DragonFly submit List (threaded) for 2004-12
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
[
Date Index][
Thread Index]
atomic 64 bit add for pentium+
Hi all,
as promised on commits@, here is a generic 64 bit add operator for
Pentium+ and the necessary change for gencount_inc. Also attached
a small hack for cpuperf used for the numbers below.
The good message is that gencount_inc can be made critical section free,
the bad is the performance of cmpxchg8b on p4. Like so many other ops,
it totally sucks.
My P4 notebook: 115.857nS/loop for cmpxchg8b, compared to 7.517nS/loop
for cmpxchg.
Leaf (AMD64): 6.788nS/loop for cmpxchg8b, compared to 1.293nS/loop for
cmpxchg.
Intel sucks.
Conclusion: The overhead on AMD64 is much less and seems completely
acceptable, for P4 it depends. Matt, what's the speed of critical
sections on P4?
I'd like to get some numbers for other processors as well.
Joerg
Index: atomic.h
===================================================================
RCS file: /home/joerg/wd/repository/dragonflybsd/src/sys/i386/include/atomic.h,v
retrieving revision 1.8
diff -u -r1.8 atomic.h
--- atomic.h 29 Jul 2004 20:31:13 -0000 1.8
+++ atomic.h 9 Dec 2004 02:26:30 -0000
@@ -159,4 +159,26 @@
#endif
+#if defined(I586_CPU) || defined(I686_CPU)
+static __inline
+void
+atomic_add_long_long(unsigned long long *p, unsigned long long v)
+{
+ __asm __volatile(
+ "1:\n"
+ "\tmovl %0, %%eax\n"
+ "\taddl %%eax, %%ebx\n"
+ "\tmovl 4+%0, %%edx\n"
+ "\taddl %%edx, %%ecx\n"
+ "\tjnc 2f\n"
+ "\tincl %%ebx\n"
+ "2:\n"
+ "\tlock; cmpxchg8b %0\n"
+ "\tjnz 1b"
+ :
+ : "m" (*p), "b" ((u_long)v), "c" ((u_long)(v >> 32))
+ : "memory", "ax", "dx");
+}
+#endif
+
#endif /* ! _MACHINE_ATOMIC_H_ */
Index: gencount.h
===================================================================
RCS file: /home/joerg/wd/repository/dragonflybsd/src/sys/i386/include/gencount.h,v
retrieving revision 1.1
diff -u -r1.1 gencount.h
--- gencount.h 8 Dec 2004 23:19:51 -0000 1.1
+++ gencount.h 9 Dec 2004 02:39:04 -0000
@@ -41,6 +41,8 @@
#error "no user-servicable parts inside"
#endif
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
#include <sys/types.h>
#include <sys/thread2.h>
@@ -60,6 +62,12 @@
static __inline void
gencount_inc(gencount_t *gencnt)
{
+#if defined(I586_CPU) || defined(I686_CPU)
+ if (cpu_feature & CPUID_CX8) {
+ atomic_add_long_long((uint64_t *)gencnt, 1);
+ return;
+ }
+#endif
crit_enter();
if (++gencnt->high == 0)
++gencnt->low;
Index: Makefile
===================================================================
RCS file: /home/joerg/wd/repository/dragonflybsd/src/test/cpuperf/Makefile,v
retrieving revision 1.2
diff -u -r1.2 Makefile
--- Makefile 9 Feb 2004 18:15:35 -0000 1.2
+++ Makefile 9 Dec 2004 02:43:30 -0000
@@ -4,7 +4,7 @@
.PATH: ${.CURDIR}/../sysperf
TARGETS=/tmp/cpu_add /tmp/cpu_ladd /tmp/cpu_cmpadd /tmp/cpu_cmpexg \
- /tmp/cpu_lcmpexg /tmp/cpu_call
+ /tmp/cpu_lcmpexg /tmp/cpu_call /tmp/cpu_cmpexg8b /tmp/cpu_lcmpexg8b
CFLAGS= -O2 -g -I../sysperf
@@ -22,9 +22,15 @@
/tmp/cpu_cmpexg: cputest.c cpu_cmpexg.S blib.c
$(CC) $(CFLAGS) ${.ALLSRC} -o ${.TARGET}
+/tmp/cpu_cmpexg8b: cputest.c cpu_cmpexg8b.S blib.c
+ $(CC) $(CFLAGS) ${.ALLSRC} -o ${.TARGET}
+
/tmp/cpu_lcmpexg: cputest.c cpu_lcmpexg.S blib.c
$(CC) $(CFLAGS) ${.ALLSRC} -o ${.TARGET}
+/tmp/cpu_lcmpexg8b: cputest.c cpu_lcmpexg8b.S blib.c
+ $(CC) $(CFLAGS) ${.ALLSRC} -o ${.TARGET}
+
/tmp/cpu_call: cputest.c cpu_call.S blib.c
$(CC) $(CFLAGS) ${.ALLSRC} -o ${.TARGET}
Index: cpu_cmpexg8b.S
===================================================================
RCS file: cpu_cmpexg8b.S
diff -N cpu_cmpexg8b.S
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ cpu_cmpexg8b.S 9 Dec 2004 02:41:38 -0000
@@ -0,0 +1,31 @@
+/* $DragonFly: src/test/cpuperf/cpu_cmpexg.S,v 1.1 2004/02/09 18:08:54 dillon Exp $ */
+
+ .globl test_dummy
+ .globl test_load
+ .globl test_str
+
+ .p2align 5
+test_dummy:
+ movl 4(%esp),%ecx
+ movl $0,%edx
+ movl $1,%eax
+ movl %ebx,%edx
+ movl $0,(%ecx)
+ addl $3,%eax
+ ret
+
+ .p2align 5
+test_load:
+ movl 4(%esp),%ecx
+ movl $0,%edx
+ movl $1,%eax
+ cmpxchg8b (%ecx) /* instruction under test */
+ movl %ebx,%edx
+ movl $0,(%ecx)
+ addl $3,%eax
+ ret
+
+ .p2align 5
+test_str:
+ .asciz "non-locked cmpxchg8b (successful exchange) in pipeline"
+
Index: cpu_lcmpexg8b.S
===================================================================
RCS file: cpu_lcmpexg8b.S
diff -N cpu_lcmpexg8b.S
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ cpu_lcmpexg8b.S 9 Dec 2004 02:43:02 -0000
@@ -0,0 +1,31 @@
+/* $DragonFly: src/test/cpuperf/cpu_lcmpexg.S,v 1.1 2004/02/09 18:08:54 dillon Exp $ */
+
+ .globl test_dummy
+ .globl test_load
+ .globl test_str
+
+ .p2align 5
+test_dummy:
+ movl 4(%esp),%ecx
+ movl $0,%edx
+ movl $1,%eax
+ movl %ebx,%edx
+ movl $0,(%ecx)
+ addl $3,%eax
+ ret
+
+ .p2align 5
+test_load:
+ movl 4(%esp),%ecx
+ movl $0,%edx
+ movl $1,%eax
+ lock; cmpxchg8b (%ecx) /* instruction under test */
+ movl %ebx,%edx
+ movl $0,(%ecx)
+ addl $3,%eax
+ ret
+
+ .p2align 5
+test_str:
+ .asciz "bus-locked cmpxchg8b (successful exchange) in pipeline"
+
Index: cputest.c
===================================================================
RCS file: /home/joerg/wd/repository/dragonflybsd/src/test/cpuperf/cputest.c,v
retrieving revision 1.1
diff -u -r1.1 cputest.c
--- cputest.c 9 Feb 2004 18:08:54 -0000 1.1
+++ cputest.c 9 Dec 2004 02:48:36 -0000
@@ -11,7 +11,7 @@
extern char test_str[];
-int junk;
+int junk[10];
int
main(int ac, char **av)
@@ -24,19 +24,19 @@
printf("CPUTEST %s\n", test_str);
start_timing();
for (i = 0; ; ++i) {
- test_load(&junk);
+ test_load(junk);
if ((i & 65535) == 0 && get_timing() > 1000000)
break;
}
ttl = i * 4;
start_timing();
for (i = 0; i < ttl; ++i) {
- test_dummy(&junk);
+ test_dummy(junk);
}
us1 = get_timing();
start_timing();
for (i = 0; i < ttl; ++i) {
- test_load(&junk);
+ test_load(junk);
}
us2 = get_timing();
stop_timing2(ttl, us2 - us1, "instruction overhead:");
[
Date Prev][
Date Next]
[
Thread Prev][
Thread Next]
[
Date Index][
Thread Index]