Browse code

- membar_write on x86_64 is by default empty (since on amd64 stores are always ordered)

- x86/x86_64 lock optimizations: spinning on a lock should be friendlier now
for the other cpus caches (at the extra cost of a cmp mem + jump) ; tried to
arrange a little better the instructions to allow for some parallel
execution.

- x86 unlocks with xchg by default (since some x86s reorder stores, so a
simple mov is unsafe)

Andrei Pelinescu-Onciul authored on 03/04/2006 19:03:16
Showing 2 changed files
... ...
@@ -31,6 +31,8 @@
31 31
  *
32 32
  *  Config defines:   - NOSMP
33 33
  *                    - X86_OOSTORE (out of order store, defined by default)
34
+ *                    - X86_64_OOSTORE, like X86_OOSTORE, but for x86_64 cpus,
35
+ *                      default off
34 36
  *                    - __CPU_x86_64 (64 bit mode, long and void* is 64 bit and
35 37
  *                                    the cpu has all of the mfence, lfence
36 38
  *                                    and sfence instructions)
... ...
@@ -79,7 +81,7 @@
79 79
 */
80 80
 #define membar() 		asm volatile( " mfence \n\t " : : : "memory" )
81 81
 #define membar_read()	asm volatile( " lfence \n\t " : : : "memory" )
82
-#ifdef X86_OOSTORE
82
+#ifdef X86_64_OOSTORE
83 83
 #define membar_write()	asm volatile( " sfence \n\t " : : : "memory" )
84 84
 #else
85 85
 #define membar_write()	asm volatile ("" : : : "memory") /* gcc don't cache*/
... ...
@@ -47,7 +47,8 @@
47 47
  *  2006-04-03  optimization: call lock_get memory barrier outside tsl,in the 
48 48
  *               calling function, only if the lock operation succeeded
49 49
  *               (membar_getlock()) (andrei)
50
- *              added try_lock()  (andrei)
50
+ *              added try_lock(); more x86 optimizations, x86  release_lock
51
+ *               fix (andrei)
51 52
  *
52 53
  */
53 54
 
... ...
@@ -64,6 +65,12 @@
64 64
 #endif
65 65
 
66 66
 
67
+
68
+#define SPIN_OPTIMIZE /* if defined optimize spining on the lock:
69
+                         try first the lock with non-atomic/non memory locking
70
+                         operations, and only if the lock appears to be free
71
+                         switch to the more expensive version */
72
+
67 73
 typedef  volatile int fl_lock_t;
68 74
 
69 75
 
... ...
@@ -130,16 +137,24 @@ inline static int tsl(fl_lock_t* lock)
130 130
 #if defined(__CPU_i386) || defined(__CPU_x86_64)
131 131
 
132 132
 #ifdef NOSMP
133
-	val=0;
134 133
 	asm volatile(
135
-		" btsl $0, %1 \n\t"
136
-		" adcl $0, %0 \n\t"
137
-		: "=q" (val), "=m" (*lock) : "0"(val) : "memory", "cc"
134
+		" xor %0, %0 \n\t"
135
+		" btsl $0, %2 \n\t"
136
+		" setc %b0 \n\t"
137
+		: "=q" (val), "=m" (*lock) : "m"(*lock) : "memory", "cc"
138 138
 	);
139 139
 #else
140
-	val=1;
141
-	asm volatile( 
142
-		" xchg %b1, %0" : "=q" (val), "=m" (*lock) : "0" (val) : "memory"
140
+	asm volatile(
141
+#ifdef SPIN_OPTIMIZE
142
+		" cmpb $0, %2 \n\t"
143
+		" mov $1, %0 \n\t"
144
+		" jnz 1f \n\t"
145
+#else
146
+		" mov $1, %0 \n\t"
147
+#endif
148
+		" xchgb %2, %b0 \n\t"
149
+		"1: \n\t"
150
+		: "=q" (val), "=m" (*lock) : "m"(*lock) : "memory"
143 151
 	);
144 152
 #endif /*NOSMP*/
145 153
 #elif defined(__CPU_sparc64) || defined(__CPU_sparc)
... ...
@@ -250,11 +265,27 @@ inline static int try_lock(fl_lock_t* lock)
250 250
 
251 251
 inline static void release_lock(fl_lock_t* lock)
252 252
 {
253
-#if defined(__CPU_i386) || defined(__CPU_x86_64)
253
+#if defined(__CPU_i386) 
254
+#ifdef NOSMP
254 255
 	asm volatile(
255
-		" movb $0, %0" : "=m"(*lock) : : "memory"
256
-		/*" xchg %b0, %1" : "=q" (val), "=m" (*lock) : "0" (val) : "memory"*/
256
+		" movb $0, %0 \n\t" 
257
+		: "=m"(*lock) : : "memory"
257 258
 	); 
259
+#else /* ! NOSMP */
260
+	int val;
261
+	/* a simple mov $0, (lock) does not force StoreStore ordering on all
262
+	   x86 versions and it doesn't seem to force LoadStore either */
263
+	asm volatile(
264
+		" xchgb %b0, %1 \n\t"
265
+		: "=q" (val), "=m" (*lock) : "0" (0) : "memory"
266
+	);
267
+#endif /* NOSMP */
268
+#elif defined(__CPU_x86_64)
269
+	asm volatile(
270
+		" movb $0, %0 \n\t" /* on amd64 membar StoreStore | LoadStore is 
271
+							   implicit (at least on the same mem. type) */
272
+		: "=m"(*lock) : : "memory"
273
+	);
258 274
 #elif defined(__CPU_sparc64) || defined(__CPU_sparc)
259 275
 	asm volatile(
260 276
 #ifndef NOSMP