Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 // This file is an internal atomic implementation, use base/atomicops.h instead. | |
| 6 // | |
| 7 // LinuxKernelCmpxchg and Barrier_AtomicIncrement are from Google Gears. | |
| 8 | |
| 9 #ifndef BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_ | |
| 10 #define BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_ | |
| 11 | |
| 12 namespace base { | |
| 13 namespace subtle { | |
| 14 | |
| 15 // Memory barriers on ARM are funky, but the kernel is here to help: | |
| 16 // | |
| 17 // * ARMv5 didn't support SMP, there is no memory barrier instruction at | |
| 18 // all on this architecture, or when targetting its machine code. | |
| 19 // | |
| 20 // * Some ARMv6 CPUs support SMP. A full memory barrier can be produced by | |
| 21 // writing a random value to a very specific coprocessor register. | |
| 22 // | |
| 23 // * On ARMv7, the "dmb" instruction is used to perform a full memory | |
| 24 // barrier (though writing to the co-processor will still work). | |
| 25 // However, on single core devices (e.g. Nexus One, or Nexus S), | |
| 26 // this instruction will take up to 200 ns, which is huge, even though | |
| 27 // it's completely un-needed on these devices. | |
| 28 // | |
| 29 // * There is no easy way to determine at runtime if the device is | |
| 30 // single or multi-core. However, the kernel provide a useful helper | |
| 31 // function at a fixed memory address (0xffff0fa0), which will always | |
| 32 // perform a memory barrier in the most efficient way. I.e. on single | |
| 33 // core devices, this is an empty function that exits immediately. | |
| 34 // On multi-core devices, it implements a full memory barrier. | |
| 35 // | |
| 36 // Note that this helper function doesn't modify any register or memory. | |
| 37 // See the comment in Barrier_AtomicIncrement() to see why it is | |
| 38 // important. | |
| 39 // | |
| 40 // * This source could be compiled to ARMv5 machine code that runs on a | |
| 41 // multi-core ARMv6 or ARMv7 device. In this case, memory barriers | |
| 42 // are needed for correct execution. Always call the kernel helper, even | |
| 43 // when targetting ARMv5TE. | |
| 44 // | |
| 45 | |
| 46 #define LINUX_ARM_KERNEL_MEMORY_BARRIER 0xffff0fa0 | |
| 47 | |
| 48 inline void MemoryBarrier() { | |
| 49 ((void (*)(void))LINUX_ARM_KERNEL_MEMORY_BARRIER)(); | |
|
JF
2013/06/03 18:29:27
Shouldn't this have a compiler barrier to prevent
digit1
2013/06/04 09:05:28
I believe that all function calls are implicit com
| |
| 50 } | |
| 51 | |
| 52 #if defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_6__) | |
| 53 | |
| 54 // On ARMv6 and higher, it is possible to directly use ldrex/strex | |
| 55 // instructions to implement fast atomic operations directly. | |
| 56 inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, | |
| 57 Atomic32 old_value, | |
| 58 Atomic32 new_value) { | |
| 59 Atomic32 prev_value; | |
| 60 int reloop; | |
| 61 do { | |
| 62 // The following is equivalent to: | |
| 63 // | |
| 64 // prev_value = LDREX(ptr) | |
| 65 // reloop = 0 | |
| 66 // if (prev_value != old_value) | |
| 67 // reloop = STREX(ptr, new_value) | |
| 68 __asm__ __volatile__(" ldrex %0, [%3]\n" | |
| 69 " mov %1, #0\n" | |
| 70 " teq %0, %4\n" | |
| 71 #ifdef __thumb2__ | |
|
JF
2013/06/03 18:29:27
I don't think this is needed, on ARM it blocks sho
digit1
2013/06/04 09:05:28
As of today, it's definitely needed with version o
| |
| 72 " it eq\n" | |
| 73 #endif | |
| 74 " strexeq %1, %5, [%3]\n" | |
| 75 : "=&r"(prev_value), "=&r"(reloop), "+m"(*ptr) | |
| 76 : "r"(ptr), "r"(old_value), "r"(new_value) | |
| 77 : "cc"); | |
| 78 } while (reloop != 0); | |
| 79 return prev_value; | |
| 80 } | |
| 81 | |
| 82 inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, | |
| 83 Atomic32 old_value, | |
| 84 Atomic32 new_value) { | |
| 85 Atomic32 result = NoBarrier_CompareAndSwap(ptr, old_value, new_value); | |
| 86 MemoryBarrier(); | |
| 87 return result; | |
| 88 } | |
| 89 | |
| 90 inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, | |
| 91 Atomic32 old_value, | |
| 92 Atomic32 new_value) { | |
| 93 MemoryBarrier(); | |
| 94 return NoBarrier_CompareAndSwap(ptr, old_value, new_value); | |
| 95 } | |
| 96 | |
| 97 inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, | |
| 98 Atomic32 increment) { | |
| 99 Atomic32 value; | |
| 100 int reloop; | |
| 101 do { | |
| 102 // Equivalent to: | |
| 103 // | |
| 104 // value = LDREX(ptr) | |
| 105 // value += increment | |
| 106 // reloop = STREX(ptr, value) | |
| 107 // | |
| 108 __asm__ __volatile__(" ldrex %0, [%3]\n" | |
| 109 " add %0, %0, %4\n" | |
| 110 " strex %1, %0, [%3]\n" | |
| 111 : "=&r"(value), "=&r"(reloop), "+m"(*ptr) | |
| 112 : "r"(ptr), "r"(increment) | |
| 113 : "cc"); | |
| 114 } while (reloop); | |
| 115 return value; | |
| 116 } | |
| 117 | |
| 118 inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, | |
| 119 Atomic32 increment) { | |
| 120 Atomic32 value; | |
| 121 uint32_t barrier = LINUX_ARM_KERNEL_MEMORY_BARRIER; | |
| 122 int reloop; | |
| 123 // The following is very subtle. This function is supposed | |
| 124 // to perform a "full memory barrier", i.e. it's supposed to | |
| 125 // enforce both acquire and release semantics at the same time. | |
| 126 // | |
| 127 // A naive way to implement this is to perform two MemoryBarrier() calls, | |
| 128 // one before, and one after the actual atomic operation, as in: | |
| 129 // | |
| 130 // MemoryBarrier(); | |
| 131 // NoBarrier_AtomicIncrement(); | |
| 132 // MemoryBarrier(); | |
| 133 // | |
| 134 // However, memory barriers are costly, especially on multi-core | |
| 135 // devices, so the code below uses a single one and looks like: | |
| 136 // | |
| 137 // LDREX ... | |
| 138 // MemoryBarrier() | |
| 139 // STREX .. | |
| 140 // | |
| 141 // That's because this is equivalent to performing a "load-acquire" | |
| 142 // (LDREX + MemoryBarrier()) followed by a "store-release" | |
| 143 // (MemoryBarrier() + STREX). | |
| 144 // | |
| 145 // Note that when looping is needed, due to failed STREX operations, then | |
| 146 // MemoryBarrier() will be called in each iteration. However, it | |
| 147 // takes a minimum of 3 iterations for this code to perform worse than | |
| 148 // the naive one. It this happens, the address of the cache line is | |
| 149 // already highly contented and performance will probably be very bad | |
| 150 // already. | |
| 151 // | |
| 152 // IMPORTANT: This works because the MemoryBarrier() kernel helper | |
| 153 // never modifies any register or memory! | |
| 154 do { | |
| 155 // Equivalent to: | |
| 156 // | |
| 157 // value = LDREX(ptr) | |
| 158 // value += increment | |
| 159 // barrier(); | |
| 160 // reloop = STREX(ptr, value) | |
| 161 // | |
| 162 __asm__ __volatile__(" ldrex %0, [%3]\n" | |
| 163 " add %0, %0, %4\n" | |
| 164 " blx %5\n" | |
|
Dmitry Vyukov
2013/06/03 18:01:22
yeah, this is very subtle
the following does not m
digit1
2013/06/04 09:05:28
Good point, I've restored the two-memory-barriers
| |
| 165 " strex %1, %0, [%3]\n" | |
| 166 : "=&r"(value), "=&r"(reloop), "+m"(*ptr) | |
| 167 : "r"(ptr), "r"(increment), "r"(barrier) | |
| 168 : "cc"); | |
|
Dmitry Vyukov
2013/06/03 18:01:22
add "memory"
digit1
2013/06/04 09:05:28
Done, thanks. Also, I removed the superfluous "cc"
| |
| 169 } while (reloop); | |
| 170 return value; | |
| 171 } | |
| 172 | |
| 173 inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, | |
| 174 Atomic32 new_value) { | |
| 175 Atomic32 old_value; | |
| 176 int reloop; | |
| 177 do { | |
| 178 // old_value = LDREX(ptr) | |
| 179 // fail = STREX(ptr, new_value) | |
| 180 __asm__ __volatile__(" ldrex %0, [%3]\n" | |
| 181 " strex %1, %4, [%3]\n" | |
| 182 : "=&r"(old_value), "=&r"(reloop), "+m"(*ptr) | |
| 183 : "r"(ptr), "r"(new_value) | |
| 184 : "cc"); | |
| 185 } while (reloop != 0); | |
| 186 return old_value; | |
| 187 } | |
| 188 | |
| 189 #else | |
| 190 | |
| 191 // The kernel also provides a helper function to perform an atomic | |
| 192 // compare-and-swap operation at the hard-wired address 0xffff0fc0. | |
| 193 // On ARMv5, this is implemented by a special code path that the kernel | |
| 194 // detects and treats specially when thread pre-emption happens. | |
| 195 // On ARMv6 and higher, it uses LDREX/STREX instructions instead. | |
| 196 // | |
| 197 // Note that this always perform a full memory barrier, there is no | |
| 198 // need to add calls MemoryBarrier() before or after it. | |
| 199 typedef Atomic32 (*LinuxKernelCmpxchgFunc)(Atomic32 old_value, | |
|
Dmitry Vyukov
2013/06/03 18:01:22
I would use int as the return type, it's 0/1 rathe
Dmitry Vyukov
2013/06/03 18:01:22
add a comment that this function lies with return
digit1
2013/06/04 09:05:28
Done.
| |
| 200 Atomic32 new_value, | |
| 201 volatile Atomic32* ptr); | |
| 202 LinuxKernelCmpxchgFunc pLinuxKernelCmpxchg __attribute__((weak)) = | |
| 203 (LinuxKernelCmpxchgFunc)0xffff0fc0; | |
| 204 | |
| 205 inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, | |
| 206 Atomic32 old_value, | |
| 207 Atomic32 new_value) { | |
| 208 Atomic32 prev_value = *ptr; | |
| 209 do { | |
| 210 if (!pLinuxKernelCmpxchg( | |
| 211 old_value, new_value, const_cast<Atomic32*>(ptr))) { | |
| 212 return old_value; | |
| 213 } | |
| 214 prev_value = *ptr; | |
|
Dmitry Vyukov
2013/06/03 18:01:22
it does not have dmb after the load (provided that
digit1
2013/06/04 09:05:28
Fixed. Note that this part of the code comes from
| |
| 215 } while (prev_value == old_value); | |
| 216 return prev_value; | |
| 217 } | |
| 218 | |
| 219 inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, | |
| 220 Atomic32 new_value) { | |
| 221 Atomic32 old_value; | |
| 222 do { | |
| 223 old_value = *ptr; | |
| 224 } while ( | |
| 225 pLinuxKernelCmpxchg(old_value, new_value, const_cast<Atomic32*>(ptr))); | |
| 226 return old_value; | |
| 227 } | |
| 228 | |
| 229 inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, | |
| 230 Atomic32 increment) { | |
| 231 return Barrier_AtomicIncrement(ptr, increment); | |
| 232 } | |
| 233 | |
| 234 inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, | |
| 235 Atomic32 increment) { | |
| 236 for (;;) { | |
| 237 // Atomic exchange the old value with an incremented one. | |
| 238 Atomic32 old_value = *ptr; | |
| 239 Atomic32 new_value = old_value + increment; | |
| 240 if (pLinuxKernelCmpxchg(old_value, new_value, const_cast<Atomic32*>(ptr)) == | |
| 241 0) { | |
| 242 // The exchange took place as expected. | |
| 243 return new_value; | |
| 244 } | |
| 245 // Otherwise, *ptr changed mid-loop and we need to retry. | |
| 246 } | |
| 247 | |
| 248 } | |
| 249 | |
| 250 inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, | |
| 251 Atomic32 old_value, | |
| 252 Atomic32 new_value) { | |
| 253 return NoBarrier_CompareAndSwap(ptr, old_value, new_value); | |
| 254 } | |
| 255 | |
| 256 inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, | |
| 257 Atomic32 old_value, | |
| 258 Atomic32 new_value) { | |
| 259 return NoBarrier_CompareAndSwap(ptr, old_value, new_value); | |
| 260 } | |
| 261 | |
| 262 #endif // __ARM_ARCH_6__ || __ARM_ARCH_7A__ | |
| 263 | |
| 264 inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { | |
|
JF
2013/06/03 18:29:27
This is only atomic when aligned, right? A comment
digit1
2013/06/04 09:05:28
Yes, I've added a comment to the source code. I do
| |
| 265 *ptr = value; | |
| 266 } | |
| 267 | |
| 268 inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { | |
| 269 *ptr = value; | |
| 270 MemoryBarrier(); | |
| 271 } | |
| 272 | |
| 273 inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { | |
| 274 MemoryBarrier(); | |
| 275 *ptr = value; | |
| 276 } | |
| 277 | |
| 278 inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { return *ptr; } | |
| 279 | |
| 280 inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { | |
| 281 Atomic32 value = *ptr; | |
| 282 MemoryBarrier(); | |
| 283 return value; | |
| 284 } | |
| 285 | |
| 286 inline Atomic32 Release_Load(volatile const Atomic32* ptr) { | |
| 287 MemoryBarrier(); | |
| 288 return *ptr; | |
| 289 } | |
| 290 | |
| 291 #undef LINUX_ARM_KERNEL_MEMORY_BARRIER | |
| 292 | |
| 293 } // namespace base::subtle | |
| 294 } // namespace base | |
| 295 | |
| 296 #endif // BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_ | |
| OLD | NEW |