Chromium Code Reviews| Index: base/atomicops_internals_arm_gcc.h |
| diff --git a/base/atomicops_internals_arm_gcc.h b/base/atomicops_internals_arm_gcc.h |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..d8751b250675dd5a5f1e0193fa282c60ea5bf32b |
| --- /dev/null |
| +++ b/base/atomicops_internals_arm_gcc.h |
| @@ -0,0 +1,296 @@ |
| +// Copyright (c) 2009 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +// This file is an internal atomic implementation, use base/atomicops.h instead. |
| +// |
| +// LinuxKernelCmpxchg and Barrier_AtomicIncrement are from Google Gears. |
| + |
| +#ifndef BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_ |
| +#define BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_ |
| + |
| +namespace base { |
| +namespace subtle { |
| + |
| +// Memory barriers on ARM are funky, but the kernel is here to help: |
| +// |
| +// * ARMv5 didn't support SMP, there is no memory barrier instruction at |
| +// all on this architecture, or when targetting its machine code. |
| +// |
| +// * Some ARMv6 CPUs support SMP. A full memory barrier can be produced by |
| +// writing a random value to a very specific coprocessor register. |
| +// |
| +// * On ARMv7, the "dmb" instruction is used to perform a full memory |
| +// barrier (though writing to the co-processor will still work). |
| +// However, on single core devices (e.g. Nexus One, or Nexus S), |
| +// this instruction will take up to 200 ns, which is huge, even though |
| +// it's completely un-needed on these devices. |
| +// |
| +// * There is no easy way to determine at runtime if the device is |
| +// single or multi-core. However, the kernel provide a useful helper |
| +// function at a fixed memory address (0xffff0fa0), which will always |
| +// perform a memory barrier in the most efficient way. I.e. on single |
| +// core devices, this is an empty function that exits immediately. |
| +// On multi-core devices, it implements a full memory barrier. |
| +// |
| +// Note that this helper function doesn't modify any register or memory. |
| +// See the comment in Barrier_AtomicIncrement() to see why it is |
| +// important. |
| +// |
| +// * This source could be compiled to ARMv5 machine code that runs on a |
| +// multi-core ARMv6 or ARMv7 device. In this case, memory barriers |
| +// are needed for correct execution. Always call the kernel helper, even |
| +// when targetting ARMv5TE. |
| +// |
| + |
| +#define LINUX_ARM_KERNEL_MEMORY_BARRIER 0xffff0fa0 |
| + |
| +inline void MemoryBarrier() { |
| + ((void (*)(void))LINUX_ARM_KERNEL_MEMORY_BARRIER)(); |
|
JF
2013/06/03 18:29:27
Shouldn't this have a compiler barrier to prevent
digit1
2013/06/04 09:05:28
I believe that all function calls are implicit com
|
| +} |
| + |
| +#if defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_6__) |
| + |
| +// On ARMv6 and higher, it is possible to directly use ldrex/strex |
| +// instructions to implement fast atomic operations directly. |
| +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, |
| + Atomic32 old_value, |
| + Atomic32 new_value) { |
| + Atomic32 prev_value; |
| + int reloop; |
| + do { |
| + // The following is equivalent to: |
| + // |
| + // prev_value = LDREX(ptr) |
| + // reloop = 0 |
| + // if (prev_value != old_value) |
| + // reloop = STREX(ptr, new_value) |
| + __asm__ __volatile__(" ldrex %0, [%3]\n" |
| + " mov %1, #0\n" |
| + " teq %0, %4\n" |
| +#ifdef __thumb2__ |
|
JF
2013/06/03 18:29:27
I don't think this is needed, on ARM it blocks sho
digit1
2013/06/04 09:05:28
As of today, it's definitely needed with version o
|
| + " it eq\n" |
| +#endif |
| + " strexeq %1, %5, [%3]\n" |
| + : "=&r"(prev_value), "=&r"(reloop), "+m"(*ptr) |
| + : "r"(ptr), "r"(old_value), "r"(new_value) |
| + : "cc"); |
| + } while (reloop != 0); |
| + return prev_value; |
| +} |
| + |
| +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, |
| + Atomic32 old_value, |
| + Atomic32 new_value) { |
| + Atomic32 result = NoBarrier_CompareAndSwap(ptr, old_value, new_value); |
| + MemoryBarrier(); |
| + return result; |
| +} |
| + |
| +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, |
| + Atomic32 old_value, |
| + Atomic32 new_value) { |
| + MemoryBarrier(); |
| + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); |
| +} |
| + |
| +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, |
| + Atomic32 increment) { |
| + Atomic32 value; |
| + int reloop; |
| + do { |
| + // Equivalent to: |
| + // |
| + // value = LDREX(ptr) |
| + // value += increment |
| + // reloop = STREX(ptr, value) |
| + // |
| + __asm__ __volatile__(" ldrex %0, [%3]\n" |
| + " add %0, %0, %4\n" |
| + " strex %1, %0, [%3]\n" |
| + : "=&r"(value), "=&r"(reloop), "+m"(*ptr) |
| + : "r"(ptr), "r"(increment) |
| + : "cc"); |
| + } while (reloop); |
| + return value; |
| +} |
| + |
| +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, |
| + Atomic32 increment) { |
| + Atomic32 value; |
| + uint32_t barrier = LINUX_ARM_KERNEL_MEMORY_BARRIER; |
| + int reloop; |
| + // The following is very subtle. This function is supposed |
| + // to perform a "full memory barrier", i.e. it's supposed to |
| + // enforce both acquire and release semantics at the same time. |
| + // |
| + // A naive way to implement this is to perform two MemoryBarrier() calls, |
| + // one before, and one after the actual atomic operation, as in: |
| + // |
| + // MemoryBarrier(); |
| + // NoBarrier_AtomicIncrement(); |
| + // MemoryBarrier(); |
| + // |
| + // However, memory barriers are costly, especially on multi-core |
| + // devices, so the code below uses a single one and looks like: |
| + // |
| + // LDREX ... |
| + // MemoryBarrier() |
| + // STREX .. |
| + // |
| + // That's because this is equivalent to performing a "load-acquire" |
| + // (LDREX + MemoryBarrier()) followed by a "store-release" |
| + // (MemoryBarrier() + STREX). |
| + // |
| + // Note that when looping is needed, due to failed STREX operations, then |
| + // MemoryBarrier() will be called in each iteration. However, it |
| + // takes a minimum of 3 iterations for this code to perform worse than |
| + // the naive one. It this happens, the address of the cache line is |
| + // already highly contented and performance will probably be very bad |
| + // already. |
| + // |
| + // IMPORTANT: This works because the MemoryBarrier() kernel helper |
| + // never modifies any register or memory! |
| + do { |
| + // Equivalent to: |
| + // |
| + // value = LDREX(ptr) |
| + // value += increment |
| + // barrier(); |
| + // reloop = STREX(ptr, value) |
| + // |
| + __asm__ __volatile__(" ldrex %0, [%3]\n" |
| + " add %0, %0, %4\n" |
| + " blx %5\n" |
|
Dmitry Vyukov
2013/06/03 18:01:22
yeah, this is very subtle
the following does not m
digit1
2013/06/04 09:05:28
Good point, I've restored the two-memory-barriers
|
| + " strex %1, %0, [%3]\n" |
| + : "=&r"(value), "=&r"(reloop), "+m"(*ptr) |
| + : "r"(ptr), "r"(increment), "r"(barrier) |
| + : "cc"); |
|
Dmitry Vyukov
2013/06/03 18:01:22
add "memory"
digit1
2013/06/04 09:05:28
Done, thanks. Also, I removed the superfluous "cc"
|
| + } while (reloop); |
| + return value; |
| +} |
| + |
| +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, |
| + Atomic32 new_value) { |
| + Atomic32 old_value; |
| + int reloop; |
| + do { |
| + // old_value = LDREX(ptr) |
| + // fail = STREX(ptr, new_value) |
| + __asm__ __volatile__(" ldrex %0, [%3]\n" |
| + " strex %1, %4, [%3]\n" |
| + : "=&r"(old_value), "=&r"(reloop), "+m"(*ptr) |
| + : "r"(ptr), "r"(new_value) |
| + : "cc"); |
| + } while (reloop != 0); |
| + return old_value; |
| +} |
| + |
| +#else |
| + |
| +// The kernel also provides a helper function to perform an atomic |
| +// compare-and-swap operation at the hard-wired address 0xffff0fc0. |
| +// On ARMv5, this is implemented by a special code path that the kernel |
| +// detects and treats specially when thread pre-emption happens. |
| +// On ARMv6 and higher, it uses LDREX/STREX instructions instead. |
| +// |
| +// Note that this always perform a full memory barrier, there is no |
| +// need to add calls MemoryBarrier() before or after it. |
| +typedef Atomic32 (*LinuxKernelCmpxchgFunc)(Atomic32 old_value, |
|
Dmitry Vyukov
2013/06/03 18:01:22
I would use int as the return type, it's 0/1 rathe
Dmitry Vyukov
2013/06/03 18:01:22
add a comment that this function lies with return
digit1
2013/06/04 09:05:28
Done.
|
| + Atomic32 new_value, |
| + volatile Atomic32* ptr); |
| +LinuxKernelCmpxchgFunc pLinuxKernelCmpxchg __attribute__((weak)) = |
| + (LinuxKernelCmpxchgFunc)0xffff0fc0; |
| + |
| +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, |
| + Atomic32 old_value, |
| + Atomic32 new_value) { |
| + Atomic32 prev_value = *ptr; |
| + do { |
| + if (!pLinuxKernelCmpxchg( |
| + old_value, new_value, const_cast<Atomic32*>(ptr))) { |
| + return old_value; |
| + } |
| + prev_value = *ptr; |
|
Dmitry Vyukov
2013/06/03 18:01:22
it does not have dmb after the load (provided that
digit1
2013/06/04 09:05:28
Fixed. Note that this part of the code comes from
|
| + } while (prev_value == old_value); |
| + return prev_value; |
| +} |
| + |
| +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, |
| + Atomic32 new_value) { |
| + Atomic32 old_value; |
| + do { |
| + old_value = *ptr; |
| + } while ( |
| + pLinuxKernelCmpxchg(old_value, new_value, const_cast<Atomic32*>(ptr))); |
| + return old_value; |
| +} |
| + |
| +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, |
| + Atomic32 increment) { |
| + return Barrier_AtomicIncrement(ptr, increment); |
| +} |
| + |
| +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, |
| + Atomic32 increment) { |
| + for (;;) { |
| + // Atomic exchange the old value with an incremented one. |
| + Atomic32 old_value = *ptr; |
| + Atomic32 new_value = old_value + increment; |
| + if (pLinuxKernelCmpxchg(old_value, new_value, const_cast<Atomic32*>(ptr)) == |
| + 0) { |
| + // The exchange took place as expected. |
| + return new_value; |
| + } |
| + // Otherwise, *ptr changed mid-loop and we need to retry. |
| + } |
| + |
| +} |
| + |
| +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, |
| + Atomic32 old_value, |
| + Atomic32 new_value) { |
| + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); |
| +} |
| + |
| +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, |
| + Atomic32 old_value, |
| + Atomic32 new_value) { |
| + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); |
| +} |
| + |
| +#endif // __ARM_ARCH_6__ || __ARM_ARCH_7A__ |
| + |
| +inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { |
|
JF
2013/06/03 18:29:27
This is only atomic when aligned, right? A comment
digit1
2013/06/04 09:05:28
Yes, I've added a comment to the source code. I do
|
| + *ptr = value; |
| +} |
| + |
| +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { |
| + *ptr = value; |
| + MemoryBarrier(); |
| +} |
| + |
| +inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { |
| + MemoryBarrier(); |
| + *ptr = value; |
| +} |
| + |
| +inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { return *ptr; } |
| + |
| +inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { |
| + Atomic32 value = *ptr; |
| + MemoryBarrier(); |
| + return value; |
| +} |
| + |
| +inline Atomic32 Release_Load(volatile const Atomic32* ptr) { |
| + MemoryBarrier(); |
| + return *ptr; |
| +} |
| + |
| +#undef LINUX_ARM_KERNEL_MEMORY_BARRIER |
| + |
| +} // namespace base::subtle |
| +} // namespace base |
| + |
| +#endif // BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_ |