base/atomicops_internals_arm_gcc.h - Issue 16335007: Improve the implementation of atomic operations on Linux/ARM (including Android/ARM).

Side by Side Diff: base/atomicops_internals_arm_gcc.h

Issue 16335007: Improve the implementation of atomic operations on Linux/ARM (including Android/ARM). (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Add comment about Linux 2.6.24 Created 7 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 // This file is an internal atomic implementation, use base/atomicops.h instead.

	6 //

	7 // LinuxKernelCmpxchg and Barrier_AtomicIncrement are from Google Gears.

	8

	9 #ifndef BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_

	10 #define BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_

	11

	12 namespace base {

	13 namespace subtle {

	14

	15 // Memory barriers on ARM are funky, but the kernel is here to help:

	16 //

	17 // * ARMv5 didn't support SMP, there is no memory barrier instruction at

	18 // all on this architecture, or when targetting its machine code.

	19 //

	20 // * Some ARMv6 CPUs support SMP. A full memory barrier can be produced by

	21 // writing a random value to a very specific coprocessor register.

	22 //

	23 // * On ARMv7, the "dmb" instruction is used to perform a full memory

	24 // barrier (though writing to the co-processor will still work).

	25 // However, on single core devices (e.g. Nexus One, or Nexus S),

	26 // this instruction will take up to 200 ns, which is huge, even though

	27 // it's completely un-needed on these devices.

	28 //

	29 // * There is no easy way to determine at runtime if the device is

	30 // single or multi-core. However, the kernel provide a useful helper

	31 // function at a fixed memory address (0xffff0fa0), which will always

	32 // perform a memory barrier in the most efficient way. I.e. on single

	33 // core devices, this is an empty function that exits immediately.

	34 // On multi-core devices, it implements a full memory barrier.

	35 //

	36 // Note that this helper function doesn't modify any register or memory.

	37 // See the comment in Barrier_AtomicIncrement() to see why it is

	38 // important.

	39 //

	40 // * This source could be compiled to ARMv5 machine code that runs on a

	41 // multi-core ARMv6 or ARMv7 device. In this case, memory barriers

	42 // are needed for correct execution. Always call the kernel helper, even

	43 // when targetting ARMv5TE.

	44 //

	45

	46 #define LINUX_ARM_KERNEL_MEMORY_BARRIER 0xffff0fa0

	47

	48 inline void MemoryBarrier() {

	49 // Note: This is a function call, which is also an implicit compiler

	50 // barrier.

	51 ((void (*)(void))LINUX_ARM_KERNEL_MEMORY_BARRIER)();

	52 }

	53

	54 #if defined(__ARM_ARCH_7A__) \|\| defined(__ARM_ARCH_6__)

	55

	56 // On ARMv6 and higher, it is possible to directly use ldrex/strex

	57 // instructions to implement fast atomic operations directly.

	58 inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,

	59 Atomic32 old_value,

	60 Atomic32 new_value) {

	61 Atomic32 prev_value;

	62 int reloop;

	63 do {

	64 // The following is equivalent to:

	65 //

	66 // prev_value = LDREX(ptr)

	67 // reloop = 0

	68 // if (prev_value != old_value)

	69 // reloop = STREX(ptr, new_value)

	70 __asm__ __volatile__(" ldrex %0, [%3]\n"

	71 " mov %1, #0\n"

	72 " teq %0, %4\n"

	73 #ifdef __thumb2__

	74 " it eq\n"

	75 #endif

	76 " strexeq %1, %5, [%3]\n"

	77 : "=&r"(prev_value), "=&r"(reloop), "+m"(*ptr)

	78 : "r"(ptr), "r"(old_value), "r"(new_value)

	79 : "cc", "memory");

	80 } while (reloop != 0);

	81 return prev_value;

	82 }

	83

	84 inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,

	85 Atomic32 old_value,

	86 Atomic32 new_value) {

	87 Atomic32 result = NoBarrier_CompareAndSwap(ptr, old_value, new_value);

	88 MemoryBarrier();

	89 return result;

	90 }

	91

	92 inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,

	93 Atomic32 old_value,

	94 Atomic32 new_value) {

	95 MemoryBarrier();

	96 return NoBarrier_CompareAndSwap(ptr, old_value, new_value);

	97 }

	98

	99 inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr,

	100 Atomic32 increment) {

	101 Atomic32 value;

	102 int reloop;

	103 do {

	104 // Equivalent to:

	105 //

	106 // value = LDREX(ptr)

	107 // value += increment

	108 // reloop = STREX(ptr, value)

	109 //

	110 __asm__ __volatile__(" ldrex %0, [%3]\n"

	111 " add %0, %0, %4\n"

	112 " strex %1, %0, [%3]\n"

	113 : "=&r"(value), "=&r"(reloop), "+m"(*ptr)

	114 : "r"(ptr), "r"(increment)

	115 : "memory");

	116 } while (reloop);

	117 return value;

	118 }

	119

	120 inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr,

	121 Atomic32 increment) {

	122 // TODO(digit): Investigate if it's possible to implement this with

	123 // a single MemoryBarrier() operation between the LDREX and STREX.

	124 // See http://crbug.com/246514

	125 MemoryBarrier();

	126 Atomic32 result = NoBarrier_AtomicIncrement(ptr, increment);

	127 MemoryBarrier();

	128 return result;

	129 }

	130

	131 inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr,

	132 Atomic32 new_value) {

	133 Atomic32 old_value;

	134 int reloop;

	135 do {

	136 // old_value = LDREX(ptr)

	137 // fail = STREX(ptr, new_value)

	138 __asm__ __volatile__(" ldrex %0, [%3]\n"

	139 " strex %1, %4, [%3]\n"

	140 : "=&r"(old_value), "=&r"(reloop), "+m"(*ptr)

	141 : "r"(ptr), "r"(new_value)

	142 : "memory");
	Dmitry Vyukov 2013/06/04 09:26:00 I think that NoBarrier functions does not require I think that NoBarrier functions does not require "memory" clobber, because they properly enumerate all inputs/outputs. However, there is probably no code in Chromium for which is makes any performance difference (stats collection?), I would use "memory", "cc" everywhere just to be safe. digit1 2013/06/04 13:24:18 Thanks for the details, I've put "cc" and "memory" Thanks for the details, I've put "cc" and "memory" to all fragments.
	143 } while (reloop != 0);

	144 return old_value;

	145 }

	146

	147 #else

	148

	149 // The kernel also provides a helper function to perform an atomic

	150 // compare-and-swap operation at the hard-wired address 0xffff0fc0.

	151 // On ARMv5, this is implemented by a special code path that the kernel

	152 // detects and treats specially when thread pre-emption happens.

	153 // On ARMv6 and higher, it uses LDREX/STREX instructions instead.

	154 //

	155 // Note that this always perform a full memory barrier, there is no

	156 // need to add calls MemoryBarrier() before or after it. It also

	157 // returns 0 on success, and 1 on exit.

	158 //

	159 // Available since Linux 2.6.24. Note that the first Android releases
	Dmitry Vyukov 2013/06/04 09:26:00 It was available before 2.6.24, but it was returni It was available before 2.6.24, but it was returning spurious failures (fails even if old_value==ptr) digit1* 2013/06/04 11:51:56 Thanks. I'm changing this to "Available and reliab Thanks. I'm changing this to "Available and reliable since..." digit1 2013/06/04 13:24:18 Done. Show quoted text On 2013/06/04 11:51:56, digit1 wrote: > Thanks. I'm changing this to "Available and reliable since..." Done.
	160 // used 2.6.29, and ChromeOS is currently at 3.4, iirc, so this should

	161 // only be a concern for people running _really_ old custom Linux/ARM

	162 // distributions).

	163 typedef int (*LinuxKernelCmpxchgFunc)(Atomic32 old_value,

	164 Atomic32 new_value,

	165 volatile Atomic32* ptr);

	166 LinuxKernelCmpxchgFunc pLinuxKernelCmpxchg __attribute__((weak)) =

	167 (LinuxKernelCmpxchgFunc)0xffff0fc0;

	168

	169 inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,

	170 Atomic32 old_value,

	171 Atomic32 new_value) {

	172 Atomic32 prev_value;

	173 for (;;) {

	174 prev_value = *ptr;

	175 if (prev_value != old_value)

	176 return prev_value;
	Dmitry Vyukov 2013/06/04 09:26:00 This does not have acquire semantics either. We lo This does not have acquire semantics either. We load ptr, see that it's different from old_value and return. Caller can act based on the value returned from NoBarrier_CompareAndSwap(), e.g. if it's equal to 1, load some global var. This needs dmb in between. digit1* 2013/06/04 11:51:56 I see. Is it correct that this remark should only I see. Is it correct that this remark should only apply to Acquire_CompareAndSwap()? I.e. that NoBarrier_CompareAndSwap() and Release_CompareAndSwap() don't need this extra dmb before the return? digit1 2013/06/04 12:00:37 Also, please note that it looks like that the NoBa Also, please note that it looks like that the NoBarrier_CompareAndSwap() implementation in base/atomicops_internals_gcc.h (currently used on Android/ARM and NaCL) have the same "issue". A bit off-topic, but base/atomicops_internal_mips_gcc.h is even more interesting. The NoBarrier_CompareAndSwap() inline assembly also doesn't use any memory barrier, and Barrier_CompareAndSwap() places _compiler_ barriers before/after a call to NoBarrier_CompareAndSwap(). Dmitry Vyukov 2013/06/04 12:49:32 Yes, that's correct. But Acquire version does not Show quoted text On 2013/06/04 11:51:56, digit1 wrote: > I see. Is it correct that this remark should only apply to > Acquire_CompareAndSwap()? I.e. that NoBarrier_CompareAndSwap() and > Release_CompareAndSwap() don't need this extra dmb before the return? Yes, that's correct. But Acquire version does not need dbm, if pLinuxKernelCmpxchg() succeeds, because it includes the barrier. For this reason I prefer to do it the other way around, i.e. if the cheapest atomic RMW is Barrier, then implement the Barrier version and then forward Acquire/Release/NoBarrier to Barrier with a comment that there is no more efficient implementation. But if you forward Barrier into NoBarrier, that's kind of questionable. I know that it's implemented this way in atomicops for historical reasons. Dmitry Vyukov 2013/06/04 12:49:32 I would not be surprised if there are bugs. Show quoted text On 2013/06/04 12:00:37, digit1 wrote: > Also, please note that it looks like that the NoBarrier_CompareAndSwap() > implementation in base/atomicops_internals_gcc.h (currently used on Android/ARM > and NaCL) have the same "issue". > > A bit off-topic, but base/atomicops_internal_mips_gcc.h is even more > interesting. The NoBarrier_CompareAndSwap() inline assembly also doesn't use any > memory barrier, and Barrier_CompareAndSwap() places _compiler_ barriers > before/after a call to NoBarrier_CompareAndSwap(). I would not be surprised if there are bugs. digit1 2013/06/04 13:24:18 I've uploaded a patch where only Acquire_CompareAn I've uploaded a patch where only Acquire_CompareAndSwap() performs the extra memory barrier before returning. Please let me know if this is still not correct.
	177 if (!pLinuxKernelCmpxchg(old_value, new_value, ptr))

	178 return old_value;

	179 }

	180 }

	181

	182 inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr,

	183 Atomic32 new_value) {

	184 Atomic32 old_value;

	185 do {

	186 old_value = *ptr;

	187 } while (pLinuxKernelCmpxchg(old_value, new_value, ptr));

	188 return old_value;

	189 }

	190

	191 inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr,

	192 Atomic32 increment) {

	193 return Barrier_AtomicIncrement(ptr, increment);

	194 }

	195

	196 inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr,

	197 Atomic32 increment) {

	198 for (;;) {

	199 // Atomic exchange the old value with an incremented one.

	200 Atomic32 old_value = *ptr;

	201 Atomic32 new_value = old_value + increment;

	202 if (!pLinuxKernelCmpxchg(old_value, new_value, ptr)) {

	203 // The exchange took place as expected.

	204 return new_value;

	205 }

	206 // Otherwise, *ptr changed mid-loop and we need to retry.

	207 }

	208 }

	209

	210 inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,

	211 Atomic32 old_value,

	212 Atomic32 new_value) {

	213 return NoBarrier_CompareAndSwap(ptr, old_value, new_value);

	214 }

	215

	216 inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,

	217 Atomic32 old_value,

	218 Atomic32 new_value) {

	219 return NoBarrier_CompareAndSwap(ptr, old_value, new_value);

	220 }

	221

	222 #endif // __ARM_ARCH_6__ \|\| __ARM_ARCH_7A__

	223

	224 // NOTE: Atomicity of the following load and store operations is only

	225 // guaranteed in case of 32-bit alignement of \|ptr\| values.

	226

	227 inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) {

	228 *ptr = value;

	229 }

	230

	231 inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) {

	232 *ptr = value;

	233 MemoryBarrier();

	234 }

	235

	236 inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) {

	237 MemoryBarrier();

	238 *ptr = value;

	239 }

	240

	241 inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { return *ptr; }

	242

	243 inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) {

	244 Atomic32 value = *ptr;

	245 MemoryBarrier();

	246 return value;

	247 }

	248

	249 inline Atomic32 Release_Load(volatile const Atomic32* ptr) {

	250 MemoryBarrier();

	251 return *ptr;

	252 }

	253

	254 #undef LINUX_ARM_KERNEL_MEMORY_BARRIER

	255

	256 } // namespace base::subtle

	257 } // namespace base

	258

	259 #endif // BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_

OLD	NEW

« no previous file with comments | « base/atomicops.h ('k') | no next file » | no next file with comments »