base/atomicops_internals_arm_gcc.h - Issue 16335007: Improve the implementation of atomic operations on Linux/ARM (including Android/ARM).

Side by Side Diff: base/atomicops_internals_arm_gcc.h

Issue 16335007: Improve the implementation of atomic operations on Linux/ARM (including Android/ARM). (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 7 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 // This file is an internal atomic implementation, use base/atomicops.h instead.

	6 //

	7 // LinuxKernelCmpxchg and Barrier_AtomicIncrement are from Google Gears.

	8

	9 #ifndef BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_

	10 #define BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_

	11

	12 namespace base {

	13 namespace subtle {

	14

	15 // Memory barriers on ARM are funky, but the kernel is here to help:

	16 //

	17 // * ARMv5 didn't support SMP, there is no memory barrier instruction at

	18 // all on this architecture, or when targetting its machine code.

	19 //

	20 // * Some ARMv6 CPUs support SMP. A full memory barrier can be produced by

	21 // writing a random value to a very specific coprocessor register.

	22 //

	23 // * On ARMv7, the "dmb" instruction is used to perform a full memory

	24 // barrier (though writing to the co-processor will still work).

	25 // However, on single core devices (e.g. Nexus One, or Nexus S),

	26 // this instruction will take up to 200 ns, which is huge, even though

	27 // it's completely un-needed on these devices.

	28 //

	29 // * There is no easy way to determine at runtime if the device is

	30 // single or multi-core. However, the kernel provide a useful helper

	31 // function at a fixed memory address (0xffff0fa0), which will always

	32 // perform a memory barrier in the most efficient way. I.e. on single

	33 // core devices, this is an empty function that exits immediately.

	34 // On multi-core devices, it implements a full memory barrier.

	35 //

	36 // Note that this helper function doesn't modify any register or memory.

	37 // See the comment in Barrier_AtomicIncrement() to see why it is

	38 // important.

	39 //

	40 // * This source could be compiled to ARMv5 machine code that runs on a

	41 // multi-core ARMv6 or ARMv7 device. In this case, memory barriers

	42 // are needed for correct execution. Always call the kernel helper, even

	43 // when targetting ARMv5TE.

	44 //

	45

	46 #define LINUX_ARM_KERNEL_MEMORY_BARRIER 0xffff0fa0

	47

	48 inline void MemoryBarrier() {

	49 ((void (*)(void))LINUX_ARM_KERNEL_MEMORY_BARRIER)();
	JF 2013/06/03 18:29:27 Shouldn't this have a compiler barrier to prevent Shouldn't this have a compiler barrier to prevent code motion around it? e.g.: __asm__ __volatile__("" ::: "memory"); digit1 2013/06/04 09:05:28 I believe that all function calls are implicit com I believe that all function calls are implicit compiler barriers, hence this isn't needed. I've added a comment though to make it clearer.
	50 }

	51

	52 #if defined(__ARM_ARCH_7A__) \|\| defined(__ARM_ARCH_6__)

	53

	54 // On ARMv6 and higher, it is possible to directly use ldrex/strex

	55 // instructions to implement fast atomic operations directly.

	56 inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,

	57 Atomic32 old_value,

	58 Atomic32 new_value) {

	59 Atomic32 prev_value;

	60 int reloop;

	61 do {

	62 // The following is equivalent to:

	63 //

	64 // prev_value = LDREX(ptr)

	65 // reloop = 0

	66 // if (prev_value != old_value)

	67 // reloop = STREX(ptr, new_value)

	68 __asm__ __volatile__(" ldrex %0, [%3]\n"

	69 " mov %1, #0\n"

	70 " teq %0, %4\n"

	71 #ifdef __thumb2__
	JF 2013/06/03 18:29:27 I don't think this is needed, on ARM it blocks sho I don't think this is needed, on ARM it blocks should just serve as an annotation. digit1 2013/06/04 09:05:28 As of today, it's definitely needed with version o As of today, it's definitely needed with version of gas that comes with the Android NDK being used to build Chromium on Android (NDK r8e and GCC 4.6), otherwise I get: {standard input}: Assembler messages: {standard input}:10708: Error: thumb conditional instruction should be in IT block -- `strexeq r0,r3,[r4]' {standard input}:10867: Error: thumb conditional instruction should be in IT block -- `strexeq r0,r3,[r4]' {standard input}:10990: Error: thumb conditional instruction should be in IT block -- `strexeq r1,r3,[r4]' I believe more recent versions of binutils/gas provide automatic insertion of IT instructions, but we're not using them yet.
	72 " it eq\n"

	73 #endif

	74 " strexeq %1, %5, [%3]\n"

	75 : "=&r"(prev_value), "=&r"(reloop), "+m"(*ptr)

	76 : "r"(ptr), "r"(old_value), "r"(new_value)

	77 : "cc");

	78 } while (reloop != 0);

	79 return prev_value;

	80 }

	81

	82 inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,

	83 Atomic32 old_value,

	84 Atomic32 new_value) {

	85 Atomic32 result = NoBarrier_CompareAndSwap(ptr, old_value, new_value);

	86 MemoryBarrier();

	87 return result;

	88 }

	89

	90 inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,

	91 Atomic32 old_value,

	92 Atomic32 new_value) {

	93 MemoryBarrier();

	94 return NoBarrier_CompareAndSwap(ptr, old_value, new_value);

	95 }

	96

	97 inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr,

	98 Atomic32 increment) {

	99 Atomic32 value;

	100 int reloop;

	101 do {

	102 // Equivalent to:

	103 //

	104 // value = LDREX(ptr)

	105 // value += increment

	106 // reloop = STREX(ptr, value)

	107 //

	108 __asm__ __volatile__(" ldrex %0, [%3]\n"

	109 " add %0, %0, %4\n"

	110 " strex %1, %0, [%3]\n"

	111 : "=&r"(value), "=&r"(reloop), "+m"(*ptr)

	112 : "r"(ptr), "r"(increment)

	113 : "cc");

	114 } while (reloop);

	115 return value;

	116 }

	117

	118 inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr,

	119 Atomic32 increment) {

	120 Atomic32 value;

	121 uint32_t barrier = LINUX_ARM_KERNEL_MEMORY_BARRIER;

	122 int reloop;

	123 // The following is very subtle. This function is supposed

	124 // to perform a "full memory barrier", i.e. it's supposed to

	125 // enforce both acquire and release semantics at the same time.

	126 //

	127 // A naive way to implement this is to perform two MemoryBarrier() calls,

	128 // one before, and one after the actual atomic operation, as in:

	129 //

	130 // MemoryBarrier();

	131 // NoBarrier_AtomicIncrement();

	132 // MemoryBarrier();

	133 //

	134 // However, memory barriers are costly, especially on multi-core

	135 // devices, so the code below uses a single one and looks like:

	136 //

	137 // LDREX ...

	138 // MemoryBarrier()

	139 // STREX ..

	140 //

	141 // That's because this is equivalent to performing a "load-acquire"

	142 // (LDREX + MemoryBarrier()) followed by a "store-release"

	143 // (MemoryBarrier() + STREX).

	144 //

	145 // Note that when looping is needed, due to failed STREX operations, then

	146 // MemoryBarrier() will be called in each iteration. However, it

	147 // takes a minimum of 3 iterations for this code to perform worse than

	148 // the naive one. It this happens, the address of the cache line is

	149 // already highly contented and performance will probably be very bad

	150 // already.

	151 //

	152 // IMPORTANT: This works because the MemoryBarrier() kernel helper

	153 // never modifies any register or memory!

	154 do {

	155 // Equivalent to:

	156 //

	157 // value = LDREX(ptr)

	158 // value += increment

	159 // barrier();

	160 // reloop = STREX(ptr, value)

	161 //

	162 __asm__ __volatile__(" ldrex %0, [%3]\n"

	163 " add %0, %0, %4\n"

	164 " blx %5\n"
	Dmitry Vyukov 2013/06/03 18:01:22 yeah, this is very subtle the following does not m yeah, this is very subtle the following does not mention it: http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html I would probably ask over comp.programming.threads or something digit1 2013/06/04 09:05:28 Good point, I've restored the two-memory-barriers Show quoted text On 2013/06/03 18:01:22, Dmitry Vyukov wrote: > yeah, this is very subtle > the following does not mention it: > http://www.cl.cam.ac.uk/%7Epes20/cpp/cpp0xmappings.html > I would probably ask over comp.programming.threads or something Good point, I've restored the two-memory-barriers implementation, and filed https://code.google.com/p/chromium/issues/detail?id=246514 to investigate this issue with lower priority. This will give ample time to determine what's the best option. I'll rerun my benchmarks and update the spreadsheet too. At first glance, we're still noticeably better than the two previous implementations anyway.
	165 " strex %1, %0, [%3]\n"

	166 : "=&r"(value), "=&r"(reloop), "+m"(*ptr)

	167 : "r"(ptr), "r"(increment), "r"(barrier)

	168 : "cc");
	Dmitry Vyukov 2013/06/03 18:01:22 add "memory" add "memory" digit1 2013/06/04 09:05:28 Done, thanks. Also, I removed the superfluous "cc" Show quoted text On 2013/06/03 18:01:22, Dmitry Vyukov wrote: > add "memory" Done, thanks. Also, I removed the superfluous "cc" annotations for the segments that don't need it (this code comes from a previous versions I wrote for a different project that used explicit labels / tests / branches, turns out it's generally better to let the compiler do this when using inline functions).
	169 } while (reloop);

	170 return value;

	171 }

	172

	173 inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr,

	174 Atomic32 new_value) {

	175 Atomic32 old_value;

	176 int reloop;

	177 do {

	178 // old_value = LDREX(ptr)

	179 // fail = STREX(ptr, new_value)

	180 __asm__ __volatile__(" ldrex %0, [%3]\n"

	181 " strex %1, %4, [%3]\n"

	182 : "=&r"(old_value), "=&r"(reloop), "+m"(*ptr)

	183 : "r"(ptr), "r"(new_value)

	184 : "cc");

	185 } while (reloop != 0);

	186 return old_value;

	187 }

	188

	189 #else

	190

	191 // The kernel also provides a helper function to perform an atomic

	192 // compare-and-swap operation at the hard-wired address 0xffff0fc0.

	193 // On ARMv5, this is implemented by a special code path that the kernel

	194 // detects and treats specially when thread pre-emption happens.

	195 // On ARMv6 and higher, it uses LDREX/STREX instructions instead.

	196 //

	197 // Note that this always perform a full memory barrier, there is no

	198 // need to add calls MemoryBarrier() before or after it.

	199 typedef Atomic32 (*LinuxKernelCmpxchgFunc)(Atomic32 old_value,
	Dmitry Vyukov 2013/06/03 18:01:22 I would use int as the return type, it's 0/1 rathe I would use int as the return type, it's 0/1 rather than atomic value. Dmitry Vyukov 2013/06/03 18:01:22 add a comment that this function lies with return add a comment that this function lies with return value prior to 2.6.24: http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=b49... digit1 2013/06/04 09:05:28 Done. Show quoted text On 2013/06/03 18:01:22, Dmitry Vyukov wrote: > I would use int as the return type, it's 0/1 rather than atomic value. Done.
	200 Atomic32 new_value,

	201 volatile Atomic32* ptr);

	202 LinuxKernelCmpxchgFunc pLinuxKernelCmpxchg __attribute__((weak)) =

	203 (LinuxKernelCmpxchgFunc)0xffff0fc0;

	204

	205 inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,

	206 Atomic32 old_value,

	207 Atomic32 new_value) {

	208 Atomic32 prev_value = *ptr;

	209 do {

	210 if (!pLinuxKernelCmpxchg(

	211 old_value, new_value, const_cast<Atomic32*>(ptr))) {

	212 return old_value;

	213 }

	214 prev_value = *ptr;
	Dmitry Vyukov 2013/06/03 18:01:22 it does not have dmb after the load (provided that it does not have dmb after the load (provided that we bail out of the loop), so it can not be used as acquire version digit1 2013/06/04 09:05:28 Fixed. Note that this part of the code comes from Show quoted text On 2013/06/03 18:01:22, Dmitry Vyukov wrote: > it does not have dmb after the load (provided that we bail out of the loop), so > it can not be used as acquire version Fixed. Note that this part of the code comes from was in the Chromium source tree before the switch to gcc intrinsics. It's a bit scary this wasn't detected previously. It also seems the const_cast<> uses are obsolete, so I've removed them.
	215 } while (prev_value == old_value);

	216 return prev_value;

	217 }

	218

	219 inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr,

	220 Atomic32 new_value) {

	221 Atomic32 old_value;

	222 do {

	223 old_value = *ptr;

	224 } while (

	225 pLinuxKernelCmpxchg(old_value, new_value, const_cast<Atomic32*>(ptr)));

	226 return old_value;

	227 }

	228

	229 inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr,

	230 Atomic32 increment) {

	231 return Barrier_AtomicIncrement(ptr, increment);

	232 }

	233

	234 inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr,

	235 Atomic32 increment) {

	236 for (;;) {

	237 // Atomic exchange the old value with an incremented one.

	238 Atomic32 old_value = *ptr;

	239 Atomic32 new_value = old_value + increment;

	240 if (pLinuxKernelCmpxchg(old_value, new_value, const_cast<Atomic32*>(ptr)) ==

	241 0) {

	242 // The exchange took place as expected.

	243 return new_value;

	244 }

	245 // Otherwise, *ptr changed mid-loop and we need to retry.

	246 }

	247

	248 }

	249

	250 inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,

	251 Atomic32 old_value,

	252 Atomic32 new_value) {

	253 return NoBarrier_CompareAndSwap(ptr, old_value, new_value);

	254 }

	255

	256 inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,

	257 Atomic32 old_value,

	258 Atomic32 new_value) {

	259 return NoBarrier_CompareAndSwap(ptr, old_value, new_value);

	260 }

	261

	262 #endif // __ARM_ARCH_6__ \|\| __ARM_ARCH_7A__

	263

	264 inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) {
	JF 2013/06/03 18:29:27 This is only atomic when aligned, right? A comment This is only atomic when aligned, right? A comment to that effect (if not an assert)? Same for below. digit1 2013/06/04 09:05:28 Yes, I've added a comment to the source code. I do Show quoted text On 2013/06/03 18:29:27, JF wrote: > This is only atomic when aligned, right? A comment to that effect (if not an > assert)? > > Same for below. Yes, I've added a comment to the source code. I don't think an assert is necessary given that the issue exists for other platforms too and isn't tested. (Besides, unaligned access results in a crash on certain Android devices, based on how the kernel is configured).
	265 *ptr = value;

	266 }

	267

	268 inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) {

	269 *ptr = value;

	270 MemoryBarrier();

	271 }

	272

	273 inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) {

	274 MemoryBarrier();

	275 *ptr = value;

	276 }

	277

	278 inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { return *ptr; }

	279

	280 inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) {

	281 Atomic32 value = *ptr;

	282 MemoryBarrier();

	283 return value;

	284 }

	285

	286 inline Atomic32 Release_Load(volatile const Atomic32* ptr) {

	287 MemoryBarrier();

	288 return *ptr;

	289 }

	290

	291 #undef LINUX_ARM_KERNEL_MEMORY_BARRIER

	292

	293 } // namespace base::subtle

	294 } // namespace base

	295

	296 #endif // BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_

OLD	NEW

« no previous file with comments | « base/atomicops.h ('k') | no next file » | no next file with comments »