Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(433)

Side by Side Diff: base/atomicops_internals_arm_gcc.h

Issue 16335007: Improve the implementation of atomic operations on Linux/ARM (including Android/ARM). (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 7 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « base/atomicops.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // This file is an internal atomic implementation, use base/atomicops.h instead.
6 //
7 // LinuxKernelCmpxchg and Barrier_AtomicIncrement are from Google Gears.
8
9 #ifndef BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_
10 #define BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_
11
12 namespace base {
13 namespace subtle {
14
15 // Memory barriers on ARM are funky, but the kernel is here to help:
16 //
17 // * ARMv5 didn't support SMP, there is no memory barrier instruction at
18 // all on this architecture, or when targetting its machine code.
19 //
20 // * Some ARMv6 CPUs support SMP. A full memory barrier can be produced by
21 // writing a random value to a very specific coprocessor register.
22 //
23 // * On ARMv7, the "dmb" instruction is used to perform a full memory
24 // barrier (though writing to the co-processor will still work).
25 // However, on single core devices (e.g. Nexus One, or Nexus S),
26 // this instruction will take up to 200 ns, which is huge, even though
27 // it's completely un-needed on these devices.
28 //
29 // * There is no easy way to determine at runtime if the device is
30 // single or multi-core. However, the kernel provide a useful helper
31 // function at a fixed memory address (0xffff0fa0), which will always
32 // perform a memory barrier in the most efficient way. I.e. on single
33 // core devices, this is an empty function that exits immediately.
34 // On multi-core devices, it implements a full memory barrier.
35 //
36 // Note that this helper function doesn't modify any register or memory.
37 // See the comment in Barrier_AtomicIncrement() to see why it is
38 // important.
39 //
40 // * This source could be compiled to ARMv5 machine code that runs on a
41 // multi-core ARMv6 or ARMv7 device. In this case, memory barriers
42 // are needed for correct execution. Always call the kernel helper, even
43 // when targetting ARMv5TE.
44 //
45
46 #define LINUX_ARM_KERNEL_MEMORY_BARRIER 0xffff0fa0
47
48 inline void MemoryBarrier() {
49 ((void (*)(void))LINUX_ARM_KERNEL_MEMORY_BARRIER)();
JF 2013/06/03 18:29:27 Shouldn't this have a compiler barrier to prevent
digit1 2013/06/04 09:05:28 I believe that all function calls are implicit com
50 }
51
52 #if defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_6__)
53
54 // On ARMv6 and higher, it is possible to directly use ldrex/strex
55 // instructions to implement fast atomic operations directly.
56 inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,
57 Atomic32 old_value,
58 Atomic32 new_value) {
59 Atomic32 prev_value;
60 int reloop;
61 do {
62 // The following is equivalent to:
63 //
64 // prev_value = LDREX(ptr)
65 // reloop = 0
66 // if (prev_value != old_value)
67 // reloop = STREX(ptr, new_value)
68 __asm__ __volatile__(" ldrex %0, [%3]\n"
69 " mov %1, #0\n"
70 " teq %0, %4\n"
71 #ifdef __thumb2__
JF 2013/06/03 18:29:27 I don't think this is needed, on ARM it blocks sho
digit1 2013/06/04 09:05:28 As of today, it's definitely needed with version o
72 " it eq\n"
73 #endif
74 " strexeq %1, %5, [%3]\n"
75 : "=&r"(prev_value), "=&r"(reloop), "+m"(*ptr)
76 : "r"(ptr), "r"(old_value), "r"(new_value)
77 : "cc");
78 } while (reloop != 0);
79 return prev_value;
80 }
81
82 inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
83 Atomic32 old_value,
84 Atomic32 new_value) {
85 Atomic32 result = NoBarrier_CompareAndSwap(ptr, old_value, new_value);
86 MemoryBarrier();
87 return result;
88 }
89
90 inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
91 Atomic32 old_value,
92 Atomic32 new_value) {
93 MemoryBarrier();
94 return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
95 }
96
97 inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr,
98 Atomic32 increment) {
99 Atomic32 value;
100 int reloop;
101 do {
102 // Equivalent to:
103 //
104 // value = LDREX(ptr)
105 // value += increment
106 // reloop = STREX(ptr, value)
107 //
108 __asm__ __volatile__(" ldrex %0, [%3]\n"
109 " add %0, %0, %4\n"
110 " strex %1, %0, [%3]\n"
111 : "=&r"(value), "=&r"(reloop), "+m"(*ptr)
112 : "r"(ptr), "r"(increment)
113 : "cc");
114 } while (reloop);
115 return value;
116 }
117
118 inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr,
119 Atomic32 increment) {
120 Atomic32 value;
121 uint32_t barrier = LINUX_ARM_KERNEL_MEMORY_BARRIER;
122 int reloop;
123 // The following is very subtle. This function is supposed
124 // to perform a "full memory barrier", i.e. it's supposed to
125 // enforce both acquire and release semantics at the same time.
126 //
127 // A naive way to implement this is to perform two MemoryBarrier() calls,
128 // one before, and one after the actual atomic operation, as in:
129 //
130 // MemoryBarrier();
131 // NoBarrier_AtomicIncrement();
132 // MemoryBarrier();
133 //
134 // However, memory barriers are costly, especially on multi-core
135 // devices, so the code below uses a single one and looks like:
136 //
137 // LDREX ...
138 // MemoryBarrier()
139 // STREX ..
140 //
141 // That's because this is equivalent to performing a "load-acquire"
142 // (LDREX + MemoryBarrier()) followed by a "store-release"
143 // (MemoryBarrier() + STREX).
144 //
145 // Note that when looping is needed, due to failed STREX operations, then
146 // MemoryBarrier() will be called in each iteration. However, it
147 // takes a minimum of 3 iterations for this code to perform worse than
148 // the naive one. It this happens, the address of the cache line is
149 // already highly contented and performance will probably be very bad
150 // already.
151 //
152 // IMPORTANT: This works because the MemoryBarrier() kernel helper
153 // never modifies any register or memory!
154 do {
155 // Equivalent to:
156 //
157 // value = LDREX(ptr)
158 // value += increment
159 // barrier();
160 // reloop = STREX(ptr, value)
161 //
162 __asm__ __volatile__(" ldrex %0, [%3]\n"
163 " add %0, %0, %4\n"
164 " blx %5\n"
Dmitry Vyukov 2013/06/03 18:01:22 yeah, this is very subtle the following does not m
digit1 2013/06/04 09:05:28 Good point, I've restored the two-memory-barriers
165 " strex %1, %0, [%3]\n"
166 : "=&r"(value), "=&r"(reloop), "+m"(*ptr)
167 : "r"(ptr), "r"(increment), "r"(barrier)
168 : "cc");
Dmitry Vyukov 2013/06/03 18:01:22 add "memory"
digit1 2013/06/04 09:05:28 Done, thanks. Also, I removed the superfluous "cc"
169 } while (reloop);
170 return value;
171 }
172
173 inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr,
174 Atomic32 new_value) {
175 Atomic32 old_value;
176 int reloop;
177 do {
178 // old_value = LDREX(ptr)
179 // fail = STREX(ptr, new_value)
180 __asm__ __volatile__(" ldrex %0, [%3]\n"
181 " strex %1, %4, [%3]\n"
182 : "=&r"(old_value), "=&r"(reloop), "+m"(*ptr)
183 : "r"(ptr), "r"(new_value)
184 : "cc");
185 } while (reloop != 0);
186 return old_value;
187 }
188
189 #else
190
191 // The kernel also provides a helper function to perform an atomic
192 // compare-and-swap operation at the hard-wired address 0xffff0fc0.
193 // On ARMv5, this is implemented by a special code path that the kernel
194 // detects and treats specially when thread pre-emption happens.
195 // On ARMv6 and higher, it uses LDREX/STREX instructions instead.
196 //
197 // Note that this always perform a full memory barrier, there is no
198 // need to add calls MemoryBarrier() before or after it.
199 typedef Atomic32 (*LinuxKernelCmpxchgFunc)(Atomic32 old_value,
Dmitry Vyukov 2013/06/03 18:01:22 I would use int as the return type, it's 0/1 rathe
Dmitry Vyukov 2013/06/03 18:01:22 add a comment that this function lies with return
digit1 2013/06/04 09:05:28 Done.
200 Atomic32 new_value,
201 volatile Atomic32* ptr);
202 LinuxKernelCmpxchgFunc pLinuxKernelCmpxchg __attribute__((weak)) =
203 (LinuxKernelCmpxchgFunc)0xffff0fc0;
204
205 inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,
206 Atomic32 old_value,
207 Atomic32 new_value) {
208 Atomic32 prev_value = *ptr;
209 do {
210 if (!pLinuxKernelCmpxchg(
211 old_value, new_value, const_cast<Atomic32*>(ptr))) {
212 return old_value;
213 }
214 prev_value = *ptr;
Dmitry Vyukov 2013/06/03 18:01:22 it does not have dmb after the load (provided that
digit1 2013/06/04 09:05:28 Fixed. Note that this part of the code comes from
215 } while (prev_value == old_value);
216 return prev_value;
217 }
218
219 inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr,
220 Atomic32 new_value) {
221 Atomic32 old_value;
222 do {
223 old_value = *ptr;
224 } while (
225 pLinuxKernelCmpxchg(old_value, new_value, const_cast<Atomic32*>(ptr)));
226 return old_value;
227 }
228
229 inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr,
230 Atomic32 increment) {
231 return Barrier_AtomicIncrement(ptr, increment);
232 }
233
234 inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr,
235 Atomic32 increment) {
236 for (;;) {
237 // Atomic exchange the old value with an incremented one.
238 Atomic32 old_value = *ptr;
239 Atomic32 new_value = old_value + increment;
240 if (pLinuxKernelCmpxchg(old_value, new_value, const_cast<Atomic32*>(ptr)) ==
241 0) {
242 // The exchange took place as expected.
243 return new_value;
244 }
245 // Otherwise, *ptr changed mid-loop and we need to retry.
246 }
247
248 }
249
250 inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
251 Atomic32 old_value,
252 Atomic32 new_value) {
253 return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
254 }
255
256 inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
257 Atomic32 old_value,
258 Atomic32 new_value) {
259 return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
260 }
261
262 #endif // __ARM_ARCH_6__ || __ARM_ARCH_7A__
263
264 inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) {
JF 2013/06/03 18:29:27 This is only atomic when aligned, right? A comment
digit1 2013/06/04 09:05:28 Yes, I've added a comment to the source code. I do
265 *ptr = value;
266 }
267
268 inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) {
269 *ptr = value;
270 MemoryBarrier();
271 }
272
273 inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) {
274 MemoryBarrier();
275 *ptr = value;
276 }
277
278 inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { return *ptr; }
279
280 inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) {
281 Atomic32 value = *ptr;
282 MemoryBarrier();
283 return value;
284 }
285
286 inline Atomic32 Release_Load(volatile const Atomic32* ptr) {
287 MemoryBarrier();
288 return *ptr;
289 }
290
291 #undef LINUX_ARM_KERNEL_MEMORY_BARRIER
292
293 } // namespace base::subtle
294 } // namespace base
295
296 #endif // BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_
OLDNEW
« no previous file with comments | « base/atomicops.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698