Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(286)

Side by Side Diff: src/arm/codegen-arm.cc

Issue 17858002: ARM: Implement memcpy using NEON. (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge
Patch Set: Remove "unaligned accesses" from C++ code Created 7 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after
105 OS::ProtectCode(buffer, actual_size); 105 OS::ProtectCode(buffer, actual_size);
106 106
107 #if !defined(USE_SIMULATOR) 107 #if !defined(USE_SIMULATOR)
108 return FUNCTION_CAST<UnaryMathFunction>(buffer); 108 return FUNCTION_CAST<UnaryMathFunction>(buffer);
109 #else 109 #else
110 fast_exp_arm_machine_code = buffer; 110 fast_exp_arm_machine_code = buffer;
111 return &fast_exp_simulator; 111 return &fast_exp_simulator;
112 #endif 112 #endif
113 } 113 }
114 114
115 #if defined(V8_HOST_ARCH_ARM)
116 OS::MemCopyUint8Function CreateMemCopyUint8Function(
117 OS::MemCopyUint8Function stub) {
118 #if defined(USE_SIMULATOR)
119 return stub;
120 #else
121 if (Serializer::enabled() || !CpuFeatures::IsSupported(UNALIGNED_ACCESSES)) {
122 return stub;
123 }
124 size_t actual_size;
125 byte* buffer = static_cast<byte*>(OS::Allocate(1 * KB, &actual_size, true));
126 if (buffer == NULL) return stub;
127
128 MacroAssembler masm(NULL, buffer, static_cast<int>(actual_size));
129
130 Register dest = r0;
131 Register src = r1;
132 Register chars = r2;
133 Register temp1 = r3;
134 Label less_4;
135
136 if (CpuFeatures::IsSupported(NEON)) {
137 Label loop, less_256, less_128, less_64, less_32, _16_or_less, _8_or_less;
ulan 2013/07/09 15:16:32 The main loop copies 64 byte at a time. Why don't
vincent.belliard.fr 2013/07/10 15:30:38 With ARM, pld starts a load but doesn't wait for t
138 Label size_less_than_8;
139 __ pld(MemOperand(src, 0));
140
141 __ cmp(chars, Operand(8));
142 __ b(lt, &size_less_than_8);
143 __ cmp(chars, Operand(32));
144 __ b(lt, &less_32);
145 if (CpuFeatures::cache_line_size() == 32) {
146 __ pld(MemOperand(src, 32));
147 }
148 __ cmp(chars, Operand(64));
149 __ b(lt, &less_64);
150 __ pld(MemOperand(src, 64));
151 if (CpuFeatures::cache_line_size() == 32) {
152 __ pld(MemOperand(src, 96));
153 }
154 __ cmp(chars, Operand(128));
155 __ b(lt, &less_128);
156 __ pld(MemOperand(src, 128));
157 if (CpuFeatures::cache_line_size() == 32) {
158 __ pld(MemOperand(src, 160));
159 }
160 __ pld(MemOperand(src, 192));
161 if (CpuFeatures::cache_line_size() == 32) {
162 __ pld(MemOperand(src, 224));
163 }
164 __ cmp(chars, Operand(256));
165 __ b(lt, &less_256);
166 __ sub(chars, chars, Operand(256));
167
168 __ bind(&loop);
169 __ pld(MemOperand(src, 256));
170 __ vld1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(src, PostIndex));
171 __ vld1(Neon8, NeonListOperand(d4, 4), NeonMemOperand(src, PostIndex));
172 if (CpuFeatures::cache_line_size() == 32) {
173 __ pld(MemOperand(src, 256));
ulan 2013/07/09 15:16:32 Shouldn't this be __ pld(MemOperand(src, 256 - 32)
vincent.belliard.fr 2013/07/10 15:30:38 Done.
174 }
175 __ sub(chars, chars, Operand(64), SetCC);
176 __ vst1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(dest, PostIndex));
177 __ vst1(Neon8, NeonListOperand(d4, 4), NeonMemOperand(dest, PostIndex));
178 __ b(ge, &loop);
179 __ add(chars, chars, Operand(256));
180
181 __ bind(&less_256);
182 __ vld1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(src, PostIndex));
183 __ vld1(Neon8, NeonListOperand(d4, 4), NeonMemOperand(src, PostIndex));
184 __ sub(chars, chars, Operand(128));
185 __ vst1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(dest, PostIndex));
186 __ vst1(Neon8, NeonListOperand(d4, 4), NeonMemOperand(dest, PostIndex));
187 __ vld1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(src, PostIndex));
188 __ vld1(Neon8, NeonListOperand(d4, 4), NeonMemOperand(src, PostIndex));
189 __ vst1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(dest, PostIndex));
190 __ vst1(Neon8, NeonListOperand(d4, 4), NeonMemOperand(dest, PostIndex));
191 __ cmp(chars, Operand(64));
192 __ b(lt, &less_64);
193
194 __ bind(&less_128);
195 __ vld1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(src, PostIndex));
196 __ vld1(Neon8, NeonListOperand(d4, 4), NeonMemOperand(src, PostIndex));
197 __ sub(chars, chars, Operand(64));
198 __ vst1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(dest, PostIndex));
199 __ vst1(Neon8, NeonListOperand(d4, 4), NeonMemOperand(dest, PostIndex));
200
201 __ bind(&less_64);
202 __ cmp(chars, Operand(32));
203 __ b(lt, &less_32);
204 __ vld1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(src, PostIndex));
205 __ vst1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(dest, PostIndex));
206 __ sub(chars, chars, Operand(32));
207
208 __ bind(&less_32);
209 __ cmp(chars, Operand(16));
210 __ b(le, &_16_or_less);
211 __ vld1(Neon8, NeonListOperand(d0, 2), NeonMemOperand(src, PostIndex));
212 __ vst1(Neon8, NeonListOperand(d0, 2), NeonMemOperand(dest, PostIndex));
213 __ sub(chars, chars, Operand(16));
214
215 __ bind(&_16_or_less);
216 __ cmp(chars, Operand(8));
217 __ b(le, &_8_or_less);
218 __ vld1(Neon8, NeonListOperand(d0), NeonMemOperand(src, PostIndex));
219 __ vst1(Neon8, NeonListOperand(d0), NeonMemOperand(dest, PostIndex));
220 __ sub(chars, chars, Operand(8));
221
222 // Do a last copy which may overlap with the previous copy (up to 8 bytes).
223 __ bind(&_8_or_less);
224 __ rsb(chars, chars, Operand(8));
225 __ sub(src, src, Operand(chars));
226 __ sub(dest, dest, Operand(chars));
227 __ vld1(Neon8, NeonListOperand(d0), NeonMemOperand(src));
228 __ vst1(Neon8, NeonListOperand(d0), NeonMemOperand(dest));
229
230 __ Ret();
231
232 __ bind(&size_less_than_8);
233
234 __ bic(temp1, chars, Operand(0x3), SetCC);
235 __ b(&less_4, eq);
236 __ ldr(temp1, MemOperand(src, 4, PostIndex));
237 __ str(temp1, MemOperand(dest, 4, PostIndex));
238 } else {
239 Register temp2 = ip;
240 Label loop;
241
242 __ bic(temp2, chars, Operand(0x3), SetCC);
243 __ b(&less_4, eq);
244 __ add(temp2, dest, temp2);
245
246 __ bind(&loop);
247 __ ldr(temp1, MemOperand(src, 4, PostIndex));
248 __ str(temp1, MemOperand(dest, 4, PostIndex));
249 __ cmp(dest, temp2);
250 __ b(&loop, ne);
251 }
252
253 __ bind(&less_4);
254 __ mov(chars, Operand(chars, LSL, 31), SetCC);
255 // bit0 => Z (ne), bit1 => C (cs)
256 __ ldrh(temp1, MemOperand(src, 2, PostIndex), cs);
257 __ strh(temp1, MemOperand(dest, 2, PostIndex), cs);
258 __ ldrb(temp1, MemOperand(src), ne);
259 __ strb(temp1, MemOperand(dest), ne);
260 __ Ret();
261
262 CodeDesc desc;
263 masm.GetCode(&desc);
264 ASSERT(!RelocInfo::RequiresRelocation(desc));
265
266 CPU::FlushICache(buffer, actual_size);
267 OS::ProtectCode(buffer, actual_size);
268 return FUNCTION_CAST<OS::MemCopyUint8Function>(buffer);
269 #endif
270 }
271
272 // Convert 8 to 16. The number of character to copy must be at least 8.
273 OS::MemCopyUint16Uint8Function CreateMemCopyUint16Uint8Function(
274 OS::MemCopyUint16Uint8Function stub) {
275 #if defined(USE_SIMULATOR)
276 return stub;
277 #else
278 if (Serializer::enabled() || !CpuFeatures::IsSupported(UNALIGNED_ACCESSES)) {
279 return stub;
280 }
281 size_t actual_size;
282 byte* buffer = static_cast<byte*>(OS::Allocate(1 * KB, &actual_size, true));
283 if (buffer == NULL) return stub;
284
285 MacroAssembler masm(NULL, buffer, static_cast<int>(actual_size));
286
287 Register dest = r0;
288 Register src = r1;
289 Register chars = r2;
290 if (CpuFeatures::IsSupported(NEON)) {
291 Register temp = r3;
292 Label loop;
293
294 __ bic(temp, chars, Operand(0x7));
295 __ sub(chars, chars, Operand(temp));
296 __ add(temp, dest, Operand(temp, LSL, 1));
297
298 __ bind(&loop);
299 __ vld1(Neon8, NeonListOperand(d0), NeonMemOperand(src, PostIndex));
300 __ vmovl(NeonU8, q0, d0);
301 __ vst1(Neon16, NeonListOperand(d0, 2), NeonMemOperand(dest, PostIndex));
302 __ cmp(dest, temp);
303 __ b(&loop, ne);
304
305 // Do a last copy which will overlap with the previous copy (1 to 8 bytes).
306 __ rsb(chars, chars, Operand(8));
307 __ sub(src, src, Operand(chars));
308 __ sub(dest, dest, Operand(chars, LSL, 1));
309 __ vld1(Neon8, NeonListOperand(d0), NeonMemOperand(src));
310 __ vmovl(NeonU8, q0, d0);
311 __ vst1(Neon16, NeonListOperand(d0, 2), NeonMemOperand(dest));
312 __ Ret();
313 } else {
314 Register temp1 = r3;
315 Register temp2 = ip;
316 Register temp3 = lr;
317 Register temp4 = r4;
318 Label loop;
319 Label not_two;
320
321 __ Push(lr, r4);
322 __ bic(temp2, chars, Operand(0x3));
323 __ add(temp2, dest, Operand(temp2, LSL, 1));
324
325 __ bind(&loop);
326 __ ldr(temp1, MemOperand(src, 4, PostIndex));
327 __ uxtb16(temp3, Operand(temp1, ROR, 0));
328 __ uxtb16(temp4, Operand(temp1, ROR, 8));
329 __ pkhbt(temp1, temp3, Operand(temp4, LSL, 16));
330 __ str(temp1, MemOperand(dest));
331 __ pkhtb(temp1, temp4, Operand(temp3, ASR, 16));
332 __ str(temp1, MemOperand(dest, 4));
333 __ add(dest, dest, Operand(8));
334 __ cmp(dest, temp2);
335 __ b(&loop, ne);
336
337 __ mov(chars, Operand(chars, LSL, 31), SetCC); // bit0 => ne, bit1 => cs
338 __ b(&not_two, cc);
339 __ ldrh(temp1, MemOperand(src, 2, PostIndex));
340 __ uxtb(temp3, Operand(temp1, ROR, 8));
341 __ mov(temp3, Operand(temp3, LSL, 16));
342 __ uxtab(temp3, temp3, Operand(temp1, ROR, 0));
343 __ str(temp3, MemOperand(dest, 4, PostIndex));
344 __ bind(&not_two);
345 __ ldrb(temp1, MemOperand(src), ne);
346 __ strh(temp1, MemOperand(dest), ne);
347 __ Pop(pc, r4);
348 }
349
350 CodeDesc desc;
351 masm.GetCode(&desc);
352
353 CPU::FlushICache(buffer, actual_size);
354 OS::ProtectCode(buffer, actual_size);
355
356 return FUNCTION_CAST<OS::MemCopyUint16Uint8Function>(buffer);
357 #endif
358 }
359 #endif
115 360
116 #undef __ 361 #undef __
117 362
118 363
119 UnaryMathFunction CreateSqrtFunction() { 364 UnaryMathFunction CreateSqrtFunction() {
120 return &sqrt; 365 return &sqrt;
121 } 366 }
122 367
123 // ------------------------------------------------------------------------- 368 // -------------------------------------------------------------------------
124 // Platform-specific RuntimeCallHelper functions. 369 // Platform-specific RuntimeCallHelper functions.
(...skipping 512 matching lines...) Expand 10 before | Expand all | Expand 10 after
637 patcher.masm()->add(r0, pc, Operand(-8)); 882 patcher.masm()->add(r0, pc, Operand(-8));
638 patcher.masm()->ldr(pc, MemOperand(pc, -4)); 883 patcher.masm()->ldr(pc, MemOperand(pc, -4));
639 patcher.masm()->dd(reinterpret_cast<uint32_t>(stub->instruction_start())); 884 patcher.masm()->dd(reinterpret_cast<uint32_t>(stub->instruction_start()));
640 } 885 }
641 } 886 }
642 887
643 888
644 } } // namespace v8::internal 889 } } // namespace v8::internal
645 890
646 #endif // V8_TARGET_ARCH_ARM 891 #endif // V8_TARGET_ARCH_ARM
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698