Index: src/arm/codegen-arm.cc |
diff --git a/src/arm/codegen-arm.cc b/src/arm/codegen-arm.cc |
index 60de5fc4f7ff2fb18b1d74f710f8225041d15522..fe714a77cd19af2f0d4f3acc5f04a11604cc0b3d 100644 |
--- a/src/arm/codegen-arm.cc |
+++ b/src/arm/codegen-arm.cc |
@@ -112,6 +112,251 @@ UnaryMathFunction CreateExpFunction() { |
#endif |
} |
+#if defined(V8_HOST_ARCH_ARM) |
+OS::MemCopyUint8Function CreateMemCopyUint8Function( |
+ OS::MemCopyUint8Function stub) { |
+#if defined(USE_SIMULATOR) |
+ return stub; |
+#else |
+ if (Serializer::enabled() || !CpuFeatures::IsSupported(UNALIGNED_ACCESSES)) { |
+ return stub; |
+ } |
+ size_t actual_size; |
+ byte* buffer = static_cast<byte*>(OS::Allocate(1 * KB, &actual_size, true)); |
+ if (buffer == NULL) return stub; |
+ |
+ MacroAssembler masm(NULL, buffer, static_cast<int>(actual_size)); |
+ |
+ Register dest = r0; |
+ Register src = r1; |
+ Register chars = r2; |
+ Register temp1 = r3; |
+ Label less_4; |
+ |
+ if (CpuFeatures::IsSupported(NEON)) { |
+ Label loop, less_256, less_128, less_64, less_32, _16_or_less, _8_or_less; |
ulan
2013/07/09 15:16:32
The main loop copies 64 byte at a time. Why don't
vincent.belliard.fr
2013/07/10 15:30:38
With ARM, pld starts a load but doesn't wait for t
|
+ Label size_less_than_8; |
+ __ pld(MemOperand(src, 0)); |
+ |
+ __ cmp(chars, Operand(8)); |
+ __ b(lt, &size_less_than_8); |
+ __ cmp(chars, Operand(32)); |
+ __ b(lt, &less_32); |
+ if (CpuFeatures::cache_line_size() == 32) { |
+ __ pld(MemOperand(src, 32)); |
+ } |
+ __ cmp(chars, Operand(64)); |
+ __ b(lt, &less_64); |
+ __ pld(MemOperand(src, 64)); |
+ if (CpuFeatures::cache_line_size() == 32) { |
+ __ pld(MemOperand(src, 96)); |
+ } |
+ __ cmp(chars, Operand(128)); |
+ __ b(lt, &less_128); |
+ __ pld(MemOperand(src, 128)); |
+ if (CpuFeatures::cache_line_size() == 32) { |
+ __ pld(MemOperand(src, 160)); |
+ } |
+ __ pld(MemOperand(src, 192)); |
+ if (CpuFeatures::cache_line_size() == 32) { |
+ __ pld(MemOperand(src, 224)); |
+ } |
+ __ cmp(chars, Operand(256)); |
+ __ b(lt, &less_256); |
+ __ sub(chars, chars, Operand(256)); |
+ |
+ __ bind(&loop); |
+ __ pld(MemOperand(src, 256)); |
+ __ vld1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(src, PostIndex)); |
+ __ vld1(Neon8, NeonListOperand(d4, 4), NeonMemOperand(src, PostIndex)); |
+ if (CpuFeatures::cache_line_size() == 32) { |
+ __ pld(MemOperand(src, 256)); |
ulan
2013/07/09 15:16:32
Shouldn't this be __ pld(MemOperand(src, 256 - 32)
vincent.belliard.fr
2013/07/10 15:30:38
Done.
|
+ } |
+ __ sub(chars, chars, Operand(64), SetCC); |
+ __ vst1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(dest, PostIndex)); |
+ __ vst1(Neon8, NeonListOperand(d4, 4), NeonMemOperand(dest, PostIndex)); |
+ __ b(ge, &loop); |
+ __ add(chars, chars, Operand(256)); |
+ |
+ __ bind(&less_256); |
+ __ vld1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(src, PostIndex)); |
+ __ vld1(Neon8, NeonListOperand(d4, 4), NeonMemOperand(src, PostIndex)); |
+ __ sub(chars, chars, Operand(128)); |
+ __ vst1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(dest, PostIndex)); |
+ __ vst1(Neon8, NeonListOperand(d4, 4), NeonMemOperand(dest, PostIndex)); |
+ __ vld1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(src, PostIndex)); |
+ __ vld1(Neon8, NeonListOperand(d4, 4), NeonMemOperand(src, PostIndex)); |
+ __ vst1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(dest, PostIndex)); |
+ __ vst1(Neon8, NeonListOperand(d4, 4), NeonMemOperand(dest, PostIndex)); |
+ __ cmp(chars, Operand(64)); |
+ __ b(lt, &less_64); |
+ |
+ __ bind(&less_128); |
+ __ vld1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(src, PostIndex)); |
+ __ vld1(Neon8, NeonListOperand(d4, 4), NeonMemOperand(src, PostIndex)); |
+ __ sub(chars, chars, Operand(64)); |
+ __ vst1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(dest, PostIndex)); |
+ __ vst1(Neon8, NeonListOperand(d4, 4), NeonMemOperand(dest, PostIndex)); |
+ |
+ __ bind(&less_64); |
+ __ cmp(chars, Operand(32)); |
+ __ b(lt, &less_32); |
+ __ vld1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(src, PostIndex)); |
+ __ vst1(Neon8, NeonListOperand(d0, 4), NeonMemOperand(dest, PostIndex)); |
+ __ sub(chars, chars, Operand(32)); |
+ |
+ __ bind(&less_32); |
+ __ cmp(chars, Operand(16)); |
+ __ b(le, &_16_or_less); |
+ __ vld1(Neon8, NeonListOperand(d0, 2), NeonMemOperand(src, PostIndex)); |
+ __ vst1(Neon8, NeonListOperand(d0, 2), NeonMemOperand(dest, PostIndex)); |
+ __ sub(chars, chars, Operand(16)); |
+ |
+ __ bind(&_16_or_less); |
+ __ cmp(chars, Operand(8)); |
+ __ b(le, &_8_or_less); |
+ __ vld1(Neon8, NeonListOperand(d0), NeonMemOperand(src, PostIndex)); |
+ __ vst1(Neon8, NeonListOperand(d0), NeonMemOperand(dest, PostIndex)); |
+ __ sub(chars, chars, Operand(8)); |
+ |
+ // Do a last copy which may overlap with the previous copy (up to 8 bytes). |
+ __ bind(&_8_or_less); |
+ __ rsb(chars, chars, Operand(8)); |
+ __ sub(src, src, Operand(chars)); |
+ __ sub(dest, dest, Operand(chars)); |
+ __ vld1(Neon8, NeonListOperand(d0), NeonMemOperand(src)); |
+ __ vst1(Neon8, NeonListOperand(d0), NeonMemOperand(dest)); |
+ |
+ __ Ret(); |
+ |
+ __ bind(&size_less_than_8); |
+ |
+ __ bic(temp1, chars, Operand(0x3), SetCC); |
+ __ b(&less_4, eq); |
+ __ ldr(temp1, MemOperand(src, 4, PostIndex)); |
+ __ str(temp1, MemOperand(dest, 4, PostIndex)); |
+ } else { |
+ Register temp2 = ip; |
+ Label loop; |
+ |
+ __ bic(temp2, chars, Operand(0x3), SetCC); |
+ __ b(&less_4, eq); |
+ __ add(temp2, dest, temp2); |
+ |
+ __ bind(&loop); |
+ __ ldr(temp1, MemOperand(src, 4, PostIndex)); |
+ __ str(temp1, MemOperand(dest, 4, PostIndex)); |
+ __ cmp(dest, temp2); |
+ __ b(&loop, ne); |
+ } |
+ |
+ __ bind(&less_4); |
+ __ mov(chars, Operand(chars, LSL, 31), SetCC); |
+ // bit0 => Z (ne), bit1 => C (cs) |
+ __ ldrh(temp1, MemOperand(src, 2, PostIndex), cs); |
+ __ strh(temp1, MemOperand(dest, 2, PostIndex), cs); |
+ __ ldrb(temp1, MemOperand(src), ne); |
+ __ strb(temp1, MemOperand(dest), ne); |
+ __ Ret(); |
+ |
+ CodeDesc desc; |
+ masm.GetCode(&desc); |
+ ASSERT(!RelocInfo::RequiresRelocation(desc)); |
+ |
+ CPU::FlushICache(buffer, actual_size); |
+ OS::ProtectCode(buffer, actual_size); |
+ return FUNCTION_CAST<OS::MemCopyUint8Function>(buffer); |
+#endif |
+} |
+ |
+// Convert 8 to 16. The number of character to copy must be at least 8. |
+OS::MemCopyUint16Uint8Function CreateMemCopyUint16Uint8Function( |
+ OS::MemCopyUint16Uint8Function stub) { |
+#if defined(USE_SIMULATOR) |
+ return stub; |
+#else |
+ if (Serializer::enabled() || !CpuFeatures::IsSupported(UNALIGNED_ACCESSES)) { |
+ return stub; |
+ } |
+ size_t actual_size; |
+ byte* buffer = static_cast<byte*>(OS::Allocate(1 * KB, &actual_size, true)); |
+ if (buffer == NULL) return stub; |
+ |
+ MacroAssembler masm(NULL, buffer, static_cast<int>(actual_size)); |
+ |
+ Register dest = r0; |
+ Register src = r1; |
+ Register chars = r2; |
+ if (CpuFeatures::IsSupported(NEON)) { |
+ Register temp = r3; |
+ Label loop; |
+ |
+ __ bic(temp, chars, Operand(0x7)); |
+ __ sub(chars, chars, Operand(temp)); |
+ __ add(temp, dest, Operand(temp, LSL, 1)); |
+ |
+ __ bind(&loop); |
+ __ vld1(Neon8, NeonListOperand(d0), NeonMemOperand(src, PostIndex)); |
+ __ vmovl(NeonU8, q0, d0); |
+ __ vst1(Neon16, NeonListOperand(d0, 2), NeonMemOperand(dest, PostIndex)); |
+ __ cmp(dest, temp); |
+ __ b(&loop, ne); |
+ |
+ // Do a last copy which will overlap with the previous copy (1 to 8 bytes). |
+ __ rsb(chars, chars, Operand(8)); |
+ __ sub(src, src, Operand(chars)); |
+ __ sub(dest, dest, Operand(chars, LSL, 1)); |
+ __ vld1(Neon8, NeonListOperand(d0), NeonMemOperand(src)); |
+ __ vmovl(NeonU8, q0, d0); |
+ __ vst1(Neon16, NeonListOperand(d0, 2), NeonMemOperand(dest)); |
+ __ Ret(); |
+ } else { |
+ Register temp1 = r3; |
+ Register temp2 = ip; |
+ Register temp3 = lr; |
+ Register temp4 = r4; |
+ Label loop; |
+ Label not_two; |
+ |
+ __ Push(lr, r4); |
+ __ bic(temp2, chars, Operand(0x3)); |
+ __ add(temp2, dest, Operand(temp2, LSL, 1)); |
+ |
+ __ bind(&loop); |
+ __ ldr(temp1, MemOperand(src, 4, PostIndex)); |
+ __ uxtb16(temp3, Operand(temp1, ROR, 0)); |
+ __ uxtb16(temp4, Operand(temp1, ROR, 8)); |
+ __ pkhbt(temp1, temp3, Operand(temp4, LSL, 16)); |
+ __ str(temp1, MemOperand(dest)); |
+ __ pkhtb(temp1, temp4, Operand(temp3, ASR, 16)); |
+ __ str(temp1, MemOperand(dest, 4)); |
+ __ add(dest, dest, Operand(8)); |
+ __ cmp(dest, temp2); |
+ __ b(&loop, ne); |
+ |
+ __ mov(chars, Operand(chars, LSL, 31), SetCC); // bit0 => ne, bit1 => cs |
+ __ b(¬_two, cc); |
+ __ ldrh(temp1, MemOperand(src, 2, PostIndex)); |
+ __ uxtb(temp3, Operand(temp1, ROR, 8)); |
+ __ mov(temp3, Operand(temp3, LSL, 16)); |
+ __ uxtab(temp3, temp3, Operand(temp1, ROR, 0)); |
+ __ str(temp3, MemOperand(dest, 4, PostIndex)); |
+ __ bind(¬_two); |
+ __ ldrb(temp1, MemOperand(src), ne); |
+ __ strh(temp1, MemOperand(dest), ne); |
+ __ Pop(pc, r4); |
+ } |
+ |
+ CodeDesc desc; |
+ masm.GetCode(&desc); |
+ |
+ CPU::FlushICache(buffer, actual_size); |
+ OS::ProtectCode(buffer, actual_size); |
+ |
+ return FUNCTION_CAST<OS::MemCopyUint16Uint8Function>(buffer); |
+#endif |
+} |
+#endif |
#undef __ |