Index: simd/jsimd_arm_neon.S |
=================================================================== |
--- simd/jsimd_arm_neon.S (revision 134206) |
+++ simd/jsimd_arm_neon.S (working copy) |
@@ -2157,3 +2157,241 @@ |
.unreq SHIFT |
.unreq LOOP_COUNT |
.endfunc |
+ |
+/*****************************************************************************/ |
+ |
+/* |
+ * GLOBAL(void) |
+ * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, |
+ * JDIMENSION downsampled_width, |
+ * JSAMPARRAY input_data, |
+ * JSAMPARRAY * output_data_ptr); |
+ * |
+ * Note: the use of unaligned writes is the main remaining bottleneck in |
+ * this code, which can be potentially solved to get up to tens |
+ * of percents performance improvement on Cortex-A8/Cortex-A9. |
+ */ |
+ |
+/* |
+ * Upsample 16 source pixels to 32 destination pixels. The new 16 source |
+ * pixels are loaded to q0. The previous 16 source pixels are in q1. The |
+ * shifted-by-one source pixels are constructed in q2 by using q0 and q1. |
+ * Register d28 is used for multiplication by 3. Register q15 is used |
+ * for adding +1 bias. |
+ */ |
+.macro upsample16 OUTPTR, INPTR |
+ vld1.8 {q0}, [\INPTR]! |
+ vmovl.u8 q8, d0 |
+ vext.8 q2, q1, q0, #15 |
+ vmovl.u8 q9, d1 |
+ vaddw.u8 q10, q15, d4 |
+ vaddw.u8 q11, q15, d5 |
+ vmlal.u8 q8, d4, d28 |
+ vmlal.u8 q9, d5, d28 |
+ vmlal.u8 q10, d0, d28 |
+ vmlal.u8 q11, d1, d28 |
+ vmov q1, q0 /* backup source pixels to q1 */ |
+ vrshrn.u16 d6, q8, #2 |
+ vrshrn.u16 d7, q9, #2 |
+ vshrn.u16 d8, q10, #2 |
+ vshrn.u16 d9, q11, #2 |
+ vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! |
+.endm |
+ |
+/* |
+ * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16' |
+ * macro, the roles of q0 and q1 registers are reversed for even and odd |
+ * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed. |
+ * Also this unrolling allows to reorder loads and stores to compensate |
+ * multiplication latency and reduce stalls. |
+ */ |
+.macro upsample32 OUTPTR, INPTR |
+ /* even 16 pixels group */ |
+ vld1.8 {q0}, [\INPTR]! |
+ vmovl.u8 q8, d0 |
+ vext.8 q2, q1, q0, #15 |
+ vmovl.u8 q9, d1 |
+ vaddw.u8 q10, q15, d4 |
+ vaddw.u8 q11, q15, d5 |
+ vmlal.u8 q8, d4, d28 |
+ vmlal.u8 q9, d5, d28 |
+ vmlal.u8 q10, d0, d28 |
+ vmlal.u8 q11, d1, d28 |
+ /* odd 16 pixels group */ |
+ vld1.8 {q1}, [\INPTR]! |
+ vrshrn.u16 d6, q8, #2 |
+ vrshrn.u16 d7, q9, #2 |
+ vshrn.u16 d8, q10, #2 |
+ vshrn.u16 d9, q11, #2 |
+ vmovl.u8 q8, d2 |
+ vext.8 q2, q0, q1, #15 |
+ vmovl.u8 q9, d3 |
+ vaddw.u8 q10, q15, d4 |
+ vaddw.u8 q11, q15, d5 |
+ vmlal.u8 q8, d4, d28 |
+ vmlal.u8 q9, d5, d28 |
+ vmlal.u8 q10, d2, d28 |
+ vmlal.u8 q11, d3, d28 |
+ vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! |
+ vrshrn.u16 d6, q8, #2 |
+ vrshrn.u16 d7, q9, #2 |
+ vshrn.u16 d8, q10, #2 |
+ vshrn.u16 d9, q11, #2 |
+ vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! |
+.endm |
+ |
+/* |
+ * Upsample a row of WIDTH pixels from INPTR to OUTPTR. |
+ */ |
+.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1 |
+ /* special case for the first and last pixels */ |
+ sub \WIDTH, \WIDTH, #1 |
+ add \OUTPTR, \OUTPTR, #1 |
+ ldrb \TMP1, [\INPTR, \WIDTH] |
+ strb \TMP1, [\OUTPTR, \WIDTH, asl #1] |
+ ldrb \TMP1, [\INPTR], #1 |
+ strb \TMP1, [\OUTPTR, #-1] |
+ vmov.8 d3[7], \TMP1 |
+ |
+ subs \WIDTH, \WIDTH, #32 |
+ blt 5f |
+0: /* process 32 pixels per iteration */ |
+ upsample32 \OUTPTR, \INPTR |
+ subs \WIDTH, \WIDTH, #32 |
+ bge 0b |
+5: |
+ adds \WIDTH, \WIDTH, #16 |
+ blt 1f |
+0: /* process 16 pixels if needed */ |
+ upsample16 \OUTPTR, \INPTR |
+ subs \WIDTH, \WIDTH, #16 |
+1: |
+ adds \WIDTH, \WIDTH, #16 |
+ beq 9f |
+ |
+ /* load the remaining 1-15 pixels */ |
+ add \INPTR, \INPTR, \WIDTH |
+ tst \WIDTH, #1 |
+ beq 2f |
+ sub \INPTR, \INPTR, #1 |
+ vld1.8 {d0[0]}, [\INPTR] |
+2: |
+ tst \WIDTH, #2 |
+ beq 2f |
+ vext.8 d0, d0, d0, #6 |
+ sub \INPTR, \INPTR, #1 |
+ vld1.8 {d0[1]}, [\INPTR] |
+ sub \INPTR, \INPTR, #1 |
+ vld1.8 {d0[0]}, [\INPTR] |
+2: |
+ tst \WIDTH, #4 |
+ beq 2f |
+ vrev64.32 d0, d0 |
+ sub \INPTR, \INPTR, #1 |
+ vld1.8 {d0[3]}, [\INPTR] |
+ sub \INPTR, \INPTR, #1 |
+ vld1.8 {d0[2]}, [\INPTR] |
+ sub \INPTR, \INPTR, #1 |
+ vld1.8 {d0[1]}, [\INPTR] |
+ sub \INPTR, \INPTR, #1 |
+ vld1.8 {d0[0]}, [\INPTR] |
+2: |
+ tst \WIDTH, #8 |
+ beq 2f |
+ vmov d1, d0 |
+ sub \INPTR, \INPTR, #8 |
+ vld1.8 {d0}, [\INPTR] |
+2: /* upsample the remaining pixels */ |
+ vmovl.u8 q8, d0 |
+ vext.8 q2, q1, q0, #15 |
+ vmovl.u8 q9, d1 |
+ vaddw.u8 q10, q15, d4 |
+ vaddw.u8 q11, q15, d5 |
+ vmlal.u8 q8, d4, d28 |
+ vmlal.u8 q9, d5, d28 |
+ vmlal.u8 q10, d0, d28 |
+ vmlal.u8 q11, d1, d28 |
+ vrshrn.u16 d10, q8, #2 |
+ vrshrn.u16 d12, q9, #2 |
+ vshrn.u16 d11, q10, #2 |
+ vshrn.u16 d13, q11, #2 |
+ vzip.8 d10, d11 |
+ vzip.8 d12, d13 |
+ /* store the remaining pixels */ |
+ tst \WIDTH, #8 |
+ beq 2f |
+ vst1.8 {d10, d11}, [\OUTPTR]! |
+ vmov q5, q6 |
+2: |
+ tst \WIDTH, #4 |
+ beq 2f |
+ vst1.8 {d10}, [\OUTPTR]! |
+ vmov d10, d11 |
+2: |
+ tst \WIDTH, #2 |
+ beq 2f |
+ vst1.8 {d10[0]}, [\OUTPTR]! |
+ vst1.8 {d10[1]}, [\OUTPTR]! |
+ vst1.8 {d10[2]}, [\OUTPTR]! |
+ vst1.8 {d10[3]}, [\OUTPTR]! |
+ vext.8 d10, d10, d10, #4 |
+2: |
+ tst \WIDTH, #1 |
+ beq 2f |
+ vst1.8 {d10[0]}, [\OUTPTR]! |
+ vst1.8 {d10[1]}, [\OUTPTR]! |
+2: |
+9: |
+.endm |
+ |
+asm_function jsimd_h2v1_fancy_upsample_neon |
+ |
+ MAX_V_SAMP_FACTOR .req r0 |
+ DOWNSAMPLED_WIDTH .req r1 |
+ INPUT_DATA .req r2 |
+ OUTPUT_DATA_PTR .req r3 |
+ OUTPUT_DATA .req OUTPUT_DATA_PTR |
+ |
+ OUTPTR .req r4 |
+ INPTR .req r5 |
+ WIDTH .req ip |
+ TMP .req lr |
+ |
+ push {r4, r5, r6, lr} |
+ vpush {d8-d15} |
+ |
+ ldr OUTPUT_DATA, [OUTPUT_DATA_PTR] |
+ cmp MAX_V_SAMP_FACTOR, #0 |
+ ble 99f |
+ |
+ /* initialize constants */ |
+ vmov.u8 d28, #3 |
+ vmov.u16 q15, #1 |
+11: |
+ ldr INPTR, [INPUT_DATA], #4 |
+ ldr OUTPTR, [OUTPUT_DATA], #4 |
+ mov WIDTH, DOWNSAMPLED_WIDTH |
+ upsample_row OUTPTR, INPTR, WIDTH, TMP |
+ subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1 |
+ bgt 11b |
+ |
+99: |
+ vpop {d8-d15} |
+ pop {r4, r5, r6, pc} |
+ |
+ .unreq MAX_V_SAMP_FACTOR |
+ .unreq DOWNSAMPLED_WIDTH |
+ .unreq INPUT_DATA |
+ .unreq OUTPUT_DATA_PTR |
+ .unreq OUTPUT_DATA |
+ |
+ .unreq OUTPTR |
+ .unreq INPTR |
+ .unreq WIDTH |
+ .unreq TMP |
+ |
+.endfunc |
+ |
+.purgem upsample16 |
+.purgem upsample32 |
+.purgem upsample_row |