OLD | NEW |
1 /* | 1 /* |
2 * ARM NEON optimizations for libjpeg-turbo | 2 * ARM NEON optimizations for libjpeg-turbo |
3 * | 3 * |
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). | 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). |
5 * All rights reserved. | 5 * All rights reserved. |
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> | 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> |
7 * | 7 * |
8 * This software is provided 'as-is', without any express or implied | 8 * This software is provided 'as-is', without any express or implied |
9 * warranty. In no event will the authors be held liable for any damages | 9 * warranty. In no event will the authors be held liable for any damages |
10 * arising from the use of this software. | 10 * arising from the use of this software. |
(...skipping 2139 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2150 bx lr /* return */ | 2150 bx lr /* return */ |
2151 | 2151 |
2152 .unreq COEF_BLOCK | 2152 .unreq COEF_BLOCK |
2153 .unreq DIVISORS | 2153 .unreq DIVISORS |
2154 .unreq WORKSPACE | 2154 .unreq WORKSPACE |
2155 .unreq RECIPROCAL | 2155 .unreq RECIPROCAL |
2156 .unreq CORRECTION | 2156 .unreq CORRECTION |
2157 .unreq SHIFT | 2157 .unreq SHIFT |
2158 .unreq LOOP_COUNT | 2158 .unreq LOOP_COUNT |
2159 .endfunc | 2159 .endfunc |
| 2160 |
| 2161 /*****************************************************************************/ |
| 2162 |
| 2163 /* |
| 2164 * GLOBAL(void) |
| 2165 * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, |
| 2166 * JDIMENSION downsampled_width, |
| 2167 * JSAMPARRAY input_data, |
| 2168 * JSAMPARRAY * output_data_ptr); |
| 2169 * |
| 2170 * Note: the use of unaligned writes is the main remaining bottleneck in |
| 2171 * this code, which can be potentially solved to get up to tens |
| 2172 * of percents performance improvement on Cortex-A8/Cortex-A9. |
| 2173 */ |
| 2174 |
| 2175 /* |
| 2176 * Upsample 16 source pixels to 32 destination pixels. The new 16 source |
| 2177 * pixels are loaded to q0. The previous 16 source pixels are in q1. The |
| 2178 * shifted-by-one source pixels are constructed in q2 by using q0 and q1. |
| 2179 * Register d28 is used for multiplication by 3. Register q15 is used |
| 2180 * for adding +1 bias. |
| 2181 */ |
| 2182 .macro upsample16 OUTPTR, INPTR |
| 2183 vld1.8 {q0}, [\INPTR]! |
| 2184 vmovl.u8 q8, d0 |
| 2185 vext.8 q2, q1, q0, #15 |
| 2186 vmovl.u8 q9, d1 |
| 2187 vaddw.u8 q10, q15, d4 |
| 2188 vaddw.u8 q11, q15, d5 |
| 2189 vmlal.u8 q8, d4, d28 |
| 2190 vmlal.u8 q9, d5, d28 |
| 2191 vmlal.u8 q10, d0, d28 |
| 2192 vmlal.u8 q11, d1, d28 |
| 2193 vmov q1, q0 /* backup source pixels to q1 */ |
| 2194 vrshrn.u16 d6, q8, #2 |
| 2195 vrshrn.u16 d7, q9, #2 |
| 2196 vshrn.u16 d8, q10, #2 |
| 2197 vshrn.u16 d9, q11, #2 |
| 2198 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! |
| 2199 .endm |
| 2200 |
| 2201 /* |
| 2202 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16' |
| 2203 * macro, the roles of q0 and q1 registers are reversed for even and odd |
| 2204 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed. |
| 2205 * Also this unrolling allows to reorder loads and stores to compensate |
| 2206 * multiplication latency and reduce stalls. |
| 2207 */ |
| 2208 .macro upsample32 OUTPTR, INPTR |
| 2209 /* even 16 pixels group */ |
| 2210 vld1.8 {q0}, [\INPTR]! |
| 2211 vmovl.u8 q8, d0 |
| 2212 vext.8 q2, q1, q0, #15 |
| 2213 vmovl.u8 q9, d1 |
| 2214 vaddw.u8 q10, q15, d4 |
| 2215 vaddw.u8 q11, q15, d5 |
| 2216 vmlal.u8 q8, d4, d28 |
| 2217 vmlal.u8 q9, d5, d28 |
| 2218 vmlal.u8 q10, d0, d28 |
| 2219 vmlal.u8 q11, d1, d28 |
| 2220 /* odd 16 pixels group */ |
| 2221 vld1.8 {q1}, [\INPTR]! |
| 2222 vrshrn.u16 d6, q8, #2 |
| 2223 vrshrn.u16 d7, q9, #2 |
| 2224 vshrn.u16 d8, q10, #2 |
| 2225 vshrn.u16 d9, q11, #2 |
| 2226 vmovl.u8 q8, d2 |
| 2227 vext.8 q2, q0, q1, #15 |
| 2228 vmovl.u8 q9, d3 |
| 2229 vaddw.u8 q10, q15, d4 |
| 2230 vaddw.u8 q11, q15, d5 |
| 2231 vmlal.u8 q8, d4, d28 |
| 2232 vmlal.u8 q9, d5, d28 |
| 2233 vmlal.u8 q10, d2, d28 |
| 2234 vmlal.u8 q11, d3, d28 |
| 2235 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! |
| 2236 vrshrn.u16 d6, q8, #2 |
| 2237 vrshrn.u16 d7, q9, #2 |
| 2238 vshrn.u16 d8, q10, #2 |
| 2239 vshrn.u16 d9, q11, #2 |
| 2240 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! |
| 2241 .endm |
| 2242 |
| 2243 /* |
| 2244 * Upsample a row of WIDTH pixels from INPTR to OUTPTR. |
| 2245 */ |
| 2246 .macro upsample_row OUTPTR, INPTR, WIDTH, TMP1 |
| 2247 /* special case for the first and last pixels */ |
| 2248 sub \WIDTH, \WIDTH, #1 |
| 2249 add \OUTPTR, \OUTPTR, #1 |
| 2250 ldrb \TMP1, [\INPTR, \WIDTH] |
| 2251 strb \TMP1, [\OUTPTR, \WIDTH, asl #1] |
| 2252 ldrb \TMP1, [\INPTR], #1 |
| 2253 strb \TMP1, [\OUTPTR, #-1] |
| 2254 vmov.8 d3[7], \TMP1 |
| 2255 |
| 2256 subs \WIDTH, \WIDTH, #32 |
| 2257 blt 5f |
| 2258 0: /* process 32 pixels per iteration */ |
| 2259 upsample32 \OUTPTR, \INPTR |
| 2260 subs \WIDTH, \WIDTH, #32 |
| 2261 bge 0b |
| 2262 5: |
| 2263 adds \WIDTH, \WIDTH, #16 |
| 2264 blt 1f |
| 2265 0: /* process 16 pixels if needed */ |
| 2266 upsample16 \OUTPTR, \INPTR |
| 2267 subs \WIDTH, \WIDTH, #16 |
| 2268 1: |
| 2269 adds \WIDTH, \WIDTH, #16 |
| 2270 beq 9f |
| 2271 |
| 2272 /* load the remaining 1-15 pixels */ |
| 2273 add \INPTR, \INPTR, \WIDTH |
| 2274 tst \WIDTH, #1 |
| 2275 beq 2f |
| 2276 sub \INPTR, \INPTR, #1 |
| 2277 vld1.8 {d0[0]}, [\INPTR] |
| 2278 2: |
| 2279 tst \WIDTH, #2 |
| 2280 beq 2f |
| 2281 vext.8 d0, d0, d0, #6 |
| 2282 sub \INPTR, \INPTR, #1 |
| 2283 vld1.8 {d0[1]}, [\INPTR] |
| 2284 sub \INPTR, \INPTR, #1 |
| 2285 vld1.8 {d0[0]}, [\INPTR] |
| 2286 2: |
| 2287 tst \WIDTH, #4 |
| 2288 beq 2f |
| 2289 vrev64.32 d0, d0 |
| 2290 sub \INPTR, \INPTR, #1 |
| 2291 vld1.8 {d0[3]}, [\INPTR] |
| 2292 sub \INPTR, \INPTR, #1 |
| 2293 vld1.8 {d0[2]}, [\INPTR] |
| 2294 sub \INPTR, \INPTR, #1 |
| 2295 vld1.8 {d0[1]}, [\INPTR] |
| 2296 sub \INPTR, \INPTR, #1 |
| 2297 vld1.8 {d0[0]}, [\INPTR] |
| 2298 2: |
| 2299 tst \WIDTH, #8 |
| 2300 beq 2f |
| 2301 vmov d1, d0 |
| 2302 sub \INPTR, \INPTR, #8 |
| 2303 vld1.8 {d0}, [\INPTR] |
| 2304 2: /* upsample the remaining pixels */ |
| 2305 vmovl.u8 q8, d0 |
| 2306 vext.8 q2, q1, q0, #15 |
| 2307 vmovl.u8 q9, d1 |
| 2308 vaddw.u8 q10, q15, d4 |
| 2309 vaddw.u8 q11, q15, d5 |
| 2310 vmlal.u8 q8, d4, d28 |
| 2311 vmlal.u8 q9, d5, d28 |
| 2312 vmlal.u8 q10, d0, d28 |
| 2313 vmlal.u8 q11, d1, d28 |
| 2314 vrshrn.u16 d10, q8, #2 |
| 2315 vrshrn.u16 d12, q9, #2 |
| 2316 vshrn.u16 d11, q10, #2 |
| 2317 vshrn.u16 d13, q11, #2 |
| 2318 vzip.8 d10, d11 |
| 2319 vzip.8 d12, d13 |
| 2320 /* store the remaining pixels */ |
| 2321 tst \WIDTH, #8 |
| 2322 beq 2f |
| 2323 vst1.8 {d10, d11}, [\OUTPTR]! |
| 2324 vmov q5, q6 |
| 2325 2: |
| 2326 tst \WIDTH, #4 |
| 2327 beq 2f |
| 2328 vst1.8 {d10}, [\OUTPTR]! |
| 2329 vmov d10, d11 |
| 2330 2: |
| 2331 tst \WIDTH, #2 |
| 2332 beq 2f |
| 2333 vst1.8 {d10[0]}, [\OUTPTR]! |
| 2334 vst1.8 {d10[1]}, [\OUTPTR]! |
| 2335 vst1.8 {d10[2]}, [\OUTPTR]! |
| 2336 vst1.8 {d10[3]}, [\OUTPTR]! |
| 2337 vext.8 d10, d10, d10, #4 |
| 2338 2: |
| 2339 tst \WIDTH, #1 |
| 2340 beq 2f |
| 2341 vst1.8 {d10[0]}, [\OUTPTR]! |
| 2342 vst1.8 {d10[1]}, [\OUTPTR]! |
| 2343 2: |
| 2344 9: |
| 2345 .endm |
| 2346 |
| 2347 asm_function jsimd_h2v1_fancy_upsample_neon |
| 2348 |
| 2349 MAX_V_SAMP_FACTOR .req r0 |
| 2350 DOWNSAMPLED_WIDTH .req r1 |
| 2351 INPUT_DATA .req r2 |
| 2352 OUTPUT_DATA_PTR .req r3 |
| 2353 OUTPUT_DATA .req OUTPUT_DATA_PTR |
| 2354 |
| 2355 OUTPTR .req r4 |
| 2356 INPTR .req r5 |
| 2357 WIDTH .req ip |
| 2358 TMP .req lr |
| 2359 |
| 2360 push {r4, r5, r6, lr} |
| 2361 vpush {d8-d15} |
| 2362 |
| 2363 ldr OUTPUT_DATA, [OUTPUT_DATA_PTR] |
| 2364 cmp MAX_V_SAMP_FACTOR, #0 |
| 2365 ble 99f |
| 2366 |
| 2367 /* initialize constants */ |
| 2368 vmov.u8 d28, #3 |
| 2369 vmov.u16 q15, #1 |
| 2370 11: |
| 2371 ldr INPTR, [INPUT_DATA], #4 |
| 2372 ldr OUTPTR, [OUTPUT_DATA], #4 |
| 2373 mov WIDTH, DOWNSAMPLED_WIDTH |
| 2374 upsample_row OUTPTR, INPTR, WIDTH, TMP |
| 2375 subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1 |
| 2376 bgt 11b |
| 2377 |
| 2378 99: |
| 2379 vpop {d8-d15} |
| 2380 pop {r4, r5, r6, pc} |
| 2381 |
| 2382 .unreq MAX_V_SAMP_FACTOR |
| 2383 .unreq DOWNSAMPLED_WIDTH |
| 2384 .unreq INPUT_DATA |
| 2385 .unreq OUTPUT_DATA_PTR |
| 2386 .unreq OUTPUT_DATA |
| 2387 |
| 2388 .unreq OUTPTR |
| 2389 .unreq INPTR |
| 2390 .unreq WIDTH |
| 2391 .unreq TMP |
| 2392 |
| 2393 .endfunc |
| 2394 |
| 2395 .purgem upsample16 |
| 2396 .purgem upsample32 |
| 2397 .purgem upsample_row |
OLD | NEW |