Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: simd/jsimd_arm_neon.S

Issue 10386084: Update libjpeg-turbo. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libjpeg_turbo/
Patch Set: Created 8 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « simd/jsimd_arm.c ('k') | simd/jsimd_i386.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * ARM NEON optimizations for libjpeg-turbo 2 * ARM NEON optimizations for libjpeg-turbo
3 * 3 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved. 5 * All rights reserved.
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
7 * 7 *
8 * This software is provided 'as-is', without any express or implied 8 * This software is provided 'as-is', without any express or implied
9 * warranty. In no event will the authors be held liable for any damages 9 * warranty. In no event will the authors be held liable for any damages
10 * arising from the use of this software. 10 * arising from the use of this software.
(...skipping 2139 matching lines...) Expand 10 before | Expand all | Expand 10 after
2150 bx lr /* return */ 2150 bx lr /* return */
2151 2151
2152 .unreq COEF_BLOCK 2152 .unreq COEF_BLOCK
2153 .unreq DIVISORS 2153 .unreq DIVISORS
2154 .unreq WORKSPACE 2154 .unreq WORKSPACE
2155 .unreq RECIPROCAL 2155 .unreq RECIPROCAL
2156 .unreq CORRECTION 2156 .unreq CORRECTION
2157 .unreq SHIFT 2157 .unreq SHIFT
2158 .unreq LOOP_COUNT 2158 .unreq LOOP_COUNT
2159 .endfunc 2159 .endfunc
2160
2161 /*****************************************************************************/
2162
2163 /*
2164 * GLOBAL(void)
2165 * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
2166 * JDIMENSION downsampled_width,
2167 * JSAMPARRAY input_data,
2168 * JSAMPARRAY * output_data_ptr);
2169 *
2170 * Note: the use of unaligned writes is the main remaining bottleneck in
2171 * this code, which can be potentially solved to get up to tens
2172 * of percents performance improvement on Cortex-A8/Cortex-A9.
2173 */
2174
2175 /*
2176 * Upsample 16 source pixels to 32 destination pixels. The new 16 source
2177 * pixels are loaded to q0. The previous 16 source pixels are in q1. The
2178 * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
2179 * Register d28 is used for multiplication by 3. Register q15 is used
2180 * for adding +1 bias.
2181 */
2182 .macro upsample16 OUTPTR, INPTR
2183 vld1.8 {q0}, [\INPTR]!
2184 vmovl.u8 q8, d0
2185 vext.8 q2, q1, q0, #15
2186 vmovl.u8 q9, d1
2187 vaddw.u8 q10, q15, d4
2188 vaddw.u8 q11, q15, d5
2189 vmlal.u8 q8, d4, d28
2190 vmlal.u8 q9, d5, d28
2191 vmlal.u8 q10, d0, d28
2192 vmlal.u8 q11, d1, d28
2193 vmov q1, q0 /* backup source pixels to q1 */
2194 vrshrn.u16 d6, q8, #2
2195 vrshrn.u16 d7, q9, #2
2196 vshrn.u16 d8, q10, #2
2197 vshrn.u16 d9, q11, #2
2198 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2199 .endm
2200
2201 /*
2202 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
2203 * macro, the roles of q0 and q1 registers are reversed for even and odd
2204 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
2205 * Also this unrolling allows to reorder loads and stores to compensate
2206 * multiplication latency and reduce stalls.
2207 */
2208 .macro upsample32 OUTPTR, INPTR
2209 /* even 16 pixels group */
2210 vld1.8 {q0}, [\INPTR]!
2211 vmovl.u8 q8, d0
2212 vext.8 q2, q1, q0, #15
2213 vmovl.u8 q9, d1
2214 vaddw.u8 q10, q15, d4
2215 vaddw.u8 q11, q15, d5
2216 vmlal.u8 q8, d4, d28
2217 vmlal.u8 q9, d5, d28
2218 vmlal.u8 q10, d0, d28
2219 vmlal.u8 q11, d1, d28
2220 /* odd 16 pixels group */
2221 vld1.8 {q1}, [\INPTR]!
2222 vrshrn.u16 d6, q8, #2
2223 vrshrn.u16 d7, q9, #2
2224 vshrn.u16 d8, q10, #2
2225 vshrn.u16 d9, q11, #2
2226 vmovl.u8 q8, d2
2227 vext.8 q2, q0, q1, #15
2228 vmovl.u8 q9, d3
2229 vaddw.u8 q10, q15, d4
2230 vaddw.u8 q11, q15, d5
2231 vmlal.u8 q8, d4, d28
2232 vmlal.u8 q9, d5, d28
2233 vmlal.u8 q10, d2, d28
2234 vmlal.u8 q11, d3, d28
2235 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2236 vrshrn.u16 d6, q8, #2
2237 vrshrn.u16 d7, q9, #2
2238 vshrn.u16 d8, q10, #2
2239 vshrn.u16 d9, q11, #2
2240 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2241 .endm
2242
2243 /*
2244 * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
2245 */
2246 .macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
2247 /* special case for the first and last pixels */
2248 sub \WIDTH, \WIDTH, #1
2249 add \OUTPTR, \OUTPTR, #1
2250 ldrb \TMP1, [\INPTR, \WIDTH]
2251 strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
2252 ldrb \TMP1, [\INPTR], #1
2253 strb \TMP1, [\OUTPTR, #-1]
2254 vmov.8 d3[7], \TMP1
2255
2256 subs \WIDTH, \WIDTH, #32
2257 blt 5f
2258 0: /* process 32 pixels per iteration */
2259 upsample32 \OUTPTR, \INPTR
2260 subs \WIDTH, \WIDTH, #32
2261 bge 0b
2262 5:
2263 adds \WIDTH, \WIDTH, #16
2264 blt 1f
2265 0: /* process 16 pixels if needed */
2266 upsample16 \OUTPTR, \INPTR
2267 subs \WIDTH, \WIDTH, #16
2268 1:
2269 adds \WIDTH, \WIDTH, #16
2270 beq 9f
2271
2272 /* load the remaining 1-15 pixels */
2273 add \INPTR, \INPTR, \WIDTH
2274 tst \WIDTH, #1
2275 beq 2f
2276 sub \INPTR, \INPTR, #1
2277 vld1.8 {d0[0]}, [\INPTR]
2278 2:
2279 tst \WIDTH, #2
2280 beq 2f
2281 vext.8 d0, d0, d0, #6
2282 sub \INPTR, \INPTR, #1
2283 vld1.8 {d0[1]}, [\INPTR]
2284 sub \INPTR, \INPTR, #1
2285 vld1.8 {d0[0]}, [\INPTR]
2286 2:
2287 tst \WIDTH, #4
2288 beq 2f
2289 vrev64.32 d0, d0
2290 sub \INPTR, \INPTR, #1
2291 vld1.8 {d0[3]}, [\INPTR]
2292 sub \INPTR, \INPTR, #1
2293 vld1.8 {d0[2]}, [\INPTR]
2294 sub \INPTR, \INPTR, #1
2295 vld1.8 {d0[1]}, [\INPTR]
2296 sub \INPTR, \INPTR, #1
2297 vld1.8 {d0[0]}, [\INPTR]
2298 2:
2299 tst \WIDTH, #8
2300 beq 2f
2301 vmov d1, d0
2302 sub \INPTR, \INPTR, #8
2303 vld1.8 {d0}, [\INPTR]
2304 2: /* upsample the remaining pixels */
2305 vmovl.u8 q8, d0
2306 vext.8 q2, q1, q0, #15
2307 vmovl.u8 q9, d1
2308 vaddw.u8 q10, q15, d4
2309 vaddw.u8 q11, q15, d5
2310 vmlal.u8 q8, d4, d28
2311 vmlal.u8 q9, d5, d28
2312 vmlal.u8 q10, d0, d28
2313 vmlal.u8 q11, d1, d28
2314 vrshrn.u16 d10, q8, #2
2315 vrshrn.u16 d12, q9, #2
2316 vshrn.u16 d11, q10, #2
2317 vshrn.u16 d13, q11, #2
2318 vzip.8 d10, d11
2319 vzip.8 d12, d13
2320 /* store the remaining pixels */
2321 tst \WIDTH, #8
2322 beq 2f
2323 vst1.8 {d10, d11}, [\OUTPTR]!
2324 vmov q5, q6
2325 2:
2326 tst \WIDTH, #4
2327 beq 2f
2328 vst1.8 {d10}, [\OUTPTR]!
2329 vmov d10, d11
2330 2:
2331 tst \WIDTH, #2
2332 beq 2f
2333 vst1.8 {d10[0]}, [\OUTPTR]!
2334 vst1.8 {d10[1]}, [\OUTPTR]!
2335 vst1.8 {d10[2]}, [\OUTPTR]!
2336 vst1.8 {d10[3]}, [\OUTPTR]!
2337 vext.8 d10, d10, d10, #4
2338 2:
2339 tst \WIDTH, #1
2340 beq 2f
2341 vst1.8 {d10[0]}, [\OUTPTR]!
2342 vst1.8 {d10[1]}, [\OUTPTR]!
2343 2:
2344 9:
2345 .endm
2346
2347 asm_function jsimd_h2v1_fancy_upsample_neon
2348
2349 MAX_V_SAMP_FACTOR .req r0
2350 DOWNSAMPLED_WIDTH .req r1
2351 INPUT_DATA .req r2
2352 OUTPUT_DATA_PTR .req r3
2353 OUTPUT_DATA .req OUTPUT_DATA_PTR
2354
2355 OUTPTR .req r4
2356 INPTR .req r5
2357 WIDTH .req ip
2358 TMP .req lr
2359
2360 push {r4, r5, r6, lr}
2361 vpush {d8-d15}
2362
2363 ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
2364 cmp MAX_V_SAMP_FACTOR, #0
2365 ble 99f
2366
2367 /* initialize constants */
2368 vmov.u8 d28, #3
2369 vmov.u16 q15, #1
2370 11:
2371 ldr INPTR, [INPUT_DATA], #4
2372 ldr OUTPTR, [OUTPUT_DATA], #4
2373 mov WIDTH, DOWNSAMPLED_WIDTH
2374 upsample_row OUTPTR, INPTR, WIDTH, TMP
2375 subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
2376 bgt 11b
2377
2378 99:
2379 vpop {d8-d15}
2380 pop {r4, r5, r6, pc}
2381
2382 .unreq MAX_V_SAMP_FACTOR
2383 .unreq DOWNSAMPLED_WIDTH
2384 .unreq INPUT_DATA
2385 .unreq OUTPUT_DATA_PTR
2386 .unreq OUTPUT_DATA
2387
2388 .unreq OUTPTR
2389 .unreq INPTR
2390 .unreq WIDTH
2391 .unreq TMP
2392
2393 .endfunc
2394
2395 .purgem upsample16
2396 .purgem upsample32
2397 .purgem upsample_row
OLDNEW
« no previous file with comments | « simd/jsimd_arm.c ('k') | simd/jsimd_i386.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698