Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(271)

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 22566002: ARM Skia NEON patches - 24 - S32_D565_Blend_Dither slight speedup/bugfix (Closed) Base URL: https://skia.googlecode.com/svn/trunk
Patch Set: Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBlitRow_opts_arm.h" 8 #include "SkBlitRow_opts_arm.h"
9 9
10 #include "SkBlitMask.h" 10 #include "SkBlitMask.h"
(...skipping 304 matching lines...) Expand 10 before | Expand all | Expand 10 after
315 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 315 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
316 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 316 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
317 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 317 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
318 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 318 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
319 319
320 }; 320 };
321 321
322 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, 322 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
323 int count, U8CPU alpha, int x, int y) 323 int count, U8CPU alpha, int x, int y)
324 { 324 {
325 /* select row and offset for dither array */
326 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
327 325
328 /* rescale alpha to range 0 - 256 */ 326 SkASSERT(255 > alpha);
327
328 // rescale alpha to range 1 - 256
329 int scale = SkAlpha255To256(alpha); 329 int scale = SkAlpha255To256(alpha);
330 330
331 asm volatile ( 331 if (count >= 8) {
332 "vld1.8 {d31}, [%[dstart]] \n\t" // loa d dither values 332 /* select row and offset for dither array */
333 "vshr.u8 d30, d31, #1 \n\t" // cal c. green dither values 333 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
334 "vdup.16 d6, %[scale] \n\t" // dup licate scale into neon reg
335 "vmov.i8 d29, #0x3f \n\t" // set up green mask
336 "vmov.i8 d28, #0x1f \n\t" // set up blue mask
337 "1: \n\t"
338 "vld4.8 {d0, d1, d2, d3}, [%[src]]! \n\t" // loa d 8 pixels and split into argb
339 "vshr.u8 d22, d0, #5 \n\t" // cal c. red >> 5
340 "vshr.u8 d23, d1, #6 \n\t" // cal c. green >> 6
341 "vshr.u8 d24, d2, #5 \n\t" // cal c. blue >> 5
342 "vaddl.u8 q8, d0, d31 \n\t" // add in dither to red and widen
343 "vaddl.u8 q9, d1, d30 \n\t" // add in dither to green and widen
344 "vaddl.u8 q10, d2, d31 \n\t" // add in dither to blue and widen
345 "vsubw.u8 q8, q8, d22 \n\t" // sub shifted red from result
346 "vsubw.u8 q9, q9, d23 \n\t" // sub shifted green from result
347 "vsubw.u8 q10, q10, d24 \n\t" // sub shifted blue from result
348 "vshrn.i16 d22, q8, #3 \n\t" // shi ft right and narrow to 5 bits
349 "vshrn.i16 d23, q9, #2 \n\t" // shi ft right and narrow to 6 bits
350 "vshrn.i16 d24, q10, #3 \n\t" // shi ft right and narrow to 5 bits
351 // load 8 pixels from dst, extract rgb
352 "vld1.16 {d0, d1}, [%[dst]] \n\t" // loa d 8 pixels
353 "vshrn.i16 d17, q0, #5 \n\t" // shi ft green down to bottom 6 bits
354 "vmovn.i16 d18, q0 \n\t" // nar row to get blue as bytes
355 "vshr.u16 q0, q0, #11 \n\t" // shi ft down to extract red
356 "vand d17, d17, d29 \n\t" // and green with green mask
357 "vand d18, d18, d28 \n\t" // and blue with blue mask
358 "vmovn.i16 d16, q0 \n\t" // nar row to get red as bytes
359 // src = {d22 (r), d23 (g), d24 (b)}
360 // dst = {d16 (r), d17 (g), d18 (b)}
361 // subtract dst from src and widen
362 "vsubl.s8 q0, d22, d16 \n\t" // sub tract red src from dst
363 "vsubl.s8 q1, d23, d17 \n\t" // sub tract green src from dst
364 "vsubl.s8 q2, d24, d18 \n\t" // sub tract blue src from dst
365 // multiply diffs by scale and shift
366 "vmul.i16 q0, q0, d6[0] \n\t" // mul tiply red by scale
367 "vmul.i16 q1, q1, d6[0] \n\t" // mul tiply blue by scale
368 "vmul.i16 q2, q2, d6[0] \n\t" // mul tiply green by scale
369 "subs %[count], %[count], #8 \n\t" // dec rement loop counter
370 "vshrn.i16 d0, q0, #8 \n\t" // shi ft down red by 8 and narrow
371 "vshrn.i16 d2, q1, #8 \n\t" // shi ft down green by 8 and narrow
372 "vshrn.i16 d4, q2, #8 \n\t" // shi ft down blue by 8 and narrow
373 // add dst to result
374 "vaddl.s8 q0, d0, d16 \n\t" // add dst to red
375 "vaddl.s8 q1, d2, d17 \n\t" // add dst to green
376 "vaddl.s8 q2, d4, d18 \n\t" // add dst to blue
377 // put result into 565 format
378 "vsli.i16 q2, q1, #5 \n\t" // shi ft up green and insert into blue
379 "vsli.i16 q2, q0, #11 \n\t" // shi ft up red and insert into blue
380 "vst1.16 {d4, d5}, [%[dst]]! \n\t" // sto re result
381 "bgt 1b \n\t" // loo p if count > 0
382 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
383 : [dstart] "r" (dstart), [scale] "r" (scale)
384 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d 16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30" , "d31"
385 );
386 334
387 DITHER_565_SCAN(y); 335 uint8x8_t vdither = vld1_u8(dstart); // load dither values
336 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither value s
388 337
389 while((count & 7) > 0) 338 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into ne on reg
390 { 339 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask
391 SkPMColor c = *src++;
392 340
393 int dither = DITHER_VALUE(x); 341 do {
394 int sr = SkGetPackedR32(c);
395 int sg = SkGetPackedG32(c);
396 int sb = SkGetPackedB32(c);
397 sr = SkDITHER_R32To565(sr, dither);
398 sg = SkDITHER_G32To565(sg, dither);
399 sb = SkDITHER_B32To565(sb, dither);
400 342
401 uint16_t d = *dst; 343 uint8x8_t vsrc_r, vsrc_g, vsrc_b;
402 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 344 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
403 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 345 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
404 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 346 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
405 DITHER_INC_X(x); 347 uint16x8_t vdst;
406 count--; 348 uint16x8_t vdst_r, vdst_g, vdst_b;
349 int16x8_t vres_r, vres_g, vres_b;
350 int8x8_t vres8_r, vres8_g, vres8_b;
351
352 // Load source and add dither
353 {
354 register uint8x8_t d0 asm("d0");
355 register uint8x8_t d1 asm("d1");
356 register uint8x8_t d2 asm("d2");
357 register uint8x8_t d3 asm("d3");
358
359 asm (
360 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
361 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
362 :
363 );
364 vsrc_g = d1;
365 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
366 vsrc_r = d2; vsrc_b = d0;
367 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
368 vsrc_r = d0; vsrc_b = d2;
369 #endif
370 }
371
372 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
373 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
374 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
375
376 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
377 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red an d widen
378 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue a nd widen
379
380 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red fr om result
381 vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result
382 vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue f rom result
383
384 vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
385 vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
386 vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
387
388 // Load dst and unpack
389 vdst = vld1q_u16(dst);
390 vdst_g = vshrq_n_u16(vdst, 5); // shift down to ge t green
391 vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
392 vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue
393
394 // subtract dst from src and widen
395 vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_ s16_u16(vdst_r));
396 vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_ s16_u16(vdst_g));
397 vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_ s16_u16(vdst_b));
398
399 // multiply diffs by scale and shift
400 vres_r = vmulq_s16(vres_r, vscale);
401 vres_g = vmulq_s16(vres_g, vscale);
402 vres_b = vmulq_s16(vres_b, vscale);
403
404 vres8_r = vshrn_n_s16(vres_r, 8);
405 vres8_g = vshrn_n_s16(vres_g, 8);
406 vres8_b = vshrn_n_s16(vres_b, 8);
407
408 // add dst to result
409 vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
410 vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
411 vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
412
413 // put result into 565 format
414 vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and ins ert into blue
415 vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and inser t into blue
416
417 // Store result
418 vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
419
420 // Next iteration
421 dst += 8;
422 count -= 8;
423
424 } while (count >= 8);
425 }
426
427 // Leftovers
428 if (count > 0) {
429 int scale = SkAlpha255To256(alpha);
430 DITHER_565_SCAN(y);
431 do {
432 SkPMColor c = *src++;
433 SkPMColorAssert(c);
434
435 int dither = DITHER_VALUE(x);
436 int sr = SkGetPackedR32(c);
437 int sg = SkGetPackedG32(c);
438 int sb = SkGetPackedB32(c);
439 sr = SkDITHER_R32To565(sr, dither);
440 sg = SkDITHER_G32To565(sg, dither);
441 sb = SkDITHER_B32To565(sb, dither);
442
443 uint16_t d = *dst;
444 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
445 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
446 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
447 DITHER_INC_X(x);
448 } while (--count != 0);
407 } 449 }
408 } 450 }
409 451
410 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 452 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
411 const SkPMColor* SK_RESTRICT src, 453 const SkPMColor* SK_RESTRICT src,
412 int count, U8CPU alpha) { 454 int count, U8CPU alpha) {
413 455
414 SkASSERT(255 == alpha); 456 SkASSERT(255 == alpha);
415 if (count > 0) { 457 if (count > 0) {
416 458
(...skipping 851 matching lines...) Expand 10 before | Expand all | Expand 10 after
1268 * case where we do not inspect the src alpha. 1310 * case where we do not inspect the src alpha.
1269 */ 1311 */
1270 #if SK_A32_SHIFT == 24 1312 #if SK_A32_SHIFT == 24
1271 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1313 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1272 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1314 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1273 #else 1315 #else
1274 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1316 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1275 #endif 1317 #endif
1276 S32A_Blend_BlitRow32_arm // S32A_Blend 1318 S32A_Blend_BlitRow32_arm // S32A_Blend
1277 }; 1319 };
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698