src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 22566002: ARM Skia NEON patches - 24 - S32_D565_Blend_Dither slight speedup/bugfix

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 22566002: ARM Skia NEON patches - 24 - S32_D565_Blend_Dither slight speedup/bugfix (Closed) Base URL: https://skia.googlecode.com/svn/trunk

Patch Set: Created 7 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm.h"	8 #include "SkBlitRow_opts_arm.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

(...skipping 304 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
315 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,	315 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,

316 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,	316 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,

317 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,	317 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,

318 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,	318 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,

319	319

320 };	320 };

321	321

322 void S32_D565_Blend_Dither_neon(uint16_t dst, const SkPMColor src,	322 void S32_D565_Blend_Dither_neon(uint16_t dst, const SkPMColor src,

323 int count, U8CPU alpha, int x, int y)	323 int count, U8CPU alpha, int x, int y)

324 {	324 {

325 /* select row and offset for dither array */

326 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];

327	325

328 /* rescale alpha to range 0 - 256 */	326 SkASSERT(255 > alpha);

	327

	328 // rescale alpha to range 1 - 256

329 int scale = SkAlpha255To256(alpha);	329 int scale = SkAlpha255To256(alpha);

330	330

331 asm volatile (	331 if (count >= 8) {

332 "vld1.8 {d31}, [%[dstart]] \n\t" // loa d dither values	332 /* select row and offset for dither array */

333 "vshr.u8 d30, d31, #1 \n\t" // cal c. green dither values	333 const uint8_t dstart = &gDitherMatrix_Neon[(y&3)12 + (x&3)];

334 "vdup.16 d6, %[scale] \n\t" // dup licate scale into neon reg

335 "vmov.i8 d29, #0x3f \n\t" // set up green mask

336 "vmov.i8 d28, #0x1f \n\t" // set up blue mask

337 "1: \n\t"

338 "vld4.8 {d0, d1, d2, d3}, [%[src]]! \n\t" // loa d 8 pixels and split into argb

339 "vshr.u8 d22, d0, #5 \n\t" // cal c. red >> 5

340 "vshr.u8 d23, d1, #6 \n\t" // cal c. green >> 6

341 "vshr.u8 d24, d2, #5 \n\t" // cal c. blue >> 5

342 "vaddl.u8 q8, d0, d31 \n\t" // add in dither to red and widen

343 "vaddl.u8 q9, d1, d30 \n\t" // add in dither to green and widen

344 "vaddl.u8 q10, d2, d31 \n\t" // add in dither to blue and widen

345 "vsubw.u8 q8, q8, d22 \n\t" // sub shifted red from result

346 "vsubw.u8 q9, q9, d23 \n\t" // sub shifted green from result

347 "vsubw.u8 q10, q10, d24 \n\t" // sub shifted blue from result

348 "vshrn.i16 d22, q8, #3 \n\t" // shi ft right and narrow to 5 bits

349 "vshrn.i16 d23, q9, #2 \n\t" // shi ft right and narrow to 6 bits

350 "vshrn.i16 d24, q10, #3 \n\t" // shi ft right and narrow to 5 bits

351 // load 8 pixels from dst, extract rgb

352 "vld1.16 {d0, d1}, [%[dst]] \n\t" // loa d 8 pixels

353 "vshrn.i16 d17, q0, #5 \n\t" // shi ft green down to bottom 6 bits

354 "vmovn.i16 d18, q0 \n\t" // nar row to get blue as bytes

355 "vshr.u16 q0, q0, #11 \n\t" // shi ft down to extract red

356 "vand d17, d17, d29 \n\t" // and green with green mask

357 "vand d18, d18, d28 \n\t" // and blue with blue mask

358 "vmovn.i16 d16, q0 \n\t" // nar row to get red as bytes

359 // src = {d22 (r), d23 (g), d24 (b)}

360 // dst = {d16 (r), d17 (g), d18 (b)}

361 // subtract dst from src and widen

362 "vsubl.s8 q0, d22, d16 \n\t" // sub tract red src from dst

363 "vsubl.s8 q1, d23, d17 \n\t" // sub tract green src from dst

364 "vsubl.s8 q2, d24, d18 \n\t" // sub tract blue src from dst

365 // multiply diffs by scale and shift

366 "vmul.i16 q0, q0, d6[0] \n\t" // mul tiply red by scale

367 "vmul.i16 q1, q1, d6[0] \n\t" // mul tiply blue by scale

368 "vmul.i16 q2, q2, d6[0] \n\t" // mul tiply green by scale

369 "subs %[count], %[count], #8 \n\t" // dec rement loop counter

370 "vshrn.i16 d0, q0, #8 \n\t" // shi ft down red by 8 and narrow

371 "vshrn.i16 d2, q1, #8 \n\t" // shi ft down green by 8 and narrow

372 "vshrn.i16 d4, q2, #8 \n\t" // shi ft down blue by 8 and narrow

373 // add dst to result

374 "vaddl.s8 q0, d0, d16 \n\t" // add dst to red

375 "vaddl.s8 q1, d2, d17 \n\t" // add dst to green

376 "vaddl.s8 q2, d4, d18 \n\t" // add dst to blue

377 // put result into 565 format

378 "vsli.i16 q2, q1, #5 \n\t" // shi ft up green and insert into blue

379 "vsli.i16 q2, q0, #11 \n\t" // shi ft up red and insert into blue

380 "vst1.16 {d4, d5}, [%[dst]]! \n\t" // sto re result

381 "bgt 1b \n\t" // loo p if count > 0

382 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)

383 : [dstart] "r" (dstart), [scale] "r" (scale)

384 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d 16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30" , "d31"

385 );

386	334

387 DITHER_565_SCAN(y);	335 uint8x8_t vdither = vld1_u8(dstart); // load dither values

	336 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither value s

388	337

389 while((count & 7) > 0)	338 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into ne on reg

390 {	339 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask

391 SkPMColor c = *src++;

392	340

393 int dither = DITHER_VALUE(x);	341 do {

394 int sr = SkGetPackedR32(c);

395 int sg = SkGetPackedG32(c);

396 int sb = SkGetPackedB32(c);

397 sr = SkDITHER_R32To565(sr, dither);

398 sg = SkDITHER_G32To565(sg, dither);

399 sb = SkDITHER_B32To565(sb, dither);

400	342

401 uint16_t d = *dst;	343 uint8x8_t vsrc_r, vsrc_g, vsrc_b;

402 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),	344 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;

403 SkAlphaBlend(sg, SkGetPackedG16(d), scale),	345 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;

404 SkAlphaBlend(sb, SkGetPackedB16(d), scale));	346 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;

405 DITHER_INC_X(x);	347 uint16x8_t vdst;

406 count--;	348 uint16x8_t vdst_r, vdst_g, vdst_b;

	349 int16x8_t vres_r, vres_g, vres_b;

	350 int8x8_t vres8_r, vres8_g, vres8_b;

	351

	352 // Load source and add dither

	353 {

	354 register uint8x8_t d0 asm("d0");

	355 register uint8x8_t d1 asm("d1");

	356 register uint8x8_t d2 asm("d2");

	357 register uint8x8_t d3 asm("d3");

	358

	359 asm (

	360 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"

	361 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)

	362 :

	363 );

	364 vsrc_g = d1;

	365 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)

	366 vsrc_r = d2; vsrc_b = d0;

	367 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)

	368 vsrc_r = d0; vsrc_b = d2;

	369 #endif

	370 }

	371

	372 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6

	373 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5

	374 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5

	375

	376 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen

	377 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red an d widen

	378 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue a nd widen

	379

	380 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red fr om result

	381 vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result

	382 vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue f rom result

	383

	384 vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);

	385 vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);

	386 vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);

	387

	388 // Load dst and unpack

	389 vdst = vld1q_u16(dst);

	390 vdst_g = vshrq_n_u16(vdst, 5); // shift down to ge t green

	391 vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red

	392 vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue

	393

	394 // subtract dst from src and widen

	395 vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_ s16_u16(vdst_r));

	396 vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_ s16_u16(vdst_g));

	397 vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_ s16_u16(vdst_b));

	398

	399 // multiply diffs by scale and shift

	400 vres_r = vmulq_s16(vres_r, vscale);

	401 vres_g = vmulq_s16(vres_g, vscale);

	402 vres_b = vmulq_s16(vres_b, vscale);

	403

	404 vres8_r = vshrn_n_s16(vres_r, 8);

	405 vres8_g = vshrn_n_s16(vres_g, 8);

	406 vres8_b = vshrn_n_s16(vres_b, 8);

	407

	408 // add dst to result

	409 vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);

	410 vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);

	411 vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);

	412

	413 // put result into 565 format

	414 vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and ins ert into blue

	415 vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and inser t into blue

	416

	417 // Store result

	418 vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));

	419

	420 // Next iteration

	421 dst += 8;

	422 count -= 8;

	423

	424 } while (count >= 8);

	425 }

	426

	427 // Leftovers

	428 if (count > 0) {

	429 int scale = SkAlpha255To256(alpha);

	430 DITHER_565_SCAN(y);

	431 do {

	432 SkPMColor c = *src++;

	433 SkPMColorAssert(c);

	434

	435 int dither = DITHER_VALUE(x);

	436 int sr = SkGetPackedR32(c);

	437 int sg = SkGetPackedG32(c);

	438 int sb = SkGetPackedB32(c);

	439 sr = SkDITHER_R32To565(sr, dither);

	440 sg = SkDITHER_G32To565(sg, dither);

	441 sb = SkDITHER_B32To565(sb, dither);

	442

	443 uint16_t d = *dst;

	444 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),

	445 SkAlphaBlend(sg, SkGetPackedG16(d), scale),

	446 SkAlphaBlend(sb, SkGetPackedB16(d), scale));

	447 DITHER_INC_X(x);

	448 } while (--count != 0);

407 }	449 }

408 }	450 }

409	451

410 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,	452 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

411 const SkPMColor* SK_RESTRICT src,	453 const SkPMColor* SK_RESTRICT src,

412 int count, U8CPU alpha) {	454 int count, U8CPU alpha) {

413	455

414 SkASSERT(255 == alpha);	456 SkASSERT(255 == alpha);

415 if (count > 0) {	457 if (count > 0) {

416	458

(...skipping 851 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1268 * case where we do not inspect the src alpha.	1310 * case where we do not inspect the src alpha.

1269 */	1311 */

1270 #if SK_A32_SHIFT == 24	1312 #if SK_A32_SHIFT == 24

1271 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor	1313 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

1272 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,	1314 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

1273 #else	1315 #else

1274 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,	1316 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

1275 #endif	1317 #endif

1276 S32A_Blend_BlitRow32_arm // S32A_Blend	1318 S32A_Blend_BlitRow32_arm // S32A_Blend

1277 };	1319 };

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »