OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm.h" | 8 #include "SkBlitRow_opts_arm.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
(...skipping 304 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
315 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, | 315 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, |
316 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, | 316 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, |
317 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, | 317 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, |
318 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, | 318 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, |
319 | 319 |
320 }; | 320 }; |
321 | 321 |
322 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, | 322 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, |
323 int count, U8CPU alpha, int x, int y) | 323 int count, U8CPU alpha, int x, int y) |
324 { | 324 { |
325 /* select row and offset for dither array */ | |
326 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; | |
327 | 325 |
328 /* rescale alpha to range 0 - 256 */ | 326 SkASSERT(255 > alpha); |
| 327 |
| 328 // rescale alpha to range 1 - 256 |
329 int scale = SkAlpha255To256(alpha); | 329 int scale = SkAlpha255To256(alpha); |
330 | 330 |
331 asm volatile ( | 331 if (count >= 8) { |
332 "vld1.8 {d31}, [%[dstart]] \n\t" // loa
d dither values | 332 /* select row and offset for dither array */ |
333 "vshr.u8 d30, d31, #1 \n\t" // cal
c. green dither values | 333 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
334 "vdup.16 d6, %[scale] \n\t" // dup
licate scale into neon reg | |
335 "vmov.i8 d29, #0x3f \n\t" // set
up green mask | |
336 "vmov.i8 d28, #0x1f \n\t" // set
up blue mask | |
337 "1: \n\t" | |
338 "vld4.8 {d0, d1, d2, d3}, [%[src]]! \n\t" // loa
d 8 pixels and split into argb | |
339 "vshr.u8 d22, d0, #5 \n\t" // cal
c. red >> 5 | |
340 "vshr.u8 d23, d1, #6 \n\t" // cal
c. green >> 6 | |
341 "vshr.u8 d24, d2, #5 \n\t" // cal
c. blue >> 5 | |
342 "vaddl.u8 q8, d0, d31 \n\t" // add
in dither to red and widen | |
343 "vaddl.u8 q9, d1, d30 \n\t" // add
in dither to green and widen | |
344 "vaddl.u8 q10, d2, d31 \n\t" // add
in dither to blue and widen | |
345 "vsubw.u8 q8, q8, d22 \n\t" // sub
shifted red from result | |
346 "vsubw.u8 q9, q9, d23 \n\t" // sub
shifted green from result | |
347 "vsubw.u8 q10, q10, d24 \n\t" // sub
shifted blue from result | |
348 "vshrn.i16 d22, q8, #3 \n\t" // shi
ft right and narrow to 5 bits | |
349 "vshrn.i16 d23, q9, #2 \n\t" // shi
ft right and narrow to 6 bits | |
350 "vshrn.i16 d24, q10, #3 \n\t" // shi
ft right and narrow to 5 bits | |
351 // load 8 pixels from dst, extract rgb | |
352 "vld1.16 {d0, d1}, [%[dst]] \n\t" // loa
d 8 pixels | |
353 "vshrn.i16 d17, q0, #5 \n\t" // shi
ft green down to bottom 6 bits | |
354 "vmovn.i16 d18, q0 \n\t" // nar
row to get blue as bytes | |
355 "vshr.u16 q0, q0, #11 \n\t" // shi
ft down to extract red | |
356 "vand d17, d17, d29 \n\t" // and
green with green mask | |
357 "vand d18, d18, d28 \n\t" // and
blue with blue mask | |
358 "vmovn.i16 d16, q0 \n\t" // nar
row to get red as bytes | |
359 // src = {d22 (r), d23 (g), d24 (b)} | |
360 // dst = {d16 (r), d17 (g), d18 (b)} | |
361 // subtract dst from src and widen | |
362 "vsubl.s8 q0, d22, d16 \n\t" // sub
tract red src from dst | |
363 "vsubl.s8 q1, d23, d17 \n\t" // sub
tract green src from dst | |
364 "vsubl.s8 q2, d24, d18 \n\t" // sub
tract blue src from dst | |
365 // multiply diffs by scale and shift | |
366 "vmul.i16 q0, q0, d6[0] \n\t" // mul
tiply red by scale | |
367 "vmul.i16 q1, q1, d6[0] \n\t" // mul
tiply blue by scale | |
368 "vmul.i16 q2, q2, d6[0] \n\t" // mul
tiply green by scale | |
369 "subs %[count], %[count], #8 \n\t" // dec
rement loop counter | |
370 "vshrn.i16 d0, q0, #8 \n\t" // shi
ft down red by 8 and narrow | |
371 "vshrn.i16 d2, q1, #8 \n\t" // shi
ft down green by 8 and narrow | |
372 "vshrn.i16 d4, q2, #8 \n\t" // shi
ft down blue by 8 and narrow | |
373 // add dst to result | |
374 "vaddl.s8 q0, d0, d16 \n\t" // add
dst to red | |
375 "vaddl.s8 q1, d2, d17 \n\t" // add
dst to green | |
376 "vaddl.s8 q2, d4, d18 \n\t" // add
dst to blue | |
377 // put result into 565 format | |
378 "vsli.i16 q2, q1, #5 \n\t" // shi
ft up green and insert into blue | |
379 "vsli.i16 q2, q0, #11 \n\t" // shi
ft up red and insert into blue | |
380 "vst1.16 {d4, d5}, [%[dst]]! \n\t" // sto
re result | |
381 "bgt 1b \n\t" // loo
p if count > 0 | |
382 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) | |
383 : [dstart] "r" (dstart), [scale] "r" (scale) | |
384 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d
16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30"
, "d31" | |
385 ); | |
386 | 334 |
387 DITHER_565_SCAN(y); | 335 uint8x8_t vdither = vld1_u8(dstart); // load dither values |
| 336 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither value
s |
388 | 337 |
389 while((count & 7) > 0) | 338 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into ne
on reg |
390 { | 339 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask |
391 SkPMColor c = *src++; | |
392 | 340 |
393 int dither = DITHER_VALUE(x); | 341 do { |
394 int sr = SkGetPackedR32(c); | |
395 int sg = SkGetPackedG32(c); | |
396 int sb = SkGetPackedB32(c); | |
397 sr = SkDITHER_R32To565(sr, dither); | |
398 sg = SkDITHER_G32To565(sg, dither); | |
399 sb = SkDITHER_B32To565(sb, dither); | |
400 | 342 |
401 uint16_t d = *dst; | 343 uint8x8_t vsrc_r, vsrc_g, vsrc_b; |
402 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), | 344 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; |
403 SkAlphaBlend(sg, SkGetPackedG16(d), scale), | 345 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; |
404 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); | 346 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; |
405 DITHER_INC_X(x); | 347 uint16x8_t vdst; |
406 count--; | 348 uint16x8_t vdst_r, vdst_g, vdst_b; |
| 349 int16x8_t vres_r, vres_g, vres_b; |
| 350 int8x8_t vres8_r, vres8_g, vres8_b; |
| 351 |
| 352 // Load source and add dither |
| 353 { |
| 354 register uint8x8_t d0 asm("d0"); |
| 355 register uint8x8_t d1 asm("d1"); |
| 356 register uint8x8_t d2 asm("d2"); |
| 357 register uint8x8_t d3 asm("d3"); |
| 358 |
| 359 asm ( |
| 360 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" |
| 361 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
| 362 : |
| 363 ); |
| 364 vsrc_g = d1; |
| 365 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) |
| 366 vsrc_r = d2; vsrc_b = d0; |
| 367 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) |
| 368 vsrc_r = d0; vsrc_b = d2; |
| 369 #endif |
| 370 } |
| 371 |
| 372 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 |
| 373 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 |
| 374 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 |
| 375 |
| 376 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green
and widen |
| 377 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red an
d widen |
| 378 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue a
nd widen |
| 379 |
| 380 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red fr
om result |
| 381 vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green
from result |
| 382 vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue f
rom result |
| 383 |
| 384 vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3); |
| 385 vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2); |
| 386 vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3); |
| 387 |
| 388 // Load dst and unpack |
| 389 vdst = vld1q_u16(dst); |
| 390 vdst_g = vshrq_n_u16(vdst, 5); // shift down to ge
t green |
| 391 vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to
extract red |
| 392 vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue |
| 393 |
| 394 // subtract dst from src and widen |
| 395 vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_
s16_u16(vdst_r)); |
| 396 vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_
s16_u16(vdst_g)); |
| 397 vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_
s16_u16(vdst_b)); |
| 398 |
| 399 // multiply diffs by scale and shift |
| 400 vres_r = vmulq_s16(vres_r, vscale); |
| 401 vres_g = vmulq_s16(vres_g, vscale); |
| 402 vres_b = vmulq_s16(vres_b, vscale); |
| 403 |
| 404 vres8_r = vshrn_n_s16(vres_r, 8); |
| 405 vres8_g = vshrn_n_s16(vres_g, 8); |
| 406 vres8_b = vshrn_n_s16(vres_b, 8); |
| 407 |
| 408 // add dst to result |
| 409 vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r); |
| 410 vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g); |
| 411 vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b); |
| 412 |
| 413 // put result into 565 format |
| 414 vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and ins
ert into blue |
| 415 vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and inser
t into blue |
| 416 |
| 417 // Store result |
| 418 vst1q_u16(dst, vreinterpretq_u16_s16(vres_b)); |
| 419 |
| 420 // Next iteration |
| 421 dst += 8; |
| 422 count -= 8; |
| 423 |
| 424 } while (count >= 8); |
| 425 } |
| 426 |
| 427 // Leftovers |
| 428 if (count > 0) { |
| 429 int scale = SkAlpha255To256(alpha); |
| 430 DITHER_565_SCAN(y); |
| 431 do { |
| 432 SkPMColor c = *src++; |
| 433 SkPMColorAssert(c); |
| 434 |
| 435 int dither = DITHER_VALUE(x); |
| 436 int sr = SkGetPackedR32(c); |
| 437 int sg = SkGetPackedG32(c); |
| 438 int sb = SkGetPackedB32(c); |
| 439 sr = SkDITHER_R32To565(sr, dither); |
| 440 sg = SkDITHER_G32To565(sg, dither); |
| 441 sb = SkDITHER_B32To565(sb, dither); |
| 442 |
| 443 uint16_t d = *dst; |
| 444 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), |
| 445 SkAlphaBlend(sg, SkGetPackedG16(d), scale), |
| 446 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); |
| 447 DITHER_INC_X(x); |
| 448 } while (--count != 0); |
407 } | 449 } |
408 } | 450 } |
409 | 451 |
410 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, | 452 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
411 const SkPMColor* SK_RESTRICT src, | 453 const SkPMColor* SK_RESTRICT src, |
412 int count, U8CPU alpha) { | 454 int count, U8CPU alpha) { |
413 | 455 |
414 SkASSERT(255 == alpha); | 456 SkASSERT(255 == alpha); |
415 if (count > 0) { | 457 if (count > 0) { |
416 | 458 |
(...skipping 851 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1268 * case where we do not inspect the src alpha. | 1310 * case where we do not inspect the src alpha. |
1269 */ | 1311 */ |
1270 #if SK_A32_SHIFT == 24 | 1312 #if SK_A32_SHIFT == 24 |
1271 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1313 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
1272 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1314 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
1273 #else | 1315 #else |
1274 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1316 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
1275 #endif | 1317 #endif |
1276 S32A_Blend_BlitRow32_arm // S32A_Blend | 1318 S32A_Blend_BlitRow32_arm // S32A_Blend |
1277 }; | 1319 }; |
OLD | NEW |