| OLD | NEW |
| 1 // Copyright 2012 Google Inc. All Rights Reserved. | 1 // Copyright 2012 Google Inc. All Rights Reserved. |
| 2 // | 2 // |
| 3 // This code is licensed under the same terms as WebM: | 3 // Use of this source code is governed by a BSD-style license |
| 4 // Software License Agreement: http://www.webmproject.org/license/software/ | 4 // that can be found in the COPYING file in the root of the source |
| 5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ | 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. |
| 6 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
| 7 // | 9 // |
| 8 // ARM NEON version of speed-critical encoding functions. | 10 // ARM NEON version of speed-critical encoding functions. |
| 9 // | 11 // |
| 10 // adapted from libvpx (http://www.webmproject.org/code/) | 12 // adapted from libvpx (http://www.webmproject.org/code/) |
| 11 | 13 |
| 12 #include "./dsp.h" | 14 #include "./dsp.h" |
| 13 | 15 |
| 14 #if defined(__cplusplus) || defined(c_plusplus) | 16 #if defined(__cplusplus) || defined(c_plusplus) |
| 15 extern "C" { | 17 extern "C" { |
| (...skipping 299 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 315 "vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000 | 317 "vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000 |
| 316 | 318 |
| 317 "vceq.s16 d4, d7, #0 \n" | 319 "vceq.s16 d4, d7, #0 \n" |
| 318 | 320 |
| 319 "vshr.s16 d0, d0, #4 \n" | 321 "vshr.s16 d0, d0, #4 \n" |
| 320 "vshr.s16 d2, d2, #4 \n" | 322 "vshr.s16 d2, d2, #4 \n" |
| 321 | 323 |
| 322 "vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000 | 324 "vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000 |
| 323 "vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000 | 325 "vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000 |
| 324 | 326 |
| 325 "vmvn.s16 d4, d4 \n" | 327 "vmvn d4, d4 \n" // !(d1 == 0) |
| 326 // op[4] = (c1*2217 + d1*5352 + 12000)>>16 | 328 // op[4] = (c1*2217 + d1*5352 + 12000)>>16 |
| 327 "vshrn.s32 d1, q11, #16 \n" | 329 "vshrn.s32 d1, q11, #16 \n" |
| 328 // op[4] += (d1!=0) | 330 // op[4] += (d1!=0) |
| 329 "vsub.s16 d1, d1, d4 \n" | 331 "vsub.s16 d1, d1, d4 \n" |
| 330 // op[12]= (d1*2217 - c1*5352 + 51000)>>16 | 332 // op[12]= (d1*2217 - c1*5352 + 51000)>>16 |
| 331 "vshrn.s32 d3, q12, #16 \n" | 333 "vshrn.s32 d3, q12, #16 \n" |
| 332 | 334 |
| 333 // set result to out array | 335 // set result to out array |
| 334 "vst1.16 {q0, q1}, [%[out]] \n" | 336 "vst1.16 {q0, q1}, [%[out]] \n" |
| 335 : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr), | 337 : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr), |
| (...skipping 20 matching lines...) Expand all Loading... |
| 356 "vld1.16 d3[1], [%[in]], %[kStep] \n" | 358 "vld1.16 d3[1], [%[in]], %[kStep] \n" |
| 357 "vld1.16 d0[2], [%[in]], %[kStep] \n" | 359 "vld1.16 d0[2], [%[in]], %[kStep] \n" |
| 358 "vld1.16 d1[2], [%[in]], %[kStep] \n" | 360 "vld1.16 d1[2], [%[in]], %[kStep] \n" |
| 359 "vld1.16 d2[2], [%[in]], %[kStep] \n" | 361 "vld1.16 d2[2], [%[in]], %[kStep] \n" |
| 360 "vld1.16 d3[2], [%[in]], %[kStep] \n" | 362 "vld1.16 d3[2], [%[in]], %[kStep] \n" |
| 361 "vld1.16 d0[3], [%[in]], %[kStep] \n" | 363 "vld1.16 d0[3], [%[in]], %[kStep] \n" |
| 362 "vld1.16 d1[3], [%[in]], %[kStep] \n" | 364 "vld1.16 d1[3], [%[in]], %[kStep] \n" |
| 363 "vld1.16 d2[3], [%[in]], %[kStep] \n" | 365 "vld1.16 d2[3], [%[in]], %[kStep] \n" |
| 364 "vld1.16 d3[3], [%[in]], %[kStep] \n" | 366 "vld1.16 d3[3], [%[in]], %[kStep] \n" |
| 365 | 367 |
| 366 "vaddl.s16 q2, d0, d2 \n" | 368 "vaddl.s16 q2, d0, d2 \n" // a0=(in[0*16]+in[2*16]) |
| 367 "vshl.s32 q2, q2, #2 \n" // a0=(in[0*16]+in[2*16])<<2 | 369 "vaddl.s16 q3, d1, d3 \n" // a1=(in[1*16]+in[3*16]) |
| 368 "vaddl.s16 q3, d1, d3 \n" | 370 "vsubl.s16 q4, d1, d3 \n" // a2=(in[1*16]-in[3*16]) |
| 369 "vshl.s32 q3, q3, #2 \n" // a1=(in[1*16]+in[3*16])<<2 | 371 "vsubl.s16 q5, d0, d2 \n" // a3=(in[0*16]-in[2*16]) |
| 370 "vsubl.s16 q4, d1, d3 \n" | |
| 371 "vshl.s32 q4, q4, #2 \n" // a2=(in[1*16]-in[3*16])<<2 | |
| 372 "vsubl.s16 q5, d0, d2 \n" | |
| 373 "vshl.s32 q5, q5, #2 \n" // a3=(in[0*16]-in[2*16])<<2 | |
| 374 | 372 |
| 375 "vceq.s32 q10, q2, #0 \n" | 373 "vqadd.s32 q6, q2, q3 \n" // a0 + a1 |
| 376 "vmvn.s32 q10, q10 \n" // (a0 != 0) | |
| 377 "vqadd.s32 q6, q2, q3 \n" // (a0 + a1) | |
| 378 "vqsub.s32 q6, q6, q10 \n" // (a0 + a1) + (a0 != 0) | |
| 379 "vqadd.s32 q7, q5, q4 \n" // a3 + a2 | 374 "vqadd.s32 q7, q5, q4 \n" // a3 + a2 |
| 380 "vqsub.s32 q8, q5, q4 \n" // a3 - a2 | 375 "vqsub.s32 q8, q5, q4 \n" // a3 - a2 |
| 381 "vqsub.s32 q9, q2, q3 \n" // a0 - a1 | 376 "vqsub.s32 q9, q2, q3 \n" // a0 - a1 |
| 382 | 377 |
| 383 // Transpose | 378 // Transpose |
| 384 // q6 = tmp[0, 1, 2, 3] ; q7 = tmp[ 4, 5, 6, 7] | 379 // q6 = tmp[0, 1, 2, 3] ; q7 = tmp[ 4, 5, 6, 7] |
| 385 // q8 = tmp[8, 9, 10, 11] ; q9 = tmp[12, 13, 14, 15] | 380 // q8 = tmp[8, 9, 10, 11] ; q9 = tmp[12, 13, 14, 15] |
| 386 "vswp d13, d16 \n" // vtrn.64 q0, q2 | 381 "vswp d13, d16 \n" // vtrn.64 q0, q2 |
| 387 "vswp d15, d18 \n" // vtrn.64 q1, q3 | 382 "vswp d15, d18 \n" // vtrn.64 q1, q3 |
| 388 "vtrn.32 q6, q7 \n" | 383 "vtrn.32 q6, q7 \n" |
| 389 "vtrn.32 q8, q9 \n" | 384 "vtrn.32 q8, q9 \n" |
| 390 | 385 |
| 391 "vqadd.s32 q0, q6, q8 \n" // a0 = tmp[0] + tmp[8] | 386 "vqadd.s32 q0, q6, q8 \n" // a0 = tmp[0] + tmp[8] |
| 392 "vqadd.s32 q1, q7, q9 \n" // a1 = tmp[4] + tmp[12] | 387 "vqadd.s32 q1, q7, q9 \n" // a1 = tmp[4] + tmp[12] |
| 393 "vqsub.s32 q2, q7, q9 \n" // a2 = tmp[4] - tmp[12] | 388 "vqsub.s32 q2, q7, q9 \n" // a2 = tmp[4] - tmp[12] |
| 394 "vqsub.s32 q3, q6, q8 \n" // a3 = tmp[0] - tmp[8] | 389 "vqsub.s32 q3, q6, q8 \n" // a3 = tmp[0] - tmp[8] |
| 395 | 390 |
| 396 "vqadd.s32 q4, q0, q1 \n" // b0 = a0 + a1 | 391 "vqadd.s32 q4, q0, q1 \n" // b0 = a0 + a1 |
| 397 "vqadd.s32 q5, q3, q2 \n" // b1 = a3 + a2 | 392 "vqadd.s32 q5, q3, q2 \n" // b1 = a3 + a2 |
| 398 "vqsub.s32 q6, q3, q2 \n" // b2 = a3 - a2 | 393 "vqsub.s32 q6, q3, q2 \n" // b2 = a3 - a2 |
| 399 "vqsub.s32 q7, q0, q1 \n" // b3 = a0 - a1 | 394 "vqsub.s32 q7, q0, q1 \n" // b3 = a0 - a1 |
| 400 | 395 |
| 401 "vmov.s32 q0, #3 \n" // q0 = 3 | 396 "vshrn.s32 d18, q4, #1 \n" // b0 >> 1 |
| 402 | 397 "vshrn.s32 d19, q5, #1 \n" // b1 >> 1 |
| 403 "vcgt.s32 q1, q4, #0 \n" // (b0>0) | 398 "vshrn.s32 d20, q6, #1 \n" // b2 >> 1 |
| 404 "vqsub.s32 q2, q4, q1 \n" // (b0+(b0>0)) | 399 "vshrn.s32 d21, q7, #1 \n" // b3 >> 1 |
| 405 "vqadd.s32 q3, q2, q0 \n" // (b0+(b0>0)+3) | |
| 406 "vshrn.s32 d18, q3, #3 \n" // (b0+(b0>0)+3) >> 3 | |
| 407 | |
| 408 "vcgt.s32 q1, q5, #0 \n" // (b1>0) | |
| 409 "vqsub.s32 q2, q5, q1 \n" // (b1+(b1>0)) | |
| 410 "vqadd.s32 q3, q2, q0 \n" // (b1+(b1>0)+3) | |
| 411 "vshrn.s32 d19, q3, #3 \n" // (b1+(b1>0)+3) >> 3 | |
| 412 | |
| 413 "vcgt.s32 q1, q6, #0 \n" // (b2>0) | |
| 414 "vqsub.s32 q2, q6, q1 \n" // (b2+(b2>0)) | |
| 415 "vqadd.s32 q3, q2, q0 \n" // (b2+(b2>0)+3) | |
| 416 "vshrn.s32 d20, q3, #3 \n" // (b2+(b2>0)+3) >> 3 | |
| 417 | |
| 418 "vcgt.s32 q1, q7, #0 \n" // (b3>0) | |
| 419 "vqsub.s32 q2, q7, q1 \n" // (b3+(b3>0)) | |
| 420 "vqadd.s32 q3, q2, q0 \n" // (b3+(b3>0)+3) | |
| 421 "vshrn.s32 d21, q3, #3 \n" // (b3+(b3>0)+3) >> 3 | |
| 422 | 400 |
| 423 "vst1.16 {q9, q10}, [%[out]] \n" | 401 "vst1.16 {q9, q10}, [%[out]] \n" |
| 424 | 402 |
| 425 : [in] "+r"(in) | 403 : [in] "+r"(in) |
| 426 : [kStep] "r"(kStep), [out] "r"(out) | 404 : [kStep] "r"(kStep), [out] "r"(out) |
| 427 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", | 405 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", |
| 428 "q6", "q7", "q8", "q9", "q10" // clobbered | 406 "q6", "q7", "q8", "q9", "q10" // clobbered |
| 429 ) ; | 407 ) ; |
| 430 } | 408 } |
| 431 | 409 |
| (...skipping 220 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 652 VP8FTransformWHT = FTransformWHT; | 630 VP8FTransformWHT = FTransformWHT; |
| 653 | 631 |
| 654 VP8TDisto4x4 = Disto4x4; | 632 VP8TDisto4x4 = Disto4x4; |
| 655 VP8TDisto16x16 = Disto16x16; | 633 VP8TDisto16x16 = Disto16x16; |
| 656 #endif // WEBP_USE_NEON | 634 #endif // WEBP_USE_NEON |
| 657 } | 635 } |
| 658 | 636 |
| 659 #if defined(__cplusplus) || defined(c_plusplus) | 637 #if defined(__cplusplus) || defined(c_plusplus) |
| 660 } // extern "C" | 638 } // extern "C" |
| 661 #endif | 639 #endif |
| OLD | NEW |