OLD | NEW |
1 // Copyright 2012 Google Inc. All Rights Reserved. | 1 // Copyright 2012 Google Inc. All Rights Reserved. |
2 // | 2 // |
3 // This code is licensed under the same terms as WebM: | 3 // Use of this source code is governed by a BSD-style license |
4 // Software License Agreement: http://www.webmproject.org/license/software/ | 4 // that can be found in the COPYING file in the root of the source |
5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ | 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. |
6 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
7 // | 9 // |
8 // ARM NEON version of speed-critical encoding functions. | 10 // ARM NEON version of speed-critical encoding functions. |
9 // | 11 // |
10 // adapted from libvpx (http://www.webmproject.org/code/) | 12 // adapted from libvpx (http://www.webmproject.org/code/) |
11 | 13 |
12 #include "./dsp.h" | 14 #include "./dsp.h" |
13 | 15 |
14 #if defined(__cplusplus) || defined(c_plusplus) | 16 #if defined(__cplusplus) || defined(c_plusplus) |
15 extern "C" { | 17 extern "C" { |
(...skipping 299 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
315 "vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000 | 317 "vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000 |
316 | 318 |
317 "vceq.s16 d4, d7, #0 \n" | 319 "vceq.s16 d4, d7, #0 \n" |
318 | 320 |
319 "vshr.s16 d0, d0, #4 \n" | 321 "vshr.s16 d0, d0, #4 \n" |
320 "vshr.s16 d2, d2, #4 \n" | 322 "vshr.s16 d2, d2, #4 \n" |
321 | 323 |
322 "vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000 | 324 "vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000 |
323 "vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000 | 325 "vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000 |
324 | 326 |
325 "vmvn.s16 d4, d4 \n" | 327 "vmvn d4, d4 \n" // !(d1 == 0) |
326 // op[4] = (c1*2217 + d1*5352 + 12000)>>16 | 328 // op[4] = (c1*2217 + d1*5352 + 12000)>>16 |
327 "vshrn.s32 d1, q11, #16 \n" | 329 "vshrn.s32 d1, q11, #16 \n" |
328 // op[4] += (d1!=0) | 330 // op[4] += (d1!=0) |
329 "vsub.s16 d1, d1, d4 \n" | 331 "vsub.s16 d1, d1, d4 \n" |
330 // op[12]= (d1*2217 - c1*5352 + 51000)>>16 | 332 // op[12]= (d1*2217 - c1*5352 + 51000)>>16 |
331 "vshrn.s32 d3, q12, #16 \n" | 333 "vshrn.s32 d3, q12, #16 \n" |
332 | 334 |
333 // set result to out array | 335 // set result to out array |
334 "vst1.16 {q0, q1}, [%[out]] \n" | 336 "vst1.16 {q0, q1}, [%[out]] \n" |
335 : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr), | 337 : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr), |
(...skipping 20 matching lines...) Expand all Loading... |
356 "vld1.16 d3[1], [%[in]], %[kStep] \n" | 358 "vld1.16 d3[1], [%[in]], %[kStep] \n" |
357 "vld1.16 d0[2], [%[in]], %[kStep] \n" | 359 "vld1.16 d0[2], [%[in]], %[kStep] \n" |
358 "vld1.16 d1[2], [%[in]], %[kStep] \n" | 360 "vld1.16 d1[2], [%[in]], %[kStep] \n" |
359 "vld1.16 d2[2], [%[in]], %[kStep] \n" | 361 "vld1.16 d2[2], [%[in]], %[kStep] \n" |
360 "vld1.16 d3[2], [%[in]], %[kStep] \n" | 362 "vld1.16 d3[2], [%[in]], %[kStep] \n" |
361 "vld1.16 d0[3], [%[in]], %[kStep] \n" | 363 "vld1.16 d0[3], [%[in]], %[kStep] \n" |
362 "vld1.16 d1[3], [%[in]], %[kStep] \n" | 364 "vld1.16 d1[3], [%[in]], %[kStep] \n" |
363 "vld1.16 d2[3], [%[in]], %[kStep] \n" | 365 "vld1.16 d2[3], [%[in]], %[kStep] \n" |
364 "vld1.16 d3[3], [%[in]], %[kStep] \n" | 366 "vld1.16 d3[3], [%[in]], %[kStep] \n" |
365 | 367 |
366 "vaddl.s16 q2, d0, d2 \n" | 368 "vaddl.s16 q2, d0, d2 \n" // a0=(in[0*16]+in[2*16]) |
367 "vshl.s32 q2, q2, #2 \n" // a0=(in[0*16]+in[2*16])<<2 | 369 "vaddl.s16 q3, d1, d3 \n" // a1=(in[1*16]+in[3*16]) |
368 "vaddl.s16 q3, d1, d3 \n" | 370 "vsubl.s16 q4, d1, d3 \n" // a2=(in[1*16]-in[3*16]) |
369 "vshl.s32 q3, q3, #2 \n" // a1=(in[1*16]+in[3*16])<<2 | 371 "vsubl.s16 q5, d0, d2 \n" // a3=(in[0*16]-in[2*16]) |
370 "vsubl.s16 q4, d1, d3 \n" | |
371 "vshl.s32 q4, q4, #2 \n" // a2=(in[1*16]-in[3*16])<<2 | |
372 "vsubl.s16 q5, d0, d2 \n" | |
373 "vshl.s32 q5, q5, #2 \n" // a3=(in[0*16]-in[2*16])<<2 | |
374 | 372 |
375 "vceq.s32 q10, q2, #0 \n" | 373 "vqadd.s32 q6, q2, q3 \n" // a0 + a1 |
376 "vmvn.s32 q10, q10 \n" // (a0 != 0) | |
377 "vqadd.s32 q6, q2, q3 \n" // (a0 + a1) | |
378 "vqsub.s32 q6, q6, q10 \n" // (a0 + a1) + (a0 != 0) | |
379 "vqadd.s32 q7, q5, q4 \n" // a3 + a2 | 374 "vqadd.s32 q7, q5, q4 \n" // a3 + a2 |
380 "vqsub.s32 q8, q5, q4 \n" // a3 - a2 | 375 "vqsub.s32 q8, q5, q4 \n" // a3 - a2 |
381 "vqsub.s32 q9, q2, q3 \n" // a0 - a1 | 376 "vqsub.s32 q9, q2, q3 \n" // a0 - a1 |
382 | 377 |
383 // Transpose | 378 // Transpose |
384 // q6 = tmp[0, 1, 2, 3] ; q7 = tmp[ 4, 5, 6, 7] | 379 // q6 = tmp[0, 1, 2, 3] ; q7 = tmp[ 4, 5, 6, 7] |
385 // q8 = tmp[8, 9, 10, 11] ; q9 = tmp[12, 13, 14, 15] | 380 // q8 = tmp[8, 9, 10, 11] ; q9 = tmp[12, 13, 14, 15] |
386 "vswp d13, d16 \n" // vtrn.64 q0, q2 | 381 "vswp d13, d16 \n" // vtrn.64 q0, q2 |
387 "vswp d15, d18 \n" // vtrn.64 q1, q3 | 382 "vswp d15, d18 \n" // vtrn.64 q1, q3 |
388 "vtrn.32 q6, q7 \n" | 383 "vtrn.32 q6, q7 \n" |
389 "vtrn.32 q8, q9 \n" | 384 "vtrn.32 q8, q9 \n" |
390 | 385 |
391 "vqadd.s32 q0, q6, q8 \n" // a0 = tmp[0] + tmp[8] | 386 "vqadd.s32 q0, q6, q8 \n" // a0 = tmp[0] + tmp[8] |
392 "vqadd.s32 q1, q7, q9 \n" // a1 = tmp[4] + tmp[12] | 387 "vqadd.s32 q1, q7, q9 \n" // a1 = tmp[4] + tmp[12] |
393 "vqsub.s32 q2, q7, q9 \n" // a2 = tmp[4] - tmp[12] | 388 "vqsub.s32 q2, q7, q9 \n" // a2 = tmp[4] - tmp[12] |
394 "vqsub.s32 q3, q6, q8 \n" // a3 = tmp[0] - tmp[8] | 389 "vqsub.s32 q3, q6, q8 \n" // a3 = tmp[0] - tmp[8] |
395 | 390 |
396 "vqadd.s32 q4, q0, q1 \n" // b0 = a0 + a1 | 391 "vqadd.s32 q4, q0, q1 \n" // b0 = a0 + a1 |
397 "vqadd.s32 q5, q3, q2 \n" // b1 = a3 + a2 | 392 "vqadd.s32 q5, q3, q2 \n" // b1 = a3 + a2 |
398 "vqsub.s32 q6, q3, q2 \n" // b2 = a3 - a2 | 393 "vqsub.s32 q6, q3, q2 \n" // b2 = a3 - a2 |
399 "vqsub.s32 q7, q0, q1 \n" // b3 = a0 - a1 | 394 "vqsub.s32 q7, q0, q1 \n" // b3 = a0 - a1 |
400 | 395 |
401 "vmov.s32 q0, #3 \n" // q0 = 3 | 396 "vshrn.s32 d18, q4, #1 \n" // b0 >> 1 |
402 | 397 "vshrn.s32 d19, q5, #1 \n" // b1 >> 1 |
403 "vcgt.s32 q1, q4, #0 \n" // (b0>0) | 398 "vshrn.s32 d20, q6, #1 \n" // b2 >> 1 |
404 "vqsub.s32 q2, q4, q1 \n" // (b0+(b0>0)) | 399 "vshrn.s32 d21, q7, #1 \n" // b3 >> 1 |
405 "vqadd.s32 q3, q2, q0 \n" // (b0+(b0>0)+3) | |
406 "vshrn.s32 d18, q3, #3 \n" // (b0+(b0>0)+3) >> 3 | |
407 | |
408 "vcgt.s32 q1, q5, #0 \n" // (b1>0) | |
409 "vqsub.s32 q2, q5, q1 \n" // (b1+(b1>0)) | |
410 "vqadd.s32 q3, q2, q0 \n" // (b1+(b1>0)+3) | |
411 "vshrn.s32 d19, q3, #3 \n" // (b1+(b1>0)+3) >> 3 | |
412 | |
413 "vcgt.s32 q1, q6, #0 \n" // (b2>0) | |
414 "vqsub.s32 q2, q6, q1 \n" // (b2+(b2>0)) | |
415 "vqadd.s32 q3, q2, q0 \n" // (b2+(b2>0)+3) | |
416 "vshrn.s32 d20, q3, #3 \n" // (b2+(b2>0)+3) >> 3 | |
417 | |
418 "vcgt.s32 q1, q7, #0 \n" // (b3>0) | |
419 "vqsub.s32 q2, q7, q1 \n" // (b3+(b3>0)) | |
420 "vqadd.s32 q3, q2, q0 \n" // (b3+(b3>0)+3) | |
421 "vshrn.s32 d21, q3, #3 \n" // (b3+(b3>0)+3) >> 3 | |
422 | 400 |
423 "vst1.16 {q9, q10}, [%[out]] \n" | 401 "vst1.16 {q9, q10}, [%[out]] \n" |
424 | 402 |
425 : [in] "+r"(in) | 403 : [in] "+r"(in) |
426 : [kStep] "r"(kStep), [out] "r"(out) | 404 : [kStep] "r"(kStep), [out] "r"(out) |
427 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", | 405 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", |
428 "q6", "q7", "q8", "q9", "q10" // clobbered | 406 "q6", "q7", "q8", "q9", "q10" // clobbered |
429 ) ; | 407 ) ; |
430 } | 408 } |
431 | 409 |
(...skipping 220 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
652 VP8FTransformWHT = FTransformWHT; | 630 VP8FTransformWHT = FTransformWHT; |
653 | 631 |
654 VP8TDisto4x4 = Disto4x4; | 632 VP8TDisto4x4 = Disto4x4; |
655 VP8TDisto16x16 = Disto16x16; | 633 VP8TDisto16x16 = Disto16x16; |
656 #endif // WEBP_USE_NEON | 634 #endif // WEBP_USE_NEON |
657 } | 635 } |
658 | 636 |
659 #if defined(__cplusplus) || defined(c_plusplus) | 637 #if defined(__cplusplus) || defined(c_plusplus) |
660 } // extern "C" | 638 } // extern "C" |
661 #endif | 639 #endif |
OLD | NEW |