OLD | NEW |
1 // Copyright 2011 Google Inc. All Rights Reserved. | 1 // Copyright 2011 Google Inc. All Rights Reserved. |
2 // | 2 // |
3 // This code is licensed under the same terms as WebM: | 3 // Use of this source code is governed by a BSD-style license |
4 // Software License Agreement: http://www.webmproject.org/license/software/ | 4 // that can be found in the COPYING file in the root of the source |
5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ | 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. |
6 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
7 // | 9 // |
8 // SSE2 version of speed-critical encoding functions. | 10 // SSE2 version of speed-critical encoding functions. |
9 // | 11 // |
10 // Author: Christian Duvivier (cduvivier@google.com) | 12 // Author: Christian Duvivier (cduvivier@google.com) |
11 | 13 |
12 #include "./dsp.h" | 14 #include "./dsp.h" |
13 | 15 |
14 #if defined(__cplusplus) || defined(c_plusplus) | 16 #if defined(__cplusplus) || defined(c_plusplus) |
15 extern "C" { | 17 extern "C" { |
(...skipping 430 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
446 // -> f1 = f1 + 1 - (a3 == 0) | 448 // -> f1 = f1 + 1 - (a3 == 0) |
447 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); | 449 const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); |
448 | 450 |
449 _mm_storel_epi64((__m128i*)&out[ 0], d0); | 451 _mm_storel_epi64((__m128i*)&out[ 0], d0); |
450 _mm_storel_epi64((__m128i*)&out[ 4], g1); | 452 _mm_storel_epi64((__m128i*)&out[ 4], g1); |
451 _mm_storel_epi64((__m128i*)&out[ 8], d2); | 453 _mm_storel_epi64((__m128i*)&out[ 8], d2); |
452 _mm_storel_epi64((__m128i*)&out[12], f3); | 454 _mm_storel_epi64((__m128i*)&out[12], f3); |
453 } | 455 } |
454 } | 456 } |
455 | 457 |
| 458 static void FTransformWHTSSE2(const int16_t* in, int16_t* out) { |
| 459 int16_t tmp[16]; |
| 460 int i; |
| 461 for (i = 0; i < 4; ++i, in += 64) { |
| 462 const int a0 = (in[0 * 16] + in[2 * 16]); |
| 463 const int a1 = (in[1 * 16] + in[3 * 16]); |
| 464 const int a2 = (in[1 * 16] - in[3 * 16]); |
| 465 const int a3 = (in[0 * 16] - in[2 * 16]); |
| 466 tmp[0 + i * 4] = a0 + a1; |
| 467 tmp[1 + i * 4] = a3 + a2; |
| 468 tmp[2 + i * 4] = a3 - a2; |
| 469 tmp[3 + i * 4] = a0 - a1; |
| 470 } |
| 471 { |
| 472 const __m128i src0 = _mm_loadl_epi64((__m128i*)&tmp[0]); |
| 473 const __m128i src1 = _mm_loadl_epi64((__m128i*)&tmp[4]); |
| 474 const __m128i src2 = _mm_loadl_epi64((__m128i*)&tmp[8]); |
| 475 const __m128i src3 = _mm_loadl_epi64((__m128i*)&tmp[12]); |
| 476 const __m128i a0 = _mm_add_epi16(src0, src2); |
| 477 const __m128i a1 = _mm_add_epi16(src1, src3); |
| 478 const __m128i a2 = _mm_sub_epi16(src1, src3); |
| 479 const __m128i a3 = _mm_sub_epi16(src0, src2); |
| 480 const __m128i b0 = _mm_srai_epi16(_mm_adds_epi16(a0, a1), 1); |
| 481 const __m128i b1 = _mm_srai_epi16(_mm_adds_epi16(a3, a2), 1); |
| 482 const __m128i b2 = _mm_srai_epi16(_mm_subs_epi16(a3, a2), 1); |
| 483 const __m128i b3 = _mm_srai_epi16(_mm_subs_epi16(a0, a1), 1); |
| 484 _mm_storel_epi64((__m128i*)&out[ 0], b0); |
| 485 _mm_storel_epi64((__m128i*)&out[ 4], b1); |
| 486 _mm_storel_epi64((__m128i*)&out[ 8], b2); |
| 487 _mm_storel_epi64((__m128i*)&out[12], b3); |
| 488 } |
| 489 } |
| 490 |
456 //------------------------------------------------------------------------------ | 491 //------------------------------------------------------------------------------ |
457 // Metric | 492 // Metric |
458 | 493 |
459 static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b, | 494 static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b, |
460 int num_quads, int do_16) { | 495 int num_quads, int do_16) { |
461 const __m128i zero = _mm_setzero_si128(); | 496 const __m128i zero = _mm_setzero_si128(); |
462 __m128i sum1 = zero; | 497 __m128i sum1 = zero; |
463 __m128i sum2 = zero; | 498 __m128i sum2 = zero; |
464 | 499 |
465 while (num_quads-- > 0) { | 500 while (num_quads-- > 0) { |
(...skipping 446 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
912 // Entry point | 947 // Entry point |
913 | 948 |
914 extern void VP8EncDspInitSSE2(void); | 949 extern void VP8EncDspInitSSE2(void); |
915 | 950 |
916 void VP8EncDspInitSSE2(void) { | 951 void VP8EncDspInitSSE2(void) { |
917 #if defined(WEBP_USE_SSE2) | 952 #if defined(WEBP_USE_SSE2) |
918 VP8CollectHistogram = CollectHistogramSSE2; | 953 VP8CollectHistogram = CollectHistogramSSE2; |
919 VP8EncQuantizeBlock = QuantizeBlockSSE2; | 954 VP8EncQuantizeBlock = QuantizeBlockSSE2; |
920 VP8ITransform = ITransformSSE2; | 955 VP8ITransform = ITransformSSE2; |
921 VP8FTransform = FTransformSSE2; | 956 VP8FTransform = FTransformSSE2; |
| 957 VP8FTransformWHT = FTransformWHTSSE2; |
922 VP8SSE16x16 = SSE16x16SSE2; | 958 VP8SSE16x16 = SSE16x16SSE2; |
923 VP8SSE16x8 = SSE16x8SSE2; | 959 VP8SSE16x8 = SSE16x8SSE2; |
924 VP8SSE8x8 = SSE8x8SSE2; | 960 VP8SSE8x8 = SSE8x8SSE2; |
925 VP8SSE4x4 = SSE4x4SSE2; | 961 VP8SSE4x4 = SSE4x4SSE2; |
926 VP8TDisto4x4 = Disto4x4SSE2; | 962 VP8TDisto4x4 = Disto4x4SSE2; |
927 VP8TDisto16x16 = Disto16x16SSE2; | 963 VP8TDisto16x16 = Disto16x16SSE2; |
928 #endif // WEBP_USE_SSE2 | 964 #endif // WEBP_USE_SSE2 |
929 } | 965 } |
930 | 966 |
931 #if defined(__cplusplus) || defined(c_plusplus) | 967 #if defined(__cplusplus) || defined(c_plusplus) |
932 } // extern "C" | 968 } // extern "C" |
933 #endif | 969 #endif |
OLD | NEW |