| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #if defined(_MSC_VER) | |
| 6 #include <intrin.h> | |
| 7 #else | |
| 8 #include <mmintrin.h> | |
| 9 #include <emmintrin.h> | |
| 10 #endif | |
| 11 | |
| 12 #include "remoting/host/differ_block.h" | |
| 13 #include "remoting/host/differ_block_internal.h" | |
| 14 | |
| 15 namespace remoting { | |
| 16 | |
| 17 extern int BlockDifference_SSE2_W16(const uint8* image1, const uint8* image2, | |
| 18 int stride) { | |
| 19 __m128i acc = _mm_setzero_si128(); | |
| 20 __m128i v0; | |
| 21 __m128i v1; | |
| 22 __m128i sad; | |
| 23 for (int y = 0; y < kBlockSize; ++y) { | |
| 24 const __m128i* i1 = reinterpret_cast<const __m128i*>(image1); | |
| 25 const __m128i* i2 = reinterpret_cast<const __m128i*>(image2); | |
| 26 v0 = _mm_loadu_si128(i1); | |
| 27 v1 = _mm_loadu_si128(i2); | |
| 28 sad = _mm_sad_epu8(v0, v1); | |
| 29 acc = _mm_adds_epu16(acc, sad); | |
| 30 v0 = _mm_loadu_si128(i1 + 1); | |
| 31 v1 = _mm_loadu_si128(i2 + 1); | |
| 32 sad = _mm_sad_epu8(v0, v1); | |
| 33 acc = _mm_adds_epu16(acc, sad); | |
| 34 v0 = _mm_loadu_si128(i1 + 2); | |
| 35 v1 = _mm_loadu_si128(i2 + 2); | |
| 36 sad = _mm_sad_epu8(v0, v1); | |
| 37 acc = _mm_adds_epu16(acc, sad); | |
| 38 v0 = _mm_loadu_si128(i1 + 3); | |
| 39 v1 = _mm_loadu_si128(i2 + 3); | |
| 40 sad = _mm_sad_epu8(v0, v1); | |
| 41 acc = _mm_adds_epu16(acc, sad); | |
| 42 | |
| 43 // This essential means sad = acc >> 64. We only care about the lower 16 | |
| 44 // bits. | |
| 45 sad = _mm_shuffle_epi32(acc, 0xEE); | |
| 46 sad = _mm_adds_epu16(sad, acc); | |
| 47 int diff = _mm_cvtsi128_si32(sad); | |
| 48 if (diff) | |
| 49 return 1; | |
| 50 image1 += stride; | |
| 51 image2 += stride; | |
| 52 } | |
| 53 return 0; | |
| 54 } | |
| 55 | |
| 56 extern int BlockDifference_SSE2_W32(const uint8* image1, const uint8* image2, | |
| 57 int stride) { | |
| 58 __m128i acc = _mm_setzero_si128(); | |
| 59 __m128i v0; | |
| 60 __m128i v1; | |
| 61 __m128i sad; | |
| 62 for (int y = 0; y < kBlockSize; ++y) { | |
| 63 const __m128i* i1 = reinterpret_cast<const __m128i*>(image1); | |
| 64 const __m128i* i2 = reinterpret_cast<const __m128i*>(image2); | |
| 65 v0 = _mm_loadu_si128(i1); | |
| 66 v1 = _mm_loadu_si128(i2); | |
| 67 sad = _mm_sad_epu8(v0, v1); | |
| 68 acc = _mm_adds_epu16(acc, sad); | |
| 69 v0 = _mm_loadu_si128(i1 + 1); | |
| 70 v1 = _mm_loadu_si128(i2 + 1); | |
| 71 sad = _mm_sad_epu8(v0, v1); | |
| 72 acc = _mm_adds_epu16(acc, sad); | |
| 73 v0 = _mm_loadu_si128(i1 + 2); | |
| 74 v1 = _mm_loadu_si128(i2 + 2); | |
| 75 sad = _mm_sad_epu8(v0, v1); | |
| 76 acc = _mm_adds_epu16(acc, sad); | |
| 77 v0 = _mm_loadu_si128(i1 + 3); | |
| 78 v1 = _mm_loadu_si128(i2 + 3); | |
| 79 sad = _mm_sad_epu8(v0, v1); | |
| 80 acc = _mm_adds_epu16(acc, sad); | |
| 81 v0 = _mm_loadu_si128(i1 + 4); | |
| 82 v1 = _mm_loadu_si128(i2 + 4); | |
| 83 sad = _mm_sad_epu8(v0, v1); | |
| 84 acc = _mm_adds_epu16(acc, sad); | |
| 85 v0 = _mm_loadu_si128(i1 + 5); | |
| 86 v1 = _mm_loadu_si128(i2 + 5); | |
| 87 sad = _mm_sad_epu8(v0, v1); | |
| 88 acc = _mm_adds_epu16(acc, sad); | |
| 89 v0 = _mm_loadu_si128(i1 + 6); | |
| 90 v1 = _mm_loadu_si128(i2 + 6); | |
| 91 sad = _mm_sad_epu8(v0, v1); | |
| 92 acc = _mm_adds_epu16(acc, sad); | |
| 93 v0 = _mm_loadu_si128(i1 + 7); | |
| 94 v1 = _mm_loadu_si128(i2 + 7); | |
| 95 sad = _mm_sad_epu8(v0, v1); | |
| 96 acc = _mm_adds_epu16(acc, sad); | |
| 97 | |
| 98 // This essential means sad = acc >> 64. We only care about the lower 16 | |
| 99 // bits. | |
| 100 sad = _mm_shuffle_epi32(acc, 0xEE); | |
| 101 sad = _mm_adds_epu16(sad, acc); | |
| 102 int diff = _mm_cvtsi128_si32(sad); | |
| 103 if (diff) | |
| 104 return 1; | |
| 105 image1 += stride; | |
| 106 image2 += stride; | |
| 107 } | |
| 108 return 0; | |
| 109 } | |
| 110 | |
| 111 } // namespace remoting | |
| OLD | NEW |