OLD | NEW |
| (Empty) |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #if defined(_MSC_VER) | |
6 #include <intrin.h> | |
7 #else | |
8 #include <mmintrin.h> | |
9 #include <emmintrin.h> | |
10 #endif | |
11 | |
12 #include "remoting/host/differ_block.h" | |
13 #include "remoting/host/differ_block_internal.h" | |
14 | |
15 namespace remoting { | |
16 | |
17 extern int BlockDifference_SSE2_W16(const uint8* image1, const uint8* image2, | |
18 int stride) { | |
19 __m128i acc = _mm_setzero_si128(); | |
20 __m128i v0; | |
21 __m128i v1; | |
22 __m128i sad; | |
23 for (int y = 0; y < kBlockSize; ++y) { | |
24 const __m128i* i1 = reinterpret_cast<const __m128i*>(image1); | |
25 const __m128i* i2 = reinterpret_cast<const __m128i*>(image2); | |
26 v0 = _mm_loadu_si128(i1); | |
27 v1 = _mm_loadu_si128(i2); | |
28 sad = _mm_sad_epu8(v0, v1); | |
29 acc = _mm_adds_epu16(acc, sad); | |
30 v0 = _mm_loadu_si128(i1 + 1); | |
31 v1 = _mm_loadu_si128(i2 + 1); | |
32 sad = _mm_sad_epu8(v0, v1); | |
33 acc = _mm_adds_epu16(acc, sad); | |
34 v0 = _mm_loadu_si128(i1 + 2); | |
35 v1 = _mm_loadu_si128(i2 + 2); | |
36 sad = _mm_sad_epu8(v0, v1); | |
37 acc = _mm_adds_epu16(acc, sad); | |
38 v0 = _mm_loadu_si128(i1 + 3); | |
39 v1 = _mm_loadu_si128(i2 + 3); | |
40 sad = _mm_sad_epu8(v0, v1); | |
41 acc = _mm_adds_epu16(acc, sad); | |
42 | |
43 // This essential means sad = acc >> 64. We only care about the lower 16 | |
44 // bits. | |
45 sad = _mm_shuffle_epi32(acc, 0xEE); | |
46 sad = _mm_adds_epu16(sad, acc); | |
47 int diff = _mm_cvtsi128_si32(sad); | |
48 if (diff) | |
49 return 1; | |
50 image1 += stride; | |
51 image2 += stride; | |
52 } | |
53 return 0; | |
54 } | |
55 | |
56 extern int BlockDifference_SSE2_W32(const uint8* image1, const uint8* image2, | |
57 int stride) { | |
58 __m128i acc = _mm_setzero_si128(); | |
59 __m128i v0; | |
60 __m128i v1; | |
61 __m128i sad; | |
62 for (int y = 0; y < kBlockSize; ++y) { | |
63 const __m128i* i1 = reinterpret_cast<const __m128i*>(image1); | |
64 const __m128i* i2 = reinterpret_cast<const __m128i*>(image2); | |
65 v0 = _mm_loadu_si128(i1); | |
66 v1 = _mm_loadu_si128(i2); | |
67 sad = _mm_sad_epu8(v0, v1); | |
68 acc = _mm_adds_epu16(acc, sad); | |
69 v0 = _mm_loadu_si128(i1 + 1); | |
70 v1 = _mm_loadu_si128(i2 + 1); | |
71 sad = _mm_sad_epu8(v0, v1); | |
72 acc = _mm_adds_epu16(acc, sad); | |
73 v0 = _mm_loadu_si128(i1 + 2); | |
74 v1 = _mm_loadu_si128(i2 + 2); | |
75 sad = _mm_sad_epu8(v0, v1); | |
76 acc = _mm_adds_epu16(acc, sad); | |
77 v0 = _mm_loadu_si128(i1 + 3); | |
78 v1 = _mm_loadu_si128(i2 + 3); | |
79 sad = _mm_sad_epu8(v0, v1); | |
80 acc = _mm_adds_epu16(acc, sad); | |
81 v0 = _mm_loadu_si128(i1 + 4); | |
82 v1 = _mm_loadu_si128(i2 + 4); | |
83 sad = _mm_sad_epu8(v0, v1); | |
84 acc = _mm_adds_epu16(acc, sad); | |
85 v0 = _mm_loadu_si128(i1 + 5); | |
86 v1 = _mm_loadu_si128(i2 + 5); | |
87 sad = _mm_sad_epu8(v0, v1); | |
88 acc = _mm_adds_epu16(acc, sad); | |
89 v0 = _mm_loadu_si128(i1 + 6); | |
90 v1 = _mm_loadu_si128(i2 + 6); | |
91 sad = _mm_sad_epu8(v0, v1); | |
92 acc = _mm_adds_epu16(acc, sad); | |
93 v0 = _mm_loadu_si128(i1 + 7); | |
94 v1 = _mm_loadu_si128(i2 + 7); | |
95 sad = _mm_sad_epu8(v0, v1); | |
96 acc = _mm_adds_epu16(acc, sad); | |
97 | |
98 // This essential means sad = acc >> 64. We only care about the lower 16 | |
99 // bits. | |
100 sad = _mm_shuffle_epi32(acc, 0xEE); | |
101 sad = _mm_adds_epu16(sad, acc); | |
102 int diff = _mm_cvtsi128_si32(sad); | |
103 if (diff) | |
104 return 1; | |
105 image1 += stride; | |
106 image2 += stride; | |
107 } | |
108 return 0; | |
109 } | |
110 | |
111 } // namespace remoting | |
OLD | NEW |