OLD | NEW |
| (Empty) |
1 // Copyright 2011 Google Inc. | |
2 // | |
3 // This code is licensed under the same terms as WebM: | |
4 // Software License Agreement: http://www.webmproject.org/license/software/ | |
5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ | |
6 // ----------------------------------------------------------------------------- | |
7 // | |
8 // speed-critical functions. | |
9 // | |
10 // Author: Skal (pascal.massimino@gmail.com) | |
11 | |
12 #include <assert.h> | |
13 #include "vp8enci.h" | |
14 | |
15 #if defined(__cplusplus) || defined(c_plusplus) | |
16 extern "C" { | |
17 #endif | |
18 | |
19 //------------------------------------------------------------------------------ | |
20 // Compute susceptibility based on DCT-coeff histograms: | |
21 // the higher, the "easier" the macroblock is to compress. | |
22 | |
23 static int ClipAlpha(int alpha) { | |
24 return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha; | |
25 } | |
26 | |
27 int VP8GetAlpha(const int histo[MAX_COEFF_THRESH + 1]) { | |
28 int num = 0, den = 0, val = 0; | |
29 int k; | |
30 int alpha; | |
31 // note: changing this loop to avoid the numerous "k + 1" slows things down. | |
32 for (k = 0; k < MAX_COEFF_THRESH; ++k) { | |
33 if (histo[k + 1]) { | |
34 val += histo[k + 1]; | |
35 num += val * (k + 1); | |
36 den += (k + 1) * (k + 1); | |
37 } | |
38 } | |
39 // we scale the value to a usable [0..255] range | |
40 alpha = den ? 10 * num / den - 5 : 0; | |
41 return ClipAlpha(alpha); | |
42 } | |
43 | |
44 static int CollectHistogram(const uint8_t* ref, const uint8_t* pred, | |
45 int start_block, int end_block) { | |
46 int histo[MAX_COEFF_THRESH + 1] = { 0 }; | |
47 int16_t out[16]; | |
48 int j, k; | |
49 for (j = start_block; j < end_block; ++j) { | |
50 VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out); | |
51 | |
52 // Convert coefficients to bin (within out[]). | |
53 for (k = 0; k < 16; ++k) { | |
54 const int v = abs(out[k]) >> 2; | |
55 out[k] = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v; | |
56 } | |
57 | |
58 // Use bin to update histogram. | |
59 for (k = 0; k < 16; ++k) { | |
60 histo[out[k]]++; | |
61 } | |
62 } | |
63 | |
64 return VP8GetAlpha(histo); | |
65 } | |
66 | |
67 //------------------------------------------------------------------------------ | |
68 // run-time tables (~4k) | |
69 | |
70 static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255] | |
71 | |
72 // We declare this variable 'volatile' to prevent instruction reordering | |
73 // and make sure it's set to true _last_ (so as to be thread-safe) | |
74 static volatile int tables_ok = 0; | |
75 | |
76 static void InitTables(void) { | |
77 if (!tables_ok) { | |
78 int i; | |
79 for (i = -255; i <= 255 + 255; ++i) { | |
80 clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i; | |
81 } | |
82 tables_ok = 1; | |
83 } | |
84 } | |
85 | |
86 static inline uint8_t clip_8b(int v) { | |
87 return (!(v & ~0xff)) ? v : v < 0 ? 0 : 255; | |
88 } | |
89 | |
90 //------------------------------------------------------------------------------ | |
91 // Transforms (Paragraph 14.4) | |
92 | |
93 #define STORE(x, y, v) \ | |
94 dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3)) | |
95 | |
96 static const int kC1 = 20091 + (1 << 16); | |
97 static const int kC2 = 35468; | |
98 #define MUL(a, b) (((a) * (b)) >> 16) | |
99 | |
100 static inline void ITransformOne(const uint8_t* ref, const int16_t* in, | |
101 uint8_t* dst) { | |
102 int C[4 * 4], *tmp; | |
103 int i; | |
104 tmp = C; | |
105 for (i = 0; i < 4; ++i) { // vertical pass | |
106 const int a = in[0] + in[8]; | |
107 const int b = in[0] - in[8]; | |
108 const int c = MUL(in[4], kC2) - MUL(in[12], kC1); | |
109 const int d = MUL(in[4], kC1) + MUL(in[12], kC2); | |
110 tmp[0] = a + d; | |
111 tmp[1] = b + c; | |
112 tmp[2] = b - c; | |
113 tmp[3] = a - d; | |
114 tmp += 4; | |
115 in++; | |
116 } | |
117 | |
118 tmp = C; | |
119 for (i = 0; i < 4; ++i) { // horizontal pass | |
120 const int dc = tmp[0] + 4; | |
121 const int a = dc + tmp[8]; | |
122 const int b = dc - tmp[8]; | |
123 const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1); | |
124 const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2); | |
125 STORE(0, i, a + d); | |
126 STORE(1, i, b + c); | |
127 STORE(2, i, b - c); | |
128 STORE(3, i, a - d); | |
129 tmp++; | |
130 } | |
131 } | |
132 | |
133 static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, | |
134 int do_two) { | |
135 ITransformOne(ref, in, dst); | |
136 if (do_two) { | |
137 ITransformOne(ref + 4, in + 16, dst + 4); | |
138 } | |
139 } | |
140 | |
141 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { | |
142 int i; | |
143 int tmp[16]; | |
144 for (i = 0; i < 4; ++i, src += BPS, ref += BPS) { | |
145 const int d0 = src[0] - ref[0]; | |
146 const int d1 = src[1] - ref[1]; | |
147 const int d2 = src[2] - ref[2]; | |
148 const int d3 = src[3] - ref[3]; | |
149 const int a0 = (d0 + d3) << 3; | |
150 const int a1 = (d1 + d2) << 3; | |
151 const int a2 = (d1 - d2) << 3; | |
152 const int a3 = (d0 - d3) << 3; | |
153 tmp[0 + i * 4] = (a0 + a1); | |
154 tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 14500) >> 12; | |
155 tmp[2 + i * 4] = (a0 - a1); | |
156 tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 7500) >> 12; | |
157 } | |
158 for (i = 0; i < 4; ++i) { | |
159 const int a0 = (tmp[0 + i] + tmp[12 + i]); | |
160 const int a1 = (tmp[4 + i] + tmp[ 8 + i]); | |
161 const int a2 = (tmp[4 + i] - tmp[ 8 + i]); | |
162 const int a3 = (tmp[0 + i] - tmp[12 + i]); | |
163 out[0 + i] = (a0 + a1 + 7) >> 4; | |
164 out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0); | |
165 out[8 + i] = (a0 - a1 + 7) >> 4; | |
166 out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16); | |
167 } | |
168 } | |
169 | |
170 static void ITransformWHT(const int16_t* in, int16_t* out) { | |
171 int tmp[16]; | |
172 int i; | |
173 for (i = 0; i < 4; ++i) { | |
174 const int a0 = in[0 + i] + in[12 + i]; | |
175 const int a1 = in[4 + i] + in[ 8 + i]; | |
176 const int a2 = in[4 + i] - in[ 8 + i]; | |
177 const int a3 = in[0 + i] - in[12 + i]; | |
178 tmp[0 + i] = a0 + a1; | |
179 tmp[8 + i] = a0 - a1; | |
180 tmp[4 + i] = a3 + a2; | |
181 tmp[12 + i] = a3 - a2; | |
182 } | |
183 for (i = 0; i < 4; ++i) { | |
184 const int dc = tmp[0 + i * 4] + 3; // w/ rounder | |
185 const int a0 = dc + tmp[3 + i * 4]; | |
186 const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4]; | |
187 const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4]; | |
188 const int a3 = dc - tmp[3 + i * 4]; | |
189 out[ 0] = (a0 + a1) >> 3; | |
190 out[16] = (a3 + a2) >> 3; | |
191 out[32] = (a0 - a1) >> 3; | |
192 out[48] = (a3 - a2) >> 3; | |
193 out += 64; | |
194 } | |
195 } | |
196 | |
197 static void FTransformWHT(const int16_t* in, int16_t* out) { | |
198 int tmp[16]; | |
199 int i; | |
200 for (i = 0; i < 4; ++i, in += 64) { | |
201 const int a0 = (in[0 * 16] + in[2 * 16]) << 2; | |
202 const int a1 = (in[1 * 16] + in[3 * 16]) << 2; | |
203 const int a2 = (in[1 * 16] - in[3 * 16]) << 2; | |
204 const int a3 = (in[0 * 16] - in[2 * 16]) << 2; | |
205 tmp[0 + i * 4] = (a0 + a1) + (a0 != 0); | |
206 tmp[1 + i * 4] = a3 + a2; | |
207 tmp[2 + i * 4] = a3 - a2; | |
208 tmp[3 + i * 4] = a0 - a1; | |
209 } | |
210 for (i = 0; i < 4; ++i) { | |
211 const int a0 = (tmp[0 + i] + tmp[8 + i]); | |
212 const int a1 = (tmp[4 + i] + tmp[12+ i]); | |
213 const int a2 = (tmp[4 + i] - tmp[12+ i]); | |
214 const int a3 = (tmp[0 + i] - tmp[8 + i]); | |
215 const int b0 = a0 + a1; | |
216 const int b1 = a3 + a2; | |
217 const int b2 = a3 - a2; | |
218 const int b3 = a0 - a1; | |
219 out[ 0 + i] = (b0 + (b0 > 0) + 3) >> 3; | |
220 out[ 4 + i] = (b1 + (b1 > 0) + 3) >> 3; | |
221 out[ 8 + i] = (b2 + (b2 > 0) + 3) >> 3; | |
222 out[12 + i] = (b3 + (b3 > 0) + 3) >> 3; | |
223 } | |
224 } | |
225 | |
226 #undef MUL | |
227 #undef STORE | |
228 | |
229 //------------------------------------------------------------------------------ | |
230 // Intra predictions | |
231 | |
232 #define OUT(x, y) dst[(x) + (y) * BPS] | |
233 | |
234 static inline void Fill(uint8_t* dst, int value, int size) { | |
235 int j; | |
236 for (j = 0; j < size; ++j) { | |
237 memset(dst + j * BPS, value, size); | |
238 } | |
239 } | |
240 | |
241 static inline void VerticalPred(uint8_t* dst, const uint8_t* top, int size) { | |
242 int j; | |
243 if (top) { | |
244 for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size); | |
245 } else { | |
246 Fill(dst, 127, size); | |
247 } | |
248 } | |
249 | |
250 static inline void HorizontalPred(uint8_t* dst, const uint8_t* left, int size) { | |
251 if (left) { | |
252 int j; | |
253 for (j = 0; j < size; ++j) { | |
254 memset(dst + j * BPS, left[j], size); | |
255 } | |
256 } else { | |
257 Fill(dst, 129, size); | |
258 } | |
259 } | |
260 | |
261 static inline void TrueMotion(uint8_t* dst, const uint8_t* left, | |
262 const uint8_t* top, int size) { | |
263 int y; | |
264 if (left) { | |
265 if (top) { | |
266 const uint8_t* const clip = clip1 + 255 - left[-1]; | |
267 for (y = 0; y < size; ++y) { | |
268 const uint8_t* const clip_table = clip + left[y]; | |
269 int x; | |
270 for (x = 0; x < size; ++x) { | |
271 dst[x] = clip_table[top[x]]; | |
272 } | |
273 dst += BPS; | |
274 } | |
275 } else { | |
276 HorizontalPred(dst, left, size); | |
277 } | |
278 } else { | |
279 // true motion without left samples (hence: with default 129 value) | |
280 // is equivalent to VE prediction where you just copy the top samples. | |
281 // Note that if top samples are not available, the default value is | |
282 // then 129, and not 127 as in the VerticalPred case. | |
283 if (top) { | |
284 VerticalPred(dst, top, size); | |
285 } else { | |
286 Fill(dst, 129, size); | |
287 } | |
288 } | |
289 } | |
290 | |
291 static inline void DCMode(uint8_t* dst, const uint8_t* left, | |
292 const uint8_t* top, | |
293 int size, int round, int shift) { | |
294 int DC = 0; | |
295 int j; | |
296 if (top) { | |
297 for (j = 0; j < size; ++j) DC += top[j]; | |
298 if (left) { // top and left present | |
299 for (j = 0; j < size; ++j) DC += left[j]; | |
300 } else { // top, but no left | |
301 DC += DC; | |
302 } | |
303 DC = (DC + round) >> shift; | |
304 } else if (left) { // left but no top | |
305 for (j = 0; j < size; ++j) DC += left[j]; | |
306 DC += DC; | |
307 DC = (DC + round) >> shift; | |
308 } else { // no top, no left, nothing. | |
309 DC = 0x80; | |
310 } | |
311 Fill(dst, DC, size); | |
312 } | |
313 | |
314 //------------------------------------------------------------------------------ | |
315 // Chroma 8x8 prediction (paragraph 12.2) | |
316 | |
317 static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, | |
318 const uint8_t* top) { | |
319 // U block | |
320 DCMode(C8DC8 + dst, left, top, 8, 8, 4); | |
321 VerticalPred(C8VE8 + dst, top, 8); | |
322 HorizontalPred(C8HE8 + dst, left, 8); | |
323 TrueMotion(C8TM8 + dst, left, top, 8); | |
324 // V block | |
325 dst += 8; | |
326 if (top) top += 8; | |
327 if (left) left += 16; | |
328 DCMode(C8DC8 + dst, left, top, 8, 8, 4); | |
329 VerticalPred(C8VE8 + dst, top, 8); | |
330 HorizontalPred(C8HE8 + dst, left, 8); | |
331 TrueMotion(C8TM8 + dst, left, top, 8); | |
332 } | |
333 | |
334 //------------------------------------------------------------------------------ | |
335 // luma 16x16 prediction (paragraph 12.3) | |
336 | |
337 static void Intra16Preds(uint8_t* dst, | |
338 const uint8_t* left, const uint8_t* top) { | |
339 DCMode(I16DC16 + dst, left, top, 16, 16, 5); | |
340 VerticalPred(I16VE16 + dst, top, 16); | |
341 HorizontalPred(I16HE16 + dst, left, 16); | |
342 TrueMotion(I16TM16 + dst, left, top, 16); | |
343 } | |
344 | |
345 //------------------------------------------------------------------------------ | |
346 // luma 4x4 prediction | |
347 | |
348 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) | |
349 #define AVG2(a, b) (((a) + (b) + 1) >> 1) | |
350 | |
351 static void VE4(uint8_t* dst, const uint8_t* top) { // vertical | |
352 const uint8_t vals[4] = { | |
353 AVG3(top[-1], top[0], top[1]), | |
354 AVG3(top[ 0], top[1], top[2]), | |
355 AVG3(top[ 1], top[2], top[3]), | |
356 AVG3(top[ 2], top[3], top[4]) | |
357 }; | |
358 int i; | |
359 for (i = 0; i < 4; ++i) { | |
360 memcpy(dst + i * BPS, vals, 4); | |
361 } | |
362 } | |
363 | |
364 static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal | |
365 const int X = top[-1]; | |
366 const int I = top[-2]; | |
367 const int J = top[-3]; | |
368 const int K = top[-4]; | |
369 const int L = top[-5]; | |
370 *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(X, I, J); | |
371 *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(I, J, K); | |
372 *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(J, K, L); | |
373 *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(K, L, L); | |
374 } | |
375 | |
376 static void DC4(uint8_t* dst, const uint8_t* top) { | |
377 uint32_t dc = 4; | |
378 int i; | |
379 for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; | |
380 Fill(dst, dc >> 3, 4); | |
381 } | |
382 | |
383 static void RD4(uint8_t* dst, const uint8_t* top) { | |
384 const int X = top[-1]; | |
385 const int I = top[-2]; | |
386 const int J = top[-3]; | |
387 const int K = top[-4]; | |
388 const int L = top[-5]; | |
389 const int A = top[0]; | |
390 const int B = top[1]; | |
391 const int C = top[2]; | |
392 const int D = top[3]; | |
393 OUT(0, 3) = AVG3(J, K, L); | |
394 OUT(0, 2) = OUT(1, 3) = AVG3(I, J, K); | |
395 OUT(0, 1) = OUT(1, 2) = OUT(2, 3) = AVG3(X, I, J); | |
396 OUT(0, 0) = OUT(1, 1) = OUT(2, 2) = OUT(3, 3) = AVG3(A, X, I); | |
397 OUT(1, 0) = OUT(2, 1) = OUT(3, 2) = AVG3(B, A, X); | |
398 OUT(2, 0) = OUT(3, 1) = AVG3(C, B, A); | |
399 OUT(3, 0) = AVG3(D, C, B); | |
400 } | |
401 | |
402 static void LD4(uint8_t* dst, const uint8_t* top) { | |
403 const int A = top[0]; | |
404 const int B = top[1]; | |
405 const int C = top[2]; | |
406 const int D = top[3]; | |
407 const int E = top[4]; | |
408 const int F = top[5]; | |
409 const int G = top[6]; | |
410 const int H = top[7]; | |
411 OUT(0, 0) = AVG3(A, B, C); | |
412 OUT(1, 0) = OUT(0, 1) = AVG3(B, C, D); | |
413 OUT(2, 0) = OUT(1, 1) = OUT(0, 2) = AVG3(C, D, E); | |
414 OUT(3, 0) = OUT(2, 1) = OUT(1, 2) = OUT(0, 3) = AVG3(D, E, F); | |
415 OUT(3, 1) = OUT(2, 2) = OUT(1, 3) = AVG3(E, F, G); | |
416 OUT(3, 2) = OUT(2, 3) = AVG3(F, G, H); | |
417 OUT(3, 3) = AVG3(G, H, H); | |
418 } | |
419 | |
420 static void VR4(uint8_t* dst, const uint8_t* top) { | |
421 const int X = top[-1]; | |
422 const int I = top[-2]; | |
423 const int J = top[-3]; | |
424 const int K = top[-4]; | |
425 const int A = top[0]; | |
426 const int B = top[1]; | |
427 const int C = top[2]; | |
428 const int D = top[3]; | |
429 OUT(0, 0) = OUT(1, 2) = AVG2(X, A); | |
430 OUT(1, 0) = OUT(2, 2) = AVG2(A, B); | |
431 OUT(2, 0) = OUT(3, 2) = AVG2(B, C); | |
432 OUT(3, 0) = AVG2(C, D); | |
433 | |
434 OUT(0, 3) = AVG3(K, J, I); | |
435 OUT(0, 2) = AVG3(J, I, X); | |
436 OUT(0, 1) = OUT(1, 3) = AVG3(I, X, A); | |
437 OUT(1, 1) = OUT(2, 3) = AVG3(X, A, B); | |
438 OUT(2, 1) = OUT(3, 3) = AVG3(A, B, C); | |
439 OUT(3, 1) = AVG3(B, C, D); | |
440 } | |
441 | |
442 static void VL4(uint8_t* dst, const uint8_t* top) { | |
443 const int A = top[0]; | |
444 const int B = top[1]; | |
445 const int C = top[2]; | |
446 const int D = top[3]; | |
447 const int E = top[4]; | |
448 const int F = top[5]; | |
449 const int G = top[6]; | |
450 const int H = top[7]; | |
451 OUT(0, 0) = AVG2(A, B); | |
452 OUT(1, 0) = OUT(0, 2) = AVG2(B, C); | |
453 OUT(2, 0) = OUT(1, 2) = AVG2(C, D); | |
454 OUT(3, 0) = OUT(2, 2) = AVG2(D, E); | |
455 | |
456 OUT(0, 1) = AVG3(A, B, C); | |
457 OUT(1, 1) = OUT(0, 3) = AVG3(B, C, D); | |
458 OUT(2, 1) = OUT(1, 3) = AVG3(C, D, E); | |
459 OUT(3, 1) = OUT(2, 3) = AVG3(D, E, F); | |
460 OUT(3, 2) = AVG3(E, F, G); | |
461 OUT(3, 3) = AVG3(F, G, H); | |
462 } | |
463 | |
464 static void HU4(uint8_t* dst, const uint8_t* top) { | |
465 const int I = top[-2]; | |
466 const int J = top[-3]; | |
467 const int K = top[-4]; | |
468 const int L = top[-5]; | |
469 OUT(0, 0) = AVG2(I, J); | |
470 OUT(2, 0) = OUT(0, 1) = AVG2(J, K); | |
471 OUT(2, 1) = OUT(0, 2) = AVG2(K, L); | |
472 OUT(1, 0) = AVG3(I, J, K); | |
473 OUT(3, 0) = OUT(1, 1) = AVG3(J, K, L); | |
474 OUT(3, 1) = OUT(1, 2) = AVG3(K, L, L); | |
475 OUT(3, 2) = OUT(2, 2) = | |
476 OUT(0, 3) = OUT(1, 3) = OUT(2, 3) = OUT(3, 3) = L; | |
477 } | |
478 | |
479 static void HD4(uint8_t* dst, const uint8_t* top) { | |
480 const int X = top[-1]; | |
481 const int I = top[-2]; | |
482 const int J = top[-3]; | |
483 const int K = top[-4]; | |
484 const int L = top[-5]; | |
485 const int A = top[0]; | |
486 const int B = top[1]; | |
487 const int C = top[2]; | |
488 | |
489 OUT(0, 0) = OUT(2, 1) = AVG2(I, X); | |
490 OUT(0, 1) = OUT(2, 2) = AVG2(J, I); | |
491 OUT(0, 2) = OUT(2, 3) = AVG2(K, J); | |
492 OUT(0, 3) = AVG2(L, K); | |
493 | |
494 OUT(3, 0) = AVG3(A, B, C); | |
495 OUT(2, 0) = AVG3(X, A, B); | |
496 OUT(1, 0) = OUT(3, 1) = AVG3(I, X, A); | |
497 OUT(1, 1) = OUT(3, 2) = AVG3(J, I, X); | |
498 OUT(1, 2) = OUT(3, 3) = AVG3(K, J, I); | |
499 OUT(1, 3) = AVG3(L, K, J); | |
500 } | |
501 | |
502 static void TM4(uint8_t* dst, const uint8_t* top) { | |
503 int x, y; | |
504 const uint8_t* const clip = clip1 + 255 - top[-1]; | |
505 for (y = 0; y < 4; ++y) { | |
506 const uint8_t* const clip_table = clip + top[-2 - y]; | |
507 for (x = 0; x < 4; ++x) { | |
508 dst[x] = clip_table[top[x]]; | |
509 } | |
510 dst += BPS; | |
511 } | |
512 } | |
513 | |
514 #undef AVG3 | |
515 #undef AVG2 | |
516 | |
517 // Left samples are top[-5 .. -2], top_left is top[-1], top are | |
518 // located at top[0..3], and top right is top[4..7] | |
519 static void Intra4Preds(uint8_t* dst, const uint8_t* top) { | |
520 DC4(I4DC4 + dst, top); | |
521 TM4(I4TM4 + dst, top); | |
522 VE4(I4VE4 + dst, top); | |
523 HE4(I4HE4 + dst, top); | |
524 RD4(I4RD4 + dst, top); | |
525 VR4(I4VR4 + dst, top); | |
526 LD4(I4LD4 + dst, top); | |
527 VL4(I4VL4 + dst, top); | |
528 HD4(I4HD4 + dst, top); | |
529 HU4(I4HU4 + dst, top); | |
530 } | |
531 | |
532 //------------------------------------------------------------------------------ | |
533 // Metric | |
534 | |
535 static inline int GetSSE(const uint8_t* a, const uint8_t* b, int w, int h) { | |
536 int count = 0; | |
537 int y, x; | |
538 for (y = 0; y < h; ++y) { | |
539 for (x = 0; x < w; ++x) { | |
540 const int diff = (int)a[x] - b[x]; | |
541 count += diff * diff; | |
542 } | |
543 a += BPS; | |
544 b += BPS; | |
545 } | |
546 return count; | |
547 } | |
548 | |
549 static int SSE16x16(const uint8_t* a, const uint8_t* b) { | |
550 return GetSSE(a, b, 16, 16); | |
551 } | |
552 static int SSE16x8(const uint8_t* a, const uint8_t* b) { | |
553 return GetSSE(a, b, 16, 8); | |
554 } | |
555 static int SSE8x8(const uint8_t* a, const uint8_t* b) { | |
556 return GetSSE(a, b, 8, 8); | |
557 } | |
558 static int SSE4x4(const uint8_t* a, const uint8_t* b) { | |
559 return GetSSE(a, b, 4, 4); | |
560 } | |
561 | |
562 //------------------------------------------------------------------------------ | |
563 // Texture distortion | |
564 // | |
565 // We try to match the spectral content (weighted) between source and | |
566 // reconstructed samples. | |
567 | |
568 // Hadamard transform | |
569 // Returns the weighted sum of the absolute value of transformed coefficients. | |
570 static int TTransform(const uint8_t* in, const uint16_t* w) { | |
571 int sum = 0; | |
572 int tmp[16]; | |
573 int i; | |
574 // horizontal pass | |
575 for (i = 0; i < 4; ++i, in += BPS) { | |
576 const int a0 = (in[0] + in[2]) << 2; | |
577 const int a1 = (in[1] + in[3]) << 2; | |
578 const int a2 = (in[1] - in[3]) << 2; | |
579 const int a3 = (in[0] - in[2]) << 2; | |
580 tmp[0 + i * 4] = a0 + a1 + (a0 != 0); | |
581 tmp[1 + i * 4] = a3 + a2; | |
582 tmp[2 + i * 4] = a3 - a2; | |
583 tmp[3 + i * 4] = a0 - a1; | |
584 } | |
585 // vertical pass | |
586 for (i = 0; i < 4; ++i, ++w) { | |
587 const int a0 = (tmp[0 + i] + tmp[8 + i]); | |
588 const int a1 = (tmp[4 + i] + tmp[12+ i]); | |
589 const int a2 = (tmp[4 + i] - tmp[12+ i]); | |
590 const int a3 = (tmp[0 + i] - tmp[8 + i]); | |
591 const int b0 = a0 + a1; | |
592 const int b1 = a3 + a2; | |
593 const int b2 = a3 - a2; | |
594 const int b3 = a0 - a1; | |
595 // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3 | |
596 sum += w[ 0] * ((abs(b0) + 3) >> 3); | |
597 sum += w[ 4] * ((abs(b1) + 3) >> 3); | |
598 sum += w[ 8] * ((abs(b2) + 3) >> 3); | |
599 sum += w[12] * ((abs(b3) + 3) >> 3); | |
600 } | |
601 return sum; | |
602 } | |
603 | |
604 static int Disto4x4(const uint8_t* const a, const uint8_t* const b, | |
605 const uint16_t* const w) { | |
606 const int sum1 = TTransform(a, w); | |
607 const int sum2 = TTransform(b, w); | |
608 return (abs(sum2 - sum1) + 8) >> 4; | |
609 } | |
610 | |
611 static int Disto16x16(const uint8_t* const a, const uint8_t* const b, | |
612 const uint16_t* const w) { | |
613 int D = 0; | |
614 int x, y; | |
615 for (y = 0; y < 16 * BPS; y += 4 * BPS) { | |
616 for (x = 0; x < 16; x += 4) { | |
617 D += Disto4x4(a + x + y, b + x + y, w); | |
618 } | |
619 } | |
620 return D; | |
621 } | |
622 | |
623 //------------------------------------------------------------------------------ | |
624 // Quantization | |
625 // | |
626 | |
627 // Simple quantization | |
628 static int QuantizeBlock(int16_t in[16], int16_t out[16], | |
629 int n, const VP8Matrix* const mtx) { | |
630 int last = -1; | |
631 for (; n < 16; ++n) { | |
632 const int j = VP8Zigzag[n]; | |
633 const int sign = (in[j] < 0); | |
634 int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j]; | |
635 if (coeff > 2047) coeff = 2047; | |
636 if (coeff > mtx->zthresh_[j]) { | |
637 const int Q = mtx->q_[j]; | |
638 const int iQ = mtx->iq_[j]; | |
639 const int B = mtx->bias_[j]; | |
640 out[n] = QUANTDIV(coeff, iQ, B); | |
641 if (sign) out[n] = -out[n]; | |
642 in[j] = out[n] * Q; | |
643 if (out[n]) last = n; | |
644 } else { | |
645 out[n] = 0; | |
646 in[j] = 0; | |
647 } | |
648 } | |
649 return (last >= 0); | |
650 } | |
651 | |
652 //------------------------------------------------------------------------------ | |
653 // Block copy | |
654 | |
655 static inline void Copy(const uint8_t* src, uint8_t* dst, int size) { | |
656 int y; | |
657 for (y = 0; y < size; ++y) { | |
658 memcpy(dst, src, size); | |
659 src += BPS; | |
660 dst += BPS; | |
661 } | |
662 } | |
663 | |
664 static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); } | |
665 static void Copy8x8(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 8); } | |
666 static void Copy16x16(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 16); } | |
667 | |
668 //------------------------------------------------------------------------------ | |
669 // SSE2 detection. | |
670 // | |
671 | |
672 #if defined(__pic__) && defined(__i386__) | |
673 static inline void GetCPUInfo(int cpu_info[4], int info_type) { | |
674 __asm__ volatile ( | |
675 "mov %%ebx, %%edi\n" | |
676 "cpuid\n" | |
677 "xchg %%edi, %%ebx\n" | |
678 : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) | |
679 : "a"(info_type)); | |
680 } | |
681 #elif defined(__i386__) || defined(__x86_64__) | |
682 static inline void GetCPUInfo(int cpu_info[4], int info_type) { | |
683 __asm__ volatile ( | |
684 "cpuid\n" | |
685 : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) | |
686 : "a"(info_type)); | |
687 } | |
688 #elif defined(_MSC_VER) // Visual C++ | |
689 #define GetCPUInfo __cpuid | |
690 #endif | |
691 | |
692 #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) | |
693 static int x86CPUInfo(CPUFeature feature) { | |
694 int cpu_info[4]; | |
695 GetCPUInfo(cpu_info, 1); | |
696 if (feature == kSSE2) { | |
697 return 0 != (cpu_info[3] & 0x04000000); | |
698 } | |
699 if (feature == kSSE3) { | |
700 return 0 != (cpu_info[2] & 0x00000001); | |
701 } | |
702 return 0; | |
703 } | |
704 VP8CPUInfo VP8EncGetCPUInfo = x86CPUInfo; | |
705 #else | |
706 VP8CPUInfo VP8EncGetCPUInfo = NULL; | |
707 #endif | |
708 | |
709 // Speed-critical function pointers. We have to initialize them to the default | |
710 // implementations within VP8EncDspInit(). | |
711 VP8CHisto VP8CollectHistogram; | |
712 VP8Idct VP8ITransform; | |
713 VP8Fdct VP8FTransform; | |
714 VP8WHT VP8ITransformWHT; | |
715 VP8WHT VP8FTransformWHT; | |
716 VP8Intra4Preds VP8EncPredLuma4; | |
717 VP8IntraPreds VP8EncPredLuma16; | |
718 VP8IntraPreds VP8EncPredChroma8; | |
719 VP8Metric VP8SSE16x16; | |
720 VP8Metric VP8SSE8x8; | |
721 VP8Metric VP8SSE16x8; | |
722 VP8Metric VP8SSE4x4; | |
723 VP8WMetric VP8TDisto4x4; | |
724 VP8WMetric VP8TDisto16x16; | |
725 VP8QuantizeBlock VP8EncQuantizeBlock; | |
726 VP8BlockCopy VP8Copy4x4; | |
727 VP8BlockCopy VP8Copy8x8; | |
728 VP8BlockCopy VP8Copy16x16; | |
729 | |
730 extern void VP8EncDspInitSSE2(void); | |
731 | |
732 void VP8EncDspInit(void) { | |
733 InitTables(); | |
734 | |
735 // default C implementations | |
736 VP8CollectHistogram = CollectHistogram; | |
737 VP8ITransform = ITransform; | |
738 VP8FTransform = FTransform; | |
739 VP8ITransformWHT = ITransformWHT; | |
740 VP8FTransformWHT = FTransformWHT; | |
741 VP8EncPredLuma4 = Intra4Preds; | |
742 VP8EncPredLuma16 = Intra16Preds; | |
743 VP8EncPredChroma8 = IntraChromaPreds; | |
744 VP8SSE16x16 = SSE16x16; | |
745 VP8SSE8x8 = SSE8x8; | |
746 VP8SSE16x8 = SSE16x8; | |
747 VP8SSE4x4 = SSE4x4; | |
748 VP8TDisto4x4 = Disto4x4; | |
749 VP8TDisto16x16 = Disto16x16; | |
750 VP8EncQuantizeBlock = QuantizeBlock; | |
751 VP8Copy4x4 = Copy4x4; | |
752 VP8Copy8x8 = Copy8x8; | |
753 VP8Copy16x16 = Copy16x16; | |
754 | |
755 // If defined, use CPUInfo() to overwrite some pointers with faster versions. | |
756 if (VP8EncGetCPUInfo) { | |
757 if (VP8EncGetCPUInfo(kSSE2)) { | |
758 #if defined(__SSE2__) || defined(_MSC_VER) | |
759 VP8EncDspInitSSE2(); | |
760 #endif | |
761 } | |
762 if (VP8EncGetCPUInfo(kSSE3)) { | |
763 // later we'll plug some SSE3 variant here | |
764 } | |
765 } | |
766 } | |
767 | |
768 #if defined(__cplusplus) || defined(c_plusplus) | |
769 } // extern "C" | |
770 #endif | |
OLD | NEW |