source/libvpx/vp9/common/x86/vp9_sadmxn_sse2.c - Issue 13474006: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_sadmxn_sse2.c

Issue 13474006: libvpx: Pull from upstream (Closed) Base URL: https://src.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 7 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.

	3 *

	4 * Use of this source code is governed by a BSD-style license

	5 * that can be found in the LICENSE file in the root of the source

	6 * tree. An additional intellectual property rights grant can be found

	7 * in the file PATENTS. All contributing project authors may

	8 * be found in the AUTHORS file in the root of the source tree.

	9 */

	10

	11 #include <emmintrin.h> /* SSE2 */

	12 #include "vpx/vpx_integer.h"

	13 #include "vpx_ports/emmintrin_compat.h"

	14

	15 unsigned int vp9_sad16x3_sse2(

	16 const unsigned char *src_ptr,

	17 int src_stride,

	18 const unsigned char *ref_ptr,

	19 int ref_stride) {

	20 __m128i s0, s1, s2;

	21 __m128i r0, r1, r2;

	22 __m128i sad;

	23

	24 s0 = _mm_loadu_si128((const __m128i )(src_ptr + 0 src_stride));

	25 s1 = _mm_loadu_si128((const __m128i )(src_ptr + 1 src_stride));

	26 s2 = _mm_loadu_si128((const __m128i )(src_ptr + 2 src_stride));

	27

	28 r0 = _mm_loadu_si128((const __m128i )(ref_ptr + 0 ref_stride));

	29 r1 = _mm_loadu_si128((const __m128i )(ref_ptr + 1 ref_stride));

	30 r2 = _mm_loadu_si128((const __m128i )(ref_ptr + 2 ref_stride));

	31

	32 sad = _mm_sad_epu8(s0, r0);

	33 sad = _mm_add_epi16(sad, _mm_sad_epu8(s1, r1));

	34 sad = _mm_add_epi16(sad, _mm_sad_epu8(s2, r2));

	35 sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8));

	36

	37 return _mm_cvtsi128_si32(sad);

	38 }

	39

	40 unsigned int vp9_sad3x16_sse2(

	41 const unsigned char *src_ptr,

	42 int src_stride,

	43 const unsigned char *ref_ptr,

	44 int ref_stride) {

	45 int r;

	46 __m128i s0, s1, s2, s3;

	47 __m128i r0, r1, r2, r3;

	48 __m128i sad = _mm_setzero_si128();

	49 __m128i mask;

	50 const int offset = (uintptr_t)src_ptr & 3;

	51

	52 /* In current use case, the offset is 1 if CONFIG_SUBPELREFMV is off.

	53 * Here, for offset=1, we adjust src_ptr to be 4-byte aligned. Then, movd

	54 * takes much less time.

	55 */

	56 if (offset == 1)

	57 src_ptr -= 1;

	58

	59 /* mask = 0xffffffffffff0000ffffffffffff0000 */

	60 mask = _mm_cmpeq_epi32(sad, sad);

	61 mask = _mm_slli_epi64(mask, 16);

	62

	63 for (r = 0; r < 16; r += 4) {

	64 s0 = _mm_cvtsi32_si128 ((const int )(src_ptr + 0 * src_stride));

	65 s1 = _mm_cvtsi32_si128 ((const int )(src_ptr + 1 * src_stride));

	66 s2 = _mm_cvtsi32_si128 ((const int )(src_ptr + 2 * src_stride));

	67 s3 = _mm_cvtsi32_si128 ((const int )(src_ptr + 3 * src_stride));

	68 r0 = _mm_cvtsi32_si128 ((const int )(ref_ptr + 0 * ref_stride));

	69 r1 = _mm_cvtsi32_si128 ((const int )(ref_ptr + 1 * ref_stride));

	70 r2 = _mm_cvtsi32_si128 ((const int )(ref_ptr + 2 * ref_stride));

	71 r3 = _mm_cvtsi32_si128 ((const int )(ref_ptr + 3 * ref_stride));

	72

	73 s0 = _mm_unpacklo_epi8(s0, s1);

	74 r0 = _mm_unpacklo_epi8(r0, r1);

	75 s2 = _mm_unpacklo_epi8(s2, s3);

	76 r2 = _mm_unpacklo_epi8(r2, r3);

	77 s0 = _mm_unpacklo_epi64(s0, s2);

	78 r0 = _mm_unpacklo_epi64(r0, r2);

	79

	80 // throw out extra byte

	81 if (offset == 1)

	82 s0 = _mm_and_si128(s0, mask);

	83 else

	84 s0 = _mm_slli_epi64(s0, 16);

	85 r0 = _mm_slli_epi64(r0, 16);

	86

	87 sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0));

	88

	89 src_ptr += src_stride*4;

	90 ref_ptr += ref_stride*4;

	91 }

	92

	93 sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8));

	94 return _mm_cvtsi128_si32(sad);

	95 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/common/x86/vp9_loopfilter_x86.c ('k') | source/libvpx/vp9/common/x86/vp9_sadmxn_x86.c » ('j') | no next file with comments »