media/base/sinc_resampler.cc - Issue 10803003: Add SSE optimizations to SincResampler.

Side by Side Diff: media/base/sinc_resampler.cc

Issue 10803003: Add SSE optimizations to SincResampler. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Fix compile error. Created 8 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_):	5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_):

6 //	6 //

7 // \|----------------\|-----------------------------------------\|----------------\|	7 // \|----------------\|-----------------------------------------\|----------------\|

8 //	8 //

9 // kBlockSize + kKernelSize / 2	9 // kBlockSize + kKernelSize / 2

10 // <--------------------------------------------------------->	10 // <--------------------------------------------------------->

(...skipping 18 matching lines...) Expand all Loading...
29 // 5) Goto (2) until all of input is consumed.	29 // 5) Goto (2) until all of input is consumed.

30 //	30 //

31 // Note: we're glossing over how the sub-sample handling works with	31 // Note: we're glossing over how the sub-sample handling works with

32 // \|virtual_source_idx_\|, etc.	32 // \|virtual_source_idx_\|, etc.

33	33

34 // MSVC++ requires this to be set before any other includes to get M_PI.	34 // MSVC++ requires this to be set before any other includes to get M_PI.

35 #define _USE_MATH_DEFINES	35 #define _USE_MATH_DEFINES

36	36

37 #include "media/base/sinc_resampler.h"	37 #include "media/base/sinc_resampler.h"

38	38

	39 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

	40 #include <xmmintrin.h>

	41 #endif

39 #include <cmath>	42 #include <cmath>

40	43

	44 #include "base/cpu.h"

41 #include "base/logging.h"	45 #include "base/logging.h"

42	46

43 namespace media {	47 namespace media {

44	48

45 enum {	49 enum {

46 // The kernel size can be adjusted for quality (higher is better) at the	50 // The kernel size can be adjusted for quality (higher is better) at the

47 // expense of performance. Must be an even number.	51 // expense of performance. Must be a multiple of 32.

48 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.	52 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.

49 kKernelSize = 32,	53 kKernelSize = 32,

50	54

51 // The number of destination frames generated per processing pass. Affects	55 // The number of destination frames generated per processing pass. Affects

52 // how often and for how much SincResampler calls back for input. Must be	56 // how often and for how much SincResampler calls back for input. Must be

53 // greater than kKernelSize.	57 // greater than kKernelSize.

54 kBlockSize = 512,	58 kBlockSize = 512,

55	59

56 // The kernel offset count is used for interpolation and is the number of	60 // The kernel offset count is used for interpolation and is the number of

57 // sub-sample kernel shifts. Can be adjusted for quality (higher is better)	61 // sub-sample kernel shifts. Can be adjusted for quality (higher is better)

58 // at the expense of allocating more memory.	62 // at the expense of allocating more memory.

59 kKernelOffsetCount = 32,	63 kKernelOffsetCount = 32,

60 kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),	64 kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),

61	65

62 // The size (in samples) of the internal buffer used by the resampler.	66 // The size (in samples) of the internal buffer used by the resampler.

63 kBufferSize = kBlockSize + kKernelSize	67 kBufferSize = kBlockSize + kKernelSize

64 };	68 };

65	69

66 SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb)	70 SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb)

67 : io_sample_rate_ratio_(io_sample_rate_ratio),	71 : io_sample_rate_ratio_(io_sample_rate_ratio),

68 virtual_source_idx_(0),	72 virtual_source_idx_(0),

69 buffer_primed_(false),	73 buffer_primed_(false),

70 read_cb_(read_cb),	74 read_cb_(read_cb),

71 // TODO(dalecurtis): When we switch to AVX/SSE optimization, we'll need to	75 // Create input buffers with a 16-byte alignment for SSE optimizations.

72 // allocate with 32-byte alignment and ensure they're sized % 32 bytes.	76 kernel_storage_(static_cast<float*>(

73 kernel_storage_(new float[kKernelStorageSize]),	77 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),

74 input_buffer_(new float[kBufferSize]),	78 input_buffer_(static_cast<float*>(

	79 base::AlignedAlloc(sizeof(float) * kBufferSize, 16))),

75 // Setup various region pointers in the buffer (see diagram above).	80 // Setup various region pointers in the buffer (see diagram above).

76 r0_(input_buffer_.get() + kKernelSize / 2),	81 r0_(input_buffer_.get() + kKernelSize / 2),

77 r1_(input_buffer_.get()),	82 r1_(input_buffer_.get()),

78 r2_(r0_),	83 r2_(r0_),

79 r3_(r0_ + kBlockSize - kKernelSize / 2),	84 r3_(r0_ + kBlockSize - kKernelSize / 2),

80 r4_(r0_ + kBlockSize),	85 r4_(r0_ + kBlockSize),

81 r5_(r0_ + kKernelSize / 2) {	86 r5_(r0_ + kKernelSize / 2) {

82 DCHECK_EQ(kKernelSize % 2, 0) << "kKernelSize must be even!";	87 // Ensure kKernelSize is a multiple of 32 for easy SSE optimizations; causes

	88 // r0_ and r5_ (used for input) to always be 16-byte aligned by virtue of

	89 // input_buffer_ being 16-byte aligned.

	90 DCHECK_EQ(kKernelSize % 32, 0) << "kKernelSize must be a multiple of 32!";

83 DCHECK_GT(kBlockSize, kKernelSize)	91 DCHECK_GT(kBlockSize, kKernelSize)

84 << "kBlockSize must be greater than kKernelSize!";	92 << "kBlockSize must be greater than kKernelSize!";

85 // Basic sanity checks to ensure buffer regions are laid out correctly:	93 // Basic sanity checks to ensure buffer regions are laid out correctly:

86 // r0_ and r2_ should always be the same position.	94 // r0_ and r2_ should always be the same position.

87 DCHECK_EQ(r0_, r2_);	95 DCHECK_EQ(r0_, r2_);

88 // r1_ at the beginning of the buffer.	96 // r1_ at the beginning of the buffer.

89 DCHECK_EQ(r1_, input_buffer_.get());	97 DCHECK_EQ(r1_, input_buffer_.get());

90 // r1_ left of r2_, r2_ left of r5_ and r1_, r2_ size correct.	98 // r1_ left of r2_, r2_ left of r5_ and r1_, r2_ size correct.

91 DCHECK_EQ(r2_ - r1_, r5_ - r2_);	99 DCHECK_EQ(r2_ - r1_, r5_ - r2_);

92 // r3_ left of r4_, r5_ left of r0_ and r3_ size correct.	100 // r3_ left of r4_, r5_ left of r0_ and r3_ size correct.

(...skipping 43 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
136 double s =	144 double s =

137 sinc_scale_factor * M_PI * (i - kKernelSize / 2 - subsample_offset);	145 sinc_scale_factor * M_PI * (i - kKernelSize / 2 - subsample_offset);

138 double sinc = (!s ? 1.0 : sin(s) / s) * sinc_scale_factor;	146 double sinc = (!s ? 1.0 : sin(s) / s) * sinc_scale_factor;

139	147

140 // Compute Blackman window, matching the offset of the sinc().	148 // Compute Blackman window, matching the offset of the sinc().

141 double x = (i - subsample_offset) / kKernelSize;	149 double x = (i - subsample_offset) / kKernelSize;

142 double window = kA0 - kA1 * cos(2.0 * M_PI * x) + kA2	150 double window = kA0 - kA1 * cos(2.0 * M_PI * x) + kA2

143 * cos(4.0 * M_PI * x);	151 * cos(4.0 * M_PI * x);

144	152

145 // Window the sinc() function and store at the correct offset.	153 // Window the sinc() function and store at the correct offset.

146 kernel_storage_[i + offset_idx * kKernelSize] = sinc * window;	154 kernel_storage_.get()[i + offset_idx * kKernelSize] = sinc * window;

147 }	155 }

148 }	156 }

149 }	157 }

150	158

151 void SincResampler::Resample(float* destination, int frames) {	159 void SincResampler::Resample(float* destination, int frames) {

152 int remaining_frames = frames;	160 int remaining_frames = frames;

153	161

154 // Step (1) -- Prime the input buffer at the start of the input stream.	162 // Step (1) -- Prime the input buffer at the start of the input stream.

155 if (!buffer_primed_) {	163 if (!buffer_primed_) {

156 read_cb_.Run(r0_, kBlockSize + kKernelSize / 2);	164 read_cb_.Run(r0_, kBlockSize + kKernelSize / 2);

157 buffer_primed_ = true;	165 buffer_primed_ = true;

158 }	166 }

159	167

160 // Step (2) -- Resample!	168 // Step (2) -- Resample!

161 while (remaining_frames) {	169 while (remaining_frames) {

162 while (virtual_source_idx_ < kBlockSize) {	170 while (virtual_source_idx_ < kBlockSize) {

163 // \|virtual_source_idx_\| lies in between two kernel offsets so figure out	171 // \|virtual_source_idx_\| lies in between two kernel offsets so figure out

164 // what they are.	172 // what they are.

165 int source_idx = static_cast<int>(virtual_source_idx_);	173 int source_idx = static_cast<int>(virtual_source_idx_);

166 double subsample_remainder = virtual_source_idx_ - source_idx;	174 double subsample_remainder = virtual_source_idx_ - source_idx;

167	175

168 double virtual_offset_idx = subsample_remainder * kKernelOffsetCount;	176 double virtual_offset_idx = subsample_remainder * kKernelOffsetCount;

169 int offset_idx = static_cast<int>(virtual_offset_idx);	177 int offset_idx = static_cast<int>(virtual_offset_idx);

170	178

	179 // We'll compute "convolutions" for the two kernels which straddle

	180 // \|virtual_source_idx_\|.

171 float* k1 = kernel_storage_.get() + offset_idx * kKernelSize;	181 float* k1 = kernel_storage_.get() + offset_idx * kKernelSize;

172 float* k2 = k1 + kKernelSize;	182 float* k2 = k1 + kKernelSize;

173	183

174 // Initialize input pointer based on quantized \|virtual_source_idx_\|.	184 // Initialize input pointer based on quantized \|virtual_source_idx_\|.

175 float* input_ptr = r1_ + source_idx;	185 float* input_ptr = r1_ + source_idx;

176	186

177 // We'll compute "convolutions" for the two kernels which straddle

178 // \|virtual_source_idx_\|.

179 float sum1 = 0;

180 float sum2 = 0;

181

182 // Figure out how much to weight each kernel's "convolution".	187 // Figure out how much to weight each kernel's "convolution".

183 double kernel_interpolation_factor = virtual_offset_idx - offset_idx;	188 double kernel_interpolation_factor = virtual_offset_idx - offset_idx;

184	189 *destination++ = Convolve(

185 // Generate a single output sample.	190 input_ptr, k1, k2, kernel_interpolation_factor);

186 int n = kKernelSize;

187 float input;

188 // TODO(dalecurtis): For initial commit, I've ripped out all the SSE

189 // optimizations, these definitely need to go back in before release.

190 while (n--) {

191 input = *input_ptr++;

192 sum1 += input * *k1++;

193 sum2 += input * *k2++;

194 }

195

196 // Linearly interpolate the two "convolutions".

197 double result = (1.0 - kernel_interpolation_factor) * sum1

198 + kernel_interpolation_factor * sum2;

199

200 *destination++ = result;

201	191

202 // Advance the virtual index.	192 // Advance the virtual index.

203 virtual_source_idx_ += io_sample_rate_ratio_;	193 virtual_source_idx_ += io_sample_rate_ratio_;

204	194

205 if (!--remaining_frames)	195 if (!--remaining_frames)

206 return;	196 return;

207 }	197 }

208	198

209 // Wrap back around to the start.	199 // Wrap back around to the start.

210 virtual_source_idx_ -= kBlockSize;	200 virtual_source_idx_ -= kBlockSize;

211	201

212 // Step (3) Copy r3_ to r1_ and r4_ to r2_.	202 // Step (3) Copy r3_ to r1_ and r4_ to r2_.

213 // This wraps the last input frames back to the start of the buffer.	203 // This wraps the last input frames back to the start of the buffer.

214 memcpy(r1_, r3_, sizeof(input_buffer_.get()) (kKernelSize / 2));	204 memcpy(r1_, r3_, sizeof(input_buffer_.get()) (kKernelSize / 2));

215 memcpy(r2_, r4_, sizeof(input_buffer_.get()) (kKernelSize / 2));	205 memcpy(r2_, r4_, sizeof(input_buffer_.get()) (kKernelSize / 2));

216	206

217 // Step (4)	207 // Step (4)

218 // Refresh the buffer with more input.	208 // Refresh the buffer with more input.

219 read_cb_.Run(r5_, kBlockSize);	209 read_cb_.Run(r5_, kBlockSize);

220 }	210 }

221 }	211 }

222	212

223 int SincResampler::ChunkSize() {	213 int SincResampler::ChunkSize() {

224 return kBlockSize / io_sample_rate_ratio_;	214 return kBlockSize / io_sample_rate_ratio_;

225 }	215 }

226	216

	217 float SincResampler::Convolve(const float* input_ptr, const float* k1,

	218 const float* k2,

	219 double kernel_interpolation_factor) {

	220 // Rely on function level static initialization to keep ConvolveProc selection

	221 // thread safe.

	222 typedef float (ConvolveProc)(const float src, const float* k1,

	223 const float* k2,

	224 double kernel_interpolation_factor);

	225 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

	226 static const ConvolveProc kConvolveProc =

	227 base::CPU().has_sse() ? Convolve_SSE : Convolve_C;

	228 #else

	229 static const ConvolveProc kConvolveProc = Convolve_C;

	230 #endif

	231

	232 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor);

	233 }

	234

	235 float SincResampler::Convolve_C(const float* input_ptr, const float* k1,

	236 const float* k2,

	237 double kernel_interpolation_factor) {

	238 float sum1 = 0;

	239 float sum2 = 0;

	240

	241 // Generate a single output sample. Unrolling this loop hurt performance in

	242 // local testing.

	243 int n = kKernelSize;

	244 while (n--) {

	245 sum1 += input_ptr *k1++;

	246 sum2 += input_ptr++ *k2++;

	247 }

	248

	249 // Linearly interpolate the two "convolutions".

	250 return (1.0 - kernel_interpolation_factor) * sum1

	251 + kernel_interpolation_factor * sum2;

	252 }

	253

	254 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

	255 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,

	256 const float* k2,

	257 double kernel_interpolation_factor) {

	258 // Ensure \|k1\|, \|k2\| are 16-byte aligned for SSE usage. Should always be true

	259 // so long as kKernelSize is a multiple of 16.

	260 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);

	261 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);

	262

	263 __m128 m_input;

	264 __m128 m_sums1 = _mm_setzero_ps();

	265 __m128 m_sums2 = _mm_setzero_ps();

	266

	267 // Based on \|input_ptr\| alignment, we need to use loadu or load. Unrolling

	268 // these loops hurt performance in local testing.

	269 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {

	270 for (int i = 0; i < kKernelSize; i += 4) {

	271 m_input = _mm_loadu_ps(input_ptr + i);

	272 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

	273 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

	274 }

	275 } else {

	276 for (int i = 0; i < kKernelSize; i += 4) {

	277 m_input = _mm_load_ps(input_ptr + i);

	278 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

	279 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

	280 }

	281 }

	282

	283 // Linearly interpolate the two "convolutions".

	284 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor));

	285 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor));

	286 m_sums1 = _mm_add_ps(m_sums1, m_sums2);

	287

	288 // Sum components together.

	289 float result;

	290 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);

	291 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(

	292 m_sums2, m_sums2, 1)));

	293

	294 return result;

	295 }

	296 #endif

	297

227 } // namespace media	298 } // namespace media

OLD	NEW

« no previous file with comments | « media/base/sinc_resampler.h ('k') | media/base/sinc_resampler_unittest.cc » ('j') | no next file with comments »