Line data Source code
1 : /*
2 : * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : // Modified from the Chromium original:
12 : // src/media/base/sinc_resampler.cc
13 :
14 : // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_
15 : // and r4_ will move after the first load):
16 : //
17 : // |----------------|-----------------------------------------|----------------|
18 : //
19 : // request_frames_
20 : // <--------------------------------------------------------->
21 : // r0_ (during first load)
22 : //
23 : // kKernelSize / 2 kKernelSize / 2 kKernelSize / 2 kKernelSize / 2
24 : // <---------------> <---------------> <---------------> <--------------->
25 : // r1_ r2_ r3_ r4_
26 : //
27 : // block_size_ == r4_ - r2_
28 : // <--------------------------------------->
29 : //
30 : // request_frames_
31 : // <------------------ ... ----------------->
32 : // r0_ (during second load)
33 : //
34 : // On the second request r0_ slides to the right by kKernelSize / 2 and r3_, r4_
35 : // and block_size_ are reinitialized via step (3) in the algorithm below.
36 : //
37 : // These new regions remain constant until a Flush() occurs. While complicated,
38 : // this allows us to reduce jitter by always requesting the same amount from the
39 : // provided callback.
40 : //
41 : // The algorithm:
42 : //
43 : // 1) Allocate input_buffer of size: request_frames_ + kKernelSize; this ensures
44 : // there's enough room to read request_frames_ from the callback into region
45 : // r0_ (which will move between the first and subsequent passes).
46 : //
47 : // 2) Let r1_, r2_ each represent half the kernel centered around r0_:
48 : //
49 : // r0_ = input_buffer_ + kKernelSize / 2
50 : // r1_ = input_buffer_
51 : // r2_ = r0_
52 : //
53 : // r0_ is always request_frames_ in size. r1_, r2_ are kKernelSize / 2 in
54 : // size. r1_ must be zero initialized to avoid convolution with garbage (see
55 : // step (5) for why).
56 : //
57 : // 3) Let r3_, r4_ each represent half the kernel right aligned with the end of
58 : // r0_ and choose block_size_ as the distance in frames between r4_ and r2_:
59 : //
60 : // r3_ = r0_ + request_frames_ - kKernelSize
61 : // r4_ = r0_ + request_frames_ - kKernelSize / 2
62 : // block_size_ = r4_ - r2_ = request_frames_ - kKernelSize / 2
63 : //
64 : // 4) Consume request_frames_ frames into r0_.
65 : //
66 : // 5) Position kernel centered at start of r2_ and generate output frames until
67 : // the kernel is centered at the start of r4_ or we've finished generating
68 : // all the output frames.
69 : //
70 : // 6) Wrap left over data from the r3_ to r1_ and r4_ to r2_.
71 : //
72 : // 7) If we're on the second load, in order to avoid overwriting the frames we
73 : // just wrapped from r4_ we need to slide r0_ to the right by the size of
74 : // r4_, which is kKernelSize / 2:
75 : //
76 : // r0_ = r0_ + kKernelSize / 2 = input_buffer_ + kKernelSize
77 : //
78 : // r3_, r4_, and block_size_ then need to be reinitialized, so goto (3).
79 : //
80 : // 8) Else, if we're not on the second load, goto (4).
81 : //
82 : // Note: we're glossing over how the sub-sample handling works with
83 : // |virtual_source_idx_|, etc.
84 :
85 : // MSVC++ requires this to be set before any other includes to get M_PI.
86 : #ifndef _USE_MATH_DEFINES
87 : #define _USE_MATH_DEFINES
88 : #endif
89 :
90 : #include <math.h>
91 : #include <string.h>
92 :
93 : #include <limits>
94 :
95 : #include "webrtc/base/checks.h"
96 : #include "webrtc/common_audio/resampler/sinc_resampler.h"
97 : #include "webrtc/system_wrappers/include/cpu_features_wrapper.h"
98 : #include "webrtc/typedefs.h"
99 :
100 : namespace {
101 :
102 0 : double SincScaleFactor(double io_ratio) {
103 : // |sinc_scale_factor| is basically the normalized cutoff frequency of the
104 : // low-pass filter.
105 0 : double sinc_scale_factor = io_ratio > 1.0 ? 1.0 / io_ratio : 1.0;
106 :
107 : // The sinc function is an idealized brick-wall filter, but since we're
108 : // windowing it the transition from pass to stop does not happen right away.
109 : // So we should adjust the low pass filter cutoff slightly downward to avoid
110 : // some aliasing at the very high-end.
111 : // TODO(crogers): this value is empirical and to be more exact should vary
112 : // depending on kKernelSize.
113 0 : sinc_scale_factor *= 0.9;
114 :
115 0 : return sinc_scale_factor;
116 : }
117 :
118 : } // namespace
119 :
120 : namespace webrtc {
121 :
122 : const size_t SincResampler::kKernelSize;
123 :
124 : // If we know the minimum architecture at compile time, avoid CPU detection.
125 : #if defined(WEBRTC_ARCH_X86_FAMILY)
126 : #if defined(__SSE2__)
127 : #define CONVOLVE_FUNC Convolve_SSE
128 0 : void SincResampler::InitializeCPUSpecificFeatures() {}
129 : #else
130 : // x86 CPU detection required. Function will be set by
131 : // InitializeCPUSpecificFeatures().
132 : // TODO(dalecurtis): Once Chrome moves to an SSE baseline this can be removed.
133 : #define CONVOLVE_FUNC convolve_proc_
134 :
135 : void SincResampler::InitializeCPUSpecificFeatures() {
136 : convolve_proc_ = WebRtc_GetCPUInfo(kSSE2) ? Convolve_SSE : Convolve_C;
137 : }
138 : #endif
139 : #elif defined(WEBRTC_HAS_NEON)
140 : #define CONVOLVE_FUNC Convolve_NEON
141 : void SincResampler::InitializeCPUSpecificFeatures() {}
142 : #else
143 : // Unknown architecture.
144 : #define CONVOLVE_FUNC Convolve_C
145 : void SincResampler::InitializeCPUSpecificFeatures() {}
146 : #endif
147 :
148 0 : SincResampler::SincResampler(double io_sample_rate_ratio,
149 : size_t request_frames,
150 0 : SincResamplerCallback* read_cb)
151 : : io_sample_rate_ratio_(io_sample_rate_ratio),
152 : read_cb_(read_cb),
153 : request_frames_(request_frames),
154 0 : input_buffer_size_(request_frames_ + kKernelSize),
155 : // Create input buffers with a 16-byte alignment for SSE optimizations.
156 : kernel_storage_(static_cast<float*>(
157 0 : AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
158 : kernel_pre_sinc_storage_(static_cast<float*>(
159 0 : AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
160 : kernel_window_storage_(static_cast<float*>(
161 0 : AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
162 : input_buffer_(static_cast<float*>(
163 0 : AlignedMalloc(sizeof(float) * input_buffer_size_, 16))),
164 : #if defined(WEBRTC_CPU_DETECTION)
165 : convolve_proc_(NULL),
166 : #endif
167 0 : r1_(input_buffer_.get()),
168 0 : r2_(input_buffer_.get() + kKernelSize / 2) {
169 : #if defined(WEBRTC_CPU_DETECTION)
170 : InitializeCPUSpecificFeatures();
171 : RTC_DCHECK(convolve_proc_);
172 : #endif
173 0 : RTC_DCHECK_GT(request_frames_, 0);
174 0 : Flush();
175 0 : RTC_DCHECK_GT(block_size_, kKernelSize);
176 :
177 0 : memset(kernel_storage_.get(), 0,
178 0 : sizeof(*kernel_storage_.get()) * kKernelStorageSize);
179 0 : memset(kernel_pre_sinc_storage_.get(), 0,
180 0 : sizeof(*kernel_pre_sinc_storage_.get()) * kKernelStorageSize);
181 0 : memset(kernel_window_storage_.get(), 0,
182 0 : sizeof(*kernel_window_storage_.get()) * kKernelStorageSize);
183 :
184 0 : InitializeKernel();
185 0 : }
186 :
187 0 : SincResampler::~SincResampler() {}
188 :
189 0 : void SincResampler::UpdateRegions(bool second_load) {
190 : // Setup various region pointers in the buffer (see diagram above). If we're
191 : // on the second load we need to slide r0_ to the right by kKernelSize / 2.
192 0 : r0_ = input_buffer_.get() + (second_load ? kKernelSize : kKernelSize / 2);
193 0 : r3_ = r0_ + request_frames_ - kKernelSize;
194 0 : r4_ = r0_ + request_frames_ - kKernelSize / 2;
195 0 : block_size_ = r4_ - r2_;
196 :
197 : // r1_ at the beginning of the buffer.
198 0 : RTC_DCHECK_EQ(r1_, input_buffer_.get());
199 : // r1_ left of r2_, r4_ left of r3_ and size correct.
200 0 : RTC_DCHECK_EQ(r2_ - r1_, r4_ - r3_);
201 : // r2_ left of r3.
202 0 : RTC_DCHECK_LT(r2_, r3_);
203 0 : }
204 :
205 0 : void SincResampler::InitializeKernel() {
206 : // Blackman window parameters.
207 : static const double kAlpha = 0.16;
208 : static const double kA0 = 0.5 * (1.0 - kAlpha);
209 : static const double kA1 = 0.5;
210 : static const double kA2 = 0.5 * kAlpha;
211 :
212 : // Generates a set of windowed sinc() kernels.
213 : // We generate a range of sub-sample offsets from 0.0 to 1.0.
214 0 : const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_);
215 0 : for (size_t offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) {
216 : const float subsample_offset =
217 0 : static_cast<float>(offset_idx) / kKernelOffsetCount;
218 :
219 0 : for (size_t i = 0; i < kKernelSize; ++i) {
220 0 : const size_t idx = i + offset_idx * kKernelSize;
221 0 : const float pre_sinc = static_cast<float>(M_PI *
222 0 : (static_cast<int>(i) - static_cast<int>(kKernelSize / 2) -
223 0 : subsample_offset));
224 0 : kernel_pre_sinc_storage_[idx] = pre_sinc;
225 :
226 : // Compute Blackman window, matching the offset of the sinc().
227 0 : const float x = (i - subsample_offset) / kKernelSize;
228 0 : const float window = static_cast<float>(kA0 - kA1 * cos(2.0 * M_PI * x) +
229 0 : kA2 * cos(4.0 * M_PI * x));
230 0 : kernel_window_storage_[idx] = window;
231 :
232 : // Compute the sinc with offset, then window the sinc() function and store
233 : // at the correct offset.
234 0 : kernel_storage_[idx] = static_cast<float>(window *
235 0 : ((pre_sinc == 0) ?
236 : sinc_scale_factor :
237 0 : (sin(sinc_scale_factor * pre_sinc) / pre_sinc)));
238 : }
239 : }
240 0 : }
241 :
242 0 : void SincResampler::SetRatio(double io_sample_rate_ratio) {
243 0 : if (fabs(io_sample_rate_ratio_ - io_sample_rate_ratio) <
244 0 : std::numeric_limits<double>::epsilon()) {
245 0 : return;
246 : }
247 :
248 0 : io_sample_rate_ratio_ = io_sample_rate_ratio;
249 :
250 : // Optimize reinitialization by reusing values which are independent of
251 : // |sinc_scale_factor|. Provides a 3x speedup.
252 0 : const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_);
253 0 : for (size_t offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) {
254 0 : for (size_t i = 0; i < kKernelSize; ++i) {
255 0 : const size_t idx = i + offset_idx * kKernelSize;
256 0 : const float window = kernel_window_storage_[idx];
257 0 : const float pre_sinc = kernel_pre_sinc_storage_[idx];
258 :
259 0 : kernel_storage_[idx] = static_cast<float>(window *
260 0 : ((pre_sinc == 0) ?
261 : sinc_scale_factor :
262 0 : (sin(sinc_scale_factor * pre_sinc) / pre_sinc)));
263 : }
264 : }
265 : }
266 :
267 0 : void SincResampler::Resample(size_t frames, float* destination) {
268 0 : size_t remaining_frames = frames;
269 :
270 : // Step (1) -- Prime the input buffer at the start of the input stream.
271 0 : if (!buffer_primed_ && remaining_frames) {
272 0 : read_cb_->Run(request_frames_, r0_);
273 0 : buffer_primed_ = true;
274 : }
275 :
276 : // Step (2) -- Resample! const what we can outside of the loop for speed. It
277 : // actually has an impact on ARM performance. See inner loop comment below.
278 0 : const double current_io_ratio = io_sample_rate_ratio_;
279 0 : const float* const kernel_ptr = kernel_storage_.get();
280 0 : while (remaining_frames) {
281 : // |i| may be negative if the last Resample() call ended on an iteration
282 : // that put |virtual_source_idx_| over the limit.
283 : //
284 : // Note: The loop construct here can severely impact performance on ARM
285 : // or when built with clang. See https://codereview.chromium.org/18566009/
286 0 : for (int i = static_cast<int>(
287 0 : ceil((block_size_ - virtual_source_idx_) / current_io_ratio));
288 0 : i > 0; --i) {
289 0 : RTC_DCHECK_LT(virtual_source_idx_, block_size_);
290 :
291 : // |virtual_source_idx_| lies in between two kernel offsets so figure out
292 : // what they are.
293 0 : const int source_idx = static_cast<int>(virtual_source_idx_);
294 0 : const double subsample_remainder = virtual_source_idx_ - source_idx;
295 :
296 : const double virtual_offset_idx =
297 0 : subsample_remainder * kKernelOffsetCount;
298 0 : const int offset_idx = static_cast<int>(virtual_offset_idx);
299 :
300 : // We'll compute "convolutions" for the two kernels which straddle
301 : // |virtual_source_idx_|.
302 0 : const float* const k1 = kernel_ptr + offset_idx * kKernelSize;
303 0 : const float* const k2 = k1 + kKernelSize;
304 :
305 : // Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always be
306 : // true so long as kKernelSize is a multiple of 16.
307 0 : RTC_DCHECK_EQ(0, reinterpret_cast<uintptr_t>(k1) % 16);
308 0 : RTC_DCHECK_EQ(0, reinterpret_cast<uintptr_t>(k2) % 16);
309 :
310 : // Initialize input pointer based on quantized |virtual_source_idx_|.
311 0 : const float* const input_ptr = r1_ + source_idx;
312 :
313 : // Figure out how much to weight each kernel's "convolution".
314 : const double kernel_interpolation_factor =
315 0 : virtual_offset_idx - offset_idx;
316 0 : *destination++ = CONVOLVE_FUNC(
317 : input_ptr, k1, k2, kernel_interpolation_factor);
318 :
319 : // Advance the virtual index.
320 0 : virtual_source_idx_ += current_io_ratio;
321 :
322 0 : if (!--remaining_frames)
323 0 : return;
324 : }
325 :
326 : // Wrap back around to the start.
327 0 : virtual_source_idx_ -= block_size_;
328 :
329 : // Step (3) -- Copy r3_, r4_ to r1_, r2_.
330 : // This wraps the last input frames back to the start of the buffer.
331 0 : memcpy(r1_, r3_, sizeof(*input_buffer_.get()) * kKernelSize);
332 :
333 : // Step (4) -- Reinitialize regions if necessary.
334 0 : if (r0_ == r2_)
335 0 : UpdateRegions(true);
336 :
337 : // Step (5) -- Refresh the buffer with more input.
338 0 : read_cb_->Run(request_frames_, r0_);
339 : }
340 : }
341 :
342 : #undef CONVOLVE_FUNC
343 :
344 0 : size_t SincResampler::ChunkSize() const {
345 0 : return static_cast<size_t>(block_size_ / io_sample_rate_ratio_);
346 : }
347 :
348 0 : void SincResampler::Flush() {
349 0 : virtual_source_idx_ = 0;
350 0 : buffer_primed_ = false;
351 0 : memset(input_buffer_.get(), 0,
352 0 : sizeof(*input_buffer_.get()) * input_buffer_size_);
353 0 : UpdateRegions(false);
354 0 : }
355 :
356 0 : float SincResampler::Convolve_C(const float* input_ptr, const float* k1,
357 : const float* k2,
358 : double kernel_interpolation_factor) {
359 0 : float sum1 = 0;
360 0 : float sum2 = 0;
361 :
362 : // Generate a single output sample. Unrolling this loop hurt performance in
363 : // local testing.
364 0 : size_t n = kKernelSize;
365 0 : while (n--) {
366 0 : sum1 += *input_ptr * *k1++;
367 0 : sum2 += *input_ptr++ * *k2++;
368 : }
369 :
370 : // Linearly interpolate the two "convolutions".
371 0 : return static_cast<float>((1.0 - kernel_interpolation_factor) * sum1 +
372 0 : kernel_interpolation_factor * sum2);
373 : }
374 :
375 : } // namespace webrtc
|