Line data Source code
1 : /*
2 : * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include "webrtc/modules/audio_coding/neteq/time_stretch.h"
12 :
13 : #include <algorithm> // min, max
14 : #include <memory>
15 :
16 : #include "webrtc/base/safe_conversions.h"
17 : #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
18 : #include "webrtc/modules/audio_coding/neteq/background_noise.h"
19 : #include "webrtc/modules/audio_coding/neteq/cross_correlation.h"
20 : #include "webrtc/modules/audio_coding/neteq/dsp_helper.h"
21 :
22 : namespace webrtc {
23 :
24 0 : TimeStretch::ReturnCodes TimeStretch::Process(const int16_t* input,
25 : size_t input_len,
26 : bool fast_mode,
27 : AudioMultiVector* output,
28 : size_t* length_change_samples) {
29 : // Pre-calculate common multiplication with |fs_mult_|.
30 : size_t fs_mult_120 =
31 0 : static_cast<size_t>(fs_mult_ * 120); // Corresponds to 15 ms.
32 :
33 : const int16_t* signal;
34 0 : std::unique_ptr<int16_t[]> signal_array;
35 : size_t signal_len;
36 0 : if (num_channels_ == 1) {
37 0 : signal = input;
38 0 : signal_len = input_len;
39 : } else {
40 : // We want |signal| to be only the first channel of |input|, which is
41 : // interleaved. Thus, we take the first sample, skip forward |num_channels|
42 : // samples, and continue like that.
43 0 : signal_len = input_len / num_channels_;
44 0 : signal_array.reset(new int16_t[signal_len]);
45 0 : signal = signal_array.get();
46 0 : size_t j = master_channel_;
47 0 : for (size_t i = 0; i < signal_len; ++i) {
48 0 : signal_array[i] = input[j];
49 0 : j += num_channels_;
50 : }
51 : }
52 :
53 : // Find maximum absolute value of input signal.
54 0 : max_input_value_ = WebRtcSpl_MaxAbsValueW16(signal, signal_len);
55 :
56 : // Downsample to 4 kHz sample rate and calculate auto-correlation.
57 0 : DspHelper::DownsampleTo4kHz(signal, signal_len, kDownsampledLen,
58 0 : sample_rate_hz_, true /* compensate delay*/,
59 0 : downsampled_input_);
60 0 : AutoCorrelation();
61 :
62 : // Find the strongest correlation peak.
63 : static const size_t kNumPeaks = 1;
64 : size_t peak_index;
65 : int16_t peak_value;
66 0 : DspHelper::PeakDetection(auto_correlation_, kCorrelationLen, kNumPeaks,
67 0 : fs_mult_, &peak_index, &peak_value);
68 : // Assert that |peak_index| stays within boundaries.
69 0 : assert(peak_index <= (2 * kCorrelationLen - 1) * fs_mult_);
70 :
71 : // Compensate peak_index for displaced starting position. The displacement
72 : // happens in AutoCorrelation(). Here, |kMinLag| is in the down-sampled 4 kHz
73 : // domain, while the |peak_index| is in the original sample rate; hence, the
74 : // multiplication by fs_mult_ * 2.
75 0 : peak_index += kMinLag * fs_mult_ * 2;
76 : // Assert that |peak_index| stays within boundaries.
77 0 : assert(peak_index >= static_cast<size_t>(20 * fs_mult_));
78 0 : assert(peak_index <= 20 * fs_mult_ + (2 * kCorrelationLen - 1) * fs_mult_);
79 :
80 : // Calculate scaling to ensure that |peak_index| samples can be square-summed
81 : // without overflowing.
82 0 : int scaling = 31 - WebRtcSpl_NormW32(max_input_value_ * max_input_value_) -
83 0 : WebRtcSpl_NormW32(static_cast<int32_t>(peak_index));
84 0 : scaling = std::max(0, scaling);
85 :
86 : // |vec1| starts at 15 ms minus one pitch period.
87 0 : const int16_t* vec1 = &signal[fs_mult_120 - peak_index];
88 : // |vec2| start at 15 ms.
89 0 : const int16_t* vec2 = &signal[fs_mult_120];
90 : // Calculate energies for |vec1| and |vec2|, assuming they both contain
91 : // |peak_index| samples.
92 : int32_t vec1_energy =
93 0 : WebRtcSpl_DotProductWithScale(vec1, vec1, peak_index, scaling);
94 : int32_t vec2_energy =
95 0 : WebRtcSpl_DotProductWithScale(vec2, vec2, peak_index, scaling);
96 :
97 : // Calculate cross-correlation between |vec1| and |vec2|.
98 : int32_t cross_corr =
99 0 : WebRtcSpl_DotProductWithScale(vec1, vec2, peak_index, scaling);
100 :
101 : // Check if the signal seems to be active speech or not (simple VAD).
102 0 : bool active_speech = SpeechDetection(vec1_energy, vec2_energy, peak_index,
103 0 : scaling);
104 :
105 : int16_t best_correlation;
106 0 : if (!active_speech) {
107 0 : SetParametersForPassiveSpeech(signal_len, &best_correlation, &peak_index);
108 : } else {
109 : // Calculate correlation:
110 : // cross_corr / sqrt(vec1_energy * vec2_energy).
111 :
112 : // Start with calculating scale values.
113 0 : int energy1_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec1_energy));
114 0 : int energy2_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec2_energy));
115 :
116 : // Make sure total scaling is even (to simplify scale factor after sqrt).
117 0 : if ((energy1_scale + energy2_scale) & 1) {
118 : // The sum is odd.
119 0 : energy1_scale += 1;
120 : }
121 :
122 : // Scale energies to int16_t.
123 : int16_t vec1_energy_int16 =
124 0 : static_cast<int16_t>(vec1_energy >> energy1_scale);
125 : int16_t vec2_energy_int16 =
126 0 : static_cast<int16_t>(vec2_energy >> energy2_scale);
127 :
128 : // Calculate square-root of energy product.
129 0 : int16_t sqrt_energy_prod = WebRtcSpl_SqrtFloor(vec1_energy_int16 *
130 0 : vec2_energy_int16);
131 :
132 : // Calculate cross_corr / sqrt(en1*en2) in Q14.
133 0 : int temp_scale = 14 - (energy1_scale + energy2_scale) / 2;
134 0 : cross_corr = WEBRTC_SPL_SHIFT_W32(cross_corr, temp_scale);
135 0 : cross_corr = std::max(0, cross_corr); // Don't use if negative.
136 0 : best_correlation = WebRtcSpl_DivW32W16(cross_corr, sqrt_energy_prod);
137 : // Make sure |best_correlation| is no larger than 1 in Q14.
138 0 : best_correlation = std::min(static_cast<int16_t>(16384), best_correlation);
139 : }
140 :
141 :
142 : // Check accelerate criteria and stretch the signal.
143 : ReturnCodes return_value =
144 0 : CheckCriteriaAndStretch(input, input_len, peak_index, best_correlation,
145 0 : active_speech, fast_mode, output);
146 0 : switch (return_value) {
147 : case kSuccess:
148 0 : *length_change_samples = peak_index;
149 0 : break;
150 : case kSuccessLowEnergy:
151 0 : *length_change_samples = peak_index;
152 0 : break;
153 : case kNoStretch:
154 : case kError:
155 0 : *length_change_samples = 0;
156 0 : break;
157 : }
158 0 : return return_value;
159 : }
160 :
161 0 : void TimeStretch::AutoCorrelation() {
162 : // Calculate correlation from lag kMinLag to lag kMaxLag in 4 kHz domain.
163 : int32_t auto_corr[kCorrelationLen];
164 : CrossCorrelationWithAutoShift(
165 0 : &downsampled_input_[kMaxLag], &downsampled_input_[kMaxLag - kMinLag],
166 0 : kCorrelationLen, kMaxLag - kMinLag, -1, auto_corr);
167 :
168 : // Normalize correlation to 14 bits and write to |auto_correlation_|.
169 0 : int32_t max_corr = WebRtcSpl_MaxAbsValueW32(auto_corr, kCorrelationLen);
170 0 : int scaling = std::max(0, 17 - WebRtcSpl_NormW32(max_corr));
171 0 : WebRtcSpl_VectorBitShiftW32ToW16(auto_correlation_, kCorrelationLen,
172 0 : auto_corr, scaling);
173 0 : }
174 :
175 0 : bool TimeStretch::SpeechDetection(int32_t vec1_energy, int32_t vec2_energy,
176 : size_t peak_index, int scaling) const {
177 : // Check if the signal seems to be active speech or not (simple VAD).
178 : // If (vec1_energy + vec2_energy) / (2 * peak_index) <=
179 : // 8 * background_noise_energy, then we say that the signal contains no
180 : // active speech.
181 : // Rewrite the inequality as:
182 : // (vec1_energy + vec2_energy) / 16 <= peak_index * background_noise_energy.
183 : // The two sides of the inequality will be denoted |left_side| and
184 : // |right_side|.
185 0 : int32_t left_side = (vec1_energy + vec2_energy) / 16;
186 : int32_t right_side;
187 0 : if (background_noise_.initialized()) {
188 0 : right_side = background_noise_.Energy(master_channel_);
189 : } else {
190 : // If noise parameters have not been estimated, use a fixed threshold.
191 0 : right_side = 75000;
192 : }
193 0 : int right_scale = 16 - WebRtcSpl_NormW32(right_side);
194 0 : right_scale = std::max(0, right_scale);
195 0 : left_side = left_side >> right_scale;
196 0 : right_side =
197 0 : rtc::checked_cast<int32_t>(peak_index) * (right_side >> right_scale);
198 :
199 : // Scale |left_side| properly before comparing with |right_side|.
200 : // (|scaling| is the scale factor before energy calculation, thus the scale
201 : // factor for the energy is 2 * scaling.)
202 0 : if (WebRtcSpl_NormW32(left_side) < 2 * scaling) {
203 : // Cannot scale only |left_side|, must scale |right_side| too.
204 0 : int temp_scale = WebRtcSpl_NormW32(left_side);
205 0 : left_side = left_side << temp_scale;
206 0 : right_side = right_side >> (2 * scaling - temp_scale);
207 : } else {
208 0 : left_side = left_side << 2 * scaling;
209 : }
210 0 : return left_side > right_side;
211 : }
212 :
213 : } // namespace webrtc
|