Line data Source code
1 : /*
2 : * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include "webrtc/modules/audio_processing/vad/vad_audio_proc.h"
12 :
13 : #include <math.h>
14 : #include <stdio.h>
15 :
16 : #include "webrtc/base/checks.h"
17 : #include "webrtc/common_audio/fft4g.h"
18 : #include "webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h"
19 : #include "webrtc/modules/audio_processing/vad/pitch_internal.h"
20 : #include "webrtc/modules/audio_processing/vad/pole_zero_filter.h"
21 : extern "C" {
22 : #include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h"
23 : #include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h"
24 : #include "webrtc/modules/audio_coding/codecs/isac/main/source/pitch_estimator.h"
25 : #include "webrtc/modules/audio_coding/codecs/isac/main/source/structs.h"
26 : }
27 : #include "webrtc/modules/include/module_common_types.h"
28 :
29 : namespace webrtc {
30 :
31 : // The following structures are declared anonymous in iSAC's structs.h. To
32 : // forward declare them, we use this derived class trick.
33 : struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {};
34 : struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {};
35 :
36 : static const float kFrequencyResolution =
37 : kSampleRateHz / static_cast<float>(VadAudioProc::kDftSize);
38 : static const int kSilenceRms = 5;
39 :
40 : // TODO(turajs): Make a Create or Init for VadAudioProc.
41 0 : VadAudioProc::VadAudioProc()
42 : : audio_buffer_(),
43 : num_buffer_samples_(kNumPastSignalSamples),
44 : log_old_gain_(-2),
45 : old_lag_(50), // Arbitrary but valid as pitch-lag (in samples).
46 0 : pitch_analysis_handle_(new PitchAnalysisStruct),
47 0 : pre_filter_handle_(new PreFiltBankstr),
48 : high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator,
49 : kFilterOrder,
50 : kCoeffDenominator,
51 0 : kFilterOrder)) {
52 : static_assert(kNumPastSignalSamples + kNumSubframeSamples ==
53 : sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]),
54 : "lpc analysis window incorrect size");
55 : static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]),
56 : "correlation weight incorrect size");
57 :
58 : // TODO(turajs): Are we doing too much in the constructor?
59 : float data[kDftSize];
60 : // Make FFT to initialize.
61 0 : ip_[0] = 0;
62 0 : WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
63 : // TODO(turajs): Need to initialize high-pass filter.
64 :
65 : // Initialize iSAC components.
66 0 : WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get());
67 0 : WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get());
68 0 : }
69 :
70 0 : VadAudioProc::~VadAudioProc() {
71 0 : }
72 :
73 0 : void VadAudioProc::ResetBuffer() {
74 0 : memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess],
75 0 : sizeof(audio_buffer_[0]) * kNumPastSignalSamples);
76 0 : num_buffer_samples_ = kNumPastSignalSamples;
77 0 : }
78 :
79 0 : int VadAudioProc::ExtractFeatures(const int16_t* frame,
80 : size_t length,
81 : AudioFeatures* features) {
82 0 : features->num_frames = 0;
83 0 : if (length != kNumSubframeSamples) {
84 0 : return -1;
85 : }
86 :
87 : // High-pass filter to remove the DC component and very low frequency content.
88 : // We have experienced that this high-pass filtering improves voice/non-voiced
89 : // classification.
90 0 : if (high_pass_filter_->Filter(frame, kNumSubframeSamples,
91 0 : &audio_buffer_[num_buffer_samples_]) != 0) {
92 0 : return -1;
93 : }
94 :
95 0 : num_buffer_samples_ += kNumSubframeSamples;
96 0 : if (num_buffer_samples_ < kBufferLength) {
97 0 : return 0;
98 : }
99 0 : RTC_DCHECK_EQ(num_buffer_samples_, kBufferLength);
100 0 : features->num_frames = kNum10msSubframes;
101 0 : features->silence = false;
102 :
103 0 : Rms(features->rms, kMaxNumFrames);
104 0 : for (size_t i = 0; i < kNum10msSubframes; ++i) {
105 0 : if (features->rms[i] < kSilenceRms) {
106 : // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence.
107 : // Bail out here instead.
108 0 : features->silence = true;
109 0 : ResetBuffer();
110 0 : return 0;
111 : }
112 : }
113 :
114 0 : PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz,
115 0 : kMaxNumFrames);
116 0 : FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames);
117 0 : ResetBuffer();
118 0 : return 0;
119 : }
120 :
121 : // Computes |kLpcOrder + 1| correlation coefficients.
122 0 : void VadAudioProc::SubframeCorrelation(double* corr,
123 : size_t length_corr,
124 : size_t subframe_index) {
125 0 : RTC_DCHECK_GE(length_corr, kLpcOrder + 1);
126 : double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples];
127 0 : size_t buffer_index = subframe_index * kNumSubframeSamples;
128 :
129 0 : for (size_t n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++)
130 0 : windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n];
131 :
132 : WebRtcIsac_AutoCorr(corr, windowed_audio,
133 0 : kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder);
134 0 : }
135 :
136 : // Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input.
137 : // The analysis window is 15 ms long and it is centered on the first half of
138 : // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the
139 : // first half of each 10 ms subframe.
140 0 : void VadAudioProc::GetLpcPolynomials(double* lpc, size_t length_lpc) {
141 0 : RTC_DCHECK_GE(length_lpc, kNum10msSubframes * (kLpcOrder + 1));
142 : double corr[kLpcOrder + 1];
143 : double reflec_coeff[kLpcOrder];
144 0 : for (size_t i = 0, offset_lpc = 0; i < kNum10msSubframes;
145 0 : i++, offset_lpc += kLpcOrder + 1) {
146 0 : SubframeCorrelation(corr, kLpcOrder + 1, i);
147 0 : corr[0] *= 1.0001;
148 : // This makes Lev-Durb a bit more stable.
149 0 : for (size_t k = 0; k < kLpcOrder + 1; k++) {
150 0 : corr[k] *= kCorrWeight[k];
151 : }
152 0 : WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder);
153 : }
154 0 : }
155 :
156 : // Fit a second order curve to these 3 points and find the location of the
157 : // extremum. The points are inverted before curve fitting.
158 0 : static float QuadraticInterpolation(float prev_val,
159 : float curr_val,
160 : float next_val) {
161 : // Doing the interpolation in |1 / A(z)|^2.
162 0 : float fractional_index = 0;
163 0 : next_val = 1.0f / next_val;
164 0 : prev_val = 1.0f / prev_val;
165 0 : curr_val = 1.0f / curr_val;
166 :
167 0 : fractional_index =
168 0 : -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val);
169 0 : RTC_DCHECK_LT(fabs(fractional_index), 1);
170 0 : return fractional_index;
171 : }
172 :
173 : // 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope
174 : // of the input signal. The local maximum of the spectral envelope corresponds
175 : // with the local minimum of A(z). It saves complexity, as we save one
176 : // inversion. Furthermore, we find the first local maximum of magnitude squared,
177 : // to save on one square root.
178 0 : void VadAudioProc::FindFirstSpectralPeaks(double* f_peak,
179 : size_t length_f_peak) {
180 0 : RTC_DCHECK_GE(length_f_peak, kNum10msSubframes);
181 : double lpc[kNum10msSubframes * (kLpcOrder + 1)];
182 : // For all sub-frames.
183 0 : GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1));
184 :
185 0 : const size_t kNumDftCoefficients = kDftSize / 2 + 1;
186 : float data[kDftSize];
187 :
188 0 : for (size_t i = 0; i < kNum10msSubframes; i++) {
189 : // Convert to float with zero pad.
190 0 : memset(data, 0, sizeof(data));
191 0 : for (size_t n = 0; n < kLpcOrder + 1; n++) {
192 0 : data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]);
193 : }
194 : // Transform to frequency domain.
195 0 : WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
196 :
197 0 : size_t index_peak = 0;
198 0 : float prev_magn_sqr = data[0] * data[0];
199 0 : float curr_magn_sqr = data[2] * data[2] + data[3] * data[3];
200 : float next_magn_sqr;
201 0 : bool found_peak = false;
202 0 : for (size_t n = 2; n < kNumDftCoefficients - 1; n++) {
203 0 : next_magn_sqr =
204 0 : data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1];
205 0 : if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
206 0 : found_peak = true;
207 0 : index_peak = n - 1;
208 0 : break;
209 : }
210 0 : prev_magn_sqr = curr_magn_sqr;
211 0 : curr_magn_sqr = next_magn_sqr;
212 : }
213 0 : float fractional_index = 0;
214 0 : if (!found_peak) {
215 : // Checking if |kNumDftCoefficients - 1| is the local minimum.
216 0 : next_magn_sqr = data[1] * data[1];
217 0 : if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
218 0 : index_peak = kNumDftCoefficients - 1;
219 : }
220 : } else {
221 : // A peak is found, do a simple quadratic interpolation to get a more
222 : // accurate estimate of the peak location.
223 : fractional_index =
224 0 : QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr);
225 : }
226 0 : f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution;
227 : }
228 0 : }
229 :
230 : // Using iSAC functions to estimate pitch gains & lags.
231 0 : void VadAudioProc::PitchAnalysis(double* log_pitch_gains,
232 : double* pitch_lags_hz,
233 : size_t length) {
234 : // TODO(turajs): This can be "imported" from iSAC & and the next two
235 : // constants.
236 0 : RTC_DCHECK_GE(length, kNum10msSubframes);
237 0 : const int kNumPitchSubframes = 4;
238 : double gains[kNumPitchSubframes];
239 : double lags[kNumPitchSubframes];
240 :
241 0 : const int kNumSubbandFrameSamples = 240;
242 0 : const int kNumLookaheadSamples = 24;
243 :
244 : float lower[kNumSubbandFrameSamples];
245 : float upper[kNumSubbandFrameSamples];
246 : double lower_lookahead[kNumSubbandFrameSamples];
247 : double upper_lookahead[kNumSubbandFrameSamples];
248 : double lower_lookahead_pre_filter[kNumSubbandFrameSamples +
249 : kNumLookaheadSamples];
250 :
251 : // Split signal to lower and upper bands
252 0 : WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower,
253 : upper, lower_lookahead, upper_lookahead,
254 0 : pre_filter_handle_.get());
255 : WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter,
256 0 : pitch_analysis_handle_.get(), lags, gains);
257 :
258 : // Lags are computed on lower-band signal with sampling rate half of the
259 : // input signal.
260 0 : GetSubframesPitchParameters(
261 : kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes,
262 0 : &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz);
263 0 : }
264 :
265 0 : void VadAudioProc::Rms(double* rms, size_t length_rms) {
266 0 : RTC_DCHECK_GE(length_rms, kNum10msSubframes);
267 0 : size_t offset = kNumPastSignalSamples;
268 0 : for (size_t i = 0; i < kNum10msSubframes; i++) {
269 0 : rms[i] = 0;
270 0 : for (size_t n = 0; n < kNumSubframeSamples; n++, offset++)
271 0 : rms[i] += audio_buffer_[offset] * audio_buffer_[offset];
272 0 : rms[i] = sqrt(rms[i] / kNumSubframeSamples);
273 : }
274 0 : }
275 :
276 : } // namespace webrtc
|