Line data Source code
1 : /*
2 : * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include "webrtc/common_audio/vad/vad_filterbank.h"
12 :
13 : #include "webrtc/base/checks.h"
14 : #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
15 : #include "webrtc/typedefs.h"
16 :
17 : // Constants used in LogOfEnergy().
18 : static const int16_t kLogConst = 24660; // 160*log10(2) in Q9.
19 : static const int16_t kLogEnergyIntPart = 14336; // 14 in Q10
20 :
21 : // Coefficients used by HighPassFilter, Q14.
22 : static const int16_t kHpZeroCoefs[3] = { 6631, -13262, 6631 };
23 : static const int16_t kHpPoleCoefs[3] = { 16384, -7756, 5620 };
24 :
25 : // Allpass filter coefficients, upper and lower, in Q15.
26 : // Upper: 0.64, Lower: 0.17
27 : static const int16_t kAllPassCoefsQ15[2] = { 20972, 5571 };
28 :
29 : // Adjustment for division with two in SplitFilter.
30 : static const int16_t kOffsetVector[6] = { 368, 368, 272, 176, 176, 176 };
31 :
32 : // High pass filtering, with a cut-off frequency at 80 Hz, if the |data_in| is
33 : // sampled at 500 Hz.
34 : //
35 : // - data_in [i] : Input audio data sampled at 500 Hz.
36 : // - data_length [i] : Length of input and output data.
37 : // - filter_state [i/o] : State of the filter.
38 : // - data_out [o] : Output audio data in the frequency interval
39 : // 80 - 250 Hz.
40 0 : static void HighPassFilter(const int16_t* data_in, size_t data_length,
41 : int16_t* filter_state, int16_t* data_out) {
42 : size_t i;
43 0 : const int16_t* in_ptr = data_in;
44 0 : int16_t* out_ptr = data_out;
45 0 : int32_t tmp32 = 0;
46 :
47 :
48 : // The sum of the absolute values of the impulse response:
49 : // The zero/pole-filter has a max amplification of a single sample of: 1.4546
50 : // Impulse response: 0.4047 -0.6179 -0.0266 0.1993 0.1035 -0.0194
51 : // The all-zero section has a max amplification of a single sample of: 1.6189
52 : // Impulse response: 0.4047 -0.8094 0.4047 0 0 0
53 : // The all-pole section has a max amplification of a single sample of: 1.9931
54 : // Impulse response: 1.0000 0.4734 -0.1189 -0.2187 -0.0627 0.04532
55 :
56 0 : for (i = 0; i < data_length; i++) {
57 : // All-zero section (filter coefficients in Q14).
58 0 : tmp32 = kHpZeroCoefs[0] * *in_ptr;
59 0 : tmp32 += kHpZeroCoefs[1] * filter_state[0];
60 0 : tmp32 += kHpZeroCoefs[2] * filter_state[1];
61 0 : filter_state[1] = filter_state[0];
62 0 : filter_state[0] = *in_ptr++;
63 :
64 : // All-pole section (filter coefficients in Q14).
65 0 : tmp32 -= kHpPoleCoefs[1] * filter_state[2];
66 0 : tmp32 -= kHpPoleCoefs[2] * filter_state[3];
67 0 : filter_state[3] = filter_state[2];
68 0 : filter_state[2] = (int16_t) (tmp32 >> 14);
69 0 : *out_ptr++ = filter_state[2];
70 : }
71 0 : }
72 :
73 : // All pass filtering of |data_in|, used before splitting the signal into two
74 : // frequency bands (low pass vs high pass).
75 : // Note that |data_in| and |data_out| can NOT correspond to the same address.
76 : //
77 : // - data_in [i] : Input audio signal given in Q0.
78 : // - data_length [i] : Length of input and output data.
79 : // - filter_coefficient [i] : Given in Q15.
80 : // - filter_state [i/o] : State of the filter given in Q(-1).
81 : // - data_out [o] : Output audio signal given in Q(-1).
82 0 : static void AllPassFilter(const int16_t* data_in, size_t data_length,
83 : int16_t filter_coefficient, int16_t* filter_state,
84 : int16_t* data_out) {
85 : // The filter can only cause overflow (in the w16 output variable)
86 : // if more than 4 consecutive input numbers are of maximum value and
87 : // has the the same sign as the impulse responses first taps.
88 : // First 6 taps of the impulse response:
89 : // 0.6399 0.5905 -0.3779 0.2418 -0.1547 0.0990
90 :
91 : size_t i;
92 0 : int16_t tmp16 = 0;
93 0 : int32_t tmp32 = 0;
94 0 : int32_t state32 = ((int32_t) (*filter_state) << 16); // Q15
95 :
96 0 : for (i = 0; i < data_length; i++) {
97 0 : tmp32 = state32 + filter_coefficient * *data_in;
98 0 : tmp16 = (int16_t) (tmp32 >> 16); // Q(-1)
99 0 : *data_out++ = tmp16;
100 0 : state32 = (*data_in << 14) - filter_coefficient * tmp16; // Q14
101 0 : state32 <<= 1; // Q15.
102 0 : data_in += 2;
103 : }
104 :
105 0 : *filter_state = (int16_t) (state32 >> 16); // Q(-1)
106 0 : }
107 :
108 : // Splits |data_in| into |hp_data_out| and |lp_data_out| corresponding to
109 : // an upper (high pass) part and a lower (low pass) part respectively.
110 : //
111 : // - data_in [i] : Input audio data to be split into two frequency bands.
112 : // - data_length [i] : Length of |data_in|.
113 : // - upper_state [i/o] : State of the upper filter, given in Q(-1).
114 : // - lower_state [i/o] : State of the lower filter, given in Q(-1).
115 : // - hp_data_out [o] : Output audio data of the upper half of the spectrum.
116 : // The length is |data_length| / 2.
117 : // - lp_data_out [o] : Output audio data of the lower half of the spectrum.
118 : // The length is |data_length| / 2.
119 0 : static void SplitFilter(const int16_t* data_in, size_t data_length,
120 : int16_t* upper_state, int16_t* lower_state,
121 : int16_t* hp_data_out, int16_t* lp_data_out) {
122 : size_t i;
123 0 : size_t half_length = data_length >> 1; // Downsampling by 2.
124 : int16_t tmp_out;
125 :
126 : // All-pass filtering upper branch.
127 0 : AllPassFilter(&data_in[0], half_length, kAllPassCoefsQ15[0], upper_state,
128 : hp_data_out);
129 :
130 : // All-pass filtering lower branch.
131 0 : AllPassFilter(&data_in[1], half_length, kAllPassCoefsQ15[1], lower_state,
132 : lp_data_out);
133 :
134 : // Make LP and HP signals.
135 0 : for (i = 0; i < half_length; i++) {
136 0 : tmp_out = *hp_data_out;
137 0 : *hp_data_out++ -= *lp_data_out;
138 0 : *lp_data_out++ += tmp_out;
139 : }
140 0 : }
141 :
142 : // Calculates the energy of |data_in| in dB, and also updates an overall
143 : // |total_energy| if necessary.
144 : //
145 : // - data_in [i] : Input audio data for energy calculation.
146 : // - data_length [i] : Length of input data.
147 : // - offset [i] : Offset value added to |log_energy|.
148 : // - total_energy [i/o] : An external energy updated with the energy of
149 : // |data_in|.
150 : // NOTE: |total_energy| is only updated if
151 : // |total_energy| <= |kMinEnergy|.
152 : // - log_energy [o] : 10 * log10("energy of |data_in|") given in Q4.
153 0 : static void LogOfEnergy(const int16_t* data_in, size_t data_length,
154 : int16_t offset, int16_t* total_energy,
155 : int16_t* log_energy) {
156 : // |tot_rshifts| accumulates the number of right shifts performed on |energy|.
157 0 : int tot_rshifts = 0;
158 : // The |energy| will be normalized to 15 bits. We use unsigned integer because
159 : // we eventually will mask out the fractional part.
160 0 : uint32_t energy = 0;
161 :
162 0 : RTC_DCHECK(data_in);
163 0 : RTC_DCHECK_GT(data_length, 0);
164 :
165 0 : energy = (uint32_t) WebRtcSpl_Energy((int16_t*) data_in, data_length,
166 : &tot_rshifts);
167 :
168 0 : if (energy != 0) {
169 : // By construction, normalizing to 15 bits is equivalent with 17 leading
170 : // zeros of an unsigned 32 bit value.
171 0 : int normalizing_rshifts = 17 - WebRtcSpl_NormU32(energy);
172 : // In a 15 bit representation the leading bit is 2^14. log2(2^14) in Q10 is
173 : // (14 << 10), which is what we initialize |log2_energy| with. For a more
174 : // detailed derivations, see below.
175 0 : int16_t log2_energy = kLogEnergyIntPart;
176 :
177 0 : tot_rshifts += normalizing_rshifts;
178 : // Normalize |energy| to 15 bits.
179 : // |tot_rshifts| is now the total number of right shifts performed on
180 : // |energy| after normalization. This means that |energy| is in
181 : // Q(-tot_rshifts).
182 0 : if (normalizing_rshifts < 0) {
183 0 : energy <<= -normalizing_rshifts;
184 : } else {
185 0 : energy >>= normalizing_rshifts;
186 : }
187 :
188 : // Calculate the energy of |data_in| in dB, in Q4.
189 : //
190 : // 10 * log10("true energy") in Q4 = 2^4 * 10 * log10("true energy") =
191 : // 160 * log10(|energy| * 2^|tot_rshifts|) =
192 : // 160 * log10(2) * log2(|energy| * 2^|tot_rshifts|) =
193 : // 160 * log10(2) * (log2(|energy|) + log2(2^|tot_rshifts|)) =
194 : // (160 * log10(2)) * (log2(|energy|) + |tot_rshifts|) =
195 : // |kLogConst| * (|log2_energy| + |tot_rshifts|)
196 : //
197 : // We know by construction that |energy| is normalized to 15 bits. Hence,
198 : // |energy| = 2^14 + frac_Q15, where frac_Q15 is a fractional part in Q15.
199 : // Further, we'd like |log2_energy| in Q10
200 : // log2(|energy|) in Q10 = 2^10 * log2(2^14 + frac_Q15) =
201 : // 2^10 * log2(2^14 * (1 + frac_Q15 * 2^-14)) =
202 : // 2^10 * (14 + log2(1 + frac_Q15 * 2^-14)) ~=
203 : // (14 << 10) + 2^10 * (frac_Q15 * 2^-14) =
204 : // (14 << 10) + (frac_Q15 * 2^-4) = (14 << 10) + (frac_Q15 >> 4)
205 : //
206 : // Note that frac_Q15 = (|energy| & 0x00003FFF)
207 :
208 : // Calculate and add the fractional part to |log2_energy|.
209 0 : log2_energy += (int16_t) ((energy & 0x00003FFF) >> 4);
210 :
211 : // |kLogConst| is in Q9, |log2_energy| in Q10 and |tot_rshifts| in Q0.
212 : // Note that we in our derivation above have accounted for an output in Q4.
213 0 : *log_energy = (int16_t)(((kLogConst * log2_energy) >> 19) +
214 0 : ((tot_rshifts * kLogConst) >> 9));
215 :
216 0 : if (*log_energy < 0) {
217 0 : *log_energy = 0;
218 : }
219 : } else {
220 0 : *log_energy = offset;
221 0 : return;
222 : }
223 :
224 0 : *log_energy += offset;
225 :
226 : // Update the approximate |total_energy| with the energy of |data_in|, if
227 : // |total_energy| has not exceeded |kMinEnergy|. |total_energy| is used as an
228 : // energy indicator in WebRtcVad_GmmProbability() in vad_core.c.
229 0 : if (*total_energy <= kMinEnergy) {
230 0 : if (tot_rshifts >= 0) {
231 : // We know by construction that the |energy| > |kMinEnergy| in Q0, so add
232 : // an arbitrary value such that |total_energy| exceeds |kMinEnergy|.
233 0 : *total_energy += kMinEnergy + 1;
234 : } else {
235 : // By construction |energy| is represented by 15 bits, hence any number of
236 : // right shifted |energy| will fit in an int16_t. In addition, adding the
237 : // value to |total_energy| is wrap around safe as long as
238 : // |kMinEnergy| < 8192.
239 0 : *total_energy += (int16_t) (energy >> -tot_rshifts); // Q0.
240 : }
241 : }
242 : }
243 :
244 0 : int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in,
245 : size_t data_length, int16_t* features) {
246 0 : int16_t total_energy = 0;
247 : // We expect |data_length| to be 80, 160 or 240 samples, which corresponds to
248 : // 10, 20 or 30 ms in 8 kHz. Therefore, the intermediate downsampled data will
249 : // have at most 120 samples after the first split and at most 60 samples after
250 : // the second split.
251 : int16_t hp_120[120], lp_120[120];
252 : int16_t hp_60[60], lp_60[60];
253 0 : const size_t half_data_length = data_length >> 1;
254 0 : size_t length = half_data_length; // |data_length| / 2, corresponds to
255 : // bandwidth = 2000 Hz after downsampling.
256 :
257 : // Initialize variables for the first SplitFilter().
258 0 : int frequency_band = 0;
259 0 : const int16_t* in_ptr = data_in; // [0 - 4000] Hz.
260 0 : int16_t* hp_out_ptr = hp_120; // [2000 - 4000] Hz.
261 0 : int16_t* lp_out_ptr = lp_120; // [0 - 2000] Hz.
262 :
263 0 : RTC_DCHECK_LE(data_length, 240);
264 : RTC_DCHECK_LT(4, kNumChannels - 1); // Checking maximum |frequency_band|.
265 :
266 : // Split at 2000 Hz and downsample.
267 0 : SplitFilter(in_ptr, data_length, &self->upper_state[frequency_band],
268 : &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
269 :
270 : // For the upper band (2000 Hz - 4000 Hz) split at 3000 Hz and downsample.
271 0 : frequency_band = 1;
272 0 : in_ptr = hp_120; // [2000 - 4000] Hz.
273 0 : hp_out_ptr = hp_60; // [3000 - 4000] Hz.
274 0 : lp_out_ptr = lp_60; // [2000 - 3000] Hz.
275 0 : SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
276 : &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
277 :
278 : // Energy in 3000 Hz - 4000 Hz.
279 0 : length >>= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz.
280 :
281 0 : LogOfEnergy(hp_60, length, kOffsetVector[5], &total_energy, &features[5]);
282 :
283 : // Energy in 2000 Hz - 3000 Hz.
284 0 : LogOfEnergy(lp_60, length, kOffsetVector[4], &total_energy, &features[4]);
285 :
286 : // For the lower band (0 Hz - 2000 Hz) split at 1000 Hz and downsample.
287 0 : frequency_band = 2;
288 0 : in_ptr = lp_120; // [0 - 2000] Hz.
289 0 : hp_out_ptr = hp_60; // [1000 - 2000] Hz.
290 0 : lp_out_ptr = lp_60; // [0 - 1000] Hz.
291 0 : length = half_data_length; // |data_length| / 2 <=> bandwidth = 2000 Hz.
292 0 : SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
293 : &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
294 :
295 : // Energy in 1000 Hz - 2000 Hz.
296 0 : length >>= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz.
297 0 : LogOfEnergy(hp_60, length, kOffsetVector[3], &total_energy, &features[3]);
298 :
299 : // For the lower band (0 Hz - 1000 Hz) split at 500 Hz and downsample.
300 0 : frequency_band = 3;
301 0 : in_ptr = lp_60; // [0 - 1000] Hz.
302 0 : hp_out_ptr = hp_120; // [500 - 1000] Hz.
303 0 : lp_out_ptr = lp_120; // [0 - 500] Hz.
304 0 : SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
305 : &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
306 :
307 : // Energy in 500 Hz - 1000 Hz.
308 0 : length >>= 1; // |data_length| / 8 <=> bandwidth = 500 Hz.
309 0 : LogOfEnergy(hp_120, length, kOffsetVector[2], &total_energy, &features[2]);
310 :
311 : // For the lower band (0 Hz - 500 Hz) split at 250 Hz and downsample.
312 0 : frequency_band = 4;
313 0 : in_ptr = lp_120; // [0 - 500] Hz.
314 0 : hp_out_ptr = hp_60; // [250 - 500] Hz.
315 0 : lp_out_ptr = lp_60; // [0 - 250] Hz.
316 0 : SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
317 : &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);
318 :
319 : // Energy in 250 Hz - 500 Hz.
320 0 : length >>= 1; // |data_length| / 16 <=> bandwidth = 250 Hz.
321 0 : LogOfEnergy(hp_60, length, kOffsetVector[1], &total_energy, &features[1]);
322 :
323 : // Remove 0 Hz - 80 Hz, by high pass filtering the lower band.
324 0 : HighPassFilter(lp_60, length, self->hp_filter_state, hp_120);
325 :
326 : // Energy in 80 Hz - 250 Hz.
327 0 : LogOfEnergy(hp_120, length, kOffsetVector[0], &total_energy, &features[0]);
328 :
329 0 : return total_energy;
330 : }
|