Line data Source code
1 : /*
2 : * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include "webrtc/common_audio/vad/vad_core.h"
12 :
13 : #include "webrtc/base/sanitizer.h"
14 : #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
15 : #include "webrtc/common_audio/vad/vad_filterbank.h"
16 : #include "webrtc/common_audio/vad/vad_gmm.h"
17 : #include "webrtc/common_audio/vad/vad_sp.h"
18 : #include "webrtc/typedefs.h"
19 :
20 : // Spectrum Weighting
21 : static const int16_t kSpectrumWeight[kNumChannels] = { 6, 8, 10, 12, 14, 16 };
22 : static const int16_t kNoiseUpdateConst = 655; // Q15
23 : static const int16_t kSpeechUpdateConst = 6554; // Q15
24 : static const int16_t kBackEta = 154; // Q8
25 : // Minimum difference between the two models, Q5
26 : static const int16_t kMinimumDifference[kNumChannels] = {
27 : 544, 544, 576, 576, 576, 576 };
28 : // Upper limit of mean value for speech model, Q7
29 : static const int16_t kMaximumSpeech[kNumChannels] = {
30 : 11392, 11392, 11520, 11520, 11520, 11520 };
31 : // Minimum value for mean value
32 : static const int16_t kMinimumMean[kNumGaussians] = { 640, 768 };
33 : // Upper limit of mean value for noise model, Q7
34 : static const int16_t kMaximumNoise[kNumChannels] = {
35 : 9216, 9088, 8960, 8832, 8704, 8576 };
36 : // Start values for the Gaussian models, Q7
37 : // Weights for the two Gaussians for the six channels (noise)
38 : static const int16_t kNoiseDataWeights[kTableSize] = {
39 : 34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 };
40 : // Weights for the two Gaussians for the six channels (speech)
41 : static const int16_t kSpeechDataWeights[kTableSize] = {
42 : 48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 };
43 : // Means for the two Gaussians for the six channels (noise)
44 : static const int16_t kNoiseDataMeans[kTableSize] = {
45 : 6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 };
46 : // Means for the two Gaussians for the six channels (speech)
47 : static const int16_t kSpeechDataMeans[kTableSize] = {
48 : 8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483
49 : };
50 : // Stds for the two Gaussians for the six channels (noise)
51 : static const int16_t kNoiseDataStds[kTableSize] = {
52 : 378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 };
53 : // Stds for the two Gaussians for the six channels (speech)
54 : static const int16_t kSpeechDataStds[kTableSize] = {
55 : 555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };
56 :
57 : // Constants used in GmmProbability().
58 : //
59 : // Maximum number of counted speech (VAD = 1) frames in a row.
60 : static const int16_t kMaxSpeechFrames = 6;
61 : // Minimum standard deviation for both speech and noise.
62 : static const int16_t kMinStd = 384;
63 :
64 : // Constants in WebRtcVad_InitCore().
65 : // Default aggressiveness mode.
66 : static const short kDefaultMode = 0;
67 : static const int kInitCheck = 42;
68 :
69 : // Constants used in WebRtcVad_set_mode_core().
70 : //
71 : // Thresholds for different frame lengths (10 ms, 20 ms and 30 ms).
72 : //
73 : // Mode 0, Quality.
74 : static const int16_t kOverHangMax1Q[3] = { 8, 4, 3 };
75 : static const int16_t kOverHangMax2Q[3] = { 14, 7, 5 };
76 : static const int16_t kLocalThresholdQ[3] = { 24, 21, 24 };
77 : static const int16_t kGlobalThresholdQ[3] = { 57, 48, 57 };
78 : // Mode 1, Low bitrate.
79 : static const int16_t kOverHangMax1LBR[3] = { 8, 4, 3 };
80 : static const int16_t kOverHangMax2LBR[3] = { 14, 7, 5 };
81 : static const int16_t kLocalThresholdLBR[3] = { 37, 32, 37 };
82 : static const int16_t kGlobalThresholdLBR[3] = { 100, 80, 100 };
83 : // Mode 2, Aggressive.
84 : static const int16_t kOverHangMax1AGG[3] = { 6, 3, 2 };
85 : static const int16_t kOverHangMax2AGG[3] = { 9, 5, 3 };
86 : static const int16_t kLocalThresholdAGG[3] = { 82, 78, 82 };
87 : static const int16_t kGlobalThresholdAGG[3] = { 285, 260, 285 };
88 : // Mode 3, Very aggressive.
89 : static const int16_t kOverHangMax1VAG[3] = { 6, 3, 2 };
90 : static const int16_t kOverHangMax2VAG[3] = { 9, 5, 3 };
91 : static const int16_t kLocalThresholdVAG[3] = { 94, 94, 94 };
92 : static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 };
93 :
94 : // Calculates the weighted average w.r.t. number of Gaussians. The |data| are
95 : // updated with an |offset| before averaging.
96 : //
97 : // - data [i/o] : Data to average.
98 : // - offset [i] : An offset added to |data|.
99 : // - weights [i] : Weights used for averaging.
100 : //
101 : // returns : The weighted average.
102 0 : static int32_t WeightedAverage(int16_t* data, int16_t offset,
103 : const int16_t* weights) {
104 : int k;
105 0 : int32_t weighted_average = 0;
106 :
107 0 : for (k = 0; k < kNumGaussians; k++) {
108 0 : data[k * kNumChannels] += offset;
109 0 : weighted_average += data[k * kNumChannels] * weights[k * kNumChannels];
110 : }
111 0 : return weighted_average;
112 : }
113 :
114 : // An s16 x s32 -> s32 multiplication that's allowed to overflow. (It's still
115 : // undefined behavior, so not a good idea; this just makes UBSan ignore the
116 : // violation, so that our old code can continue to do what it's always been
117 : // doing.)
118 0 : static inline int32_t OverflowingMulS16ByS32ToS32(int16_t a, int32_t b)
119 : RTC_NO_SANITIZE("signed-integer-overflow") {
120 0 : return a * b;
121 : }
122 :
123 : // Calculates the probabilities for both speech and background noise using
124 : // Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which
125 : // type of signal is most probable.
126 : //
127 : // - self [i/o] : Pointer to VAD instance
128 : // - features [i] : Feature vector of length |kNumChannels|
129 : // = log10(energy in frequency band)
130 : // - total_power [i] : Total power in audio frame.
131 : // - frame_length [i] : Number of input samples
132 : //
133 : // - returns : the VAD decision (0 - noise, 1 - speech).
134 0 : static int16_t GmmProbability(VadInstT* self, int16_t* features,
135 : int16_t total_power, size_t frame_length) {
136 : int channel, k;
137 : int16_t feature_minimum;
138 : int16_t h0, h1;
139 : int16_t log_likelihood_ratio;
140 0 : int16_t vadflag = 0;
141 : int16_t shifts_h0, shifts_h1;
142 : int16_t tmp_s16, tmp1_s16, tmp2_s16;
143 : int16_t diff;
144 : int gaussian;
145 : int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
146 : int16_t delt, ndelt;
147 : int16_t maxspe, maxmu;
148 : int16_t deltaN[kTableSize], deltaS[kTableSize];
149 0 : int16_t ngprvec[kTableSize] = { 0 }; // Conditional probability = 0.
150 0 : int16_t sgprvec[kTableSize] = { 0 }; // Conditional probability = 0.
151 : int32_t h0_test, h1_test;
152 : int32_t tmp1_s32, tmp2_s32;
153 0 : int32_t sum_log_likelihood_ratios = 0;
154 : int32_t noise_global_mean, speech_global_mean;
155 : int32_t noise_probability[kNumGaussians], speech_probability[kNumGaussians];
156 : int16_t overhead1, overhead2, individualTest, totalTest;
157 :
158 : // Set various thresholds based on frame lengths (80, 160 or 240 samples).
159 0 : if (frame_length == 80) {
160 0 : overhead1 = self->over_hang_max_1[0];
161 0 : overhead2 = self->over_hang_max_2[0];
162 0 : individualTest = self->individual[0];
163 0 : totalTest = self->total[0];
164 0 : } else if (frame_length == 160) {
165 0 : overhead1 = self->over_hang_max_1[1];
166 0 : overhead2 = self->over_hang_max_2[1];
167 0 : individualTest = self->individual[1];
168 0 : totalTest = self->total[1];
169 : } else {
170 0 : overhead1 = self->over_hang_max_1[2];
171 0 : overhead2 = self->over_hang_max_2[2];
172 0 : individualTest = self->individual[2];
173 0 : totalTest = self->total[2];
174 : }
175 :
176 0 : if (total_power > kMinEnergy) {
177 : // The signal power of current frame is large enough for processing. The
178 : // processing consists of two parts:
179 : // 1) Calculating the likelihood of speech and thereby a VAD decision.
180 : // 2) Updating the underlying model, w.r.t., the decision made.
181 :
182 : // The detection scheme is an LRT with hypothesis
183 : // H0: Noise
184 : // H1: Speech
185 : //
186 : // We combine a global LRT with local tests, for each frequency sub-band,
187 : // here defined as |channel|.
188 0 : for (channel = 0; channel < kNumChannels; channel++) {
189 : // For each channel we model the probability with a GMM consisting of
190 : // |kNumGaussians|, with different means and standard deviations depending
191 : // on H0 or H1.
192 0 : h0_test = 0;
193 0 : h1_test = 0;
194 0 : for (k = 0; k < kNumGaussians; k++) {
195 0 : gaussian = channel + k * kNumChannels;
196 : // Probability under H0, that is, probability of frame being noise.
197 : // Value given in Q27 = Q7 * Q20.
198 0 : tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
199 0 : self->noise_means[gaussian],
200 0 : self->noise_stds[gaussian],
201 : &deltaN[gaussian]);
202 0 : noise_probability[k] = kNoiseDataWeights[gaussian] * tmp1_s32;
203 0 : h0_test += noise_probability[k]; // Q27
204 :
205 : // Probability under H1, that is, probability of frame being speech.
206 : // Value given in Q27 = Q7 * Q20.
207 0 : tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
208 0 : self->speech_means[gaussian],
209 0 : self->speech_stds[gaussian],
210 : &deltaS[gaussian]);
211 0 : speech_probability[k] = kSpeechDataWeights[gaussian] * tmp1_s32;
212 0 : h1_test += speech_probability[k]; // Q27
213 : }
214 :
215 : // Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}).
216 : // Approximation:
217 : // log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q)
218 : // = log2(h1_test) - log2(h0_test)
219 : // = log2(2^(31-shifts_h1)*(1+b1))
220 : // - log2(2^(31-shifts_h0)*(1+b0))
221 : // = shifts_h0 - shifts_h1
222 : // + log2(1+b1) - log2(1+b0)
223 : // ~= shifts_h0 - shifts_h1
224 : //
225 : // Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1.
226 : // Further, b0 and b1 are independent and on the average the two terms
227 : // cancel.
228 0 : shifts_h0 = WebRtcSpl_NormW32(h0_test);
229 0 : shifts_h1 = WebRtcSpl_NormW32(h1_test);
230 0 : if (h0_test == 0) {
231 0 : shifts_h0 = 31;
232 : }
233 0 : if (h1_test == 0) {
234 0 : shifts_h1 = 31;
235 : }
236 0 : log_likelihood_ratio = shifts_h0 - shifts_h1;
237 :
238 : // Update |sum_log_likelihood_ratios| with spectrum weighting. This is
239 : // used for the global VAD decision.
240 0 : sum_log_likelihood_ratios +=
241 0 : (int32_t) (log_likelihood_ratio * kSpectrumWeight[channel]);
242 :
243 : // Local VAD decision.
244 0 : if ((log_likelihood_ratio << 2) > individualTest) {
245 0 : vadflag = 1;
246 : }
247 :
248 : // TODO(bjornv): The conditional probabilities below are applied on the
249 : // hard coded number of Gaussians set to two. Find a way to generalize.
250 : // Calculate local noise probabilities used later when updating the GMM.
251 0 : h0 = (int16_t) (h0_test >> 12); // Q15
252 0 : if (h0 > 0) {
253 : // High probability of noise. Assign conditional probabilities for each
254 : // Gaussian in the GMM.
255 0 : tmp1_s32 = (noise_probability[0] & 0xFFFFF000) << 2; // Q29
256 0 : ngprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h0); // Q14
257 0 : ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel];
258 : } else {
259 : // Low noise probability. Assign conditional probability 1 to the first
260 : // Gaussian and 0 to the rest (which is already set at initialization).
261 0 : ngprvec[channel] = 16384;
262 : }
263 :
264 : // Calculate local speech probabilities used later when updating the GMM.
265 0 : h1 = (int16_t) (h1_test >> 12); // Q15
266 0 : if (h1 > 0) {
267 : // High probability of speech. Assign conditional probabilities for each
268 : // Gaussian in the GMM. Otherwise use the initialized values, i.e., 0.
269 0 : tmp1_s32 = (speech_probability[0] & 0xFFFFF000) << 2; // Q29
270 0 : sgprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h1); // Q14
271 0 : sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel];
272 : }
273 : }
274 :
275 : // Make a global VAD decision.
276 0 : vadflag |= (sum_log_likelihood_ratios >= totalTest);
277 :
278 : // Update the model parameters.
279 0 : maxspe = 12800;
280 0 : for (channel = 0; channel < kNumChannels; channel++) {
281 :
282 : // Get minimum value in past which is used for long term correction in Q4.
283 0 : feature_minimum = WebRtcVad_FindMinimum(self, features[channel], channel);
284 :
285 : // Compute the "global" mean, that is the sum of the two means weighted.
286 0 : noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
287 : &kNoiseDataWeights[channel]);
288 0 : tmp1_s16 = (int16_t) (noise_global_mean >> 6); // Q8
289 :
290 0 : for (k = 0; k < kNumGaussians; k++) {
291 0 : gaussian = channel + k * kNumChannels;
292 :
293 0 : nmk = self->noise_means[gaussian];
294 0 : smk = self->speech_means[gaussian];
295 0 : nsk = self->noise_stds[gaussian];
296 0 : ssk = self->speech_stds[gaussian];
297 :
298 : // Update noise mean vector if the frame consists of noise only.
299 0 : nmk2 = nmk;
300 0 : if (!vadflag) {
301 : // deltaN = (x-mu)/sigma^2
302 : // ngprvec[k] = |noise_probability[k]| /
303 : // (|noise_probability[0]| + |noise_probability[1]|)
304 :
305 : // (Q14 * Q11 >> 11) = Q14.
306 0 : delt = (int16_t)((ngprvec[gaussian] * deltaN[gaussian]) >> 11);
307 : // Q7 + (Q14 * Q15 >> 22) = Q7.
308 0 : nmk2 = nmk + (int16_t)((delt * kNoiseUpdateConst) >> 22);
309 : }
310 :
311 : // Long term correction of the noise mean.
312 : // Q8 - Q8 = Q8.
313 0 : ndelt = (feature_minimum << 4) - tmp1_s16;
314 : // Q7 + (Q8 * Q8) >> 9 = Q7.
315 0 : nmk3 = nmk2 + (int16_t)((ndelt * kBackEta) >> 9);
316 :
317 : // Control that the noise mean does not drift to much.
318 0 : tmp_s16 = (int16_t) ((k + 5) << 7);
319 0 : if (nmk3 < tmp_s16) {
320 0 : nmk3 = tmp_s16;
321 : }
322 0 : tmp_s16 = (int16_t) ((72 + k - channel) << 7);
323 0 : if (nmk3 > tmp_s16) {
324 0 : nmk3 = tmp_s16;
325 : }
326 0 : self->noise_means[gaussian] = nmk3;
327 :
328 0 : if (vadflag) {
329 : // Update speech mean vector:
330 : // |deltaS| = (x-mu)/sigma^2
331 : // sgprvec[k] = |speech_probability[k]| /
332 : // (|speech_probability[0]| + |speech_probability[1]|)
333 :
334 : // (Q14 * Q11) >> 11 = Q14.
335 0 : delt = (int16_t)((sgprvec[gaussian] * deltaS[gaussian]) >> 11);
336 : // Q14 * Q15 >> 21 = Q8.
337 0 : tmp_s16 = (int16_t)((delt * kSpeechUpdateConst) >> 21);
338 : // Q7 + (Q8 >> 1) = Q7. With rounding.
339 0 : smk2 = smk + ((tmp_s16 + 1) >> 1);
340 :
341 : // Control that the speech mean does not drift to much.
342 0 : maxmu = maxspe + 640;
343 0 : if (smk2 < kMinimumMean[k]) {
344 0 : smk2 = kMinimumMean[k];
345 : }
346 0 : if (smk2 > maxmu) {
347 0 : smk2 = maxmu;
348 : }
349 0 : self->speech_means[gaussian] = smk2; // Q7.
350 :
351 : // (Q7 >> 3) = Q4. With rounding.
352 0 : tmp_s16 = ((smk + 4) >> 3);
353 :
354 0 : tmp_s16 = features[channel] - tmp_s16; // Q4
355 : // (Q11 * Q4 >> 3) = Q12.
356 0 : tmp1_s32 = (deltaS[gaussian] * tmp_s16) >> 3;
357 0 : tmp2_s32 = tmp1_s32 - 4096;
358 0 : tmp_s16 = sgprvec[gaussian] >> 2;
359 : // (Q14 >> 2) * Q12 = Q24.
360 0 : tmp1_s32 = tmp_s16 * tmp2_s32;
361 :
362 0 : tmp2_s32 = tmp1_s32 >> 4; // Q20
363 :
364 : // 0.1 * Q20 / Q7 = Q13.
365 0 : if (tmp2_s32 > 0) {
366 0 : tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp2_s32, ssk * 10);
367 : } else {
368 0 : tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp2_s32, ssk * 10);
369 0 : tmp_s16 = -tmp_s16;
370 : }
371 : // Divide by 4 giving an update factor of 0.025 (= 0.1 / 4).
372 : // Note that division by 4 equals shift by 2, hence,
373 : // (Q13 >> 8) = (Q13 >> 6) / 4 = Q7.
374 0 : tmp_s16 += 128; // Rounding.
375 0 : ssk += (tmp_s16 >> 8);
376 0 : if (ssk < kMinStd) {
377 0 : ssk = kMinStd;
378 : }
379 0 : self->speech_stds[gaussian] = ssk;
380 : } else {
381 : // Update GMM variance vectors.
382 : // deltaN * (features[channel] - nmk) - 1
383 : // Q4 - (Q7 >> 3) = Q4.
384 0 : tmp_s16 = features[channel] - (nmk >> 3);
385 : // (Q11 * Q4 >> 3) = Q12.
386 0 : tmp1_s32 = (deltaN[gaussian] * tmp_s16) >> 3;
387 0 : tmp1_s32 -= 4096;
388 :
389 : // (Q14 >> 2) * Q12 = Q24.
390 0 : tmp_s16 = (ngprvec[gaussian] + 2) >> 2;
391 0 : tmp2_s32 = OverflowingMulS16ByS32ToS32(tmp_s16, tmp1_s32);
392 : // Q20 * approx 0.001 (2^-10=0.0009766), hence,
393 : // (Q24 >> 14) = (Q24 >> 4) / 2^10 = Q20.
394 0 : tmp1_s32 = tmp2_s32 >> 14;
395 :
396 : // Q20 / Q7 = Q13.
397 0 : if (tmp1_s32 > 0) {
398 0 : tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, nsk);
399 : } else {
400 0 : tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp1_s32, nsk);
401 0 : tmp_s16 = -tmp_s16;
402 : }
403 0 : tmp_s16 += 32; // Rounding
404 0 : nsk += tmp_s16 >> 6; // Q13 >> 6 = Q7.
405 0 : if (nsk < kMinStd) {
406 0 : nsk = kMinStd;
407 : }
408 0 : self->noise_stds[gaussian] = nsk;
409 : }
410 : }
411 :
412 : // Separate models if they are too close.
413 : // |noise_global_mean| in Q14 (= Q7 * Q7).
414 0 : noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
415 : &kNoiseDataWeights[channel]);
416 :
417 : // |speech_global_mean| in Q14 (= Q7 * Q7).
418 0 : speech_global_mean = WeightedAverage(&self->speech_means[channel], 0,
419 : &kSpeechDataWeights[channel]);
420 :
421 : // |diff| = "global" speech mean - "global" noise mean.
422 : // (Q14 >> 9) - (Q14 >> 9) = Q5.
423 0 : diff = (int16_t) (speech_global_mean >> 9) -
424 0 : (int16_t) (noise_global_mean >> 9);
425 0 : if (diff < kMinimumDifference[channel]) {
426 0 : tmp_s16 = kMinimumDifference[channel] - diff;
427 :
428 : // |tmp1_s16| = ~0.8 * (kMinimumDifference - diff) in Q7.
429 : // |tmp2_s16| = ~0.2 * (kMinimumDifference - diff) in Q7.
430 0 : tmp1_s16 = (int16_t)((13 * tmp_s16) >> 2);
431 0 : tmp2_s16 = (int16_t)((3 * tmp_s16) >> 2);
432 :
433 : // Move Gaussian means for speech model by |tmp1_s16| and update
434 : // |speech_global_mean|. Note that |self->speech_means[channel]| is
435 : // changed after the call.
436 0 : speech_global_mean = WeightedAverage(&self->speech_means[channel],
437 : tmp1_s16,
438 : &kSpeechDataWeights[channel]);
439 :
440 : // Move Gaussian means for noise model by -|tmp2_s16| and update
441 : // |noise_global_mean|. Note that |self->noise_means[channel]| is
442 : // changed after the call.
443 0 : noise_global_mean = WeightedAverage(&self->noise_means[channel],
444 : -tmp2_s16,
445 : &kNoiseDataWeights[channel]);
446 : }
447 :
448 : // Control that the speech & noise means do not drift to much.
449 0 : maxspe = kMaximumSpeech[channel];
450 0 : tmp2_s16 = (int16_t) (speech_global_mean >> 7);
451 0 : if (tmp2_s16 > maxspe) {
452 : // Upper limit of speech model.
453 0 : tmp2_s16 -= maxspe;
454 :
455 0 : for (k = 0; k < kNumGaussians; k++) {
456 0 : self->speech_means[channel + k * kNumChannels] -= tmp2_s16;
457 : }
458 : }
459 :
460 0 : tmp2_s16 = (int16_t) (noise_global_mean >> 7);
461 0 : if (tmp2_s16 > kMaximumNoise[channel]) {
462 0 : tmp2_s16 -= kMaximumNoise[channel];
463 :
464 0 : for (k = 0; k < kNumGaussians; k++) {
465 0 : self->noise_means[channel + k * kNumChannels] -= tmp2_s16;
466 : }
467 : }
468 : }
469 0 : self->frame_counter++;
470 : }
471 :
472 : // Smooth with respect to transition hysteresis.
473 0 : if (!vadflag) {
474 0 : if (self->over_hang > 0) {
475 0 : vadflag = 2 + self->over_hang;
476 0 : self->over_hang--;
477 : }
478 0 : self->num_of_speech = 0;
479 : } else {
480 0 : self->num_of_speech++;
481 0 : if (self->num_of_speech > kMaxSpeechFrames) {
482 0 : self->num_of_speech = kMaxSpeechFrames;
483 0 : self->over_hang = overhead2;
484 : } else {
485 0 : self->over_hang = overhead1;
486 : }
487 : }
488 0 : return vadflag;
489 : }
490 :
491 : // Initialize the VAD. Set aggressiveness mode to default value.
492 0 : int WebRtcVad_InitCore(VadInstT* self) {
493 : int i;
494 :
495 0 : if (self == NULL) {
496 0 : return -1;
497 : }
498 :
499 : // Initialization of general struct variables.
500 0 : self->vad = 1; // Speech active (=1).
501 0 : self->frame_counter = 0;
502 0 : self->over_hang = 0;
503 0 : self->num_of_speech = 0;
504 :
505 : // Initialization of downsampling filter state.
506 0 : memset(self->downsampling_filter_states, 0,
507 : sizeof(self->downsampling_filter_states));
508 :
509 : // Initialization of 48 to 8 kHz downsampling.
510 0 : WebRtcSpl_ResetResample48khzTo8khz(&self->state_48_to_8);
511 :
512 : // Read initial PDF parameters.
513 0 : for (i = 0; i < kTableSize; i++) {
514 0 : self->noise_means[i] = kNoiseDataMeans[i];
515 0 : self->speech_means[i] = kSpeechDataMeans[i];
516 0 : self->noise_stds[i] = kNoiseDataStds[i];
517 0 : self->speech_stds[i] = kSpeechDataStds[i];
518 : }
519 :
520 : // Initialize Index and Minimum value vectors.
521 0 : for (i = 0; i < 16 * kNumChannels; i++) {
522 0 : self->low_value_vector[i] = 10000;
523 0 : self->index_vector[i] = 0;
524 : }
525 :
526 : // Initialize splitting filter states.
527 0 : memset(self->upper_state, 0, sizeof(self->upper_state));
528 0 : memset(self->lower_state, 0, sizeof(self->lower_state));
529 :
530 : // Initialize high pass filter states.
531 0 : memset(self->hp_filter_state, 0, sizeof(self->hp_filter_state));
532 :
533 : // Initialize mean value memory, for WebRtcVad_FindMinimum().
534 0 : for (i = 0; i < kNumChannels; i++) {
535 0 : self->mean_value[i] = 1600;
536 : }
537 :
538 : // Set aggressiveness mode to default (=|kDefaultMode|).
539 0 : if (WebRtcVad_set_mode_core(self, kDefaultMode) != 0) {
540 0 : return -1;
541 : }
542 :
543 0 : self->init_flag = kInitCheck;
544 :
545 0 : return 0;
546 : }
547 :
548 : // Set aggressiveness mode
549 0 : int WebRtcVad_set_mode_core(VadInstT* self, int mode) {
550 0 : int return_value = 0;
551 :
552 0 : switch (mode) {
553 : case 0:
554 : // Quality mode.
555 0 : memcpy(self->over_hang_max_1, kOverHangMax1Q,
556 : sizeof(self->over_hang_max_1));
557 0 : memcpy(self->over_hang_max_2, kOverHangMax2Q,
558 : sizeof(self->over_hang_max_2));
559 0 : memcpy(self->individual, kLocalThresholdQ,
560 : sizeof(self->individual));
561 0 : memcpy(self->total, kGlobalThresholdQ,
562 : sizeof(self->total));
563 0 : break;
564 : case 1:
565 : // Low bitrate mode.
566 0 : memcpy(self->over_hang_max_1, kOverHangMax1LBR,
567 : sizeof(self->over_hang_max_1));
568 0 : memcpy(self->over_hang_max_2, kOverHangMax2LBR,
569 : sizeof(self->over_hang_max_2));
570 0 : memcpy(self->individual, kLocalThresholdLBR,
571 : sizeof(self->individual));
572 0 : memcpy(self->total, kGlobalThresholdLBR,
573 : sizeof(self->total));
574 0 : break;
575 : case 2:
576 : // Aggressive mode.
577 0 : memcpy(self->over_hang_max_1, kOverHangMax1AGG,
578 : sizeof(self->over_hang_max_1));
579 0 : memcpy(self->over_hang_max_2, kOverHangMax2AGG,
580 : sizeof(self->over_hang_max_2));
581 0 : memcpy(self->individual, kLocalThresholdAGG,
582 : sizeof(self->individual));
583 0 : memcpy(self->total, kGlobalThresholdAGG,
584 : sizeof(self->total));
585 0 : break;
586 : case 3:
587 : // Very aggressive mode.
588 0 : memcpy(self->over_hang_max_1, kOverHangMax1VAG,
589 : sizeof(self->over_hang_max_1));
590 0 : memcpy(self->over_hang_max_2, kOverHangMax2VAG,
591 : sizeof(self->over_hang_max_2));
592 0 : memcpy(self->individual, kLocalThresholdVAG,
593 : sizeof(self->individual));
594 0 : memcpy(self->total, kGlobalThresholdVAG,
595 : sizeof(self->total));
596 0 : break;
597 : default:
598 0 : return_value = -1;
599 0 : break;
600 : }
601 :
602 0 : return return_value;
603 : }
604 :
605 : // Calculate VAD decision by first extracting feature values and then calculate
606 : // probability for both speech and background noise.
607 :
608 0 : int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame,
609 : size_t frame_length) {
610 : int vad;
611 : size_t i;
612 : int16_t speech_nb[240]; // 30 ms in 8 kHz.
613 : // |tmp_mem| is a temporary memory used by resample function, length is
614 : // frame length in 10 ms (480 samples) + 256 extra.
615 0 : int32_t tmp_mem[480 + 256] = { 0 };
616 0 : const size_t kFrameLen10ms48khz = 480;
617 0 : const size_t kFrameLen10ms8khz = 80;
618 0 : size_t num_10ms_frames = frame_length / kFrameLen10ms48khz;
619 :
620 0 : for (i = 0; i < num_10ms_frames; i++) {
621 0 : WebRtcSpl_Resample48khzTo8khz(speech_frame,
622 0 : &speech_nb[i * kFrameLen10ms8khz],
623 : &inst->state_48_to_8,
624 : tmp_mem);
625 : }
626 :
627 : // Do VAD on an 8 kHz signal
628 0 : vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6);
629 :
630 0 : return vad;
631 : }
632 :
633 0 : int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame,
634 : size_t frame_length)
635 : {
636 : size_t len;
637 : int vad;
638 : int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
639 : int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
640 :
641 :
642 : // Downsample signal 32->16->8 before doing VAD
643 0 : WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
644 : frame_length);
645 0 : len = frame_length / 2;
646 :
647 0 : WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
648 0 : len /= 2;
649 :
650 : // Do VAD on an 8 kHz signal
651 0 : vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
652 :
653 0 : return vad;
654 : }
655 :
656 0 : int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame,
657 : size_t frame_length)
658 : {
659 : size_t len;
660 : int vad;
661 : int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
662 :
663 : // Wideband: Downsample signal before doing VAD
664 0 : WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
665 : frame_length);
666 :
667 0 : len = frame_length / 2;
668 0 : vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
669 :
670 0 : return vad;
671 : }
672 :
673 0 : int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame,
674 : size_t frame_length)
675 : {
676 : int16_t feature_vector[kNumChannels], total_power;
677 :
678 : // Get power in the bands
679 0 : total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,
680 : feature_vector);
681 :
682 : // Make a VAD
683 0 : inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length);
684 :
685 0 : return inst->vad;
686 : }
|