LCOV - code coverage report
Current view: top level - media/webrtc/trunk/webrtc/common_audio/vad - vad_core.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 242 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 9 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
       3             :  *
       4             :  *  Use of this source code is governed by a BSD-style license
       5             :  *  that can be found in the LICENSE file in the root of the source
       6             :  *  tree. An additional intellectual property rights grant can be found
       7             :  *  in the file PATENTS.  All contributing project authors may
       8             :  *  be found in the AUTHORS file in the root of the source tree.
       9             :  */
      10             : 
      11             : #include "webrtc/common_audio/vad/vad_core.h"
      12             : 
      13             : #include "webrtc/base/sanitizer.h"
      14             : #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
      15             : #include "webrtc/common_audio/vad/vad_filterbank.h"
      16             : #include "webrtc/common_audio/vad/vad_gmm.h"
      17             : #include "webrtc/common_audio/vad/vad_sp.h"
      18             : #include "webrtc/typedefs.h"
      19             : 
      20             : // Spectrum Weighting
      21             : static const int16_t kSpectrumWeight[kNumChannels] = { 6, 8, 10, 12, 14, 16 };
      22             : static const int16_t kNoiseUpdateConst = 655; // Q15
      23             : static const int16_t kSpeechUpdateConst = 6554; // Q15
      24             : static const int16_t kBackEta = 154; // Q8
      25             : // Minimum difference between the two models, Q5
      26             : static const int16_t kMinimumDifference[kNumChannels] = {
      27             :     544, 544, 576, 576, 576, 576 };
      28             : // Upper limit of mean value for speech model, Q7
      29             : static const int16_t kMaximumSpeech[kNumChannels] = {
      30             :     11392, 11392, 11520, 11520, 11520, 11520 };
      31             : // Minimum value for mean value
      32             : static const int16_t kMinimumMean[kNumGaussians] = { 640, 768 };
      33             : // Upper limit of mean value for noise model, Q7
      34             : static const int16_t kMaximumNoise[kNumChannels] = {
      35             :     9216, 9088, 8960, 8832, 8704, 8576 };
      36             : // Start values for the Gaussian models, Q7
      37             : // Weights for the two Gaussians for the six channels (noise)
      38             : static const int16_t kNoiseDataWeights[kTableSize] = {
      39             :     34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 };
      40             : // Weights for the two Gaussians for the six channels (speech)
      41             : static const int16_t kSpeechDataWeights[kTableSize] = {
      42             :     48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 };
      43             : // Means for the two Gaussians for the six channels (noise)
      44             : static const int16_t kNoiseDataMeans[kTableSize] = {
      45             :     6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 };
      46             : // Means for the two Gaussians for the six channels (speech)
      47             : static const int16_t kSpeechDataMeans[kTableSize] = {
      48             :     8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483
      49             : };
      50             : // Stds for the two Gaussians for the six channels (noise)
      51             : static const int16_t kNoiseDataStds[kTableSize] = {
      52             :     378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 };
      53             : // Stds for the two Gaussians for the six channels (speech)
      54             : static const int16_t kSpeechDataStds[kTableSize] = {
      55             :     555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };
      56             : 
      57             : // Constants used in GmmProbability().
      58             : //
      59             : // Maximum number of counted speech (VAD = 1) frames in a row.
      60             : static const int16_t kMaxSpeechFrames = 6;
      61             : // Minimum standard deviation for both speech and noise.
      62             : static const int16_t kMinStd = 384;
      63             : 
      64             : // Constants in WebRtcVad_InitCore().
      65             : // Default aggressiveness mode.
      66             : static const short kDefaultMode = 0;
      67             : static const int kInitCheck = 42;
      68             : 
      69             : // Constants used in WebRtcVad_set_mode_core().
      70             : //
      71             : // Thresholds for different frame lengths (10 ms, 20 ms and 30 ms).
      72             : //
      73             : // Mode 0, Quality.
      74             : static const int16_t kOverHangMax1Q[3] = { 8, 4, 3 };
      75             : static const int16_t kOverHangMax2Q[3] = { 14, 7, 5 };
      76             : static const int16_t kLocalThresholdQ[3] = { 24, 21, 24 };
      77             : static const int16_t kGlobalThresholdQ[3] = { 57, 48, 57 };
      78             : // Mode 1, Low bitrate.
      79             : static const int16_t kOverHangMax1LBR[3] = { 8, 4, 3 };
      80             : static const int16_t kOverHangMax2LBR[3] = { 14, 7, 5 };
      81             : static const int16_t kLocalThresholdLBR[3] = { 37, 32, 37 };
      82             : static const int16_t kGlobalThresholdLBR[3] = { 100, 80, 100 };
      83             : // Mode 2, Aggressive.
      84             : static const int16_t kOverHangMax1AGG[3] = { 6, 3, 2 };
      85             : static const int16_t kOverHangMax2AGG[3] = { 9, 5, 3 };
      86             : static const int16_t kLocalThresholdAGG[3] = { 82, 78, 82 };
      87             : static const int16_t kGlobalThresholdAGG[3] = { 285, 260, 285 };
      88             : // Mode 3, Very aggressive.
      89             : static const int16_t kOverHangMax1VAG[3] = { 6, 3, 2 };
      90             : static const int16_t kOverHangMax2VAG[3] = { 9, 5, 3 };
      91             : static const int16_t kLocalThresholdVAG[3] = { 94, 94, 94 };
      92             : static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 };
      93             : 
      94             : // Calculates the weighted average w.r.t. number of Gaussians. The |data| are
      95             : // updated with an |offset| before averaging.
      96             : //
      97             : // - data     [i/o] : Data to average.
      98             : // - offset   [i]   : An offset added to |data|.
      99             : // - weights  [i]   : Weights used for averaging.
     100             : //
     101             : // returns          : The weighted average.
     102           0 : static int32_t WeightedAverage(int16_t* data, int16_t offset,
     103             :                                const int16_t* weights) {
     104             :   int k;
     105           0 :   int32_t weighted_average = 0;
     106             : 
     107           0 :   for (k = 0; k < kNumGaussians; k++) {
     108           0 :     data[k * kNumChannels] += offset;
     109           0 :     weighted_average += data[k * kNumChannels] * weights[k * kNumChannels];
     110             :   }
     111           0 :   return weighted_average;
     112             : }
     113             : 
     114             : // An s16 x s32 -> s32 multiplication that's allowed to overflow. (It's still
     115             : // undefined behavior, so not a good idea; this just makes UBSan ignore the
     116             : // violation, so that our old code can continue to do what it's always been
     117             : // doing.)
     118           0 : static inline int32_t OverflowingMulS16ByS32ToS32(int16_t a, int32_t b)
     119             :     RTC_NO_SANITIZE("signed-integer-overflow") {
     120           0 :   return a * b;
     121             : }
     122             : 
     123             : // Calculates the probabilities for both speech and background noise using
     124             : // Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which
     125             : // type of signal is most probable.
     126             : //
     127             : // - self           [i/o] : Pointer to VAD instance
     128             : // - features       [i]   : Feature vector of length |kNumChannels|
     129             : //                          = log10(energy in frequency band)
     130             : // - total_power    [i]   : Total power in audio frame.
     131             : // - frame_length   [i]   : Number of input samples
     132             : //
     133             : // - returns              : the VAD decision (0 - noise, 1 - speech).
     134           0 : static int16_t GmmProbability(VadInstT* self, int16_t* features,
     135             :                               int16_t total_power, size_t frame_length) {
     136             :   int channel, k;
     137             :   int16_t feature_minimum;
     138             :   int16_t h0, h1;
     139             :   int16_t log_likelihood_ratio;
     140           0 :   int16_t vadflag = 0;
     141             :   int16_t shifts_h0, shifts_h1;
     142             :   int16_t tmp_s16, tmp1_s16, tmp2_s16;
     143             :   int16_t diff;
     144             :   int gaussian;
     145             :   int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
     146             :   int16_t delt, ndelt;
     147             :   int16_t maxspe, maxmu;
     148             :   int16_t deltaN[kTableSize], deltaS[kTableSize];
     149           0 :   int16_t ngprvec[kTableSize] = { 0 };  // Conditional probability = 0.
     150           0 :   int16_t sgprvec[kTableSize] = { 0 };  // Conditional probability = 0.
     151             :   int32_t h0_test, h1_test;
     152             :   int32_t tmp1_s32, tmp2_s32;
     153           0 :   int32_t sum_log_likelihood_ratios = 0;
     154             :   int32_t noise_global_mean, speech_global_mean;
     155             :   int32_t noise_probability[kNumGaussians], speech_probability[kNumGaussians];
     156             :   int16_t overhead1, overhead2, individualTest, totalTest;
     157             : 
     158             :   // Set various thresholds based on frame lengths (80, 160 or 240 samples).
     159           0 :   if (frame_length == 80) {
     160           0 :     overhead1 = self->over_hang_max_1[0];
     161           0 :     overhead2 = self->over_hang_max_2[0];
     162           0 :     individualTest = self->individual[0];
     163           0 :     totalTest = self->total[0];
     164           0 :   } else if (frame_length == 160) {
     165           0 :     overhead1 = self->over_hang_max_1[1];
     166           0 :     overhead2 = self->over_hang_max_2[1];
     167           0 :     individualTest = self->individual[1];
     168           0 :     totalTest = self->total[1];
     169             :   } else {
     170           0 :     overhead1 = self->over_hang_max_1[2];
     171           0 :     overhead2 = self->over_hang_max_2[2];
     172           0 :     individualTest = self->individual[2];
     173           0 :     totalTest = self->total[2];
     174             :   }
     175             : 
     176           0 :   if (total_power > kMinEnergy) {
     177             :     // The signal power of current frame is large enough for processing. The
     178             :     // processing consists of two parts:
     179             :     // 1) Calculating the likelihood of speech and thereby a VAD decision.
     180             :     // 2) Updating the underlying model, w.r.t., the decision made.
     181             : 
     182             :     // The detection scheme is an LRT with hypothesis
     183             :     // H0: Noise
     184             :     // H1: Speech
     185             :     //
     186             :     // We combine a global LRT with local tests, for each frequency sub-band,
     187             :     // here defined as |channel|.
     188           0 :     for (channel = 0; channel < kNumChannels; channel++) {
     189             :       // For each channel we model the probability with a GMM consisting of
     190             :       // |kNumGaussians|, with different means and standard deviations depending
     191             :       // on H0 or H1.
     192           0 :       h0_test = 0;
     193           0 :       h1_test = 0;
     194           0 :       for (k = 0; k < kNumGaussians; k++) {
     195           0 :         gaussian = channel + k * kNumChannels;
     196             :         // Probability under H0, that is, probability of frame being noise.
     197             :         // Value given in Q27 = Q7 * Q20.
     198           0 :         tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
     199           0 :                                                  self->noise_means[gaussian],
     200           0 :                                                  self->noise_stds[gaussian],
     201             :                                                  &deltaN[gaussian]);
     202           0 :         noise_probability[k] = kNoiseDataWeights[gaussian] * tmp1_s32;
     203           0 :         h0_test += noise_probability[k];  // Q27
     204             : 
     205             :         // Probability under H1, that is, probability of frame being speech.
     206             :         // Value given in Q27 = Q7 * Q20.
     207           0 :         tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
     208           0 :                                                  self->speech_means[gaussian],
     209           0 :                                                  self->speech_stds[gaussian],
     210             :                                                  &deltaS[gaussian]);
     211           0 :         speech_probability[k] = kSpeechDataWeights[gaussian] * tmp1_s32;
     212           0 :         h1_test += speech_probability[k];  // Q27
     213             :       }
     214             : 
     215             :       // Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}).
     216             :       // Approximation:
     217             :       // log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q)
     218             :       //                           = log2(h1_test) - log2(h0_test)
     219             :       //                           = log2(2^(31-shifts_h1)*(1+b1))
     220             :       //                             - log2(2^(31-shifts_h0)*(1+b0))
     221             :       //                           = shifts_h0 - shifts_h1
     222             :       //                             + log2(1+b1) - log2(1+b0)
     223             :       //                          ~= shifts_h0 - shifts_h1
     224             :       //
     225             :       // Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1.
     226             :       // Further, b0 and b1 are independent and on the average the two terms
     227             :       // cancel.
     228           0 :       shifts_h0 = WebRtcSpl_NormW32(h0_test);
     229           0 :       shifts_h1 = WebRtcSpl_NormW32(h1_test);
     230           0 :       if (h0_test == 0) {
     231           0 :         shifts_h0 = 31;
     232             :       }
     233           0 :       if (h1_test == 0) {
     234           0 :         shifts_h1 = 31;
     235             :       }
     236           0 :       log_likelihood_ratio = shifts_h0 - shifts_h1;
     237             : 
     238             :       // Update |sum_log_likelihood_ratios| with spectrum weighting. This is
     239             :       // used for the global VAD decision.
     240           0 :       sum_log_likelihood_ratios +=
     241           0 :           (int32_t) (log_likelihood_ratio * kSpectrumWeight[channel]);
     242             : 
     243             :       // Local VAD decision.
     244           0 :       if ((log_likelihood_ratio << 2) > individualTest) {
     245           0 :         vadflag = 1;
     246             :       }
     247             : 
     248             :       // TODO(bjornv): The conditional probabilities below are applied on the
     249             :       // hard coded number of Gaussians set to two. Find a way to generalize.
     250             :       // Calculate local noise probabilities used later when updating the GMM.
     251           0 :       h0 = (int16_t) (h0_test >> 12);  // Q15
     252           0 :       if (h0 > 0) {
     253             :         // High probability of noise. Assign conditional probabilities for each
     254             :         // Gaussian in the GMM.
     255           0 :         tmp1_s32 = (noise_probability[0] & 0xFFFFF000) << 2;  // Q29
     256           0 :         ngprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h0);  // Q14
     257           0 :         ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel];
     258             :       } else {
     259             :         // Low noise probability. Assign conditional probability 1 to the first
     260             :         // Gaussian and 0 to the rest (which is already set at initialization).
     261           0 :         ngprvec[channel] = 16384;
     262             :       }
     263             : 
     264             :       // Calculate local speech probabilities used later when updating the GMM.
     265           0 :       h1 = (int16_t) (h1_test >> 12);  // Q15
     266           0 :       if (h1 > 0) {
     267             :         // High probability of speech. Assign conditional probabilities for each
     268             :         // Gaussian in the GMM. Otherwise use the initialized values, i.e., 0.
     269           0 :         tmp1_s32 = (speech_probability[0] & 0xFFFFF000) << 2;  // Q29
     270           0 :         sgprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h1);  // Q14
     271           0 :         sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel];
     272             :       }
     273             :     }
     274             : 
     275             :     // Make a global VAD decision.
     276           0 :     vadflag |= (sum_log_likelihood_ratios >= totalTest);
     277             : 
     278             :     // Update the model parameters.
     279           0 :     maxspe = 12800;
     280           0 :     for (channel = 0; channel < kNumChannels; channel++) {
     281             : 
     282             :       // Get minimum value in past which is used for long term correction in Q4.
     283           0 :       feature_minimum = WebRtcVad_FindMinimum(self, features[channel], channel);
     284             : 
     285             :       // Compute the "global" mean, that is the sum of the two means weighted.
     286           0 :       noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
     287             :                                           &kNoiseDataWeights[channel]);
     288           0 :       tmp1_s16 = (int16_t) (noise_global_mean >> 6);  // Q8
     289             : 
     290           0 :       for (k = 0; k < kNumGaussians; k++) {
     291           0 :         gaussian = channel + k * kNumChannels;
     292             : 
     293           0 :         nmk = self->noise_means[gaussian];
     294           0 :         smk = self->speech_means[gaussian];
     295           0 :         nsk = self->noise_stds[gaussian];
     296           0 :         ssk = self->speech_stds[gaussian];
     297             : 
     298             :         // Update noise mean vector if the frame consists of noise only.
     299           0 :         nmk2 = nmk;
     300           0 :         if (!vadflag) {
     301             :           // deltaN = (x-mu)/sigma^2
     302             :           // ngprvec[k] = |noise_probability[k]| /
     303             :           //   (|noise_probability[0]| + |noise_probability[1]|)
     304             : 
     305             :           // (Q14 * Q11 >> 11) = Q14.
     306           0 :           delt = (int16_t)((ngprvec[gaussian] * deltaN[gaussian]) >> 11);
     307             :           // Q7 + (Q14 * Q15 >> 22) = Q7.
     308           0 :           nmk2 = nmk + (int16_t)((delt * kNoiseUpdateConst) >> 22);
     309             :         }
     310             : 
     311             :         // Long term correction of the noise mean.
     312             :         // Q8 - Q8 = Q8.
     313           0 :         ndelt = (feature_minimum << 4) - tmp1_s16;
     314             :         // Q7 + (Q8 * Q8) >> 9 = Q7.
     315           0 :         nmk3 = nmk2 + (int16_t)((ndelt * kBackEta) >> 9);
     316             : 
     317             :         // Control that the noise mean does not drift to much.
     318           0 :         tmp_s16 = (int16_t) ((k + 5) << 7);
     319           0 :         if (nmk3 < tmp_s16) {
     320           0 :           nmk3 = tmp_s16;
     321             :         }
     322           0 :         tmp_s16 = (int16_t) ((72 + k - channel) << 7);
     323           0 :         if (nmk3 > tmp_s16) {
     324           0 :           nmk3 = tmp_s16;
     325             :         }
     326           0 :         self->noise_means[gaussian] = nmk3;
     327             : 
     328           0 :         if (vadflag) {
     329             :           // Update speech mean vector:
     330             :           // |deltaS| = (x-mu)/sigma^2
     331             :           // sgprvec[k] = |speech_probability[k]| /
     332             :           //   (|speech_probability[0]| + |speech_probability[1]|)
     333             : 
     334             :           // (Q14 * Q11) >> 11 = Q14.
     335           0 :           delt = (int16_t)((sgprvec[gaussian] * deltaS[gaussian]) >> 11);
     336             :           // Q14 * Q15 >> 21 = Q8.
     337           0 :           tmp_s16 = (int16_t)((delt * kSpeechUpdateConst) >> 21);
     338             :           // Q7 + (Q8 >> 1) = Q7. With rounding.
     339           0 :           smk2 = smk + ((tmp_s16 + 1) >> 1);
     340             : 
     341             :           // Control that the speech mean does not drift to much.
     342           0 :           maxmu = maxspe + 640;
     343           0 :           if (smk2 < kMinimumMean[k]) {
     344           0 :             smk2 = kMinimumMean[k];
     345             :           }
     346           0 :           if (smk2 > maxmu) {
     347           0 :             smk2 = maxmu;
     348             :           }
     349           0 :           self->speech_means[gaussian] = smk2;  // Q7.
     350             : 
     351             :           // (Q7 >> 3) = Q4. With rounding.
     352           0 :           tmp_s16 = ((smk + 4) >> 3);
     353             : 
     354           0 :           tmp_s16 = features[channel] - tmp_s16;  // Q4
     355             :           // (Q11 * Q4 >> 3) = Q12.
     356           0 :           tmp1_s32 = (deltaS[gaussian] * tmp_s16) >> 3;
     357           0 :           tmp2_s32 = tmp1_s32 - 4096;
     358           0 :           tmp_s16 = sgprvec[gaussian] >> 2;
     359             :           // (Q14 >> 2) * Q12 = Q24.
     360           0 :           tmp1_s32 = tmp_s16 * tmp2_s32;
     361             : 
     362           0 :           tmp2_s32 = tmp1_s32 >> 4;  // Q20
     363             : 
     364             :           // 0.1 * Q20 / Q7 = Q13.
     365           0 :           if (tmp2_s32 > 0) {
     366           0 :             tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp2_s32, ssk * 10);
     367             :           } else {
     368           0 :             tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp2_s32, ssk * 10);
     369           0 :             tmp_s16 = -tmp_s16;
     370             :           }
     371             :           // Divide by 4 giving an update factor of 0.025 (= 0.1 / 4).
     372             :           // Note that division by 4 equals shift by 2, hence,
     373             :           // (Q13 >> 8) = (Q13 >> 6) / 4 = Q7.
     374           0 :           tmp_s16 += 128;  // Rounding.
     375           0 :           ssk += (tmp_s16 >> 8);
     376           0 :           if (ssk < kMinStd) {
     377           0 :             ssk = kMinStd;
     378             :           }
     379           0 :           self->speech_stds[gaussian] = ssk;
     380             :         } else {
     381             :           // Update GMM variance vectors.
     382             :           // deltaN * (features[channel] - nmk) - 1
     383             :           // Q4 - (Q7 >> 3) = Q4.
     384           0 :           tmp_s16 = features[channel] - (nmk >> 3);
     385             :           // (Q11 * Q4 >> 3) = Q12.
     386           0 :           tmp1_s32 = (deltaN[gaussian] * tmp_s16) >> 3;
     387           0 :           tmp1_s32 -= 4096;
     388             : 
     389             :           // (Q14 >> 2) * Q12 = Q24.
     390           0 :           tmp_s16 = (ngprvec[gaussian] + 2) >> 2;
     391           0 :           tmp2_s32 = OverflowingMulS16ByS32ToS32(tmp_s16, tmp1_s32);
     392             :           // Q20  * approx 0.001 (2^-10=0.0009766), hence,
     393             :           // (Q24 >> 14) = (Q24 >> 4) / 2^10 = Q20.
     394           0 :           tmp1_s32 = tmp2_s32 >> 14;
     395             : 
     396             :           // Q20 / Q7 = Q13.
     397           0 :           if (tmp1_s32 > 0) {
     398           0 :             tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, nsk);
     399             :           } else {
     400           0 :             tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp1_s32, nsk);
     401           0 :             tmp_s16 = -tmp_s16;
     402             :           }
     403           0 :           tmp_s16 += 32;  // Rounding
     404           0 :           nsk += tmp_s16 >> 6;  // Q13 >> 6 = Q7.
     405           0 :           if (nsk < kMinStd) {
     406           0 :             nsk = kMinStd;
     407             :           }
     408           0 :           self->noise_stds[gaussian] = nsk;
     409             :         }
     410             :       }
     411             : 
     412             :       // Separate models if they are too close.
     413             :       // |noise_global_mean| in Q14 (= Q7 * Q7).
     414           0 :       noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
     415             :                                           &kNoiseDataWeights[channel]);
     416             : 
     417             :       // |speech_global_mean| in Q14 (= Q7 * Q7).
     418           0 :       speech_global_mean = WeightedAverage(&self->speech_means[channel], 0,
     419             :                                            &kSpeechDataWeights[channel]);
     420             : 
     421             :       // |diff| = "global" speech mean - "global" noise mean.
     422             :       // (Q14 >> 9) - (Q14 >> 9) = Q5.
     423           0 :       diff = (int16_t) (speech_global_mean >> 9) -
     424           0 :           (int16_t) (noise_global_mean >> 9);
     425           0 :       if (diff < kMinimumDifference[channel]) {
     426           0 :         tmp_s16 = kMinimumDifference[channel] - diff;
     427             : 
     428             :         // |tmp1_s16| = ~0.8 * (kMinimumDifference - diff) in Q7.
     429             :         // |tmp2_s16| = ~0.2 * (kMinimumDifference - diff) in Q7.
     430           0 :         tmp1_s16 = (int16_t)((13 * tmp_s16) >> 2);
     431           0 :         tmp2_s16 = (int16_t)((3 * tmp_s16) >> 2);
     432             : 
     433             :         // Move Gaussian means for speech model by |tmp1_s16| and update
     434             :         // |speech_global_mean|. Note that |self->speech_means[channel]| is
     435             :         // changed after the call.
     436           0 :         speech_global_mean = WeightedAverage(&self->speech_means[channel],
     437             :                                              tmp1_s16,
     438             :                                              &kSpeechDataWeights[channel]);
     439             : 
     440             :         // Move Gaussian means for noise model by -|tmp2_s16| and update
     441             :         // |noise_global_mean|. Note that |self->noise_means[channel]| is
     442             :         // changed after the call.
     443           0 :         noise_global_mean = WeightedAverage(&self->noise_means[channel],
     444             :                                             -tmp2_s16,
     445             :                                             &kNoiseDataWeights[channel]);
     446             :       }
     447             : 
     448             :       // Control that the speech & noise means do not drift to much.
     449           0 :       maxspe = kMaximumSpeech[channel];
     450           0 :       tmp2_s16 = (int16_t) (speech_global_mean >> 7);
     451           0 :       if (tmp2_s16 > maxspe) {
     452             :         // Upper limit of speech model.
     453           0 :         tmp2_s16 -= maxspe;
     454             : 
     455           0 :         for (k = 0; k < kNumGaussians; k++) {
     456           0 :           self->speech_means[channel + k * kNumChannels] -= tmp2_s16;
     457             :         }
     458             :       }
     459             : 
     460           0 :       tmp2_s16 = (int16_t) (noise_global_mean >> 7);
     461           0 :       if (tmp2_s16 > kMaximumNoise[channel]) {
     462           0 :         tmp2_s16 -= kMaximumNoise[channel];
     463             : 
     464           0 :         for (k = 0; k < kNumGaussians; k++) {
     465           0 :           self->noise_means[channel + k * kNumChannels] -= tmp2_s16;
     466             :         }
     467             :       }
     468             :     }
     469           0 :     self->frame_counter++;
     470             :   }
     471             : 
     472             :   // Smooth with respect to transition hysteresis.
     473           0 :   if (!vadflag) {
     474           0 :     if (self->over_hang > 0) {
     475           0 :       vadflag = 2 + self->over_hang;
     476           0 :       self->over_hang--;
     477             :     }
     478           0 :     self->num_of_speech = 0;
     479             :   } else {
     480           0 :     self->num_of_speech++;
     481           0 :     if (self->num_of_speech > kMaxSpeechFrames) {
     482           0 :       self->num_of_speech = kMaxSpeechFrames;
     483           0 :       self->over_hang = overhead2;
     484             :     } else {
     485           0 :       self->over_hang = overhead1;
     486             :     }
     487             :   }
     488           0 :   return vadflag;
     489             : }
     490             : 
     491             : // Initialize the VAD. Set aggressiveness mode to default value.
     492           0 : int WebRtcVad_InitCore(VadInstT* self) {
     493             :   int i;
     494             : 
     495           0 :   if (self == NULL) {
     496           0 :     return -1;
     497             :   }
     498             : 
     499             :   // Initialization of general struct variables.
     500           0 :   self->vad = 1;  // Speech active (=1).
     501           0 :   self->frame_counter = 0;
     502           0 :   self->over_hang = 0;
     503           0 :   self->num_of_speech = 0;
     504             : 
     505             :   // Initialization of downsampling filter state.
     506           0 :   memset(self->downsampling_filter_states, 0,
     507             :          sizeof(self->downsampling_filter_states));
     508             : 
     509             :   // Initialization of 48 to 8 kHz downsampling.
     510           0 :   WebRtcSpl_ResetResample48khzTo8khz(&self->state_48_to_8);
     511             : 
     512             :   // Read initial PDF parameters.
     513           0 :   for (i = 0; i < kTableSize; i++) {
     514           0 :     self->noise_means[i] = kNoiseDataMeans[i];
     515           0 :     self->speech_means[i] = kSpeechDataMeans[i];
     516           0 :     self->noise_stds[i] = kNoiseDataStds[i];
     517           0 :     self->speech_stds[i] = kSpeechDataStds[i];
     518             :   }
     519             : 
     520             :   // Initialize Index and Minimum value vectors.
     521           0 :   for (i = 0; i < 16 * kNumChannels; i++) {
     522           0 :     self->low_value_vector[i] = 10000;
     523           0 :     self->index_vector[i] = 0;
     524             :   }
     525             : 
     526             :   // Initialize splitting filter states.
     527           0 :   memset(self->upper_state, 0, sizeof(self->upper_state));
     528           0 :   memset(self->lower_state, 0, sizeof(self->lower_state));
     529             : 
     530             :   // Initialize high pass filter states.
     531           0 :   memset(self->hp_filter_state, 0, sizeof(self->hp_filter_state));
     532             : 
     533             :   // Initialize mean value memory, for WebRtcVad_FindMinimum().
     534           0 :   for (i = 0; i < kNumChannels; i++) {
     535           0 :     self->mean_value[i] = 1600;
     536             :   }
     537             : 
     538             :   // Set aggressiveness mode to default (=|kDefaultMode|).
     539           0 :   if (WebRtcVad_set_mode_core(self, kDefaultMode) != 0) {
     540           0 :     return -1;
     541             :   }
     542             : 
     543           0 :   self->init_flag = kInitCheck;
     544             : 
     545           0 :   return 0;
     546             : }
     547             : 
     548             : // Set aggressiveness mode
     549           0 : int WebRtcVad_set_mode_core(VadInstT* self, int mode) {
     550           0 :   int return_value = 0;
     551             : 
     552           0 :   switch (mode) {
     553             :     case 0:
     554             :       // Quality mode.
     555           0 :       memcpy(self->over_hang_max_1, kOverHangMax1Q,
     556             :              sizeof(self->over_hang_max_1));
     557           0 :       memcpy(self->over_hang_max_2, kOverHangMax2Q,
     558             :              sizeof(self->over_hang_max_2));
     559           0 :       memcpy(self->individual, kLocalThresholdQ,
     560             :              sizeof(self->individual));
     561           0 :       memcpy(self->total, kGlobalThresholdQ,
     562             :              sizeof(self->total));
     563           0 :       break;
     564             :     case 1:
     565             :       // Low bitrate mode.
     566           0 :       memcpy(self->over_hang_max_1, kOverHangMax1LBR,
     567             :              sizeof(self->over_hang_max_1));
     568           0 :       memcpy(self->over_hang_max_2, kOverHangMax2LBR,
     569             :              sizeof(self->over_hang_max_2));
     570           0 :       memcpy(self->individual, kLocalThresholdLBR,
     571             :              sizeof(self->individual));
     572           0 :       memcpy(self->total, kGlobalThresholdLBR,
     573             :              sizeof(self->total));
     574           0 :       break;
     575             :     case 2:
     576             :       // Aggressive mode.
     577           0 :       memcpy(self->over_hang_max_1, kOverHangMax1AGG,
     578             :              sizeof(self->over_hang_max_1));
     579           0 :       memcpy(self->over_hang_max_2, kOverHangMax2AGG,
     580             :              sizeof(self->over_hang_max_2));
     581           0 :       memcpy(self->individual, kLocalThresholdAGG,
     582             :              sizeof(self->individual));
     583           0 :       memcpy(self->total, kGlobalThresholdAGG,
     584             :              sizeof(self->total));
     585           0 :       break;
     586             :     case 3:
     587             :       // Very aggressive mode.
     588           0 :       memcpy(self->over_hang_max_1, kOverHangMax1VAG,
     589             :              sizeof(self->over_hang_max_1));
     590           0 :       memcpy(self->over_hang_max_2, kOverHangMax2VAG,
     591             :              sizeof(self->over_hang_max_2));
     592           0 :       memcpy(self->individual, kLocalThresholdVAG,
     593             :              sizeof(self->individual));
     594           0 :       memcpy(self->total, kGlobalThresholdVAG,
     595             :              sizeof(self->total));
     596           0 :       break;
     597             :     default:
     598           0 :       return_value = -1;
     599           0 :       break;
     600             :   }
     601             : 
     602           0 :   return return_value;
     603             : }
     604             : 
     605             : // Calculate VAD decision by first extracting feature values and then calculate
     606             : // probability for both speech and background noise.
     607             : 
     608           0 : int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame,
     609             :                            size_t frame_length) {
     610             :   int vad;
     611             :   size_t i;
     612             :   int16_t speech_nb[240];  // 30 ms in 8 kHz.
     613             :   // |tmp_mem| is a temporary memory used by resample function, length is
     614             :   // frame length in 10 ms (480 samples) + 256 extra.
     615           0 :   int32_t tmp_mem[480 + 256] = { 0 };
     616           0 :   const size_t kFrameLen10ms48khz = 480;
     617           0 :   const size_t kFrameLen10ms8khz = 80;
     618           0 :   size_t num_10ms_frames = frame_length / kFrameLen10ms48khz;
     619             : 
     620           0 :   for (i = 0; i < num_10ms_frames; i++) {
     621           0 :     WebRtcSpl_Resample48khzTo8khz(speech_frame,
     622           0 :                                   &speech_nb[i * kFrameLen10ms8khz],
     623             :                                   &inst->state_48_to_8,
     624             :                                   tmp_mem);
     625             :   }
     626             : 
     627             :   // Do VAD on an 8 kHz signal
     628           0 :   vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6);
     629             : 
     630           0 :   return vad;
     631             : }
     632             : 
     633           0 : int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame,
     634             :                            size_t frame_length)
     635             : {
     636             :     size_t len;
     637             :     int vad;
     638             :     int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
     639             :     int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
     640             : 
     641             : 
     642             :     // Downsample signal 32->16->8 before doing VAD
     643           0 :     WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
     644             :                            frame_length);
     645           0 :     len = frame_length / 2;
     646             : 
     647           0 :     WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
     648           0 :     len /= 2;
     649             : 
     650             :     // Do VAD on an 8 kHz signal
     651           0 :     vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
     652             : 
     653           0 :     return vad;
     654             : }
     655             : 
     656           0 : int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame,
     657             :                            size_t frame_length)
     658             : {
     659             :     size_t len;
     660             :     int vad;
     661             :     int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
     662             : 
     663             :     // Wideband: Downsample signal before doing VAD
     664           0 :     WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
     665             :                            frame_length);
     666             : 
     667           0 :     len = frame_length / 2;
     668           0 :     vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
     669             : 
     670           0 :     return vad;
     671             : }
     672             : 
     673           0 : int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame,
     674             :                           size_t frame_length)
     675             : {
     676             :     int16_t feature_vector[kNumChannels], total_power;
     677             : 
     678             :     // Get power in the bands
     679           0 :     total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,
     680             :                                               feature_vector);
     681             : 
     682             :     // Make a VAD
     683           0 :     inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length);
     684             : 
     685           0 :     return inst->vad;
     686             : }

Generated by: LCOV version 1.13