LCOV - output.info - media/webrtc/trunk/webrtc/modules/audio_processing/vad/vad_audio

LCOV - code coverage report

Current view:	top level - media/webrtc/trunk/webrtc/modules/audio_processing/vad - vad_audio_proc.cc (source / functions)		Hit	Total	Coverage
Test:	output.info	Lines:	0	116	0.0 %
Date:	2017-07-14 16:53:18	Functions:	0	10	0.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*
       2             :  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
       3             :  *
       4             :  *  Use of this source code is governed by a BSD-style license
       5             :  *  that can be found in the LICENSE file in the root of the source
       6             :  *  tree. An additional intellectual property rights grant can be found
       7             :  *  in the file PATENTS.  All contributing project authors may
       8             :  *  be found in the AUTHORS file in the root of the source tree.
       9             :  */
      10             : 
      11             : #include "webrtc/modules/audio_processing/vad/vad_audio_proc.h"
      12             : 
      13             : #include <math.h>
      14             : #include <stdio.h>
      15             : 
      16             : #include "webrtc/base/checks.h"
      17             : #include "webrtc/common_audio/fft4g.h"
      18             : #include "webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h"
      19             : #include "webrtc/modules/audio_processing/vad/pitch_internal.h"
      20             : #include "webrtc/modules/audio_processing/vad/pole_zero_filter.h"
      21             : extern "C" {
      22             : #include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h"
      23             : #include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h"
      24             : #include "webrtc/modules/audio_coding/codecs/isac/main/source/pitch_estimator.h"
      25             : #include "webrtc/modules/audio_coding/codecs/isac/main/source/structs.h"
      26             : }
      27             : #include "webrtc/modules/include/module_common_types.h"
      28             : 
      29             : namespace webrtc {
      30             : 
      31             : // The following structures are declared anonymous in iSAC's structs.h. To
      32             : // forward declare them, we use this derived class trick.
      33             : struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {};
      34             : struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {};
      35             : 
      36             : static const float kFrequencyResolution =
      37             :     kSampleRateHz / static_cast<float>(VadAudioProc::kDftSize);
      38             : static const int kSilenceRms = 5;
      39             : 
      40             : // TODO(turajs): Make a Create or Init for VadAudioProc.
      41           0 : VadAudioProc::VadAudioProc()
      42             :     : audio_buffer_(),
      43             :       num_buffer_samples_(kNumPastSignalSamples),
      44             :       log_old_gain_(-2),
      45             :       old_lag_(50),  // Arbitrary but valid as pitch-lag (in samples).
      46           0 :       pitch_analysis_handle_(new PitchAnalysisStruct),
      47           0 :       pre_filter_handle_(new PreFiltBankstr),
      48             :       high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator,
      49             :                                                kFilterOrder,
      50             :                                                kCoeffDenominator,
      51           0 :                                                kFilterOrder)) {
      52             :   static_assert(kNumPastSignalSamples + kNumSubframeSamples ==
      53             :                     sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]),
      54             :                 "lpc analysis window incorrect size");
      55             :   static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]),
      56             :                 "correlation weight incorrect size");
      57             : 
      58             :   // TODO(turajs): Are we doing too much in the constructor?
      59             :   float data[kDftSize];
      60             :   // Make FFT to initialize.
      61           0 :   ip_[0] = 0;
      62           0 :   WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
      63             :   // TODO(turajs): Need to initialize high-pass filter.
      64             : 
      65             :   // Initialize iSAC components.
      66           0 :   WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get());
      67           0 :   WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get());
      68           0 : }
      69             : 
      70           0 : VadAudioProc::~VadAudioProc() {
      71           0 : }
      72             : 
      73           0 : void VadAudioProc::ResetBuffer() {
      74           0 :   memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess],
      75           0 :          sizeof(audio_buffer_[0]) * kNumPastSignalSamples);
      76           0 :   num_buffer_samples_ = kNumPastSignalSamples;
      77           0 : }
      78             : 
      79           0 : int VadAudioProc::ExtractFeatures(const int16_t* frame,
      80             :                                   size_t length,
      81             :                                   AudioFeatures* features) {
      82           0 :   features->num_frames = 0;
      83           0 :   if (length != kNumSubframeSamples) {
      84           0 :     return -1;
      85             :   }
      86             : 
      87             :   // High-pass filter to remove the DC component and very low frequency content.
      88             :   // We have experienced that this high-pass filtering improves voice/non-voiced
      89             :   // classification.
      90           0 :   if (high_pass_filter_->Filter(frame, kNumSubframeSamples,
      91           0 :                                 &audio_buffer_[num_buffer_samples_]) != 0) {
      92           0 :     return -1;
      93             :   }
      94             : 
      95           0 :   num_buffer_samples_ += kNumSubframeSamples;
      96           0 :   if (num_buffer_samples_ < kBufferLength) {
      97           0 :     return 0;
      98             :   }
      99           0 :   RTC_DCHECK_EQ(num_buffer_samples_, kBufferLength);
     100           0 :   features->num_frames = kNum10msSubframes;
     101           0 :   features->silence = false;
     102             : 
     103           0 :   Rms(features->rms, kMaxNumFrames);
     104           0 :   for (size_t i = 0; i < kNum10msSubframes; ++i) {
     105           0 :     if (features->rms[i] < kSilenceRms) {
     106             :       // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence.
     107             :       // Bail out here instead.
     108           0 :       features->silence = true;
     109           0 :       ResetBuffer();
     110           0 :       return 0;
     111             :     }
     112             :   }
     113             : 
     114           0 :   PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz,
     115           0 :                 kMaxNumFrames);
     116           0 :   FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames);
     117           0 :   ResetBuffer();
     118           0 :   return 0;
     119             : }
     120             : 
     121             : // Computes |kLpcOrder + 1| correlation coefficients.
     122           0 : void VadAudioProc::SubframeCorrelation(double* corr,
     123             :                                        size_t length_corr,
     124             :                                        size_t subframe_index) {
     125           0 :   RTC_DCHECK_GE(length_corr, kLpcOrder + 1);
     126             :   double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples];
     127           0 :   size_t buffer_index = subframe_index * kNumSubframeSamples;
     128             : 
     129           0 :   for (size_t n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++)
     130           0 :     windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n];
     131             : 
     132             :   WebRtcIsac_AutoCorr(corr, windowed_audio,
     133           0 :                       kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder);
     134           0 : }
     135             : 
     136             : // Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input.
     137             : // The analysis window is 15 ms long and it is centered on the first half of
     138             : // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the
     139             : // first half of each 10 ms subframe.
     140           0 : void VadAudioProc::GetLpcPolynomials(double* lpc, size_t length_lpc) {
     141           0 :   RTC_DCHECK_GE(length_lpc, kNum10msSubframes * (kLpcOrder + 1));
     142             :   double corr[kLpcOrder + 1];
     143             :   double reflec_coeff[kLpcOrder];
     144           0 :   for (size_t i = 0, offset_lpc = 0; i < kNum10msSubframes;
     145           0 :        i++, offset_lpc += kLpcOrder + 1) {
     146           0 :     SubframeCorrelation(corr, kLpcOrder + 1, i);
     147           0 :     corr[0] *= 1.0001;
     148             :     // This makes Lev-Durb a bit more stable.
     149           0 :     for (size_t k = 0; k < kLpcOrder + 1; k++) {
     150           0 :       corr[k] *= kCorrWeight[k];
     151             :     }
     152           0 :     WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder);
     153             :   }
     154           0 : }
     155             : 
     156             : // Fit a second order curve to these 3 points and find the location of the
     157             : // extremum. The points are inverted before curve fitting.
     158           0 : static float QuadraticInterpolation(float prev_val,
     159             :                                     float curr_val,
     160             :                                     float next_val) {
     161             :   // Doing the interpolation in |1 / A(z)|^2.
     162           0 :   float fractional_index = 0;
     163           0 :   next_val = 1.0f / next_val;
     164           0 :   prev_val = 1.0f / prev_val;
     165           0 :   curr_val = 1.0f / curr_val;
     166             : 
     167           0 :   fractional_index =
     168           0 :       -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val);
     169           0 :   RTC_DCHECK_LT(fabs(fractional_index), 1);
     170           0 :   return fractional_index;
     171             : }
     172             : 
     173             : // 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope
     174             : // of the input signal. The local maximum of the spectral envelope corresponds
     175             : // with the local minimum of A(z). It saves complexity, as we save one
     176             : // inversion. Furthermore, we find the first local maximum of magnitude squared,
     177             : // to save on one square root.
     178           0 : void VadAudioProc::FindFirstSpectralPeaks(double* f_peak,
     179             :                                           size_t length_f_peak) {
     180           0 :   RTC_DCHECK_GE(length_f_peak, kNum10msSubframes);
     181             :   double lpc[kNum10msSubframes * (kLpcOrder + 1)];
     182             :   // For all sub-frames.
     183           0 :   GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1));
     184             : 
     185           0 :   const size_t kNumDftCoefficients = kDftSize / 2 + 1;
     186             :   float data[kDftSize];
     187             : 
     188           0 :   for (size_t i = 0; i < kNum10msSubframes; i++) {
     189             :     // Convert to float with zero pad.
     190           0 :     memset(data, 0, sizeof(data));
     191           0 :     for (size_t n = 0; n < kLpcOrder + 1; n++) {
     192           0 :       data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]);
     193             :     }
     194             :     // Transform to frequency domain.
     195           0 :     WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
     196             : 
     197           0 :     size_t index_peak = 0;
     198           0 :     float prev_magn_sqr = data[0] * data[0];
     199           0 :     float curr_magn_sqr = data[2] * data[2] + data[3] * data[3];
     200             :     float next_magn_sqr;
     201           0 :     bool found_peak = false;
     202           0 :     for (size_t n = 2; n < kNumDftCoefficients - 1; n++) {
     203           0 :       next_magn_sqr =
     204           0 :           data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1];
     205           0 :       if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
     206           0 :         found_peak = true;
     207           0 :         index_peak = n - 1;
     208           0 :         break;
     209             :       }
     210           0 :       prev_magn_sqr = curr_magn_sqr;
     211           0 :       curr_magn_sqr = next_magn_sqr;
     212             :     }
     213           0 :     float fractional_index = 0;
     214           0 :     if (!found_peak) {
     215             :       // Checking if |kNumDftCoefficients - 1| is the local minimum.
     216           0 :       next_magn_sqr = data[1] * data[1];
     217           0 :       if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
     218           0 :         index_peak = kNumDftCoefficients - 1;
     219             :       }
     220             :     } else {
     221             :       // A peak is found, do a simple quadratic interpolation to get a more
     222             :       // accurate estimate of the peak location.
     223             :       fractional_index =
     224           0 :           QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr);
     225             :     }
     226           0 :     f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution;
     227             :   }
     228           0 : }
     229             : 
     230             : // Using iSAC functions to estimate pitch gains & lags.
     231           0 : void VadAudioProc::PitchAnalysis(double* log_pitch_gains,
     232             :                                  double* pitch_lags_hz,
     233             :                                  size_t length) {
     234             :   // TODO(turajs): This can be "imported" from iSAC & and the next two
     235             :   // constants.
     236           0 :   RTC_DCHECK_GE(length, kNum10msSubframes);
     237           0 :   const int kNumPitchSubframes = 4;
     238             :   double gains[kNumPitchSubframes];
     239             :   double lags[kNumPitchSubframes];
     240             : 
     241           0 :   const int kNumSubbandFrameSamples = 240;
     242           0 :   const int kNumLookaheadSamples = 24;
     243             : 
     244             :   float lower[kNumSubbandFrameSamples];
     245             :   float upper[kNumSubbandFrameSamples];
     246             :   double lower_lookahead[kNumSubbandFrameSamples];
     247             :   double upper_lookahead[kNumSubbandFrameSamples];
     248             :   double lower_lookahead_pre_filter[kNumSubbandFrameSamples +
     249             :                                     kNumLookaheadSamples];
     250             : 
     251             :   // Split signal to lower and upper bands
     252           0 :   WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower,
     253             :                                  upper, lower_lookahead, upper_lookahead,
     254           0 :                                  pre_filter_handle_.get());
     255             :   WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter,
     256           0 :                            pitch_analysis_handle_.get(), lags, gains);
     257             : 
     258             :   // Lags are computed on lower-band signal with sampling rate half of the
     259             :   // input signal.
     260           0 :   GetSubframesPitchParameters(
     261             :       kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes,
     262           0 :       &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz);
     263           0 : }
     264             : 
     265           0 : void VadAudioProc::Rms(double* rms, size_t length_rms) {
     266           0 :   RTC_DCHECK_GE(length_rms, kNum10msSubframes);
     267           0 :   size_t offset = kNumPastSignalSamples;
     268           0 :   for (size_t i = 0; i < kNum10msSubframes; i++) {
     269           0 :     rms[i] = 0;
     270           0 :     for (size_t n = 0; n < kNumSubframeSamples; n++, offset++)
     271           0 :       rms[i] += audio_buffer_[offset] * audio_buffer_[offset];
     272           0 :     rms[i] = sqrt(rms[i] / kNumSubframeSamples);
     273             :   }
     274           0 : }
     275             : 
     276             : }  // namespace webrtc

Generated by: LCOV version 1.13