LCOV - code coverage report
Current view: top level - dom/media/webspeech/recognition - energy_endpointer.cc (source / functions) Hit Total Coverage
Test: output.info Lines: 0 183 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 23 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // Copyright (c) 2013 The Chromium Authors. All rights reserved.
       2             : //
       3             : // Redistribution and use in source and binary forms, with or without
       4             : // modification, are permitted provided that the following conditions are
       5             : // met:
       6             : //
       7             : //    * Redistributions of source code must retain the above copyright
       8             : // notice, this list of conditions and the following disclaimer.
       9             : //    * Redistributions in binary form must reproduce the above
      10             : // copyright notice, this list of conditions and the following disclaimer
      11             : // in the documentation and/or other materials provided with the
      12             : // distribution.
      13             : //    * Neither the name of Google Inc. nor the names of its
      14             : // contributors may be used to endorse or promote products derived from
      15             : // this software without specific prior written permission.
      16             : //
      17             : // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
      18             : // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
      19             : // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
      20             : // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
      21             : // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
      22             : // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
      23             : // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
      24             : // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
      25             : // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
      26             : // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
      27             : // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
      28             : 
      29             : #include "energy_endpointer.h"
      30             : 
      31             : #include <math.h>
      32             : 
      33             : namespace {
      34             : 
      35             : // Returns the RMS (quadratic mean) of the input signal.
      36           0 : float RMS(const int16_t* samples, int num_samples) {
      37           0 :   int64_t ssq_int64_t = 0;
      38           0 :   int64_t sum_int64_t = 0;
      39           0 :   for (int i = 0; i < num_samples; ++i) {
      40           0 :     sum_int64_t += samples[i];
      41           0 :     ssq_int64_t += samples[i] * samples[i];
      42             :   }
      43             :   // now convert to floats.
      44           0 :   double sum = static_cast<double>(sum_int64_t);
      45           0 :   sum /= num_samples;
      46           0 :   double ssq = static_cast<double>(ssq_int64_t);
      47           0 :   return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));
      48             : }
      49             : 
      50           0 : int64_t Secs2Usecs(float seconds) {
      51           0 :   return static_cast<int64_t>(0.5 + (1.0e6 * seconds));
      52             : }
      53             : 
      54           0 : float GetDecibel(float value) {
      55           0 :   if (value > 1.0e-100)
      56           0 :     return 20 * log10(value);
      57           0 :   return -2000.0;
      58             : }
      59             : 
      60             : }  // namespace
      61             : 
      62             : namespace mozilla {
      63             : 
      64             : // Stores threshold-crossing histories for making decisions about the speech
      65             : // state.
      66           0 : class EnergyEndpointer::HistoryRing {
      67             :  public:
      68           0 :   HistoryRing() : insertion_index_(0) {}
      69             : 
      70             :   // Resets the ring to |size| elements each with state |initial_state|
      71             :   void SetRing(int size, bool initial_state);
      72             : 
      73             :   // Inserts a new entry into the ring and drops the oldest entry.
      74             :   void Insert(int64_t time_us, bool decision);
      75             : 
      76             :   // Returns the time in microseconds of the most recently added entry.
      77             :   int64_t EndTime() const;
      78             : 
      79             :   // Returns the sum of all intervals during which 'decision' is true within
      80             :   // the time in seconds specified by 'duration'. The returned interval is
      81             :   // in seconds.
      82             :   float RingSum(float duration_sec);
      83             : 
      84             :  private:
      85             :   struct DecisionPoint {
      86             :     int64_t time_us;
      87             :     bool decision;
      88             :   };
      89             : 
      90             :   std::vector<DecisionPoint> decision_points_;
      91             :   int insertion_index_;  // Index at which the next item gets added/inserted.
      92             : 
      93             :   HistoryRing(const HistoryRing&);
      94             :   void operator=(const HistoryRing&);
      95             : };
      96             : 
      97           0 : void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {
      98           0 :   insertion_index_ = 0;
      99           0 :   decision_points_.clear();
     100           0 :   DecisionPoint init = { -1, initial_state };
     101           0 :   decision_points_.resize(size, init);
     102           0 : }
     103             : 
     104           0 : void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) {
     105           0 :   decision_points_[insertion_index_].time_us = time_us;
     106           0 :   decision_points_[insertion_index_].decision = decision;
     107           0 :   insertion_index_ = (insertion_index_ + 1) % decision_points_.size();
     108           0 : }
     109             : 
     110           0 : int64_t EnergyEndpointer::HistoryRing::EndTime() const {
     111           0 :   int ind = insertion_index_ - 1;
     112           0 :   if (ind < 0)
     113           0 :     ind = decision_points_.size() - 1;
     114           0 :   return decision_points_[ind].time_us;
     115             : }
     116             : 
     117           0 : float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {
     118           0 :   if (!decision_points_.size())
     119           0 :     return 0.0;
     120             : 
     121           0 :   int64_t sum_us = 0;
     122           0 :   int ind = insertion_index_ - 1;
     123           0 :   if (ind < 0)
     124           0 :     ind = decision_points_.size() - 1;
     125           0 :   int64_t end_us = decision_points_[ind].time_us;
     126           0 :   bool is_on = decision_points_[ind].decision;
     127           0 :   int64_t start_us = end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec));
     128           0 :   if (start_us < 0)
     129           0 :     start_us = 0;
     130           0 :   size_t n_summed = 1;  // n points ==> (n-1) intervals
     131           0 :   while ((decision_points_[ind].time_us > start_us) &&
     132           0 :          (n_summed < decision_points_.size())) {
     133           0 :     --ind;
     134           0 :     if (ind < 0)
     135           0 :       ind = decision_points_.size() - 1;
     136           0 :     if (is_on)
     137           0 :       sum_us += end_us - decision_points_[ind].time_us;
     138           0 :     is_on = decision_points_[ind].decision;
     139           0 :     end_us = decision_points_[ind].time_us;
     140           0 :     n_summed++;
     141             :   }
     142             : 
     143           0 :   return 1.0e-6f * sum_us;  //  Returns total time that was super threshold.
     144             : }
     145             : 
     146           0 : EnergyEndpointer::EnergyEndpointer()
     147             :     : status_(EP_PRE_SPEECH),
     148             :       offset_confirm_dur_sec_(0),
     149             :       endpointer_time_us_(0),
     150             :       fast_update_frames_(0),
     151             :       frame_counter_(0),
     152             :       max_window_dur_(4.0),
     153             :       sample_rate_(0),
     154           0 :       history_(new HistoryRing()),
     155             :       decision_threshold_(0),
     156             :       estimating_environment_(false),
     157             :       noise_level_(0),
     158             :       rms_adapt_(0),
     159             :       start_lag_(0),
     160             :       end_lag_(0),
     161           0 :       user_input_start_time_us_(0) {
     162           0 : }
     163             : 
     164           0 : EnergyEndpointer::~EnergyEndpointer() {
     165           0 : }
     166             : 
     167           0 : int EnergyEndpointer::TimeToFrame(float time) const {
     168           0 :   return static_cast<int32_t>(0.5 + (time / params_.frame_period()));
     169             : }
     170             : 
     171           0 : void EnergyEndpointer::Restart(bool reset_threshold) {
     172           0 :   status_ = EP_PRE_SPEECH;
     173           0 :   user_input_start_time_us_ = 0;
     174             : 
     175           0 :   if (reset_threshold) {
     176           0 :     decision_threshold_ = params_.decision_threshold();
     177           0 :     rms_adapt_ = decision_threshold_;
     178           0 :     noise_level_ = params_.decision_threshold() / 2.0f;
     179           0 :     frame_counter_ = 0;  // Used for rapid initial update of levels.
     180             :   }
     181             : 
     182             :   // Set up the memories to hold the history windows.
     183           0 :   history_->SetRing(TimeToFrame(max_window_dur_), false);
     184             : 
     185             :   // Flag that indicates that current input should be used for
     186             :   // estimating the environment. The user has not yet started input
     187             :   // by e.g. pressed the push-to-talk button. By default, this is
     188             :   // false for backward compatibility.
     189           0 :   estimating_environment_ = false;
     190           0 : }
     191             : 
     192           0 : void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {
     193           0 :   params_ = params;
     194             : 
     195             :   // Find the longest history interval to be used, and make the ring
     196             :   // large enough to accommodate that number of frames.  NOTE: This
     197             :   // depends upon ep_frame_period being set correctly in the factory
     198             :   // that did this instantiation.
     199           0 :   max_window_dur_ = params_.onset_window();
     200           0 :   if (params_.speech_on_window() > max_window_dur_)
     201           0 :     max_window_dur_ = params_.speech_on_window();
     202           0 :   if (params_.offset_window() > max_window_dur_)
     203           0 :     max_window_dur_ = params_.offset_window();
     204           0 :   Restart(true);
     205             : 
     206           0 :   offset_confirm_dur_sec_ = params_.offset_window() -
     207           0 :                             params_.offset_confirm_dur();
     208           0 :   if (offset_confirm_dur_sec_ < 0.0)
     209           0 :     offset_confirm_dur_sec_ = 0.0;
     210             : 
     211           0 :   user_input_start_time_us_ = 0;
     212             : 
     213             :   // Flag that indicates that  current input should be used for
     214             :   // estimating the environment. The user has not yet started input
     215             :   // by e.g. pressed the push-to-talk button. By default, this is
     216             :   // false for backward compatibility.
     217           0 :   estimating_environment_ = false;
     218             :   // The initial value of the noise and speech levels is inconsequential.
     219             :   // The level of the first frame will overwrite these values.
     220           0 :   noise_level_ = params_.decision_threshold() / 2.0f;
     221           0 :   fast_update_frames_ =
     222           0 :       static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period());
     223             : 
     224           0 :   frame_counter_ = 0;  // Used for rapid initial update of levels.
     225             : 
     226           0 :   sample_rate_ = params_.sample_rate();
     227           0 :   start_lag_ = static_cast<int>(sample_rate_ /
     228           0 :                                 params_.max_fundamental_frequency());
     229           0 :   end_lag_ = static_cast<int>(sample_rate_ /
     230           0 :                               params_.min_fundamental_frequency());
     231           0 : }
     232             : 
     233           0 : void EnergyEndpointer::StartSession() {
     234           0 :   Restart(true);
     235           0 : }
     236             : 
     237           0 : void EnergyEndpointer::EndSession() {
     238           0 :   status_ = EP_POST_SPEECH;
     239           0 : }
     240             : 
     241           0 : void EnergyEndpointer::SetEnvironmentEstimationMode() {
     242           0 :   Restart(true);
     243           0 :   estimating_environment_ = true;
     244           0 : }
     245             : 
     246           0 : void EnergyEndpointer::SetUserInputMode() {
     247           0 :   estimating_environment_ = false;
     248           0 :   user_input_start_time_us_ = endpointer_time_us_;
     249           0 : }
     250             : 
     251           0 : void EnergyEndpointer::ProcessAudioFrame(int64_t time_us,
     252             :                                          const int16_t* samples,
     253             :                                          int num_samples,
     254             :                                          float* rms_out) {
     255           0 :   endpointer_time_us_ = time_us;
     256           0 :   float rms = RMS(samples, num_samples);
     257             : 
     258             :   // Check that this is user input audio vs. pre-input adaptation audio.
     259             :   // Input audio starts when the user indicates start of input, by e.g.
     260             :   // pressing push-to-talk. Audio recieved prior to that is used to update
     261             :   // noise and speech level estimates.
     262           0 :   if (!estimating_environment_) {
     263           0 :     bool decision = false;
     264           0 :     if ((endpointer_time_us_ - user_input_start_time_us_) <
     265           0 :         Secs2Usecs(params_.contamination_rejection_period())) {
     266           0 :       decision = false;
     267             :       //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_));
     268             :     } else {
     269           0 :       decision = (rms > decision_threshold_);
     270             :     }
     271             : 
     272           0 :     history_->Insert(endpointer_time_us_, decision);
     273             : 
     274           0 :     switch (status_) {
     275             :       case EP_PRE_SPEECH:
     276           0 :         if (history_->RingSum(params_.onset_window()) >
     277           0 :             params_.onset_detect_dur()) {
     278           0 :           status_ = EP_POSSIBLE_ONSET;
     279             :         }
     280           0 :         break;
     281             : 
     282             :       case EP_POSSIBLE_ONSET: {
     283           0 :         float tsum = history_->RingSum(params_.onset_window());
     284           0 :         if (tsum > params_.onset_confirm_dur()) {
     285           0 :           status_ = EP_SPEECH_PRESENT;
     286             :         } else {  // If signal is not maintained, drop back to pre-speech.
     287           0 :           if (tsum <= params_.onset_detect_dur())
     288           0 :             status_ = EP_PRE_SPEECH;
     289             :         }
     290           0 :         break;
     291             :       }
     292             : 
     293             :       case EP_SPEECH_PRESENT: {
     294             :         // To induce hysteresis in the state residency, we allow a
     295             :         // smaller residency time in the on_ring, than was required to
     296             :         // enter the SPEECH_PERSENT state.
     297           0 :         float on_time = history_->RingSum(params_.speech_on_window());
     298           0 :         if (on_time < params_.on_maintain_dur())
     299           0 :           status_ = EP_POSSIBLE_OFFSET;
     300           0 :         break;
     301             :       }
     302             : 
     303             :       case EP_POSSIBLE_OFFSET:
     304           0 :         if (history_->RingSum(params_.offset_window()) <=
     305           0 :             offset_confirm_dur_sec_) {
     306             :           // Note that this offset time may be beyond the end
     307             :           // of the input buffer in a real-time system.  It will be up
     308             :           // to the RecognizerSession to decide what to do.
     309           0 :           status_ = EP_PRE_SPEECH;  // Automatically reset for next utterance.
     310             :         } else {  // If speech picks up again we allow return to SPEECH_PRESENT.
     311           0 :           if (history_->RingSum(params_.speech_on_window()) >=
     312           0 :               params_.on_maintain_dur())
     313           0 :             status_ = EP_SPEECH_PRESENT;
     314             :         }
     315           0 :         break;
     316             : 
     317             :       default:
     318           0 :         break;
     319             :     }
     320             : 
     321             :     // If this is a quiet, non-speech region, slowly adapt the detection
     322             :     // threshold to be about 6dB above the average RMS.
     323           0 :     if ((!decision) && (status_ == EP_PRE_SPEECH)) {
     324           0 :       decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);
     325           0 :       rms_adapt_ = decision_threshold_;
     326             :     } else {
     327             :       // If this is in a speech region, adapt the decision threshold to
     328             :       // be about 10dB below the average RMS. If the noise level is high,
     329             :       // the threshold is pushed up.
     330             :       // Adaptation up to a higher level is 5 times faster than decay to
     331             :       // a lower level.
     332           0 :       if ((status_ == EP_SPEECH_PRESENT) && decision) {
     333           0 :         if (rms_adapt_ > rms) {
     334           0 :           rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);
     335             :         } else {
     336           0 :           rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);
     337             :         }
     338           0 :         float target_threshold = 0.3f * rms_adapt_ +  noise_level_;
     339           0 :         decision_threshold_ = (.90f * decision_threshold_) +
     340           0 :                               (0.10f * target_threshold);
     341             :       }
     342             :     }
     343             : 
     344             :     // Set a floor
     345           0 :     if (decision_threshold_ < params_.min_decision_threshold())
     346           0 :       decision_threshold_ = params_.min_decision_threshold();
     347             :   }
     348             : 
     349             :   // Update speech and noise levels.
     350           0 :   UpdateLevels(rms);
     351           0 :   ++frame_counter_;
     352             : 
     353           0 :   if (rms_out)
     354           0 :     *rms_out = GetDecibel(rms);
     355           0 : }
     356             : 
     357           0 : float EnergyEndpointer::GetNoiseLevelDb() const {
     358           0 :   return GetDecibel(noise_level_);
     359             : }
     360             : 
     361           0 : void EnergyEndpointer::UpdateLevels(float rms) {
     362             :   // Update quickly initially. We assume this is noise and that
     363             :   // speech is 6dB above the noise.
     364           0 :   if (frame_counter_ < fast_update_frames_) {
     365             :     // Alpha increases from 0 to (k-1)/k where k is the number of time
     366             :     // steps in the initial adaptation period.
     367           0 :     float alpha = static_cast<float>(frame_counter_) /
     368           0 :         static_cast<float>(fast_update_frames_);
     369           0 :     noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);
     370             :     //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_));
     371             :   } else {
     372             :     // Update Noise level. The noise level adapts quickly downward, but
     373             :     // slowly upward. The noise_level_ parameter is not currently used
     374             :     // for threshold adaptation. It is used for UI feedback.
     375           0 :     if (noise_level_ < rms)
     376           0 :       noise_level_ = (0.999f * noise_level_) + (0.001f * rms);
     377             :     else
     378           0 :       noise_level_ = (0.95f * noise_level_) + (0.05f * rms);
     379             :   }
     380           0 :   if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {
     381           0 :     decision_threshold_ = noise_level_ * 2; // 6dB above noise level.
     382             :     // Set a floor
     383           0 :     if (decision_threshold_ < params_.min_decision_threshold())
     384           0 :       decision_threshold_ = params_.min_decision_threshold();
     385             :   }
     386           0 : }
     387             : 
     388           0 : EpStatus EnergyEndpointer::Status(int64_t* status_time)  const {
     389           0 :   *status_time = history_->EndTime();
     390           0 :   return status_;
     391             : }
     392             : 
     393             : }  // namespace mozilla

Generated by: LCOV version 1.13