Line data Source code
1 : // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 : //
3 : // Redistribution and use in source and binary forms, with or without
4 : // modification, are permitted provided that the following conditions are
5 : // met:
6 : //
7 : // * Redistributions of source code must retain the above copyright
8 : // notice, this list of conditions and the following disclaimer.
9 : // * Redistributions in binary form must reproduce the above
10 : // copyright notice, this list of conditions and the following disclaimer
11 : // in the documentation and/or other materials provided with the
12 : // distribution.
13 : // * Neither the name of Google Inc. nor the names of its
14 : // contributors may be used to endorse or promote products derived from
15 : // this software without specific prior written permission.
16 : //
17 : // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 : // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 : // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 : // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 : // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 : // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 : // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 : // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 : // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 : // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 : // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 :
29 : #include "energy_endpointer.h"
30 :
31 : #include <math.h>
32 :
33 : namespace {
34 :
35 : // Returns the RMS (quadratic mean) of the input signal.
36 0 : float RMS(const int16_t* samples, int num_samples) {
37 0 : int64_t ssq_int64_t = 0;
38 0 : int64_t sum_int64_t = 0;
39 0 : for (int i = 0; i < num_samples; ++i) {
40 0 : sum_int64_t += samples[i];
41 0 : ssq_int64_t += samples[i] * samples[i];
42 : }
43 : // now convert to floats.
44 0 : double sum = static_cast<double>(sum_int64_t);
45 0 : sum /= num_samples;
46 0 : double ssq = static_cast<double>(ssq_int64_t);
47 0 : return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));
48 : }
49 :
50 0 : int64_t Secs2Usecs(float seconds) {
51 0 : return static_cast<int64_t>(0.5 + (1.0e6 * seconds));
52 : }
53 :
54 0 : float GetDecibel(float value) {
55 0 : if (value > 1.0e-100)
56 0 : return 20 * log10(value);
57 0 : return -2000.0;
58 : }
59 :
60 : } // namespace
61 :
62 : namespace mozilla {
63 :
64 : // Stores threshold-crossing histories for making decisions about the speech
65 : // state.
66 0 : class EnergyEndpointer::HistoryRing {
67 : public:
68 0 : HistoryRing() : insertion_index_(0) {}
69 :
70 : // Resets the ring to |size| elements each with state |initial_state|
71 : void SetRing(int size, bool initial_state);
72 :
73 : // Inserts a new entry into the ring and drops the oldest entry.
74 : void Insert(int64_t time_us, bool decision);
75 :
76 : // Returns the time in microseconds of the most recently added entry.
77 : int64_t EndTime() const;
78 :
79 : // Returns the sum of all intervals during which 'decision' is true within
80 : // the time in seconds specified by 'duration'. The returned interval is
81 : // in seconds.
82 : float RingSum(float duration_sec);
83 :
84 : private:
85 : struct DecisionPoint {
86 : int64_t time_us;
87 : bool decision;
88 : };
89 :
90 : std::vector<DecisionPoint> decision_points_;
91 : int insertion_index_; // Index at which the next item gets added/inserted.
92 :
93 : HistoryRing(const HistoryRing&);
94 : void operator=(const HistoryRing&);
95 : };
96 :
97 0 : void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {
98 0 : insertion_index_ = 0;
99 0 : decision_points_.clear();
100 0 : DecisionPoint init = { -1, initial_state };
101 0 : decision_points_.resize(size, init);
102 0 : }
103 :
104 0 : void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) {
105 0 : decision_points_[insertion_index_].time_us = time_us;
106 0 : decision_points_[insertion_index_].decision = decision;
107 0 : insertion_index_ = (insertion_index_ + 1) % decision_points_.size();
108 0 : }
109 :
110 0 : int64_t EnergyEndpointer::HistoryRing::EndTime() const {
111 0 : int ind = insertion_index_ - 1;
112 0 : if (ind < 0)
113 0 : ind = decision_points_.size() - 1;
114 0 : return decision_points_[ind].time_us;
115 : }
116 :
117 0 : float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {
118 0 : if (!decision_points_.size())
119 0 : return 0.0;
120 :
121 0 : int64_t sum_us = 0;
122 0 : int ind = insertion_index_ - 1;
123 0 : if (ind < 0)
124 0 : ind = decision_points_.size() - 1;
125 0 : int64_t end_us = decision_points_[ind].time_us;
126 0 : bool is_on = decision_points_[ind].decision;
127 0 : int64_t start_us = end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec));
128 0 : if (start_us < 0)
129 0 : start_us = 0;
130 0 : size_t n_summed = 1; // n points ==> (n-1) intervals
131 0 : while ((decision_points_[ind].time_us > start_us) &&
132 0 : (n_summed < decision_points_.size())) {
133 0 : --ind;
134 0 : if (ind < 0)
135 0 : ind = decision_points_.size() - 1;
136 0 : if (is_on)
137 0 : sum_us += end_us - decision_points_[ind].time_us;
138 0 : is_on = decision_points_[ind].decision;
139 0 : end_us = decision_points_[ind].time_us;
140 0 : n_summed++;
141 : }
142 :
143 0 : return 1.0e-6f * sum_us; // Returns total time that was super threshold.
144 : }
145 :
146 0 : EnergyEndpointer::EnergyEndpointer()
147 : : status_(EP_PRE_SPEECH),
148 : offset_confirm_dur_sec_(0),
149 : endpointer_time_us_(0),
150 : fast_update_frames_(0),
151 : frame_counter_(0),
152 : max_window_dur_(4.0),
153 : sample_rate_(0),
154 0 : history_(new HistoryRing()),
155 : decision_threshold_(0),
156 : estimating_environment_(false),
157 : noise_level_(0),
158 : rms_adapt_(0),
159 : start_lag_(0),
160 : end_lag_(0),
161 0 : user_input_start_time_us_(0) {
162 0 : }
163 :
164 0 : EnergyEndpointer::~EnergyEndpointer() {
165 0 : }
166 :
167 0 : int EnergyEndpointer::TimeToFrame(float time) const {
168 0 : return static_cast<int32_t>(0.5 + (time / params_.frame_period()));
169 : }
170 :
171 0 : void EnergyEndpointer::Restart(bool reset_threshold) {
172 0 : status_ = EP_PRE_SPEECH;
173 0 : user_input_start_time_us_ = 0;
174 :
175 0 : if (reset_threshold) {
176 0 : decision_threshold_ = params_.decision_threshold();
177 0 : rms_adapt_ = decision_threshold_;
178 0 : noise_level_ = params_.decision_threshold() / 2.0f;
179 0 : frame_counter_ = 0; // Used for rapid initial update of levels.
180 : }
181 :
182 : // Set up the memories to hold the history windows.
183 0 : history_->SetRing(TimeToFrame(max_window_dur_), false);
184 :
185 : // Flag that indicates that current input should be used for
186 : // estimating the environment. The user has not yet started input
187 : // by e.g. pressed the push-to-talk button. By default, this is
188 : // false for backward compatibility.
189 0 : estimating_environment_ = false;
190 0 : }
191 :
192 0 : void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {
193 0 : params_ = params;
194 :
195 : // Find the longest history interval to be used, and make the ring
196 : // large enough to accommodate that number of frames. NOTE: This
197 : // depends upon ep_frame_period being set correctly in the factory
198 : // that did this instantiation.
199 0 : max_window_dur_ = params_.onset_window();
200 0 : if (params_.speech_on_window() > max_window_dur_)
201 0 : max_window_dur_ = params_.speech_on_window();
202 0 : if (params_.offset_window() > max_window_dur_)
203 0 : max_window_dur_ = params_.offset_window();
204 0 : Restart(true);
205 :
206 0 : offset_confirm_dur_sec_ = params_.offset_window() -
207 0 : params_.offset_confirm_dur();
208 0 : if (offset_confirm_dur_sec_ < 0.0)
209 0 : offset_confirm_dur_sec_ = 0.0;
210 :
211 0 : user_input_start_time_us_ = 0;
212 :
213 : // Flag that indicates that current input should be used for
214 : // estimating the environment. The user has not yet started input
215 : // by e.g. pressed the push-to-talk button. By default, this is
216 : // false for backward compatibility.
217 0 : estimating_environment_ = false;
218 : // The initial value of the noise and speech levels is inconsequential.
219 : // The level of the first frame will overwrite these values.
220 0 : noise_level_ = params_.decision_threshold() / 2.0f;
221 0 : fast_update_frames_ =
222 0 : static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period());
223 :
224 0 : frame_counter_ = 0; // Used for rapid initial update of levels.
225 :
226 0 : sample_rate_ = params_.sample_rate();
227 0 : start_lag_ = static_cast<int>(sample_rate_ /
228 0 : params_.max_fundamental_frequency());
229 0 : end_lag_ = static_cast<int>(sample_rate_ /
230 0 : params_.min_fundamental_frequency());
231 0 : }
232 :
233 0 : void EnergyEndpointer::StartSession() {
234 0 : Restart(true);
235 0 : }
236 :
237 0 : void EnergyEndpointer::EndSession() {
238 0 : status_ = EP_POST_SPEECH;
239 0 : }
240 :
241 0 : void EnergyEndpointer::SetEnvironmentEstimationMode() {
242 0 : Restart(true);
243 0 : estimating_environment_ = true;
244 0 : }
245 :
246 0 : void EnergyEndpointer::SetUserInputMode() {
247 0 : estimating_environment_ = false;
248 0 : user_input_start_time_us_ = endpointer_time_us_;
249 0 : }
250 :
251 0 : void EnergyEndpointer::ProcessAudioFrame(int64_t time_us,
252 : const int16_t* samples,
253 : int num_samples,
254 : float* rms_out) {
255 0 : endpointer_time_us_ = time_us;
256 0 : float rms = RMS(samples, num_samples);
257 :
258 : // Check that this is user input audio vs. pre-input adaptation audio.
259 : // Input audio starts when the user indicates start of input, by e.g.
260 : // pressing push-to-talk. Audio recieved prior to that is used to update
261 : // noise and speech level estimates.
262 0 : if (!estimating_environment_) {
263 0 : bool decision = false;
264 0 : if ((endpointer_time_us_ - user_input_start_time_us_) <
265 0 : Secs2Usecs(params_.contamination_rejection_period())) {
266 0 : decision = false;
267 : //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_));
268 : } else {
269 0 : decision = (rms > decision_threshold_);
270 : }
271 :
272 0 : history_->Insert(endpointer_time_us_, decision);
273 :
274 0 : switch (status_) {
275 : case EP_PRE_SPEECH:
276 0 : if (history_->RingSum(params_.onset_window()) >
277 0 : params_.onset_detect_dur()) {
278 0 : status_ = EP_POSSIBLE_ONSET;
279 : }
280 0 : break;
281 :
282 : case EP_POSSIBLE_ONSET: {
283 0 : float tsum = history_->RingSum(params_.onset_window());
284 0 : if (tsum > params_.onset_confirm_dur()) {
285 0 : status_ = EP_SPEECH_PRESENT;
286 : } else { // If signal is not maintained, drop back to pre-speech.
287 0 : if (tsum <= params_.onset_detect_dur())
288 0 : status_ = EP_PRE_SPEECH;
289 : }
290 0 : break;
291 : }
292 :
293 : case EP_SPEECH_PRESENT: {
294 : // To induce hysteresis in the state residency, we allow a
295 : // smaller residency time in the on_ring, than was required to
296 : // enter the SPEECH_PERSENT state.
297 0 : float on_time = history_->RingSum(params_.speech_on_window());
298 0 : if (on_time < params_.on_maintain_dur())
299 0 : status_ = EP_POSSIBLE_OFFSET;
300 0 : break;
301 : }
302 :
303 : case EP_POSSIBLE_OFFSET:
304 0 : if (history_->RingSum(params_.offset_window()) <=
305 0 : offset_confirm_dur_sec_) {
306 : // Note that this offset time may be beyond the end
307 : // of the input buffer in a real-time system. It will be up
308 : // to the RecognizerSession to decide what to do.
309 0 : status_ = EP_PRE_SPEECH; // Automatically reset for next utterance.
310 : } else { // If speech picks up again we allow return to SPEECH_PRESENT.
311 0 : if (history_->RingSum(params_.speech_on_window()) >=
312 0 : params_.on_maintain_dur())
313 0 : status_ = EP_SPEECH_PRESENT;
314 : }
315 0 : break;
316 :
317 : default:
318 0 : break;
319 : }
320 :
321 : // If this is a quiet, non-speech region, slowly adapt the detection
322 : // threshold to be about 6dB above the average RMS.
323 0 : if ((!decision) && (status_ == EP_PRE_SPEECH)) {
324 0 : decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);
325 0 : rms_adapt_ = decision_threshold_;
326 : } else {
327 : // If this is in a speech region, adapt the decision threshold to
328 : // be about 10dB below the average RMS. If the noise level is high,
329 : // the threshold is pushed up.
330 : // Adaptation up to a higher level is 5 times faster than decay to
331 : // a lower level.
332 0 : if ((status_ == EP_SPEECH_PRESENT) && decision) {
333 0 : if (rms_adapt_ > rms) {
334 0 : rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);
335 : } else {
336 0 : rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);
337 : }
338 0 : float target_threshold = 0.3f * rms_adapt_ + noise_level_;
339 0 : decision_threshold_ = (.90f * decision_threshold_) +
340 0 : (0.10f * target_threshold);
341 : }
342 : }
343 :
344 : // Set a floor
345 0 : if (decision_threshold_ < params_.min_decision_threshold())
346 0 : decision_threshold_ = params_.min_decision_threshold();
347 : }
348 :
349 : // Update speech and noise levels.
350 0 : UpdateLevels(rms);
351 0 : ++frame_counter_;
352 :
353 0 : if (rms_out)
354 0 : *rms_out = GetDecibel(rms);
355 0 : }
356 :
357 0 : float EnergyEndpointer::GetNoiseLevelDb() const {
358 0 : return GetDecibel(noise_level_);
359 : }
360 :
361 0 : void EnergyEndpointer::UpdateLevels(float rms) {
362 : // Update quickly initially. We assume this is noise and that
363 : // speech is 6dB above the noise.
364 0 : if (frame_counter_ < fast_update_frames_) {
365 : // Alpha increases from 0 to (k-1)/k where k is the number of time
366 : // steps in the initial adaptation period.
367 0 : float alpha = static_cast<float>(frame_counter_) /
368 0 : static_cast<float>(fast_update_frames_);
369 0 : noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);
370 : //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_));
371 : } else {
372 : // Update Noise level. The noise level adapts quickly downward, but
373 : // slowly upward. The noise_level_ parameter is not currently used
374 : // for threshold adaptation. It is used for UI feedback.
375 0 : if (noise_level_ < rms)
376 0 : noise_level_ = (0.999f * noise_level_) + (0.001f * rms);
377 : else
378 0 : noise_level_ = (0.95f * noise_level_) + (0.05f * rms);
379 : }
380 0 : if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {
381 0 : decision_threshold_ = noise_level_ * 2; // 6dB above noise level.
382 : // Set a floor
383 0 : if (decision_threshold_ < params_.min_decision_threshold())
384 0 : decision_threshold_ = params_.min_decision_threshold();
385 : }
386 0 : }
387 :
388 0 : EpStatus EnergyEndpointer::Status(int64_t* status_time) const {
389 0 : *status_time = history_->EndTime();
390 0 : return status_;
391 : }
392 :
393 : } // namespace mozilla
|