Line data Source code
1 : // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 : //
3 : // Redistribution and use in source and binary forms, with or without
4 : // modification, are permitted provided that the following conditions are
5 : // met:
6 : //
7 : // * Redistributions of source code must retain the above copyright
8 : // notice, this list of conditions and the following disclaimer.
9 : // * Redistributions in binary form must reproduce the above
10 : // copyright notice, this list of conditions and the following disclaimer
11 : // in the documentation and/or other materials provided with the
12 : // distribution.
13 : // * Neither the name of Google Inc. nor the names of its
14 : // contributors may be used to endorse or promote products derived from
15 : // this software without specific prior written permission.
16 : //
17 : // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 : // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 : // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 : // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 : // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 : // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 : // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 : // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 : // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 : // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 : // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 :
29 : #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
30 : #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
31 :
32 : #include "energy_endpointer.h"
33 :
34 : namespace mozilla {
35 :
36 : struct AudioChunk;
37 :
38 : // A simple interface to the underlying energy-endpointer implementation, this
39 : // class lets callers provide audio as being recorded and let them poll to find
40 : // when the user has stopped speaking.
41 : //
42 : // There are two events that may trigger the end of speech:
43 : //
44 : // speechInputPossiblyComplete event:
45 : //
46 : // Signals that silence/noise has been detected for a *short* amount of
47 : // time after some speech has been detected. It can be used for low latency
48 : // UI feedback. To disable it, set it to a large amount.
49 : //
50 : // speechInputComplete event:
51 : //
52 : // This event is intended to signal end of input and to stop recording.
53 : // The amount of time to wait after speech is set by
54 : // speech_input_complete_silence_length_ and optionally two other
55 : // parameters (see below).
56 : // This time can be held constant, or can change as more speech is detected.
57 : // In the latter case, the time changes after a set amount of time from the
58 : // *beginning* of speech. This is motivated by the expectation that there
59 : // will be two distinct types of inputs: short search queries and longer
60 : // dictation style input.
61 : //
62 : // Three parameters are used to define the piecewise constant timeout function.
63 : // The timeout length is speech_input_complete_silence_length until
64 : // long_speech_length, when it changes to
65 : // long_speech_input_complete_silence_length.
66 0 : class Endpointer {
67 : public:
68 : explicit Endpointer(int sample_rate);
69 :
70 : // Start the endpointer. This should be called at the beginning of a session.
71 : void StartSession();
72 :
73 : // Stop the endpointer.
74 : void EndSession();
75 :
76 : // Start environment estimation. Audio will be used for environment estimation
77 : // i.e. noise level estimation.
78 : void SetEnvironmentEstimationMode();
79 :
80 : // Start user input. This should be called when the user indicates start of
81 : // input, e.g. by pressing a button.
82 : void SetUserInputMode();
83 :
84 : // Process a segment of audio, which may be more than one frame.
85 : // The status of the last frame will be returned.
86 : EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out);
87 :
88 : // Get the status of the endpointer.
89 : EpStatus Status(int64_t *time_us);
90 :
91 : // Get the expected frame size for audio chunks. Audio chunks are expected
92 : // to contain a number of samples that is a multiple of this number, and extra
93 : // samples will be dropped.
94 0 : int32_t FrameSize() const {
95 0 : return frame_size_;
96 : }
97 :
98 : // Returns true if the endpointer detected reasonable audio levels above
99 : // background noise which could be user speech, false if not.
100 0 : bool DidStartReceivingSpeech() const {
101 0 : return speech_previously_detected_;
102 : }
103 :
104 : bool IsEstimatingEnvironment() const {
105 : return energy_endpointer_.estimating_environment();
106 : }
107 :
108 0 : void set_speech_input_complete_silence_length(int64_t time_us) {
109 0 : speech_input_complete_silence_length_us_ = time_us;
110 0 : }
111 :
112 0 : void set_long_speech_input_complete_silence_length(int64_t time_us) {
113 0 : long_speech_input_complete_silence_length_us_ = time_us;
114 0 : }
115 :
116 : void set_speech_input_possibly_complete_silence_length(int64_t time_us) {
117 : speech_input_possibly_complete_silence_length_us_ = time_us;
118 : }
119 :
120 0 : void set_long_speech_length(int64_t time_us) {
121 0 : long_speech_length_us_ = time_us;
122 0 : }
123 :
124 0 : bool speech_input_complete() const {
125 0 : return speech_input_complete_;
126 : }
127 :
128 : // RMS background noise level in dB.
129 : float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); }
130 :
131 : private:
132 : // Reset internal states. Helper method common to initial input utterance
133 : // and following input utternaces.
134 : void Reset();
135 :
136 : // Minimum allowable length of speech input.
137 : int64_t speech_input_minimum_length_us_;
138 :
139 : // The speechInputPossiblyComplete event signals that silence/noise has been
140 : // detected for a *short* amount of time after some speech has been detected.
141 : // This proporty specifies the time period.
142 : int64_t speech_input_possibly_complete_silence_length_us_;
143 :
144 : // The speechInputComplete event signals that silence/noise has been
145 : // detected for a *long* amount of time after some speech has been detected.
146 : // This property specifies the time period.
147 : int64_t speech_input_complete_silence_length_us_;
148 :
149 : // Same as above, this specifies the required silence period after speech
150 : // detection. This period is used instead of
151 : // speech_input_complete_silence_length_ when the utterance is longer than
152 : // long_speech_length_. This parameter is optional.
153 : int64_t long_speech_input_complete_silence_length_us_;
154 :
155 : // The period of time after which the endpointer should consider
156 : // long_speech_input_complete_silence_length_ as a valid silence period
157 : // instead of speech_input_complete_silence_length_. This parameter is
158 : // optional.
159 : int64_t long_speech_length_us_;
160 :
161 : // First speech onset time, used in determination of speech complete timeout.
162 : int64_t speech_start_time_us_;
163 :
164 : // Most recent end time, used in determination of speech complete timeout.
165 : int64_t speech_end_time_us_;
166 :
167 : int64_t audio_frame_time_us_;
168 : EpStatus old_ep_status_;
169 : bool waiting_for_speech_possibly_complete_timeout_;
170 : bool waiting_for_speech_complete_timeout_;
171 : bool speech_previously_detected_;
172 : bool speech_input_complete_;
173 : EnergyEndpointer energy_endpointer_;
174 : int sample_rate_;
175 : int32_t frame_size_;
176 : };
177 :
178 : } // namespace mozilla
179 :
180 : #endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
|