Line data Source code
1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* vim:set ts=2 sw=2 sts=2 et cindent: */
3 : /* This Source Code Form is subject to the terms of the Mozilla Public
4 : * License, v. 2.0. If a copy of the MPL was not distributed with this
5 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 :
7 : #ifndef mozilla_dom_SpeechRecognition_h
8 : #define mozilla_dom_SpeechRecognition_h
9 :
10 : #include "mozilla/Attributes.h"
11 : #include "mozilla/DOMEventTargetHelper.h"
12 : #include "nsCOMPtr.h"
13 : #include "nsString.h"
14 : #include "nsWrapperCache.h"
15 : #include "nsTArray.h"
16 : #include "js/TypeDecls.h"
17 :
18 : #include "DOMMediaStream.h"
19 : #include "nsIDOMNavigatorUserMedia.h"
20 : #include "nsITimer.h"
21 : #include "MediaStreamGraph.h"
22 : #include "AudioSegment.h"
23 : #include "mozilla/WeakPtr.h"
24 :
25 : #include "SpeechGrammarList.h"
26 : #include "SpeechRecognitionResultList.h"
27 : #include "SpeechStreamListener.h"
28 : #include "nsISpeechRecognitionService.h"
29 : #include "endpointer.h"
30 :
31 : #include "mozilla/dom/BindingDeclarations.h"
32 : #include "mozilla/dom/SpeechRecognitionError.h"
33 :
34 : namespace mozilla {
35 :
36 : class DOMMediaStream;
37 :
38 : namespace dom {
39 :
40 : #define SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC "SpeechRecognitionTest:RequestEvent"
41 : #define SPEECH_RECOGNITION_TEST_END_TOPIC "SpeechRecognitionTest:End"
42 :
43 : class GlobalObject;
44 : class SpeechEvent;
45 :
46 : LogModule* GetSpeechRecognitionLog();
47 : #define SR_LOG(...) MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__))
48 :
49 : class SpeechRecognition final : public DOMEventTargetHelper,
50 : public nsIObserver,
51 : public SupportsWeakPtr<SpeechRecognition>
52 : {
53 : public:
54 0 : MOZ_DECLARE_WEAKREFERENCE_TYPENAME(SpeechRecognition)
55 : explicit SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow);
56 :
57 : NS_DECL_ISUPPORTS_INHERITED
58 0 : NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(SpeechRecognition, DOMEventTargetHelper)
59 :
60 : NS_DECL_NSIOBSERVER
61 :
62 : nsISupports* GetParentObject() const;
63 :
64 : JSObject* WrapObject(JSContext* aCx, JS::Handle<JSObject*> aGivenProto) override;
65 :
66 : static bool IsAuthorized(JSContext* aCx, JSObject* aGlobal);
67 :
68 : static already_AddRefed<SpeechRecognition>
69 : Constructor(const GlobalObject& aGlobal, ErrorResult& aRv);
70 :
71 : already_AddRefed<SpeechGrammarList> Grammars() const;
72 :
73 : void SetGrammars(mozilla::dom::SpeechGrammarList& aArg);
74 :
75 : void GetLang(nsString& aRetVal) const;
76 :
77 : void SetLang(const nsAString& aArg);
78 :
79 : bool GetContinuous(ErrorResult& aRv) const;
80 :
81 : void SetContinuous(bool aArg, ErrorResult& aRv);
82 :
83 : bool InterimResults() const;
84 :
85 : void SetInterimResults(bool aArg);
86 :
87 : uint32_t MaxAlternatives() const;
88 :
89 : void SetMaxAlternatives(uint32_t aArg);
90 :
91 : void GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const;
92 :
93 : void SetServiceURI(const nsAString& aArg, ErrorResult& aRv);
94 :
95 : void Start(const Optional<NonNull<DOMMediaStream>>& aStream,
96 : CallerType aCallerType, ErrorResult& aRv);
97 :
98 : void Stop();
99 :
100 : void Abort();
101 :
102 0 : IMPL_EVENT_HANDLER(audiostart)
103 0 : IMPL_EVENT_HANDLER(soundstart)
104 0 : IMPL_EVENT_HANDLER(speechstart)
105 0 : IMPL_EVENT_HANDLER(speechend)
106 0 : IMPL_EVENT_HANDLER(soundend)
107 0 : IMPL_EVENT_HANDLER(audioend)
108 0 : IMPL_EVENT_HANDLER(result)
109 0 : IMPL_EVENT_HANDLER(nomatch)
110 0 : IMPL_EVENT_HANDLER(error)
111 0 : IMPL_EVENT_HANDLER(start)
112 0 : IMPL_EVENT_HANDLER(end)
113 :
114 : enum EventType {
115 : EVENT_START,
116 : EVENT_STOP,
117 : EVENT_ABORT,
118 : EVENT_AUDIO_DATA,
119 : EVENT_AUDIO_ERROR,
120 : EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT,
121 : EVENT_RECOGNITIONSERVICE_FINAL_RESULT,
122 : EVENT_RECOGNITIONSERVICE_ERROR,
123 : EVENT_COUNT
124 : };
125 :
126 : void DispatchError(EventType aErrorType, SpeechRecognitionErrorCode aErrorCode, const nsAString& aMessage);
127 : uint32_t FillSamplesBuffer(const int16_t* aSamples, uint32_t aSampleCount);
128 : uint32_t SplitSamplesBuffer(const int16_t* aSamplesBuffer, uint32_t aSampleCount, nsTArray<RefPtr<SharedBuffer>>& aResult);
129 : AudioSegment* CreateAudioSegment(nsTArray<RefPtr<SharedBuffer>>& aChunks);
130 : void FeedAudioData(already_AddRefed<SharedBuffer> aSamples, uint32_t aDuration, MediaStreamListener* aProvider, TrackRate aTrackRate);
131 :
132 : friend class SpeechEvent;
133 : private:
134 0 : virtual ~SpeechRecognition() {};
135 :
136 : enum FSMState {
137 : STATE_IDLE,
138 : STATE_STARTING,
139 : STATE_ESTIMATING,
140 : STATE_WAITING_FOR_SPEECH,
141 : STATE_RECOGNIZING,
142 : STATE_WAITING_FOR_RESULT,
143 : STATE_COUNT
144 : };
145 :
146 : void SetState(FSMState state);
147 : bool StateBetween(FSMState begin, FSMState end);
148 :
149 : bool SetRecognitionService(ErrorResult& aRv);
150 : bool ValidateAndSetGrammarList(ErrorResult& aRv);
151 :
152 : class GetUserMediaSuccessCallback : public nsIDOMGetUserMediaSuccessCallback
153 : {
154 : public:
155 : NS_DECL_ISUPPORTS
156 : NS_DECL_NSIDOMGETUSERMEDIASUCCESSCALLBACK
157 :
158 0 : explicit GetUserMediaSuccessCallback(SpeechRecognition* aRecognition)
159 0 : : mRecognition(aRecognition)
160 0 : {}
161 :
162 : private:
163 0 : virtual ~GetUserMediaSuccessCallback() {}
164 :
165 : RefPtr<SpeechRecognition> mRecognition;
166 : };
167 :
168 : class GetUserMediaErrorCallback : public nsIDOMGetUserMediaErrorCallback
169 : {
170 : public:
171 : NS_DECL_ISUPPORTS
172 : NS_DECL_NSIDOMGETUSERMEDIAERRORCALLBACK
173 :
174 0 : explicit GetUserMediaErrorCallback(SpeechRecognition* aRecognition)
175 0 : : mRecognition(aRecognition)
176 0 : {}
177 :
178 : private:
179 0 : virtual ~GetUserMediaErrorCallback() {}
180 :
181 : RefPtr<SpeechRecognition> mRecognition;
182 : };
183 :
184 : NS_IMETHOD StartRecording(DOMMediaStream* aDOMStream);
185 : NS_IMETHOD StopRecording();
186 :
187 : uint32_t ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate);
188 : void NotifyError(SpeechEvent* aEvent);
189 :
190 : void ProcessEvent(SpeechEvent* aEvent);
191 : void Transition(SpeechEvent* aEvent);
192 :
193 : void Reset();
194 : void ResetAndEnd();
195 : void WaitForAudioData(SpeechEvent* aEvent);
196 : void StartedAudioCapture(SpeechEvent* aEvent);
197 : void StopRecordingAndRecognize(SpeechEvent* aEvent);
198 : void WaitForEstimation(SpeechEvent* aEvent);
199 : void DetectSpeech(SpeechEvent* aEvent);
200 : void WaitForSpeechEnd(SpeechEvent* aEvent);
201 : void NotifyFinalResult(SpeechEvent* aEvent);
202 : void DoNothing(SpeechEvent* aEvent);
203 : void AbortSilently(SpeechEvent* aEvent);
204 : void AbortError(SpeechEvent* aEvent);
205 :
206 : RefPtr<DOMMediaStream> mDOMStream;
207 : RefPtr<SpeechStreamListener> mSpeechListener;
208 : nsCOMPtr<nsISpeechRecognitionService> mRecognitionService;
209 :
210 : FSMState mCurrentState;
211 :
212 : Endpointer mEndpointer;
213 : uint32_t mEstimationSamples;
214 :
215 : uint32_t mAudioSamplesPerChunk;
216 :
217 : // buffer holds one chunk of mAudioSamplesPerChunk
218 : // samples before feeding it to mEndpointer
219 : RefPtr<SharedBuffer> mAudioSamplesBuffer;
220 : uint32_t mBufferedSamples;
221 :
222 : nsCOMPtr<nsITimer> mSpeechDetectionTimer;
223 : bool mAborted;
224 :
225 : nsString mLang;
226 :
227 : RefPtr<SpeechGrammarList> mSpeechGrammarList;
228 :
229 : // WebSpeechAPI (http://bit.ly/1gIl7DC) states:
230 : //
231 : // 1. Default value MUST be false
232 : // 2. If true, interim results SHOULD be returned
233 : // 3. If false, interim results MUST NOT be returned
234 : //
235 : // Pocketsphinx does not return interm results; so, defaulting
236 : // mInterimResults to false, then ignoring its subsequent value
237 : // is a conforming implementation.
238 : bool mInterimResults;
239 :
240 : // WebSpeechAPI (http://bit.ly/1JAiqeo) states:
241 : //
242 : // 1. Default value is 1
243 : // 2. Subsequent value is the "maximum number of SpeechRecognitionAlternatives per result"
244 : //
245 : // Pocketsphinx can only return at maximum a single SpeechRecognitionAlternative
246 : // per SpeechRecognitionResult. So defaulting mMaxAlternatives to 1, for all non
247 : // zero values ignoring mMaxAlternatives while for a 0 value returning no
248 : // SpeechRecognitionAlternative per result is a conforming implementation.
249 : uint32_t mMaxAlternatives;
250 :
251 : void ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName);
252 :
253 : const char* GetName(FSMState aId);
254 : const char* GetName(SpeechEvent* aId);
255 : };
256 :
257 : class SpeechEvent : public Runnable
258 : {
259 : public:
260 0 : SpeechEvent(SpeechRecognition* aRecognition,
261 : SpeechRecognition::EventType aType)
262 0 : : Runnable("dom::SpeechEvent")
263 : , mAudioSegment(0)
264 : , mRecognitionResultList(nullptr)
265 : , mError(nullptr)
266 : , mRecognition(aRecognition)
267 : , mType(aType)
268 0 : , mTrackRate(0)
269 : {
270 0 : }
271 :
272 : ~SpeechEvent();
273 :
274 : NS_IMETHOD Run() override;
275 : AudioSegment* mAudioSegment;
276 : RefPtr<SpeechRecognitionResultList> mRecognitionResultList; // TODO: make this a session being passed which also has index and stuff
277 : RefPtr<SpeechRecognitionError> mError;
278 :
279 : friend class SpeechRecognition;
280 : private:
281 : SpeechRecognition* mRecognition;
282 :
283 : // for AUDIO_DATA events, keep a reference to the provider
284 : // of the data (i.e., the SpeechStreamListener) to ensure it
285 : // is kept alive (and keeps SpeechRecognition alive) until this
286 : // event gets processed.
287 : RefPtr<MediaStreamListener> mProvider;
288 : SpeechRecognition::EventType mType;
289 : TrackRate mTrackRate;
290 : };
291 :
292 : } // namespace dom
293 :
294 : inline nsISupports*
295 0 : ToSupports(dom::SpeechRecognition* aRec)
296 : {
297 0 : return ToSupports(static_cast<DOMEventTargetHelper*>(aRec));
298 : }
299 :
300 : } // namespace mozilla
301 :
302 : #endif
|