Line data Source code
1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* vim:set ts=2 sw=2 sts=2 et cindent: */
3 : /* This Source Code Form is subject to the terms of the Mozilla Public
4 : * License, v. 2.0. If a copy of the MPL was not distributed with this
5 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 :
7 : #include "SpeechRecognition.h"
8 :
9 : #include "nsCOMPtr.h"
10 : #include "nsCycleCollectionParticipant.h"
11 :
12 : #include "mozilla/dom/BindingUtils.h"
13 : #include "mozilla/dom/Element.h"
14 : #include "mozilla/dom/SpeechRecognitionBinding.h"
15 : #include "mozilla/dom/MediaStreamTrackBinding.h"
16 : #include "mozilla/dom/MediaStreamError.h"
17 : #include "mozilla/MediaManager.h"
18 : #include "mozilla/Preferences.h"
19 : #include "MediaPrefs.h"
20 : #include "mozilla/Services.h"
21 :
22 : #include "AudioSegment.h"
23 : #include "DOMMediaStream.h"
24 : #include "endpointer.h"
25 :
26 : #include "mozilla/dom/SpeechRecognitionEvent.h"
27 : #include "nsContentUtils.h"
28 : #include "nsIDocument.h"
29 : #include "nsIObserverService.h"
30 : #include "nsIPermissionManager.h"
31 : #include "nsIPrincipal.h"
32 : #include "nsPIDOMWindow.h"
33 : #include "nsServiceManagerUtils.h"
34 : #include "nsQueryObject.h"
35 :
36 : #include <algorithm>
37 :
38 : // Undo the windows.h damage
39 : #if defined(XP_WIN) && defined(GetMessage)
40 : #undef GetMessage
41 : #endif
42 :
43 : namespace mozilla {
44 : namespace dom {
45 :
46 : #define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default"
47 : #define DEFAULT_RECOGNITION_SERVICE_PREFIX "pocketsphinx-"
48 : #define DEFAULT_RECOGNITION_SERVICE "pocketsphinx-en-US"
49 :
50 : #define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length"
51 : #define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH "media.webspeech.long_silence_length"
52 : #define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH "media.webspeech.long_speech_length"
53 :
54 : static const uint32_t kSAMPLE_RATE = 16000;
55 : static const uint32_t kSPEECH_DETECTION_TIMEOUT_MS = 10000;
56 :
57 : // number of frames corresponding to 300ms of audio to send to endpointer while
58 : // it's in environment estimation mode
59 : // kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms
60 : static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000;
61 :
62 : LogModule*
63 0 : GetSpeechRecognitionLog()
64 : {
65 : static LazyLogModule sLog("SpeechRecognition");
66 0 : return sLog;
67 : }
68 : #define SR_LOG(...) MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__))
69 :
70 : already_AddRefed<nsISpeechRecognitionService>
71 0 : GetSpeechRecognitionService(const nsAString& aLang)
72 : {
73 0 : nsAutoCString speechRecognitionServiceCID;
74 :
75 : nsAdoptingCString prefValue =
76 0 : Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE);
77 0 : nsAutoCString speechRecognitionService;
78 :
79 0 : if (!aLang.IsEmpty()) {
80 : speechRecognitionService =
81 0 : NS_LITERAL_CSTRING(DEFAULT_RECOGNITION_SERVICE_PREFIX) +
82 0 : NS_ConvertUTF16toUTF8(aLang);
83 0 : } else if (!prefValue.IsEmpty()) {
84 0 : speechRecognitionService = prefValue;
85 : } else {
86 0 : speechRecognitionService = DEFAULT_RECOGNITION_SERVICE;
87 : }
88 :
89 0 : if (MediaPrefs::WebSpeechFakeRecognitionService()) {
90 : speechRecognitionServiceCID =
91 0 : NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake";
92 : } else {
93 : speechRecognitionServiceCID =
94 0 : NS_LITERAL_CSTRING(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) +
95 0 : speechRecognitionService;
96 : }
97 :
98 : nsresult rv;
99 0 : nsCOMPtr<nsISpeechRecognitionService> recognitionService;
100 0 : recognitionService = do_GetService(speechRecognitionServiceCID.get(), &rv);
101 0 : return recognitionService.forget();
102 : }
103 :
104 0 : NS_IMPL_CYCLE_COLLECTION_INHERITED(SpeechRecognition, DOMEventTargetHelper, mDOMStream, mSpeechGrammarList)
105 :
106 0 : NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION_INHERITED(SpeechRecognition)
107 0 : NS_INTERFACE_MAP_ENTRY(nsIObserver)
108 0 : NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper)
109 :
110 0 : NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper)
111 0 : NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper)
112 :
113 0 : SpeechRecognition::SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow)
114 : : DOMEventTargetHelper(aOwnerWindow)
115 : , mEndpointer(kSAMPLE_RATE)
116 0 : , mAudioSamplesPerChunk(mEndpointer.FrameSize())
117 0 : , mSpeechDetectionTimer(do_CreateInstance(NS_TIMER_CONTRACTID))
118 0 : , mSpeechGrammarList(new SpeechGrammarList(GetParentObject()))
119 : , mInterimResults(false)
120 0 : , mMaxAlternatives(1)
121 : {
122 0 : SR_LOG("created SpeechRecognition");
123 :
124 0 : if (MediaPrefs::WebSpeechTestEnabled()) {
125 0 : nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
126 0 : obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false);
127 0 : obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false);
128 : }
129 :
130 0 : mEndpointer.set_speech_input_complete_silence_length(
131 0 : Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 1250000));
132 0 : mEndpointer.set_long_speech_input_complete_silence_length(
133 0 : Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 2500000));
134 0 : mEndpointer.set_long_speech_length(
135 0 : Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000));
136 0 : Reset();
137 0 : }
138 :
139 : bool
140 0 : SpeechRecognition::StateBetween(FSMState begin, FSMState end)
141 : {
142 0 : return mCurrentState >= begin && mCurrentState <= end;
143 : }
144 :
145 : void
146 0 : SpeechRecognition::SetState(FSMState state)
147 : {
148 0 : mCurrentState = state;
149 0 : SR_LOG("Transitioned to state %s", GetName(mCurrentState));
150 0 : return;
151 : }
152 :
153 : JSObject*
154 0 : SpeechRecognition::WrapObject(JSContext* aCx, JS::Handle<JSObject*> aGivenProto)
155 : {
156 0 : return SpeechRecognitionBinding::Wrap(aCx, this, aGivenProto);
157 : }
158 :
159 : bool
160 0 : SpeechRecognition::IsAuthorized(JSContext* aCx, JSObject* aGlobal)
161 : {
162 0 : nsCOMPtr<nsIPrincipal> principal = nsContentUtils::ObjectPrincipal(aGlobal);
163 :
164 : nsresult rv;
165 0 : nsCOMPtr<nsIPermissionManager> mgr = do_GetService(NS_PERMISSIONMANAGER_CONTRACTID, &rv);
166 0 : if (NS_WARN_IF(NS_FAILED(rv))) {
167 0 : return false;
168 : }
169 :
170 0 : uint32_t speechRecognition = nsIPermissionManager::UNKNOWN_ACTION;
171 0 : rv = mgr->TestExactPermissionFromPrincipal(principal, "speech-recognition", &speechRecognition);
172 0 : if (NS_WARN_IF(NS_FAILED(rv))) {
173 0 : return false;
174 : }
175 :
176 0 : bool hasPermission = (speechRecognition == nsIPermissionManager::ALLOW_ACTION);
177 :
178 0 : return (hasPermission || MediaPrefs::WebSpeechRecognitionForceEnabled()
179 0 : || MediaPrefs::WebSpeechTestEnabled())
180 0 : && MediaPrefs::WebSpeechRecognitionEnabled();
181 : }
182 :
183 : already_AddRefed<SpeechRecognition>
184 0 : SpeechRecognition::Constructor(const GlobalObject& aGlobal,
185 : ErrorResult& aRv)
186 : {
187 0 : nsCOMPtr<nsPIDOMWindowInner> win = do_QueryInterface(aGlobal.GetAsSupports());
188 0 : if (!win) {
189 0 : aRv.Throw(NS_ERROR_FAILURE);
190 0 : return nullptr;
191 : }
192 :
193 0 : MOZ_ASSERT(win->IsInnerWindow());
194 0 : RefPtr<SpeechRecognition> object = new SpeechRecognition(win);
195 0 : return object.forget();
196 : }
197 :
198 : nsISupports*
199 0 : SpeechRecognition::GetParentObject() const
200 : {
201 0 : return GetOwner();
202 : }
203 :
204 : void
205 0 : SpeechRecognition::ProcessEvent(SpeechEvent* aEvent)
206 : {
207 0 : SR_LOG("Processing %s, current state is %s",
208 : GetName(aEvent),
209 : GetName(mCurrentState));
210 :
211 0 : if (mAborted && aEvent->mType != EVENT_ABORT) {
212 : // ignore all events while aborting
213 0 : return;
214 : }
215 :
216 0 : Transition(aEvent);
217 : }
218 :
219 : void
220 0 : SpeechRecognition::Transition(SpeechEvent* aEvent)
221 : {
222 0 : switch (mCurrentState) {
223 : case STATE_IDLE:
224 0 : switch (aEvent->mType) {
225 : case EVENT_START:
226 : // TODO: may want to time out if we wait too long
227 : // for user to approve
228 0 : WaitForAudioData(aEvent);
229 0 : break;
230 : case EVENT_STOP:
231 : case EVENT_ABORT:
232 : case EVENT_AUDIO_DATA:
233 : case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
234 : case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
235 0 : DoNothing(aEvent);
236 0 : break;
237 : case EVENT_AUDIO_ERROR:
238 : case EVENT_RECOGNITIONSERVICE_ERROR:
239 0 : AbortError(aEvent);
240 0 : break;
241 : case EVENT_COUNT:
242 0 : MOZ_CRASH("Invalid event EVENT_COUNT");
243 : }
244 0 : break;
245 : case STATE_STARTING:
246 0 : switch (aEvent->mType) {
247 : case EVENT_AUDIO_DATA:
248 0 : StartedAudioCapture(aEvent);
249 0 : break;
250 : case EVENT_AUDIO_ERROR:
251 : case EVENT_RECOGNITIONSERVICE_ERROR:
252 0 : AbortError(aEvent);
253 0 : break;
254 : case EVENT_ABORT:
255 0 : AbortSilently(aEvent);
256 0 : break;
257 : case EVENT_STOP:
258 0 : Reset();
259 0 : break;
260 : case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
261 : case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
262 0 : DoNothing(aEvent);
263 0 : break;
264 : case EVENT_START:
265 0 : SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
266 0 : MOZ_CRASH();
267 : case EVENT_COUNT:
268 0 : MOZ_CRASH("Invalid event EVENT_COUNT");
269 : }
270 0 : break;
271 : case STATE_ESTIMATING:
272 0 : switch (aEvent->mType) {
273 : case EVENT_AUDIO_DATA:
274 0 : WaitForEstimation(aEvent);
275 0 : break;
276 : case EVENT_STOP:
277 0 : StopRecordingAndRecognize(aEvent);
278 0 : break;
279 : case EVENT_ABORT:
280 0 : AbortSilently(aEvent);
281 0 : break;
282 : case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
283 : case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
284 : case EVENT_RECOGNITIONSERVICE_ERROR:
285 0 : DoNothing(aEvent);
286 0 : break;
287 : case EVENT_AUDIO_ERROR:
288 0 : AbortError(aEvent);
289 0 : break;
290 : case EVENT_START:
291 0 : SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType);
292 0 : MOZ_CRASH();
293 : case EVENT_COUNT:
294 0 : MOZ_CRASH("Invalid event EVENT_COUNT");
295 : }
296 0 : break;
297 : case STATE_WAITING_FOR_SPEECH:
298 0 : switch (aEvent->mType) {
299 : case EVENT_AUDIO_DATA:
300 0 : DetectSpeech(aEvent);
301 0 : break;
302 : case EVENT_STOP:
303 0 : StopRecordingAndRecognize(aEvent);
304 0 : break;
305 : case EVENT_ABORT:
306 0 : AbortSilently(aEvent);
307 0 : break;
308 : case EVENT_AUDIO_ERROR:
309 0 : AbortError(aEvent);
310 0 : break;
311 : case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
312 : case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
313 : case EVENT_RECOGNITIONSERVICE_ERROR:
314 0 : DoNothing(aEvent);
315 0 : break;
316 : case EVENT_START:
317 0 : SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
318 0 : MOZ_CRASH();
319 : case EVENT_COUNT:
320 0 : MOZ_CRASH("Invalid event EVENT_COUNT");
321 : }
322 0 : break;
323 : case STATE_RECOGNIZING:
324 0 : switch (aEvent->mType) {
325 : case EVENT_AUDIO_DATA:
326 0 : WaitForSpeechEnd(aEvent);
327 0 : break;
328 : case EVENT_STOP:
329 0 : StopRecordingAndRecognize(aEvent);
330 0 : break;
331 : case EVENT_AUDIO_ERROR:
332 : case EVENT_RECOGNITIONSERVICE_ERROR:
333 0 : AbortError(aEvent);
334 0 : break;
335 : case EVENT_ABORT:
336 0 : AbortSilently(aEvent);
337 0 : break;
338 : case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
339 : case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
340 0 : DoNothing(aEvent);
341 0 : break;
342 : case EVENT_START:
343 0 : SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent));
344 0 : MOZ_CRASH();
345 : case EVENT_COUNT:
346 0 : MOZ_CRASH("Invalid event EVENT_COUNT");
347 : }
348 0 : break;
349 : case STATE_WAITING_FOR_RESULT:
350 0 : switch (aEvent->mType) {
351 : case EVENT_STOP:
352 0 : DoNothing(aEvent);
353 0 : break;
354 : case EVENT_AUDIO_ERROR:
355 : case EVENT_RECOGNITIONSERVICE_ERROR:
356 0 : AbortError(aEvent);
357 0 : break;
358 : case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
359 0 : NotifyFinalResult(aEvent);
360 0 : break;
361 : case EVENT_AUDIO_DATA:
362 0 : DoNothing(aEvent);
363 0 : break;
364 : case EVENT_ABORT:
365 0 : AbortSilently(aEvent);
366 0 : break;
367 : case EVENT_START:
368 : case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
369 0 : SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s", GetName(aEvent));
370 0 : MOZ_CRASH();
371 : case EVENT_COUNT:
372 0 : MOZ_CRASH("Invalid event EVENT_COUNT");
373 : }
374 0 : break;
375 : case STATE_COUNT:
376 0 : MOZ_CRASH("Invalid state STATE_COUNT");
377 : }
378 :
379 0 : return;
380 : }
381 :
382 : /*
383 : * Handle a segment of recorded audio data.
384 : * Returns the number of samples that were processed.
385 : */
386 : uint32_t
387 0 : SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate)
388 : {
389 0 : AudioSegment::ChunkIterator iterator(*aSegment);
390 0 : uint32_t samples = 0;
391 0 : while (!iterator.IsEnded()) {
392 : float out;
393 0 : mEndpointer.ProcessAudio(*iterator, &out);
394 0 : samples += iterator->GetDuration();
395 0 : iterator.Next();
396 : }
397 :
398 0 : mRecognitionService->ProcessAudioSegment(aSegment, aTrackRate);
399 0 : return samples;
400 : }
401 :
402 : /****************************************************************************
403 : * FSM Transition functions
404 : *
405 : * If a transition function may cause a DOM event to be fired,
406 : * it may also be re-entered, since the event handler may cause the
407 : * event loop to spin and new SpeechEvents to be processed.
408 : *
409 : * Rules:
410 : * 1) These methods should call SetState as soon as possible.
411 : * 2) If these methods dispatch DOM events, or call methods that dispatch
412 : * DOM events, that should be done as late as possible.
413 : * 3) If anything must happen after dispatching a DOM event, make sure
414 : * the state is still what the method expected it to be.
415 : ****************************************************************************/
416 :
417 : void
418 0 : SpeechRecognition::Reset()
419 : {
420 0 : SetState(STATE_IDLE);
421 0 : mRecognitionService = nullptr;
422 0 : mEstimationSamples = 0;
423 0 : mBufferedSamples = 0;
424 0 : mSpeechDetectionTimer->Cancel();
425 0 : mAborted = false;
426 0 : }
427 :
428 : void
429 0 : SpeechRecognition::ResetAndEnd()
430 : {
431 0 : Reset();
432 0 : DispatchTrustedEvent(NS_LITERAL_STRING("end"));
433 0 : }
434 :
435 : void
436 0 : SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent)
437 : {
438 0 : SetState(STATE_STARTING);
439 0 : }
440 :
441 : void
442 0 : SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent)
443 : {
444 0 : SetState(STATE_ESTIMATING);
445 :
446 0 : mEndpointer.SetEnvironmentEstimationMode();
447 0 : mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
448 :
449 0 : DispatchTrustedEvent(NS_LITERAL_STRING("audiostart"));
450 0 : if (mCurrentState == STATE_ESTIMATING) {
451 0 : DispatchTrustedEvent(NS_LITERAL_STRING("start"));
452 : }
453 0 : }
454 :
455 : void
456 0 : SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent)
457 : {
458 0 : SetState(STATE_WAITING_FOR_RESULT);
459 :
460 0 : MOZ_ASSERT(mRecognitionService, "Service deleted before recording done");
461 0 : mRecognitionService->SoundEnd();
462 :
463 0 : StopRecording();
464 0 : }
465 :
466 : void
467 0 : SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent)
468 : {
469 0 : SetState(STATE_ESTIMATING);
470 :
471 0 : mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
472 0 : if (mEstimationSamples > kESTIMATION_SAMPLES) {
473 0 : mEndpointer.SetUserInputMode();
474 0 : SetState(STATE_WAITING_FOR_SPEECH);
475 : }
476 0 : }
477 :
478 : void
479 0 : SpeechRecognition::DetectSpeech(SpeechEvent* aEvent)
480 : {
481 0 : SetState(STATE_WAITING_FOR_SPEECH);
482 :
483 0 : ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
484 0 : if (mEndpointer.DidStartReceivingSpeech()) {
485 0 : mSpeechDetectionTimer->Cancel();
486 0 : SetState(STATE_RECOGNIZING);
487 0 : DispatchTrustedEvent(NS_LITERAL_STRING("speechstart"));
488 : }
489 0 : }
490 :
491 : void
492 0 : SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent)
493 : {
494 0 : SetState(STATE_RECOGNIZING);
495 :
496 0 : ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
497 0 : if (mEndpointer.speech_input_complete()) {
498 0 : DispatchTrustedEvent(NS_LITERAL_STRING("speechend"));
499 :
500 0 : if (mCurrentState == STATE_RECOGNIZING) {
501 : // FIXME: StopRecordingAndRecognize should only be called for single
502 : // shot services for continuous we should just inform the service
503 0 : StopRecordingAndRecognize(aEvent);
504 : }
505 : }
506 0 : }
507 :
508 : void
509 0 : SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent)
510 : {
511 0 : ResetAndEnd();
512 :
513 0 : RootedDictionary<SpeechRecognitionEventInit> init(RootingCx());
514 0 : init.mBubbles = true;
515 0 : init.mCancelable = false;
516 : // init.mResultIndex = 0;
517 0 : init.mResults = aEvent->mRecognitionResultList;
518 0 : init.mInterpretation = JS::NullValue();
519 : // init.mEmma = nullptr;
520 :
521 : RefPtr<SpeechRecognitionEvent> event =
522 0 : SpeechRecognitionEvent::Constructor(this, NS_LITERAL_STRING("result"), init);
523 0 : event->SetTrusted(true);
524 :
525 : bool defaultActionEnabled;
526 0 : this->DispatchEvent(event, &defaultActionEnabled);
527 0 : }
528 :
529 : void
530 0 : SpeechRecognition::DoNothing(SpeechEvent* aEvent)
531 : {
532 0 : }
533 :
534 : void
535 0 : SpeechRecognition::AbortSilently(SpeechEvent* aEvent)
536 : {
537 0 : if (mRecognitionService) {
538 0 : mRecognitionService->Abort();
539 : }
540 :
541 0 : if (mDOMStream) {
542 0 : StopRecording();
543 : }
544 :
545 0 : ResetAndEnd();
546 0 : }
547 :
548 : void
549 0 : SpeechRecognition::AbortError(SpeechEvent* aEvent)
550 : {
551 0 : AbortSilently(aEvent);
552 0 : NotifyError(aEvent);
553 0 : }
554 :
555 : void
556 0 : SpeechRecognition::NotifyError(SpeechEvent* aEvent)
557 : {
558 0 : aEvent->mError->SetTrusted(true);
559 :
560 : bool defaultActionEnabled;
561 0 : this->DispatchEvent(aEvent->mError, &defaultActionEnabled);
562 :
563 0 : return;
564 : }
565 :
566 : /**************************************
567 : * Event triggers and other functions *
568 : **************************************/
569 : NS_IMETHODIMP
570 0 : SpeechRecognition::StartRecording(DOMMediaStream* aDOMStream)
571 : {
572 : // hold a reference so that the underlying stream
573 : // doesn't get Destroy()'ed
574 0 : mDOMStream = aDOMStream;
575 :
576 0 : if (NS_WARN_IF(!mDOMStream->GetPlaybackStream())) {
577 0 : return NS_ERROR_UNEXPECTED;
578 : }
579 0 : mSpeechListener = new SpeechStreamListener(this);
580 0 : mDOMStream->GetPlaybackStream()->AddListener(mSpeechListener);
581 :
582 0 : mEndpointer.StartSession();
583 :
584 0 : return mSpeechDetectionTimer->Init(this, kSPEECH_DETECTION_TIMEOUT_MS,
585 0 : nsITimer::TYPE_ONE_SHOT);
586 : }
587 :
588 : NS_IMETHODIMP
589 0 : SpeechRecognition::StopRecording()
590 : {
591 : // we only really need to remove the listener explicitly when testing,
592 : // as our JS code still holds a reference to mDOMStream and only assigning
593 : // it to nullptr isn't guaranteed to free the stream and the listener.
594 0 : mDOMStream->GetPlaybackStream()->RemoveListener(mSpeechListener);
595 0 : mSpeechListener = nullptr;
596 0 : mDOMStream = nullptr;
597 :
598 0 : mEndpointer.EndSession();
599 0 : DispatchTrustedEvent(NS_LITERAL_STRING("audioend"));
600 :
601 0 : return NS_OK;
602 : }
603 :
604 : NS_IMETHODIMP
605 0 : SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic,
606 : const char16_t* aData)
607 : {
608 0 : MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread");
609 :
610 0 : if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) &&
611 0 : StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) {
612 :
613 0 : DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR,
614 : SpeechRecognitionErrorCode::No_speech,
615 0 : NS_LITERAL_STRING("No speech detected (timeout)"));
616 0 : } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) {
617 0 : nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
618 0 : obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC);
619 0 : obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC);
620 0 : } else if (MediaPrefs::WebSpeechFakeFSMEvents() &&
621 0 : !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) {
622 0 : ProcessTestEventRequest(aSubject, nsDependentString(aData));
623 : }
624 :
625 0 : return NS_OK;
626 : }
627 :
628 : void
629 0 : SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName)
630 : {
631 0 : if (aEventName.EqualsLiteral("EVENT_ABORT")) {
632 0 : Abort();
633 0 : } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) {
634 0 : DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR,
635 : SpeechRecognitionErrorCode::Audio_capture, // TODO different codes?
636 0 : NS_LITERAL_STRING("AUDIO_ERROR test event"));
637 : } else {
638 0 : NS_ASSERTION(MediaPrefs::WebSpeechFakeRecognitionService(),
639 : "Got request for fake recognition service event, but "
640 : TEST_PREFERENCE_FAKE_RECOGNITION_SERVICE " is unset");
641 :
642 : // let the fake recognition service handle the request
643 : }
644 :
645 0 : return;
646 : }
647 :
648 : already_AddRefed<SpeechGrammarList>
649 0 : SpeechRecognition::Grammars() const
650 : {
651 0 : RefPtr<SpeechGrammarList> speechGrammarList = mSpeechGrammarList;
652 0 : return speechGrammarList.forget();
653 : }
654 :
655 : void
656 0 : SpeechRecognition::SetGrammars(SpeechGrammarList& aArg)
657 : {
658 0 : mSpeechGrammarList = &aArg;
659 0 : }
660 :
661 : void
662 0 : SpeechRecognition::GetLang(nsString& aRetVal) const
663 : {
664 0 : aRetVal = mLang;
665 0 : }
666 :
667 : void
668 0 : SpeechRecognition::SetLang(const nsAString& aArg)
669 : {
670 0 : mLang = aArg;
671 0 : }
672 :
673 : bool
674 0 : SpeechRecognition::GetContinuous(ErrorResult& aRv) const
675 : {
676 0 : aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
677 0 : return false;
678 : }
679 :
680 : void
681 0 : SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv)
682 : {
683 0 : aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
684 0 : return;
685 : }
686 :
687 : bool
688 0 : SpeechRecognition::InterimResults() const
689 : {
690 0 : return mInterimResults;
691 : }
692 :
693 : void
694 0 : SpeechRecognition::SetInterimResults(bool aArg)
695 : {
696 0 : mInterimResults = aArg;
697 0 : return;
698 : }
699 :
700 : uint32_t
701 0 : SpeechRecognition::MaxAlternatives() const
702 : {
703 0 : return mMaxAlternatives;
704 : }
705 :
706 : void
707 0 : SpeechRecognition::SetMaxAlternatives(uint32_t aArg)
708 : {
709 0 : mMaxAlternatives = aArg;
710 0 : return;
711 : }
712 :
713 : void
714 0 : SpeechRecognition::GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const
715 : {
716 0 : aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
717 0 : return;
718 : }
719 :
720 : void
721 0 : SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv)
722 : {
723 0 : aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
724 0 : return;
725 : }
726 :
727 : void
728 0 : SpeechRecognition::Start(const Optional<NonNull<DOMMediaStream>>& aStream,
729 : CallerType aCallerType,
730 : ErrorResult& aRv)
731 : {
732 0 : if (mCurrentState != STATE_IDLE) {
733 0 : aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
734 0 : return;
735 : }
736 :
737 0 : if (!SetRecognitionService(aRv)) {
738 0 : return;
739 : }
740 :
741 0 : if (!ValidateAndSetGrammarList(aRv)) {
742 0 : return;
743 : }
744 :
745 : nsresult rv;
746 0 : rv = mRecognitionService->Initialize(this);
747 0 : if (NS_WARN_IF(NS_FAILED(rv))) {
748 0 : return;
749 : }
750 :
751 0 : MediaStreamConstraints constraints;
752 0 : constraints.mAudio.SetAsBoolean() = true;
753 :
754 0 : if (aStream.WasPassed()) {
755 0 : StartRecording(&aStream.Value());
756 : } else {
757 0 : AutoNoJSAPI();
758 0 : MediaManager* manager = MediaManager::Get();
759 0 : manager->GetUserMedia(GetOwner(),
760 : constraints,
761 0 : new GetUserMediaSuccessCallback(this),
762 0 : new GetUserMediaErrorCallback(this),
763 0 : aCallerType);
764 : }
765 :
766 0 : RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_START);
767 0 : NS_DispatchToMainThread(event);
768 : }
769 :
770 : bool
771 0 : SpeechRecognition::SetRecognitionService(ErrorResult& aRv)
772 : {
773 : // See: https://dvcs.w3.org/hg/speech-api/raw-file/tip/webspeechapi.html#dfn-lang
774 0 : if (!mLang.IsEmpty()) {
775 0 : mRecognitionService = GetSpeechRecognitionService(mLang);
776 :
777 0 : if (!mRecognitionService) {
778 0 : aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
779 0 : return false;
780 : }
781 :
782 0 : return true;
783 : }
784 :
785 0 : nsCOMPtr<nsPIDOMWindowInner> window = GetOwner();
786 0 : if(!window) {
787 0 : aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
788 0 : return false;
789 : }
790 0 : nsCOMPtr<nsIDocument> document = window->GetExtantDoc();
791 0 : if(!document) {
792 0 : aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
793 0 : return false;
794 : }
795 0 : nsCOMPtr<Element> element = document->GetRootElement();
796 0 : if(!element) {
797 0 : aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
798 0 : return false;
799 : }
800 :
801 0 : nsAutoString lang;
802 0 : element->GetLang(lang);
803 0 : mRecognitionService = GetSpeechRecognitionService(lang);
804 :
805 0 : if (!mRecognitionService) {
806 0 : aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
807 0 : return false;
808 : }
809 :
810 0 : return true;
811 : }
812 :
813 : bool
814 0 : SpeechRecognition::ValidateAndSetGrammarList(ErrorResult& aRv)
815 : {
816 0 : if (!mSpeechGrammarList) {
817 0 : aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
818 0 : return false;
819 : }
820 :
821 0 : uint32_t grammarListLength = mSpeechGrammarList->Length();
822 0 : if (0 == grammarListLength) {
823 0 : aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
824 0 : return false;
825 : }
826 :
827 0 : for (uint32_t count = 0; count < grammarListLength; ++count) {
828 0 : RefPtr<SpeechGrammar> speechGrammar = mSpeechGrammarList->Item(count, aRv);
829 0 : if (aRv.Failed()) {
830 0 : return false;
831 : }
832 0 : if (NS_FAILED(mRecognitionService->ValidateAndSetGrammarList(speechGrammar.get(), nullptr))) {
833 0 : aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
834 0 : return false;
835 : }
836 : }
837 :
838 0 : return true;
839 : }
840 :
841 : void
842 0 : SpeechRecognition::Stop()
843 : {
844 0 : RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_STOP);
845 0 : NS_DispatchToMainThread(event);
846 0 : }
847 :
848 : void
849 0 : SpeechRecognition::Abort()
850 : {
851 0 : if (mAborted) {
852 0 : return;
853 : }
854 :
855 0 : mAborted = true;
856 0 : RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT);
857 0 : NS_DispatchToMainThread(event);
858 : }
859 :
860 : void
861 0 : SpeechRecognition::DispatchError(EventType aErrorType,
862 : SpeechRecognitionErrorCode aErrorCode,
863 : const nsAString& aMessage)
864 : {
865 0 : MOZ_ASSERT(NS_IsMainThread());
866 0 : MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR ||
867 : aErrorType == EVENT_AUDIO_ERROR, "Invalid error type!");
868 :
869 : RefPtr<SpeechRecognitionError> srError =
870 0 : new SpeechRecognitionError(nullptr, nullptr, nullptr);
871 :
872 0 : srError->InitSpeechRecognitionError(NS_LITERAL_STRING("error"), true, false,
873 0 : aErrorCode, aMessage);
874 :
875 0 : RefPtr<SpeechEvent> event = new SpeechEvent(this, aErrorType);
876 0 : event->mError = srError;
877 0 : NS_DispatchToMainThread(event);
878 0 : }
879 :
880 : /*
881 : * Buffer audio samples into mAudioSamplesBuffer until aBufferSize.
882 : * Updates mBufferedSamples and returns the number of samples that were buffered.
883 : */
884 : uint32_t
885 0 : SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples,
886 : uint32_t aSampleCount)
887 : {
888 0 : MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk);
889 0 : MOZ_ASSERT(mAudioSamplesBuffer.get());
890 :
891 0 : int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data());
892 0 : size_t samplesToCopy = std::min(aSampleCount,
893 0 : mAudioSamplesPerChunk - mBufferedSamples);
894 :
895 0 : memcpy(samplesBuffer + mBufferedSamples, aSamples,
896 0 : samplesToCopy * sizeof(int16_t));
897 :
898 0 : mBufferedSamples += samplesToCopy;
899 0 : return samplesToCopy;
900 : }
901 :
902 : /*
903 : * Split a samples buffer starting of a given size into
904 : * chunks of equal size. The chunks are stored in the array
905 : * received as argument.
906 : * Returns the offset of the end of the last chunk that was
907 : * created.
908 : */
909 : uint32_t
910 0 : SpeechRecognition::SplitSamplesBuffer(const int16_t* aSamplesBuffer,
911 : uint32_t aSampleCount,
912 : nsTArray<RefPtr<SharedBuffer>>& aResult)
913 : {
914 0 : uint32_t chunkStart = 0;
915 :
916 0 : while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) {
917 : RefPtr<SharedBuffer> chunk =
918 0 : SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));
919 :
920 0 : memcpy(chunk->Data(), aSamplesBuffer + chunkStart,
921 0 : mAudioSamplesPerChunk * sizeof(int16_t));
922 :
923 0 : aResult.AppendElement(chunk.forget());
924 0 : chunkStart += mAudioSamplesPerChunk;
925 : }
926 :
927 0 : return chunkStart;
928 : }
929 :
930 : AudioSegment*
931 0 : SpeechRecognition::CreateAudioSegment(nsTArray<RefPtr<SharedBuffer>>& aChunks)
932 : {
933 0 : AudioSegment* segment = new AudioSegment();
934 0 : for (uint32_t i = 0; i < aChunks.Length(); ++i) {
935 0 : RefPtr<SharedBuffer> buffer = aChunks[i];
936 0 : const int16_t* chunkData = static_cast<const int16_t*>(buffer->Data());
937 :
938 0 : AutoTArray<const int16_t*, 1> channels;
939 0 : channels.AppendElement(chunkData);
940 0 : segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk,
941 0 : PRINCIPAL_HANDLE_NONE);
942 : }
943 :
944 0 : return segment;
945 : }
946 :
947 : void
948 0 : SpeechRecognition::FeedAudioData(already_AddRefed<SharedBuffer> aSamples,
949 : uint32_t aDuration,
950 : MediaStreamListener* aProvider, TrackRate aTrackRate)
951 : {
952 0 : NS_ASSERTION(!NS_IsMainThread(),
953 : "FeedAudioData should not be called in the main thread");
954 :
955 : // Endpointer expects to receive samples in chunks whose size is a
956 : // multiple of its frame size.
957 : // Since we can't assume we will receive the frames in appropriate-sized
958 : // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk
959 : // (a multiple of Endpointer's frame size) before feeding to Endpointer.
960 :
961 : // ensure aSamples is deleted
962 0 : RefPtr<SharedBuffer> refSamples = aSamples;
963 :
964 0 : uint32_t samplesIndex = 0;
965 0 : const int16_t* samples = static_cast<int16_t*>(refSamples->Data());
966 0 : AutoTArray<RefPtr<SharedBuffer>, 5> chunksToSend;
967 :
968 : // fill up our buffer and make a chunk out of it, if possible
969 0 : if (mBufferedSamples > 0) {
970 0 : samplesIndex += FillSamplesBuffer(samples, aDuration);
971 :
972 0 : if (mBufferedSamples == mAudioSamplesPerChunk) {
973 0 : chunksToSend.AppendElement(mAudioSamplesBuffer.forget());
974 0 : mBufferedSamples = 0;
975 : }
976 : }
977 :
978 : // create sample chunks of correct size
979 0 : if (samplesIndex < aDuration) {
980 0 : samplesIndex += SplitSamplesBuffer(samples + samplesIndex,
981 : aDuration - samplesIndex,
982 : chunksToSend);
983 : }
984 :
985 : // buffer remaining samples
986 0 : if (samplesIndex < aDuration) {
987 0 : mBufferedSamples = 0;
988 : mAudioSamplesBuffer =
989 0 : SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));
990 :
991 0 : FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex);
992 : }
993 :
994 0 : AudioSegment* segment = CreateAudioSegment(chunksToSend);
995 0 : RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_AUDIO_DATA);
996 0 : event->mAudioSegment = segment;
997 0 : event->mProvider = aProvider;
998 0 : event->mTrackRate = aTrackRate;
999 0 : NS_DispatchToMainThread(event);
1000 :
1001 0 : return;
1002 : }
1003 :
1004 : const char*
1005 0 : SpeechRecognition::GetName(FSMState aId)
1006 : {
1007 : static const char* names[] = {
1008 : "STATE_IDLE",
1009 : "STATE_STARTING",
1010 : "STATE_ESTIMATING",
1011 : "STATE_WAITING_FOR_SPEECH",
1012 : "STATE_RECOGNIZING",
1013 : "STATE_WAITING_FOR_RESULT",
1014 : };
1015 :
1016 0 : MOZ_ASSERT(aId < STATE_COUNT);
1017 0 : MOZ_ASSERT(ArrayLength(names) == STATE_COUNT);
1018 0 : return names[aId];
1019 : }
1020 :
1021 : const char*
1022 0 : SpeechRecognition::GetName(SpeechEvent* aEvent)
1023 : {
1024 : static const char* names[] = {
1025 : "EVENT_START",
1026 : "EVENT_STOP",
1027 : "EVENT_ABORT",
1028 : "EVENT_AUDIO_DATA",
1029 : "EVENT_AUDIO_ERROR",
1030 : "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT",
1031 : "EVENT_RECOGNITIONSERVICE_FINAL_RESULT",
1032 : "EVENT_RECOGNITIONSERVICE_ERROR"
1033 : };
1034 :
1035 0 : MOZ_ASSERT(aEvent->mType < EVENT_COUNT);
1036 0 : MOZ_ASSERT(ArrayLength(names) == EVENT_COUNT);
1037 0 : return names[aEvent->mType];
1038 : }
1039 :
1040 0 : SpeechEvent::~SpeechEvent()
1041 : {
1042 0 : delete mAudioSegment;
1043 0 : }
1044 :
1045 : NS_IMETHODIMP
1046 0 : SpeechEvent::Run()
1047 : {
1048 0 : mRecognition->ProcessEvent(this);
1049 0 : return NS_OK;
1050 : }
1051 :
1052 0 : NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaSuccessCallback, nsIDOMGetUserMediaSuccessCallback)
1053 :
1054 : NS_IMETHODIMP
1055 0 : SpeechRecognition::GetUserMediaSuccessCallback::OnSuccess(nsISupports* aStream)
1056 : {
1057 0 : RefPtr<DOMMediaStream> stream = do_QueryObject(aStream);
1058 0 : if (!stream) {
1059 0 : return NS_ERROR_NO_INTERFACE;
1060 : }
1061 0 : mRecognition->StartRecording(stream);
1062 0 : return NS_OK;
1063 : }
1064 :
1065 0 : NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaErrorCallback, nsIDOMGetUserMediaErrorCallback)
1066 :
1067 : NS_IMETHODIMP
1068 0 : SpeechRecognition::GetUserMediaErrorCallback::OnError(nsISupports* aError)
1069 : {
1070 0 : RefPtr<MediaStreamError> error = do_QueryObject(aError);
1071 0 : if (!error) {
1072 0 : return NS_OK;
1073 : }
1074 : SpeechRecognitionErrorCode errorCode;
1075 :
1076 0 : nsAutoString name;
1077 0 : error->GetName(name);
1078 0 : if (name.EqualsLiteral("PERMISSION_DENIED")) {
1079 0 : errorCode = SpeechRecognitionErrorCode::Not_allowed;
1080 : } else {
1081 0 : errorCode = SpeechRecognitionErrorCode::Audio_capture;
1082 : }
1083 :
1084 0 : nsAutoString message;
1085 0 : error->GetMessage(message);
1086 0 : mRecognition->DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode,
1087 0 : message);
1088 0 : return NS_OK;
1089 : }
1090 :
1091 : } // namespace dom
1092 : } // namespace mozilla
|