Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(289)

Side by Side Diff: content/browser/speech/speech_recognizer_impl.cc

Issue 9972008: Refactoring of chrome speech recognition architecture (CL1.6) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Rebased from master. Created 8 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "content/browser/speech/speech_recognizer_impl.h" 5 #include "content/browser/speech/speech_recognizer_impl.h"
6 6
7 #include "base/basictypes.h" 7 #include "base/basictypes.h"
8 #include "base/bind.h" 8 #include "base/bind.h"
9 #include "base/time.h" 9 #include "base/time.h"
10 #include "content/browser/browser_main_loop.h" 10 #include "content/browser/browser_main_loop.h"
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
56 if (samples[i] <= -32767 || samples[i] >= 32767) { 56 if (samples[i] <= -32767 || samples[i] >= 32767) {
57 if (++clipping_samples > kThreshold) 57 if (++clipping_samples > kThreshold)
58 return true; 58 return true;
59 } 59 }
60 } 60 }
61 return false; 61 return false;
62 } 62 }
63 63
64 } // namespace 64 } // namespace
65 65
66 // TODO(primiano) Create(...) is transitional (until we fix speech input
67 // extensions) and should be removed soon. The manager should be the only one
68 // knowing the existence of SpeechRecognizer(Impl), thus the only one in charge
69 // of instantiating it.
66 SpeechRecognizer* SpeechRecognizer::Create( 70 SpeechRecognizer* SpeechRecognizer::Create(
67 SpeechRecognitionEventListener* listener, 71 SpeechRecognitionEventListener* listener,
68 int caller_id, 72 int session_id,
69 const std::string& language, 73 const std::string& language,
70 const std::string& grammar, 74 const std::string& grammar,
71 net::URLRequestContextGetter* context_getter, 75 net::URLRequestContextGetter* context_getter,
72 bool filter_profanities, 76 bool filter_profanities,
73 const std::string& hardware_info, 77 const std::string& hardware_info,
74 const std::string& origin_url) { 78 const std::string& origin_url) {
75 speech::GoogleOneShotRemoteEngineConfig remote_engine_config; 79 speech::GoogleOneShotRemoteEngineConfig remote_engine_config;
76 remote_engine_config.language = language; 80 remote_engine_config.language = language;
77 remote_engine_config.grammar = grammar; 81 remote_engine_config.grammar = grammar;
78 remote_engine_config.audio_sample_rate = 82 remote_engine_config.audio_sample_rate =
79 speech::SpeechRecognizerImpl::kAudioSampleRate; 83 speech::SpeechRecognizerImpl::kAudioSampleRate;
80 remote_engine_config.audio_num_bits_per_sample = 84 remote_engine_config.audio_num_bits_per_sample =
81 speech::SpeechRecognizerImpl::kNumBitsPerAudioSample; 85 speech::SpeechRecognizerImpl::kNumBitsPerAudioSample;
82 remote_engine_config.filter_profanities = filter_profanities; 86 remote_engine_config.filter_profanities = filter_profanities;
83 remote_engine_config.hardware_info = hardware_info; 87 remote_engine_config.hardware_info = hardware_info;
84 remote_engine_config.origin_url = origin_url; 88 remote_engine_config.origin_url = origin_url;
85 89
86 // SpeechRecognizerImpl takes ownership of google_remote_engine. 90 // SpeechRecognizerImpl takes ownership of google_remote_engine.
87 speech::GoogleOneShotRemoteEngine* google_remote_engine = 91 speech::GoogleOneShotRemoteEngine* google_remote_engine =
88 new speech::GoogleOneShotRemoteEngine(context_getter); 92 new speech::GoogleOneShotRemoteEngine(context_getter);
89 google_remote_engine->SetConfig(remote_engine_config); 93 google_remote_engine->SetConfig(remote_engine_config);
90 94
91 return new speech::SpeechRecognizerImpl(listener, 95 return new speech::SpeechRecognizerImpl(listener,
92 caller_id, 96 session_id,
93 google_remote_engine); 97 google_remote_engine);
94 } 98 }
95 99
96 namespace speech { 100 namespace speech {
97 101
98 const int SpeechRecognizerImpl::kAudioSampleRate = 16000; 102 const int SpeechRecognizerImpl::kAudioSampleRate = 16000;
99 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO; 103 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO;
100 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; 104 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16;
101 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; 105 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000;
102 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; 106 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300;
103 107
104 COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0, 108 COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0,
105 kNumBitsPerAudioSample_must_be_a_multiple_of_8); 109 kNumBitsPerAudioSample_must_be_a_multiple_of_8);
106 110
107 SpeechRecognizerImpl::SpeechRecognizerImpl( 111 SpeechRecognizerImpl::SpeechRecognizerImpl(
108 SpeechRecognitionEventListener* listener, 112 SpeechRecognitionEventListener* listener,
109 int caller_id, 113 int session_id,
110 SpeechRecognitionEngine* engine) 114 SpeechRecognitionEngine* engine)
111 : listener_(listener), 115 : listener_(listener),
112 testing_audio_manager_(NULL), 116 testing_audio_manager_(NULL),
113 recognition_engine_(engine), 117 recognition_engine_(engine),
114 endpointer_(kAudioSampleRate), 118 endpointer_(kAudioSampleRate),
115 caller_id_(caller_id), 119 session_id_(session_id),
116 is_dispatching_event_(false), 120 is_dispatching_event_(false),
117 state_(STATE_IDLE) { 121 state_(STATE_IDLE) {
118 DCHECK(listener_ != NULL); 122 DCHECK(listener_ != NULL);
119 DCHECK(recognition_engine_ != NULL); 123 DCHECK(recognition_engine_ != NULL);
120 endpointer_.set_speech_input_complete_silence_length( 124 endpointer_.set_speech_input_complete_silence_length(
121 base::Time::kMicrosecondsPerSecond / 2); 125 base::Time::kMicrosecondsPerSecond / 2);
122 endpointer_.set_long_speech_input_complete_silence_length( 126 endpointer_.set_long_speech_input_complete_silence_length(
123 base::Time::kMicrosecondsPerSecond); 127 base::Time::kMicrosecondsPerSecond);
124 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); 128 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);
125 endpointer_.StartSession(); 129 endpointer_.StartSession();
(...skipping 273 matching lines...) Expand 10 before | Expand all | Expand 10 after
399 DCHECK(recognition_engine_.get() != NULL); 403 DCHECK(recognition_engine_.get() != NULL);
400 DCHECK(!IsCapturingAudio()); 404 DCHECK(!IsCapturingAudio());
401 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ? 405 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ?
402 testing_audio_manager_ : 406 testing_audio_manager_ :
403 BrowserMainLoop::GetAudioManager(); 407 BrowserMainLoop::GetAudioManager();
404 DCHECK(audio_manager != NULL); 408 DCHECK(audio_manager != NULL);
405 409
406 DVLOG(1) << "SpeechRecognizerImpl starting audio capture."; 410 DVLOG(1) << "SpeechRecognizerImpl starting audio capture.";
407 num_samples_recorded_ = 0; 411 num_samples_recorded_ = 0;
408 audio_level_ = 0; 412 audio_level_ = 0;
409 listener_->OnRecognitionStart(caller_id_); 413 listener_->OnRecognitionStart(session_id_);
410 414
411 if (!audio_manager->HasAudioInputDevices()) { 415 if (!audio_manager->HasAudioInputDevices()) {
412 return AbortWithError(SpeechRecognitionError( 416 return AbortWithError(SpeechRecognitionError(
413 content::SPEECH_RECOGNITION_ERROR_AUDIO, 417 content::SPEECH_RECOGNITION_ERROR_AUDIO,
414 content::SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); 418 content::SPEECH_AUDIO_ERROR_DETAILS_NO_MIC));
415 } 419 }
416 420
417 if (audio_manager->IsRecordingInProcess()) { 421 if (audio_manager->IsRecordingInProcess()) {
418 return AbortWithError(SpeechRecognitionError( 422 return AbortWithError(SpeechRecognitionError(
419 content::SPEECH_RECOGNITION_ERROR_AUDIO, 423 content::SPEECH_RECOGNITION_ERROR_AUDIO,
(...skipping 20 matching lines...) Expand all
440 audio_controller_->Record(); 444 audio_controller_->Record();
441 return STATE_STARTING; 445 return STATE_STARTING;
442 } 446 }
443 447
444 SpeechRecognizerImpl::FSMState 448 SpeechRecognizerImpl::FSMState
445 SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) { 449 SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) {
446 // This is the first audio packet captured, so the recognition engine is 450 // This is the first audio packet captured, so the recognition engine is
447 // started and the delegate notified about the event. 451 // started and the delegate notified about the event.
448 DCHECK(recognition_engine_.get() != NULL); 452 DCHECK(recognition_engine_.get() != NULL);
449 recognition_engine_->StartRecognition(); 453 recognition_engine_->StartRecognition();
450 listener_->OnAudioStart(caller_id_); 454 listener_->OnAudioStart(session_id_);
451 455
452 // This is a little hack, since TakeAudioChunk() is already called by 456 // This is a little hack, since TakeAudioChunk() is already called by
453 // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping 457 // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping
454 // the first audio chunk captured after opening the audio device. 458 // the first audio chunk captured after opening the audio device.
455 recognition_engine_->TakeAudioChunk(*(event_args.audio_data)); 459 recognition_engine_->TakeAudioChunk(*(event_args.audio_data));
456 return STATE_ESTIMATING_ENVIRONMENT; 460 return STATE_ESTIMATING_ENVIRONMENT;
457 } 461 }
458 462
459 SpeechRecognizerImpl::FSMState 463 SpeechRecognizerImpl::FSMState
460 SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) { 464 SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) {
461 DCHECK(endpointer_.IsEstimatingEnvironment()); 465 DCHECK(endpointer_.IsEstimatingEnvironment());
462 if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) { 466 if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) {
463 endpointer_.SetUserInputMode(); 467 endpointer_.SetUserInputMode();
464 listener_->OnEnvironmentEstimationComplete(caller_id_); 468 listener_->OnEnvironmentEstimationComplete(session_id_);
465 return STATE_WAITING_FOR_SPEECH; 469 return STATE_WAITING_FOR_SPEECH;
466 } else { 470 } else {
467 return STATE_ESTIMATING_ENVIRONMENT; 471 return STATE_ESTIMATING_ENVIRONMENT;
468 } 472 }
469 } 473 }
470 474
471 SpeechRecognizerImpl::FSMState 475 SpeechRecognizerImpl::FSMState
472 SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) { 476 SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) {
473 if (endpointer_.DidStartReceivingSpeech()) { 477 if (endpointer_.DidStartReceivingSpeech()) {
474 listener_->OnSoundStart(caller_id_); 478 listener_->OnSoundStart(session_id_);
475 return STATE_RECOGNIZING; 479 return STATE_RECOGNIZING;
476 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) { 480 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) {
477 return AbortWithError( 481 return AbortWithError(
478 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH)); 482 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH));
479 } 483 }
480 return STATE_WAITING_FOR_SPEECH; 484 return STATE_WAITING_FOR_SPEECH;
481 } 485 }
482 486
483 SpeechRecognizerImpl::FSMState 487 SpeechRecognizerImpl::FSMState
484 SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) { 488 SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) {
485 if (endpointer_.speech_input_complete()) { 489 if (endpointer_.speech_input_complete()) {
486 return StopCaptureAndWaitForResult(event_args); 490 return StopCaptureAndWaitForResult(event_args);
487 } 491 }
488 return STATE_RECOGNIZING; 492 return STATE_RECOGNIZING;
489 } 493 }
490 494
491 SpeechRecognizerImpl::FSMState 495 SpeechRecognizerImpl::FSMState
492 SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) { 496 SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) {
493 DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING); 497 DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING);
494 498
495 DVLOG(1) << "Concluding recognition"; 499 DVLOG(1) << "Concluding recognition";
496 CloseAudioControllerAsynchronously(); 500 CloseAudioControllerAsynchronously();
497 recognition_engine_->AudioChunksEnded(); 501 recognition_engine_->AudioChunksEnded();
498 502
499 if (state_ > STATE_WAITING_FOR_SPEECH) 503 if (state_ > STATE_WAITING_FOR_SPEECH)
500 listener_->OnSoundEnd(caller_id_); 504 listener_->OnSoundEnd(session_id_);
501 505
502 listener_->OnAudioEnd(caller_id_); 506 listener_->OnAudioEnd(session_id_);
503 return STATE_WAITING_FINAL_RESULT; 507 return STATE_WAITING_FINAL_RESULT;
504 } 508 }
505 509
506 SpeechRecognizerImpl::FSMState 510 SpeechRecognizerImpl::FSMState
507 SpeechRecognizerImpl::Abort(const FSMEventArgs& event_args) { 511 SpeechRecognizerImpl::Abort(const FSMEventArgs& event_args) {
508 // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of 512 // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of
509 // other specific error sources (so that it was an explicit abort request). 513 // other specific error sources (so that it was an explicit abort request).
510 // However, SPEECH_RECOGNITION_ERROR_ABORTED is not currently caught by 514 // However, SPEECH_RECOGNITION_ERROR_ABORTED is not currently caught by
511 // ChromeSpeechRecognitionManagerDelegate and would cause an exception. 515 // ChromeSpeechRecognitionManagerDelegate and would cause an exception.
512 // JS support will probably need it in future. 516 // JS support will probably need it in future.
(...skipping 18 matching lines...) Expand all
531 535
532 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. "; 536 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. ";
533 537
534 // The recognition engine is initialized only after STATE_STARTING. 538 // The recognition engine is initialized only after STATE_STARTING.
535 if (state_ > STATE_STARTING) { 539 if (state_ > STATE_STARTING) {
536 DCHECK(recognition_engine_.get() != NULL); 540 DCHECK(recognition_engine_.get() != NULL);
537 recognition_engine_->EndRecognition(); 541 recognition_engine_->EndRecognition();
538 } 542 }
539 543
540 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) 544 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT)
541 listener_->OnSoundEnd(caller_id_); 545 listener_->OnSoundEnd(session_id_);
542 546
543 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) 547 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT)
544 listener_->OnAudioEnd(caller_id_); 548 listener_->OnAudioEnd(session_id_);
545 549
546 if (error != NULL) 550 if (error != NULL)
547 listener_->OnRecognitionError(caller_id_, *error); 551 listener_->OnRecognitionError(session_id_, *error);
548 552
549 listener_->OnRecognitionEnd(caller_id_); 553 listener_->OnRecognitionEnd(session_id_);
550 554
551 return STATE_IDLE; 555 return STATE_IDLE;
552 } 556 }
553 557
554 SpeechRecognizerImpl::FSMState 558 SpeechRecognizerImpl::FSMState
555 SpeechRecognizerImpl::ProcessIntermediateResult(const FSMEventArgs&) { 559 SpeechRecognizerImpl::ProcessIntermediateResult(const FSMEventArgs&) {
556 // This is in preparation for future speech recognition functions. 560 // This is in preparation for future speech recognition functions.
557 NOTREACHED(); 561 NOTREACHED();
558 return state_; 562 return state_;
559 } 563 }
560 564
561 SpeechRecognizerImpl::FSMState 565 SpeechRecognizerImpl::FSMState
562 SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) { 566 SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) {
563 const SpeechRecognitionResult& result = event_args.engine_result; 567 const SpeechRecognitionResult& result = event_args.engine_result;
564 DVLOG(1) << "Got valid result"; 568 DVLOG(1) << "Got valid result";
565 recognition_engine_->EndRecognition(); 569 recognition_engine_->EndRecognition();
566 listener_->OnRecognitionResult(caller_id_, result); 570 listener_->OnRecognitionResult(session_id_, result);
567 listener_->OnRecognitionEnd(caller_id_); 571 listener_->OnRecognitionEnd(session_id_);
568 return STATE_IDLE; 572 return STATE_IDLE;
569 } 573 }
570 574
571 SpeechRecognizerImpl::FSMState 575 SpeechRecognizerImpl::FSMState
572 SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const { 576 SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const {
573 return state_; // Just keep the current state. 577 return state_; // Just keep the current state.
574 } 578 }
575 579
576 SpeechRecognizerImpl::FSMState 580 SpeechRecognizerImpl::FSMState
577 SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) { 581 SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) {
(...skipping 30 matching lines...) Expand all
608 const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor : 612 const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor :
609 kDownSmoothingFactor; 613 kDownSmoothingFactor;
610 audio_level_ += (level - audio_level_) * smoothing_factor; 614 audio_level_ += (level - audio_level_) * smoothing_factor;
611 615
612 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / 616 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /
613 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); 617 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
614 noise_level = std::min(std::max(0.0f, noise_level), 618 noise_level = std::min(std::max(0.0f, noise_level),
615 kAudioMeterRangeMaxUnclipped); 619 kAudioMeterRangeMaxUnclipped);
616 620
617 listener_->OnAudioLevelsChange( 621 listener_->OnAudioLevelsChange(
618 caller_id_, clip_detected ? 1.0f : audio_level_, noise_level); 622 session_id_, clip_detected ? 1.0f : audio_level_, noise_level);
619 } 623 }
620 624
621 const SpeechRecognitionEngine& 625 const SpeechRecognitionEngine&
622 SpeechRecognizerImpl::recognition_engine() const { 626 SpeechRecognizerImpl::recognition_engine() const {
623 return *(recognition_engine_.get()); 627 return *(recognition_engine_.get());
624 } 628 }
625 629
626 void SpeechRecognizerImpl::SetAudioManagerForTesting( 630 void SpeechRecognizerImpl::SetAudioManagerForTesting(
627 AudioManager* audio_manager) { 631 AudioManager* audio_manager) {
628 testing_audio_manager_ = audio_manager; 632 testing_audio_manager_ = audio_manager;
629 } 633 }
630 634
631 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) 635 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value)
632 : event(event_value), 636 : event(event_value),
633 audio_error_code(0), 637 audio_error_code(0),
634 audio_data(NULL), 638 audio_data(NULL),
635 engine_error(content::SPEECH_RECOGNITION_ERROR_NONE) { 639 engine_error(content::SPEECH_RECOGNITION_ERROR_NONE) {
636 } 640 }
637 641
638 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { 642 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {
639 } 643 }
640 644
641 } // namespace speech 645 } // namespace speech
OLDNEW
« no previous file with comments | « content/browser/speech/speech_recognizer_impl.h ('k') | content/browser/speech/speech_recognizer_impl_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698