content/browser/speech/speech_recognizer_impl.cc - Issue 9972008: Refactoring of chrome speech recognition architecture (CL1.6)

Side by Side Diff: content/browser/speech/speech_recognizer_impl.cc

Issue 9972008: Refactoring of chrome speech recognition architecture (CL1.6) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Rebased from master. Created 8 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "content/browser/speech/speech_recognizer_impl.h"	5 #include "content/browser/speech/speech_recognizer_impl.h"

6	6

7 #include "base/basictypes.h"	7 #include "base/basictypes.h"

8 #include "base/bind.h"	8 #include "base/bind.h"

9 #include "base/time.h"	9 #include "base/time.h"

10 #include "content/browser/browser_main_loop.h"	10 #include "content/browser/browser_main_loop.h"

(...skipping 45 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
56 if (samples[i] <= -32767 \|\| samples[i] >= 32767) {	56 if (samples[i] <= -32767 \|\| samples[i] >= 32767) {

57 if (++clipping_samples > kThreshold)	57 if (++clipping_samples > kThreshold)

58 return true;	58 return true;

59 }	59 }

60 }	60 }

61 return false;	61 return false;

62 }	62 }

63	63

64 } // namespace	64 } // namespace

65	65

	66 // TODO(primiano) Create(...) is transitional (until we fix speech input

	67 // extensions) and should be removed soon. The manager should be the only one

	68 // knowing the existence of SpeechRecognizer(Impl), thus the only one in charge

	69 // of instantiating it.

66 SpeechRecognizer* SpeechRecognizer::Create(	70 SpeechRecognizer* SpeechRecognizer::Create(

67 SpeechRecognitionEventListener* listener,	71 SpeechRecognitionEventListener* listener,

68 int caller_id,	72 int session_id,

69 const std::string& language,	73 const std::string& language,

70 const std::string& grammar,	74 const std::string& grammar,

71 net::URLRequestContextGetter* context_getter,	75 net::URLRequestContextGetter* context_getter,

72 bool filter_profanities,	76 bool filter_profanities,

73 const std::string& hardware_info,	77 const std::string& hardware_info,

74 const std::string& origin_url) {	78 const std::string& origin_url) {

75 speech::GoogleOneShotRemoteEngineConfig remote_engine_config;	79 speech::GoogleOneShotRemoteEngineConfig remote_engine_config;

76 remote_engine_config.language = language;	80 remote_engine_config.language = language;

77 remote_engine_config.grammar = grammar;	81 remote_engine_config.grammar = grammar;

78 remote_engine_config.audio_sample_rate =	82 remote_engine_config.audio_sample_rate =

79 speech::SpeechRecognizerImpl::kAudioSampleRate;	83 speech::SpeechRecognizerImpl::kAudioSampleRate;

80 remote_engine_config.audio_num_bits_per_sample =	84 remote_engine_config.audio_num_bits_per_sample =

81 speech::SpeechRecognizerImpl::kNumBitsPerAudioSample;	85 speech::SpeechRecognizerImpl::kNumBitsPerAudioSample;

82 remote_engine_config.filter_profanities = filter_profanities;	86 remote_engine_config.filter_profanities = filter_profanities;

83 remote_engine_config.hardware_info = hardware_info;	87 remote_engine_config.hardware_info = hardware_info;

84 remote_engine_config.origin_url = origin_url;	88 remote_engine_config.origin_url = origin_url;

85	89

86 // SpeechRecognizerImpl takes ownership of google_remote_engine.	90 // SpeechRecognizerImpl takes ownership of google_remote_engine.

87 speech::GoogleOneShotRemoteEngine* google_remote_engine =	91 speech::GoogleOneShotRemoteEngine* google_remote_engine =

88 new speech::GoogleOneShotRemoteEngine(context_getter);	92 new speech::GoogleOneShotRemoteEngine(context_getter);

89 google_remote_engine->SetConfig(remote_engine_config);	93 google_remote_engine->SetConfig(remote_engine_config);

90	94

91 return new speech::SpeechRecognizerImpl(listener,	95 return new speech::SpeechRecognizerImpl(listener,

92 caller_id,	96 session_id,

93 google_remote_engine);	97 google_remote_engine);

94 }	98 }

95	99

96 namespace speech {	100 namespace speech {

97	101

98 const int SpeechRecognizerImpl::kAudioSampleRate = 16000;	102 const int SpeechRecognizerImpl::kAudioSampleRate = 16000;

99 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO;	103 const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO;

100 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16;	104 const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16;

101 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000;	105 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000;

102 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300;	106 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300;

103	107

104 COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0,	108 COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0,

105 kNumBitsPerAudioSample_must_be_a_multiple_of_8);	109 kNumBitsPerAudioSample_must_be_a_multiple_of_8);

106	110

107 SpeechRecognizerImpl::SpeechRecognizerImpl(	111 SpeechRecognizerImpl::SpeechRecognizerImpl(

108 SpeechRecognitionEventListener* listener,	112 SpeechRecognitionEventListener* listener,

109 int caller_id,	113 int session_id,

110 SpeechRecognitionEngine* engine)	114 SpeechRecognitionEngine* engine)

111 : listener_(listener),	115 : listener_(listener),

112 testing_audio_manager_(NULL),	116 testing_audio_manager_(NULL),

113 recognition_engine_(engine),	117 recognition_engine_(engine),

114 endpointer_(kAudioSampleRate),	118 endpointer_(kAudioSampleRate),

115 caller_id_(caller_id),	119 session_id_(session_id),

116 is_dispatching_event_(false),	120 is_dispatching_event_(false),

117 state_(STATE_IDLE) {	121 state_(STATE_IDLE) {

118 DCHECK(listener_ != NULL);	122 DCHECK(listener_ != NULL);

119 DCHECK(recognition_engine_ != NULL);	123 DCHECK(recognition_engine_ != NULL);

120 endpointer_.set_speech_input_complete_silence_length(	124 endpointer_.set_speech_input_complete_silence_length(

121 base::Time::kMicrosecondsPerSecond / 2);	125 base::Time::kMicrosecondsPerSecond / 2);

122 endpointer_.set_long_speech_input_complete_silence_length(	126 endpointer_.set_long_speech_input_complete_silence_length(

123 base::Time::kMicrosecondsPerSecond);	127 base::Time::kMicrosecondsPerSecond);

124 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);	128 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);

125 endpointer_.StartSession();	129 endpointer_.StartSession();

(...skipping 273 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
399 DCHECK(recognition_engine_.get() != NULL);	403 DCHECK(recognition_engine_.get() != NULL);

400 DCHECK(!IsCapturingAudio());	404 DCHECK(!IsCapturingAudio());

401 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ?	405 AudioManager* audio_manager = (testing_audio_manager_ != NULL) ?

402 testing_audio_manager_ :	406 testing_audio_manager_ :

403 BrowserMainLoop::GetAudioManager();	407 BrowserMainLoop::GetAudioManager();

404 DCHECK(audio_manager != NULL);	408 DCHECK(audio_manager != NULL);

405	409

406 DVLOG(1) << "SpeechRecognizerImpl starting audio capture.";	410 DVLOG(1) << "SpeechRecognizerImpl starting audio capture.";

407 num_samples_recorded_ = 0;	411 num_samples_recorded_ = 0;

408 audio_level_ = 0;	412 audio_level_ = 0;

409 listener_->OnRecognitionStart(caller_id_);	413 listener_->OnRecognitionStart(session_id_);

410	414

411 if (!audio_manager->HasAudioInputDevices()) {	415 if (!audio_manager->HasAudioInputDevices()) {

412 return AbortWithError(SpeechRecognitionError(	416 return AbortWithError(SpeechRecognitionError(

413 content::SPEECH_RECOGNITION_ERROR_AUDIO,	417 content::SPEECH_RECOGNITION_ERROR_AUDIO,

414 content::SPEECH_AUDIO_ERROR_DETAILS_NO_MIC));	418 content::SPEECH_AUDIO_ERROR_DETAILS_NO_MIC));

415 }	419 }

416	420

417 if (audio_manager->IsRecordingInProcess()) {	421 if (audio_manager->IsRecordingInProcess()) {

418 return AbortWithError(SpeechRecognitionError(	422 return AbortWithError(SpeechRecognitionError(

419 content::SPEECH_RECOGNITION_ERROR_AUDIO,	423 content::SPEECH_RECOGNITION_ERROR_AUDIO,

(...skipping 20 matching lines...) Expand all Loading...
440 audio_controller_->Record();	444 audio_controller_->Record();

441 return STATE_STARTING;	445 return STATE_STARTING;

442 }	446 }

443	447

444 SpeechRecognizerImpl::FSMState	448 SpeechRecognizerImpl::FSMState

445 SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) {	449 SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) {

446 // This is the first audio packet captured, so the recognition engine is	450 // This is the first audio packet captured, so the recognition engine is

447 // started and the delegate notified about the event.	451 // started and the delegate notified about the event.

448 DCHECK(recognition_engine_.get() != NULL);	452 DCHECK(recognition_engine_.get() != NULL);

449 recognition_engine_->StartRecognition();	453 recognition_engine_->StartRecognition();

450 listener_->OnAudioStart(caller_id_);	454 listener_->OnAudioStart(session_id_);

451	455

452 // This is a little hack, since TakeAudioChunk() is already called by	456 // This is a little hack, since TakeAudioChunk() is already called by

453 // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping	457 // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping

454 // the first audio chunk captured after opening the audio device.	458 // the first audio chunk captured after opening the audio device.

455 recognition_engine_->TakeAudioChunk(*(event_args.audio_data));	459 recognition_engine_->TakeAudioChunk(*(event_args.audio_data));

456 return STATE_ESTIMATING_ENVIRONMENT;	460 return STATE_ESTIMATING_ENVIRONMENT;

457 }	461 }

458	462

459 SpeechRecognizerImpl::FSMState	463 SpeechRecognizerImpl::FSMState

460 SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) {	464 SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) {

461 DCHECK(endpointer_.IsEstimatingEnvironment());	465 DCHECK(endpointer_.IsEstimatingEnvironment());

462 if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) {	466 if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) {

463 endpointer_.SetUserInputMode();	467 endpointer_.SetUserInputMode();

464 listener_->OnEnvironmentEstimationComplete(caller_id_);	468 listener_->OnEnvironmentEstimationComplete(session_id_);

465 return STATE_WAITING_FOR_SPEECH;	469 return STATE_WAITING_FOR_SPEECH;

466 } else {	470 } else {

467 return STATE_ESTIMATING_ENVIRONMENT;	471 return STATE_ESTIMATING_ENVIRONMENT;

468 }	472 }

469 }	473 }

470	474

471 SpeechRecognizerImpl::FSMState	475 SpeechRecognizerImpl::FSMState

472 SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) {	476 SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) {

473 if (endpointer_.DidStartReceivingSpeech()) {	477 if (endpointer_.DidStartReceivingSpeech()) {

474 listener_->OnSoundStart(caller_id_);	478 listener_->OnSoundStart(session_id_);

475 return STATE_RECOGNIZING;	479 return STATE_RECOGNIZING;

476 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) {	480 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) {

477 return AbortWithError(	481 return AbortWithError(

478 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH));	482 SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH));

479 }	483 }

480 return STATE_WAITING_FOR_SPEECH;	484 return STATE_WAITING_FOR_SPEECH;

481 }	485 }

482	486

483 SpeechRecognizerImpl::FSMState	487 SpeechRecognizerImpl::FSMState

484 SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) {	488 SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) {

485 if (endpointer_.speech_input_complete()) {	489 if (endpointer_.speech_input_complete()) {

486 return StopCaptureAndWaitForResult(event_args);	490 return StopCaptureAndWaitForResult(event_args);

487 }	491 }

488 return STATE_RECOGNIZING;	492 return STATE_RECOGNIZING;

489 }	493 }

490	494

491 SpeechRecognizerImpl::FSMState	495 SpeechRecognizerImpl::FSMState

492 SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) {	496 SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) {

493 DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING);	497 DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING);

494	498

495 DVLOG(1) << "Concluding recognition";	499 DVLOG(1) << "Concluding recognition";

496 CloseAudioControllerAsynchronously();	500 CloseAudioControllerAsynchronously();

497 recognition_engine_->AudioChunksEnded();	501 recognition_engine_->AudioChunksEnded();

498	502

499 if (state_ > STATE_WAITING_FOR_SPEECH)	503 if (state_ > STATE_WAITING_FOR_SPEECH)

500 listener_->OnSoundEnd(caller_id_);	504 listener_->OnSoundEnd(session_id_);

501	505

502 listener_->OnAudioEnd(caller_id_);	506 listener_->OnAudioEnd(session_id_);

503 return STATE_WAITING_FINAL_RESULT;	507 return STATE_WAITING_FINAL_RESULT;

504 }	508 }

505	509

506 SpeechRecognizerImpl::FSMState	510 SpeechRecognizerImpl::FSMState

507 SpeechRecognizerImpl::Abort(const FSMEventArgs& event_args) {	511 SpeechRecognizerImpl::Abort(const FSMEventArgs& event_args) {

508 // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of	512 // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of

509 // other specific error sources (so that it was an explicit abort request).	513 // other specific error sources (so that it was an explicit abort request).

510 // However, SPEECH_RECOGNITION_ERROR_ABORTED is not currently caught by	514 // However, SPEECH_RECOGNITION_ERROR_ABORTED is not currently caught by

511 // ChromeSpeechRecognitionManagerDelegate and would cause an exception.	515 // ChromeSpeechRecognitionManagerDelegate and would cause an exception.

512 // JS support will probably need it in future.	516 // JS support will probably need it in future.

(...skipping 18 matching lines...) Expand all Loading...
531	535

532 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. ";	536 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. ";

533	537

534 // The recognition engine is initialized only after STATE_STARTING.	538 // The recognition engine is initialized only after STATE_STARTING.

535 if (state_ > STATE_STARTING) {	539 if (state_ > STATE_STARTING) {

536 DCHECK(recognition_engine_.get() != NULL);	540 DCHECK(recognition_engine_.get() != NULL);

537 recognition_engine_->EndRecognition();	541 recognition_engine_->EndRecognition();

538 }	542 }

539	543

540 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT)	544 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT)

541 listener_->OnSoundEnd(caller_id_);	545 listener_->OnSoundEnd(session_id_);

542	546

543 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT)	547 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT)

544 listener_->OnAudioEnd(caller_id_);	548 listener_->OnAudioEnd(session_id_);

545	549

546 if (error != NULL)	550 if (error != NULL)

547 listener_->OnRecognitionError(caller_id_, *error);	551 listener_->OnRecognitionError(session_id_, *error);

548	552

549 listener_->OnRecognitionEnd(caller_id_);	553 listener_->OnRecognitionEnd(session_id_);

550	554

551 return STATE_IDLE;	555 return STATE_IDLE;

552 }	556 }

553	557

554 SpeechRecognizerImpl::FSMState	558 SpeechRecognizerImpl::FSMState

555 SpeechRecognizerImpl::ProcessIntermediateResult(const FSMEventArgs&) {	559 SpeechRecognizerImpl::ProcessIntermediateResult(const FSMEventArgs&) {

556 // This is in preparation for future speech recognition functions.	560 // This is in preparation for future speech recognition functions.

557 NOTREACHED();	561 NOTREACHED();

558 return state_;	562 return state_;

559 }	563 }

560	564

561 SpeechRecognizerImpl::FSMState	565 SpeechRecognizerImpl::FSMState

562 SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) {	566 SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) {

563 const SpeechRecognitionResult& result = event_args.engine_result;	567 const SpeechRecognitionResult& result = event_args.engine_result;

564 DVLOG(1) << "Got valid result";	568 DVLOG(1) << "Got valid result";

565 recognition_engine_->EndRecognition();	569 recognition_engine_->EndRecognition();

566 listener_->OnRecognitionResult(caller_id_, result);	570 listener_->OnRecognitionResult(session_id_, result);

567 listener_->OnRecognitionEnd(caller_id_);	571 listener_->OnRecognitionEnd(session_id_);

568 return STATE_IDLE;	572 return STATE_IDLE;

569 }	573 }

570	574

571 SpeechRecognizerImpl::FSMState	575 SpeechRecognizerImpl::FSMState

572 SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const {	576 SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const {

573 return state_; // Just keep the current state.	577 return state_; // Just keep the current state.

574 }	578 }

575	579

576 SpeechRecognizerImpl::FSMState	580 SpeechRecognizerImpl::FSMState

577 SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) {	581 SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) {

(...skipping 30 matching lines...) Expand all Loading...
608 const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor :	612 const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor :

609 kDownSmoothingFactor;	613 kDownSmoothingFactor;

610 audio_level_ += (level - audio_level_) * smoothing_factor;	614 audio_level_ += (level - audio_level_) * smoothing_factor;

611	615

612 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /	616 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /

613 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);	617 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);

614 noise_level = std::min(std::max(0.0f, noise_level),	618 noise_level = std::min(std::max(0.0f, noise_level),

615 kAudioMeterRangeMaxUnclipped);	619 kAudioMeterRangeMaxUnclipped);

616	620

617 listener_->OnAudioLevelsChange(	621 listener_->OnAudioLevelsChange(

618 caller_id_, clip_detected ? 1.0f : audio_level_, noise_level);	622 session_id_, clip_detected ? 1.0f : audio_level_, noise_level);

619 }	623 }

620	624

621 const SpeechRecognitionEngine&	625 const SpeechRecognitionEngine&

622 SpeechRecognizerImpl::recognition_engine() const {	626 SpeechRecognizerImpl::recognition_engine() const {

623 return *(recognition_engine_.get());	627 return *(recognition_engine_.get());

624 }	628 }

625	629

626 void SpeechRecognizerImpl::SetAudioManagerForTesting(	630 void SpeechRecognizerImpl::SetAudioManagerForTesting(

627 AudioManager* audio_manager) {	631 AudioManager* audio_manager) {

628 testing_audio_manager_ = audio_manager;	632 testing_audio_manager_ = audio_manager;

629 }	633 }

630	634

631 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value)	635 SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value)

632 : event(event_value),	636 : event(event_value),

633 audio_error_code(0),	637 audio_error_code(0),

634 audio_data(NULL),	638 audio_data(NULL),

635 engine_error(content::SPEECH_RECOGNITION_ERROR_NONE) {	639 engine_error(content::SPEECH_RECOGNITION_ERROR_NONE) {

636 }	640 }

637	641

638 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {	642 SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {

639 }	643 }

640	644

641 } // namespace speech	645 } // namespace speech

OLD	NEW

« no previous file with comments | « content/browser/speech/speech_recognizer_impl.h ('k') | content/browser/speech/speech_recognizer_impl_unittest.cc » ('j') | no next file with comments »