Bottle SpeechRecognizerModule::waitNextRecognition(int timeout) { yInfo() <<"Recognition: blocking mode on" ; Bottle bOutGrammar; bool gotSomething = false; double endTime = Time::now() + timeout/1000.0; interruptRecognition = false; cout << endl ; yInfo() << "=========== GO Waiting for recog! ===========" ; while(Time::now()<endTime && !gotSomething && !interruptRecognition) { //std::cout<<"."; const float ConfidenceThreshold = 0.3f; SPEVENT curEvent; ULONG fetched = 0; HRESULT hr = S_OK; m_cpRecoCtxt->GetEvents(1, &curEvent, &fetched); while (fetched > 0) { yInfo() << " received something in waitNextRecognition" ; gotSomething = true; ISpRecoResult* result = reinterpret_cast<ISpRecoResult*>(curEvent.lParam); CSpDynamicString dstrText; result->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL); string fullSentence = ws2s(dstrText); yInfo() <<fullSentence ; if (m_useTalkBack) say(fullSentence); bOutGrammar.addString(fullSentence); SPPHRASE* pPhrase = NULL; result->GetPhrase(&pPhrase); bOutGrammar.addList() = toBottle(pPhrase,&pPhrase->Rule); yInfo() <<"Sending semantic bottle : "<<bOutGrammar.toString() ; m_cpRecoCtxt->GetEvents(1, &curEvent, &fetched); if (m_forwardSound) { yarp::sig::Sound& rawSnd = m_portSound.prepare(); rawSnd = toSound(result); m_portSound.write(); } } } if(interruptRecognition) { yDebug() << "interrupted speech recognizer!"; } yInfo() <<"Recognition: blocking mode off"; return bOutGrammar; }
// 音频处理 void ThisApp::speech_process() { // 置信阈值 const float ConfidenceThreshold = 0.3f; SPEVENT curEvent = { SPEI_UNDEFINED, SPET_LPARAM_IS_UNDEFINED, 0, 0, 0, 0 }; ULONG fetched = 0; HRESULT hr = S_OK; // 获取事件 m_pSpeechContext->GetEvents(1, &curEvent, &fetched); while (fetched > 0) { // 确定是识别事件 switch (curEvent.eEventId) { case SPEI_RECOGNITION: // 保证位对象 if (SPET_LPARAM_IS_OBJECT == curEvent.elParamType) { ISpRecoResult* result = reinterpret_cast<ISpRecoResult*>(curEvent.lParam); SPPHRASE* pPhrase = nullptr; // 获取识别短语 hr = result->GetPhrase(&pPhrase); if (SUCCEEDED(hr)) { #ifdef _DEBUG // DEBUG时显示识别字符串 WCHAR* pwszFirstWord; result->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &pwszFirstWord, nullptr); _cwprintf(pwszFirstWord); ::CoTaskMemFree(pwszFirstWord); #endif pPhrase->pProperties; const SPPHRASEELEMENT* pointer = pPhrase->pElements + 1; if ((pPhrase->pProperties != nullptr) && (pPhrase->pProperties->pFirstChild != nullptr)) { const SPPHRASEPROPERTY* pSemanticTag = pPhrase->pProperties->pFirstChild; #ifdef _DEBUG _cwprintf(L" 置信度:%d%%\n", (int)(pSemanticTag->SREngineConfidence*100.f)); #endif if (pSemanticTag->SREngineConfidence > ConfidenceThreshold) { speech_behavior(pSemanticTag); } } ::CoTaskMemFree(pPhrase); } } break; } m_pSpeechContext->GetEvents(1, &curEvent, &fetched); } return; }
/// <summary> /// Process recently triggered speech recognition events. /// </summary> void KinectReader::ProcessSpeech(Input* input) { m_pSpeechController = new SpeechController(input); const float ConfidenceThreshold = 0.3f; SPEVENT curEvent; ULONG fetched = 0; HRESULT hr = S_OK; m_pSpeechContext->GetEvents(1, &curEvent, &fetched); while (fetched > 0) { switch (curEvent.eEventId) { case SPEI_RECOGNITION: if (SPET_LPARAM_IS_OBJECT == curEvent.elParamType) { // this is an ISpRecoResult ISpRecoResult* result = reinterpret_cast<ISpRecoResult*>(curEvent.lParam); SPPHRASE* pPhrase = NULL; hr = result->GetPhrase(&pPhrase); if (SUCCEEDED(hr)) { if ((pPhrase->pProperties != NULL) && (pPhrase->pProperties->pFirstChild != NULL)) { const SPPHRASEPROPERTY* pSemanticTag = pPhrase->pProperties->pFirstChild; if (pSemanticTag->SREngineConfidence > ConfidenceThreshold) { SpeechAction action = MapSpeechTagToAction(pSemanticTag->pszValue); m_pSpeechController->DoAction(action); } } ::CoTaskMemFree(pPhrase); } } break; } m_pSpeechContext->GetEvents(1, &curEvent, &fetched); } return; }
string SpeechProvider::processSpeech() { const float ConfidenceThreshold = 0.3f; SPEVENT curEvent = { SPEI_UNDEFINED, SPET_LPARAM_IS_UNDEFINED, 0, 0, 0, 0 }; ULONG fetched = 0; HRESULT hr = S_OK; string fresult; speechContext->GetEvents(1, &curEvent, &fetched); while (fetched > 0) { switch (curEvent.eEventId) { case SPEI_RECOGNITION: if (SPET_LPARAM_IS_OBJECT == curEvent.elParamType) { // this is an ISpRecoResult ISpRecoResult* result = reinterpret_cast<ISpRecoResult*>(curEvent.lParam); SPPHRASE* pPhrase = NULL; hr = result->GetPhrase(&pPhrase); if (SUCCEEDED(hr)) { wstring s = pPhrase->pElements->pszDisplayText; wcout << L"Phrase Spoken : " << s << endl; fresult = string(s.begin(), s.end()); ::CoTaskMemFree(pPhrase); } } break; } speechContext->GetEvents(1, &curEvent, &fetched); } return fresult; }
/** This is called when SAPI 5.1 has an event. In the textless case, we only handle SPIE_RECOGNITION event. We aren't looking at SPIE_HYPOTHESIS. This might be an error. We might be more robust by handling both. We process the event and add the phonemes we get to the result list **/ void sapi_textless_lipsync::callback() { CSpEvent event; // the event ISpRecoResult *pRecoResult; // recoResult from the event SPPHRASE *pSpPhrase; // phrase from recoResult SPRECORESULTTIMES pRecoResultTimes; // result times from RecoResult WCHAR phone_buffer[256]; // phoneme buffer for conversion long msStart; // time stamp of the result while (event.GetFrom(this->m_recogCntxt) == S_OK) { if (event.eEventId == SPEI_RECOGNITION /*|| event.eEventId == SPEI_HYPOTHESIS */) { // for textless we only accept full recognition. This might be an area // to watch out for // pull out the result object pRecoResult = event.RecoResult(); // pull the whole text from the result CSpDynamicString pSapiText; pRecoResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, FALSE, &pSapiText, NULL); // get the start time for the phrase. we use this as an offset for the phrase // elements. Not sure if this is correct. pRecoResult->GetResultTimes(&pRecoResultTimes); msStart = sapi_time_to_milli(pRecoResultTimes.ullStart); // extract the phrase object pRecoResult->GetPhrase(&pSpPhrase); if (pSpPhrase != NULL) { // Process each element of the phrase. These should be our // orthorgraphs const SPPHRASEELEMENT *p = pSpPhrase->pElements; const SPPHRASEELEMENT *pEnd = p + pSpPhrase->Rule.ulCountOfElements; while (p != pEnd) { // for each phrase element we create a marker // that contains the time stamps along with the // phonemes. associated with it. alignment_result al; al.m_orthography = p->pszDisplayText; // Get the phonemes ULONG j = 0; SPPHONEID phn[2]; phn[1] = 0x00; while (p->pszPronunciation[j] != 0) { // process each phoneme phn[0] = p->pszPronunciation[j]; m_phnCvt->IdToPhone(phn, phone_buffer); al.m_phonemes.push_back(phone_buffer); j++; } // start time of the ortheme al.m_msStart= msStart + bytes_to_milli(p->ulAudioStreamOffset); // end time of the ortheme al.m_msEnd = bytes_to_milli(p->ulAudioSizeBytes); al.m_msEnd += al.m_msStart; // add it to the results m_results.push_back(al); p++; } } } else if (event.eEventId == SPEI_END_SR_STREAM) { // This event occurs when the stream has finished processing. // we set a flag to indicate that things are done. m_bDone = TRUE; } } }
/** This is called by SAPI 5.1 when it has an event. We use the SpEvent class provided by their SDK to simplify the processing. Basically, when we get a "RECOGNITION" event or a "SPEI_HYPOTHESIS" event we process them the same. Hypothesis are more likely, for all but very short files, "SPIE_RECOGNITION" is a rarity. Since the hypothesis will include duplicate data, we have a decision. We can save the newest hypothesis or we can save the one which generates the most alignments. Imperically, it seems that sticking with the longest result works best. But perhaps this is not so. **/ void sapi_textbased_lipsync::callback() { //USES_CONVERSION; CSpEvent event; ISpRecoResult *pRecoResult; // recoResult from the event SPPHRASE *pSpPhrase; // phrase from recoResult SPRECORESULTTIMES pRecoResultTimes; // result times from RecoResult WCHAR phone_buffer[256]; // buffer for the phonemes UINT msStart; // start time of the phrase // Process the events while (event.GetFrom(this->m_recogCntxt) == S_OK) { if (event.eEventId == SPEI_RECOGNITION || event.eEventId == SPEI_HYPOTHESIS) { // text based has to accept hypothesis or it mostly fails unless the // script is very short // pull out the result object pRecoResult = event.RecoResult(); // pull the whole text from the result CSpDynamicString pSapiText; pRecoResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, FALSE, &pSapiText, NULL); // get the start time for the phrase. we use this as an offset for the phrase // elements. Not sure if this is correct. pRecoResult->GetResultTimes(&pRecoResultTimes); msStart = sapi_time_to_milli(pRecoResultTimes.ullStart); std::wstring strPrintText = pSapiText; std::cerr << "hypothesis: " << wstring_2_string(strPrintText) << std::endl; // if the new results are longer than existing results in orthographic form // we accept the results and process the phonemes. Otherwise, we skip it if ((wcslen(pSapiText) > this->m_strResults.size())) { m_strResults = pSapiText; // clear the old results. This hypothesis trumps it this->m_results.clear(); // extract the phrase object pRecoResult->GetPhrase(&pSpPhrase); if (pSpPhrase != NULL) { // Process each element of the phrase. These should be our // orthorgraphs const SPPHRASEELEMENT *p = pSpPhrase->pElements; const SPPHRASEELEMENT *pEnd = p + pSpPhrase->Rule.ulCountOfElements; while (p != pEnd) { // for each phrase element we create a marker // that contains the time stamps along with the // phonemes. associated with it. alignment_result al; al.m_orthography = p->pszDisplayText; // Get the phonemes ULONG j = 0; SPPHONEID phn[2]; phn[1] = 0x00; while (p->pszPronunciation[j] != 0) { // process each phoneme phn[0] = p->pszPronunciation[j]; m_phnCvt->IdToPhone(phn, phone_buffer); al.m_phonemes.push_back(phone_buffer); j++; } // start time of the ortheme al.m_msStart= msStart + bytes_to_milli(p->ulAudioStreamOffset); // end time of the ortheme al.m_msEnd = bytes_to_milli(p->ulAudioSizeBytes); al.m_msEnd += al.m_msStart; // add it to the results m_results.push_back(al); p++; } } } } else if (event.eEventId == SPEI_END_SR_STREAM) { // This event occurs when the stream has finished processing. // we set a flag to indicate that things are done. m_bDone = TRUE; } } }
void CASRwrapper::GetText(std::wstring& speechRes, float* pconfidence, int requestedAlternates, std::wstring alternates[], float alternatesConfidence[]) { //HRESULT hr = CoCreateInstance(CLSID_SpVoice, NULL, CLSCTX_ALL, IID_ISpVoice, (void **)&m_pVoice); //hr = m_pVoice->Speak(speechRes, 0, NULL); //m_pVoice->Release(); //m_pVoice = NULL; const ULONG maxEvents = 10; SPEVENT events[maxEvents]; ULONG eventCount; HRESULT hr; hr = m_cpRecoCtxt->GetEvents(maxEvents, events, &eventCount); // Warning hr equal S_FALSE if everything is OK // but eventCount < requestedEventCount if (!(hr == S_OK || hr == S_FALSE)) { return; } if (eventCount > 1) { speechRes.assign(L"More than one event!"); return; } ISpRecoResult* recoResult; recoResult = reinterpret_cast<ISpRecoResult*>(events[0].lParam); wchar_t* text; hr = recoResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, FALSE, &text, NULL); speechRes.assign(text); //if (confidence != NULL) //*confidence = recoResult->->pElements->SREngineConfidence;; CoTaskMemFree(text); if (requestedAlternates == 0 && pconfidence == NULL) return; const USHORT MAX_ALTERNATES = 100; if (requestedAlternates > MAX_ALTERNATES) requestedAlternates = MAX_ALTERNATES; if (requestedAlternates == 0) //in case asked for confidence. i.e., pconfidence!=NULL requestedAlternates = 1; CComPtr<ISpPhraseAlt> pcpPhraseAlt[MAX_ALTERNATES]; SPPHRASE* pPhrase; std::string betterResult; float ConfidenceMax = 0.0; ULONG ulCount; //std::list<std::string> lWordsRec; // Retrieve information about the recognized phrase hr = recoResult->GetPhrase(&pPhrase); if (SUCCEEDED(hr)) { // Retrieve a list of alternative phrases related to the recognized phrase hr = recoResult->GetAlternates(pPhrase->Rule.ulFirstElement, pPhrase->Rule.ulCountOfElements, requestedAlternates, (ISpPhraseAlt**)pcpPhraseAlt, &ulCount); } if (SUCCEEDED(hr)) { // Browse the list of alternative phrases in order of highest likelyhood with the original phrase for (unsigned int i = 0; i < ulCount; i++) { SPPHRASE* pPhraseAlt; CSpDynamicString pwszAlternate; // Retrieve information about the current alternative phrase pcpPhraseAlt[i]->GetPhrase(&pPhraseAlt); // Get the phrase's entire text string hr = pcpPhraseAlt[i]->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &pwszAlternate, NULL); if (SUCCEEDED(hr)) { if (i == 1 && pconfidence != NULL) *pconfidence = pPhraseAlt->pElements->SREngineConfidence; if (alternatesConfidence != NULL) alternatesConfidence[i] = pPhraseAlt->pElements->SREngineConfidence; if (alternates != NULL) alternates[i] = pwszAlternate.Copy(); // .CopyToChar(); } } } }
int _tmain( int argc, _TCHAR* argv[] ) { cv::setUseOptimized( true ); // Kinectのインスタンス生成、初期化 INuiSensor* pSensor; HRESULT hResult = S_OK; hResult = NuiCreateSensorByIndex( 0, &pSensor ); if( FAILED( hResult ) ){ std::cerr << "Error : NuiCreateSensorByIndex" << std::endl; return -1; } hResult = pSensor->NuiInitialize( NUI_INITIALIZE_FLAG_USES_AUDIO ); if( FAILED( hResult ) ){ std::cerr << "Error : NuiInitialize" << std::endl; return -1; } // Audioストリームの初期化(InitializeAudioStream) std::cout << "InitializeAudioStream" << std::endl; INuiAudioBeam* pNuiAudioSource; hResult = pSensor->NuiGetAudioSource( &pNuiAudioSource ); if( FAILED( hResult ) ){ std::cerr << "Error : NuiGetAudioSource" << std::endl; return -1; } IMediaObject* pMediaObject = nullptr; IPropertyStore* pPropertyStore = nullptr; pNuiAudioSource->QueryInterface( IID_IMediaObject, reinterpret_cast<void**>( &pMediaObject ) ); pNuiAudioSource->QueryInterface( IID_IPropertyStore, reinterpret_cast<void**>( &pPropertyStore ) ); PROPVARIANT propvariant; PropVariantInit( &propvariant ); propvariant.vt = VT_I4; propvariant.lVal = static_cast<LONG>( 4 ); pPropertyStore->SetValue( MFPKEY_WMAAECMA_SYSTEM_MODE, propvariant ); PropVariantClear( &propvariant ); WAVEFORMATEX waveFormat = { AudioFormat, AudioChannels, AudioSamplesPerSecond, AudioAverageBytesPerSecond, AudioBlockAlign, AudioBitsPerSample, 0 }; DMO_MEDIA_TYPE mediaType = { 0 }; MoInitMediaType( &mediaType, sizeof( WAVEFORMATEX ) ); mediaType.majortype = MEDIATYPE_Audio; mediaType.subtype = MEDIASUBTYPE_PCM; mediaType.lSampleSize = 0; mediaType.bFixedSizeSamples = true; mediaType.bTemporalCompression = false; mediaType.formattype = FORMAT_WaveFormatEx; memcpy( mediaType.pbFormat, &waveFormat, sizeof( WAVEFORMATEX ) ); pMediaObject->SetOutputType( 0, &mediaType, 0 ); KinectAudioStream* audioStream = new KinectAudioStream( pMediaObject ); IStream* pStream = nullptr; audioStream->QueryInterface( IID_IStream, reinterpret_cast<void**>( &pStream ) ); CoInitialize( nullptr ); ISpStream* pSpeechStream = nullptr; CoCreateInstance( CLSID_SpStream, NULL, CLSCTX_INPROC_SERVER, __uuidof(ISpStream), reinterpret_cast<void**>( &pSpeechStream ) ); pSpeechStream->SetBaseStream( pStream, SPDFID_WaveFormatEx, &waveFormat ); MoFreeMediaType( &mediaType ); pStream->Release(); pPropertyStore->Release(); pMediaObject->Release(); pNuiAudioSource->Release(); // 音声認識器を作成(CreateSpeechRecognizer) std::cout << "CreateSpeechRecognizer" << std::endl; ISpRecognizer* pSpeechRecognizer; CoCreateInstance( CLSID_SpInprocRecognizer, nullptr, CLSCTX_INPROC_SERVER, __uuidof(ISpRecognizer), reinterpret_cast<void**>( &pSpeechRecognizer ) ); pSpeechRecognizer->SetInput( pSpeechStream, false ); /* // If can use ATL, easier to using SpFindBestToken(sphelper.h). When using Professional or more. ISpObjectToken* pEngineToken = nullptr; SpFindBestToken( SPCAT_RECOGNIZERS, L"Language=411;Kinect=True", NULL, &pEngineToken ); // Japanese "Language=411;Kinect=True" English "Language=409;Kinect=True" */ ///* // If can't use ATL, alternative to using SpFIndBestToken(sphelper.h). When using Express. const wchar_t* pVendorPreferred = L"VendorPreferred"; const unsigned long lengthVendorPreferred = static_cast<unsigned long>( wcslen( pVendorPreferred ) ); unsigned long length; ULongAdd( lengthVendorPreferred, 1, &length ); wchar_t* pAttribsVendorPreferred = new wchar_t[ length ]; StringCchCopyW( pAttribsVendorPreferred, length, pVendorPreferred ); ISpObjectTokenCategory* pTokenCategory = nullptr; CoCreateInstance( CLSID_SpObjectTokenCategory, nullptr, CLSCTX_ALL, __uuidof(ISpObjectTokenCategory), reinterpret_cast<void**>( &pTokenCategory ) ); pTokenCategory->SetId( SPCAT_RECOGNIZERS, false ); IEnumSpObjectTokens* pEnumTokens = nullptr; CoCreateInstance( CLSID_SpMMAudioEnum, nullptr, CLSCTX_ALL, __uuidof(IEnumSpObjectTokens), reinterpret_cast<void**>( &pEnumTokens ) ); pTokenCategory->EnumTokens( L"Language=411;Kinect=True", pAttribsVendorPreferred, &pEnumTokens ); // Japanese "Language=411;Kinect=True" English "Language=409;Kinect=True" delete[] pAttribsVendorPreferred; ISpObjectToken* pEngineToken = nullptr; pEnumTokens->Next( 1, &pEngineToken, nullptr ); //*/ pSpeechRecognizer->SetRecognizer( pEngineToken ); ISpRecoContext* pSpeechContext; pSpeechRecognizer->CreateRecoContext( &pSpeechContext ); pEngineToken->Release(); ///* pTokenCategory->Release(); pEnumTokens->Release(); //*/ // 音声認識辞書の作成(LoadSpeechGrammar) std::cout << "LoadSpeechGrammar" << std::endl; ISpRecoGrammar* pSpeechGrammar; pSpeechContext->CreateGrammar( 1, &pSpeechGrammar ); pSpeechGrammar->LoadCmdFromFile( L"SpeechRecognition_Ja.grxml", /*SPLO_STATIC*/SPLO_DYNAMIC ); // http://www.w3.org/TR/speech-grammar/ (UTF-8/CRLF) audioStream->StartCapture(); pSpeechGrammar->SetRuleState( nullptr, nullptr, SPRS_ACTIVE ); pSpeechRecognizer->SetRecoState( SPRST_ACTIVE_ALWAYS ); pSpeechContext->SetInterest( SPFEI( SPEI_RECOGNITION ), SPFEI( SPEI_RECOGNITION ) ); pSpeechContext->Resume( 0 ); HANDLE hSpeechEvent = INVALID_HANDLE_VALUE; hSpeechEvent = pSpeechContext->GetNotifyEventHandle(); HANDLE hEvents[1] = { hSpeechEvent }; int width = 640; int height = 480; cv::Mat audioMat = cv::Mat::zeros( height, width, CV_8UC3 ); cv::namedWindow( "Audio" ); bool exit = false; std::cout << std::endl << "Speech Recognition Start..." << std::endl << std::endl; while( 1 ){ // イベントの更新待ち ResetEvent( hSpeechEvent ); unsigned long waitObject = MsgWaitForMultipleObjectsEx( ARRAYSIZE( hEvents ), hEvents, INFINITE, QS_ALLINPUT, MWMO_INPUTAVAILABLE ); if( waitObject == WAIT_OBJECT_0 ){ // イベントの取得 const float confidenceThreshold = 0.3f; SPEVENT eventStatus; unsigned long eventFetch = 0; pSpeechContext->GetEvents( 1, &eventStatus, &eventFetch ); while( eventFetch > 0 ){ switch( eventStatus.eEventId ){ // 音声認識イベント(SPEI_HYPOTHESIS:推定またはSPEI_RECOGNITION:認識) case SPEI_HYPOTHESIS: case SPEI_RECOGNITION: if( eventStatus.elParamType == SPET_LPARAM_IS_OBJECT ){ // フレーズの取得 ISpRecoResult* pRecoResult = reinterpret_cast<ISpRecoResult*>( eventStatus.lParam ); SPPHRASE* pPhrase = nullptr; hResult = pRecoResult->GetPhrase( &pPhrase ); if( SUCCEEDED( hResult ) ){ if( ( pPhrase->pProperties != nullptr ) && ( pPhrase->pProperties->pFirstChild != nullptr ) ){ // 辞書のフレーズタグと比較 const SPPHRASEPROPERTY* pSemantic = pPhrase->pProperties->pFirstChild; if( pSemantic->SREngineConfidence > confidenceThreshold ){ if( wcscmp( L"あか", pSemantic->pszValue ) == 0 ){ std::cout << "あか" << std::endl; audioMat = cv::Scalar( 0, 0, 255 ); } else if( wcscmp( L"みどり", pSemantic->pszValue ) == 0 ){ std::cout << "みどり" << std::endl; audioMat = cv::Scalar( 0, 255, 0 ); } else if( wcscmp( L"あお", pSemantic->pszValue ) == 0 ){ std::cout << "あお" << std::endl; audioMat = cv::Scalar( 255, 0, 0 ); } else if( wcscmp( L"おわり", pSemantic->pszValue ) == 0 ){ exit = true; } } } CoTaskMemFree( pPhrase ); } } break; default: break; } pSpeechContext->GetEvents( 1, &eventStatus, &eventFetch ); } } // 表示 cv::imshow( "Audio", audioMat ); // ループの終了判定(Escキー) if( cv::waitKey( 30 ) == VK_ESCAPE || exit ){ break; } } // 終了処理 audioStream->StopCapture(); pSpeechRecognizer->SetRecoState( SPRST_INACTIVE ); CoUninitialize(); pSensor->NuiShutdown(); CloseHandle( hSpeechEvent ); cv::destroyAllWindows(); return 0; }
void RSpeechRecognition::CallbackRule() { USES_CONVERSION; HRESULT hr; std::string dictationString; CSpEvent ruleEvent; hr = ruleEvent.GetFrom( this->RuleRecoCtxt ); if ( FAILED(hr) ) return ; //認識した結果 ISpRecoResult* result; result = ruleEvent.RecoResult(); //認識した文字列の取得 CSpDynamicString dstrText; hr = result->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL); if ( FAILED(hr) ) return ; this->ResultString = W2A(dstrText); //ルールベースで認識した結果の音声部分をもう一度 ディクテーションにかけます。 //これで過剰なマッチを排除します。 { CComPtr<ISpStreamFormat> resultStream; hr = result->GetAudio( 0, 0, &resultStream ); if ( FAILED(hr) ) return; //オーディオから読み込んでね hr = this->DictationEngine->SetInput( resultStream, TRUE); if(FAILED(hr)) return; hr = this->DictationGrammar->SetDictationState(SPRS_ACTIVE ); if(FAILED(hr)) return; hr = this->DictationRecoCtxt->WaitForNotifyEvent(10000); //10秒タイムアウト if ( FAILED(hr) ) return; hr = this->DictationGrammar->SetDictationState(SPRS_INACTIVE ); if(FAILED(hr)) return; CSpEvent tempevent; hr = tempevent.GetFrom( this->DictationRecoCtxt ); if ( FAILED(hr) ) return ; //認識した結果 ISpRecoResult* tempresult; tempresult = tempevent.RecoResult(); //認識した文字列の取得 CSpDynamicString tempdstrText; hr = tempresult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &tempdstrText, NULL); if ( FAILED(hr) ) return ; std::string dictationString = W2A(tempdstrText); //ディクテーションフィルターで絞る if ( dictationString.find(this->DicticationFilterWord) == std::string::npos ) { //フィルターにより拒否 this->FlagCleanup(); return ; } } //認識に XMLを使用した場合、代入された結果を得る. SPPHRASE *pPhrase; hr = result->GetPhrase(&pPhrase); if ( FAILED(hr) ) return ; this->ResultMap.clear(); const SPPHRASEPROPERTY *pProp; for (pProp = pPhrase->pProperties; pProp; pProp = pProp->pNextSibling) { this->ResultMap[ W2A(pProp->pszName) ] = W2A(pProp->pszValue); } CoTaskMemFree(pPhrase); //コマンド認識 SendMessage(this->CallbackWindowHandle , this->CallbackWindowMesage , 0 , 0); this->FlagCleanup(); }
//ルールベースで認識した結果の音声部分をもう一度 ディクテーションにかけます。 //これで過剰なマッチを排除します。 xreturn::r<std::string> Recognition_SAPI::convertDictation(ISpRecoResult* result,const std::string& ruleName) { HRESULT hr; _USE_WINDOWS_ENCODING; CComPtr<ISpStreamFormat> resultStream; { hr = result->GetAudio( 0, 1, &resultStream ); if(FAILED(hr)) return xreturn::windowsError(hr); //オーディオから読み込んでね hr = this->DictationEngine->SetInput( resultStream, TRUE); if(FAILED(hr)) return xreturn::windowsError(hr); hr = this->DictationGrammar->SetRuleState(ruleName.empty() ? NULL : _A2W(ruleName.c_str()), NULL, SPRS_ACTIVE ); if(FAILED(hr)) return xreturn::windowsError(hr); hr = this->DictationRecoCtxt->WaitForNotifyEvent(2000); //2秒タイムアウト if(FAILED(hr)) return xreturn::windowsError(hr); hr = this->DictationGrammar->SetRuleState(NULL, NULL, SPRS_INACTIVE ); if(FAILED(hr)) return xreturn::windowsError(hr); { CSpEvent tempevent; hr = tempevent.GetFrom( this->DictationRecoCtxt ); if(FAILED(hr)) return xreturn::windowsError(hr); if (tempevent.eEventId == SPEI_RECOGNITION) {//認識した結果 ISpRecoResult* tempresult; { tempresult = tempevent.RecoResult(); //認識した文字列の取得 CSpDynamicString tempdstrText; hr = tempresult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &tempdstrText, NULL); if(FAILED(hr)) return xreturn::windowsError(hr); SPPHRASE *pPhrase; hr = tempresult->GetPhrase(&pPhrase); if ( FAILED(hr) ) return xreturn::windowsError(hr); double confidence = pPhrase->pElements->SREngineConfidence; std::string ret = _W2A(tempdstrText); this->PoolMainWindow->SyncInvokeLog(std::string() + "ディクテーションフィルター :" + ret + + " " + num2str(confidence),LOG_LEVEL_DEBUG); if (confidence <= 0.60) { this->PoolMainWindow->SyncInvokeLog(std::string() + "ディクテーションフィルター棄却",LOG_LEVEL_DEBUG); return ""; } return ret; } } } } //不明 return ""; }
//認識したときに呼ばれるコールバック xreturn::r<bool> Recognition_SAPI::CallbackReco() { HRESULT hr; //平均認識率 double SREngineConfidenceAvg; //正規表現キャプチャ std::map<std::string , std::string> capture; //呼びかけの部分の信頼度を取得する. double yobikakeEngineConfidence; //コールバックIDの取得 unsigned int funcID; //テンポラリルールかどうか。 bool isTemporaryRule; //ルールでマッチしたものをディクテーション認識させた時の結果 std::string dictationString; //マッチした文字列全体 std::string matchString; //マッチした結果を取得し分析します。 { CSpEvent ruleEvent; hr = ruleEvent.GetFrom( this->RuleRecoCtxt ); if ( FAILED(hr) ) return xreturn::windowsError(hr); if ( ruleEvent.eEventId != SPEI_RECOGNITION ) { return false; } this->PoolMainWindow->SyncInvokeLog("SPEI_RECOGNITION" ,LOG_LEVEL_DEBUG); { //認識した結果 ISpRecoResult* result; result = ruleEvent.RecoResult(); SPPHRASE *pPhrase; hr = result->GetPhrase(&pPhrase); if ( FAILED(hr) ) return xreturn::windowsError(hr); PhraseTo phraseTo(pPhrase); if (phraseTo.IsError()) { this->PoolMainWindow->AsyncInvoke( [=](){ this->PoolMainWindow->ScriptManager.BadVoiceRecogntion(-5,"","",0,0,false); } ); return false; } //平均認識率 SREngineConfidenceAvg = phraseTo.GetSREngineConfidenceAvg(); //正規表現キャプチャ capture = phraseTo.GetRegexpCapture(); //呼びかけの部分の信頼度を取得する. yobikakeEngineConfidence = phraseTo.GetYobikakeEngineConfidence(); //コールバックIDの取得 funcID = phraseTo.GetFuncID(); //テンポラリルール? isTemporaryRule = phraseTo.IsTemporaryRule(); //マッチした文字列 matchString = phraseTo.GetAllString(); if ( !isTemporaryRule ) {//ルールでマッチしたものをディクテーション認識させてみる。 // dictationString = this->convertDictation(result,"FilterRule"); // if ( ! this->checkDictation(dictationString) ) // { // dictationString = this->convertDictation(result,"FilterRule2"); // if ( ! this->checkDictation(dictationString) ) // { dictationString = this->convertDictation(result,""); // } // } } } } if ( funcID == UINT_MAX || funcID >= this->CallbackDictionary.size() ) {//コールバックしようがないマッチは異常。 return xreturn::error("マッチした後のコールバック関数ID " + num2str(funcID) + " が存在しません" ); } if ( isTemporaryRule ) {//テンポラリルール if (SREngineConfidenceAvg < this->TemporaryRuleConfidenceFilter) {//BAD this->PoolMainWindow->AsyncInvoke( [=](){ this->PoolMainWindow->ScriptManager.BadVoiceRecogntion (-1,matchString,"",0,SREngineConfidenceAvg,false); } ); return false; } //上手くマッチしたらのでコールバックする this->PoolMainWindow->SyncInvokePopupMessage("音声認識",matchString); this->PoolMainWindow->AsyncInvoke( [=](){ this->PoolMainWindow->ScriptManager.VoiceRecogntion (this->CallbackDictionary[funcID],capture,"",0,SREngineConfidenceAvg); } ); return true; } //ディクテーションチェック bool dictationCheck = this->checkDictation(dictationString); if (this->UseDictationFilter) { if (! dictationCheck ) {//ディクテーションチェックの結果エラーになった this->PoolMainWindow->AsyncInvoke( [=](){ this->PoolMainWindow->ScriptManager.BadVoiceRecogntion (-2,matchString,dictationString,yobikakeEngineConfidence,SREngineConfidenceAvg,dictationCheck); } ); return false; } } //呼びかけの部分の信頼度 if (yobikakeEngineConfidence < this->YobikakeRuleConfidenceFilter ) {//呼びかけの信頼度が足りない this->PoolMainWindow->AsyncInvoke( [=](){ this->PoolMainWindow->ScriptManager.BadVoiceRecogntion (-3,matchString,dictationString,yobikakeEngineConfidence,SREngineConfidenceAvg,dictationCheck); } ); return false; } //全体を通しての信頼度 if (SREngineConfidenceAvg < this->BasicRuleConfidenceFilter ) {//全体を通しての信頼度が足りない this->PoolMainWindow->AsyncInvoke( [=](){ this->PoolMainWindow->ScriptManager.BadVoiceRecogntion (-4,matchString,dictationString,yobikakeEngineConfidence,SREngineConfidenceAvg,dictationCheck); } ); return false; } //マッチしたのでコールバックする this->PoolMainWindow->SyncInvokePopupMessage("音声認識",matchString); this->PoolMainWindow->AsyncInvoke( [=](){ this->PoolMainWindow->ScriptManager.VoiceRecogntion (this->CallbackDictionary[funcID],capture,dictationString,yobikakeEngineConfidence,SREngineConfidenceAvg); } ); return true; }