/// <summary> /// Start recognizing speech asynchronously. /// </summary> /// <returns> /// <para>S_OK on success, otherwise failure code.</para> /// </returns> HRESULT KinectReader::StartSpeechRecognition() { HRESULT hr = m_pKinectAudioStream->StartCapture(); if (SUCCEEDED(hr)) { // Specify that all top level rules in grammar are now active m_pSpeechGrammar->SetRuleState(NULL, NULL, SPRS_ACTIVE); // Specify that engine should always be reading audio m_pSpeechRecognizer->SetRecoState(SPRST_ACTIVE_ALWAYS); // Specify that we're only interested in receiving recognition events m_pSpeechContext->SetInterest(SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION)); // Ensure that engine is recognizing speech and not in paused state hr = m_pSpeechContext->Resume(0); if (SUCCEEDED(hr)) { m_hSpeechEvent = m_pSpeechContext->GetNotifyEventHandle(); } } return hr; }
bool KinectSpeechRecognizer::StartRecognition() { HRESULT hr = S_OK; // Specify that engine should always be reading audio hr = speechRecognizer->SetRecoState(SPRST_ACTIVE); if (!SUCCEEDED(hr)) { return false; } // Specify that we're only interested in receiving recognition events hr = speechContext->SetInterest(SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION)); if (FAILED(hr)) { return false; } // Ensure that engine is recognizing speech and not in paused state hr = speechContext->Resume(0); if (FAILED(hr)) { return false; } if(SUCCEEDED(hr)) { isPaused = false; return true; } else return false; }
KEEPERSPEECH_API KEEPERSPEECH_REASON __cdecl KeeperSpeechInit(void) { HRESULT res; KEEPERSPEECH_REASON reason; reason = KSR_NOT_KNOWN; //"loop" for error handling purposes (break on error) for (;;) { res = state.engine.CoCreateInstance(CLSID_SpSharedRecognizer); if (FAILED(res)) { reason = KSR_CREATE_ENGINE; break; } res = state.engine->CreateRecoContext(&state.recog); if (FAILED(res)) { reason = KSR_CREATE_RECOG_CONTEXT; break; } res = state.recog->SetNotifyCallbackFunction(recognitionCallback, 0, 0); if (FAILED(res)) { reason = KSR_SET_NOTIFY; break; } res = state.recog->SetInterest(SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION)); if (FAILED(res)) { reason = KSR_SET_INTEREST; break; } res = state.recog->CreateGrammar(1, &state.grammar); if (FAILED(res)) { reason = KSR_CREATE_GRAMMAR; break; } res = state.grammar->LoadCmdFromResource(hModule, MAKEINTRESOURCEW(IDR_COMMAND_GRAMMAR), L"SRGRAMMAR", MAKELANGID(LANG_NEUTRAL, SUBLANG_NEUTRAL), SPLO_DYNAMIC); if (FAILED(res)) { reason = KSR_LOAD_GRAMMAR; break; } res = state.grammar->SetRuleState(NULL, NULL, SPRS_ACTIVE); if (FAILED(res)) { reason = KSR_ACTIVATE_GRAMMAR; break; } return KSR_OK; } KeeperSpeechExit(); return reason; }
void SetVoiceNotification(void * pV, HWND hwnd){ HRESULT hr; ISpVoice *pVoice = pV; if(pVoice){ hr = ISpVoice_SetInterest(pVoice, SPFEI(SPEI_END_INPUT_STREAM), SPFEI(SPEI_END_INPUT_STREAM)); if(hr == S_OK){ hr = ISpVoice_SetNotifyWindowMessage(pVoice, hwnd, PNM_SOUND_ENDS, 0, 0); } } }
bool SapiService::Init() { PROFILER_LABEL_FUNC(js::ProfileEntry::Category::OTHER); MOZ_ASSERT(!mInitialized); if (Preferences::GetBool("media.webspeech.synth.test") || !Preferences::GetBool("media.webspeech.synth.enabled")) { // When enabled, we shouldn't add OS backend (Bug 1160844) return false; } if (FAILED(CoCreateInstance(CLSID_SpVoice, nullptr, CLSCTX_ALL, IID_ISpVoice, getter_AddRefs(mSapiClient)))) { return false; } // Set interest for all the events we are interested in ULONGLONG eventMask = SPFEI(SPEI_START_INPUT_STREAM) | SPFEI(SPEI_TTS_BOOKMARK) | SPFEI(SPEI_WORD_BOUNDARY) | SPFEI(SPEI_SENTENCE_BOUNDARY) | SPFEI(SPEI_END_INPUT_STREAM); if (FAILED(mSapiClient->SetInterest(eventMask, eventMask))) { return false; } // Get all the voices from sapi and register in the SynthVoiceRegistry if (!RegisterVoices()) { return false; } // Set the callback function for receiving the events mSapiClient->SetNotifyCallbackFunction( (SPNOTIFYCALLBACK*) SapiService::SpeechEventCallback, (WPARAM) this, 0); mInitialized = true; return true; }
int main(int argc, char* argv[]) { CComPtr<IEnumSpObjectTokens> voice_tokens; if (FAILED(::CoInitialize(NULL))) return FALSE; std::cout << "initialized\n"; // cout << "creating voice\n"; if (S_OK != cpVoice.CoCreateInstance(CLSID_SpVoice)) return FALSE; std::cout << "voice initialized" << endl; ULONGLONG event_mask = SPFEI(SPEI_START_INPUT_STREAM) | SPFEI(SPEI_TTS_BOOKMARK) | SPFEI(SPEI_WORD_BOUNDARY) | SPFEI(SPEI_SENTENCE_BOUNDARY) | SPFEI(SPEI_END_INPUT_STREAM); cpVoice->SetInterest(event_mask, event_mask); std::cout << "interests set" << endl; cpVoice->SetNotifyCallbackFunction(SpeechEventCallback, 0, 0); std::cout << "callback function set" << endl; cpVoice->Speak(L"This should work", SPF_ASYNC, &stream_number_); HANDLE hWait = cpVoice->SpeakCompleteEvent(); WaitAndPumpMessagesWithTimeout(hWait, INFINITE); cpVoice.Release(); ::CoUninitialize(); return TRUE; }
void speak(string Qtext,int sleepTime) { HRESULT hr = E_FAIL; CSpDynamicString Qtextout; if (SUCCEEDED(hr = ::CoInitialize(NULL))) { { CComPtr<ISpRecoContext> cpRecoCtxt; CComPtr<ISpRecoGrammar> cpGrammar; CComPtr<ISpVoice> cpVoice; hr = cpRecoCtxt.CoCreateInstance(CLSID_SpSharedRecoContext); if (SUCCEEDED(hr)) { hr = cpRecoCtxt->GetVoice(&cpVoice); } if (cpRecoCtxt && cpVoice && SUCCEEDED(hr = cpRecoCtxt->SetNotifyWin32Event()) && SUCCEEDED(hr = cpRecoCtxt->SetInterest(SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION))) && SUCCEEDED(hr = cpRecoCtxt->SetAudioOptions(SPAO_RETAIN_AUDIO, NULL, NULL)) && SUCCEEDED(hr = cpRecoCtxt->CreateGrammar(0, &cpGrammar)) && SUCCEEDED(hr = cpGrammar->LoadDictation(NULL, SPLO_STATIC)) && SUCCEEDED(hr = cpGrammar->SetDictationState(SPRS_ACTIVE))) { USES_CONVERSION; CComPtr<ISpRecoResult> cpResult; Qtextout.operator=(Qtext.c_str()); cpVoice->Speak(Qtextout, SPF_ASYNC, NULL); Sleep(sleepTime); } } ::CoUninitialize(); } }
void SpeechRecognition::InitContext() { //init com if (FAILED(::CoInitialize(nullptr))) { Log("Failed to initialize com library", LogEntry::Error); return; } HRESULT hr; ISpRecognizer* recognizer; hr = CoCreateInstance(CLSID_SpSharedRecognizer, nullptr, CLSCTX_ALL, IID_ISpRecognizer, reinterpret_cast<void**>(&recognizer)); CheckReturn(hr); hr = recognizer->CreateRecoContext(&recoContext); CheckReturn(hr); //pause context hr = recoContext->Pause(0); CheckReturn(hr); //make grammar library ISpRecoGrammar* recoGrammar = InitGrammar(); handleEvent = recoContext->GetNotifyEventHandle(); if (handleEvent == INVALID_HANDLE_VALUE) CheckReturn(E_FAIL); ULONGLONG interest = SPFEI(SPEI_RECOGNITION); hr = recoContext->SetInterest(interest, interest); CheckReturn(hr); //Activate grammar hr = recoGrammar->SetRuleState(RULENAME, 0, SPRS_ACTIVE); CheckReturn(hr); //enable context again hr = recoContext->Resume(0); CheckReturn(hr); std::cout << "Waiting for mocha...." << std::endl; Update(); std::cout << "Hello!" << std::endl; }
// CMyDlg 메시지 처리기입니다. BOOL CMyDlg::OnInitSpeech() { HRESULT hr = S_OK; hr = cpRecoEngine.CoCreateInstance(CLSID_SpInprocRecognizer); if (SUCCEEDED(hr)) { hr = cpRecoEngine->CreateRecoContext(&m_cpRecoCtxt); } if (SUCCEEDED(hr)) { hr = m_cpRecoCtxt->SetNotifyWindowMessage(m_hWnd, WM_RECOEVENT, 0, 0); } if (SUCCEEDED(hr)) { const ULONGLONG ullInterest = SPFEI(SPEI_RECOGNITION); hr = m_cpRecoCtxt->SetInterest(ullInterest, ullInterest); } CComPtr<ISpAudio> cpAudio; hr = SpCreateDefaultObjectFromCategoryId(SPCAT_AUDIOIN, &cpAudio); hr = cpRecoEngine->SetInput(cpAudio, TRUE); hr = cpRecoEngine->SetRecoState(SPRST_ACTIVE); if (SUCCEEDED(hr)) hr = m_cpRecoCtxt->CreateGrammar(GID_DICTATION, &m_cpDictationGrammar); if (SUCCEEDED(hr)) hr = m_cpDictationGrammar->LoadDictation(NULL, SPLO_STATIC); if (SUCCEEDED(hr)) hr = m_cpDictationGrammar->SetDictationState(SPRS_ACTIVE); if (FAILED(hr)) m_cpDictationGrammar.Release(); return (hr == S_OK); }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //Module implementation bool SpeechRecognizerModule::configure(ResourceFinder &rf ) { setName( rf.check("name",Value("speechRecognizer")).asString().c_str() ); m_timeout = rf.check("timeout",Value(10000)).asInt(); USE_LEGACY = !rf.check("noLegacy"); m_forwardSound = rf.check("forwardSound"); m_tmpFileFolder = rf.getHomeContextPath().c_str(); interruptRecognition = false; //Deal with speech recognition string grammarFile = rf.check("grammarFile",Value("defaultGrammar.grxml")).asString().c_str(); grammarFile = rf.findFile(grammarFile).c_str(); std::wstring tmp = s2ws(grammarFile); LPCWSTR cwgrammarfile = tmp.c_str(); m_useTalkBack = rf.check("talkback"); //Initialise the speech crap bool everythingIsFine = true; HRESULT hr; everythingIsFine = SUCCEEDED( m_cpRecoEngine.CoCreateInstance(CLSID_SpInprocRecognizer)); everythingIsFine &= SUCCEEDED( SpCreateDefaultObjectFromCategoryId(SPCAT_AUDIOIN, &m_cpAudio)); everythingIsFine &= SUCCEEDED( m_cpRecoEngine->CreateRecoContext( &m_cpRecoCtxt )); // Here, all we are interested in is the beginning and ends of sounds, as well as // when the engine has recognized something const ULONGLONG ullInterest = SPFEI(SPEI_RECOGNITION); everythingIsFine &= SUCCEEDED(m_cpRecoCtxt->SetInterest(ullInterest, ullInterest)); // set the input for the engine everythingIsFine &= SUCCEEDED( m_cpRecoEngine->SetInput(m_cpAudio, TRUE)); everythingIsFine &= SUCCEEDED( m_cpRecoEngine->SetRecoState( SPRST_ACTIVE )); //Load grammar from file everythingIsFine &= SUCCEEDED( m_cpRecoCtxt->CreateGrammar( 1, &m_cpGrammarFromFile )); everythingIsFine &= SUCCEEDED( m_cpGrammarFromFile->SetGrammarState(SPGS_DISABLED)); everythingIsFine &= SUCCEEDED( m_cpGrammarFromFile->LoadCmdFromFile(cwgrammarfile, SPLO_DYNAMIC)); // everythingIsFine &= loadGrammarFromRf(rf); //Create a runtime grammar everythingIsFine &= SUCCEEDED( m_cpRecoCtxt->CreateGrammar( 2, &m_cpGrammarRuntime )); everythingIsFine &= SUCCEEDED( m_cpGrammarRuntime->SetGrammarState(SPGS_DISABLED)); //Create a dictation grammar everythingIsFine &= SUCCEEDED(m_cpRecoCtxt->CreateGrammar( GID_DICTATION, &m_cpGrammarDictation )); everythingIsFine &= SUCCEEDED(m_cpGrammarDictation->LoadDictation(NULL, SPLO_STATIC)); everythingIsFine &= SUCCEEDED(m_cpGrammarDictation->SetDictationState(SPRS_INACTIVE)); //Setup thing for the raw audio processing everythingIsFine &= SUCCEEDED(m_cAudioFmt.AssignFormat(SPSF_22kHz16BitMono)); hr = m_cpRecoCtxt->SetAudioOptions(SPAO_RETAIN_AUDIO, &m_cAudioFmt.FormatId(), m_cAudioFmt.WaveFormatExPtr()); //everythingIsFine &= SUCCEEDED(hr = SPBindToFile((const WCHAR *)"C:\\temp.wav", SPFM_CREATE_ALWAYS, &m_streamFormat, &m_cAudioFmt.FormatId(), m_cAudioFmt.WaveFormatExPtr())); //CComPtr <ISpStream> cpStream = NULL; //CSpStreamFormat cAudioFmt; //hr = cAudioFmt.AssignFormat(SPSF_22kHz16BitMono); //hr = SPBindToFile((const WCHAR *)"c:\\ttstemp.wav", SPFM_CREATE_ALWAYS, &cpStream, &cAudioFmt.FormatId(), cAudioFmt.WaveFormatExPtr()); if( everythingIsFine ) { string pName = "/"; pName += getName().c_str(); pName += "/recog/continuous:o"; m_portContinuousRecognition.open( pName.c_str() ); pName = "/"; pName += getName().c_str(); pName += "/recog/continuousGrammar:o"; m_portContinuousRecognitionGrammar.open( pName.c_str() ); pName = "/"; pName += getName().c_str(); pName += "/recog/sound:o"; m_portSound.open(pName.c_str()); //iSpeak pName = "/"; pName += getName().c_str(); pName += "/tts/iSpeak:o"; m_port2iSpeak.open( pName.c_str() ); pName = "/"; pName += getName().c_str(); pName += "/tts/iSpeak/rpc"; m_port2iSpeakRpc.open( pName.c_str() ); if (Network::connect(m_port2iSpeak.getName().c_str(),"/iSpeak")&&Network::connect(m_port2iSpeakRpc.getName().c_str(),"/iSpeak/rpc")) yInfo() <<"Connection to iSpeak succesfull" ; else yWarning() <<"Unable to connect to iSpeak. Connect manually." ; pName = "/"; pName += getName().c_str(); pName += "/rpc"; m_portRPC.open( pName.c_str() ); attach(m_portRPC); //Start recognition //everythingIsFine &= SUCCEEDED(m_cpRecoEngine->SetRecoState(SPRST_ACTIVE_ALWAYS)); everythingIsFine &= SUCCEEDED(m_cpGrammarFromFile->SetRuleState(NULL, NULL, SPRS_ACTIVE)); everythingIsFine &= SUCCEEDED( m_cpGrammarFromFile->SetGrammarState(SPGS_ENABLED)); } return (everythingIsFine); }
int voiceRecognition(string Qtext) { HRESULT hr = E_FAIL; int word=0; CSpDynamicString Qtextout; if (SUCCEEDED(hr = ::CoInitialize(NULL))) { { CComPtr<ISpRecoContext> cpRecoCtxt; CComPtr<ISpRecoGrammar> cpGrammar; CComPtr<ISpVoice> cpVoice; hr = cpRecoCtxt.CoCreateInstance(CLSID_SpSharedRecoContext); if (SUCCEEDED(hr)) { hr = cpRecoCtxt->GetVoice(&cpVoice); } if (cpRecoCtxt && cpVoice && SUCCEEDED(hr = cpRecoCtxt->SetNotifyWin32Event()) && SUCCEEDED(hr = cpRecoCtxt->SetInterest(SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION))) && SUCCEEDED(hr = cpRecoCtxt->SetAudioOptions(SPAO_RETAIN_AUDIO, NULL, NULL)) && SUCCEEDED(hr = cpRecoCtxt->CreateGrammar(0, &cpGrammar)) && SUCCEEDED(hr = cpGrammar->LoadDictation(NULL, SPLO_STATIC)) && SUCCEEDED(hr = cpGrammar->SetDictationState(SPRS_ACTIVE))) { USES_CONVERSION; CComPtr<ISpRecoResult> cpResult; Qtextout.operator=(Qtext.c_str()); cpVoice->Speak(Qtextout, SPF_ASYNC, NULL); if (SUCCEEDED(hr = BlockForResult(cpRecoCtxt, &cpResult))) { cpGrammar->SetDictationState(SPRS_INACTIVE); CSpDynamicString dstrText; if (SUCCEEDED(cpResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL))) { cpResult.Release(); } if (_wcsicmp(dstrText, L"True") == 0) { word = 1; //break; } if (_wcsicmp(dstrText, L"Two") == 0) { word = 1; //break; } if (_wcsicmp(dstrText, L"False") == 0) { word = 2; //break; } if (_wcsicmp(dstrText, L"Falls") == 0) { word = 2; //break; } if (_wcsicmp(dstrText, L"Follows") == 0) { word = 2; //break; } if (_wcsicmp(dstrText, L"A") == 0) { word = 3; //break; } if (_wcsicmp(dstrText, L"Eight") == 0) { word = 3; //break; } if (_wcsicmp(dstrText, L"B") == 0) { word = 4; //break; } if (_wcsicmp(dstrText, L"Bee") == 0) { word = 4; //break; } if (_wcsicmp(dstrText, L"C") == 0) { word = 5; //break; } if (_wcsicmp(dstrText, L"See") == 0) { word = 5; //break; } if (_wcsicmp(dstrText, L"Fire") == 0) { word = 6; //break; } if (_wcsicmp(dstrText, L"Leave") == 0) { word = 7; //break; } if (_wcsicmp(dstrText, L"Leave it") == 0) { word = 7; //break; } if (_wcsicmp(dstrText, L"We've") == 0) { word = 7; //break; } if (_wcsicmp(dstrText, L"Quit") == 0) { word = 7; //break; } if (_wcsicmp(dstrText, L"Quits") == 0) { word = 7; //break; } if (_wcsicmp(dstrText, L"Switch") == 0) { word = 8; //break; } if (_wcsicmp(dstrText, L"Change") == 0) { word = 8; //break; } cout << dstrText.CopyToChar(); cpGrammar->SetDictationState(SPRS_ACTIVE); } } } ::CoUninitialize(); } return word; }
bool VOICEREC_init(HWND hWnd, int event_id, int grammar_id, int command_resource) { HRESULT hr = S_OK; while (true) { // create a recognition engine hr = p_recogEngine.CoCreateInstance(CLSID_SpInprocRecognizer); if (FAILED(hr)) { os::dialogs::Message(os::dialogs::MESSAGEBOX_ERROR, "Failed to create a recognition engine\n","Error"); printf("Failed to create a recognition engine\n"); break; } // create the command recognition context hr = p_recogEngine->CreateRecoContext( &p_recogContext ); if (FAILED(hr)) { os::dialogs::Message(os::dialogs::MESSAGEBOX_ERROR,"Failed to create the command recognition context\n","Error"); printf("Failed to create the command recognition context\n"); break; } // Let SR know that window we want it to send event information to, and using // what message hr = p_recogContext->SetNotifyWindowMessage( hWnd, event_id, 0, 0 ); if (FAILED(hr)) { os::dialogs::Message(os::dialogs::MESSAGEBOX_ERROR,"Failed to SetNotifyWindowMessage\n","Error"); break; } // Tell SR what types of events interest us. Here we only care about command // recognition. hr = p_recogContext->SetInterest( SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION) ); if (FAILED(hr)) { os::dialogs::Message(os::dialogs::MESSAGEBOX_ERROR,"Failed to set events\n","Error"); break; } // Create a grammar hr = p_recogContext->CreateGrammar(grammar_id, &p_grammarObject); if (FAILED(hr)) { os::dialogs::Message(os::dialogs::MESSAGEBOX_ERROR,"Failed to create grammar\n","Error"); break; } // Load our grammar from data\phrases.xml, or if that doesn't exist, from the compiled in // user defined ("SRGRAMMAR") resource type. hr = p_grammarObject->LoadCmdFromFile(L"data\\phrases.xml", SPLO_STATIC); if (FAILED(hr)) { hr = p_grammarObject->LoadCmdFromResource(NULL, MAKEINTRESOURCEW(command_resource), L"SRGRAMMAR", MAKELANGID(LANG_NEUTRAL, SUBLANG_NEUTRAL), SPLO_STATIC); if (FAILED(hr)) { os::dialogs::Message(os::dialogs::MESSAGEBOX_ERROR,"Failed to load resource SRGRAMMAR\n","Error"); break; } } hr = SpCreateDefaultObjectFromCategoryId(SPCAT_AUDIOIN, &cpAudio); if (FAILED(hr)) { os::dialogs::Message(os::dialogs::MESSAGEBOX_ERROR,"Failed to get default audio input\n", "Error"); break; } // Set the audio input to our token. hr = p_recogEngine->SetInput(cpAudio, TRUE); if (FAILED(hr)) { os::dialogs::Message(os::dialogs::MESSAGEBOX_ERROR,"Failed to set audio input\n", "Error"); } // Set rules to active, we are now listening for commands hr = p_grammarObject->SetRuleState(NULL, NULL, SPRS_ACTIVE ); if (FAILED(hr)) { os::dialogs::Message(os::dialogs::MESSAGEBOX_ERROR,"Failed to set listening for commands\n","Error"); break; } break; } // if we failed and have a partially setup SAPI, close it all down if (FAILED(hr)) { VOICEREC_deinit(); } os::events::addEventListener(SDL_SYSWMEVENT, os::events::DEFAULT_LISTENER_WEIGHT, system_event_handler); return ( hr == S_OK); }
/********************************************************** * COperator::InitializeSapi * *---------------------------* * Description: * Various SAPI initializations. * Return: * S_OK if SAPI initialized successfully * Return values of failed SAPI initialization * functions ***********************************************************/ HRESULT COperator::InitializeSapi() { // Create a voice for speaking on this machine HRESULT hr = m_cpLocalVoice.CoCreateInstance( CLSID_SpVoice ); if ( FAILED( hr ) ) { DoMessage( L"Could not create a TTS voice on the local machine" ); return hr; } // Create a reco engine for recognizing speech over the phone // This is an inproc recognizer since it will likely be // using a format other than the default hr = m_cpIncomingRecognizer.CoCreateInstance( CLSID_SpInprocRecognizer ); if ( FAILED(hr) ) { DoMessage(L"CoCreateInstance on inproc reco engine failed"); return hr; } // Create a reco context for this engine hr = m_cpIncomingRecognizer->CreateRecoContext( &m_cpIncomingRecoCtxt ); if ( FAILED(hr) ) { DoMessage(L"Could not create recognition context"); return hr; } // Set interest only in PHRASE_START, RECOGNITION, FALSE_RECOGNITION const ULONGLONG ullInterest = SPFEI(SPEI_PHRASE_START) | SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_FALSE_RECOGNITION); hr = m_cpIncomingRecoCtxt->SetInterest( ullInterest, ullInterest ); if ( FAILED(hr) ) { DoMessage(L"Could not set interest in SAPI events"); return hr; } // Retain recognized audio hr = m_cpIncomingRecoCtxt->SetAudioOptions( SPAO_RETAIN_AUDIO, NULL, NULL ); if ( FAILED(hr) ) { DoMessage(L"Could not set audio options to retain recognized audio"); return hr; } // Create a dictation grammar and load it hr = m_cpIncomingRecoCtxt->CreateGrammar( 0, &m_cpDictGrammar ); if ( FAILED(hr) ) { DoMessage(L"Could not create dictation grammar"); return hr; } hr = m_cpDictGrammar->LoadDictation( NULL, SPLO_STATIC ); if ( FAILED(hr) ) { DoMessage(L"Could not load dictation"); return hr; } // Create a voice for talking on the phone. hr = m_cpIncomingRecoCtxt->GetVoice( &m_cpOutgoingVoice ); if ( FAILED(hr) ) { DoMessage(L"Could not create a TTS voice for speaking over the phone"); return hr; } return S_OK; } /* COperator::InitializeSapi */
mssapi_captions::mssapi_captions( captions_cb callback, const std::string &lang) try : captions_handler(callback, AUDIO_FORMAT_16BIT, 16000) { HRESULT hr; std::wstring wlang; wlang.resize(lang.size()); for (size_t i = 0; i < lang.size(); i++) wlang[i] = (wchar_t)lang[i]; LCID lang_id = LocaleNameToLCID(wlang.c_str(), 0); wchar_t lang_str[32]; _snwprintf(lang_str, 31, L"language=%x", (int)lang_id); stop = CreateEvent(nullptr, false, false, nullptr); if (!stop.Valid()) throw "Failed to create event"; hr = SpFindBestToken(SPCAT_RECOGNIZERS, lang_str, nullptr, &token); if (FAILED(hr)) throw HRError("SpFindBestToken failed", hr); hr = CoCreateInstance(CLSID_SpInprocRecognizer, nullptr, CLSCTX_ALL, __uuidof(ISpRecognizer), (void**)&recognizer); if (FAILED(hr)) throw HRError("CoCreateInstance for recognizer failed", hr); hr = recognizer->SetRecognizer(token); if (FAILED(hr)) throw HRError("SetRecognizer failed", hr); hr = recognizer->SetRecoState(SPRST_INACTIVE); if (FAILED(hr)) throw HRError("SetRecoState(SPRST_INACTIVE) failed", hr); hr = recognizer->CreateRecoContext(&context); if (FAILED(hr)) throw HRError("CreateRecoContext failed", hr); ULONGLONG interest = SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM); hr = context->SetInterest(interest, interest); if (FAILED(hr)) throw HRError("SetInterest failed", hr); hr = context->SetNotifyWin32Event(); if (FAILED(hr)) throw HRError("SetNotifyWin32Event", hr); notify = context->GetNotifyEventHandle(); if (notify == INVALID_HANDLE_VALUE) throw HRError("GetNotifyEventHandle failed", E_NOINTERFACE); size_t sample_rate = audio_output_get_sample_rate(obs_get_audio()); audio = new CaptionStream((DWORD)sample_rate, this); audio->Release(); hr = recognizer->SetInput(audio, false); if (FAILED(hr)) throw HRError("SetInput failed", hr); hr = context->CreateGrammar(1, &grammar); if (FAILED(hr)) throw HRError("CreateGrammar failed", hr); hr = grammar->LoadDictation(nullptr, SPLO_STATIC); if (FAILED(hr)) throw HRError("LoadDictation failed", hr); try { t = std::thread([this] () {main_thread();}); } catch (...) { throw "Failed to create thread"; } } catch (const char *err) { blog(LOG_WARNING, "%s: %s", __FUNCTION__, err); throw CAPTIONS_ERROR_GENERIC_FAIL; } catch (HRError err) { blog(LOG_WARNING, "%s: %s (%lX)", __FUNCTION__, err.str, err.hr); throw CAPTIONS_ERROR_GENERIC_FAIL; }
/****************************************************************************** * InitSAPI * *----------* * Description: * Called once to get SAPI started. * ******************************************************************************/ HRESULT InitSAPI( HWND hWnd ) { HRESULT hr = S_OK; while ( 1 ) { // create a recognition engine hr = g_cpEngine.CoCreateInstance(CLSID_SpSharedRecognizer); if ( FAILED( hr ) ) { break; } // create the command recognition context hr = g_cpEngine->CreateRecoContext( &g_cpRecoCtxt ); if ( FAILED( hr ) ) { break; } // Let SR know that window we want it to send event information to, and using // what message hr = g_cpRecoCtxt->SetNotifyWindowMessage( hWnd, WM_RECOEVENT, 0, 0 ); if ( FAILED( hr ) ) { break; } // Tell SR what types of events interest us. Here we only care about command // recognition. hr = g_cpRecoCtxt->SetInterest( SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION) ); if ( FAILED( hr ) ) { break; } // Load our grammar, which is the compiled form of simple.xml bound into this executable as a // user defined ("SRGRAMMAR") resource type. hr = g_cpRecoCtxt->CreateGrammar(GRAMMARID1, &g_cpCmdGrammar); if (FAILED(hr)) { break; } hr = g_cpCmdGrammar->LoadCmdFromResource(NULL, MAKEINTRESOURCEW(IDR_CMD_CFG), L"SRGRAMMAR", MAKELANGID(LANG_NEUTRAL, SUBLANG_NEUTRAL), SPLO_DYNAMIC); if ( FAILED( hr ) ) { break; } // Set navigation rule to active, espresso order rule to inactive hr = g_cpCmdGrammar->SetRuleIdState( VID_EspressoDrinks, SPRS_INACTIVE ); if ( FAILED( hr ) ) { break; } hr = g_cpCmdGrammar->SetRuleIdState( VID_Navigation, SPRS_ACTIVE ); if ( FAILED( hr ) ) { break; } // Get the default voice associated with our reco context hr = g_cpRecoCtxt->GetVoice(&g_cpVoice); if ( FAILED( hr ) ) { break; } break; } // if we failed and have a partially setup SAPI, close it all down if ( FAILED( hr ) ) { CleanupSAPI(); } return ( hr ); }
//The main AISA initializer. bool AISALib::init(HWND vHwnd) { HRESULT hr = CoInitialize(NULL); AL_FAILED_GOTO_MSG(hr, logger, "CoInitialize..failed\n", exit); hr = CoCreateInstance(CLSID_SpInprocRecognizer, NULL, CLSCTX_INPROC_SERVER, IID_ISpRecognizer, (void **) &recognizer); AL_FAILED_GOTO_MSG(hr, logger, "CoCreateInstance CLSID_SpInprocRecognizer -> IID_ISpRecognizer..failed\n", exit); hr = recognizer->CreateRecoContext(&recogCtx); AL_FAILED_GOTO_MSG(hr, logger, "recognizer->CreateRecoContext..failed\n", exit); hr = recogCtx->SetNotifyWindowMessage(vHwnd, WM_RECOGEVENT, 0, 0 ); AL_FAILED_GOTO_MSG(hr, logger, "recogCtx->SetNotifyWindowMessage..failed\n", exit); const ULONGLONG ullInterest = SPFEI(SPEI_RECOGNITION); hr = recogCtx->SetInterest(ullInterest, ullInterest); AL_FAILED_GOTO_MSG(hr, logger, "recogCtx->SetInterest..failed\n", exit); hr = CoCreateInstance(CLSID_SpObjectTokenCategory, NULL, CLSCTX_INPROC_SERVER, IID_ISpObjectTokenCategory, (void **) &audioInCat); AL_FAILED_GOTO_MSG(hr, logger, "CoCreateInstance(CLSID_SpObjectTokenCategory..failed\n", exit); hr = audioInCat->SetId(SPCAT_AUDIOIN, false); AL_FAILED_GOTO_MSG(hr, logger, "audioInCat->SetId(SPCAT_AUDIOIN\n", exit); hr = audioInCat->EnumTokens(NULL, NULL, &cpEnum); AL_FAILED_GOTO_MSG(hr, logger, "audioInCat->EnumTokens..failed\n", exit); hr = cpEnum->Next(1, &token, NULL); AL_FAILED_GOTO_MSG(hr, logger, "cpEnum->Next..failed\n", exit); hr= token->CreateInstance(NULL, CLSCTX_INPROC_SERVER, IID_ISpAudio, (void **) &audio); AL_FAILED_GOTO_MSG(hr, logger, "token->CreateInstance IID_ISpAudio..failed\n", exit); AL_RELEASE(token, logger, "Release token..\n"); AL_RELEASE(cpEnum, logger, "Release cpEnum..\n"); AL_RELEASE(audioInCat, logger, "Release audioInCat..\n"); hr = recognizer->SetInput(audio, TRUE); AL_FAILED_GOTO_MSG(hr, logger, "recognizer->SetInput..failed\n", exit); hr = recognizer->SetRecoState(SPRST_ACTIVE); AL_FAILED_GOTO_MSG(hr, logger, "recognizer->SetRecoState SPRST_ACTIVE..failed\n", exit); hr = recogCtx->CreateGrammar(GID_DICTATION, &grammar); AL_FAILED_GOTO_MSG(hr, logger, "recogCtx->CreateGrammar..failed\n", exit); hr = grammar->LoadDictation(NULL, SPLO_STATIC); AL_FAILED_GOTO_MSG(hr, logger, "grammar->LoadDictation SPLO_STATIC..failed\n", exit); hr = grammar->SetDictationState(SPRS_ACTIVE); AL_FAILED_GOTO_MSG(hr, logger, "grammar->SetDictationState SPRS_ACTIVE..failed\n", exit); hr = CoCreateInstance(CLSID_SpVoice, NULL, CLSCTX_INPROC_SERVER, IID_ISpVoice, (void **) &voice); AL_FAILED_GOTO_MSG(hr, logger, "CoCreateInstance(CLSID_SpVoice..failed.\n", exit); voice->SetNotifyWindowMessage(vHwnd, WM_TTSEVENT, 0, 0); hr = voice->SetInterest( SPFEI_ALL_TTS_EVENTS, SPFEI_ALL_TTS_EVENTS ); AL_FAILED_GOTO_MSG(hr, logger, "voice->SetInterest SPFEI_ALL_TTS_EVENTS..failed \n", exit); return true; exit: destroy(); return false; }
LRESULT CALLBACK WndProc(HWND hwnd, UINT message, WPARAM wParam, LPARAM lParam) { HDC hdc; PAINTSTRUCT ps; switch (message) { case WM_CREATE: { //初始化COM端口 ::CoInitializeEx(NULL, COINIT_APARTMENTTHREADED); //创建识别引擎COM实例为共享型 HRESULT hr = m_cpRecoEngine.CoCreateInstance(CLSID_SpSharedRecognizer); //创建识别上下文接口 if (SUCCEEDED(hr)) { hr = m_cpRecoEngine->CreateRecoContext(&m_cpRecoCtxt); } else MessageBox(hwnd, TEXT("error1"), TEXT("error"), S_OK); //设置识别消息,使计算机时刻监听语音消息 if (SUCCEEDED(hr)) { hr = m_cpRecoCtxt->SetNotifyWindowMessage(hwnd, WM_RECOEVENT, 0, 0); } else MessageBox(hwnd, TEXT("error2"), TEXT("error"), S_OK); //设置我们感兴趣的事件 if (SUCCEEDED(hr)) { ULONGLONG ullMyEvents = SPFEI(SPEI_SOUND_START) | SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_SOUND_END); hr = m_cpRecoCtxt->SetInterest(ullMyEvents, ullMyEvents); } else MessageBox(hwnd, TEXT("error3"), TEXT("error"), S_OK); //创建语法规则 b_Cmd_Grammar = TRUE; if (FAILED(hr)) { MessageBox(hwnd, TEXT("error4"), TEXT("error"), S_OK); } hr = m_cpRecoCtxt->CreateGrammar(GID_CMD_GR, &m_cpCmdGramma); WCHAR wszXMLFile[20] = L"er.xml"; MultiByteToWideChar(CP_ACP, 0, (LPCSTR)"er.xml", -1, wszXMLFile, 256); hr = m_cpCmdGramma->LoadCmdFromFile(wszXMLFile, SPLO_DYNAMIC); if (FAILED(hr)) { MessageBox(hwnd, TEXT("error5"), TEXT("error"), S_OK); } b_initSR = TRUE; //在开始识别时,激活语法进行识别 hr = m_cpCmdGramma->SetRuleState(NULL, NULL, SPRS_ACTIVE); return 0; } case WM_RECOEVENT: { RECT rect; GetClientRect(hwnd, &rect); hdc = GetDC(hwnd); USES_CONVERSION; CSpEvent event; while (event.GetFrom(m_cpRecoCtxt) == S_OK) { switch (event.eEventId) { case SPEI_RECOGNITION: { static const WCHAR wszUnrecognized[] = L"<Unrecognized>"; CSpDynamicString dstrText; //取得识别结果 if (FAILED(event.RecoResult()->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL))) { dstrText = wszUnrecognized; } BSTR SRout; dstrText.CopyToBSTR(&SRout); char* lpszText2 = _com_util::ConvertBSTRToString(SRout); if (b_Cmd_Grammar) { if (strstr("打开企鹅", lpszText2) != NULL) { DrawText(hdc, TEXT("打开企鹅"), -1, &rect, DT_SINGLELINE | DT_CENTER | DT_VCENTER); openQQ(); } if (strstr("关闭企鹅", lpszText2) != NULL) { DrawText(hdc, TEXT("关闭企鹅"), -1, &rect, DT_SINGLELINE | DT_CENTER | DT_VCENTER); closeQQ(); } if (strstr("隐藏企鹅", lpszText2) != NULL) { DrawText(hdc, TEXT("隐藏企鹅"), -1, &rect, DT_SINGLELINE | DT_CENTER | DT_VCENTER); yincangQQ(); } if (strstr("显示企鹅", lpszText2) != NULL) { DrawText(hdc, TEXT("显示企鹅"), -1, &rect, DT_SINGLELINE | DT_CENTER | DT_VCENTER); showQQ(); } if (strstr("上移企鹅", lpszText2) != NULL) { DrawText(hdc, TEXT("上移企鹅"), -1, &rect, DT_SINGLELINE | DT_CENTER | DT_VCENTER); top(); } if (strstr("下移企鹅", lpszText2) != NULL) { DrawText(hdc, TEXT("下移企鹅"), -1, &rect, DT_SINGLELINE | DT_CENTER | DT_VCENTER); bottom(); } if (strstr("左移企鹅", lpszText2) != NULL) { DrawText(hdc, TEXT("左移企鹅"), -1, &rect, DT_SINGLELINE | DT_CENTER | DT_VCENTER); left(); } if (strstr("右移企鹅", lpszText2) != NULL) { DrawText(hdc, TEXT("右移企鹅"), -1, &rect, DT_SINGLELINE | DT_CENTER | DT_VCENTER); right(); } } } } } return TRUE; } case WM_PAINT: hdc = BeginPaint(hwnd, &ps); EndPaint(hwnd, &ps); return 0; case WM_DESTROY: PostQuitMessage(0); return 0; } return DefWindowProc(hwnd, message, wParam, lParam); }
//Speech Initialization is done here HRESULT CASRwrapper::InitSpeech(std::wstring sPathToFile, IStream * pMemStream) { HRESULT hr = S_OK; hr = cpRecoEngine.CoCreateInstance(CLSID_SpInprocRecognizer); if (SUCCEEDED(hr)) { hr = cpRecoEngine->CreateRecoContext(&m_cpRecoCtxt); } if (SUCCEEDED(hr)) { WPARAM wparam = NULL; LPARAM lparam = NULL; hr = m_cpRecoCtxt->SetNotifyWin32Event(); //hr = m_cpRecoCtxt->SetNotifyCallbackFunction(SpRecCallback,wparam,lparam); // hr = m_cpRecoCtxt->SetNotifyWindowMessage(m_hWnd, WM_RECOEVENT, 0, 0); } if (SUCCEEDED(hr)) { // This specifies which of the recognition events are going //to trigger notifications. Here, all we are interested in //is the beginning and ends of sounds, as well as // when the engine has recognized something //using ISpRecoContext const ULONGLONG ullInterest = SPFEI(SPEI_RECOGNITION); hr = m_cpRecoCtxt->SetInterest(ullInterest, ullInterest); } if (SUCCEEDED(hr)) { // Specifies that the grammar we want is a dictation grammar. // Initializes the grammar (m_cpDictationGrammar) // using ISpRecoContext hr = m_cpRecoCtxt->CreateGrammar(GID_DICTATION, &m_cpDictationGrammar); } if (SUCCEEDED(hr)) { //Load the dictation tool for the grammar specified //using ISpRecoGrammar hr = m_cpDictationGrammar->LoadDictation(NULL, SPLO_STATIC); } if (!sPathToFile.empty() || pMemStream != NULL) { CComPtr<ISpStream> cpInputStream; if (SUCCEEDED(hr)) { // Create basic SAPI stream object // NOTE: The helper SpBindToFile can be used to perform the following operations hr = cpInputStream.CoCreateInstance(CLSID_SpStream); } CSpStreamFormat sInputFormat; // generate WaveFormatEx structure, assuming the wav format is 44kHz, 16-bit, Mono if (SUCCEEDED(hr)) { hr = sInputFormat.AssignFormat(SPSF_44kHz16BitMono); } if (pMemStream != NULL) { if (SUCCEEDED(hr)) { hr = cpInputStream->SetBaseStream(pMemStream, SPDFID_WaveFormatEx, sInputFormat.WaveFormatExPtr()); } } else { if (SUCCEEDED(hr)) { // for read-only access, since it will only be access by the SR engine hr = cpInputStream->BindToFile(sPathToFile.c_str(), SPFM_OPEN_READONLY, &(sInputFormat.FormatId()), sInputFormat.WaveFormatExPtr(), SPFEI_ALL_EVENTS); } } if (SUCCEEDED(hr)) { // connect wav input to recognizer // SAPI will negotiate mismatched engine/input audio formats using system audio codecs, so second parameter is not important - use default of TRUE hr = cpRecoEngine->SetInput(cpInputStream, TRUE); } } else //connect to mic { // create default audio object CComPtr<ISpAudio> cpAudio; if (SUCCEEDED(hr)) { hr = SpCreateDefaultObjectFromCategoryId(SPCAT_AUDIOIN, &cpAudio); } // set the input for the engine if (SUCCEEDED(hr)) { hr = cpRecoEngine->SetInput(cpAudio, TRUE); } if (SUCCEEDED(hr)) { hr = cpRecoEngine->SetRecoState(SPRST_ACTIVE); } } if (FAILED(hr)) { //Release the grammar using ISpRecoGrammar m_cpDictationGrammar.Release(); } return hr; }
/********************************************************************************************* * CSimpleDict::InitDialog() * Creates the recognition context and activates the grammar. * Returns TRUE iff successful. **********************************************************************************************/ bool CSimpleDict::InitDialog( HWND hDlg ) { m_hDlg = hDlg; HRESULT hr = S_OK; CComPtr<ISpRecognizer> cpRecoEngine; hr = cpRecoEngine.CoCreateInstance(CLSID_SpInprocRecognizer); if( SUCCEEDED( hr ) ) { hr = cpRecoEngine->CreateRecoContext( &m_cpRecoCtxt ); } // Set recognition notification for dictation if (SUCCEEDED(hr)) { hr = m_cpRecoCtxt->SetNotifyWindowMessage( hDlg, WM_RECOEVENT, 0, 0 ); } if (SUCCEEDED(hr)) { // This specifies which of the recognition events are going to trigger notifications. // Here, all we are interested in is the beginning and ends of sounds, as well as // when the engine has recognized something const ULONGLONG ullInterest = SPFEI(SPEI_RECOGNITION); hr = m_cpRecoCtxt->SetInterest(ullInterest, ullInterest); } // create default audio object CComPtr<ISpAudio> cpAudio; hr = SpCreateDefaultObjectFromCategoryId(SPCAT_AUDIOIN, &cpAudio); // set the input for the engine hr = cpRecoEngine->SetInput(cpAudio, TRUE); hr = cpRecoEngine->SetRecoState( SPRST_ACTIVE ); if (SUCCEEDED(hr)) { // Specifies that the grammar we want is a dictation grammar. // Initializes the grammar (m_cpDictationGrammar) hr = m_cpRecoCtxt->CreateGrammar( GID_DICTATION, &m_cpDictationGrammar ); } if (SUCCEEDED(hr)) { hr = m_cpDictationGrammar->LoadDictation(NULL, SPLO_STATIC); } if (SUCCEEDED(hr)) { hr = m_cpDictationGrammar->SetDictationState( SPRS_ACTIVE ); } if (FAILED(hr)) { m_cpDictationGrammar.Release(); } return (hr == S_OK); }
HRESULT SREngine::InitializeSapi(WId hWnd, UINT Msg) { HRESULT hr = S_OK; //FOR ONE NOT FOR ALL /* 独享模式的配置 */ hr = m_cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer); //独享模式 if (FAILED(hr)) { QMessageBox::information(NULL, "Error", "Create recognizer error", MB_OK); return hr; } hr = SpCreateDefaultObjectFromCategoryId(SPCAT_AUDIOIN, &m_cpAudio); //建立默认的音频输入对象 if (FAILED(hr)) { QMessageBox::information(NULL, "Error", "Create default audio object error", MB_OK); return hr; } hr = m_cpRecognizer->SetInput(m_cpAudio, TRUE); //设置识别引擎输入源 if (FAILED(hr)) { QMessageBox::information(NULL, "Error", "Error setINPUT", MB_OK); return hr; } hr = m_cpRecognizer->CreateRecoContext(&m_cpRecoContext); //创建识别上下文接口 if (FAILED(hr)) { QMessageBox::information(NULL, "Error", "Error CreateRecoContext", MB_OK); return hr; } hr = m_cpRecoContext->SetNotifyWindowMessage((HWND)hWnd, Msg, 0, 0); //设置识别消息 if (FAILED(hr)) { QMessageBox::information(NULL, "Error", "Error SetNotifyWindowMessage", MB_OK); return hr; } const ULONGLONG ullInterest = SPFEI(SPEI_SOUND_START) | SPFEI(SPEI_SOUND_END) | SPFEI(SPEI_PHRASE_START) | SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_FALSE_RECOGNITION) | SPFEI(SPEI_HYPOTHESIS) | SPFEI(SPEI_INTERFERENCE) | SPFEI(SPEI_RECO_OTHER_CONTEXT) | SPFEI(SPEI_REQUEST_UI) | SPFEI(SPEI_RECO_STATE_CHANGE) | SPFEI(SPEI_PROPERTY_NUM_CHANGE) | SPFEI(SPEI_PROPERTY_STRING_CHANGE); hr = m_cpRecoContext->SetInterest(ullInterest, ullInterest); //设置感兴趣的事件 if (FAILED(hr)) { QMessageBox::information(NULL, "Error", "Error set interest", MB_OK); } return hr; }
int main(int argc, char* argv[]) { HRESULT hr = E_FAIL; bool fUseTTS = true; // turn TTS play back on or off bool fReplay = true; // turn Audio replay on or off // Process optional arguments if (argc > 1) { int i; for (i = 1; i < argc; i++) { if (_stricmp(argv[i], "-noTTS") == 0) { fUseTTS = false; continue; } if (_stricmp(argv[i], "-noReplay") == 0) { fReplay = false; continue; } printf ("Usage: %s [-noTTS] [-noReplay] \n", argv[0]); return hr; } } if (SUCCEEDED(hr = ::CoInitialize(NULL))) { { CComPtr<ISpRecoContext> cpRecoCtxt; CComPtr<ISpRecoGrammar> cpGrammar; CComPtr<ISpVoice> cpVoice; hr = cpRecoCtxt.CoCreateInstance(CLSID_SpSharedRecoContext); if(SUCCEEDED(hr)) { hr = cpRecoCtxt->GetVoice(&cpVoice); } if (cpRecoCtxt && cpVoice && SUCCEEDED(hr = cpRecoCtxt->SetNotifyWin32Event()) && SUCCEEDED(hr = cpRecoCtxt->SetInterest(SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION))) && SUCCEEDED(hr = cpRecoCtxt->SetAudioOptions(SPAO_RETAIN_AUDIO, NULL, NULL)) && SUCCEEDED(hr = cpRecoCtxt->CreateGrammar(0, &cpGrammar)) && SUCCEEDED(hr = cpGrammar->LoadDictation(NULL, SPLO_STATIC)) && SUCCEEDED(hr = cpGrammar->SetDictationState(SPRS_ACTIVE))) { USES_CONVERSION; const WCHAR * const pchStop = StopWord(); CComPtr<ISpRecoResult> cpResult; printf( "I will repeat everything you say.\nSay \"%s\" to exit.\n", W2A(pchStop) ); while (SUCCEEDED(hr = BlockForResult(cpRecoCtxt, &cpResult))) { cpGrammar->SetDictationState( SPRS_INACTIVE ); CSpDynamicString dstrText; if (SUCCEEDED(cpResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL))) { printf("I heard: %s\n", W2A(dstrText)); if (fUseTTS) { cpVoice->Speak( L"I heard", SPF_ASYNC, NULL); cpVoice->Speak( dstrText, SPF_ASYNC, NULL ); } if (fReplay) { if (fUseTTS) cpVoice->Speak( L"when you said", SPF_ASYNC, NULL); else printf ("\twhen you said...\n"); cpResult->SpeakAudio(NULL, 0, NULL, NULL); } cpResult.Release(); } if (_wcsicmp(dstrText, pchStop) == 0) { break; } cpGrammar->SetDictationState( SPRS_ACTIVE ); } } } ::CoUninitialize(); } return hr; }
//音声認識のためのオブジェクトの構築. void RSpeechRecognition::Create(const string & inToWave,const string & inGrammarXML) throw(RComException) { USES_CONVERSION; HRESULT hr; // 認識エンジンオブジェクトの作成 // CLSID_SpSharedRecognizer 共有オブジェクト // CLSID_SpInprocRecognizer アプリ内動作 if ( inToWave.empty() ) { // hr = this->Engine.CoCreateInstance(CLSID_SpSharedRecognizer); // if(FAILED(hr)) throw RComException(hr , "CLSID_SpSharedRecognizer 構築 に失敗"); hr = this->Engine.CoCreateInstance(CLSID_SpInprocRecognizer); if(FAILED(hr)) throw RComException(hr , "CLSID_SpInprocRecognizer 構築 に失敗"); CComPtr<ISpAudio> cpAudio; hr = SpCreateDefaultObjectFromCategoryId(SPCAT_AUDIOIN, &cpAudio); if(FAILED(hr)) throw RComException(hr , "SpCreateDefaultObjectFromCategoryId に失敗"); //認識エンジンのエンジンのディフォルトに設定する。 hr = this->Engine->SetInput(cpAudio, TRUE); if(FAILED(hr)) throw RComException(hr , "SetInput に失敗"); // hr = this->Engine->SetRecoState( SPRST_ACTIVE ); // if(FAILED(hr)) throw RComException(hr , "SetRecoState に失敗"); } else { CComPtr<ISpStream> cpStream; hr = this->Engine.CoCreateInstance(CLSID_SpInprocRecognizer); if(FAILED(hr)) throw RComException(hr , "CLSID_SpInprocRecognizer 構築 に失敗"); hr = cpStream.CoCreateInstance(CLSID_SpStream); if(FAILED(hr)) throw RComException(hr , "CoCreateInstance CLSID_SpStream に失敗"); hr = cpStream->BindToFile( A2W( inToWave.c_str() ) , SPFM_OPEN_READONLY , NULL , NULL, SPFEI_ALL_EVENTS); if(FAILED(hr)) throw RComException(hr , "BindToFile に失敗"); hr = this->Engine->SetInput( cpStream, TRUE); if(FAILED(hr)) throw RComException( this->Engine , CLSID_SpSharedRecognizer , hr , "SetInput に失敗"); } // 認識コンテクストオブジェクトの作成 hr = this->Engine->CreateRecoContext(&this->RecoCtxt); if(FAILED(hr)) throw RComException(hr , "CreateRecoContext に失敗"); hr = this->RecoCtxt->SetNotifyWin32Event(); if ( FAILED(hr) ) throw RComException(hr , "SetNotifyWin32Event に失敗"); hr = this->RecoCtxt->SetInterest(SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION)); if ( FAILED(hr) ) throw RComException(hr , "SetInterest に失敗"); hr = this->RecoCtxt->SetAudioOptions(SPAO_RETAIN_AUDIO, NULL, NULL); if ( FAILED(hr) ) throw RComException(hr , "SetAudioOptions に失敗"); //メインとなる文法の作成 hr = this->RecoCtxt->CreateGrammar(0, &this->DictationGrammar); if ( FAILED(hr) ) throw RComException(hr , "CreateGrammar に失敗"); hr = this->DictationGrammar->LoadDictation(NULL, SPLO_STATIC); if ( FAILED(hr) ) throw RComException(hr , "LoadDictation に失敗"); if ( inGrammarXML.empty() ) { //録音開始 hr = this->DictationGrammar->SetDictationState( SPRS_ACTIVE ); if ( FAILED(hr) ) throw RComException(hr , "SetDictationState に失敗"); } else { //ユーザ指定ファイルからのロード hr = this->DictationGrammar->LoadCmdFromFile( A2W( inGrammarXML.c_str() ) ,SPLO_STATIC); if ( FAILED(hr) ) throw RComException(hr , "LoadCmdFromFile に失敗"); //録音開始 hr = this->DictationGrammar->SetRuleState(NULL, NULL, SPRS_ACTIVE ); if ( FAILED(hr) ) throw RComException(hr , "SetRuleState に失敗"); } }
*/ #include "stdafx.h" #include "sapi_lipsync.h" #include "phone_estimate.h" #include "sapi_util.h" /////////////////////////////////////////////////////////////////////////////// /// constants /////////////////////////////////////////////////////////////////////////////// #define GID_LIPSYNC 0 // grammar identifier. /// Interest level for event in SAPI const ULONGLONG ullInterest = SPFEI(SPEI_SOUND_START) | SPFEI(SPEI_SOUND_END) | SPFEI(SPEI_PHRASE_START) | SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_FALSE_RECOGNITION) | SPFEI(SPEI_HYPOTHESIS) | SPFEI(SPEI_INTERFERENCE) | SPFEI(SPEI_RECO_OTHER_CONTEXT) | SPFEI(SPEI_REQUEST_UI) | SPFEI(SPEI_RECO_STATE_CHANGE) | SPFEI(SPEI_END_SR_STREAM) | SPFEI(SPEI_PROPERTY_NUM_CHANGE) | SPFEI(SPEI_PROPERTY_STRING_CHANGE); /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // sapi_lipsync class implementation /////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// 初始化语音识别 HRESULT ThisApp::init_speech_recognizer(){ HRESULT hr = S_OK; // 创建语音输入流 if (SUCCEEDED(hr)){ hr = CoCreateInstance(CLSID_SpStream, nullptr, CLSCTX_INPROC_SERVER, __uuidof(ISpStream), (void**)&m_pSpeechStream);; } // 与我们的Kinect语音输入相连接 if (SUCCEEDED(hr)){ WAVEFORMATEX wft = { WAVE_FORMAT_PCM, // PCM编码 1, // 单声道 16000, // 采样率为16KHz 32000, // 每分钟数据流 = 采样率 * 对齐 2, // 对齐 : 单声道 * 样本深度 = 2byte 16, // 样本深度 16BIT 0 // 额外数据 }; // 设置状态 hr = m_pSpeechStream->SetBaseStream(m_p16BitPCMAudioStream, SPDFID_WaveFormatEx, &wft); } // 创建语音识别对象 if (SUCCEEDED(hr)){ ISpObjectToken *pEngineToken = nullptr; // 创建语言识别器 hr = CoCreateInstance(CLSID_SpInprocRecognizer, nullptr, CLSCTX_INPROC_SERVER, __uuidof(ISpRecognizer), (void**)&m_pSpeechRecognizer); if (SUCCEEDED(hr)) { // 连接我们创建的语音输入流对象 m_pSpeechRecognizer->SetInput(m_pSpeechStream, TRUE); // 创建待识别语言 这里选择大陆汉语(zh-cn) // 目前没有Kinect的汉语语音识别包 有的话可以设置"language=804;Kinect=Ture" hr = SpFindBestToken(SPCAT_RECOGNIZERS, L"Language=804", nullptr, &pEngineToken); if (SUCCEEDED(hr)) { // 设置待识别语言 m_pSpeechRecognizer->SetRecognizer(pEngineToken); // 创建语音识别上下文 hr = m_pSpeechRecognizer->CreateRecoContext(&m_pSpeechContext); // 适应性 ON! 防止因长时间的处理而导致识别能力的退化 if (SUCCEEDED(hr)) { hr = m_pSpeechRecognizer->SetPropertyNum(L"AdaptationOn", 0); } } } SafeRelease(pEngineToken); } // 创建语法 if (SUCCEEDED(hr)){ hr = m_pSpeechContext->CreateGrammar(1, &m_pSpeechGrammar); } // 加载静态SRGS语法文件 if (SUCCEEDED(hr)){ hr = m_pSpeechGrammar->LoadCmdFromFile(s_GrammarFileName, SPLO_STATIC); } // 激活语法规则 if (SUCCEEDED(hr)){ hr = m_pSpeechGrammar->SetRuleState(nullptr, nullptr, SPRS_ACTIVE); } // 设置识别器一直读取数据 if (SUCCEEDED(hr)){ hr = m_pSpeechRecognizer->SetRecoState(SPRST_ACTIVE_ALWAYS); } // 设置对识别事件感兴趣 if (SUCCEEDED(hr)){ hr = m_pSpeechContext->SetInterest(SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION)); } // 保证语音识别处于激活状态 if (SUCCEEDED(hr)){ hr = m_pSpeechContext->Resume(0); } // 获取识别事件 if (SUCCEEDED(hr)){ m_p16BitPCMAudioStream->SetSpeechState(TRUE); m_hSpeechEvent = m_pSpeechContext->GetNotifyEventHandle(); printf_s("init_speech_recognizer succeeded\n"); } #ifdef _DEBUG else printf_s("init_speech_recognizer failed\n"); #endif return hr; }
/****************************************************************************** * CounterPaneProc * *-----------------* * Description: * Handles messages specifically for the counter (order) pane. * ******************************************************************************/ LRESULT CounterPaneProc( HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam ) { USES_CONVERSION; HRESULT hr; switch ( message ) { case WM_ESPRESSOORDER: { _ASSERTE( lParam ); KillTimer( hWnd, 0 ); ID_TEXT *pulIds = (ID_TEXT *) lParam; int i = 0, ilen = 0; TCHAR szTempBuf[NORMAL_LOADSTRING]; TCHAR szSpace[] = _T(" "); int iTemplen; g_szCounterDisplay[0] = '\0'; // Sort the array while ( 0 != pulIds[i].ulId ) { i++; } for ( int j = 0; j < i; j++ ) { int iminIndex = j; for ( int k = j; k < i; k++ ) { if ( pulIds[iminIndex].ulId > pulIds[k].ulId ) { iminIndex = k; } } ULONG ulId = pulIds[iminIndex].ulId; WCHAR *pwstr = pulIds[iminIndex].pwstrCoMemText; pulIds[iminIndex].pwstrCoMemText = pulIds[j].pwstrCoMemText; pulIds[j].pwstrCoMemText = pwstr; pulIds[iminIndex].ulId = pulIds[j].ulId; pulIds[j].ulId = ulId; } i = 0; // Put in the first order words if we actually have an order if ( 0 != pulIds[0].ulId ) { iTemplen = LoadString( g_hInst, IDS_ORDERBEGIN, szTempBuf, NORMAL_LOADSTRING ); lstrcat( g_szCounterDisplay + ilen, szTempBuf ); ilen += iTemplen; } while ( i < MAX_ID_ARRAY && 0 != pulIds[i].ulId ) { TCHAR *pTempStr = W2T( pulIds[i].pwstrCoMemText ); iTemplen = lstrlen( pTempStr ); // We'll quit now so we dont overrun the buffer if ( ilen + iTemplen >= MAX_LOADSTRING ) { break; } if ( i > 0 ) { lstrcat( g_szCounterDisplay + ilen, szSpace ); ilen += 1; } lstrcat( g_szCounterDisplay, pTempStr ); ilen += iTemplen; i++; } // Put the thank you on this order if ( 0 < i ) { iTemplen = LoadString( g_hInst, IDS_ORDEREND, szTempBuf, NORMAL_LOADSTRING ); if ( ilen + iTemplen < MAX_LOADSTRING ) { lstrcat( g_szCounterDisplay + ilen, szTempBuf ); ilen += iTemplen; } } InvalidateRect( hWnd, NULL, TRUE ); SetTimer( hWnd, 0, TIMEOUT, NULL ); // Speak the order g_cpVoice->Speak( T2W(g_szCounterDisplay), SPF_ASYNC, NULL); // Delete the CoTaskMem we were given initially by ISpPhrase->GetText i = 0; while ( i < MAX_ID_ARRAY && 0 != pulIds[i].ulId ) { CoTaskMemFree( pulIds[i].pwstrCoMemText ); i++; } delete [] pulIds; return ( 1 ); } case WM_PAINT: CounterPanePaint( hWnd, g_szCounterDisplay ); return ( 1 ); case WM_INITPANE: LoadString( g_hInst, IDS_PLEASEORDER, g_szCounterDisplay, MAX_LOADSTRING ); // Set the rule recognizing an espresso order to active, now that we are ready for it g_cpCmdGrammar->SetRuleIdState( VID_EspressoDrinks, SPRS_ACTIVE ); // Set our interests to include false recognitions hr = g_cpRecoCtxt->SetInterest( SPFEI(SPEI_RECOGNITION)|SPFEI(SPEI_FALSE_RECOGNITION), SPFEI(SPEI_RECOGNITION)|SPFEI(SPEI_FALSE_RECOGNITION) ); _ASSERTE( SUCCEEDED( hr ) ); // Speak the welcome string g_cpVoice->Speak( T2W(g_szCounterDisplay), SPF_ASYNC, NULL); return ( 1 ); case WM_TIMER: // Revert back to 'go ahead and order' message LoadString( g_hInst, IDS_PLEASEORDER, g_szCounterDisplay, MAX_LOADSTRING ); InvalidateRect( hWnd, NULL, TRUE ); // Speak the welcome string g_cpVoice->Speak( T2W(g_szCounterDisplay), SPF_ASYNC, NULL); KillTimer( hWnd, 0 ); return ( 1 ); case WM_GOTOOFFICE: KillTimer( hWnd, 0 ); // Set the rule recognizing an espresso order to inactive // since you cant order from the office g_cpCmdGrammar->SetRuleIdState( VID_EspressoDrinks, SPRS_INACTIVE ); // Set our interests to include only recognitions hr = g_cpRecoCtxt->SetInterest( SPFEI(SPEI_RECOGNITION),SPFEI(SPEI_RECOGNITION) ); _ASSERTE( SUCCEEDED( hr ) ); // Set the right message handler and repaint g_fpCurrentPane = OfficePaneProc; PostMessage( hWnd, WM_INITPANE, NULL, NULL ); InvalidateRect( hWnd, NULL, TRUE ); return ( 1 ); case WM_DIDNTUNDERSTAND: KillTimer( hWnd, 0 ); LoadString( g_hInst, IDS_DIDNTUNDERSTAND, g_szCounterDisplay, MAX_LOADSTRING ); InvalidateRect( hWnd, NULL, TRUE ); // Speak the didn't understand string g_cpVoice->Speak( T2W(g_szCounterDisplay), SPF_ASYNC, NULL); SetTimer( hWnd, 0, TIMEOUT, NULL ); return ( 1 ); } return ( 0 ); }
//----------------------------------------------------------------------------- // Purpose: Given a wave file and a string of words "text", creates a CFG from the // sentence and stores the resulting words/phonemes in CSentence // Input : *wavname - // text - // sentence - // (*pfnPrint - // Output : SR_RESULT //----------------------------------------------------------------------------- SR_RESULT ExtractPhonemes( const char *wavname, CSpDynamicString& text, CSentence& sentence, void (*pfnPrint)( const char *fmt, ...) ) { // Assume failure SR_RESULT result = SR_RESULT_ERROR; if ( text.Length() <= 0 ) { pfnPrint( "Error: no rule / text specified\n" ); return result; } USES_CONVERSION; HRESULT hr; CUtlVector < WORDRULETYPE > wordRules; CComPtr<ISpStream> cpInputStream; CComPtr<ISpRecognizer> cpRecognizer; CComPtr<ISpRecoContext> cpRecoContext; CComPtr<ISpRecoGrammar> cpRecoGrammar; CComPtr<ISpPhoneConverter> cpPhoneConv; // Create basic SAPI stream object // NOTE: The helper SpBindToFile can be used to perform the following operations hr = cpInputStream.CoCreateInstance(CLSID_SpStream); if ( FAILED( hr ) ) { pfnPrint( "Error: SAPI 5.1 Stream object not installed?\n" ); return result; } CSpStreamFormat sInputFormat; // setup stream object with wav file MY_WAVE_AUDIO_FILENAME // for read-only access, since it will only be access by the SR engine hr = cpInputStream->BindToFile( T2W(wavname), SPFM_OPEN_READONLY, NULL, sInputFormat.WaveFormatExPtr(), SPFEI_ALL_EVENTS ); if ( FAILED( hr ) ) { pfnPrint( "Error: couldn't open wav file %s\n", wavname ); return result; } // Create in-process speech recognition engine hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer); if ( FAILED( hr ) ) { pfnPrint( "Error: SAPI 5.1 In process recognizer object not installed?\n" ); return result; } // Create recognition context to receive events hr = cpRecognizer->CreateRecoContext(&cpRecoContext); if ( FAILED( hr ) ) { pfnPrint( "Error: SAPI 5.1 Unable to create recognizer context\n" ); return result; } // Create a grammar hr = cpRecoContext->CreateGrammar( EP_GRAM_ID, &cpRecoGrammar ); if ( FAILED( hr ) ) { pfnPrint( "Error: SAPI 5.1 Unable to create recognizer grammar\n" ); return result; } LANGID englishID = 0x409; // 1033 decimal bool userSpecified = false; LANGID langID = SpGetUserDefaultUILanguage(); // Allow commandline override if ( CommandLine()->FindParm( "-languageid" ) != 0 ) { userSpecified = true; langID = CommandLine()->ParmValue( "-languageid", langID ); } // Create a phoneme converter ( so we can convert to IPA codes ) hr = SpCreatePhoneConverter( langID, NULL, NULL, &cpPhoneConv ); if ( FAILED( hr ) ) { if ( langID != englishID ) { if ( userSpecified ) { pfnPrint( "Warning: SAPI 5.1 Unable to create phoneme converter for command line override -languageid %i\n", langID ); } else { pfnPrint( "Warning: SAPI 5.1 Unable to create phoneme converter for default UI language %i\n",langID ); } // Try english!!! langID = englishID; hr = SpCreatePhoneConverter( langID, NULL, NULL, &cpPhoneConv ); } if ( FAILED( hr ) ) { pfnPrint( "Error: SAPI 5.1 Unable to create phoneme converter for English language id %i\n", langID ); return result; } else { pfnPrint( "Note: SAPI 5.1 Falling back to use english -languageid %i\n", langID ); } } else if ( userSpecified ) { pfnPrint( "Note: SAPI 5.1 Using user specified -languageid %i\n",langID ); } SPSTATEHANDLE hStateRoot; // create/re-create Root level rule of grammar hr = cpRecoGrammar->GetRule(L"Root", 0, SPRAF_TopLevel | SPRAF_Active, TRUE, &hStateRoot); if ( FAILED( hr ) ) { pfnPrint( "Error: SAPI 5.1 Unable to create root rule\n" ); return result; } // Inactivate it so we can alter it hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_INACTIVE ); if ( FAILED( hr ) ) { pfnPrint( "Error: SAPI 5.1 Unable to deactivate grammar rules\n" ); return result; } // Create the rule set from the words in text { CSpDynamicString currentWord; WCHAR *pos = ( WCHAR * )text; WCHAR str[ 2 ]; str[1]= 0; while ( *pos ) { if ( *pos == L' ' /*|| *pos == L'.' || *pos == L'-'*/ ) { // Add word to rule set if ( currentWord.Length() > 0 ) { AddWordRule( cpRecoGrammar, &hStateRoot, &wordRules, currentWord ); currentWord.Clear(); } pos++; continue; } // Skip anything that's inside a [ xxx ] pair. if ( *pos == L'[' ) { while ( *pos && *pos != L']' ) { pos++; } if ( *pos ) { pos++; } continue; } str[ 0 ] = *pos; currentWord.Append( str ); pos++; } if ( currentWord.Length() > 0 ) { AddWordRule( cpRecoGrammar, &hStateRoot, &wordRules, currentWord ); } if ( wordRules.Size() <= 0 ) { pfnPrint( "Error: Text %s contained no usable words\n", text ); return result; } // Build all word to word transitions in the grammar if ( !BuildRules( cpRecoGrammar, &hStateRoot, &wordRules ) ) { pfnPrint( "Error: Rule set for %s could not be generated\n", text ); return result; } } // check for recognitions and end of stream event const ULONGLONG ullInterest = SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM) | SPFEI(SPEI_FALSE_RECOGNITION) | SPFEI(SPEI_PHRASE_START ) | SPFEI(SPEI_HYPOTHESIS ) | SPFEI(SPEI_INTERFERENCE) ; hr = cpRecoContext->SetInterest( ullInterest, ullInterest ); if ( FAILED( hr ) ) { pfnPrint( "Error: SAPI 5.1 Unable to set interest level\n" ); return result; } // use Win32 events for command-line style application hr = cpRecoContext->SetNotifyWin32Event(); if ( FAILED( hr ) ) { pfnPrint( "Error: SAPI 5.1 Unable to set win32 notify event\n" ); return result; } // connect wav input to recognizer // SAPI will negotiate mismatched engine/input audio formats using system audio codecs, so second parameter is not important - use default of TRUE hr = cpRecognizer->SetInput(cpInputStream, TRUE); if ( FAILED( hr ) ) { pfnPrint( "Error: SAPI 5.1 Unable to associate input stream\n" ); return result; } // Activate the CFG ( rather than using dictation ) hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_ACTIVE ); if ( FAILED( hr ) ) { switch ( hr ) { case E_INVALIDARG: pfnPrint( "pszName is invalid or bad. Alternatively, pReserved is non-NULL\n" ); break; case SP_STREAM_UNINITIALIZED: pfnPrint( "ISpRecognizer::SetInput has not been called with the InProc recognizer\n" ); break; case SPERR_UNINITIALIZED: pfnPrint( "The object has not been properly initialized.\n"); break; case SPERR_UNSUPPORTED_FORMAT: pfnPrint( "Audio format is bad or is not recognized. Alternatively, the device driver may be busy by another application and cannot be accessed.\n" ); break; case SPERR_NOT_TOPLEVEL_RULE: pfnPrint( "The rule pszName exists, but is not a top-level rule.\n" ); break; default: pfnPrint( "Unknown error\n" ); break; } pfnPrint( "Error: SAPI 5.1 Unable to activate rule set\n" ); return result; } // while events occur, continue processing // timeout should be greater than the audio stream length, or a reasonable amount of time expected to pass before no more recognitions are expected in an audio stream BOOL fEndStreamReached = FALSE; while (!fEndStreamReached && S_OK == cpRecoContext->WaitForNotifyEvent( SR_WAVTIMEOUT )) { CSpEvent spEvent; // pull all queued events from the reco context's event queue while (!fEndStreamReached && S_OK == spEvent.GetFrom(cpRecoContext)) { // Check event type switch (spEvent.eEventId) { case SPEI_INTERFERENCE: { SPINTERFERENCE interference = spEvent.Interference(); switch ( interference ) { case SPINTERFERENCE_NONE: pfnPrint( "[ I None ]\r\n" ); break; case SPINTERFERENCE_NOISE: pfnPrint( "[ I Noise ]\r\n" ); break; case SPINTERFERENCE_NOSIGNAL: pfnPrint( "[ I No Signal ]\r\n" ); break; case SPINTERFERENCE_TOOLOUD: pfnPrint( "[ I Too Loud ]\r\n" ); break; case SPINTERFERENCE_TOOQUIET: pfnPrint( "[ I Too Quiet ]\r\n" ); break; case SPINTERFERENCE_TOOFAST: pfnPrint( "[ I Too Fast ]\r\n" ); break; case SPINTERFERENCE_TOOSLOW: pfnPrint( "[ I Too Slow ]\r\n" ); break; default: break; } } break; case SPEI_PHRASE_START: pfnPrint( "Phrase Start\r\n" ); sentence.MarkNewPhraseBase(); break; case SPEI_HYPOTHESIS: case SPEI_RECOGNITION: case SPEI_FALSE_RECOGNITION: { CComPtr<ISpRecoResult> cpResult; cpResult = spEvent.RecoResult(); CSpDynamicString dstrText; if (spEvent.eEventId == SPEI_FALSE_RECOGNITION) { dstrText = L"(Unrecognized)"; result = SR_RESULT_FAILED; // It's possible that the failed recog might have more words, so see if that's the case EnumeratePhonemes( cpPhoneConv, cpResult, sentence ); } else { // Hypothesis or recognition success cpResult->GetText( (ULONG)SP_GETWHOLEPHRASE, (ULONG)SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL); EnumeratePhonemes( cpPhoneConv, cpResult, sentence ); if ( spEvent.eEventId == SPEI_RECOGNITION ) { result = SR_RESULT_SUCCESS; } pfnPrint( va( "%s%s\r\n", spEvent.eEventId == SPEI_HYPOTHESIS ? "[ Hypothesis ] " : "", dstrText.CopyToChar() ) ); } cpResult.Release(); } break; // end of the wav file was reached by the speech recognition engine case SPEI_END_SR_STREAM: fEndStreamReached = TRUE; break; } // clear any event data/object references spEvent.Clear(); }// END event pulling loop - break on empty event queue OR end stream }// END event polling loop - break on event timeout OR end stream // Deactivate rule hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_INACTIVE ); if ( FAILED( hr ) ) { pfnPrint( "Error: SAPI 5.1 Unable to deactivate rule set\n" ); return result; } // close the input stream, since we're done with it // NOTE: smart pointer will call SpStream's destructor, and consequently ::Close, but code may want to check for errors on ::Close operation hr = cpInputStream->Close(); if ( FAILED( hr ) ) { pfnPrint( "Error: SAPI 5.1 Unable to close input stream\n" ); return result; } return result; }
void Sound::test() { ISpVoice * pVoice = NULL; ISpObjectToken* pVoiceToken=nullptr; IEnumSpObjectTokens* pEnum; ULONG ulCount = 0; if (FAILED(::CoInitialize(NULL))) { return; } HRESULT hr = S_OK; // Find the best matching installed en-us recognizer. CComPtr<ISpObjectToken> cpRecognizerToken; if (SUCCEEDED(hr)) { hr = SpFindBestToken(SPCAT_RECOGNIZERS, L"language=409", NULL, &cpRecognizerToken); } // Create the in-process recognizer and immediately set its state to inactive. CComPtr<ISpRecognizer> cpRecognizer; if (SUCCEEDED(hr)) { hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecognizer(cpRecognizerToken); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecoState(SPRST_INACTIVE); } // Create a new recognition context from the recognizer. CComPtr<ISpRecoContext> cpContext; if (SUCCEEDED(hr)) { hr = cpRecognizer->CreateRecoContext(&cpContext); } // Subscribe to the speech recognition event and end stream event. if (SUCCEEDED(hr)) { ULONGLONG ullEventInterest = SPFEI(SPEI_RECOGNITION); hr = cpContext->SetInterest(ullEventInterest, ullEventInterest); } // Establish a Win32 event to signal when speech events are available. HANDLE hSpeechNotifyEvent = INVALID_HANDLE_VALUE; if (SUCCEEDED(hr)) { hr = cpContext->SetNotifyWin32Event(); } if (SUCCEEDED(hr)) { hSpeechNotifyEvent = cpContext->GetNotifyEventHandle(); if (INVALID_HANDLE_VALUE == hSpeechNotifyEvent) { // Notification handle unsupported. hr = E_NOINTERFACE; } } // Initialize an audio object to use the default audio input of the system and set the recognizer to use it. CComPtr<ISpAudio> cpAudioIn; if (SUCCEEDED(hr)) { hr = cpAudioIn.CoCreateInstance(CLSID_SpMMAudioIn); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetInput(cpAudioIn, TRUE); } // Populate a WAVEFORMATEX struct with our desired output audio format. information. WAVEFORMATEX* pWfexCoMemRetainedAudioFormat = NULL; GUID guidRetainedAudioFormat = GUID_NULL; if (SUCCEEDED(hr)) { hr = SpConvertStreamFormatEnum(SPSF_16kHz16BitMono, &guidRetainedAudioFormat, &pWfexCoMemRetainedAudioFormat); } // Instruct the recognizer to retain the audio from its recognition results. if (SUCCEEDED(hr)) { hr = cpContext->SetAudioOptions(SPAO_RETAIN_AUDIO, &guidRetainedAudioFormat, pWfexCoMemRetainedAudioFormat); } if (NULL != pWfexCoMemRetainedAudioFormat) { CoTaskMemFree(pWfexCoMemRetainedAudioFormat); } // Create a new grammar and load an SRGS grammar from file. CComPtr<ISpRecoGrammar> cpGrammar; if (SUCCEEDED(hr)) { hr = cpContext->CreateGrammar(0, &cpGrammar); } if (SUCCEEDED(hr)) { hr = cpGrammar->LoadCmdFromFile(L"grammar.grxml", SPLO_STATIC); } // Set all top-level rules in the new grammar to the active state. if (SUCCEEDED(hr)) { hr = cpGrammar->SetRuleState(NULL, NULL, SPRS_ACTIVE); } // Set the recognizer state to active to begin recognition. if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecoState(SPRST_ACTIVE_ALWAYS); } // Establish a separate Win32 event to signal the event loop exit. HANDLE hExitEvent = CreateEventW(NULL, FALSE, FALSE, NULL); // Collect the events listened for to pump the speech event loop. HANDLE rghEvents[] = { hSpeechNotifyEvent, hExitEvent }; // Speech recognition event loop. BOOL fContinue = TRUE; while (fContinue && SUCCEEDED(hr)) { // Wait for either a speech event or an exit event, with a 15 second timeout. DWORD dwMessage = WaitForMultipleObjects(sp_countof(rghEvents), rghEvents, FALSE, 15000); switch (dwMessage) { // With the WaitForMultipleObjects call above, WAIT_OBJECT_0 is a speech event from hSpeechNotifyEvent. case WAIT_OBJECT_0: { // Sequentially grab the available speech events from the speech event queue. CSpEvent spevent; while (S_OK == spevent.GetFrom(cpContext)) { switch (spevent.eEventId) { case SPEI_RECOGNITION: { // Retrieve the recognition result and output the text of that result. ISpRecoResult* pResult = spevent.RecoResult(); LPWSTR pszCoMemResultText = NULL; hr = pResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &pszCoMemResultText, NULL); if (SUCCEEDED(hr)) { wprintf(L"Recognition event received, text=\"%s\"\r\n", pszCoMemResultText); } // Also retrieve the retained audio we requested. CComPtr<ISpStreamFormat> cpRetainedAudio; if (SUCCEEDED(hr)) { hr = pResult->GetAudio(0, 0, &cpRetainedAudio); } // To demonstrate, we'll speak the retained audio back using ISpVoice. CComPtr<ISpVoice> cpVoice; if (SUCCEEDED(hr)) { hr = cpVoice.CoCreateInstance(CLSID_SpVoice); } if (SUCCEEDED(hr)) { hr = cpVoice->SpeakStream(cpRetainedAudio, SPF_DEFAULT, 0); } if (NULL != pszCoMemResultText) { CoTaskMemFree(pszCoMemResultText); } break; } } } break; } case WAIT_OBJECT_0 + 1: case WAIT_TIMEOUT: { // Exit event or timeout; discontinue the speech loop. fContinue = FALSE; //break; } } } CoUninitialize(); CComPtr <ISpVoice> cpVoice; CComPtr <ISpStream> cpStream; CSpStreamFormat cAudioFmt; //Create a SAPI Voice hr = cpVoice.CoCreateInstance(CLSID_SpVoice); //Set the audio format if (SUCCEEDED(hr)) { hr = cAudioFmt.AssignFormat(SPSF_22kHz16BitMono); } //Call SPBindToFile, a SAPI helper method, to bind the audio stream to the file if (SUCCEEDED(hr)) { hr = SPBindToFile(L"c:\\ttstemp.wav", SPFM_CREATE_ALWAYS, &cpStream, &cAudioFmt.FormatId(), cAudioFmt.WaveFormatExPtr()); } //set the output to cpStream so that the output audio data will be stored in cpStream if (SUCCEEDED(hr)) { hr = cpVoice->SetOutput(cpStream, TRUE); } //Speak the text "hello world" synchronously if (SUCCEEDED(hr)) { hr = cpVoice->Speak(L"Hello World", SPF_DEFAULT, NULL); } //close the stream if (SUCCEEDED(hr)) { hr = cpStream->Close(); } //Release the stream and voice object cpStream.Release(); cpVoice.Release(); CComPtr<ISpGrammarBuilder> cpGrammarBuilder; SPSTATEHANDLE hStateTravel; // Create (if rule does not already exist) // top-level Rule, defaulting to Active. hr = cpGrammarBuilder->GetRule(L"Travel", 0, SPRAF_TopLevel | SPRAF_Active, TRUE, &hStateTravel); // Approach 1: List all possible phrases. // This is the most intuitive approach, and it does not sacrifice efficiency // because the grammar builder will merge shared sub-phrases when possible. // There is only one root state, hStateTravel, and the terminal NULL state, // and there are six unique transitions between root state and NULL state. /* XML Approximation: <rule id="Travel"> <item> fly to Seattle </item> <item> fly to New York </item> <item> fly to Washington DC </item> <item> drive to Seattle </item> <item> drive to New York </item> <item> drive to Washington DC </item> </rule> */ // Create set of peer phrases, each containing complete phrase. // Note: the word delimiter is set as " ", so that the text we // attach to the transition can be multiple words (for example, // "fly to Seattle" is implicitly "fly" + "to" + "Seattle"): if (SUCCEEDED(hr)) { hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"fly to Seattle", L" ", SPWT_LEXICAL, 1, NULL); } if (SUCCEEDED(hr)) { hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"fly to New York", L" ", SPWT_LEXICAL, 1, NULL); } if (SUCCEEDED(hr)) { hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"fly to Washington DC", L" ", SPWT_LEXICAL, 1, NULL); } if (SUCCEEDED(hr)) { hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"drive to Seattle", L" ", SPWT_LEXICAL, 1, NULL); } if (SUCCEEDED(hr)) { hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"drive to New York", L" ", SPWT_LEXICAL, 1, NULL); } if (SUCCEEDED(hr)) { hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"drive to Washington DC", L" ", SPWT_LEXICAL, 1, NULL); } // Find the best matching installed en-US recognizer. //CComPtr<ISpObjectToken> cpRecognizerToken; if (SUCCEEDED(hr)) { hr = SpFindBestToken(SPCAT_RECOGNIZERS, L"language=409", NULL, &cpRecognizerToken); } // Create the in-process recognizer and immediately set its state to inactive. //CComPtr<ISpRecognizer> cpRecognizer; if (SUCCEEDED(hr)) { hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecognizer(cpRecognizerToken); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecoState(SPRST_INACTIVE); } // Create a new recognition context from the recognizer. //CComPtr<ISpRecoContext> cpContext; if (SUCCEEDED(hr)) { hr = cpRecognizer->CreateRecoContext(&cpContext); } // Subscribe to the speech recognition event and end stream event. if (SUCCEEDED(hr)) { ULONGLONG ullEventInterest = SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM); hr = cpContext->SetInterest(ullEventInterest, ullEventInterest); } // Establish a Win32 event to signal when speech events are available. //HANDLE hSpeechNotifyEvent = INVALID_HANDLE_VALUE; if (SUCCEEDED(hr)) { hr = cpContext->SetNotifyWin32Event(); } if (SUCCEEDED(hr)) { hr = cpContext->SetNotifyWin32Event(); } if (SUCCEEDED(hr)) { hSpeechNotifyEvent = cpContext->GetNotifyEventHandle(); if (INVALID_HANDLE_VALUE == hSpeechNotifyEvent) { // Notification handle unsupported //hr = SPERR_UNITIALIZED; } } // Set up an audio input stream using a .wav file and set the recognizer's input. CComPtr<ISpStream> cpInputStream; if (SUCCEEDED(hr)) { hr = SPBindToFile(L"Test.wav", SPFM_OPEN_READONLY, &cpInputStream); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetInput(cpInputStream, TRUE); } // Create a new grammar and load an SRGS grammar from file. //CComPtr<ISpRecoGrammar> cpGrammar; if (SUCCEEDED(hr)) { hr = cpContext->CreateGrammar(0, &cpGrammar); } if (SUCCEEDED(hr)) { hr = cpGrammar->LoadCmdFromFile(L"grammar.grxml", SPLO_STATIC); } // Set all top-level rules in the new grammar to the active state. if (SUCCEEDED(hr)) { hr = cpGrammar->SetRuleState(NULL, NULL, SPRS_ACTIVE); } // Finally, set the recognizer state to active to begin recognition. if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecoState(SPRST_ACTIVE_ALWAYS); } hr = CoCreateInstance(CLSID_SpVoice, NULL, CLSCTX_ALL, IID_ISpVoice, (void **)&pVoice); if (SUCCEEDED(hr)) { hr = SpEnumTokens(SPCAT_VOICES, L"Gender=Female", NULL, &pEnum); if (SUCCEEDED(hr)) { // Get the number of voices. hr = pEnum->GetCount(&ulCount); } // Obtain a list of available voice tokens, set // the voice to the token, and call Speak. while (SUCCEEDED(hr) && ulCount--) { if (pVoiceToken != nullptr) { pVoiceToken->Release(); } if (SUCCEEDED(hr)) { hr = pEnum->Next(1, &pVoiceToken, NULL); } if (SUCCEEDED(hr)) { hr = pVoice->SetVoice(pVoiceToken); } if (SUCCEEDED(hr)) { wchar_t* start = L"<?xml version=\"1.0\" encoding=\"ISO - 8859 - 1\"?><speak version = \"1.0\" xmlns = \"http://www.w3.org/2001/10/synthesis\" xml:lang = \"en-US\">"; wchar_t* end = L"</speak>"; const wchar_t *xml = L"<voice required = \"Gender=Male\"> hi! <prosody pitch=\"fast\"> This is low pitch. </prosody><prosody volume=\"x - loud\"> This is extra loud volume. </prosody>"; wstring s = start; s += xml; s += end; hr = pVoice->Speak(xml, SPF_IS_XML| SPF_ASYNC, 0); //hr = pVoice->Speak(L"How are you?", SPF_DEFAULT, NULL); } } /* if (SUCCEEDED(hr)) { hr = pEnum->Next(1, &pVoiceToken, NULL); if (SUCCEEDED(hr)) { hr = pVoice->SetVoice(pVoiceToken); // Set the output to the default audio device. if (SUCCEEDED(hr)) { hr = pVoice->SetOutput(NULL, TRUE); if (SUCCEEDED(hr)) { hr = pVoice->Speak(L"Hello, world!", SPF_DEFAULT, 0); } } } } */ pVoice->Release(); } ::CoUninitialize(); }
int _tmain( int argc, _TCHAR* argv[] ) { cv::setUseOptimized( true ); // Kinectのインスタンス生成、初期化 INuiSensor* pSensor; HRESULT hResult = S_OK; hResult = NuiCreateSensorByIndex( 0, &pSensor ); if( FAILED( hResult ) ){ std::cerr << "Error : NuiCreateSensorByIndex" << std::endl; return -1; } hResult = pSensor->NuiInitialize( NUI_INITIALIZE_FLAG_USES_AUDIO ); if( FAILED( hResult ) ){ std::cerr << "Error : NuiInitialize" << std::endl; return -1; } // Audioストリームの初期化(InitializeAudioStream) std::cout << "InitializeAudioStream" << std::endl; INuiAudioBeam* pNuiAudioSource; hResult = pSensor->NuiGetAudioSource( &pNuiAudioSource ); if( FAILED( hResult ) ){ std::cerr << "Error : NuiGetAudioSource" << std::endl; return -1; } IMediaObject* pMediaObject = nullptr; IPropertyStore* pPropertyStore = nullptr; pNuiAudioSource->QueryInterface( IID_IMediaObject, reinterpret_cast<void**>( &pMediaObject ) ); pNuiAudioSource->QueryInterface( IID_IPropertyStore, reinterpret_cast<void**>( &pPropertyStore ) ); PROPVARIANT propvariant; PropVariantInit( &propvariant ); propvariant.vt = VT_I4; propvariant.lVal = static_cast<LONG>( 4 ); pPropertyStore->SetValue( MFPKEY_WMAAECMA_SYSTEM_MODE, propvariant ); PropVariantClear( &propvariant ); WAVEFORMATEX waveFormat = { AudioFormat, AudioChannels, AudioSamplesPerSecond, AudioAverageBytesPerSecond, AudioBlockAlign, AudioBitsPerSample, 0 }; DMO_MEDIA_TYPE mediaType = { 0 }; MoInitMediaType( &mediaType, sizeof( WAVEFORMATEX ) ); mediaType.majortype = MEDIATYPE_Audio; mediaType.subtype = MEDIASUBTYPE_PCM; mediaType.lSampleSize = 0; mediaType.bFixedSizeSamples = true; mediaType.bTemporalCompression = false; mediaType.formattype = FORMAT_WaveFormatEx; memcpy( mediaType.pbFormat, &waveFormat, sizeof( WAVEFORMATEX ) ); pMediaObject->SetOutputType( 0, &mediaType, 0 ); KinectAudioStream* audioStream = new KinectAudioStream( pMediaObject ); IStream* pStream = nullptr; audioStream->QueryInterface( IID_IStream, reinterpret_cast<void**>( &pStream ) ); CoInitialize( nullptr ); ISpStream* pSpeechStream = nullptr; CoCreateInstance( CLSID_SpStream, NULL, CLSCTX_INPROC_SERVER, __uuidof(ISpStream), reinterpret_cast<void**>( &pSpeechStream ) ); pSpeechStream->SetBaseStream( pStream, SPDFID_WaveFormatEx, &waveFormat ); MoFreeMediaType( &mediaType ); pStream->Release(); pPropertyStore->Release(); pMediaObject->Release(); pNuiAudioSource->Release(); // 音声認識器を作成(CreateSpeechRecognizer) std::cout << "CreateSpeechRecognizer" << std::endl; ISpRecognizer* pSpeechRecognizer; CoCreateInstance( CLSID_SpInprocRecognizer, nullptr, CLSCTX_INPROC_SERVER, __uuidof(ISpRecognizer), reinterpret_cast<void**>( &pSpeechRecognizer ) ); pSpeechRecognizer->SetInput( pSpeechStream, false ); /* // If can use ATL, easier to using SpFindBestToken(sphelper.h). When using Professional or more. ISpObjectToken* pEngineToken = nullptr; SpFindBestToken( SPCAT_RECOGNIZERS, L"Language=411;Kinect=True", NULL, &pEngineToken ); // Japanese "Language=411;Kinect=True" English "Language=409;Kinect=True" */ ///* // If can't use ATL, alternative to using SpFIndBestToken(sphelper.h). When using Express. const wchar_t* pVendorPreferred = L"VendorPreferred"; const unsigned long lengthVendorPreferred = static_cast<unsigned long>( wcslen( pVendorPreferred ) ); unsigned long length; ULongAdd( lengthVendorPreferred, 1, &length ); wchar_t* pAttribsVendorPreferred = new wchar_t[ length ]; StringCchCopyW( pAttribsVendorPreferred, length, pVendorPreferred ); ISpObjectTokenCategory* pTokenCategory = nullptr; CoCreateInstance( CLSID_SpObjectTokenCategory, nullptr, CLSCTX_ALL, __uuidof(ISpObjectTokenCategory), reinterpret_cast<void**>( &pTokenCategory ) ); pTokenCategory->SetId( SPCAT_RECOGNIZERS, false ); IEnumSpObjectTokens* pEnumTokens = nullptr; CoCreateInstance( CLSID_SpMMAudioEnum, nullptr, CLSCTX_ALL, __uuidof(IEnumSpObjectTokens), reinterpret_cast<void**>( &pEnumTokens ) ); pTokenCategory->EnumTokens( L"Language=411;Kinect=True", pAttribsVendorPreferred, &pEnumTokens ); // Japanese "Language=411;Kinect=True" English "Language=409;Kinect=True" delete[] pAttribsVendorPreferred; ISpObjectToken* pEngineToken = nullptr; pEnumTokens->Next( 1, &pEngineToken, nullptr ); //*/ pSpeechRecognizer->SetRecognizer( pEngineToken ); ISpRecoContext* pSpeechContext; pSpeechRecognizer->CreateRecoContext( &pSpeechContext ); pEngineToken->Release(); ///* pTokenCategory->Release(); pEnumTokens->Release(); //*/ // 音声認識辞書の作成(LoadSpeechGrammar) std::cout << "LoadSpeechGrammar" << std::endl; ISpRecoGrammar* pSpeechGrammar; pSpeechContext->CreateGrammar( 1, &pSpeechGrammar ); pSpeechGrammar->LoadCmdFromFile( L"SpeechRecognition_Ja.grxml", /*SPLO_STATIC*/SPLO_DYNAMIC ); // http://www.w3.org/TR/speech-grammar/ (UTF-8/CRLF) audioStream->StartCapture(); pSpeechGrammar->SetRuleState( nullptr, nullptr, SPRS_ACTIVE ); pSpeechRecognizer->SetRecoState( SPRST_ACTIVE_ALWAYS ); pSpeechContext->SetInterest( SPFEI( SPEI_RECOGNITION ), SPFEI( SPEI_RECOGNITION ) ); pSpeechContext->Resume( 0 ); HANDLE hSpeechEvent = INVALID_HANDLE_VALUE; hSpeechEvent = pSpeechContext->GetNotifyEventHandle(); HANDLE hEvents[1] = { hSpeechEvent }; int width = 640; int height = 480; cv::Mat audioMat = cv::Mat::zeros( height, width, CV_8UC3 ); cv::namedWindow( "Audio" ); bool exit = false; std::cout << std::endl << "Speech Recognition Start..." << std::endl << std::endl; while( 1 ){ // イベントの更新待ち ResetEvent( hSpeechEvent ); unsigned long waitObject = MsgWaitForMultipleObjectsEx( ARRAYSIZE( hEvents ), hEvents, INFINITE, QS_ALLINPUT, MWMO_INPUTAVAILABLE ); if( waitObject == WAIT_OBJECT_0 ){ // イベントの取得 const float confidenceThreshold = 0.3f; SPEVENT eventStatus; unsigned long eventFetch = 0; pSpeechContext->GetEvents( 1, &eventStatus, &eventFetch ); while( eventFetch > 0 ){ switch( eventStatus.eEventId ){ // 音声認識イベント(SPEI_HYPOTHESIS:推定またはSPEI_RECOGNITION:認識) case SPEI_HYPOTHESIS: case SPEI_RECOGNITION: if( eventStatus.elParamType == SPET_LPARAM_IS_OBJECT ){ // フレーズの取得 ISpRecoResult* pRecoResult = reinterpret_cast<ISpRecoResult*>( eventStatus.lParam ); SPPHRASE* pPhrase = nullptr; hResult = pRecoResult->GetPhrase( &pPhrase ); if( SUCCEEDED( hResult ) ){ if( ( pPhrase->pProperties != nullptr ) && ( pPhrase->pProperties->pFirstChild != nullptr ) ){ // 辞書のフレーズタグと比較 const SPPHRASEPROPERTY* pSemantic = pPhrase->pProperties->pFirstChild; if( pSemantic->SREngineConfidence > confidenceThreshold ){ if( wcscmp( L"あか", pSemantic->pszValue ) == 0 ){ std::cout << "あか" << std::endl; audioMat = cv::Scalar( 0, 0, 255 ); } else if( wcscmp( L"みどり", pSemantic->pszValue ) == 0 ){ std::cout << "みどり" << std::endl; audioMat = cv::Scalar( 0, 255, 0 ); } else if( wcscmp( L"あお", pSemantic->pszValue ) == 0 ){ std::cout << "あお" << std::endl; audioMat = cv::Scalar( 255, 0, 0 ); } else if( wcscmp( L"おわり", pSemantic->pszValue ) == 0 ){ exit = true; } } } CoTaskMemFree( pPhrase ); } } break; default: break; } pSpeechContext->GetEvents( 1, &eventStatus, &eventFetch ); } } // 表示 cv::imshow( "Audio", audioMat ); // ループの終了判定(Escキー) if( cv::waitKey( 30 ) == VK_ESCAPE || exit ){ break; } } // 終了処理 audioStream->StopCapture(); pSpeechRecognizer->SetRecoState( SPRST_INACTIVE ); CoUninitialize(); pSensor->NuiShutdown(); CloseHandle( hSpeechEvent ); cv::destroyAllWindows(); return 0; }
void SpeechRecognizer::setEnabled(bool enabled) { if (enabled == _enabled || !_comInitialized) { return; } _enabled = enabled; if (_enabled) { HRESULT hr = S_OK; // Set up dedicated recognizer instead of using shared Windows recognizer. // - By default, shared recognizer's commands like "move left" override any added here. // - Unless do SetGrammarState(SPGS_EXCLUSIVE) on shared recognizer but then non-Interface commands don't work at all. // - With dedicated recognizer, user can choose whether to have Windows recognizer running in addition to Interface's. if (SUCCEEDED(hr)) { hr = CoCreateInstance(CLSID_SpInprocRecognizer, NULL, CLSCTX_ALL, IID_ISpRecognizer, (void**)&_speechRecognizer); } if (SUCCEEDED(hr)) { ISpObjectToken* audioToken; ISpObjectTokenCategory* audioTokenCategory; hr = CoCreateInstance(CLSID_SpObjectTokenCategory, NULL, CLSCTX_ALL, IID_ISpObjectTokenCategory, (void**)&audioTokenCategory); if (SUCCEEDED(hr)) { hr = audioTokenCategory->SetId(SPCAT_AUDIOIN, TRUE); } if (SUCCEEDED(hr)) { WCHAR * tokenID; hr = audioTokenCategory->GetDefaultTokenId(&tokenID); if (SUCCEEDED(hr)) { hr = CoCreateInstance(CLSID_SpObjectToken, NULL, CLSCTX_ALL, IID_ISpObjectToken, (void**)&audioToken); if (SUCCEEDED(hr)) { hr = audioToken->SetId(NULL, tokenID, FALSE); } ::CoTaskMemFree(tokenID); } } if (SUCCEEDED(hr)) { hr = static_cast<ISpRecognizer*>(_speechRecognizer)->SetInput(audioToken, TRUE); } } if (SUCCEEDED(hr)) { hr = static_cast<ISpRecognizer*>(_speechRecognizer) ->CreateRecoContext(reinterpret_cast<ISpRecoContext**>(&_speechRecognizerContext)); if (FAILED(hr)) { static_cast<ISpRecognizer*>(_speechRecognizer)->Release(); } } // Set up event notification mechanism. if (SUCCEEDED(hr)) { hr = static_cast<ISpRecoContext*>(_speechRecognizerContext)->SetNotifyWin32Event(); } if (SUCCEEDED(hr)) { _commandRecognizedEvent = static_cast<ISpRecoContext*>(_speechRecognizerContext)->GetNotifyEventHandle(); if (_commandRecognizedEvent) { _commandRecognizedNotifier->setHandle(_commandRecognizedEvent); _commandRecognizedNotifier->setEnabled(true); } else { hr = S_FALSE; } } // Set which events to be notified of. if (SUCCEEDED(hr)) { hr = static_cast<ISpRecoContext*>(_speechRecognizerContext) ->SetInterest(SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION)); } // Create grammar and load commands. if (SUCCEEDED(hr)) { hr = static_cast<ISpRecoContext*>(_speechRecognizerContext) ->CreateGrammar(NULL, reinterpret_cast<ISpRecoGrammar**>(&_speechRecognizerGrammar)); } if (SUCCEEDED(hr)) { reloadCommands(); } _enabled = SUCCEEDED(hr); qDebug() << "Speech recognition" << (_enabled ? "enabled" : "enable failed"); } else { _commandRecognizedNotifier->setEnabled(false); static_cast<ISpRecoContext*>(_speechRecognizerContext)->Release(); static_cast<ISpRecognizer*>(_speechRecognizer)->Release(); qDebug() << "Speech recognition disabled"; } emit enabledUpdated(_enabled); }
//音声認識のためのオブジェクトの構築. void RSpeechRecognition::Create(const std::string & inDicticationFilterWord , const std::string & inGrammarXML , HWND inWindow , UINT inCallbackMesage ) { USES_CONVERSION; HRESULT hr; this->DicticationFilterWord = inDicticationFilterWord; this->CallbackWindowHandle = inWindow; this->CallbackWindowMesage = inCallbackMesage; //Dictation { CComPtr<ISpAudio> cpAudio; hr = SpCreateDefaultObjectFromCategoryId(SPCAT_AUDIOIN, &cpAudio); if(FAILED(hr)) AfxThrowOleException(hr); hr = this->DictationEngine.CoCreateInstance(CLSID_SpInprocRecognizer); if(FAILED(hr)) AfxThrowOleException(hr); hr = this->DictationEngine->CreateRecoContext(&this->DictationRecoCtxt); if(FAILED(hr)) AfxThrowOleException(hr); hr = this->DictationRecoCtxt->SetInterest(SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION)); if(FAILED(hr)) AfxThrowOleException(hr); hr = this->DictationRecoCtxt->SetAudioOptions(SPAO_RETAIN_AUDIO, NULL, NULL); if(FAILED(hr)) AfxThrowOleException(hr); //認識器始動 hr = this->DictationRecoCtxt->CreateGrammar(0, &this->DictationGrammar); if(FAILED(hr)) AfxThrowOleException(hr); hr = this->DictationGrammar->LoadDictation(NULL, SPLO_STATIC); if(FAILED(hr)) AfxThrowOleException(hr); hr = this->DictationRecoCtxt->SetNotifyWin32Event(); if(FAILED(hr)) AfxThrowOleException(hr); } //ルールベースのエンジンを作る. { CComPtr<ISpAudio> cpAudio; hr = SpCreateDefaultObjectFromCategoryId(SPCAT_AUDIOIN, &cpAudio); if(FAILED(hr)) AfxThrowOleException(hr); hr = this->RuleEngine.CoCreateInstance(CLSID_SpInprocRecognizer); if(FAILED(hr)) AfxThrowOleException(hr); //オーディオから読み込んでね hr = this->RuleEngine->SetInput( cpAudio, TRUE); if(FAILED(hr)) AfxThrowOleException(hr); hr = this->RuleEngine->CreateRecoContext(&this->RuleRecoCtxt); if(FAILED(hr)) AfxThrowOleException(hr); hr = this->RuleRecoCtxt->SetInterest(SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION)); if(FAILED(hr)) AfxThrowOleException(hr); hr = this->RuleRecoCtxt->SetAudioOptions(SPAO_RETAIN_AUDIO, NULL, NULL); if(FAILED(hr)) AfxThrowOleException(hr); //認識器始動 hr = this->RuleRecoCtxt->CreateGrammar(0, &this->RuleGrammar); if(FAILED(hr)) AfxThrowOleException(hr); hr = this->RuleGrammar->LoadDictation(NULL, SPLO_STATIC); if(FAILED(hr)) AfxThrowOleException(hr); hr = this->RuleGrammar->LoadCmdFromFile( A2W( inGrammarXML.c_str() ) ,SPLO_STATIC); if(FAILED(hr)) AfxThrowOleException(hr); hr = this->RuleRecoCtxt->SetNotifyCallbackFunction(__callbackRule , (WPARAM)this , 0); if(FAILED(hr)) AfxThrowOleException(hr); //録音開始 hr = this->RuleGrammar->SetRuleState(NULL, NULL, SPRS_ACTIVE ); if(FAILED(hr)) AfxThrowOleException(hr); } this->FlagCleanup(); }