std::wstring SpeechRecognition::GetText() { const ULONG maxEvents = 10; SPEVENT events[maxEvents]; ULONG eventCount; HRESULT hr; hr = recoContext->GetEvents(maxEvents, events, &eventCount); if (eventCount == 0) return std::wstring(); if(!(hr == S_OK || hr == S_FALSE)) { CheckReturn(hr); } ISpRecoResult* recoResult; recoResult = reinterpret_cast<ISpRecoResult*>(events[0].lParam); wchar_t* text; hr = recoResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, FALSE, &text, NULL); CheckReturn(hr); std::wstring outText(text); CoTaskMemFree(text); return outText; }
void mssapi_captions::main_thread() try { HRESULT hr; os_set_thread_name(__FUNCTION__); hr = grammar->SetDictationState(SPRS_ACTIVE); if (FAILED(hr)) throw HRError("SetDictationState failed", hr); hr = recognizer->SetRecoState(SPRST_ACTIVE); if (FAILED(hr)) throw HRError("SetRecoState(SPRST_ACTIVE) failed", hr); HANDLE events[] = {notify, stop}; started = true; for (;;) { DWORD ret = WaitForMultipleObjects(2, events, false, INFINITE); if (ret != WAIT_OBJECT_0) break; CSpEvent event; bool exit = false; while (event.GetFrom(context) == S_OK) { if (event.eEventId == SPEI_RECOGNITION) { ISpRecoResult *result = event.RecoResult(); CoTaskMemPtr<wchar_t> text; hr = result->GetText((ULONG)-1, (ULONG)-1, true, &text, nullptr); if (FAILED(hr)) continue; char text_utf8[512]; os_wcs_to_utf8(text, 0, text_utf8, 512); callback(text_utf8); blog(LOG_DEBUG, "\"%s\"", text_utf8); } else if (event.eEventId == SPEI_END_SR_STREAM) { exit = true; break; } } if (exit) break; } audio->Stop(); } catch (HRError err) { blog(LOG_WARNING, "%s failed: %s (%lX)", __FUNCTION__, err.str, err.hr); }
Bottle SpeechRecognizerModule::waitNextRecognition(int timeout) { yInfo() <<"Recognition: blocking mode on" ; Bottle bOutGrammar; bool gotSomething = false; double endTime = Time::now() + timeout/1000.0; interruptRecognition = false; cout << endl ; yInfo() << "=========== GO Waiting for recog! ===========" ; while(Time::now()<endTime && !gotSomething && !interruptRecognition) { //std::cout<<"."; const float ConfidenceThreshold = 0.3f; SPEVENT curEvent; ULONG fetched = 0; HRESULT hr = S_OK; m_cpRecoCtxt->GetEvents(1, &curEvent, &fetched); while (fetched > 0) { yInfo() << " received something in waitNextRecognition" ; gotSomething = true; ISpRecoResult* result = reinterpret_cast<ISpRecoResult*>(curEvent.lParam); CSpDynamicString dstrText; result->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL); string fullSentence = ws2s(dstrText); yInfo() <<fullSentence ; if (m_useTalkBack) say(fullSentence); bOutGrammar.addString(fullSentence); SPPHRASE* pPhrase = NULL; result->GetPhrase(&pPhrase); bOutGrammar.addList() = toBottle(pPhrase,&pPhrase->Rule); yInfo() <<"Sending semantic bottle : "<<bOutGrammar.toString() ; m_cpRecoCtxt->GetEvents(1, &curEvent, &fetched); if (m_forwardSound) { yarp::sig::Sound& rawSnd = m_portSound.prepare(); rawSnd = toSound(result); m_portSound.write(); } } } if(interruptRecognition) { yDebug() << "interrupted speech recognizer!"; } yInfo() <<"Recognition: blocking mode off"; return bOutGrammar; }
// 音频处理 void ThisApp::speech_process() { // 置信阈值 const float ConfidenceThreshold = 0.3f; SPEVENT curEvent = { SPEI_UNDEFINED, SPET_LPARAM_IS_UNDEFINED, 0, 0, 0, 0 }; ULONG fetched = 0; HRESULT hr = S_OK; // 获取事件 m_pSpeechContext->GetEvents(1, &curEvent, &fetched); while (fetched > 0) { // 确定是识别事件 switch (curEvent.eEventId) { case SPEI_RECOGNITION: // 保证位对象 if (SPET_LPARAM_IS_OBJECT == curEvent.elParamType) { ISpRecoResult* result = reinterpret_cast<ISpRecoResult*>(curEvent.lParam); SPPHRASE* pPhrase = nullptr; // 获取识别短语 hr = result->GetPhrase(&pPhrase); if (SUCCEEDED(hr)) { #ifdef _DEBUG // DEBUG时显示识别字符串 WCHAR* pwszFirstWord; result->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &pwszFirstWord, nullptr); _cwprintf(pwszFirstWord); ::CoTaskMemFree(pwszFirstWord); #endif pPhrase->pProperties; const SPPHRASEELEMENT* pointer = pPhrase->pElements + 1; if ((pPhrase->pProperties != nullptr) && (pPhrase->pProperties->pFirstChild != nullptr)) { const SPPHRASEPROPERTY* pSemanticTag = pPhrase->pProperties->pFirstChild; #ifdef _DEBUG _cwprintf(L" 置信度:%d%%\n", (int)(pSemanticTag->SREngineConfidence*100.f)); #endif if (pSemanticTag->SREngineConfidence > ConfidenceThreshold) { speech_behavior(pSemanticTag); } } ::CoTaskMemFree(pPhrase); } } break; } m_pSpeechContext->GetEvents(1, &curEvent, &fetched); } return; }
list< pair<string, double> > SpeechRecognizerModule::waitNextRecognitionLEGACY(int timeout) { yInfo() <<"Recognition LEGACY: blocking mode on" ; list< pair<string, double> > recognitionResults; bool gotSomething = false; double endTime = Time::now() + timeout/1000.0; while(Time::now()<endTime && !gotSomething && !interruptRecognition) { //std::cout<<"."; const float ConfidenceThreshold = 0.3f; SPEVENT curEvent; ULONG fetched = 0; HRESULT hr = S_OK; m_cpRecoCtxt->GetEvents(1, &curEvent, &fetched); while (fetched > 0) { gotSomething = true; ISpRecoResult* result = reinterpret_cast<ISpRecoResult*>(curEvent.lParam); //Convert the catched sentence to strings. CSpDynamicString dstrText; result->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL); string fullSentence = ws2s(dstrText); yInfo() <<fullSentence ; if (m_useTalkBack) say(fullSentence); vector<string> words = split(fullSentence,' '); for(unsigned int w=0;w<words.size();w++) { //Todo extract the confidence value somehow... recognitionResults.push_back(make_pair(words[w], -1.0)); } m_cpRecoCtxt->GetEvents(1, &curEvent, &fetched); } } interruptRecognition = false; yInfo() <<"Recognition: blocking mode off" ; return recognitionResults; }
void SpeechRecognizer::notifyCommandRecognized(void* handle) { SPEVENT eventItem; memset(&eventItem, 0, sizeof(SPEVENT)); HRESULT hr = static_cast<ISpRecoContext*>(_speechRecognizerContext)->GetEvents(1, &eventItem, NULL); if (SUCCEEDED(hr)) { if (eventItem.eEventId == SPEI_RECOGNITION && eventItem.elParamType == SPET_LPARAM_IS_OBJECT) { ISpRecoResult* recognitionResult = reinterpret_cast<ISpRecoResult*>(eventItem.lParam); wchar_t* pText; hr = recognitionResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, FALSE, &pText, NULL); if (SUCCEEDED(hr)) { QString text = QString::fromWCharArray(pText); handleCommandRecognized(text.toStdString().c_str()); ::CoTaskMemFree(pText); } recognitionResult->Release(); } } }
/** This is called when SAPI 5.1 has an event. In the textless case, we only handle SPIE_RECOGNITION event. We aren't looking at SPIE_HYPOTHESIS. This might be an error. We might be more robust by handling both. We process the event and add the phonemes we get to the result list **/ void sapi_textless_lipsync::callback() { CSpEvent event; // the event ISpRecoResult *pRecoResult; // recoResult from the event SPPHRASE *pSpPhrase; // phrase from recoResult SPRECORESULTTIMES pRecoResultTimes; // result times from RecoResult WCHAR phone_buffer[256]; // phoneme buffer for conversion long msStart; // time stamp of the result while (event.GetFrom(this->m_recogCntxt) == S_OK) { if (event.eEventId == SPEI_RECOGNITION /*|| event.eEventId == SPEI_HYPOTHESIS */) { // for textless we only accept full recognition. This might be an area // to watch out for // pull out the result object pRecoResult = event.RecoResult(); // pull the whole text from the result CSpDynamicString pSapiText; pRecoResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, FALSE, &pSapiText, NULL); // get the start time for the phrase. we use this as an offset for the phrase // elements. Not sure if this is correct. pRecoResult->GetResultTimes(&pRecoResultTimes); msStart = sapi_time_to_milli(pRecoResultTimes.ullStart); // extract the phrase object pRecoResult->GetPhrase(&pSpPhrase); if (pSpPhrase != NULL) { // Process each element of the phrase. These should be our // orthorgraphs const SPPHRASEELEMENT *p = pSpPhrase->pElements; const SPPHRASEELEMENT *pEnd = p + pSpPhrase->Rule.ulCountOfElements; while (p != pEnd) { // for each phrase element we create a marker // that contains the time stamps along with the // phonemes. associated with it. alignment_result al; al.m_orthography = p->pszDisplayText; // Get the phonemes ULONG j = 0; SPPHONEID phn[2]; phn[1] = 0x00; while (p->pszPronunciation[j] != 0) { // process each phoneme phn[0] = p->pszPronunciation[j]; m_phnCvt->IdToPhone(phn, phone_buffer); al.m_phonemes.push_back(phone_buffer); j++; } // start time of the ortheme al.m_msStart= msStart + bytes_to_milli(p->ulAudioStreamOffset); // end time of the ortheme al.m_msEnd = bytes_to_milli(p->ulAudioSizeBytes); al.m_msEnd += al.m_msStart; // add it to the results m_results.push_back(al); p++; } } } else if (event.eEventId == SPEI_END_SR_STREAM) { // This event occurs when the stream has finished processing. // we set a flag to indicate that things are done. m_bDone = TRUE; } } }
/** This is called by SAPI 5.1 when it has an event. We use the SpEvent class provided by their SDK to simplify the processing. Basically, when we get a "RECOGNITION" event or a "SPEI_HYPOTHESIS" event we process them the same. Hypothesis are more likely, for all but very short files, "SPIE_RECOGNITION" is a rarity. Since the hypothesis will include duplicate data, we have a decision. We can save the newest hypothesis or we can save the one which generates the most alignments. Imperically, it seems that sticking with the longest result works best. But perhaps this is not so. **/ void sapi_textbased_lipsync::callback() { //USES_CONVERSION; CSpEvent event; ISpRecoResult *pRecoResult; // recoResult from the event SPPHRASE *pSpPhrase; // phrase from recoResult SPRECORESULTTIMES pRecoResultTimes; // result times from RecoResult WCHAR phone_buffer[256]; // buffer for the phonemes UINT msStart; // start time of the phrase // Process the events while (event.GetFrom(this->m_recogCntxt) == S_OK) { if (event.eEventId == SPEI_RECOGNITION || event.eEventId == SPEI_HYPOTHESIS) { // text based has to accept hypothesis or it mostly fails unless the // script is very short // pull out the result object pRecoResult = event.RecoResult(); // pull the whole text from the result CSpDynamicString pSapiText; pRecoResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, FALSE, &pSapiText, NULL); // get the start time for the phrase. we use this as an offset for the phrase // elements. Not sure if this is correct. pRecoResult->GetResultTimes(&pRecoResultTimes); msStart = sapi_time_to_milli(pRecoResultTimes.ullStart); std::wstring strPrintText = pSapiText; std::cerr << "hypothesis: " << wstring_2_string(strPrintText) << std::endl; // if the new results are longer than existing results in orthographic form // we accept the results and process the phonemes. Otherwise, we skip it if ((wcslen(pSapiText) > this->m_strResults.size())) { m_strResults = pSapiText; // clear the old results. This hypothesis trumps it this->m_results.clear(); // extract the phrase object pRecoResult->GetPhrase(&pSpPhrase); if (pSpPhrase != NULL) { // Process each element of the phrase. These should be our // orthorgraphs const SPPHRASEELEMENT *p = pSpPhrase->pElements; const SPPHRASEELEMENT *pEnd = p + pSpPhrase->Rule.ulCountOfElements; while (p != pEnd) { // for each phrase element we create a marker // that contains the time stamps along with the // phonemes. associated with it. alignment_result al; al.m_orthography = p->pszDisplayText; // Get the phonemes ULONG j = 0; SPPHONEID phn[2]; phn[1] = 0x00; while (p->pszPronunciation[j] != 0) { // process each phoneme phn[0] = p->pszPronunciation[j]; m_phnCvt->IdToPhone(phn, phone_buffer); al.m_phonemes.push_back(phone_buffer); j++; } // start time of the ortheme al.m_msStart= msStart + bytes_to_milli(p->ulAudioStreamOffset); // end time of the ortheme al.m_msEnd = bytes_to_milli(p->ulAudioSizeBytes); al.m_msEnd += al.m_msStart; // add it to the results m_results.push_back(al); p++; } } } } else if (event.eEventId == SPEI_END_SR_STREAM) { // This event occurs when the stream has finished processing. // we set a flag to indicate that things are done. m_bDone = TRUE; } } }
void obs_captions::main_thread() try { ComPtr<CaptionStream> audio; ComPtr<ISpObjectToken> token; ComPtr<ISpRecoGrammar> grammar; ComPtr<ISpRecognizer> recognizer; ComPtr<ISpRecoContext> context; HRESULT hr; auto cb = [&] (const struct audio_data *audio_data, bool muted) { audio->PushAudio(audio_data, muted); }; using cb_t = decltype(cb); auto pre_cb = [] (void *param, obs_source_t*, const struct audio_data *audio_data, bool muted) { return (*static_cast<cb_t*>(param))(audio_data, muted); }; os_set_thread_name(__FUNCTION__); CoInitialize(nullptr); wchar_t lang_str[32]; _snwprintf(lang_str, 31, L"language=%x", (int)captions->lang_id); hr = SpFindBestToken(SPCAT_RECOGNIZERS, lang_str, nullptr, &token); if (FAILED(hr)) throw HRError("SpFindBestToken failed", hr); hr = CoCreateInstance(CLSID_SpInprocRecognizer, nullptr, CLSCTX_ALL, __uuidof(ISpRecognizer), (void**)&recognizer); if (FAILED(hr)) throw HRError("CoCreateInstance for recognizer failed", hr); hr = recognizer->SetRecognizer(token); if (FAILED(hr)) throw HRError("SetRecognizer failed", hr); hr = recognizer->SetRecoState(SPRST_INACTIVE); if (FAILED(hr)) throw HRError("SetRecoState(SPRST_INACTIVE) failed", hr); hr = recognizer->CreateRecoContext(&context); if (FAILED(hr)) throw HRError("CreateRecoContext failed", hr); ULONGLONG interest = SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM); hr = context->SetInterest(interest, interest); if (FAILED(hr)) throw HRError("SetInterest failed", hr); HANDLE notify; hr = context->SetNotifyWin32Event(); if (FAILED(hr)) throw HRError("SetNotifyWin32Event", hr); notify = context->GetNotifyEventHandle(); if (notify == INVALID_HANDLE_VALUE) throw HRError("GetNotifyEventHandle failed", E_NOINTERFACE); size_t sample_rate = audio_output_get_sample_rate(obs_get_audio()); audio = new CaptionStream((DWORD)sample_rate); audio->Release(); hr = recognizer->SetInput(audio, false); if (FAILED(hr)) throw HRError("SetInput failed", hr); hr = context->CreateGrammar(1, &grammar); if (FAILED(hr)) throw HRError("CreateGrammar failed", hr); hr = grammar->LoadDictation(nullptr, SPLO_STATIC); if (FAILED(hr)) throw HRError("LoadDictation failed", hr); hr = grammar->SetDictationState(SPRS_ACTIVE); if (FAILED(hr)) throw HRError("SetDictationState failed", hr); hr = recognizer->SetRecoState(SPRST_ACTIVE); if (FAILED(hr)) throw HRError("SetRecoState(SPRST_ACTIVE) failed", hr); HANDLE events[] = {notify, stop_event}; { captions->source = GetWeakSourceByName( captions->source_name.c_str()); OBSSource strong = OBSGetStrongRef(source); if (strong) obs_source_add_audio_capture_callback(strong, pre_cb, &cb); } for (;;) { DWORD ret = WaitForMultipleObjects(2, events, false, INFINITE); if (ret != WAIT_OBJECT_0) break; CSpEvent event; bool exit = false; while (event.GetFrom(context) == S_OK) { if (event.eEventId == SPEI_RECOGNITION) { ISpRecoResult *result = event.RecoResult(); CoTaskMemPtr<wchar_t> text; hr = result->GetText((ULONG)-1, (ULONG)-1, true, &text, nullptr); if (FAILED(hr)) continue; char text_utf8[512]; os_wcs_to_utf8(text, 0, text_utf8, 512); obs_output_t *output = obs_frontend_get_streaming_output(); if (output) obs_output_output_caption_text1(output, text_utf8); debug("\"%s\"", text_utf8); obs_output_release(output); } else if (event.eEventId == SPEI_END_SR_STREAM) { exit = true; break; } } if (exit) break; } { OBSSource strong = OBSGetStrongRef(source); if (strong) obs_source_remove_audio_capture_callback(strong, pre_cb, &cb); } audio->Stop(); CoUninitialize(); } catch (HRError err) { error("%s failed: %s (%lX)", __FUNCTION__, err.str, err.hr); CoUninitialize(); captions->th.detach(); }
void Sound::test() { ISpVoice * pVoice = NULL; ISpObjectToken* pVoiceToken=nullptr; IEnumSpObjectTokens* pEnum; ULONG ulCount = 0; if (FAILED(::CoInitialize(NULL))) { return; } HRESULT hr = S_OK; // Find the best matching installed en-us recognizer. CComPtr<ISpObjectToken> cpRecognizerToken; if (SUCCEEDED(hr)) { hr = SpFindBestToken(SPCAT_RECOGNIZERS, L"language=409", NULL, &cpRecognizerToken); } // Create the in-process recognizer and immediately set its state to inactive. CComPtr<ISpRecognizer> cpRecognizer; if (SUCCEEDED(hr)) { hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecognizer(cpRecognizerToken); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecoState(SPRST_INACTIVE); } // Create a new recognition context from the recognizer. CComPtr<ISpRecoContext> cpContext; if (SUCCEEDED(hr)) { hr = cpRecognizer->CreateRecoContext(&cpContext); } // Subscribe to the speech recognition event and end stream event. if (SUCCEEDED(hr)) { ULONGLONG ullEventInterest = SPFEI(SPEI_RECOGNITION); hr = cpContext->SetInterest(ullEventInterest, ullEventInterest); } // Establish a Win32 event to signal when speech events are available. HANDLE hSpeechNotifyEvent = INVALID_HANDLE_VALUE; if (SUCCEEDED(hr)) { hr = cpContext->SetNotifyWin32Event(); } if (SUCCEEDED(hr)) { hSpeechNotifyEvent = cpContext->GetNotifyEventHandle(); if (INVALID_HANDLE_VALUE == hSpeechNotifyEvent) { // Notification handle unsupported. hr = E_NOINTERFACE; } } // Initialize an audio object to use the default audio input of the system and set the recognizer to use it. CComPtr<ISpAudio> cpAudioIn; if (SUCCEEDED(hr)) { hr = cpAudioIn.CoCreateInstance(CLSID_SpMMAudioIn); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetInput(cpAudioIn, TRUE); } // Populate a WAVEFORMATEX struct with our desired output audio format. information. WAVEFORMATEX* pWfexCoMemRetainedAudioFormat = NULL; GUID guidRetainedAudioFormat = GUID_NULL; if (SUCCEEDED(hr)) { hr = SpConvertStreamFormatEnum(SPSF_16kHz16BitMono, &guidRetainedAudioFormat, &pWfexCoMemRetainedAudioFormat); } // Instruct the recognizer to retain the audio from its recognition results. if (SUCCEEDED(hr)) { hr = cpContext->SetAudioOptions(SPAO_RETAIN_AUDIO, &guidRetainedAudioFormat, pWfexCoMemRetainedAudioFormat); } if (NULL != pWfexCoMemRetainedAudioFormat) { CoTaskMemFree(pWfexCoMemRetainedAudioFormat); } // Create a new grammar and load an SRGS grammar from file. CComPtr<ISpRecoGrammar> cpGrammar; if (SUCCEEDED(hr)) { hr = cpContext->CreateGrammar(0, &cpGrammar); } if (SUCCEEDED(hr)) { hr = cpGrammar->LoadCmdFromFile(L"grammar.grxml", SPLO_STATIC); } // Set all top-level rules in the new grammar to the active state. if (SUCCEEDED(hr)) { hr = cpGrammar->SetRuleState(NULL, NULL, SPRS_ACTIVE); } // Set the recognizer state to active to begin recognition. if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecoState(SPRST_ACTIVE_ALWAYS); } // Establish a separate Win32 event to signal the event loop exit. HANDLE hExitEvent = CreateEventW(NULL, FALSE, FALSE, NULL); // Collect the events listened for to pump the speech event loop. HANDLE rghEvents[] = { hSpeechNotifyEvent, hExitEvent }; // Speech recognition event loop. BOOL fContinue = TRUE; while (fContinue && SUCCEEDED(hr)) { // Wait for either a speech event or an exit event, with a 15 second timeout. DWORD dwMessage = WaitForMultipleObjects(sp_countof(rghEvents), rghEvents, FALSE, 15000); switch (dwMessage) { // With the WaitForMultipleObjects call above, WAIT_OBJECT_0 is a speech event from hSpeechNotifyEvent. case WAIT_OBJECT_0: { // Sequentially grab the available speech events from the speech event queue. CSpEvent spevent; while (S_OK == spevent.GetFrom(cpContext)) { switch (spevent.eEventId) { case SPEI_RECOGNITION: { // Retrieve the recognition result and output the text of that result. ISpRecoResult* pResult = spevent.RecoResult(); LPWSTR pszCoMemResultText = NULL; hr = pResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &pszCoMemResultText, NULL); if (SUCCEEDED(hr)) { wprintf(L"Recognition event received, text=\"%s\"\r\n", pszCoMemResultText); } // Also retrieve the retained audio we requested. CComPtr<ISpStreamFormat> cpRetainedAudio; if (SUCCEEDED(hr)) { hr = pResult->GetAudio(0, 0, &cpRetainedAudio); } // To demonstrate, we'll speak the retained audio back using ISpVoice. CComPtr<ISpVoice> cpVoice; if (SUCCEEDED(hr)) { hr = cpVoice.CoCreateInstance(CLSID_SpVoice); } if (SUCCEEDED(hr)) { hr = cpVoice->SpeakStream(cpRetainedAudio, SPF_DEFAULT, 0); } if (NULL != pszCoMemResultText) { CoTaskMemFree(pszCoMemResultText); } break; } } } break; } case WAIT_OBJECT_0 + 1: case WAIT_TIMEOUT: { // Exit event or timeout; discontinue the speech loop. fContinue = FALSE; //break; } } } CoUninitialize(); CComPtr <ISpVoice> cpVoice; CComPtr <ISpStream> cpStream; CSpStreamFormat cAudioFmt; //Create a SAPI Voice hr = cpVoice.CoCreateInstance(CLSID_SpVoice); //Set the audio format if (SUCCEEDED(hr)) { hr = cAudioFmt.AssignFormat(SPSF_22kHz16BitMono); } //Call SPBindToFile, a SAPI helper method, to bind the audio stream to the file if (SUCCEEDED(hr)) { hr = SPBindToFile(L"c:\\ttstemp.wav", SPFM_CREATE_ALWAYS, &cpStream, &cAudioFmt.FormatId(), cAudioFmt.WaveFormatExPtr()); } //set the output to cpStream so that the output audio data will be stored in cpStream if (SUCCEEDED(hr)) { hr = cpVoice->SetOutput(cpStream, TRUE); } //Speak the text "hello world" synchronously if (SUCCEEDED(hr)) { hr = cpVoice->Speak(L"Hello World", SPF_DEFAULT, NULL); } //close the stream if (SUCCEEDED(hr)) { hr = cpStream->Close(); } //Release the stream and voice object cpStream.Release(); cpVoice.Release(); CComPtr<ISpGrammarBuilder> cpGrammarBuilder; SPSTATEHANDLE hStateTravel; // Create (if rule does not already exist) // top-level Rule, defaulting to Active. hr = cpGrammarBuilder->GetRule(L"Travel", 0, SPRAF_TopLevel | SPRAF_Active, TRUE, &hStateTravel); // Approach 1: List all possible phrases. // This is the most intuitive approach, and it does not sacrifice efficiency // because the grammar builder will merge shared sub-phrases when possible. // There is only one root state, hStateTravel, and the terminal NULL state, // and there are six unique transitions between root state and NULL state. /* XML Approximation: <rule id="Travel"> <item> fly to Seattle </item> <item> fly to New York </item> <item> fly to Washington DC </item> <item> drive to Seattle </item> <item> drive to New York </item> <item> drive to Washington DC </item> </rule> */ // Create set of peer phrases, each containing complete phrase. // Note: the word delimiter is set as " ", so that the text we // attach to the transition can be multiple words (for example, // "fly to Seattle" is implicitly "fly" + "to" + "Seattle"): if (SUCCEEDED(hr)) { hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"fly to Seattle", L" ", SPWT_LEXICAL, 1, NULL); } if (SUCCEEDED(hr)) { hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"fly to New York", L" ", SPWT_LEXICAL, 1, NULL); } if (SUCCEEDED(hr)) { hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"fly to Washington DC", L" ", SPWT_LEXICAL, 1, NULL); } if (SUCCEEDED(hr)) { hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"drive to Seattle", L" ", SPWT_LEXICAL, 1, NULL); } if (SUCCEEDED(hr)) { hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"drive to New York", L" ", SPWT_LEXICAL, 1, NULL); } if (SUCCEEDED(hr)) { hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"drive to Washington DC", L" ", SPWT_LEXICAL, 1, NULL); } // Find the best matching installed en-US recognizer. //CComPtr<ISpObjectToken> cpRecognizerToken; if (SUCCEEDED(hr)) { hr = SpFindBestToken(SPCAT_RECOGNIZERS, L"language=409", NULL, &cpRecognizerToken); } // Create the in-process recognizer and immediately set its state to inactive. //CComPtr<ISpRecognizer> cpRecognizer; if (SUCCEEDED(hr)) { hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecognizer(cpRecognizerToken); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecoState(SPRST_INACTIVE); } // Create a new recognition context from the recognizer. //CComPtr<ISpRecoContext> cpContext; if (SUCCEEDED(hr)) { hr = cpRecognizer->CreateRecoContext(&cpContext); } // Subscribe to the speech recognition event and end stream event. if (SUCCEEDED(hr)) { ULONGLONG ullEventInterest = SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM); hr = cpContext->SetInterest(ullEventInterest, ullEventInterest); } // Establish a Win32 event to signal when speech events are available. //HANDLE hSpeechNotifyEvent = INVALID_HANDLE_VALUE; if (SUCCEEDED(hr)) { hr = cpContext->SetNotifyWin32Event(); } if (SUCCEEDED(hr)) { hr = cpContext->SetNotifyWin32Event(); } if (SUCCEEDED(hr)) { hSpeechNotifyEvent = cpContext->GetNotifyEventHandle(); if (INVALID_HANDLE_VALUE == hSpeechNotifyEvent) { // Notification handle unsupported //hr = SPERR_UNITIALIZED; } } // Set up an audio input stream using a .wav file and set the recognizer's input. CComPtr<ISpStream> cpInputStream; if (SUCCEEDED(hr)) { hr = SPBindToFile(L"Test.wav", SPFM_OPEN_READONLY, &cpInputStream); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetInput(cpInputStream, TRUE); } // Create a new grammar and load an SRGS grammar from file. //CComPtr<ISpRecoGrammar> cpGrammar; if (SUCCEEDED(hr)) { hr = cpContext->CreateGrammar(0, &cpGrammar); } if (SUCCEEDED(hr)) { hr = cpGrammar->LoadCmdFromFile(L"grammar.grxml", SPLO_STATIC); } // Set all top-level rules in the new grammar to the active state. if (SUCCEEDED(hr)) { hr = cpGrammar->SetRuleState(NULL, NULL, SPRS_ACTIVE); } // Finally, set the recognizer state to active to begin recognition. if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecoState(SPRST_ACTIVE_ALWAYS); } hr = CoCreateInstance(CLSID_SpVoice, NULL, CLSCTX_ALL, IID_ISpVoice, (void **)&pVoice); if (SUCCEEDED(hr)) { hr = SpEnumTokens(SPCAT_VOICES, L"Gender=Female", NULL, &pEnum); if (SUCCEEDED(hr)) { // Get the number of voices. hr = pEnum->GetCount(&ulCount); } // Obtain a list of available voice tokens, set // the voice to the token, and call Speak. while (SUCCEEDED(hr) && ulCount--) { if (pVoiceToken != nullptr) { pVoiceToken->Release(); } if (SUCCEEDED(hr)) { hr = pEnum->Next(1, &pVoiceToken, NULL); } if (SUCCEEDED(hr)) { hr = pVoice->SetVoice(pVoiceToken); } if (SUCCEEDED(hr)) { wchar_t* start = L"<?xml version=\"1.0\" encoding=\"ISO - 8859 - 1\"?><speak version = \"1.0\" xmlns = \"http://www.w3.org/2001/10/synthesis\" xml:lang = \"en-US\">"; wchar_t* end = L"</speak>"; const wchar_t *xml = L"<voice required = \"Gender=Male\"> hi! <prosody pitch=\"fast\"> This is low pitch. </prosody><prosody volume=\"x - loud\"> This is extra loud volume. </prosody>"; wstring s = start; s += xml; s += end; hr = pVoice->Speak(xml, SPF_IS_XML| SPF_ASYNC, 0); //hr = pVoice->Speak(L"How are you?", SPF_DEFAULT, NULL); } } /* if (SUCCEEDED(hr)) { hr = pEnum->Next(1, &pVoiceToken, NULL); if (SUCCEEDED(hr)) { hr = pVoice->SetVoice(pVoiceToken); // Set the output to the default audio device. if (SUCCEEDED(hr)) { hr = pVoice->SetOutput(NULL, TRUE); if (SUCCEEDED(hr)) { hr = pVoice->Speak(L"Hello, world!", SPF_DEFAULT, 0); } } } } */ pVoice->Release(); } ::CoUninitialize(); }
void CASRwrapper::GetText(std::wstring& speechRes, float* pconfidence, int requestedAlternates, std::wstring alternates[], float alternatesConfidence[]) { //HRESULT hr = CoCreateInstance(CLSID_SpVoice, NULL, CLSCTX_ALL, IID_ISpVoice, (void **)&m_pVoice); //hr = m_pVoice->Speak(speechRes, 0, NULL); //m_pVoice->Release(); //m_pVoice = NULL; const ULONG maxEvents = 10; SPEVENT events[maxEvents]; ULONG eventCount; HRESULT hr; hr = m_cpRecoCtxt->GetEvents(maxEvents, events, &eventCount); // Warning hr equal S_FALSE if everything is OK // but eventCount < requestedEventCount if (!(hr == S_OK || hr == S_FALSE)) { return; } if (eventCount > 1) { speechRes.assign(L"More than one event!"); return; } ISpRecoResult* recoResult; recoResult = reinterpret_cast<ISpRecoResult*>(events[0].lParam); wchar_t* text; hr = recoResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, FALSE, &text, NULL); speechRes.assign(text); //if (confidence != NULL) //*confidence = recoResult->->pElements->SREngineConfidence;; CoTaskMemFree(text); if (requestedAlternates == 0 && pconfidence == NULL) return; const USHORT MAX_ALTERNATES = 100; if (requestedAlternates > MAX_ALTERNATES) requestedAlternates = MAX_ALTERNATES; if (requestedAlternates == 0) //in case asked for confidence. i.e., pconfidence!=NULL requestedAlternates = 1; CComPtr<ISpPhraseAlt> pcpPhraseAlt[MAX_ALTERNATES]; SPPHRASE* pPhrase; std::string betterResult; float ConfidenceMax = 0.0; ULONG ulCount; //std::list<std::string> lWordsRec; // Retrieve information about the recognized phrase hr = recoResult->GetPhrase(&pPhrase); if (SUCCEEDED(hr)) { // Retrieve a list of alternative phrases related to the recognized phrase hr = recoResult->GetAlternates(pPhrase->Rule.ulFirstElement, pPhrase->Rule.ulCountOfElements, requestedAlternates, (ISpPhraseAlt**)pcpPhraseAlt, &ulCount); } if (SUCCEEDED(hr)) { // Browse the list of alternative phrases in order of highest likelyhood with the original phrase for (unsigned int i = 0; i < ulCount; i++) { SPPHRASE* pPhraseAlt; CSpDynamicString pwszAlternate; // Retrieve information about the current alternative phrase pcpPhraseAlt[i]->GetPhrase(&pPhraseAlt); // Get the phrase's entire text string hr = pcpPhraseAlt[i]->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &pwszAlternate, NULL); if (SUCCEEDED(hr)) { if (i == 1 && pconfidence != NULL) *pconfidence = pPhraseAlt->pElements->SREngineConfidence; if (alternatesConfidence != NULL) alternatesConfidence[i] = pPhraseAlt->pElements->SREngineConfidence; if (alternates != NULL) alternates[i] = pwszAlternate.Copy(); // .CopyToChar(); } } } }
/********************************************************** * COperator::HandleCall * *-----------------------* * Description: * Deals with the call * Return: * S_OK * Failed return value of ISpMMSysAudio::SetState(), * ISpVoice::Speak() ************************************************************/ HRESULT COperator::HandleCall() { // PLEASE NOTE: This is a single-threaded app, so if the caller // hangs up after the call-handling sequence has started, the // app will not be notified until after the entire call sequence // has finished. // If you want to be able to cut the call-handling short because // the caller hung up, you need to have a separate thread listening // for TAPI's CS_DISCONNECT notification. _ASSERTE( m_cpMMSysAudioOut ); HRESULT hr = S_OK; // Now that the call is connected, we can start up the audio output hr = m_cpOutgoingVoice->Speak( L"Hello, please say something to me", 0, NULL ); // Start listening if ( SUCCEEDED( hr ) ) { hr = m_cpDictGrammar->SetDictationState( SPRS_ACTIVE ); } // We are expecting a PHRASESTART followed by either a RECOGNITION or a // FALSERECOGNITION // Wait for the PHRASE_START CSpEvent event; WORD eLastEventID = SPEI_FALSE_RECOGNITION; hr = m_cpIncomingRecoCtxt->WaitForNotifyEvent(CALLER_TIMEOUT); if ( SUCCEEDED( hr ) ) { hr = event.GetFrom( m_cpIncomingRecoCtxt ); } // Enter this block only if we have not timed out (the user started speaking) if ( ( S_OK == hr ) && ( SPEI_PHRASE_START == event.eEventId ) ) { // Caller has started to speak, block "forever" until the // result (or lack thereof) comes back. // This is all right, since every PHRASE_START is guaranteed // to be followed up by a RECOGNITION or FALSE_RECOGNITION hr = m_cpIncomingRecoCtxt->WaitForNotifyEvent(INFINITE); if ( S_OK == hr ) { // Get the RECOGNITION or FALSE_RECOGNITION hr = event.GetFrom( m_cpIncomingRecoCtxt ); eLastEventID = event.eEventId; // This had better be either a RECOGNITION or FALSERECOGNITION! _ASSERTE( (SPEI_RECOGNITION == event.eEventId) || (SPEI_FALSE_RECOGNITION == event.eEventId) ); } } // Make sure a recognition result was actually received (as opposed to a false recognition // or timeout on the caller) WCHAR *pwszCoMemText = NULL; ISpRecoResult *pResult = NULL; if ( SUCCEEDED( hr ) && ( SPEI_RECOGNITION == event.eEventId ) ) { // Get the text of the result pResult = event.RecoResult(); BYTE bDisplayAttr; hr = pResult->GetText( SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, FALSE, &pwszCoMemText, &bDisplayAttr ); } if ( SUCCEEDED( hr ) && pResult ) { // Speak the result back locally m_cpLocalVoice->Speak( L"I think the person on the phone said", SPF_ASYNC, 0 ); m_cpLocalVoice->Speak( pwszCoMemText, SPF_ASYNC, 0 ); m_cpLocalVoice->Speak( L"when he said", SPF_ASYNC, 0 ); // Get the audio so that the local voice can speak it back CComPtr<ISpStreamFormat> cpStreamFormat; HRESULT hrAudio = pResult->GetAudio( 0, 0, &cpStreamFormat ); if ( SUCCEEDED( hrAudio ) ) { m_cpLocalVoice->SpeakStream( cpStreamFormat, SPF_ASYNC, 0 ); } else { m_cpLocalVoice->Speak( L"no audio was available", SPF_ASYNC, 0 ); } } // Stop listening if ( SUCCEEDED( hr ) ) { hr = m_cpDictGrammar->SetDictationState( SPRS_INACTIVE ); } // Close the audio input so that we can open the audio output // (half-duplex device) if ( SUCCEEDED( hr ) ) { hr = m_cpMMSysAudioIn->SetState( SPAS_CLOSED, 0 ); } // The caller may have hung up on us, in which case we don't want to do // the following if ( m_pCall ) { if ( pResult ) { // There's a result to playback if ( SUCCEEDED( hr ) ) { hr = m_cpOutgoingVoice->Speak( L"I think I heard you say", 0, 0 ); } if ( SUCCEEDED( hr ) ) { hr = m_cpOutgoingVoice->Speak( pwszCoMemText, 0, 0 ); } if ( SUCCEEDED( hr ) ) { hr = m_cpOutgoingVoice->Speak( L"when you said", 0, 0 ); } if ( SUCCEEDED( hr ) ) { hr = pResult->SpeakAudio( NULL, 0, NULL, NULL ); } } else { // Caller didn't say anything if ( SUCCEEDED( hr ) ) { hr = m_cpOutgoingVoice->Speak( L"I don't believe you said anything!", 0, 0 ); } } if ( SUCCEEDED( hr ) ) { hr = m_cpOutgoingVoice->Speak( L"OK bye now", 0, 0 ); } } else { m_cpLocalVoice->Speak( L"Prematurely terminated call", 0, 0 ); } if ( pwszCoMemText ) { ::CoTaskMemFree( pwszCoMemText ); } return m_pCall ? hr : TAPI_E_DROPPED; } /* COperator::HandleCall */
//認識開始 void RSpeechRecognition::Listen() throw(RComException) { USES_CONVERSION; HRESULT hr; CSpEvent event; //録音が終わるまで大待機 hr = this->RecoCtxt->WaitForNotifyEvent(INFINITE); if ( FAILED(hr) ) throw RComException(hr , "WaitForNotifyEvent に失敗"); hr = event.GetFrom( this->RecoCtxt ); if ( FAILED(hr) ) throw RComException(hr , "GetFrom に失敗"); //認識した結果 ISpRecoResult* result; result = event.RecoResult(); //認識した文字列の取得 CSpDynamicString dstrText; hr = result->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL); if ( FAILED(hr) ) throw RComException(hr , "録音したテキストの取得に失敗しました"); this->ResultString = W2A(dstrText); //認識に XMLを使用した場合、代入された結果を得る. SPPHRASE *pPhrase; event.RecoResult()->GetPhrase(&pPhrase); const SPPHRASEPROPERTY *pProp; for (pProp = pPhrase->pProperties; pProp; pProp = pProp->pNextSibling) { string a = W2A(pProp->pszName); this->ResultMap[ W2A(pProp->pszName) ] = W2A(pProp->pszValue); } CoTaskMemFree(pPhrase); /* //デバッグのため、読み取った音声をwaveファイルに保存してみる。 //ファイルに保存. save { CComPtr<ISpStreamFormat> ResultStream; CComPtr<ISpVoice> voice; hr = this->RecoCtxt->GetVoice(&voice); if(FAILED(hr)) throw RComException(hr , "GetVoice に失敗"); hr = event.RecoResult()->GetAudio( 0, 0, &ResultStream ); if ( FAILED(hr) ) throw RComException(hr , "GetAudio に失敗しました"); { CComPtr<ISpStream> cpWavStream; CComPtr<ISpStreamFormat> cpOldStream; CSpStreamFormat OriginalFmt; voice->GetOutputStream( &cpOldStream ); OriginalFmt.AssignFormat(cpOldStream); hr = SPBindToFile( L"C:\\Users\\rti\\Desktop\\naichichi\\test\\output.wav",SPFM_CREATE_ALWAYS, &cpWavStream,&OriginalFmt.FormatId(), OriginalFmt.WaveFormatExPtr() ); voice->SetOutput(cpWavStream,TRUE); } } */ }
void RSpeechRecognition::CallbackRule() { USES_CONVERSION; HRESULT hr; std::string dictationString; CSpEvent ruleEvent; hr = ruleEvent.GetFrom( this->RuleRecoCtxt ); if ( FAILED(hr) ) return ; //認識した結果 ISpRecoResult* result; result = ruleEvent.RecoResult(); //認識した文字列の取得 CSpDynamicString dstrText; hr = result->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL); if ( FAILED(hr) ) return ; this->ResultString = W2A(dstrText); //ルールベースで認識した結果の音声部分をもう一度 ディクテーションにかけます。 //これで過剰なマッチを排除します。 { CComPtr<ISpStreamFormat> resultStream; hr = result->GetAudio( 0, 0, &resultStream ); if ( FAILED(hr) ) return; //オーディオから読み込んでね hr = this->DictationEngine->SetInput( resultStream, TRUE); if(FAILED(hr)) return; hr = this->DictationGrammar->SetDictationState(SPRS_ACTIVE ); if(FAILED(hr)) return; hr = this->DictationRecoCtxt->WaitForNotifyEvent(10000); //10秒タイムアウト if ( FAILED(hr) ) return; hr = this->DictationGrammar->SetDictationState(SPRS_INACTIVE ); if(FAILED(hr)) return; CSpEvent tempevent; hr = tempevent.GetFrom( this->DictationRecoCtxt ); if ( FAILED(hr) ) return ; //認識した結果 ISpRecoResult* tempresult; tempresult = tempevent.RecoResult(); //認識した文字列の取得 CSpDynamicString tempdstrText; hr = tempresult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &tempdstrText, NULL); if ( FAILED(hr) ) return ; std::string dictationString = W2A(tempdstrText); //ディクテーションフィルターで絞る if ( dictationString.find(this->DicticationFilterWord) == std::string::npos ) { //フィルターにより拒否 this->FlagCleanup(); return ; } } //認識に XMLを使用した場合、代入された結果を得る. SPPHRASE *pPhrase; hr = result->GetPhrase(&pPhrase); if ( FAILED(hr) ) return ; this->ResultMap.clear(); const SPPHRASEPROPERTY *pProp; for (pProp = pPhrase->pProperties; pProp; pProp = pProp->pNextSibling) { this->ResultMap[ W2A(pProp->pszName) ] = W2A(pProp->pszValue); } CoTaskMemFree(pPhrase); //コマンド認識 SendMessage(this->CallbackWindowHandle , this->CallbackWindowMesage , 0 , 0); this->FlagCleanup(); }
//ルールベースで認識した結果の音声部分をもう一度 ディクテーションにかけます。 //これで過剰なマッチを排除します。 xreturn::r<std::string> Recognition_SAPI::convertDictation(ISpRecoResult* result,const std::string& ruleName) { HRESULT hr; _USE_WINDOWS_ENCODING; CComPtr<ISpStreamFormat> resultStream; { hr = result->GetAudio( 0, 1, &resultStream ); if(FAILED(hr)) return xreturn::windowsError(hr); //オーディオから読み込んでね hr = this->DictationEngine->SetInput( resultStream, TRUE); if(FAILED(hr)) return xreturn::windowsError(hr); hr = this->DictationGrammar->SetRuleState(ruleName.empty() ? NULL : _A2W(ruleName.c_str()), NULL, SPRS_ACTIVE ); if(FAILED(hr)) return xreturn::windowsError(hr); hr = this->DictationRecoCtxt->WaitForNotifyEvent(2000); //2秒タイムアウト if(FAILED(hr)) return xreturn::windowsError(hr); hr = this->DictationGrammar->SetRuleState(NULL, NULL, SPRS_INACTIVE ); if(FAILED(hr)) return xreturn::windowsError(hr); { CSpEvent tempevent; hr = tempevent.GetFrom( this->DictationRecoCtxt ); if(FAILED(hr)) return xreturn::windowsError(hr); if (tempevent.eEventId == SPEI_RECOGNITION) {//認識した結果 ISpRecoResult* tempresult; { tempresult = tempevent.RecoResult(); //認識した文字列の取得 CSpDynamicString tempdstrText; hr = tempresult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &tempdstrText, NULL); if(FAILED(hr)) return xreturn::windowsError(hr); SPPHRASE *pPhrase; hr = tempresult->GetPhrase(&pPhrase); if ( FAILED(hr) ) return xreturn::windowsError(hr); double confidence = pPhrase->pElements->SREngineConfidence; std::string ret = _W2A(tempdstrText); this->PoolMainWindow->SyncInvokeLog(std::string() + "ディクテーションフィルター :" + ret + + " " + num2str(confidence),LOG_LEVEL_DEBUG); if (confidence <= 0.60) { this->PoolMainWindow->SyncInvokeLog(std::string() + "ディクテーションフィルター棄却",LOG_LEVEL_DEBUG); return ""; } return ret; } } } } //不明 return ""; }