//请求指定语言的语音 bool SpBox::RequireLanguage(LPCWSTR langid) { IEnumSpObjectTokens* pEnum; //语音总选择 ISpObjectToken* pVoiceToken; //当前语音选择 ULONG ulCount = 0; //选择个数 WCHAR* pVoiceLang; //语音LangId WCHAR* pVoiceText; //语音说明文字 HRESULT hr = SpEnumTokens(SPCAT_VOICES, NULL, NULL, &pEnum); if (SUCCEEDED(hr)) { hr = pEnum->GetCount(&ulCount); } while (SUCCEEDED(hr) && ulCount--){ if (SUCCEEDED(hr)) pEnum->Next(1, &pVoiceToken, 0); if (SUCCEEDED(hr)) hr = SpGetLanguage(pVoiceToken, &pVoiceLang); if (SUCCEEDED(hr) && !_wcsnicmp(pVoiceLang, langid, 3)) { Token = pVoiceToken; VoiceObj->SetVoice(Token); SpGetDescription(Token, &pVoiceText, 0); Description = pVoiceText; return true; } } return false; }
bool EnumerateVoices() { HRESULT comResult = S_OK; ISpObjectTokenCategory * comTokenCategory = NULL; IEnumSpObjectTokens * comVoices = NULL; ULONG comVoicesCount = 0; // Init speech api comResult = ::CoCreateInstance( CLSID_SpVoice, NULL, CLSCTX_INPROC_SERVER, IID_ISpVoice, (LPVOID*)&comVoice); wxCHECK_MSG( SUCCEEDED(comResult), false, _T("Unable to instantiate speech API")); // Generate enumeration of voices comResult = ::CoCreateInstance(CLSID_SpObjectTokenCategory, NULL, CLSCTX_INPROC_SERVER, IID_ISpObjectTokenCategory, (LPVOID*)&comTokenCategory); wxCHECK_MSG( SUCCEEDED(comResult), false, _T("Unable to instantiate a TokenCategory")); comResult = comTokenCategory->SetId(SPCAT_VOICES, false); wxCHECK_MSG( SUCCEEDED(comResult), false, _T("Unable to to set location to find the installed voices.") _T("Likely the key that I am looking for does not exist and thus ") _T("there are no installed voices on this system.")); comResult = comTokenCategory->EnumTokens(NULL, NULL, &comVoices); wxCHECK_MSG( SUCCEEDED(comResult), false, _T("Unable to enumerate the installed voices. Check that this system") _T(" has voices installed.")); comResult = comVoices->GetCount(&comVoicesCount); wxCHECK_MSG( SUCCEEDED(comResult), false, _T("Unable get a count of the installed voices.")); while( comVoicesCount > 0 ) { ISpObjectToken * comAVoice = NULL; comVoices->Next(1, &comAVoice, NULL); // retrieve just one LPWSTR id = NULL; comAVoice->GetStringValue(NULL, &id); size_t idlength = wcslen(id); wxLogDebug(_T(" Got string of length %ld:"), idlength); for(size_t i = 0; i < idlength; i++) { wxLogDebug(_T(" %04X"), id[i]); } voices.push_back(VoiceData(wxString(id, wxMBConvUTF16(), wcslen(id)), comAVoice)); #ifdef __WXDEBUG__ enumerateObjectToken(comAVoice); #endif comAVoice->Release(); comVoicesCount--; } comTokenCategory->Release(); return true; }
void Speech::selectOutputDevice() { HRESULT hr = S_OK; ISpObjectToken* cpAudioOutToken; IEnumSpObjectTokens* cpEnum; ULONG ulCount = 0; if (FAILED(CoInitialize(NULL))) return; if (SUCCEEDED (hr)) { hr = SpEnumTokens( SPCAT_AUDIOOUT, NULL, NULL, &cpEnum); } if (SUCCEEDED (hr)) { hr = cpEnum->GetCount( &ulCount); } std::cout << "Select Audio Device:" << std::endl; unsigned int i=0; vector<wchar_t*> deviceIds; while (SUCCEEDED(hr) && i < ulCount) { i++; if (SUCCEEDED (hr)) { hr = cpEnum->Next( 1, &cpAudioOutToken, NULL ); } wchar_t* deviceName = nullptr; SpGetDescription(cpAudioOutToken, &deviceName); wchar_t* deviceId = nullptr; hr = cpAudioOutToken->GetStringValue(L"DeviceId", &deviceId); deviceIds.push_back(deviceId); std::wcout << "(" << i << ") " << deviceName << std::endl; } cpAudioOutToken->Release(); cpEnum->Release(); CoUninitialize(); i=5; while(i < 1 || i > ulCount) { cin >> i; if(i < 1 || i > ulCount) { cout << "Invalid Input." << endl; cin.clear(); cin.ignore(10000, '\n'); } } outputDevice = deviceIds.at(i-1); return; }
void Speech::speechOutput(const char* str) { ISpVoice* pVoice = NULL; ISpObjectToken* cpAudioOutToken; IEnumSpObjectTokens* cpEnum; ULONG ulCount = 0; if (FAILED(::CoInitialize(NULL))) return; HRESULT hr = CoCreateInstance(CLSID_SpVoice, NULL, CLSCTX_ALL, IID_ISpVoice, (void **)&pVoice); if (SUCCEEDED (hr)) { hr = SpEnumTokens( SPCAT_AUDIOOUT, NULL, NULL, &cpEnum); } if (SUCCEEDED (hr)) { hr = cpEnum->GetCount( &ulCount); } while (SUCCEEDED(hr) && ulCount--) { hr = cpEnum->Next( 1, &cpAudioOutToken, NULL ); wchar_t* deviceId = nullptr; if(SUCCEEDED(hr)) { hr = cpAudioOutToken->GetStringValue(L"DeviceId", &deviceId); if(wcscmp(outputDevice, deviceId) == 0) break; } } if (SUCCEEDED (hr)) { hr = pVoice->SetOutput( cpAudioOutToken, TRUE ); } if(SUCCEEDED(hr)) { const size_t cSize = strlen(str)+1; wchar_t* wc = new wchar_t[cSize]; size_t ret; mbstowcs_s (&ret, wc, cSize, str, cSize); hr = pVoice->Speak(wc, NULL, NULL); pVoice->Release(); pVoice = NULL; } cpEnum->Release(); cpAudioOutToken->Release(); ::CoUninitialize(); }
void DumpCategory(LPCWSTR category) { // enumerate tokens in each category IEnumSpObjectTokens *pEnumSpObjectTokens = nullptr; HRESULT hr = SpEnumTokens(category, nullptr, nullptr, &pEnumSpObjectTokens); if (SPERR_NOT_FOUND == hr) { LOG(L" None found."); return; } else if (FAILED(hr)) { ERR(L"SpEnumTokens failed: hr = 0x%08x", hr); return; } ReleaseOnExit rEnumSpObjectTokens(pEnumSpObjectTokens); ULONG nTokens = 0; hr = pEnumSpObjectTokens->GetCount(&nTokens); if (FAILED(hr)) { ERR(L"IEnumSpObjectTokens::GetCount failed: hr = 0x%08x", hr); return; } for (ULONG token = 0; token < nTokens; token++) { ISpObjectToken *pSpObjectToken = nullptr; hr = pEnumSpObjectTokens->Next(1, &pSpObjectToken, nullptr); if (FAILED(hr)) { ERR(L"IEnumSpObjectTokens::Next failed: hr = 0x%08x", hr); return; } ReleaseOnExit rSpObjectToken(pSpObjectToken); LPWSTR description = nullptr; hr = SpGetDescription(pSpObjectToken, &description); if (FAILED(hr)) { ERR(L"SpGetDescription failed: hr = 0x%08x", hr); continue; } CoTaskMemFreeOnExit fDescription(description); LOG(L" #%u: %s", token + 1, description); EnumDataKey(2, pSpObjectToken); } }
SCP_vector<SCP_string> speech_enumerate_voices() { #ifdef _WIN32 HRESULT hr = CoCreateInstance( CLSID_SpVoice, NULL, CLSCTX_ALL, IID_ISpVoice, (void **)&Voice_device); if (FAILED(hr)) { return SCP_vector<SCP_string>(); } // This code is mostly copied from wxLauncher ISpObjectTokenCategory * comTokenCategory = NULL; IEnumSpObjectTokens * comVoices = NULL; ULONG comVoicesCount = 0; // Generate enumeration of voices hr = ::CoCreateInstance(CLSID_SpObjectTokenCategory, NULL, CLSCTX_INPROC_SERVER, IID_ISpObjectTokenCategory, (LPVOID*)&comTokenCategory); if (FAILED(hr)) { return SCP_vector<SCP_string>(); } hr = comTokenCategory->SetId(SPCAT_VOICES, false); if (FAILED(hr)) { return SCP_vector<SCP_string>(); } hr = comTokenCategory->EnumTokens(NULL, NULL, &comVoices); if (FAILED(hr)) { return SCP_vector<SCP_string>(); } hr = comVoices->GetCount(&comVoicesCount); if (FAILED(hr)) { return SCP_vector<SCP_string>(); } SCP_vector<SCP_string> voices; while (comVoicesCount > 0) { ISpObjectToken * comAVoice = NULL; comVoices->Next(1, &comAVoice, NULL); // retrieve just one LPWSTR id = NULL; comAVoice->GetStringValue(NULL, &id); auto idlength = wcslen(id); auto buffer_size = WideCharToMultiByte(CP_UTF8, 0, id, (int)idlength, nullptr, 0, nullptr, nullptr); if (buffer_size > 0) { SCP_string voiceName; voiceName.resize(buffer_size); buffer_size = WideCharToMultiByte(CP_UTF8, 0, id, (int)idlength, &voiceName[0], buffer_size, nullptr, nullptr); voices.push_back(voiceName); } CoTaskMemFree(id); comAVoice->Release(); comVoicesCount--; } comTokenCategory->Release(); Voice_device->Release(); return voices; #else STUB_FUNCTION; return SCP_vector<SCP_string>(); #endif }
static int Create (vlc_object_t *p_this) { filter_t *p_filter = (filter_t *)p_this; filter_sys_t *p_sys; HRESULT hr; p_filter->p_sys = p_sys = (filter_sys_t*) malloc(sizeof(filter_sys_t)); if (!p_sys) return VLC_ENOMEM; if (TryEnterMTA(p_this)) goto error; p_sys->cpVoice = nullptr; hr = CoCreateInstance(CLSID_SpVoice, nullptr, CLSCTX_INPROC_SERVER, IID_ISpVoice, (void**) &p_sys->cpVoice); if (SUCCEEDED(hr)) { ISpObjectToken* cpVoiceToken = nullptr; IEnumSpObjectTokens* cpEnum = nullptr; ULONG ulCount = 0; hr = SpEnumTokens(SPCAT_VOICES, nullptr, nullptr, &cpEnum); if (SUCCEEDED(hr)) { // Get the number of voices. hr = cpEnum->GetCount(&ulCount); if (SUCCEEDED (hr)) { int voiceIndex = var_InheritInteger(p_this, "sapi-voice"); if (voiceIndex > - 1) { if (voiceIndex < ulCount) { hr = cpEnum->Item(voiceIndex, &cpVoiceToken); if (SUCCEEDED(hr)) { hr = p_sys->cpVoice->SetVoice(cpVoiceToken); if (SUCCEEDED(hr)) { msg_Dbg(p_this, "Selected voice %d", voiceIndex); } else { msg_Err(p_this, "Failed to set voice %d", voiceIndex); } cpVoiceToken->Release(); cpVoiceToken = nullptr; } } else msg_Err(p_this, "Voice index exceeds available count"); } } cpEnum->Release(); cpEnum = nullptr; } if (SUCCEEDED(hr)) { hr = p_sys->cpVoice->SetOutput(nullptr, TRUE); } } else { msg_Err(p_filter, "Could not create SpVoice"); } LeaveMTA(); p_filter->pf_render_text = RenderText; return VLC_SUCCESS; error: free(p_sys); return VLC_EGENERIC; }
void Sound::test() { ISpVoice * pVoice = NULL; ISpObjectToken* pVoiceToken=nullptr; IEnumSpObjectTokens* pEnum; ULONG ulCount = 0; if (FAILED(::CoInitialize(NULL))) { return; } HRESULT hr = S_OK; // Find the best matching installed en-us recognizer. CComPtr<ISpObjectToken> cpRecognizerToken; if (SUCCEEDED(hr)) { hr = SpFindBestToken(SPCAT_RECOGNIZERS, L"language=409", NULL, &cpRecognizerToken); } // Create the in-process recognizer and immediately set its state to inactive. CComPtr<ISpRecognizer> cpRecognizer; if (SUCCEEDED(hr)) { hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecognizer(cpRecognizerToken); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecoState(SPRST_INACTIVE); } // Create a new recognition context from the recognizer. CComPtr<ISpRecoContext> cpContext; if (SUCCEEDED(hr)) { hr = cpRecognizer->CreateRecoContext(&cpContext); } // Subscribe to the speech recognition event and end stream event. if (SUCCEEDED(hr)) { ULONGLONG ullEventInterest = SPFEI(SPEI_RECOGNITION); hr = cpContext->SetInterest(ullEventInterest, ullEventInterest); } // Establish a Win32 event to signal when speech events are available. HANDLE hSpeechNotifyEvent = INVALID_HANDLE_VALUE; if (SUCCEEDED(hr)) { hr = cpContext->SetNotifyWin32Event(); } if (SUCCEEDED(hr)) { hSpeechNotifyEvent = cpContext->GetNotifyEventHandle(); if (INVALID_HANDLE_VALUE == hSpeechNotifyEvent) { // Notification handle unsupported. hr = E_NOINTERFACE; } } // Initialize an audio object to use the default audio input of the system and set the recognizer to use it. CComPtr<ISpAudio> cpAudioIn; if (SUCCEEDED(hr)) { hr = cpAudioIn.CoCreateInstance(CLSID_SpMMAudioIn); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetInput(cpAudioIn, TRUE); } // Populate a WAVEFORMATEX struct with our desired output audio format. information. WAVEFORMATEX* pWfexCoMemRetainedAudioFormat = NULL; GUID guidRetainedAudioFormat = GUID_NULL; if (SUCCEEDED(hr)) { hr = SpConvertStreamFormatEnum(SPSF_16kHz16BitMono, &guidRetainedAudioFormat, &pWfexCoMemRetainedAudioFormat); } // Instruct the recognizer to retain the audio from its recognition results. if (SUCCEEDED(hr)) { hr = cpContext->SetAudioOptions(SPAO_RETAIN_AUDIO, &guidRetainedAudioFormat, pWfexCoMemRetainedAudioFormat); } if (NULL != pWfexCoMemRetainedAudioFormat) { CoTaskMemFree(pWfexCoMemRetainedAudioFormat); } // Create a new grammar and load an SRGS grammar from file. CComPtr<ISpRecoGrammar> cpGrammar; if (SUCCEEDED(hr)) { hr = cpContext->CreateGrammar(0, &cpGrammar); } if (SUCCEEDED(hr)) { hr = cpGrammar->LoadCmdFromFile(L"grammar.grxml", SPLO_STATIC); } // Set all top-level rules in the new grammar to the active state. if (SUCCEEDED(hr)) { hr = cpGrammar->SetRuleState(NULL, NULL, SPRS_ACTIVE); } // Set the recognizer state to active to begin recognition. if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecoState(SPRST_ACTIVE_ALWAYS); } // Establish a separate Win32 event to signal the event loop exit. HANDLE hExitEvent = CreateEventW(NULL, FALSE, FALSE, NULL); // Collect the events listened for to pump the speech event loop. HANDLE rghEvents[] = { hSpeechNotifyEvent, hExitEvent }; // Speech recognition event loop. BOOL fContinue = TRUE; while (fContinue && SUCCEEDED(hr)) { // Wait for either a speech event or an exit event, with a 15 second timeout. DWORD dwMessage = WaitForMultipleObjects(sp_countof(rghEvents), rghEvents, FALSE, 15000); switch (dwMessage) { // With the WaitForMultipleObjects call above, WAIT_OBJECT_0 is a speech event from hSpeechNotifyEvent. case WAIT_OBJECT_0: { // Sequentially grab the available speech events from the speech event queue. CSpEvent spevent; while (S_OK == spevent.GetFrom(cpContext)) { switch (spevent.eEventId) { case SPEI_RECOGNITION: { // Retrieve the recognition result and output the text of that result. ISpRecoResult* pResult = spevent.RecoResult(); LPWSTR pszCoMemResultText = NULL; hr = pResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &pszCoMemResultText, NULL); if (SUCCEEDED(hr)) { wprintf(L"Recognition event received, text=\"%s\"\r\n", pszCoMemResultText); } // Also retrieve the retained audio we requested. CComPtr<ISpStreamFormat> cpRetainedAudio; if (SUCCEEDED(hr)) { hr = pResult->GetAudio(0, 0, &cpRetainedAudio); } // To demonstrate, we'll speak the retained audio back using ISpVoice. CComPtr<ISpVoice> cpVoice; if (SUCCEEDED(hr)) { hr = cpVoice.CoCreateInstance(CLSID_SpVoice); } if (SUCCEEDED(hr)) { hr = cpVoice->SpeakStream(cpRetainedAudio, SPF_DEFAULT, 0); } if (NULL != pszCoMemResultText) { CoTaskMemFree(pszCoMemResultText); } break; } } } break; } case WAIT_OBJECT_0 + 1: case WAIT_TIMEOUT: { // Exit event or timeout; discontinue the speech loop. fContinue = FALSE; //break; } } } CoUninitialize(); CComPtr <ISpVoice> cpVoice; CComPtr <ISpStream> cpStream; CSpStreamFormat cAudioFmt; //Create a SAPI Voice hr = cpVoice.CoCreateInstance(CLSID_SpVoice); //Set the audio format if (SUCCEEDED(hr)) { hr = cAudioFmt.AssignFormat(SPSF_22kHz16BitMono); } //Call SPBindToFile, a SAPI helper method, to bind the audio stream to the file if (SUCCEEDED(hr)) { hr = SPBindToFile(L"c:\\ttstemp.wav", SPFM_CREATE_ALWAYS, &cpStream, &cAudioFmt.FormatId(), cAudioFmt.WaveFormatExPtr()); } //set the output to cpStream so that the output audio data will be stored in cpStream if (SUCCEEDED(hr)) { hr = cpVoice->SetOutput(cpStream, TRUE); } //Speak the text "hello world" synchronously if (SUCCEEDED(hr)) { hr = cpVoice->Speak(L"Hello World", SPF_DEFAULT, NULL); } //close the stream if (SUCCEEDED(hr)) { hr = cpStream->Close(); } //Release the stream and voice object cpStream.Release(); cpVoice.Release(); CComPtr<ISpGrammarBuilder> cpGrammarBuilder; SPSTATEHANDLE hStateTravel; // Create (if rule does not already exist) // top-level Rule, defaulting to Active. hr = cpGrammarBuilder->GetRule(L"Travel", 0, SPRAF_TopLevel | SPRAF_Active, TRUE, &hStateTravel); // Approach 1: List all possible phrases. // This is the most intuitive approach, and it does not sacrifice efficiency // because the grammar builder will merge shared sub-phrases when possible. // There is only one root state, hStateTravel, and the terminal NULL state, // and there are six unique transitions between root state and NULL state. /* XML Approximation: <rule id="Travel"> <item> fly to Seattle </item> <item> fly to New York </item> <item> fly to Washington DC </item> <item> drive to Seattle </item> <item> drive to New York </item> <item> drive to Washington DC </item> </rule> */ // Create set of peer phrases, each containing complete phrase. // Note: the word delimiter is set as " ", so that the text we // attach to the transition can be multiple words (for example, // "fly to Seattle" is implicitly "fly" + "to" + "Seattle"): if (SUCCEEDED(hr)) { hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"fly to Seattle", L" ", SPWT_LEXICAL, 1, NULL); } if (SUCCEEDED(hr)) { hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"fly to New York", L" ", SPWT_LEXICAL, 1, NULL); } if (SUCCEEDED(hr)) { hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"fly to Washington DC", L" ", SPWT_LEXICAL, 1, NULL); } if (SUCCEEDED(hr)) { hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"drive to Seattle", L" ", SPWT_LEXICAL, 1, NULL); } if (SUCCEEDED(hr)) { hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"drive to New York", L" ", SPWT_LEXICAL, 1, NULL); } if (SUCCEEDED(hr)) { hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"drive to Washington DC", L" ", SPWT_LEXICAL, 1, NULL); } // Find the best matching installed en-US recognizer. //CComPtr<ISpObjectToken> cpRecognizerToken; if (SUCCEEDED(hr)) { hr = SpFindBestToken(SPCAT_RECOGNIZERS, L"language=409", NULL, &cpRecognizerToken); } // Create the in-process recognizer and immediately set its state to inactive. //CComPtr<ISpRecognizer> cpRecognizer; if (SUCCEEDED(hr)) { hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecognizer(cpRecognizerToken); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecoState(SPRST_INACTIVE); } // Create a new recognition context from the recognizer. //CComPtr<ISpRecoContext> cpContext; if (SUCCEEDED(hr)) { hr = cpRecognizer->CreateRecoContext(&cpContext); } // Subscribe to the speech recognition event and end stream event. if (SUCCEEDED(hr)) { ULONGLONG ullEventInterest = SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM); hr = cpContext->SetInterest(ullEventInterest, ullEventInterest); } // Establish a Win32 event to signal when speech events are available. //HANDLE hSpeechNotifyEvent = INVALID_HANDLE_VALUE; if (SUCCEEDED(hr)) { hr = cpContext->SetNotifyWin32Event(); } if (SUCCEEDED(hr)) { hr = cpContext->SetNotifyWin32Event(); } if (SUCCEEDED(hr)) { hSpeechNotifyEvent = cpContext->GetNotifyEventHandle(); if (INVALID_HANDLE_VALUE == hSpeechNotifyEvent) { // Notification handle unsupported //hr = SPERR_UNITIALIZED; } } // Set up an audio input stream using a .wav file and set the recognizer's input. CComPtr<ISpStream> cpInputStream; if (SUCCEEDED(hr)) { hr = SPBindToFile(L"Test.wav", SPFM_OPEN_READONLY, &cpInputStream); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetInput(cpInputStream, TRUE); } // Create a new grammar and load an SRGS grammar from file. //CComPtr<ISpRecoGrammar> cpGrammar; if (SUCCEEDED(hr)) { hr = cpContext->CreateGrammar(0, &cpGrammar); } if (SUCCEEDED(hr)) { hr = cpGrammar->LoadCmdFromFile(L"grammar.grxml", SPLO_STATIC); } // Set all top-level rules in the new grammar to the active state. if (SUCCEEDED(hr)) { hr = cpGrammar->SetRuleState(NULL, NULL, SPRS_ACTIVE); } // Finally, set the recognizer state to active to begin recognition. if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecoState(SPRST_ACTIVE_ALWAYS); } hr = CoCreateInstance(CLSID_SpVoice, NULL, CLSCTX_ALL, IID_ISpVoice, (void **)&pVoice); if (SUCCEEDED(hr)) { hr = SpEnumTokens(SPCAT_VOICES, L"Gender=Female", NULL, &pEnum); if (SUCCEEDED(hr)) { // Get the number of voices. hr = pEnum->GetCount(&ulCount); } // Obtain a list of available voice tokens, set // the voice to the token, and call Speak. while (SUCCEEDED(hr) && ulCount--) { if (pVoiceToken != nullptr) { pVoiceToken->Release(); } if (SUCCEEDED(hr)) { hr = pEnum->Next(1, &pVoiceToken, NULL); } if (SUCCEEDED(hr)) { hr = pVoice->SetVoice(pVoiceToken); } if (SUCCEEDED(hr)) { wchar_t* start = L"<?xml version=\"1.0\" encoding=\"ISO - 8859 - 1\"?><speak version = \"1.0\" xmlns = \"http://www.w3.org/2001/10/synthesis\" xml:lang = \"en-US\">"; wchar_t* end = L"</speak>"; const wchar_t *xml = L"<voice required = \"Gender=Male\"> hi! <prosody pitch=\"fast\"> This is low pitch. </prosody><prosody volume=\"x - loud\"> This is extra loud volume. </prosody>"; wstring s = start; s += xml; s += end; hr = pVoice->Speak(xml, SPF_IS_XML| SPF_ASYNC, 0); //hr = pVoice->Speak(L"How are you?", SPF_DEFAULT, NULL); } } /* if (SUCCEEDED(hr)) { hr = pEnum->Next(1, &pVoiceToken, NULL); if (SUCCEEDED(hr)) { hr = pVoice->SetVoice(pVoiceToken); // Set the output to the default audio device. if (SUCCEEDED(hr)) { hr = pVoice->SetOutput(NULL, TRUE); if (SUCCEEDED(hr)) { hr = pVoice->Speak(L"Hello, world!", SPF_DEFAULT, 0); } } } } */ pVoice->Release(); } ::CoUninitialize(); }
int _tmain( int argc, _TCHAR* argv[] ) { cv::setUseOptimized( true ); // Kinectのインスタンス生成、初期化 INuiSensor* pSensor; HRESULT hResult = S_OK; hResult = NuiCreateSensorByIndex( 0, &pSensor ); if( FAILED( hResult ) ){ std::cerr << "Error : NuiCreateSensorByIndex" << std::endl; return -1; } hResult = pSensor->NuiInitialize( NUI_INITIALIZE_FLAG_USES_AUDIO ); if( FAILED( hResult ) ){ std::cerr << "Error : NuiInitialize" << std::endl; return -1; } // Audioストリームの初期化(InitializeAudioStream) std::cout << "InitializeAudioStream" << std::endl; INuiAudioBeam* pNuiAudioSource; hResult = pSensor->NuiGetAudioSource( &pNuiAudioSource ); if( FAILED( hResult ) ){ std::cerr << "Error : NuiGetAudioSource" << std::endl; return -1; } IMediaObject* pMediaObject = nullptr; IPropertyStore* pPropertyStore = nullptr; pNuiAudioSource->QueryInterface( IID_IMediaObject, reinterpret_cast<void**>( &pMediaObject ) ); pNuiAudioSource->QueryInterface( IID_IPropertyStore, reinterpret_cast<void**>( &pPropertyStore ) ); PROPVARIANT propvariant; PropVariantInit( &propvariant ); propvariant.vt = VT_I4; propvariant.lVal = static_cast<LONG>( 4 ); pPropertyStore->SetValue( MFPKEY_WMAAECMA_SYSTEM_MODE, propvariant ); PropVariantClear( &propvariant ); WAVEFORMATEX waveFormat = { AudioFormat, AudioChannels, AudioSamplesPerSecond, AudioAverageBytesPerSecond, AudioBlockAlign, AudioBitsPerSample, 0 }; DMO_MEDIA_TYPE mediaType = { 0 }; MoInitMediaType( &mediaType, sizeof( WAVEFORMATEX ) ); mediaType.majortype = MEDIATYPE_Audio; mediaType.subtype = MEDIASUBTYPE_PCM; mediaType.lSampleSize = 0; mediaType.bFixedSizeSamples = true; mediaType.bTemporalCompression = false; mediaType.formattype = FORMAT_WaveFormatEx; memcpy( mediaType.pbFormat, &waveFormat, sizeof( WAVEFORMATEX ) ); pMediaObject->SetOutputType( 0, &mediaType, 0 ); KinectAudioStream* audioStream = new KinectAudioStream( pMediaObject ); IStream* pStream = nullptr; audioStream->QueryInterface( IID_IStream, reinterpret_cast<void**>( &pStream ) ); CoInitialize( nullptr ); ISpStream* pSpeechStream = nullptr; CoCreateInstance( CLSID_SpStream, NULL, CLSCTX_INPROC_SERVER, __uuidof(ISpStream), reinterpret_cast<void**>( &pSpeechStream ) ); pSpeechStream->SetBaseStream( pStream, SPDFID_WaveFormatEx, &waveFormat ); MoFreeMediaType( &mediaType ); pStream->Release(); pPropertyStore->Release(); pMediaObject->Release(); pNuiAudioSource->Release(); // 音声認識器を作成(CreateSpeechRecognizer) std::cout << "CreateSpeechRecognizer" << std::endl; ISpRecognizer* pSpeechRecognizer; CoCreateInstance( CLSID_SpInprocRecognizer, nullptr, CLSCTX_INPROC_SERVER, __uuidof(ISpRecognizer), reinterpret_cast<void**>( &pSpeechRecognizer ) ); pSpeechRecognizer->SetInput( pSpeechStream, false ); /* // If can use ATL, easier to using SpFindBestToken(sphelper.h). When using Professional or more. ISpObjectToken* pEngineToken = nullptr; SpFindBestToken( SPCAT_RECOGNIZERS, L"Language=411;Kinect=True", NULL, &pEngineToken ); // Japanese "Language=411;Kinect=True" English "Language=409;Kinect=True" */ ///* // If can't use ATL, alternative to using SpFIndBestToken(sphelper.h). When using Express. const wchar_t* pVendorPreferred = L"VendorPreferred"; const unsigned long lengthVendorPreferred = static_cast<unsigned long>( wcslen( pVendorPreferred ) ); unsigned long length; ULongAdd( lengthVendorPreferred, 1, &length ); wchar_t* pAttribsVendorPreferred = new wchar_t[ length ]; StringCchCopyW( pAttribsVendorPreferred, length, pVendorPreferred ); ISpObjectTokenCategory* pTokenCategory = nullptr; CoCreateInstance( CLSID_SpObjectTokenCategory, nullptr, CLSCTX_ALL, __uuidof(ISpObjectTokenCategory), reinterpret_cast<void**>( &pTokenCategory ) ); pTokenCategory->SetId( SPCAT_RECOGNIZERS, false ); IEnumSpObjectTokens* pEnumTokens = nullptr; CoCreateInstance( CLSID_SpMMAudioEnum, nullptr, CLSCTX_ALL, __uuidof(IEnumSpObjectTokens), reinterpret_cast<void**>( &pEnumTokens ) ); pTokenCategory->EnumTokens( L"Language=411;Kinect=True", pAttribsVendorPreferred, &pEnumTokens ); // Japanese "Language=411;Kinect=True" English "Language=409;Kinect=True" delete[] pAttribsVendorPreferred; ISpObjectToken* pEngineToken = nullptr; pEnumTokens->Next( 1, &pEngineToken, nullptr ); //*/ pSpeechRecognizer->SetRecognizer( pEngineToken ); ISpRecoContext* pSpeechContext; pSpeechRecognizer->CreateRecoContext( &pSpeechContext ); pEngineToken->Release(); ///* pTokenCategory->Release(); pEnumTokens->Release(); //*/ // 音声認識辞書の作成(LoadSpeechGrammar) std::cout << "LoadSpeechGrammar" << std::endl; ISpRecoGrammar* pSpeechGrammar; pSpeechContext->CreateGrammar( 1, &pSpeechGrammar ); pSpeechGrammar->LoadCmdFromFile( L"SpeechRecognition_Ja.grxml", /*SPLO_STATIC*/SPLO_DYNAMIC ); // http://www.w3.org/TR/speech-grammar/ (UTF-8/CRLF) audioStream->StartCapture(); pSpeechGrammar->SetRuleState( nullptr, nullptr, SPRS_ACTIVE ); pSpeechRecognizer->SetRecoState( SPRST_ACTIVE_ALWAYS ); pSpeechContext->SetInterest( SPFEI( SPEI_RECOGNITION ), SPFEI( SPEI_RECOGNITION ) ); pSpeechContext->Resume( 0 ); HANDLE hSpeechEvent = INVALID_HANDLE_VALUE; hSpeechEvent = pSpeechContext->GetNotifyEventHandle(); HANDLE hEvents[1] = { hSpeechEvent }; int width = 640; int height = 480; cv::Mat audioMat = cv::Mat::zeros( height, width, CV_8UC3 ); cv::namedWindow( "Audio" ); bool exit = false; std::cout << std::endl << "Speech Recognition Start..." << std::endl << std::endl; while( 1 ){ // イベントの更新待ち ResetEvent( hSpeechEvent ); unsigned long waitObject = MsgWaitForMultipleObjectsEx( ARRAYSIZE( hEvents ), hEvents, INFINITE, QS_ALLINPUT, MWMO_INPUTAVAILABLE ); if( waitObject == WAIT_OBJECT_0 ){ // イベントの取得 const float confidenceThreshold = 0.3f; SPEVENT eventStatus; unsigned long eventFetch = 0; pSpeechContext->GetEvents( 1, &eventStatus, &eventFetch ); while( eventFetch > 0 ){ switch( eventStatus.eEventId ){ // 音声認識イベント(SPEI_HYPOTHESIS:推定またはSPEI_RECOGNITION:認識) case SPEI_HYPOTHESIS: case SPEI_RECOGNITION: if( eventStatus.elParamType == SPET_LPARAM_IS_OBJECT ){ // フレーズの取得 ISpRecoResult* pRecoResult = reinterpret_cast<ISpRecoResult*>( eventStatus.lParam ); SPPHRASE* pPhrase = nullptr; hResult = pRecoResult->GetPhrase( &pPhrase ); if( SUCCEEDED( hResult ) ){ if( ( pPhrase->pProperties != nullptr ) && ( pPhrase->pProperties->pFirstChild != nullptr ) ){ // 辞書のフレーズタグと比較 const SPPHRASEPROPERTY* pSemantic = pPhrase->pProperties->pFirstChild; if( pSemantic->SREngineConfidence > confidenceThreshold ){ if( wcscmp( L"あか", pSemantic->pszValue ) == 0 ){ std::cout << "あか" << std::endl; audioMat = cv::Scalar( 0, 0, 255 ); } else if( wcscmp( L"みどり", pSemantic->pszValue ) == 0 ){ std::cout << "みどり" << std::endl; audioMat = cv::Scalar( 0, 255, 0 ); } else if( wcscmp( L"あお", pSemantic->pszValue ) == 0 ){ std::cout << "あお" << std::endl; audioMat = cv::Scalar( 255, 0, 0 ); } else if( wcscmp( L"おわり", pSemantic->pszValue ) == 0 ){ exit = true; } } } CoTaskMemFree( pPhrase ); } } break; default: break; } pSpeechContext->GetEvents( 1, &eventStatus, &eventFetch ); } } // 表示 cv::imshow( "Audio", audioMat ); // ループの終了判定(Escキー) if( cv::waitKey( 30 ) == VK_ESCAPE || exit ){ break; } } // 終了処理 audioStream->StopCapture(); pSpeechRecognizer->SetRecoState( SPRST_INACTIVE ); CoUninitialize(); pSensor->NuiShutdown(); CloseHandle( hSpeechEvent ); cv::destroyAllWindows(); return 0; }