bool EnumerateVoices() { HRESULT comResult = S_OK; ISpObjectTokenCategory * comTokenCategory = NULL; IEnumSpObjectTokens * comVoices = NULL; ULONG comVoicesCount = 0; // Init speech api comResult = ::CoCreateInstance( CLSID_SpVoice, NULL, CLSCTX_INPROC_SERVER, IID_ISpVoice, (LPVOID*)&comVoice); wxCHECK_MSG( SUCCEEDED(comResult), false, _T("Unable to instantiate speech API")); // Generate enumeration of voices comResult = ::CoCreateInstance(CLSID_SpObjectTokenCategory, NULL, CLSCTX_INPROC_SERVER, IID_ISpObjectTokenCategory, (LPVOID*)&comTokenCategory); wxCHECK_MSG( SUCCEEDED(comResult), false, _T("Unable to instantiate a TokenCategory")); comResult = comTokenCategory->SetId(SPCAT_VOICES, false); wxCHECK_MSG( SUCCEEDED(comResult), false, _T("Unable to to set location to find the installed voices.") _T("Likely the key that I am looking for does not exist and thus ") _T("there are no installed voices on this system.")); comResult = comTokenCategory->EnumTokens(NULL, NULL, &comVoices); wxCHECK_MSG( SUCCEEDED(comResult), false, _T("Unable to enumerate the installed voices. Check that this system") _T(" has voices installed.")); comResult = comVoices->GetCount(&comVoicesCount); wxCHECK_MSG( SUCCEEDED(comResult), false, _T("Unable get a count of the installed voices.")); while( comVoicesCount > 0 ) { ISpObjectToken * comAVoice = NULL; comVoices->Next(1, &comAVoice, NULL); // retrieve just one LPWSTR id = NULL; comAVoice->GetStringValue(NULL, &id); size_t idlength = wcslen(id); wxLogDebug(_T(" Got string of length %ld:"), idlength); for(size_t i = 0; i < idlength; i++) { wxLogDebug(_T(" %04X"), id[i]); } voices.push_back(VoiceData(wxString(id, wxMBConvUTF16(), wcslen(id)), comAVoice)); #ifdef __WXDEBUG__ enumerateObjectToken(comAVoice); #endif comAVoice->Release(); comVoicesCount--; } comTokenCategory->Release(); return true; }
void SpeechRecognizer::setEnabled(bool enabled) { if (enabled == _enabled || !_comInitialized) { return; } _enabled = enabled; if (_enabled) { HRESULT hr = S_OK; // Set up dedicated recognizer instead of using shared Windows recognizer. // - By default, shared recognizer's commands like "move left" override any added here. // - Unless do SetGrammarState(SPGS_EXCLUSIVE) on shared recognizer but then non-Interface commands don't work at all. // - With dedicated recognizer, user can choose whether to have Windows recognizer running in addition to Interface's. if (SUCCEEDED(hr)) { hr = CoCreateInstance(CLSID_SpInprocRecognizer, NULL, CLSCTX_ALL, IID_ISpRecognizer, (void**)&_speechRecognizer); } if (SUCCEEDED(hr)) { ISpObjectToken* audioToken; ISpObjectTokenCategory* audioTokenCategory; hr = CoCreateInstance(CLSID_SpObjectTokenCategory, NULL, CLSCTX_ALL, IID_ISpObjectTokenCategory, (void**)&audioTokenCategory); if (SUCCEEDED(hr)) { hr = audioTokenCategory->SetId(SPCAT_AUDIOIN, TRUE); } if (SUCCEEDED(hr)) { WCHAR * tokenID; hr = audioTokenCategory->GetDefaultTokenId(&tokenID); if (SUCCEEDED(hr)) { hr = CoCreateInstance(CLSID_SpObjectToken, NULL, CLSCTX_ALL, IID_ISpObjectToken, (void**)&audioToken); if (SUCCEEDED(hr)) { hr = audioToken->SetId(NULL, tokenID, FALSE); } ::CoTaskMemFree(tokenID); } } if (SUCCEEDED(hr)) { hr = static_cast<ISpRecognizer*>(_speechRecognizer)->SetInput(audioToken, TRUE); } } if (SUCCEEDED(hr)) { hr = static_cast<ISpRecognizer*>(_speechRecognizer) ->CreateRecoContext(reinterpret_cast<ISpRecoContext**>(&_speechRecognizerContext)); if (FAILED(hr)) { static_cast<ISpRecognizer*>(_speechRecognizer)->Release(); } } // Set up event notification mechanism. if (SUCCEEDED(hr)) { hr = static_cast<ISpRecoContext*>(_speechRecognizerContext)->SetNotifyWin32Event(); } if (SUCCEEDED(hr)) { _commandRecognizedEvent = static_cast<ISpRecoContext*>(_speechRecognizerContext)->GetNotifyEventHandle(); if (_commandRecognizedEvent) { _commandRecognizedNotifier->setHandle(_commandRecognizedEvent); _commandRecognizedNotifier->setEnabled(true); } else { hr = S_FALSE; } } // Set which events to be notified of. if (SUCCEEDED(hr)) { hr = static_cast<ISpRecoContext*>(_speechRecognizerContext) ->SetInterest(SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION)); } // Create grammar and load commands. if (SUCCEEDED(hr)) { hr = static_cast<ISpRecoContext*>(_speechRecognizerContext) ->CreateGrammar(NULL, reinterpret_cast<ISpRecoGrammar**>(&_speechRecognizerGrammar)); } if (SUCCEEDED(hr)) { reloadCommands(); } _enabled = SUCCEEDED(hr); qDebug() << "Speech recognition" << (_enabled ? "enabled" : "enable failed"); } else { _commandRecognizedNotifier->setEnabled(false); static_cast<ISpRecoContext*>(_speechRecognizerContext)->Release(); static_cast<ISpRecognizer*>(_speechRecognizer)->Release(); qDebug() << "Speech recognition disabled"; } emit enabledUpdated(_enabled); }
SCP_vector<SCP_string> speech_enumerate_voices() { #ifdef _WIN32 HRESULT hr = CoCreateInstance( CLSID_SpVoice, NULL, CLSCTX_ALL, IID_ISpVoice, (void **)&Voice_device); if (FAILED(hr)) { return SCP_vector<SCP_string>(); } // This code is mostly copied from wxLauncher ISpObjectTokenCategory * comTokenCategory = NULL; IEnumSpObjectTokens * comVoices = NULL; ULONG comVoicesCount = 0; // Generate enumeration of voices hr = ::CoCreateInstance(CLSID_SpObjectTokenCategory, NULL, CLSCTX_INPROC_SERVER, IID_ISpObjectTokenCategory, (LPVOID*)&comTokenCategory); if (FAILED(hr)) { return SCP_vector<SCP_string>(); } hr = comTokenCategory->SetId(SPCAT_VOICES, false); if (FAILED(hr)) { return SCP_vector<SCP_string>(); } hr = comTokenCategory->EnumTokens(NULL, NULL, &comVoices); if (FAILED(hr)) { return SCP_vector<SCP_string>(); } hr = comVoices->GetCount(&comVoicesCount); if (FAILED(hr)) { return SCP_vector<SCP_string>(); } SCP_vector<SCP_string> voices; while (comVoicesCount > 0) { ISpObjectToken * comAVoice = NULL; comVoices->Next(1, &comAVoice, NULL); // retrieve just one LPWSTR id = NULL; comAVoice->GetStringValue(NULL, &id); auto idlength = wcslen(id); auto buffer_size = WideCharToMultiByte(CP_UTF8, 0, id, (int)idlength, nullptr, 0, nullptr, nullptr); if (buffer_size > 0) { SCP_string voiceName; voiceName.resize(buffer_size); buffer_size = WideCharToMultiByte(CP_UTF8, 0, id, (int)idlength, &voiceName[0], buffer_size, nullptr, nullptr); voices.push_back(voiceName); } CoTaskMemFree(id); comAVoice->Release(); comVoicesCount--; } comTokenCategory->Release(); Voice_device->Release(); return voices; #else STUB_FUNCTION; return SCP_vector<SCP_string>(); #endif }
int _tmain( int argc, _TCHAR* argv[] ) { cv::setUseOptimized( true ); // Kinectのインスタンス生成、初期化 INuiSensor* pSensor; HRESULT hResult = S_OK; hResult = NuiCreateSensorByIndex( 0, &pSensor ); if( FAILED( hResult ) ){ std::cerr << "Error : NuiCreateSensorByIndex" << std::endl; return -1; } hResult = pSensor->NuiInitialize( NUI_INITIALIZE_FLAG_USES_AUDIO ); if( FAILED( hResult ) ){ std::cerr << "Error : NuiInitialize" << std::endl; return -1; } // Audioストリームの初期化(InitializeAudioStream) std::cout << "InitializeAudioStream" << std::endl; INuiAudioBeam* pNuiAudioSource; hResult = pSensor->NuiGetAudioSource( &pNuiAudioSource ); if( FAILED( hResult ) ){ std::cerr << "Error : NuiGetAudioSource" << std::endl; return -1; } IMediaObject* pMediaObject = nullptr; IPropertyStore* pPropertyStore = nullptr; pNuiAudioSource->QueryInterface( IID_IMediaObject, reinterpret_cast<void**>( &pMediaObject ) ); pNuiAudioSource->QueryInterface( IID_IPropertyStore, reinterpret_cast<void**>( &pPropertyStore ) ); PROPVARIANT propvariant; PropVariantInit( &propvariant ); propvariant.vt = VT_I4; propvariant.lVal = static_cast<LONG>( 4 ); pPropertyStore->SetValue( MFPKEY_WMAAECMA_SYSTEM_MODE, propvariant ); PropVariantClear( &propvariant ); WAVEFORMATEX waveFormat = { AudioFormat, AudioChannels, AudioSamplesPerSecond, AudioAverageBytesPerSecond, AudioBlockAlign, AudioBitsPerSample, 0 }; DMO_MEDIA_TYPE mediaType = { 0 }; MoInitMediaType( &mediaType, sizeof( WAVEFORMATEX ) ); mediaType.majortype = MEDIATYPE_Audio; mediaType.subtype = MEDIASUBTYPE_PCM; mediaType.lSampleSize = 0; mediaType.bFixedSizeSamples = true; mediaType.bTemporalCompression = false; mediaType.formattype = FORMAT_WaveFormatEx; memcpy( mediaType.pbFormat, &waveFormat, sizeof( WAVEFORMATEX ) ); pMediaObject->SetOutputType( 0, &mediaType, 0 ); KinectAudioStream* audioStream = new KinectAudioStream( pMediaObject ); IStream* pStream = nullptr; audioStream->QueryInterface( IID_IStream, reinterpret_cast<void**>( &pStream ) ); CoInitialize( nullptr ); ISpStream* pSpeechStream = nullptr; CoCreateInstance( CLSID_SpStream, NULL, CLSCTX_INPROC_SERVER, __uuidof(ISpStream), reinterpret_cast<void**>( &pSpeechStream ) ); pSpeechStream->SetBaseStream( pStream, SPDFID_WaveFormatEx, &waveFormat ); MoFreeMediaType( &mediaType ); pStream->Release(); pPropertyStore->Release(); pMediaObject->Release(); pNuiAudioSource->Release(); // 音声認識器を作成(CreateSpeechRecognizer) std::cout << "CreateSpeechRecognizer" << std::endl; ISpRecognizer* pSpeechRecognizer; CoCreateInstance( CLSID_SpInprocRecognizer, nullptr, CLSCTX_INPROC_SERVER, __uuidof(ISpRecognizer), reinterpret_cast<void**>( &pSpeechRecognizer ) ); pSpeechRecognizer->SetInput( pSpeechStream, false ); /* // If can use ATL, easier to using SpFindBestToken(sphelper.h). When using Professional or more. ISpObjectToken* pEngineToken = nullptr; SpFindBestToken( SPCAT_RECOGNIZERS, L"Language=411;Kinect=True", NULL, &pEngineToken ); // Japanese "Language=411;Kinect=True" English "Language=409;Kinect=True" */ ///* // If can't use ATL, alternative to using SpFIndBestToken(sphelper.h). When using Express. const wchar_t* pVendorPreferred = L"VendorPreferred"; const unsigned long lengthVendorPreferred = static_cast<unsigned long>( wcslen( pVendorPreferred ) ); unsigned long length; ULongAdd( lengthVendorPreferred, 1, &length ); wchar_t* pAttribsVendorPreferred = new wchar_t[ length ]; StringCchCopyW( pAttribsVendorPreferred, length, pVendorPreferred ); ISpObjectTokenCategory* pTokenCategory = nullptr; CoCreateInstance( CLSID_SpObjectTokenCategory, nullptr, CLSCTX_ALL, __uuidof(ISpObjectTokenCategory), reinterpret_cast<void**>( &pTokenCategory ) ); pTokenCategory->SetId( SPCAT_RECOGNIZERS, false ); IEnumSpObjectTokens* pEnumTokens = nullptr; CoCreateInstance( CLSID_SpMMAudioEnum, nullptr, CLSCTX_ALL, __uuidof(IEnumSpObjectTokens), reinterpret_cast<void**>( &pEnumTokens ) ); pTokenCategory->EnumTokens( L"Language=411;Kinect=True", pAttribsVendorPreferred, &pEnumTokens ); // Japanese "Language=411;Kinect=True" English "Language=409;Kinect=True" delete[] pAttribsVendorPreferred; ISpObjectToken* pEngineToken = nullptr; pEnumTokens->Next( 1, &pEngineToken, nullptr ); //*/ pSpeechRecognizer->SetRecognizer( pEngineToken ); ISpRecoContext* pSpeechContext; pSpeechRecognizer->CreateRecoContext( &pSpeechContext ); pEngineToken->Release(); ///* pTokenCategory->Release(); pEnumTokens->Release(); //*/ // 音声認識辞書の作成(LoadSpeechGrammar) std::cout << "LoadSpeechGrammar" << std::endl; ISpRecoGrammar* pSpeechGrammar; pSpeechContext->CreateGrammar( 1, &pSpeechGrammar ); pSpeechGrammar->LoadCmdFromFile( L"SpeechRecognition_Ja.grxml", /*SPLO_STATIC*/SPLO_DYNAMIC ); // http://www.w3.org/TR/speech-grammar/ (UTF-8/CRLF) audioStream->StartCapture(); pSpeechGrammar->SetRuleState( nullptr, nullptr, SPRS_ACTIVE ); pSpeechRecognizer->SetRecoState( SPRST_ACTIVE_ALWAYS ); pSpeechContext->SetInterest( SPFEI( SPEI_RECOGNITION ), SPFEI( SPEI_RECOGNITION ) ); pSpeechContext->Resume( 0 ); HANDLE hSpeechEvent = INVALID_HANDLE_VALUE; hSpeechEvent = pSpeechContext->GetNotifyEventHandle(); HANDLE hEvents[1] = { hSpeechEvent }; int width = 640; int height = 480; cv::Mat audioMat = cv::Mat::zeros( height, width, CV_8UC3 ); cv::namedWindow( "Audio" ); bool exit = false; std::cout << std::endl << "Speech Recognition Start..." << std::endl << std::endl; while( 1 ){ // イベントの更新待ち ResetEvent( hSpeechEvent ); unsigned long waitObject = MsgWaitForMultipleObjectsEx( ARRAYSIZE( hEvents ), hEvents, INFINITE, QS_ALLINPUT, MWMO_INPUTAVAILABLE ); if( waitObject == WAIT_OBJECT_0 ){ // イベントの取得 const float confidenceThreshold = 0.3f; SPEVENT eventStatus; unsigned long eventFetch = 0; pSpeechContext->GetEvents( 1, &eventStatus, &eventFetch ); while( eventFetch > 0 ){ switch( eventStatus.eEventId ){ // 音声認識イベント(SPEI_HYPOTHESIS:推定またはSPEI_RECOGNITION:認識) case SPEI_HYPOTHESIS: case SPEI_RECOGNITION: if( eventStatus.elParamType == SPET_LPARAM_IS_OBJECT ){ // フレーズの取得 ISpRecoResult* pRecoResult = reinterpret_cast<ISpRecoResult*>( eventStatus.lParam ); SPPHRASE* pPhrase = nullptr; hResult = pRecoResult->GetPhrase( &pPhrase ); if( SUCCEEDED( hResult ) ){ if( ( pPhrase->pProperties != nullptr ) && ( pPhrase->pProperties->pFirstChild != nullptr ) ){ // 辞書のフレーズタグと比較 const SPPHRASEPROPERTY* pSemantic = pPhrase->pProperties->pFirstChild; if( pSemantic->SREngineConfidence > confidenceThreshold ){ if( wcscmp( L"あか", pSemantic->pszValue ) == 0 ){ std::cout << "あか" << std::endl; audioMat = cv::Scalar( 0, 0, 255 ); } else if( wcscmp( L"みどり", pSemantic->pszValue ) == 0 ){ std::cout << "みどり" << std::endl; audioMat = cv::Scalar( 0, 255, 0 ); } else if( wcscmp( L"あお", pSemantic->pszValue ) == 0 ){ std::cout << "あお" << std::endl; audioMat = cv::Scalar( 255, 0, 0 ); } else if( wcscmp( L"おわり", pSemantic->pszValue ) == 0 ){ exit = true; } } } CoTaskMemFree( pPhrase ); } } break; default: break; } pSpeechContext->GetEvents( 1, &eventStatus, &eventFetch ); } } // 表示 cv::imshow( "Audio", audioMat ); // ループの終了判定(Escキー) if( cv::waitKey( 30 ) == VK_ESCAPE || exit ){ break; } } // 終了処理 audioStream->StopCapture(); pSpeechRecognizer->SetRecoState( SPRST_INACTIVE ); CoUninitialize(); pSensor->NuiShutdown(); CloseHandle( hSpeechEvent ); cv::destroyAllWindows(); return 0; }