Ejemplo n.º 1
0
bool EnumerateVoices() {
	HRESULT comResult = S_OK;
	ISpObjectTokenCategory * comTokenCategory = NULL;
	IEnumSpObjectTokens * comVoices = NULL;
	ULONG comVoicesCount = 0;
	
	// Init speech api
	comResult = ::CoCreateInstance(
		CLSID_SpVoice, NULL, CLSCTX_INPROC_SERVER, IID_ISpVoice, (LPVOID*)&comVoice);
	wxCHECK_MSG( SUCCEEDED(comResult), false, _T("Unable to instantiate speech API"));

	// Generate enumeration of voices
	comResult = ::CoCreateInstance(CLSID_SpObjectTokenCategory, NULL,
		CLSCTX_INPROC_SERVER, IID_ISpObjectTokenCategory, (LPVOID*)&comTokenCategory);
	wxCHECK_MSG( SUCCEEDED(comResult), false, _T("Unable to instantiate a TokenCategory"));

	comResult = comTokenCategory->SetId(SPCAT_VOICES, false);
	wxCHECK_MSG( SUCCEEDED(comResult), false,
		_T("Unable to to set location to find the installed voices.")
		_T("Likely the key that I am looking for does not exist and thus ")
		_T("there are no installed voices on this system."));

	comResult = comTokenCategory->EnumTokens(NULL, NULL, &comVoices);
	wxCHECK_MSG( SUCCEEDED(comResult), false,
		_T("Unable to enumerate the installed voices.  Check that this system")
		_T(" has voices installed."));

	comResult = comVoices->GetCount(&comVoicesCount);
	wxCHECK_MSG( SUCCEEDED(comResult), false,
		_T("Unable get a count of the installed voices."));

	while( comVoicesCount > 0 ) {
		ISpObjectToken * comAVoice = NULL;
		comVoices->Next(1, &comAVoice, NULL); // retrieve just one
		LPWSTR id = NULL;
		comAVoice->GetStringValue(NULL, &id);
		size_t idlength = wcslen(id);
		wxLogDebug(_T("  Got string of length %ld:"), idlength);
		for(size_t i = 0; i < idlength; i++) {
			wxLogDebug(_T("   %04X"), id[i]);
		}
		voices.push_back(VoiceData(wxString(id, wxMBConvUTF16(), wcslen(id)), comAVoice));

#ifdef __WXDEBUG__
		enumerateObjectToken(comAVoice);
#endif
		comAVoice->Release();
		comVoicesCount--;
	}

	comTokenCategory->Release();

	return true;
}
Ejemplo n.º 2
0
void SpeechRecognizer::setEnabled(bool enabled) {
    if (enabled == _enabled || !_comInitialized) {
        return;
    }

    _enabled = enabled;

    if (_enabled) {

        HRESULT hr = S_OK;

        // Set up dedicated recognizer instead of using shared Windows recognizer.
        // - By default, shared recognizer's commands like "move left" override any added here.
        // - Unless do SetGrammarState(SPGS_EXCLUSIVE) on shared recognizer but then non-Interface commands don't work at all.
        // - With dedicated recognizer, user can choose whether to have Windows recognizer running in addition to Interface's.
        if (SUCCEEDED(hr)) {
            hr = CoCreateInstance(CLSID_SpInprocRecognizer, NULL, CLSCTX_ALL, IID_ISpRecognizer, (void**)&_speechRecognizer);
        }
        if (SUCCEEDED(hr)) {
            ISpObjectToken* audioToken;
            ISpObjectTokenCategory* audioTokenCategory;
            hr = CoCreateInstance(CLSID_SpObjectTokenCategory, NULL, CLSCTX_ALL, IID_ISpObjectTokenCategory, 
                (void**)&audioTokenCategory);
            if (SUCCEEDED(hr)) {
                hr = audioTokenCategory->SetId(SPCAT_AUDIOIN, TRUE);
            }
            if (SUCCEEDED(hr)) {
                WCHAR * tokenID;
                hr = audioTokenCategory->GetDefaultTokenId(&tokenID);
                if (SUCCEEDED(hr)) {
                    hr = CoCreateInstance(CLSID_SpObjectToken, NULL, CLSCTX_ALL, IID_ISpObjectToken, (void**)&audioToken);
                    if (SUCCEEDED(hr)) {
                        hr = audioToken->SetId(NULL, tokenID, FALSE);
                    }
                    ::CoTaskMemFree(tokenID);
                }
            }
            if (SUCCEEDED(hr)) {
                hr = static_cast<ISpRecognizer*>(_speechRecognizer)->SetInput(audioToken, TRUE);
            }

        }
        if (SUCCEEDED(hr)) {
            hr = static_cast<ISpRecognizer*>(_speechRecognizer)
                ->CreateRecoContext(reinterpret_cast<ISpRecoContext**>(&_speechRecognizerContext));
            if (FAILED(hr)) {
                static_cast<ISpRecognizer*>(_speechRecognizer)->Release();
            }
        }

        // Set up event notification mechanism.
        if (SUCCEEDED(hr)) {
            hr = static_cast<ISpRecoContext*>(_speechRecognizerContext)->SetNotifyWin32Event();
        }
        if (SUCCEEDED(hr)) {
            _commandRecognizedEvent = static_cast<ISpRecoContext*>(_speechRecognizerContext)->GetNotifyEventHandle();
            if (_commandRecognizedEvent) {
                _commandRecognizedNotifier->setHandle(_commandRecognizedEvent);
                _commandRecognizedNotifier->setEnabled(true);
            } else {
                hr = S_FALSE;
            }
        }
        
        // Set which events to be notified of.
        if (SUCCEEDED(hr)) {
            hr = static_cast<ISpRecoContext*>(_speechRecognizerContext)
                ->SetInterest(SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION));
        }

        // Create grammar and load commands.
        if (SUCCEEDED(hr)) {
            hr = static_cast<ISpRecoContext*>(_speechRecognizerContext)
                ->CreateGrammar(NULL, reinterpret_cast<ISpRecoGrammar**>(&_speechRecognizerGrammar));
        }
        if (SUCCEEDED(hr)) {
            reloadCommands();
        }
        
        _enabled = SUCCEEDED(hr);

        qDebug() << "Speech recognition" << (_enabled ? "enabled" : "enable failed");

    } else {
        _commandRecognizedNotifier->setEnabled(false);
        static_cast<ISpRecoContext*>(_speechRecognizerContext)->Release();
        static_cast<ISpRecognizer*>(_speechRecognizer)->Release();
        qDebug() << "Speech recognition disabled";
    }

    emit enabledUpdated(_enabled);
}
Ejemplo n.º 3
0
SCP_vector<SCP_string> speech_enumerate_voices()
{
#ifdef _WIN32
	HRESULT hr = CoCreateInstance(
		CLSID_SpVoice,
		NULL,
		CLSCTX_ALL,
		IID_ISpVoice,
		(void **)&Voice_device);

	if (FAILED(hr)) {
		return SCP_vector<SCP_string>();
	}

	// This code is mostly copied from wxLauncher
	ISpObjectTokenCategory * comTokenCategory = NULL;
	IEnumSpObjectTokens * comVoices = NULL;
	ULONG comVoicesCount = 0;

	// Generate enumeration of voices
	hr = ::CoCreateInstance(CLSID_SpObjectTokenCategory, NULL,
		CLSCTX_INPROC_SERVER, IID_ISpObjectTokenCategory, (LPVOID*)&comTokenCategory);
	if (FAILED(hr)) {
		return SCP_vector<SCP_string>();
	}

	hr = comTokenCategory->SetId(SPCAT_VOICES, false);
	if (FAILED(hr)) {
		return SCP_vector<SCP_string>();
	}

	hr = comTokenCategory->EnumTokens(NULL, NULL, &comVoices);
	if (FAILED(hr)) {
		return SCP_vector<SCP_string>();
	}

	hr = comVoices->GetCount(&comVoicesCount);
	if (FAILED(hr)) {
		return SCP_vector<SCP_string>();
	}

	SCP_vector<SCP_string> voices;
	while (comVoicesCount > 0) {
		ISpObjectToken * comAVoice = NULL;

		comVoices->Next(1, &comAVoice, NULL); // retrieve just one

		LPWSTR id = NULL;
		comAVoice->GetStringValue(NULL, &id);

		auto idlength = wcslen(id);
		auto buffer_size = WideCharToMultiByte(CP_UTF8, 0, id, (int)idlength, nullptr, 0, nullptr, nullptr);

		if (buffer_size > 0) {
			SCP_string voiceName;
			voiceName.resize(buffer_size);
			buffer_size = WideCharToMultiByte(CP_UTF8, 0, id, (int)idlength, &voiceName[0], buffer_size, nullptr, nullptr);

			voices.push_back(voiceName);
		}

		CoTaskMemFree(id);
		comAVoice->Release();
		comVoicesCount--;
	}

	comTokenCategory->Release();

	Voice_device->Release();

	return voices;
#else
	STUB_FUNCTION;

	return SCP_vector<SCP_string>();
#endif
}
int _tmain( int argc, _TCHAR* argv[] )
{
	cv::setUseOptimized( true );

	// Kinectのインスタンス生成、初期化
	INuiSensor* pSensor;
	HRESULT hResult = S_OK;
	hResult = NuiCreateSensorByIndex( 0, &pSensor );
	if( FAILED( hResult ) ){
		std::cerr << "Error : NuiCreateSensorByIndex" << std::endl;
		return -1;
	}

	hResult = pSensor->NuiInitialize( NUI_INITIALIZE_FLAG_USES_AUDIO );
	if( FAILED( hResult ) ){
		std::cerr << "Error : NuiInitialize" << std::endl;
		return -1;
	}

	// Audioストリームの初期化(InitializeAudioStream)
	std::cout << "InitializeAudioStream" << std::endl;
	INuiAudioBeam* pNuiAudioSource;
	hResult = pSensor->NuiGetAudioSource( &pNuiAudioSource );
	if( FAILED( hResult ) ){
		std::cerr << "Error : NuiGetAudioSource" << std::endl;
		return -1;
	}

	IMediaObject* pMediaObject = nullptr;
	IPropertyStore* pPropertyStore = nullptr;
	pNuiAudioSource->QueryInterface( IID_IMediaObject, reinterpret_cast<void**>( &pMediaObject ) );
	pNuiAudioSource->QueryInterface( IID_IPropertyStore, reinterpret_cast<void**>( &pPropertyStore ) );

	PROPVARIANT propvariant;
	PropVariantInit( &propvariant );
	propvariant.vt = VT_I4;
	propvariant.lVal = static_cast<LONG>( 4 );
	pPropertyStore->SetValue( MFPKEY_WMAAECMA_SYSTEM_MODE, propvariant );
	PropVariantClear( &propvariant );

	WAVEFORMATEX waveFormat = { AudioFormat, AudioChannels, AudioSamplesPerSecond, AudioAverageBytesPerSecond, AudioBlockAlign, AudioBitsPerSample, 0 };
	DMO_MEDIA_TYPE mediaType = { 0 };
	MoInitMediaType( &mediaType, sizeof( WAVEFORMATEX ) );

	mediaType.majortype = MEDIATYPE_Audio;
	mediaType.subtype = MEDIASUBTYPE_PCM;
	mediaType.lSampleSize = 0;
	mediaType.bFixedSizeSamples = true;
	mediaType.bTemporalCompression = false;
	mediaType.formattype = FORMAT_WaveFormatEx;
	memcpy( mediaType.pbFormat, &waveFormat, sizeof( WAVEFORMATEX ) );

	pMediaObject->SetOutputType( 0, &mediaType, 0 ); 

	KinectAudioStream* audioStream = new KinectAudioStream( pMediaObject );

	IStream* pStream = nullptr;
	audioStream->QueryInterface( IID_IStream, reinterpret_cast<void**>( &pStream ) );

	CoInitialize( nullptr );
	ISpStream* pSpeechStream = nullptr;
	CoCreateInstance( CLSID_SpStream, NULL, CLSCTX_INPROC_SERVER, __uuidof(ISpStream), reinterpret_cast<void**>( &pSpeechStream ) );

	pSpeechStream->SetBaseStream( pStream, SPDFID_WaveFormatEx, &waveFormat );

	MoFreeMediaType( &mediaType );
	pStream->Release();
	pPropertyStore->Release();
	pMediaObject->Release();
	pNuiAudioSource->Release();

	// 音声認識器を作成(CreateSpeechRecognizer)
	std::cout << "CreateSpeechRecognizer" << std::endl;
	ISpRecognizer* pSpeechRecognizer;
	CoCreateInstance( CLSID_SpInprocRecognizer, nullptr, CLSCTX_INPROC_SERVER, __uuidof(ISpRecognizer), reinterpret_cast<void**>( &pSpeechRecognizer ) );

	pSpeechRecognizer->SetInput( pSpeechStream, false );

	/*
	// If can use ATL, easier to using SpFindBestToken(sphelper.h). When using Professional or more.
	ISpObjectToken* pEngineToken = nullptr;
	SpFindBestToken( SPCAT_RECOGNIZERS, L"Language=411;Kinect=True", NULL, &pEngineToken ); // Japanese "Language=411;Kinect=True" English "Language=409;Kinect=True"
	*/

	///*
	// If can't use ATL, alternative to using SpFIndBestToken(sphelper.h). When using Express.
	const wchar_t* pVendorPreferred = L"VendorPreferred";
	const unsigned long lengthVendorPreferred = static_cast<unsigned long>( wcslen( pVendorPreferred ) );
	unsigned long length;
	ULongAdd( lengthVendorPreferred, 1, &length );
	wchar_t* pAttribsVendorPreferred = new wchar_t[ length ];
	StringCchCopyW( pAttribsVendorPreferred, length, pVendorPreferred );

	ISpObjectTokenCategory* pTokenCategory = nullptr;
	CoCreateInstance( CLSID_SpObjectTokenCategory, nullptr, CLSCTX_ALL, __uuidof(ISpObjectTokenCategory), reinterpret_cast<void**>( &pTokenCategory ) );

	pTokenCategory->SetId( SPCAT_RECOGNIZERS, false );

	IEnumSpObjectTokens* pEnumTokens = nullptr;
	CoCreateInstance( CLSID_SpMMAudioEnum, nullptr, CLSCTX_ALL, __uuidof(IEnumSpObjectTokens), reinterpret_cast<void**>( &pEnumTokens ) );

	pTokenCategory->EnumTokens( L"Language=411;Kinect=True", pAttribsVendorPreferred, &pEnumTokens ); // Japanese "Language=411;Kinect=True" English "Language=409;Kinect=True"

	delete[] pAttribsVendorPreferred;
	
	ISpObjectToken* pEngineToken = nullptr;
	pEnumTokens->Next( 1, &pEngineToken, nullptr );
	//*/

	pSpeechRecognizer->SetRecognizer( pEngineToken );
	
	ISpRecoContext* pSpeechContext;
	pSpeechRecognizer->CreateRecoContext( &pSpeechContext );

	pEngineToken->Release();
	///*
	pTokenCategory->Release();
	pEnumTokens->Release();
	//*/

	// 音声認識辞書の作成(LoadSpeechGrammar)
	std::cout << "LoadSpeechGrammar" << std::endl;
	ISpRecoGrammar* pSpeechGrammar;
	pSpeechContext->CreateGrammar( 1, &pSpeechGrammar );

	pSpeechGrammar->LoadCmdFromFile( L"SpeechRecognition_Ja.grxml", /*SPLO_STATIC*/SPLO_DYNAMIC ); // http://www.w3.org/TR/speech-grammar/ (UTF-8/CRLF)
	
	audioStream->StartCapture();
	pSpeechGrammar->SetRuleState( nullptr, nullptr, SPRS_ACTIVE );
	pSpeechRecognizer->SetRecoState( SPRST_ACTIVE_ALWAYS );
	pSpeechContext->SetInterest( SPFEI( SPEI_RECOGNITION ), SPFEI( SPEI_RECOGNITION ) );
	pSpeechContext->Resume( 0 );

	HANDLE hSpeechEvent = INVALID_HANDLE_VALUE;
	hSpeechEvent = pSpeechContext->GetNotifyEventHandle();
	HANDLE hEvents[1] = { hSpeechEvent };

	int width = 640;
	int height = 480;

	cv::Mat audioMat = cv::Mat::zeros( height, width, CV_8UC3 );
	cv::namedWindow( "Audio" );

	bool exit = false;

	std::cout << std::endl << "Speech Recognition Start..." << std::endl << std::endl;

	while( 1 ){
		// イベントの更新待ち
		ResetEvent( hSpeechEvent );
		unsigned long waitObject = MsgWaitForMultipleObjectsEx( ARRAYSIZE( hEvents ), hEvents, INFINITE, QS_ALLINPUT, MWMO_INPUTAVAILABLE );

		if( waitObject == WAIT_OBJECT_0 ){
			// イベントの取得
			const float confidenceThreshold = 0.3f;
			SPEVENT eventStatus;
			unsigned long eventFetch = 0;
			pSpeechContext->GetEvents( 1, &eventStatus, &eventFetch );
			while( eventFetch > 0 ){
				switch( eventStatus.eEventId ){
					// 音声認識イベント(SPEI_HYPOTHESIS:推定またはSPEI_RECOGNITION:認識)
					case SPEI_HYPOTHESIS:
					case SPEI_RECOGNITION:
						if( eventStatus.elParamType == SPET_LPARAM_IS_OBJECT ){
							// フレーズの取得
							ISpRecoResult* pRecoResult = reinterpret_cast<ISpRecoResult*>( eventStatus.lParam );
							SPPHRASE* pPhrase = nullptr;
							hResult = pRecoResult->GetPhrase( &pPhrase );
							if( SUCCEEDED( hResult ) ){
								if( ( pPhrase->pProperties != nullptr ) && ( pPhrase->pProperties->pFirstChild != nullptr ) ){
									// 辞書のフレーズタグと比較
									const SPPHRASEPROPERTY* pSemantic = pPhrase->pProperties->pFirstChild;
									if( pSemantic->SREngineConfidence > confidenceThreshold ){
										if( wcscmp( L"あか", pSemantic->pszValue ) == 0 ){
											std::cout << "あか" << std::endl;
											audioMat = cv::Scalar( 0, 0, 255 );
										}
										else if( wcscmp( L"みどり", pSemantic->pszValue ) == 0 ){
											std::cout << "みどり" << std::endl;
											audioMat = cv::Scalar( 0, 255, 0 );
										}
										else if( wcscmp( L"あお", pSemantic->pszValue ) == 0 ){
											std::cout << "あお" << std::endl;
											audioMat = cv::Scalar( 255, 0, 0 );
										}
										else if( wcscmp( L"おわり", pSemantic->pszValue ) == 0 ){
											exit = true;
										}
									}
								}
								CoTaskMemFree( pPhrase );
							}
						}
						break;

					default:
						break;
				}
				pSpeechContext->GetEvents( 1, &eventStatus, &eventFetch );
			}
		}

		// 表示
		cv::imshow( "Audio", audioMat );

		// ループの終了判定(Escキー)
		if( cv::waitKey( 30 ) == VK_ESCAPE || exit ){
			break;
		}
	}

	// 終了処理
	audioStream->StopCapture();
	pSpeechRecognizer->SetRecoState( SPRST_INACTIVE );
	CoUninitialize();
	pSensor->NuiShutdown();
	CloseHandle( hSpeechEvent );

	cv::destroyAllWindows();

	return 0;
}