Ejemplo n.º 1
0
Archivo: main.cpp Proyecto: df32/TTS
//请求指定语言的语音
bool SpBox::RequireLanguage(LPCWSTR langid)
{
	IEnumSpObjectTokens*    pEnum;				//语音总选择
	ISpObjectToken*			pVoiceToken;		//当前语音选择
	ULONG					ulCount = 0;		//选择个数
	WCHAR*					pVoiceLang;			//语音LangId
	WCHAR*					pVoiceText;			//语音说明文字

	HRESULT hr = SpEnumTokens(SPCAT_VOICES, NULL, NULL, &pEnum);
	if (SUCCEEDED(hr))
	{
		hr = pEnum->GetCount(&ulCount);
	}

	while (SUCCEEDED(hr) && ulCount--){

		if (SUCCEEDED(hr))
			pEnum->Next(1, &pVoiceToken, 0);
		
		if (SUCCEEDED(hr))
			hr = SpGetLanguage(pVoiceToken, &pVoiceLang);

		if (SUCCEEDED(hr) && !_wcsnicmp(pVoiceLang, langid, 3))
		{
			Token = pVoiceToken;
			VoiceObj->SetVoice(Token);
			SpGetDescription(Token, &pVoiceText, 0);
			Description = pVoiceText;
			return true;
		}
	}
	return false;
}
Ejemplo n.º 2
0
bool EnumerateVoices() {
	HRESULT comResult = S_OK;
	ISpObjectTokenCategory * comTokenCategory = NULL;
	IEnumSpObjectTokens * comVoices = NULL;
	ULONG comVoicesCount = 0;
	
	// Init speech api
	comResult = ::CoCreateInstance(
		CLSID_SpVoice, NULL, CLSCTX_INPROC_SERVER, IID_ISpVoice, (LPVOID*)&comVoice);
	wxCHECK_MSG( SUCCEEDED(comResult), false, _T("Unable to instantiate speech API"));

	// Generate enumeration of voices
	comResult = ::CoCreateInstance(CLSID_SpObjectTokenCategory, NULL,
		CLSCTX_INPROC_SERVER, IID_ISpObjectTokenCategory, (LPVOID*)&comTokenCategory);
	wxCHECK_MSG( SUCCEEDED(comResult), false, _T("Unable to instantiate a TokenCategory"));

	comResult = comTokenCategory->SetId(SPCAT_VOICES, false);
	wxCHECK_MSG( SUCCEEDED(comResult), false,
		_T("Unable to to set location to find the installed voices.")
		_T("Likely the key that I am looking for does not exist and thus ")
		_T("there are no installed voices on this system."));

	comResult = comTokenCategory->EnumTokens(NULL, NULL, &comVoices);
	wxCHECK_MSG( SUCCEEDED(comResult), false,
		_T("Unable to enumerate the installed voices.  Check that this system")
		_T(" has voices installed."));

	comResult = comVoices->GetCount(&comVoicesCount);
	wxCHECK_MSG( SUCCEEDED(comResult), false,
		_T("Unable get a count of the installed voices."));

	while( comVoicesCount > 0 ) {
		ISpObjectToken * comAVoice = NULL;
		comVoices->Next(1, &comAVoice, NULL); // retrieve just one
		LPWSTR id = NULL;
		comAVoice->GetStringValue(NULL, &id);
		size_t idlength = wcslen(id);
		wxLogDebug(_T("  Got string of length %ld:"), idlength);
		for(size_t i = 0; i < idlength; i++) {
			wxLogDebug(_T("   %04X"), id[i]);
		}
		voices.push_back(VoiceData(wxString(id, wxMBConvUTF16(), wcslen(id)), comAVoice));

#ifdef __WXDEBUG__
		enumerateObjectToken(comAVoice);
#endif
		comAVoice->Release();
		comVoicesCount--;
	}

	comTokenCategory->Release();

	return true;
}
Ejemplo n.º 3
0
void Speech::selectOutputDevice() {
	HRESULT	hr = S_OK;
	ISpObjectToken*	cpAudioOutToken;
	IEnumSpObjectTokens* cpEnum;
	ULONG ulCount = 0;

	if (FAILED(CoInitialize(NULL)))
        return; 

	if (SUCCEEDED (hr))	{
	   hr = SpEnumTokens( SPCAT_AUDIOOUT, NULL, NULL, &cpEnum);
	}
	if (SUCCEEDED (hr))	{
	   hr = cpEnum->GetCount( &ulCount);
	}

	std::cout << "Select Audio Device:" << std::endl;
	unsigned int i=0;

	vector<wchar_t*> deviceIds;
	while (SUCCEEDED(hr) && i < ulCount) {
		i++;
		if (SUCCEEDED (hr))	{
		  hr = cpEnum->Next( 1, &cpAudioOutToken, NULL );
		}
		wchar_t* deviceName = nullptr;
		SpGetDescription(cpAudioOutToken, &deviceName);
		wchar_t* deviceId = nullptr;
        hr = cpAudioOutToken->GetStringValue(L"DeviceId", &deviceId);

		deviceIds.push_back(deviceId);
		std::wcout << "(" << i << ") " << deviceName << std::endl;	  
	}
	cpAudioOutToken->Release();
	cpEnum->Release();
	CoUninitialize();
	i=5;
	while(i < 1 || i > ulCount) {

		cin >> i;
		if(i < 1 || i > ulCount) {
			cout << "Invalid Input." << endl;
			cin.clear();
			cin.ignore(10000, '\n');
		}
	}
	outputDevice = deviceIds.at(i-1);
	return;
}
Ejemplo n.º 4
0
void Speech::speechOutput(const char* str) {

	ISpVoice* pVoice = NULL;
	ISpObjectToken* cpAudioOutToken;
	IEnumSpObjectTokens* cpEnum;
	ULONG ulCount = 0;

    if (FAILED(::CoInitialize(NULL)))
        return;

    HRESULT hr = CoCreateInstance(CLSID_SpVoice, NULL, CLSCTX_ALL, IID_ISpVoice, (void **)&pVoice);   

	if (SUCCEEDED (hr))	{
	   hr = SpEnumTokens( SPCAT_AUDIOOUT, NULL, NULL, &cpEnum);
	}
	if (SUCCEEDED (hr))	{
	   hr = cpEnum->GetCount( &ulCount);
	}

	while (SUCCEEDED(hr) && ulCount--) {
		hr = cpEnum->Next( 1, &cpAudioOutToken, NULL );
		wchar_t* deviceId = nullptr;
		if(SUCCEEDED(hr)) {
			hr = cpAudioOutToken->GetStringValue(L"DeviceId", &deviceId);
			if(wcscmp(outputDevice, deviceId) == 0) break;
		}
	}

	if (SUCCEEDED (hr))	{
		hr = pVoice->SetOutput( cpAudioOutToken, TRUE );
	}

	if(SUCCEEDED(hr)) {		
		const size_t cSize = strlen(str)+1;
		wchar_t* wc = new wchar_t[cSize];
		size_t ret;
		mbstowcs_s (&ret, wc, cSize, str, cSize);

		hr = pVoice->Speak(wc, NULL, NULL);        
	
		pVoice->Release();
        pVoice = NULL;
	}
	
	cpEnum->Release();
	cpAudioOutToken->Release();
    ::CoUninitialize();
}
void DumpCategory(LPCWSTR category) {
    // enumerate tokens in each category
    IEnumSpObjectTokens *pEnumSpObjectTokens = nullptr;
    HRESULT hr = SpEnumTokens(category, nullptr, nullptr, &pEnumSpObjectTokens);
    if (SPERR_NOT_FOUND == hr) {
        LOG(L"  None found.");
        return;
    } else if (FAILED(hr)) {
        ERR(L"SpEnumTokens failed: hr = 0x%08x", hr);
        return;
    }
    ReleaseOnExit rEnumSpObjectTokens(pEnumSpObjectTokens);
    
    ULONG nTokens = 0;
    hr = pEnumSpObjectTokens->GetCount(&nTokens);
    if (FAILED(hr)) {
        ERR(L"IEnumSpObjectTokens::GetCount failed: hr = 0x%08x", hr);
        return;
    }
    
    for (ULONG token = 0; token < nTokens; token++) {
        ISpObjectToken *pSpObjectToken = nullptr;
        hr = pEnumSpObjectTokens->Next(1, &pSpObjectToken, nullptr);
        if (FAILED(hr)) {
            ERR(L"IEnumSpObjectTokens::Next failed: hr = 0x%08x", hr);
            return;
        }
        ReleaseOnExit rSpObjectToken(pSpObjectToken);
        
        LPWSTR description = nullptr;
        hr = SpGetDescription(pSpObjectToken, &description);
        if (FAILED(hr)) {
            ERR(L"SpGetDescription failed: hr = 0x%08x", hr);
            continue;
        }
        CoTaskMemFreeOnExit fDescription(description);
        
        LOG(L"  #%u: %s", token + 1, description);
 
        EnumDataKey(2, pSpObjectToken);
    }
}
Ejemplo n.º 6
0
SCP_vector<SCP_string> speech_enumerate_voices()
{
#ifdef _WIN32
	HRESULT hr = CoCreateInstance(
		CLSID_SpVoice,
		NULL,
		CLSCTX_ALL,
		IID_ISpVoice,
		(void **)&Voice_device);

	if (FAILED(hr)) {
		return SCP_vector<SCP_string>();
	}

	// This code is mostly copied from wxLauncher
	ISpObjectTokenCategory * comTokenCategory = NULL;
	IEnumSpObjectTokens * comVoices = NULL;
	ULONG comVoicesCount = 0;

	// Generate enumeration of voices
	hr = ::CoCreateInstance(CLSID_SpObjectTokenCategory, NULL,
		CLSCTX_INPROC_SERVER, IID_ISpObjectTokenCategory, (LPVOID*)&comTokenCategory);
	if (FAILED(hr)) {
		return SCP_vector<SCP_string>();
	}

	hr = comTokenCategory->SetId(SPCAT_VOICES, false);
	if (FAILED(hr)) {
		return SCP_vector<SCP_string>();
	}

	hr = comTokenCategory->EnumTokens(NULL, NULL, &comVoices);
	if (FAILED(hr)) {
		return SCP_vector<SCP_string>();
	}

	hr = comVoices->GetCount(&comVoicesCount);
	if (FAILED(hr)) {
		return SCP_vector<SCP_string>();
	}

	SCP_vector<SCP_string> voices;
	while (comVoicesCount > 0) {
		ISpObjectToken * comAVoice = NULL;

		comVoices->Next(1, &comAVoice, NULL); // retrieve just one

		LPWSTR id = NULL;
		comAVoice->GetStringValue(NULL, &id);

		auto idlength = wcslen(id);
		auto buffer_size = WideCharToMultiByte(CP_UTF8, 0, id, (int)idlength, nullptr, 0, nullptr, nullptr);

		if (buffer_size > 0) {
			SCP_string voiceName;
			voiceName.resize(buffer_size);
			buffer_size = WideCharToMultiByte(CP_UTF8, 0, id, (int)idlength, &voiceName[0], buffer_size, nullptr, nullptr);

			voices.push_back(voiceName);
		}

		CoTaskMemFree(id);
		comAVoice->Release();
		comVoicesCount--;
	}

	comTokenCategory->Release();

	Voice_device->Release();

	return voices;
#else
	STUB_FUNCTION;

	return SCP_vector<SCP_string>();
#endif
}
static int  Create (vlc_object_t *p_this)
{
    filter_t *p_filter = (filter_t *)p_this;
    filter_sys_t *p_sys;
    HRESULT hr;

    p_filter->p_sys = p_sys = (filter_sys_t*) malloc(sizeof(filter_sys_t));
    if (!p_sys)
        return VLC_ENOMEM;

    if (TryEnterMTA(p_this))
        goto error;

    p_sys->cpVoice = nullptr;
    hr = CoCreateInstance(CLSID_SpVoice, nullptr, CLSCTX_INPROC_SERVER, IID_ISpVoice, (void**) &p_sys->cpVoice);
    if (SUCCEEDED(hr)) {
        ISpObjectToken*        cpVoiceToken = nullptr;
        IEnumSpObjectTokens*   cpEnum = nullptr;
        ULONG ulCount = 0;

        hr = SpEnumTokens(SPCAT_VOICES, nullptr, nullptr, &cpEnum);
        if (SUCCEEDED(hr)) {
            // Get the number of voices.
            hr = cpEnum->GetCount(&ulCount);
            if (SUCCEEDED (hr))
            {
                int voiceIndex = var_InheritInteger(p_this, "sapi-voice");
                if (voiceIndex > - 1)
                {
                    if (voiceIndex < ulCount) {
                        hr = cpEnum->Item(voiceIndex, &cpVoiceToken);
                        if (SUCCEEDED(hr)) {
                            hr = p_sys->cpVoice->SetVoice(cpVoiceToken);
                            if (SUCCEEDED(hr)) {
                                msg_Dbg(p_this, "Selected voice %d", voiceIndex);
                            }
                            else {
                                msg_Err(p_this, "Failed to set voice %d", voiceIndex);
                            }
                            cpVoiceToken->Release();
                            cpVoiceToken = nullptr;
                        }
                    }
                    else
                        msg_Err(p_this, "Voice index exceeds available count");
                }
            }
            cpEnum->Release();
            cpEnum = nullptr;
        }

        if (SUCCEEDED(hr)) {
            hr = p_sys->cpVoice->SetOutput(nullptr, TRUE);
        }
    }
    else
    {
        msg_Err(p_filter, "Could not create SpVoice");
    }

    LeaveMTA();

    p_filter->pf_render_text = RenderText;

    return VLC_SUCCESS;

error:
    free(p_sys);
    return VLC_EGENERIC;
}
Ejemplo n.º 8
0
	void Sound::test() {

		ISpVoice * pVoice = NULL;
		ISpObjectToken*        pVoiceToken=nullptr;
		IEnumSpObjectTokens*   pEnum;
		ULONG                  ulCount = 0;

		if (FAILED(::CoInitialize(NULL)))
		{
			return;
		}
		HRESULT hr = S_OK;

		// Find the best matching installed en-us recognizer.
		CComPtr<ISpObjectToken> cpRecognizerToken;

		if (SUCCEEDED(hr))
		{
			hr = SpFindBestToken(SPCAT_RECOGNIZERS, L"language=409", NULL, &cpRecognizerToken);
		}

		// Create the in-process recognizer and immediately set its state to inactive.
		CComPtr<ISpRecognizer> cpRecognizer;

		if (SUCCEEDED(hr))
		{
			hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer);
		}

		if (SUCCEEDED(hr))
		{
			hr = cpRecognizer->SetRecognizer(cpRecognizerToken);
		}

		if (SUCCEEDED(hr))
		{
			hr = cpRecognizer->SetRecoState(SPRST_INACTIVE);
		}

		// Create a new recognition context from the recognizer.
		CComPtr<ISpRecoContext> cpContext;

		if (SUCCEEDED(hr))
		{
			hr = cpRecognizer->CreateRecoContext(&cpContext);
		}

		// Subscribe to the speech recognition event and end stream event.
		if (SUCCEEDED(hr))
		{
			ULONGLONG ullEventInterest = SPFEI(SPEI_RECOGNITION);
			hr = cpContext->SetInterest(ullEventInterest, ullEventInterest);
		}

		// Establish a Win32 event to signal when speech events are available.
		HANDLE hSpeechNotifyEvent = INVALID_HANDLE_VALUE;

		if (SUCCEEDED(hr))
		{
			hr = cpContext->SetNotifyWin32Event();
		}

		if (SUCCEEDED(hr))
		{
			hSpeechNotifyEvent = cpContext->GetNotifyEventHandle();

			if (INVALID_HANDLE_VALUE == hSpeechNotifyEvent)
			{
				// Notification handle unsupported.
				hr = E_NOINTERFACE;
			}
		}

		// Initialize an audio object to use the default audio input of the system and set the recognizer to use it.
		CComPtr<ISpAudio> cpAudioIn;

		if (SUCCEEDED(hr))
		{
			hr = cpAudioIn.CoCreateInstance(CLSID_SpMMAudioIn);
		}

		if (SUCCEEDED(hr))
		{
			hr = cpRecognizer->SetInput(cpAudioIn, TRUE);
		}

		// Populate a WAVEFORMATEX struct with our desired output audio format. information.
		WAVEFORMATEX* pWfexCoMemRetainedAudioFormat = NULL;
		GUID guidRetainedAudioFormat = GUID_NULL;

		if (SUCCEEDED(hr))
		{
			hr = SpConvertStreamFormatEnum(SPSF_16kHz16BitMono, &guidRetainedAudioFormat, &pWfexCoMemRetainedAudioFormat);
		}

		// Instruct the recognizer to retain the audio from its recognition results.
		if (SUCCEEDED(hr))
		{
			hr = cpContext->SetAudioOptions(SPAO_RETAIN_AUDIO, &guidRetainedAudioFormat, pWfexCoMemRetainedAudioFormat);
		}

		if (NULL != pWfexCoMemRetainedAudioFormat)
		{
			CoTaskMemFree(pWfexCoMemRetainedAudioFormat);
		}

		// Create a new grammar and load an SRGS grammar from file.
		CComPtr<ISpRecoGrammar> cpGrammar;

		if (SUCCEEDED(hr))
		{
			hr = cpContext->CreateGrammar(0, &cpGrammar);
		}

		if (SUCCEEDED(hr))
		{
			hr = cpGrammar->LoadCmdFromFile(L"grammar.grxml", SPLO_STATIC);
		}

		// Set all top-level rules in the new grammar to the active state.
		if (SUCCEEDED(hr))
		{
			hr = cpGrammar->SetRuleState(NULL, NULL, SPRS_ACTIVE);
		}

		// Set the recognizer state to active to begin recognition.
		if (SUCCEEDED(hr))
		{
			hr = cpRecognizer->SetRecoState(SPRST_ACTIVE_ALWAYS);
		}

		// Establish a separate Win32 event to signal the event loop exit.
		HANDLE hExitEvent = CreateEventW(NULL, FALSE, FALSE, NULL);

		// Collect the events listened for to pump the speech event loop.
		HANDLE rghEvents[] = { hSpeechNotifyEvent, hExitEvent };

		// Speech recognition event loop.
		BOOL fContinue = TRUE;

		while (fContinue && SUCCEEDED(hr))
		{
			// Wait for either a speech event or an exit event, with a 15 second timeout.
			DWORD dwMessage = WaitForMultipleObjects(sp_countof(rghEvents), rghEvents, FALSE, 15000);

			switch (dwMessage)
			{
				// With the WaitForMultipleObjects call above, WAIT_OBJECT_0 is a speech event from hSpeechNotifyEvent.
			case WAIT_OBJECT_0:
			{
				// Sequentially grab the available speech events from the speech event queue.
				CSpEvent spevent;

				while (S_OK == spevent.GetFrom(cpContext))
				{
					switch (spevent.eEventId)
					{
					case SPEI_RECOGNITION:
					{
						// Retrieve the recognition result and output the text of that result.
						ISpRecoResult* pResult = spevent.RecoResult();

						LPWSTR pszCoMemResultText = NULL;
						hr = pResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &pszCoMemResultText, NULL);

						if (SUCCEEDED(hr))
						{
							wprintf(L"Recognition event received, text=\"%s\"\r\n", pszCoMemResultText);
						}

						// Also retrieve the retained audio we requested.
						CComPtr<ISpStreamFormat> cpRetainedAudio;

						if (SUCCEEDED(hr))
						{
							hr = pResult->GetAudio(0, 0, &cpRetainedAudio);
						}

						// To demonstrate, we'll speak the retained audio back using ISpVoice.
						CComPtr<ISpVoice> cpVoice;

						if (SUCCEEDED(hr))
						{
							hr = cpVoice.CoCreateInstance(CLSID_SpVoice);
						}

						if (SUCCEEDED(hr))
						{
							hr = cpVoice->SpeakStream(cpRetainedAudio, SPF_DEFAULT, 0);
						}

						if (NULL != pszCoMemResultText)
						{
							CoTaskMemFree(pszCoMemResultText);
						}

						break;
					}
					}
				}

				break;
			}
			case WAIT_OBJECT_0 + 1:
			case WAIT_TIMEOUT:
			{
				// Exit event or timeout; discontinue the speech loop.
				fContinue = FALSE;
				//break;
			}
			}
		}

	CoUninitialize();

		CComPtr <ISpVoice>		cpVoice;
		CComPtr <ISpStream>		cpStream;
		CSpStreamFormat			cAudioFmt;

		//Create a SAPI Voice
		hr = cpVoice.CoCreateInstance(CLSID_SpVoice);

		//Set the audio format
		if (SUCCEEDED(hr))
		{
			hr = cAudioFmt.AssignFormat(SPSF_22kHz16BitMono);
		}

		//Call SPBindToFile, a SAPI helper method,  to bind the audio stream to the file
		if (SUCCEEDED(hr))
		{

			hr = SPBindToFile(L"c:\\ttstemp.wav", SPFM_CREATE_ALWAYS,
				&cpStream, &cAudioFmt.FormatId(), cAudioFmt.WaveFormatExPtr());
		}

		//set the output to cpStream so that the output audio data will be stored in cpStream
		if (SUCCEEDED(hr))
		{
			hr = cpVoice->SetOutput(cpStream, TRUE);
		}

		//Speak the text "hello world" synchronously
		if (SUCCEEDED(hr))
		{
			hr = cpVoice->Speak(L"Hello World", SPF_DEFAULT, NULL);
		}

		//close the stream
		if (SUCCEEDED(hr))
		{
			hr = cpStream->Close();
		}

		//Release the stream and voice object
		cpStream.Release();
		cpVoice.Release();

		CComPtr<ISpGrammarBuilder>    cpGrammarBuilder;
		SPSTATEHANDLE                 hStateTravel;
		// Create (if rule does not already exist)
		// top-level Rule, defaulting to Active.
		hr = cpGrammarBuilder->GetRule(L"Travel", 0, SPRAF_TopLevel | SPRAF_Active, TRUE, &hStateTravel);

		// Approach 1: List all possible phrases.
		// This is the most intuitive approach, and it does not sacrifice efficiency
		// because the grammar builder will merge shared sub-phrases when possible.
		// There is only one root state, hStateTravel, and the terminal NULL state,
		// and there are six unique transitions between root state and NULL state.

		/* XML Approximation:
		<rule id="Travel">
		<item> fly to Seattle </item>
		<item> fly to New York </item>
		<item> fly to Washington DC </item>
		<item> drive to Seattle </item>
		<item> drive to New York </item>
		<item> drive to Washington DC </item>
		</rule>
		*/

		// Create set of peer phrases, each containing complete phrase.
		// Note: the word delimiter is set as " ", so that the text we
		// attach to the transition can be multiple words (for example,
		// "fly to Seattle" is implicitly "fly" + "to" + "Seattle"):
		if (SUCCEEDED(hr))
		{
			hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"fly to Seattle", L" ", SPWT_LEXICAL, 1, NULL);
		}
		if (SUCCEEDED(hr))
		{
			hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"fly to New York", L" ", SPWT_LEXICAL, 1, NULL);
		}
		if (SUCCEEDED(hr))
		{
			hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"fly to Washington DC", L" ", SPWT_LEXICAL, 1, NULL);
		}
		if (SUCCEEDED(hr))
		{
			hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"drive to Seattle", L" ", SPWT_LEXICAL, 1, NULL);
		}
		if (SUCCEEDED(hr))
		{
			hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"drive to New York", L" ", SPWT_LEXICAL, 1, NULL);
		}
		if (SUCCEEDED(hr))
		{
			hr = cpGrammarBuilder->AddWordTransition(hStateTravel, NULL, L"drive to Washington DC", L" ", SPWT_LEXICAL, 1, NULL);
		}
		// Find the best matching installed en-US recognizer.
		//CComPtr<ISpObjectToken> cpRecognizerToken;

		if (SUCCEEDED(hr))
		{
			hr = SpFindBestToken(SPCAT_RECOGNIZERS, L"language=409", NULL, &cpRecognizerToken);
		}

		// Create the in-process recognizer and immediately set its state to inactive.
		//CComPtr<ISpRecognizer> cpRecognizer;

		if (SUCCEEDED(hr))
		{
			hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer);
		}

		if (SUCCEEDED(hr))
		{
			hr = cpRecognizer->SetRecognizer(cpRecognizerToken);
		}

		if (SUCCEEDED(hr))
		{
			hr = cpRecognizer->SetRecoState(SPRST_INACTIVE);
		}

		// Create a new recognition context from the recognizer.
		//CComPtr<ISpRecoContext> cpContext;

		if (SUCCEEDED(hr))
		{
			hr = cpRecognizer->CreateRecoContext(&cpContext);
		}

		// Subscribe to the speech recognition event and end stream event.
		if (SUCCEEDED(hr))
		{
			ULONGLONG ullEventInterest = SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM);
			hr = cpContext->SetInterest(ullEventInterest, ullEventInterest);
		}

		// Establish a Win32 event to signal when speech events are available.
		//HANDLE hSpeechNotifyEvent = INVALID_HANDLE_VALUE;

		if (SUCCEEDED(hr))
		{
			hr = cpContext->SetNotifyWin32Event();
		}

		if (SUCCEEDED(hr))
		{
			hr = cpContext->SetNotifyWin32Event();
		}

		if (SUCCEEDED(hr))
		{
			hSpeechNotifyEvent = cpContext->GetNotifyEventHandle();

			if (INVALID_HANDLE_VALUE == hSpeechNotifyEvent)
			{
				// Notification handle unsupported
				//hr = SPERR_UNITIALIZED;
			}
		}
		// Set up an audio input stream using a .wav file and set the recognizer's input.
		CComPtr<ISpStream> cpInputStream;

		if (SUCCEEDED(hr))
		{
			hr = SPBindToFile(L"Test.wav", SPFM_OPEN_READONLY, &cpInputStream);
		}

		if (SUCCEEDED(hr))
		{
			hr = cpRecognizer->SetInput(cpInputStream, TRUE);
		}

		// Create a new grammar and load an SRGS grammar from file.
		//CComPtr<ISpRecoGrammar> cpGrammar;

		if (SUCCEEDED(hr))
		{
			hr = cpContext->CreateGrammar(0, &cpGrammar);
		}

		if (SUCCEEDED(hr))
		{
			hr = cpGrammar->LoadCmdFromFile(L"grammar.grxml", SPLO_STATIC);
		}

		// Set all top-level rules in the new grammar to the active state.
		if (SUCCEEDED(hr))
		{
			hr = cpGrammar->SetRuleState(NULL, NULL, SPRS_ACTIVE);
		}

		// Finally, set the recognizer state to active to begin recognition.
		if (SUCCEEDED(hr))
		{
			hr = cpRecognizer->SetRecoState(SPRST_ACTIVE_ALWAYS);
		}

		 hr = CoCreateInstance(CLSID_SpVoice, NULL, CLSCTX_ALL, IID_ISpVoice, (void     **)&pVoice);
		if (SUCCEEDED(hr)) {
			hr = SpEnumTokens(SPCAT_VOICES, L"Gender=Female", NULL, &pEnum);
			if (SUCCEEDED(hr))
			{
				// Get the number of voices.
				hr = pEnum->GetCount(&ulCount);
			}

			// Obtain a list of available voice tokens, set
			// the voice to the token, and call Speak.
			while (SUCCEEDED(hr) && ulCount--)			{
				if (pVoiceToken != nullptr) {
					pVoiceToken->Release();
				}

				if (SUCCEEDED(hr))
				{
					hr = pEnum->Next(1, &pVoiceToken, NULL);
				}

				if (SUCCEEDED(hr))
				{
					hr = pVoice->SetVoice(pVoiceToken);
				}

				if (SUCCEEDED(hr))
				{
					wchar_t* start = L"<?xml version=\"1.0\" encoding=\"ISO - 8859 - 1\"?><speak version = \"1.0\" xmlns = \"http://www.w3.org/2001/10/synthesis\"	xml:lang = \"en-US\">";
					wchar_t* end = L"</speak>";
					const wchar_t *xml = L"<voice required = \"Gender=Male\"> hi! <prosody pitch=\"fast\"> This is low pitch. </prosody><prosody volume=\"x - loud\"> This is extra loud volume. </prosody>";
					wstring s = start;
					s += xml;
					s += end;
					
					hr = pVoice->Speak(xml, SPF_IS_XML| SPF_ASYNC, 0);
					//hr = pVoice->Speak(L"How are you?", SPF_DEFAULT, NULL);
				}

			}
			/*
			if (SUCCEEDED(hr)) {
				hr = pEnum->Next(1, &pVoiceToken, NULL);
				if (SUCCEEDED(hr)) {
					hr = pVoice->SetVoice(pVoiceToken);
					// Set the output to the default audio device.
					if (SUCCEEDED(hr)) {
						hr = pVoice->SetOutput(NULL, TRUE);
						if (SUCCEEDED(hr)) {
							hr = pVoice->Speak(L"Hello, world!", SPF_DEFAULT, 0);
						}
					}
				}
			}
			*/
			pVoice->Release();
		}
		::CoUninitialize();
	}
int _tmain( int argc, _TCHAR* argv[] )
{
	cv::setUseOptimized( true );

	// Kinectのインスタンス生成、初期化
	INuiSensor* pSensor;
	HRESULT hResult = S_OK;
	hResult = NuiCreateSensorByIndex( 0, &pSensor );
	if( FAILED( hResult ) ){
		std::cerr << "Error : NuiCreateSensorByIndex" << std::endl;
		return -1;
	}

	hResult = pSensor->NuiInitialize( NUI_INITIALIZE_FLAG_USES_AUDIO );
	if( FAILED( hResult ) ){
		std::cerr << "Error : NuiInitialize" << std::endl;
		return -1;
	}

	// Audioストリームの初期化(InitializeAudioStream)
	std::cout << "InitializeAudioStream" << std::endl;
	INuiAudioBeam* pNuiAudioSource;
	hResult = pSensor->NuiGetAudioSource( &pNuiAudioSource );
	if( FAILED( hResult ) ){
		std::cerr << "Error : NuiGetAudioSource" << std::endl;
		return -1;
	}

	IMediaObject* pMediaObject = nullptr;
	IPropertyStore* pPropertyStore = nullptr;
	pNuiAudioSource->QueryInterface( IID_IMediaObject, reinterpret_cast<void**>( &pMediaObject ) );
	pNuiAudioSource->QueryInterface( IID_IPropertyStore, reinterpret_cast<void**>( &pPropertyStore ) );

	PROPVARIANT propvariant;
	PropVariantInit( &propvariant );
	propvariant.vt = VT_I4;
	propvariant.lVal = static_cast<LONG>( 4 );
	pPropertyStore->SetValue( MFPKEY_WMAAECMA_SYSTEM_MODE, propvariant );
	PropVariantClear( &propvariant );

	WAVEFORMATEX waveFormat = { AudioFormat, AudioChannels, AudioSamplesPerSecond, AudioAverageBytesPerSecond, AudioBlockAlign, AudioBitsPerSample, 0 };
	DMO_MEDIA_TYPE mediaType = { 0 };
	MoInitMediaType( &mediaType, sizeof( WAVEFORMATEX ) );

	mediaType.majortype = MEDIATYPE_Audio;
	mediaType.subtype = MEDIASUBTYPE_PCM;
	mediaType.lSampleSize = 0;
	mediaType.bFixedSizeSamples = true;
	mediaType.bTemporalCompression = false;
	mediaType.formattype = FORMAT_WaveFormatEx;
	memcpy( mediaType.pbFormat, &waveFormat, sizeof( WAVEFORMATEX ) );

	pMediaObject->SetOutputType( 0, &mediaType, 0 ); 

	KinectAudioStream* audioStream = new KinectAudioStream( pMediaObject );

	IStream* pStream = nullptr;
	audioStream->QueryInterface( IID_IStream, reinterpret_cast<void**>( &pStream ) );

	CoInitialize( nullptr );
	ISpStream* pSpeechStream = nullptr;
	CoCreateInstance( CLSID_SpStream, NULL, CLSCTX_INPROC_SERVER, __uuidof(ISpStream), reinterpret_cast<void**>( &pSpeechStream ) );

	pSpeechStream->SetBaseStream( pStream, SPDFID_WaveFormatEx, &waveFormat );

	MoFreeMediaType( &mediaType );
	pStream->Release();
	pPropertyStore->Release();
	pMediaObject->Release();
	pNuiAudioSource->Release();

	// 音声認識器を作成(CreateSpeechRecognizer)
	std::cout << "CreateSpeechRecognizer" << std::endl;
	ISpRecognizer* pSpeechRecognizer;
	CoCreateInstance( CLSID_SpInprocRecognizer, nullptr, CLSCTX_INPROC_SERVER, __uuidof(ISpRecognizer), reinterpret_cast<void**>( &pSpeechRecognizer ) );

	pSpeechRecognizer->SetInput( pSpeechStream, false );

	/*
	// If can use ATL, easier to using SpFindBestToken(sphelper.h). When using Professional or more.
	ISpObjectToken* pEngineToken = nullptr;
	SpFindBestToken( SPCAT_RECOGNIZERS, L"Language=411;Kinect=True", NULL, &pEngineToken ); // Japanese "Language=411;Kinect=True" English "Language=409;Kinect=True"
	*/

	///*
	// If can't use ATL, alternative to using SpFIndBestToken(sphelper.h). When using Express.
	const wchar_t* pVendorPreferred = L"VendorPreferred";
	const unsigned long lengthVendorPreferred = static_cast<unsigned long>( wcslen( pVendorPreferred ) );
	unsigned long length;
	ULongAdd( lengthVendorPreferred, 1, &length );
	wchar_t* pAttribsVendorPreferred = new wchar_t[ length ];
	StringCchCopyW( pAttribsVendorPreferred, length, pVendorPreferred );

	ISpObjectTokenCategory* pTokenCategory = nullptr;
	CoCreateInstance( CLSID_SpObjectTokenCategory, nullptr, CLSCTX_ALL, __uuidof(ISpObjectTokenCategory), reinterpret_cast<void**>( &pTokenCategory ) );

	pTokenCategory->SetId( SPCAT_RECOGNIZERS, false );

	IEnumSpObjectTokens* pEnumTokens = nullptr;
	CoCreateInstance( CLSID_SpMMAudioEnum, nullptr, CLSCTX_ALL, __uuidof(IEnumSpObjectTokens), reinterpret_cast<void**>( &pEnumTokens ) );

	pTokenCategory->EnumTokens( L"Language=411;Kinect=True", pAttribsVendorPreferred, &pEnumTokens ); // Japanese "Language=411;Kinect=True" English "Language=409;Kinect=True"

	delete[] pAttribsVendorPreferred;
	
	ISpObjectToken* pEngineToken = nullptr;
	pEnumTokens->Next( 1, &pEngineToken, nullptr );
	//*/

	pSpeechRecognizer->SetRecognizer( pEngineToken );
	
	ISpRecoContext* pSpeechContext;
	pSpeechRecognizer->CreateRecoContext( &pSpeechContext );

	pEngineToken->Release();
	///*
	pTokenCategory->Release();
	pEnumTokens->Release();
	//*/

	// 音声認識辞書の作成(LoadSpeechGrammar)
	std::cout << "LoadSpeechGrammar" << std::endl;
	ISpRecoGrammar* pSpeechGrammar;
	pSpeechContext->CreateGrammar( 1, &pSpeechGrammar );

	pSpeechGrammar->LoadCmdFromFile( L"SpeechRecognition_Ja.grxml", /*SPLO_STATIC*/SPLO_DYNAMIC ); // http://www.w3.org/TR/speech-grammar/ (UTF-8/CRLF)
	
	audioStream->StartCapture();
	pSpeechGrammar->SetRuleState( nullptr, nullptr, SPRS_ACTIVE );
	pSpeechRecognizer->SetRecoState( SPRST_ACTIVE_ALWAYS );
	pSpeechContext->SetInterest( SPFEI( SPEI_RECOGNITION ), SPFEI( SPEI_RECOGNITION ) );
	pSpeechContext->Resume( 0 );

	HANDLE hSpeechEvent = INVALID_HANDLE_VALUE;
	hSpeechEvent = pSpeechContext->GetNotifyEventHandle();
	HANDLE hEvents[1] = { hSpeechEvent };

	int width = 640;
	int height = 480;

	cv::Mat audioMat = cv::Mat::zeros( height, width, CV_8UC3 );
	cv::namedWindow( "Audio" );

	bool exit = false;

	std::cout << std::endl << "Speech Recognition Start..." << std::endl << std::endl;

	while( 1 ){
		// イベントの更新待ち
		ResetEvent( hSpeechEvent );
		unsigned long waitObject = MsgWaitForMultipleObjectsEx( ARRAYSIZE( hEvents ), hEvents, INFINITE, QS_ALLINPUT, MWMO_INPUTAVAILABLE );

		if( waitObject == WAIT_OBJECT_0 ){
			// イベントの取得
			const float confidenceThreshold = 0.3f;
			SPEVENT eventStatus;
			unsigned long eventFetch = 0;
			pSpeechContext->GetEvents( 1, &eventStatus, &eventFetch );
			while( eventFetch > 0 ){
				switch( eventStatus.eEventId ){
					// 音声認識イベント(SPEI_HYPOTHESIS:推定またはSPEI_RECOGNITION:認識)
					case SPEI_HYPOTHESIS:
					case SPEI_RECOGNITION:
						if( eventStatus.elParamType == SPET_LPARAM_IS_OBJECT ){
							// フレーズの取得
							ISpRecoResult* pRecoResult = reinterpret_cast<ISpRecoResult*>( eventStatus.lParam );
							SPPHRASE* pPhrase = nullptr;
							hResult = pRecoResult->GetPhrase( &pPhrase );
							if( SUCCEEDED( hResult ) ){
								if( ( pPhrase->pProperties != nullptr ) && ( pPhrase->pProperties->pFirstChild != nullptr ) ){
									// 辞書のフレーズタグと比較
									const SPPHRASEPROPERTY* pSemantic = pPhrase->pProperties->pFirstChild;
									if( pSemantic->SREngineConfidence > confidenceThreshold ){
										if( wcscmp( L"あか", pSemantic->pszValue ) == 0 ){
											std::cout << "あか" << std::endl;
											audioMat = cv::Scalar( 0, 0, 255 );
										}
										else if( wcscmp( L"みどり", pSemantic->pszValue ) == 0 ){
											std::cout << "みどり" << std::endl;
											audioMat = cv::Scalar( 0, 255, 0 );
										}
										else if( wcscmp( L"あお", pSemantic->pszValue ) == 0 ){
											std::cout << "あお" << std::endl;
											audioMat = cv::Scalar( 255, 0, 0 );
										}
										else if( wcscmp( L"おわり", pSemantic->pszValue ) == 0 ){
											exit = true;
										}
									}
								}
								CoTaskMemFree( pPhrase );
							}
						}
						break;

					default:
						break;
				}
				pSpeechContext->GetEvents( 1, &eventStatus, &eventFetch );
			}
		}

		// 表示
		cv::imshow( "Audio", audioMat );

		// ループの終了判定(Escキー)
		if( cv::waitKey( 30 ) == VK_ESCAPE || exit ){
			break;
		}
	}

	// 終了処理
	audioStream->StopCapture();
	pSpeechRecognizer->SetRecoState( SPRST_INACTIVE );
	CoUninitialize();
	pSensor->NuiShutdown();
	CloseHandle( hSpeechEvent );

	cv::destroyAllWindows();

	return 0;
}