/** This is called when SAPI 5.1 has an event. In the textless case, we only handle SPIE_RECOGNITION event. We aren't looking at SPIE_HYPOTHESIS. This might be an error. We might be more robust by handling both. We process the event and add the phonemes we get to the result list **/ void sapi_textless_lipsync::callback() { CSpEvent event; // the event ISpRecoResult *pRecoResult; // recoResult from the event SPPHRASE *pSpPhrase; // phrase from recoResult SPRECORESULTTIMES pRecoResultTimes; // result times from RecoResult WCHAR phone_buffer[256]; // phoneme buffer for conversion long msStart; // time stamp of the result while (event.GetFrom(this->m_recogCntxt) == S_OK) { if (event.eEventId == SPEI_RECOGNITION /*|| event.eEventId == SPEI_HYPOTHESIS */) { // for textless we only accept full recognition. This might be an area // to watch out for // pull out the result object pRecoResult = event.RecoResult(); // pull the whole text from the result CSpDynamicString pSapiText; pRecoResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, FALSE, &pSapiText, NULL); // get the start time for the phrase. we use this as an offset for the phrase // elements. Not sure if this is correct. pRecoResult->GetResultTimes(&pRecoResultTimes); msStart = sapi_time_to_milli(pRecoResultTimes.ullStart); // extract the phrase object pRecoResult->GetPhrase(&pSpPhrase); if (pSpPhrase != NULL) { // Process each element of the phrase. These should be our // orthorgraphs const SPPHRASEELEMENT *p = pSpPhrase->pElements; const SPPHRASEELEMENT *pEnd = p + pSpPhrase->Rule.ulCountOfElements; while (p != pEnd) { // for each phrase element we create a marker // that contains the time stamps along with the // phonemes. associated with it. alignment_result al; al.m_orthography = p->pszDisplayText; // Get the phonemes ULONG j = 0; SPPHONEID phn[2]; phn[1] = 0x00; while (p->pszPronunciation[j] != 0) { // process each phoneme phn[0] = p->pszPronunciation[j]; m_phnCvt->IdToPhone(phn, phone_buffer); al.m_phonemes.push_back(phone_buffer); j++; } // start time of the ortheme al.m_msStart= msStart + bytes_to_milli(p->ulAudioStreamOffset); // end time of the ortheme al.m_msEnd = bytes_to_milli(p->ulAudioSizeBytes); al.m_msEnd += al.m_msStart; // add it to the results m_results.push_back(al); p++; } } } else if (event.eEventId == SPEI_END_SR_STREAM) { // This event occurs when the stream has finished processing. // we set a flag to indicate that things are done. m_bDone = TRUE; } } }
/** This is called by SAPI 5.1 when it has an event. We use the SpEvent class provided by their SDK to simplify the processing. Basically, when we get a "RECOGNITION" event or a "SPEI_HYPOTHESIS" event we process them the same. Hypothesis are more likely, for all but very short files, "SPIE_RECOGNITION" is a rarity. Since the hypothesis will include duplicate data, we have a decision. We can save the newest hypothesis or we can save the one which generates the most alignments. Imperically, it seems that sticking with the longest result works best. But perhaps this is not so. **/ void sapi_textbased_lipsync::callback() { //USES_CONVERSION; CSpEvent event; ISpRecoResult *pRecoResult; // recoResult from the event SPPHRASE *pSpPhrase; // phrase from recoResult SPRECORESULTTIMES pRecoResultTimes; // result times from RecoResult WCHAR phone_buffer[256]; // buffer for the phonemes UINT msStart; // start time of the phrase // Process the events while (event.GetFrom(this->m_recogCntxt) == S_OK) { if (event.eEventId == SPEI_RECOGNITION || event.eEventId == SPEI_HYPOTHESIS) { // text based has to accept hypothesis or it mostly fails unless the // script is very short // pull out the result object pRecoResult = event.RecoResult(); // pull the whole text from the result CSpDynamicString pSapiText; pRecoResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, FALSE, &pSapiText, NULL); // get the start time for the phrase. we use this as an offset for the phrase // elements. Not sure if this is correct. pRecoResult->GetResultTimes(&pRecoResultTimes); msStart = sapi_time_to_milli(pRecoResultTimes.ullStart); std::wstring strPrintText = pSapiText; std::cerr << "hypothesis: " << wstring_2_string(strPrintText) << std::endl; // if the new results are longer than existing results in orthographic form // we accept the results and process the phonemes. Otherwise, we skip it if ((wcslen(pSapiText) > this->m_strResults.size())) { m_strResults = pSapiText; // clear the old results. This hypothesis trumps it this->m_results.clear(); // extract the phrase object pRecoResult->GetPhrase(&pSpPhrase); if (pSpPhrase != NULL) { // Process each element of the phrase. These should be our // orthorgraphs const SPPHRASEELEMENT *p = pSpPhrase->pElements; const SPPHRASEELEMENT *pEnd = p + pSpPhrase->Rule.ulCountOfElements; while (p != pEnd) { // for each phrase element we create a marker // that contains the time stamps along with the // phonemes. associated with it. alignment_result al; al.m_orthography = p->pszDisplayText; // Get the phonemes ULONG j = 0; SPPHONEID phn[2]; phn[1] = 0x00; while (p->pszPronunciation[j] != 0) { // process each phoneme phn[0] = p->pszPronunciation[j]; m_phnCvt->IdToPhone(phn, phone_buffer); al.m_phonemes.push_back(phone_buffer); j++; } // start time of the ortheme al.m_msStart= msStart + bytes_to_milli(p->ulAudioStreamOffset); // end time of the ortheme al.m_msEnd = bytes_to_milli(p->ulAudioSizeBytes); al.m_msEnd += al.m_msStart; // add it to the results m_results.push_back(al); p++; } } } } else if (event.eEventId == SPEI_END_SR_STREAM) { // This event occurs when the stream has finished processing. // we set a flag to indicate that things are done. m_bDone = TRUE; } } }