/**
	This is called when SAPI 5.1 has an event.

	In the textless case, we only handle SPIE_RECOGNITION event. We aren't looking
	at SPIE_HYPOTHESIS. This might be an error. We might be more robust by handling
	both.

	We process the event and add the phonemes we get to the result list
**/
void sapi_textless_lipsync::callback()
{
    CSpEvent event;	// the event

	ISpRecoResult *pRecoResult;			// recoResult from the event
	SPPHRASE      *pSpPhrase;			// phrase from recoResult
	SPRECORESULTTIMES pRecoResultTimes; // result times from RecoResult
	WCHAR phone_buffer[256];			// phoneme buffer for conversion
	long msStart;						// time stamp of the result 
	

    while (event.GetFrom(this->m_recogCntxt) == S_OK)
    {
		if (event.eEventId == SPEI_RECOGNITION /*|| event.eEventId == SPEI_HYPOTHESIS */)
		{	
			// for textless we only accept full recognition. This might be an area
			// to watch out for
			
			// pull out the result object
			pRecoResult = event.RecoResult();

			// pull the whole text from the result
			CSpDynamicString pSapiText;
			pRecoResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, FALSE, &pSapiText, NULL);

            // get the start time for the phrase. we use this as an offset for the phrase
			// elements. Not sure if this is correct.
			pRecoResult->GetResultTimes(&pRecoResultTimes);
			msStart = sapi_time_to_milli(pRecoResultTimes.ullStart);

			// extract the phrase object
			pRecoResult->GetPhrase(&pSpPhrase);

			if (pSpPhrase != NULL)
			{
				// Process each element of the phrase. These should be our
				// orthorgraphs
				const SPPHRASEELEMENT *p = pSpPhrase->pElements;
				const SPPHRASEELEMENT *pEnd = p + pSpPhrase->Rule.ulCountOfElements;
				while (p != pEnd)
				{
					// for each phrase element we create a marker 
					// that contains the time stamps along with the 
					// phonemes. associated with it.
					alignment_result al;
                    al.m_orthography = p->pszDisplayText;
					// Get the phonemes
					ULONG j = 0;
                    SPPHONEID phn[2];
                    phn[1] = 0x00;
                    while (p->pszPronunciation[j] != 0)
                    {
						// process each phoneme
                        phn[0] = p->pszPronunciation[j];
                        m_phnCvt->IdToPhone(phn, phone_buffer);
                        al.m_phonemes.push_back(phone_buffer);
                        j++;
                    }
					// start time of the ortheme
                    al.m_msStart= msStart + bytes_to_milli(p->ulAudioStreamOffset);
					// end time of the ortheme
					al.m_msEnd = bytes_to_milli(p->ulAudioSizeBytes);
                    al.m_msEnd += al.m_msStart;
					// add it to the results
                    m_results.push_back(al);

					p++;
				}
			}
		}
		else if (event.eEventId == SPEI_END_SR_STREAM)
		{
			// This event occurs when the stream has finished processing.
            // we set a flag to indicate that things are done.
			m_bDone = TRUE;        
		}
    }
}
/**
	This is called by SAPI 5.1 when it has an event. 
	
	We use the SpEvent class provided by their SDK to simplify the processing.

	Basically, when we get a "RECOGNITION" event or a "SPEI_HYPOTHESIS" event
	we process them the same. Hypothesis are more likely, for all but very
	short files, "SPIE_RECOGNITION" is a rarity. 

	Since the hypothesis will include duplicate data, we have a decision. We
	can save the newest hypothesis or we can save the one which generates the
	most alignments. Imperically, it seems that sticking with the longest
	result works best. But perhaps this is not so.
**/
void sapi_textbased_lipsync::callback()
{
    //USES_CONVERSION;
    CSpEvent event;

	ISpRecoResult *pRecoResult; // recoResult from the event
	SPPHRASE *pSpPhrase;	// phrase from recoResult
	SPRECORESULTTIMES pRecoResultTimes; // result times from RecoResult
	WCHAR phone_buffer[256];			// buffer for the phonemes
	UINT msStart;						// start time of the phrase
	

	// Process the events
	while (event.GetFrom(this->m_recogCntxt) == S_OK)
    {
		if (event.eEventId == SPEI_RECOGNITION || event.eEventId == SPEI_HYPOTHESIS)
		{
			// text based has to accept hypothesis or it mostly fails unless the
			// script is very short

			// pull out the result object
			pRecoResult = event.RecoResult();

			// pull the whole text from the result
			CSpDynamicString pSapiText;
			pRecoResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, FALSE, &pSapiText, NULL);

			// get the start time for the phrase. we use this as an offset for the phrase
			// elements. Not sure if this is correct.
			pRecoResult->GetResultTimes(&pRecoResultTimes);
			msStart = sapi_time_to_milli(pRecoResultTimes.ullStart);

			std::wstring strPrintText = pSapiText;
			std::cerr << "hypothesis: " << wstring_2_string(strPrintText) << std::endl;
			// if the new results are longer than existing results in orthographic form
			// we accept the results and process the phonemes. Otherwise, we skip it
            if ((wcslen(pSapiText) > this->m_strResults.size()))
            {
                m_strResults = pSapiText;
				// clear the old results. This hypothesis trumps it
                this->m_results.clear();
			    
			    // extract the phrase object
			    pRecoResult->GetPhrase(&pSpPhrase);

			    if (pSpPhrase != NULL)
			    {
				    // Process each element of the phrase. These should be our
					// orthorgraphs
					const SPPHRASEELEMENT *p = pSpPhrase->pElements;
					const SPPHRASEELEMENT *pEnd = p + pSpPhrase->Rule.ulCountOfElements;
				    while (p != pEnd)
				    {
						// for each phrase element we create a marker 
					// that contains the time stamps along with the 
					// phonemes. associated with it.
					    alignment_result al;
                        al.m_orthography = p->pszDisplayText;
					    // Get the phonemes
                        ULONG j = 0;
                        SPPHONEID phn[2];
                        phn[1] = 0x00;
                        while (p->pszPronunciation[j] != 0)
                        {
                            // process each phoneme
                            phn[0] = p->pszPronunciation[j];
                            m_phnCvt->IdToPhone(phn, phone_buffer);
                            al.m_phonemes.push_back(phone_buffer);
                            j++;
                        }
                                             
                        // start time of the ortheme
					    al.m_msStart= msStart + bytes_to_milli(p->ulAudioStreamOffset);
					    // end time of the ortheme
					    al.m_msEnd = bytes_to_milli(p->ulAudioSizeBytes);
                        al.m_msEnd += al.m_msStart;
						// add it to the results
                        m_results.push_back(al);
						
						p++;
				    }
			    }
            }
		}
		else if (event.eEventId == SPEI_END_SR_STREAM)
		{
			// This event occurs when the stream has finished processing.
            // we set a flag to indicate that things are done.
			m_bDone = TRUE;        
		}
    }
}