Esempio n. 1
0
autoSound SpeechSynthesizer_to_Sound (SpeechSynthesizer me, const char32 *text, autoTextGrid *tg, autoTable *events) {
	try {
		int fsamp = espeak_Initialize (AUDIO_OUTPUT_SYNCHRONOUS, 0, nullptr, // 5000ms
			espeakINITIALIZE_PHONEME_EVENTS|espeakINITIALIZE_PHONEME_IPA);
		if (fsamp == -1) {
			Melder_throw (U"Internal espeak error.");
		}
		int synth_flags = espeakCHARS_WCHAR;
		if (my d_inputTextFormat == SpeechSynthesizer_INPUT_TAGGEDTEXT) {
			synth_flags |= espeakSSML;
		}
		if (my d_inputTextFormat != SpeechSynthesizer_INPUT_TEXTONLY) {
			synth_flags |= espeakPHONEMES;
		}
		option_phoneme_events = espeakINITIALIZE_PHONEME_EVENTS; // extern int option_phoneme_events;
		if (my d_outputPhonemeCoding == SpeechSynthesizer_PHONEMECODINGS_IPA) {
			option_phoneme_events |= espeakINITIALIZE_PHONEME_IPA;
		}

		espeak_SetParameter (espeakRATE, my d_wordsPerMinute, 0);
		espeak_SetParameter (espeakPITCH, my d_pitchAdjustment, 0);
		espeak_SetParameter (espeakRANGE, my d_pitchRange, 0);
		const char32 *voiceLanguageCode = SpeechSynthesizer_getVoiceLanguageCodeFromName (me, my d_voiceLanguageName);
		const char32 *voiceVariantCode = SpeechSynthesizer_getVoiceVariantCodeFromName (me, my d_voiceVariantName);
		espeakdata_SetVoiceByName ((const char *) Melder_peek32to8 (voiceLanguageCode), 
			(const char *) Melder_peek32to8 (voiceVariantCode));

		espeak_SetParameter (espeakWORDGAP, my d_wordgap * 100, 0); // espeak wordgap is in units of 10 ms
		espeak_SetParameter (espeakCAPITALS, 0, 0);
		espeak_SetParameter (espeakPUNCTUATION, espeakPUNCT_NONE, 0);

		espeak_SetSynthCallback (synthCallback);

		my d_events = Table_createWithColumnNames (0, U"time type type-t t-pos length a-pos sample id uniq");

		#ifdef _WIN32
                wchar_t *textW = Melder_peek32toW (text);
                espeak_Synth (textW, wcslen (textW) + 1, 0, POS_CHARACTER, 0, synth_flags, nullptr, me);
		#else
                espeak_Synth (text, str32len (text) + 1, 0, POS_CHARACTER, 0, synth_flags, nullptr, me);
		#endif
				
		espeak_Terminate ();
		autoSound thee = buffer_to_Sound (my d_wav, my d_numberOfSamples, my d_internalSamplingFrequency);

		if (my d_samplingFrequency != my d_internalSamplingFrequency) {
			thee = Sound_resample (thee.get(), my d_samplingFrequency, 50);
		}
		my d_numberOfSamples = 0; // re-use the wav-buffer
		if (tg) {
			double xmin = Table_getNumericValue_Assert (my d_events.get(), 1, 1);
			if (xmin > thy xmin) {
				xmin = thy xmin;
			}
			double xmax = Table_getNumericValue_Assert (my d_events.get(), my d_events -> rows.size, 1);
			if (xmax < thy xmax) {
				xmax = thy xmax;
			}
			autoTextGrid tg1 = Table_to_TextGrid (my d_events.get(), text, xmin, xmax);
			*tg = TextGrid_extractPart (tg1.get(), thy xmin, thy xmax, 0);
		}
		if (events) {
			Table_setEventTypeString (my d_events.get());
			*events = my d_events.move();
		}
		my d_events.reset();
		return thee;
	} catch (MelderError) {
		espeak_Terminate ();
		Melder_throw (U"Text not played.");
	}
}
// We assume that the Sound and the SpeechSynthesizer have the same samplingFrequency
// schakel waarschuwingen over stiltedetectie uit
autoTextGrid SpeechSynthesizer_and_Sound_and_TextInterval_align (SpeechSynthesizer me, Sound thee, TextInterval him, double silenceThreshold, double minSilenceDuration, double minSoundingDuration) {
	try {
		if (thy xmin != his xmin || thy xmax != his xmax) {
			Melder_throw (U"Domains of Sound and TextGrid must be equal.");
		}
		if (fabs (1.0 / thy dx - my d_samplingFrequency) > NUMfpp -> eps) {
			Melder_throw (U"The sampling frequencies of the SpeechSynthesizer and the Sound must be equal.");
		}
		long numberOfTokens = Melder_countTokens (his text);
		if (numberOfTokens == 0) {
			Melder_throw (U"The interval has no text.");
		}
		// Remove silent intervals from start and end of sounds
		double minPitch = 200, timeStep = 0.005, precision = thy dx;
		double t1_thee, t2_thee;
		autoSound s_thee = Sound_trimSilencesAtStartAndEnd (thee, 0.0, minPitch, timeStep,
			silenceThreshold, minSilenceDuration, minSoundingDuration, &t1_thee, &t2_thee);
		double s_thee_duration = s_thee -> xmax - s_thee -> xmin;
		bool hasSilence_thee = fabs (t1_thee - thy xmin) > precision || fabs (t2_thee - thy xmax) > precision;

		if (my d_estimateWordsPerMinute) {
			// estimate speaking rate with the number of words per minute from the text
			double wordsPerMinute_rawTokens = 60.0 * numberOfTokens / s_thee_duration;
			// compensation for long words: 5 characters / word
			double wordsPerMinute_rawText = 60.0 * (str32len (his text) / 5.0) / s_thee_duration;
			my d_wordsPerMinute =  (long) floor (0.5 * (wordsPerMinute_rawTokens + wordsPerMinute_rawText));
		}
		autoTextGrid tg2;
		autoSound s2 = SpeechSynthesizer_and_TextInterval_to_Sound (me, him, & tg2);
		autoTextGrid silentTextGrid;
		/*
		 * For the synthesizer the silence threshold has to be < -30 dB, otherwise fricatives will not
		 * be found as sounding! This is ok since silences are almost at zero amplitudes
		 * We also have to decrease the minimum silence and minimum sounding duration to catch, for example,
		 * the final plosive "t" from the word "text"
		 *
		 */
		double s2_silenceThreshold = -40.0, s2_minSilenceDuration = 0.05, s2_minSoundingDuration = 0.05;
		double t1_s2, t2_s2;
		autoSound s_s2 = Sound_trimSilencesAtStartAndEnd (s2.peek(), 0.0, minPitch, timeStep,
			s2_silenceThreshold, s2_minSilenceDuration, s2_minSoundingDuration, & t1_s2, & t2_s2);
		double s_s2_duration = s_s2 -> xmax - s_s2 -> xmin;
		bool hasSilence_s2 = fabs (t1_s2 - s2 -> xmin) > precision || fabs (t2_s2 - s2 -> xmax) > precision;
		if (hasSilence_s2) {
			silentTextGrid = TextGrid_extractPart (tg2.peek(), t1_s2, t2_s2, true);
		}
		double analysisWidth = 0.02, dt = 0.005, band = 0.0;
		// compare the durations of the two sounds to get an indication of the slope constraint of the DTW
		double slope = s_thee_duration / s_s2_duration;
		slope = slope > 1 ? slope : 1 / slope;
        int constraint = slope < 1.5 ? 4 : (slope < 2 ? 3 : (slope < 3 ? 2 : 1));
		//autoMFCC m1 = Sound_to_MFCC ((hasSilence_thee ? s_thee.peek() : thee),
		//	numberOfCoefficients, analysisWidth, dt, f1_mel, fmax_mel, df_mel);
		//autoMFCC m2 = Sound_to_MFCC ((hasSilence_s2 ? s_s2.peek() : s2.peek()),
		//	numberOfCoefficients, analysisWidth, dt, f1_mel, fmax_mel, df_mel);
		//double wc = 1, wle = 0, wr = 0, wer = 0, dtr = 0;
		//int matchStart = 1, matchEnd = 1, constraint = 4; // no 1/3 1/2 2/3
		//autoDTW dtw = CCs_to_DTW (m1.peek(), m2.peek(), wc, wle, wr, wer, dtr, matchStart, matchEnd, constraint);
        autoDTW dtw = Sounds_to_DTW ((hasSilence_thee ? s_thee.peek() : thee), (hasSilence_s2 ? s_s2.peek() : s2.peek()), analysisWidth, dt, band, constraint);
		autoTextGrid result = DTW_and_TextGrid_to_TextGrid (dtw.peek(),
			(hasSilence_s2 ? silentTextGrid.peek() : tg2.peek()), precision);
		if (hasSilence_thee) {
			if (t1_thee > thy xmin) {
				TextGrid_setEarlierStartTime (result.peek(), thy xmin, U"", U"");
			}
			if (t2_thee < thy xmax) {
				TextGrid_setLaterEndTime (result.peek(), thy xmax, U"", U"");
			}
		}
		return result;
	} catch (MelderError) {
		Melder_throw (U"Sound and TextInterval not aligned.");
	}
}