autoSound SpeechSynthesizer_to_Sound (SpeechSynthesizer me, const char32 *text, autoTextGrid *tg, autoTable *events) { try { int fsamp = espeak_Initialize (AUDIO_OUTPUT_SYNCHRONOUS, 0, nullptr, // 5000ms espeakINITIALIZE_PHONEME_EVENTS|espeakINITIALIZE_PHONEME_IPA); if (fsamp == -1) { Melder_throw (U"Internal espeak error."); } int synth_flags = espeakCHARS_WCHAR; if (my d_inputTextFormat == SpeechSynthesizer_INPUT_TAGGEDTEXT) { synth_flags |= espeakSSML; } if (my d_inputTextFormat != SpeechSynthesizer_INPUT_TEXTONLY) { synth_flags |= espeakPHONEMES; } option_phoneme_events = espeakINITIALIZE_PHONEME_EVENTS; // extern int option_phoneme_events; if (my d_outputPhonemeCoding == SpeechSynthesizer_PHONEMECODINGS_IPA) { option_phoneme_events |= espeakINITIALIZE_PHONEME_IPA; } espeak_SetParameter (espeakRATE, my d_wordsPerMinute, 0); espeak_SetParameter (espeakPITCH, my d_pitchAdjustment, 0); espeak_SetParameter (espeakRANGE, my d_pitchRange, 0); const char32 *voiceLanguageCode = SpeechSynthesizer_getVoiceLanguageCodeFromName (me, my d_voiceLanguageName); const char32 *voiceVariantCode = SpeechSynthesizer_getVoiceVariantCodeFromName (me, my d_voiceVariantName); espeakdata_SetVoiceByName ((const char *) Melder_peek32to8 (voiceLanguageCode), (const char *) Melder_peek32to8 (voiceVariantCode)); espeak_SetParameter (espeakWORDGAP, my d_wordgap * 100, 0); // espeak wordgap is in units of 10 ms espeak_SetParameter (espeakCAPITALS, 0, 0); espeak_SetParameter (espeakPUNCTUATION, espeakPUNCT_NONE, 0); espeak_SetSynthCallback (synthCallback); my d_events = Table_createWithColumnNames (0, U"time type type-t t-pos length a-pos sample id uniq"); #ifdef _WIN32 wchar_t *textW = Melder_peek32toW (text); espeak_Synth (textW, wcslen (textW) + 1, 0, POS_CHARACTER, 0, synth_flags, nullptr, me); #else espeak_Synth (text, str32len (text) + 1, 0, POS_CHARACTER, 0, synth_flags, nullptr, me); #endif espeak_Terminate (); autoSound thee = buffer_to_Sound (my d_wav, my d_numberOfSamples, my d_internalSamplingFrequency); if (my d_samplingFrequency != my d_internalSamplingFrequency) { thee = Sound_resample (thee.get(), my d_samplingFrequency, 50); } my d_numberOfSamples = 0; // re-use the wav-buffer if (tg) { double xmin = Table_getNumericValue_Assert (my d_events.get(), 1, 1); if (xmin > thy xmin) { xmin = thy xmin; } double xmax = Table_getNumericValue_Assert (my d_events.get(), my d_events -> rows.size, 1); if (xmax < thy xmax) { xmax = thy xmax; } autoTextGrid tg1 = Table_to_TextGrid (my d_events.get(), text, xmin, xmax); *tg = TextGrid_extractPart (tg1.get(), thy xmin, thy xmax, 0); } if (events) { Table_setEventTypeString (my d_events.get()); *events = my d_events.move(); } my d_events.reset(); return thee; } catch (MelderError) { espeak_Terminate (); Melder_throw (U"Text not played."); } }
// We assume that the Sound and the SpeechSynthesizer have the same samplingFrequency // schakel waarschuwingen over stiltedetectie uit autoTextGrid SpeechSynthesizer_and_Sound_and_TextInterval_align (SpeechSynthesizer me, Sound thee, TextInterval him, double silenceThreshold, double minSilenceDuration, double minSoundingDuration) { try { if (thy xmin != his xmin || thy xmax != his xmax) { Melder_throw (U"Domains of Sound and TextGrid must be equal."); } if (fabs (1.0 / thy dx - my d_samplingFrequency) > NUMfpp -> eps) { Melder_throw (U"The sampling frequencies of the SpeechSynthesizer and the Sound must be equal."); } long numberOfTokens = Melder_countTokens (his text); if (numberOfTokens == 0) { Melder_throw (U"The interval has no text."); } // Remove silent intervals from start and end of sounds double minPitch = 200, timeStep = 0.005, precision = thy dx; double t1_thee, t2_thee; autoSound s_thee = Sound_trimSilencesAtStartAndEnd (thee, 0.0, minPitch, timeStep, silenceThreshold, minSilenceDuration, minSoundingDuration, &t1_thee, &t2_thee); double s_thee_duration = s_thee -> xmax - s_thee -> xmin; bool hasSilence_thee = fabs (t1_thee - thy xmin) > precision || fabs (t2_thee - thy xmax) > precision; if (my d_estimateWordsPerMinute) { // estimate speaking rate with the number of words per minute from the text double wordsPerMinute_rawTokens = 60.0 * numberOfTokens / s_thee_duration; // compensation for long words: 5 characters / word double wordsPerMinute_rawText = 60.0 * (str32len (his text) / 5.0) / s_thee_duration; my d_wordsPerMinute = (long) floor (0.5 * (wordsPerMinute_rawTokens + wordsPerMinute_rawText)); } autoTextGrid tg2; autoSound s2 = SpeechSynthesizer_and_TextInterval_to_Sound (me, him, & tg2); autoTextGrid silentTextGrid; /* * For the synthesizer the silence threshold has to be < -30 dB, otherwise fricatives will not * be found as sounding! This is ok since silences are almost at zero amplitudes * We also have to decrease the minimum silence and minimum sounding duration to catch, for example, * the final plosive "t" from the word "text" * */ double s2_silenceThreshold = -40.0, s2_minSilenceDuration = 0.05, s2_minSoundingDuration = 0.05; double t1_s2, t2_s2; autoSound s_s2 = Sound_trimSilencesAtStartAndEnd (s2.peek(), 0.0, minPitch, timeStep, s2_silenceThreshold, s2_minSilenceDuration, s2_minSoundingDuration, & t1_s2, & t2_s2); double s_s2_duration = s_s2 -> xmax - s_s2 -> xmin; bool hasSilence_s2 = fabs (t1_s2 - s2 -> xmin) > precision || fabs (t2_s2 - s2 -> xmax) > precision; if (hasSilence_s2) { silentTextGrid = TextGrid_extractPart (tg2.peek(), t1_s2, t2_s2, true); } double analysisWidth = 0.02, dt = 0.005, band = 0.0; // compare the durations of the two sounds to get an indication of the slope constraint of the DTW double slope = s_thee_duration / s_s2_duration; slope = slope > 1 ? slope : 1 / slope; int constraint = slope < 1.5 ? 4 : (slope < 2 ? 3 : (slope < 3 ? 2 : 1)); //autoMFCC m1 = Sound_to_MFCC ((hasSilence_thee ? s_thee.peek() : thee), // numberOfCoefficients, analysisWidth, dt, f1_mel, fmax_mel, df_mel); //autoMFCC m2 = Sound_to_MFCC ((hasSilence_s2 ? s_s2.peek() : s2.peek()), // numberOfCoefficients, analysisWidth, dt, f1_mel, fmax_mel, df_mel); //double wc = 1, wle = 0, wr = 0, wer = 0, dtr = 0; //int matchStart = 1, matchEnd = 1, constraint = 4; // no 1/3 1/2 2/3 //autoDTW dtw = CCs_to_DTW (m1.peek(), m2.peek(), wc, wle, wr, wer, dtr, matchStart, matchEnd, constraint); autoDTW dtw = Sounds_to_DTW ((hasSilence_thee ? s_thee.peek() : thee), (hasSilence_s2 ? s_s2.peek() : s2.peek()), analysisWidth, dt, band, constraint); autoTextGrid result = DTW_and_TextGrid_to_TextGrid (dtw.peek(), (hasSilence_s2 ? silentTextGrid.peek() : tg2.peek()), precision); if (hasSilence_thee) { if (t1_thee > thy xmin) { TextGrid_setEarlierStartTime (result.peek(), thy xmin, U"", U""); } if (t2_thee < thy xmax) { TextGrid_setLaterEndTime (result.peek(), thy xmax, U"", U""); } } return result; } catch (MelderError) { Melder_throw (U"Sound and TextInterval not aligned."); } }