void Sound_playPart (Sound me, double tmin, double tmax, int (*callback) (void *closure, int phase, double tmin, double tmax, double t), void *closure) { try { long ifsamp = lround (1.0 / my dx), bestSampleRate = MelderAudio_getOutputBestSampleRate (ifsamp); if (ifsamp == bestSampleRate) { struct SoundPlay *thee = (struct SoundPlay *) & thePlayingSound; double *fromLeft = my z [1], *fromRight = my ny > 1 ? my z [2] : NULL; MelderAudio_stopPlaying (MelderAudio_IMPLICIT); long i1, i2; if ((thy numberOfSamples = Matrix_getWindowSamplesX (me, tmin, tmax, & i1, & i2)) < 1) return; thy tmin = tmin; thy tmax = tmax; thy dt = my dx; thy t1 = my x1; thy callback = callback; thy closure = closure; thy silenceBefore = (long) (ifsamp * MelderAudio_getOutputSilenceBefore ()); thy silenceAfter = (long) (ifsamp * MelderAudio_getOutputSilenceAfter ()); int numberOfChannels = my ny; NUMvector_free (thy buffer, 1); // just in case thy buffer = NUMvector <short> (1, (i2 - i1 + 1 + thy silenceBefore + thy silenceAfter) * numberOfChannels); thy i1 = i1; thy i2 = i2; short *to = thy buffer + thy silenceBefore * numberOfChannels; if (numberOfChannels > 2) { for (long i = i1; i <= i2; i ++) { for (long chan = 1; chan <= my ny; chan ++) { long value = (long) round (my z [chan] [i] * 32768.0); * ++ to = value < -32768 ? -32768 : value > 32767 ? 32767 : value; } } } else if (numberOfChannels == 2) { for (long i = i1; i <= i2; i ++) { long valueLeft = (long) round (fromLeft [i] * 32768.0); * ++ to = valueLeft < -32768 ? -32768 : valueLeft > 32767 ? 32767 : valueLeft; long valueRight = (long) round (fromRight [i] * 32768.0); * ++ to = valueRight < -32768 ? -32768 : valueRight > 32767 ? 32767 : valueRight; } } else { for (long i = i1; i <= i2; i ++) { long value = (long) round (fromLeft [i] * 32768.0); * ++ to = value < -32768 ? -32768 : value > 32767 ? 32767 : value; } } if (thy callback) thy callback (thy closure, 1, tmin, tmax, tmin); MelderAudio_play16 (thy buffer + 1, ifsamp, thy silenceBefore + thy numberOfSamples + thy silenceAfter, numberOfChannels, melderPlayCallback, thee); } else { autoSound resampled = Sound_resample (me, bestSampleRate, 1); Sound_playPart (resampled.peek(), tmin, tmax, callback, closure); // recursively } } catch (MelderError) { Melder_throw (me, U": not played."); } }
autoFormant Sound_to_Formant_any (Sound me, double dt, int numberOfPoles, double maximumFrequency, double halfdt_window, int which, double preemphasisFrequency, double safetyMargin) { double nyquist = 0.5 / my dx; autoSound sound; if (maximumFrequency <= 0.0 || fabs (maximumFrequency / nyquist - 1) < 1.0e-12) { sound = Data_copy (me); // will be modified } else { sound = Sound_resample (me, maximumFrequency * 2, 50); } return Sound_to_Formant_any_inline (sound.peek(), dt, numberOfPoles, halfdt_window, which, preemphasisFrequency, safetyMargin); }
autoSpectrum Spectrum_resample (Spectrum me, long numberOfFrequencies) { try { double newSamplingFrequency = (1 / my dx) * numberOfFrequencies / my nx; // resample real and imaginary part ! autoSound thee = Sound_resample ((Sound) me, newSamplingFrequency, 50); autoSpectrum him = Spectrum_create (my xmax, numberOfFrequencies); NUMmatrix_copyElements<double> (thy z, his z, 1, 2, 1, numberOfFrequencies); return him; } catch (MelderError) { Melder_throw (me, U": not resampled."); } }
PowerCepstrogram Sound_to_PowerCepstrogram (Sound me, double pitchFloor, double dt, double maximumFrequency, double preEmphasisFrequency) { try { // minimum analysis window has 3 periods of lowest pitch double analysisWidth = 3 / pitchFloor; double windowDuration = 2 * analysisWidth; /* gaussian window */ long nFrames; // Convenience: analyse the whole sound into one Cepstrogram_frame if (windowDuration > my dx * my nx) { windowDuration = my dx * my nx; } double t1, samplingFrequency = 2 * maximumFrequency; autoSound sound = Sound_resample (me, samplingFrequency, 50); Sound_preEmphasis (sound.peek(), preEmphasisFrequency); Sampled_shortTermAnalysis (me, windowDuration, dt, & nFrames, & t1); autoSound sframe = Sound_createSimple (1, windowDuration, samplingFrequency); autoSound window = Sound_createGaussian (windowDuration, samplingFrequency); // find out the size of the FFT long nfft = 2; while (nfft < sframe -> nx) nfft *= 2; long nq = nfft / 2 + 1; double qmax = 0.5 * nfft / samplingFrequency, dq = qmax / (nq - 1); autoPowerCepstrogram thee = PowerCepstrogram_create (my xmin, my xmax, nFrames, dt, t1, 0, qmax, nq, dq, 0); autoMelderProgress progress (L"Cepstrogram analysis"); for (long iframe = 1; iframe <= nFrames; iframe++) { double t = Sampled_indexToX (thee.peek(), iframe); Sound_into_Sound (sound.peek(), sframe.peek(), t - windowDuration / 2); Vector_subtractMean (sframe.peek()); Sounds_multiply (sframe.peek(), window.peek()); autoSpectrum spec = Sound_to_Spectrum (sframe.peek(), 1); // FFT yes autoPowerCepstrum cepstrum = Spectrum_to_PowerCepstrum (spec.peek()); for (long i = 1; i <= nq; i++) { thy z[i][iframe] = cepstrum -> z[1][i]; } if ((iframe % 10) == 1) { Melder_progress ((double) iframe / nFrames, L"PowerCepstrogram analysis of frame ", Melder_integer (iframe), L" out of ", Melder_integer (nFrames), L"."); } } return thee.transfer(); } catch (MelderError) { Melder_throw (me, ": no PowerCepstrogram created."); } }
Cepstrogram Sound_to_Cepstrogram (Sound me, double analysisWidth, double dt, double maximumFrequency) { try { double windowDuration = 2 * analysisWidth; /* gaussian window */ long nFrames; // Convenience: analyse the whole sound into one Cepstrogram_frame if (windowDuration > my dx * my nx) { windowDuration = my dx * my nx; } double t1, samplingFrequency = 2 * maximumFrequency; autoSound sound = Sound_resample (me, samplingFrequency, 50); Sampled_shortTermAnalysis (me, windowDuration, dt, & nFrames, & t1); autoSound sframe = Sound_createSimple (1, windowDuration, samplingFrequency); autoSound window = Sound_createGaussian (windowDuration, samplingFrequency); double qmin, qmax, dq, q1; long nq; { // laziness: find out the proper dimensions autoSpectrum spec = Sound_to_Spectrum (sframe.peek(), 1); autoCepstrum cepstrum = Spectrum_to_Cepstrum (spec.peek()); qmin = cepstrum -> xmin; qmax = cepstrum -> xmax; dq = cepstrum -> dx; q1 = cepstrum -> x1; nq = cepstrum -> nx; } autoCepstrogram thee = Cepstrogram_create (my xmin, my xmax, nFrames, dt, t1, qmin, qmax, nq, dq, q1); autoMelderProgress progress (L"Cepstrogram analysis"); for (long iframe = 1; iframe <= nFrames; iframe++) { double t = Sampled_indexToX (thee.peek(), iframe); Sound_into_Sound (sound.peek(), sframe.peek(), t - windowDuration / 2); Vector_subtractMean (sframe.peek()); Sounds_multiply (sframe.peek(), window.peek()); autoSpectrum spec = Sound_to_Spectrum (sframe.peek(), 1); autoCepstrum cepstrum = Spectrum_to_Cepstrum (spec.peek()); for (long i = 1; i <= nq; i++) { thy z[i][iframe] = cepstrum -> z[1][i]; } if ((iframe % 10) == 1) { Melder_progress ((double) iframe / nFrames, L"Cepstrogram analysis of frame ", Melder_integer (iframe), L" out of ", Melder_integer (nFrames), L"."); } } return thee.transfer(); } catch (MelderError) { Melder_throw (me, ": no Cepstrogram created."); } }
static autoSound synthesize_pulses_lpc (Manipulation me) { try { if (! my lpc) { if (! my sound) Melder_throw (U"Missing original sound."); autoSound sound10k = Sound_resample (my sound.get(), 10000.0, 50); my lpc = Sound_to_LPC_burg (sound10k.get(), 20, 0.025, 0.01, 50.0); } if (! my pulses) Melder_throw (U"Missing pulses analysis."); autoSound train = PointProcess_to_Sound_pulseTrain (my pulses.get(), 1.0 / my lpc -> samplingPeriod, 0.7, 0.05, 30); train -> dx = my lpc -> samplingPeriod; // to be exact Sound_PointProcess_fillVoiceless (train.get(), my pulses.get()); autoSound result = LPC_and_Sound_filter (my lpc.get(), train.get(), true); NUMdeemphasize_f (result -> z [1], result -> nx, result -> dx, 50.0); Vector_scale (result.get(), 0.99); return result; } catch (MelderError) { Melder_throw (me, U": LPC synthesis not performed."); } }
Formant Sound_to_Formant_robust (Sound me, double dt_in, int numberOfPoles, double maximumFrequency, double halfdt_window, double preEmphasisFrequency, double safetyMargin, double k, int itermax, double tol, int wantlocation) { double dt = dt_in > 0.0 ? dt_in : halfdt_window / 4.0; double nyquist = 0.5 / my dx; long predictionOrder = 2 * numberOfPoles; try { autoSound sound = NULL; if (maximumFrequency <= 0.0 || fabs (maximumFrequency / nyquist - 1) < 1.0e-12) { sound.reset (Data_copy (me)); // will be modified } else { sound.reset (Sound_resample (me, maximumFrequency * 2, 50)); } autoLPC lpc = Sound_to_LPC_auto (sound.peek(), predictionOrder, halfdt_window, dt, preEmphasisFrequency); autoLPC lpcr = LPC_and_Sound_to_LPC_robust (lpc.peek(), sound.peek(), halfdt_window, preEmphasisFrequency, k, itermax, tol, wantlocation); autoFormant thee = LPC_to_Formant (lpcr.peek(), safetyMargin); return thee.transfer(); } catch (MelderError) { Melder_throw (me, ": no robust Formant created."); } }
/* gain used as a constant amplitude multiplyer within a frame of duration my dx. future alternative: convolve gain with a smoother. */ autoSound LPC_and_Sound_filter (LPC me, Sound thee, int useGain) { try { double xmin = my xmin > thy xmin ? my xmin : thy xmin; double xmax = my xmax < thy xmax ? my xmax : thy xmax; if (xmin >= xmax) { Melder_throw (U"Domains of Sound [", thy xmin, U",", thy xmax, U"] and LPC [", my xmin, U",", my xmax, U"] do not overlap."); } // resample sound if samplings don't match autoSound source; if (my samplingPeriod != thy dx) { source = Sound_resample (thee, 1.0 / my samplingPeriod, 50); thee = source.get(); // reference copy; remove at end } autoSound him = Data_copy (thee); double *x = his z[1]; long ifirst = Sampled_xToHighIndex (thee, xmin); long ilast = Sampled_xToLowIndex (thee, xmax); for (long i = ifirst; i <= ilast; i++) { double t = his x1 + (i - 1) * his dx; /* Sampled_indexToX (him, i) */ long iFrame = lround ( (t - my x1) / my dx + 1.0); /* Sampled_xToNearestIndex (me, t) */ if (iFrame < 1) { continue; } if (iFrame > my nx) { break; } double *a = my d_frames[iFrame].a; long m = i > my d_frames[iFrame].nCoefficients ? my d_frames[iFrame].nCoefficients : i - 1; for (long j = 1; j <= m; j++) { x[i] -= a[j] * x[i - j]; } } // Make samples before first frame and after last frame zero. for (long i = 1; i < ifirst; i++) { x[i] = 0.0; } for (long i = ilast + 1; i <= his nx; i++) { x[i] = 0.0; } if (useGain) { for (long i = ifirst; i <= ilast; i++) { double t = his x1 + (i - 1) * his dx; /* Sampled_indexToX (him, i) */ double riFrame = (t - my x1) / my dx + 1; /* Sampled_xToIndex (me, t); */ long iFrame = (long) floor (riFrame); double phase = riFrame - iFrame; if (iFrame < 0 || iFrame > my nx) { x[i] = 0.0; } else if (iFrame == 0) { x[i] *= sqrt (my d_frames[1].gain) * phase; } else if (iFrame == my nx) { x[i] *= sqrt (my d_frames[my nx].gain) * (1.0 - phase); } else x[i] *= phase * sqrt (my d_frames[iFrame + 1].gain) + (1.0 - phase) * sqrt (my d_frames[iFrame].gain); } } return him; } catch (MelderError) { Melder_throw (thee, U": not filtered."); } }
PowerCepstrogram Sound_to_PowerCepstrogram_hillenbrand (Sound me, double minimumPitch, double dt) { try { // minimum analysis window has 3 periods of lowest pitch double analysisWidth = 3 / minimumPitch; if (analysisWidth > my dx * my nx) { analysisWidth = my dx * my nx; } double t1, samplingFrequency = 1 / my dx; autoSound thee; if (samplingFrequency > 30000) { samplingFrequency = samplingFrequency / 2; thee.reset (Sound_resample (me, samplingFrequency, 1)); } else { thee.reset (Data_copy (me)); } // pre-emphasis with fixed coefficient 0.9 for (long i = thy nx; i > 1; i--) { thy z[1][i] -= 0.9 * thy z[1][i - 1]; } long nosInWindow = analysisWidth * samplingFrequency, nFrames; if (nosInWindow < 8) { Melder_throw ("Analysis window too short."); } Sampled_shortTermAnalysis (thee.peek(), analysisWidth, dt, & nFrames, & t1); autoNUMvector<double> hamming (1, nosInWindow); for (long i = 1; i <= nosInWindow; i++) { hamming[i] = 0.54 -0.46 * cos(2 * NUMpi * (i - 1) / (nosInWindow - 1)); } long nfft = 8; // minimum possible while (nfft < nosInWindow) { nfft *= 2; } long nfftdiv2 = nfft / 2; autoNUMvector<double> fftbuf (1, nfft); // "complex" array autoNUMvector<double> spectrum (1, nfftdiv2 + 1); // +1 needed autoNUMfft_Table fftTable; NUMfft_Table_init (&fftTable, nfft); // sound to spectrum double qmax = 0.5 * nfft / samplingFrequency, dq = qmax / (nfftdiv2 + 1); autoPowerCepstrogram him = PowerCepstrogram_create (my xmin, my xmax, nFrames, dt, t1, 0, qmax, nfftdiv2+1, dq, 0); autoMelderProgress progress (L"Cepstrogram analysis"); for (long iframe = 1; iframe <= nFrames; iframe++) { double tbegin = t1 + (iframe - 1) * dt - analysisWidth / 2; tbegin = tbegin < thy xmin ? thy xmin : tbegin; long istart = Sampled_xToIndex (thee.peek(), tbegin); istart = istart < 1 ? 1 : istart; long iend = istart + nosInWindow - 1; iend = iend > thy nx ? thy nx : iend; for (long i = 1; i <= nosInWindow; i++) { fftbuf[i] = thy z[1][istart + i - 1] * hamming[i]; } for (long i = nosInWindow + 1; i <= nfft; i++) { fftbuf[i] = 0; } NUMfft_forward (&fftTable, fftbuf.peek()); complexfftoutput_to_power (fftbuf.peek(), nfft, spectrum.peek(), true); // log10(|fft|^2) // subtract average double specmean = spectrum[1]; for (long i = 2; i <= nfftdiv2 + 1; i++) { specmean += spectrum[i]; } specmean /= nfftdiv2 + 1; for (long i = 1; i <= nfftdiv2 + 1; i++) { spectrum[i] -= specmean; } /* * Here we diverge from Hillenbrand as he takes the fft of half of the spectral values. * H. forgets that the actual spectrum has nfft/2+1 values. Thefore, we take the inverse * transform because this keeps the number of samples a power of 2. * At the same time this results in twice as much numbers in the quefrency domain, i.e. we end with nfft/2+1 * numbers while H. has only nfft/4! */ fftbuf[1] = spectrum[1]; for (long i = 2; i < nfftdiv2 + 1; i++) { fftbuf[i+i-2] = spectrum[i]; fftbuf[i+i-1] = 0; } fftbuf[nfft] = spectrum[nfftdiv2 + 1]; NUMfft_backward (&fftTable, fftbuf.peek()); for (long i = 1; i <= nfftdiv2 + 1; i++) { his z[i][iframe] = fftbuf[i] * fftbuf[i]; } if ((iframe % 10) == 1) { Melder_progress ((double) iframe / nFrames, L"Cepstrogram analysis of frame ", Melder_integer (iframe), L" out of ", Melder_integer (nFrames), L"."); } } return him.transfer(); } catch (MelderError) { Melder_throw (me, ": no Cepstrogram created."); } }
autoSound SpeechSynthesizer_to_Sound (SpeechSynthesizer me, const char32 *text, autoTextGrid *tg, autoTable *events) { try { int fsamp = espeak_Initialize (AUDIO_OUTPUT_SYNCHRONOUS, 0, nullptr, // 5000ms espeakINITIALIZE_PHONEME_EVENTS|espeakINITIALIZE_PHONEME_IPA); if (fsamp == -1) { Melder_throw (U"Internal espeak error."); } int synth_flags = espeakCHARS_WCHAR; if (my d_inputTextFormat == SpeechSynthesizer_INPUT_TAGGEDTEXT) { synth_flags |= espeakSSML; } if (my d_inputTextFormat != SpeechSynthesizer_INPUT_TEXTONLY) { synth_flags |= espeakPHONEMES; } option_phoneme_events = espeakINITIALIZE_PHONEME_EVENTS; // extern int option_phoneme_events; if (my d_outputPhonemeCoding == SpeechSynthesizer_PHONEMECODINGS_IPA) { option_phoneme_events |= espeakINITIALIZE_PHONEME_IPA; } espeak_SetParameter (espeakRATE, my d_wordsPerMinute, 0); espeak_SetParameter (espeakPITCH, my d_pitchAdjustment, 0); espeak_SetParameter (espeakRANGE, my d_pitchRange, 0); const char32 *voiceLanguageCode = SpeechSynthesizer_getVoiceLanguageCodeFromName (me, my d_voiceLanguageName); const char32 *voiceVariantCode = SpeechSynthesizer_getVoiceVariantCodeFromName (me, my d_voiceVariantName); espeakdata_SetVoiceByName ((const char *) Melder_peek32to8 (voiceLanguageCode), (const char *) Melder_peek32to8 (voiceVariantCode)); espeak_SetParameter (espeakWORDGAP, my d_wordgap * 100, 0); // espeak wordgap is in units of 10 ms espeak_SetParameter (espeakCAPITALS, 0, 0); espeak_SetParameter (espeakPUNCTUATION, espeakPUNCT_NONE, 0); espeak_SetSynthCallback (synthCallback); my d_events = Table_createWithColumnNames (0, U"time type type-t t-pos length a-pos sample id uniq"); #ifdef _WIN32 wchar_t *textW = Melder_peek32toW (text); espeak_Synth (textW, wcslen (textW) + 1, 0, POS_CHARACTER, 0, synth_flags, nullptr, me); #else espeak_Synth (text, str32len (text) + 1, 0, POS_CHARACTER, 0, synth_flags, nullptr, me); #endif espeak_Terminate (); autoSound thee = buffer_to_Sound (my d_wav, my d_numberOfSamples, my d_internalSamplingFrequency); if (my d_samplingFrequency != my d_internalSamplingFrequency) { thee = Sound_resample (thee.get(), my d_samplingFrequency, 50); } my d_numberOfSamples = 0; // re-use the wav-buffer if (tg) { double xmin = Table_getNumericValue_Assert (my d_events.get(), 1, 1); if (xmin > thy xmin) { xmin = thy xmin; } double xmax = Table_getNumericValue_Assert (my d_events.get(), my d_events -> rows.size, 1); if (xmax < thy xmax) { xmax = thy xmax; } autoTextGrid tg1 = Table_to_TextGrid (my d_events.get(), text, xmin, xmax); *tg = TextGrid_extractPart (tg1.get(), thy xmin, thy xmax, 0); } if (events) { Table_setEventTypeString (my d_events.get()); *events = my d_events.move(); } my d_events.reset(); return thee; } catch (MelderError) { espeak_Terminate (); Melder_throw (U"Text not played."); } }
Pitch Sound_to_Pitch_shs (Sound me, double timeStep, double minimumPitch, double maximumFrequency, double ceiling, long maxnSubharmonics, long maxnCandidates, double compressionFactor, long nPointsPerOctave) { try { double firstTime, newSamplingFrequency = 2 * maximumFrequency; double windowDuration = 2 / minimumPitch, halfWindow = windowDuration / 2; double atans = nPointsPerOctave * NUMlog2 (65.0 / 50.0) - 1; // Number of speech samples in the downsampled signal in each frame: // 100 for windowDuration == 0.04 and newSamplingFrequency == 2500 long nx = lround (windowDuration * newSamplingFrequency); // The minimum number of points for the fft is 256. long nfft = 1; while ( (nfft *= 2) < nx || nfft <= 128) { ; } long nfft2 = nfft / 2 + 1; double frameDuration = nfft / newSamplingFrequency; double df = newSamplingFrequency / nfft; // The number of points on the octave scale double fminl2 = NUMlog2 (minimumPitch), fmaxl2 = NUMlog2 (maximumFrequency); long nFrequencyPoints = (long) floor ((fmaxl2 - fminl2) * nPointsPerOctave); double dfl2 = (fmaxl2 - fminl2) / (nFrequencyPoints - 1); autoSound sound = Sound_resample (me, newSamplingFrequency, 50); long numberOfFrames; Sampled_shortTermAnalysis (sound.peek(), windowDuration, timeStep, &numberOfFrames, &firstTime); autoSound frame = Sound_createSimple (1, frameDuration, newSamplingFrequency); autoSound hamming = Sound_createHamming (nx / newSamplingFrequency, newSamplingFrequency); autoPitch thee = Pitch_create (my xmin, my xmax, numberOfFrames, timeStep, firstTime, ceiling, maxnCandidates); autoNUMvector<double> cc (1, numberOfFrames); autoNUMvector<double> specAmp (1, nfft2); autoNUMvector<double> fl2 (1, nfft2); autoNUMvector<double> yv2 (1, nfft2); autoNUMvector<double> arctg (1, nFrequencyPoints); autoNUMvector<double> al2 (1, nFrequencyPoints); Melder_assert (frame->nx >= nx); Melder_assert (hamming->nx == nx); // Compute the absolute value of the globally largest amplitude w.r.t. the global mean. double globalMean, globalPeak; Sound_localMean (sound.peek(), sound -> xmin, sound -> xmax, &globalMean); Sound_localPeak (sound.peek(), sound -> xmin, sound -> xmax, globalMean, &globalPeak); /* For the cubic spline interpolation we need the frequencies on an octave scale, i.e., a log2 scale. All frequencies must be DIFFERENT, otherwise the cubic spline interpolation will give corrupt results. Because log2(f==0) is not defined, we use the heuristic: f[2]-f[1] == f[3]-f[2]. */ for (long i = 2; i <= nfft2; i++) { fl2[i] = NUMlog2 ( (i - 1) * df); } fl2[1] = 2 * fl2[2] - fl2[3]; // Calculate frequencies regularly spaced on a log2-scale and // the frequency weighting function. for (long i = 1; i <= nFrequencyPoints; i++) { arctg[i] = 0.5 + atan (3 * (i - atans) / nPointsPerOctave) / NUMpi; } // Perform the analysis on all frames. for (long i = 1; i <= numberOfFrames; i++) { Pitch_Frame pitchFrame = &thy frame[i]; double hm = 1, f0, pitch_strength, localMean, localPeak; double tmid = Sampled_indexToX (thee.peek(), i); /* The center of this frame */ long nx_tmp = frame -> nx; // Copy a frame from the sound, apply a hamming window. Get local 'intensity' frame -> nx = nx; /*begin vies */ Sound_into_Sound (sound.peek(), frame.peek(), tmid - halfWindow); Sounds_multiply (frame.peek(), hamming.peek()); Sound_localMean (sound.peek(), tmid - 3 * halfWindow, tmid + 3 * halfWindow, &localMean); Sound_localPeak (sound.peek(), tmid - halfWindow, tmid + halfWindow, localMean, &localPeak); pitchFrame -> intensity = localPeak > globalPeak ? 1 : localPeak / globalPeak; frame -> nx = nx_tmp; /* einde vies */ // Get the Fourier spectrum. autoSpectrum spec = Sound_to_Spectrum (frame.peek(), 1); Melder_assert (spec->nx == nfft2); // From complex spectrum to amplitude spectrum. for (long j = 1; j <= nfft2; j++) { double rs = spec -> z[1][j], is = spec -> z[2][j]; specAmp[j] = sqrt (rs * rs + is * is); } // Enhance the peaks in the spectrum. spec_enhance_SHS (specAmp.peek(), nfft2); // Smooth the enhanced spectrum. spec_smoooth_SHS (specAmp.peek(), nfft2); // Go to a logarithmic scale and perform cubic spline interpolation to get // spectral values for the increased number of frequency points. NUMspline (fl2.peek(), specAmp.peek(), nfft2, 1e30, 1e30, yv2.peek()); for (long j = 1; j <= nFrequencyPoints; j++) { double f = fminl2 + (j - 1) * dfl2; NUMsplint (fl2.peek(), specAmp.peek(), yv2.peek(), nfft2, f, &al2[j]); } // Multiply by frequency selectivity of the auditory system. for (long j = 1; j <= nFrequencyPoints; j++) al2[j] = al2[j] > 0 ? al2[j] * arctg[j] : 0; // The subharmonic summation. Shift spectra in octaves and sum. Pitch_Frame_init (pitchFrame, maxnCandidates); autoNUMvector<double> sumspec (1, nFrequencyPoints); pitchFrame -> nCandidates = 0; /* !!!!! */ for (long m = 1; m <= maxnSubharmonics + 1; m++) { long kb = 1 + (long) floor (nPointsPerOctave * NUMlog2 (m)); for (long k = kb; k <= nFrequencyPoints; k++) { sumspec[k - kb + 1] += al2[k] * hm; } hm *= compressionFactor; } // First register the voiceless candidate (always present). Pitch_Frame_addPitch (pitchFrame, 0, 0, maxnCandidates); /* Get the best local estimates for the pitch as the maxima of the subharmonic sum spectrum by parabolic interpolation on three points: The formula for a parabole with a maximum is: y(x) = a - b (x - c)^2 with a, b, c >= 0 The three points are (-x, y1), (0, y2) and (x, y3). The solution for a (the maximum) and c (the position) is: a = (2 y1 (4 y2 + y3) - y1^2 - (y3 - 4 y2)^2)/( 8 (y1 - 2 y2 + y3) c = dx (y1 - y3) / (2 (y1 - 2 y2 + y3)) (b = (2 y2 - y1 - y3) / (2 dx^2) ) */ for (long k = 2; k <= nFrequencyPoints - 1; k++) { double y1 = sumspec[k - 1], y2 = sumspec[k], y3 = sumspec[k + 1]; if (y2 > y1 && y2 >= y3) { double denum = y1 - 2 * y2 + y3, tmp = y3 - 4 * y2; double x = dfl2 * (y1 - y3) / (2 * denum); double f = pow (2, fminl2 + (k - 1) * dfl2 + x); double strength = (2 * y1 * (4 * y2 + y3) - y1 * y1 - tmp * tmp) / (8 * denum); Pitch_Frame_addPitch (pitchFrame, f, strength, maxnCandidates); } } /* Check whether f0 corresponds to an actual periodicity T = 1 / f0: correlate two signal periods of duration T, one starting at the middle of the interval and one starting T seconds before. If there is periodicity the correlation coefficient should be high. However, some sounds do not show any regularity, or very low frequency and regularity, and nevertheless have a definite pitch, e.g. Shepard sounds. */ Pitch_Frame_getPitch (pitchFrame, &f0, &pitch_strength); if (f0 > 0) { cc[i] = Sound_correlateParts (sound.peek(), tmid - 1.0 / f0, tmid, 1.0 / f0); } } // Base V/UV decision on correlation coefficients. // Resize the pitch strengths w.r.t. the cc. double vuvCriterium = 0.52; for (long i = 1; i <= numberOfFrames; i++) { Pitch_Frame_resizeStrengths (& thy frame[i], cc[i], vuvCriterium); } return thee.transfer(); } catch (MelderError) { Melder_throw (me, U": no Pitch (shs) created."); } }