autoPitch SPINET_to_Pitch (SPINET me, double harmonicFallOffSlope, double ceiling, int maxnCandidates) { try { long nPointsPerOctave = 48; double fmin = NUMerbToHertz (Sampled2_rowToY (me, 1)); double fmax = NUMerbToHertz (Sampled2_rowToY (me, my ny)); double fminl2 = NUMlog2 (fmin), fmaxl2 = NUMlog2 (fmax); double points = (fmaxl2 - fminl2) * nPointsPerOctave; double dfl2 = (fmaxl2 - fminl2) / (points - 1); long nFrequencyPoints = (long) floor (points); long maxHarmonic = (long) floor (fmax / fmin); double maxStrength = 0.0, unvoicedCriterium = 0.45, maxPower = 0.0; if (nFrequencyPoints < 2) { Melder_throw (U"Frequency range too small."); } if (ceiling <= fmin) { Melder_throw (U"Ceiling is smaller than centre frequency of lowest filter."); } autoPitch thee = Pitch_create (my xmin, my xmax, my nx, my dx, my x1, ceiling, maxnCandidates); autoNUMvector<double> power (1, my nx); autoNUMvector<double> pitch (1, nFrequencyPoints); autoNUMvector<double> sumspec (1, nFrequencyPoints); autoNUMvector<double> y (1, my ny); autoNUMvector<double> yv2 (1, my ny); autoNUMvector<double> fl2 (1, my ny); // From ERB's to log (f) for (long i = 1; i <= my ny; i++) { double f = NUMerbToHertz (my y1 + (i - 1) * my dy); fl2[i] = NUMlog2 (f); } // Determine global maximum power in frame for (long j = 1; j <= my nx; j++) { double p = 0.0; for (long i = 1; i <= my ny; i++) { p += my s[i][j]; } if (p > maxPower) { maxPower = p; } power[j] = p; } if (maxPower == 0.0) { Melder_throw (U"No power"); } for (long j = 1; j <= my nx; j++) { Pitch_Frame pitchFrame = &thy frame[j]; pitchFrame -> intensity = power[j] / maxPower; for (long i = 1; i <= my ny; i++) { y[i] = my s[i][j]; } NUMspline (fl2.peek(), y.peek(), my ny, 1e30, 1e30, yv2.peek()); for (long k = 1; k <= nFrequencyPoints; k++) { double f = fminl2 + (k - 1) * dfl2; NUMsplint (fl2.peek(), y.peek(), yv2.peek(), my ny, f, & pitch[k]); sumspec[k] = 0.0; } // Formula (8): weighted harmonic summation. for (long m = 1; m <= maxHarmonic; m++) { double hm = 1 - harmonicFallOffSlope * NUMlog2 (m); long kb = 1 + (long) floor (nPointsPerOctave * NUMlog2 (m)); for (long k = kb; k <= nFrequencyPoints; k++) { if (pitch[k] > 0.0) { sumspec[k - kb + 1] += pitch[k] * hm; } } } // into Pitch object Pitch_Frame_init (pitchFrame, maxnCandidates); pitchFrame -> nCandidates = 0; /* !!!!! */ Pitch_Frame_addPitch (pitchFrame, 0, 0, maxnCandidates); /* unvoiced */ for (long k = 2; k <= nFrequencyPoints - 1; k++) { double y1 = sumspec[k - 1], y2 = sumspec[k], y3 = sumspec[k + 1]; if (y2 > y1 && y2 >= y3) { double denum = y1 - 2.0 * y2 + y3, tmp = y3 - 4.0 * y2; double x = dfl2 * (y1 - y3) / (2 * denum); double f = pow (2.0, fminl2 + (k - 1) * dfl2 + x); double strength = (2.0 * y1 * (4.0 * y2 + y3) - y1 * y1 - tmp * tmp) / (8.0 * denum); if (strength > maxStrength) { maxStrength = strength; } Pitch_Frame_addPitch (pitchFrame, f, strength, maxnCandidates); } } } // Scale the pitch strengths for (long j = 1; j <= my nx; j++) { double f0, localStrength; Pitch_Frame_getPitch (&thy frame[j], &f0, &localStrength); Pitch_Frame_resizeStrengths (&thy frame[j], localStrength / maxStrength, unvoicedCriterium); } return thee; } catch (MelderError) { Melder_throw (me, U": no Pitch created."); } }
Pitch Sound_to_Pitch_shs (Sound me, double timeStep, double minimumPitch, double maximumFrequency, double ceiling, long maxnSubharmonics, long maxnCandidates, double compressionFactor, long nPointsPerOctave) { try { double firstTime, newSamplingFrequency = 2 * maximumFrequency; double windowDuration = 2 / minimumPitch, halfWindow = windowDuration / 2; double atans = nPointsPerOctave * NUMlog2 (65.0 / 50.0) - 1; // Number of speech samples in the downsampled signal in each frame: // 100 for windowDuration == 0.04 and newSamplingFrequency == 2500 long nx = lround (windowDuration * newSamplingFrequency); // The minimum number of points for the fft is 256. long nfft = 1; while ( (nfft *= 2) < nx || nfft <= 128) { ; } long nfft2 = nfft / 2 + 1; double frameDuration = nfft / newSamplingFrequency; double df = newSamplingFrequency / nfft; // The number of points on the octave scale double fminl2 = NUMlog2 (minimumPitch), fmaxl2 = NUMlog2 (maximumFrequency); long nFrequencyPoints = (long) floor ((fmaxl2 - fminl2) * nPointsPerOctave); double dfl2 = (fmaxl2 - fminl2) / (nFrequencyPoints - 1); autoSound sound = Sound_resample (me, newSamplingFrequency, 50); long numberOfFrames; Sampled_shortTermAnalysis (sound.peek(), windowDuration, timeStep, &numberOfFrames, &firstTime); autoSound frame = Sound_createSimple (1, frameDuration, newSamplingFrequency); autoSound hamming = Sound_createHamming (nx / newSamplingFrequency, newSamplingFrequency); autoPitch thee = Pitch_create (my xmin, my xmax, numberOfFrames, timeStep, firstTime, ceiling, maxnCandidates); autoNUMvector<double> cc (1, numberOfFrames); autoNUMvector<double> specAmp (1, nfft2); autoNUMvector<double> fl2 (1, nfft2); autoNUMvector<double> yv2 (1, nfft2); autoNUMvector<double> arctg (1, nFrequencyPoints); autoNUMvector<double> al2 (1, nFrequencyPoints); Melder_assert (frame->nx >= nx); Melder_assert (hamming->nx == nx); // Compute the absolute value of the globally largest amplitude w.r.t. the global mean. double globalMean, globalPeak; Sound_localMean (sound.peek(), sound -> xmin, sound -> xmax, &globalMean); Sound_localPeak (sound.peek(), sound -> xmin, sound -> xmax, globalMean, &globalPeak); /* For the cubic spline interpolation we need the frequencies on an octave scale, i.e., a log2 scale. All frequencies must be DIFFERENT, otherwise the cubic spline interpolation will give corrupt results. Because log2(f==0) is not defined, we use the heuristic: f[2]-f[1] == f[3]-f[2]. */ for (long i = 2; i <= nfft2; i++) { fl2[i] = NUMlog2 ( (i - 1) * df); } fl2[1] = 2 * fl2[2] - fl2[3]; // Calculate frequencies regularly spaced on a log2-scale and // the frequency weighting function. for (long i = 1; i <= nFrequencyPoints; i++) { arctg[i] = 0.5 + atan (3 * (i - atans) / nPointsPerOctave) / NUMpi; } // Perform the analysis on all frames. for (long i = 1; i <= numberOfFrames; i++) { Pitch_Frame pitchFrame = &thy frame[i]; double hm = 1, f0, pitch_strength, localMean, localPeak; double tmid = Sampled_indexToX (thee.peek(), i); /* The center of this frame */ long nx_tmp = frame -> nx; // Copy a frame from the sound, apply a hamming window. Get local 'intensity' frame -> nx = nx; /*begin vies */ Sound_into_Sound (sound.peek(), frame.peek(), tmid - halfWindow); Sounds_multiply (frame.peek(), hamming.peek()); Sound_localMean (sound.peek(), tmid - 3 * halfWindow, tmid + 3 * halfWindow, &localMean); Sound_localPeak (sound.peek(), tmid - halfWindow, tmid + halfWindow, localMean, &localPeak); pitchFrame -> intensity = localPeak > globalPeak ? 1 : localPeak / globalPeak; frame -> nx = nx_tmp; /* einde vies */ // Get the Fourier spectrum. autoSpectrum spec = Sound_to_Spectrum (frame.peek(), 1); Melder_assert (spec->nx == nfft2); // From complex spectrum to amplitude spectrum. for (long j = 1; j <= nfft2; j++) { double rs = spec -> z[1][j], is = spec -> z[2][j]; specAmp[j] = sqrt (rs * rs + is * is); } // Enhance the peaks in the spectrum. spec_enhance_SHS (specAmp.peek(), nfft2); // Smooth the enhanced spectrum. spec_smoooth_SHS (specAmp.peek(), nfft2); // Go to a logarithmic scale and perform cubic spline interpolation to get // spectral values for the increased number of frequency points. NUMspline (fl2.peek(), specAmp.peek(), nfft2, 1e30, 1e30, yv2.peek()); for (long j = 1; j <= nFrequencyPoints; j++) { double f = fminl2 + (j - 1) * dfl2; NUMsplint (fl2.peek(), specAmp.peek(), yv2.peek(), nfft2, f, &al2[j]); } // Multiply by frequency selectivity of the auditory system. for (long j = 1; j <= nFrequencyPoints; j++) al2[j] = al2[j] > 0 ? al2[j] * arctg[j] : 0; // The subharmonic summation. Shift spectra in octaves and sum. Pitch_Frame_init (pitchFrame, maxnCandidates); autoNUMvector<double> sumspec (1, nFrequencyPoints); pitchFrame -> nCandidates = 0; /* !!!!! */ for (long m = 1; m <= maxnSubharmonics + 1; m++) { long kb = 1 + (long) floor (nPointsPerOctave * NUMlog2 (m)); for (long k = kb; k <= nFrequencyPoints; k++) { sumspec[k - kb + 1] += al2[k] * hm; } hm *= compressionFactor; } // First register the voiceless candidate (always present). Pitch_Frame_addPitch (pitchFrame, 0, 0, maxnCandidates); /* Get the best local estimates for the pitch as the maxima of the subharmonic sum spectrum by parabolic interpolation on three points: The formula for a parabole with a maximum is: y(x) = a - b (x - c)^2 with a, b, c >= 0 The three points are (-x, y1), (0, y2) and (x, y3). The solution for a (the maximum) and c (the position) is: a = (2 y1 (4 y2 + y3) - y1^2 - (y3 - 4 y2)^2)/( 8 (y1 - 2 y2 + y3) c = dx (y1 - y3) / (2 (y1 - 2 y2 + y3)) (b = (2 y2 - y1 - y3) / (2 dx^2) ) */ for (long k = 2; k <= nFrequencyPoints - 1; k++) { double y1 = sumspec[k - 1], y2 = sumspec[k], y3 = sumspec[k + 1]; if (y2 > y1 && y2 >= y3) { double denum = y1 - 2 * y2 + y3, tmp = y3 - 4 * y2; double x = dfl2 * (y1 - y3) / (2 * denum); double f = pow (2, fminl2 + (k - 1) * dfl2 + x); double strength = (2 * y1 * (4 * y2 + y3) - y1 * y1 - tmp * tmp) / (8 * denum); Pitch_Frame_addPitch (pitchFrame, f, strength, maxnCandidates); } } /* Check whether f0 corresponds to an actual periodicity T = 1 / f0: correlate two signal periods of duration T, one starting at the middle of the interval and one starting T seconds before. If there is periodicity the correlation coefficient should be high. However, some sounds do not show any regularity, or very low frequency and regularity, and nevertheless have a definite pitch, e.g. Shepard sounds. */ Pitch_Frame_getPitch (pitchFrame, &f0, &pitch_strength); if (f0 > 0) { cc[i] = Sound_correlateParts (sound.peek(), tmid - 1.0 / f0, tmid, 1.0 / f0); } } // Base V/UV decision on correlation coefficients. // Resize the pitch strengths w.r.t. the cc. double vuvCriterium = 0.52; for (long i = 1; i <= numberOfFrames; i++) { Pitch_Frame_resizeStrengths (& thy frame[i], cc[i], vuvCriterium); } return thee.transfer(); } catch (MelderError) { Melder_throw (me, U": no Pitch (shs) created."); } }