Sound_to_Pitch2.cpp 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. /* Sound_to_Pitch2.c
  2. *
  3. * Copyright (C) 1993-2017 David Weenink
  4. *
  5. * This code is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This code is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this work. If not, see <http://www.gnu.org/licenses/>.
  17. */
  18. /*
  19. djmw 20020813 GPL header
  20. djmw 20021106 Latest modification
  21. djmw 20041124 Changed call to Sound_to_Spectrum.
  22. djmw 20070103 Sound interface changes
  23. */
  24. #include "Sound_to_Pitch2.h"
  25. #include "Pitch_extensions.h"
  26. #include "Sound_and_Spectrum.h"
  27. #include "Sound_to_SPINET.h"
  28. #include "SPINET_to_Pitch.h"
  29. #include "NUM2.h"
  30. static int spec_enhance_SHS (double a [], integer n) {
  31. if (n < 2) {
  32. return 0;
  33. }
  34. autoNUMvector<integer> posmax (1, (n + 1) / 2);
  35. integer nmax = 0;
  36. if (a [1] > a [2]) {
  37. posmax [++ nmax] = 1;
  38. }
  39. for (integer i = 2; i <= n - 1; i ++) if (a [i] > a [i - 1] && a [i] >= a [i + 1]) {
  40. posmax [++ nmax] = i;
  41. }
  42. if (a [n] > a [n - 1]) {
  43. posmax [++ nmax] = n;
  44. }
  45. if (nmax == 1) {
  46. for (integer j = 1; j <= posmax [1] - 3; j ++) {
  47. a [j] = 0;
  48. }
  49. for (integer j = posmax [1] + 3; j <= n; j ++) {
  50. a [j] = 0;
  51. }
  52. } else {
  53. for (integer i = 2; i <= nmax; i ++) {
  54. for (integer j = posmax [i - 1] + 3; j <= posmax [i] - 3; j ++) {
  55. a [j] = 0.0;
  56. }
  57. }
  58. }
  59. return 1;
  60. }
  61. static void spec_smoooth_SHS (double a [], integer n) {
  62. double ai, aim1 = 0;
  63. for (integer i = 1; i <= n - 1; i ++) {
  64. ai = a [i]; a [i] = (aim1 + 2 * ai + a [i + 1]) / 4; aim1 = ai;
  65. }
  66. }
  67. autoPitch Sound_to_Pitch_shs (Sound me, double timeStep, double minimumPitch, double maximumFrequency, double ceiling, integer maxnSubharmonics, integer maxnCandidates, double compressionFactor, integer nPointsPerOctave) {
  68. try {
  69. double firstTime, newSamplingFrequency = 2.0 * maximumFrequency;
  70. double windowDuration = 2 / minimumPitch, halfWindow = 0.5 * windowDuration;
  71. double atans = nPointsPerOctave * NUMlog2 (65.0 / 50.0) - 1.0;
  72. /*
  73. Number of speech samples in the downsampled signal in each frame:
  74. 100 for windowDuration == 0.04 and newSamplingFrequency == 2500
  75. */
  76. integer nx = Melder_iround (windowDuration * newSamplingFrequency);
  77. /*
  78. The minimum number of points for the FFT is 256.
  79. */
  80. integer nfft = 1;
  81. while ((nfft *= 2) < nx || nfft <= 128) {
  82. ;
  83. }
  84. integer nfft2 = nfft / 2 + 1;
  85. double frameDuration = nfft / newSamplingFrequency;
  86. double df = newSamplingFrequency / nfft;
  87. /*
  88. The number of points on the octave scale.
  89. */
  90. double fminl2 = NUMlog2 (minimumPitch), fmaxl2 = NUMlog2 (maximumFrequency);
  91. integer nFrequencyPoints = Melder_ifloor ((fmaxl2 - fminl2) * nPointsPerOctave);
  92. double dfl2 = (fmaxl2 - fminl2) / (nFrequencyPoints - 1);
  93. autoSound sound = Sound_resample (me, newSamplingFrequency, 50);
  94. integer numberOfFrames;
  95. Sampled_shortTermAnalysis (sound.get(), windowDuration, timeStep, & numberOfFrames, & firstTime);
  96. autoSound frame = Sound_createSimple (1, frameDuration, newSamplingFrequency);
  97. autoSound hamming = Sound_createHamming (nx / newSamplingFrequency, newSamplingFrequency);
  98. autoPitch thee = Pitch_create (my xmin, my xmax, numberOfFrames, timeStep, firstTime, ceiling, maxnCandidates);
  99. autoNUMvector<double> cc (1, numberOfFrames);
  100. autoNUMvector<double> specAmp (1, nfft2);
  101. autoNUMvector<double> fl2 (1, nfft2);
  102. autoNUMvector<double> yv2 (1, nfft2);
  103. autoNUMvector<double> arctg (1, nFrequencyPoints);
  104. autoNUMvector<double> al2 (1, nFrequencyPoints);
  105. Melder_assert (frame -> nx >= nx);
  106. Melder_assert (hamming -> nx == nx);
  107. // Compute the absolute value of the globally largest amplitude w.r.t. the global mean.
  108. double globalMean, globalPeak;
  109. Sound_localMean (sound.get(), sound -> xmin, sound -> xmax, & globalMean);
  110. Sound_localPeak (sound.get(), sound -> xmin, sound -> xmax, globalMean, & globalPeak);
  111. /*
  112. For the cubic spline interpolation we need the frequencies on an octave
  113. scale, i.e., a log2 scale. All frequencies should be DIFFERENT, otherwise
  114. the cubic spline interpolation will give corrupt results.
  115. Because log2(f==0) is not defined, we use the heuristic: f [2] - f [1] == f [3] - f [2].
  116. */
  117. for (integer i = 2; i <= nfft2; i ++) {
  118. fl2 [i] = NUMlog2 ((i - 1) * df);
  119. }
  120. fl2 [1] = 2 * fl2 [2] - fl2 [3];
  121. /*
  122. Calculate frequencies regularly spaced on a log2-scale and the frequency weighting function.
  123. */
  124. for (integer i = 1; i <= nFrequencyPoints; i ++) {
  125. arctg [i] = 0.5 + atan (3.0 * (i - atans) / nPointsPerOctave) / NUMpi;
  126. }
  127. /*
  128. Perform the analysis on all frames.
  129. */
  130. for (integer i = 1; i <= numberOfFrames; i ++) {
  131. Pitch_Frame pitchFrame = & thy frame [i];
  132. double hm = 1.0, f0, pitch_strength, localMean, localPeak;
  133. double tmid = Sampled_indexToX (thee.get(), i); // The center of this frame
  134. integer nx_tmp = frame -> nx;
  135. // Copy a frame from the sound, apply a hamming window. Get local 'intensity'
  136. frame -> nx = nx; /*begin vies */
  137. Sound_into_Sound (sound.get(), frame.get(), tmid - halfWindow);
  138. Sounds_multiply (frame.get(), hamming.get());
  139. Sound_localMean (sound.get(), tmid - 3 * halfWindow, tmid + 3 * halfWindow, & localMean);
  140. Sound_localPeak (sound.get(), tmid - halfWindow, tmid + halfWindow, localMean, & localPeak);
  141. pitchFrame -> intensity = localPeak > globalPeak ? 1 : localPeak / globalPeak;
  142. frame -> nx = nx_tmp; /* einde vies */
  143. // Get the Fourier spectrum.
  144. autoSpectrum spec = Sound_to_Spectrum (frame.get(), true);
  145. Melder_assert (spec->nx == nfft2);
  146. // From complex spectrum to amplitude spectrum.
  147. for (integer j = 1; j <= nfft2; j ++) {
  148. double rs = spec -> z [1] [j], is = spec -> z [2] [j];
  149. specAmp [j] = sqrt (rs * rs + is * is);
  150. }
  151. // Enhance the peaks in the spectrum.
  152. spec_enhance_SHS (specAmp.peek(), nfft2);
  153. // Smooth the enhanced spectrum.
  154. spec_smoooth_SHS (specAmp.peek(), nfft2);
  155. // Go to a logarithmic scale and perform cubic spline interpolation to get
  156. // spectral values for the increased number of frequency points.
  157. NUMcubicSplineInterpolation_getSecondDerivatives (fl2.peek(), specAmp.peek(), nfft2, 1e30, 1e30, yv2.peek());
  158. for (integer j = 1; j <= nFrequencyPoints; j ++) {
  159. double f = fminl2 + (j - 1) * dfl2;
  160. al2 [j] = NUMcubicSplineInterpolation (fl2.peek(), specAmp.peek(), yv2.peek(), nfft2, f);
  161. }
  162. // Multiply by frequency selectivity of the auditory system.
  163. for (integer j = 1; j <= nFrequencyPoints; j ++) {
  164. al2 [j] = al2 [j] > 0 ? al2 [j] * arctg [j] : 0.0;
  165. }
  166. // The subharmonic summation. Shift spectra in octaves and sum.
  167. Pitch_Frame_init (pitchFrame, maxnCandidates);
  168. autoNUMvector<double> sumspec (1, nFrequencyPoints);
  169. pitchFrame -> nCandidates = 0; /* !!!!! */
  170. for (integer m = 1; m <= maxnSubharmonics + 1; m ++) {
  171. integer kb = 1 + Melder_ifloor (nPointsPerOctave * NUMlog2 (m));
  172. for (integer k = kb; k <= nFrequencyPoints; k ++) {
  173. sumspec [k - kb + 1] += al2 [k] * hm;
  174. }
  175. hm *= compressionFactor;
  176. }
  177. // First register the voiceless candidate (always present).
  178. Pitch_Frame_addPitch (pitchFrame, 0, 0, maxnCandidates);
  179. /*
  180. Get the best local estimates for the pitch as the maxima of the
  181. subharmonic sum spectrum by parabolic interpolation on three points:
  182. The formula for a parabole with a maximum is:
  183. y(x) = a - b (x - c)^2 with a, b, c >= 0
  184. The three points are (-x, y1), (0, y2) and (x, y3).
  185. The solution for a (the maximum) and c (the position) is:
  186. a = (2 y1 (4 y2 + y3) - y1^2 - (y3 - 4 y2)^2)/( 8 (y1 - 2 y2 + y3)
  187. c = dx (y1 - y3) / (2 (y1 - 2 y2 + y3))
  188. (b = (2 y2 - y1 - y3) / (2 dx^2) )
  189. */
  190. for (integer k = 2; k <= nFrequencyPoints - 1; k ++) {
  191. double y1 = sumspec [k - 1], y2 = sumspec [k], y3 = sumspec [k + 1];
  192. if (y2 > y1 && y2 >= y3) {
  193. double denum = y1 - 2 * y2 + y3, tmp = y3 - 4 * y2;
  194. double x = dfl2 * (y1 - y3) / (2 * denum);
  195. double f = pow (2, fminl2 + (k - 1) * dfl2 + x);
  196. double strength = (2 * y1 * (4 * y2 + y3) - y1 * y1 - tmp * tmp) / (8 * denum);
  197. Pitch_Frame_addPitch (pitchFrame, f, strength, maxnCandidates);
  198. }
  199. }
  200. /*
  201. Check whether f0 corresponds to an actual periodicity T = 1 / f0:
  202. correlate two signal periods of duration T, one starting at the
  203. middle of the interval and one starting T seconds before.
  204. If there is periodicity the correlation coefficient should be high.
  205. However, some sounds do not show any regularity, or very low
  206. frequency and regularity, and nevertheless have a definite
  207. pitch, e.g. Shepard sounds.
  208. */
  209. Pitch_Frame_getPitch (pitchFrame, &f0, &pitch_strength);
  210. if (f0 > 0) {
  211. cc [i] = Sound_correlateParts (sound.get(), tmid - 1.0 / f0, tmid, 1.0 / f0);
  212. }
  213. }
  214. // Base V/UV decision on correlation coefficients.
  215. // Resize the pitch strengths w.r.t. the cc.
  216. double vuvCriterium = 0.52;
  217. for (integer i = 1; i <= numberOfFrames; i ++) {
  218. Pitch_Frame_resizeStrengths (& thy frame [i], cc [i], vuvCriterium);
  219. }
  220. return thee;
  221. } catch (MelderError) {
  222. Melder_throw (me, U": no Pitch (shs) created.");
  223. }
  224. }
  225. autoPitch Sound_to_Pitch_SPINET (Sound me, double timeStep, double windowDuration, double minimumFrequencyHz, double maximumFrequencyHz, integer nFilters, double ceiling, int maxnCandidates) {
  226. try {
  227. autoSPINET him = Sound_to_SPINET (me, timeStep, windowDuration, minimumFrequencyHz,
  228. maximumFrequencyHz, nFilters, 0.4, 0.6);
  229. autoPitch thee = SPINET_to_Pitch (him.get(), 0.15, ceiling, maxnCandidates);
  230. return thee;
  231. } catch (MelderError) {
  232. Melder_throw (me, U": no Pitch (SPINET) created.");
  233. }
  234. }
  235. /* End of file Sound_to_Pitch2.cpp */