corpusanalysis.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. #!/usr/bin/env python3
  2. print('corpusanalysis.py: __package__ ->', __package__)
  3. import os, subprocess, json
  4. from collections import defaultdict
  5. from statistics import mean, median, stdev
  6. from library import preprocessing, phones, resonance
  7. corpus_dir = './corpus'
  8. processed_corpus_dir = './corpus-processed'
  9. if not os.path.exists(processed_corpus_dir):
  10. os.mkdir(processed_corpus_dir)
  11. subprocess.run(['chmod', '777', '-R', processed_corpus_dir])
  12. for directory in os.listdir(corpus_dir):
  13. transcript = ''
  14. recording = None
  15. for filename in os.listdir(corpus_dir + '/' + directory):
  16. if '.txt' in filename:
  17. with open(corpus_dir + '/' + directory + '/' + filename) as f:
  18. transcript = f.read()
  19. elif len([
  20. ftype for ftype in
  21. ['.wav', '.mp3', '.ogg', '.opus']
  22. if ftype in filename]
  23. ) == 1:
  24. with open(corpus_dir + '/' + directory + '/' + filename, 'rb') as f:
  25. recording = f.read()
  26. out_dir = processed_corpus_dir + '/' + directory
  27. if not os.path.exists(out_dir): os.mkdir(out_dir)
  28. subprocess.run(['chmod', '777', '-R', out_dir])
  29. preprocessing.process(recording, transcript, out_dir)
  30. m_count = 0
  31. f_count = 0
  32. m_data = []
  33. f_data = []
  34. for directory in os.listdir(processed_corpus_dir):
  35. tsv_file = (processed_corpus_dir + '/' + directory +
  36. '/output/recording.tsv')
  37. if os.path.exists(tsv_file):
  38. with open(tsv_file) as f:
  39. tsv_text = f.read()
  40. if directory[0] == 'm':
  41. m_data.append(phones.parse(tsv_text))
  42. m_count += 1
  43. if directory[0] == 'f':
  44. f_data.append(phones.parse(tsv_text))
  45. f_count += 1
  46. if len(f_data) > len(m_data):
  47. f_data = f_data[0:len(m_data)]
  48. if len(m_data) > len(f_data):
  49. m_data = m_data[0:len(f_data)]
  50. print('m_count', m_count)
  51. print('f_count', f_count)
  52. print('len(f_data)', len(f_data))
  53. print('len(m_data)', len(m_data))
  54. population_phones = defaultdict(list)
  55. for data in m_data + f_data:
  56. for phone in data['phones']:
  57. if (phone['F'] and phone['F'][0] and phone['F'][1] and
  58. phone['F'][2] and phone['F'][3]
  59. ):
  60. population_phones[phone['phoneme']].append(phone)
  61. phone_stats = {}
  62. for phoneme in population_phones:
  63. print(phoneme)
  64. Fs = [[phone['F'][i] for phone in population_phones[phoneme] ]
  65. for i in range(4)]
  66. for i in range(4):
  67. print('\tf' + str(i), mean(Fs[i]), stdev(Fs[i]))
  68. phone_stats[phoneme] = [
  69. { 'mean' : mean(Fs[i]),
  70. 'stdev': stdev(Fs[i]),
  71. 'median': median(Fs[i]),
  72. 'max': max(Fs[i]),
  73. 'min': min(Fs[i]),
  74. } for i in range(4)
  75. ]
  76. with open('stats.json', 'w') as f:
  77. f.write(json.dumps(phone_stats))
  78. # A little brute-forcing never hurt anyone.
  79. granularity = 56
  80. weights_candidates = []
  81. for i in range(granularity + 1):
  82. for j in range(granularity + 1):
  83. for k in range(granularity + 1):
  84. if i + j + k == granularity:
  85. weights_candidates.append([
  86. i / granularity,
  87. j / granularity,
  88. k / granularity
  89. ])
  90. max_accuracy = 0
  91. winner = None
  92. for weights in weights_candidates:
  93. for data in m_data + f_data:
  94. resonance.compute_resonance(data, weights)
  95. median_resonance = median([data['meanResonance'] for data in m_data + f_data])
  96. correct_count = 0
  97. total = 0
  98. for data in m_data:
  99. if data['meanResonance'] <= median_resonance:
  100. correct_count += 1
  101. total += 1
  102. for data in f_data:
  103. if data['meanResonance'] >= median_resonance:
  104. correct_count += 1
  105. total += 1
  106. accuracy = correct_count / total
  107. if accuracy >= max_accuracy:
  108. max_accuracy = accuracy
  109. winner = weights
  110. print(weights, accuracy)
  111. print('Best weight:', winner)
  112. with open('weights.json', 'w') as f:
  113. f.write(json.dumps(winner))