CreatePinyinTable.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # COPYRIGHT: Openmoko Inc. 2010
  4. # LICENSE: GPL Version 3 or later
  5. # DESCRIPTION: Convert the SCIM pinyin file to a Python dictionary
  6. # AUTHORS: Sean Moss-Pultz <sean@openmoko.com>
  7. # Christopher Hall <hsw@openmoko.com>
  8. import os
  9. import sys
  10. import string
  11. import re
  12. import getopt
  13. import PrintLog
  14. def usage(message):
  15. if None != message:
  16. print('error: {0:s}'.format(message))
  17. print('usage: {0:s} <options>'.format(os.path.basename(__file__)))
  18. print(' --help This message')
  19. print(' --verbose Enable verbose output')
  20. print(' --input=file Source SCIM Pinyin table [pinyin_table.txt]')
  21. print(' --output=file Output Python file [PinyinTable.py]')
  22. exit(1)
  23. DIGIT_RE = re.compile(r'\d')
  24. VOWELS = {
  25. u'a': u'āáăàȧ',
  26. u'e': u'ēéĕèė',
  27. u'i': u'īíĭìi',
  28. u'o': u'ōóŏòȯ',
  29. u'u': u'ūúŭùů',
  30. }
  31. def make_pinyin(text):
  32. """convert text like an3 to ăn"""
  33. global DIGIT_RE
  34. global VOWELS
  35. m = DIGIT_RE.search(text)
  36. if m:
  37. text = text[:m.end(0)]
  38. tone = int(text[-1]) - 1
  39. text = text[:-1]
  40. else:
  41. tone = 4
  42. for i in range(0, len(text)):
  43. try:
  44. if text[i] in u'iu' and text[i+1] in u'aeiou':
  45. text = text[:i + 1] + VOWELS[text[i + 1]][tone] + text[i + 2:]
  46. break
  47. except IndexError:
  48. pass
  49. if text[i] in u'aeiou':
  50. text = text[:i] + VOWELS[text[i]][tone] + text[i + 1:]
  51. break
  52. return text
  53. def generate_line(f, k, d):
  54. """output one line of Python data"""
  55. f.write(u'u\'{0:s}\': ['.format(k).encode('utf-8'))
  56. p = d.pop(0)
  57. f.write(u'u\'{0:s}\''.format(p).encode('utf-8'))
  58. for p in d:
  59. f.write(u',{1:s}u\'{0:s}\''.format(p,' ' * (6 - len(p))).encode('utf-8'))
  60. f.write(u'],{0:s}'.format(' ' * (6 - len(p))))
  61. def generate_output(filename, items_per_line, pinyin):
  62. """Create a Python module"""
  63. with open(filename, 'wb') as f:
  64. f.write("""#! /usr/bin/env python
  65. # -*- coding: utf-8 -*-
  66. # generated file - do not modify
  67. # this is a simple dictionary using the CJK character as the key
  68. # the data is a list of alternative pronunciations in Pinyin
  69. # with accented vowels to indicate the tone
  70. """)
  71. i = 0
  72. multiples = {}
  73. f.write('pinyin = {')
  74. pre_space = '\n '
  75. for k, d in pinyin.items():
  76. if 1 != len(d):
  77. multiples[k] = d
  78. continue
  79. elif 0 == i:
  80. f.write(pre_space)
  81. i = items_per_line
  82. generate_line(f, k, d)
  83. i -= 1
  84. for k, d in multiples.items():
  85. f.write(pre_space)
  86. generate_line(f, k, d)
  87. f.write('\n}\n')
  88. def main():
  89. """ main processing"""
  90. global verbose
  91. try:
  92. opts, args = getopt.getopt(sys.argv[1:], 'hvi:o:',
  93. ['help', 'verbose',
  94. 'input=',
  95. 'output=',
  96. ])
  97. except getopt.GetoptError, err:
  98. usage(err)
  99. verbose = False
  100. input_file_name = 'pinyin_table.txt'
  101. output_file_name = 'PinyinTable.py'
  102. for opt, arg in opts:
  103. if opt in ('-v', '--verbose'):
  104. verbose = True
  105. elif opt in ('-h', '--help'):
  106. usage(None)
  107. elif opt in ('-i', '--input'):
  108. input_file_name = arg
  109. elif opt in ('-o', '--output'):
  110. output_file_name = arg
  111. else:
  112. usage('unhandled option: ' + opt)
  113. if [] != args:
  114. usage('Extraneous argument(s)')
  115. PrintLog.message(u'Reading Data File: {0:s}'.format(input_file_name))
  116. errors = False
  117. pinyin = {}
  118. with open(input_file_name, 'rb') as f:
  119. PrintLog.message(u'File Header: {0:s}'.format(f.readline().strip()))
  120. PrintLog.message(u'File Version: {0:s}'.format(f.readline().strip()))
  121. expected_lines = int(f.readline())
  122. line_count = 0
  123. char_count = 0
  124. for line in f:
  125. line_count += 1
  126. n = line.strip().split()
  127. phonetic = make_pinyin(n.pop(0))
  128. item_count = int(n.pop(0))
  129. if len(n) != item_count:
  130. PrintLog.message(u'Error: incorrect item count, expected: {0:d} got: {1:d}'.format(item_count, len(n)))
  131. errors = True
  132. break
  133. for s in n:
  134. cjk = unicode(s, 'utf-8')[0]
  135. if cjk in pinyin:
  136. pinyin[cjk] += [phonetic]
  137. else:
  138. pinyin[cjk] = [phonetic]
  139. char_count += 1
  140. if line_count == expected_lines:
  141. PrintLog.message(u'Counted CJK glyphs: {0:d}'.format(char_count))
  142. PrintLog.message(u'Expected Lines: {0:d}'.format(expected_lines))
  143. PrintLog.message(u'Counted Lines: {0:d}'.format(line_count))
  144. else:
  145. PrintLog.message(u'Error: linecount miosmatch: {0:d} != {1:d}'.format(expected_lines, line_count))
  146. errors = True
  147. if errors:
  148. PrintLog.message(u'Error: failed to read data file')
  149. return 1
  150. else:
  151. PrintLog.message(u'Data Read Completed Sucessfully')
  152. text = u'欧洲,软件+互联网[用统一码] 歐洲,軟體及網際網路[讓統一碼] ABC 西安 先'
  153. expected = u'ōuzhōu,ruănjiàn+hùliánwăng[yòngtŏngyīmă] ōuzhōu,ruăntĭjíwăngjìwănglù[ràngtŏngyīmă] ABC xīān xiān'
  154. result = u''
  155. for c in text:
  156. if c in pinyin:
  157. result += pinyin[c][0]
  158. else:
  159. result += c
  160. if result == expected:
  161. PrintLog.message(u'Creating: {0:s}'.format(output_file_name))
  162. generate_output(output_file_name, 6, pinyin)
  163. PrintLog.message(u'Finished: {0:s}'.format(output_file_name))
  164. else:
  165. PrintLog.message(u'Error in test:')
  166. PrintLog.message(u'input: {0:s}'.format(text))
  167. PrintLog.message(u'output: {0:s}'.format(result))
  168. PrintLog.message(u'expected: {0:s}'.format(expected))
  169. return 2
  170. return 0
  171. # run the program
  172. if __name__ == "__main__":
  173. sys.exit(main())