UCD_features_generated_h.praat 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. # UCD_features_generated_h.praat
  2. # Paul Boersma 20180805
  3. Text writing preferences: "UTF-8"
  4. table = Read Table from semicolon-separated file: "UnicodeData.txt"
  5. numberOfRows = Get number of rows
  6. stopwatch
  7. outfile$ = "../../kar/UCD_features_generated.h"
  8. writeFileLine: outfile$, "/* This file was generated automatically by generate/Unicode/UCD_features_generated_h.praat */"
  9. previousLineCodePoint = -1
  10. for irow from 1 to numberOfRows
  11. code$ = Get value: irow, "code"
  12. lineCodePoint = number ("0x" + code$)
  13. goto finished lineCodePoint > 0x2FFFF
  14. while lineCodePoint > previousLineCodePoint + 1
  15. previousLineCodePoint += 1
  16. if isFirst
  17. line$ = "/* " + hexadecimal$ (previousLineCodePoint, 4) + " " + (name$ - "First>" + "Next>") + " */" + newline$
  18. line$ += tab$ + "{ " + categoryFeature$ +
  19. ... ", 0x" + hexadecimal$ (previousLineCodePoint, 4) + ", 0x" + hexadecimal$ (previousLineCodePoint, 4) +
  20. ... ", 0x" + hexadecimal$ (previousLineCodePoint, 4) + ", '\0', '\0' }," + newline$
  21. else
  22. line$ = "/* " + hexadecimal$ (previousLineCodePoint, 4) + " UNASSIGNED */" + newline$
  23. line$ += tab$ + "{ kUCD_UNASSIGNED " +
  24. ... ", 0x" + hexadecimal$ (previousLineCodePoint, 4) + ", 0x" + hexadecimal$ (previousLineCodePoint, 4) +
  25. ... ", 0x" + hexadecimal$ (previousLineCodePoint, 4) + ", '\0', '\0' }," + newline$
  26. endif
  27. appendFile: outfile$, line$
  28. endwhile
  29. name$ = Get value: irow, "name"
  30. isFirst = ( index (name$, " First>") <> 0 )
  31. categoryCode$ = Get value: irow, "category"
  32. alternativeName$ = Get value: irow, "dum1"
  33. upper$ = Get value: irow, "upper"
  34. lower$ = Get value: irow, "lower"
  35. title$ = Get value: irow, "title"
  36. symbol$ = if lineCodePoint >= 0xD800 and lineCodePoint <= 0xDFFF
  37. ... then " SURROGATE " else unicode$ (lineCodePoint) fi
  38. line$ = "/* " + code$ + " " +
  39. ... if name$ <> "<control>" then "(x" + symbol$ + "x) " else "" fi +
  40. ... name$ + " " +
  41. ... if alternativeName$ <> "" then "(" + alternativeName$ + ") " else "" fi +
  42. ... "*/" + newline$
  43. majorCategoryCode$ = mid$ (categoryCode$, 1, 1)
  44. minorCategoryCode$ = mid$ (categoryCode$, 2, 1)
  45. if majorCategoryCode$ = "L"
  46. categoryFeature$ = "mUCD_WORD_CHARACTER"
  47. if minorCategoryCode$ = "u"
  48. categoryFeature$ += " | mUCD_UPPERCASE_LETTER"
  49. elsif minorCategoryCode$ = "l"
  50. categoryFeature$ += " | mUCD_LOWERCASE_LETTER"
  51. elsif minorCategoryCode$ = "t"
  52. categoryFeature$ += " | mUCD_TITLECASE_LETTER"
  53. elsif minorCategoryCode$ = "m"
  54. categoryFeature$ += " | mUCD_MODIFIER_LETTER"
  55. elsif minorCategoryCode$ = "o"
  56. categoryFeature$ += " | mUCD_OTHER_LETTER"
  57. else
  58. exitScript: "Unknown letter category code in row ", irow, "."
  59. endif
  60. elsif majorCategoryCode$ = "M"
  61. categoryFeature$ = "mUCD_WORD_CHARACTER"
  62. if minorCategoryCode$ = "n"
  63. categoryFeature$ += " | mUCD_NONSPACING_MARK"
  64. elsif minorCategoryCode$ = "c"
  65. categoryFeature$ += " | mUCD_SPACING_MARK"
  66. elsif minorCategoryCode$ = "e"
  67. categoryFeature$ += " | mUCD_ENCLOSING_MARK"
  68. else
  69. exitScript: "Unknown mark category code in row ", irow, "."
  70. endif
  71. elsif majorCategoryCode$ = "N"
  72. categoryFeature$ = "mUCD_WORD_CHARACTER"
  73. if minorCategoryCode$ = "d"
  74. categoryFeature$ += " | mUCD_DECIMAL_NUMBER"
  75. elsif minorCategoryCode$ = "l"
  76. categoryFeature$ += " | mUCD_LETTER_NUMBER"
  77. elsif minorCategoryCode$ = "o"
  78. categoryFeature$ += " | mUCD_OTHER_NUMBER"
  79. else
  80. exitScript: "Unknown number category code in row ", irow, "."
  81. endif
  82. elsif majorCategoryCode$ = "P"
  83. if minorCategoryCode$ = "c"
  84. categoryFeature$ = "mUCD_WORD_CHARACTER | mUCD_CONNECTOR_PUNCTUATION"
  85. elsif minorCategoryCode$ = "d"
  86. categoryFeature$ = "mUCD_DASH_PUNCTUATION"
  87. elsif minorCategoryCode$ = "s"
  88. categoryFeature$ = "mUCD_OPEN_PUNCTUATION"
  89. elsif minorCategoryCode$ = "e"
  90. categoryFeature$ = "mUCD_CLOSE_PUNCTUATION"
  91. elsif minorCategoryCode$ = "i"
  92. categoryFeature$ = "mUCD_INITIAL_PUNCTUATION"
  93. elsif minorCategoryCode$ = "f"
  94. categoryFeature$ = "mUCD_FINAL_PUNCTUATION"
  95. elsif minorCategoryCode$ = "o"
  96. categoryFeature$ = "mUCD_OTHER_PUNCTUATION"
  97. else
  98. exitScript: "Unknown punctuation category code in row ", irow, "."
  99. endif
  100. elsif majorCategoryCode$ = "S"
  101. if minorCategoryCode$ = "m"
  102. categoryFeature$ = "mUCD_MATH_SYMBOL"
  103. elsif minorCategoryCode$ = "c"
  104. categoryFeature$ = "mUCD_CURRENCY_SYMBOL"
  105. elsif minorCategoryCode$ = "k"
  106. categoryFeature$ = "mUCD_MODIFIER_SYMBOL"
  107. elsif minorCategoryCode$ = "o"
  108. categoryFeature$ = "mUCD_OTHER_SYMBOL"
  109. else
  110. exitScript: "Unknown symbol category code in row ", irow, "."
  111. endif
  112. elsif majorCategoryCode$ = "Z"
  113. if minorCategoryCode$ = "s"
  114. if lineCodePoint = 0x00A0 or lineCodePoint = 0x202F
  115. categoryFeature$ = "mUCD_NON_BREAKING_SPACE"
  116. else
  117. categoryFeature$ = "mUCD_BREAKING_SPACE"
  118. endif
  119. elsif minorCategoryCode$ = "l"
  120. categoryFeature$ = "mUCD_LINE_SEPARATOR"
  121. elsif minorCategoryCode$ = "p"
  122. categoryFeature$ = "mUCD_PARAGRAPH_SEPARATOR"
  123. else
  124. exitScript: "Unknown separator category code in row ", irow, "."
  125. endif
  126. elsif majorCategoryCode$ = "C"
  127. if minorCategoryCode$ = "c"
  128. categoryFeature$ = "mUCD_CONTROL"
  129. #
  130. # Special cases that diverge from UnicodeData.txt.
  131. #
  132. if lineCodePoint = 0
  133. categoryFeature$ += " | mUCD_NULL"
  134. elsif lineCodePoint = 9 ; tab
  135. categoryFeature$ += " | mUCD_BREAKING_SPACE"
  136. elsif (lineCodePoint >= 10 and lineCodePoint <= 13) or lineCodePoint = 0x0085 ; line feed, vertical tab, form feed, carriage return, next line
  137. categoryFeature$ += " | mUCD_LINE_SEPARATOR"
  138. endif
  139. elsif minorCategoryCode$ = "f"
  140. categoryFeature$ = "mUCD_FORMAT"
  141. elsif minorCategoryCode$ = "s"
  142. categoryFeature$ = "kUCD_UNASSIGNED"
  143. elsif minorCategoryCode$ = "o"
  144. categoryFeature$ = "mUCD_PRIVATE_USE"
  145. elsif minorCategoryCode$ = "n"
  146. categoryFeature$ = "kUCD_UNASSIGNED"
  147. else
  148. exitScript: "Unknown other category code in row ", irow, "."
  149. endif
  150. else
  151. categoryFeature$ = "0"
  152. endif
  153. lower$ = if lower$ = "" then code$ else lower$ fi
  154. upper$ = if upper$ = "" then code$ else upper$ fi
  155. title$ = if title$ = "" then code$ else title$ fi
  156. line$ += tab$ + "{ " + categoryFeature$ +
  157. ... ", 0x" + upper$ + ", 0x" + lower$ + ", 0x" + title$ + ", '\0', '\0' }," + newline$
  158. appendFile: outfile$, line$
  159. previousLineCodePoint = lineCodePoint
  160. endfor
  161. label finished
  162. appendInfoLine: stopwatch