melder_kar.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. #ifndef _melder_kar_h_
  2. #define _melder_kar_h_
  3. /* melder_kar.h
  4. *
  5. * Copyright (C) 1992-2018 Paul Boersma
  6. *
  7. * This code is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; either version 2 of the License, or (at
  10. * your option) any later version.
  11. *
  12. * This code is distributed in the hope that it will be useful, but
  13. * WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. * See the GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with this work. If not, see <http://www.gnu.org/licenses/>.
  19. */
  20. #define kUCD_TOP_OF_ASCII 127
  21. #define kUCD_TOP_OF_LIST 0x2FA1D
  22. #define kUCD_UNASSIGNED 0
  23. enum {
  24. mUCD_UPPERCASE_LETTER = (1 << 0),
  25. mUCD_LOWERCASE_LETTER = (1 << 1),
  26. mUCD_TITLECASE_LETTER = (1 << 2),
  27. mUCD_CASED_LETTER = (mUCD_UPPERCASE_LETTER | mUCD_LOWERCASE_LETTER | mUCD_TITLECASE_LETTER),
  28. mUCD_MODIFIER_LETTER = (1 << 3),
  29. mUCD_OTHER_LETTER = (1 << 4),
  30. mUCD_LETTER = (mUCD_CASED_LETTER | mUCD_MODIFIER_LETTER | mUCD_OTHER_LETTER),
  31. mUCD_NONSPACING_MARK = (1 << 5),
  32. mUCD_SPACING_MARK = (1 << 6),
  33. mUCD_ENCLOSING_MARK = (1 << 7),
  34. mUCD_MARK = (mUCD_NONSPACING_MARK | mUCD_SPACING_MARK | mUCD_ENCLOSING_MARK),
  35. mUCD_DECIMAL_NUMBER = (1 << 8),
  36. mUCD_LETTER_NUMBER = (1 << 9),
  37. mUCD_OTHER_NUMBER = (1 << 10),
  38. mUCD_NUMBER = (mUCD_DECIMAL_NUMBER | mUCD_LETTER_NUMBER | mUCD_OTHER_NUMBER),
  39. mUCD_CONNECTOR_PUNCTUATION = (1 << 11),
  40. mUCD_DASH_PUNCTUATION = (1 << 12),
  41. mUCD_OPEN_PUNCTUATION = (1 << 13),
  42. mUCD_CLOSE_PUNCTUATION = (1 << 14),
  43. mUCD_INITIAL_PUNCTUATION = (1 << 15),
  44. mUCD_FINAL_PUNCTUATION = (1 << 16),
  45. mUCD_OTHER_PUNCTUATION = (1 << 17),
  46. mUCD_PUNCTUATION = (mUCD_CONNECTOR_PUNCTUATION | mUCD_DASH_PUNCTUATION | mUCD_OPEN_PUNCTUATION | mUCD_CLOSE_PUNCTUATION | mUCD_INITIAL_PUNCTUATION | mUCD_FINAL_PUNCTUATION | mUCD_OTHER_PUNCTUATION),
  47. mUCD_MATH_SYMBOL = (1 << 18),
  48. mUCD_CURRENCY_SYMBOL = (1 << 19),
  49. mUCD_MODIFIER_SYMBOL = (1 << 20),
  50. mUCD_OTHER_SYMBOL = (1 << 21),
  51. mUCD_SYMBOL = (mUCD_MATH_SYMBOL | mUCD_CURRENCY_SYMBOL | mUCD_MODIFIER_SYMBOL | mUCD_OTHER_SYMBOL),
  52. mUCD_BREAKING_SPACE = (1 << 22),
  53. mUCD_NON_BREAKING_SPACE = (1 << 23), // note: this keeps *lines* together; it still separates *words*, despite interpretations elsewhere
  54. mUCD_SPACE_SEPARATOR = (mUCD_BREAKING_SPACE | mUCD_NON_BREAKING_SPACE),
  55. mUCD_LINE_SEPARATOR = (1 << 24),
  56. mUCD_PARAGRAPH_SEPARATOR = (1 << 25),
  57. mUCD_NEWLINE = (mUCD_LINE_SEPARATOR | mUCD_PARAGRAPH_SEPARATOR),
  58. mUCD_SEPARATOR = (mUCD_SPACE_SEPARATOR | mUCD_NEWLINE),
  59. mUCD_CONTROL = (1 << 26),
  60. mUCD_FORMAT = (1 << 27),
  61. mUCD_PRIVATE_USE = (1 << 28),
  62. mUCD_WORD_CHARACTER = (1 << 29),
  63. mUCD_NULL = (1 << 30),
  64. mUCD_ALPHANUMERIC = (mUCD_LETTER | mUCD_NUMBER),
  65. mUCD_END_OF_INK = (mUCD_SEPARATOR | mUCD_NULL),
  66. mUCD_END_OF_LINE = (mUCD_NEWLINE | mUCD_NULL),
  67. };
  68. struct UCD_CodePointInfo {
  69. uint32 features;
  70. char32 upperCase, lowerCase, titleCase;
  71. char first, second;
  72. };
  73. extern UCD_CodePointInfo theUnicodeDatabase [1+kUCD_TOP_OF_LIST];
  74. /*
  75. Praat is an internationalized program, which means it has to work in the same way
  76. wherever on earth it is used. This means that Praat has to be blind to localized settings,
  77. such as what counts as a space and what combinations of characters
  78. count as pairs of lower case and upper case.
  79. To be able to use Praat all over the world, we therefore define one single
  80. "international locale", which is simply based on the Unicode features of each code point.
  81. */
  82. /*
  83. Internationalize std::isblank ():
  84. */
  85. inline static bool Melder_isHorizontalSpace (char32 kar) {
  86. return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_SPACE_SEPARATOR) != 0;
  87. }
  88. inline static void Melder_skipHorizontalSpace (char32 **p_text) {
  89. while (Melder_isHorizontalSpace (**p_text)) (*p_text) ++;
  90. }
  91. inline static char32 * Melder_findEndOfHorizontalSpace (char32 *p) {
  92. while (Melder_isHorizontalSpace (*p)) p ++;
  93. return p;
  94. }
  95. inline static const char32 * Melder_findEndOfHorizontalSpace (const char32 *p) {
  96. while (Melder_isHorizontalSpace (*p)) p ++;
  97. return p;
  98. }
  99. inline static bool Melder_isAsciiHorizontalSpace (char32 kar) {
  100. return kar == U'\t' || kar == U' ';
  101. }
  102. inline static bool Melder_isVerticalSpace (char32 kar) {
  103. return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_NEWLINE) != 0;
  104. }
  105. inline static bool Melder_isAsciiVerticalSpace (char32 kar) {
  106. return kar >= 10 && kar <= 13; // \n, \v, \f, \r
  107. }
  108. /*
  109. Internationalize std::isspace ():
  110. */
  111. inline static bool Melder_isHorizontalOrVerticalSpace (char32 kar) {
  112. return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_SEPARATOR) != 0;
  113. }
  114. inline static bool Melder_isAsciiHorizontalOrVerticalSpace (char32 kar) {
  115. return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & mUCD_SEPARATOR) != 0;
  116. }
  117. inline static void Melder_skipHorizontalOrVerticalSpace (char32 **p_text) {
  118. while (Melder_isHorizontalOrVerticalSpace (**p_text)) (*p_text) ++;
  119. }
  120. inline static void Melder_skipHorizontalOrVerticalSpace (const char32 **p_text) {
  121. while (Melder_isHorizontalOrVerticalSpace (**p_text)) (*p_text) ++;
  122. }
  123. inline static bool Melder_isEndOfInk (char32 kar) {
  124. return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_END_OF_INK) != 0;
  125. }
  126. inline static bool Melder_isEndOfLine (char32 kar) {
  127. return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_END_OF_LINE) != 0;
  128. }
  129. inline static bool Melder_isEndOfText (char32 kar) {
  130. return kar == U'\0';
  131. }
  132. inline static bool Melder_staysWithinInk (char32 kar) {
  133. return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_END_OF_INK) == 0;
  134. }
  135. inline static bool Melder_staysWithinLine (char32 kar) {
  136. return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_END_OF_LINE) == 0;
  137. }
  138. inline static void Melder_skipToEndOfLine (char32 **p_text) {
  139. while (Melder_staysWithinLine (**p_text)) (*p_text) ++;
  140. }
  141. inline static char32 * Melder_findEndOfInk (char32 *p) {
  142. while (Melder_staysWithinInk (*p)) p ++;
  143. return p;
  144. }
  145. inline static const char32 * Melder_findEndOfInk (const char32 *p) {
  146. while (Melder_staysWithinInk (*p)) p ++;
  147. return p;
  148. }
  149. inline static char32 * Melder_findEndOfLine (char32 *p) {
  150. while (Melder_staysWithinLine (*p)) p ++;
  151. return p;
  152. }
  153. inline static const char32 * Melder_findEndOfLine (const char32 *p) {
  154. while (Melder_staysWithinLine (*p)) p ++;
  155. return p;
  156. }
  157. /*
  158. Internationalize std::isalpha ():
  159. */
  160. inline static bool Melder_isLetter (char32 kar) {
  161. return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_LETTER) != 0;
  162. }
  163. inline static bool Melder_isAsciiLetter (char32 kar) {
  164. return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & mUCD_LETTER) != 0;
  165. }
  166. /*
  167. Internationalize std::isupper ():
  168. */
  169. inline static bool Melder_isUpperCaseLetter (char32 kar) {
  170. return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_UPPERCASE_LETTER) != 0;
  171. }
  172. inline static bool Melder_isAsciiUpperCaseLetter (char32 kar) {
  173. return kar >= U'A' && kar <= U'Z';
  174. }
  175. /*
  176. Internationalize std::islower ():
  177. */
  178. inline static bool Melder_isLowerCaseLetter (char32 kar) {
  179. return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_LOWERCASE_LETTER) != 0;
  180. }
  181. inline static bool Melder_isAsciiLowerCaseLetter (char32 kar) {
  182. return kar >= U'a' && kar <= U'z';
  183. }
  184. inline static bool Melder_isTitleCaseLetter (char32 kar) {
  185. return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_TITLECASE_LETTER) != 0;
  186. }
  187. inline static bool Melder_isAsciiTitleCaseLetter (char32 kar) {
  188. return kar >= U'A' && kar <= U'Z';
  189. }
  190. /*
  191. Internationalize std::isdigit ():
  192. */
  193. inline static bool Melder_isDecimalNumber (char32 kar) {
  194. return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_DECIMAL_NUMBER) != 0;
  195. }
  196. inline static bool Melder_isAsciiDecimalNumber (char32 kar) {
  197. return kar >= U'0' && kar <= U'9';
  198. }
  199. /*
  200. We cannot really internationalize std::isxdigit ():
  201. */
  202. inline static bool Melder_isHexadecimalDigit (char32 kar) {
  203. return kar >= U'0' && kar <= U'9' || kar >= U'A' && kar <= U'Z' || kar >= U'a' && kar <= U'z';
  204. }
  205. /*
  206. Internationalize std::isalnum ():
  207. */
  208. inline static bool Melder_isAlphanumeric (char32 kar) {
  209. return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_ALPHANUMERIC) != 0;
  210. }
  211. inline static bool Melder_isAsciiAlphanumeric (char32 kar) {
  212. return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & mUCD_ALPHANUMERIC) != 0;
  213. }
  214. inline static bool Melder_isWordCharacter (char32 kar) {
  215. return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_WORD_CHARACTER) != 0;
  216. }
  217. inline static bool Melder_isAsciiWordCharacter (char32 kar) {
  218. return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & mUCD_WORD_CHARACTER) != 0;
  219. }
  220. /*
  221. The standard library further contains std::ispunct (), std::iscntrl (), std::isprint (), std::isgraph ().
  222. These have very little use nowadays, so only for completeness do we include versions of them here,
  223. which are correct at least for ASCII arguments.
  224. Of these four functions, Melder_hasInk () is not yet correct for all Unicode points,
  225. as approximately one half of the mUCD_FORMAT points are inkless as well.
  226. */
  227. inline static bool Melder_isPunctuationOrSymbol (char32 kar) {
  228. return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & (mUCD_PUNCTUATION | mUCD_SYMBOL)) != 0;
  229. }
  230. inline static bool Melder_isAsciiPunctuationOrSymbol (char32 kar) { // same as std::ispunct() with default C locale
  231. return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & (mUCD_PUNCTUATION | mUCD_SYMBOL)) != 0;
  232. }
  233. inline static bool Melder_isControl (char32 kar) {
  234. return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_CONTROL) != 0;
  235. }
  236. inline static bool Melder_isAsciiControl (char32 kar) { // same as std::iscntrl() with default C locale
  237. return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & mUCD_CONTROL) != 0;
  238. }
  239. inline static bool Melder_isPrintable (char32 kar) {
  240. return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_CONTROL) == 0;
  241. }
  242. inline static bool Melder_isAsciiPrintable (char32 kar) { // same as std::isprint() with default C locale
  243. return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & mUCD_CONTROL) == 0;
  244. }
  245. inline static bool Melder_hasInk (char32 kar) {
  246. return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & (mUCD_CONTROL | mUCD_SEPARATOR)) == 0;
  247. }
  248. inline static bool Melder_hasAsciiInk (char32 kar) { // same as std::isgraph() with default C locale
  249. return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & (mUCD_CONTROL | mUCD_SEPARATOR)) == 0;
  250. }
  251. /*
  252. Internationalize std::toupper () and std::tolower ():
  253. */
  254. inline static char32 Melder_toUpperCase (char32 kar) {
  255. return kar <= kUCD_TOP_OF_LIST ? theUnicodeDatabase [kar]. upperCase : kar;
  256. }
  257. inline static char32 Melder_toLowerCase (char32 kar) {
  258. return kar <= kUCD_TOP_OF_LIST ? theUnicodeDatabase [kar]. lowerCase : kar;
  259. }
  260. inline static char32 Melder_toTitleCase (char32 kar) {
  261. return kar <= kUCD_TOP_OF_LIST ? theUnicodeDatabase [kar]. titleCase : kar;
  262. }
  263. /*
  264. Search functions instantiating strspn() but much faster (CHECK).
  265. */
  266. inline static const char32 * Melder_findInk (conststring32 str) noexcept {
  267. if (! str)
  268. return nullptr;
  269. const char32 *p = & str [0];
  270. for (; ! Melder_hasInk (*p); p ++) {
  271. if (*p == U'\0')
  272. return nullptr; // not found
  273. }
  274. return p;
  275. }
  276. /* End of file melder_kar.h */
  277. #endif