ucd.h 46 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070
  1. /* Unicode Character Database API
  2. *
  3. * Copyright (C) 2012-2017 Reece H. Dunn
  4. *
  5. * This file is part of ucd-tools.
  6. *
  7. * ucd-tools is free software: you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation, either version 3 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * ucd-tools is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with ucd-tools. If not, see <http://www.gnu.org/licenses/>.
  19. */
  20. #ifndef UNICODE_CHARACTER_DATA_H
  21. #define UNICODE_CHARACTER_DATA_H
  22. #include <stdint.h>
  23. #ifdef __cplusplus
  24. extern "C"
  25. {
  26. #endif
  27. /** @brief Represents a Unicode codepoint.
  28. */
  29. typedef uint32_t codepoint_t;
  30. /** @brief Unicode General Category Groups
  31. * @see http://www.unicode.org/reports/tr44/
  32. */
  33. typedef enum ucd_category_group_
  34. {
  35. UCD_CATEGORY_GROUP_C, /**< @brief Other */
  36. UCD_CATEGORY_GROUP_I, /**< @brief Invalid */
  37. UCD_CATEGORY_GROUP_L, /**< @brief Letter */
  38. UCD_CATEGORY_GROUP_M, /**< @brief Mark */
  39. UCD_CATEGORY_GROUP_N, /**< @brief Number */
  40. UCD_CATEGORY_GROUP_P, /**< @brief Punctuation */
  41. UCD_CATEGORY_GROUP_S, /**< @brief Symbol */
  42. UCD_CATEGORY_GROUP_Z, /**< @brief Separator */
  43. } ucd_category_group;
  44. /** @brief Get a string representation of the category_group enumeration value.
  45. *
  46. * @param c The value to get the string representation for.
  47. *
  48. * @return The string representation, or "-" if the value is not recognized.
  49. */
  50. const char *ucd_get_category_group_string(ucd_category_group c);
  51. /** @brief Unicode General Category Values
  52. * @see http://www.unicode.org/reports/tr44/
  53. */
  54. typedef enum ucd_category_
  55. {
  56. UCD_CATEGORY_Cc, /**< @brief Control Character */
  57. UCD_CATEGORY_Cf, /**< @brief Format Control Character */
  58. UCD_CATEGORY_Cn, /**< @brief Unassigned */
  59. UCD_CATEGORY_Co, /**< @brief Private Use */
  60. UCD_CATEGORY_Cs, /**< @brief Surrogate Code Point */
  61. UCD_CATEGORY_Ii, /**< @brief Invalid Unicode Codepoint */
  62. UCD_CATEGORY_Ll, /**< @brief Lower Case Letter */
  63. UCD_CATEGORY_Lm, /**< @brief Letter Modifier */
  64. UCD_CATEGORY_Lo, /**< @brief Other Letter */
  65. UCD_CATEGORY_Lt, /**< @brief Title Case Letter */
  66. UCD_CATEGORY_Lu, /**< @brief Upper Case Letter */
  67. UCD_CATEGORY_Mc, /**< @brief Spacing Mark */
  68. UCD_CATEGORY_Me, /**< @brief Enclosing Mark */
  69. UCD_CATEGORY_Mn, /**< @brief Non-Spacing Mark */
  70. UCD_CATEGORY_Nd, /**< @brief Decimal Digit */
  71. UCD_CATEGORY_Nl, /**< @brief Letter-Like Number */
  72. UCD_CATEGORY_No, /**< @brief Other Number */
  73. UCD_CATEGORY_Pc, /**< @brief Connector */
  74. UCD_CATEGORY_Pd, /**< @brief Dash/Hyphen */
  75. UCD_CATEGORY_Pe, /**< @brief Close Punctuation Mark */
  76. UCD_CATEGORY_Pf, /**< @brief Final Quotation Mark */
  77. UCD_CATEGORY_Pi, /**< @brief Initial Quotation Mark */
  78. UCD_CATEGORY_Po, /**< @brief Other */
  79. UCD_CATEGORY_Ps, /**< @brief Open Punctuation Mark */
  80. UCD_CATEGORY_Sc, /**< @brief Currency Symbol */
  81. UCD_CATEGORY_Sk, /**< @brief Modifier Symbol */
  82. UCD_CATEGORY_Sm, /**< @brief Math Symbol */
  83. UCD_CATEGORY_So, /**< @brief Other Symbol */
  84. UCD_CATEGORY_Zl, /**< @brief Line Separator */
  85. UCD_CATEGORY_Zp, /**< @brief Paragraph Separator */
  86. UCD_CATEGORY_Zs, /**< @brief Space Separator */
  87. } ucd_category;
  88. /** @brief Get a string representation of the category enumeration value.
  89. *
  90. * @param c The value to get the string representation for.
  91. *
  92. * @return The string representation, or "--" if the value is not recognized.
  93. */
  94. const char *ucd_get_category_string(ucd_category c);
  95. /** @brief Lookup the General Category Group for a General Category.
  96. *
  97. * @param c The General Category to lookup.
  98. * @return The General Category Group of the General Category.
  99. */
  100. ucd_category_group ucd_get_category_group_for_category(ucd_category c);
  101. /** @brief Lookup the General Category Group for a Unicode codepoint.
  102. *
  103. * @param c The Unicode codepoint to lookup.
  104. * @return The General Category Group of the Unicode codepoint.
  105. */
  106. ucd_category_group ucd_lookup_category_group(codepoint_t c);
  107. /** @brief Lookup the General Category for a Unicode codepoint.
  108. *
  109. * @param c The Unicode codepoint to lookup.
  110. * @return The General Category of the Unicode codepoint.
  111. */
  112. ucd_category ucd_lookup_category(codepoint_t c);
  113. /** @brief Unicode Script
  114. * @see http://www.iana.org/assignments/language-subtag-registry
  115. * @see http://www.unicode.org/iso15924/iso15924-codes.html
  116. */
  117. typedef enum ucd_script_
  118. {
  119. UCD_SCRIPT_Adlm, /**< @brief Adlam Script */
  120. UCD_SCRIPT_Afak, /**< @brief Afaka Script */
  121. UCD_SCRIPT_Aghb, /**< @brief Caucasian Albanian Script */
  122. UCD_SCRIPT_Ahom, /**< @brief Tai Ahom Script */
  123. UCD_SCRIPT_Arab, /**< @brief Arabic Script */
  124. UCD_SCRIPT_Armi, /**< @brief Imperial Aramaic Script */
  125. UCD_SCRIPT_Armn, /**< @brief Armenian Script */
  126. UCD_SCRIPT_Avst, /**< @brief Avestan Script */
  127. UCD_SCRIPT_Bali, /**< @brief Balinese Script */
  128. UCD_SCRIPT_Bamu, /**< @brief Bamum Script */
  129. UCD_SCRIPT_Bass, /**< @brief Bassa Vah Script */
  130. UCD_SCRIPT_Batk, /**< @brief Batak Script */
  131. UCD_SCRIPT_Beng, /**< @brief Bengali Script */
  132. UCD_SCRIPT_Bhks, /**< @brief Bhaiksuki Script */
  133. UCD_SCRIPT_Blis, /**< @brief Blissymbols Script */
  134. UCD_SCRIPT_Bopo, /**< @brief Bopomofo Script */
  135. UCD_SCRIPT_Brah, /**< @brief Brahmi Script */
  136. UCD_SCRIPT_Brai, /**< @brief Braille Script */
  137. UCD_SCRIPT_Bugi, /**< @brief Buginese Script */
  138. UCD_SCRIPT_Buhd, /**< @brief Buhid Script */
  139. UCD_SCRIPT_Cakm, /**< @brief Chakma Script */
  140. UCD_SCRIPT_Cans, /**< @brief Unified Canadian Aboriginal Syllabics */
  141. UCD_SCRIPT_Cari, /**< @brief Carian Script */
  142. UCD_SCRIPT_Cham, /**< @brief Cham Script */
  143. UCD_SCRIPT_Cher, /**< @brief Cherokee Script */
  144. UCD_SCRIPT_Cirt, /**< @brief Cirth Script */
  145. UCD_SCRIPT_Copt, /**< @brief Coptic Script */
  146. UCD_SCRIPT_Cprt, /**< @brief Cypriot Script */
  147. UCD_SCRIPT_Cyrl, /**< @brief Cyrillic Script */
  148. UCD_SCRIPT_Cyrs, /**< @brief Cyrillic (Old Church Slavonic variant) Script */
  149. UCD_SCRIPT_Deva, /**< @brief Devanagari Script */
  150. UCD_SCRIPT_Dsrt, /**< @brief Deseret Script */
  151. UCD_SCRIPT_Dupl, /**< @brief Duployan Shorthand Script */
  152. UCD_SCRIPT_Egyd, /**< @brief Egyptian Demotic Script */
  153. UCD_SCRIPT_Egyh, /**< @brief Egyptian Hieratic Script */
  154. UCD_SCRIPT_Egyp, /**< @brief Egyptian Hiegoglyphs */
  155. UCD_SCRIPT_Elba, /**< @brief Elbasan Script */
  156. UCD_SCRIPT_Ethi, /**< @brief Ethiopic Script */
  157. UCD_SCRIPT_Geok, /**< @brief Khutsuri Script */
  158. UCD_SCRIPT_Geor, /**< @brief Geirgian Script */
  159. UCD_SCRIPT_Glag, /**< @brief Glagolitic Script */
  160. UCD_SCRIPT_Gonm, /**< @brief Masaram Gondi */
  161. UCD_SCRIPT_Goth, /**< @brief Gothic Script */
  162. UCD_SCRIPT_Gran, /**< @brief Grantha Script */
  163. UCD_SCRIPT_Grek, /**< @brief Greek Script */
  164. UCD_SCRIPT_Gujr, /**< @brief Gujarati Script */
  165. UCD_SCRIPT_Guru, /**< @brief Gurmukhi Script */
  166. UCD_SCRIPT_Hang, /**< @brief Hangul Script */
  167. UCD_SCRIPT_Hani, /**< @brief Han (Hanzi, Kanji, Hanja) Script */
  168. UCD_SCRIPT_Hano, /**< @brief Hanunoo Script */
  169. UCD_SCRIPT_Hans, /**< @brief Han (Simplified) Script */
  170. UCD_SCRIPT_Hant, /**< @brief Han (Traditional) Script */
  171. UCD_SCRIPT_Hatr, /**< @brief Hatran Script */
  172. UCD_SCRIPT_Hebr, /**< @brief Hebrew Script */
  173. UCD_SCRIPT_Hira, /**< @brief Hiragana Script */
  174. UCD_SCRIPT_Hluw, /**< @brief Anatolian Hieroglyphs */
  175. UCD_SCRIPT_Hmng, /**< @brief Pahawh Hmong Script */
  176. UCD_SCRIPT_Hrkt, /**< @brief Japanese Syllabaries */
  177. UCD_SCRIPT_Hung, /**< @brief Old Hungarian Script */
  178. UCD_SCRIPT_Inds, /**< @brief Indus Script */
  179. UCD_SCRIPT_Ital, /**< @brief Old Italic Script */
  180. UCD_SCRIPT_Java, /**< @brief Javanese Script */
  181. UCD_SCRIPT_Jpan, /**< @brief Japanese (Han + Hiragana + Katakana) Scripts */
  182. UCD_SCRIPT_Jurc, /**< @brief Jurchen Script */
  183. UCD_SCRIPT_Kali, /**< @brief Kayah Li Script */
  184. UCD_SCRIPT_Kana, /**< @brief Katakana Script */
  185. UCD_SCRIPT_Khar, /**< @brief Kharoshthi Script */
  186. UCD_SCRIPT_Khmr, /**< @brief Khmer Script */
  187. UCD_SCRIPT_Khoj, /**< @brief Khojki Script */
  188. UCD_SCRIPT_Knda, /**< @brief Kannada Script */
  189. UCD_SCRIPT_Kore, /**< @brief Korean (Hangul + Han) Scripts */
  190. UCD_SCRIPT_Kpel, /**< @brief Kpelle Script */
  191. UCD_SCRIPT_Kthi, /**< @brief Kaithi Script */
  192. UCD_SCRIPT_Lana, /**< @brief Tai Tham Script */
  193. UCD_SCRIPT_Laoo, /**< @brief Lao Script */
  194. UCD_SCRIPT_Latf, /**< @brief Latin Script (Fractur Variant) */
  195. UCD_SCRIPT_Latg, /**< @brief Latin Script (Gaelic Variant) */
  196. UCD_SCRIPT_Latn, /**< @brief Latin Script */
  197. UCD_SCRIPT_Lepc, /**< @brief Lepcha Script */
  198. UCD_SCRIPT_Limb, /**< @brief Limbu Script */
  199. UCD_SCRIPT_Lina, /**< @brief Linear A Script */
  200. UCD_SCRIPT_Linb, /**< @brief Linear B Script */
  201. UCD_SCRIPT_Lisu, /**< @brief Lisu Script */
  202. UCD_SCRIPT_Loma, /**< @brief Loma Script */
  203. UCD_SCRIPT_Lyci, /**< @brief Lycian Script */
  204. UCD_SCRIPT_Lydi, /**< @brief Lydian Script */
  205. UCD_SCRIPT_Mahj, /**< @brief Mahajani Script */
  206. UCD_SCRIPT_Mand, /**< @brief Mandaic Script */
  207. UCD_SCRIPT_Mani, /**< @brief Manichaean Script */
  208. UCD_SCRIPT_Marc, /**< @brief Marchen Script */
  209. UCD_SCRIPT_Maya, /**< @brief Mayan Hieroglyphs */
  210. UCD_SCRIPT_Mend, /**< @brief Mende Kikakui Script */
  211. UCD_SCRIPT_Merc, /**< @brief Meroitic Cursive Script */
  212. UCD_SCRIPT_Mero, /**< @brief Meroitic Hieroglyphs */
  213. UCD_SCRIPT_Mlym, /**< @brief Malayalam Script */
  214. UCD_SCRIPT_Modi, /**< @brief Modi Script */
  215. UCD_SCRIPT_Mong, /**< @brief Mongolian Script */
  216. UCD_SCRIPT_Moon, /**< @brief Moon Script */
  217. UCD_SCRIPT_Mroo, /**< @brief Mro Script */
  218. UCD_SCRIPT_Mtei, /**< @brief Meitei Mayek Script */
  219. UCD_SCRIPT_Mult, /**< @brief Multani Script */
  220. UCD_SCRIPT_Mymr, /**< @brief Myanmar (Burmese) Script */
  221. UCD_SCRIPT_Narb, /**< @brief Old North Arabian Script */
  222. UCD_SCRIPT_Nbat, /**< @brief Nabataean Script */
  223. UCD_SCRIPT_Newa, /**< @brief Newa Script */
  224. UCD_SCRIPT_Nkgb, /**< @brief Nakhi Geba Script */
  225. UCD_SCRIPT_Nkoo, /**< @brief N'Ko Script */
  226. UCD_SCRIPT_Nshu, /**< @brief Nushu Script */
  227. UCD_SCRIPT_Ogam, /**< @brief Ogham Script */
  228. UCD_SCRIPT_Olck, /**< @brief Ol Chiki Script */
  229. UCD_SCRIPT_Orkh, /**< @brief Old Turkic Script */
  230. UCD_SCRIPT_Orya, /**< @brief Oriya Script */
  231. UCD_SCRIPT_Osge, /**< @brief Osage Script */
  232. UCD_SCRIPT_Osma, /**< @brief Osmanya Script */
  233. UCD_SCRIPT_Palm, /**< @brief Palmyrene Script */
  234. UCD_SCRIPT_Pauc, /**< @brief Pau Cin Hau Script */
  235. UCD_SCRIPT_Perm, /**< @brief Old Permic */
  236. UCD_SCRIPT_Phag, /**< @brief Phags-Pa Script */
  237. UCD_SCRIPT_Phli, /**< @brief Inscriptional Pahlavi Script */
  238. UCD_SCRIPT_Phlp, /**< @brief Psalter Pahlavi Script */
  239. UCD_SCRIPT_Phlv, /**< @brief Book Pahlavi Script */
  240. UCD_SCRIPT_Phnx, /**< @brief Phoenician Script */
  241. UCD_SCRIPT_Plrd, /**< @brief Miao Script */
  242. UCD_SCRIPT_Prti, /**< @brief Inscriptional Parthian Script */
  243. UCD_SCRIPT_Qaak, /**< @brief Klingon Script (Private Use) */
  244. UCD_SCRIPT_Rjng, /**< @brief Rejang Script */
  245. UCD_SCRIPT_Roro, /**< @brief Rongorongo Script */
  246. UCD_SCRIPT_Runr, /**< @brief Runic Script */
  247. UCD_SCRIPT_Samr, /**< @brief Samaritan Script */
  248. UCD_SCRIPT_Sara, /**< @brief Sarati Script */
  249. UCD_SCRIPT_Sarb, /**< @brief Old South Arabian Script */
  250. UCD_SCRIPT_Saur, /**< @brief Saurashtra Script */
  251. UCD_SCRIPT_Sgnw, /**< @brief Sign Writing */
  252. UCD_SCRIPT_Shaw, /**< @brief Shavian Script */
  253. UCD_SCRIPT_Shrd, /**< @brief Sharada Script */
  254. UCD_SCRIPT_Sidd, /**< @brief Siddham Script */
  255. UCD_SCRIPT_Sind, /**< @brief Sindhi Script */
  256. UCD_SCRIPT_Sinh, /**< @brief Sinhala Script */
  257. UCD_SCRIPT_Sora, /**< @brief Sora Sompeng Script */
  258. UCD_SCRIPT_Soyo, /**< @brief Soyombo */
  259. UCD_SCRIPT_Sund, /**< @brief Sundanese Script */
  260. UCD_SCRIPT_Sylo, /**< @brief Syloti Nagri Script */
  261. UCD_SCRIPT_Syrc, /**< @brief Syriac Script */
  262. UCD_SCRIPT_Syre, /**< @brief Syriac Script (Estrangelo Variant) */
  263. UCD_SCRIPT_Syrj, /**< @brief Syriac Script (Western Variant) */
  264. UCD_SCRIPT_Syrn, /**< @brief Syriac Script (Eastern Variant) */
  265. UCD_SCRIPT_Tagb, /**< @brief Tagbanwa Script */
  266. UCD_SCRIPT_Takr, /**< @brief Takri Script */
  267. UCD_SCRIPT_Tale, /**< @brief Tai Le Script */
  268. UCD_SCRIPT_Talu, /**< @brief New Tai Lue Script */
  269. UCD_SCRIPT_Taml, /**< @brief Tamil Script */
  270. UCD_SCRIPT_Tang, /**< @brief Tangut Script */
  271. UCD_SCRIPT_Tavt, /**< @brief Tai Viet Script */
  272. UCD_SCRIPT_Telu, /**< @brief Telugu Script */
  273. UCD_SCRIPT_Teng, /**< @brief Tengwar Script */
  274. UCD_SCRIPT_Tfng, /**< @brief Tifinagh Script */
  275. UCD_SCRIPT_Tglg, /**< @brief Tagalog Script */
  276. UCD_SCRIPT_Thaa, /**< @brief Thaana Script */
  277. UCD_SCRIPT_Thai, /**< @brief Thai Script */
  278. UCD_SCRIPT_Tibt, /**< @brief Tibetan Script */
  279. UCD_SCRIPT_Tirh, /**< @brief Tirhuta Script */
  280. UCD_SCRIPT_Ugar, /**< @brief Ugaritic Script */
  281. UCD_SCRIPT_Vaii, /**< @brief Vai Script */
  282. UCD_SCRIPT_Visp, /**< @brief Visible Speech Script */
  283. UCD_SCRIPT_Wara, /**< @brief Warang Citi Script */
  284. UCD_SCRIPT_Wole, /**< @brief Woleai Script */
  285. UCD_SCRIPT_Xpeo, /**< @brief Old Persian Script */
  286. UCD_SCRIPT_Xsux, /**< @brief Cuneiform Script */
  287. UCD_SCRIPT_Yiii, /**< @brief Yi Script */
  288. UCD_SCRIPT_Zanb, /**< @brief Zanabazar Square */
  289. UCD_SCRIPT_Zinh, /**< @brief Inherited Script */
  290. UCD_SCRIPT_Zmth, /**< @brief Mathematical Notation */
  291. UCD_SCRIPT_Zsym, /**< @brief Symbols */
  292. UCD_SCRIPT_Zxxx, /**< @brief Unwritten Documents */
  293. UCD_SCRIPT_Zyyy, /**< @brief Undetermined Script */
  294. UCD_SCRIPT_Zzzz, /**< @brief Uncoded Script */
  295. } ucd_script;
  296. /** @brief Get a string representation of the script enumeration value.
  297. *
  298. * @param s The value to get the string representation for.
  299. *
  300. * @return The string representation, or "----" if the value is not recognized.
  301. */
  302. const char *ucd_get_script_string(ucd_script s);
  303. /** @brief Lookup the Script for a Unicode codepoint.
  304. *
  305. * @param c The Unicode codepoint to lookup.
  306. * @return The Script of the Unicode codepoint.
  307. */
  308. ucd_script ucd_lookup_script(codepoint_t c);
  309. /** @brief Properties
  310. */
  311. typedef uint64_t ucd_property;
  312. #define UCD_PROPERTY_WHITE_SPACE 0x0000000000000001ull /**< @brief White_Space */
  313. #define UCD_PROPERTY_BIDI_CONTROL 0x0000000000000002ull /**< @brief Bidi_Control */
  314. #define UCD_PROPERTY_JOIN_CONTROL 0x0000000000000004ull /**< @brief Join_Control */
  315. #define UCD_PROPERTY_DASH 0x0000000000000008ull /**< @brief Dash */
  316. #define UCD_PROPERTY_HYPHEN 0x0000000000000010ull /**< @brief Hyphen */
  317. #define UCD_PROPERTY_QUOTATION_MARK 0x0000000000000020ull /**< @brief Quotation_Mark */
  318. #define UCD_PROPERTY_TERMINAL_PUNCTUATION 0x0000000000000040ull /**< @brief Terminal_Punctuation */
  319. #define UCD_PROPERTY_OTHER_MATH 0x0000000000000080ull /**< @brief Other_Math */
  320. #define UCD_PROPERTY_HEX_DIGIT 0x0000000000000100ull /**< @brief Hex_Digit */
  321. #define UCD_PROPERTY_ASCII_HEX_DIGIT 0x0000000000000200ull /**< @brief ASCII_Hex_Digit */
  322. #define UCD_PROPERTY_OTHER_ALPHABETIC 0x0000000000000400ull /**< @brief Other_Alphabetic */
  323. #define UCD_PROPERTY_IDEOGRAPHIC 0x0000000000000800ull /**< @brief Ideographic */
  324. #define UCD_PROPERTY_DIACRITIC 0x0000000000001000ull /**< @brief Diacritic */
  325. #define UCD_PROPERTY_EXTENDER 0x0000000000002000ull /**< @brief Extender */
  326. #define UCD_PROPERTY_OTHER_LOWERCASE 0x0000000000004000ull /**< @brief Other_Lowercase */
  327. #define UCD_PROPERTY_OTHER_UPPERCASE 0x0000000000008000ull /**< @brief Other_Uppercase */
  328. #define UCD_PROPERTY_NONCHARACTER_CODE_POINT 0x0000000000010000ull /**< @brief Noncharacter_Code_Point */
  329. #define UCD_PROPERTY_OTHER_GRAPHEME_EXTEND 0x0000000000020000ull /**< @brief Other_Grapheme_Extend */
  330. #define UCD_PROPERTY_IDS_BINARY_OPERATOR 0x0000000000040000ull /**< @brief IDS_Binary_Operator */
  331. #define UCD_PROPERTY_IDS_TRINARY_OPERATOR 0x0000000000080000ull /**< @brief IDS_Trinary_Operator */
  332. #define UCD_PROPERTY_RADICAL 0x0000000000100000ull /**< @brief Radical */
  333. #define UCD_PROPERTY_UNIFIED_IDEOGRAPH 0x0000000000200000ull /**< @brief Unified_Ideograph */
  334. #define UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT 0x0000000000400000ull /**< @brief Other_Default_Ignorable_Code_Point */
  335. #define UCD_PROPERTY_DEPRECATED 0x0000000000800000ull /**< @brief Deprecated */
  336. #define UCD_PROPERTY_SOFT_DOTTED 0x0000000001000000ull /**< @brief Soft_Dotted */
  337. #define UCD_PROPERTY_LOGICAL_ORDER_EXCEPTION 0x0000000002000000ull /**< @brief Logical_Order_Exception */
  338. #define UCD_PROPERTY_OTHER_ID_START 0x0000000004000000ull /**< @brief Other_ID_Start */
  339. #define UCD_PROPERTY_OTHER_ID_CONTINUE 0x0000000008000000ull /**< @brief Other_ID_Continue */
  340. #define UCD_PROPERTY_SENTENCE_TERMINAL 0x0000000010000000ull /**< @brief Sentence_Terminal */
  341. #define UCD_PROPERTY_VARIATION_SELECTOR 0x0000000020000000ull /**< @brief Variation_Selector */
  342. #define UCD_PROPERTY_PATTERN_WHITE_SPACE 0x0000000040000000ull /**< @brief Pattern_White_Space */
  343. #define UCD_PROPERTY_PATTERN_SYNTAX 0x0000000080000000ull /**< @brief Pattern_Syntax */
  344. #define UCD_PROPERTY_PREPENDED_CONCATENATION_MARK 0x0000000100000000ull /**< @brief Prepended_Concatenation_Mark */
  345. #define UCD_PROPERTY_EMOJI 0x0000000200000000ull /**< @brief Emoji */
  346. #define UCD_PROPERTY_EMOJI_PRESENTATION 0x0000000400000000ull /**< @brief Emoji_Presentation */
  347. #define UCD_PROPERTY_EMOJI_MODIFIER 0x0000000800000000ull /**< @brief Emoji_Modifier */
  348. #define UCD_PROPERTY_EMOJI_MODIFIER_BASE 0x0000001000000000ull /**< @brief Emoji_Modifier_Base */
  349. #define UCD_PROPERTY_REGIONAL_INDICATOR 0x0000002000000000ull /**< @brief Regional_Indicator */
  350. #define UCD_PROPERTY_EMOJI_COMPONENT 0x0000004000000000ull /**< @brief Emoji_Component */
  351. // eSpeak NG extended properties:
  352. #define ESPEAKNG_PROPERTY_INVERTED_TERMINAL_PUNCTUATION 0x0010000000000000ull /**< @brief Inverted_Terminal_Punctuation */
  353. #define ESPEAKNG_PROPERTY_PUNCTUATION_IN_WORD 0x0020000000000000ull /**< @brief Punctuation_In_Word */
  354. #define ESPEAKNG_PROPERTY_OPTIONAL_SPACE_AFTER 0x0040000000000000ull /**< @brief Optional_Space_After */
  355. #define ESPEAKNG_PROPERTY_EXTENDED_DASH 0x0080000000000000ull /**< @brief Extended_Dash */
  356. #define ESPEAKNG_PROPERTY_PARAGRAPH_SEPARATOR 0x0100000000000000ull /**< @brief Paragraph_Separator */
  357. #define ESPEAKNG_PROPERTY_ELLIPSIS 0x0200000000000000ull /**< @brief Ellipsis */
  358. #define ESPEAKNG_PROPERTY_SEMI_COLON 0x0400000000000000ull /**< @brief Semi_Colon */
  359. #define ESPEAKNG_PROPERTY_COLON 0x0800000000000000ull /**< @brief Colon */
  360. #define ESPEAKNG_PROPERTY_COMMA 0x1000000000000000ull /**< @brief Comma */
  361. #define ESPEAKNG_PROPERTY_EXCLAMATION_MARK 0x2000000000000000ull /**< @brief Exclamation_Mark */
  362. #define ESPEAKNG_PROPERTY_QUESTION_MARK 0x4000000000000000ull /**< @brief Question_Mark */
  363. #define ESPEAKNG_PROPERTY_FULL_STOP 0x8000000000000000ull /**< @brief Full_Stop */
  364. /** @brief Return the properties of the specified codepoint.
  365. *
  366. * @param c The Unicode codepoint to lookup.
  367. * @param category The General Category of the codepoint.
  368. * @return The properties associated with the codepoint.
  369. */
  370. ucd_property ucd_properties(codepoint_t c, ucd_category category);
  371. /** @brief Is the codepoint in the 'alnum' class?
  372. *
  373. * @param c The Unicode codepoint to check.
  374. * @return Non-zero if the codepoint is in the 'alnum' class, zero otherwise.
  375. */
  376. int ucd_isalnum(codepoint_t c);
  377. /** @brief Is the codepoint in the 'alpha' class?
  378. *
  379. * @param c The Unicode codepoint to check.
  380. * @return Non-zero if the codepoint is in the 'alpha' class, zero otherwise.
  381. */
  382. int ucd_isalpha(codepoint_t c);
  383. /** @brief Is the codepoint in the 'blank' class?
  384. *
  385. * @param c The Unicode codepoint to check.
  386. * @return Non-zero if the codepoint is in the 'blank' class, zero otherwise.
  387. */
  388. int ucd_isblank(codepoint_t c);
  389. /** @brief Is the codepoint in the 'cntrl' class?
  390. *
  391. * @param c The Unicode codepoint to check.
  392. * @return Non-zero if the codepoint is in the 'cntrl' class, zero otherwise.
  393. */
  394. int ucd_iscntrl(codepoint_t c);
  395. /** @brief Is the codepoint in the 'digit' class?
  396. *
  397. * @param c The Unicode codepoint to check.
  398. * @return Non-zero if the codepoint is in the 'digit' class, zero otherwise.
  399. */
  400. int ucd_isdigit(codepoint_t c);
  401. /** @brief Is the codepoint in the 'graph' class?
  402. *
  403. * @param c The Unicode codepoint to check.
  404. * @return Non-zero if the codepoint is in the 'graph' class, zero otherwise.
  405. */
  406. int ucd_isgraph(codepoint_t c);
  407. /** @brief Is the codepoint in the 'lower' class?
  408. *
  409. * @param c The Unicode codepoint to check.
  410. * @return Non-zero if the codepoint is in the 'lower' class, zero otherwise.
  411. */
  412. int ucd_islower(codepoint_t c);
  413. /** @brief Is the codepoint in the 'print' class?
  414. *
  415. * @param c The Unicode codepoint to check.
  416. * @return Non-zero if the codepoint is in the 'print' class, zero otherwise.
  417. */
  418. int ucd_isprint(codepoint_t c);
  419. /** @brief Is the codepoint in the 'punct' class?
  420. *
  421. * @param c The Unicode codepoint to check.
  422. * @return Non-zero if the codepoint is in the 'punct' class, zero otherwise.
  423. */
  424. int ucd_ispunct(codepoint_t c);
  425. /** @brief Is the codepoint in the 'space' class?
  426. *
  427. * @param c The Unicode codepoint to check.
  428. * @return Non-zero if the codepoint is in the 'space' class, zero otherwise.
  429. */
  430. int ucd_isspace(codepoint_t c);
  431. /** @brief Is the codepoint in the 'upper' class?
  432. *
  433. * @param c The Unicode codepoint to check.
  434. * @return Non-zero if the codepoint is in the 'upper' class, zero otherwise.
  435. */
  436. int ucd_isupper(codepoint_t c);
  437. /** @brief Is the codepoint in the 'xdigit' class?
  438. *
  439. * @param c The Unicode codepoint to check.
  440. * @return Non-zero if the codepoint is in the 'xdigit' class, zero otherwise.
  441. */
  442. int ucd_isxdigit(codepoint_t c);
  443. /** @brief Convert the Unicode codepoint to upper-case.
  444. *
  445. * This function only uses the simple case mapping present in the
  446. * UnicodeData file. The data in SpecialCasing requires Unicode
  447. * codepoints to be mapped to multiple codepoints.
  448. *
  449. * @param c The Unicode codepoint to convert.
  450. * @return The upper-case Unicode codepoint for this codepoint, or
  451. * this codepoint if there is no upper-case codepoint.
  452. */
  453. codepoint_t ucd_toupper(codepoint_t c);
  454. /** @brief Convert the Unicode codepoint to lower-case.
  455. *
  456. * This function only uses the simple case mapping present in the
  457. * UnicodeData file. The data in SpecialCasing requires Unicode
  458. * codepoints to be mapped to multiple codepoints.
  459. *
  460. * @param c The Unicode codepoint to convert.
  461. * @return The lower-case Unicode codepoint for this codepoint, or
  462. * this codepoint if there is no upper-case codepoint.
  463. */
  464. codepoint_t ucd_tolower(codepoint_t c);
  465. /** @brief Convert the Unicode codepoint to title-case.
  466. *
  467. * This function only uses the simple case mapping present in the
  468. * UnicodeData file. The data in SpecialCasing requires Unicode
  469. * codepoints to be mapped to multiple codepoints.
  470. *
  471. * @param c The Unicode codepoint to convert.
  472. * @return The title-case Unicode codepoint for this codepoint, or
  473. * this codepoint if there is no upper-case codepoint.
  474. */
  475. codepoint_t ucd_totitle(codepoint_t c);
  476. #ifdef __cplusplus
  477. }
  478. /** @brief Unicode Character Database
  479. */
  480. namespace ucd
  481. {
  482. /** @brief Represents a Unicode codepoint.
  483. */
  484. using ::codepoint_t;
  485. /** @brief Unicode General Category Groups
  486. * @see http://www.unicode.org/reports/tr44/
  487. */
  488. enum category_group
  489. {
  490. C = UCD_CATEGORY_GROUP_C, /**< @brief Other */
  491. I = UCD_CATEGORY_GROUP_I, /**< @brief Invalid */
  492. L = UCD_CATEGORY_GROUP_L, /**< @brief Letter */
  493. M = UCD_CATEGORY_GROUP_M, /**< @brief Mark */
  494. N = UCD_CATEGORY_GROUP_N, /**< @brief Number */
  495. P = UCD_CATEGORY_GROUP_P, /**< @brief Punctuation */
  496. S = UCD_CATEGORY_GROUP_S, /**< @brief Symbol */
  497. Z = UCD_CATEGORY_GROUP_Z, /**< @brief Separator */
  498. };
  499. /** @brief Get a string representation of the category_group enumeration value.
  500. *
  501. * @param c The value to get the string representation for.
  502. *
  503. * @return The string representation, or "-" if the value is not recognized.
  504. */
  505. inline const char *get_category_group_string(category_group c)
  506. {
  507. return ucd_get_category_group_string((ucd_category_group)c);
  508. }
  509. /** @brief Unicode General Category Values
  510. * @see http://www.unicode.org/reports/tr44/
  511. */
  512. enum category
  513. {
  514. Cc = UCD_CATEGORY_Cc, /**< @brief Control Character */
  515. Cf = UCD_CATEGORY_Cf, /**< @brief Format Control Character */
  516. Cn = UCD_CATEGORY_Cn, /**< @brief Unassigned */
  517. Co = UCD_CATEGORY_Co, /**< @brief Private Use */
  518. Cs = UCD_CATEGORY_Cs, /**< @brief Surrogate Code Point */
  519. Ii = UCD_CATEGORY_Ii, /**< @brief Invalid Unicode Codepoint */
  520. Ll = UCD_CATEGORY_Ll, /**< @brief Lower Case Letter */
  521. Lm = UCD_CATEGORY_Lm, /**< @brief Letter Modifier */
  522. Lo = UCD_CATEGORY_Lo, /**< @brief Other Letter */
  523. Lt = UCD_CATEGORY_Lt, /**< @brief Title Case Letter */
  524. Lu = UCD_CATEGORY_Lu, /**< @brief Upper Case Letter */
  525. Mc = UCD_CATEGORY_Mc, /**< @brief Spacing Mark */
  526. Me = UCD_CATEGORY_Me, /**< @brief Enclosing Mark */
  527. Mn = UCD_CATEGORY_Mn, /**< @brief Non-Spacing Mark */
  528. Nd = UCD_CATEGORY_Nd, /**< @brief Decimal Digit */
  529. Nl = UCD_CATEGORY_Nl, /**< @brief Letter-Like Number */
  530. No = UCD_CATEGORY_No, /**< @brief Other Number */
  531. Pc = UCD_CATEGORY_Pc, /**< @brief Connector */
  532. Pd = UCD_CATEGORY_Pd, /**< @brief Dash/Hyphen */
  533. Pe = UCD_CATEGORY_Pe, /**< @brief Close Punctuation Mark */
  534. Pf = UCD_CATEGORY_Pf, /**< @brief Final Quotation Mark */
  535. Pi = UCD_CATEGORY_Pi, /**< @brief Initial Quotation Mark */
  536. Po = UCD_CATEGORY_Po, /**< @brief Other */
  537. Ps = UCD_CATEGORY_Ps, /**< @brief Open Punctuation Mark */
  538. Sc = UCD_CATEGORY_Sc, /**< @brief Currency Symbol */
  539. Sk = UCD_CATEGORY_Sk, /**< @brief Modifier Symbol */
  540. Sm = UCD_CATEGORY_Sm, /**< @brief Math Symbol */
  541. So = UCD_CATEGORY_So, /**< @brief Other Symbol */
  542. Zl = UCD_CATEGORY_Zl, /**< @brief Line Separator */
  543. Zp = UCD_CATEGORY_Zp, /**< @brief Paragraph Separator */
  544. Zs = UCD_CATEGORY_Zs, /**< @brief Space Separator */
  545. };
  546. /** @brief Get a string representation of the category enumeration value.
  547. *
  548. * @param c The value to get the string representation for.
  549. *
  550. * @return The string representation, or "--" if the value is not recognized.
  551. */
  552. inline const char *get_category_string(category c)
  553. {
  554. return ucd_get_category_string((ucd_category)c);
  555. }
  556. /** @brief Lookup the General Category Group for a General Category.
  557. *
  558. * @param c The General Category to lookup.
  559. * @return The General Category Group of the General Category.
  560. */
  561. inline category_group lookup_category_group(category c)
  562. {
  563. return (category_group)ucd_get_category_group_for_category((ucd_category)c);
  564. }
  565. /** @brief Lookup the General Category Group for a Unicode codepoint.
  566. *
  567. * @param c The Unicode codepoint to lookup.
  568. * @return The General Category Group of the Unicode codepoint.
  569. */
  570. inline category_group lookup_category_group(codepoint_t c)
  571. {
  572. return (category_group)ucd_lookup_category_group(c);
  573. }
  574. /** @brief Lookup the General Category for a Unicode codepoint.
  575. *
  576. * @param c The Unicode codepoint to lookup.
  577. * @return The General Category of the Unicode codepoint.
  578. */
  579. inline category lookup_category(codepoint_t c)
  580. {
  581. return (category)ucd_lookup_category(c);
  582. }
  583. /** @brief Unicode Script
  584. * @see http://www.iana.org/assignments/language-subtag-registry
  585. * @see http://www.unicode.org/iso15924/iso15924-codes.html
  586. */
  587. enum script
  588. {
  589. Adlm = UCD_SCRIPT_Adlm, /**< @brief Adlam Script */
  590. Afak = UCD_SCRIPT_Afak, /**< @brief Afaka Script */
  591. Aghb = UCD_SCRIPT_Aghb, /**< @brief Caucasian Albanian Script */
  592. Ahom = UCD_SCRIPT_Ahom, /**< @brief Tai Ahom Script */
  593. Arab = UCD_SCRIPT_Arab, /**< @brief Arabic Script */
  594. Armi = UCD_SCRIPT_Armi, /**< @brief Imperial Aramaic Script */
  595. Armn = UCD_SCRIPT_Armn, /**< @brief Armenian Script */
  596. Avst = UCD_SCRIPT_Avst, /**< @brief Avestan Script */
  597. Bali = UCD_SCRIPT_Bali, /**< @brief Balinese Script */
  598. Bamu = UCD_SCRIPT_Bamu, /**< @brief Bamum Script */
  599. Bass = UCD_SCRIPT_Bass, /**< @brief Bassa Vah Script */
  600. Batk = UCD_SCRIPT_Batk, /**< @brief Batak Script */
  601. Beng = UCD_SCRIPT_Beng, /**< @brief Bengali Script */
  602. Bhks = UCD_SCRIPT_Bhks, /**< @brief Bhaiksuki Script */
  603. Blis = UCD_SCRIPT_Blis, /**< @brief Blissymbols Script */
  604. Bopo = UCD_SCRIPT_Bopo, /**< @brief Bopomofo Script */
  605. Brah = UCD_SCRIPT_Brah, /**< @brief Brahmi Script */
  606. Brai = UCD_SCRIPT_Brai, /**< @brief Braille Script */
  607. Bugi = UCD_SCRIPT_Bugi, /**< @brief Buginese Script */
  608. Buhd = UCD_SCRIPT_Buhd, /**< @brief Buhid Script */
  609. Cakm = UCD_SCRIPT_Cakm, /**< @brief Chakma Script */
  610. Cans = UCD_SCRIPT_Cans, /**< @brief Unified Canadian Aboriginal Syllabics */
  611. Cari = UCD_SCRIPT_Cari, /**< @brief Carian Script */
  612. Cham = UCD_SCRIPT_Cham, /**< @brief Cham Script */
  613. Cher = UCD_SCRIPT_Cher, /**< @brief Cherokee Script */
  614. Cirt = UCD_SCRIPT_Cirt, /**< @brief Cirth Script */
  615. Copt = UCD_SCRIPT_Copt, /**< @brief Coptic Script */
  616. Cprt = UCD_SCRIPT_Cprt, /**< @brief Cypriot Script */
  617. Cyrl = UCD_SCRIPT_Cyrl, /**< @brief Cyrillic Script */
  618. Cyrs = UCD_SCRIPT_Cyrs, /**< @brief Cyrillic (Old Church Slavonic variant) Script */
  619. Deva = UCD_SCRIPT_Deva, /**< @brief Devanagari Script */
  620. Dsrt = UCD_SCRIPT_Dsrt, /**< @brief Deseret Script */
  621. Dupl = UCD_SCRIPT_Dupl, /**< @brief Duployan Shorthand Script */
  622. Egyd = UCD_SCRIPT_Egyd, /**< @brief Egyptian Demotic Script */
  623. Egyh = UCD_SCRIPT_Egyh, /**< @brief Egyptian Hieratic Script */
  624. Egyp = UCD_SCRIPT_Egyp, /**< @brief Egyptian Hiegoglyphs */
  625. Elba = UCD_SCRIPT_Elba, /**< @brief Elbasan Script */
  626. Ethi = UCD_SCRIPT_Ethi, /**< @brief Ethiopic Script */
  627. Geok = UCD_SCRIPT_Geok, /**< @brief Khutsuri Script */
  628. Geor = UCD_SCRIPT_Geor, /**< @brief Geirgian Script */
  629. Glag = UCD_SCRIPT_Glag, /**< @brief Glagolitic Script */
  630. Gonm = UCD_SCRIPT_Gonm, /**< @brief Masaram Gondi */
  631. Goth = UCD_SCRIPT_Goth, /**< @brief Gothic Script */
  632. Gran = UCD_SCRIPT_Gran, /**< @brief Grantha Script */
  633. Grek = UCD_SCRIPT_Grek, /**< @brief Greek Script */
  634. Gujr = UCD_SCRIPT_Gujr, /**< @brief Gujarati Script */
  635. Guru = UCD_SCRIPT_Guru, /**< @brief Gurmukhi Script */
  636. Hang = UCD_SCRIPT_Hang, /**< @brief Hangul Script */
  637. Hani = UCD_SCRIPT_Hani, /**< @brief Han (Hanzi, Kanji, Hanja) Script */
  638. Hano = UCD_SCRIPT_Hano, /**< @brief Hanunoo Script */
  639. Hans = UCD_SCRIPT_Hans, /**< @brief Han (Simplified) Script */
  640. Hant = UCD_SCRIPT_Hant, /**< @brief Han (Traditional) Script */
  641. Hatr = UCD_SCRIPT_Hatr, /**< @brief Hatran Script */
  642. Hebr = UCD_SCRIPT_Hebr, /**< @brief Hebrew Script */
  643. Hira = UCD_SCRIPT_Hira, /**< @brief Hiragana Script */
  644. Hluw = UCD_SCRIPT_Hluw, /**< @brief Anatolian Hieroglyphs */
  645. Hmng = UCD_SCRIPT_Hmng, /**< @brief Pahawh Hmong Script */
  646. Hrkt = UCD_SCRIPT_Hrkt, /**< @brief Japanese Syllabaries */
  647. Hung = UCD_SCRIPT_Hung, /**< @brief Old Hungarian Script */
  648. Inds = UCD_SCRIPT_Inds, /**< @brief Indus Script */
  649. Ital = UCD_SCRIPT_Ital, /**< @brief Old Italic Script */
  650. Java = UCD_SCRIPT_Java, /**< @brief Javanese Script */
  651. Jpan = UCD_SCRIPT_Jpan, /**< @brief Japanese (Han + Hiragana + Katakana) Scripts */
  652. Jurc = UCD_SCRIPT_Jurc, /**< @brief Jurchen Script */
  653. Kali = UCD_SCRIPT_Kali, /**< @brief Kayah Li Script */
  654. Kana = UCD_SCRIPT_Kana, /**< @brief Katakana Script */
  655. Khar = UCD_SCRIPT_Khar, /**< @brief Kharoshthi Script */
  656. Khmr = UCD_SCRIPT_Khmr, /**< @brief Khmer Script */
  657. Khoj = UCD_SCRIPT_Khoj, /**< @brief Khojki Script */
  658. Knda = UCD_SCRIPT_Knda, /**< @brief Kannada Script */
  659. Kore = UCD_SCRIPT_Kore, /**< @brief Korean (Hangul + Han) Scripts */
  660. Kpel = UCD_SCRIPT_Kpel, /**< @brief Kpelle Script */
  661. Kthi = UCD_SCRIPT_Kthi, /**< @brief Kaithi Script */
  662. Lana = UCD_SCRIPT_Lana, /**< @brief Tai Tham Script */
  663. Laoo = UCD_SCRIPT_Laoo, /**< @brief Lao Script */
  664. Latf = UCD_SCRIPT_Latf, /**< @brief Latin Script (Fractur Variant) */
  665. Latg = UCD_SCRIPT_Latg, /**< @brief Latin Script (Gaelic Variant) */
  666. Latn = UCD_SCRIPT_Latn, /**< @brief Latin Script */
  667. Lepc = UCD_SCRIPT_Lepc, /**< @brief Lepcha Script */
  668. Limb = UCD_SCRIPT_Limb, /**< @brief Limbu Script */
  669. Lina = UCD_SCRIPT_Lina, /**< @brief Linear A Script */
  670. Linb = UCD_SCRIPT_Linb, /**< @brief Linear B Script */
  671. Lisu = UCD_SCRIPT_Lisu, /**< @brief Lisu Script */
  672. Loma = UCD_SCRIPT_Loma, /**< @brief Loma Script */
  673. Lyci = UCD_SCRIPT_Lyci, /**< @brief Lycian Script */
  674. Lydi = UCD_SCRIPT_Lydi, /**< @brief Lydian Script */
  675. Mahj = UCD_SCRIPT_Mahj, /**< @brief Mahajani Script */
  676. Mand = UCD_SCRIPT_Mand, /**< @brief Mandaic Script */
  677. Mani = UCD_SCRIPT_Mani, /**< @brief Manichaean Script */
  678. Marc = UCD_SCRIPT_Marc, /**< @brief Marchen Script */
  679. Maya = UCD_SCRIPT_Maya, /**< @brief Mayan Hieroglyphs */
  680. Mend = UCD_SCRIPT_Mend, /**< @brief Mende Kikakui Script */
  681. Merc = UCD_SCRIPT_Merc, /**< @brief Meroitic Cursive Script */
  682. Mero = UCD_SCRIPT_Mero, /**< @brief Meroitic Hieroglyphs */
  683. Mlym = UCD_SCRIPT_Mlym, /**< @brief Malayalam Script */
  684. Modi = UCD_SCRIPT_Modi, /**< @brief Modi Script */
  685. Mong = UCD_SCRIPT_Mong, /**< @brief Mongolian Script */
  686. Moon = UCD_SCRIPT_Moon, /**< @brief Moon Script */
  687. Mroo = UCD_SCRIPT_Mroo, /**< @brief Mro Script */
  688. Mtei = UCD_SCRIPT_Mtei, /**< @brief Meitei Mayek Script */
  689. Mult = UCD_SCRIPT_Mult, /**< @brief Multani Script */
  690. Mymr = UCD_SCRIPT_Mymr, /**< @brief Myanmar (Burmese) Script */
  691. Narb = UCD_SCRIPT_Narb, /**< @brief Old North Arabian Script */
  692. Nbat = UCD_SCRIPT_Nbat, /**< @brief Nabataean Script */
  693. Newa = UCD_SCRIPT_Newa, /**< @brief Newa Script */
  694. Nkgb = UCD_SCRIPT_Nkgb, /**< @brief Nakhi Geba Script */
  695. Nkoo = UCD_SCRIPT_Nkoo, /**< @brief N'Ko Script */
  696. Nshu = UCD_SCRIPT_Nshu, /**< @brief Nushu Script */
  697. Ogam = UCD_SCRIPT_Ogam, /**< @brief Ogham Script */
  698. Olck = UCD_SCRIPT_Olck, /**< @brief Ol Chiki Script */
  699. Orkh = UCD_SCRIPT_Orkh, /**< @brief Old Turkic Script */
  700. Orya = UCD_SCRIPT_Orya, /**< @brief Oriya Script */
  701. Osge = UCD_SCRIPT_Osge, /**< @brief Osage Script */
  702. Osma = UCD_SCRIPT_Osma, /**< @brief Osmanya Script */
  703. Palm = UCD_SCRIPT_Palm, /**< @brief Palmyrene Script */
  704. Pauc = UCD_SCRIPT_Pauc, /**< @brief Pau Cin Hau Script */
  705. Perm = UCD_SCRIPT_Perm, /**< @brief Old Permic */
  706. Phag = UCD_SCRIPT_Phag, /**< @brief Phags-Pa Script */
  707. Phli = UCD_SCRIPT_Phli, /**< @brief Inscriptional Pahlavi Script */
  708. Phlp = UCD_SCRIPT_Phlp, /**< @brief Psalter Pahlavi Script */
  709. Phlv = UCD_SCRIPT_Phlv, /**< @brief Book Pahlavi Script */
  710. Phnx = UCD_SCRIPT_Phnx, /**< @brief Phoenician Script */
  711. Plrd = UCD_SCRIPT_Plrd, /**< @brief Miao Script */
  712. Prti = UCD_SCRIPT_Prti, /**< @brief Inscriptional Parthian Script */
  713. Qaak = UCD_SCRIPT_Qaak, /**< @brief Klingon Script (Private Use) */
  714. Rjng = UCD_SCRIPT_Rjng, /**< @brief Rejang Script */
  715. Roro = UCD_SCRIPT_Roro, /**< @brief Rongorongo Script */
  716. Runr = UCD_SCRIPT_Runr, /**< @brief Runic Script */
  717. Samr = UCD_SCRIPT_Samr, /**< @brief Samaritan Script */
  718. Sara = UCD_SCRIPT_Sara, /**< @brief Sarati Script */
  719. Sarb = UCD_SCRIPT_Sarb, /**< @brief Old South Arabian Script */
  720. Saur = UCD_SCRIPT_Saur, /**< @brief Saurashtra Script */
  721. Sgnw = UCD_SCRIPT_Sgnw, /**< @brief Sign Writing */
  722. Shaw = UCD_SCRIPT_Shaw, /**< @brief Shavian Script */
  723. Shrd = UCD_SCRIPT_Shrd, /**< @brief Sharada Script */
  724. Sidd = UCD_SCRIPT_Sidd, /**< @brief Siddham Script */
  725. Sind = UCD_SCRIPT_Sind, /**< @brief Sindhi Script */
  726. Sinh = UCD_SCRIPT_Sinh, /**< @brief Sinhala Script */
  727. Sora = UCD_SCRIPT_Sora, /**< @brief Sora Sompeng Script */
  728. Soyo = UCD_SCRIPT_Soyo, /**< @brief Soyombo */
  729. Sund = UCD_SCRIPT_Sund, /**< @brief Sundanese Script */
  730. Sylo = UCD_SCRIPT_Sylo, /**< @brief Syloti Nagri Script */
  731. Syrc = UCD_SCRIPT_Syrc, /**< @brief Syriac Script */
  732. Syre = UCD_SCRIPT_Syre, /**< @brief Syriac Script (Estrangelo Variant) */
  733. Syrj = UCD_SCRIPT_Syrj, /**< @brief Syriac Script (Western Variant) */
  734. Syrn = UCD_SCRIPT_Syrn, /**< @brief Syriac Script (Eastern Variant) */
  735. Tagb = UCD_SCRIPT_Tagb, /**< @brief Tagbanwa Script */
  736. Takr = UCD_SCRIPT_Takr, /**< @brief Takri Script */
  737. Tale = UCD_SCRIPT_Tale, /**< @brief Tai Le Script */
  738. Talu = UCD_SCRIPT_Talu, /**< @brief New Tai Lue Script */
  739. Taml = UCD_SCRIPT_Taml, /**< @brief Tamil Script */
  740. Tang = UCD_SCRIPT_Tang, /**< @brief Tangut Script */
  741. Tavt = UCD_SCRIPT_Tavt, /**< @brief Tai Viet Script */
  742. Telu = UCD_SCRIPT_Telu, /**< @brief Telugu Script */
  743. Teng = UCD_SCRIPT_Teng, /**< @brief Tengwar Script */
  744. Tfng = UCD_SCRIPT_Tfng, /**< @brief Tifinagh Script */
  745. Tglg = UCD_SCRIPT_Tglg, /**< @brief Tagalog Script */
  746. Thaa = UCD_SCRIPT_Thaa, /**< @brief Thaana Script */
  747. Thai = UCD_SCRIPT_Thai, /**< @brief Thai Script */
  748. Tibt = UCD_SCRIPT_Tibt, /**< @brief Tibetan Script */
  749. Tirh = UCD_SCRIPT_Tirh, /**< @brief Tirhuta Script */
  750. Ugar = UCD_SCRIPT_Ugar, /**< @brief Ugaritic Script */
  751. Vaii = UCD_SCRIPT_Vaii, /**< @brief Vai Script */
  752. Visp = UCD_SCRIPT_Visp, /**< @brief Visible Speech Script */
  753. Wara = UCD_SCRIPT_Wara, /**< @brief Warang Citi Script */
  754. Wole = UCD_SCRIPT_Wole, /**< @brief Woleai Script */
  755. Xpeo = UCD_SCRIPT_Xpeo, /**< @brief Old Persian Script */
  756. Xsux = UCD_SCRIPT_Xsux, /**< @brief Cuneiform Script */
  757. Yiii = UCD_SCRIPT_Yiii, /**< @brief Yi Script */
  758. Zanb = UCD_SCRIPT_Zanb, /**< @brief Zanabazar Square */
  759. Zinh = UCD_SCRIPT_Zinh, /**< @brief Inherited Script */
  760. Zmth = UCD_SCRIPT_Zmth, /**< @brief Mathematical Notation */
  761. Zsym = UCD_SCRIPT_Zsym, /**< @brief Symbols */
  762. Zxxx = UCD_SCRIPT_Zxxx, /**< @brief Unwritten Documents */
  763. Zyyy = UCD_SCRIPT_Zyyy, /**< @brief Undetermined Script */
  764. Zzzz = UCD_SCRIPT_Zzzz, /**< @brief Uncoded Script */
  765. };
  766. /** @brief Get a string representation of the script enumeration value.
  767. *
  768. * @param s The value to get the string representation for.
  769. *
  770. * @return The string representation, or "----" if the value is not recognized.
  771. */
  772. inline const char *get_script_string(script s)
  773. {
  774. return ucd_get_script_string((ucd_script)s);
  775. }
  776. /** @brief Lookup the Script for a Unicode codepoint.
  777. *
  778. * @param c The Unicode codepoint to lookup.
  779. * @return The Script of the Unicode codepoint.
  780. */
  781. inline script lookup_script(codepoint_t c)
  782. {
  783. return (script)ucd_lookup_script(c);
  784. }
  785. /** @brief Properties
  786. */
  787. typedef ucd_property property;
  788. enum
  789. {
  790. White_Space = UCD_PROPERTY_WHITE_SPACE, /**< @brief White_Space */
  791. Bidi_Control = UCD_PROPERTY_BIDI_CONTROL, /**< @brief Bidi_Control */
  792. Join_Control = UCD_PROPERTY_JOIN_CONTROL, /**< @brief Join_Control */
  793. Dash = UCD_PROPERTY_DASH, /**< @brief Dash */
  794. Hyphen = UCD_PROPERTY_HYPHEN, /**< @brief Hyphen */
  795. Quotation_Mark = UCD_PROPERTY_QUOTATION_MARK, /**< @brief Quotation_Mark */
  796. Terminal_Punctuation = UCD_PROPERTY_TERMINAL_PUNCTUATION, /**< @brief Terminal_Punctuation */
  797. Other_Math = UCD_PROPERTY_OTHER_MATH, /**< @brief Other_Math */
  798. Hex_Digit = UCD_PROPERTY_HEX_DIGIT, /**< @brief Hex_Digit */
  799. ASCII_Hex_Digit = UCD_PROPERTY_ASCII_HEX_DIGIT, /**< @brief ASCII_Hex_Digit */
  800. Other_Alphabetic = UCD_PROPERTY_OTHER_ALPHABETIC, /**< @brief Other_Alphabetic */
  801. Ideographic = UCD_PROPERTY_IDEOGRAPHIC, /**< @brief Ideographic */
  802. Diacritic = UCD_PROPERTY_DIACRITIC, /**< @brief Diacritic */
  803. Extender = UCD_PROPERTY_EXTENDER, /**< @brief Extender */
  804. Other_Lowercase = UCD_PROPERTY_OTHER_LOWERCASE, /**< @brief Other_Lowercase */
  805. Other_Uppercase = UCD_PROPERTY_OTHER_UPPERCASE, /**< @brief Other_Uppercase */
  806. Noncharacter_Code_Point = UCD_PROPERTY_NONCHARACTER_CODE_POINT, /**< @brief Noncharacter_Code_Point */
  807. Other_Grapheme_Extend = UCD_PROPERTY_OTHER_GRAPHEME_EXTEND, /**< @brief Other_Grapheme_Extend */
  808. IDS_Binary_Operator = UCD_PROPERTY_IDS_BINARY_OPERATOR, /**< @brief IDS_Binary_Operator */
  809. IDS_Trinary_Operator = UCD_PROPERTY_IDS_TRINARY_OPERATOR, /**< @brief IDS_Trinary_Operator */
  810. Radical = UCD_PROPERTY_RADICAL, /**< @brief Radical */
  811. Unified_Ideograph = UCD_PROPERTY_UNIFIED_IDEOGRAPH, /**< @brief Unified_Ideograph */
  812. Other_Default_Ignorable_Code_Point = UCD_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT, /**< @brief Other_Default_Ignorable_Code_Point */
  813. Deprecated = UCD_PROPERTY_DEPRECATED, /**< @brief Deprecated */
  814. Soft_Dotted = UCD_PROPERTY_SOFT_DOTTED, /**< @brief Soft_Dotted */
  815. Logical_Order_Exception = UCD_PROPERTY_LOGICAL_ORDER_EXCEPTION, /**< @brief Logical_Order_Exception */
  816. Other_ID_Start = UCD_PROPERTY_OTHER_ID_START, /**< @brief Other_ID_Start */
  817. Other_ID_Continue = UCD_PROPERTY_OTHER_ID_CONTINUE, /**< @brief Other_ID_Continue */
  818. Sentence_Terminal = UCD_PROPERTY_SENTENCE_TERMINAL, /**< @brief Sentence_Terminal */
  819. Variation_Selector = UCD_PROPERTY_VARIATION_SELECTOR, /**< @brief Variation_Selector */
  820. Pattern_White_Space = UCD_PROPERTY_PATTERN_WHITE_SPACE, /**< @brief Pattern_White_Space */
  821. Pattern_Syntax = UCD_PROPERTY_PATTERN_SYNTAX, /**< @brief Pattern_Syntax */
  822. Prepended_Concatenation_Mark = UCD_PROPERTY_PREPENDED_CONCATENATION_MARK, /**< @brief Prepended_Concatenation_Mark */
  823. Emoji = UCD_PROPERTY_EMOJI, /**< @brief Emoji */
  824. Emoji_Presentation = UCD_PROPERTY_EMOJI_PRESENTATION, /**< @brief Emoji_Presentation */
  825. Emoji_Modifier = UCD_PROPERTY_EMOJI_MODIFIER, /**< @brief Emoji_Modifier */
  826. Emoji_Modifier_Base = UCD_PROPERTY_EMOJI_MODIFIER_BASE, /**< @brief Emoji_Modifier_Base */
  827. Regional_Indicator = UCD_PROPERTY_REGIONAL_INDICATOR, /**< @brief Regional_Indicator */
  828. Emoji_Component = UCD_PROPERTY_EMOJI_COMPONENT, /**< @brief Emoji_Component */
  829. };
  830. /** @brief Return the properties of the specified codepoint.
  831. *
  832. * @param c The Unicode codepoint to lookup.
  833. * @param cat The General Category of the codepoint.
  834. * @return The properties associated with the codepoint.
  835. */
  836. inline property properties(codepoint_t c, category cat)
  837. {
  838. return (property)ucd_properties(c, (ucd_category)cat);
  839. }
  840. /** @brief Is the codepoint in the 'alnum' class?
  841. *
  842. * @param c The Unicode codepoint to check.
  843. * @return Non-zero if the codepoint is in the 'alnum' class, zero otherwise.
  844. */
  845. inline int isalnum(codepoint_t c)
  846. {
  847. return ucd_isalnum(c);
  848. }
  849. /** @brief Is the codepoint in the 'alpha' class?
  850. *
  851. * @param c The Unicode codepoint to check.
  852. * @return Non-zero if the codepoint is in the 'alpha' class, zero otherwise.
  853. */
  854. inline int isalpha(codepoint_t c)
  855. {
  856. return ucd_isalpha(c);
  857. }
  858. /** @brief Is the codepoint in the 'blank' class?
  859. *
  860. * @param c The Unicode codepoint to check.
  861. * @return Non-zero if the codepoint is in the 'blank' class, zero otherwise.
  862. */
  863. inline int isblank(codepoint_t c)
  864. {
  865. return ucd_isblank(c);
  866. }
  867. /** @brief Is the codepoint in the 'cntrl' class?
  868. *
  869. * @param c The Unicode codepoint to check.
  870. * @return Non-zero if the codepoint is in the 'cntrl' class, zero otherwise.
  871. */
  872. inline int iscntrl(codepoint_t c)
  873. {
  874. return ucd_iscntrl(c);
  875. }
  876. /** @brief Is the codepoint in the 'digit' class?
  877. *
  878. * @param c The Unicode codepoint to check.
  879. * @return Non-zero if the codepoint is in the 'digit' class, zero otherwise.
  880. */
  881. inline int isdigit(codepoint_t c)
  882. {
  883. return ucd_isdigit(c);
  884. }
  885. /** @brief Is the codepoint in the 'graph' class?
  886. *
  887. * @param c The Unicode codepoint to check.
  888. * @return Non-zero if the codepoint is in the 'graph' class, zero otherwise.
  889. */
  890. inline int isgraph(codepoint_t c)
  891. {
  892. return ucd_isgraph(c);
  893. }
  894. /** @brief Is the codepoint in the 'lower' class?
  895. *
  896. * @param c The Unicode codepoint to check.
  897. * @return Non-zero if the codepoint is in the 'lower' class, zero otherwise.
  898. */
  899. inline int islower(codepoint_t c)
  900. {
  901. return ucd_islower(c);
  902. }
  903. /** @brief Is the codepoint in the 'print' class?
  904. *
  905. * @param c The Unicode codepoint to check.
  906. * @return Non-zero if the codepoint is in the 'print' class, zero otherwise.
  907. */
  908. inline int isprint(codepoint_t c)
  909. {
  910. return ucd_isprint(c);
  911. }
  912. /** @brief Is the codepoint in the 'punct' class?
  913. *
  914. * @param c The Unicode codepoint to check.
  915. * @return Non-zero if the codepoint is in the 'punct' class, zero otherwise.
  916. */
  917. inline int ispunct(codepoint_t c)
  918. {
  919. return ucd_ispunct(c);
  920. }
  921. /** @brief Is the codepoint in the 'space' class?
  922. *
  923. * @param c The Unicode codepoint to check.
  924. * @return Non-zero if the codepoint is in the 'space' class, zero otherwise.
  925. */
  926. inline int isspace(codepoint_t c)
  927. {
  928. return ucd_isspace(c);
  929. }
  930. /** @brief Is the codepoint in the 'upper' class?
  931. *
  932. * @param c The Unicode codepoint to check.
  933. * @return Non-zero if the codepoint is in the 'upper' class, zero otherwise.
  934. */
  935. inline int isupper(codepoint_t c)
  936. {
  937. return ucd_isupper(c);
  938. }
  939. /** @brief Is the codepoint in the 'xdigit' class?
  940. *
  941. * @param c The Unicode codepoint to check.
  942. * @return Non-zero if the codepoint is in the 'xdigit' class, zero otherwise.
  943. */
  944. inline int isxdigit(codepoint_t c)
  945. {
  946. return ucd_isxdigit(c);
  947. }
  948. /** @brief Convert the Unicode codepoint to upper-case.
  949. *
  950. * This function only uses the simple case mapping present in the
  951. * UnicodeData file. The data in SpecialCasing requires Unicode
  952. * codepoints to be mapped to multiple codepoints.
  953. *
  954. * @param c The Unicode codepoint to convert.
  955. * @return The upper-case Unicode codepoint for this codepoint, or
  956. * this codepoint if there is no upper-case codepoint.
  957. */
  958. inline codepoint_t toupper(codepoint_t c)
  959. {
  960. return ucd_toupper(c);
  961. }
  962. /** @brief Convert the Unicode codepoint to lower-case.
  963. *
  964. * This function only uses the simple case mapping present in the
  965. * UnicodeData file. The data in SpecialCasing requires Unicode
  966. * codepoints to be mapped to multiple codepoints.
  967. *
  968. * @param c The Unicode codepoint to convert.
  969. * @return The lower-case Unicode codepoint for this codepoint, or
  970. * this codepoint if there is no upper-case codepoint.
  971. */
  972. inline codepoint_t tolower(codepoint_t c)
  973. {
  974. return ucd_tolower(c);
  975. }
  976. /** @brief Convert the Unicode codepoint to title-case.
  977. *
  978. * This function only uses the simple case mapping present in the
  979. * UnicodeData file. The data in SpecialCasing requires Unicode
  980. * codepoints to be mapped to multiple codepoints.
  981. *
  982. * @param c The Unicode codepoint to convert.
  983. * @return The title-case Unicode codepoint for this codepoint, or
  984. * this codepoint if there is no upper-case codepoint.
  985. */
  986. inline codepoint_t totitle(codepoint_t c)
  987. {
  988. return ucd_totitle(c);
  989. }
  990. }
  991. #endif
  992. #endif