wcwidth.py 45 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234
  1. #!/usr/bin/env python
  2. # License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
  3. # Imports {{{
  4. import json
  5. import os
  6. import re
  7. import subprocess
  8. import sys
  9. from collections import defaultdict
  10. from collections.abc import Generator, Hashable, Iterable
  11. from contextlib import contextmanager
  12. from functools import lru_cache, partial
  13. from html.entities import html5
  14. from io import StringIO
  15. from math import ceil, log
  16. from typing import (
  17. Callable,
  18. DefaultDict,
  19. Iterator,
  20. Literal,
  21. NamedTuple,
  22. Optional,
  23. Protocol,
  24. Sequence,
  25. TypedDict,
  26. Union,
  27. )
  28. from urllib.request import urlopen
  29. if __name__ == '__main__' and not __package__:
  30. import __main__
  31. __main__.__package__ = 'gen'
  32. sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  33. # }}}
  34. # Fetching data {{{
  35. non_characters = frozenset(range(0xfffe, 0x10ffff, 0x10000))
  36. non_characters |= frozenset(range(0xffff, 0x10ffff + 1, 0x10000))
  37. non_characters |= frozenset(range(0xfdd0, 0xfdf0))
  38. if len(non_characters) != 66:
  39. raise SystemExit('non_characters table incorrect')
  40. emoji_skin_tone_modifiers = frozenset(range(0x1f3fb, 0x1F3FF + 1))
  41. def get_data(fname: str, folder: str = 'UCD') -> Iterable[str]:
  42. url = f'https://www.unicode.org/Public/{folder}/latest/{fname}'
  43. bn = os.path.basename(url)
  44. local = os.path.join('/tmp', bn)
  45. if os.path.exists(local):
  46. with open(local, 'rb') as f:
  47. data = f.read()
  48. else:
  49. data = urlopen(url).read()
  50. with open(local, 'wb') as f:
  51. f.write(data)
  52. for line in data.decode('utf-8').splitlines():
  53. line = line.strip()
  54. if line and not line.startswith('#'):
  55. yield line
  56. @lru_cache(maxsize=2)
  57. def unicode_version() -> tuple[int, int, int]:
  58. for line in get_data("ReadMe.txt"):
  59. m = re.search(r'Version\s+(\d+)\.(\d+)\.(\d+)', line)
  60. if m is not None:
  61. return int(m.group(1)), int(m.group(2)), int(m.group(3))
  62. raise ValueError('Could not find Unicode Version')
  63. # }}}
  64. # Parsing Unicode databases {{{
  65. # Map of class names to set of codepoints in class
  66. class_maps: dict[str, set[int]] = {}
  67. all_symbols: set[int] = set()
  68. name_map: dict[int, str] = {}
  69. word_search_map: DefaultDict[str, set[int]] = defaultdict(set)
  70. soft_hyphen = 0xad
  71. flag_codepoints = frozenset(range(0x1F1E6, 0x1F1E6 + 26))
  72. # See https://github.com/harfbuzz/harfbuzz/issues/169
  73. marks = set(emoji_skin_tone_modifiers) | flag_codepoints
  74. not_assigned = set(range(0, sys.maxunicode))
  75. property_maps: dict[str, set[int]] = defaultdict(set)
  76. grapheme_segmentation_maps: dict[str, set[int]] = defaultdict(set)
  77. grapheme_break_as_int: dict[str, int] = {}
  78. int_as_grapheme_break: tuple[str, ...] = ()
  79. incb_as_int: dict[str, int] = {}
  80. int_as_incb: tuple[str, ...] = ()
  81. incb_map: dict[str, set[int]] = defaultdict(set)
  82. extended_pictographic: set[int] = set()
  83. def parse_prop_list() -> None:
  84. global marks
  85. for line in get_data('ucd/PropList.txt'):
  86. if line.startswith('#'):
  87. continue
  88. cp_or_range, rest = line.split(';', 1)
  89. chars = parse_range_spec(cp_or_range.strip())
  90. name = rest.strip().split()[0]
  91. property_maps[name] |= chars
  92. # see https://www.unicode.org/faq/unsup_char.html#3
  93. marks |= property_maps['Other_Default_Ignorable_Code_Point']
  94. def parse_ucd() -> None:
  95. def add_word(w: str, c: int) -> None:
  96. if c <= 32 or c == 127 or 128 <= c <= 159:
  97. return
  98. if len(w) > 1:
  99. word_search_map[w.lower()].add(c)
  100. first: Optional[int] = None
  101. for word, c in html5.items():
  102. if len(c) == 1:
  103. add_word(word.rstrip(';'), ord(c))
  104. word_search_map['nnbsp'].add(0x202f)
  105. for line in get_data('ucd/UnicodeData.txt'):
  106. parts = [x.strip() for x in line.split(';')]
  107. codepoint = int(parts[0], 16)
  108. name = parts[1] or parts[10]
  109. if name == '<control>':
  110. name = parts[10]
  111. if name:
  112. name_map[codepoint] = name
  113. for word in name.lower().split():
  114. add_word(word, codepoint)
  115. category = parts[2]
  116. s = class_maps.setdefault(category, set())
  117. desc = parts[1]
  118. codepoints: Union[tuple[int, ...], Iterable[int]] = (codepoint,)
  119. if first is None:
  120. if desc.endswith(', First>'):
  121. first = codepoint
  122. continue
  123. else:
  124. codepoints = range(first, codepoint + 1)
  125. first = None
  126. for codepoint in codepoints:
  127. s.add(codepoint)
  128. not_assigned.discard(codepoint)
  129. if category.startswith('M'):
  130. marks.add(codepoint)
  131. elif category.startswith('S'):
  132. all_symbols.add(codepoint)
  133. elif category == 'Cf':
  134. # we add Cf to marks as it contains things like tags and zero
  135. # width chars. Not sure if *all* of Cf should be treated as
  136. # combining chars, might need to add individual exceptions in
  137. # the future.
  138. marks.add(codepoint)
  139. with open('gen/nerd-fonts-glyphs.txt') as f:
  140. for line in f:
  141. line = line.strip()
  142. if not line or line.startswith('#'):
  143. continue
  144. code, category, name = line.split(' ', 2)
  145. codepoint = int(code, 16)
  146. if name and codepoint not in name_map:
  147. name_map[codepoint] = name.upper()
  148. for word in name.lower().split():
  149. add_word(word, codepoint)
  150. # Some common synonyms
  151. word_search_map['bee'] |= word_search_map['honeybee']
  152. word_search_map['lambda'] |= word_search_map['lamda']
  153. word_search_map['lamda'] |= word_search_map['lambda']
  154. word_search_map['diamond'] |= word_search_map['gem']
  155. def parse_range_spec(spec: str) -> set[int]:
  156. spec = spec.strip()
  157. if '..' in spec:
  158. chars_ = tuple(map(lambda x: int(x, 16), filter(None, spec.split('.'))))
  159. chars = set(range(chars_[0], chars_[1] + 1))
  160. else:
  161. chars = {int(spec, 16)}
  162. return chars
  163. def split_two(line: str) -> tuple[set[int], str]:
  164. spec, rest = line.split(';', 1)
  165. spec, rest = spec.strip(), rest.strip().split(' ', 1)[0].strip()
  166. return parse_range_spec(spec), rest
  167. all_emoji: set[int] = set()
  168. emoji_presentation_bases: set[int] = set()
  169. narrow_emoji: set[int] = set()
  170. wide_emoji: set[int] = set()
  171. flags: dict[int, list[int]] = {}
  172. def parse_basic_emoji(spec: str) -> None:
  173. parts = list(filter(None, spec.split()))
  174. has_emoji_presentation = len(parts) < 2
  175. chars = parse_range_spec(parts[0])
  176. all_emoji.update(chars)
  177. emoji_presentation_bases.update(chars)
  178. (wide_emoji if has_emoji_presentation else narrow_emoji).update(chars)
  179. def parse_keycap_sequence(spec: str) -> None:
  180. base, fe0f, cc = list(filter(None, spec.split()))
  181. chars = parse_range_spec(base)
  182. all_emoji.update(chars)
  183. emoji_presentation_bases.update(chars)
  184. narrow_emoji.update(chars)
  185. def parse_flag_emoji_sequence(spec: str) -> None:
  186. a, b = list(filter(None, spec.split()))
  187. left, right = int(a, 16), int(b, 16)
  188. chars = {left, right}
  189. all_emoji.update(chars)
  190. wide_emoji.update(chars)
  191. emoji_presentation_bases.update(chars)
  192. flags.setdefault(left, []).append(right)
  193. def parse_emoji_tag_sequence(spec: str) -> None:
  194. a = int(spec.split()[0], 16)
  195. all_emoji.add(a)
  196. wide_emoji.add(a)
  197. emoji_presentation_bases.add(a)
  198. def parse_emoji_modifier_sequence(spec: str) -> None:
  199. a, b = list(filter(None, spec.split()))
  200. char, mod = int(a, 16), int(b, 16)
  201. mod
  202. all_emoji.add(char)
  203. wide_emoji.add(char)
  204. emoji_presentation_bases.add(char)
  205. def parse_emoji() -> None:
  206. for line in get_data('emoji-sequences.txt', 'emoji'):
  207. parts = [x.strip() for x in line.split(';')]
  208. if len(parts) < 2:
  209. continue
  210. data, etype = parts[:2]
  211. if etype == 'Basic_Emoji':
  212. parse_basic_emoji(data)
  213. elif etype == 'Emoji_Keycap_Sequence':
  214. parse_keycap_sequence(data)
  215. elif etype == 'RGI_Emoji_Flag_Sequence':
  216. parse_flag_emoji_sequence(data)
  217. elif etype == 'RGI_Emoji_Tag_Sequence':
  218. parse_emoji_tag_sequence(data)
  219. elif etype == 'RGI_Emoji_Modifier_Sequence':
  220. parse_emoji_modifier_sequence(data)
  221. doublewidth: set[int] = set()
  222. ambiguous: set[int] = set()
  223. def parse_eaw() -> None:
  224. global doublewidth, ambiguous
  225. seen: set[int] = set()
  226. for line in get_data('ucd/EastAsianWidth.txt'):
  227. chars, eaw = split_two(line)
  228. if eaw == 'A':
  229. ambiguous |= chars
  230. seen |= chars
  231. elif eaw in ('W', 'F'):
  232. doublewidth |= chars
  233. seen |= chars
  234. doublewidth |= set(range(0x3400, 0x4DBF + 1)) - seen
  235. doublewidth |= set(range(0x4E00, 0x9FFF + 1)) - seen
  236. doublewidth |= set(range(0xF900, 0xFAFF + 1)) - seen
  237. doublewidth |= set(range(0x20000, 0x2FFFD + 1)) - seen
  238. doublewidth |= set(range(0x30000, 0x3FFFD + 1)) - seen
  239. def parse_grapheme_segmentation() -> None:
  240. global extended_pictographic, grapheme_break_as_int, incb_as_int, int_as_grapheme_break, int_as_incb
  241. global seg_props_from_int, seg_props_as_int
  242. grapheme_segmentation_maps['AtStart'] # this is used by the segmentation algorithm, no character has it
  243. grapheme_segmentation_maps['None'] # this is used by the segmentation algorithm, no character has it
  244. for line in get_data('ucd/auxiliary/GraphemeBreakProperty.txt'):
  245. chars, category = split_two(line)
  246. grapheme_segmentation_maps[category] |= chars
  247. grapheme_segmentation_maps['Private_Expecting_RI'] # this is used by the segmentation algorithm, no character has it
  248. grapheme_break_as_int = {x: i for i, x in enumerate(grapheme_segmentation_maps)}
  249. int_as_grapheme_break = tuple(grapheme_break_as_int)
  250. incb_map['None'] # used by segmentation algorithm no character has it
  251. for line in get_data('ucd/DerivedCoreProperties.txt'):
  252. spec, rest = line.split(';', 1)
  253. category = rest.strip().split(' ', 1)[0].strip().rstrip(';')
  254. chars = parse_range_spec(spec.strip())
  255. if category == 'InCB':
  256. # Most InCB chars also have a GBP categorization, but not all,
  257. # there exist some InCB chars that do not have a GBP category
  258. subcat = rest.strip().split(';')[1].strip().split()[0].strip()
  259. incb_map[subcat] |= chars
  260. incb_as_int = {x: i for i, x in enumerate(incb_map)}
  261. int_as_incb = tuple(incb_as_int)
  262. for line in get_data('ucd/emoji/emoji-data.txt'):
  263. chars, category = split_two(line)
  264. if 'Extended_Pictographic#' == category:
  265. extended_pictographic |= chars
  266. seg_props_from_int = {'grapheme_break': int_as_grapheme_break, 'indic_conjunct_break': int_as_incb}
  267. seg_props_as_int = {'grapheme_break': grapheme_break_as_int, 'indic_conjunct_break': incb_as_int}
  268. class GraphemeSegmentationTest(TypedDict):
  269. data: tuple[str, ...]
  270. comment: str
  271. grapheme_segmentation_tests: list[GraphemeSegmentationTest] = []
  272. def parse_test_data() -> None:
  273. for line in get_data('ucd/auxiliary/GraphemeBreakTest.txt'):
  274. t, comment = line.split('#')
  275. t = t.lstrip('÷').strip().rstrip('÷').strip()
  276. chars: list[list[str]] = [[]]
  277. for x in re.split(r'([÷×])', t):
  278. x = x.strip()
  279. match x:
  280. case '÷':
  281. chars.append([])
  282. case '×':
  283. pass
  284. case '':
  285. pass
  286. case _:
  287. ch = chr(int(x, 16))
  288. chars[-1].append(ch)
  289. c = tuple(''.join(c) for c in chars)
  290. grapheme_segmentation_tests.append({'data': c, 'comment': comment.strip()})
  291. grapheme_segmentation_tests.append({
  292. 'data': (' ', '\xad', ' '),
  293. 'comment': '÷ [0.2] SPACE (Other) ÷ [0.4] SOFT HYPHEN ÷ [999.0] SPACE (Other) ÷ [0.3]'
  294. })
  295. grapheme_segmentation_tests.append({
  296. 'data': ('\U0001f468\u200d\U0001f469\u200d\U0001f467\u200d\U0001f466',),
  297. 'comment': '÷ [0.2] MAN × [9.0] ZERO WIDTH JOINER × [11.0] WOMAN × [9.0] ZERO WIDTH JOINER × [11.0] GIRL × [9.0] ZERO WIDTH JOINER × [11.0] BOY ÷ [0.3]'
  298. })
  299. # }}}
  300. def write_case(spec: Union[tuple[int, ...], int], p: Callable[..., None], for_go: bool = False) -> None:
  301. if isinstance(spec, tuple):
  302. if for_go:
  303. v = ', '.join(f'0x{x:x}' for x in range(spec[0], spec[1] + 1))
  304. p(f'\t\tcase {v}:')
  305. else:
  306. p('\t\tcase 0x{:x} ... 0x{:x}:'.format(*spec))
  307. else:
  308. p(f'\t\tcase 0x{spec:x}:')
  309. @contextmanager
  310. def create_header(path: str, include_data_types: bool = True) -> Generator[Callable[..., None], None, None]:
  311. with open(path, 'w') as f:
  312. p = partial(print, file=f)
  313. p('// Unicode data, built from the Unicode Standard', '.'.join(map(str, unicode_version())))
  314. p(f'// Code generated by {os.path.basename(__file__)}, DO NOT EDIT.', end='\n\n')
  315. if path.endswith('.h'):
  316. p('#pragma once')
  317. if include_data_types:
  318. p('#include "data-types.h"\n')
  319. p('START_ALLOW_CASE_RANGE')
  320. p()
  321. yield p
  322. p()
  323. if include_data_types:
  324. p('END_ALLOW_CASE_RANGE')
  325. def gen_names() -> None:
  326. aliases_map: dict[int, set[str]] = {}
  327. for word, codepoints in word_search_map.items():
  328. for cp in codepoints:
  329. aliases_map.setdefault(cp, set()).add(word)
  330. if len(name_map) > 0xffff:
  331. raise Exception('Too many named codepoints')
  332. with open('tools/unicode_names/names.txt', 'w') as f:
  333. print(len(name_map), len(word_search_map), file=f)
  334. for cp in sorted(name_map):
  335. name = name_map[cp]
  336. words = name.lower().split()
  337. aliases = aliases_map.get(cp, set()) - set(words)
  338. end = '\n'
  339. if aliases:
  340. end = '\t' + ' '.join(sorted(aliases)) + end
  341. print(cp, *words, end=end, file=f)
  342. def gofmt(*files: str) -> None:
  343. subprocess.check_call(['gofmt', '-w', '-s'] + list(files))
  344. def gen_rowcolumn_diacritics() -> None:
  345. # codes of all row/column diacritics
  346. codes = []
  347. with open("gen/rowcolumn-diacritics.txt") as file:
  348. for line in file.readlines():
  349. if line.startswith('#'):
  350. continue
  351. code = int(line.split(";")[0], 16)
  352. codes.append(code)
  353. go_file = 'tools/utils/images/rowcolumn_diacritics.go'
  354. with create_header('kitty/rowcolumn-diacritics.c') as p, create_header(go_file, include_data_types=False) as g:
  355. p('int diacritic_to_num(char_type code) {')
  356. p('\tswitch (code) {')
  357. g('package images')
  358. g(f'var NumberToDiacritic = [{len(codes)}]rune''{')
  359. g(', '.join(f'0x{x:x}' for x in codes) + ',')
  360. g('}')
  361. range_start_num = 1
  362. range_start = 0
  363. range_end = 0
  364. def print_range() -> None:
  365. if range_start >= range_end:
  366. return
  367. write_case((range_start, range_end), p)
  368. p('\t\treturn code - ' + hex(range_start) + ' + ' +
  369. str(range_start_num) + ';')
  370. for code in codes:
  371. if range_end == code:
  372. range_end += 1
  373. else:
  374. print_range()
  375. range_start_num += range_end - range_start
  376. range_start = code
  377. range_end = code + 1
  378. print_range()
  379. p('\t}')
  380. p('\treturn 0;')
  381. p('}')
  382. gofmt(go_file)
  383. def gen_test_data() -> None:
  384. with open('kitty_tests/GraphemeBreakTest.json', 'wb') as f:
  385. f.write(json.dumps(grapheme_segmentation_tests, indent=2, ensure_ascii=False).encode())
  386. def getsize(data: Iterable[int]) -> Literal[1, 2, 4]:
  387. # return smallest possible integer size for the given array
  388. maxdata = max(data)
  389. if maxdata < 256:
  390. return 1
  391. if maxdata < 65536:
  392. return 2
  393. return 4
  394. def mask_for(bits: int) -> int:
  395. return ~((~0) << bits)
  396. def splitbins[T: Hashable](t: tuple[T, ...], property_size: int, use_fixed_shift: int = 0) -> tuple[list[int], list[int], list[T], int]:
  397. if use_fixed_shift:
  398. candidates = range(use_fixed_shift, use_fixed_shift + 1)
  399. else:
  400. n = len(t)-1 # last valid index
  401. maxshift = 0 # the most we can shift n and still have something left
  402. if n > 0:
  403. while n >> 1:
  404. n >>= 1
  405. maxshift += 1
  406. candidates = range(maxshift + 1)
  407. t3: list[T] = []
  408. tmap: dict[T, int] = {}
  409. seen = set()
  410. for x in t:
  411. if x not in seen:
  412. seen.add(x)
  413. tmap[x] = len(t3)
  414. t3.append(x)
  415. t_int = tuple(tmap[x] for x in t)
  416. bytesz = sys.maxsize
  417. def memsize() -> int:
  418. ans = len(t1)*getsize(t1)
  419. sz3 = len(t3)*property_size + len(t2)*getsize(t2)
  420. sz2 = len(t2) * property_size
  421. return ans + min(sz2, sz3)
  422. for shift in candidates:
  423. t1: list[int] = []
  424. t2: list[int] = []
  425. size = 2**shift
  426. bincache: dict[tuple[int, ...], int] = {}
  427. for i in range(0, len(t_int), size):
  428. bin = t_int[i:i+size]
  429. index = bincache.get(bin)
  430. if index is None:
  431. index = len(t2)
  432. bincache[bin] = index
  433. t2.extend(bin)
  434. t1.append(index >> shift)
  435. # determine memory size
  436. b = memsize()
  437. if b < bytesz:
  438. best = t1, t2, shift
  439. bytesz = b
  440. t1, t2, shift = best
  441. return t1, t2, t3, shift
  442. class Property(Protocol):
  443. @property
  444. def as_c(self) -> str:
  445. return ''
  446. @property
  447. def as_go(self) -> str:
  448. return ''
  449. @classmethod
  450. def bitsize(cls) -> int:
  451. return 0
  452. def get_types(sz: int) -> tuple[str, str]:
  453. sz *= 8
  454. return f'uint{sz}_t', f'uint{sz}'
  455. def gen_multistage_table(
  456. c: Callable[..., None], g: Callable[..., None], t1: Sequence[int], t2: Sequence[int], t3: Sequence[Property], shift: int, input_max_val: int
  457. ) -> None:
  458. t1_type_sz = getsize(t1)
  459. ctype_t1, gotype_t1 = get_types(t1_type_sz)
  460. mask = mask_for(shift)
  461. name = t3[0].__class__.__name__
  462. t2_type_sz = getsize(tuple(range(len(t3))))
  463. ctype_t2, gotype_t2 = get_types(t2_type_sz)
  464. t3_type_sz = t3[0].bitsize() // 8
  465. lname = name.lower()
  466. input_type = get_types(getsize((input_max_val,)))[1]
  467. # Output t1
  468. c(f'static const char_type {name}_mask = {mask}u;')
  469. c(f'static const char_type {name}_shift = {shift}u;')
  470. c(f'static const {ctype_t1} {name}_t1[{len(t1)}] = ''{')
  471. c(f'\t{", ".join(map(str, t1))}')
  472. c('};')
  473. g(f'const {lname}_mask = {mask}')
  474. g(f'const {lname}_shift = {shift}')
  475. g(f'var {lname}_t1 = [{len(t1)}]{gotype_t1}''{')
  476. g(f'\t{", ".join(map(str, t1))},')
  477. g('}')
  478. bytesz = len(t1) * t1_type_sz
  479. if t3_type_sz > t2_type_sz: # needs 3 levels
  480. bytesz += len(t2) * t2_type_sz + len(t3) * t3_type_sz
  481. c(f'static const {ctype_t2} {name}_t2[{len(t2)}] = ''{')
  482. c(f'\t{", ".join(map(str, t2))}')
  483. c('};')
  484. items = '\n\t'.join(x.as_c + f', // {i}' for i, x in enumerate(t3))
  485. c(f'static const {name} {name}_t3[{len(t3)}] = ''{')
  486. c(f'\t{items}')
  487. c('};')
  488. g(f'var {lname}_t2 = [{len(t2)}]{gotype_t2}''{')
  489. g(f'\t{", ".join(map(str, t2))},')
  490. g('}')
  491. items = '\n\t'.join(x.as_go + f', // {i}' for i, x in enumerate(t3))
  492. g(f'var {lname}_t3 = [{len(t3)}]{name}''{')
  493. g(f'\t{items}')
  494. g('}')
  495. g(f'''
  496. // Array accessor function that avoids bounds checking
  497. func {lname}_for(x {input_type}) {name} {{
  498. t1 := uintptr(*(*{gotype_t1})(unsafe.Pointer(uintptr(unsafe.Pointer(&{lname}_t1[0])) + uintptr(x>>{lname}_shift)*{t1_type_sz})))
  499. t1_shifted := (t1 << {lname}_shift) + (uintptr(x) & {lname}_mask)
  500. t2 := uintptr(*(*{gotype_t2})(unsafe.Pointer(uintptr(unsafe.Pointer(&{lname}_t2[0])) + t1_shifted*{t2_type_sz})))
  501. return *(*{name})(unsafe.Pointer(uintptr(unsafe.Pointer(&{lname}_t3[0])) + t2*{t3_type_sz}))
  502. }}
  503. ''')
  504. else:
  505. t3 = tuple(t3[i] for i in t2)
  506. bytesz += len(t3) * t3_type_sz
  507. items = '\n\t'.join(x.as_c + ',' for x in t3)
  508. c(f'static const {name} {name}_t2[{len(t3)}] = ''{')
  509. c(f'\t{items}')
  510. c('};')
  511. items = '\n\t'.join(x.as_go + ',' for x in t3)
  512. g(f'var {lname}_t2 = [{len(t3)}]{name}''{')
  513. g(f'\t{items}')
  514. g('}')
  515. g(f'''
  516. // Array accessor function that avoids bounds checking
  517. func {lname}_for(x {input_type}) {name} {{
  518. t1 := uintptr(*(*{gotype_t1})(unsafe.Pointer(uintptr(unsafe.Pointer(&{lname}_t1[0])) + uintptr(x>>{lname}_shift)*{t1_type_sz})))
  519. t1_shifted := (t1 << {lname}_shift) + (uintptr(x) & {lname}_mask)
  520. return *(*{name})(unsafe.Pointer(uintptr(unsafe.Pointer(&{lname}_t2[0])) + t1_shifted*{t3_type_sz}))
  521. }}
  522. ''')
  523. print(f'Size of {name} table: {ceil(bytesz/1024)}KB with {shift} bit shift')
  524. width_shift = 4
  525. def bitsize(maxval: int) -> int: # number of bits needed to store maxval
  526. return ceil(log(maxval, 2))
  527. def clamped_bitsize(val: int) -> int:
  528. if val <= 8:
  529. return 8
  530. if val <= 16:
  531. return 16
  532. if val <= 32:
  533. return 32
  534. if val <= 64:
  535. return 64
  536. raise ValueError('Too many fields')
  537. def bitfield_from_int(
  538. fields: dict[str, int], x: int, int_to_str: dict[str, tuple[str, ...]]
  539. ) -> dict[str, str | bool]:
  540. # first field is least significant, last field is most significant
  541. args: dict[str, str | bool] = {}
  542. for f, shift in fields.items():
  543. mask = mask_for(shift)
  544. val = x & mask
  545. if shift == 1:
  546. args[f] = bool(val)
  547. else:
  548. args[f] = int_to_str[f][val]
  549. x >>= shift
  550. return args
  551. def bitfield_as_int(
  552. fields: dict[str, int], vals: Sequence[bool | str], str_maps: dict[str, dict[str, int]]
  553. ) -> int:
  554. # first field is least significant, last field is most significant
  555. ans = shift = 0
  556. for i, (f, width) in enumerate(fields.items()):
  557. qval = vals[i]
  558. if isinstance(qval, str):
  559. val = str_maps[f][qval]
  560. else:
  561. val = int(qval)
  562. ans |= val << shift
  563. shift += width
  564. return ans
  565. seg_props_from_int: dict[str, tuple[str, ...]] = {}
  566. seg_props_as_int: dict[str, dict[str, int]] = {}
  567. class GraphemeSegmentationProps(NamedTuple):
  568. grapheme_break: str = '' # set at runtime
  569. indic_conjunct_break: str = '' # set at runtime
  570. is_extended_pictographic: bool = True
  571. @classmethod
  572. def used_bits(cls) -> int:
  573. return sum(int(cls._field_defaults[f]) for f in cls._fields)
  574. @classmethod
  575. def bitsize(cls) -> int:
  576. return clamped_bitsize(cls.used_bits())
  577. @classmethod
  578. def fields(cls) -> dict[str, int]:
  579. return {f: int(cls._field_defaults[f]) for f in cls._fields}
  580. @classmethod
  581. def from_int(cls, x: int) -> 'GraphemeSegmentationProps':
  582. args = bitfield_from_int(cls.fields(), x, seg_props_from_int)
  583. return cls(**args) # type: ignore
  584. def __int__(self) -> int:
  585. return bitfield_as_int(self.fields(), self, seg_props_as_int)
  586. control_grapheme_breaks = 'CR', 'LF', 'Control'
  587. linker_or_extend = 'Linker', 'Extend'
  588. def bitfield_declaration_as_c(name: str, fields: dict[str, int], *alternate_fields: dict[str, int]) -> str:
  589. base_size = clamped_bitsize(sum(fields.values()))
  590. base_type = f'uint{base_size}_t'
  591. ans = [f'// {name}Declaration: uses {sum(fields.values())} bits {{''{{', f'typedef union {name} {{']
  592. def struct(fields: dict[str, int]) -> Iterator[str]:
  593. if not fields:
  594. return
  595. empty = base_size - sum(fields.values())
  596. yield ' struct __attribute__((packed)) {'
  597. yield '#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__'
  598. for f, width in reversed(fields.items()):
  599. yield f' uint{clamped_bitsize(width)}_t {f} : {width};'
  600. if empty:
  601. yield f' uint{clamped_bitsize(empty)}_t : {empty};'
  602. yield '#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__'
  603. if empty:
  604. yield f' uint{clamped_bitsize(empty)}_t : {empty};'
  605. for f, width in fields.items():
  606. yield f' uint{clamped_bitsize(width)}_t {f} : {width};'
  607. yield '#else'
  608. yield '#error "Unsupported endianness"'
  609. yield '#endif'
  610. yield ' };'
  611. ans.extend(struct(fields))
  612. for fields in alternate_fields:
  613. ans.extend(struct(fields))
  614. ans.append(f' {base_type} val;')
  615. ans.append(f'}} {name};')
  616. ans.append(f'static_assert(sizeof({name}) == sizeof({base_type}), "Fix the ordering of {name}");')
  617. ans.append(f'// End{name}Declaration }}''}}')
  618. return '\n'.join(ans)
  619. class GraphemeSegmentationState(NamedTuple):
  620. grapheme_break: str = '' # set at runtime
  621. # True if the last character ends a sequence of Indic_Conjunct_Break values: consonant {extend|linker}*
  622. incb_consonant_extended: bool = True
  623. # True if the last character ends a sequence of Indic_Conjunct_Break values: consonant {extend|linker}* linker
  624. incb_consonant_extended_linker: bool = True
  625. # True if the last character ends a sequence of Indic_Conjunct_Break values: consonant {extend|linker}* linker {extend|linker}*
  626. incb_consonant_extended_linker_extended: bool = True
  627. # True if the last character ends an emoji modifier sequence \p{Extended_Pictographic} Extend*
  628. emoji_modifier_sequence: bool = True
  629. # True if the last character was immediately preceded by an emoji modifier sequence \p{Extended_Pictographic} Extend*
  630. emoji_modifier_sequence_before_last_char: bool = True
  631. @classmethod
  632. def make(cls) -> 'GraphemeSegmentationState':
  633. return GraphemeSegmentationState('AtStart', False, False, False, False, False)
  634. @classmethod
  635. def fields(cls) -> dict[str, int]:
  636. return {f: int(cls._field_defaults[f]) for f in cls._fields}
  637. @classmethod
  638. def from_int(cls, x: int) -> 'GraphemeSegmentationState':
  639. args = bitfield_from_int(cls.fields(), x, {'grapheme_break': int_as_grapheme_break})
  640. return cls(**args) # type: ignore
  641. def __int__(self) -> int:
  642. return bitfield_as_int(self.fields(), self, seg_props_as_int)
  643. @classmethod
  644. def c_declaration(cls) -> str:
  645. return bitfield_declaration_as_c(cls.__name__, cls.fields())
  646. @classmethod
  647. def used_bits(cls) -> int:
  648. return sum(int(cls._field_defaults[f]) for f in cls._fields)
  649. @classmethod
  650. def bitsize(cls) -> int:
  651. return clamped_bitsize(cls.used_bits())
  652. def add_to_current_cell(self, p: GraphemeSegmentationProps) -> 'GraphemeSegmentationResult':
  653. prev = self.grapheme_break
  654. prop = p.grapheme_break
  655. incb = p.indic_conjunct_break
  656. add_to_cell = False
  657. if self.grapheme_break == 'AtStart':
  658. add_to_cell = True
  659. if prop == 'Regional_Indicator':
  660. prop = 'Private_Expecting_RI'
  661. else:
  662. # No break between CR and LF (GB3).
  663. if prev == 'CR' and prop == 'LF':
  664. add_to_cell = True
  665. # Break before and after controls (GB4, GB5).
  666. elif prev in control_grapheme_breaks or prop in control_grapheme_breaks:
  667. pass
  668. # No break between Hangul syllable sequences (GB6, GB7, GB8).
  669. elif (
  670. (prev == 'L' and prop in ('L', 'V', 'LV', 'LVT')) or
  671. (prev in ('LV', 'V') and prop in ('V', 'T')) or
  672. (prev in ('LVT', 'T') and prop == 'T')
  673. ):
  674. add_to_cell = True
  675. # No break before: extending characters or ZWJ (GB9), SpacingMarks (GB9a), Prepend characters (GB9b).
  676. elif prop in ('Extend', 'ZWJ', 'SpacingMark') or prev == 'Prepend':
  677. add_to_cell = True
  678. # No break within certain combinations of Indic_Conjunct_Break values
  679. # Between consonant {extend|linker}* linker {extend|linker}* and consonant (GB9c).
  680. elif self.incb_consonant_extended_linker_extended and incb == 'Consonant':
  681. add_to_cell = True
  682. # No break within emoji modifier sequences or emoji zwj sequences (GB11).
  683. elif prev == 'ZWJ' and self.emoji_modifier_sequence_before_last_char and p.is_extended_pictographic:
  684. add_to_cell = True
  685. # No break between RI if there is an odd number of RI characters before (GB12, GB13).
  686. elif prop == 'Regional_Indicator':
  687. if prev == 'Private_Expecting_RI':
  688. add_to_cell = True
  689. else:
  690. prop = 'Private_Expecting_RI'
  691. # Break everywhere else GB999
  692. incb_consonant_extended_linker = self.incb_consonant_extended and incb == 'Linker'
  693. incb_consonant_extended_linker_extended = incb_consonant_extended_linker or (
  694. self.incb_consonant_extended_linker_extended and incb in linker_or_extend)
  695. incb_consonant_extended = incb == 'Consonant' or (
  696. self.incb_consonant_extended and incb in linker_or_extend)
  697. emoji_modifier_sequence_before_last_char = self.emoji_modifier_sequence
  698. emoji_modifier_sequence = (self.emoji_modifier_sequence and prop == 'Extend') or p.is_extended_pictographic
  699. return GraphemeSegmentationResult(GraphemeSegmentationState(
  700. grapheme_break=prop, incb_consonant_extended=incb_consonant_extended,
  701. incb_consonant_extended_linker=incb_consonant_extended_linker,
  702. incb_consonant_extended_linker_extended=incb_consonant_extended_linker_extended,
  703. emoji_modifier_sequence=emoji_modifier_sequence, emoji_modifier_sequence_before_last_char=emoji_modifier_sequence_before_last_char
  704. ), add_to_cell)
  705. def split_into_graphemes(props: Sequence[GraphemeSegmentationProps], text: str) -> Iterator[str]:
  706. s = GraphemeSegmentationState.make()
  707. pos = 0
  708. for i, ch in enumerate(text):
  709. p = props[ord(ch)]
  710. s, add_to_cell = s.add_to_current_cell(p)
  711. if not add_to_cell:
  712. yield text[pos:i]
  713. pos = i
  714. if pos < len(text):
  715. yield text[pos:]
  716. def split_into_graphemes_with_table(
  717. props: Sequence['GraphemeSegmentationProps'], table: Sequence['GraphemeSegmentationResult'], text: str,
  718. ) -> Iterator[str]:
  719. s = GraphemeSegmentationResult.make()
  720. pos = 0
  721. for i, ch in enumerate(text):
  722. k = int(GraphemeSegmentationKey(s.new_state, props[ord(ch)]))
  723. s = table[k]
  724. if not s.add_to_current_cell:
  725. yield text[pos:i]
  726. pos = i
  727. if pos < len(text):
  728. yield text[pos:]
  729. def test_grapheme_segmentation(split_into_graphemes: Callable[[str], Iterator[str]]) -> None:
  730. for test in grapheme_segmentation_tests:
  731. expected = test['data']
  732. actual = tuple(split_into_graphemes(''.join(test['data'])))
  733. if expected != actual:
  734. def as_codepoints(text: str) -> str:
  735. return ' '.join(hex(ord(x))[2:] for x in text)
  736. qe = tuple(map(as_codepoints, expected))
  737. qa = tuple(map(as_codepoints, actual))
  738. raise SystemExit(f'Failed to split graphemes for: {test["comment"]}\n{expected!r} {qe} != {actual!r} {qa}')
  739. class GraphemeSegmentationKey(NamedTuple):
  740. state: GraphemeSegmentationState
  741. char: GraphemeSegmentationProps
  742. @classmethod
  743. def from_int(cls, x: int) -> 'GraphemeSegmentationKey':
  744. shift = GraphemeSegmentationProps.used_bits()
  745. mask = mask_for(shift)
  746. state = GraphemeSegmentationState.from_int(x >> shift)
  747. char = GraphemeSegmentationProps.from_int(x & mask)
  748. return GraphemeSegmentationKey(state, char)
  749. def __int__(self) -> int:
  750. shift = GraphemeSegmentationProps.used_bits()
  751. return int(self.state) << shift | int(self.char)
  752. def result(self) -> 'GraphemeSegmentationResult':
  753. return self.state.add_to_current_cell(self.char)
  754. @classmethod
  755. def code_to_convert_to_int(cls, for_go: bool = False) -> str:
  756. lines: list[str] = []
  757. a = lines.append
  758. shift = GraphemeSegmentationProps.used_bits()
  759. if for_go:
  760. base_type = f'uint{GraphemeSegmentationState.bitsize()}'
  761. a(f'func grapheme_segmentation_key(r GraphemeSegmentationResult, ch CharProps) ({base_type}) ''{')
  762. a(f'\treturn (r.State() << {shift}) | ch.GraphemeSegmentationProperty()')
  763. a('}')
  764. else:
  765. base_type = f'uint{GraphemeSegmentationState.bitsize()}_t'
  766. a(f'static inline {base_type} {cls.__name__}(GraphemeSegmentationResult r, CharProps ch)' '{')
  767. a(f'\treturn (r.state << {shift}) | ch.grapheme_segmentation_property;')
  768. a('}')
  769. return '\n'.join(lines)
  770. class GraphemeSegmentationResult(NamedTuple):
  771. new_state: GraphemeSegmentationState = GraphemeSegmentationState()
  772. add_to_current_cell: bool = True
  773. @classmethod
  774. def used_bits(cls) -> int:
  775. return sum(int(GraphemeSegmentationState._field_defaults[f]) for f in GraphemeSegmentationState._fields) + 1
  776. @classmethod
  777. def bitsize(cls) -> int:
  778. return clamped_bitsize(cls.used_bits())
  779. @classmethod
  780. def make(cls) -> 'GraphemeSegmentationResult':
  781. return GraphemeSegmentationResult(GraphemeSegmentationState.make(), False)
  782. @classmethod
  783. def go_fields(cls) -> Sequence[str]:
  784. ans = []
  785. ans.append('add_to_current_cell 1')
  786. for f, width in reversed(GraphemeSegmentationState.fields().items()):
  787. ans.append(f'{f} {width}')
  788. return tuple(ans)
  789. @property
  790. def as_go(self) -> str:
  791. shift = 0
  792. parts = []
  793. for f in reversed(GraphemeSegmentationResult.go_fields()):
  794. f, _, w = f.partition(' ')
  795. bits = int(w)
  796. if f != 'add_to_current_cell':
  797. x = getattr(self.new_state, f)
  798. if f == 'grapheme_break':
  799. x = f'GraphemeSegmentationResult(GBP_{x})'
  800. else:
  801. x = int(x)
  802. else:
  803. x = int(self.add_to_current_cell)
  804. mask = '0b' + '1' * bits
  805. parts.append(f'(({x} & {mask}) << {shift})')
  806. shift += bits
  807. return ' | '.join(parts)
  808. @classmethod
  809. def go_extra(cls) -> str:
  810. bits = GraphemeSegmentationState.used_bits()
  811. base_type = f'uint{GraphemeSegmentationState.bitsize()}'
  812. return f'''
  813. func (r GraphemeSegmentationResult) State() (ans {base_type}) {{
  814. return {base_type}(r) & {mask_for(bits)}
  815. }}
  816. '''
  817. @property
  818. def as_c(self) -> str:
  819. parts = []
  820. for f in GraphemeSegmentationState._fields:
  821. x = getattr(self.new_state, f)
  822. match f:
  823. case 'grapheme_break':
  824. x = f'GBP_{x}'
  825. case _:
  826. x = int(x)
  827. parts.append(f'.{f}={x}')
  828. parts.append(f'.add_to_current_cell={int(self.add_to_current_cell)}')
  829. return '{' + ', '.join(parts) + '}'
  830. @classmethod
  831. def c_declaration(cls) -> str:
  832. fields = {'add_to_current_cell': 1}
  833. sfields = GraphemeSegmentationState.fields()
  834. fields.update(sfields)
  835. bits = sum(sfields.values())
  836. # dont know if the alternate state access works in big endian
  837. return bitfield_declaration_as_c('GraphemeSegmentationResult', fields, {'state': bits})
  838. class CharProps(NamedTuple):
  839. width: int = 3
  840. is_emoji: bool = True
  841. category: str = '' # set at runtime
  842. is_emoji_presentation_base: bool = True
  843. # derived properties for fast lookup
  844. is_invalid: bool = True
  845. is_non_rendered: bool = True
  846. is_symbol: bool = True
  847. is_combining_char: bool = True
  848. is_word_char: bool = True
  849. is_punctuation: bool = True
  850. # needed for grapheme segmentation set as LSB bits for easy conversion to GraphemeSegmentationProps
  851. grapheme_break: str = '' # set at runtime
  852. indic_conjunct_break: str = '' # set at runtime
  853. is_extended_pictographic: bool = True
  854. @classmethod
  855. def bitsize(cls) -> int:
  856. ans = sum(int(cls._field_defaults[f]) for f in cls._fields)
  857. return clamped_bitsize(ans)
  858. @classmethod
  859. def go_fields(cls) -> Sequence[str]:
  860. ans = []
  861. for f in cls._fields:
  862. bits = int(cls._field_defaults[f])
  863. if f == 'width':
  864. f = 'shifted_width'
  865. ans.append(f'{f} {bits}')
  866. return tuple(ans)
  867. @property
  868. def as_go(self) -> str:
  869. shift = 0
  870. parts = []
  871. for f in reversed(self.go_fields()):
  872. f, _, w = f.partition(' ')
  873. if f == 'shifted_width':
  874. f = 'width'
  875. x = getattr(self, f)
  876. match f:
  877. case 'width':
  878. x += width_shift
  879. case 'grapheme_break':
  880. x = f'CharProps(GBP_{x})'
  881. case 'indic_conjunct_break':
  882. x = f'CharProps(ICB_{x})'
  883. case 'category':
  884. x = f'CharProps(UC_{x})'
  885. case _:
  886. x = int(x)
  887. bits = int(w)
  888. mask = '0b' + '1' * bits
  889. parts.append(f'(({x} & {mask}) << {shift})')
  890. shift += bits
  891. return ' | '.join(parts)
  892. @classmethod
  893. def go_extra(cls) -> str:
  894. base_type = f'uint{GraphemeSegmentationState.bitsize()}'
  895. f = GraphemeSegmentationProps.fields()
  896. s = f['grapheme_break'] + f['indic_conjunct_break']
  897. return f'''
  898. func (s CharProps) Width() int {{
  899. return int(s.Shifted_width()) - {width_shift}
  900. }}
  901. func (s CharProps) GraphemeSegmentationProperty() {base_type} {{
  902. return {base_type}(s.Grapheme_break() | (s.Indic_conjunct_break() << {f["grapheme_break"]}) | (s.Is_extended_pictographic() << {s}))
  903. }}
  904. '''
  905. @property
  906. def as_c(self) -> str:
  907. parts = []
  908. for f in self._fields:
  909. x = getattr(self, f)
  910. match f:
  911. case 'width':
  912. x += width_shift
  913. f = 'shifted_width'
  914. case 'grapheme_break':
  915. x = f'GBP_{x}'
  916. case 'indic_conjunct_break':
  917. x = f'ICB_{x}'
  918. case 'category':
  919. x = f'UC_{x}'
  920. case _:
  921. x = int(x)
  922. parts.append(f'.{f}={x}')
  923. return '{' + ', '.join(parts) + '}'
  924. @classmethod
  925. def fields(cls) -> dict[str, int]:
  926. return {'shifted_width' if f == 'width' else f: int(cls._field_defaults[f]) for f in cls._fields}
  927. @classmethod
  928. def c_declaration(cls) -> str:
  929. # Dont know if grapheme_segmentation_property in alternate works on big endian
  930. alternate = {
  931. 'grapheme_segmentation_property': sum(int(cls._field_defaults[f]) for f in GraphemeSegmentationProps._fields)
  932. }
  933. return bitfield_declaration_as_c(cls.__name__, cls.fields(), alternate)
  934. def generate_enum(p: Callable[..., None], gp: Callable[..., None], name: str, *items: str, prefix: str = '') -> None:
  935. p(f'typedef enum {name} {{') # }}
  936. gp(f'type {name} uint8\n')
  937. gp('const (') # )
  938. for i, x in enumerate(items):
  939. x = prefix + x
  940. p(f'\t{x},')
  941. if i == 0:
  942. gp(f'{x} {name} = iota')
  943. else:
  944. gp(x)
  945. p(f'}} {name};')
  946. gp(')')
  947. p('')
  948. gp('')
  949. def category_set(predicate: Callable[[str], bool]) -> set[int]:
  950. ans = set()
  951. for c, chs in class_maps.items():
  952. if predicate(c):
  953. ans |= chs
  954. return ans
  955. def top_level_category(q: str) -> set[int]:
  956. return category_set(lambda x: x[0] in q)
  957. def patch_declaration(name: str, decl: str, raw: str) -> str:
  958. begin = f'// {name}Declaration'
  959. end = f'// End{name}Declaration }}''}}'
  960. return re.sub(rf'{begin}.+?{end}', decl.rstrip(), raw, flags=re.DOTALL)
  961. def gen_char_props() -> None:
  962. CharProps._field_defaults['grapheme_break'] = str(bitsize(len(grapheme_segmentation_maps)))
  963. CharProps._field_defaults['indic_conjunct_break'] = str(bitsize(len(incb_map)))
  964. CharProps._field_defaults['category'] = str(bitsize(len(class_maps) + 1))
  965. GraphemeSegmentationProps._field_defaults['grapheme_break'] = CharProps._field_defaults['grapheme_break']
  966. GraphemeSegmentationProps._field_defaults['indic_conjunct_break'] = CharProps._field_defaults['indic_conjunct_break']
  967. GraphemeSegmentationState._field_defaults['grapheme_break'] = GraphemeSegmentationProps._field_defaults['grapheme_break']
  968. invalid = class_maps['Cc'] | class_maps['Cs'] | non_characters
  969. non_printing = invalid | class_maps['Cf']
  970. non_rendered = non_printing | property_maps['Other_Default_Ignorable_Code_Point'] | set(range(0xfe00, 0xfe0f + 1))
  971. is_word_char = top_level_category('LN')
  972. is_punctuation = top_level_category('P')
  973. width_map: dict[int, int] = {}
  974. cat_map: dict[int, str] = {}
  975. for cat, chs in class_maps.items():
  976. for ch in chs:
  977. cat_map[ch] = cat
  978. def aw(s: Iterable[int], width: int) -> None:
  979. nonlocal width_map
  980. d = dict.fromkeys(s, width)
  981. d.update(width_map)
  982. width_map = d
  983. aw(flag_codepoints, 2)
  984. aw(doublewidth, 2)
  985. aw(wide_emoji, 2)
  986. aw(marks | {0}, 0)
  987. aw(non_printing, -1)
  988. aw(ambiguous, -2)
  989. aw(class_maps['Co'], -3) # Private use
  990. aw(not_assigned, -4)
  991. gs_map: dict[int, str] = {}
  992. icb_map: dict[int, str] = {}
  993. for name, cps in grapheme_segmentation_maps.items():
  994. gs_map.update(dict.fromkeys(cps, name))
  995. for name, cps in incb_map.items():
  996. icb_map.update(dict.fromkeys(cps, name))
  997. prop_array = tuple(
  998. CharProps(
  999. width=width_map.get(ch, 1), grapheme_break=gs_map.get(ch, 'None'), indic_conjunct_break=icb_map.get(ch, 'None'),
  1000. is_invalid=ch in invalid, is_non_rendered=ch in non_rendered, is_emoji=ch in all_emoji, is_symbol=ch in all_symbols,
  1001. is_extended_pictographic=ch in extended_pictographic, is_emoji_presentation_base=ch in emoji_presentation_bases,
  1002. is_combining_char=ch in marks, category=cat_map.get(ch, 'Cn'), is_word_char=ch in is_word_char,
  1003. is_punctuation=ch in is_punctuation,
  1004. ) for ch in range(sys.maxunicode + 1))
  1005. gsprops = tuple(GraphemeSegmentationProps(
  1006. grapheme_break=x.grapheme_break, indic_conjunct_break=x.indic_conjunct_break,
  1007. is_extended_pictographic=x.is_extended_pictographic) for x in prop_array)
  1008. test_grapheme_segmentation(partial(split_into_graphemes, gsprops))
  1009. gseg_results = tuple(GraphemeSegmentationKey.from_int(i).result() for i in range(1 << 16))
  1010. test_grapheme_segmentation(partial(split_into_graphemes_with_table, gsprops, gseg_results))
  1011. t1, t2, t3, t_shift = splitbins(prop_array, CharProps.bitsize() // 8)
  1012. g1, g2, g3, g_shift = splitbins(gseg_results, GraphemeSegmentationResult.bitsize() // 8)
  1013. from .bitfields import make_bitfield
  1014. buf = StringIO()
  1015. cen = partial(print, file=buf)
  1016. with create_header('kitty/char-props-data.h', include_data_types=False) as c, open('tools/wcswidth/char-props-data.go', 'w') as gof:
  1017. gp = partial(print, file=gof)
  1018. gp('package wcswidth')
  1019. gp('import "unsafe"')
  1020. gp(f'const MAX_UNICODE = {sys.maxunicode}')
  1021. gp(f'const UNICODE_LIMIT = {sys.maxunicode + 1}')
  1022. cen('// UCBDeclaration {{''{')
  1023. cen(f'#define MAX_UNICODE ({sys.maxunicode}u)')
  1024. generate_enum(cen, gp, 'GraphemeBreakProperty', *grapheme_segmentation_maps, prefix='GBP_')
  1025. generate_enum(c, gp, 'IndicConjunctBreak', *incb_map, prefix='ICB_')
  1026. generate_enum(cen, gp, 'UnicodeCategory', 'Cn', *class_maps, prefix='UC_')
  1027. cen('// EndUCBDeclaration }}''}')
  1028. gp(make_bitfield('tools/wcswidth', 'CharProps', *CharProps.go_fields(), add_package=False)[1])
  1029. gp(make_bitfield('tools/wcswidth', 'GraphemeSegmentationResult', *GraphemeSegmentationResult.go_fields(), add_package=False)[1])
  1030. gp(CharProps.go_extra())
  1031. gp(GraphemeSegmentationResult.go_extra())
  1032. gen_multistage_table(c, gp, t1, t2, t3, t_shift, len(prop_array)-1)
  1033. gen_multistage_table(c, gp, g1, g2, g3, g_shift, len(gseg_results)-1)
  1034. c(GraphemeSegmentationKey.code_to_convert_to_int())
  1035. c(GraphemeSegmentationState.c_declaration())
  1036. gp(GraphemeSegmentationKey.code_to_convert_to_int(for_go=True))
  1037. gofmt(gof.name)
  1038. with open('kitty/char-props.h', 'r+') as f:
  1039. raw = f.read()
  1040. nraw = re.sub(r'\d+/\*=width_shift\*/', f'{width_shift}/*=width_shift*/', raw)
  1041. nraw = patch_declaration('CharProps', CharProps.c_declaration(), nraw)
  1042. nraw = patch_declaration('GraphemeSegmentationResult', GraphemeSegmentationResult.c_declaration(), nraw)
  1043. nraw = patch_declaration('UCB', buf.getvalue(), nraw)
  1044. if nraw != raw:
  1045. f.seek(0)
  1046. f.truncate()
  1047. f.write(nraw)
  1048. def main(args: list[str]=sys.argv) -> None:
  1049. parse_ucd()
  1050. parse_prop_list()
  1051. parse_emoji()
  1052. parse_eaw()
  1053. parse_grapheme_segmentation()
  1054. parse_test_data()
  1055. gen_names()
  1056. gen_rowcolumn_diacritics()
  1057. gen_test_data()
  1058. gen_char_props()
  1059. if __name__ == '__main__':
  1060. import runpy
  1061. m = runpy.run_path(os.path.dirname(os.path.abspath(__file__)))
  1062. m['main']([sys.executable, 'wcwidth'])