wcwidth.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573
  1. #!/usr/bin/env python
  2. # License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
  3. import os
  4. import re
  5. import subprocess
  6. import sys
  7. from collections import defaultdict
  8. from collections.abc import Generator, Iterable
  9. from contextlib import contextmanager
  10. from functools import lru_cache, partial
  11. from html.entities import html5
  12. from itertools import groupby
  13. from operator import itemgetter
  14. from typing import (
  15. Callable,
  16. DefaultDict,
  17. Optional,
  18. Union,
  19. )
  20. from urllib.request import urlopen
  21. if __name__ == '__main__' and not __package__:
  22. import __main__
  23. __main__.__package__ = 'gen'
  24. sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  25. non_characters = frozenset(range(0xfffe, 0x10ffff, 0x10000))
  26. non_characters |= frozenset(range(0xffff, 0x10ffff + 1, 0x10000))
  27. non_characters |= frozenset(range(0xfdd0, 0xfdf0))
  28. if len(non_characters) != 66:
  29. raise SystemExit('non_characters table incorrect')
  30. emoji_skin_tone_modifiers = frozenset(range(0x1f3fb, 0x1F3FF + 1))
  31. def get_data(fname: str, folder: str = 'UCD') -> Iterable[str]:
  32. url = f'https://www.unicode.org/Public/{folder}/latest/{fname}'
  33. bn = os.path.basename(url)
  34. local = os.path.join('/tmp', bn)
  35. if os.path.exists(local):
  36. with open(local, 'rb') as f:
  37. data = f.read()
  38. else:
  39. data = urlopen(url).read()
  40. with open(local, 'wb') as f:
  41. f.write(data)
  42. for line in data.decode('utf-8').splitlines():
  43. line = line.strip()
  44. if line and not line.startswith('#'):
  45. yield line
  46. @lru_cache(maxsize=2)
  47. def unicode_version() -> tuple[int, int, int]:
  48. for line in get_data("ReadMe.txt"):
  49. m = re.search(r'Version\s+(\d+)\.(\d+)\.(\d+)', line)
  50. if m is not None:
  51. return int(m.group(1)), int(m.group(2)), int(m.group(3))
  52. raise ValueError('Could not find Unicode Version')
  53. # Map of class names to set of codepoints in class
  54. class_maps: dict[str, set[int]] = {}
  55. all_symbols: set[int] = set()
  56. name_map: dict[int, str] = {}
  57. word_search_map: DefaultDict[str, set[int]] = defaultdict(set)
  58. soft_hyphen = 0xad
  59. flag_codepoints = frozenset(range(0x1F1E6, 0x1F1E6 + 26))
  60. # See https://github.com/harfbuzz/harfbuzz/issues/169
  61. marks = set(emoji_skin_tone_modifiers) | flag_codepoints
  62. not_assigned = set(range(0, sys.maxunicode))
  63. property_maps: dict[str, set[int]] = defaultdict(set)
  64. def parse_prop_list() -> None:
  65. global marks
  66. for line in get_data('ucd/PropList.txt'):
  67. if line.startswith('#'):
  68. continue
  69. cp_or_range, rest = line.split(';', 1)
  70. chars = parse_range_spec(cp_or_range.strip())
  71. name = rest.strip().split()[0]
  72. property_maps[name] |= chars
  73. # see https://www.unicode.org/faq/unsup_char.html#3
  74. marks |= property_maps['Other_Default_Ignorable_Code_Point']
  75. def parse_ucd() -> None:
  76. def add_word(w: str, c: int) -> None:
  77. if c <= 32 or c == 127 or 128 <= c <= 159:
  78. return
  79. if len(w) > 1:
  80. word_search_map[w.lower()].add(c)
  81. first: Optional[int] = None
  82. for word, c in html5.items():
  83. if len(c) == 1:
  84. add_word(word.rstrip(';'), ord(c))
  85. word_search_map['nnbsp'].add(0x202f)
  86. for line in get_data('ucd/UnicodeData.txt'):
  87. parts = [x.strip() for x in line.split(';')]
  88. codepoint = int(parts[0], 16)
  89. name = parts[1] or parts[10]
  90. if name == '<control>':
  91. name = parts[10]
  92. if name:
  93. name_map[codepoint] = name
  94. for word in name.lower().split():
  95. add_word(word, codepoint)
  96. category = parts[2]
  97. s = class_maps.setdefault(category, set())
  98. desc = parts[1]
  99. codepoints: Union[tuple[int, ...], Iterable[int]] = (codepoint,)
  100. if first is None:
  101. if desc.endswith(', First>'):
  102. first = codepoint
  103. continue
  104. else:
  105. codepoints = range(first, codepoint + 1)
  106. first = None
  107. for codepoint in codepoints:
  108. s.add(codepoint)
  109. not_assigned.discard(codepoint)
  110. if category.startswith('M'):
  111. marks.add(codepoint)
  112. elif category.startswith('S'):
  113. all_symbols.add(codepoint)
  114. elif category == 'Cf':
  115. # we add Cf to marks as it contains things like tags and zero
  116. # width chars. Not sure if *all* of Cf should be treated as
  117. # combining chars, might need to add individual exceptions in
  118. # the future.
  119. marks.add(codepoint)
  120. with open('gen/nerd-fonts-glyphs.txt') as f:
  121. for line in f:
  122. line = line.strip()
  123. if not line or line.startswith('#'):
  124. continue
  125. code, category, name = line.split(' ', 2)
  126. codepoint = int(code, 16)
  127. if name and codepoint not in name_map:
  128. name_map[codepoint] = name.upper()
  129. for word in name.lower().split():
  130. add_word(word, codepoint)
  131. # Some common synonyms
  132. word_search_map['bee'] |= word_search_map['honeybee']
  133. word_search_map['lambda'] |= word_search_map['lamda']
  134. word_search_map['lamda'] |= word_search_map['lambda']
  135. word_search_map['diamond'] |= word_search_map['gem']
  136. def parse_range_spec(spec: str) -> set[int]:
  137. spec = spec.strip()
  138. if '..' in spec:
  139. chars_ = tuple(map(lambda x: int(x, 16), filter(None, spec.split('.'))))
  140. chars = set(range(chars_[0], chars_[1] + 1))
  141. else:
  142. chars = {int(spec, 16)}
  143. return chars
  144. def split_two(line: str) -> tuple[set[int], str]:
  145. spec, rest = line.split(';', 1)
  146. spec, rest = spec.strip(), rest.strip().split(' ', 1)[0].strip()
  147. return parse_range_spec(spec), rest
  148. all_emoji: set[int] = set()
  149. emoji_presentation_bases: set[int] = set()
  150. narrow_emoji: set[int] = set()
  151. wide_emoji: set[int] = set()
  152. flags: dict[int, list[int]] = {}
  153. def parse_basic_emoji(spec: str) -> None:
  154. parts = list(filter(None, spec.split()))
  155. has_emoji_presentation = len(parts) < 2
  156. chars = parse_range_spec(parts[0])
  157. all_emoji.update(chars)
  158. emoji_presentation_bases.update(chars)
  159. (wide_emoji if has_emoji_presentation else narrow_emoji).update(chars)
  160. def parse_keycap_sequence(spec: str) -> None:
  161. base, fe0f, cc = list(filter(None, spec.split()))
  162. chars = parse_range_spec(base)
  163. all_emoji.update(chars)
  164. emoji_presentation_bases.update(chars)
  165. narrow_emoji.update(chars)
  166. def parse_flag_emoji_sequence(spec: str) -> None:
  167. a, b = list(filter(None, spec.split()))
  168. left, right = int(a, 16), int(b, 16)
  169. chars = {left, right}
  170. all_emoji.update(chars)
  171. wide_emoji.update(chars)
  172. emoji_presentation_bases.update(chars)
  173. flags.setdefault(left, []).append(right)
  174. def parse_emoji_tag_sequence(spec: str) -> None:
  175. a = int(spec.split()[0], 16)
  176. all_emoji.add(a)
  177. wide_emoji.add(a)
  178. emoji_presentation_bases.add(a)
  179. def parse_emoji_modifier_sequence(spec: str) -> None:
  180. a, b = list(filter(None, spec.split()))
  181. char, mod = int(a, 16), int(b, 16)
  182. mod
  183. all_emoji.add(char)
  184. wide_emoji.add(char)
  185. emoji_presentation_bases.add(char)
  186. def parse_emoji() -> None:
  187. for line in get_data('emoji-sequences.txt', 'emoji'):
  188. parts = [x.strip() for x in line.split(';')]
  189. if len(parts) < 2:
  190. continue
  191. data, etype = parts[:2]
  192. if etype == 'Basic_Emoji':
  193. parse_basic_emoji(data)
  194. elif etype == 'Emoji_Keycap_Sequence':
  195. parse_keycap_sequence(data)
  196. elif etype == 'RGI_Emoji_Flag_Sequence':
  197. parse_flag_emoji_sequence(data)
  198. elif etype == 'RGI_Emoji_Tag_Sequence':
  199. parse_emoji_tag_sequence(data)
  200. elif etype == 'RGI_Emoji_Modifier_Sequence':
  201. parse_emoji_modifier_sequence(data)
  202. doublewidth: set[int] = set()
  203. ambiguous: set[int] = set()
  204. def parse_eaw() -> None:
  205. global doublewidth, ambiguous
  206. seen: set[int] = set()
  207. for line in get_data('ucd/EastAsianWidth.txt'):
  208. chars, eaw = split_two(line)
  209. if eaw == 'A':
  210. ambiguous |= chars
  211. seen |= chars
  212. elif eaw in ('W', 'F'):
  213. doublewidth |= chars
  214. seen |= chars
  215. doublewidth |= set(range(0x3400, 0x4DBF + 1)) - seen
  216. doublewidth |= set(range(0x4E00, 0x9FFF + 1)) - seen
  217. doublewidth |= set(range(0xF900, 0xFAFF + 1)) - seen
  218. doublewidth |= set(range(0x20000, 0x2FFFD + 1)) - seen
  219. doublewidth |= set(range(0x30000, 0x3FFFD + 1)) - seen
  220. def get_ranges(items: list[int]) -> Generator[Union[int, tuple[int, int]], None, None]:
  221. items.sort()
  222. for k, g in groupby(enumerate(items), lambda m: m[0]-m[1]):
  223. group = tuple(map(itemgetter(1), g))
  224. a, b = group[0], group[-1]
  225. if a == b:
  226. yield a
  227. else:
  228. yield a, b
  229. def write_case(spec: Union[tuple[int, ...], int], p: Callable[..., None], for_go: bool = False) -> None:
  230. if isinstance(spec, tuple):
  231. if for_go:
  232. v = ', '.join(f'0x{x:x}' for x in range(spec[0], spec[1] + 1))
  233. p(f'\t\tcase {v}:')
  234. else:
  235. p('\t\tcase 0x{:x} ... 0x{:x}:'.format(*spec))
  236. else:
  237. p(f'\t\tcase 0x{spec:x}:')
  238. @contextmanager
  239. def create_header(path: str, include_data_types: bool = True) -> Generator[Callable[..., None], None, None]:
  240. with open(path, 'w') as f:
  241. p = partial(print, file=f)
  242. p('// Unicode data, built from the Unicode Standard', '.'.join(map(str, unicode_version())))
  243. p(f'// Code generated by {os.path.basename(__file__)}, DO NOT EDIT.', end='\n\n')
  244. if path.endswith('.h'):
  245. p('#pragma once')
  246. if include_data_types:
  247. p('#include "data-types.h"\n')
  248. p('START_ALLOW_CASE_RANGE')
  249. p()
  250. yield p
  251. p()
  252. if include_data_types:
  253. p('END_ALLOW_CASE_RANGE')
  254. def gen_emoji() -> None:
  255. with create_header('kitty/emoji.h') as p:
  256. p('static inline bool\nis_emoji(char_type code) {')
  257. p('\tswitch(code) {')
  258. for spec in get_ranges(list(all_emoji)):
  259. write_case(spec, p)
  260. p('\t\t\treturn true;')
  261. p('\t\tdefault: return false;')
  262. p('\t}')
  263. p('\treturn false;\n}')
  264. p('static inline bool\nis_symbol(char_type code) {')
  265. p('\tswitch(code) {')
  266. for spec in get_ranges(list(all_symbols)):
  267. write_case(spec, p)
  268. p('\t\t\treturn true;')
  269. p('\t\tdefault: return false;')
  270. p('\t}')
  271. p('\treturn false;\n}')
  272. def category_test(
  273. name: str,
  274. p: Callable[..., None],
  275. classes: Iterable[str],
  276. comment: str,
  277. use_static: bool = False,
  278. extra_chars: Union[frozenset[int], set[int]] = frozenset(),
  279. exclude: Union[set[int], frozenset[int]] = frozenset(),
  280. least_check_return: Optional[str] = None,
  281. ascii_range: Optional[str] = None
  282. ) -> None:
  283. static = 'static inline ' if use_static else ''
  284. chars: set[int] = set()
  285. for c in classes:
  286. chars |= class_maps[c]
  287. chars |= extra_chars
  288. chars -= exclude
  289. p(f'{static}bool\n{name}(char_type code) {{')
  290. p(f'\t// {comment} ({len(chars)} codepoints)' + ' {{' '{')
  291. if least_check_return is not None:
  292. least = min(chars)
  293. p(f'\tif (LIKELY(code < {least})) return {least_check_return};')
  294. if ascii_range is not None:
  295. p(f'\tif (LIKELY(0x20 <= code && code <= 0x7e)) return {ascii_range};')
  296. p('\tswitch(code) {')
  297. for spec in get_ranges(list(chars)):
  298. write_case(spec, p)
  299. p('\t\t\treturn true;')
  300. p('\t} // }}}\n')
  301. p('\treturn false;\n}\n')
  302. def codepoint_to_mark_map(p: Callable[..., None], mark_map: list[int]) -> dict[int, int]:
  303. p('\tswitch(c) { // {{{')
  304. rmap = {c: m for m, c in enumerate(mark_map)}
  305. for spec in get_ranges(mark_map):
  306. if isinstance(spec, tuple):
  307. s = rmap[spec[0]]
  308. cases = ' '.join(f'case {i}:' for i in range(spec[0], spec[1]+1))
  309. p(f'\t\t{cases} return {s} + c - {spec[0]};')
  310. else:
  311. p(f'\t\tcase {spec}: return {rmap[spec]};')
  312. p('default: return 0;')
  313. p('\t} // }}}')
  314. return rmap
  315. def classes_to_regex(classes: Iterable[str], exclude: str = '', for_go: bool = True) -> Iterable[str]:
  316. chars: set[int] = set()
  317. for c in classes:
  318. chars |= class_maps[c]
  319. for x in map(ord, exclude):
  320. chars.discard(x)
  321. if for_go:
  322. def as_string(codepoint: int) -> str:
  323. if codepoint < 256:
  324. return fr'\x{codepoint:02x}'
  325. return fr'\x{{{codepoint:x}}}'
  326. else:
  327. def as_string(codepoint: int) -> str:
  328. if codepoint < 256:
  329. return fr'\x{codepoint:02x}'
  330. if codepoint <= 0xffff:
  331. return fr'\u{codepoint:04x}'
  332. return fr'\U{codepoint:08x}'
  333. for spec in get_ranges(list(chars)):
  334. if isinstance(spec, tuple):
  335. yield '{}-{}'.format(*map(as_string, (spec[0], spec[1])))
  336. else:
  337. yield as_string(spec)
  338. def gen_ucd() -> None:
  339. cz = {c for c in class_maps if c[0] in 'CZ'}
  340. with create_header('kitty/unicode-data.c') as p:
  341. p('#include "unicode-data.h"')
  342. p('START_ALLOW_CASE_RANGE')
  343. category_test(
  344. 'is_combining_char', p,
  345. (),
  346. 'Combining and default ignored characters',
  347. extra_chars=marks,
  348. least_check_return='false'
  349. )
  350. category_test(
  351. 'is_ignored_char', p, 'Cc Cs'.split(),
  352. 'Control characters and non-characters',
  353. extra_chars=non_characters,
  354. ascii_range='false'
  355. )
  356. category_test(
  357. 'is_non_rendered_char', p, 'Cc Cs Cf'.split(),
  358. 'Other_Default_Ignorable_Code_Point and soft hyphen',
  359. extra_chars=property_maps['Other_Default_Ignorable_Code_Point'] | set(range(0xfe00, 0xfe0f + 1)),
  360. ascii_range='false'
  361. )
  362. category_test('is_word_char', p, {c for c in class_maps if c[0] in 'LN'}, 'L and N categories')
  363. category_test('is_CZ_category', p, cz, 'C and Z categories')
  364. category_test('is_P_category', p, {c for c in class_maps if c[0] == 'P'}, 'P category (punctuation)')
  365. def gen_names() -> None:
  366. aliases_map: dict[int, set[str]] = {}
  367. for word, codepoints in word_search_map.items():
  368. for cp in codepoints:
  369. aliases_map.setdefault(cp, set()).add(word)
  370. if len(name_map) > 0xffff:
  371. raise Exception('Too many named codepoints')
  372. with open('tools/unicode_names/names.txt', 'w') as f:
  373. print(len(name_map), len(word_search_map), file=f)
  374. for cp in sorted(name_map):
  375. name = name_map[cp]
  376. words = name.lower().split()
  377. aliases = aliases_map.get(cp, set()) - set(words)
  378. end = '\n'
  379. if aliases:
  380. end = '\t' + ' '.join(sorted(aliases)) + end
  381. print(cp, *words, end=end, file=f)
  382. def gen_wcwidth() -> None:
  383. seen: set[int] = set()
  384. non_printing = class_maps['Cc'] | class_maps['Cf'] | class_maps['Cs']
  385. def add(p: Callable[..., None], comment: str, chars_: Union[set[int], frozenset[int]], ret: int, for_go: bool = False) -> None:
  386. chars = chars_ - seen
  387. seen.update(chars)
  388. p(f'\t\t// {comment} ({len(chars)} codepoints)' + ' {{' '{')
  389. for spec in get_ranges(list(chars)):
  390. write_case(spec, p, for_go)
  391. p(f'\t\t\treturn {ret};')
  392. p('\t\t// }}}\n')
  393. def add_all(p: Callable[..., None], for_go: bool = False) -> None:
  394. seen.clear()
  395. add(p, 'Flags', flag_codepoints, 2, for_go)
  396. add(p, 'Marks', marks | {0}, 0, for_go)
  397. add(p, 'Non-printing characters', non_printing, -1, for_go)
  398. add(p, 'Private use', class_maps['Co'], -3, for_go)
  399. add(p, 'Text Presentation', narrow_emoji, 1, for_go)
  400. add(p, 'East Asian ambiguous width', ambiguous, -2, for_go)
  401. add(p, 'East Asian double width', doublewidth, 2, for_go)
  402. add(p, 'Emoji Presentation', wide_emoji, 2, for_go)
  403. add(p, 'Not assigned in the unicode character database', not_assigned, -4, for_go)
  404. p('\t\tdefault:\n\t\t\treturn 1;')
  405. p('\t}')
  406. if for_go:
  407. p('\t}')
  408. else:
  409. p('\treturn 1;\n}')
  410. with create_header('kitty/wcwidth-std.h') as p, open('tools/wcswidth/std.go', 'w') as gof:
  411. gop = partial(print, file=gof)
  412. gop('package wcswidth\n\n')
  413. gop('func Runewidth(code rune) int {')
  414. p('static inline int\nwcwidth_std(int32_t code) {')
  415. p('\tif (LIKELY(0x20 <= code && code <= 0x7e)) { return 1; }')
  416. p('\tswitch(code) {')
  417. gop('\tswitch(code) {')
  418. add_all(p)
  419. add_all(gop, True)
  420. p('static inline bool\nis_emoji_presentation_base(uint32_t code) {')
  421. gop('func IsEmojiPresentationBase(code rune) bool {')
  422. p('\tswitch(code) {')
  423. gop('\tswitch(code) {')
  424. for spec in get_ranges(list(emoji_presentation_bases)):
  425. write_case(spec, p)
  426. write_case(spec, gop, for_go=True)
  427. p('\t\t\treturn true;')
  428. gop('\t\t\treturn true;')
  429. p('\t\tdefault: return false;')
  430. p('\t}')
  431. gop('\t\tdefault:\n\t\t\treturn false')
  432. gop('\t}')
  433. p('\treturn true;\n}')
  434. gop('\n}')
  435. uv = unicode_version()
  436. p(f'#define UNICODE_MAJOR_VERSION {uv[0]}')
  437. p(f'#define UNICODE_MINOR_VERSION {uv[1]}')
  438. p(f'#define UNICODE_PATCH_VERSION {uv[2]}')
  439. gop('var UnicodeDatabaseVersion [3]int = [3]int{' f'{uv[0]}, {uv[1]}, {uv[2]}' + '}')
  440. subprocess.check_call(['gofmt', '-w', '-s', gof.name])
  441. def gen_rowcolumn_diacritics() -> None:
  442. # codes of all row/column diacritics
  443. codes = []
  444. with open("gen/rowcolumn-diacritics.txt") as file:
  445. for line in file.readlines():
  446. if line.startswith('#'):
  447. continue
  448. code = int(line.split(";")[0], 16)
  449. codes.append(code)
  450. go_file = 'tools/utils/images/rowcolumn_diacritics.go'
  451. with create_header('kitty/rowcolumn-diacritics.c') as p, create_header(go_file, include_data_types=False) as g:
  452. p('int diacritic_to_num(char_type code) {')
  453. p('\tswitch (code) {')
  454. g('package images')
  455. g(f'var NumberToDiacritic = [{len(codes)}]rune''{')
  456. g(', '.join(f'0x{x:x}' for x in codes) + ',')
  457. g('}')
  458. range_start_num = 1
  459. range_start = 0
  460. range_end = 0
  461. def print_range() -> None:
  462. if range_start >= range_end:
  463. return
  464. write_case((range_start, range_end), p)
  465. p('\t\treturn code - ' + hex(range_start) + ' + ' +
  466. str(range_start_num) + ';')
  467. for code in codes:
  468. if range_end == code:
  469. range_end += 1
  470. else:
  471. print_range()
  472. range_start_num += range_end - range_start
  473. range_start = code
  474. range_end = code + 1
  475. print_range()
  476. p('\t}')
  477. p('\treturn 0;')
  478. p('}')
  479. subprocess.check_call(['gofmt', '-w', '-s', go_file])
  480. def main(args: list[str]=sys.argv) -> None:
  481. parse_ucd()
  482. parse_prop_list()
  483. parse_emoji()
  484. parse_eaw()
  485. gen_ucd()
  486. gen_wcwidth()
  487. gen_emoji()
  488. gen_names()
  489. gen_rowcolumn_diacritics()
  490. if __name__ == '__main__':
  491. import runpy
  492. m = runpy.run_path(os.path.dirname(os.path.abspath(__file__)))
  493. m['main']([sys.executable, 'wcwidth'])