char_range_fetch.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. #!/usr/bin/env python3
  2. # Script used to dump char ranges for specific properties from
  3. # the Unicode Character Database to the `char_range.inc` file.
  4. # NOTE: This script is deliberately not integrated into the build system;
  5. # you should run it manually whenever you want to update the data.
  6. import os
  7. import sys
  8. from typing import Final, List, Tuple
  9. from urllib.request import urlopen
  10. if __name__ == "__main__":
  11. sys.path.insert(1, os.path.join(os.path.dirname(__file__), "../../"))
  12. from methods import generate_copyright_header
  13. URL: Final[str] = "https://www.unicode.org/Public/16.0.0/ucd/DerivedCoreProperties.txt"
  14. xid_start: List[Tuple[int, int]] = []
  15. xid_continue: List[Tuple[int, int]] = []
  16. uppercase_letter: List[Tuple[int, int]] = []
  17. lowercase_letter: List[Tuple[int, int]] = []
  18. unicode_letter: List[Tuple[int, int]] = []
  19. def merge_ranges(ranges: List[Tuple[int, int]]) -> None:
  20. if len(ranges) < 2:
  21. return
  22. last_start: int = ranges[0][0]
  23. last_end: int = ranges[0][1]
  24. original_ranges: List[Tuple[int, int]] = ranges[1:]
  25. ranges.clear()
  26. for curr_range in original_ranges:
  27. curr_start: int = curr_range[0]
  28. curr_end: int = curr_range[1]
  29. if last_end + 1 != curr_start:
  30. ranges.append((last_start, last_end))
  31. last_start = curr_start
  32. last_end = curr_end
  33. ranges.append((last_start, last_end))
  34. def parse_unicode_data() -> None:
  35. lines: List[str] = [line.decode("utf-8") for line in urlopen(URL)]
  36. for line in lines:
  37. if line.startswith("#") or not line.strip():
  38. continue
  39. split_line: List[str] = line.split(";")
  40. char_range: str = split_line[0].strip()
  41. char_property: str = split_line[1].strip().split("#")[0].strip()
  42. range_start: str = char_range
  43. range_end: str = char_range
  44. if ".." in char_range:
  45. range_start, range_end = char_range.split("..")
  46. range_tuple: Tuple[int, int] = (int(range_start, 16), int(range_end, 16))
  47. if char_property == "XID_Start":
  48. xid_start.append(range_tuple)
  49. elif char_property == "XID_Continue":
  50. xid_continue.append(range_tuple)
  51. elif char_property == "Uppercase":
  52. uppercase_letter.append(range_tuple)
  53. elif char_property == "Lowercase":
  54. lowercase_letter.append(range_tuple)
  55. elif char_property == "Alphabetic":
  56. unicode_letter.append(range_tuple)
  57. # Underscore technically isn't in XID_Start, but for our purposes it's included.
  58. xid_start.append((0x005F, 0x005F))
  59. xid_start.sort(key=lambda x: x[0])
  60. merge_ranges(xid_start)
  61. merge_ranges(xid_continue)
  62. merge_ranges(uppercase_letter)
  63. merge_ranges(lowercase_letter)
  64. merge_ranges(unicode_letter)
  65. def make_array(array_name: str, range_list: List[Tuple[int, int]]) -> str:
  66. result: str = f"\n\nconstexpr inline CharRange {array_name}[] = {{\n"
  67. for start, end in range_list:
  68. result += f"\t{{ 0x{start:x}, 0x{end:x} }},\n"
  69. result += "};"
  70. return result
  71. def generate_char_range_inc() -> None:
  72. parse_unicode_data()
  73. source: str = generate_copyright_header("char_range.inc")
  74. source += f"""
  75. // This file was generated using the `misc/scripts/char_range_fetch.py` script.
  76. #pragma once
  77. #include "core/typedefs.h"
  78. // Unicode Derived Core Properties
  79. // Source: {URL}
  80. struct CharRange {{
  81. \tchar32_t start;
  82. \tchar32_t end;
  83. }};"""
  84. source += make_array("xid_start", xid_start)
  85. source += make_array("xid_continue", xid_continue)
  86. source += make_array("uppercase_letter", uppercase_letter)
  87. source += make_array("lowercase_letter", lowercase_letter)
  88. source += make_array("unicode_letter", unicode_letter)
  89. source += "\n"
  90. char_range_path: str = os.path.join(os.path.dirname(__file__), "../../core/string/char_range.inc")
  91. with open(char_range_path, "w", newline="\n") as f:
  92. f.write(source)
  93. print("`char_range.inc` generated successfully.")
  94. if __name__ == "__main__":
  95. generate_char_range_inc()