unicode_ranges_fetch.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. #!/usr/bin/env python3
  2. # Script used to dump char ranges from
  3. # the Unicode Character Database to the `char_range.inc` file.
  4. # NOTE: This script is deliberately not integrated into the build system;
  5. # you should run it manually whenever you want to update the data.
  6. import os
  7. import sys
  8. from typing import Final, List, Set, Tuple
  9. from urllib.request import urlopen
  10. if __name__ == "__main__":
  11. sys.path.insert(1, os.path.join(os.path.dirname(__file__), "../../"))
  12. from methods import generate_copyright_header
  13. URL: Final[str] = "https://www.unicode.org/Public/16.0.0/ucd/Blocks.txt"
  14. ranges: List[Tuple[str, str, str]] = []
  15. exclude_blocks: Set[str] = {
  16. "High Surrogates",
  17. "High Private Use Surrogates",
  18. "Low Surrogates",
  19. "Variation Selectors",
  20. "Specials",
  21. "Egyptian Hieroglyph Format Controls",
  22. "Tags",
  23. "Variation Selectors Supplement",
  24. }
  25. def parse_unicode_data() -> None:
  26. lines: List[str] = [line.decode("utf-8") for line in urlopen(URL)]
  27. for line in lines:
  28. if line.startswith("#") or not line.strip():
  29. continue
  30. split_line: List[str] = line.split(";")
  31. char_range: str = split_line[0].strip()
  32. block: str = split_line[1].strip()
  33. if block in exclude_blocks:
  34. continue
  35. range_start, range_end = char_range.split("..")
  36. ranges.append((f"0x{range_start}", f"0x{range_end}", block))
  37. def make_array(array_name: str, ranges: List[Tuple[str, str, str]]) -> str:
  38. result: str = f"static UniRange {array_name}[] = {{\n"
  39. for start, end, block in ranges:
  40. result += f'\t{{ {start}, {end}, U"{block}" }},\n'
  41. result += """\t{ 0x10FFFF, 0x10FFFF, String() }
  42. };\n\n"""
  43. return result
  44. def generate_unicode_ranges_inc() -> None:
  45. parse_unicode_data()
  46. source: str = generate_copyright_header("unicode_ranges.inc")
  47. source += f"""
  48. // This file was generated using the `misc/scripts/unicode_ranges_fetch.py` script.
  49. #ifndef UNICODE_RANGES_INC
  50. #define UNICODE_RANGES_INC
  51. // Unicode Character Blocks
  52. // Source: {URL}
  53. struct UniRange {{
  54. \tint32_t start;
  55. \tint32_t end;
  56. \tString name;
  57. }};\n\n"""
  58. source += make_array("unicode_ranges", ranges)
  59. source += "#endif // UNICODE_RANGES_INC\n"
  60. unicode_ranges_path: str = os.path.join(os.path.dirname(__file__), "../../editor/import/unicode_ranges.inc")
  61. with open(unicode_ranges_path, "w", newline="\n") as f:
  62. f.write(source)
  63. print("`unicode_ranges.inc` generated successfully.")
  64. if __name__ == "__main__":
  65. generate_unicode_ranges_inc()