123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100 |
- #!/usr/bin/env python3
- # Script used to dump char ranges from
- # the Unicode Character Database to the `char_range.inc` file.
- # NOTE: This script is deliberately not integrated into the build system;
- # you should run it manually whenever you want to update the data.
- import os
- import sys
- from typing import Final, List, Set, Tuple
- from urllib.request import urlopen
- if __name__ == "__main__":
- sys.path.insert(1, os.path.join(os.path.dirname(__file__), "../../"))
- from methods import generate_copyright_header
- URL: Final[str] = "https://www.unicode.org/Public/16.0.0/ucd/Blocks.txt"
- ranges: List[Tuple[str, str, str]] = []
- exclude_blocks: Set[str] = {
- "High Surrogates",
- "High Private Use Surrogates",
- "Low Surrogates",
- "Variation Selectors",
- "Specials",
- "Egyptian Hieroglyph Format Controls",
- "Tags",
- "Variation Selectors Supplement",
- }
- def parse_unicode_data() -> None:
- lines: List[str] = [line.decode("utf-8") for line in urlopen(URL)]
- for line in lines:
- if line.startswith("#") or not line.strip():
- continue
- split_line: List[str] = line.split(";")
- char_range: str = split_line[0].strip()
- block: str = split_line[1].strip()
- if block in exclude_blocks:
- continue
- range_start, range_end = char_range.split("..")
- ranges.append((f"0x{range_start}", f"0x{range_end}", block))
- def make_array(array_name: str, ranges: List[Tuple[str, str, str]]) -> str:
- result: str = f"static UniRange {array_name}[] = {{\n"
- for start, end, block in ranges:
- result += f'\t{{ {start}, {end}, U"{block}" }},\n'
- result += """\t{ 0x10FFFF, 0x10FFFF, String() }
- };\n\n"""
- return result
- def generate_unicode_ranges_inc() -> None:
- parse_unicode_data()
- source: str = generate_copyright_header("unicode_ranges.inc")
- source += f"""
- // This file was generated using the `misc/scripts/unicode_ranges_fetch.py` script.
- #ifndef UNICODE_RANGES_INC
- #define UNICODE_RANGES_INC
- // Unicode Character Blocks
- // Source: {URL}
- struct UniRange {{
- \tint32_t start;
- \tint32_t end;
- \tString name;
- }};\n\n"""
- source += make_array("unicode_ranges", ranges)
- source += "#endif // UNICODE_RANGES_INC\n"
- unicode_ranges_path: str = os.path.join(os.path.dirname(__file__), "../../editor/import/unicode_ranges.inc")
- with open(unicode_ranges_path, "w", newline="\n") as f:
- f.write(source)
- print("`unicode_ranges.inc` generated successfully.")
- if __name__ == "__main__":
- generate_unicode_ranges_inc()
|