123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138 |
- #!/usr/bin/env python3
- # Script used to dump char ranges for specific properties from
- # the Unicode Character Database to the `char_range.inc` file.
- # NOTE: This script is deliberately not integrated into the build system;
- # you should run it manually whenever you want to update the data.
- import os
- import sys
- from typing import Final, List, Tuple
- from urllib.request import urlopen
- if __name__ == "__main__":
- sys.path.insert(1, os.path.join(os.path.dirname(__file__), "../../"))
- from methods import generate_copyright_header
- URL: Final[str] = "https://www.unicode.org/Public/16.0.0/ucd/DerivedCoreProperties.txt"
- xid_start: List[Tuple[int, int]] = []
- xid_continue: List[Tuple[int, int]] = []
- uppercase_letter: List[Tuple[int, int]] = []
- lowercase_letter: List[Tuple[int, int]] = []
- unicode_letter: List[Tuple[int, int]] = []
- def merge_ranges(ranges: List[Tuple[int, int]]) -> None:
- if len(ranges) < 2:
- return
- last_start: int = ranges[0][0]
- last_end: int = ranges[0][1]
- original_ranges: List[Tuple[int, int]] = ranges[1:]
- ranges.clear()
- for curr_range in original_ranges:
- curr_start: int = curr_range[0]
- curr_end: int = curr_range[1]
- if last_end + 1 != curr_start:
- ranges.append((last_start, last_end))
- last_start = curr_start
- last_end = curr_end
- ranges.append((last_start, last_end))
- def parse_unicode_data() -> None:
- lines: List[str] = [line.decode("utf-8") for line in urlopen(URL)]
- for line in lines:
- if line.startswith("#") or not line.strip():
- continue
- split_line: List[str] = line.split(";")
- char_range: str = split_line[0].strip()
- char_property: str = split_line[1].strip().split("#")[0].strip()
- range_start: str = char_range
- range_end: str = char_range
- if ".." in char_range:
- range_start, range_end = char_range.split("..")
- range_tuple: Tuple[int, int] = (int(range_start, 16), int(range_end, 16))
- if char_property == "XID_Start":
- xid_start.append(range_tuple)
- elif char_property == "XID_Continue":
- xid_continue.append(range_tuple)
- elif char_property == "Uppercase":
- uppercase_letter.append(range_tuple)
- elif char_property == "Lowercase":
- lowercase_letter.append(range_tuple)
- elif char_property == "Alphabetic":
- unicode_letter.append(range_tuple)
- # Underscore technically isn't in XID_Start, but for our purposes it's included.
- xid_start.append((0x005F, 0x005F))
- xid_start.sort(key=lambda x: x[0])
- merge_ranges(xid_start)
- merge_ranges(xid_continue)
- merge_ranges(uppercase_letter)
- merge_ranges(lowercase_letter)
- merge_ranges(unicode_letter)
- def make_array(array_name: str, range_list: List[Tuple[int, int]]) -> str:
- result: str = f"\n\nconstexpr inline CharRange {array_name}[] = {{\n"
- for start, end in range_list:
- result += f"\t{{ 0x{start:x}, 0x{end:x} }},\n"
- result += "};"
- return result
- def generate_char_range_inc() -> None:
- parse_unicode_data()
- source: str = generate_copyright_header("char_range.inc")
- source += f"""
- // This file was generated using the `misc/scripts/char_range_fetch.py` script.
- #pragma once
- #include "core/typedefs.h"
- // Unicode Derived Core Properties
- // Source: {URL}
- struct CharRange {{
- \tchar32_t start;
- \tchar32_t end;
- }};"""
- source += make_array("xid_start", xid_start)
- source += make_array("xid_continue", xid_continue)
- source += make_array("uppercase_letter", uppercase_letter)
- source += make_array("lowercase_letter", lowercase_letter)
- source += make_array("unicode_letter", unicode_letter)
- source += "\n"
- char_range_path: str = os.path.join(os.path.dirname(__file__), "../../core/string/char_range.inc")
- with open(char_range_path, "w", newline="\n") as f:
- f.write(source)
- print("`char_range.inc` generated successfully.")
- if __name__ == "__main__":
- generate_char_range_inc()
|