ucaps_fetch.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. #!/usr/bin/env python3
  2. # Script used to dump case mappings from
  3. # the Unicode Character Database to the `ucaps.h` file.
  4. # NOTE: This script is deliberately not integrated into the build system;
  5. # you should run it manually whenever you want to update the data.
  6. import os
  7. import sys
  8. from typing import Final, List, Tuple
  9. from urllib.request import urlopen
  10. if __name__ == "__main__":
  11. sys.path.insert(1, os.path.join(os.path.dirname(__file__), "../../"))
  12. from methods import generate_copyright_header
  13. URL: Final[str] = "https://www.unicode.org/Public/16.0.0/ucd/UnicodeData.txt"
  14. lower_to_upper: List[Tuple[str, str]] = []
  15. upper_to_lower: List[Tuple[str, str]] = []
  16. def parse_unicode_data() -> None:
  17. lines: List[str] = [line.decode("utf-8") for line in urlopen(URL)]
  18. for line in lines:
  19. split_line: List[str] = line.split(";")
  20. code_value: str = split_line[0].strip()
  21. uppercase_mapping: str = split_line[12].strip()
  22. lowercase_mapping: str = split_line[13].strip()
  23. if uppercase_mapping:
  24. lower_to_upper.append((f"0x{code_value}", f"0x{uppercase_mapping}"))
  25. if lowercase_mapping:
  26. upper_to_lower.append((f"0x{code_value}", f"0x{lowercase_mapping}"))
  27. def make_cap_table(table_name: str, len_name: str, table: List[Tuple[str, str]]) -> str:
  28. result: str = f"static const int {table_name}[{len_name}][2] = {{\n"
  29. for first, second in table:
  30. result += f"\t{{ {first}, {second} }},\n"
  31. result += "};\n\n"
  32. return result
  33. def generate_ucaps_fetch() -> None:
  34. parse_unicode_data()
  35. source: str = generate_copyright_header("ucaps.h")
  36. source += f"""
  37. #pragma once
  38. // This file was generated using the `misc/scripts/ucaps_fetch.py` script.
  39. #define LTU_LEN {len(lower_to_upper)}
  40. #define UTL_LEN {len(upper_to_lower)}\n\n"""
  41. source += make_cap_table("caps_table", "LTU_LEN", lower_to_upper)
  42. source += make_cap_table("reverse_caps_table", "UTL_LEN", upper_to_lower)
  43. source += """static int _find_upper(int ch) {
  44. \tint low = 0;
  45. \tint high = LTU_LEN - 1;
  46. \tint middle;
  47. \twhile (low <= high) {
  48. \t\tmiddle = (low + high) / 2;
  49. \t\tif (ch < caps_table[middle][0]) {
  50. \t\t\thigh = middle - 1; // Search low end of array.
  51. \t\t} else if (caps_table[middle][0] < ch) {
  52. \t\t\tlow = middle + 1; // Search high end of array.
  53. \t\t} else {
  54. \t\t\treturn caps_table[middle][1];
  55. \t\t}
  56. \t}
  57. \treturn ch;
  58. }
  59. static int _find_lower(int ch) {
  60. \tint low = 0;
  61. \tint high = UTL_LEN - 1;
  62. \tint middle;
  63. \twhile (low <= high) {
  64. \t\tmiddle = (low + high) / 2;
  65. \t\tif (ch < reverse_caps_table[middle][0]) {
  66. \t\t\thigh = middle - 1; // Search low end of array.
  67. \t\t} else if (reverse_caps_table[middle][0] < ch) {
  68. \t\t\tlow = middle + 1; // Search high end of array.
  69. \t\t} else {
  70. \t\t\treturn reverse_caps_table[middle][1];
  71. \t\t}
  72. \t}
  73. \treturn ch;
  74. }
  75. """
  76. ucaps_path: str = os.path.join(os.path.dirname(__file__), "../../core/string/ucaps.h")
  77. with open(ucaps_path, "w", newline="\n") as f:
  78. f.write(source)
  79. print("`ucaps.h` generated successfully.")
  80. if __name__ == "__main__":
  81. generate_ucaps_fetch()