update_github_emojis.cr 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. #
  2. # Copyright (c) 2023 supercell
  3. #
  4. # SPDX-License-Identifier: BSD-3-Clause
  5. #
  6. require "http/client"
  7. require "json"
  8. require "option_parser"
  9. require "../src/luce/legacy_emojis"
  10. # Regular expression to parse unicode from GitHub emoji API output filenames.
  11. GITHUB_EMOJI_UNICODE_FROM_FILENAME_PATTERN = Regex.new(%{.*unicode/([A-Fa-f0-9-]+).png})
  12. # URL for GitHub's emoji API. We reconcile with our legacy emoji so that
  13. # we don't change or break anything.
  14. # There are essentially only TWO (2) emoji that change and the
  15. # legacy emoji is still available with an alternate name.
  16. # The 'beetle' emoji changes from `🐞` to `🪲`, legacy available as 'lady_beetle'.
  17. # The 'cricket' emoji changes from `🏏` to `🦗`, legacy available as 'cricket_game'.
  18. # (if the -g flag us used to force using the GitHub Unicode sequences for the
  19. # emoji then additionally the 'email' emoji changes from '✉️' to '📧').
  20. EMOJIS_JSON_RAW_URL = "https://api.github.com/emojis"
  21. EMOJIS_FILE_PATH = "src/luce/emojis.cr"
  22. LEGACY_EMOJIS = Luce::LegacyEmojis
  23. # This array is ONLY USED when the --use-github-unicodes option is used to
  24. # minimize the visual differences in the output emoji.
  25. LEGACY_EMOJIS_USED_VARIATION_MODIFIER = [
  26. "263a",
  27. "2600",
  28. "2601",
  29. "2744",
  30. "2708",
  31. "260e",
  32. "2702",
  33. "2712",
  34. "270f",
  35. "2764",
  36. "d83c-de37",
  37. "2734",
  38. "3299",
  39. "3297",
  40. "d83c-dd70",
  41. "d83c-dd71",
  42. "d83c-dd7e",
  43. "2668",
  44. "203c",
  45. "2049",
  46. "303d",
  47. "26a0",
  48. "267b",
  49. "2747",
  50. "2733",
  51. "24c2",
  52. "d83c-de02",
  53. "d83c-dd7f",
  54. "23cf",
  55. "25b6",
  56. "25c0",
  57. "27a1",
  58. "2b05",
  59. "2b06",
  60. "2b07",
  61. "2197",
  62. "2198",
  63. "2199",
  64. "2196",
  65. "2195",
  66. "2194",
  67. "21aa",
  68. "21a9",
  69. "2934",
  70. "2935",
  71. "2139",
  72. "3030",
  73. "2714",
  74. "2716",
  75. "00a9",
  76. "00ae",
  77. "2122",
  78. "2611",
  79. "25aa",
  80. "25ab",
  81. "25fc",
  82. "25fb",
  83. "2660",
  84. "2663",
  85. "2665",
  86. "2666",
  87. ]
  88. # Special replacement character '�'
  89. ERROR_SPECIAL_REPLACEMENT = "\uFFFD"
  90. USE_OF_GITHUB_UNICODE_SEQUENCES_WARNING = "
  91. IMPORTANT NOTE: The use of --use-github-unicodes switch will force using
  92. GitHub Unicode sequences.
  93. This option is essentially here only for completeness, not for
  94. release use.
  95. The slight visual differences of some emoji might also be another
  96. reason using --use-github-unicodes should be considered a *Breaking Change*.
  97. Some test will fail because of the different Unicode sequences
  98. and the emojis.unit file would need to be updated to contain the new
  99. expected GitHub versions of the Unicode sequences of the emoji in order
  100. for the tests to pass.
  101. "
  102. # The GitHub API URL will return a JSON map of all emoji in the form of
  103. # `{ "shortcode":"emojifilename" ... }`.
  104. # The filenames are simply a list of all of the hex string of the
  105. # *essential* Unicode codepoints representing the emoji.
  106. # These sequences exclude the Unicode join zero width (0x200D) and
  107. # variation select (0xFE0F) modifiers. (We will need to add these in to
  108. # build our actually Unicode strings representing the emoji).
  109. # Multiple Unicode codepoints are separated by '-'.
  110. # Examples filenames (single and double code point examples):
  111. # - "https://github.githubassets.com/images/icons/emoji/unicode/1f643.png?v8"
  112. # - "https://github.githubassets.com/images/icons/emoji/unicode/1f1fa-1f1fe.png?v8"
  113. # - "https://github.githubassets.com/images/icons/emoji/unicode/1f469-1f469-1f467-1f466.png?v8"
  114. # NOTE: Some filenames will be GitHub 'custom' emoji that have no Unicode
  115. # equivalent and these will not have hex codepoints, only the GitHub custom name.
  116. # We will ingore these (there are only a 19 and they are mostly pixel art from
  117. # the old Doom game).
  118. # Example GitHub custom emoji filename:
  119. # - "https://github.githubassets.com/images/icons/emoji/godmode.png?v8",
  120. def parse_github_filename_into_unicode_string(emoji_filename : String) : String
  121. variation_selector = 0xFE0F
  122. zero_width_joiner = 0x200D
  123. begin
  124. raw_hex_list = GITHUB_EMOJI_UNICODE_FROM_FILENAME_PATTERN.match(emoji_filename).try &.[1]
  125. # This is the GitHub custom emoji and it is represented by a PNG image only and
  126. # there is no equivalent Unicode. We have to ignore.
  127. return "" if raw_hex_list.nil?
  128. legacy_used_variation_code = LEGACY_EMOJIS_USED_VARIATION_MODIFIER.includes?(raw_hex_list)
  129. raw_code_points_hex = raw_hex_list.split("-").map { |v| "0x#{v}".to_i(prefix: true) }
  130. code_points_hex = [] of Int32
  131. if legacy_used_variation_code
  132. code_points_hex.concat(raw_code_points_hex)
  133. code_points_hex << variation_selector
  134. else
  135. # Now insert the join zero width and variation select modifying Unicode chars.
  136. raw_code_points_hex.each_with_index do |code_point_at_index, i|
  137. code_points_hex << code_point_at_index
  138. if i < (raw_code_points_hex.size - 1)
  139. code_points_hex << variation_selector
  140. # # and 0-9 don't use Zero Width joiner
  141. if code_point_at_index == 0x23 ||
  142. (code_point_at_index >= 0x30 && code_point_at_index <= 0x39)
  143. # Don't add Zero Width Joiner
  144. else
  145. code_points_hex << zero_width_joiner
  146. end
  147. end
  148. end
  149. end
  150. code_points_hex.map(&.chr).join
  151. rescue e
  152. STDERR.puts "Invalid/Non-Conformant emoji filename pattern found \"#{emoji_filename}\"!"
  153. ERROR_SPECIAL_REPLACEMENT
  154. end
  155. end
  156. def main(args = ARGV)
  157. use_legacy_unicode_sequences = true
  158. visualize_unicode_diffs = false
  159. dump_markdown_shortcodes = false
  160. dump_markdown_tooltip_shortcodes = false
  161. parser = OptionParser.new do |p|
  162. p.banner = "Usage update_github_emojis.cr
  163. By default, the legacy Unicode sequences are used (for
  164. maximum visual compatibility with the legacy emoji).
  165. The --use-github-unicodes flag can be used so that the
  166. Unicode sequences from GitHub are used for emoji's that
  167. existed within the legacy set. This will result in a very slight visual
  168. difference for some emoji, but it will result in many more
  169. binary differences when comparing legacy_emoji.cr to emoji.cr.
  170. #{USE_OF_GITHUB_UNICODE_SEQUENCES_WARNING}
  171. The --visualize-different-unicodes flag can be used to visually
  172. verify that any different Unicode sequences produce the same
  173. emoji.
  174. Options:"
  175. p.on("-h", "--help", "Print help text and exit") do
  176. puts p
  177. exit 0
  178. end
  179. p.on("-g", "--use-github-unicodes", "Use the GitHub Unicode sequences instead of leagacy sequences.") do
  180. use_legacy_unicode_sequences = false
  181. end
  182. p.on("-v", "--visualize-different-unicodes", "Visualize any Unicode sequence differences") do
  183. visualize_unicode_diffs = true
  184. end
  185. p.on("-s STYLE", "--dump-markdown-shortcodes STYLE", "Outputs all emoji shortcodes to stdout which can be used in markdown to show and test all emoji") do |value|
  186. dump_markdown_shortcodes = (value.downcase == "plain")
  187. dump_markdown_tooltip_shortcodes = (value.downcase == "tooltip")
  188. unless dump_markdown_shortcodes || dump_markdown_tooltip_shortcodes
  189. STDERR.puts "--dump-markdown-shortcodes can only be one of \"plain\" or \"tooltip\""
  190. exit 1
  191. end
  192. end
  193. p.missing_option do |option|
  194. STDERR.puts "Missing option for #{option} flag"
  195. puts p
  196. exit 1
  197. end
  198. p.invalid_option do |option|
  199. STDERR.puts "Unknown option: #{option}"
  200. puts p
  201. exit 1
  202. end
  203. end
  204. parser.parse args
  205. total_emoji_with_different_unicode_sequences = 0
  206. unless use_legacy_unicode_sequences
  207. # Issue warning of the implications of using full GitHub emoji Unicode sequences
  208. puts USE_OF_GITHUB_UNICODE_SEQUENCES_WARNING
  209. end
  210. if visualize_unicode_diffs
  211. puts "The following emoji have different Unicode sequences from those of legacy versions:"
  212. end
  213. response = HTTP::Client.get URI.parse(EMOJIS_JSON_RAW_URL)
  214. shortcode_to_emoji = Hash(String, String).from_json(response.body)
  215. shortcode_to_emoji = shortcode_to_emoji.transform_values { |value| parse_github_filename_into_unicode_string(value) }
  216. # Now before we proceed we need to 'mix in' any legacy emoji alias shortcodes that
  217. # are missing from the GitHub emoji list.
  218. LEGACY_EMOJIS.each do |shortcode_alias, emoji_unicode|
  219. unless shortcode_to_emoji.includes?(shortcode_alias)
  220. shortcode_to_emoji[shortcode_alias] = emoji_unicode
  221. end
  222. end
  223. emojis_content = String::Builder.new("# GENERATED FILE. DO NOT EDIT.
  224. #
  225. # This file was generated from GitHub's emoji API list endpoint:
  226. # #{EMOJIS_JSON_RAW_URL}
  227. # at #{Time.utc} by the script, tools/update_github_emojis.cr.\n\n")
  228. emojis_content.puts "module Luce"
  229. emojis_content.puts " class Emojis"
  230. emojis_content.puts
  231. emojis_content.puts " # Returns the emoji for *name*, or `nil` if no emoji exists."
  232. emojis_content.puts " def self.[]?(name : String) : String?"
  233. emojis_content.puts " @@hash.fetch(name, nil)"
  234. emojis_content.puts " end"
  235. emojis_content.puts
  236. emojis_content.puts " def self.each(& : Tuple(String, String) -> ) : Nil"
  237. emojis_content.puts " @@hash.each { |key| yield key }"
  238. emojis_content.puts " end"
  239. emojis_content.puts
  240. emojis_content.puts " def self.includes?(key : String) : Bool"
  241. emojis_content.puts " @@hash.includes?(key)"
  242. emojis_content.puts " end"
  243. emojis_content.puts
  244. emojis_content.puts
  245. emojis_content.puts " @@hash : Hash(String, String) = {"
  246. emoji_count = 0
  247. ignored = [] of String
  248. errored = [] of String
  249. # Dump in sorted order now to facilitate comparison with new GitHub emoji
  250. sorted_keys = shortcode_to_emoji.keys.sort!
  251. sorted_keys.each do |shortcode_alias|
  252. emoji_unicode = shortcode_to_emoji[shortcode_alias]
  253. if use_legacy_unicode_sequences &&
  254. LEGACY_EMOJIS.includes?(shortcode_alias) &&
  255. shortcode_alias != "cricket" &&
  256. shortcode_alias != "beetle"
  257. emoji_unicode = LEGACY_EMOJIS[shortcode_alias]
  258. end
  259. if LEGACY_EMOJIS.includes?(shortcode_alias) &&
  260. emoji_unicode != LEGACY_EMOJIS[shortcode_alias]
  261. total_emoji_with_different_unicode_sequences += 1
  262. if visualize_unicode_diffs
  263. puts "#{emoji_unicode} was #{LEGACY_EMOJIS[shortcode_alias]} :#{shortcode_alias}"
  264. end
  265. end
  266. if emoji_unicode != ERROR_SPECIAL_REPLACEMENT && !emoji_unicode.empty?
  267. emojis_content.puts " \"#{shortcode_alias}\" => \"#{emoji_unicode}\","
  268. if dump_markdown_shortcodes
  269. puts ":#{shortcode_alias}:"
  270. elsif dump_markdown_tooltip_shortcodes
  271. puts "[:#{shortcode_alias}:](## \"&colon;#{shortcode_alias}&colon; emoji\")"
  272. end
  273. emoji_count += 1
  274. else
  275. if emoji_unicode == ERROR_SPECIAL_REPLACEMENT
  276. errored << shortcode_alias
  277. else
  278. ignored << shortcode_alias
  279. end
  280. end
  281. end
  282. emojis_content.puts " }"
  283. emojis_content.puts " end"
  284. emojis_content.puts "end"
  285. File.write(EMOJIS_FILE_PATH, emojis_content.to_s)
  286. # We are outputing the markdown to stdout, and presumably it
  287. # is being captured, so we exit now to exclude the summary
  288. # report from being included in the emoji markdown we have
  289. # been outputing.
  290. exit 0 if dump_markdown_shortcodes
  291. puts "Wrote data to #{EMOJIS_FILE_PATH} for #{emoji_count} emoji,
  292. #{total_emoji_with_different_unicode_sequences} emoji's Unicode sequences differ from legacy versions#{visualize_unicode_diffs ? "" : " (run with -v flag to visualize"},
  293. ignoring #{ignored.size}: #{ignored.join(", ")},
  294. errored: #{errored.size} #{errored.join(", ")}."
  295. Process.run("crystal", ["tool", "format", EMOJIS_FILE_PATH])
  296. end
  297. main