123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329 |
- #
- # Copyright (c) 2023 supercell
- #
- # SPDX-License-Identifier: BSD-3-Clause
- #
- require "http/client"
- require "json"
- require "option_parser"
- require "../src/luce/legacy_emojis"
- # Regular expression to parse unicode from GitHub emoji API output filenames.
- GITHUB_EMOJI_UNICODE_FROM_FILENAME_PATTERN = Regex.new(%{.*unicode/([A-Fa-f0-9-]+).png})
- # URL for GitHub's emoji API. We reconcile with our legacy emoji so that
- # we don't change or break anything.
- # There are essentially only TWO (2) emoji that change and the
- # legacy emoji is still available with an alternate name.
- # The 'beetle' emoji changes from `🐞` to `🪲`, legacy available as 'lady_beetle'.
- # The 'cricket' emoji changes from `🏏` to `🦗`, legacy available as 'cricket_game'.
- # (if the -g flag us used to force using the GitHub Unicode sequences for the
- # emoji then additionally the 'email' emoji changes from '✉️' to '📧').
- EMOJIS_JSON_RAW_URL = "https://api.github.com/emojis"
- EMOJIS_FILE_PATH = "src/luce/emojis.cr"
- LEGACY_EMOJIS = Luce::LegacyEmojis
- # This array is ONLY USED when the --use-github-unicodes option is used to
- # minimize the visual differences in the output emoji.
- LEGACY_EMOJIS_USED_VARIATION_MODIFIER = [
- "263a",
- "2600",
- "2601",
- "2744",
- "2708",
- "260e",
- "2702",
- "2712",
- "270f",
- "2764",
- "d83c-de37",
- "2734",
- "3299",
- "3297",
- "d83c-dd70",
- "d83c-dd71",
- "d83c-dd7e",
- "2668",
- "203c",
- "2049",
- "303d",
- "26a0",
- "267b",
- "2747",
- "2733",
- "24c2",
- "d83c-de02",
- "d83c-dd7f",
- "23cf",
- "25b6",
- "25c0",
- "27a1",
- "2b05",
- "2b06",
- "2b07",
- "2197",
- "2198",
- "2199",
- "2196",
- "2195",
- "2194",
- "21aa",
- "21a9",
- "2934",
- "2935",
- "2139",
- "3030",
- "2714",
- "2716",
- "00a9",
- "00ae",
- "2122",
- "2611",
- "25aa",
- "25ab",
- "25fc",
- "25fb",
- "2660",
- "2663",
- "2665",
- "2666",
- ]
- # Special replacement character '�'
- ERROR_SPECIAL_REPLACEMENT = "\uFFFD"
- USE_OF_GITHUB_UNICODE_SEQUENCES_WARNING = "
- IMPORTANT NOTE: The use of --use-github-unicodes switch will force using
- GitHub Unicode sequences.
- This option is essentially here only for completeness, not for
- release use.
- The slight visual differences of some emoji might also be another
- reason using --use-github-unicodes should be considered a *Breaking Change*.
- Some test will fail because of the different Unicode sequences
- and the emojis.unit file would need to be updated to contain the new
- expected GitHub versions of the Unicode sequences of the emoji in order
- for the tests to pass.
- "
- # The GitHub API URL will return a JSON map of all emoji in the form of
- # `{ "shortcode":"emojifilename" ... }`.
- # The filenames are simply a list of all of the hex string of the
- # *essential* Unicode codepoints representing the emoji.
- # These sequences exclude the Unicode join zero width (0x200D) and
- # variation select (0xFE0F) modifiers. (We will need to add these in to
- # build our actually Unicode strings representing the emoji).
- # Multiple Unicode codepoints are separated by '-'.
- # Examples filenames (single and double code point examples):
- # - "https://github.githubassets.com/images/icons/emoji/unicode/1f643.png?v8"
- # - "https://github.githubassets.com/images/icons/emoji/unicode/1f1fa-1f1fe.png?v8"
- # - "https://github.githubassets.com/images/icons/emoji/unicode/1f469-1f469-1f467-1f466.png?v8"
- # NOTE: Some filenames will be GitHub 'custom' emoji that have no Unicode
- # equivalent and these will not have hex codepoints, only the GitHub custom name.
- # We will ingore these (there are only a 19 and they are mostly pixel art from
- # the old Doom game).
- # Example GitHub custom emoji filename:
- # - "https://github.githubassets.com/images/icons/emoji/godmode.png?v8",
- def parse_github_filename_into_unicode_string(emoji_filename : String) : String
- variation_selector = 0xFE0F
- zero_width_joiner = 0x200D
- begin
- raw_hex_list = GITHUB_EMOJI_UNICODE_FROM_FILENAME_PATTERN.match(emoji_filename).try &.[1]
- # This is the GitHub custom emoji and it is represented by a PNG image only and
- # there is no equivalent Unicode. We have to ignore.
- return "" if raw_hex_list.nil?
- legacy_used_variation_code = LEGACY_EMOJIS_USED_VARIATION_MODIFIER.includes?(raw_hex_list)
- raw_code_points_hex = raw_hex_list.split("-").map { |v| "0x#{v}".to_i(prefix: true) }
- code_points_hex = [] of Int32
- if legacy_used_variation_code
- code_points_hex.concat(raw_code_points_hex)
- code_points_hex << variation_selector
- else
- # Now insert the join zero width and variation select modifying Unicode chars.
- raw_code_points_hex.each_with_index do |code_point_at_index, i|
- code_points_hex << code_point_at_index
- if i < (raw_code_points_hex.size - 1)
- code_points_hex << variation_selector
- # # and 0-9 don't use Zero Width joiner
- if code_point_at_index == 0x23 ||
- (code_point_at_index >= 0x30 && code_point_at_index <= 0x39)
- # Don't add Zero Width Joiner
- else
- code_points_hex << zero_width_joiner
- end
- end
- end
- end
- code_points_hex.map(&.chr).join
- rescue e
- STDERR.puts "Invalid/Non-Conformant emoji filename pattern found \"#{emoji_filename}\"!"
- ERROR_SPECIAL_REPLACEMENT
- end
- end
- def main(args = ARGV)
- use_legacy_unicode_sequences = true
- visualize_unicode_diffs = false
- dump_markdown_shortcodes = false
- dump_markdown_tooltip_shortcodes = false
- parser = OptionParser.new do |p|
- p.banner = "Usage update_github_emojis.cr
- By default, the legacy Unicode sequences are used (for
- maximum visual compatibility with the legacy emoji).
- The --use-github-unicodes flag can be used so that the
- Unicode sequences from GitHub are used for emoji's that
- existed within the legacy set. This will result in a very slight visual
- difference for some emoji, but it will result in many more
- binary differences when comparing legacy_emoji.cr to emoji.cr.
- #{USE_OF_GITHUB_UNICODE_SEQUENCES_WARNING}
- The --visualize-different-unicodes flag can be used to visually
- verify that any different Unicode sequences produce the same
- emoji.
- Options:"
- p.on("-h", "--help", "Print help text and exit") do
- puts p
- exit 0
- end
- p.on("-g", "--use-github-unicodes", "Use the GitHub Unicode sequences instead of leagacy sequences.") do
- use_legacy_unicode_sequences = false
- end
- p.on("-v", "--visualize-different-unicodes", "Visualize any Unicode sequence differences") do
- visualize_unicode_diffs = true
- end
- p.on("-s STYLE", "--dump-markdown-shortcodes STYLE", "Outputs all emoji shortcodes to stdout which can be used in markdown to show and test all emoji") do |value|
- dump_markdown_shortcodes = (value.downcase == "plain")
- dump_markdown_tooltip_shortcodes = (value.downcase == "tooltip")
- unless dump_markdown_shortcodes || dump_markdown_tooltip_shortcodes
- STDERR.puts "--dump-markdown-shortcodes can only be one of \"plain\" or \"tooltip\""
- exit 1
- end
- end
- p.missing_option do |option|
- STDERR.puts "Missing option for #{option} flag"
- puts p
- exit 1
- end
- p.invalid_option do |option|
- STDERR.puts "Unknown option: #{option}"
- puts p
- exit 1
- end
- end
- parser.parse args
- total_emoji_with_different_unicode_sequences = 0
- unless use_legacy_unicode_sequences
- # Issue warning of the implications of using full GitHub emoji Unicode sequences
- puts USE_OF_GITHUB_UNICODE_SEQUENCES_WARNING
- end
- if visualize_unicode_diffs
- puts "The following emoji have different Unicode sequences from those of legacy versions:"
- end
- response = HTTP::Client.get URI.parse(EMOJIS_JSON_RAW_URL)
- shortcode_to_emoji = Hash(String, String).from_json(response.body)
- shortcode_to_emoji = shortcode_to_emoji.transform_values { |value| parse_github_filename_into_unicode_string(value) }
- # Now before we proceed we need to 'mix in' any legacy emoji alias shortcodes that
- # are missing from the GitHub emoji list.
- LEGACY_EMOJIS.each do |shortcode_alias, emoji_unicode|
- unless shortcode_to_emoji.includes?(shortcode_alias)
- shortcode_to_emoji[shortcode_alias] = emoji_unicode
- end
- end
- emojis_content = String::Builder.new("# GENERATED FILE. DO NOT EDIT.
- #
- # This file was generated from GitHub's emoji API list endpoint:
- # #{EMOJIS_JSON_RAW_URL}
- # at #{Time.utc} by the script, tools/update_github_emojis.cr.\n\n")
- emojis_content.puts "module Luce"
- emojis_content.puts " class Emojis"
- emojis_content.puts
- emojis_content.puts " # Returns the emoji for *name*, or `nil` if no emoji exists."
- emojis_content.puts " def self.[]?(name : String) : String?"
- emojis_content.puts " @@hash.fetch(name, nil)"
- emojis_content.puts " end"
- emojis_content.puts
- emojis_content.puts " def self.each(& : Tuple(String, String) -> ) : Nil"
- emojis_content.puts " @@hash.each { |key| yield key }"
- emojis_content.puts " end"
- emojis_content.puts
- emojis_content.puts " def self.includes?(key : String) : Bool"
- emojis_content.puts " @@hash.includes?(key)"
- emojis_content.puts " end"
- emojis_content.puts
- emojis_content.puts
- emojis_content.puts " @@hash : Hash(String, String) = {"
- emoji_count = 0
- ignored = [] of String
- errored = [] of String
- # Dump in sorted order now to facilitate comparison with new GitHub emoji
- sorted_keys = shortcode_to_emoji.keys.sort!
- sorted_keys.each do |shortcode_alias|
- emoji_unicode = shortcode_to_emoji[shortcode_alias]
- if use_legacy_unicode_sequences &&
- LEGACY_EMOJIS.includes?(shortcode_alias) &&
- shortcode_alias != "cricket" &&
- shortcode_alias != "beetle"
- emoji_unicode = LEGACY_EMOJIS[shortcode_alias]
- end
- if LEGACY_EMOJIS.includes?(shortcode_alias) &&
- emoji_unicode != LEGACY_EMOJIS[shortcode_alias]
- total_emoji_with_different_unicode_sequences += 1
- if visualize_unicode_diffs
- puts "#{emoji_unicode} was #{LEGACY_EMOJIS[shortcode_alias]} :#{shortcode_alias}"
- end
- end
- if emoji_unicode != ERROR_SPECIAL_REPLACEMENT && !emoji_unicode.empty?
- emojis_content.puts " \"#{shortcode_alias}\" => \"#{emoji_unicode}\","
- if dump_markdown_shortcodes
- puts ":#{shortcode_alias}:"
- elsif dump_markdown_tooltip_shortcodes
- puts "[:#{shortcode_alias}:](## \":#{shortcode_alias}: emoji\")"
- end
- emoji_count += 1
- else
- if emoji_unicode == ERROR_SPECIAL_REPLACEMENT
- errored << shortcode_alias
- else
- ignored << shortcode_alias
- end
- end
- end
- emojis_content.puts " }"
- emojis_content.puts " end"
- emojis_content.puts "end"
- File.write(EMOJIS_FILE_PATH, emojis_content.to_s)
- # We are outputing the markdown to stdout, and presumably it
- # is being captured, so we exit now to exclude the summary
- # report from being included in the emoji markdown we have
- # been outputing.
- exit 0 if dump_markdown_shortcodes
- puts "Wrote data to #{EMOJIS_FILE_PATH} for #{emoji_count} emoji,
- #{total_emoji_with_different_unicode_sequences} emoji's Unicode sequences differ from legacy versions#{visualize_unicode_diffs ? "" : " (run with -v flag to visualize"},
- ignoring #{ignored.size}: #{ignored.join(", ")},
- errored: #{errored.size} #{errored.join(", ")}."
- Process.run("crystal", ["tool", "format", EMOJIS_FILE_PATH])
- end
- main
|