bluemoon
/
luce


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
							#
# Copyright (c) 2023 supercell
#
# SPDX-License-Identifier: BSD-3-Clause
#
require "http/client"
require "json"
require "option_parser"

require "../src/luce/legacy_emojis"

# Regular expression to parse unicode from GitHub emoji API output filenames.
GITHUB_EMOJI_UNICODE_FROM_FILENAME_PATTERN = Regex.new(%{.*unicode/([A-Fa-f0-9-]+).png})

# URL for GitHub's emoji API.  We reconcile with our legacy emoji so that
# we don't change or break anything.
# There are essentially only TWO (2) emoji that change and the
# legacy emoji is still available with an alternate name.
# The 'beetle' emoji changes from `🐞` to `🪲`, legacy available as 'lady_beetle'.
# The 'cricket' emoji changes from `🏏` to `🦗`, legacy available as 'cricket_game'.
# (if the -g flag us used to force using the GitHub Unicode sequences for the
# emoji then additionally the 'email' emoji changes from '✉️' to '📧').
EMOJIS_JSON_RAW_URL = "https://api.github.com/emojis"
EMOJIS_FILE_PATH    = "src/luce/emojis.cr"

LEGACY_EMOJIS = Luce::LegacyEmojis

# This array is ONLY USED when the --use-github-unicodes option is used to
# minimize the visual differences in the output emoji.
LEGACY_EMOJIS_USED_VARIATION_MODIFIER = [
  "263a",
  "2600",
  "2601",
  "2744",
  "2708",
  "260e",
  "2702",
  "2712",
  "270f",
  "2764",
  "d83c-de37",
  "2734",
  "3299",
  "3297",
  "d83c-dd70",
  "d83c-dd71",
  "d83c-dd7e",
  "2668",
  "203c",
  "2049",
  "303d",
  "26a0",
  "267b",
  "2747",
  "2733",
  "24c2",
  "d83c-de02",
  "d83c-dd7f",
  "23cf",
  "25b6",
  "25c0",
  "27a1",
  "2b05",
  "2b06",
  "2b07",
  "2197",
  "2198",
  "2199",
  "2196",
  "2195",
  "2194",
  "21aa",
  "21a9",
  "2934",
  "2935",
  "2139",
  "3030",
  "2714",
  "2716",
  "00a9",
  "00ae",
  "2122",
  "2611",
  "25aa",
  "25ab",
  "25fc",
  "25fb",
  "2660",
  "2663",
  "2665",
  "2666",
]

# Special replacement character '�'
ERROR_SPECIAL_REPLACEMENT = "\uFFFD"

USE_OF_GITHUB_UNICODE_SEQUENCES_WARNING = "
IMPORTANT NOTE: The use of --use-github-unicodes switch will force using
GitHub Unicode sequences.
This option is essentially here only for completeness, not for
release use.
The slight visual differences of some emoji might also be another
reason using --use-github-unicodes should be considered a *Breaking Change*.

Some test will fail because of the different Unicode sequences
and the emojis.unit file would need to be updated to contain the new
expected GitHub versions of the Unicode sequences of the emoji in order
for the tests to pass.
"

# The GitHub API URL will return a JSON map of all emoji in the form of
# `{ "shortcode":"emojifilename" ... }`.
# The filenames are simply a list of all of the hex string of the
# *essential* Unicode codepoints representing the emoji.
# These sequences exclude the Unicode join zero width (0x200D) and
# variation select (0xFE0F) modifiers. (We will need to add these in to
# build our actually Unicode strings representing the emoji).
# Multiple Unicode codepoints are separated by '-'.
# Examples filenames (single and double code point examples):
#  - "https://github.githubassets.com/images/icons/emoji/unicode/1f643.png?v8"
#  - "https://github.githubassets.com/images/icons/emoji/unicode/1f1fa-1f1fe.png?v8"
#  - "https://github.githubassets.com/images/icons/emoji/unicode/1f469-1f469-1f467-1f466.png?v8"
# NOTE: Some filenames will be GitHub 'custom' emoji that have no Unicode
# equivalent and these will not have hex codepoints, only the GitHub custom name.
# We will ingore these (there are only a 19 and they are mostly pixel art from
# the old Doom game).
# Example GitHub custom emoji filename:
#  - "https://github.githubassets.com/images/icons/emoji/godmode.png?v8",
def parse_github_filename_into_unicode_string(emoji_filename : String) : String
  variation_selector = 0xFE0F
  zero_width_joiner = 0x200D

  begin
    raw_hex_list = GITHUB_EMOJI_UNICODE_FROM_FILENAME_PATTERN.match(emoji_filename).try &.[1]
    # This is the GitHub custom emoji and it is represented by a PNG image only and
    # there is no equivalent Unicode. We have to ignore.
    return "" if raw_hex_list.nil?
    legacy_used_variation_code = LEGACY_EMOJIS_USED_VARIATION_MODIFIER.includes?(raw_hex_list)
    raw_code_points_hex = raw_hex_list.split("-").map { |v| "0x#{v}".to_i(prefix: true) }
    code_points_hex = [] of Int32

    if legacy_used_variation_code
      code_points_hex.concat(raw_code_points_hex)
      code_points_hex << variation_selector
    else
      # Now insert the join zero width and variation select modifying Unicode chars.
      raw_code_points_hex.each_with_index do |code_point_at_index, i|
        code_points_hex << code_point_at_index
        if i < (raw_code_points_hex.size - 1)
          code_points_hex << variation_selector
          # # and 0-9 don't use Zero Width joiner
          if code_point_at_index == 0x23 ||
             (code_point_at_index >= 0x30 && code_point_at_index <= 0x39)
            # Don't add Zero Width Joiner
          else
            code_points_hex << zero_width_joiner
          end
        end
      end
    end
    code_points_hex.map(&.chr).join
  rescue e
    STDERR.puts "Invalid/Non-Conformant emoji filename pattern found \"#{emoji_filename}\"!"
    ERROR_SPECIAL_REPLACEMENT
  end
end

def main(args = ARGV)
  use_legacy_unicode_sequences = true
  visualize_unicode_diffs = false
  dump_markdown_shortcodes = false
  dump_markdown_tooltip_shortcodes = false

  parser = OptionParser.new do |p|
    p.banner = "Usage update_github_emojis.cr

By default, the legacy Unicode sequences are used (for
maximum visual compatibility with the legacy emoji).
The --use-github-unicodes flag can be used so that the
Unicode sequences from GitHub are used for emoji's that
existed within the legacy set.  This will result in a very slight visual
difference for some emoji, but it will result in many more
binary differences when comparing legacy_emoji.cr to emoji.cr.
#{USE_OF_GITHUB_UNICODE_SEQUENCES_WARNING}

The --visualize-different-unicodes flag can be used to visually
verify that any different Unicode sequences produce the same
emoji.

Options:"
    p.on("-h", "--help", "Print help text and exit") do
      puts p
      exit 0
    end
    p.on("-g", "--use-github-unicodes", "Use the GitHub Unicode sequences instead of leagacy sequences.") do
      use_legacy_unicode_sequences = false
    end
    p.on("-v", "--visualize-different-unicodes", "Visualize any Unicode sequence differences") do
      visualize_unicode_diffs = true
    end
    p.on("-s STYLE", "--dump-markdown-shortcodes STYLE", "Outputs all emoji shortcodes to stdout which can be used in markdown to show and test all emoji") do |value|
      dump_markdown_shortcodes = (value.downcase == "plain")
      dump_markdown_tooltip_shortcodes = (value.downcase == "tooltip")
      unless dump_markdown_shortcodes || dump_markdown_tooltip_shortcodes
        STDERR.puts "--dump-markdown-shortcodes can only be one of \"plain\" or \"tooltip\""
        exit 1
      end
    end
    p.missing_option do |option|
      STDERR.puts "Missing option for #{option} flag"
      puts p
      exit 1
    end
    p.invalid_option do |option|
      STDERR.puts "Unknown option: #{option}"
      puts p
      exit 1
    end
  end

  parser.parse args

  total_emoji_with_different_unicode_sequences = 0
  unless use_legacy_unicode_sequences
    # Issue warning of the implications of using full GitHub emoji Unicode sequences
    puts USE_OF_GITHUB_UNICODE_SEQUENCES_WARNING
  end

  if visualize_unicode_diffs
    puts "The following emoji have different Unicode sequences from those of legacy versions:"
  end

  response = HTTP::Client.get URI.parse(EMOJIS_JSON_RAW_URL)
  shortcode_to_emoji = Hash(String, String).from_json(response.body)
  shortcode_to_emoji = shortcode_to_emoji.transform_values { |value| parse_github_filename_into_unicode_string(value) }

  # Now before we proceed we need to 'mix in' any legacy emoji alias shortcodes that
  # are missing from the GitHub emoji list.
  LEGACY_EMOJIS.each do |shortcode_alias, emoji_unicode|
    unless shortcode_to_emoji.includes?(shortcode_alias)
      shortcode_to_emoji[shortcode_alias] = emoji_unicode
    end
  end

  emojis_content = String::Builder.new("# GENERATED FILE. DO NOT EDIT.
#
# This file was generated from GitHub's emoji API list endpoint:
# #{EMOJIS_JSON_RAW_URL}
# at #{Time.utc} by the script, tools/update_github_emojis.cr.\n\n")
  emojis_content.puts "module Luce"
  emojis_content.puts "  class Emojis"
  emojis_content.puts
  emojis_content.puts "    # Returns the emoji for *name*, or `nil` if no emoji exists."
  emojis_content.puts "    def self.[]?(name : String) : String?"
  emojis_content.puts "      @@hash.fetch(name, nil)"
  emojis_content.puts "    end"
  emojis_content.puts
  emojis_content.puts "    def self.each(& : Tuple(String, String) -> ) : Nil"
  emojis_content.puts "      @@hash.each { |key| yield key }"
  emojis_content.puts "    end"
  emojis_content.puts
  emojis_content.puts "    def self.includes?(key : String) : Bool"
  emojis_content.puts "      @@hash.includes?(key)"
  emojis_content.puts "    end"
  emojis_content.puts
  emojis_content.puts
  emojis_content.puts "    @@hash : Hash(String, String) = {"

  emoji_count = 0
  ignored = [] of String
  errored = [] of String

  # Dump in sorted order now to facilitate comparison with new GitHub emoji
  sorted_keys = shortcode_to_emoji.keys.sort!
  sorted_keys.each do |shortcode_alias|
    emoji_unicode = shortcode_to_emoji[shortcode_alias]

    if use_legacy_unicode_sequences &&
       LEGACY_EMOJIS.includes?(shortcode_alias) &&
       shortcode_alias != "cricket" &&
       shortcode_alias != "beetle"
      emoji_unicode = LEGACY_EMOJIS[shortcode_alias]
    end
    if LEGACY_EMOJIS.includes?(shortcode_alias) &&
       emoji_unicode != LEGACY_EMOJIS[shortcode_alias]
      total_emoji_with_different_unicode_sequences += 1
      if visualize_unicode_diffs
        puts "#{emoji_unicode} was #{LEGACY_EMOJIS[shortcode_alias]} :#{shortcode_alias}"
      end
    end
    if emoji_unicode != ERROR_SPECIAL_REPLACEMENT && !emoji_unicode.empty?
      emojis_content.puts "      \"#{shortcode_alias}\" => \"#{emoji_unicode}\","
      if dump_markdown_shortcodes
        puts ":#{shortcode_alias}:"
      elsif dump_markdown_tooltip_shortcodes
        puts "[:#{shortcode_alias}:](## \"&colon;#{shortcode_alias}&colon; emoji\")"
      end
      emoji_count += 1
    else
      if emoji_unicode == ERROR_SPECIAL_REPLACEMENT
        errored << shortcode_alias
      else
        ignored << shortcode_alias
      end
    end
  end

  emojis_content.puts "    }"
  emojis_content.puts "  end"
  emojis_content.puts "end"

  File.write(EMOJIS_FILE_PATH, emojis_content.to_s)

  # We are outputing the markdown to stdout, and presumably it
  # is being captured, so we exit now to exclude the summary
  # report from being included in the emoji markdown we have
  # been outputing.
  exit 0 if dump_markdown_shortcodes

  puts "Wrote data to #{EMOJIS_FILE_PATH} for #{emoji_count} emoji,
#{total_emoji_with_different_unicode_sequences} emoji's Unicode sequences differ from legacy versions#{visualize_unicode_diffs ? "" : " (run with -v flag to visualize"},
ignoring #{ignored.size}: #{ignored.join(", ")},
errored: #{errored.size} #{errored.join(", ")}."

  Process.run("crystal", ["tool", "format", EMOJIS_FILE_PATH])
end

main