generate-emoji-sequences.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. #!/usr/bin/env python3
  2. import requests
  3. response = requests.get("https://unicode.org/Public/emoji/15.0/emoji-test.txt")
  4. response.raise_for_status()
  5. all_sequences = []
  6. longest_sequence = 0
  7. for line in response.text.splitlines(False):
  8. if len(line) == 0 or line[0] == '#' or line.find("minimally-qualified") != -1:
  9. continue
  10. columns = line.split(";")
  11. codepoints = columns[0].split()
  12. sequence = []
  13. for codepoint in codepoints:
  14. sequence.append(int(codepoint, base=16))
  15. longest_sequence = max(longest_sequence, len(sequence))
  16. all_sequences.append(sequence)
  17. with open("generated/Emoji.hpp", "w") as header_file:
  18. header_file.write("""#pragma once
  19. #include <stdint.h>
  20. #include <stddef.h>
  21. // This file was automatically generated with generate-emoji-sequences.py, do not edit manually!
  22. namespace QuickMedia {
  23. bool match_emoji_sequence(const unsigned char *str, size_t size, uint32_t sequence[32], size_t &sequence_len, size_t &byte_length);
  24. }""")
  25. with open("generated/Emoji.cpp", "w") as source_file:
  26. source_file.write("""#include "Emoji.hpp"
  27. #include <unordered_map>
  28. #include <array>
  29. #include <mglpp/system/Utf8.hpp>
  30. // This file was automatically generated with generate-emoji-sequences.py, do not edit manually!
  31. namespace QuickMedia {
  32. static std::unordered_multimap<uint32_t, std::array<uint32_t, %d>> emoji_sequences = {
  33. """ % (longest_sequence - 1))
  34. for sequence in all_sequences:
  35. remaining_sequences = [ hex(c) for c in sequence[1:] ]
  36. source_file.write(" { %s, { %s } },\n" % (hex(sequence[0]), ", ".join(remaining_sequences)))
  37. source_file.write(
  38. """ };
  39. bool match_emoji_sequence(const unsigned char *str, size_t size, uint32_t sequence[32], size_t &sequence_len, size_t &byte_length) {
  40. uint32_t codepoint;
  41. size_t clen;
  42. if(!mgl::utf8_decode(str, size, &codepoint, &clen))
  43. return false;
  44. const size_t str_start_index = clen;
  45. sequence[0] = codepoint;
  46. auto range = emoji_sequences.equal_range(codepoint);
  47. if(range.first == range.second)
  48. return false;
  49. auto longest_match_it = range.first;
  50. size_t longest_match_byte_length = str_start_index;
  51. bool match_found = false;
  52. for(auto it = range.first, end = range.second; it != end; ++it) {
  53. size_t str_index = str_start_index;
  54. for(size_t i = 0; i < it->second.size(); ++i) {
  55. const uint32_t codepoint_in_sequence = it->second[i];
  56. if(codepoint_in_sequence == 0)
  57. break;
  58. if(str_index >= size)
  59. goto next;
  60. if(!mgl::utf8_decode(str + str_index, size - str_index, &codepoint, &clen))
  61. goto next;
  62. if(codepoint != codepoint_in_sequence)
  63. goto next;
  64. str_index += clen;
  65. }
  66. if(str_index >= longest_match_byte_length) {
  67. longest_match_it = it;
  68. longest_match_byte_length = str_index;
  69. }
  70. match_found = true;
  71. next:;
  72. }
  73. if(!match_found)
  74. return false;
  75. size_t sequence_index = 1;
  76. for(size_t i = 0; i < longest_match_it->second.size(); ++i) {
  77. const uint32_t codepoint_in_sequence = longest_match_it->second[i];
  78. if(codepoint_in_sequence == 0)
  79. break;
  80. sequence[sequence_index] = codepoint_in_sequence;
  81. ++sequence_index;
  82. }
  83. sequence_len = sequence_index;
  84. byte_length = longest_match_byte_length;
  85. return true;
  86. }
  87. }
  88. """)