123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117 |
- #!/usr/bin/env python3
- import requests
- response = requests.get("https://unicode.org/Public/emoji/15.0/emoji-test.txt")
- response.raise_for_status()
- all_sequences = []
- longest_sequence = 0
- for line in response.text.splitlines(False):
- if len(line) == 0 or line[0] == '#' or line.find("minimally-qualified") != -1:
- continue
- columns = line.split(";")
- codepoints = columns[0].split()
- sequence = []
- for codepoint in codepoints:
- sequence.append(int(codepoint, base=16))
- longest_sequence = max(longest_sequence, len(sequence))
- all_sequences.append(sequence)
- with open("generated/Emoji.hpp", "w") as header_file:
- header_file.write("""#pragma once
- #include <stdint.h>
- #include <stddef.h>
- // This file was automatically generated with generate-emoji-sequences.py, do not edit manually!
- namespace QuickMedia {
- bool match_emoji_sequence(const unsigned char *str, size_t size, uint32_t sequence[32], size_t &sequence_len, size_t &byte_length);
- }""")
- with open("generated/Emoji.cpp", "w") as source_file:
- source_file.write("""#include "Emoji.hpp"
- #include <unordered_map>
- #include <array>
- #include <mglpp/system/Utf8.hpp>
- // This file was automatically generated with generate-emoji-sequences.py, do not edit manually!
- namespace QuickMedia {
- static std::unordered_multimap<uint32_t, std::array<uint32_t, %d>> emoji_sequences = {
- """ % (longest_sequence - 1))
- for sequence in all_sequences:
- remaining_sequences = [ hex(c) for c in sequence[1:] ]
- source_file.write(" { %s, { %s } },\n" % (hex(sequence[0]), ", ".join(remaining_sequences)))
- source_file.write(
- """ };
- bool match_emoji_sequence(const unsigned char *str, size_t size, uint32_t sequence[32], size_t &sequence_len, size_t &byte_length) {
- uint32_t codepoint;
- size_t clen;
- if(!mgl::utf8_decode(str, size, &codepoint, &clen))
- return false;
- const size_t str_start_index = clen;
- sequence[0] = codepoint;
- auto range = emoji_sequences.equal_range(codepoint);
- if(range.first == range.second)
- return false;
- auto longest_match_it = range.first;
- size_t longest_match_byte_length = str_start_index;
- bool match_found = false;
- for(auto it = range.first, end = range.second; it != end; ++it) {
- size_t str_index = str_start_index;
- for(size_t i = 0; i < it->second.size(); ++i) {
- const uint32_t codepoint_in_sequence = it->second[i];
- if(codepoint_in_sequence == 0)
- break;
- if(str_index >= size)
- goto next;
- if(!mgl::utf8_decode(str + str_index, size - str_index, &codepoint, &clen))
- goto next;
- if(codepoint != codepoint_in_sequence)
- goto next;
- str_index += clen;
- }
- if(str_index >= longest_match_byte_length) {
- longest_match_it = it;
- longest_match_byte_length = str_index;
- }
- match_found = true;
- next:;
- }
- if(!match_found)
- return false;
- size_t sequence_index = 1;
- for(size_t i = 0; i < longest_match_it->second.size(); ++i) {
- const uint32_t codepoint_in_sequence = longest_match_it->second[i];
- if(codepoint_in_sequence == 0)
- break;
- sequence[sequence_index] = codepoint_in_sequence;
- ++sequence_index;
- }
- sequence_len = sequence_index;
- byte_length = longest_match_byte_length;
- return true;
- }
- }
- """)
|