Ryu18
/
QuickMedia
kopia lustrzana https://repo.dec05eba.com/QuickMedia


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
							#!/usr/bin/env python3

import requests

response = requests.get("https://unicode.org/Public/emoji/15.0/emoji-test.txt")
response.raise_for_status()

all_sequences = []
longest_sequence = 0

for line in response.text.splitlines(False):
    if len(line) == 0 or line[0] == '#' or line.find("minimally-qualified") != -1:
        continue

    columns = line.split(";")
    codepoints = columns[0].split()
    sequence = []
    for codepoint in codepoints:
        sequence.append(int(codepoint, base=16))
    longest_sequence = max(longest_sequence, len(sequence))
    all_sequences.append(sequence)

with open("generated/Emoji.hpp", "w") as header_file:
    header_file.write("""#pragma once

#include <stdint.h>
#include <stddef.h>

// This file was automatically generated with generate-emoji-sequences.py, do not edit manually!

namespace QuickMedia {
    bool match_emoji_sequence(const unsigned char *str, size_t size, uint32_t sequence[32], size_t &sequence_len, size_t &byte_length);
}""")

with open("generated/Emoji.cpp", "w") as source_file:
    source_file.write("""#include "Emoji.hpp"
#include <unordered_map>
#include <array>
#include <mglpp/system/Utf8.hpp>

// This file was automatically generated with generate-emoji-sequences.py, do not edit manually!

namespace QuickMedia {
    static std::unordered_multimap<uint32_t, std::array<uint32_t, %d>> emoji_sequences = {
""" % (longest_sequence - 1))
    for sequence in all_sequences:
        remaining_sequences = [ hex(c) for c in sequence[1:] ]
        source_file.write("        { %s, { %s } },\n" % (hex(sequence[0]), ", ".join(remaining_sequences)))

    source_file.write(
"""    };

    bool match_emoji_sequence(const unsigned char *str, size_t size, uint32_t sequence[32], size_t &sequence_len, size_t &byte_length) {
        uint32_t codepoint;
        size_t clen;
        if(!mgl::utf8_decode(str, size, &codepoint, &clen))
            return false;

        const size_t str_start_index = clen;
        sequence[0] = codepoint;

        auto range = emoji_sequences.equal_range(codepoint);
        if(range.first == range.second)
            return false;

        auto longest_match_it = range.first;
        size_t longest_match_byte_length = str_start_index;
        bool match_found = false;

        for(auto it = range.first, end = range.second; it != end; ++it) {
            size_t str_index = str_start_index;

            for(size_t i = 0; i < it->second.size(); ++i) {
                const uint32_t codepoint_in_sequence = it->second[i];
                if(codepoint_in_sequence == 0)
                    break;

                if(str_index >= size)
                    goto next;

                if(!mgl::utf8_decode(str + str_index, size - str_index, &codepoint, &clen))
                    goto next;

                if(codepoint != codepoint_in_sequence)
                    goto next;

                str_index += clen;
            }

            if(str_index >= longest_match_byte_length) {
                longest_match_it = it;
                longest_match_byte_length = str_index;
            }

            match_found = true;
            next:;
        }

        if(!match_found)
            return false;

        size_t sequence_index = 1;
        for(size_t i = 0; i < longest_match_it->second.size(); ++i) {
            const uint32_t codepoint_in_sequence = longest_match_it->second[i];
            if(codepoint_in_sequence == 0)
                break;

            sequence[sequence_index] = codepoint_in_sequence;
            ++sequence_index;
        }

        sequence_len = sequence_index;
        byte_length = longest_match_byte_length;
        return true;
    }
}
""")