|
- <?xml version="1.0" encoding="utf-8"?>
- <!-- Created by Leo: http://leoeditor.com/leo_toc.html -->
- <leo_file xmlns:leo="http://leoeditor.com/namespaces/leo-python-editor/1.1" >
- <leo_header file_format="2"/>
- <globals/>
- <preferences/>
- <find_panel_settings/>
- <vnodes>
- <v t="caminhante.20200309141113.1"><vh>@settings</vh>
- <v t="caminhante.20200309141113.2"><vh>NodeActions</vh>
- <v t="caminhante.20200309141113.3"><vh>^@(file|clean) .*\.[ch] [X]</vh></v>
- <v t="caminhante.20200309141113.4"><vh>@run*</vh></v>
- </v>
- </v>
- <v t="caminhante.20200309142214.1"><vh>Utils</vh>
- <v t="caminhante.20200309141700.1"><vh>@run fossil status</vh></v>
- <v t="caminhante.20200309141709.1"><vh>@run sync Git repo</vh></v>
- <v t="caminhante.20200309142127.1"><vh>@run create Git repo</vh></v>
- </v>
- <v t="caminhante.20200309141027.3"><vh>Minimal UTF-8 support</vh>
- <v t="caminhante.20200309141415.1"><vh>@auto README.md</vh></v>
- <v t="caminhante.20200309141148.1"><vh>@clean ./utf8.c</vh>
- <v t="caminhante.20200309141148.2"><vh>static within</vh></v>
- <v t="caminhante.20200309141148.3"><vh>static ascii_char</vh></v>
- <v t="caminhante.20200309141148.4"><vh>static utf8_2bytes_char</vh></v>
- <v t="caminhante.20200309141148.5"><vh>static utf8_3bytes_char</vh></v>
- <v t="caminhante.20200309141148.6"><vh>static utf8_4bytes_char</vh></v>
- <v t="caminhante.20200309141148.7"><vh>uchar_valid</vh></v>
- <v t="caminhante.20200309141148.8"><vh>uchar_bytes</vh></v>
- <v t="caminhante.20200309141148.9"><vh>ustring_length</vh></v>
- <v t="caminhante.20200309141148.10"><vh>ustring_bytes</vh></v>
- <v t="caminhante.20200309141148.11"><vh>cstring_bytes</vh></v>
- <v t="caminhante.20200309141148.12"><vh>next_uchar</vh></v>
- <v t="caminhante.20200309141148.13"><vh>c_to_ustring</vh></v>
- <v t="caminhante.20200309141148.14"><vh>u_to_cstring</vh></v>
- <v t="caminhante.20200309141148.15"><vh>uchar_puts</vh></v>
- <v t="caminhante.20200309141148.16"><vh>ustring_puts</vh></v>
- </v>
- <v t="caminhante.20200309141158.1"><vh>@clean ./utf8.h</vh>
- <v t="caminhante.20200309141158.2"><vh>uchar_valid</vh></v>
- <v t="caminhante.20200309141158.3"><vh>uchar_bytes</vh></v>
- <v t="caminhante.20200309141158.4"><vh>ustring_length</vh></v>
- <v t="caminhante.20200309141158.5"><vh>ustring_bytes</vh></v>
- <v t="caminhante.20200309141158.6"><vh>cstring_bytes</vh></v>
- <v t="caminhante.20200309141158.7"><vh>next_uchar</vh></v>
- <v t="caminhante.20200309141158.8"><vh>c_to_ustring</vh></v>
- <v t="caminhante.20200309141158.9"><vh>u_to_cstring</vh></v>
- <v t="caminhante.20200309141158.10"><vh>uchar_puts</vh></v>
- <v t="caminhante.20200309141158.11"><vh>ustring_puts</vh></v>
- </v>
- <v t="caminhante.20200309145700.1"><vh>@clean Makefile</vh></v>
- </v>
- </vnodes>
- <tnodes>
- <t tx="caminhante.20200309141027.3"></t>
- <t tx="caminhante.20200309141113.1"></t>
- <t tx="caminhante.20200309141113.2">@language python</t>
- <t tx="caminhante.20200309141113.3">import subprocess,os
- if c.isChanged(): c.save()
- os.chdir(c.getNodePath(pClicked))
- filename = ' '.join(pClicked.h.split()[1:])
- g.es('gcc -c ' + filename)
- proc = subprocess.Popen(['gcc','-std=gnu99','-Wall','-Werror','-Wfatal-errors','-D_GNU_SOURCE',filename,'-c','-o','/dev/null'],
- stderr=subprocess.PIPE, stdout=subprocess.PIPE, close_fds=True)
- while True:
- data = proc.stdout.read()
- if len(data) == 0: break
- g.es(data)
- while True:
- data = proc.stderr.read()
- if len(data) == 0: break
- g.es(data)
- </t>
- <t tx="caminhante.20200309141113.4">@language python
- import subprocess
- def getpath (p):
- dict = c.scanAllDirectives(p)
- d = dict.get("path")
- if p.isAnyAtFileNode():
- filename = p.anyAtFileNodeName()
- filename = g.os_path_join(d,filename)
- if filename:
- d = g.os_path_dirname(filename)
- if d is None:
- return ""
- else:
- return g.os_path_normpath(d)
- def execute (cmd):
- # return subprocess.run(cmd,shell=True,universal_newlines=True,stderr=subprocess.STDOUT,stdout=subprocess.PIPE)
- return subprocess.check_output(cmd,shell=True,universal_newlines=True,stderr=subprocess.STDOUT)
- path = getpath(c.p)
- command = c.p.b
- cmdname = c.p.h
- g.es('---- '+cmdname+' ----')
- g.es(execute('cd "'+path+'";\n'+command))
- g.es('---- end ----')</t>
- <t tx="caminhante.20200309141148.1">#include "utf8.h"
- @others
- </t>
- <t tx="caminhante.20200309141148.10">size_t ustring_bytes (char* source) {
- size_t a, p=0;
- do {
- a = uchar_bytes(source+p);
- p += a;
- } while (a);
- return p;
- }
- </t>
- <t tx="caminhante.20200309141148.11">size_t cstring_bytes (struct uchar* source) {
- size_t a = 0;
- while(source->bytes) {
- a += source->bytes;
- source++;
- }
- return a;
- }
- </t>
- <t tx="caminhante.20200309141148.12">struct uchar next_uchar (char* source) {
- size_t bytes = uchar_bytes(source);
- if (bytes == 0) return (struct uchar){0};
- struct uchar uc = (struct uchar){
- .bytes=bytes,
- .chars[0]=source[0],
- .chars[1]=(bytes>=2 ? source[1] : 0),
- .chars[2]=(bytes>=3 ? source[2] : 0),
- .chars[3]=(bytes==4 ? source[3] : 0)
- };
- return uc;
- }
- </t>
- <t tx="caminhante.20200309141148.13">void c_to_ustring (char* source, struct uchar* destination) {
- char *p = source;
- struct uchar uc, *us = destination;
- do {
- uc = next_uchar(p);
- p += uc.bytes;
- *us = uc;
- us ++;
- } while (uc.bytes);
- }
- </t>
- <t tx="caminhante.20200309141148.14">void u_to_cstring (struct uchar* source, char* destination) {
- struct uchar *us = source;
- char *p = destination;
- while (us->bytes) {
- memcpy(p,us->chars,us->bytes);
- p += us->bytes;
- us ++;
- }
- p[0] = '\0';
- }
- </t>
- <t tx="caminhante.20200309141148.15">size_t uchar_puts (int fileno, struct uchar *uc) {
- return write(fileno, uc->chars, uc->bytes);
- }
- </t>
- <t tx="caminhante.20200309141148.16">size_t ustring_puts (int fileno, struct uchar *ustring) {
- size_t written = 0;
- while (ustring->bytes != 0) {
- size_t a = uchar_puts(fileno,ustring);
- if (a < ustring->bytes) break;
- written+=a;
- ustring++;
- }
- return written;
- }
- </t>
- <t tx="caminhante.20200309141148.2">static inline bool within (unsigned char value, unsigned char lower, unsigned char greater) {
- return lower<=value && value<=greater;
- }
- </t>
- <t tx="caminhante.20200309141148.3">static bool ascii_char (char* source) {
- return ((signed char)source[0]) >0;
- }
- </t>
- <t tx="caminhante.20200309141148.4">static bool utf8_2bytes_char (char* source) {
- char a = source[0], b = source[1];
- return within(a,0xC2,0xDF) && within(b,0x80,0xBF);
- }
- </t>
- <t tx="caminhante.20200309141148.5">static bool utf8_3bytes_char (char* source) {
- char a = source[0], b = source[1], c = source[2];
- return
- (a==0xE0 && within(b,0xA0,0xBF) && within(c,0x80,0xBF)) ||
- (within(a,0xE1,0xEC) && within(b,0x80,0xBF) && within(c,0x80,0xBF)) ||
- (a==0xED && within(b,0x80,0x9F) && within(c,0x80,0xBF)) ||
- (within(a,0xEE,0xEF) && within(b,0x80,0xBF) && within(c,0x80,0xBF));
- }
- </t>
- <t tx="caminhante.20200309141148.6">static bool utf8_4bytes_char (char* source) {
- char a = source[0], b = source[1], c = source[2], d = source[3];
- return
- (a==0xF0 && within(b,0x90,0xBF) && within(c,0x80,0xBF) && within(d,0x80,0xBF)) ||
- (within(a,0xF1,0xF3) && within(b,0x80,0x8F) && within(c,0x80,0xBF) && within(d,0x80,0xBF)) ||
- (a==0xF4 && within(b,0x80,0x8F) && within(c,0x80,0xBF) && within(d,0x80,0xBF));
- }
- </t>
- <t tx="caminhante.20200309141148.7">bool uchar_valid (char* source) {
- return ascii_char(source) || utf8_2bytes_char(source) ||
- utf8_3bytes_char(source) || utf8_4bytes_char(source);
- }
- </t>
- <t tx="caminhante.20200309141148.8">size_t uchar_bytes (char* source) {
- return ascii_char(source) ? 1 :
- utf8_2bytes_char(source) ? 2 :
- utf8_3bytes_char(source) ? 3 :
- utf8_4bytes_char(source) ? 4 : 0;
- }
- </t>
- <t tx="caminhante.20200309141148.9">size_t ustring_length (char* source) {
- size_t length = 0, a, p=0;
- do {
- a = uchar_bytes(source+p);
- p += a;
- if (a) length ++;
- } while (a);
- return length;
- }
- </t>
- <t tx="caminhante.20200309141158.1">#ifndef _UTF8_H_
- #define _UTF8_H_
- #include <stdlib.h>
- #include <stdio.h>
- #include <stdint.h>
- #include <string.h>
- #include <stdbool.h>
- #include <unistd.h>
- // A unicode string is a array of `struct uchar` objects, terminated with a 'struct uchar' with `.bytes == 0`.
- // A `\0` byte isn't considered a valid unicode char.
- struct uchar {
- uint8_t bytes;
- union {
- char chars[4];
- uint32_t ichars;
- };
- };
- @others
- #endif
- </t>
- <t tx="caminhante.20200309141158.10">// [ a `struct uchar` object =>
- // side effect: output UTF8 byte sequence at file descriptor, returns number of written bytes ]
- size_t uchar_puts (int fileno, struct uchar* uc);
- </t>
- <t tx="caminhante.20200309141158.11">// [ sequence of `struct uchar` objects =>
- // side effect: output all UTF8 byte sequences at file descriptor, returns number of written bytes ]
- size_t ustring_puts (int fileno, struct uchar* ustring);
- </t>
- <t tx="caminhante.20200309141158.2">// [ valid UTF8 byte sequence => true | false ]
- bool uchar_valid (char* source);
- </t>
- <t tx="caminhante.20200309141158.3">// [ valid UTF8 byte sequence =>
- // number of bytes occupied by a valid UTF8 byte sequence, between 1 and 4 | 0 ]
- size_t uchar_bytes (char* source);
- </t>
- <t tx="caminhante.20200309141158.4">// [ sequence of valid UTF8 byte sequences =>
- // number of valid consecutive UTF8 byte sequences, greater or equal than 1 | 0 ]
- size_t ustring_length (char* source);
- </t>
- <t tx="caminhante.20200309141158.5">// [ sequence of valid UTF8 byte sequences =>
- // the number of bytes occupied by valid consecutive UTF8 byte sequences,
- // greater or equal than 1 | 0 ]
- size_t ustring_bytes (char* source);
- </t>
- <t tx="caminhante.20200309141158.6">// [ sequence of `struct uchar` UTF byte sequences =>
- // number of bytes required to convert it to a conventional `\0` terminated `char` array ]
- size_t cstring_bytes (struct uchar* source);
- </t>
- <t tx="caminhante.20200309141158.7">// [ valid UTF8 byte sequence =>
- // a correctly initializated `struct uchar` object
- // and side effects: source position is incremented |
- // a `struct uchar` object with `.bytes == 0` ]
- struct uchar next_uchar (char* source);
- </t>
- <t tx="caminhante.20200309141158.8">// [ a `char` array containing potentially valid UTF8 text =>
- // a `struct uchar` array with all consecutive UTF8 valid byte sequences is written at `*destination` ]
- // You need to calc the needed `struct uchar` array length beforehand,
- // with `ustring_length(source)`
- void c_to_ustring (char* source, struct uchar* destination);
- </t>
- <t tx="caminhante.20200309141158.9">// [ a `struct uchar` array containing potentially valid UTF8 text =>
- // a `\0` terminated `char` array is written at `*destination` ]
- // You need to calc the needed `char` array length beforehand, summing all
- // `struct uchar` `.bytes` members plus 1 (accounting for a extra `\0` byte at the end
- void u_to_cstring (struct uchar* source, char* destination);
- </t>
- <t tx="caminhante.20200309141700.1">fossil status</t>
- <t tx="caminhante.20200309141709.1">cd git
- fossil2git.sh ../minimal_UTF8.fossil
- git push --set-upstream origin trunk
- echo</t>
- <t tx="caminhante.20200309142127.1">mkdir ./git
- cd ./git
- git init
- git remote add origin git@notabug.org:XCaminhante/minimal_UTF8.git</t>
- <t tx="caminhante.20200309142214.1"></t>
- <t tx="caminhante.20200309145700.1">@tabwidth 5
- CFLAGS := -std=gnu99 -Wall -Werror -Wfatal-errors -D_GNU_SOURCE -O3
- all: build/libminiutf8.so build/libminiutf8.a
- build/utf8.o: utf8.c utf8.h
- @mkdir -p build
- gcc $(CFLAGS) $< -c -o $@
- build/libminiutf8.a: build/utf8.o
- ar cr $@ $<
- ranlib $@
- build/libminiutf8.so: build/utf8.o
- gcc -shared $< -o $@
- clean:
- rm -rfv build</t>
- </tnodes>
- </leo_file>
|