minimal_UTF8.leo 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. <?xml version="1.0" encoding="utf-8"?>
  2. <!-- Created by Leo: http://leoeditor.com/leo_toc.html -->
  3. <leo_file xmlns:leo="http://leoeditor.com/namespaces/leo-python-editor/1.1" >
  4. <leo_header file_format="2"/>
  5. <globals/>
  6. <preferences/>
  7. <find_panel_settings/>
  8. <vnodes>
  9. <v t="caminhante.20200309141113.1"><vh>@settings</vh>
  10. <v t="caminhante.20200309141113.2"><vh>NodeActions</vh>
  11. <v t="caminhante.20200309141113.3"><vh>^@(file|clean) .*\.[ch] [X]</vh></v>
  12. <v t="caminhante.20200309141113.4"><vh>@run*</vh></v>
  13. </v>
  14. </v>
  15. <v t="caminhante.20200309142214.1"><vh>Utils</vh>
  16. <v t="caminhante.20200309141700.1"><vh>@run fossil status</vh></v>
  17. <v t="caminhante.20200309141709.1"><vh>@run sync Git repo</vh></v>
  18. <v t="caminhante.20200309142127.1"><vh>@run create Git repo</vh></v>
  19. </v>
  20. <v t="caminhante.20200309141027.3"><vh>Minimal UTF-8 support</vh>
  21. <v t="caminhante.20200309141415.1"><vh>@auto README.md</vh></v>
  22. <v t="caminhante.20200309141148.1"><vh>@clean ./utf8.c</vh>
  23. <v t="caminhante.20200309141148.2"><vh>static within</vh></v>
  24. <v t="caminhante.20200309141148.3"><vh>static ascii_char</vh></v>
  25. <v t="caminhante.20200309141148.4"><vh>static utf8_2bytes_char</vh></v>
  26. <v t="caminhante.20200309141148.5"><vh>static utf8_3bytes_char</vh></v>
  27. <v t="caminhante.20200309141148.6"><vh>static utf8_4bytes_char</vh></v>
  28. <v t="caminhante.20200309141148.7"><vh>uchar_valid</vh></v>
  29. <v t="caminhante.20200309141148.8"><vh>uchar_bytes</vh></v>
  30. <v t="caminhante.20200309141148.9"><vh>ustring_length</vh></v>
  31. <v t="caminhante.20200309141148.10"><vh>ustring_bytes</vh></v>
  32. <v t="caminhante.20200309141148.11"><vh>cstring_bytes</vh></v>
  33. <v t="caminhante.20200309141148.12"><vh>next_uchar</vh></v>
  34. <v t="caminhante.20200309141148.13"><vh>c_to_ustring</vh></v>
  35. <v t="caminhante.20200309141148.14"><vh>u_to_cstring</vh></v>
  36. <v t="caminhante.20200309141148.15"><vh>uchar_puts</vh></v>
  37. <v t="caminhante.20200309141148.16"><vh>ustring_puts</vh></v>
  38. </v>
  39. <v t="caminhante.20200309141158.1"><vh>@clean ./utf8.h</vh>
  40. <v t="caminhante.20200309141158.2"><vh>uchar_valid</vh></v>
  41. <v t="caminhante.20200309141158.3"><vh>uchar_bytes</vh></v>
  42. <v t="caminhante.20200309141158.4"><vh>ustring_length</vh></v>
  43. <v t="caminhante.20200309141158.5"><vh>ustring_bytes</vh></v>
  44. <v t="caminhante.20200309141158.6"><vh>cstring_bytes</vh></v>
  45. <v t="caminhante.20200309141158.7"><vh>next_uchar</vh></v>
  46. <v t="caminhante.20200309141158.8"><vh>c_to_ustring</vh></v>
  47. <v t="caminhante.20200309141158.9"><vh>u_to_cstring</vh></v>
  48. <v t="caminhante.20200309141158.10"><vh>uchar_puts</vh></v>
  49. <v t="caminhante.20200309141158.11"><vh>ustring_puts</vh></v>
  50. </v>
  51. <v t="caminhante.20200309145700.1"><vh>@clean Makefile</vh></v>
  52. </v>
  53. </vnodes>
  54. <tnodes>
  55. <t tx="caminhante.20200309141027.3"></t>
  56. <t tx="caminhante.20200309141113.1"></t>
  57. <t tx="caminhante.20200309141113.2">@language python</t>
  58. <t tx="caminhante.20200309141113.3">import subprocess,os
  59. if c.isChanged(): c.save()
  60. os.chdir(c.getNodePath(pClicked))
  61. filename = ' '.join(pClicked.h.split()[1:])
  62. g.es('gcc -c ' + filename)
  63. proc = subprocess.Popen(['gcc','-std=gnu99','-Wall','-Werror','-Wfatal-errors','-D_GNU_SOURCE',filename,'-c','-o','/dev/null'],
  64. stderr=subprocess.PIPE, stdout=subprocess.PIPE, close_fds=True)
  65. while True:
  66. data = proc.stdout.read()
  67. if len(data) == 0: break
  68. g.es(data)
  69. while True:
  70. data = proc.stderr.read()
  71. if len(data) == 0: break
  72. g.es(data)
  73. </t>
  74. <t tx="caminhante.20200309141113.4">@language python
  75. import subprocess
  76. def getpath (p):
  77. dict = c.scanAllDirectives(p)
  78. d = dict.get("path")
  79. if p.isAnyAtFileNode():
  80. filename = p.anyAtFileNodeName()
  81. filename = g.os_path_join(d,filename)
  82. if filename:
  83. d = g.os_path_dirname(filename)
  84. if d is None:
  85. return ""
  86. else:
  87. return g.os_path_normpath(d)
  88. def execute (cmd):
  89. # return subprocess.run(cmd,shell=True,universal_newlines=True,stderr=subprocess.STDOUT,stdout=subprocess.PIPE)
  90. return subprocess.check_output(cmd,shell=True,universal_newlines=True,stderr=subprocess.STDOUT)
  91. path = getpath(c.p)
  92. command = c.p.b
  93. cmdname = c.p.h
  94. g.es('---- '+cmdname+' ----')
  95. g.es(execute('cd "'+path+'";\n'+command))
  96. g.es('---- end ----')</t>
  97. <t tx="caminhante.20200309141148.1">#include "utf8.h"
  98. @others
  99. </t>
  100. <t tx="caminhante.20200309141148.10">size_t ustring_bytes (char* source) {
  101. size_t a, p=0;
  102. do {
  103. a = uchar_bytes(source+p);
  104. p += a;
  105. } while (a);
  106. return p;
  107. }
  108. </t>
  109. <t tx="caminhante.20200309141148.11">size_t cstring_bytes (struct uchar* source) {
  110. size_t a = 0;
  111. while(source-&gt;bytes) {
  112. a += source-&gt;bytes;
  113. source++;
  114. }
  115. return a;
  116. }
  117. </t>
  118. <t tx="caminhante.20200309141148.12">struct uchar next_uchar (char* source) {
  119. size_t bytes = uchar_bytes(source);
  120. if (bytes == 0) return (struct uchar){0};
  121. struct uchar uc = (struct uchar){
  122. .bytes=bytes,
  123. .chars[0]=source[0],
  124. .chars[1]=(bytes&gt;=2 ? source[1] : 0),
  125. .chars[2]=(bytes&gt;=3 ? source[2] : 0),
  126. .chars[3]=(bytes==4 ? source[3] : 0)
  127. };
  128. return uc;
  129. }
  130. </t>
  131. <t tx="caminhante.20200309141148.13">void c_to_ustring (char* source, struct uchar* destination) {
  132. char *p = source;
  133. struct uchar uc, *us = destination;
  134. do {
  135. uc = next_uchar(p);
  136. p += uc.bytes;
  137. *us = uc;
  138. us ++;
  139. } while (uc.bytes);
  140. }
  141. </t>
  142. <t tx="caminhante.20200309141148.14">void u_to_cstring (struct uchar* source, char* destination) {
  143. struct uchar *us = source;
  144. char *p = destination;
  145. while (us-&gt;bytes) {
  146. memcpy(p,us-&gt;chars,us-&gt;bytes);
  147. p += us-&gt;bytes;
  148. us ++;
  149. }
  150. p[0] = '\0';
  151. }
  152. </t>
  153. <t tx="caminhante.20200309141148.15">size_t uchar_puts (int fileno, struct uchar *uc) {
  154. return write(fileno, uc-&gt;chars, uc-&gt;bytes);
  155. }
  156. </t>
  157. <t tx="caminhante.20200309141148.16">size_t ustring_puts (int fileno, struct uchar *ustring) {
  158. size_t written = 0;
  159. while (ustring-&gt;bytes != 0) {
  160. size_t a = uchar_puts(fileno,ustring);
  161. if (a &lt; ustring-&gt;bytes) break;
  162. written+=a;
  163. ustring++;
  164. }
  165. return written;
  166. }
  167. </t>
  168. <t tx="caminhante.20200309141148.2">static inline bool within (unsigned char value, unsigned char lower, unsigned char greater) {
  169. return lower&lt;=value &amp;&amp; value&lt;=greater;
  170. }
  171. </t>
  172. <t tx="caminhante.20200309141148.3">static bool ascii_char (char* source) {
  173. return ((signed char)source[0]) &gt;0;
  174. }
  175. </t>
  176. <t tx="caminhante.20200309141148.4">static bool utf8_2bytes_char (char* source) {
  177. char a = source[0], b = source[1];
  178. return within(a,0xC2,0xDF) &amp;&amp; within(b,0x80,0xBF);
  179. }
  180. </t>
  181. <t tx="caminhante.20200309141148.5">static bool utf8_3bytes_char (char* source) {
  182. char a = source[0], b = source[1], c = source[2];
  183. return
  184. (a==0xE0 &amp;&amp; within(b,0xA0,0xBF) &amp;&amp; within(c,0x80,0xBF)) ||
  185. (within(a,0xE1,0xEC) &amp;&amp; within(b,0x80,0xBF) &amp;&amp; within(c,0x80,0xBF)) ||
  186. (a==0xED &amp;&amp; within(b,0x80,0x9F) &amp;&amp; within(c,0x80,0xBF)) ||
  187. (within(a,0xEE,0xEF) &amp;&amp; within(b,0x80,0xBF) &amp;&amp; within(c,0x80,0xBF));
  188. }
  189. </t>
  190. <t tx="caminhante.20200309141148.6">static bool utf8_4bytes_char (char* source) {
  191. char a = source[0], b = source[1], c = source[2], d = source[3];
  192. return
  193. (a==0xF0 &amp;&amp; within(b,0x90,0xBF) &amp;&amp; within(c,0x80,0xBF) &amp;&amp; within(d,0x80,0xBF)) ||
  194. (within(a,0xF1,0xF3) &amp;&amp; within(b,0x80,0x8F) &amp;&amp; within(c,0x80,0xBF) &amp;&amp; within(d,0x80,0xBF)) ||
  195. (a==0xF4 &amp;&amp; within(b,0x80,0x8F) &amp;&amp; within(c,0x80,0xBF) &amp;&amp; within(d,0x80,0xBF));
  196. }
  197. </t>
  198. <t tx="caminhante.20200309141148.7">bool uchar_valid (char* source) {
  199. return ascii_char(source) || utf8_2bytes_char(source) ||
  200. utf8_3bytes_char(source) || utf8_4bytes_char(source);
  201. }
  202. </t>
  203. <t tx="caminhante.20200309141148.8">size_t uchar_bytes (char* source) {
  204. return ascii_char(source) ? 1 :
  205. utf8_2bytes_char(source) ? 2 :
  206. utf8_3bytes_char(source) ? 3 :
  207. utf8_4bytes_char(source) ? 4 : 0;
  208. }
  209. </t>
  210. <t tx="caminhante.20200309141148.9">size_t ustring_length (char* source) {
  211. size_t length = 0, a, p=0;
  212. do {
  213. a = uchar_bytes(source+p);
  214. p += a;
  215. if (a) length ++;
  216. } while (a);
  217. return length;
  218. }
  219. </t>
  220. <t tx="caminhante.20200309141158.1">#ifndef _UTF8_H_
  221. #define _UTF8_H_
  222. #include &lt;stdlib.h&gt;
  223. #include &lt;stdio.h&gt;
  224. #include &lt;stdint.h&gt;
  225. #include &lt;string.h&gt;
  226. #include &lt;stdbool.h&gt;
  227. #include &lt;unistd.h&gt;
  228. // A unicode string is a array of `struct uchar` objects, terminated with a 'struct uchar' with `.bytes == 0`.
  229. // A `\0` byte isn't considered a valid unicode char.
  230. struct uchar {
  231. uint8_t bytes;
  232. union {
  233. char chars[4];
  234. uint32_t ichars;
  235. };
  236. };
  237. @others
  238. #endif
  239. </t>
  240. <t tx="caminhante.20200309141158.10">// [ a `struct uchar` object =&gt;
  241. // side effect: output UTF8 byte sequence at file descriptor, returns number of written bytes ]
  242. size_t uchar_puts (int fileno, struct uchar* uc);
  243. </t>
  244. <t tx="caminhante.20200309141158.11">// [ sequence of `struct uchar` objects =&gt;
  245. // side effect: output all UTF8 byte sequences at file descriptor, returns number of written bytes ]
  246. size_t ustring_puts (int fileno, struct uchar* ustring);
  247. </t>
  248. <t tx="caminhante.20200309141158.2">// [ valid UTF8 byte sequence =&gt; true | false ]
  249. bool uchar_valid (char* source);
  250. </t>
  251. <t tx="caminhante.20200309141158.3">// [ valid UTF8 byte sequence =&gt;
  252. // number of bytes occupied by a valid UTF8 byte sequence, between 1 and 4 | 0 ]
  253. size_t uchar_bytes (char* source);
  254. </t>
  255. <t tx="caminhante.20200309141158.4">// [ sequence of valid UTF8 byte sequences =&gt;
  256. // number of valid consecutive UTF8 byte sequences, greater or equal than 1 | 0 ]
  257. size_t ustring_length (char* source);
  258. </t>
  259. <t tx="caminhante.20200309141158.5">// [ sequence of valid UTF8 byte sequences =&gt;
  260. // the number of bytes occupied by valid consecutive UTF8 byte sequences,
  261. // greater or equal than 1 | 0 ]
  262. size_t ustring_bytes (char* source);
  263. </t>
  264. <t tx="caminhante.20200309141158.6">// [ sequence of `struct uchar` UTF byte sequences =&gt;
  265. // number of bytes required to convert it to a conventional `\0` terminated `char` array ]
  266. size_t cstring_bytes (struct uchar* source);
  267. </t>
  268. <t tx="caminhante.20200309141158.7">// [ valid UTF8 byte sequence =&gt;
  269. // a correctly initializated `struct uchar` object
  270. // and side effects: source position is incremented |
  271. // a `struct uchar` object with `.bytes == 0` ]
  272. struct uchar next_uchar (char* source);
  273. </t>
  274. <t tx="caminhante.20200309141158.8">// [ a `char` array containing potentially valid UTF8 text =&gt;
  275. // a `struct uchar` array with all consecutive UTF8 valid byte sequences is written at `*destination` ]
  276. // You need to calc the needed `struct uchar` array length beforehand,
  277. // with `ustring_length(source)`
  278. void c_to_ustring (char* source, struct uchar* destination);
  279. </t>
  280. <t tx="caminhante.20200309141158.9">// [ a `struct uchar` array containing potentially valid UTF8 text =&gt;
  281. // a `\0` terminated `char` array is written at `*destination` ]
  282. // You need to calc the needed `char` array length beforehand, summing all
  283. // `struct uchar` `.bytes` members plus 1 (accounting for a extra `\0` byte at the end
  284. void u_to_cstring (struct uchar* source, char* destination);
  285. </t>
  286. <t tx="caminhante.20200309141700.1">fossil status</t>
  287. <t tx="caminhante.20200309141709.1">cd git
  288. fossil2git.sh ../minimal_UTF8.fossil
  289. git push --set-upstream origin trunk
  290. echo</t>
  291. <t tx="caminhante.20200309142127.1">mkdir ./git
  292. cd ./git
  293. git init
  294. git remote add origin git@notabug.org:XCaminhante/minimal_UTF8.git</t>
  295. <t tx="caminhante.20200309142214.1"></t>
  296. <t tx="caminhante.20200309145700.1">@tabwidth 5
  297. CFLAGS := -std=gnu99 -Wall -Werror -Wfatal-errors -D_GNU_SOURCE -O3
  298. all: build/libminiutf8.so build/libminiutf8.a
  299. build/utf8.o: utf8.c utf8.h
  300. @mkdir -p build
  301. gcc $(CFLAGS) $&lt; -c -o $@
  302. build/libminiutf8.a: build/utf8.o
  303. ar cr $@ $&lt;
  304. ranlib $@
  305. build/libminiutf8.so: build/utf8.o
  306. gcc -shared $&lt; -o $@
  307. clean:
  308. rm -rfv build</t>
  309. </tnodes>
  310. </leo_file>