utf8.h 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. #ifndef _UTF8_H_
  2. #define _UTF8_H_
  3. #include <stdlib.h>
  4. #include <stdio.h>
  5. #include <stdint.h>
  6. #include <string.h>
  7. #include <stdbool.h>
  8. #include <unistd.h>
  9. // A unicode string is a array of `struct uchar` objects, terminated with a 'struct uchar' with `.bytes == 0`.
  10. // A `\0` byte isn't considered a valid unicode char.
  11. struct uchar {
  12. uint8_t bytes;
  13. union {
  14. char chars[4];
  15. uint32_t ichars;
  16. };
  17. };
  18. // [ valid UTF8 byte sequence => true | false ]
  19. bool uchar_valid (char* source);
  20. // [ valid UTF8 byte sequence =>
  21. // number of bytes occupied by a valid UTF8 byte sequence, between 1 and 4 | 0 ]
  22. size_t uchar_bytes (char* source);
  23. // [ sequence of valid UTF8 byte sequences =>
  24. // number of valid consecutive UTF8 byte sequences, greater or equal than 1 | 0 ]
  25. size_t ustring_length (char* source);
  26. // [ sequence of valid UTF8 byte sequences =>
  27. // the number of bytes occupied by valid consecutive UTF8 byte sequences,
  28. // greater or equal than 1 | 0 ]
  29. size_t ustring_bytes (char* source);
  30. // [ sequence of `struct uchar` UTF byte sequences =>
  31. // number of bytes required to convert it to a conventional `\0` terminated `char` array ]
  32. size_t cstring_bytes (struct uchar* source);
  33. // [ valid UTF8 byte sequence =>
  34. // a correctly initializated `struct uchar` object
  35. // and side effects: source position is incremented |
  36. // a `struct uchar` object with `.bytes == 0` ]
  37. struct uchar next_uchar (char* source);
  38. // [ a `char` array containing potentially valid UTF8 text =>
  39. // a `struct uchar` array with all consecutive UTF8 valid byte sequences is written at `*destination` ]
  40. // You need to calc the needed `struct uchar` array length beforehand,
  41. // with `ustring_length(source)`
  42. void c_to_ustring (char* source, struct uchar* destination);
  43. // [ a `struct uchar` array containing potentially valid UTF8 text =>
  44. // a `\0` terminated `char` array is written at `*destination` ]
  45. // You need to calc the needed `char` array length beforehand, summing all
  46. // `struct uchar` `.bytes` members plus 1 (accounting for a extra `\0` byte at the end
  47. void u_to_cstring (struct uchar* source, char* destination);
  48. // [ a `struct uchar` object =>
  49. // side effect: output UTF8 byte sequence at file descriptor, returns number of written bytes ]
  50. size_t uchar_puts (int fileno, struct uchar* uc);
  51. // [ sequence of `struct uchar` objects =>
  52. // side effect: output all UTF8 byte sequences at file descriptor, returns number of written bytes ]
  53. size_t ustring_puts (int fileno, struct uchar* ustring);
  54. #endif