utf8.cc 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. // Copyright (C) 2003 Mooffie <mooffie@typo.co.il>
  2. //
  3. // This program is free software; you can redistribute it and/or modify
  4. // it under the terms of the GNU General Public License as published by
  5. // the Free Software Foundation; either version 2 of the License, or
  6. // (at your option) any later version.
  7. //
  8. // This program is distributed in the hope that it will be useful,
  9. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. // GNU General Public License for more details.
  12. //
  13. // You should have received a copy of the GNU General Public License
  14. // along with this program; if not, write to the Free Software
  15. // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
  16. #include <config.h>
  17. #include "utf8.h"
  18. #include "univalues.h"
  19. #include "dbg.h"
  20. // utf8_to_unicode() - converts a UTF-8 string to unichars. When an
  21. // incomplete sequence is encountered, *problem will point to its head (it's
  22. // similar to what iconv does).
  23. //
  24. // This function converts UTF-8 to UCS-4 (not to UTF-32) -- that's why it
  25. // recognizes 5- and 6-byte sequences.
  26. int utf8_to_unicode(unichar *dest, const char *s, int len, const char **problem)
  27. {
  28. int length = 0;
  29. const char *end = s + len;
  30. if (problem)
  31. *problem = NULL;
  32. // constant expressions are evaluated at compile time, of course.
  33. #define FRST(t) ((*s & ((1 << (8-t-1)) - 1)) << (t-1)*6)
  34. #define UC(t,n) (*(s+n-1) & 0x3F) << ((t-n)*6)
  35. while (s < end) {
  36. if (!(*s & 0x80)) {
  37. *dest++ = *s;
  38. s++;
  39. } else if ((*s & 0xE0) == 0xC0) {
  40. if ((end - s) >= 2) {
  41. *dest++ = FRST(2) | UC(2,2);
  42. s += 2;
  43. } else {
  44. if (problem)
  45. *problem = s;
  46. break;
  47. }
  48. } else if ((*s & 0xF0) == 0xE0) {
  49. if ((end - s) >= 3) {
  50. *dest++ = FRST(3) | UC(3,2) | UC(3,3);
  51. s += 3;
  52. } else {
  53. if (problem)
  54. *problem = s;
  55. break;
  56. }
  57. } else if ((*s & 0xF8) == 0xF0) {
  58. if ((end - s) >= 4) {
  59. *dest++ = FRST(4) | UC(4,2) | UC(4,3) | UC(4,4);
  60. s += 4;
  61. } else {
  62. if (problem)
  63. *problem = s;
  64. break;
  65. }
  66. } else if ((*s & 0xFC) == 0xF8) {
  67. if ((end - s) >= 5) {
  68. *dest++ = FRST(5) | UC(5,2) | UC(5,3) | UC(5,4) | UC(5,5);
  69. s += 5;
  70. } else {
  71. if (problem)
  72. *problem = s;
  73. break;
  74. }
  75. } else if ((*s & 0xFE) == 0xFC) {
  76. if ((end - s) >= 6) {
  77. *dest++ = FRST(6) | UC(6,2) | UC(6,3) | UC(6,4) | UC(6,5) | UC(6,6);
  78. s += 6;
  79. } else {
  80. if (problem)
  81. *problem = s;
  82. break;
  83. }
  84. } else {
  85. *dest++ = UNI_REPLACEMENT;
  86. s++;
  87. }
  88. length++;
  89. }
  90. return length;
  91. #undef FRST
  92. #undef UC
  93. }
  94. // unicode_to_utf8() - converts unichars to UTF-8.
  95. int unicode_to_utf8(char *dest, const unichar *us, int len)
  96. {
  97. #define UC(n) ((*us >> 6*n) & 0x3F)
  98. #define CNT(n) (((1 << n) - 1) << (8 - n))
  99. int nbytes = 0;
  100. while (len--) {
  101. if (*us < 0x80) {
  102. *dest++ = *us;
  103. nbytes += 1;
  104. } else if (*us < 0x800) {
  105. *dest++ = UC(1) | CNT(2);
  106. *dest++ = UC(0) | 0x80;
  107. nbytes += 2;
  108. } else if (*us < 0x10000) {
  109. *dest++ = UC(2) | CNT(3);
  110. *dest++ = UC(1) | 0x80;
  111. *dest++ = UC(0) | 0x80;
  112. nbytes += 3;
  113. } else if (*us < 0x200000) {
  114. *dest++ = UC(3) | CNT(4);
  115. *dest++ = UC(2) | 0x80;
  116. *dest++ = UC(1) | 0x80;
  117. *dest++ = UC(0) | 0x80;
  118. nbytes += 4;
  119. } else if (*us < 0x4000000) {
  120. *dest++ = UC(4) | CNT(5);
  121. *dest++ = UC(3) | 0x80;
  122. *dest++ = UC(2) | 0x80;
  123. *dest++ = UC(1) | 0x80;
  124. *dest++ = UC(0) | 0x80;
  125. nbytes += 5;
  126. } else {
  127. *dest++ = UC(5) | CNT(6);
  128. *dest++ = UC(4) | 0x80;
  129. *dest++ = UC(3) | 0x80;
  130. *dest++ = UC(2) | 0x80;
  131. *dest++ = UC(1) | 0x80;
  132. *dest++ = UC(0) | 0x80;
  133. nbytes += 6;
  134. }
  135. us++;
  136. }
  137. return nbytes;
  138. #undef UC
  139. #undef CNT
  140. }