converters.cc 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. // Copyright (C) 2003 Mooffie <mooffie@typo.co.il>
  2. //
  3. // This program is free software; you can redistribute it and/or modify
  4. // it under the terms of the GNU General Public License as published by
  5. // the Free Software Foundation; either version 2 of the License, or
  6. // (at your option) any later version.
  7. //
  8. // This program is distributed in the hope that it will be useful,
  9. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. // GNU General Public License for more details.
  12. //
  13. // You should have received a copy of the GNU General Public License
  14. // along with this program; if not, write to the Free Software
  15. // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
  16. #include <config.h>
  17. #include <ctype.h> // toupper
  18. #include "converters.h"
  19. #include "iso88598.h"
  20. #include "utf8.h"
  21. #include "dbg.h"
  22. // guess_encoding() - guesses the encoding of a string.
  23. const char *guess_encoding(const char *buf, int len)
  24. {
  25. #define IS_UTF8 "UTF-8"
  26. #define IS_UTF16 "UTF-16"
  27. #define IS_UTF32 "UTF-32"
  28. #define IS_NOT_UTF8 NULL
  29. #define UNKNOWN_TYPE NULL
  30. if (len >= 4) {
  31. // check for BOM (Byte-Order Mark):
  32. //
  33. // UTF-16 big-endian FE FF
  34. // UTF-16 little-endian FF FE
  35. // UTF-32 big-endian 00 00 FE FF
  36. // UTF-32 little-endian FF FE 00 00
  37. if (buf[0] == (char)0xFE && buf[1] == (char)0xFF)
  38. return IS_UTF16; // BE
  39. if (buf[0] == (char)0xFF && buf[1] == (char)0xFE) {
  40. if (buf[2] == 0 && buf[3] == 0)
  41. return IS_UTF32; // LE
  42. else
  43. return IS_UTF16; // LE
  44. }
  45. if (buf[0] == 0 && buf[1] == 0
  46. && buf[2] == (char)0xFE && buf[3] == (char)0xFF)
  47. return IS_UTF32; // BE
  48. }
  49. // No BOM was found. Go through the string and check for
  50. // UTF-8 sequences: if an illegal sequence was found, it's
  51. // IS_NOT_UTF8, if all squences are legal, it's IS_UTF8, if
  52. // it doesn't have characters with high bit set, return
  53. // UNKNOWN.
  54. const char *result = UNKNOWN_TYPE;
  55. int nbytes = 0;
  56. for (int i = 0; i < len; i++) {
  57. if (nbytes) {
  58. if ((buf[i] & 0xC0) != 0x80) {
  59. return IS_NOT_UTF8;
  60. }
  61. if (! --nbytes)
  62. result = IS_UTF8;
  63. } else if (buf[i] & 0x80) {
  64. const char &c = buf[i];
  65. nbytes = (c & 0xE0) == 0xC0 ? 1 :
  66. (c & 0xF0) == 0xE0 ? 2 :
  67. (c & 0xF8) == 0xF0 ? 3 :
  68. (c & 0xFC) == 0xF8 ? 4 :
  69. (c & 0xFE) == 0xFC ? 5 : 0;
  70. if (nbytes == 0) {
  71. return IS_NOT_UTF8;
  72. }
  73. }
  74. }
  75. return result;
  76. }
  77. #ifdef USE_ICONV
  78. IconvConverter::IconvConverter()
  79. {
  80. cd = (iconv_t)-1;
  81. }
  82. IconvConverter::~IconvConverter()
  83. {
  84. if (cd != (iconv_t)-1)
  85. iconv_close(cd);
  86. }
  87. int IconvConverter::set_source_encoding(const char *encoding)
  88. {
  89. cd = iconv_open(INTERNAL_ENCODING, encoding);
  90. if (cd == (iconv_t)-1) {
  91. return false;
  92. }
  93. return true;
  94. }
  95. int IconvConverter::set_target_encoding(const char *encoding)
  96. {
  97. cd = iconv_open(encoding, INTERNAL_ENCODING);
  98. if (cd == (iconv_t)-1) {
  99. return false;
  100. }
  101. return true;
  102. }
  103. int IconvConverter::convert(unichar **dest, char **src, int len)
  104. {
  105. size_t dest_avail = 99999; // :FIXME: I've tried several xxx_MAX constants
  106. // but it makes iconv() fail.
  107. size_t src_bytes_left = len;
  108. size_t result = iconv(cd, (ICONV_CONST char **)src, &src_bytes_left,
  109. (char **)dest, &dest_avail);
  110. return (result == (size_t)-1) ? (int)-1 : (int)result;
  111. }
  112. int IconvConverter::convert(char **dest, unichar **src, int len)
  113. {
  114. size_t dest_avail = 99999; // :FIXME:
  115. size_t src_bytes_left = len * sizeof(unichar);
  116. size_t result;
  117. while (1) {
  118. result = iconv(cd, (ICONV_CONST char **)src, &src_bytes_left,
  119. dest, &dest_avail);
  120. if (ilseq_repr && result == (size_t)-1 && errno == EILSEQ) {
  121. // We're asked to represent EILSEQ as "?".
  122. // we put "?" in **dest, and advance the
  123. // src pointer.
  124. (*src)++;
  125. src_bytes_left -= sizeof(unichar);
  126. (**dest) = '?';
  127. (*dest)++;
  128. dest_avail--;
  129. } else {
  130. break;
  131. }
  132. }
  133. return (result == (size_t)-1) ? (int)-1 : (int)result;
  134. }
  135. #endif
  136. int ISO88598Converter::convert(unichar **dest, char **src, int len)
  137. {
  138. int count = 0;
  139. unichar * &d = *dest;
  140. char * &s = *src;
  141. while (len--) {
  142. *d++ = iso88598_to_unicode(*s++);
  143. count++;
  144. }
  145. return count;
  146. }
  147. int ISO88598Converter::convert(char **dest, unichar **src, int len)
  148. {
  149. int count = 0;
  150. char * &d = *dest;
  151. unichar * &s = *src;
  152. while (len--) {
  153. int ich = unicode_to_iso88598(*s);
  154. if (ich == EOF) {
  155. if (ilseq_repr) {
  156. ich = '?';
  157. } else {
  158. errno = EILSEQ;
  159. return -1;
  160. }
  161. }
  162. *d++ = (char)ich;
  163. s++;
  164. count++;
  165. }
  166. return count;
  167. }
  168. int Latin1Converter::convert(unichar **dest, char **src, int len)
  169. {
  170. int count = len;
  171. unichar * &d = *dest;
  172. char * &s = *src;
  173. while (len--)
  174. *d++ = (unsigned char)*s++;
  175. return count;
  176. }
  177. int Latin1Converter::convert(char **dest, unichar **src, int len)
  178. {
  179. int count = len;
  180. char * &d = *dest;
  181. unichar * &s = *src;
  182. while (len--) {
  183. if (*s > 0xFF) {
  184. if (ilseq_repr) {
  185. *d++ = '?';
  186. s++;
  187. } else {
  188. errno = EILSEQ;
  189. return -1;
  190. }
  191. } else {
  192. *d++ = (char)*s++;
  193. }
  194. }
  195. return count;
  196. }
  197. int UTF8Converter::convert(unichar **dest, char **src, int len)
  198. {
  199. int count = 0;
  200. unichar * &d = *dest;
  201. char * &s = *src;
  202. const char *problem;
  203. count = utf8_to_unicode(d, s, len, &problem);
  204. if (problem) {
  205. d += count;
  206. s = (char *)problem;
  207. errno = EINVAL;
  208. return -1;
  209. } else {
  210. d += count;
  211. s += len;
  212. }
  213. return count;
  214. }
  215. int UTF8Converter::convert(char **dest, unichar **src, int len)
  216. {
  217. char * &d = *dest;
  218. unichar * &s = *src;
  219. int nbytes = unicode_to_utf8(d, s, len);
  220. d += nbytes;
  221. s += len;
  222. return len;
  223. }
  224. Converter *ConverterFactory::get_internal_converter(const char *enc)
  225. {
  226. // canonize the encoding name: remove '-', and upperace.
  227. u8string encoding = u8string(enc).erase_char('-').toupper_ascii();
  228. DBG(1, ("looking for internal '%s' converter\n", encoding.c_str()));
  229. if (encoding == "UTF8")
  230. return new UTF8Converter();
  231. if (encoding == "ISO88598" || encoding == "88598")
  232. return new ISO88598Converter();
  233. if (encoding == "ISO88591" || encoding == "LATIN1"
  234. || encoding == "88591" || encoding == "ASCII"
  235. || encoding == "USASCII")
  236. return new Latin1Converter();
  237. return NULL;
  238. }
  239. Converter *ConverterFactory::get_converter_from(const char *encoding)
  240. {
  241. #ifdef USE_ICONV
  242. IconvConverter *iconv = new IconvConverter();
  243. if (!iconv->set_source_encoding(encoding)) {
  244. delete iconv;
  245. return NULL;
  246. } else {
  247. return iconv;
  248. }
  249. #else
  250. return ConverterFactory::get_internal_converter(encoding);
  251. #endif
  252. }
  253. Converter *ConverterFactory::get_converter_to(const char *encoding)
  254. {
  255. #ifdef USE_ICONV
  256. IconvConverter *iconv = new IconvConverter();
  257. if (!iconv->set_target_encoding(encoding)) {
  258. delete iconv;
  259. return NULL;
  260. } else {
  261. return iconv;
  262. }
  263. #else
  264. return ConverterFactory::get_internal_converter(encoding);
  265. #endif
  266. }