converters.cc 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. // Copyright (C) 2003 Mooffie <mooffie@typo.co.il>
  2. //
  3. // This program is free software; you can redistribute it and/or modify
  4. // it under the terms of the GNU General Public License as published by
  5. // the Free Software Foundation; either version 2 of the License, or
  6. // (at your option) any later version.
  7. //
  8. // This program is distributed in the hope that it will be useful,
  9. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. // GNU General Public License for more details.
  12. //
  13. // You should have received a copy of the GNU General Public License
  14. // along with this program; if not, write to the Free Software
  15. // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
  16. #include <config.h>
  17. #include <ctype.h> // toupper
  18. #include <stdio.h>
  19. #include "converters.h"
  20. #include "iso88598.h"
  21. #include "utf8.h"
  22. #include "dbg.h"
  23. // guess_encoding() - guesses the encoding of a string.
  24. const char *guess_encoding(const char *buf, int len)
  25. {
  26. #define IS_UTF8 "UTF-8"
  27. #define IS_UTF16 "UTF-16"
  28. #define IS_UTF32 "UTF-32"
  29. #define IS_NOT_UTF8 NULL
  30. #define UNKNOWN_TYPE NULL
  31. if (len >= 4) {
  32. // check for BOM (Byte-Order Mark):
  33. //
  34. // UTF-16 big-endian FE FF
  35. // UTF-16 little-endian FF FE
  36. // UTF-32 big-endian 00 00 FE FF
  37. // UTF-32 little-endian FF FE 00 00
  38. if (buf[0] == (char)0xFE && buf[1] == (char)0xFF)
  39. return IS_UTF16; // BE
  40. if (buf[0] == (char)0xFF && buf[1] == (char)0xFE) {
  41. if (buf[2] == 0 && buf[3] == 0)
  42. return IS_UTF32; // LE
  43. else
  44. return IS_UTF16; // LE
  45. }
  46. if (buf[0] == 0 && buf[1] == 0
  47. && buf[2] == (char)0xFE && buf[3] == (char)0xFF)
  48. return IS_UTF32; // BE
  49. }
  50. // No BOM was found. Go through the string and check for
  51. // UTF-8 sequences: if an illegal sequence was found, it's
  52. // IS_NOT_UTF8, if all squences are legal, it's IS_UTF8, if
  53. // it doesn't have characters with high bit set, return
  54. // UNKNOWN.
  55. const char *result = UNKNOWN_TYPE;
  56. int nbytes = 0;
  57. for (int i = 0; i < len; i++) {
  58. if (nbytes) {
  59. if ((buf[i] & 0xC0) != 0x80) {
  60. return IS_NOT_UTF8;
  61. }
  62. if (! --nbytes)
  63. result = IS_UTF8;
  64. } else if (buf[i] & 0x80) {
  65. const char &c = buf[i];
  66. nbytes = (c & 0xE0) == 0xC0 ? 1 :
  67. (c & 0xF0) == 0xE0 ? 2 :
  68. (c & 0xF8) == 0xF0 ? 3 :
  69. (c & 0xFC) == 0xF8 ? 4 :
  70. (c & 0xFE) == 0xFC ? 5 : 0;
  71. if (nbytes == 0) {
  72. return IS_NOT_UTF8;
  73. }
  74. }
  75. }
  76. return result;
  77. }
  78. #ifdef USE_ICONV
  79. IconvConverter::IconvConverter()
  80. {
  81. cd = (iconv_t)-1;
  82. }
  83. IconvConverter::~IconvConverter()
  84. {
  85. if (cd != (iconv_t)-1)
  86. iconv_close(cd);
  87. }
  88. int IconvConverter::set_source_encoding(const char *encoding)
  89. {
  90. cd = iconv_open(INTERNAL_ENCODING, encoding);
  91. if (cd == (iconv_t)-1) {
  92. return false;
  93. }
  94. return true;
  95. }
  96. int IconvConverter::set_target_encoding(const char *encoding)
  97. {
  98. cd = iconv_open(encoding, INTERNAL_ENCODING);
  99. if (cd == (iconv_t)-1) {
  100. return false;
  101. }
  102. return true;
  103. }
  104. int IconvConverter::convert(unichar **dest, char **src, int len)
  105. {
  106. size_t dest_avail = 99999; // :FIXME: I've tried several xxx_MAX constants
  107. // but it makes iconv() fail.
  108. size_t src_bytes_left = len;
  109. size_t result = iconv(cd, (ICONV_CONST char **)src, &src_bytes_left,
  110. (char **)dest, &dest_avail);
  111. return (result == (size_t)-1) ? (int)-1 : (int)result;
  112. }
  113. int IconvConverter::convert(char **dest, unichar **src, int len)
  114. {
  115. size_t dest_avail = 99999; // :FIXME:
  116. size_t src_bytes_left = len * sizeof(unichar);
  117. size_t result;
  118. while (1) {
  119. result = iconv(cd, (ICONV_CONST char **)src, &src_bytes_left,
  120. dest, &dest_avail);
  121. if (ilseq_repr && result == (size_t)-1 && errno == EILSEQ) {
  122. // We're asked to represent EILSEQ as "?".
  123. // we put "?" in **dest, and advance the
  124. // src pointer.
  125. (*src)++;
  126. src_bytes_left -= sizeof(unichar);
  127. (**dest) = '?';
  128. (*dest)++;
  129. dest_avail--;
  130. } else {
  131. break;
  132. }
  133. }
  134. return (result == (size_t)-1) ? (int)-1 : (int)result;
  135. }
  136. #endif
  137. int ISO88598Converter::convert(unichar **dest, char **src, int len)
  138. {
  139. int count = 0;
  140. unichar * &d = *dest;
  141. char * &s = *src;
  142. while (len--) {
  143. *d++ = iso88598_to_unicode(*s++);
  144. count++;
  145. }
  146. return count;
  147. }
  148. int ISO88598Converter::convert(char **dest, unichar **src, int len)
  149. {
  150. int count = 0;
  151. char * &d = *dest;
  152. unichar * &s = *src;
  153. while (len--) {
  154. int ich = unicode_to_iso88598(*s);
  155. if (ich == EOF) {
  156. if (ilseq_repr) {
  157. ich = '?';
  158. } else {
  159. errno = EILSEQ;
  160. return -1;
  161. }
  162. }
  163. *d++ = (char)ich;
  164. s++;
  165. count++;
  166. }
  167. return count;
  168. }
  169. int Latin1Converter::convert(unichar **dest, char **src, int len)
  170. {
  171. int count = len;
  172. unichar * &d = *dest;
  173. char * &s = *src;
  174. while (len--)
  175. *d++ = (unsigned char)*s++;
  176. return count;
  177. }
  178. int Latin1Converter::convert(char **dest, unichar **src, int len)
  179. {
  180. int count = len;
  181. char * &d = *dest;
  182. unichar * &s = *src;
  183. while (len--) {
  184. if (*s > 0xFF) {
  185. if (ilseq_repr) {
  186. *d++ = '?';
  187. s++;
  188. } else {
  189. errno = EILSEQ;
  190. return -1;
  191. }
  192. } else {
  193. *d++ = (char)*s++;
  194. }
  195. }
  196. return count;
  197. }
  198. int UTF8Converter::convert(unichar **dest, char **src, int len)
  199. {
  200. int count = 0;
  201. unichar * &d = *dest;
  202. char * &s = *src;
  203. const char *problem;
  204. count = utf8_to_unicode(d, s, len, &problem);
  205. if (problem) {
  206. d += count;
  207. s = (char *)problem;
  208. errno = EINVAL;
  209. return -1;
  210. } else {
  211. d += count;
  212. s += len;
  213. }
  214. return count;
  215. }
  216. int UTF8Converter::convert(char **dest, unichar **src, int len)
  217. {
  218. char * &d = *dest;
  219. unichar * &s = *src;
  220. int nbytes = unicode_to_utf8(d, s, len);
  221. d += nbytes;
  222. s += len;
  223. return len;
  224. }
  225. Converter *ConverterFactory::get_internal_converter(const char *enc)
  226. {
  227. // canonize the encoding name: remove '-', and upperace.
  228. u8string encoding = u8string(enc).erase_char('-').toupper_ascii();
  229. DBG(1, ("looking for internal '%s' converter\n", encoding.c_str()));
  230. if (encoding == "UTF8")
  231. return new UTF8Converter();
  232. if (encoding == "ISO88598" || encoding == "88598")
  233. return new ISO88598Converter();
  234. if (encoding == "ISO88591" || encoding == "LATIN1"
  235. || encoding == "88591" || encoding == "ASCII"
  236. || encoding == "USASCII")
  237. return new Latin1Converter();
  238. return NULL;
  239. }
  240. Converter *ConverterFactory::get_converter_from(const char *encoding)
  241. {
  242. #ifdef USE_ICONV
  243. IconvConverter *iconv = new IconvConverter();
  244. if (!iconv->set_source_encoding(encoding)) {
  245. delete iconv;
  246. return NULL;
  247. } else {
  248. return iconv;
  249. }
  250. #else
  251. return ConverterFactory::get_internal_converter(encoding);
  252. #endif
  253. }
  254. Converter *ConverterFactory::get_converter_to(const char *encoding)
  255. {
  256. #ifdef USE_ICONV
  257. IconvConverter *iconv = new IconvConverter();
  258. if (!iconv->set_target_encoding(encoding)) {
  259. delete iconv;
  260. return NULL;
  261. } else {
  262. return iconv;
  263. }
  264. #else
  265. return ConverterFactory::get_internal_converter(encoding);
  266. #endif
  267. }