tzafrir
/
geresh
同期ミラー git://git.tzafrir.org.il/git/geresh.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
							// Copyright (C) 2003 Mooffie <mooffie@typo.co.il>
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.

#include <config.h>

#include <ctype.h> // toupper

#include "converters.h"
#include "iso88598.h"
#include "utf8.h"
#include "dbg.h"

// guess_encoding() - guesses the encoding of a string.

const char *guess_encoding(const char *buf, int len)
{
#define IS_UTF8		 "UTF-8"
#define IS_UTF16	 "UTF-16"
#define IS_UTF32	 "UTF-32"
#define IS_NOT_UTF8	 NULL
#define UNKNOWN_TYPE	 NULL

    if (len >= 4) {
	// check for BOM (Byte-Order Mark):
	// 
	// UTF-16 big-endian          FE FF
	// UTF-16 little-endian       FF FE
	// UTF-32 big-endian          00 00 FE FF
	// UTF-32 little-endian       FF FE 00 00

	if (buf[0] == (char)0xFE && buf[1] == (char)0xFF)
	    return IS_UTF16; // BE
	if (buf[0] == (char)0xFF && buf[1] == (char)0xFE) {
	    if (buf[2] == 0 && buf[3] == 0)
		return IS_UTF32; // LE
	    else
		return IS_UTF16; // LE
	}
	if (buf[0] == 0 && buf[1] == 0
		&& buf[2] == (char)0xFE && buf[3] == (char)0xFF)
	    return IS_UTF32; // BE
    }

    // No BOM was found. Go through the string and check for
    // UTF-8 sequences: if an illegal sequence was found, it's
    // IS_NOT_UTF8, if all squences are legal, it's IS_UTF8, if
    // it doesn't have characters with high bit set, return
    // UNKNOWN. 

    const char *result = UNKNOWN_TYPE;
    int nbytes = 0;
    for (int i = 0; i < len; i++) {
	if (nbytes) {
	    if ((buf[i] & 0xC0) != 0x80) {
		return IS_NOT_UTF8;
	    }
	    if (! --nbytes)
		result = IS_UTF8;
	} else if (buf[i] & 0x80) {
	    const char &c = buf[i];
	    nbytes = (c & 0xE0) == 0xC0 ? 1 :
		     (c & 0xF0) == 0xE0 ? 2 :
		     (c & 0xF8) == 0xF0 ? 3 :
		     (c & 0xFC) == 0xF8 ? 4 :
		     (c & 0xFE) == 0xFC ? 5 : 0;
	    if (nbytes == 0) {
		return IS_NOT_UTF8;
	    }
	}
    }
    return result;
}

#ifdef USE_ICONV
IconvConverter::IconvConverter()
{
    cd = (iconv_t)-1;
}

IconvConverter::~IconvConverter()
{
    if (cd != (iconv_t)-1)
	iconv_close(cd);
}

int IconvConverter::set_source_encoding(const char *encoding)
{
    cd = iconv_open(INTERNAL_ENCODING, encoding);
    if (cd == (iconv_t)-1) {
	return false;
    }
    return true;
}

int IconvConverter::set_target_encoding(const char *encoding)
{
    cd = iconv_open(encoding, INTERNAL_ENCODING);
    if (cd == (iconv_t)-1) {
	return false;
    }
    return true;
}

int IconvConverter::convert(unichar **dest, char **src, int len)
{
    size_t dest_avail = 99999; // :FIXME:  I've tried several xxx_MAX constants
			       //          but it makes iconv() fail.
    size_t src_bytes_left = len;
    size_t result = iconv(cd, (ICONV_CONST char **)src, &src_bytes_left,
			  (char **)dest, &dest_avail);
    return (result == (size_t)-1) ? (int)-1 : (int)result;
}

int IconvConverter::convert(char **dest, unichar **src, int len)
{
    size_t dest_avail = 99999; // :FIXME:
    size_t src_bytes_left = len * sizeof(unichar);
    size_t result;

    while (1) {
	result = iconv(cd, (ICONV_CONST char **)src, &src_bytes_left,
			  dest, &dest_avail);
	if (ilseq_repr && result == (size_t)-1 && errno == EILSEQ) {
	    // We're asked to represent EILSEQ as "?".
	    // we put "?" in **dest, and advance the
	    // src pointer.
	    (*src)++;
	    src_bytes_left -= sizeof(unichar);
	    (**dest) = '?';
	    (*dest)++;
	    dest_avail--;
	} else {
	    break;
	}
    }
    return (result == (size_t)-1) ? (int)-1 : (int)result;
}
#endif

int ISO88598Converter::convert(unichar **dest, char **src, int len)
{
    int count = 0;
    unichar * &d = *dest;
    char *    &s = *src;
    while (len--) {
	*d++ = iso88598_to_unicode(*s++);
	count++;
    }
    return count;
}

int ISO88598Converter::convert(char **dest, unichar **src, int len)
{
    int count = 0;
    char *    &d = *dest;
    unichar * &s = *src;
    while (len--) {
	int ich = unicode_to_iso88598(*s);
	if (ich == EOF) {
	    if (ilseq_repr) {
		ich = '?';
	    } else {
		errno = EILSEQ;
		return -1;
	    }
	}
	*d++ = (char)ich;
	s++;
	count++;
    }
    return count;
}

int Latin1Converter::convert(unichar **dest, char **src, int len)
{
    int count = len;
    unichar * &d = *dest;
    char *    &s = *src;
    while (len--)
	*d++ = (unsigned char)*s++;
    return count;
}

int Latin1Converter::convert(char **dest, unichar **src, int len)
{
    int count = len;
    char *    &d = *dest;
    unichar * &s = *src;
    while (len--) {
	if (*s > 0xFF) {
	    if (ilseq_repr) {
		*d++ = '?';
		s++;		
	    } else {
	    	errno = EILSEQ;
	    	return -1;
	    }
	} else {
	    *d++ = (char)*s++;
	}
    }
    return count;
}

int UTF8Converter::convert(unichar **dest, char **src, int len)
{
    int count = 0;
    unichar * &d = *dest;
    char *    &s = *src;
    const char *problem;
    count = utf8_to_unicode(d, s, len, &problem);
    if (problem) {
	d += count;
	s = (char *)problem;
	errno = EINVAL;
	return -1;
    } else {
	d += count;
	s += len;
    }
    return count;
}

int UTF8Converter::convert(char **dest, unichar **src, int len)
{
    char *    &d = *dest;
    unichar * &s = *src;
    int nbytes = unicode_to_utf8(d, s, len);
    d += nbytes;
    s += len;
    return len;
}

Converter *ConverterFactory::get_internal_converter(const char *enc)
{
    // canonize the encoding name: remove '-', and upperace.
    u8string encoding = u8string(enc).erase_char('-').toupper_ascii();

    DBG(1, ("looking for internal '%s' converter\n", encoding.c_str()));

    if (encoding == "UTF8")
	return new UTF8Converter();
    if (encoding == "ISO88598" || encoding == "88598")
	return new ISO88598Converter();
    if (encoding == "ISO88591" || encoding == "LATIN1"
	    || encoding == "88591" || encoding == "ASCII"
	    || encoding == "USASCII")
	return new Latin1Converter();

    return NULL;
}

Converter *ConverterFactory::get_converter_from(const char *encoding)
{
#ifdef USE_ICONV
    IconvConverter *iconv = new IconvConverter();
    if (!iconv->set_source_encoding(encoding)) {
	delete iconv;
	return NULL;
    } else {
	return iconv;
    }
#else
    return ConverterFactory::get_internal_converter(encoding);
#endif
}

Converter *ConverterFactory::get_converter_to(const char *encoding)
{
#ifdef USE_ICONV
    IconvConverter *iconv = new IconvConverter();
    if (!iconv->set_target_encoding(encoding)) {
	delete iconv;
	return NULL;
    } else {
	return iconv;
    }
#else
    return ConverterFactory::get_internal_converter(encoding);
#endif
}