tzafrir
/
geresh
mirror of git://git.tzafrir.org.il/git/geresh.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
							// Copyright (C) 2003 Mooffie <mooffie@typo.co.il>
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.

#include <config.h>

#include "utf8.h"
#include "univalues.h"
#include "dbg.h"

// utf8_to_unicode() - converts a UTF-8 string to unichars. When an
// incomplete sequence is encountered, *problem will point to its head (it's
// similar to what iconv does).
//
// This function converts UTF-8 to UCS-4 (not to UTF-32) -- that's why it
// recognizes 5- and 6-byte sequences.

int utf8_to_unicode(unichar *dest, const char *s, int len, const char **problem)
{
    int length = 0;
    const char *end = s + len;
    
    if (problem)
	*problem = NULL;

// constant expressions are evaluated at compile time, of course.
#define FRST(t)  ((*s & ((1 << (8-t-1)) - 1)) << (t-1)*6)
#define UC(t,n)  (*(s+n-1) & 0x3F) << ((t-n)*6)
    while (s < end) {
	if (!(*s & 0x80)) {
	    *dest++ = *s;
	    s++;
	} else if ((*s & 0xE0) == 0xC0) {
	    if ((end - s) >= 2) {
		*dest++ = FRST(2) | UC(2,2);
		s += 2;
	    } else {
		if (problem)
		    *problem = s;
		break;
	    }
	} else if ((*s & 0xF0) == 0xE0) {
	    if ((end - s) >= 3) {
		*dest++ = FRST(3) | UC(3,2) | UC(3,3);
		s += 3;
	    } else {
		if (problem)
		    *problem = s;
		break;
	    }
	} else if ((*s & 0xF8) == 0xF0) {
	    if ((end - s) >= 4) {
		*dest++ = FRST(4) | UC(4,2) | UC(4,3) | UC(4,4);
		s += 4;
	    } else {
		if (problem)
		    *problem = s;
		break;
	    }
	} else if ((*s & 0xFC) == 0xF8) {
	    if ((end - s) >= 5) {
		*dest++ = FRST(5) | UC(5,2) | UC(5,3) | UC(5,4) | UC(5,5);
		s += 5;
	    } else {
		if (problem)
		    *problem = s;
		break;
	    }
	} else if ((*s & 0xFE) == 0xFC) {
	    if ((end - s) >= 6) {
		*dest++ = FRST(6) | UC(6,2) | UC(6,3) | UC(6,4) | UC(6,5) | UC(6,6);
		s += 6;
	    } else {
		if (problem)
		    *problem = s;
		break;
	    }
	} else {
	    *dest++ = UNI_REPLACEMENT;
	    s++;
	}
	length++;
    }
    return length;
#undef FRST
#undef UC
}

// unicode_to_utf8() - converts unichars to UTF-8.

int unicode_to_utf8(char *dest, const unichar *us, int len)
{
#define UC(n)	((*us >> 6*n) & 0x3F)
#define CNT(n)	(((1 << n) - 1) << (8 - n))
    int nbytes = 0;
    while (len--) {
	if (*us < 0x80) {
	    *dest++ = *us;
	    nbytes += 1;
	} else if (*us < 0x800) {
	    *dest++ = UC(1) | CNT(2);
	    *dest++ = UC(0) | 0x80;
	    nbytes += 2;
	} else if (*us < 0x10000) {
	    *dest++ = UC(2) | CNT(3);
	    *dest++ = UC(1) | 0x80;
	    *dest++ = UC(0) | 0x80;
	    nbytes += 3;
	} else if (*us < 0x200000) {
	    *dest++ = UC(3) | CNT(4);
	    *dest++ = UC(2) | 0x80;
	    *dest++ = UC(1) | 0x80;
	    *dest++ = UC(0) | 0x80;
	    nbytes += 4;
	} else if (*us < 0x4000000) {
	    *dest++ = UC(4) | CNT(5);
	    *dest++ = UC(3) | 0x80;
	    *dest++ = UC(2) | 0x80;
	    *dest++ = UC(1) | 0x80;
	    *dest++ = UC(0) | 0x80;
	    nbytes += 5;
	} else {
	    *dest++ = UC(5) | CNT(6);
	    *dest++ = UC(4) | 0x80;
	    *dest++ = UC(3) | 0x80;
	    *dest++ = UC(2) | 0x80;
	    *dest++ = UC(1) | 0x80;
	    *dest++ = UC(0) | 0x80;
	    nbytes += 6;
	}
	us++;
    }
    return nbytes;
#undef UC
#undef CNT
}