flatwhatson
/
scheme48


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
							/* Part of Scheme 48 1.9.  See file COPYING for notices and license.
 *
 * Authors: Mike Sperber, Robert Ransom
 */

/*
 * This file defines functions for dealing with a synthetic text
 * encoding called UTF-8.  It's like UTF-8, but also encodes unpaired
 * surrogates directly, which is what we need for the Windows API.
 */

#include <windows.h>

static char masks[4] = { 0x7f, 0x1f, 0x0f, 0x07 };
 
/*
 * - NUL-terminates
 * - if utf_8of16 is NULL, we just compute the size
 * - returns size (sans NUL) needed for UTF-8of16
 */

int
s48_utf_16_to_utf_8of16(LPWSTR utf_16, 
			unsigned char* utf_8of16)
{
  int p = 0, i = 0;
  while (utf_16[i])
    {
      unsigned int c = utf_16[i];
      ++i;
      if ((c >= 0xD800) && (c <= 0xDBFF) /* high surrogate */
	  && utf_16[i]
	  && (utf_16[i] >= 0xDC00) && (utf_16[i] <= 0xDFFF)) /* low surrogate */
	{
	  c = ((c - 0xd7c0) << 10) + (utf_16[i] & 0x3ff);
	  ++i;
	}
					    
      if (c <= 0x7f)
	{
	  if (utf_8of16)
	    utf_8of16[p] = (unsigned char) c;
	  ++p; 
	}
      else if (c <= 0x7ff)
	{
	  if (utf_8of16)
	    {
	      utf_8of16[p] = (unsigned char) ((c >> 6) + 0xc0);
	      utf_8of16[p+1] = (unsigned char) ((c & 0x3f) + 0x80);
	    }
	  p += 2;
	  }
      else if (c <= 0xffff)
	{
	  if (utf_8of16)
	    {
	      utf_8of16[p] = (unsigned char) ((c >> 12) + 0xe0);
	      utf_8of16[p+1] = (unsigned char) (((c >> 6) & 0x3f) + 0x80);
	      utf_8of16[p+2] = (unsigned char) ((c & 0x3f) + 0x80);
	    }
	  p += 3;
	}
      else
	{
	  if (utf_8of16)
	    {
	      utf_8of16[p] = (unsigned char) ((c >> 18) + 0xf0);
	      utf_8of16[p+1] = (unsigned char) (((c >> 12) + 0xe0) + 0x80);
	      utf_8of16[p+2] = (unsigned char) (((c >> 6) & 0x3f) + 0x80);
	      utf_8of16[p+3] = (unsigned char) ((c & 0x3f) + 0x80);
	    }
	  p += 4;
	}
	
    }
  if (utf_8of16)
    utf_8of16[p] = 0;
  return p;
}

/*
 * The table, and the associated decoding algorithm, is from
 * Richard Gillam: "Unicode Demystified", chapter 14
 */

static char states[3][32] =
  { 
    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1},
    {-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -2, -2, -2, -2, -2, -2},
    {-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 1, 1, 1, 1, 1, 1, 1, 1, -2, -2, -2, -2, -2, -2, -2, -2},
  };

/*
 * - NUL-terminates
 * - if utf_16 is NULL, we just compute the size
 * - returns size (sans NUL) needed for UTF-16
 */

int
s48_utf_8of16_to_utf_16(const unsigned char* utf_8of16,
			LPWSTR utf_16,
			int* errorp)
{
  int p = 0, q = 0, state = 0, error = 0;
  unsigned int scalar_value = 0;
  unsigned mask = 0;
  
  while (utf_8of16[q])
    {
      unsigned char c = utf_8of16[q];
      ++q;

      state = states[state][c >> 3];
      
      switch (state) {
      case 0:
	scalar_value += c & 0x7f;

	if (scalar_value > 0xffff)
	  {
	    if (utf_16)
	      {
		utf_16[p] = (scalar_value >> 10) + 0xD7C0;
		utf_16[p+1] = (scalar_value & 0x3FF) + 0xDC00;
	      }
	    p += 2;
	  }
	else
	  {
	    if (utf_16)
	      utf_16[p] = scalar_value;
	    ++p;
	  }
	scalar_value = 0;
	mask = 0;
	break;
	
      case 1:
      case 2:
	if (mask == 0)
	  mask = masks[state];
	scalar_value = (scalar_value + (c & mask)) << 6;
	mask = 0x3f;
	break;

      case -2:
	--q;
	/* fall thru */

      case -1:
	if (utf_16)
	  utf_16[p] = 0xfffd;
	++p;
	scalar_value = 0;
	state = 0;
	mask = 0;
	error = 1;
	break;
      }
    }
  if (errorp)
    *errorp = error;
  if (utf_16)
    utf_16[p] = 0;
  return p;
}

/*

#include <stdlib.h>
#include <stdio.h>

int
main(void)
{
  unsigned int t1[] = { 'A', 'B', 0xd800, 0xd900, 0xdfff, 'C', 'D', 0 };

  int size_8 = s48_utf_16_to_utf_8of16(t1, NULL);
  printf("size_8 %d\n", size_8);

  unsigned char c[1000];

  size_8 = s48_utf_16_to_utf_8of16(t1, c);

  printf("size_8 %d\n", size_8);

  {
    int i = 0;
    while (i < size_8)
      {
	printf("%d: %4x\n", i, c[i]);
	++i;
      }
  }

  unsigned int u[1000];
  int error;

  int size_16 = s48_utf_8of16_to_utf_16(c, NULL, &error);
  printf("size_16 %d\n", size_16);

  size_16 = s48_utf_8of16_to_utf_16(c, u, &error);
  printf("size_16 %d\n", size_16);

  {
    int i = 0;
    while (i < size_16)
      {
	printf("%d: %4x\n", i, u[i]);
	++i;
      }
  }
  
}
*/