123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206 |
- /* UnicodeReader.java --
- Copyright (C) 2005 Free Software Foundation, Inc.
- This file is part of GNU Classpath.
- GNU Classpath is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2, or (at your option)
- any later version.
- GNU Classpath is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with GNU Classpath; see the file COPYING. If not, write to the
- Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- 02110-1301 USA.
- Linking this library statically or dynamically with other modules is
- making a combined work based on this library. Thus, the terms and
- conditions of the GNU General Public License cover the whole
- combination.
- As a special exception, the copyright holders of this library give you
- permission to link this library with independent modules to produce an
- executable, regardless of the license terms of these independent
- modules, and to copy and distribute the resulting executable under
- terms of your choice, provided that you also meet, for each linked
- independent module, the terms and conditions of the license of that
- module. An independent module is a module which is not derived from
- or based on this library. If you modify this library, you may extend
- this exception to your version of the library, but you are not
- obligated to do so. If you do not wish to do so, delete this
- exception statement from your version. */
- package gnu.xml.stream;
- import java.io.IOException;
- import java.io.Reader;
- /**
- * A reader that converts UTF-16 characters to Unicode code points.
- *
- * @author <a href='mailto:dog@gnu.org'>Chris Burdess</a>
- */
- public class UnicodeReader
- {
- final Reader in;
- UnicodeReader(Reader in)
- {
- this.in = in;
- }
- public void mark(int limit)
- throws IOException
- {
- in.mark(limit * 2);
- }
- public void reset()
- throws IOException
- {
- in.reset();
- }
- public int read()
- throws IOException
- {
- int ret = in.read();
- if (ret == -1)
- return ret;
- if (ret >= 0xd800 && ret < 0xdc00)
- {
- // Unicode surrogate?
- int low = in.read();
- if (low >= 0xdc00 && low < 0xe000)
- ret = Character.toCodePoint((char) ret, (char) low);
- else
- throw new IOException("unpaired surrogate: U+" +
- Integer.toHexString(ret));
- }
- else if (ret >= 0xdc00 && ret < 0xe000)
- throw new IOException("unpaired surrogate: U+" +
- Integer.toHexString(ret));
- return ret;
- }
- public int read(int[] buf, int off, int len)
- throws IOException
- {
- if (len == 0)
- return 0;
- char[] b2 = new char[len];
- int ret = in.read(b2, 0, len);
- if (ret <= 0)
- return ret;
- int l = ret - 1;
- int i = 0, j = off;
- for (; i < l; i++)
- {
- char c = b2[i];
- if (c >= 0xd800 && c < 0xdc00)
- {
- // Unicode surrogate?
- char d = b2[i + 1];
- if (d >= 0xdc00 && d < 0xe000)
- {
- buf[j++] = Character.toCodePoint(c, d);
- i++;
- continue;
- }
- else
- throw new IOException("unpaired surrogate: U+" +
- Integer.toHexString(c));
- }
- else if (c >= 0xdc00 && c < 0xe000)
- throw new IOException("unpaired surrogate: U+" +
- Integer.toHexString(c));
- buf[j++] = (int) c;
- }
- if (i == l)
- {
- // last char
- char c = b2[l];
- if (c >= 0xd800 && c < 0xdc00)
- {
- int low = in.read();
- if (low >= 0xdc00 && low < 0xe000)
- {
- buf[j++] = Character.toCodePoint(c, (char) low);
- return j;
- }
- else
- throw new IOException("unpaired surrogate: U+" +
- Integer.toHexString(c));
- }
- else if (c >= 0xdc00 && c < 0xe000)
- throw new IOException("unpaired surrogate: U+" +
- Integer.toHexString(c));
- buf[j++] = (int) c;
- }
- return j;
- }
- public void close()
- throws IOException
- {
- in.close();
- }
- /**
- * Returns the specified UTF-16 char array as an array of Unicode code
- * points.
- */
- public static int[] toCodePointArray(String text)
- throws IOException
- {
- char[] b2 = text.toCharArray();
- int[] buf = new int[b2.length];
- if (b2.length > 0)
- {
- int l = b2.length - 1;
- int i = 0, j = 0;
- for (; i < l; i++)
- {
- char c = b2[i];
- if (c >= 0xd800 && c < 0xdc00)
- {
- // Unicode surrogate?
- char d = b2[i + 1];
- if (d >= 0xdc00 && d < 0xe000)
- {
- buf[j++] = Character.toCodePoint(c, d);
- i++;
- continue;
- }
- else
- throw new IOException("unpaired surrogate: U+" +
- Integer.toHexString(c));
- }
- else if (c >= 0xdc00 && c < 0xe000)
- throw new IOException("unpaired surrogate: U+" +
- Integer.toHexString(c));
- buf[j++] = (int) c;
- }
- if (i == l)
- {
- // last char
- buf[j++] = (int) b2[l];
- if (j < buf.length)
- {
- int[] buf2 = new int[j];
- System.arraycopy(buf, 0, buf2, 0, j);
- buf = buf2;
- }
- }
- }
- return buf;
- }
- }
|