Output_UTF8.java 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. /* Copyright (C) 1999, 2000, 2003, 2006 Free Software Foundation
  2. This file is part of libgcj.
  3. This software is copyrighted work licensed under the terms of the
  4. Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
  5. details. */
  6. package gnu.gcj.convert;
  7. /**
  8. * Convert Unicode to UTF8.
  9. * @author Per Bothner <bothner@cygnus.com>
  10. * @date Match 1999.
  11. */
  12. public class Output_UTF8 extends UnicodeToBytes
  13. {
  14. public String getName() { return "UTF8"; }
  15. /** True if a surrogate pair should be emitted as a single UTF8 sequence.
  16. * Otherwise, a surrogate pair is treated as two separate characters.
  17. * Also, '\0' is emitted as {0} if true, and as {0xC0,0x80} if false. */
  18. public boolean standardUTF8 = true;
  19. // Saves the previous char if it was a high-surrogate.
  20. char hi_part;
  21. // Value of incomplete character.
  22. int value;
  23. // Number of continuation bytes still to emit.
  24. int bytes_todo;
  25. public int write (char[] inbuffer, int inpos, int inlength)
  26. {
  27. int start_pos = inpos;
  28. int avail = buf.length - count;
  29. for (;;)
  30. {
  31. if (avail == 0 || (inlength == 0 && bytes_todo == 0 && hi_part == 0))
  32. break;
  33. // The algorithm is made more complicated because we want to write
  34. // at least one byte in the output buffer, if there is room for
  35. // that byte, and at least one input character is available.
  36. // This makes the code more robust, since client code will
  37. // always "make progress", even in the complicated cases,
  38. // where the output buffer only has room for only *part* of a
  39. // multi-byte sequence, or the input char buffer only has half
  40. // of a surrogate pair (when standardUTF8 is set), or both.
  41. // Handle continuation characters we did not have room for before.
  42. if (bytes_todo > 0)
  43. {
  44. do
  45. {
  46. bytes_todo--;
  47. buf[count++] = (byte)
  48. (((value >> (bytes_todo * 6)) & 0x3F) | 0x80);
  49. avail--;
  50. }
  51. while (bytes_todo > 0 && avail > 0);
  52. continue;
  53. }
  54. // Handle a high surrogate at the end of the input stream.
  55. if (inlength == 0 && hi_part != 0)
  56. {
  57. buf[count++] = (byte) (0xE0 | (hi_part >> 12));
  58. value = hi_part;
  59. hi_part = 0;
  60. avail--;
  61. bytes_todo = 2;
  62. continue;
  63. }
  64. char ch = inbuffer[inpos++];
  65. inlength--;
  66. if (hi_part != 0 && (ch <= 0xDBFF || ch > 0xDFFF))
  67. {
  68. // If the previous character was a high surrogate, and we
  69. // don't now have a low surrogate, we print the high
  70. // surrogate as an isolated character.
  71. --inpos;
  72. ++inlength;
  73. buf[count++] = (byte) (0xE0 | (hi_part >> 12));
  74. value = hi_part;
  75. hi_part = 0;
  76. avail--;
  77. bytes_todo = 2;
  78. }
  79. else if (hi_part == 0 && ch >= 0xDC00 && ch <= 0xDFFF)
  80. {
  81. // If this character is a low surrogate and we didn't
  82. // previously see a high surrogate, we do the same thing
  83. // as above.
  84. buf[count++] = (byte) (0xE0 | (ch >> 12));
  85. value = ch;
  86. avail--;
  87. bytes_todo = 2;
  88. }
  89. else if (ch < 128 && (ch != 0 || standardUTF8))
  90. {
  91. avail--;
  92. buf[count++] = (byte) ch;
  93. }
  94. else if (ch <= 0x07FF)
  95. {
  96. buf[count++] = (byte) (0xC0 | (ch >> 6));
  97. avail--;
  98. value = ch;
  99. bytes_todo = 1;
  100. }
  101. else if (ch >= 0xD800 && ch <= 0xDFFF && standardUTF8)
  102. {
  103. if (ch <= 0xDBFF) // High surrogates
  104. {
  105. // Just save the high surrogate until the next
  106. // character comes along.
  107. hi_part = ch;
  108. }
  109. else // Low surrogates
  110. {
  111. value = (hi_part - 0xD800) * 0x400 + (ch - 0xDC00) + 0x10000;
  112. buf[count++] = (byte) (0xF0 | (value >> 18));
  113. avail--;
  114. bytes_todo = 3;
  115. hi_part = 0;
  116. }
  117. }
  118. else
  119. {
  120. buf[count++] = (byte) (0xE0 | (ch >> 12));
  121. value = ch;
  122. avail--;
  123. bytes_todo = 2;
  124. }
  125. }
  126. return inpos - start_pos;
  127. }
  128. public boolean havePendingBytes()
  129. {
  130. return bytes_todo > 0 || hi_part != 0;
  131. }
  132. }