rune.go 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. /*
  2. * The authors of this software are Rob Pike and Ken Thompson.
  3. * Copyright (c) 2002 by Lucent Technologies.
  4. * Portions Copyright 2009 The Go Authors. All rights reserved.
  5. * Permission to use, copy, modify, and distribute this software for any
  6. * purpose without fee is hereby granted, provided that this entire notice
  7. * is included in all copies of any software which is or includes a copy
  8. * or modification of this software and in all copies of the supporting
  9. * documentation for such software.
  10. * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
  11. * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
  12. * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
  13. * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
  14. */
  15. /*
  16. * This code is copied, with slight editing due to type differences,
  17. * from a subset of ../lib9/utf/rune.c
  18. */
  19. package runtime
  20. const (
  21. bit1 = 7
  22. bitx = 6
  23. bit2 = 5
  24. bit3 = 4
  25. bit4 = 3
  26. bit5 = 2
  27. t1 = ((1 << (bit1 + 1)) - 1) ^ 0xFF /* 0000 0000 */
  28. tx = ((1 << (bitx + 1)) - 1) ^ 0xFF /* 1000 0000 */
  29. t2 = ((1 << (bit2 + 1)) - 1) ^ 0xFF /* 1100 0000 */
  30. t3 = ((1 << (bit3 + 1)) - 1) ^ 0xFF /* 1110 0000 */
  31. t4 = ((1 << (bit4 + 1)) - 1) ^ 0xFF /* 1111 0000 */
  32. t5 = ((1 << (bit5 + 1)) - 1) ^ 0xFF /* 1111 1000 */
  33. rune1 = (1 << (bit1 + 0*bitx)) - 1 /* 0000 0000 0111 1111 */
  34. rune2 = (1 << (bit2 + 1*bitx)) - 1 /* 0000 0111 1111 1111 */
  35. rune3 = (1 << (bit3 + 2*bitx)) - 1 /* 1111 1111 1111 1111 */
  36. rune4 = (1 << (bit4 + 3*bitx)) - 1 /* 0001 1111 1111 1111 1111 1111 */
  37. maskx = (1 << bitx) - 1 /* 0011 1111 */
  38. testx = maskx ^ 0xFF /* 1100 0000 */
  39. runeerror = 0xFFFD
  40. runeself = 0x80
  41. surrogateMin = 0xD800
  42. surrogateMax = 0xDFFF
  43. bad = runeerror
  44. runemax = 0x10FFFF /* maximum rune value */
  45. )
  46. /*
  47. * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
  48. * This is a slower but "safe" version of the old chartorune
  49. * that works on strings that are not necessarily null-terminated.
  50. *
  51. * If you know for sure that your string is null-terminated,
  52. * chartorune will be a bit faster.
  53. *
  54. * It is guaranteed not to attempt to access "length"
  55. * past the incoming pointer. This is to avoid
  56. * possible access violations. If the string appears to be
  57. * well-formed but incomplete (i.e., to get the whole Rune
  58. * we'd need to read past str+length) then we'll set the Rune
  59. * to Bad and return 0.
  60. *
  61. * Note that if we have decoding problems for other
  62. * reasons, we return 1 instead of 0.
  63. */
  64. func charntorune(s string) (rune, int) {
  65. /* When we're not allowed to read anything */
  66. if len(s) <= 0 {
  67. return bad, 1
  68. }
  69. /*
  70. * one character sequence (7-bit value)
  71. * 00000-0007F => T1
  72. */
  73. c := s[0]
  74. if c < tx {
  75. return rune(c), 1
  76. }
  77. // If we can't read more than one character we must stop
  78. if len(s) <= 1 {
  79. return bad, 1
  80. }
  81. /*
  82. * two character sequence (11-bit value)
  83. * 0080-07FF => t2 tx
  84. */
  85. c1 := s[1] ^ tx
  86. if (c1 & testx) != 0 {
  87. return bad, 1
  88. }
  89. if c < t3 {
  90. if c < t2 {
  91. return bad, 1
  92. }
  93. l := ((rune(c) << bitx) | rune(c1)) & rune2
  94. if l <= rune1 {
  95. return bad, 1
  96. }
  97. return l, 2
  98. }
  99. // If we can't read more than two characters we must stop
  100. if len(s) <= 2 {
  101. return bad, 1
  102. }
  103. /*
  104. * three character sequence (16-bit value)
  105. * 0800-FFFF => t3 tx tx
  106. */
  107. c2 := s[2] ^ tx
  108. if (c2 & testx) != 0 {
  109. return bad, 1
  110. }
  111. if c < t4 {
  112. l := ((((rune(c) << bitx) | rune(c1)) << bitx) | rune(c2)) & rune3
  113. if l <= rune2 {
  114. return bad, 1
  115. }
  116. if surrogateMin <= l && l <= surrogateMax {
  117. return bad, 1
  118. }
  119. return l, 3
  120. }
  121. if len(s) <= 3 {
  122. return bad, 1
  123. }
  124. /*
  125. * four character sequence (21-bit value)
  126. * 10000-1FFFFF => t4 tx tx tx
  127. */
  128. c3 := s[3] ^ tx
  129. if (c3 & testx) != 0 {
  130. return bad, 1
  131. }
  132. if c < t5 {
  133. l := ((((((rune(c) << bitx) | rune(c1)) << bitx) | rune(c2)) << bitx) | rune(c3)) & rune4
  134. if l <= rune3 || l > runemax {
  135. return bad, 1
  136. }
  137. return l, 4
  138. }
  139. // Support for 5-byte or longer UTF-8 would go here, but
  140. // since we don't have that, we'll just return bad.
  141. return bad, 1
  142. }
  143. // runetochar converts r to bytes and writes the result to str.
  144. // returns the number of bytes generated.
  145. func runetochar(str []byte, r rune) int {
  146. /* runes are signed, so convert to unsigned for range check. */
  147. c := uint32(r)
  148. /*
  149. * one character sequence
  150. * 00000-0007F => 00-7F
  151. */
  152. if c <= rune1 {
  153. str[0] = byte(c)
  154. return 1
  155. }
  156. /*
  157. * two character sequence
  158. * 0080-07FF => t2 tx
  159. */
  160. if c <= rune2 {
  161. str[0] = byte(t2 | (c >> (1 * bitx)))
  162. str[1] = byte(tx | (c & maskx))
  163. return 2
  164. }
  165. /*
  166. * If the rune is out of range or a surrogate half, convert it to the error rune.
  167. * Do this test here because the error rune encodes to three bytes.
  168. * Doing it earlier would duplicate work, since an out of range
  169. * rune wouldn't have fit in one or two bytes.
  170. */
  171. if c > runemax {
  172. c = runeerror
  173. }
  174. if surrogateMin <= c && c <= surrogateMax {
  175. c = runeerror
  176. }
  177. /*
  178. * three character sequence
  179. * 0800-FFFF => t3 tx tx
  180. */
  181. if c <= rune3 {
  182. str[0] = byte(t3 | (c >> (2 * bitx)))
  183. str[1] = byte(tx | ((c >> (1 * bitx)) & maskx))
  184. str[2] = byte(tx | (c & maskx))
  185. return 3
  186. }
  187. /*
  188. * four character sequence (21-bit value)
  189. * 10000-1FFFFF => t4 tx tx tx
  190. */
  191. str[0] = byte(t4 | (c >> (3 * bitx)))
  192. str[1] = byte(tx | ((c >> (2 * bitx)) & maskx))
  193. str[2] = byte(tx | ((c >> (1 * bitx)) & maskx))
  194. str[3] = byte(tx | (c & maskx))
  195. return 4
  196. }