u8-mbtouc-aux.c 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. /* Conversion UTF-8 to UCS-4.
  2. Copyright (C) 2001-2002, 2006-2007, 2009-2010 Free Software Foundation, Inc.
  3. Written by Bruno Haible <bruno@clisp.org>, 2001.
  4. This program is free software: you can redistribute it and/or modify it
  5. under the terms of the GNU Lesser General Public License as published
  6. by the Free Software Foundation; either version 3 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public License
  13. along with this program. If not, see <http://www.gnu.org/licenses/>. */
  14. #include <config.h>
  15. /* Specification. */
  16. #include "unistr.h"
  17. #if defined IN_LIBUNISTRING || HAVE_INLINE
  18. int
  19. u8_mbtouc_aux (ucs4_t *puc, const uint8_t *s, size_t n)
  20. {
  21. uint8_t c = *s;
  22. if (c >= 0xc2)
  23. {
  24. if (c < 0xe0)
  25. {
  26. if (n >= 2)
  27. {
  28. if ((s[1] ^ 0x80) < 0x40)
  29. {
  30. *puc = ((unsigned int) (c & 0x1f) << 6)
  31. | (unsigned int) (s[1] ^ 0x80);
  32. return 2;
  33. }
  34. /* invalid multibyte character */
  35. }
  36. else
  37. {
  38. /* incomplete multibyte character */
  39. *puc = 0xfffd;
  40. return n;
  41. }
  42. }
  43. else if (c < 0xf0)
  44. {
  45. if (n >= 3)
  46. {
  47. if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
  48. && (c >= 0xe1 || s[1] >= 0xa0)
  49. && (c != 0xed || s[1] < 0xa0))
  50. {
  51. *puc = ((unsigned int) (c & 0x0f) << 12)
  52. | ((unsigned int) (s[1] ^ 0x80) << 6)
  53. | (unsigned int) (s[2] ^ 0x80);
  54. return 3;
  55. }
  56. /* invalid multibyte character */
  57. }
  58. else
  59. {
  60. /* incomplete multibyte character */
  61. *puc = 0xfffd;
  62. return n;
  63. }
  64. }
  65. else if (c < 0xf8)
  66. {
  67. if (n >= 4)
  68. {
  69. if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
  70. && (s[3] ^ 0x80) < 0x40
  71. && (c >= 0xf1 || s[1] >= 0x90)
  72. #if 1
  73. && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))
  74. #endif
  75. )
  76. {
  77. *puc = ((unsigned int) (c & 0x07) << 18)
  78. | ((unsigned int) (s[1] ^ 0x80) << 12)
  79. | ((unsigned int) (s[2] ^ 0x80) << 6)
  80. | (unsigned int) (s[3] ^ 0x80);
  81. return 4;
  82. }
  83. /* invalid multibyte character */
  84. }
  85. else
  86. {
  87. /* incomplete multibyte character */
  88. *puc = 0xfffd;
  89. return n;
  90. }
  91. }
  92. #if 0
  93. else if (c < 0xfc)
  94. {
  95. if (n >= 5)
  96. {
  97. if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
  98. && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
  99. && (c >= 0xf9 || s[1] >= 0x88))
  100. {
  101. *puc = ((unsigned int) (c & 0x03) << 24)
  102. | ((unsigned int) (s[1] ^ 0x80) << 18)
  103. | ((unsigned int) (s[2] ^ 0x80) << 12)
  104. | ((unsigned int) (s[3] ^ 0x80) << 6)
  105. | (unsigned int) (s[4] ^ 0x80);
  106. return 5;
  107. }
  108. /* invalid multibyte character */
  109. }
  110. else
  111. {
  112. /* incomplete multibyte character */
  113. *puc = 0xfffd;
  114. return n;
  115. }
  116. }
  117. else if (c < 0xfe)
  118. {
  119. if (n >= 6)
  120. {
  121. if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
  122. && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
  123. && (s[5] ^ 0x80) < 0x40
  124. && (c >= 0xfd || s[1] >= 0x84))
  125. {
  126. *puc = ((unsigned int) (c & 0x01) << 30)
  127. | ((unsigned int) (s[1] ^ 0x80) << 24)
  128. | ((unsigned int) (s[2] ^ 0x80) << 18)
  129. | ((unsigned int) (s[3] ^ 0x80) << 12)
  130. | ((unsigned int) (s[4] ^ 0x80) << 6)
  131. | (unsigned int) (s[5] ^ 0x80);
  132. return 6;
  133. }
  134. /* invalid multibyte character */
  135. }
  136. else
  137. {
  138. /* incomplete multibyte character */
  139. *puc = 0xfffd;
  140. return n;
  141. }
  142. }
  143. #endif
  144. }
  145. /* invalid multibyte character */
  146. *puc = 0xfffd;
  147. return 1;
  148. }
  149. #endif