BLI_string_utf8_test.cc 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. /* Apache License, Version 2.0 */
  2. #include "testing/testing.h"
  3. extern "C" {
  4. #include "BLI_utildefines.h"
  5. #include "BLI_string.h"
  6. #include "BLI_string_utf8.h"
  7. }
  8. /* Note that 'common' utf-8 variants of string functions (like copy, etc.) are tested in
  9. * BLI_string_test.cc However, tests below are specific utf-8 conformance ones, and since they eat
  10. * quite their share of lines, they deserved their own file. */
  11. /* -------------------------------------------------------------------- */
  12. /* stubs */
  13. extern "C" {
  14. int mk_wcwidth(wchar_t ucs);
  15. int mk_wcswidth(const wchar_t *pwcs, size_t n);
  16. int mk_wcwidth(wchar_t ucs)
  17. {
  18. return 0;
  19. }
  20. int mk_wcswidth(const wchar_t *pwcs, size_t n)
  21. {
  22. return 0;
  23. }
  24. }
  25. /* -------------------------------------------------------------------- */
  26. /* tests */
  27. /* Breaking strings is confusing here, prefer over-long lines. */
  28. /* clang-format off */
  29. /* Each test is made of a 79 bytes (80 with NULL char) string to test, expected string result after
  30. * stripping invalid utf8 bytes, and a single-byte string encoded with expected number of errors.
  31. *
  32. * Based on utf-8 decoder stress-test (https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt)
  33. * by Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
  34. */
  35. const char *utf8_invalid_tests[][3] = {
  36. // 1 Some correct UTF-8 text
  37. {"You should see the Greek word 'kosme': \"\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5\" |",
  38. "You should see the Greek word 'kosme': \"\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5\" |", "\x00"},
  39. // 2 Boundary condition test cases
  40. // Note that those will pass for us, those are not erronéous unicode code points
  41. // (asside from \x00, which is only valid as string terminator).
  42. // 2.1 First possible sequence of a certain length
  43. {"2.1.1 1 byte (U-00000000): \"\x00\" |",
  44. "2.1.1 1 byte (U-00000000): \"\" |", "\x01"},
  45. {"2.1.2 2 bytes (U-00000080): \"\xc2\x80\" |",
  46. "2.1.2 2 bytes (U-00000080): \"\xc2\x80\" |", "\x00"},
  47. {"2.1.3 3 bytes (U-00000800): \"\xe0\xa0\x80\" |",
  48. "2.1.3 3 bytes (U-00000800): \"\xe0\xa0\x80\" |", "\x00"},
  49. {"2.1.4 4 bytes (U-00010000): \"\xf0\x90\x80\x80\" |",
  50. "2.1.4 4 bytes (U-00010000): \"\xf0\x90\x80\x80\" |", "\x00"},
  51. {"2.1.5 5 bytes (U-00200000): \"\xf8\x88\x80\x80\x80\" |",
  52. "2.1.5 5 bytes (U-00200000): \"\xf8\x88\x80\x80\x80\" |", "\x00"},
  53. {"2.1.6 6 bytes (U-04000000): \"\xfc\x84\x80\x80\x80\x80\" |",
  54. "2.1.6 6 bytes (U-04000000): \"\xfc\x84\x80\x80\x80\x80\" |", "\x00"},
  55. // 2.2 Last possible sequence of a certain length
  56. {"2.2.1 1 byte (U-0000007F): \"\x7f\" |",
  57. "2.2.1 1 byte (U-0000007F): \"\x7f\" |", "\x00"},
  58. {"2.2.2 2 bytes (U-000007FF): \"\xdf\xbf\" |",
  59. "2.2.2 2 bytes (U-000007FF): \"\xdf\xbf\" |", "\x00"},
  60. {"2.2.3 3 bytes (U-0000FFFF): \"\xef\xbf\xbf\" |",
  61. "2.2.3 3 bytes (U-0000FFFF): \"\" |", "\x03"}, /* matches one of 5.3 sequences... */
  62. {"2.2.4 4 bytes (U-001FFFFF): \"\xf7\xbf\xbf\xbf\" |",
  63. "2.2.4 4 bytes (U-001FFFFF): \"\xf7\xbf\xbf\xbf\" |", "\x00"},
  64. {"2.2.5 5 bytes (U-03FFFFFF): \"\xfb\xbf\xbf\xbf\xbf\" |",
  65. "2.2.5 5 bytes (U-03FFFFFF): \"\xfb\xbf\xbf\xbf\xbf\" |", "\x00"},
  66. {"2.2.6 6 bytes (U-7FFFFFFF): \"\xfd\xbf\xbf\xbf\xbf\xbf\" |",
  67. "2.2.6 6 bytes (U-7FFFFFFF): \"\xfd\xbf\xbf\xbf\xbf\xbf\" |", "\x00"},
  68. // 2.3 Other boundary conditions
  69. {"2.3.1 U-0000D7FF = ed 9f bf = \"\xed\x9f\xbf\" |",
  70. "2.3.1 U-0000D7FF = ed 9f bf = \"\xed\x9f\xbf\" |", "\x00"},
  71. {"2.3.2 U-0000E000 = ee 80 80 = \"\xee\x80\x80\" |",
  72. "2.3.2 U-0000E000 = ee 80 80 = \"\xee\x80\x80\" |", "\x00"},
  73. {"2.3.3 U-0000FFFD = ef bf bd = \"\xef\xbf\xbd\" |",
  74. "2.3.3 U-0000FFFD = ef bf bd = \"\xef\xbf\xbd\" |", "\x00"},
  75. {"2.3.4 U-0010FFFF = f4 8f bf bf = \"\xf4\x8f\xbf\xbf\" |",
  76. "2.3.4 U-0010FFFF = f4 8f bf bf = \"\xf4\x8f\xbf\xbf\" |", "\x00"},
  77. {"2.3.5 U-00110000 = f4 90 80 80 = \"\xf4\x90\x80\x80\" |",
  78. "2.3.5 U-00110000 = f4 90 80 80 = \"\xf4\x90\x80\x80\" |", "\x00"},
  79. // 3 Malformed sequences
  80. // 3.1 Unexpected continuation bytes
  81. // Each unexpected continuation byte should be separately signaled as a malformed sequence of its own.
  82. {"3.1.1 First continuation byte 0x80: \"\x80\" |",
  83. "3.1.1 First continuation byte 0x80: \"\" |", "\x01"},
  84. {"3.1.2 Last continuation byte 0xbf: \"\xbf\" |",
  85. "3.1.2 Last continuation byte 0xbf: \"\" |", "\x01"},
  86. {"3.1.3 2 continuation bytes: \"\x80\xbf\" |",
  87. "3.1.3 2 continuation bytes: \"\" |", "\x02"},
  88. {"3.1.4 3 continuation bytes: \"\x80\xbf\x80\" |",
  89. "3.1.4 3 continuation bytes: \"\" |", "\x03"},
  90. {"3.1.5 4 continuation bytes: \"\x80\xbf\x80\xbf\" |",
  91. "3.1.5 4 continuation bytes: \"\" |", "\x04"},
  92. {"3.1.6 5 continuation bytes: \"\x80\xbf\x80\xbf\x80\" |",
  93. "3.1.6 5 continuation bytes: \"\" |", "\x05"},
  94. {"3.1.7 6 continuation bytes: \"\x80\xbf\x80\xbf\x80\xbf\" |",
  95. "3.1.7 6 continuation bytes: \"\" |", "\x06"},
  96. {"3.1.8 7 continuation bytes: \"\x80\xbf\x80\xbf\x80\xbf\x80\" |",
  97. "3.1.8 7 continuation bytes: \"\" |", "\x07"},
  98. // 3.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf): |
  99. {"3.1.9 \"\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
  100. "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
  101. "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
  102. "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\" |",
  103. "3.1.9 \"\" |", "\x40"},
  104. // 3.2 Lonely start characters
  105. // 3.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf), each followed by a space character:
  106. {"3.2.1 \"\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf "
  107. "\xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \" |",
  108. "3.2.1 \" \" |", "\x20"},
  109. // 3.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef), each followed by a space character:
  110. {"3.2.2 \"\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \" |",
  111. "3.2.2 \" \" |", "\x10"},
  112. // 3.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7), each followed by a space character:
  113. {"3.2.3 \"\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \" |",
  114. "3.2.3 \" \" |", "\x08"},
  115. // 3.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb), each followed by a space character:
  116. {"3.2.4 \"\xf8 \xf9 \xfa \xfb \" |",
  117. "3.2.4 \" \" |", "\x04"},
  118. // 3.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd), each followed by a space character:
  119. {"3.2.4 \"\xfc \xfd \" |",
  120. "3.2.4 \" \" |", "\x02"},
  121. // 3.3 Sequences with last continuation byte missing
  122. // All bytes of an incomplete sequence should be signaled as a single malformed sequence,
  123. // i.e., you should see only a single replacement character in each of the next 10 tests.
  124. // (Characters as in section 2)
  125. {"3.3.1 2-byte sequence with last byte missing (U+0000): \"\xc0\" |",
  126. "3.3.1 2-byte sequence with last byte missing (U+0000): \"\" |", "\x01"},
  127. {"3.3.2 3-byte sequence with last byte missing (U+0000): \"\xe0\x80\" |",
  128. "3.3.2 3-byte sequence with last byte missing (U+0000): \"\" |", "\x02"},
  129. {"3.3.3 4-byte sequence with last byte missing (U+0000): \"\xf0\x80\x80\" |",
  130. "3.3.3 4-byte sequence with last byte missing (U+0000): \"\" |", "\x03"},
  131. {"3.3.4 5-byte sequence with last byte missing (U+0000): \"\xf8\x80\x80\x80\" |",
  132. "3.3.4 5-byte sequence with last byte missing (U+0000): \"\" |", "\x04"},
  133. {"3.3.5 6-byte sequence with last byte missing (U+0000): \"\xfc\x80\x80\x80\x80\" |",
  134. "3.3.5 6-byte sequence with last byte missing (U+0000): \"\" |", "\x05"},
  135. {"3.3.6 2-byte sequence with last byte missing (U-000007FF): \"\xdf\" |",
  136. "3.3.6 2-byte sequence with last byte missing (U-000007FF): \"\" |", "\x01"},
  137. {"3.3.7 3-byte sequence with last byte missing (U-0000FFFF): \"\xef\xbf\" |",
  138. "3.3.7 3-byte sequence with last byte missing (U-0000FFFF): \"\" |", "\x02"},
  139. {"3.3.8 4-byte sequence with last byte missing (U-001FFFFF): \"\xf7\xbf\xbf\" |",
  140. "3.3.8 4-byte sequence with last byte missing (U-001FFFFF): \"\" |", "\x03"},
  141. {"3.3.9 5-byte sequence with last byte missing (U-03FFFFFF): \"\xfb\xbf\xbf\xbf\" |",
  142. "3.3.9 5-byte sequence with last byte missing (U-03FFFFFF): \"\" |", "\x04"},
  143. {"3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): \"\xfd\xbf\xbf\xbf\xbf\" |",
  144. "3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): \"\" |", "\x05"},
  145. // 3.4 Concatenation of incomplete sequences
  146. // All the 10 sequences of 3.3 concatenated, you should see 10 malformed sequences being signaled:
  147. {"3.4 \"\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80"
  148. "\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf\""
  149. " |",
  150. "3.4 \"\" |", "\x1e"},
  151. // 3.5 Impossible bytes
  152. // The following two bytes cannot appear in a correct UTF-8 string
  153. {"3.5.1 fe = \"\xfe\" |",
  154. "3.5.1 fe = \"\" |", "\x01"},
  155. {"3.5.2 ff = \"\xff\" |",
  156. "3.5.2 ff = \"\" |", "\x01"},
  157. {"3.5.3 fe fe ff ff = \"\xfe\xfe\xff\xff\" |",
  158. "3.5.3 fe fe ff ff = \"\" |", "\x04"},
  159. // 4 Overlong sequences
  160. // The following sequences are not malformed according to the letter of the Unicode 2.0 standard.
  161. // However, they are longer then necessary and a correct UTF-8 encoder is not allowed to produce them.
  162. // A "safe UTF-8 decoder" should reject them just like malformed sequences for two reasons:
  163. // (1) It helps to debug applications if overlong sequences are not treated as valid representations
  164. // of characters, because this helps to spot problems more quickly. (2) Overlong sequences provide
  165. // alternative representations of characters, that could maliciously be used to bypass filters that check
  166. // only for ASCII characters. For instance, a 2-byte encoded line feed (LF) would not be caught by a
  167. // line counter that counts only 0x0a bytes, but it would still be processed as a line feed by an unsafe
  168. // UTF-8 decoder later in the pipeline. From a security point of view, ASCII compatibility of UTF-8
  169. // sequences means also, that ASCII characters are *only* allowed to be represented by ASCII bytes
  170. // in the range 0x00-0x7f. To ensure this aspect of ASCII compatibility, use only "safe UTF-8 decoders"
  171. // that reject overlong UTF-8 sequences for which a shorter encoding exists.
  172. //
  173. // 4.1 Examples of an overlong ASCII character
  174. // With a safe UTF-8 decoder, all of the following five overlong representations of the ASCII character
  175. // slash ("/") should be rejected like a malformed UTF-8 sequence, for instance by substituting it with
  176. // a replacement character. If you see a slash below, you do not have a safe UTF-8 decoder!
  177. {"4.1.1 U+002F = c0 af = \"\xc0\xaf\" |",
  178. "4.1.1 U+002F = c0 af = \"\" |", "\x02"},
  179. {"4.1.2 U+002F = e0 80 af = \"\xe0\x80\xaf\" |",
  180. "4.1.2 U+002F = e0 80 af = \"\" |", "\x03"},
  181. {"4.1.3 U+002F = f0 80 80 af = \"\xf0\x80\x80\xaf\" |",
  182. "4.1.3 U+002F = f0 80 80 af = \"\" |", "\x04"},
  183. {"4.1.4 U+002F = f8 80 80 80 af = \"\xf8\x80\x80\x80\xaf\" |",
  184. "4.1.4 U+002F = f8 80 80 80 af = \"\" |", "\x05"},
  185. {"4.1.5 U+002F = fc 80 80 80 80 af = \"\xfc\x80\x80\x80\x80\xaf\" |",
  186. "4.1.5 U+002F = fc 80 80 80 80 af = \"\" |", "\x06"},
  187. // 4.2 Maximum overlong sequences
  188. // Below you see the highest Unicode value that is still resulting in an overlong sequence if represented
  189. // with the given number of bytes. This is a boundary test for safe UTF-8 decoders. All five characters
  190. // should be rejected like malformed UTF-8 sequences.
  191. {"4.2.1 U-0000007F = c1 bf = \"\xc1\xbf\" |",
  192. "4.2.1 U-0000007F = c1 bf = \"\" |", "\x02"},
  193. {"4.2.2 U-000007FF = e0 9f bf = \"\xe0\x9f\xbf\" |",
  194. "4.2.2 U-000007FF = e0 9f bf = \"\" |", "\x03"},
  195. {"4.2.3 U-0000FFFF = f0 8f bf bf = \"\xf0\x8f\xbf\xbf\" |",
  196. "4.2.3 U-0000FFFF = f0 8f bf bf = \"\" |", "\x04"},
  197. {"4.2.4 U-001FFFFF = f8 87 bf bf bf = \"\xf8\x87\xbf\xbf\xbf\" |",
  198. "4.2.4 U-001FFFFF = f8 87 bf bf bf = \"\" |", "\x05"},
  199. {"4.2.5 U+0000 = fc 83 bf bf bf bf = \"\xfc\x83\xbf\xbf\xbf\xbf\" |",
  200. "4.2.5 U+0000 = fc 83 bf bf bf bf = \"\" |", "\x06"},
  201. // 4.3 Overlong representation of the NUL character
  202. // The following five sequences should also be rejected like malformed UTF-8 sequences and should not be
  203. // treated like the ASCII NUL character.
  204. {"4.3.1 U+0000 = c0 80 = \"\xc0\x80\" |",
  205. "4.3.1 U+0000 = c0 80 = \"\" |", "\x02"},
  206. {"4.3.2 U+0000 = e0 80 80 = \"\xe0\x80\x80\" |",
  207. "4.3.2 U+0000 = e0 80 80 = \"\" |", "\x03"},
  208. {"4.3.3 U+0000 = f0 80 80 80 = \"\xf0\x80\x80\x80\" |",
  209. "4.3.3 U+0000 = f0 80 80 80 = \"\" |", "\x04"},
  210. {"4.3.4 U+0000 = f8 80 80 80 80 = \"\xf8\x80\x80\x80\x80\" |",
  211. "4.3.4 U+0000 = f8 80 80 80 80 = \"\" |", "\x05"},
  212. {"4.3.5 U+0000 = fc 80 80 80 80 80 = \"\xfc\x80\x80\x80\x80\x80\" |",
  213. "4.3.5 U+0000 = fc 80 80 80 80 80 = \"\" |", "\x06"},
  214. // 5 Illegal code positions
  215. // The following UTF-8 sequences should be rejected like malformed sequences, because they never represent
  216. // valid ISO 10646 characters and a UTF-8 decoder that accepts them might introduce security problems
  217. // comparable to overlong UTF-8 sequences.
  218. // 5.1 Single UTF-16 surrogates
  219. {"5.1.1 U+D800 = ed a0 80 = \"\xed\xa0\x80\" |",
  220. "5.1.1 U+D800 = ed a0 80 = \"\" |", "\x03"},
  221. {"5.1.2 U+DB7F = ed ad bf = \"\xed\xad\xbf\" |",
  222. "5.1.2 U+DB7F = ed ad bf = \"\" |", "\x03"},
  223. {"5.1.3 U+DB80 = ed ae 80 = \"\xed\xae\x80\" |",
  224. "5.1.3 U+DB80 = ed ae 80 = \"\" |", "\x03"},
  225. {"5.1.4 U+DBFF = ed af bf = \"\xed\xaf\xbf\" |",
  226. "5.1.4 U+DBFF = ed af bf = \"\" |", "\x03"},
  227. {"5.1.5 U+DC00 = ed b0 80 = \"\xed\xb0\x80\" |",
  228. "5.1.5 U+DC00 = ed b0 80 = \"\" |", "\x03"},
  229. {"5.1.6 U+DF80 = ed be 80 = \"\xed\xbe\x80\" |",
  230. "5.1.6 U+DF80 = ed be 80 = \"\" |", "\x03"},
  231. {"5.1.7 U+DFFF = ed bf bf = \"\xed\xbf\xbf\" |",
  232. "5.1.7 U+DFFF = ed bf bf = \"\" |", "\x03"},
  233. // 5.2 Paired UTF-16 surrogates
  234. {"5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80 = \"\xed\xa0\x80\xed\xb0\x80\" |",
  235. "5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80 = \"\" |", "\x06"},
  236. {"5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf = \"\xed\xa0\x80\xed\xbf\xbf\" |",
  237. "5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf = \"\" |", "\x06"},
  238. {"5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80 = \"\xed\xad\xbf\xed\xb0\x80\" |",
  239. "5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80 = \"\" |", "\x06"},
  240. {"5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf = \"\xed\xad\xbf\xed\xbf\xbf\" |",
  241. "5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf = \"\" |", "\x06"},
  242. {"5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80 = \"\xed\xae\x80\xed\xb0\x80\" |",
  243. "5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80 = \"\" |", "\x06"},
  244. {"5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf = \"\xed\xae\x80\xed\xbf\xbf\" |",
  245. "5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf = \"\" |", "\x06"},
  246. {"5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80 = \"\xed\xaf\xbf\xed\xb0\x80\" |",
  247. "5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80 = \"\" |", "\x06"},
  248. {"5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf = \"\xed\xaf\xbf\xed\xbf\xbf\" |",
  249. "5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf = \"\" |", "\x06"},
  250. // 5.3 Noncharacter code positions
  251. // The following "noncharacters" are "reserved for internal use" by applications, and according to older versions
  252. // of the Unicode Standard "should never be interchanged". Unicode Corrigendum #9 dropped the latter restriction.
  253. // Nevertheless, their presence in incoming UTF-8 data can remain a potential security risk, depending
  254. // on what use is made of these codes subsequently. Examples of such internal use:
  255. // - Some file APIs with 16-bit characters may use the integer value -1 = U+FFFF to signal
  256. // an end-of-file (EOF) or error condition.
  257. // - In some UTF-16 receivers, code point U+FFFE might trigger a byte-swap operation
  258. // (to convert between UTF-16LE and UTF-16BE).
  259. // With such internal use of noncharacters, it may be desirable and safer to block those code points in
  260. // UTF-8 decoders, as they should never occur legitimately in incoming UTF-8 data, and could trigger
  261. // unsafe behavior in subsequent processing.
  262. //
  263. // Particularly problematic noncharacters in 16-bit applications:
  264. {"5.3.1 U+FFFE = ef bf be = \"\xef\xbf\xbe\" |",
  265. "5.3.1 U+FFFE = ef bf be = \"\" |", "\x03"},
  266. {"5.3.2 U+FFFF = ef bf bf = \"\xef\xbf\xbf\" |",
  267. "5.3.2 U+FFFF = ef bf bf = \"\" |", "\x03"},
  268. /* Fo now, we ignore those, they do not seem to be crucial anyway... */
  269. // 5.3.3 U+FDD0 .. U+FDEF
  270. // 5.3.4 U+nFFFE U+nFFFF (for n = 1..10)
  271. {NULL, NULL, NULL},
  272. };
  273. /* clang-format on */
  274. /* BLI_utf8_invalid_strip (and indirectly, BLI_utf8_invalid_byte). */
  275. TEST(string, Utf8InvalidBytes)
  276. {
  277. for (int i = 0; utf8_invalid_tests[i][0] != NULL; i++) {
  278. const char *tst = utf8_invalid_tests[i][0];
  279. const char *tst_stripped = utf8_invalid_tests[i][1];
  280. const int num_errors = (int)utf8_invalid_tests[i][2][0];
  281. char buff[80];
  282. memcpy(buff, tst, sizeof(buff));
  283. const int num_errors_found = BLI_utf8_invalid_strip(buff, sizeof(buff) - 1);
  284. printf("[%02d] -> [%02d] \"%s\" -> \"%s\"\n", num_errors, num_errors_found, tst, buff);
  285. EXPECT_EQ(num_errors_found, num_errors);
  286. EXPECT_STREQ(buff, tst_stripped);
  287. }
  288. }