CleanUpTest.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. <?php
  2. # Copyright (C) 2004 Brion Vibber <brion@pobox.com>
  3. # http://www.mediawiki.org/
  4. #
  5. # This program is free software; you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation; either version 2 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License along
  16. # with this program; if not, write to the Free Software Foundation, Inc.,
  17. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. # http://www.gnu.org/copyleft/gpl.html
  19. if( php_sapi_name() != 'cli' ) {
  20. die( "Run me from the command line please.\n" );
  21. }
  22. /** */
  23. if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
  24. dl( 'php_utfnormal.so' );
  25. }
  26. #ini_set( 'memory_limit', '40M' );
  27. require_once 'PHPUnit/Framework.php';
  28. require_once 'PHPUnit/TextUI/TestRunner.php';
  29. require_once 'UtfNormal.php';
  30. /**
  31. * Additional tests for UtfNormal::cleanUp() function, inclusion
  32. * regression checks for known problems.
  33. * Requires PHPUnit.
  34. *
  35. * @ingroup UtfNormal
  36. * @private
  37. */
  38. class CleanUpTest extends PHPUnit_Framework_TestCase {
  39. /** @todo document */
  40. function setUp() {
  41. }
  42. /** @todo document */
  43. function tearDown() {
  44. }
  45. /** @todo document */
  46. function testAscii() {
  47. $text = 'This is plain ASCII text.';
  48. $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
  49. }
  50. /** @todo document */
  51. function testNull() {
  52. $text = "a \x00 null";
  53. $expect = "a \xef\xbf\xbd null";
  54. $this->assertEquals(
  55. bin2hex( $expect ),
  56. bin2hex( UtfNormal::cleanUp( $text ) ) );
  57. }
  58. /** @todo document */
  59. function testLatin() {
  60. $text = "L'\xc3\xa9cole";
  61. $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
  62. }
  63. /** @todo document */
  64. function testLatinNormal() {
  65. $text = "L'e\xcc\x81cole";
  66. $expect = "L'\xc3\xa9cole";
  67. $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
  68. }
  69. /**
  70. * This test is *very* expensive!
  71. * @todo document
  72. */
  73. function XtestAllChars() {
  74. $rep = UTF8_REPLACEMENT;
  75. global $utfCanonicalComp, $utfCanonicalDecomp;
  76. for( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
  77. $char = codepointToUtf8( $i );
  78. $clean = UtfNormal::cleanUp( $char );
  79. $x = sprintf( "%04X", $i );
  80. if( $i % 0x1000 == 0 ) echo "U+$x\n";
  81. if( $i == 0x0009 ||
  82. $i == 0x000a ||
  83. $i == 0x000d ||
  84. ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) ||
  85. ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
  86. ($i > 0xffff && $i <= UNICODE_MAX ) ) {
  87. if( isset( $utfCanonicalComp[$char] ) || isset( $utfCanonicalDecomp[$char] ) ) {
  88. $comp = UtfNormal::NFC( $char );
  89. $this->assertEquals(
  90. bin2hex( $comp ),
  91. bin2hex( $clean ),
  92. "U+$x should be decomposed" );
  93. } else {
  94. $this->assertEquals(
  95. bin2hex( $char ),
  96. bin2hex( $clean ),
  97. "U+$x should be intact" );
  98. }
  99. } else {
  100. $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
  101. }
  102. }
  103. }
  104. /** @todo document */
  105. function testAllBytes() {
  106. $this->doTestBytes( '', '' );
  107. $this->doTestBytes( 'x', '' );
  108. $this->doTestBytes( '', 'x' );
  109. $this->doTestBytes( 'x', 'x' );
  110. }
  111. /** @todo document */
  112. function doTestBytes( $head, $tail ) {
  113. for( $i = 0x0; $i < 256; $i++ ) {
  114. $char = $head . chr( $i ) . $tail;
  115. $clean = UtfNormal::cleanUp( $char );
  116. $x = sprintf( "%02X", $i );
  117. if( $i == 0x0009 ||
  118. $i == 0x000a ||
  119. $i == 0x000d ||
  120. ($i > 0x001f && $i < 0x80) ) {
  121. $this->assertEquals(
  122. bin2hex( $char ),
  123. bin2hex( $clean ),
  124. "ASCII byte $x should be intact" );
  125. if( $char != $clean ) return;
  126. } else {
  127. $norm = $head . UTF8_REPLACEMENT . $tail;
  128. $this->assertEquals(
  129. bin2hex( $norm ),
  130. bin2hex( $clean ),
  131. "Forbidden byte $x should be rejected" );
  132. if( $norm != $clean ) return;
  133. }
  134. }
  135. }
  136. /** @todo document */
  137. function testDoubleBytes() {
  138. $this->doTestDoubleBytes( '', '' );
  139. $this->doTestDoubleBytes( 'x', '' );
  140. $this->doTestDoubleBytes( '', 'x' );
  141. $this->doTestDoubleBytes( 'x', 'x' );
  142. }
  143. /**
  144. * @todo document
  145. */
  146. function doTestDoubleBytes( $head, $tail ) {
  147. for( $first = 0xc0; $first < 0x100; $first++ ) {
  148. for( $second = 0x80; $second < 0x100; $second++ ) {
  149. $char = $head . chr( $first ) . chr( $second ) . $tail;
  150. $clean = UtfNormal::cleanUp( $char );
  151. $x = sprintf( "%02X,%02X", $first, $second );
  152. if( $first > 0xc1 &&
  153. $first < 0xe0 &&
  154. $second < 0xc0 ) {
  155. $norm = UtfNormal::NFC( $char );
  156. $this->assertEquals(
  157. bin2hex( $norm ),
  158. bin2hex( $clean ),
  159. "Pair $x should be intact" );
  160. if( $norm != $clean ) return;
  161. } elseif( $first > 0xfd || $second > 0xbf ) {
  162. # fe and ff are not legal head bytes -- expect two replacement chars
  163. $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
  164. $this->assertEquals(
  165. bin2hex( $norm ),
  166. bin2hex( $clean ),
  167. "Forbidden pair $x should be rejected" );
  168. if( $norm != $clean ) return;
  169. } else {
  170. $norm = $head . UTF8_REPLACEMENT . $tail;
  171. $this->assertEquals(
  172. bin2hex( $norm ),
  173. bin2hex( $clean ),
  174. "Forbidden pair $x should be rejected" );
  175. if( $norm != $clean ) return;
  176. }
  177. }
  178. }
  179. }
  180. /** @todo document */
  181. function testTripleBytes() {
  182. $this->doTestTripleBytes( '', '' );
  183. $this->doTestTripleBytes( 'x', '' );
  184. $this->doTestTripleBytes( '', 'x' );
  185. $this->doTestTripleBytes( 'x', 'x' );
  186. }
  187. /** @todo document */
  188. function doTestTripleBytes( $head, $tail ) {
  189. for( $first = 0xc0; $first < 0x100; $first++ ) {
  190. for( $second = 0x80; $second < 0x100; $second++ ) {
  191. #for( $third = 0x80; $third < 0x100; $third++ ) {
  192. for( $third = 0x80; $third < 0x81; $third++ ) {
  193. $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
  194. $clean = UtfNormal::cleanUp( $char );
  195. $x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
  196. if( $first >= 0xe0 &&
  197. $first < 0xf0 &&
  198. $second < 0xc0 &&
  199. $third < 0xc0 ) {
  200. if( $first == 0xe0 && $second < 0xa0 ) {
  201. $this->assertEquals(
  202. bin2hex( $head . UTF8_REPLACEMENT . $tail ),
  203. bin2hex( $clean ),
  204. "Overlong triplet $x should be rejected" );
  205. } elseif( $first == 0xed &&
  206. ( chr( $first ) . chr( $second ) . chr( $third )) >= UTF8_SURROGATE_FIRST ) {
  207. $this->assertEquals(
  208. bin2hex( $head . UTF8_REPLACEMENT . $tail ),
  209. bin2hex( $clean ),
  210. "Surrogate triplet $x should be rejected" );
  211. } else {
  212. $this->assertEquals(
  213. bin2hex( UtfNormal::NFC( $char ) ),
  214. bin2hex( $clean ),
  215. "Triplet $x should be intact" );
  216. }
  217. } elseif( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
  218. $this->assertEquals(
  219. bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
  220. bin2hex( $clean ),
  221. "Valid 2-byte $x + broken tail" );
  222. } elseif( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
  223. $this->assertEquals(
  224. bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
  225. bin2hex( $clean ),
  226. "Broken head + valid 2-byte $x" );
  227. } elseif( ( $first > 0xfd || $second > 0xfd ) &&
  228. ( ( $second > 0xbf && $third > 0xbf ) ||
  229. ( $second < 0xc0 && $third < 0xc0 ) ||
  230. ( $second > 0xfd ) ||
  231. ( $third > 0xfd ) ) ) {
  232. # fe and ff are not legal head bytes -- expect three replacement chars
  233. $this->assertEquals(
  234. bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
  235. bin2hex( $clean ),
  236. "Forbidden triplet $x should be rejected" );
  237. } elseif( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
  238. $this->assertEquals(
  239. bin2hex( $head . UTF8_REPLACEMENT . $tail ),
  240. bin2hex( $clean ),
  241. "Forbidden triplet $x should be rejected" );
  242. } else {
  243. $this->assertEquals(
  244. bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
  245. bin2hex( $clean ),
  246. "Forbidden triplet $x should be rejected" );
  247. }
  248. }
  249. }
  250. }
  251. }
  252. /** @todo document */
  253. function testChunkRegression() {
  254. # Check for regression against a chunking bug
  255. $text = "\x46\x55\xb8" .
  256. "\xdc\x96" .
  257. "\xee" .
  258. "\xe7" .
  259. "\x44" .
  260. "\xaa" .
  261. "\x2f\x25";
  262. $expect = "\x46\x55\xef\xbf\xbd" .
  263. "\xdc\x96" .
  264. "\xef\xbf\xbd" .
  265. "\xef\xbf\xbd" .
  266. "\x44" .
  267. "\xef\xbf\xbd" .
  268. "\x2f\x25";
  269. $this->assertEquals(
  270. bin2hex( $expect ),
  271. bin2hex( UtfNormal::cleanUp( $text ) ) );
  272. }
  273. /** @todo document */
  274. function testInterposeRegression() {
  275. $text = "\x4e\x30" .
  276. "\xb1" . # bad tail
  277. "\x3a" .
  278. "\x92" . # bad tail
  279. "\x62\x3a" .
  280. "\x84" . # bad tail
  281. "\x43" .
  282. "\xc6" . # bad head
  283. "\x3f" .
  284. "\x92" . # bad tail
  285. "\xad" . # bad tail
  286. "\x7d" .
  287. "\xd9\x95";
  288. $expect = "\x4e\x30" .
  289. "\xef\xbf\xbd" .
  290. "\x3a" .
  291. "\xef\xbf\xbd" .
  292. "\x62\x3a" .
  293. "\xef\xbf\xbd" .
  294. "\x43" .
  295. "\xef\xbf\xbd" .
  296. "\x3f" .
  297. "\xef\xbf\xbd" .
  298. "\xef\xbf\xbd" .
  299. "\x7d" .
  300. "\xd9\x95";
  301. $this->assertEquals(
  302. bin2hex( $expect ),
  303. bin2hex( UtfNormal::cleanUp( $text ) ) );
  304. }
  305. /** @todo document */
  306. function testOverlongRegression() {
  307. $text = "\x67" .
  308. "\x1a" . # forbidden ascii
  309. "\xea" . # bad head
  310. "\xc1\xa6" . # overlong sequence
  311. "\xad" . # bad tail
  312. "\x1c" . # forbidden ascii
  313. "\xb0" . # bad tail
  314. "\x3c" .
  315. "\x9e"; # bad tail
  316. $expect = "\x67" .
  317. "\xef\xbf\xbd" .
  318. "\xef\xbf\xbd" .
  319. "\xef\xbf\xbd" .
  320. "\xef\xbf\xbd" .
  321. "\xef\xbf\xbd" .
  322. "\xef\xbf\xbd" .
  323. "\x3c" .
  324. "\xef\xbf\xbd";
  325. $this->assertEquals(
  326. bin2hex( $expect ),
  327. bin2hex( UtfNormal::cleanUp( $text ) ) );
  328. }
  329. /** @todo document */
  330. function testSurrogateRegression() {
  331. $text = "\xed\xb4\x96" . # surrogate 0xDD16
  332. "\x83" . # bad tail
  333. "\xb4" . # bad tail
  334. "\xac"; # bad head
  335. $expect = "\xef\xbf\xbd" .
  336. "\xef\xbf\xbd" .
  337. "\xef\xbf\xbd" .
  338. "\xef\xbf\xbd";
  339. $this->assertEquals(
  340. bin2hex( $expect ),
  341. bin2hex( UtfNormal::cleanUp( $text ) ) );
  342. }
  343. /** @todo document */
  344. function testBomRegression() {
  345. $text = "\xef\xbf\xbe" . # U+FFFE, illegal char
  346. "\xb2" . # bad tail
  347. "\xef" . # bad head
  348. "\x59";
  349. $expect = "\xef\xbf\xbd" .
  350. "\xef\xbf\xbd" .
  351. "\xef\xbf\xbd" .
  352. "\x59";
  353. $this->assertEquals(
  354. bin2hex( $expect ),
  355. bin2hex( UtfNormal::cleanUp( $text ) ) );
  356. }
  357. /** @todo document */
  358. function testForbiddenRegression() {
  359. $text = "\xef\xbf\xbf"; # U+FFFF, illegal char
  360. $expect = "\xef\xbf\xbd";
  361. $this->assertEquals(
  362. bin2hex( $expect ),
  363. bin2hex( UtfNormal::cleanUp( $text ) ) );
  364. }
  365. /** @todo document */
  366. function testHangulRegression() {
  367. $text = "\xed\x9c\xaf" . # Hangul char
  368. "\xe1\x87\x81"; # followed by another final jamo
  369. $expect = $text; # Should *not* change.
  370. $this->assertEquals(
  371. bin2hex( $expect ),
  372. bin2hex( UtfNormal::cleanUp( $text ) ) );
  373. }
  374. }
  375. $suite = new PHPUnit_Framework_TestSuite( 'CleanUpTest' );
  376. $result = PHPUnit_TextUI_TestRunner::run( $suite );
  377. if( !$result->wasSuccessful() ) {
  378. exit( -1 );
  379. }
  380. exit( 0 );