api.punycode.php 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. <?php
  2. /**
  3. * The MIT License (MIT)
  4. *
  5. * Copyright (c) 2013 mk-j, zedwood.com
  6. *
  7. * Permission is hereby granted, free of charge, to any person obtaining a copy
  8. * of this software and associated documentation files (the "Software"), to deal
  9. * in the Software without restriction, including without limitation the rights
  10. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11. * copies of the Software, and to permit persons to whom the Software is
  12. * furnished to do so, subject to the following conditions:
  13. *
  14. * The above copyright notice and this permission notice shall be included in all
  15. * copies or substantial portions of the Software.
  16. *
  17. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23. * SOFTWARE.
  24. */
  25. function_exists('mb_internal_encoding') or die('unsupported dependency, mbstring');
  26. class Punycode {
  27. const TMIN = 1;
  28. const TMAX = 26;
  29. const BASE = 36;
  30. const INITIAL_N = 128;
  31. const INITIAL_BIAS = 72;
  32. const DAMP = 700;
  33. const SKEW = 38;
  34. const DELIMITER = '-';
  35. //Punycode::::encodeHostName() corresponds to idna_toASCII('xärg.örg');
  36. public static function encodeHostName($hostname) {
  37. if (!self::is_valid_utf8($hostname)) {
  38. return $hostname; //invalid
  39. }
  40. if (function_exists('idn_to_ascii') && 0) {
  41. return idn_to_ascii($hostname); //php 5.3+
  42. }
  43. $old_encoding = mb_internal_encoding();
  44. mb_internal_encoding("UTF-8");
  45. $pieces = explode(".", self::mb_strtolower($hostname));
  46. $punycode_pieces = array();
  47. foreach ($pieces as $piece) {
  48. if (preg_match("/[\x{80}-\x{FFFF}]/u", $piece)) {//is multi byte utf8
  49. $punycode_pieces[] = "xn--" . self::encode($piece);
  50. } else if (preg_match('/^[a-z\d][a-z\d-]{0,62}$/i', $piece) && !preg_match('/-$/', $piece)) {//is valid ascii hostname
  51. $punycode_pieces[] = $piece;
  52. } else {
  53. mb_internal_encoding($old_encoding);
  54. return $hostname; //invalid domain
  55. }
  56. }
  57. mb_internal_encoding($old_encoding);
  58. return implode(".", $punycode_pieces);
  59. }
  60. //Punycode::::decodeHostName() corresponds to idna_toUnicode('xn--xrg-9ka.xn--rg-eka');
  61. public static function decodeHostName($encoded_hostname) {
  62. if (!preg_match('/[a-z\d.-]{1,255}/', $encoded_hostname)) {
  63. return false;
  64. }
  65. if (function_exists('idn_to_utf8') && 0) {
  66. return idn_to_utf8($encoded_hostname);
  67. }
  68. $old_encoding = mb_internal_encoding();
  69. mb_internal_encoding("UTF-8");
  70. $pieces = explode(".", strtolower($encoded_hostname));
  71. foreach ($pieces as $piece) {
  72. if (!preg_match('/^[a-z\d][a-z\d-]{0,62}$/i', $piece) || preg_match('/-$/', $piece)) {
  73. mb_internal_encoding($old_encoding);
  74. return $encoded_hostname; //invalid
  75. }
  76. $punycode_pieces[] = strpos($piece, "xn--") === 0 ? self::decode(substr($piece, 4)) : $piece;
  77. }
  78. mb_internal_encoding($old_encoding);
  79. return implode(".", $punycode_pieces);
  80. }
  81. protected static function encode($input) {
  82. try {
  83. $n = self::INITIAL_N;
  84. $delta = 0;
  85. $bias = self::INITIAL_BIAS;
  86. $output = '';
  87. $input_length = self::mb_strlen($input);
  88. $b = 0;
  89. for ($i = 0; $i < $input_length; $i++) {
  90. $chr = self::mb_substr($input, $i, 1);
  91. $c = self::uniord($chr); //autoloaded class
  92. if ($c < self::INITIAL_N) {
  93. $output .= $chr;
  94. $b++;
  95. }
  96. }
  97. if ($b == $input_length) {//no international chars to convert to punycode here
  98. throw new Exception("PunycodeException.BAD_INPUT");
  99. } else if ($b > 0) {
  100. $output .= self::DELIMITER;
  101. }
  102. $h = $b;
  103. while ($h < $input_length) {
  104. $m = PHP_INT_MAX;
  105. // Find the minimum code point >= n
  106. for ($i = 0; $i < $input_length; $i++) {
  107. $chr = self::mb_substr($input, $i, 1);
  108. $c = self::uniord($chr);
  109. if ($c >= $n && $c < $m) {
  110. $m = $c;
  111. }
  112. }
  113. if (($m - $n) > (PHP_INT_MAX - $delta) / ($h + 1)) {
  114. throw new Exception("PunycodeException.OVERFLOW");
  115. }
  116. $delta = $delta + ($m - $n) * ($h + 1);
  117. $n = $m;
  118. for ($j = 0; $j < $input_length; $j++) {
  119. $chr = self::mb_substr($input, $j, 1);
  120. $c = self::uniord($chr);
  121. if ($c < $n) {
  122. $delta++;
  123. if (0 == $delta) {
  124. throw new Exception("PunycodeException.OVERFLOW");
  125. }
  126. }
  127. if ($c == $n) {
  128. $q = $delta;
  129. for ($k = self::BASE;; $k += self::BASE) {
  130. $t = 0;
  131. if ($k <= $bias) {
  132. $t = self::TMIN;
  133. } else if ($k >= $bias + self::TMAX) {
  134. $t = self::TMAX;
  135. } else {
  136. $t = $k - $bias;
  137. }
  138. if ($q < $t) {
  139. break;
  140. }
  141. $output .= chr(self::digit2codepoint($t + ($q - $t) % (self::BASE - $t)));
  142. $q = floor(($q - $t) / (self::BASE - $t)); //integer division
  143. }
  144. $output .= chr(self::digit2codepoint($q));
  145. $bias = self::adapt($delta, $h + 1, $h == $b);
  146. $delta = 0;
  147. $h++;
  148. }
  149. }
  150. $delta++;
  151. $n++;
  152. }
  153. } catch (Exception $e) {
  154. error_log("[PUNYCODE] error " . $e->getMessage());
  155. return $input;
  156. }
  157. return $output;
  158. }
  159. protected static function decode($input) {
  160. try {
  161. $n = self::INITIAL_N;
  162. $i = 0;
  163. $bias = self::INITIAL_BIAS;
  164. $output = '';
  165. $d = self::rstrpos($input, self::DELIMITER);
  166. if ($d > 0) {
  167. for ($j = 0; $j < $d; $j++) {
  168. $chr = self::mb_substr($input, $j, 1);
  169. $c = self::uniord($chr);
  170. if ($c >= self::INITIAL_N) {
  171. throw new Exception("PunycodeException.BAD_INPUT");
  172. }
  173. $output .= $chr;
  174. }
  175. $d++;
  176. } else {
  177. $d = 0;
  178. }
  179. $input_length = self::mb_strlen($input);
  180. while ($d < $input_length) {
  181. $oldi = $i;
  182. $w = 1;
  183. for ($k = self::BASE;; $k += self::BASE) {
  184. if ($d == $input_length) {
  185. throw new Exception("PunycodeException.BAD_INPUT");
  186. }
  187. $chr = self::mb_substr($input, $d++, 1);
  188. $c = self::uniord($chr);
  189. $digit = self::codepoint2digit($c);
  190. if ($digit > (PHP_INT_MAX - $i) / $w) {
  191. throw new Exception("PunycodeException.OVERFLOW");
  192. }
  193. $i = $i + $digit * $w;
  194. $t = 0;
  195. if ($k <= $bias) {
  196. $t = self::TMIN;
  197. } else if ($k >= $bias + self::TMAX) {
  198. $t = self::TMAX;
  199. } else {
  200. $t = $k - $bias;
  201. }
  202. if ($digit < $t) {
  203. break;
  204. }
  205. $w = $w * (self::BASE - $t);
  206. }
  207. $output_length = self::mb_strlen($output);
  208. $bias = self::adapt($i - $oldi, $output_length + 1, $oldi == 0);
  209. if ($i / ($output_length + 1) > PHP_INT_MAX - $n) {
  210. throw new Exception("PunycodeException.OVERFLOW");
  211. }
  212. $n = floor($n + $i / ($output_length + 1));
  213. $i = $i % ($output_length + 1);
  214. $output = self::mb_strinsert($output, self::utf8($n), $i);
  215. $i++;
  216. }
  217. } catch (Exception $e) {
  218. error_log("[PUNYCODE] error " . $e->getMessage());
  219. return $input;
  220. }
  221. return $output;
  222. }
  223. //adapt patched from:
  224. //https://github.com/takezoh/php-PunycodeEncoder/blob/master/punycode.php
  225. protected static function adapt($delta, $numpoints, $firsttime) {
  226. $delta = (int) ($firsttime ? $delta / self::DAMP : $delta / 2);
  227. $delta += (int) ($delta / $numpoints);
  228. $k = 0;
  229. while ($delta > (((self::BASE - self::TMIN) * self::TMAX) / 2)) {
  230. $delta = (int) ($delta / (self::BASE - self::TMIN));
  231. $k += self::BASE;
  232. }
  233. return $k + (int) ((self::BASE - self::TMIN + 1) * $delta / ($delta + self::SKEW));
  234. }
  235. protected static function digit2codepoint($d) {
  236. if ($d < 26) {
  237. // 0..25 : 'a'..'z'
  238. return $d + ord('a');
  239. } else if ($d < 36) {
  240. // 26..35 : '0'..'9';
  241. return $d - 26 + ord('0');
  242. } else {
  243. throw new Exception("PunycodeException.BAD_INPUT");
  244. }
  245. }
  246. protected static function codepoint2digit($c) {
  247. if ($c - ord('0') < 10) {
  248. // '0'..'9' : 26..35
  249. return $c - ord('0') + 26;
  250. } else if ($c - ord('a') < 26) {
  251. // 'a'..'z' : 0..25
  252. return $c - ord('a');
  253. } else {
  254. throw new Exception("PunycodeException.BAD_INPUT");
  255. }
  256. }
  257. protected static function rstrpos($haystack, $needle) {
  258. $pos = strpos(strrev($haystack), $needle);
  259. if ($pos === false)
  260. return false;
  261. return strlen($haystack) - 1 - $pos;
  262. }
  263. protected static function mb_strinsert($haystack, $needle, $position) {
  264. $old_encoding = mb_internal_encoding();
  265. mb_internal_encoding("UTF-8");
  266. $r = mb_substr($haystack, 0, $position) . $needle . mb_substr($haystack, $position);
  267. mb_internal_encoding($old_encoding);
  268. return $r;
  269. }
  270. protected static function mb_substr($str, $start, $length) {
  271. $old_encoding = mb_internal_encoding();
  272. mb_internal_encoding("UTF-8");
  273. $r = mb_substr($str, $start, $length);
  274. mb_internal_encoding($old_encoding);
  275. return $r;
  276. }
  277. protected static function mb_strlen($str) {
  278. $old_encoding = mb_internal_encoding();
  279. mb_internal_encoding("UTF-8");
  280. $r = mb_strlen($str);
  281. mb_internal_encoding($old_encoding);
  282. return $r;
  283. }
  284. protected static function mb_strtolower($str) {
  285. $old_encoding = mb_internal_encoding();
  286. mb_internal_encoding("UTF-8");
  287. $r = mb_strtolower($str);
  288. mb_internal_encoding($old_encoding);
  289. return $r;
  290. }
  291. public static function uniord($c) {//cousin of ord() but for unicode
  292. $ord0 = ord($c[0]);
  293. if ($ord0 >= 0 && $ord0 <= 127)
  294. return $ord0;
  295. $ord1 = ord($c[1]);
  296. if ($ord0 >= 192 && $ord0 <= 223)
  297. return ($ord0 - 192) * 64 + ($ord1 - 128);
  298. if ($ord0 == 0xed && ($ord1 & 0xa0) == 0xa0)
  299. return false; //code points, 0xd800 to 0xdfff
  300. $ord2 = ord($c[2]);
  301. if ($ord0 >= 224 && $ord0 <= 239)
  302. return ($ord0 - 224) * 4096 + ($ord1 - 128) * 64 + ($ord2 - 128);
  303. $ord3 = ord($c[3]);
  304. if ($ord0 >= 240 && $ord0 <= 247)
  305. return ($ord0 - 240) * 262144 + ($ord1 - 128) * 4096 + ($ord2 - 128) * 64 + ($ord3 - 128);
  306. return false;
  307. }
  308. public static function utf8($num) {//cousin of ascii() but for utf8
  309. if ($num <= 0x7F)
  310. return chr($num);
  311. if ($num <= 0x7FF)
  312. return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
  313. if (0xd800 <= $num && $num <= 0xdfff)
  314. return ''; //invalid block of utf8
  315. if ($num <= 0xFFFF)
  316. return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
  317. if ($num <= 0x10FFFF)
  318. return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
  319. return '';
  320. }
  321. public static function is_valid_utf8($string) {
  322. for ($i = 0, $ix = strlen($string); $i < $ix; $i++) {
  323. $c = ord($string[$i]);
  324. if ($c == 0x09 || $c == 0x0a || $c == 0x0d || (0x20 <= $c && $c < 0x7e))
  325. $n = 0;# 0bbbbbbb
  326. else if (($c & 0xE0) == 0xC0)
  327. $n = 1;# 110bbbbb
  328. else if ($c == 0xed && (ord($string[$i + 1]) & 0xa0) == 0xa0)
  329. return false; //code points, 0xd800 to 0xdfff
  330. else if (($c & 0xF0) == 0xE0)
  331. $n = 2;# 1110bbbb
  332. else if (($c & 0xF8) == 0xF0)
  333. $n = 3;# 11110bbb
  334. else
  335. return false;
  336. for ($j = 0; $j < $n; $j++) { // n bytes matching 10bbbbbb follow ?
  337. if (( ++$i == $ix) || ((ord($string[$i]) & 0xC0) != 0x80))
  338. return false;
  339. }
  340. }
  341. return true;
  342. }
  343. }