Ustring.php 28 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066
  1. <?php
  2. /**
  3. * Hoa
  4. *
  5. *
  6. * @license
  7. *
  8. * New BSD License
  9. *
  10. * Copyright © 2007-2017, Hoa community. All rights reserved.
  11. *
  12. * Redistribution and use in source and binary forms, with or without
  13. * modification, are permitted provided that the following conditions are met:
  14. * * Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. * * Redistributions in binary form must reproduce the above copyright
  17. * notice, this list of conditions and the following disclaimer in the
  18. * documentation and/or other materials provided with the distribution.
  19. * * Neither the name of the Hoa nor the names of its contributors may be
  20. * used to endorse or promote products derived from this software without
  21. * specific prior written permission.
  22. *
  23. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  24. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE
  27. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  28. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  29. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  30. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  31. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  32. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  33. * POSSIBILITY OF SUCH DAMAGE.
  34. */
  35. namespace Hoa\Ustring;
  36. use Hoa\Consistency;
  37. /**
  38. * Class \Hoa\Ustring.
  39. *
  40. * This class represents a UTF-8 string.
  41. * Please, see:
  42. * • http://www.ietf.org/rfc/rfc3454.txt;
  43. * • http://unicode.org/reports/tr9/;
  44. * • http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt.
  45. *
  46. * @copyright Copyright © 2007-2017 Hoa community
  47. * @license New BSD License
  48. */
  49. class Ustring implements \ArrayAccess, \Countable, \IteratorAggregate
  50. {
  51. /**
  52. * Left-To-Right.
  53. *
  54. * @const int
  55. */
  56. const LTR = 0;
  57. /**
  58. * Right-To-Left.
  59. *
  60. * @const int
  61. */
  62. const RTL = 1;
  63. /**
  64. * ZERO WIDTH NON-BREAKING SPACE (ZWNPBSP, aka byte-order mark, BOM).
  65. *
  66. * @const int
  67. */
  68. const BOM = 0xfeff;
  69. /**
  70. * LEFT-TO-RIGHT MARK.
  71. *
  72. * @const int
  73. */
  74. const LRM = 0x200e;
  75. /**
  76. * RIGHT-TO-LEFT MARK.
  77. *
  78. * @const int
  79. */
  80. const RLM = 0x200f;
  81. /**
  82. * LEFT-TO-RIGHT EMBEDDING.
  83. *
  84. * @const int
  85. */
  86. const LRE = 0x202a;
  87. /**
  88. * RIGHT-TO-LEFT EMBEDDING.
  89. *
  90. * @const int
  91. */
  92. const RLE = 0x202b;
  93. /**
  94. * POP DIRECTIONAL FORMATTING.
  95. *
  96. * @const int
  97. */
  98. const PDF = 0x202c;
  99. /**
  100. * LEFT-TO-RIGHT OVERRIDE.
  101. *
  102. * @const int
  103. */
  104. const LRO = 0x202d;
  105. /**
  106. * RIGHT-TO-LEFT OVERRIDE.
  107. *
  108. * @const int
  109. */
  110. const RLO = 0x202e;
  111. /**
  112. * Represent the beginning of the string.
  113. *
  114. * @const int
  115. */
  116. const BEGINNING = 1;
  117. /**
  118. * Represent the end of the string.
  119. *
  120. * @const int
  121. */
  122. const END = 2;
  123. /**
  124. * Split: non-empty pieces is returned.
  125. *
  126. * @const int
  127. */
  128. const WITHOUT_EMPTY = PREG_SPLIT_NO_EMPTY;
  129. /**
  130. * Split: parenthesized expression in the delimiter pattern will be captured
  131. * and returned.
  132. *
  133. * @const int
  134. */
  135. const WITH_DELIMITERS = PREG_SPLIT_DELIM_CAPTURE;
  136. /**
  137. * Split: offsets of captures will be returned.
  138. *
  139. * @const int
  140. */
  141. const WITH_OFFSET = 260; // PREG_OFFSET_CAPTURE
  142. // | PREG_SPLIT_OFFSET_CAPTURE
  143. /**
  144. * Group results by patterns.
  145. *
  146. * @const int
  147. */
  148. const GROUP_BY_PATTERN = PREG_PATTERN_ORDER;
  149. /**
  150. * Group results by tuple (set of patterns).
  151. *
  152. * @const int
  153. */
  154. const GROUP_BY_TUPLE = PREG_SET_ORDER;
  155. /**
  156. * Current string.
  157. *
  158. * @var string
  159. */
  160. protected $_string = null;
  161. /**
  162. * Direction. Please see self::LTR and self::RTL constants.
  163. *
  164. * @var int
  165. */
  166. protected $_direction = null;
  167. /**
  168. * Collator.
  169. *
  170. * @var \Collator
  171. */
  172. protected static $_collator = null;
  173. /**
  174. * Construct a UTF-8 string.
  175. *
  176. * @param string $string String.
  177. */
  178. public function __construct($string = null)
  179. {
  180. if (null !== $string) {
  181. $this->append($string);
  182. }
  183. return;
  184. }
  185. /**
  186. * Check if ext/mbstring is available.
  187. *
  188. * @return bool
  189. */
  190. public static function checkMbString()
  191. {
  192. return function_exists('mb_substr');
  193. }
  194. /**
  195. * Check if ext/iconv is available.
  196. *
  197. * @return bool
  198. */
  199. public static function checkIconv()
  200. {
  201. return function_exists('iconv');
  202. }
  203. /**
  204. * Append a substring to the current string, i.e. add to the end.
  205. *
  206. * @param string $substring Substring to append.
  207. * @return \Hoa\Ustring
  208. */
  209. public function append($substring)
  210. {
  211. $this->_string .= $substring;
  212. return $this;
  213. }
  214. /**
  215. * Prepend a substring to the current string, i.e. add to the start.
  216. *
  217. * @param string $substring Substring to append.
  218. * @return \Hoa\Ustring
  219. */
  220. public function prepend($substring)
  221. {
  222. $this->_string = $substring . $this->_string;
  223. return $this;
  224. }
  225. /**
  226. * Pad the current string to a certain length with another piece, aka piece.
  227. *
  228. * @param int $length Length.
  229. * @param string $piece Piece.
  230. * @param int $side Whether we append at the end or the beginning
  231. * of the current string.
  232. * @return \Hoa\Ustring
  233. */
  234. public function pad($length, $piece, $side = self::END)
  235. {
  236. $difference = $length - $this->count();
  237. if (0 >= $difference) {
  238. return $this;
  239. }
  240. $handle = null;
  241. for ($i = $difference / mb_strlen($piece) - 1; $i >= 0; --$i) {
  242. $handle .= $piece;
  243. }
  244. $handle .= mb_substr($piece, 0, $difference - mb_strlen($handle));
  245. return
  246. static::END === $side
  247. ? $this->append($handle)
  248. : $this->prepend($handle);
  249. }
  250. /**
  251. * Make a comparison with a string.
  252. * Return < 0 if current string is less than $string, > 0 if greater and 0
  253. * if equal.
  254. *
  255. * @param mixed $string String.
  256. * @return int
  257. */
  258. public function compare($string)
  259. {
  260. if (null === $collator = static::getCollator()) {
  261. return strcmp($this->_string, (string) $string);
  262. }
  263. return $collator->compare($this->_string, $string);
  264. }
  265. /**
  266. * Get collator.
  267. *
  268. * @return \Collator
  269. */
  270. public static function getCollator()
  271. {
  272. if (false === class_exists('Collator')) {
  273. return null;
  274. }
  275. if (null === static::$_collator) {
  276. static::$_collator = new \Collator(setlocale(LC_COLLATE, null));
  277. }
  278. return static::$_collator;
  279. }
  280. /**
  281. * Ensure that the pattern is safe for Unicode: add the “u” option.
  282. *
  283. * @param string $pattern Pattern.
  284. * @return string
  285. */
  286. public static function safePattern($pattern)
  287. {
  288. $delimiter = mb_substr($pattern, 0, 1);
  289. $options = mb_substr(
  290. mb_strrchr($pattern, $delimiter, false),
  291. mb_strlen($delimiter)
  292. );
  293. if (false === strpos($options, 'u')) {
  294. $pattern .= 'u';
  295. }
  296. return $pattern;
  297. }
  298. /**
  299. * Perform a regular expression (PCRE) match.
  300. *
  301. * @param string $pattern Pattern.
  302. * @param array $matches Matches.
  303. * @param int $flags Please, see constants self::WITH_OFFSET,
  304. * self::GROUP_BY_PATTERN and
  305. * self::GROUP_BY_TUPLE.
  306. * @param int $offset Alternate place from which to start the
  307. * search.
  308. * @param bool $global Whether the match is global or not.
  309. * @return int
  310. */
  311. public function match(
  312. $pattern,
  313. &$matches = null,
  314. $flags = 0,
  315. $offset = 0,
  316. $global = false
  317. ) {
  318. $pattern = static::safePattern($pattern);
  319. if (0 === $flags) {
  320. if (true === $global) {
  321. $flags = static::GROUP_BY_PATTERN;
  322. }
  323. } else {
  324. $flags &= ~PREG_SPLIT_OFFSET_CAPTURE;
  325. }
  326. $offset = strlen(mb_substr($this->_string, 0, $offset));
  327. if (true === $global) {
  328. return preg_match_all(
  329. $pattern,
  330. $this->_string,
  331. $matches,
  332. $flags,
  333. $offset
  334. );
  335. }
  336. return preg_match($pattern, $this->_string, $matches, $flags, $offset);
  337. }
  338. /**
  339. * Perform a regular expression (PCRE) search and replace.
  340. *
  341. * @param mixed $pattern Pattern(s).
  342. * @param mixed $replacement Replacement(s) (please, see
  343. * preg_replace() documentation).
  344. * @param int $limit Maximum of replacements. -1 for unbound.
  345. * @return \Hoa\Ustring
  346. */
  347. public function replace($pattern, $replacement, $limit = -1)
  348. {
  349. $pattern = static::safePattern($pattern);
  350. if (false === is_callable($replacement)) {
  351. $this->_string = preg_replace(
  352. $pattern,
  353. $replacement,
  354. $this->_string,
  355. $limit
  356. );
  357. } else {
  358. $this->_string = preg_replace_callback(
  359. $pattern,
  360. $replacement,
  361. $this->_string,
  362. $limit
  363. );
  364. }
  365. return $this;
  366. }
  367. /**
  368. * Split the current string according to a given pattern (PCRE).
  369. *
  370. * @param string $pattern Pattern (as a regular expression).
  371. * @param int $limit Maximum of split. -1 for unbound.
  372. * @param int $flags Please, see constants self::WITHOUT_EMPTY,
  373. * self::WITH_DELIMITERS, self::WITH_OFFSET.
  374. * @return array
  375. */
  376. public function split(
  377. $pattern,
  378. $limit = -1,
  379. $flags = self::WITHOUT_EMPTY
  380. ) {
  381. return preg_split(
  382. static::safePattern($pattern),
  383. $this->_string,
  384. $limit,
  385. $flags
  386. );
  387. }
  388. /**
  389. * Iterator over chars.
  390. *
  391. * @return \ArrayIterator
  392. */
  393. public function getIterator()
  394. {
  395. return new \ArrayIterator(preg_split('#(?<!^)(?!$)#u', $this->_string));
  396. }
  397. /**
  398. * Perform a lowercase folding on the current string.
  399. *
  400. * @return \Hoa\Ustring
  401. */
  402. public function toLowerCase()
  403. {
  404. $this->_string = mb_strtolower($this->_string);
  405. return $this;
  406. }
  407. /**
  408. * Perform an uppercase folding on the current string.
  409. *
  410. * @return \Hoa\Ustring
  411. */
  412. public function toUpperCase()
  413. {
  414. $this->_string = mb_strtoupper($this->_string);
  415. return $this;
  416. }
  417. /**
  418. * Transform a UTF-8 string into an ASCII one.
  419. * First, try with a transliterator. If not available, will fallback to a
  420. * normalizer. If not available, will try something homemade.
  421. *
  422. * @param bool $try Try something if \Normalizer is not present.
  423. * @return \Hoa\Ustring
  424. * @throws \Hoa\Ustring\Exception
  425. */
  426. public function toAscii($try = false)
  427. {
  428. if (0 === preg_match('#[\x80-\xff]#', $this->_string)) {
  429. return $this;
  430. }
  431. $string = $this->_string;
  432. $transId =
  433. 'Any-Latin; ' .
  434. '[\p{S}] Name; ' .
  435. 'Latin-ASCII';
  436. if (null !== $transliterator = static::getTransliterator($transId)) {
  437. $this->_string = preg_replace_callback(
  438. '#\\\N\{([A-Z ]+)\}#u',
  439. function (array $matches) {
  440. return '(' . strtolower($matches[1]) . ')';
  441. },
  442. $transliterator->transliterate($string)
  443. );
  444. return $this;
  445. }
  446. if (false === class_exists('Normalizer')) {
  447. if (false === $try) {
  448. throw new Exception(
  449. '%s needs the class Normalizer to work properly, ' .
  450. 'or you can force a try by using %1$s(true).',
  451. 0,
  452. __METHOD__
  453. );
  454. }
  455. $string = static::transcode($string, 'UTF-8', 'ASCII//IGNORE//TRANSLIT');
  456. $this->_string = preg_replace('#(?:[\'"`^](\w))#u', '\1', $string);
  457. return $this;
  458. }
  459. $string = \Normalizer::normalize($string, \Normalizer::NFKD);
  460. $string = preg_replace('#\p{Mn}+#u', '', $string);
  461. $this->_string = static::transcode($string, 'UTF-8', 'ASCII//IGNORE//TRANSLIT');
  462. return $this;
  463. }
  464. /**
  465. * Transliterate the string into another.
  466. * See self::getTransliterator for more information.
  467. *
  468. * @param string $identifier Identifier.
  469. * @param int $start Start.
  470. * @param int $end End.
  471. * @return \Hoa\Ustring
  472. * @throws \Hoa\Ustring\Exception
  473. */
  474. public function transliterate($identifier, $start = 0, $end = null)
  475. {
  476. if (null === $transliterator = static::getTransliterator($identifier)) {
  477. throw new Exception(
  478. '%s needs the class Transliterator to work properly.',
  479. 1,
  480. __METHOD__
  481. );
  482. }
  483. $this->_string = $transliterator->transliterate($this->_string, $start, $end);
  484. return $this;
  485. }
  486. /**
  487. * Get transliterator.
  488. * See http://userguide.icu-project.org/transforms/general for $identifier.
  489. *
  490. * @param string $identifier Identifier.
  491. * @return \Transliterator
  492. */
  493. public static function getTransliterator($identifier)
  494. {
  495. if (false === class_exists('Transliterator')) {
  496. return null;
  497. }
  498. return \Transliterator::create($identifier);
  499. }
  500. /**
  501. * Strip characters (default \s) of the current string.
  502. *
  503. * @param string $regex Characters to remove.
  504. * @param int $side Whether we trim the beginning, the end or both
  505. * sides, of the current string.
  506. * @return \Hoa\Ustring
  507. */
  508. public function trim($regex = '\s', $side = 3 /* static::BEGINNING | static::END */)
  509. {
  510. $regex = '(?:' . $regex . ')+';
  511. $handle = null;
  512. if (0 !== ($side & static::BEGINNING)) {
  513. $handle .= '(^' . $regex . ')';
  514. }
  515. if (0 !== ($side & static::END)) {
  516. if (null !== $handle) {
  517. $handle .= '|';
  518. }
  519. $handle .= '(' . $regex . '$)';
  520. }
  521. $this->_string = preg_replace('#' . $handle . '#u', '', $this->_string);
  522. $this->_direction = null;
  523. return $this;
  524. }
  525. /**
  526. * Compute offset (negative, unbound etc.).
  527. *
  528. * @param int $offset Offset.
  529. * @return int
  530. */
  531. protected function computeOffset($offset)
  532. {
  533. $length = mb_strlen($this->_string);
  534. if (0 > $offset) {
  535. $offset = -$offset % $length;
  536. if (0 !== $offset) {
  537. $offset = $length - $offset;
  538. }
  539. } elseif ($offset >= $length) {
  540. $offset %= $length;
  541. }
  542. return $offset;
  543. }
  544. /**
  545. * Get a specific chars of the current string.
  546. *
  547. * @param int $offset Offset (can be negative and unbound).
  548. * @return string
  549. */
  550. public function offsetGet($offset)
  551. {
  552. return mb_substr($this->_string, $this->computeOffset($offset), 1);
  553. }
  554. /**
  555. * Set a specific character of the current string.
  556. *
  557. * @param int $offset Offset (can be negative and unbound).
  558. * @param string $value Value.
  559. * @return \Hoa\Ustring
  560. */
  561. public function offsetSet($offset, $value)
  562. {
  563. $head = null;
  564. $offset = $this->computeOffset($offset);
  565. if (0 < $offset) {
  566. $head = mb_substr($this->_string, 0, $offset);
  567. }
  568. $tail = mb_substr($this->_string, $offset + 1);
  569. $this->_string = $head . $value . $tail;
  570. $this->_direction = null;
  571. return $this;
  572. }
  573. /**
  574. * Delete a specific character of the current string.
  575. *
  576. * @param int $offset Offset (can be negative and unbound).
  577. * @return string
  578. */
  579. public function offsetUnset($offset)
  580. {
  581. return $this->offsetSet($offset, null);
  582. }
  583. /**
  584. * Check if a specific offset exists.
  585. *
  586. * @return bool
  587. */
  588. public function offsetExists($offset)
  589. {
  590. return true;
  591. }
  592. /**
  593. * Reduce the strings.
  594. *
  595. * @param int $start Position of first character.
  596. * @param int $length Maximum number of characters.
  597. * @return \Hoa\Ustring
  598. */
  599. public function reduce($start, $length = null)
  600. {
  601. $this->_string = mb_substr($this->_string, $start, $length);
  602. return $this;
  603. }
  604. /**
  605. * Count number of characters of the current string.
  606. *
  607. * @return int
  608. */
  609. public function count()
  610. {
  611. return mb_strlen($this->_string);
  612. }
  613. /**
  614. * Get byte (not character) at a specific offset.
  615. *
  616. * @param int $offset Offset (can be negative and unbound).
  617. * @return string
  618. */
  619. public function getByteAt($offset)
  620. {
  621. $length = strlen($this->_string);
  622. if (0 > $offset) {
  623. $offset = -$offset % $length;
  624. if (0 !== $offset) {
  625. $offset = $length - $offset;
  626. }
  627. } elseif ($offset >= $length) {
  628. $offset %= $length;
  629. }
  630. return $this->_string[$offset];
  631. }
  632. /**
  633. * Count number of bytes (not characters) of the current string.
  634. *
  635. * @return int
  636. */
  637. public function getBytesLength()
  638. {
  639. return strlen($this->_string);
  640. }
  641. /**
  642. * Get the width of the current string.
  643. * Useful when printing the string in monotype (some character need more
  644. * than one column to be printed).
  645. *
  646. * @return int
  647. */
  648. public function getWidth()
  649. {
  650. return mb_strwidth($this->_string);
  651. }
  652. /**
  653. * Get direction of the current string.
  654. * Please, see the self::LTR and self::RTL constants.
  655. * It does not yet support embedding directions.
  656. *
  657. * @return int
  658. */
  659. public function getDirection()
  660. {
  661. if (null === $this->_direction) {
  662. if (null === $this->_string) {
  663. $this->_direction = static::LTR;
  664. } else {
  665. $this->_direction = static::getCharDirection(
  666. mb_substr($this->_string, 0, 1)
  667. );
  668. }
  669. }
  670. return $this->_direction;
  671. }
  672. /**
  673. * Get character of a specific character.
  674. * Please, see the self::LTR and self::RTL constants.
  675. *
  676. * @param string $char Character.
  677. * @return int
  678. */
  679. public static function getCharDirection($char)
  680. {
  681. $c = static::toCode($char);
  682. if (!(0x5be <= $c && 0x10b7f >= $c)) {
  683. return static::LTR;
  684. }
  685. if (0x85e >= $c) {
  686. if (0x5be === $c ||
  687. 0x5c0 === $c ||
  688. 0x5c3 === $c ||
  689. 0x5c6 === $c ||
  690. (0x5d0 <= $c && 0x5ea >= $c) ||
  691. (0x5f0 <= $c && 0x5f4 >= $c) ||
  692. 0x608 === $c ||
  693. 0x60b === $c ||
  694. 0x60d === $c ||
  695. 0x61b === $c ||
  696. (0x61e <= $c && 0x64a >= $c) ||
  697. (0x66d <= $c && 0x66f >= $c) ||
  698. (0x671 <= $c && 0x6d5 >= $c) ||
  699. (0x6e5 <= $c && 0x6e6 >= $c) ||
  700. (0x6ee <= $c && 0x6ef >= $c) ||
  701. (0x6fa <= $c && 0x70d >= $c) ||
  702. 0x710 === $c ||
  703. (0x712 <= $c && 0x72f >= $c) ||
  704. (0x74d <= $c && 0x7a5 >= $c) ||
  705. 0x7b1 === $c ||
  706. (0x7c0 <= $c && 0x7ea >= $c) ||
  707. (0x7f4 <= $c && 0x7f5 >= $c) ||
  708. 0x7fa === $c ||
  709. (0x800 <= $c && 0x815 >= $c) ||
  710. 0x81a === $c ||
  711. 0x824 === $c ||
  712. 0x828 === $c ||
  713. (0x830 <= $c && 0x83e >= $c) ||
  714. (0x840 <= $c && 0x858 >= $c) ||
  715. 0x85e === $c) {
  716. return static::RTL;
  717. }
  718. } elseif (0x200f === $c) {
  719. return static::RTL;
  720. } elseif (0xfb1d <= $c) {
  721. if (0xfb1d === $c ||
  722. (0xfb1f <= $c && 0xfb28 >= $c) ||
  723. (0xfb2a <= $c && 0xfb36 >= $c) ||
  724. (0xfb38 <= $c && 0xfb3c >= $c) ||
  725. 0xfb3e === $c ||
  726. (0xfb40 <= $c && 0xfb41 >= $c) ||
  727. (0xfb43 <= $c && 0xfb44 >= $c) ||
  728. (0xfb46 <= $c && 0xfbc1 >= $c) ||
  729. (0xfbd3 <= $c && 0xfd3d >= $c) ||
  730. (0xfd50 <= $c && 0xfd8f >= $c) ||
  731. (0xfd92 <= $c && 0xfdc7 >= $c) ||
  732. (0xfdf0 <= $c && 0xfdfc >= $c) ||
  733. (0xfe70 <= $c && 0xfe74 >= $c) ||
  734. (0xfe76 <= $c && 0xfefc >= $c) ||
  735. (0x10800 <= $c && 0x10805 >= $c) ||
  736. 0x10808 === $c ||
  737. (0x1080a <= $c && 0x10835 >= $c) ||
  738. (0x10837 <= $c && 0x10838 >= $c) ||
  739. 0x1083c === $c ||
  740. (0x1083f <= $c && 0x10855 >= $c) ||
  741. (0x10857 <= $c && 0x1085f >= $c) ||
  742. (0x10900 <= $c && 0x1091b >= $c) ||
  743. (0x10920 <= $c && 0x10939 >= $c) ||
  744. 0x1093f === $c ||
  745. 0x10a00 === $c ||
  746. (0x10a10 <= $c && 0x10a13 >= $c) ||
  747. (0x10a15 <= $c && 0x10a17 >= $c) ||
  748. (0x10a19 <= $c && 0x10a33 >= $c) ||
  749. (0x10a40 <= $c && 0x10a47 >= $c) ||
  750. (0x10a50 <= $c && 0x10a58 >= $c) ||
  751. (0x10a60 <= $c && 0x10a7f >= $c) ||
  752. (0x10b00 <= $c && 0x10b35 >= $c) ||
  753. (0x10b40 <= $c && 0x10b55 >= $c) ||
  754. (0x10b58 <= $c && 0x10b72 >= $c) ||
  755. (0x10b78 <= $c && 0x10b7f >= $c)) {
  756. return static::RTL;
  757. }
  758. }
  759. return static::LTR;
  760. }
  761. /**
  762. * Get the number of column positions of a wide-character.
  763. *
  764. * This is a PHP implementation of wcwidth() and wcswidth() (defined in IEEE
  765. * Std 1002.1-2001) for Unicode, by Markus Kuhn. Please, see
  766. * http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c.
  767. *
  768. * The wcwidth(wc) function shall either return 0 (if wc is a null
  769. * wide-character code), or return the number of column positions to be
  770. * occupied by the wide-character code wc, or return -1 (if wc does not
  771. * correspond to a printable wide-character code).
  772. *
  773. * @param string $char Character.
  774. * @return int
  775. */
  776. public static function getCharWidth($char)
  777. {
  778. $char = (string) $char;
  779. $c = static::toCode($char);
  780. // Test for 8-bit control characters.
  781. if (0x0 === $c) {
  782. return 0;
  783. }
  784. if (0x20 > $c || (0x7f <= $c && $c < 0xa0)) {
  785. return -1;
  786. }
  787. // Non-spacing characters.
  788. if (0xad !== $c &&
  789. 0 !== preg_match('#^[\p{Mn}\p{Me}\p{Cf}\x{1160}-\x{11ff}\x{200b}]#u', $char)) {
  790. return 0;
  791. }
  792. // If we arrive here, $c is not a combining C0/C1 control character.
  793. return 1 +
  794. (0x1100 <= $c &&
  795. (0x115f >= $c || // Hangul Jamo init. consonants
  796. 0x2329 === $c || 0x232a === $c ||
  797. (0x2e80 <= $c && 0xa4cf >= $c &&
  798. 0x303f !== $c) || // CJK…Yi
  799. (0xac00 <= $c && 0xd7a3 >= $c) || // Hangul Syllables
  800. (0xf900 <= $c && 0xfaff >= $c) || // CJK Compatibility Ideographs
  801. (0xfe10 <= $c && 0xfe19 >= $c) || // Vertical forms
  802. (0xfe30 <= $c && 0xfe6f >= $c) || // CJK Compatibility Forms
  803. (0xff00 <= $c && 0xff60 >= $c) || // Fullwidth Forms
  804. (0xffe0 <= $c && 0xffe6 >= $c) ||
  805. (0x20000 <= $c && 0x2fffd >= $c) ||
  806. (0x30000 <= $c && 0x3fffd >= $c)));
  807. }
  808. /**
  809. * Check whether the character is printable or not.
  810. *
  811. * @param string $char Character.
  812. * @return bool
  813. */
  814. public static function isCharPrintable($char)
  815. {
  816. return 1 <= static::getCharWidth($char);
  817. }
  818. /**
  819. * Get a UTF-8 character from its decimal code representation.
  820. *
  821. * @param int $code Code.
  822. * @return string
  823. */
  824. public static function fromCode($code)
  825. {
  826. return mb_convert_encoding(
  827. '&#x' . dechex($code) . ';',
  828. 'UTF-8',
  829. 'HTML-ENTITIES'
  830. );
  831. }
  832. /**
  833. * Get a decimal code representation of a specific character.
  834. *
  835. * @param string $char Character.
  836. * @return int
  837. */
  838. public static function toCode($char)
  839. {
  840. $char = (string) $char;
  841. $code = ord($char[0]);
  842. $bytes = 1;
  843. if (!($code & 0x80)) { // 0xxxxxxx
  844. return $code;
  845. }
  846. if (($code & 0xe0) === 0xc0) { // 110xxxxx
  847. $bytes = 2;
  848. $code = $code & ~0xc0;
  849. } elseif (($code & 0xf0) == 0xe0) { // 1110xxxx
  850. $bytes = 3;
  851. $code = $code & ~0xe0;
  852. } elseif (($code & 0xf8) === 0xf0) { // 11110xxx
  853. $bytes = 4;
  854. $code = $code & ~0xf0;
  855. }
  856. for ($i = 2; $i <= $bytes; $i++) { // 10xxxxxx
  857. $code = ($code << 6) + (ord($char[$i - 1]) & ~0x80);
  858. }
  859. return $code;
  860. }
  861. /**
  862. * Get a binary representation of a specific character.
  863. *
  864. * @param string $char Character.
  865. * @return string
  866. */
  867. public static function toBinaryCode($char)
  868. {
  869. $char = (string) $char;
  870. $out = null;
  871. for ($i = 0, $max = strlen($char); $i < $max; ++$i) {
  872. $out .= vsprintf('%08b', ord($char[$i]));
  873. }
  874. return $out;
  875. }
  876. /**
  877. * Transcode.
  878. *
  879. * @param string $string String.
  880. * @param string $from Original encoding.
  881. * @param string $to Final encoding.
  882. * @return string
  883. * @throws \Hoa\Ustring\Exception
  884. */
  885. public static function transcode($string, $from, $to = 'UTF-8')
  886. {
  887. if (false === static::checkIconv()) {
  888. throw new Exception(
  889. '%s needs the iconv extension.',
  890. 2,
  891. __CLASS__
  892. );
  893. }
  894. return iconv($from, $to, $string);
  895. }
  896. /**
  897. * Check if a string is encoded in UTF-8.
  898. *
  899. * @param string $string String.
  900. * @return bool
  901. */
  902. public static function isUtf8($string)
  903. {
  904. return (bool) preg_match('##u', $string);
  905. }
  906. /**
  907. * Copy current object string
  908. *
  909. * @return \Hoa\Ustring
  910. */
  911. public function copy()
  912. {
  913. return clone $this;
  914. }
  915. /**
  916. * Transform the object as a string.
  917. *
  918. * @return string
  919. */
  920. public function __toString()
  921. {
  922. return $this->_string;
  923. }
  924. }
  925. /**
  926. * Flex entity.
  927. */
  928. Consistency::flexEntity('Hoa\Ustring\Ustring');
  929. if (false === Ustring::checkMbString()) {
  930. throw new Exception(
  931. '%s needs the mbstring extension.',
  932. 0,
  933. __NAMESPACE__ . '\Ustring'
  934. );
  935. }