Character.java 137 KB


  1. /* java.lang.Character -- Wrapper class for char, and Unicode subsets
  2. Copyright (C) 1998, 1999, 2001, 2002, 2004, 2005 Free Software Foundation, Inc.
  3. This file is part of GNU Classpath.
  4. GNU Classpath is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2, or (at your option)
  7. any later version.
  8. GNU Classpath is distributed in the hope that it will be useful, but
  9. WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with GNU Classpath; see the file COPYING. If not, write to the
  14. Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  15. 02110-1301 USA.
  16. Linking this library statically or dynamically with other modules is
  17. making a combined work based on this library. Thus, the terms and
  18. conditions of the GNU General Public License cover the whole
  19. combination.
  20. As a special exception, the copyright holders of this library give you
  21. permission to link this library with independent modules to produce an
  22. executable, regardless of the license terms of these independent
  23. modules, and to copy and distribute the resulting executable under
  24. terms of your choice, provided that you also meet, for each linked
  25. independent module, the terms and conditions of the license of that
  26. module. An independent module is a module which is not derived from
  27. or based on this library. If you modify this library, you may extend
  28. this exception to your version of the library, but you are not
  29. obligated to do so. If you do not wish to do so, delete this
  30. exception statement from your version. */
  31. package java.lang;
  32. import gnu.java.lang.CharData;
  33. import java.io.Serializable;
  34. import java.text.Collator;
  35. import java.util.Locale;
  36. /**
  37. * Wrapper class for the primitive char data type. In addition, this class
  38. * allows one to retrieve property information and perform transformations
  39. * on the defined characters in the Unicode Standard, Version 4.0.0.
  40. * java.lang.Character is designed to be very dynamic, and as such, it
  41. * retrieves information on the Unicode character set from a separate
  42. * database, gnu.java.lang.CharData, which can be easily upgraded.
  43. *
  44. * <p>For predicates, boundaries are used to describe
  45. * the set of characters for which the method will return true.
  46. * This syntax uses fairly normal regular expression notation.
  47. * See 5.13 of the Unicode Standard, Version 4.0, for the
  48. * boundary specification.
  49. *
  50. * <p>See <a href="http://www.unicode.org">http://www.unicode.org</a>
  51. * for more information on the Unicode Standard.
  52. *
  53. * @author Tom Tromey (tromey@cygnus.com)
  54. * @author Paul N. Fisher
  55. * @author Jochen Hoenicke
  56. * @author Eric Blake (ebb9@email.byu.edu)
  57. * @author Andrew John Hughes (gnu_andrew@member.fsf.org)
  58. * @see CharData
  59. * @since 1.0
  60. * @status partly updated to 1.5; some things still missing
  61. */
  62. public final class Character implements Serializable, Comparable<Character>
  63. {
  64. /**
  65. * A subset of Unicode blocks.
  66. *
  67. * @author Paul N. Fisher
  68. * @author Eric Blake (ebb9@email.byu.edu)
  69. * @since 1.2
  70. */
  71. public static class Subset
  72. {
  73. /** The name of the subset. */
  74. private final String name;
  75. /**
  76. * Construct a new subset of characters.
  77. *
  78. * @param name the name of the subset
  79. * @throws NullPointerException if name is null
  80. */
  81. protected Subset(String name)
  82. {
  83. // Note that name.toString() is name, unless name was null.
  84. this.name = name.toString();
  85. }
  86. /**
  87. * Compares two Subsets for equality. This is <code>final</code>, and
  88. * restricts the comparison on the <code>==</code> operator, so it returns
  89. * true only for the same object.
  90. *
  91. * @param o the object to compare
  92. * @return true if o is this
  93. */
  94. public final boolean equals(Object o)
  95. {
  96. return o == this;
  97. }
  98. /**
  99. * Makes the original hashCode of Object final, to be consistent with
  100. * equals.
  101. *
  102. * @return the hash code for this object
  103. */
  104. public final int hashCode()
  105. {
  106. return super.hashCode();
  107. }
  108. /**
  109. * Returns the name of the subset.
  110. *
  111. * @return the name
  112. */
  113. public final String toString()
  114. {
  115. return name;
  116. }
  117. } // class Subset
  118. /**
  119. * A family of character subsets in the Unicode specification. A character
  120. * is in at most one of these blocks.
  121. *
  122. * This inner class was generated automatically from
  123. * <code>doc/unicode/Blocks-4.0.0.txt</code>, by some perl scripts.
  124. * This Unicode definition file can be found on the
  125. * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
  126. * JDK 1.5 uses Unicode version 4.0.0.
  127. *
  128. * @author scripts/unicode-blocks.pl (written by Eric Blake)
  129. * @since 1.2
  130. */
  131. public static final class UnicodeBlock extends Subset
  132. {
  133. /** The start of the subset. */
  134. private final int start;
  135. /** The end of the subset. */
  136. private final int end;
  137. /** The canonical name of the block according to the Unicode standard. */
  138. private final String canonicalName;
  139. /** Enumeration for the <code>forName()</code> method */
  140. private enum NameType { CANONICAL, NO_SPACES, CONSTANT; }
  141. /**
  142. * Constructor for strictly defined blocks.
  143. *
  144. * @param start the start character of the range
  145. * @param end the end character of the range
  146. * @param name the block name
  147. * @param canonicalName the name of the block as defined in the Unicode
  148. * standard.
  149. */
  150. private UnicodeBlock(int start, int end, String name,
  151. String canonicalName)
  152. {
  153. super(name);
  154. this.start = start;
  155. this.end = end;
  156. this.canonicalName = canonicalName;
  157. }
  158. /**
  159. * Returns the Unicode character block which a character belongs to.
  160. * <strong>Note</strong>: This method does not support the use of
  161. * supplementary characters. For such support, <code>of(int)</code>
  162. * should be used instead.
  163. *
  164. * @param ch the character to look up
  165. * @return the set it belongs to, or null if it is not in one
  166. */
  167. public static UnicodeBlock of(char ch)
  168. {
  169. return of((int) ch);
  170. }
  171. /**
  172. * Returns the Unicode character block which a code point belongs to.
  173. *
  174. * @param codePoint the character to look up
  175. * @return the set it belongs to, or null if it is not in one.
  176. * @throws IllegalArgumentException if the specified code point is
  177. * invalid.
  178. * @since 1.5
  179. */
  180. public static UnicodeBlock of(int codePoint)
  181. {
  182. if (codePoint > MAX_CODE_POINT)
  183. throw new IllegalArgumentException("The supplied integer value is " +
  184. "too large to be a codepoint.");
  185. // Simple binary search for the correct block.
  186. int low = 0;
  187. int hi = sets.length - 1;
  188. while (low <= hi)
  189. {
  190. int mid = (low + hi) >> 1;
  191. UnicodeBlock b = sets[mid];
  192. if (codePoint < b.start)
  193. hi = mid - 1;
  194. else if (codePoint > b.end)
  195. low = mid + 1;
  196. else
  197. return b;
  198. }
  199. return null;
  200. }
  201. /**
  202. * <p>
  203. * Returns the <code>UnicodeBlock</code> with the given name, as defined
  204. * by the Unicode standard. The version of Unicode in use is defined by
  205. * the <code>Character</code> class, and the names are given in the
  206. * <code>Blocks-<version>.txt</code> file corresponding to that version.
  207. * The name may be specified in one of three ways:
  208. * </p>
  209. * <ol>
  210. * <li>The canonical, human-readable name used by the Unicode standard.
  211. * This is the name with all spaces and hyphens retained. For example,
  212. * `Basic Latin' retrieves the block, UnicodeBlock.BASIC_LATIN.</li>
  213. * <li>The canonical name with all spaces removed e.g. `BasicLatin'.</li>
  214. * <li>The name used for the constants specified by this class, which
  215. * is the canonical name with all spaces and hyphens replaced with
  216. * underscores e.g. `BASIC_LATIN'</li>
  217. * </ol>
  218. * <p>
  219. * The names are compared case-insensitively using the case comparison
  220. * associated with the U.S. English locale. The method recognises the
  221. * previous names used for blocks as well as the current ones. At
  222. * present, this simply means that the deprecated `SURROGATES_AREA'
  223. * will be recognised by this method (the <code>of()</code> methods
  224. * only return one of the three new surrogate blocks).
  225. * </p>
  226. *
  227. * @param blockName the name of the block to look up.
  228. * @return the specified block.
  229. * @throws NullPointerException if the <code>blockName</code> is
  230. * <code>null</code>.
  231. * @throws IllegalArgumentException if the name does not match any Unicode
  232. * block.
  233. * @since 1.5
  234. */
  235. public static final UnicodeBlock forName(String blockName)
  236. {
  237. NameType type;
  238. if (blockName.indexOf(' ') != -1)
  239. type = NameType.CANONICAL;
  240. else if (blockName.indexOf('_') != -1)
  241. type = NameType.CONSTANT;
  242. else
  243. type = NameType.NO_SPACES;
  244. Collator usCollator = Collator.getInstance(Locale.US);
  245. usCollator.setStrength(Collator.PRIMARY);
  246. /* Special case for deprecated blocks not in sets */
  247. switch (type)
  248. {
  249. case CANONICAL:
  250. if (usCollator.compare(blockName, "Surrogates Area") == 0)
  251. return SURROGATES_AREA;
  252. break;
  253. case NO_SPACES:
  254. if (usCollator.compare(blockName, "SurrogatesArea") == 0)
  255. return SURROGATES_AREA;
  256. break;
  257. case CONSTANT:
  258. if (usCollator.compare(blockName, "SURROGATES_AREA") == 0)
  259. return SURROGATES_AREA;
  260. break;
  261. }
  262. /* Other cases */
  263. switch (type)
  264. {
  265. case CANONICAL:
  266. for (UnicodeBlock block : sets)
  267. if (usCollator.compare(blockName, block.canonicalName) == 0)
  268. return block;
  269. break;
  270. case NO_SPACES:
  271. for (UnicodeBlock block : sets)
  272. {
  273. String nsName = block.canonicalName.replaceAll(" ","");
  274. if (usCollator.compare(blockName, nsName) == 0)
  275. return block;
  276. }
  277. break;
  278. case CONSTANT:
  279. for (UnicodeBlock block : sets)
  280. if (usCollator.compare(blockName, block.toString()) == 0)
  281. return block;
  282. break;
  283. }
  284. throw new IllegalArgumentException("No Unicode block found for " +
  285. blockName + ".");
  286. }
  287. /**
  288. * Basic Latin.
  289. * 0x0000 - 0x007F.
  290. */
  291. public static final UnicodeBlock BASIC_LATIN
  292. = new UnicodeBlock(0x0000, 0x007F,
  293. "BASIC_LATIN",
  294. "Basic Latin");
  295. /**
  296. * Latin-1 Supplement.
  297. * 0x0080 - 0x00FF.
  298. */
  299. public static final UnicodeBlock LATIN_1_SUPPLEMENT
  300. = new UnicodeBlock(0x0080, 0x00FF,
  301. "LATIN_1_SUPPLEMENT",
  302. "Latin-1 Supplement");
  303. /**
  304. * Latin Extended-A.
  305. * 0x0100 - 0x017F.
  306. */
  307. public static final UnicodeBlock LATIN_EXTENDED_A
  308. = new UnicodeBlock(0x0100, 0x017F,
  309. "LATIN_EXTENDED_A",
  310. "Latin Extended-A");
  311. /**
  312. * Latin Extended-B.
  313. * 0x0180 - 0x024F.
  314. */
  315. public static final UnicodeBlock LATIN_EXTENDED_B
  316. = new UnicodeBlock(0x0180, 0x024F,
  317. "LATIN_EXTENDED_B",
  318. "Latin Extended-B");
  319. /**
  320. * IPA Extensions.
  321. * 0x0250 - 0x02AF.
  322. */
  323. public static final UnicodeBlock IPA_EXTENSIONS
  324. = new UnicodeBlock(0x0250, 0x02AF,
  325. "IPA_EXTENSIONS",
  326. "IPA Extensions");
  327. /**
  328. * Spacing Modifier Letters.
  329. * 0x02B0 - 0x02FF.
  330. */
  331. public static final UnicodeBlock SPACING_MODIFIER_LETTERS
  332. = new UnicodeBlock(0x02B0, 0x02FF,
  333. "SPACING_MODIFIER_LETTERS",
  334. "Spacing Modifier Letters");
  335. /**
  336. * Combining Diacritical Marks.
  337. * 0x0300 - 0x036F.
  338. */
  339. public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS
  340. = new UnicodeBlock(0x0300, 0x036F,
  341. "COMBINING_DIACRITICAL_MARKS",
  342. "Combining Diacritical Marks");
  343. /**
  344. * Greek.
  345. * 0x0370 - 0x03FF.
  346. */
  347. public static final UnicodeBlock GREEK
  348. = new UnicodeBlock(0x0370, 0x03FF,
  349. "GREEK",
  350. "Greek");
  351. /**
  352. * Cyrillic.
  353. * 0x0400 - 0x04FF.
  354. */
  355. public static final UnicodeBlock CYRILLIC
  356. = new UnicodeBlock(0x0400, 0x04FF,
  357. "CYRILLIC",
  358. "Cyrillic");
  359. /**
  360. * Cyrillic Supplementary.
  361. * 0x0500 - 0x052F.
  362. * @since 1.5
  363. */
  364. public static final UnicodeBlock CYRILLIC_SUPPLEMENTARY
  365. = new UnicodeBlock(0x0500, 0x052F,
  366. "CYRILLIC_SUPPLEMENTARY",
  367. "Cyrillic Supplementary");
  368. /**
  369. * Armenian.
  370. * 0x0530 - 0x058F.
  371. */
  372. public static final UnicodeBlock ARMENIAN
  373. = new UnicodeBlock(0x0530, 0x058F,
  374. "ARMENIAN",
  375. "Armenian");
  376. /**
  377. * Hebrew.
  378. * 0x0590 - 0x05FF.
  379. */
  380. public static final UnicodeBlock HEBREW
  381. = new UnicodeBlock(0x0590, 0x05FF,
  382. "HEBREW",
  383. "Hebrew");
  384. /**
  385. * Arabic.
  386. * 0x0600 - 0x06FF.
  387. */
  388. public static final UnicodeBlock ARABIC
  389. = new UnicodeBlock(0x0600, 0x06FF,
  390. "ARABIC",
  391. "Arabic");
  392. /**
  393. * Syriac.
  394. * 0x0700 - 0x074F.
  395. * @since 1.4
  396. */
  397. public static final UnicodeBlock SYRIAC
  398. = new UnicodeBlock(0x0700, 0x074F,
  399. "SYRIAC",
  400. "Syriac");
  401. /**
  402. * Thaana.
  403. * 0x0780 - 0x07BF.
  404. * @since 1.4
  405. */
  406. public static final UnicodeBlock THAANA
  407. = new UnicodeBlock(0x0780, 0x07BF,
  408. "THAANA",
  409. "Thaana");
  410. /**
  411. * Devanagari.
  412. * 0x0900 - 0x097F.
  413. */
  414. public static final UnicodeBlock DEVANAGARI
  415. = new UnicodeBlock(0x0900, 0x097F,
  416. "DEVANAGARI",
  417. "Devanagari");
  418. /**
  419. * Bengali.
  420. * 0x0980 - 0x09FF.
  421. */
  422. public static final UnicodeBlock BENGALI
  423. = new UnicodeBlock(0x0980, 0x09FF,
  424. "BENGALI",
  425. "Bengali");
  426. /**
  427. * Gurmukhi.
  428. * 0x0A00 - 0x0A7F.
  429. */
  430. public static final UnicodeBlock GURMUKHI
  431. = new UnicodeBlock(0x0A00, 0x0A7F,
  432. "GURMUKHI",
  433. "Gurmukhi");
  434. /**
  435. * Gujarati.
  436. * 0x0A80 - 0x0AFF.
  437. */
  438. public static final UnicodeBlock GUJARATI
  439. = new UnicodeBlock(0x0A80, 0x0AFF,
  440. "GUJARATI",
  441. "Gujarati");
  442. /**
  443. * Oriya.
  444. * 0x0B00 - 0x0B7F.
  445. */
  446. public static final UnicodeBlock ORIYA
  447. = new UnicodeBlock(0x0B00, 0x0B7F,
  448. "ORIYA",
  449. "Oriya");
  450. /**
  451. * Tamil.
  452. * 0x0B80 - 0x0BFF.
  453. */
  454. public static final UnicodeBlock TAMIL
  455. = new UnicodeBlock(0x0B80, 0x0BFF,
  456. "TAMIL",
  457. "Tamil");
  458. /**
  459. * Telugu.
  460. * 0x0C00 - 0x0C7F.
  461. */
  462. public static final UnicodeBlock TELUGU
  463. = new UnicodeBlock(0x0C00, 0x0C7F,
  464. "TELUGU",
  465. "Telugu");
  466. /**
  467. * Kannada.
  468. * 0x0C80 - 0x0CFF.
  469. */
  470. public static final UnicodeBlock KANNADA
  471. = new UnicodeBlock(0x0C80, 0x0CFF,
  472. "KANNADA",
  473. "Kannada");
  474. /**
  475. * Malayalam.
  476. * 0x0D00 - 0x0D7F.
  477. */
  478. public static final UnicodeBlock MALAYALAM
  479. = new UnicodeBlock(0x0D00, 0x0D7F,
  480. "MALAYALAM",
  481. "Malayalam");
  482. /**
  483. * Sinhala.
  484. * 0x0D80 - 0x0DFF.
  485. * @since 1.4
  486. */
  487. public static final UnicodeBlock SINHALA
  488. = new UnicodeBlock(0x0D80, 0x0DFF,
  489. "SINHALA",
  490. "Sinhala");
  491. /**
  492. * Thai.
  493. * 0x0E00 - 0x0E7F.
  494. */
  495. public static final UnicodeBlock THAI
  496. = new UnicodeBlock(0x0E00, 0x0E7F,
  497. "THAI",
  498. "Thai");
  499. /**
  500. * Lao.
  501. * 0x0E80 - 0x0EFF.
  502. */
  503. public static final UnicodeBlock LAO
  504. = new UnicodeBlock(0x0E80, 0x0EFF,
  505. "LAO",
  506. "Lao");
  507. /**
  508. * Tibetan.
  509. * 0x0F00 - 0x0FFF.
  510. */
  511. public static final UnicodeBlock TIBETAN
  512. = new UnicodeBlock(0x0F00, 0x0FFF,
  513. "TIBETAN",
  514. "Tibetan");
  515. /**
  516. * Myanmar.
  517. * 0x1000 - 0x109F.
  518. * @since 1.4
  519. */
  520. public static final UnicodeBlock MYANMAR
  521. = new UnicodeBlock(0x1000, 0x109F,
  522. "MYANMAR",
  523. "Myanmar");
  524. /**
  525. * Georgian.
  526. * 0x10A0 - 0x10FF.
  527. */
  528. public static final UnicodeBlock GEORGIAN
  529. = new UnicodeBlock(0x10A0, 0x10FF,
  530. "GEORGIAN",
  531. "Georgian");
  532. /**
  533. * Hangul Jamo.
  534. * 0x1100 - 0x11FF.
  535. */
  536. public static final UnicodeBlock HANGUL_JAMO
  537. = new UnicodeBlock(0x1100, 0x11FF,
  538. "HANGUL_JAMO",
  539. "Hangul Jamo");
  540. /**
  541. * Ethiopic.
  542. * 0x1200 - 0x137F.
  543. * @since 1.4
  544. */
  545. public static final UnicodeBlock ETHIOPIC
  546. = new UnicodeBlock(0x1200, 0x137F,
  547. "ETHIOPIC",
  548. "Ethiopic");
  549. /**
  550. * Cherokee.
  551. * 0x13A0 - 0x13FF.
  552. * @since 1.4
  553. */
  554. public static final UnicodeBlock CHEROKEE
  555. = new UnicodeBlock(0x13A0, 0x13FF,
  556. "CHEROKEE",
  557. "Cherokee");
  558. /**
  559. * Unified Canadian Aboriginal Syllabics.
  560. * 0x1400 - 0x167F.
  561. * @since 1.4
  562. */
  563. public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
  564. = new UnicodeBlock(0x1400, 0x167F,
  565. "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS",
  566. "Unified Canadian Aboriginal Syllabics");
  567. /**
  568. * Ogham.
  569. * 0x1680 - 0x169F.
  570. * @since 1.4
  571. */
  572. public static final UnicodeBlock OGHAM
  573. = new UnicodeBlock(0x1680, 0x169F,
  574. "OGHAM",
  575. "Ogham");
  576. /**
  577. * Runic.
  578. * 0x16A0 - 0x16FF.
  579. * @since 1.4
  580. */
  581. public static final UnicodeBlock RUNIC
  582. = new UnicodeBlock(0x16A0, 0x16FF,
  583. "RUNIC",
  584. "Runic");
  585. /**
  586. * Tagalog.
  587. * 0x1700 - 0x171F.
  588. * @since 1.5
  589. */
  590. public static final UnicodeBlock TAGALOG
  591. = new UnicodeBlock(0x1700, 0x171F,
  592. "TAGALOG",
  593. "Tagalog");
  594. /**
  595. * Hanunoo.
  596. * 0x1720 - 0x173F.
  597. * @since 1.5
  598. */
  599. public static final UnicodeBlock HANUNOO
  600. = new UnicodeBlock(0x1720, 0x173F,
  601. "HANUNOO",
  602. "Hanunoo");
  603. /**
  604. * Buhid.
  605. * 0x1740 - 0x175F.
  606. * @since 1.5
  607. */
  608. public static final UnicodeBlock BUHID
  609. = new UnicodeBlock(0x1740, 0x175F,
  610. "BUHID",
  611. "Buhid");
  612. /**
  613. * Tagbanwa.
  614. * 0x1760 - 0x177F.
  615. * @since 1.5
  616. */
  617. public static final UnicodeBlock TAGBANWA
  618. = new UnicodeBlock(0x1760, 0x177F,
  619. "TAGBANWA",
  620. "Tagbanwa");
  621. /**
  622. * Khmer.
  623. * 0x1780 - 0x17FF.
  624. * @since 1.4
  625. */
  626. public static final UnicodeBlock KHMER
  627. = new UnicodeBlock(0x1780, 0x17FF,
  628. "KHMER",
  629. "Khmer");
  630. /**
  631. * Mongolian.
  632. * 0x1800 - 0x18AF.
  633. * @since 1.4
  634. */
  635. public static final UnicodeBlock MONGOLIAN
  636. = new UnicodeBlock(0x1800, 0x18AF,
  637. "MONGOLIAN",
  638. "Mongolian");
  639. /**
  640. * Limbu.
  641. * 0x1900 - 0x194F.
  642. * @since 1.5
  643. */
  644. public static final UnicodeBlock LIMBU
  645. = new UnicodeBlock(0x1900, 0x194F,
  646. "LIMBU",
  647. "Limbu");
  648. /**
  649. * Tai Le.
  650. * 0x1950 - 0x197F.
  651. * @since 1.5
  652. */
  653. public static final UnicodeBlock TAI_LE
  654. = new UnicodeBlock(0x1950, 0x197F,
  655. "TAI_LE",
  656. "Tai Le");
  657. /**
  658. * Khmer Symbols.
  659. * 0x19E0 - 0x19FF.
  660. * @since 1.5
  661. */
  662. public static final UnicodeBlock KHMER_SYMBOLS
  663. = new UnicodeBlock(0x19E0, 0x19FF,
  664. "KHMER_SYMBOLS",
  665. "Khmer Symbols");
  666. /**
  667. * Phonetic Extensions.
  668. * 0x1D00 - 0x1D7F.
  669. * @since 1.5
  670. */
  671. public static final UnicodeBlock PHONETIC_EXTENSIONS
  672. = new UnicodeBlock(0x1D00, 0x1D7F,
  673. "PHONETIC_EXTENSIONS",
  674. "Phonetic Extensions");
  675. /**
  676. * Latin Extended Additional.
  677. * 0x1E00 - 0x1EFF.
  678. */
  679. public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL
  680. = new UnicodeBlock(0x1E00, 0x1EFF,
  681. "LATIN_EXTENDED_ADDITIONAL",
  682. "Latin Extended Additional");
  683. /**
  684. * Greek Extended.
  685. * 0x1F00 - 0x1FFF.
  686. */
  687. public static final UnicodeBlock GREEK_EXTENDED
  688. = new UnicodeBlock(0x1F00, 0x1FFF,
  689. "GREEK_EXTENDED",
  690. "Greek Extended");
  691. /**
  692. * General Punctuation.
  693. * 0x2000 - 0x206F.
  694. */
  695. public static final UnicodeBlock GENERAL_PUNCTUATION
  696. = new UnicodeBlock(0x2000, 0x206F,
  697. "GENERAL_PUNCTUATION",
  698. "General Punctuation");
  699. /**
  700. * Superscripts and Subscripts.
  701. * 0x2070 - 0x209F.
  702. */
  703. public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS
  704. = new UnicodeBlock(0x2070, 0x209F,
  705. "SUPERSCRIPTS_AND_SUBSCRIPTS",
  706. "Superscripts and Subscripts");
  707. /**
  708. * Currency Symbols.
  709. * 0x20A0 - 0x20CF.
  710. */
  711. public static final UnicodeBlock CURRENCY_SYMBOLS
  712. = new UnicodeBlock(0x20A0, 0x20CF,
  713. "CURRENCY_SYMBOLS",
  714. "Currency Symbols");
  715. /**
  716. * Combining Marks for Symbols.
  717. * 0x20D0 - 0x20FF.
  718. */
  719. public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS
  720. = new UnicodeBlock(0x20D0, 0x20FF,
  721. "COMBINING_MARKS_FOR_SYMBOLS",
  722. "Combining Marks for Symbols");
  723. /**
  724. * Letterlike Symbols.
  725. * 0x2100 - 0x214F.
  726. */
  727. public static final UnicodeBlock LETTERLIKE_SYMBOLS
  728. = new UnicodeBlock(0x2100, 0x214F,
  729. "LETTERLIKE_SYMBOLS",
  730. "Letterlike Symbols");
  731. /**
  732. * Number Forms.
  733. * 0x2150 - 0x218F.
  734. */
  735. public static final UnicodeBlock NUMBER_FORMS
  736. = new UnicodeBlock(0x2150, 0x218F,
  737. "NUMBER_FORMS",
  738. "Number Forms");
  739. /**
  740. * Arrows.
  741. * 0x2190 - 0x21FF.
  742. */
  743. public static final UnicodeBlock ARROWS
  744. = new UnicodeBlock(0x2190, 0x21FF,
  745. "ARROWS",
  746. "Arrows");
  747. /**
  748. * Mathematical Operators.
  749. * 0x2200 - 0x22FF.
  750. */
  751. public static final UnicodeBlock MATHEMATICAL_OPERATORS
  752. = new UnicodeBlock(0x2200, 0x22FF,
  753. "MATHEMATICAL_OPERATORS",
  754. "Mathematical Operators");
  755. /**
  756. * Miscellaneous Technical.
  757. * 0x2300 - 0x23FF.
  758. */
  759. public static final UnicodeBlock MISCELLANEOUS_TECHNICAL
  760. = new UnicodeBlock(0x2300, 0x23FF,
  761. "MISCELLANEOUS_TECHNICAL",
  762. "Miscellaneous Technical");
  763. /**
  764. * Control Pictures.
  765. * 0x2400 - 0x243F.
  766. */
  767. public static final UnicodeBlock CONTROL_PICTURES
  768. = new UnicodeBlock(0x2400, 0x243F,
  769. "CONTROL_PICTURES",
  770. "Control Pictures");
  771. /**
  772. * Optical Character Recognition.
  773. * 0x2440 - 0x245F.
  774. */
  775. public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION
  776. = new UnicodeBlock(0x2440, 0x245F,
  777. "OPTICAL_CHARACTER_RECOGNITION",
  778. "Optical Character Recognition");
  779. /**
  780. * Enclosed Alphanumerics.
  781. * 0x2460 - 0x24FF.
  782. */
  783. public static final UnicodeBlock ENCLOSED_ALPHANUMERICS
  784. = new UnicodeBlock(0x2460, 0x24FF,
  785. "ENCLOSED_ALPHANUMERICS",
  786. "Enclosed Alphanumerics");
  787. /**
  788. * Box Drawing.
  789. * 0x2500 - 0x257F.
  790. */
  791. public static final UnicodeBlock BOX_DRAWING
  792. = new UnicodeBlock(0x2500, 0x257F,
  793. "BOX_DRAWING",
  794. "Box Drawing");
  795. /**
  796. * Block Elements.
  797. * 0x2580 - 0x259F.
  798. */
  799. public static final UnicodeBlock BLOCK_ELEMENTS
  800. = new UnicodeBlock(0x2580, 0x259F,
  801. "BLOCK_ELEMENTS",
  802. "Block Elements");
  803. /**
  804. * Geometric Shapes.
  805. * 0x25A0 - 0x25FF.
  806. */
  807. public static final UnicodeBlock GEOMETRIC_SHAPES
  808. = new UnicodeBlock(0x25A0, 0x25FF,
  809. "GEOMETRIC_SHAPES",
  810. "Geometric Shapes");
  811. /**
  812. * Miscellaneous Symbols.
  813. * 0x2600 - 0x26FF.
  814. */
  815. public static final UnicodeBlock MISCELLANEOUS_SYMBOLS
  816. = new UnicodeBlock(0x2600, 0x26FF,
  817. "MISCELLANEOUS_SYMBOLS",
  818. "Miscellaneous Symbols");
  819. /**
  820. * Dingbats.
  821. * 0x2700 - 0x27BF.
  822. */
  823. public static final UnicodeBlock DINGBATS
  824. = new UnicodeBlock(0x2700, 0x27BF,
  825. "DINGBATS",
  826. "Dingbats");
  827. /**
  828. * Miscellaneous Mathematical Symbols-A.
  829. * 0x27C0 - 0x27EF.
  830. * @since 1.5
  831. */
  832. public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A
  833. = new UnicodeBlock(0x27C0, 0x27EF,
  834. "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A",
  835. "Miscellaneous Mathematical Symbols-A");
  836. /**
  837. * Supplemental Arrows-A.
  838. * 0x27F0 - 0x27FF.
  839. * @since 1.5
  840. */
  841. public static final UnicodeBlock SUPPLEMENTAL_ARROWS_A
  842. = new UnicodeBlock(0x27F0, 0x27FF,
  843. "SUPPLEMENTAL_ARROWS_A",
  844. "Supplemental Arrows-A");
  845. /**
  846. * Braille Patterns.
  847. * 0x2800 - 0x28FF.
  848. * @since 1.4
  849. */
  850. public static final UnicodeBlock BRAILLE_PATTERNS
  851. = new UnicodeBlock(0x2800, 0x28FF,
  852. "BRAILLE_PATTERNS",
  853. "Braille Patterns");
  854. /**
  855. * Supplemental Arrows-B.
  856. * 0x2900 - 0x297F.
  857. * @since 1.5
  858. */
  859. public static final UnicodeBlock SUPPLEMENTAL_ARROWS_B
  860. = new UnicodeBlock(0x2900, 0x297F,
  861. "SUPPLEMENTAL_ARROWS_B",
  862. "Supplemental Arrows-B");
  863. /**
  864. * Miscellaneous Mathematical Symbols-B.
  865. * 0x2980 - 0x29FF.
  866. * @since 1.5
  867. */
  868. public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B
  869. = new UnicodeBlock(0x2980, 0x29FF,
  870. "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B",
  871. "Miscellaneous Mathematical Symbols-B");
  872. /**
  873. * Supplemental Mathematical Operators.
  874. * 0x2A00 - 0x2AFF.
  875. * @since 1.5
  876. */
  877. public static final UnicodeBlock SUPPLEMENTAL_MATHEMATICAL_OPERATORS
  878. = new UnicodeBlock(0x2A00, 0x2AFF,
  879. "SUPPLEMENTAL_MATHEMATICAL_OPERATORS",
  880. "Supplemental Mathematical Operators");
  881. /**
  882. * Miscellaneous Symbols and Arrows.
  883. * 0x2B00 - 0x2BFF.
  884. * @since 1.5
  885. */
  886. public static final UnicodeBlock MISCELLANEOUS_SYMBOLS_AND_ARROWS
  887. = new UnicodeBlock(0x2B00, 0x2BFF,
  888. "MISCELLANEOUS_SYMBOLS_AND_ARROWS",
  889. "Miscellaneous Symbols and Arrows");
  890. /**
  891. * CJK Radicals Supplement.
  892. * 0x2E80 - 0x2EFF.
  893. * @since 1.4
  894. */
  895. public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT
  896. = new UnicodeBlock(0x2E80, 0x2EFF,
  897. "CJK_RADICALS_SUPPLEMENT",
  898. "CJK Radicals Supplement");
  899. /**
  900. * Kangxi Radicals.
  901. * 0x2F00 - 0x2FDF.
  902. * @since 1.4
  903. */
  904. public static final UnicodeBlock KANGXI_RADICALS
  905. = new UnicodeBlock(0x2F00, 0x2FDF,
  906. "KANGXI_RADICALS",
  907. "Kangxi Radicals");
  908. /**
  909. * Ideographic Description Characters.
  910. * 0x2FF0 - 0x2FFF.
  911. * @since 1.4
  912. */
  913. public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS
  914. = new UnicodeBlock(0x2FF0, 0x2FFF,
  915. "IDEOGRAPHIC_DESCRIPTION_CHARACTERS",
  916. "Ideographic Description Characters");
  917. /**
  918. * CJK Symbols and Punctuation.
  919. * 0x3000 - 0x303F.
  920. */
  921. public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION
  922. = new UnicodeBlock(0x3000, 0x303F,
  923. "CJK_SYMBOLS_AND_PUNCTUATION",
  924. "CJK Symbols and Punctuation");
  925. /**
  926. * Hiragana.
  927. * 0x3040 - 0x309F.
  928. */
  929. public static final UnicodeBlock HIRAGANA
  930. = new UnicodeBlock(0x3040, 0x309F,
  931. "HIRAGANA",
  932. "Hiragana");
  933. /**
  934. * Katakana.
  935. * 0x30A0 - 0x30FF.
  936. */
  937. public static final UnicodeBlock KATAKANA
  938. = new UnicodeBlock(0x30A0, 0x30FF,
  939. "KATAKANA",
  940. "Katakana");
  941. /**
  942. * Bopomofo.
  943. * 0x3100 - 0x312F.
  944. */
  945. public static final UnicodeBlock BOPOMOFO
  946. = new UnicodeBlock(0x3100, 0x312F,
  947. "BOPOMOFO",
  948. "Bopomofo");
  949. /**
  950. * Hangul Compatibility Jamo.
  951. * 0x3130 - 0x318F.
  952. */
  953. public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO
  954. = new UnicodeBlock(0x3130, 0x318F,
  955. "HANGUL_COMPATIBILITY_JAMO",
  956. "Hangul Compatibility Jamo");
  957. /**
  958. * Kanbun.
  959. * 0x3190 - 0x319F.
  960. */
  961. public static final UnicodeBlock KANBUN
  962. = new UnicodeBlock(0x3190, 0x319F,
  963. "KANBUN",
  964. "Kanbun");
  965. /**
  966. * Bopomofo Extended.
  967. * 0x31A0 - 0x31BF.
  968. * @since 1.4
  969. */
  970. public static final UnicodeBlock BOPOMOFO_EXTENDED
  971. = new UnicodeBlock(0x31A0, 0x31BF,
  972. "BOPOMOFO_EXTENDED",
  973. "Bopomofo Extended");
  974. /**
  975. * Katakana Phonetic Extensions.
  976. * 0x31F0 - 0x31FF.
  977. * @since 1.5
  978. */
  979. public static final UnicodeBlock KATAKANA_PHONETIC_EXTENSIONS
  980. = new UnicodeBlock(0x31F0, 0x31FF,
  981. "KATAKANA_PHONETIC_EXTENSIONS",
  982. "Katakana Phonetic Extensions");
  983. /**
  984. * Enclosed CJK Letters and Months.
  985. * 0x3200 - 0x32FF.
  986. */
  987. public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS
  988. = new UnicodeBlock(0x3200, 0x32FF,
  989. "ENCLOSED_CJK_LETTERS_AND_MONTHS",
  990. "Enclosed CJK Letters and Months");
  991. /**
  992. * CJK Compatibility.
  993. * 0x3300 - 0x33FF.
  994. */
  995. public static final UnicodeBlock CJK_COMPATIBILITY
  996. = new UnicodeBlock(0x3300, 0x33FF,
  997. "CJK_COMPATIBILITY",
  998. "CJK Compatibility");
  999. /**
  1000. * CJK Unified Ideographs Extension A.
  1001. * 0x3400 - 0x4DBF.
  1002. * @since 1.4
  1003. */
  1004. public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
  1005. = new UnicodeBlock(0x3400, 0x4DBF,
  1006. "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A",
  1007. "CJK Unified Ideographs Extension A");
  1008. /**
  1009. * Yijing Hexagram Symbols.
  1010. * 0x4DC0 - 0x4DFF.
  1011. * @since 1.5
  1012. */
  1013. public static final UnicodeBlock YIJING_HEXAGRAM_SYMBOLS
  1014. = new UnicodeBlock(0x4DC0, 0x4DFF,
  1015. "YIJING_HEXAGRAM_SYMBOLS",
  1016. "Yijing Hexagram Symbols");
  1017. /**
  1018. * CJK Unified Ideographs.
  1019. * 0x4E00 - 0x9FFF.
  1020. */
  1021. public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS
  1022. = new UnicodeBlock(0x4E00, 0x9FFF,
  1023. "CJK_UNIFIED_IDEOGRAPHS",
  1024. "CJK Unified Ideographs");
  1025. /**
  1026. * Yi Syllables.
  1027. * 0xA000 - 0xA48F.
  1028. * @since 1.4
  1029. */
  1030. public static final UnicodeBlock YI_SYLLABLES
  1031. = new UnicodeBlock(0xA000, 0xA48F,
  1032. "YI_SYLLABLES",
  1033. "Yi Syllables");
  1034. /**
  1035. * Yi Radicals.
  1036. * 0xA490 - 0xA4CF.
  1037. * @since 1.4
  1038. */
  1039. public static final UnicodeBlock YI_RADICALS
  1040. = new UnicodeBlock(0xA490, 0xA4CF,
  1041. "YI_RADICALS",
  1042. "Yi Radicals");
  1043. /**
  1044. * Hangul Syllables.
  1045. * 0xAC00 - 0xD7AF.
  1046. */
  1047. public static final UnicodeBlock HANGUL_SYLLABLES
  1048. = new UnicodeBlock(0xAC00, 0xD7AF,
  1049. "HANGUL_SYLLABLES",
  1050. "Hangul Syllables");
  1051. /**
  1052. * High Surrogates.
  1053. * 0xD800 - 0xDB7F.
  1054. * @since 1.5
  1055. */
  1056. public static final UnicodeBlock HIGH_SURROGATES
  1057. = new UnicodeBlock(0xD800, 0xDB7F,
  1058. "HIGH_SURROGATES",
  1059. "High Surrogates");
  1060. /**
  1061. * High Private Use Surrogates.
  1062. * 0xDB80 - 0xDBFF.
  1063. * @since 1.5
  1064. */
  1065. public static final UnicodeBlock HIGH_PRIVATE_USE_SURROGATES
  1066. = new UnicodeBlock(0xDB80, 0xDBFF,
  1067. "HIGH_PRIVATE_USE_SURROGATES",
  1068. "High Private Use Surrogates");
  1069. /**
  1070. * Low Surrogates.
  1071. * 0xDC00 - 0xDFFF.
  1072. * @since 1.5
  1073. */
  1074. public static final UnicodeBlock LOW_SURROGATES
  1075. = new UnicodeBlock(0xDC00, 0xDFFF,
  1076. "LOW_SURROGATES",
  1077. "Low Surrogates");
  1078. /**
  1079. * Private Use Area.
  1080. * 0xE000 - 0xF8FF.
  1081. */
  1082. public static final UnicodeBlock PRIVATE_USE_AREA
  1083. = new UnicodeBlock(0xE000, 0xF8FF,
  1084. "PRIVATE_USE_AREA",
  1085. "Private Use Area");
  1086. /**
  1087. * CJK Compatibility Ideographs.
  1088. * 0xF900 - 0xFAFF.
  1089. */
  1090. public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS
  1091. = new UnicodeBlock(0xF900, 0xFAFF,
  1092. "CJK_COMPATIBILITY_IDEOGRAPHS",
  1093. "CJK Compatibility Ideographs");
  1094. /**
  1095. * Alphabetic Presentation Forms.
  1096. * 0xFB00 - 0xFB4F.
  1097. */
  1098. public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS
  1099. = new UnicodeBlock(0xFB00, 0xFB4F,
  1100. "ALPHABETIC_PRESENTATION_FORMS",
  1101. "Alphabetic Presentation Forms");
  1102. /**
  1103. * Arabic Presentation Forms-A.
  1104. * 0xFB50 - 0xFDFF.
  1105. */
  1106. public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A
  1107. = new UnicodeBlock(0xFB50, 0xFDFF,
  1108. "ARABIC_PRESENTATION_FORMS_A",
  1109. "Arabic Presentation Forms-A");
  1110. /**
  1111. * Variation Selectors.
  1112. * 0xFE00 - 0xFE0F.
  1113. * @since 1.5
  1114. */
  1115. public static final UnicodeBlock VARIATION_SELECTORS
  1116. = new UnicodeBlock(0xFE00, 0xFE0F,
  1117. "VARIATION_SELECTORS",
  1118. "Variation Selectors");
  1119. /**
  1120. * Combining Half Marks.
  1121. * 0xFE20 - 0xFE2F.
  1122. */
  1123. public static final UnicodeBlock COMBINING_HALF_MARKS
  1124. = new UnicodeBlock(0xFE20, 0xFE2F,
  1125. "COMBINING_HALF_MARKS",
  1126. "Combining Half Marks");
  1127. /**
  1128. * CJK Compatibility Forms.
  1129. * 0xFE30 - 0xFE4F.
  1130. */
  1131. public static final UnicodeBlock CJK_COMPATIBILITY_FORMS
  1132. = new UnicodeBlock(0xFE30, 0xFE4F,
  1133. "CJK_COMPATIBILITY_FORMS",
  1134. "CJK Compatibility Forms");
  1135. /**
  1136. * Small Form Variants.
  1137. * 0xFE50 - 0xFE6F.
  1138. */
  1139. public static final UnicodeBlock SMALL_FORM_VARIANTS
  1140. = new UnicodeBlock(0xFE50, 0xFE6F,
  1141. "SMALL_FORM_VARIANTS",
  1142. "Small Form Variants");
  1143. /**
  1144. * Arabic Presentation Forms-B.
  1145. * 0xFE70 - 0xFEFF.
  1146. */
  1147. public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B
  1148. = new UnicodeBlock(0xFE70, 0xFEFF,
  1149. "ARABIC_PRESENTATION_FORMS_B",
  1150. "Arabic Presentation Forms-B");
  1151. /**
  1152. * Halfwidth and Fullwidth Forms.
  1153. * 0xFF00 - 0xFFEF.
  1154. */
  1155. public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS
  1156. = new UnicodeBlock(0xFF00, 0xFFEF,
  1157. "HALFWIDTH_AND_FULLWIDTH_FORMS",
  1158. "Halfwidth and Fullwidth Forms");
  1159. /**
  1160. * Specials.
  1161. * 0xFFF0 - 0xFFFF.
  1162. */
  1163. public static final UnicodeBlock SPECIALS
  1164. = new UnicodeBlock(0xFFF0, 0xFFFF,
  1165. "SPECIALS",
  1166. "Specials");
  1167. /**
  1168. * Linear B Syllabary.
  1169. * 0x10000 - 0x1007F.
  1170. * @since 1.5
  1171. */
  1172. public static final UnicodeBlock LINEAR_B_SYLLABARY
  1173. = new UnicodeBlock(0x10000, 0x1007F,
  1174. "LINEAR_B_SYLLABARY",
  1175. "Linear B Syllabary");
  1176. /**
  1177. * Linear B Ideograms.
  1178. * 0x10080 - 0x100FF.
  1179. * @since 1.5
  1180. */
  1181. public static final UnicodeBlock LINEAR_B_IDEOGRAMS
  1182. = new UnicodeBlock(0x10080, 0x100FF,
  1183. "LINEAR_B_IDEOGRAMS",
  1184. "Linear B Ideograms");
  1185. /**
  1186. * Aegean Numbers.
  1187. * 0x10100 - 0x1013F.
  1188. * @since 1.5
  1189. */
  1190. public static final UnicodeBlock AEGEAN_NUMBERS
  1191. = new UnicodeBlock(0x10100, 0x1013F,
  1192. "AEGEAN_NUMBERS",
  1193. "Aegean Numbers");
  1194. /**
  1195. * Old Italic.
  1196. * 0x10300 - 0x1032F.
  1197. * @since 1.5
  1198. */
  1199. public static final UnicodeBlock OLD_ITALIC
  1200. = new UnicodeBlock(0x10300, 0x1032F,
  1201. "OLD_ITALIC",
  1202. "Old Italic");
  1203. /**
  1204. * Gothic.
  1205. * 0x10330 - 0x1034F.
  1206. * @since 1.5
  1207. */
  1208. public static final UnicodeBlock GOTHIC
  1209. = new UnicodeBlock(0x10330, 0x1034F,
  1210. "GOTHIC",
  1211. "Gothic");
  1212. /**
  1213. * Ugaritic.
  1214. * 0x10380 - 0x1039F.
  1215. * @since 1.5
  1216. */
  1217. public static final UnicodeBlock UGARITIC
  1218. = new UnicodeBlock(0x10380, 0x1039F,
  1219. "UGARITIC",
  1220. "Ugaritic");
  1221. /**
  1222. * Deseret.
  1223. * 0x10400 - 0x1044F.
  1224. * @since 1.5
  1225. */
  1226. public static final UnicodeBlock DESERET
  1227. = new UnicodeBlock(0x10400, 0x1044F,
  1228. "DESERET",
  1229. "Deseret");
  1230. /**
  1231. * Shavian.
  1232. * 0x10450 - 0x1047F.
  1233. * @since 1.5
  1234. */
  1235. public static final UnicodeBlock SHAVIAN
  1236. = new UnicodeBlock(0x10450, 0x1047F,
  1237. "SHAVIAN",
  1238. "Shavian");
  1239. /**
  1240. * Osmanya.
  1241. * 0x10480 - 0x104AF.
  1242. * @since 1.5
  1243. */
  1244. public static final UnicodeBlock OSMANYA
  1245. = new UnicodeBlock(0x10480, 0x104AF,
  1246. "OSMANYA",
  1247. "Osmanya");
  1248. /**
  1249. * Cypriot Syllabary.
  1250. * 0x10800 - 0x1083F.
  1251. * @since 1.5
  1252. */
  1253. public static final UnicodeBlock CYPRIOT_SYLLABARY
  1254. = new UnicodeBlock(0x10800, 0x1083F,
  1255. "CYPRIOT_SYLLABARY",
  1256. "Cypriot Syllabary");
  1257. /**
  1258. * Byzantine Musical Symbols.
  1259. * 0x1D000 - 0x1D0FF.
  1260. * @since 1.5
  1261. */
  1262. public static final UnicodeBlock BYZANTINE_MUSICAL_SYMBOLS
  1263. = new UnicodeBlock(0x1D000, 0x1D0FF,
  1264. "BYZANTINE_MUSICAL_SYMBOLS",
  1265. "Byzantine Musical Symbols");
  1266. /**
  1267. * Musical Symbols.
  1268. * 0x1D100 - 0x1D1FF.
  1269. * @since 1.5
  1270. */
  1271. public static final UnicodeBlock MUSICAL_SYMBOLS
  1272. = new UnicodeBlock(0x1D100, 0x1D1FF,
  1273. "MUSICAL_SYMBOLS",
  1274. "Musical Symbols");
  1275. /**
  1276. * Tai Xuan Jing Symbols.
  1277. * 0x1D300 - 0x1D35F.
  1278. * @since 1.5
  1279. */
  1280. public static final UnicodeBlock TAI_XUAN_JING_SYMBOLS
  1281. = new UnicodeBlock(0x1D300, 0x1D35F,
  1282. "TAI_XUAN_JING_SYMBOLS",
  1283. "Tai Xuan Jing Symbols");
  1284. /**
  1285. * Mathematical Alphanumeric Symbols.
  1286. * 0x1D400 - 0x1D7FF.
  1287. * @since 1.5
  1288. */
  1289. public static final UnicodeBlock MATHEMATICAL_ALPHANUMERIC_SYMBOLS
  1290. = new UnicodeBlock(0x1D400, 0x1D7FF,
  1291. "MATHEMATICAL_ALPHANUMERIC_SYMBOLS",
  1292. "Mathematical Alphanumeric Symbols");
  1293. /**
  1294. * CJK Unified Ideographs Extension B.
  1295. * 0x20000 - 0x2A6DF.
  1296. * @since 1.5
  1297. */
  1298. public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
  1299. = new UnicodeBlock(0x20000, 0x2A6DF,
  1300. "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B",
  1301. "CJK Unified Ideographs Extension B");
  1302. /**
  1303. * CJK Compatibility Ideographs Supplement.
  1304. * 0x2F800 - 0x2FA1F.
  1305. * @since 1.5
  1306. */
  1307. public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT
  1308. = new UnicodeBlock(0x2F800, 0x2FA1F,
  1309. "CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT",
  1310. "CJK Compatibility Ideographs Supplement");
  1311. /**
  1312. * Tags.
  1313. * 0xE0000 - 0xE007F.
  1314. * @since 1.5
  1315. */
  1316. public static final UnicodeBlock TAGS
  1317. = new UnicodeBlock(0xE0000, 0xE007F,
  1318. "TAGS",
  1319. "Tags");
  1320. /**
  1321. * Variation Selectors Supplement.
  1322. * 0xE0100 - 0xE01EF.
  1323. * @since 1.5
  1324. */
  1325. public static final UnicodeBlock VARIATION_SELECTORS_SUPPLEMENT
  1326. = new UnicodeBlock(0xE0100, 0xE01EF,
  1327. "VARIATION_SELECTORS_SUPPLEMENT",
  1328. "Variation Selectors Supplement");
  1329. /**
  1330. * Supplementary Private Use Area-A.
  1331. * 0xF0000 - 0xFFFFF.
  1332. * @since 1.5
  1333. */
  1334. public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_A
  1335. = new UnicodeBlock(0xF0000, 0xFFFFF,
  1336. "SUPPLEMENTARY_PRIVATE_USE_AREA_A",
  1337. "Supplementary Private Use Area-A");
  1338. /**
  1339. * Supplementary Private Use Area-B.
  1340. * 0x100000 - 0x10FFFF.
  1341. * @since 1.5
  1342. */
  1343. public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_B
  1344. = new UnicodeBlock(0x100000, 0x10FFFF,
  1345. "SUPPLEMENTARY_PRIVATE_USE_AREA_B",
  1346. "Supplementary Private Use Area-B");
  1347. /**
  1348. * Surrogates Area.
  1349. * 'D800' - 'DFFF'.
  1350. * @deprecated As of 1.5, the three areas,
  1351. * <a href="#HIGH_SURROGATES">HIGH_SURROGATES</a>,
  1352. * <a href="#HIGH_PRIVATE_USE_SURROGATES">HIGH_PRIVATE_USE_SURROGATES</a>
  1353. * and <a href="#LOW_SURROGATES">LOW_SURROGATES</a>, as defined
  1354. * by the Unicode standard, should be used in preference to
  1355. * this. These are also returned from calls to <code>of(int)</code>
  1356. * and <code>of(char)</code>.
  1357. */
  1358. @Deprecated
  1359. public static final UnicodeBlock SURROGATES_AREA
  1360. = new UnicodeBlock(0xD800, 0xDFFF,
  1361. "SURROGATES_AREA",
  1362. "Surrogates Area");
  1363. /**
  1364. * The defined subsets.
  1365. */
  1366. private static final UnicodeBlock sets[] = {
  1367. BASIC_LATIN,
  1368. LATIN_1_SUPPLEMENT,
  1369. LATIN_EXTENDED_A,
  1370. LATIN_EXTENDED_B,
  1371. IPA_EXTENSIONS,
  1372. SPACING_MODIFIER_LETTERS,
  1373. COMBINING_DIACRITICAL_MARKS,
  1374. GREEK,
  1375. CYRILLIC,
  1376. CYRILLIC_SUPPLEMENTARY,
  1377. ARMENIAN,
  1378. HEBREW,
  1379. ARABIC,
  1380. SYRIAC,
  1381. THAANA,
  1382. DEVANAGARI,
  1383. BENGALI,
  1384. GURMUKHI,
  1385. GUJARATI,
  1386. ORIYA,
  1387. TAMIL,
  1388. TELUGU,
  1389. KANNADA,
  1390. MALAYALAM,
  1391. SINHALA,
  1392. THAI,
  1393. LAO,
  1394. TIBETAN,
  1395. MYANMAR,
  1396. GEORGIAN,
  1397. HANGUL_JAMO,
  1398. ETHIOPIC,
  1399. CHEROKEE,
  1400. UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
  1401. OGHAM,
  1402. RUNIC,
  1403. TAGALOG,
  1404. HANUNOO,
  1405. BUHID,
  1406. TAGBANWA,
  1407. KHMER,
  1408. MONGOLIAN,
  1409. LIMBU,
  1410. TAI_LE,
  1411. KHMER_SYMBOLS,
  1412. PHONETIC_EXTENSIONS,
  1413. LATIN_EXTENDED_ADDITIONAL,
  1414. GREEK_EXTENDED,
  1415. GENERAL_PUNCTUATION,
  1416. SUPERSCRIPTS_AND_SUBSCRIPTS,
  1417. CURRENCY_SYMBOLS,
  1418. COMBINING_MARKS_FOR_SYMBOLS,
  1419. LETTERLIKE_SYMBOLS,
  1420. NUMBER_FORMS,
  1421. ARROWS,
  1422. MATHEMATICAL_OPERATORS,
  1423. MISCELLANEOUS_TECHNICAL,
  1424. CONTROL_PICTURES,
  1425. OPTICAL_CHARACTER_RECOGNITION,
  1426. ENCLOSED_ALPHANUMERICS,
  1427. BOX_DRAWING,
  1428. BLOCK_ELEMENTS,
  1429. GEOMETRIC_SHAPES,
  1430. MISCELLANEOUS_SYMBOLS,
  1431. DINGBATS,
  1432. MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A,
  1433. SUPPLEMENTAL_ARROWS_A,
  1434. BRAILLE_PATTERNS,
  1435. SUPPLEMENTAL_ARROWS_B,
  1436. MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B,
  1437. SUPPLEMENTAL_MATHEMATICAL_OPERATORS,
  1438. MISCELLANEOUS_SYMBOLS_AND_ARROWS,
  1439. CJK_RADICALS_SUPPLEMENT,
  1440. KANGXI_RADICALS,
  1441. IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
  1442. CJK_SYMBOLS_AND_PUNCTUATION,
  1443. HIRAGANA,
  1444. KATAKANA,
  1445. BOPOMOFO,
  1446. HANGUL_COMPATIBILITY_JAMO,
  1447. KANBUN,
  1448. BOPOMOFO_EXTENDED,
  1449. KATAKANA_PHONETIC_EXTENSIONS,
  1450. ENCLOSED_CJK_LETTERS_AND_MONTHS,
  1451. CJK_COMPATIBILITY,
  1452. CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
  1453. YIJING_HEXAGRAM_SYMBOLS,
  1454. CJK_UNIFIED_IDEOGRAPHS,
  1455. YI_SYLLABLES,
  1456. YI_RADICALS,
  1457. HANGUL_SYLLABLES,
  1458. HIGH_SURROGATES,
  1459. HIGH_PRIVATE_USE_SURROGATES,
  1460. LOW_SURROGATES,
  1461. PRIVATE_USE_AREA,
  1462. CJK_COMPATIBILITY_IDEOGRAPHS,
  1463. ALPHABETIC_PRESENTATION_FORMS,
  1464. ARABIC_PRESENTATION_FORMS_A,
  1465. VARIATION_SELECTORS,
  1466. COMBINING_HALF_MARKS,
  1467. CJK_COMPATIBILITY_FORMS,
  1468. SMALL_FORM_VARIANTS,
  1469. ARABIC_PRESENTATION_FORMS_B,
  1470. HALFWIDTH_AND_FULLWIDTH_FORMS,
  1471. SPECIALS,
  1472. LINEAR_B_SYLLABARY,
  1473. LINEAR_B_IDEOGRAMS,
  1474. AEGEAN_NUMBERS,
  1475. OLD_ITALIC,
  1476. GOTHIC,
  1477. UGARITIC,
  1478. DESERET,
  1479. SHAVIAN,
  1480. OSMANYA,
  1481. CYPRIOT_SYLLABARY,
  1482. BYZANTINE_MUSICAL_SYMBOLS,
  1483. MUSICAL_SYMBOLS,
  1484. TAI_XUAN_JING_SYMBOLS,
  1485. MATHEMATICAL_ALPHANUMERIC_SYMBOLS,
  1486. CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B,
  1487. CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
  1488. TAGS,
  1489. VARIATION_SELECTORS_SUPPLEMENT,
  1490. SUPPLEMENTARY_PRIVATE_USE_AREA_A,
  1491. SUPPLEMENTARY_PRIVATE_USE_AREA_B,
  1492. };
  1493. } // class UnicodeBlock
  1494. /**
  1495. * A class to encompass all the properties of characters in the
  1496. * private use blocks in the Unicode standard. This class extends
  1497. * UnassignedCharacters because the return type from getType() is
  1498. * different.
  1499. * @author Anthony Balkissoon abalkiss at redhat dot com
  1500. *
  1501. */
  1502. private static class PrivateUseCharacters extends UnassignedCharacters
  1503. {
  1504. /**
  1505. * Returns the type of the character cp.
  1506. */
  1507. static int getType(int cp)
  1508. {
  1509. // The upper 2 code points in any plane are considered unassigned,
  1510. // even in the private-use planes.
  1511. if ((cp & 0xffff) >= 0xfffe)
  1512. return UnassignedCharacters.getType(cp);
  1513. return PRIVATE_USE;
  1514. }
  1515. /**
  1516. * Returns true if the character cp is defined.
  1517. */
  1518. static boolean isDefined(int cp)
  1519. {
  1520. // The upper 2 code points in any plane are considered unassigned,
  1521. // even in the private-use planes.
  1522. if ((cp & 0xffff) >= 0xfffe)
  1523. return UnassignedCharacters.isDefined(cp);
  1524. return true;
  1525. }
  1526. /**
  1527. * Gets the directionality for the character cp.
  1528. */
  1529. static byte getDirectionality(int cp)
  1530. {
  1531. if ((cp & 0xffff) >= 0xfffe)
  1532. return UnassignedCharacters.getDirectionality(cp);
  1533. return DIRECTIONALITY_LEFT_TO_RIGHT;
  1534. }
  1535. }
  1536. /**
  1537. * A class to encompass all the properties of code points that are
  1538. * currently undefined in the Unicode standard.
  1539. * @author Anthony Balkissoon abalkiss at redhat dot com
  1540. *
  1541. */
  1542. private static class UnassignedCharacters
  1543. {
  1544. /**
  1545. * Returns the numeric value for the unassigned characters.
  1546. * @param cp the character
  1547. * @param radix the radix (not used)
  1548. * @return the numeric value of this character in this radix
  1549. */
  1550. static int digit(int cp, int radix)
  1551. {
  1552. return -1;
  1553. }
  1554. /**
  1555. * Returns the Unicode directionality property for unassigned
  1556. * characters.
  1557. * @param cp the character
  1558. * @return DIRECTIONALITY_UNDEFINED
  1559. */
  1560. static byte getDirectionality(int cp)
  1561. {
  1562. return DIRECTIONALITY_UNDEFINED;
  1563. }
  1564. /**
  1565. * Returns -1, the numeric value for unassigned Unicode characters.
  1566. * @param cp the character
  1567. * @return -1
  1568. */
  1569. static int getNumericValue(int cp)
  1570. {
  1571. return -1;
  1572. }
  1573. /**
  1574. * Returns UNASSIGNED, the type of unassigned Unicode characters.
  1575. * @param cp the character
  1576. * @return UNASSIGNED
  1577. */
  1578. static int getType(int cp)
  1579. {
  1580. return UNASSIGNED;
  1581. }
  1582. /**
  1583. * Returns false to indiciate that the character is not defined in the
  1584. * Unicode standard.
  1585. * @param cp the character
  1586. * @return false
  1587. */
  1588. static boolean isDefined(int cp)
  1589. {
  1590. return false;
  1591. }
  1592. /**
  1593. * Returns false to indicate that the character is not a digit.
  1594. * @param cp the character
  1595. * @return false
  1596. */
  1597. static boolean isDigit(int cp)
  1598. {
  1599. return false;
  1600. }
  1601. /**
  1602. * Returns false to indicate that the character cannot be ignored
  1603. * within an identifier
  1604. * @param cp the character
  1605. * @return false
  1606. */
  1607. static boolean isIdentifierIgnorable(int cp)
  1608. {
  1609. return false;
  1610. }
  1611. /**
  1612. * Returns false to indicate that the character cannot be part of a
  1613. * Java identifier.
  1614. * @param cp the character
  1615. * @return false
  1616. */
  1617. static boolean isJavaIdentifierPart(int cp)
  1618. {
  1619. return false;
  1620. }
  1621. /**
  1622. * Returns false to indicate that the character cannot be start a
  1623. * Java identifier.
  1624. * @param cp the character
  1625. * @return false
  1626. */
  1627. static boolean isJavaIdentiferStart(int cp)
  1628. {
  1629. return false;
  1630. }
  1631. /**
  1632. * Returns false to indicate that the character is not a letter.
  1633. * @param cp the character
  1634. * @return false
  1635. */
  1636. static boolean isLetter(int cp)
  1637. {
  1638. return false;
  1639. }
  1640. /**
  1641. * Returns false to indicate that the character cannot is neither a letter
  1642. * nor a digit.
  1643. * @param cp the character
  1644. * @return false
  1645. */
  1646. static boolean isLetterOrDigit(int cp)
  1647. {
  1648. return false;
  1649. }
  1650. /**
  1651. * Returns false to indicate that the character is not a lowercase letter.
  1652. * @param cp the character
  1653. * @return false
  1654. */
  1655. static boolean isLowerCase(int cp)
  1656. {
  1657. return false;
  1658. }
  1659. /**
  1660. * Returns false to indicate that the character cannot is not mirrored.
  1661. * @param cp the character
  1662. * @return false
  1663. */
  1664. static boolean isMirrored(int cp)
  1665. {
  1666. return false;
  1667. }
  1668. /**
  1669. * Returns false to indicate that the character is not a space character.
  1670. * @param cp the character
  1671. * @return false
  1672. */
  1673. static boolean isSpaceChar(int cp)
  1674. {
  1675. return false;
  1676. }
  1677. /**
  1678. * Returns false to indicate that the character it not a titlecase letter.
  1679. * @param cp the character
  1680. * @return false
  1681. */
  1682. static boolean isTitleCase(int cp)
  1683. {
  1684. return false;
  1685. }
  1686. /**
  1687. * Returns false to indicate that the character cannot be part of a
  1688. * Unicode identifier.
  1689. * @param cp the character
  1690. * @return false
  1691. */
  1692. static boolean isUnicodeIdentifierPart(int cp)
  1693. {
  1694. return false;
  1695. }
  1696. /**
  1697. * Returns false to indicate that the character cannot start a
  1698. * Unicode identifier.
  1699. * @param cp the character
  1700. * @return false
  1701. */
  1702. static boolean isUnicodeIdentifierStart(int cp)
  1703. {
  1704. return false;
  1705. }
  1706. /**
  1707. * Returns false to indicate that the character is not an uppercase letter.
  1708. * @param cp the character
  1709. * @return false
  1710. */
  1711. static boolean isUpperCase(int cp)
  1712. {
  1713. return false;
  1714. }
  1715. /**
  1716. * Returns false to indicate that the character is not a whitespace
  1717. * character.
  1718. * @param cp the character
  1719. * @return false
  1720. */
  1721. static boolean isWhiteSpace(int cp)
  1722. {
  1723. return false;
  1724. }
  1725. /**
  1726. * Returns cp to indicate this character has no lowercase conversion.
  1727. * @param cp the character
  1728. * @return cp
  1729. */
  1730. static int toLowerCase(int cp)
  1731. {
  1732. return cp;
  1733. }
  1734. /**
  1735. * Returns cp to indicate this character has no titlecase conversion.
  1736. * @param cp the character
  1737. * @return cp
  1738. */
  1739. static int toTitleCase(int cp)
  1740. {
  1741. return cp;
  1742. }
  1743. /**
  1744. * Returns cp to indicate this character has no uppercase conversion.
  1745. * @param cp the character
  1746. * @return cp
  1747. */
  1748. static int toUpperCase(int cp)
  1749. {
  1750. return cp;
  1751. }
  1752. }
  1753. /**
  1754. * The immutable value of this Character.
  1755. *
  1756. * @serial the value of this Character
  1757. */
  1758. private final char value;
  1759. /**
  1760. * Compatible with JDK 1.0+.
  1761. */
  1762. private static final long serialVersionUID = 3786198910865385080L;
  1763. /**
  1764. * Smallest value allowed for radix arguments in Java. This value is 2.
  1765. *
  1766. * @see #digit(char, int)
  1767. * @see #forDigit(int, int)
  1768. * @see Integer#toString(int, int)
  1769. * @see Integer#valueOf(String)
  1770. */
  1771. public static final int MIN_RADIX = 2;
  1772. /**
  1773. * Largest value allowed for radix arguments in Java. This value is 36.
  1774. *
  1775. * @see #digit(char, int)
  1776. * @see #forDigit(int, int)
  1777. * @see Integer#toString(int, int)
  1778. * @see Integer#valueOf(String)
  1779. */
  1780. public static final int MAX_RADIX = 36;
  1781. /**
  1782. * The minimum value the char data type can hold.
  1783. * This value is <code>'\\u0000'</code>.
  1784. */
  1785. public static final char MIN_VALUE = '\u0000';
  1786. /**
  1787. * The maximum value the char data type can hold.
  1788. * This value is <code>'\\uFFFF'</code>.
  1789. */
  1790. public static final char MAX_VALUE = '\uFFFF';
  1791. /**
  1792. * The minimum Unicode 4.0 code point. This value is <code>0</code>.
  1793. * @since 1.5
  1794. */
  1795. public static final int MIN_CODE_POINT = 0;
  1796. /**
  1797. * The maximum Unicode 4.0 code point, which is greater than the range
  1798. * of the char data type.
  1799. * This value is <code>0x10FFFF</code>.
  1800. * @since 1.5
  1801. */
  1802. public static final int MAX_CODE_POINT = 0x10FFFF;
  1803. /**
  1804. * The minimum Unicode high surrogate code unit, or
  1805. * <emph>leading-surrogate</emph>, in the UTF-16 character encoding.
  1806. * This value is <code>'\uD800'</code>.
  1807. * @since 1.5
  1808. */
  1809. public static final char MIN_HIGH_SURROGATE = '\uD800';
  1810. /**
  1811. * The maximum Unicode high surrogate code unit, or
  1812. * <emph>leading-surrogate</emph>, in the UTF-16 character encoding.
  1813. * This value is <code>'\uDBFF'</code>.
  1814. * @since 1.5
  1815. */
  1816. public static final char MAX_HIGH_SURROGATE = '\uDBFF';
  1817. /**
  1818. * The minimum Unicode low surrogate code unit, or
  1819. * <emph>trailing-surrogate</emph>, in the UTF-16 character encoding.
  1820. * This value is <code>'\uDC00'</code>.
  1821. * @since 1.5
  1822. */
  1823. public static final char MIN_LOW_SURROGATE = '\uDC00';
  1824. /**
  1825. * The maximum Unicode low surrogate code unit, or
  1826. * <emph>trailing-surrogate</emph>, in the UTF-16 character encoding.
  1827. * This value is <code>'\uDFFF'</code>.
  1828. * @since 1.5
  1829. */
  1830. public static final char MAX_LOW_SURROGATE = '\uDFFF';
  1831. /**
  1832. * The minimum Unicode surrogate code unit in the UTF-16 character encoding.
  1833. * This value is <code>'\uD800'</code>.
  1834. * @since 1.5
  1835. */
  1836. public static final char MIN_SURROGATE = MIN_HIGH_SURROGATE;
  1837. /**
  1838. * The maximum Unicode surrogate code unit in the UTF-16 character encoding.
  1839. * This value is <code>'\uDFFF'</code>.
  1840. * @since 1.5
  1841. */
  1842. public static final char MAX_SURROGATE = MAX_LOW_SURROGATE;
  1843. /**
  1844. * The lowest possible supplementary Unicode code point (the first code
  1845. * point outside the basic multilingual plane (BMP)).
  1846. * This value is <code>0x10000</code>.
  1847. */
  1848. public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
  1849. /**
  1850. * Class object representing the primitive char data type.
  1851. *
  1852. * @since 1.1
  1853. */
  1854. public static final Class<Character> TYPE = (Class<Character>) VMClassLoader.getPrimitiveClass('C');
  1855. /**
  1856. * The number of bits needed to represent a <code>char</code>.
  1857. * @since 1.5
  1858. */
  1859. public static final int SIZE = 16;
  1860. // This caches some Character values, and is used by boxing
  1861. // conversions via valueOf(). We must cache at least 0..127;
  1862. // this constant controls how much we actually cache.
  1863. private static final int MAX_CACHE = 127;
  1864. private static Character[] charCache = new Character[MAX_CACHE + 1];
  1865. static
  1866. {
  1867. for (char i=0; i <= MAX_CACHE; i++)
  1868. charCache[i] = new Character(i);
  1869. }
  1870. /**
  1871. * Lu = Letter, Uppercase (Informative).
  1872. *
  1873. * @since 1.1
  1874. */
  1875. public static final byte UPPERCASE_LETTER = 1;
  1876. /**
  1877. * Ll = Letter, Lowercase (Informative).
  1878. *
  1879. * @since 1.1
  1880. */
  1881. public static final byte LOWERCASE_LETTER = 2;
  1882. /**
  1883. * Lt = Letter, Titlecase (Informative).
  1884. *
  1885. * @since 1.1
  1886. */
  1887. public static final byte TITLECASE_LETTER = 3;
  1888. /**
  1889. * Mn = Mark, Non-Spacing (Normative).
  1890. *
  1891. * @since 1.1
  1892. */
  1893. public static final byte NON_SPACING_MARK = 6;
  1894. /**
  1895. * Mc = Mark, Spacing Combining (Normative).
  1896. *
  1897. * @since 1.1
  1898. */
  1899. public static final byte COMBINING_SPACING_MARK = 8;
  1900. /**
  1901. * Me = Mark, Enclosing (Normative).
  1902. *
  1903. * @since 1.1
  1904. */
  1905. public static final byte ENCLOSING_MARK = 7;
  1906. /**
  1907. * Nd = Number, Decimal Digit (Normative).
  1908. *
  1909. * @since 1.1
  1910. */
  1911. public static final byte DECIMAL_DIGIT_NUMBER = 9;
  1912. /**
  1913. * Nl = Number, Letter (Normative).
  1914. *
  1915. * @since 1.1
  1916. */
  1917. public static final byte LETTER_NUMBER = 10;
  1918. /**
  1919. * No = Number, Other (Normative).
  1920. *
  1921. * @since 1.1
  1922. */
  1923. public static final byte OTHER_NUMBER = 11;
  1924. /**
  1925. * Zs = Separator, Space (Normative).
  1926. *
  1927. * @since 1.1
  1928. */
  1929. public static final byte SPACE_SEPARATOR = 12;
  1930. /**
  1931. * Zl = Separator, Line (Normative).
  1932. *
  1933. * @since 1.1
  1934. */
  1935. public static final byte LINE_SEPARATOR = 13;
  1936. /**
  1937. * Zp = Separator, Paragraph (Normative).
  1938. *
  1939. * @since 1.1
  1940. */
  1941. public static final byte PARAGRAPH_SEPARATOR = 14;
  1942. /**
  1943. * Cc = Other, Control (Normative).
  1944. *
  1945. * @since 1.1
  1946. */
  1947. public static final byte CONTROL = 15;
  1948. /**
  1949. * Cf = Other, Format (Normative).
  1950. *
  1951. * @since 1.1
  1952. */
  1953. public static final byte FORMAT = 16;
  1954. /**
  1955. * Cs = Other, Surrogate (Normative).
  1956. *
  1957. * @since 1.1
  1958. */
  1959. public static final byte SURROGATE = 19;
  1960. /**
  1961. * Co = Other, Private Use (Normative).
  1962. *
  1963. * @since 1.1
  1964. */
  1965. public static final byte PRIVATE_USE = 18;
  1966. /**
  1967. * Cn = Other, Not Assigned (Normative).
  1968. *
  1969. * @since 1.1
  1970. */
  1971. public static final byte UNASSIGNED = 0;
  1972. /**
  1973. * Lm = Letter, Modifier (Informative).
  1974. *
  1975. * @since 1.1
  1976. */
  1977. public static final byte MODIFIER_LETTER = 4;
  1978. /**
  1979. * Lo = Letter, Other (Informative).
  1980. *
  1981. * @since 1.1
  1982. */
  1983. public static final byte OTHER_LETTER = 5;
  1984. /**
  1985. * Pc = Punctuation, Connector (Informative).
  1986. *
  1987. * @since 1.1
  1988. */
  1989. public static final byte CONNECTOR_PUNCTUATION = 23;
  1990. /**
  1991. * Pd = Punctuation, Dash (Informative).
  1992. *
  1993. * @since 1.1
  1994. */
  1995. public static final byte DASH_PUNCTUATION = 20;
  1996. /**
  1997. * Ps = Punctuation, Open (Informative).
  1998. *
  1999. * @since 1.1
  2000. */
  2001. public static final byte START_PUNCTUATION = 21;
  2002. /**
  2003. * Pe = Punctuation, Close (Informative).
  2004. *
  2005. * @since 1.1
  2006. */
  2007. public static final byte END_PUNCTUATION = 22;
  2008. /**
  2009. * Pi = Punctuation, Initial Quote (Informative).
  2010. *
  2011. * @since 1.4
  2012. */
  2013. public static final byte INITIAL_QUOTE_PUNCTUATION = 29;
  2014. /**
  2015. * Pf = Punctuation, Final Quote (Informative).
  2016. *
  2017. * @since 1.4
  2018. */
  2019. public static final byte FINAL_QUOTE_PUNCTUATION = 30;
  2020. /**
  2021. * Po = Punctuation, Other (Informative).
  2022. *
  2023. * @since 1.1
  2024. */
  2025. public static final byte OTHER_PUNCTUATION = 24;
  2026. /**
  2027. * Sm = Symbol, Math (Informative).
  2028. *
  2029. * @since 1.1
  2030. */
  2031. public static final byte MATH_SYMBOL = 25;
  2032. /**
  2033. * Sc = Symbol, Currency (Informative).
  2034. *
  2035. * @since 1.1
  2036. */
  2037. public static final byte CURRENCY_SYMBOL = 26;
  2038. /**
  2039. * Sk = Symbol, Modifier (Informative).
  2040. *
  2041. * @since 1.1
  2042. */
  2043. public static final byte MODIFIER_SYMBOL = 27;
  2044. /**
  2045. * So = Symbol, Other (Informative).
  2046. *
  2047. * @since 1.1
  2048. */
  2049. public static final byte OTHER_SYMBOL = 28;
  2050. /**
  2051. * Undefined bidirectional character type. Undefined char values have
  2052. * undefined directionality in the Unicode specification.
  2053. *
  2054. * @since 1.4
  2055. */
  2056. public static final byte DIRECTIONALITY_UNDEFINED = -1;
  2057. /**
  2058. * Strong bidirectional character type "L".
  2059. *
  2060. * @since 1.4
  2061. */
  2062. public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = 0;
  2063. /**
  2064. * Strong bidirectional character type "R".
  2065. *
  2066. * @since 1.4
  2067. */
  2068. public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = 1;
  2069. /**
  2070. * Strong bidirectional character type "AL".
  2071. *
  2072. * @since 1.4
  2073. */
  2074. public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2;
  2075. /**
  2076. * Weak bidirectional character type "EN".
  2077. *
  2078. * @since 1.4
  2079. */
  2080. public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = 3;
  2081. /**
  2082. * Weak bidirectional character type "ES".
  2083. *
  2084. * @since 1.4
  2085. */
  2086. public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4;
  2087. /**
  2088. * Weak bidirectional character type "ET".
  2089. *
  2090. * @since 1.4
  2091. */
  2092. public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5;
  2093. /**
  2094. * Weak bidirectional character type "AN".
  2095. *
  2096. * @since 1.4
  2097. */
  2098. public static final byte DIRECTIONALITY_ARABIC_NUMBER = 6;
  2099. /**
  2100. * Weak bidirectional character type "CS".
  2101. *
  2102. * @since 1.4
  2103. */
  2104. public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7;
  2105. /**
  2106. * Weak bidirectional character type "NSM".
  2107. *
  2108. * @since 1.4
  2109. */
  2110. public static final byte DIRECTIONALITY_NONSPACING_MARK = 8;
  2111. /**
  2112. * Weak bidirectional character type "BN".
  2113. *
  2114. * @since 1.4
  2115. */
  2116. public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9;
  2117. /**
  2118. * Neutral bidirectional character type "B".
  2119. *
  2120. * @since 1.4
  2121. */
  2122. public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10;
  2123. /**
  2124. * Neutral bidirectional character type "S".
  2125. *
  2126. * @since 1.4
  2127. */
  2128. public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11;
  2129. /**
  2130. * Strong bidirectional character type "WS".
  2131. *
  2132. * @since 1.4
  2133. */
  2134. public static final byte DIRECTIONALITY_WHITESPACE = 12;
  2135. /**
  2136. * Neutral bidirectional character type "ON".
  2137. *
  2138. * @since 1.4
  2139. */
  2140. public static final byte DIRECTIONALITY_OTHER_NEUTRALS = 13;
  2141. /**
  2142. * Strong bidirectional character type "LRE".
  2143. *
  2144. * @since 1.4
  2145. */
  2146. public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14;
  2147. /**
  2148. * Strong bidirectional character type "LRO".
  2149. *
  2150. * @since 1.4
  2151. */
  2152. public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15;
  2153. /**
  2154. * Strong bidirectional character type "RLE".
  2155. *
  2156. * @since 1.4
  2157. */
  2158. public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16;
  2159. /**
  2160. * Strong bidirectional character type "RLO".
  2161. *
  2162. * @since 1.4
  2163. */
  2164. public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17;
  2165. /**
  2166. * Weak bidirectional character type "PDF".
  2167. *
  2168. * @since 1.4
  2169. */
  2170. public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18;
  2171. /**
  2172. * Stores unicode block offset lookup table. Exploit package visibility of
  2173. * String.value to avoid copying the array.
  2174. * @see #readCodePoint(int)
  2175. * @see CharData#BLOCKS
  2176. */
  2177. private static final char[][] blocks =
  2178. new char[][]{
  2179. String.zeroBasedStringValue(CharData.BLOCKS[0]),
  2180. String.zeroBasedStringValue(CharData.BLOCKS[1]),
  2181. String.zeroBasedStringValue(CharData.BLOCKS[2]),
  2182. String.zeroBasedStringValue(CharData.BLOCKS[3]),
  2183. String.zeroBasedStringValue(CharData.BLOCKS[4]),
  2184. String.zeroBasedStringValue(CharData.BLOCKS[5]),
  2185. String.zeroBasedStringValue(CharData.BLOCKS[6]),
  2186. String.zeroBasedStringValue(CharData.BLOCKS[7]),
  2187. String.zeroBasedStringValue(CharData.BLOCKS[8]),
  2188. String.zeroBasedStringValue(CharData.BLOCKS[9]),
  2189. String.zeroBasedStringValue(CharData.BLOCKS[10]),
  2190. String.zeroBasedStringValue(CharData.BLOCKS[11]),
  2191. String.zeroBasedStringValue(CharData.BLOCKS[12]),
  2192. String.zeroBasedStringValue(CharData.BLOCKS[13]),
  2193. String.zeroBasedStringValue(CharData.BLOCKS[14]),
  2194. String.zeroBasedStringValue(CharData.BLOCKS[15]),
  2195. String.zeroBasedStringValue(CharData.BLOCKS[16])};
  2196. /**
  2197. * Stores unicode attribute offset lookup table. Exploit package visibility
  2198. * of String.value to avoid copying the array.
  2199. * @see CharData#DATA
  2200. */
  2201. private static final char[][] data =
  2202. new char[][]{
  2203. String.zeroBasedStringValue(CharData.DATA[0]),
  2204. String.zeroBasedStringValue(CharData.DATA[1]),
  2205. String.zeroBasedStringValue(CharData.DATA[2]),
  2206. String.zeroBasedStringValue(CharData.DATA[3]),
  2207. String.zeroBasedStringValue(CharData.DATA[4]),
  2208. String.zeroBasedStringValue(CharData.DATA[5]),
  2209. String.zeroBasedStringValue(CharData.DATA[6]),
  2210. String.zeroBasedStringValue(CharData.DATA[7]),
  2211. String.zeroBasedStringValue(CharData.DATA[8]),
  2212. String.zeroBasedStringValue(CharData.DATA[9]),
  2213. String.zeroBasedStringValue(CharData.DATA[10]),
  2214. String.zeroBasedStringValue(CharData.DATA[11]),
  2215. String.zeroBasedStringValue(CharData.DATA[12]),
  2216. String.zeroBasedStringValue(CharData.DATA[13]),
  2217. String.zeroBasedStringValue(CharData.DATA[14]),
  2218. String.zeroBasedStringValue(CharData.DATA[15]),
  2219. String.zeroBasedStringValue(CharData.DATA[16])};
  2220. /**
  2221. * Stores unicode numeric value attribute table. Exploit package visibility
  2222. * of String.value to avoid copying the array.
  2223. * @see CharData#NUM_VALUE
  2224. */
  2225. private static final char[][] numValue =
  2226. new char[][]{
  2227. String.zeroBasedStringValue(CharData.NUM_VALUE[0]),
  2228. String.zeroBasedStringValue(CharData.NUM_VALUE[1]),
  2229. String.zeroBasedStringValue(CharData.NUM_VALUE[2]),
  2230. String.zeroBasedStringValue(CharData.NUM_VALUE[3]),
  2231. String.zeroBasedStringValue(CharData.NUM_VALUE[4]),
  2232. String.zeroBasedStringValue(CharData.NUM_VALUE[5]),
  2233. String.zeroBasedStringValue(CharData.NUM_VALUE[6]),
  2234. String.zeroBasedStringValue(CharData.NUM_VALUE[7]),
  2235. String.zeroBasedStringValue(CharData.NUM_VALUE[8]),
  2236. String.zeroBasedStringValue(CharData.NUM_VALUE[9]),
  2237. String.zeroBasedStringValue(CharData.NUM_VALUE[10]),
  2238. String.zeroBasedStringValue(CharData.NUM_VALUE[11]),
  2239. String.zeroBasedStringValue(CharData.NUM_VALUE[12]),
  2240. String.zeroBasedStringValue(CharData.NUM_VALUE[13]),
  2241. String.zeroBasedStringValue(CharData.NUM_VALUE[14]),
  2242. String.zeroBasedStringValue(CharData.NUM_VALUE[15]),
  2243. String.zeroBasedStringValue(CharData.NUM_VALUE[16])};
  2244. /**
  2245. * Stores unicode uppercase attribute table. Exploit package visibility
  2246. * of String.value to avoid copying the array.
  2247. * @see CharData#UPPER
  2248. */
  2249. private static final char[][] upper =
  2250. new char[][]{
  2251. String.zeroBasedStringValue(CharData.UPPER[0]),
  2252. String.zeroBasedStringValue(CharData.UPPER[1]),
  2253. String.zeroBasedStringValue(CharData.UPPER[2]),
  2254. String.zeroBasedStringValue(CharData.UPPER[3]),
  2255. String.zeroBasedStringValue(CharData.UPPER[4]),
  2256. String.zeroBasedStringValue(CharData.UPPER[5]),
  2257. String.zeroBasedStringValue(CharData.UPPER[6]),
  2258. String.zeroBasedStringValue(CharData.UPPER[7]),
  2259. String.zeroBasedStringValue(CharData.UPPER[8]),
  2260. String.zeroBasedStringValue(CharData.UPPER[9]),
  2261. String.zeroBasedStringValue(CharData.UPPER[10]),
  2262. String.zeroBasedStringValue(CharData.UPPER[11]),
  2263. String.zeroBasedStringValue(CharData.UPPER[12]),
  2264. String.zeroBasedStringValue(CharData.UPPER[13]),
  2265. String.zeroBasedStringValue(CharData.UPPER[14]),
  2266. String.zeroBasedStringValue(CharData.UPPER[15]),
  2267. String.zeroBasedStringValue(CharData.UPPER[16])};
  2268. /**
  2269. * Stores unicode lowercase attribute table. Exploit package visibility
  2270. * of String.value to avoid copying the array.
  2271. * @see CharData#LOWER
  2272. */
  2273. private static final char[][] lower =
  2274. new char[][]{
  2275. String.zeroBasedStringValue(CharData.LOWER[0]),
  2276. String.zeroBasedStringValue(CharData.LOWER[1]),
  2277. String.zeroBasedStringValue(CharData.LOWER[2]),
  2278. String.zeroBasedStringValue(CharData.LOWER[3]),
  2279. String.zeroBasedStringValue(CharData.LOWER[4]),
  2280. String.zeroBasedStringValue(CharData.LOWER[5]),
  2281. String.zeroBasedStringValue(CharData.LOWER[6]),
  2282. String.zeroBasedStringValue(CharData.LOWER[7]),
  2283. String.zeroBasedStringValue(CharData.LOWER[8]),
  2284. String.zeroBasedStringValue(CharData.LOWER[9]),
  2285. String.zeroBasedStringValue(CharData.LOWER[10]),
  2286. String.zeroBasedStringValue(CharData.LOWER[11]),
  2287. String.zeroBasedStringValue(CharData.LOWER[12]),
  2288. String.zeroBasedStringValue(CharData.LOWER[13]),
  2289. String.zeroBasedStringValue(CharData.LOWER[14]),
  2290. String.zeroBasedStringValue(CharData.LOWER[15]),
  2291. String.zeroBasedStringValue(CharData.LOWER[16])};
  2292. /**
  2293. * Stores unicode direction attribute table. Exploit package visibility
  2294. * of String.value to avoid copying the array.
  2295. * @see CharData#DIRECTION
  2296. */
  2297. // Package visible for use by String.
  2298. static final char[][] direction =
  2299. new char[][]{
  2300. String.zeroBasedStringValue(CharData.DIRECTION[0]),
  2301. String.zeroBasedStringValue(CharData.DIRECTION[1]),
  2302. String.zeroBasedStringValue(CharData.DIRECTION[2]),
  2303. String.zeroBasedStringValue(CharData.DIRECTION[3]),
  2304. String.zeroBasedStringValue(CharData.DIRECTION[4]),
  2305. String.zeroBasedStringValue(CharData.DIRECTION[5]),
  2306. String.zeroBasedStringValue(CharData.DIRECTION[6]),
  2307. String.zeroBasedStringValue(CharData.DIRECTION[7]),
  2308. String.zeroBasedStringValue(CharData.DIRECTION[8]),
  2309. String.zeroBasedStringValue(CharData.DIRECTION[9]),
  2310. String.zeroBasedStringValue(CharData.DIRECTION[10]),
  2311. String.zeroBasedStringValue(CharData.DIRECTION[11]),
  2312. String.zeroBasedStringValue(CharData.DIRECTION[12]),
  2313. String.zeroBasedStringValue(CharData.DIRECTION[13]),
  2314. String.zeroBasedStringValue(CharData.DIRECTION[14]),
  2315. String.zeroBasedStringValue(CharData.DIRECTION[15]),
  2316. String.zeroBasedStringValue(CharData.DIRECTION[16])};
  2317. /**
  2318. * Stores unicode titlecase table. Exploit package visibility of
  2319. * String.value to avoid copying the array.
  2320. * @see CharData#TITLE
  2321. */
  2322. private static final char[] title = String.zeroBasedStringValue(CharData.TITLE);
  2323. /**
  2324. * Mask for grabbing the type out of the contents of data.
  2325. * @see CharData#DATA
  2326. */
  2327. private static final int TYPE_MASK = 0x1F;
  2328. /**
  2329. * Mask for grabbing the non-breaking space flag out of the contents of
  2330. * data.
  2331. * @see CharData#DATA
  2332. */
  2333. private static final int NO_BREAK_MASK = 0x20;
  2334. /**
  2335. * Mask for grabbing the mirrored directionality flag out of the contents
  2336. * of data.
  2337. * @see CharData#DATA
  2338. */
  2339. private static final int MIRROR_MASK = 0x40;
  2340. /**
  2341. * Grabs an attribute offset from the Unicode attribute database. The lower
  2342. * 5 bits are the character type, the next 2 bits are flags, and the top
  2343. * 9 bits are the offset into the attribute tables.
  2344. *
  2345. * @param codePoint the character to look up
  2346. * @return the character's attribute offset and type
  2347. * @see #TYPE_MASK
  2348. * @see #NO_BREAK_MASK
  2349. * @see #MIRROR_MASK
  2350. * @see CharData#DATA
  2351. * @see CharData#SHIFT
  2352. */
  2353. // Package visible for use in String.
  2354. static char readCodePoint(int codePoint)
  2355. {
  2356. int plane = codePoint >>> 16;
  2357. char offset = (char) (codePoint & 0xffff);
  2358. return data[plane][(char) (blocks[plane][offset >> CharData.SHIFT[plane]] + offset)];
  2359. }
  2360. /**
  2361. * Wraps up a character.
  2362. *
  2363. * @param value the character to wrap
  2364. */
  2365. public Character(char value)
  2366. {
  2367. this.value = value;
  2368. }
  2369. /**
  2370. * Returns the character which has been wrapped by this class.
  2371. *
  2372. * @return the character wrapped
  2373. */
  2374. public char charValue()
  2375. {
  2376. return value;
  2377. }
  2378. /**
  2379. * Returns the numerical value (unsigned) of the wrapped character.
  2380. * Range of returned values: 0x0000-0xFFFF.
  2381. *
  2382. * @return the value of the wrapped character
  2383. */
  2384. public int hashCode()
  2385. {
  2386. return value;
  2387. }
  2388. /**
  2389. * Determines if an object is equal to this object. This is only true for
  2390. * another Character object wrapping the same value.
  2391. *
  2392. * @param o object to compare
  2393. * @return true if o is a Character with the same value
  2394. */
  2395. public boolean equals(Object o)
  2396. {
  2397. return o instanceof Character && value == ((Character) o).value;
  2398. }
  2399. /**
  2400. * Converts the wrapped character into a String.
  2401. *
  2402. * @return a String containing one character -- the wrapped character
  2403. * of this instance
  2404. */
  2405. public String toString()
  2406. {
  2407. // Package constructor avoids an array copy.
  2408. return new String(new char[] { value }, 0, 1, true);
  2409. }
  2410. /**
  2411. * Returns a String of length 1 representing the specified character.
  2412. *
  2413. * @param ch the character to convert
  2414. * @return a String containing the character
  2415. * @since 1.4
  2416. */
  2417. public static String toString(char ch)
  2418. {
  2419. // Package constructor avoids an array copy.
  2420. return new String(new char[] { ch }, 0, 1, true);
  2421. }
  2422. /**
  2423. * Determines if a character is a Unicode lowercase letter. For example,
  2424. * <code>'a'</code> is lowercase. Returns true if getType() returns
  2425. * LOWERCASE_LETTER.
  2426. * <br>
  2427. * lowercase = [Ll]
  2428. *
  2429. * @param ch character to test
  2430. * @return true if ch is a Unicode lowercase letter, else false
  2431. * @see #isUpperCase(char)
  2432. * @see #isTitleCase(char)
  2433. * @see #toLowerCase(char)
  2434. * @see #getType(char)
  2435. */
  2436. public static boolean isLowerCase(char ch)
  2437. {
  2438. return isLowerCase((int)ch);
  2439. }
  2440. /**
  2441. * Determines if a character is a Unicode lowercase letter. For example,
  2442. * <code>'a'</code> is lowercase. Returns true if getType() returns
  2443. * LOWERCASE_LETTER.
  2444. * <br>
  2445. * lowercase = [Ll]
  2446. *
  2447. * @param codePoint character to test
  2448. * @return true if ch is a Unicode lowercase letter, else false
  2449. * @see #isUpperCase(char)
  2450. * @see #isTitleCase(char)
  2451. * @see #toLowerCase(char)
  2452. * @see #getType(char)
  2453. *
  2454. * @since 1.5
  2455. */
  2456. public static boolean isLowerCase(int codePoint)
  2457. {
  2458. return getType(codePoint) == LOWERCASE_LETTER;
  2459. }
  2460. /**
  2461. * Determines if a character is a Unicode uppercase letter. For example,
  2462. * <code>'A'</code> is uppercase. Returns true if getType() returns
  2463. * UPPERCASE_LETTER.
  2464. * <br>
  2465. * uppercase = [Lu]
  2466. *
  2467. * @param ch character to test
  2468. * @return true if ch is a Unicode uppercase letter, else false
  2469. * @see #isLowerCase(char)
  2470. * @see #isTitleCase(char)
  2471. * @see #toUpperCase(char)
  2472. * @see #getType(char)
  2473. */
  2474. public static boolean isUpperCase(char ch)
  2475. {
  2476. return isUpperCase((int)ch);
  2477. }
  2478. /**
  2479. * Determines if a character is a Unicode uppercase letter. For example,
  2480. * <code>'A'</code> is uppercase. Returns true if getType() returns
  2481. * UPPERCASE_LETTER.
  2482. * <br>
  2483. * uppercase = [Lu]
  2484. *
  2485. * @param codePoint character to test
  2486. * @return true if ch is a Unicode uppercase letter, else false
  2487. * @see #isLowerCase(char)
  2488. * @see #isTitleCase(char)
  2489. * @see #toUpperCase(char)
  2490. * @see #getType(char)
  2491. *
  2492. * @since 1.5
  2493. */
  2494. public static boolean isUpperCase(int codePoint)
  2495. {
  2496. return getType(codePoint) == UPPERCASE_LETTER;
  2497. }
  2498. /**
  2499. * Determines if a character is a Unicode titlecase letter. For example,
  2500. * the character "Lj" (Latin capital L with small letter j) is titlecase.
  2501. * True if getType() returns TITLECASE_LETTER.
  2502. * <br>
  2503. * titlecase = [Lt]
  2504. *
  2505. * @param ch character to test
  2506. * @return true if ch is a Unicode titlecase letter, else false
  2507. * @see #isLowerCase(char)
  2508. * @see #isUpperCase(char)
  2509. * @see #toTitleCase(char)
  2510. * @see #getType(char)
  2511. */
  2512. public static boolean isTitleCase(char ch)
  2513. {
  2514. return isTitleCase((int)ch);
  2515. }
  2516. /**
  2517. * Determines if a character is a Unicode titlecase letter. For example,
  2518. * the character "Lj" (Latin capital L with small letter j) is titlecase.
  2519. * True if getType() returns TITLECASE_LETTER.
  2520. * <br>
  2521. * titlecase = [Lt]
  2522. *
  2523. * @param codePoint character to test
  2524. * @return true if ch is a Unicode titlecase letter, else false
  2525. * @see #isLowerCase(char)
  2526. * @see #isUpperCase(char)
  2527. * @see #toTitleCase(char)
  2528. * @see #getType(char)
  2529. *
  2530. * @since 1.5
  2531. */
  2532. public static boolean isTitleCase(int codePoint)
  2533. {
  2534. return getType(codePoint) == TITLECASE_LETTER;
  2535. }
  2536. /**
  2537. * Determines if a character is a Unicode decimal digit. For example,
  2538. * <code>'0'</code> is a digit. A character is a Unicode digit if
  2539. * getType() returns DECIMAL_DIGIT_NUMBER.
  2540. * <br>
  2541. * Unicode decimal digit = [Nd]
  2542. *
  2543. * @param ch character to test
  2544. * @return true if ch is a Unicode decimal digit, else false
  2545. * @see #digit(char, int)
  2546. * @see #forDigit(int, int)
  2547. * @see #getType(char)
  2548. */
  2549. public static boolean isDigit(char ch)
  2550. {
  2551. return isDigit((int)ch);
  2552. }
  2553. /**
  2554. * Determines if a character is a Unicode decimal digit. For example,
  2555. * <code>'0'</code> is a digit. A character is a Unicode digit if
  2556. * getType() returns DECIMAL_DIGIT_NUMBER.
  2557. * <br>
  2558. * Unicode decimal digit = [Nd]
  2559. *
  2560. * @param codePoint character to test
  2561. * @return true if ch is a Unicode decimal digit, else false
  2562. * @see #digit(char, int)
  2563. * @see #forDigit(int, int)
  2564. * @see #getType(char)
  2565. *
  2566. * @since 1.5
  2567. */
  2568. public static boolean isDigit(int codePoint)
  2569. {
  2570. return getType(codePoint) == DECIMAL_DIGIT_NUMBER;
  2571. }
  2572. /**
  2573. * Determines if a character is part of the Unicode Standard. This is an
  2574. * evolving standard, but covers every character in the data file.
  2575. * <br>
  2576. * defined = not [Cn]
  2577. *
  2578. * @param ch character to test
  2579. * @return true if ch is a Unicode character, else false
  2580. * @see #isDigit(char)
  2581. * @see #isLetter(char)
  2582. * @see #isLetterOrDigit(char)
  2583. * @see #isLowerCase(char)
  2584. * @see #isTitleCase(char)
  2585. * @see #isUpperCase(char)
  2586. */
  2587. public static boolean isDefined(char ch)
  2588. {
  2589. return isDefined((int)ch);
  2590. }
  2591. /**
  2592. * Determines if a character is part of the Unicode Standard. This is an
  2593. * evolving standard, but covers every character in the data file.
  2594. * <br>
  2595. * defined = not [Cn]
  2596. *
  2597. * @param codePoint character to test
  2598. * @return true if ch is a Unicode character, else false
  2599. * @see #isDigit(char)
  2600. * @see #isLetter(char)
  2601. * @see #isLetterOrDigit(char)
  2602. * @see #isLowerCase(char)
  2603. * @see #isTitleCase(char)
  2604. * @see #isUpperCase(char)
  2605. *
  2606. * @since 1.5
  2607. */
  2608. public static boolean isDefined(int codePoint)
  2609. {
  2610. return getType(codePoint) != UNASSIGNED;
  2611. }
  2612. /**
  2613. * Determines if a character is a Unicode letter. Not all letters have case,
  2614. * so this may return true when isLowerCase and isUpperCase return false.
  2615. * A character is a Unicode letter if getType() returns one of
  2616. * UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER,
  2617. * or OTHER_LETTER.
  2618. * <br>
  2619. * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]
  2620. *
  2621. * @param ch character to test
  2622. * @return true if ch is a Unicode letter, else false
  2623. * @see #isDigit(char)
  2624. * @see #isJavaIdentifierStart(char)
  2625. * @see #isJavaLetter(char)
  2626. * @see #isJavaLetterOrDigit(char)
  2627. * @see #isLetterOrDigit(char)
  2628. * @see #isLowerCase(char)
  2629. * @see #isTitleCase(char)
  2630. * @see #isUnicodeIdentifierStart(char)
  2631. * @see #isUpperCase(char)
  2632. */
  2633. public static boolean isLetter(char ch)
  2634. {
  2635. return isLetter((int)ch);
  2636. }
  2637. /**
  2638. * Determines if a character is a Unicode letter. Not all letters have case,
  2639. * so this may return true when isLowerCase and isUpperCase return false.
  2640. * A character is a Unicode letter if getType() returns one of
  2641. * UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER,
  2642. * or OTHER_LETTER.
  2643. * <br>
  2644. * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]
  2645. *
  2646. * @param codePoint character to test
  2647. * @return true if ch is a Unicode letter, else false
  2648. * @see #isDigit(char)
  2649. * @see #isJavaIdentifierStart(char)
  2650. * @see #isJavaLetter(char)
  2651. * @see #isJavaLetterOrDigit(char)
  2652. * @see #isLetterOrDigit(char)
  2653. * @see #isLowerCase(char)
  2654. * @see #isTitleCase(char)
  2655. * @see #isUnicodeIdentifierStart(char)
  2656. * @see #isUpperCase(char)
  2657. *
  2658. * @since 1.5
  2659. */
  2660. public static boolean isLetter(int codePoint)
  2661. {
  2662. return ((1 << getType(codePoint))
  2663. & ((1 << UPPERCASE_LETTER)
  2664. | (1 << LOWERCASE_LETTER)
  2665. | (1 << TITLECASE_LETTER)
  2666. | (1 << MODIFIER_LETTER)
  2667. | (1 << OTHER_LETTER))) != 0;
  2668. }
  2669. /**
  2670. * Returns the index into the given CharSequence that is offset
  2671. * <code>codePointOffset</code> code points from <code>index</code>.
  2672. * @param seq the CharSequence
  2673. * @param index the start position in the CharSequence
  2674. * @param codePointOffset the number of code points offset from the start
  2675. * position
  2676. * @return the index into the CharSequence that is codePointOffset code
  2677. * points offset from index
  2678. *
  2679. * @throws NullPointerException if seq is null
  2680. * @throws IndexOutOfBoundsException if index is negative or greater than the
  2681. * length of the sequence.
  2682. * @throws IndexOutOfBoundsException if codePointOffset is positive and the
  2683. * subsequence from index to the end of seq has fewer than codePointOffset
  2684. * code points
  2685. * @throws IndexOutOfBoundsException if codePointOffset is negative and the
  2686. * subsequence from the start of seq to index has fewer than
  2687. * (-codePointOffset) code points
  2688. * @since 1.5
  2689. */
  2690. public static int offsetByCodePoints(CharSequence seq,
  2691. int index,
  2692. int codePointOffset)
  2693. {
  2694. int len = seq.length();
  2695. if (index < 0 || index > len)
  2696. throw new IndexOutOfBoundsException();
  2697. int numToGo = codePointOffset;
  2698. int offset = index;
  2699. int adjust = 1;
  2700. if (numToGo >= 0)
  2701. {
  2702. for (; numToGo > 0; offset++)
  2703. {
  2704. numToGo--;
  2705. if (Character.isHighSurrogate(seq.charAt(offset))
  2706. && (offset + 1) < len
  2707. && Character.isLowSurrogate(seq.charAt(offset + 1)))
  2708. offset++;
  2709. }
  2710. return offset;
  2711. }
  2712. else
  2713. {
  2714. numToGo *= -1;
  2715. for (; numToGo > 0;)
  2716. {
  2717. numToGo--;
  2718. offset--;
  2719. if (Character.isLowSurrogate(seq.charAt(offset))
  2720. && (offset - 1) >= 0
  2721. && Character.isHighSurrogate(seq.charAt(offset - 1)))
  2722. offset--;
  2723. }
  2724. return offset;
  2725. }
  2726. }
  2727. /**
  2728. * Returns the index into the given char subarray that is offset
  2729. * <code>codePointOffset</code> code points from <code>index</code>.
  2730. * @param a the char array
  2731. * @param start the start index of the subarray
  2732. * @param count the length of the subarray
  2733. * @param index the index to be offset
  2734. * @param codePointOffset the number of code points offset from <code>index
  2735. * </code>
  2736. * @return the index into the char array
  2737. *
  2738. * @throws NullPointerException if a is null
  2739. * @throws IndexOutOfBoundsException if start or count is negative or if
  2740. * start + count is greater than the length of the array
  2741. * @throws IndexOutOfBoundsException if index is less than start or larger
  2742. * than start + count
  2743. * @throws IndexOutOfBoundsException if codePointOffset is positive and the
  2744. * subarray from index to start + count - 1 has fewer than codePointOffset
  2745. * code points.
  2746. * @throws IndexOutOfBoundsException if codePointOffset is negative and the
  2747. * subarray from start to index - 1 has fewer than (-codePointOffset) code
  2748. * points
  2749. *
  2750. * @since 1.5
  2751. */
  2752. public static int offsetByCodePoints(char[] a,
  2753. int start,
  2754. int count,
  2755. int index,
  2756. int codePointOffset)
  2757. {
  2758. int len = a.length;
  2759. int end = start + count;
  2760. if (start < 0 || count < 0 || end > len || index < start || index > end)
  2761. throw new IndexOutOfBoundsException();
  2762. int numToGo = codePointOffset;
  2763. int offset = index;
  2764. int adjust = 1;
  2765. if (numToGo >= 0)
  2766. {
  2767. for (; numToGo > 0; offset++)
  2768. {
  2769. numToGo--;
  2770. if (Character.isHighSurrogate(a[offset])
  2771. && (offset + 1) < len
  2772. && Character.isLowSurrogate(a[offset + 1]))
  2773. offset++;
  2774. }
  2775. return offset;
  2776. }
  2777. else
  2778. {
  2779. numToGo *= -1;
  2780. for (; numToGo > 0;)
  2781. {
  2782. numToGo--;
  2783. offset--;
  2784. if (Character.isLowSurrogate(a[offset])
  2785. && (offset - 1) >= 0
  2786. && Character.isHighSurrogate(a[offset - 1]))
  2787. offset--;
  2788. if (offset < start)
  2789. throw new IndexOutOfBoundsException();
  2790. }
  2791. return offset;
  2792. }
  2793. }
  2794. /**
  2795. * Returns the number of Unicode code points in the specified range of the
  2796. * given CharSequence. The first char in the range is at position
  2797. * beginIndex and the last one is at position endIndex - 1. Paired
  2798. * surrogates (supplementary characters are represented by a pair of chars -
  2799. * one from the high surrogates and one from the low surrogates)
  2800. * count as just one code point.
  2801. * @param seq the CharSequence to inspect
  2802. * @param beginIndex the beginning of the range
  2803. * @param endIndex the end of the range
  2804. * @return the number of Unicode code points in the given range of the
  2805. * sequence
  2806. * @throws NullPointerException if seq is null
  2807. * @throws IndexOutOfBoundsException if beginIndex is negative, endIndex is
  2808. * larger than the length of seq, or if beginIndex is greater than endIndex.
  2809. * @since 1.5
  2810. */
  2811. public static int codePointCount(CharSequence seq, int beginIndex,
  2812. int endIndex)
  2813. {
  2814. int len = seq.length();
  2815. if (beginIndex < 0 || endIndex > len || beginIndex > endIndex)
  2816. throw new IndexOutOfBoundsException();
  2817. int count = 0;
  2818. for (int i = beginIndex; i < endIndex; i++)
  2819. {
  2820. count++;
  2821. // If there is a pairing, count it only once.
  2822. if (isHighSurrogate(seq.charAt(i)) && (i + 1) < endIndex
  2823. && isLowSurrogate(seq.charAt(i + 1)))
  2824. i ++;
  2825. }
  2826. return count;
  2827. }
  2828. /**
  2829. * Returns the number of Unicode code points in the specified range of the
  2830. * given char array. The first char in the range is at position
  2831. * offset and the length of the range is count. Paired surrogates
  2832. * (supplementary characters are represented by a pair of chars -
  2833. * one from the high surrogates and one from the low surrogates)
  2834. * count as just one code point.
  2835. * @param a the char array to inspect
  2836. * @param offset the beginning of the range
  2837. * @param count the length of the range
  2838. * @return the number of Unicode code points in the given range of the
  2839. * array
  2840. * @throws NullPointerException if a is null
  2841. * @throws IndexOutOfBoundsException if offset or count is negative or if
  2842. * offset + countendIndex is larger than the length of a.
  2843. * @since 1.5
  2844. */
  2845. public static int codePointCount(char[] a, int offset,
  2846. int count)
  2847. {
  2848. int len = a.length;
  2849. int end = offset + count;
  2850. if (offset < 0 || count < 0 || end > len)
  2851. throw new IndexOutOfBoundsException();
  2852. int counter = 0;
  2853. for (int i = offset; i < end; i++)
  2854. {
  2855. counter++;
  2856. // If there is a pairing, count it only once.
  2857. if (isHighSurrogate(a[i]) && (i + 1) < end
  2858. && isLowSurrogate(a[i + 1]))
  2859. i ++;
  2860. }
  2861. return counter;
  2862. }
  2863. /**
  2864. * Determines if a character is a Unicode letter or a Unicode digit. This
  2865. * is the combination of isLetter and isDigit.
  2866. * <br>
  2867. * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd]
  2868. *
  2869. * @param ch character to test
  2870. * @return true if ch is a Unicode letter or a Unicode digit, else false
  2871. * @see #isDigit(char)
  2872. * @see #isJavaIdentifierPart(char)
  2873. * @see #isJavaLetter(char)
  2874. * @see #isJavaLetterOrDigit(char)
  2875. * @see #isLetter(char)
  2876. * @see #isUnicodeIdentifierPart(char)
  2877. */
  2878. public static boolean isLetterOrDigit(char ch)
  2879. {
  2880. return isLetterOrDigit((int)ch);
  2881. }
  2882. /**
  2883. * Determines if a character is a Unicode letter or a Unicode digit. This
  2884. * is the combination of isLetter and isDigit.
  2885. * <br>
  2886. * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd]
  2887. *
  2888. * @param codePoint character to test
  2889. * @return true if ch is a Unicode letter or a Unicode digit, else false
  2890. * @see #isDigit(char)
  2891. * @see #isJavaIdentifierPart(char)
  2892. * @see #isJavaLetter(char)
  2893. * @see #isJavaLetterOrDigit(char)
  2894. * @see #isLetter(char)
  2895. * @see #isUnicodeIdentifierPart(char)
  2896. *
  2897. * @since 1.5
  2898. */
  2899. public static boolean isLetterOrDigit(int codePoint)
  2900. {
  2901. return ((1 << getType(codePoint))
  2902. & ((1 << UPPERCASE_LETTER)
  2903. | (1 << LOWERCASE_LETTER)
  2904. | (1 << TITLECASE_LETTER)
  2905. | (1 << MODIFIER_LETTER)
  2906. | (1 << OTHER_LETTER)
  2907. | (1 << DECIMAL_DIGIT_NUMBER))) != 0;
  2908. }
  2909. /**
  2910. * Determines if a character can start a Java identifier. This is the
  2911. * combination of isLetter, any character where getType returns
  2912. * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
  2913. * (like '_').
  2914. *
  2915. * @param ch character to test
  2916. * @return true if ch can start a Java identifier, else false
  2917. * @deprecated Replaced by {@link #isJavaIdentifierStart(char)}
  2918. * @see #isJavaLetterOrDigit(char)
  2919. * @see #isJavaIdentifierStart(char)
  2920. * @see #isJavaIdentifierPart(char)
  2921. * @see #isLetter(char)
  2922. * @see #isLetterOrDigit(char)
  2923. * @see #isUnicodeIdentifierStart(char)
  2924. */
  2925. public static boolean isJavaLetter(char ch)
  2926. {
  2927. return isJavaIdentifierStart(ch);
  2928. }
  2929. /**
  2930. * Determines if a character can follow the first letter in
  2931. * a Java identifier. This is the combination of isJavaLetter (isLetter,
  2932. * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
  2933. * numeric letter (like Roman numerals), combining marks, non-spacing marks,
  2934. * or isIdentifierIgnorable.
  2935. *
  2936. * @param ch character to test
  2937. * @return true if ch can follow the first letter in a Java identifier
  2938. * @deprecated Replaced by {@link #isJavaIdentifierPart(char)}
  2939. * @see #isJavaLetter(char)
  2940. * @see #isJavaIdentifierStart(char)
  2941. * @see #isJavaIdentifierPart(char)
  2942. * @see #isLetter(char)
  2943. * @see #isLetterOrDigit(char)
  2944. * @see #isUnicodeIdentifierPart(char)
  2945. * @see #isIdentifierIgnorable(char)
  2946. */
  2947. public static boolean isJavaLetterOrDigit(char ch)
  2948. {
  2949. return isJavaIdentifierPart(ch);
  2950. }
  2951. /**
  2952. * Determines if a character can start a Java identifier. This is the
  2953. * combination of isLetter, any character where getType returns
  2954. * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
  2955. * (like '_').
  2956. * <br>
  2957. * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]
  2958. *
  2959. * @param ch character to test
  2960. * @return true if ch can start a Java identifier, else false
  2961. * @see #isJavaIdentifierPart(char)
  2962. * @see #isLetter(char)
  2963. * @see #isUnicodeIdentifierStart(char)
  2964. * @since 1.1
  2965. */
  2966. public static boolean isJavaIdentifierStart(char ch)
  2967. {
  2968. return isJavaIdentifierStart((int)ch);
  2969. }
  2970. /**
  2971. * Determines if a character can start a Java identifier. This is the
  2972. * combination of isLetter, any character where getType returns
  2973. * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation
  2974. * (like '_').
  2975. * <br>
  2976. * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]
  2977. *
  2978. * @param codePoint character to test
  2979. * @return true if ch can start a Java identifier, else false
  2980. * @see #isJavaIdentifierPart(char)
  2981. * @see #isLetter(char)
  2982. * @see #isUnicodeIdentifierStart(char)
  2983. * @since 1.5
  2984. */
  2985. public static boolean isJavaIdentifierStart(int codePoint)
  2986. {
  2987. return ((1 << getType(codePoint))
  2988. & ((1 << UPPERCASE_LETTER)
  2989. | (1 << LOWERCASE_LETTER)
  2990. | (1 << TITLECASE_LETTER)
  2991. | (1 << MODIFIER_LETTER)
  2992. | (1 << OTHER_LETTER)
  2993. | (1 << LETTER_NUMBER)
  2994. | (1 << CURRENCY_SYMBOL)
  2995. | (1 << CONNECTOR_PUNCTUATION))) != 0;
  2996. }
  2997. /**
  2998. * Determines if a character can follow the first letter in
  2999. * a Java identifier. This is the combination of isJavaLetter (isLetter,
  3000. * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
  3001. * numeric letter (like Roman numerals), combining marks, non-spacing marks,
  3002. * or isIdentifierIgnorable.
  3003. * <br>
  3004. * Java identifier extender =
  3005. * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf]
  3006. * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
  3007. *
  3008. * @param ch character to test
  3009. * @return true if ch can follow the first letter in a Java identifier
  3010. * @see #isIdentifierIgnorable(char)
  3011. * @see #isJavaIdentifierStart(char)
  3012. * @see #isLetterOrDigit(char)
  3013. * @see #isUnicodeIdentifierPart(char)
  3014. * @since 1.1
  3015. */
  3016. public static boolean isJavaIdentifierPart(char ch)
  3017. {
  3018. return isJavaIdentifierPart((int)ch);
  3019. }
  3020. /**
  3021. * Determines if a character can follow the first letter in
  3022. * a Java identifier. This is the combination of isJavaLetter (isLetter,
  3023. * type of LETTER_NUMBER, currency, connecting punctuation) and digit,
  3024. * numeric letter (like Roman numerals), combining marks, non-spacing marks,
  3025. * or isIdentifierIgnorable.
  3026. * <br>
  3027. * Java identifier extender =
  3028. * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf]
  3029. * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
  3030. *
  3031. * @param codePoint character to test
  3032. * @return true if ch can follow the first letter in a Java identifier
  3033. * @see #isIdentifierIgnorable(char)
  3034. * @see #isJavaIdentifierStart(char)
  3035. * @see #isLetterOrDigit(char)
  3036. * @see #isUnicodeIdentifierPart(char)
  3037. * @since 1.5
  3038. */
  3039. public static boolean isJavaIdentifierPart(int codePoint)
  3040. {
  3041. int category = getType(codePoint);
  3042. return ((1 << category)
  3043. & ((1 << UPPERCASE_LETTER)
  3044. | (1 << LOWERCASE_LETTER)
  3045. | (1 << TITLECASE_LETTER)
  3046. | (1 << MODIFIER_LETTER)
  3047. | (1 << OTHER_LETTER)
  3048. | (1 << NON_SPACING_MARK)
  3049. | (1 << COMBINING_SPACING_MARK)
  3050. | (1 << DECIMAL_DIGIT_NUMBER)
  3051. | (1 << LETTER_NUMBER)
  3052. | (1 << CURRENCY_SYMBOL)
  3053. | (1 << CONNECTOR_PUNCTUATION)
  3054. | (1 << FORMAT))) != 0
  3055. || (category == CONTROL && isIdentifierIgnorable(codePoint));
  3056. }
  3057. /**
  3058. * Determines if a character can start a Unicode identifier. Only
  3059. * letters can start a Unicode identifier, but this includes characters
  3060. * in LETTER_NUMBER.
  3061. * <br>
  3062. * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]
  3063. *
  3064. * @param ch character to test
  3065. * @return true if ch can start a Unicode identifier, else false
  3066. * @see #isJavaIdentifierStart(char)
  3067. * @see #isLetter(char)
  3068. * @see #isUnicodeIdentifierPart(char)
  3069. * @since 1.1
  3070. */
  3071. public static boolean isUnicodeIdentifierStart(char ch)
  3072. {
  3073. return isUnicodeIdentifierStart((int)ch);
  3074. }
  3075. /**
  3076. * Determines if a character can start a Unicode identifier. Only
  3077. * letters can start a Unicode identifier, but this includes characters
  3078. * in LETTER_NUMBER.
  3079. * <br>
  3080. * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]
  3081. *
  3082. * @param codePoint character to test
  3083. * @return true if ch can start a Unicode identifier, else false
  3084. * @see #isJavaIdentifierStart(char)
  3085. * @see #isLetter(char)
  3086. * @see #isUnicodeIdentifierPart(char)
  3087. * @since 1.5
  3088. */
  3089. public static boolean isUnicodeIdentifierStart(int codePoint)
  3090. {
  3091. return ((1 << getType(codePoint))
  3092. & ((1 << UPPERCASE_LETTER)
  3093. | (1 << LOWERCASE_LETTER)
  3094. | (1 << TITLECASE_LETTER)
  3095. | (1 << MODIFIER_LETTER)
  3096. | (1 << OTHER_LETTER)
  3097. | (1 << LETTER_NUMBER))) != 0;
  3098. }
  3099. /**
  3100. * Determines if a character can follow the first letter in
  3101. * a Unicode identifier. This includes letters, connecting punctuation,
  3102. * digits, numeric letters, combining marks, non-spacing marks, and
  3103. * isIdentifierIgnorable.
  3104. * <br>
  3105. * Unicode identifier extender =
  3106. * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]|
  3107. * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
  3108. *
  3109. * @param ch character to test
  3110. * @return true if ch can follow the first letter in a Unicode identifier
  3111. * @see #isIdentifierIgnorable(char)
  3112. * @see #isJavaIdentifierPart(char)
  3113. * @see #isLetterOrDigit(char)
  3114. * @see #isUnicodeIdentifierStart(char)
  3115. * @since 1.1
  3116. */
  3117. public static boolean isUnicodeIdentifierPart(char ch)
  3118. {
  3119. return isUnicodeIdentifierPart((int)ch);
  3120. }
  3121. /**
  3122. * Determines if a character can follow the first letter in
  3123. * a Unicode identifier. This includes letters, connecting punctuation,
  3124. * digits, numeric letters, combining marks, non-spacing marks, and
  3125. * isIdentifierIgnorable.
  3126. * <br>
  3127. * Unicode identifier extender =
  3128. * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]|
  3129. * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F
  3130. *
  3131. * @param codePoint character to test
  3132. * @return true if ch can follow the first letter in a Unicode identifier
  3133. * @see #isIdentifierIgnorable(char)
  3134. * @see #isJavaIdentifierPart(char)
  3135. * @see #isLetterOrDigit(char)
  3136. * @see #isUnicodeIdentifierStart(char)
  3137. * @since 1.5
  3138. */
  3139. public static boolean isUnicodeIdentifierPart(int codePoint)
  3140. {
  3141. int category = getType(codePoint);
  3142. return ((1 << category)
  3143. & ((1 << UPPERCASE_LETTER)
  3144. | (1 << LOWERCASE_LETTER)
  3145. | (1 << TITLECASE_LETTER)
  3146. | (1 << MODIFIER_LETTER)
  3147. | (1 << OTHER_LETTER)
  3148. | (1 << NON_SPACING_MARK)
  3149. | (1 << COMBINING_SPACING_MARK)
  3150. | (1 << DECIMAL_DIGIT_NUMBER)
  3151. | (1 << LETTER_NUMBER)
  3152. | (1 << CONNECTOR_PUNCTUATION)
  3153. | (1 << FORMAT))) != 0
  3154. || (category == CONTROL && isIdentifierIgnorable(codePoint));
  3155. }
  3156. /**
  3157. * Determines if a character is ignorable in a Unicode identifier. This
  3158. * includes the non-whitespace ISO control characters (<code>'\u0000'</code>
  3159. * through <code>'\u0008'</code>, <code>'\u000E'</code> through
  3160. * <code>'\u001B'</code>, and <code>'\u007F'</code> through
  3161. * <code>'\u009F'</code>), and FORMAT characters.
  3162. * <br>
  3163. * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B
  3164. * |U+007F-U+009F
  3165. *
  3166. * @param ch character to test
  3167. * @return true if ch is ignorable in a Unicode or Java identifier
  3168. * @see #isJavaIdentifierPart(char)
  3169. * @see #isUnicodeIdentifierPart(char)
  3170. * @since 1.1
  3171. */
  3172. public static boolean isIdentifierIgnorable(char ch)
  3173. {
  3174. return isIdentifierIgnorable((int)ch);
  3175. }
  3176. /**
  3177. * Determines if a character is ignorable in a Unicode identifier. This
  3178. * includes the non-whitespace ISO control characters (<code>'\u0000'</code>
  3179. * through <code>'\u0008'</code>, <code>'\u000E'</code> through
  3180. * <code>'\u001B'</code>, and <code>'\u007F'</code> through
  3181. * <code>'\u009F'</code>), and FORMAT characters.
  3182. * <br>
  3183. * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B
  3184. * |U+007F-U+009F
  3185. *
  3186. * @param codePoint character to test
  3187. * @return true if ch is ignorable in a Unicode or Java identifier
  3188. * @see #isJavaIdentifierPart(char)
  3189. * @see #isUnicodeIdentifierPart(char)
  3190. * @since 1.5
  3191. */
  3192. public static boolean isIdentifierIgnorable(int codePoint)
  3193. {
  3194. if ((codePoint >= 0 && codePoint <= 0x0008)
  3195. || (codePoint >= 0x000E && codePoint <= 0x001B)
  3196. || (codePoint >= 0x007F && codePoint <= 0x009F)
  3197. || getType(codePoint) == FORMAT)
  3198. return true;
  3199. return false;
  3200. }
  3201. /**
  3202. * Converts a Unicode character into its lowercase equivalent mapping.
  3203. * If a mapping does not exist, then the character passed is returned.
  3204. * Note that isLowerCase(toLowerCase(ch)) does not always return true.
  3205. *
  3206. * @param ch character to convert to lowercase
  3207. * @return lowercase mapping of ch, or ch if lowercase mapping does
  3208. * not exist
  3209. * @see #isLowerCase(char)
  3210. * @see #isUpperCase(char)
  3211. * @see #toTitleCase(char)
  3212. * @see #toUpperCase(char)
  3213. */
  3214. public static char toLowerCase(char ch)
  3215. {
  3216. return (char) (lower[0][readCodePoint((int)ch) >>> 7] + ch);
  3217. }
  3218. /**
  3219. * Converts a Unicode character into its lowercase equivalent mapping.
  3220. * If a mapping does not exist, then the character passed is returned.
  3221. * Note that isLowerCase(toLowerCase(ch)) does not always return true.
  3222. *
  3223. * @param codePoint character to convert to lowercase
  3224. * @return lowercase mapping of ch, or ch if lowercase mapping does
  3225. * not exist
  3226. * @see #isLowerCase(char)
  3227. * @see #isUpperCase(char)
  3228. * @see #toTitleCase(char)
  3229. * @see #toUpperCase(char)
  3230. *
  3231. * @since 1.5
  3232. */
  3233. public static int toLowerCase(int codePoint)
  3234. {
  3235. // If the code point is unassigned or in one of the private use areas
  3236. // then we delegate the call to the appropriate private static inner class.
  3237. int plane = codePoint >>> 16;
  3238. if (plane > 2 && plane < 14)
  3239. return UnassignedCharacters.toLowerCase(codePoint);
  3240. if (plane > 14)
  3241. return PrivateUseCharacters.toLowerCase(codePoint);
  3242. // The short value stored in lower[plane] is the signed difference between
  3243. // codePoint and its lowercase conversion.
  3244. return ((short)lower[plane][readCodePoint(codePoint) >>> 7]) + codePoint;
  3245. }
  3246. /**
  3247. * Converts a Unicode character into its uppercase equivalent mapping.
  3248. * If a mapping does not exist, then the character passed is returned.
  3249. * Note that isUpperCase(toUpperCase(ch)) does not always return true.
  3250. *
  3251. * @param ch character to convert to uppercase
  3252. * @return uppercase mapping of ch, or ch if uppercase mapping does
  3253. * not exist
  3254. * @see #isLowerCase(char)
  3255. * @see #isUpperCase(char)
  3256. * @see #toLowerCase(char)
  3257. * @see #toTitleCase(char)
  3258. */
  3259. public static char toUpperCase(char ch)
  3260. {
  3261. return (char) (upper[0][readCodePoint((int)ch) >>> 7] + ch);
  3262. }
  3263. /**
  3264. * Converts a Unicode character into its uppercase equivalent mapping.
  3265. * If a mapping does not exist, then the character passed is returned.
  3266. * Note that isUpperCase(toUpperCase(ch)) does not always return true.
  3267. *
  3268. * @param codePoint character to convert to uppercase
  3269. * @return uppercase mapping of ch, or ch if uppercase mapping does
  3270. * not exist
  3271. * @see #isLowerCase(char)
  3272. * @see #isUpperCase(char)
  3273. * @see #toLowerCase(char)
  3274. * @see #toTitleCase(char)
  3275. *
  3276. * @since 1.5
  3277. */
  3278. public static int toUpperCase(int codePoint)
  3279. {
  3280. // If the code point is unassigned or in one of the private use areas
  3281. // then we delegate the call to the appropriate private static inner class.
  3282. int plane = codePoint >>> 16;
  3283. if (plane > 2 && plane < 14)
  3284. return UnassignedCharacters.toUpperCase(codePoint);
  3285. if (plane > 14)
  3286. return PrivateUseCharacters.toUpperCase(codePoint);
  3287. // The short value stored in upper[plane] is the signed difference between
  3288. // codePoint and its uppercase conversion.
  3289. return ((short)upper[plane][readCodePoint(codePoint) >>> 7]) + codePoint;
  3290. }
  3291. /**
  3292. * Converts a Unicode character into its titlecase equivalent mapping.
  3293. * If a mapping does not exist, then the character passed is returned.
  3294. * Note that isTitleCase(toTitleCase(ch)) does not always return true.
  3295. *
  3296. * @param ch character to convert to titlecase
  3297. * @return titlecase mapping of ch, or ch if titlecase mapping does
  3298. * not exist
  3299. * @see #isTitleCase(char)
  3300. * @see #toLowerCase(char)
  3301. * @see #toUpperCase(char)
  3302. */
  3303. public static char toTitleCase(char ch)
  3304. {
  3305. // As title is short, it doesn't hurt to exhaustively iterate over it.
  3306. for (int i = title.length - 2; i >= 0; i -= 2)
  3307. if (title[i] == ch)
  3308. return title[i + 1];
  3309. return toUpperCase(ch);
  3310. }
  3311. /**
  3312. * Converts a Unicode character into its titlecase equivalent mapping.
  3313. * If a mapping does not exist, then the character passed is returned.
  3314. * Note that isTitleCase(toTitleCase(ch)) does not always return true.
  3315. *
  3316. * @param codePoint character to convert to titlecase
  3317. * @return titlecase mapping of ch, or ch if titlecase mapping does
  3318. * not exist
  3319. * @see #isTitleCase(char)
  3320. * @see #toLowerCase(char)
  3321. * @see #toUpperCase(char)
  3322. *
  3323. * @since 1.5
  3324. */
  3325. public static int toTitleCase(int codePoint)
  3326. {
  3327. // As of Unicode 4.0.0 no characters outside of plane 0 have
  3328. // titlecase mappings that are different from their uppercase
  3329. // mapping.
  3330. if (codePoint < 0x10000)
  3331. return (int) toTitleCase((char)codePoint);
  3332. return toUpperCase(codePoint);
  3333. }
  3334. /**
  3335. * Converts a character into a digit of the specified radix. If the radix
  3336. * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch)
  3337. * exceeds the radix, or if ch is not a decimal digit or in the case
  3338. * insensitive set of 'a'-'z', the result is -1.
  3339. * <br>
  3340. * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A
  3341. * |U+FF21-U+FF3A|U+FF41-U+FF5A
  3342. *
  3343. * @param ch character to convert into a digit
  3344. * @param radix radix in which ch is a digit
  3345. * @return digit which ch represents in radix, or -1 not a valid digit
  3346. * @see #MIN_RADIX
  3347. * @see #MAX_RADIX
  3348. * @see #forDigit(int, int)
  3349. * @see #isDigit(char)
  3350. * @see #getNumericValue(char)
  3351. */
  3352. public static int digit(char ch, int radix)
  3353. {
  3354. if (radix < MIN_RADIX || radix > MAX_RADIX)
  3355. return -1;
  3356. char attr = readCodePoint((int)ch);
  3357. if (((1 << (attr & TYPE_MASK))
  3358. & ((1 << UPPERCASE_LETTER)
  3359. | (1 << LOWERCASE_LETTER)
  3360. | (1 << DECIMAL_DIGIT_NUMBER))) != 0)
  3361. {
  3362. // Signedness doesn't matter; 0xffff vs. -1 are both rejected.
  3363. int digit = numValue[0][attr >> 7];
  3364. return (digit < radix) ? digit : -1;
  3365. }
  3366. return -1;
  3367. }
  3368. /**
  3369. * Converts a character into a digit of the specified radix. If the radix
  3370. * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch)
  3371. * exceeds the radix, or if ch is not a decimal digit or in the case
  3372. * insensitive set of 'a'-'z', the result is -1.
  3373. * <br>
  3374. * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A
  3375. * |U+FF21-U+FF3A|U+FF41-U+FF5A
  3376. *
  3377. * @param codePoint character to convert into a digit
  3378. * @param radix radix in which ch is a digit
  3379. * @return digit which ch represents in radix, or -1 not a valid digit
  3380. * @see #MIN_RADIX
  3381. * @see #MAX_RADIX
  3382. * @see #forDigit(int, int)
  3383. * @see #isDigit(char)
  3384. * @see #getNumericValue(char)
  3385. */
  3386. public static int digit(int codePoint, int radix)
  3387. {
  3388. if (radix < MIN_RADIX || radix > MAX_RADIX)
  3389. return -1;
  3390. // If the code point is unassigned or in one of the private use areas
  3391. // then we delegate the call to the appropriate private static inner class.
  3392. int plane = codePoint >>> 16;
  3393. if (plane > 2 && plane < 14)
  3394. return UnassignedCharacters.digit(codePoint, radix);
  3395. if (plane > 14)
  3396. return PrivateUseCharacters.digit(codePoint, radix);
  3397. char attr = readCodePoint(codePoint);
  3398. if (((1 << (attr & TYPE_MASK))
  3399. & ((1 << UPPERCASE_LETTER)
  3400. | (1 << LOWERCASE_LETTER)
  3401. | (1 << DECIMAL_DIGIT_NUMBER))) != 0)
  3402. {
  3403. // Signedness doesn't matter; 0xffff vs. -1 are both rejected.
  3404. int digit = numValue[plane][attr >> 7];
  3405. // If digit is less than or equal to -3 then the numerical value was
  3406. // too large to fit into numValue and is stored in CharData.LARGENUMS.
  3407. if (digit <= -3)
  3408. digit = CharData.LARGENUMS[-digit - 3];
  3409. return (digit < radix) ? digit : -1;
  3410. }
  3411. return -1;
  3412. }
  3413. /**
  3414. * Returns the Unicode numeric value property of a character. For example,
  3415. * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50.
  3416. *
  3417. * <p>This method also returns values for the letters A through Z, (not
  3418. * specified by Unicode), in these ranges: <code>'\u0041'</code>
  3419. * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code>
  3420. * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code>
  3421. * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through
  3422. * <code>'\uFF5A'</code> (full width variants).
  3423. *
  3424. * <p>If the character lacks a numeric value property, -1 is returned.
  3425. * If the character has a numeric value property which is not representable
  3426. * as a nonnegative integer, such as a fraction, -2 is returned.
  3427. *
  3428. * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A
  3429. * |U+FF21-U+FF3A|U+FF41-U+FF5A
  3430. *
  3431. * @param ch character from which the numeric value property will
  3432. * be retrieved
  3433. * @return the numeric value property of ch, or -1 if it does not exist, or
  3434. * -2 if it is not representable as a nonnegative integer
  3435. * @see #forDigit(int, int)
  3436. * @see #digit(char, int)
  3437. * @see #isDigit(char)
  3438. * @since 1.1
  3439. */
  3440. public static int getNumericValue(char ch)
  3441. {
  3442. // Treat numValue as signed.
  3443. return (short) numValue[0][readCodePoint((int)ch) >> 7];
  3444. }
  3445. /**
  3446. * Returns the Unicode numeric value property of a character. For example,
  3447. * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50.
  3448. *
  3449. * <p>This method also returns values for the letters A through Z, (not
  3450. * specified by Unicode), in these ranges: <code>'\u0041'</code>
  3451. * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code>
  3452. * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code>
  3453. * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through
  3454. * <code>'\uFF5A'</code> (full width variants).
  3455. *
  3456. * <p>If the character lacks a numeric value property, -1 is returned.
  3457. * If the character has a numeric value property which is not representable
  3458. * as a nonnegative integer, such as a fraction, -2 is returned.
  3459. *
  3460. * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A
  3461. * |U+FF21-U+FF3A|U+FF41-U+FF5A
  3462. *
  3463. * @param codePoint character from which the numeric value property will
  3464. * be retrieved
  3465. * @return the numeric value property of ch, or -1 if it does not exist, or
  3466. * -2 if it is not representable as a nonnegative integer
  3467. * @see #forDigit(int, int)
  3468. * @see #digit(char, int)
  3469. * @see #isDigit(char)
  3470. * @since 1.5
  3471. */
  3472. public static int getNumericValue(int codePoint)
  3473. {
  3474. // If the code point is unassigned or in one of the private use areas
  3475. // then we delegate the call to the appropriate private static inner class.
  3476. int plane = codePoint >>> 16;
  3477. if (plane > 2 && plane < 14)
  3478. return UnassignedCharacters.getNumericValue(codePoint);
  3479. if (plane > 14)
  3480. return PrivateUseCharacters.getNumericValue(codePoint);
  3481. // If the value N found in numValue[plane] is less than or equal to -3
  3482. // then the numeric value was too big to fit into 16 bits and is
  3483. // stored in CharData.LARGENUMS at offset (-N - 3).
  3484. short num = (short)numValue[plane][readCodePoint(codePoint) >> 7];
  3485. if (num <= -3)
  3486. return CharData.LARGENUMS[-num - 3];
  3487. return num;
  3488. }
  3489. /**
  3490. * Determines if a character is a ISO-LATIN-1 space. This is only the five
  3491. * characters <code>'\t'</code>, <code>'\n'</code>, <code>'\f'</code>,
  3492. * <code>'\r'</code>, and <code>' '</code>.
  3493. * <br>
  3494. * Java space = U+0020|U+0009|U+000A|U+000C|U+000D
  3495. *
  3496. * @param ch character to test
  3497. * @return true if ch is a space, else false
  3498. * @deprecated Replaced by {@link #isWhitespace(char)}
  3499. * @see #isSpaceChar(char)
  3500. * @see #isWhitespace(char)
  3501. */
  3502. public static boolean isSpace(char ch)
  3503. {
  3504. // Performing the subtraction up front alleviates need to compare longs.
  3505. return ch-- <= ' ' && ((1 << ch)
  3506. & ((1 << (' ' - 1))
  3507. | (1 << ('\t' - 1))
  3508. | (1 << ('\n' - 1))
  3509. | (1 << ('\r' - 1))
  3510. | (1 << ('\f' - 1)))) != 0;
  3511. }
  3512. /**
  3513. * Determines if a character is a Unicode space character. This includes
  3514. * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR.
  3515. * <br>
  3516. * Unicode space = [Zs]|[Zp]|[Zl]
  3517. *
  3518. * @param ch character to test
  3519. * @return true if ch is a Unicode space, else false
  3520. * @see #isWhitespace(char)
  3521. * @since 1.1
  3522. */
  3523. public static boolean isSpaceChar(char ch)
  3524. {
  3525. return isSpaceChar((int)ch);
  3526. }
  3527. /**
  3528. * Determines if a character is a Unicode space character. This includes
  3529. * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR.
  3530. * <br>
  3531. * Unicode space = [Zs]|[Zp]|[Zl]
  3532. *
  3533. * @param codePoint character to test
  3534. * @return true if ch is a Unicode space, else false
  3535. * @see #isWhitespace(char)
  3536. * @since 1.5
  3537. */
  3538. public static boolean isSpaceChar(int codePoint)
  3539. {
  3540. return ((1 << getType(codePoint))
  3541. & ((1 << SPACE_SEPARATOR)
  3542. | (1 << LINE_SEPARATOR)
  3543. | (1 << PARAGRAPH_SEPARATOR))) != 0;
  3544. }
  3545. /**
  3546. * Determines if a character is Java whitespace. This includes Unicode
  3547. * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and
  3548. * PARAGRAPH_SEPARATOR) except the non-breaking spaces
  3549. * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>);
  3550. * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>,
  3551. * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>,
  3552. * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>,
  3553. * and <code>'\u001F'</code>.
  3554. * <br>
  3555. * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F
  3556. *
  3557. * @param ch character to test
  3558. * @return true if ch is Java whitespace, else false
  3559. * @see #isSpaceChar(char)
  3560. * @since 1.1
  3561. */
  3562. public static boolean isWhitespace(char ch)
  3563. {
  3564. return isWhitespace((int) ch);
  3565. }
  3566. /**
  3567. * Determines if a character is Java whitespace. This includes Unicode
  3568. * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and
  3569. * PARAGRAPH_SEPARATOR) except the non-breaking spaces
  3570. * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>);
  3571. * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>,
  3572. * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>,
  3573. * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>,
  3574. * and <code>'\u001F'</code>.
  3575. * <br>
  3576. * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F
  3577. *
  3578. * @param codePoint character to test
  3579. * @return true if ch is Java whitespace, else false
  3580. * @see #isSpaceChar(char)
  3581. * @since 1.5
  3582. */
  3583. public static boolean isWhitespace(int codePoint)
  3584. {
  3585. int plane = codePoint >>> 16;
  3586. if (plane > 2 && plane < 14)
  3587. return UnassignedCharacters.isWhiteSpace(codePoint);
  3588. if (plane > 14)
  3589. return PrivateUseCharacters.isWhiteSpace(codePoint);
  3590. int attr = readCodePoint(codePoint);
  3591. return ((((1 << (attr & TYPE_MASK))
  3592. & ((1 << SPACE_SEPARATOR)
  3593. | (1 << LINE_SEPARATOR)
  3594. | (1 << PARAGRAPH_SEPARATOR))) != 0)
  3595. && (attr & NO_BREAK_MASK) == 0)
  3596. || (codePoint <= '\u001F' && ((1 << codePoint)
  3597. & ((1 << '\t')
  3598. | (1 << '\n')
  3599. | (1 << '\u000B')
  3600. | (1 << '\u000C')
  3601. | (1 << '\r')
  3602. | (1 << '\u001C')
  3603. | (1 << '\u001D')
  3604. | (1 << '\u001E')
  3605. | (1 << '\u001F'))) != 0);
  3606. }
  3607. /**
  3608. * Determines if a character has the ISO Control property.
  3609. * <br>
  3610. * ISO Control = [Cc]
  3611. *
  3612. * @param ch character to test
  3613. * @return true if ch is an ISO Control character, else false
  3614. * @see #isSpaceChar(char)
  3615. * @see #isWhitespace(char)
  3616. * @since 1.1
  3617. */
  3618. public static boolean isISOControl(char ch)
  3619. {
  3620. return isISOControl((int)ch);
  3621. }
  3622. /**
  3623. * Determines if the character is an ISO Control character. This is true
  3624. * if the code point is in the range [0, 0x001F] or if it is in the range
  3625. * [0x007F, 0x009F].
  3626. * @param codePoint the character to check
  3627. * @return true if the character is in one of the above ranges
  3628. *
  3629. * @since 1.5
  3630. */
  3631. public static boolean isISOControl(int codePoint)
  3632. {
  3633. if ((codePoint >= 0 && codePoint <= 0x001F)
  3634. || (codePoint >= 0x007F && codePoint <= 0x009F))
  3635. return true;
  3636. return false;
  3637. }
  3638. /**
  3639. * Returns the Unicode general category property of a character.
  3640. *
  3641. * @param ch character from which the general category property will
  3642. * be retrieved
  3643. * @return the character category property of ch as an integer
  3644. * @see #UNASSIGNED
  3645. * @see #UPPERCASE_LETTER
  3646. * @see #LOWERCASE_LETTER
  3647. * @see #TITLECASE_LETTER
  3648. * @see #MODIFIER_LETTER
  3649. * @see #OTHER_LETTER
  3650. * @see #NON_SPACING_MARK
  3651. * @see #ENCLOSING_MARK
  3652. * @see #COMBINING_SPACING_MARK
  3653. * @see #DECIMAL_DIGIT_NUMBER
  3654. * @see #LETTER_NUMBER
  3655. * @see #OTHER_NUMBER
  3656. * @see #SPACE_SEPARATOR
  3657. * @see #LINE_SEPARATOR
  3658. * @see #PARAGRAPH_SEPARATOR
  3659. * @see #CONTROL
  3660. * @see #FORMAT
  3661. * @see #PRIVATE_USE
  3662. * @see #SURROGATE
  3663. * @see #DASH_PUNCTUATION
  3664. * @see #START_PUNCTUATION
  3665. * @see #END_PUNCTUATION
  3666. * @see #CONNECTOR_PUNCTUATION
  3667. * @see #OTHER_PUNCTUATION
  3668. * @see #MATH_SYMBOL
  3669. * @see #CURRENCY_SYMBOL
  3670. * @see #MODIFIER_SYMBOL
  3671. * @see #INITIAL_QUOTE_PUNCTUATION
  3672. * @see #FINAL_QUOTE_PUNCTUATION
  3673. * @since 1.1
  3674. */
  3675. public static int getType(char ch)
  3676. {
  3677. return getType((int)ch);
  3678. }
  3679. /**
  3680. * Returns the Unicode general category property of a character.
  3681. *
  3682. * @param codePoint character from which the general category property will
  3683. * be retrieved
  3684. * @return the character category property of ch as an integer
  3685. * @see #UNASSIGNED
  3686. * @see #UPPERCASE_LETTER
  3687. * @see #LOWERCASE_LETTER
  3688. * @see #TITLECASE_LETTER
  3689. * @see #MODIFIER_LETTER
  3690. * @see #OTHER_LETTER
  3691. * @see #NON_SPACING_MARK
  3692. * @see #ENCLOSING_MARK
  3693. * @see #COMBINING_SPACING_MARK
  3694. * @see #DECIMAL_DIGIT_NUMBER
  3695. * @see #LETTER_NUMBER
  3696. * @see #OTHER_NUMBER
  3697. * @see #SPACE_SEPARATOR
  3698. * @see #LINE_SEPARATOR
  3699. * @see #PARAGRAPH_SEPARATOR
  3700. * @see #CONTROL
  3701. * @see #FORMAT
  3702. * @see #PRIVATE_USE
  3703. * @see #SURROGATE
  3704. * @see #DASH_PUNCTUATION
  3705. * @see #START_PUNCTUATION
  3706. * @see #END_PUNCTUATION
  3707. * @see #CONNECTOR_PUNCTUATION
  3708. * @see #OTHER_PUNCTUATION
  3709. * @see #MATH_SYMBOL
  3710. * @see #CURRENCY_SYMBOL
  3711. * @see #MODIFIER_SYMBOL
  3712. * @see #INITIAL_QUOTE_PUNCTUATION
  3713. * @see #FINAL_QUOTE_PUNCTUATION
  3714. *
  3715. * @since 1.5
  3716. */
  3717. public static int getType(int codePoint)
  3718. {
  3719. // If the codePoint is unassigned or in one of the private use areas
  3720. // then we delegate the call to the appropriate private static inner class.
  3721. int plane = codePoint >>> 16;
  3722. if (plane > 2 && plane < 14)
  3723. return UnassignedCharacters.getType(codePoint);
  3724. if (plane > 14)
  3725. return PrivateUseCharacters.getType(codePoint);
  3726. return readCodePoint(codePoint) & TYPE_MASK;
  3727. }
  3728. /**
  3729. * Converts a digit into a character which represents that digit
  3730. * in a specified radix. If the radix exceeds MIN_RADIX or MAX_RADIX,
  3731. * or the digit exceeds the radix, then the null character <code>'\0'</code>
  3732. * is returned. Otherwise the return value is in '0'-'9' and 'a'-'z'.
  3733. * <br>
  3734. * return value boundary = U+0030-U+0039|U+0061-U+007A
  3735. *
  3736. * @param digit digit to be converted into a character
  3737. * @param radix radix of digit
  3738. * @return character representing digit in radix, or '\0'
  3739. * @see #MIN_RADIX
  3740. * @see #MAX_RADIX
  3741. * @see #digit(char, int)
  3742. */
  3743. public static char forDigit(int digit, int radix)
  3744. {
  3745. if (radix < MIN_RADIX || radix > MAX_RADIX
  3746. || digit < 0 || digit >= radix)
  3747. return '\0';
  3748. return Number.digits[digit];
  3749. }
  3750. /**
  3751. * Returns the Unicode directionality property of the character. This
  3752. * is used in the visual ordering of text.
  3753. *
  3754. * @param ch the character to look up
  3755. * @return the directionality constant, or DIRECTIONALITY_UNDEFINED
  3756. * @see #DIRECTIONALITY_UNDEFINED
  3757. * @see #DIRECTIONALITY_LEFT_TO_RIGHT
  3758. * @see #DIRECTIONALITY_RIGHT_TO_LEFT
  3759. * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
  3760. * @see #DIRECTIONALITY_EUROPEAN_NUMBER
  3761. * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR
  3762. * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR
  3763. * @see #DIRECTIONALITY_ARABIC_NUMBER
  3764. * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR
  3765. * @see #DIRECTIONALITY_NONSPACING_MARK
  3766. * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL
  3767. * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR
  3768. * @see #DIRECTIONALITY_SEGMENT_SEPARATOR
  3769. * @see #DIRECTIONALITY_WHITESPACE
  3770. * @see #DIRECTIONALITY_OTHER_NEUTRALS
  3771. * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING
  3772. * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE
  3773. * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
  3774. * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE
  3775. * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
  3776. * @since 1.4
  3777. */
  3778. public static byte getDirectionality(char ch)
  3779. {
  3780. // The result will correctly be signed.
  3781. return getDirectionality((int)ch);
  3782. }
  3783. /**
  3784. * Returns the Unicode directionality property of the character. This
  3785. * is used in the visual ordering of text.
  3786. *
  3787. * @param codePoint the character to look up
  3788. * @return the directionality constant, or DIRECTIONALITY_UNDEFINED
  3789. * @see #DIRECTIONALITY_UNDEFINED
  3790. * @see #DIRECTIONALITY_LEFT_TO_RIGHT
  3791. * @see #DIRECTIONALITY_RIGHT_TO_LEFT
  3792. * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
  3793. * @see #DIRECTIONALITY_EUROPEAN_NUMBER
  3794. * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR
  3795. * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR
  3796. * @see #DIRECTIONALITY_ARABIC_NUMBER
  3797. * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR
  3798. * @see #DIRECTIONALITY_NONSPACING_MARK
  3799. * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL
  3800. * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR
  3801. * @see #DIRECTIONALITY_SEGMENT_SEPARATOR
  3802. * @see #DIRECTIONALITY_WHITESPACE
  3803. * @see #DIRECTIONALITY_OTHER_NEUTRALS
  3804. * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING
  3805. * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE
  3806. * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
  3807. * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE
  3808. * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
  3809. * @since 1.5
  3810. */
  3811. public static byte getDirectionality(int codePoint)
  3812. {
  3813. // If the code point is unassigned or in one of the private use areas
  3814. // then we delegate the call to the appropriate private static inner class.
  3815. int plane = codePoint >>> 16;
  3816. if (plane > 2 && plane < 14)
  3817. return UnassignedCharacters.getDirectionality(codePoint);
  3818. if (plane > 14)
  3819. return PrivateUseCharacters.getDirectionality(codePoint);
  3820. // The result will correctly be signed.
  3821. return (byte) (direction[plane][readCodePoint(codePoint) >> 7] >> 2);
  3822. }
  3823. /**
  3824. * Determines whether the character is mirrored according to Unicode. For
  3825. * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in
  3826. * left-to-right text, but ')' in right-to-left text.
  3827. *
  3828. * @param ch the character to look up
  3829. * @return true if the character is mirrored
  3830. * @since 1.4
  3831. */
  3832. public static boolean isMirrored(char ch)
  3833. {
  3834. return (readCodePoint((int)ch) & MIRROR_MASK) != 0;
  3835. }
  3836. /**
  3837. * Determines whether the character is mirrored according to Unicode. For
  3838. * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in
  3839. * left-to-right text, but ')' in right-to-left text.
  3840. *
  3841. * @param codePoint the character to look up
  3842. * @return true if the character is mirrored
  3843. * @since 1.5
  3844. */
  3845. public static boolean isMirrored(int codePoint)
  3846. {
  3847. // If the code point is unassigned or part of one of the private use areas
  3848. // then we delegate the call to the appropriate private static inner class.
  3849. int plane = codePoint >>> 16;
  3850. if (plane > 2 && plane < 14)
  3851. return UnassignedCharacters.isMirrored(codePoint);
  3852. if (plane > 14)
  3853. return PrivateUseCharacters.isMirrored(codePoint);
  3854. return (readCodePoint(codePoint) & MIRROR_MASK) != 0;
  3855. }
  3856. /**
  3857. * Compares another Character to this Character, numerically.
  3858. *
  3859. * @param anotherCharacter Character to compare with this Character
  3860. * @return a negative integer if this Character is less than
  3861. * anotherCharacter, zero if this Character is equal, and
  3862. * a positive integer if this Character is greater
  3863. * @throws NullPointerException if anotherCharacter is null
  3864. * @since 1.2
  3865. */
  3866. public int compareTo(Character anotherCharacter)
  3867. {
  3868. return value - anotherCharacter.value;
  3869. }
  3870. /**
  3871. * Compares two unboxed char values.
  3872. * The result is positive if the first is greater, negative if the second
  3873. * is greater, and 0 if the two are equal.
  3874. *
  3875. * @param x First value to compare.
  3876. * @param y Second value to compare.
  3877. *
  3878. * @return positive int if the first value is greater, negative if the second
  3879. * is greater, and 0 if the two are equal.
  3880. * @since 1.7
  3881. */
  3882. public static int compare(char x, char y)
  3883. {
  3884. return Character.valueOf(x).compareTo(Character.valueOf(y));
  3885. }
  3886. /**
  3887. * Returns an <code>Character</code> object wrapping the value.
  3888. * In contrast to the <code>Character</code> constructor, this method
  3889. * will cache some values. It is used by boxing conversion.
  3890. *
  3891. * @param val the value to wrap
  3892. * @return the <code>Character</code>
  3893. *
  3894. * @since 1.5
  3895. */
  3896. public static Character valueOf(char val)
  3897. {
  3898. if (val > MAX_CACHE)
  3899. return new Character(val);
  3900. else
  3901. return charCache[val - MIN_VALUE];
  3902. }
  3903. /**
  3904. * Reverse the bytes in val.
  3905. * @since 1.5
  3906. */
  3907. public static char reverseBytes(char val)
  3908. {
  3909. return (char) (((val >> 8) & 0xff) | ((val << 8) & 0xff00));
  3910. }
  3911. /**
  3912. * Converts a unicode code point to a UTF-16 representation of that
  3913. * code point.
  3914. *
  3915. * @param codePoint the unicode code point
  3916. *
  3917. * @return the UTF-16 representation of that code point
  3918. *
  3919. * @throws IllegalArgumentException if the code point is not a valid
  3920. * unicode code point
  3921. *
  3922. * @since 1.5
  3923. */
  3924. public static char[] toChars(int codePoint)
  3925. {
  3926. if (!isValidCodePoint(codePoint))
  3927. throw new IllegalArgumentException("Illegal Unicode code point : "
  3928. + codePoint);
  3929. char[] result = new char[charCount(codePoint)];
  3930. int ignore = toChars(codePoint, result, 0);
  3931. return result;
  3932. }
  3933. /**
  3934. * Converts a unicode code point to its UTF-16 representation.
  3935. *
  3936. * @param codePoint the unicode code point
  3937. * @param dst the target char array
  3938. * @param dstIndex the start index for the target
  3939. *
  3940. * @return number of characters written to <code>dst</code>
  3941. *
  3942. * @throws IllegalArgumentException if <code>codePoint</code> is not a
  3943. * valid unicode code point
  3944. * @throws NullPointerException if <code>dst</code> is <code>null</code>
  3945. * @throws IndexOutOfBoundsException if <code>dstIndex</code> is not valid
  3946. * in <code>dst</code> or if the UTF-16 representation does not
  3947. * fit into <code>dst</code>
  3948. *
  3949. * @since 1.5
  3950. */
  3951. public static int toChars(int codePoint, char[] dst, int dstIndex)
  3952. {
  3953. if (!isValidCodePoint(codePoint))
  3954. {
  3955. throw new IllegalArgumentException("not a valid code point: "
  3956. + codePoint);
  3957. }
  3958. int result;
  3959. if (isSupplementaryCodePoint(codePoint))
  3960. {
  3961. // Write second char first to cause IndexOutOfBoundsException
  3962. // immediately.
  3963. final int cp2 = codePoint - 0x10000;
  3964. dst[dstIndex + 1] = (char) ((cp2 % 0x400) + (int) MIN_LOW_SURROGATE);
  3965. dst[dstIndex] = (char) ((cp2 / 0x400) + (int) MIN_HIGH_SURROGATE);
  3966. result = 2;
  3967. }
  3968. else
  3969. {
  3970. dst[dstIndex] = (char) codePoint;
  3971. result = 1;
  3972. }
  3973. return result;
  3974. }
  3975. /**
  3976. * Return number of 16-bit characters required to represent the given
  3977. * code point.
  3978. *
  3979. * @param codePoint a unicode code point
  3980. *
  3981. * @return 2 if codePoint >= 0x10000, 1 otherwise.
  3982. *
  3983. * @since 1.5
  3984. */
  3985. public static int charCount(int codePoint)
  3986. {
  3987. return
  3988. (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT)
  3989. ? 2
  3990. : 1;
  3991. }
  3992. /**
  3993. * Determines whether the specified code point is
  3994. * in the range 0x10000 .. 0x10FFFF, i.e. the character is within the Unicode
  3995. * supplementary character range.
  3996. *
  3997. * @param codePoint a Unicode code point
  3998. *
  3999. * @return <code>true</code> if code point is in supplementary range
  4000. *
  4001. * @since 1.5
  4002. */
  4003. public static boolean isSupplementaryCodePoint(int codePoint)
  4004. {
  4005. return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT
  4006. && codePoint <= MAX_CODE_POINT;
  4007. }
  4008. /**
  4009. * Determines whether the specified code point is
  4010. * in the range 0x0000 .. 0x10FFFF, i.e. it is a valid Unicode code point.
  4011. *
  4012. * @param codePoint a Unicode code point
  4013. *
  4014. * @return <code>true</code> if code point is valid
  4015. *
  4016. * @since 1.5
  4017. */
  4018. public static boolean isValidCodePoint(int codePoint)
  4019. {
  4020. return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
  4021. }
  4022. /**
  4023. * Return true if the given character is a high surrogate.
  4024. * @param ch the character
  4025. * @return true if the character is a high surrogate character
  4026. *
  4027. * @since 1.5
  4028. */
  4029. public static boolean isHighSurrogate(char ch)
  4030. {
  4031. return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
  4032. }
  4033. /**
  4034. * Return true if the given character is a low surrogate.
  4035. * @param ch the character
  4036. * @return true if the character is a low surrogate character
  4037. *
  4038. * @since 1.5
  4039. */
  4040. public static boolean isLowSurrogate(char ch)
  4041. {
  4042. return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
  4043. }
  4044. /**
  4045. * Return true if the given characters compose a surrogate pair.
  4046. * This is true if the first character is a high surrogate and the
  4047. * second character is a low surrogate.
  4048. * @param ch1 the first character
  4049. * @param ch2 the first character
  4050. * @return true if the characters compose a surrogate pair
  4051. *
  4052. * @since 1.5
  4053. */
  4054. public static boolean isSurrogatePair(char ch1, char ch2)
  4055. {
  4056. return isHighSurrogate(ch1) && isLowSurrogate(ch2);
  4057. }
  4058. /**
  4059. * Given a valid surrogate pair, this returns the corresponding
  4060. * code point.
  4061. * @param high the high character of the pair
  4062. * @param low the low character of the pair
  4063. * @return the corresponding code point
  4064. *
  4065. * @since 1.5
  4066. */
  4067. public static int toCodePoint(char high, char low)
  4068. {
  4069. return ((high - MIN_HIGH_SURROGATE) * 0x400) +
  4070. (low - MIN_LOW_SURROGATE) + 0x10000;
  4071. }
  4072. /**
  4073. * Get the code point at the specified index in the CharSequence.
  4074. * This is like CharSequence#charAt(int), but if the character is
  4075. * the start of a surrogate pair, and there is a following
  4076. * character, and this character completes the pair, then the
  4077. * corresponding supplementary code point is returned. Otherwise,
  4078. * the character at the index is returned.
  4079. *
  4080. * @param sequence the CharSequence
  4081. * @param index the index of the codepoint to get, starting at 0
  4082. * @return the codepoint at the specified index
  4083. * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
  4084. * @since 1.5
  4085. */
  4086. public static int codePointAt(CharSequence sequence, int index)
  4087. {
  4088. int len = sequence.length();
  4089. if (index < 0 || index >= len)
  4090. throw new IndexOutOfBoundsException();
  4091. char high = sequence.charAt(index);
  4092. if (! isHighSurrogate(high) || ++index >= len)
  4093. return high;
  4094. char low = sequence.charAt(index);
  4095. if (! isLowSurrogate(low))
  4096. return high;
  4097. return toCodePoint(high, low);
  4098. }
  4099. /**
  4100. * Get the code point at the specified index in the CharSequence.
  4101. * If the character is the start of a surrogate pair, and there is a
  4102. * following character, and this character completes the pair, then
  4103. * the corresponding supplementary code point is returned.
  4104. * Otherwise, the character at the index is returned.
  4105. *
  4106. * @param chars the character array in which to look
  4107. * @param index the index of the codepoint to get, starting at 0
  4108. * @return the codepoint at the specified index
  4109. * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
  4110. * @since 1.5
  4111. */
  4112. public static int codePointAt(char[] chars, int index)
  4113. {
  4114. return codePointAt(chars, index, chars.length);
  4115. }
  4116. /**
  4117. * Get the code point at the specified index in the CharSequence.
  4118. * If the character is the start of a surrogate pair, and there is a
  4119. * following character within the specified range, and this
  4120. * character completes the pair, then the corresponding
  4121. * supplementary code point is returned. Otherwise, the character
  4122. * at the index is returned.
  4123. *
  4124. * @param chars the character array in which to look
  4125. * @param index the index of the codepoint to get, starting at 0
  4126. * @param limit the limit past which characters should not be examined
  4127. * @return the codepoint at the specified index
  4128. * @throws IndexOutOfBoundsException if index is negative or &gt;=
  4129. * limit, or if limit is negative or &gt;= the length of the array
  4130. * @since 1.5
  4131. */
  4132. public static int codePointAt(char[] chars, int index, int limit)
  4133. {
  4134. if (index < 0 || index >= limit || limit < 0 || limit > chars.length)
  4135. throw new IndexOutOfBoundsException();
  4136. char high = chars[index];
  4137. if (! isHighSurrogate(high) || ++index >= limit)
  4138. return high;
  4139. char low = chars[index];
  4140. if (! isLowSurrogate(low))
  4141. return high;
  4142. return toCodePoint(high, low);
  4143. }
  4144. /**
  4145. * Get the code point before the specified index. This is like
  4146. * #codePointAt(char[], int), but checks the characters at
  4147. * <code>index-1</code> and <code>index-2</code> to see if they form
  4148. * a supplementary code point. If they do not, the character at
  4149. * <code>index-1</code> is returned.
  4150. *
  4151. * @param chars the character array
  4152. * @param index the index just past the codepoint to get, starting at 0
  4153. * @return the codepoint at the specified index
  4154. * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
  4155. * @since 1.5
  4156. */
  4157. public static int codePointBefore(char[] chars, int index)
  4158. {
  4159. return codePointBefore(chars, index, 1);
  4160. }
  4161. /**
  4162. * Get the code point before the specified index. This is like
  4163. * #codePointAt(char[], int), but checks the characters at
  4164. * <code>index-1</code> and <code>index-2</code> to see if they form
  4165. * a supplementary code point. If they do not, the character at
  4166. * <code>index-1</code> is returned. The start parameter is used to
  4167. * limit the range of the array which may be examined.
  4168. *
  4169. * @param chars the character array
  4170. * @param index the index just past the codepoint to get, starting at 0
  4171. * @param start the index before which characters should not be examined
  4172. * @return the codepoint at the specified index
  4173. * @throws IndexOutOfBoundsException if index is &gt; start or &gt;
  4174. * the length of the array, or if limit is negative or &gt;= the
  4175. * length of the array
  4176. * @since 1.5
  4177. */
  4178. public static int codePointBefore(char[] chars, int index, int start)
  4179. {
  4180. if (index < start || index > chars.length
  4181. || start < 0 || start >= chars.length)
  4182. throw new IndexOutOfBoundsException();
  4183. --index;
  4184. char low = chars[index];
  4185. if (! isLowSurrogate(low) || --index < start)
  4186. return low;
  4187. char high = chars[index];
  4188. if (! isHighSurrogate(high))
  4189. return low;
  4190. return toCodePoint(high, low);
  4191. }
  4192. /**
  4193. * Get the code point before the specified index. This is like
  4194. * #codePointAt(CharSequence, int), but checks the characters at
  4195. * <code>index-1</code> and <code>index-2</code> to see if they form
  4196. * a supplementary code point. If they do not, the character at
  4197. * <code>index-1</code> is returned.
  4198. *
  4199. * @param sequence the CharSequence
  4200. * @param index the index just past the codepoint to get, starting at 0
  4201. * @return the codepoint at the specified index
  4202. * @throws IndexOutOfBoundsException if index is negative or &gt;= length()
  4203. * @since 1.5
  4204. */
  4205. public static int codePointBefore(CharSequence sequence, int index)
  4206. {
  4207. int len = sequence.length();
  4208. if (index < 1 || index > len)
  4209. throw new IndexOutOfBoundsException();
  4210. --index;
  4211. char low = sequence.charAt(index);
  4212. if (! isLowSurrogate(low) || --index < 0)
  4213. return low;
  4214. char high = sequence.charAt(index);
  4215. if (! isHighSurrogate(high))
  4216. return low;
  4217. return toCodePoint(high, low);
  4218. }
  4219. } // class Character