normalizer2.h 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2009-2013, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: normalizer2.h
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2009nov22
  16. * created by: Markus W. Scherer
  17. */
  18. #ifndef __NORMALIZER2_H__
  19. #define __NORMALIZER2_H__
  20. /**
  21. * \file
  22. * \brief C++ API: New API for Unicode Normalization.
  23. */
  24. #include "unicode/utypes.h"
  25. #if U_SHOW_CPLUSPLUS_API
  26. #if !UCONFIG_NO_NORMALIZATION
  27. #include "unicode/stringpiece.h"
  28. #include "unicode/uniset.h"
  29. #include "unicode/unistr.h"
  30. #include "unicode/unorm2.h"
  31. U_NAMESPACE_BEGIN
  32. class ByteSink;
  33. /**
  34. * Unicode normalization functionality for standard Unicode normalization or
  35. * for using custom mapping tables.
  36. * All instances of this class are unmodifiable/immutable.
  37. * Instances returned by getInstance() are singletons that must not be deleted by the caller.
  38. * The Normalizer2 class is not intended for public subclassing.
  39. *
  40. * The primary functions are to produce a normalized string and to detect whether
  41. * a string is already normalized.
  42. * The most commonly used normalization forms are those defined in
  43. * http://www.unicode.org/unicode/reports/tr15/
  44. * However, this API supports additional normalization forms for specialized purposes.
  45. * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
  46. * and can be used in implementations of UTS #46.
  47. *
  48. * Not only are the standard compose and decompose modes supplied,
  49. * but additional modes are provided as documented in the Mode enum.
  50. *
  51. * Some of the functions in this class identify normalization boundaries.
  52. * At a normalization boundary, the portions of the string
  53. * before it and starting from it do not interact and can be handled independently.
  54. *
  55. * The spanQuickCheckYes() stops at a normalization boundary.
  56. * When the goal is a normalized string, then the text before the boundary
  57. * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
  58. *
  59. * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
  60. * a character is guaranteed to be at a normalization boundary,
  61. * regardless of context.
  62. * This is used for moving from one normalization boundary to the next
  63. * or preceding boundary, and for performing iterative normalization.
  64. *
  65. * Iterative normalization is useful when only a small portion of a
  66. * longer string needs to be processed.
  67. * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
  68. * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
  69. * (to process only the substring for which sort key bytes are computed).
  70. *
  71. * The set of normalization boundaries returned by these functions may not be
  72. * complete: There may be more boundaries that could be returned.
  73. * Different functions may return different boundaries.
  74. * @stable ICU 4.4
  75. */
  76. class U_COMMON_API Normalizer2 : public UObject {
  77. public:
  78. /**
  79. * Destructor.
  80. * @stable ICU 4.4
  81. */
  82. ~Normalizer2();
  83. /**
  84. * Returns a Normalizer2 instance for Unicode NFC normalization.
  85. * Same as getInstance(nullptr, "nfc", UNORM2_COMPOSE, errorCode).
  86. * Returns an unmodifiable singleton instance. Do not delete it.
  87. * @param errorCode Standard ICU error code. Its input value must
  88. * pass the U_SUCCESS() test, or else the function returns
  89. * immediately. Check for U_FAILURE() on output or use with
  90. * function chaining. (See User Guide for details.)
  91. * @return the requested Normalizer2, if successful
  92. * @stable ICU 49
  93. */
  94. static const Normalizer2 *
  95. getNFCInstance(UErrorCode &errorCode);
  96. /**
  97. * Returns a Normalizer2 instance for Unicode NFD normalization.
  98. * Same as getInstance(nullptr, "nfc", UNORM2_DECOMPOSE, errorCode).
  99. * Returns an unmodifiable singleton instance. Do not delete it.
  100. * @param errorCode Standard ICU error code. Its input value must
  101. * pass the U_SUCCESS() test, or else the function returns
  102. * immediately. Check for U_FAILURE() on output or use with
  103. * function chaining. (See User Guide for details.)
  104. * @return the requested Normalizer2, if successful
  105. * @stable ICU 49
  106. */
  107. static const Normalizer2 *
  108. getNFDInstance(UErrorCode &errorCode);
  109. /**
  110. * Returns a Normalizer2 instance for Unicode NFKC normalization.
  111. * Same as getInstance(nullptr, "nfkc", UNORM2_COMPOSE, errorCode).
  112. * Returns an unmodifiable singleton instance. Do not delete it.
  113. * @param errorCode Standard ICU error code. Its input value must
  114. * pass the U_SUCCESS() test, or else the function returns
  115. * immediately. Check for U_FAILURE() on output or use with
  116. * function chaining. (See User Guide for details.)
  117. * @return the requested Normalizer2, if successful
  118. * @stable ICU 49
  119. */
  120. static const Normalizer2 *
  121. getNFKCInstance(UErrorCode &errorCode);
  122. /**
  123. * Returns a Normalizer2 instance for Unicode NFKD normalization.
  124. * Same as getInstance(nullptr, "nfkc", UNORM2_DECOMPOSE, errorCode).
  125. * Returns an unmodifiable singleton instance. Do not delete it.
  126. * @param errorCode Standard ICU error code. Its input value must
  127. * pass the U_SUCCESS() test, or else the function returns
  128. * immediately. Check for U_FAILURE() on output or use with
  129. * function chaining. (See User Guide for details.)
  130. * @return the requested Normalizer2, if successful
  131. * @stable ICU 49
  132. */
  133. static const Normalizer2 *
  134. getNFKDInstance(UErrorCode &errorCode);
  135. /**
  136. * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
  137. * Same as getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, errorCode).
  138. * Returns an unmodifiable singleton instance. Do not delete it.
  139. * @param errorCode Standard ICU error code. Its input value must
  140. * pass the U_SUCCESS() test, or else the function returns
  141. * immediately. Check for U_FAILURE() on output or use with
  142. * function chaining. (See User Guide for details.)
  143. * @return the requested Normalizer2, if successful
  144. * @stable ICU 49
  145. */
  146. static const Normalizer2 *
  147. getNFKCCasefoldInstance(UErrorCode &errorCode);
  148. /**
  149. * Returns a Normalizer2 instance which uses the specified data file
  150. * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
  151. * and which composes or decomposes text according to the specified mode.
  152. * Returns an unmodifiable singleton instance. Do not delete it.
  153. *
  154. * Use packageName=nullptr for data files that are part of ICU's own data.
  155. * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
  156. * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
  157. * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
  158. *
  159. * @param packageName nullptr for ICU built-in data, otherwise application data package name
  160. * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
  161. * @param mode normalization mode (compose or decompose etc.)
  162. * @param errorCode Standard ICU error code. Its input value must
  163. * pass the U_SUCCESS() test, or else the function returns
  164. * immediately. Check for U_FAILURE() on output or use with
  165. * function chaining. (See User Guide for details.)
  166. * @return the requested Normalizer2, if successful
  167. * @stable ICU 4.4
  168. */
  169. static const Normalizer2 *
  170. getInstance(const char *packageName,
  171. const char *name,
  172. UNormalization2Mode mode,
  173. UErrorCode &errorCode);
  174. /**
  175. * Returns the normalized form of the source string.
  176. * @param src source string
  177. * @param errorCode Standard ICU error code. Its input value must
  178. * pass the U_SUCCESS() test, or else the function returns
  179. * immediately. Check for U_FAILURE() on output or use with
  180. * function chaining. (See User Guide for details.)
  181. * @return normalized src
  182. * @stable ICU 4.4
  183. */
  184. UnicodeString
  185. normalize(const UnicodeString &src, UErrorCode &errorCode) const {
  186. UnicodeString result;
  187. normalize(src, result, errorCode);
  188. return result;
  189. }
  190. /**
  191. * Writes the normalized form of the source string to the destination string
  192. * (replacing its contents) and returns the destination string.
  193. * The source and destination strings must be different objects.
  194. * @param src source string
  195. * @param dest destination string; its contents is replaced with normalized src
  196. * @param errorCode Standard ICU error code. Its input value must
  197. * pass the U_SUCCESS() test, or else the function returns
  198. * immediately. Check for U_FAILURE() on output or use with
  199. * function chaining. (See User Guide for details.)
  200. * @return dest
  201. * @stable ICU 4.4
  202. */
  203. virtual UnicodeString &
  204. normalize(const UnicodeString &src,
  205. UnicodeString &dest,
  206. UErrorCode &errorCode) const = 0;
  207. /**
  208. * Normalizes a UTF-8 string and optionally records how source substrings
  209. * relate to changed and unchanged result substrings.
  210. *
  211. * Implemented completely for all built-in modes except for FCD.
  212. * The base class implementation converts to & from UTF-16 and does not support edits.
  213. *
  214. * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
  215. * @param src Source UTF-8 string.
  216. * @param sink A ByteSink to which the normalized UTF-8 result string is written.
  217. * sink.Flush() is called at the end.
  218. * @param edits Records edits for index mapping, working with styled text,
  219. * and getting only changes (if any).
  220. * The Edits contents is undefined if any error occurs.
  221. * This function calls edits->reset() first unless
  222. * options includes U_EDITS_NO_RESET. edits can be nullptr.
  223. * @param errorCode Standard ICU error code. Its input value must
  224. * pass the U_SUCCESS() test, or else the function returns
  225. * immediately. Check for U_FAILURE() on output or use with
  226. * function chaining. (See User Guide for details.)
  227. * @stable ICU 60
  228. */
  229. virtual void
  230. normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
  231. Edits *edits, UErrorCode &errorCode) const;
  232. /**
  233. * Appends the normalized form of the second string to the first string
  234. * (merging them at the boundary) and returns the first string.
  235. * The result is normalized if the first string was normalized.
  236. * The first and second strings must be different objects.
  237. * @param first string, should be normalized
  238. * @param second string, will be normalized
  239. * @param errorCode Standard ICU error code. Its input value must
  240. * pass the U_SUCCESS() test, or else the function returns
  241. * immediately. Check for U_FAILURE() on output or use with
  242. * function chaining. (See User Guide for details.)
  243. * @return first
  244. * @stable ICU 4.4
  245. */
  246. virtual UnicodeString &
  247. normalizeSecondAndAppend(UnicodeString &first,
  248. const UnicodeString &second,
  249. UErrorCode &errorCode) const = 0;
  250. /**
  251. * Appends the second string to the first string
  252. * (merging them at the boundary) and returns the first string.
  253. * The result is normalized if both the strings were normalized.
  254. * The first and second strings must be different objects.
  255. * @param first string, should be normalized
  256. * @param second string, should be normalized
  257. * @param errorCode Standard ICU error code. Its input value must
  258. * pass the U_SUCCESS() test, or else the function returns
  259. * immediately. Check for U_FAILURE() on output or use with
  260. * function chaining. (See User Guide for details.)
  261. * @return first
  262. * @stable ICU 4.4
  263. */
  264. virtual UnicodeString &
  265. append(UnicodeString &first,
  266. const UnicodeString &second,
  267. UErrorCode &errorCode) const = 0;
  268. /**
  269. * Gets the decomposition mapping of c.
  270. * Roughly equivalent to normalizing the String form of c
  271. * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
  272. * returns false and does not write a string
  273. * if c does not have a decomposition mapping in this instance's data.
  274. * This function is independent of the mode of the Normalizer2.
  275. * @param c code point
  276. * @param decomposition String object which will be set to c's
  277. * decomposition mapping, if there is one.
  278. * @return true if c has a decomposition, otherwise false
  279. * @stable ICU 4.6
  280. */
  281. virtual UBool
  282. getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
  283. /**
  284. * Gets the raw decomposition mapping of c.
  285. *
  286. * This is similar to the getDecomposition() method but returns the
  287. * raw decomposition mapping as specified in UnicodeData.txt or
  288. * (for custom data) in the mapping files processed by the gennorm2 tool.
  289. * By contrast, getDecomposition() returns the processed,
  290. * recursively-decomposed version of this mapping.
  291. *
  292. * When used on a standard NFKC Normalizer2 instance,
  293. * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
  294. *
  295. * When used on a standard NFC Normalizer2 instance,
  296. * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
  297. * in this case, the result contains either one or two code points (=1..4 char16_ts).
  298. *
  299. * This function is independent of the mode of the Normalizer2.
  300. * The default implementation returns false.
  301. * @param c code point
  302. * @param decomposition String object which will be set to c's
  303. * raw decomposition mapping, if there is one.
  304. * @return true if c has a decomposition, otherwise false
  305. * @stable ICU 49
  306. */
  307. virtual UBool
  308. getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
  309. /**
  310. * Performs pairwise composition of a & b and returns the composite if there is one.
  311. *
  312. * Returns a composite code point c only if c has a two-way mapping to a+b.
  313. * In standard Unicode normalization, this means that
  314. * c has a canonical decomposition to a+b
  315. * and c does not have the Full_Composition_Exclusion property.
  316. *
  317. * This function is independent of the mode of the Normalizer2.
  318. * The default implementation returns a negative value.
  319. * @param a A (normalization starter) code point.
  320. * @param b Another code point.
  321. * @return The non-negative composite code point if there is one; otherwise a negative value.
  322. * @stable ICU 49
  323. */
  324. virtual UChar32
  325. composePair(UChar32 a, UChar32 b) const;
  326. /**
  327. * Gets the combining class of c.
  328. * The default implementation returns 0
  329. * but all standard implementations return the Unicode Canonical_Combining_Class value.
  330. * @param c code point
  331. * @return c's combining class
  332. * @stable ICU 49
  333. */
  334. virtual uint8_t
  335. getCombiningClass(UChar32 c) const;
  336. /**
  337. * Tests if the string is normalized.
  338. * Internally, in cases where the quickCheck() method would return "maybe"
  339. * (which is only possible for the two COMPOSE modes) this method
  340. * resolves to "yes" or "no" to provide a definitive result,
  341. * at the cost of doing more work in those cases.
  342. * @param s input string
  343. * @param errorCode Standard ICU error code. Its input value must
  344. * pass the U_SUCCESS() test, or else the function returns
  345. * immediately. Check for U_FAILURE() on output or use with
  346. * function chaining. (See User Guide for details.)
  347. * @return true if s is normalized
  348. * @stable ICU 4.4
  349. */
  350. virtual UBool
  351. isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
  352. /**
  353. * Tests if the UTF-8 string is normalized.
  354. * Internally, in cases where the quickCheck() method would return "maybe"
  355. * (which is only possible for the two COMPOSE modes) this method
  356. * resolves to "yes" or "no" to provide a definitive result,
  357. * at the cost of doing more work in those cases.
  358. *
  359. * This works for all normalization modes.
  360. * It is optimized for UTF-8 for all built-in modes except for FCD.
  361. * The base class implementation converts to UTF-16 and calls isNormalized().
  362. *
  363. * @param s UTF-8 input string
  364. * @param errorCode Standard ICU error code. Its input value must
  365. * pass the U_SUCCESS() test, or else the function returns
  366. * immediately. Check for U_FAILURE() on output or use with
  367. * function chaining. (See User Guide for details.)
  368. * @return true if s is normalized
  369. * @stable ICU 60
  370. */
  371. virtual UBool
  372. isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
  373. /**
  374. * Tests if the string is normalized.
  375. * For the two COMPOSE modes, the result could be "maybe" in cases that
  376. * would take a little more work to resolve definitively.
  377. * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
  378. * combination of quick check + normalization, to avoid
  379. * re-checking the "yes" prefix.
  380. * @param s input string
  381. * @param errorCode Standard ICU error code. Its input value must
  382. * pass the U_SUCCESS() test, or else the function returns
  383. * immediately. Check for U_FAILURE() on output or use with
  384. * function chaining. (See User Guide for details.)
  385. * @return UNormalizationCheckResult
  386. * @stable ICU 4.4
  387. */
  388. virtual UNormalizationCheckResult
  389. quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
  390. /**
  391. * Returns the end of the normalized substring of the input string.
  392. * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
  393. * the substring <code>UnicodeString(s, 0, end)</code>
  394. * will pass the quick check with a "yes" result.
  395. *
  396. * The returned end index is usually one or more characters before the
  397. * "no" or "maybe" character: The end index is at a normalization boundary.
  398. * (See the class documentation for more about normalization boundaries.)
  399. *
  400. * When the goal is a normalized string and most input strings are expected
  401. * to be normalized already, then call this method,
  402. * and if it returns a prefix shorter than the input string,
  403. * copy that prefix and use normalizeSecondAndAppend() for the remainder.
  404. * @param s input string
  405. * @param errorCode Standard ICU error code. Its input value must
  406. * pass the U_SUCCESS() test, or else the function returns
  407. * immediately. Check for U_FAILURE() on output or use with
  408. * function chaining. (See User Guide for details.)
  409. * @return "yes" span end index
  410. * @stable ICU 4.4
  411. */
  412. virtual int32_t
  413. spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
  414. /**
  415. * Tests if the character always has a normalization boundary before it,
  416. * regardless of context.
  417. * If true, then the character does not normalization-interact with
  418. * preceding characters.
  419. * In other words, a string containing this character can be normalized
  420. * by processing portions before this character and starting from this
  421. * character independently.
  422. * This is used for iterative normalization. See the class documentation for details.
  423. * @param c character to test
  424. * @return true if c has a normalization boundary before it
  425. * @stable ICU 4.4
  426. */
  427. virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
  428. /**
  429. * Tests if the character always has a normalization boundary after it,
  430. * regardless of context.
  431. * If true, then the character does not normalization-interact with
  432. * following characters.
  433. * In other words, a string containing this character can be normalized
  434. * by processing portions up to this character and after this
  435. * character independently.
  436. * This is used for iterative normalization. See the class documentation for details.
  437. * Note that this operation may be significantly slower than hasBoundaryBefore().
  438. * @param c character to test
  439. * @return true if c has a normalization boundary after it
  440. * @stable ICU 4.4
  441. */
  442. virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
  443. /**
  444. * Tests if the character is normalization-inert.
  445. * If true, then the character does not change, nor normalization-interact with
  446. * preceding or following characters.
  447. * In other words, a string containing this character can be normalized
  448. * by processing portions before this character and after this
  449. * character independently.
  450. * This is used for iterative normalization. See the class documentation for details.
  451. * Note that this operation may be significantly slower than hasBoundaryBefore().
  452. * @param c character to test
  453. * @return true if c is normalization-inert
  454. * @stable ICU 4.4
  455. */
  456. virtual UBool isInert(UChar32 c) const = 0;
  457. };
  458. /**
  459. * Normalization filtered by a UnicodeSet.
  460. * Normalizes portions of the text contained in the filter set and leaves
  461. * portions not contained in the filter set unchanged.
  462. * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
  463. * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
  464. * This class implements all of (and only) the Normalizer2 API.
  465. * An instance of this class is unmodifiable/immutable but is constructed and
  466. * must be destructed by the owner.
  467. * @stable ICU 4.4
  468. */
  469. class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
  470. public:
  471. /**
  472. * Constructs a filtered normalizer wrapping any Normalizer2 instance
  473. * and a filter set.
  474. * Both are aliased and must not be modified or deleted while this object
  475. * is used.
  476. * The filter set should be frozen; otherwise the performance will suffer greatly.
  477. * @param n2 wrapped Normalizer2 instance
  478. * @param filterSet UnicodeSet which determines the characters to be normalized
  479. * @stable ICU 4.4
  480. */
  481. FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
  482. norm2(n2), set(filterSet) {}
  483. /**
  484. * Destructor.
  485. * @stable ICU 4.4
  486. */
  487. ~FilteredNormalizer2();
  488. /**
  489. * Writes the normalized form of the source string to the destination string
  490. * (replacing its contents) and returns the destination string.
  491. * The source and destination strings must be different objects.
  492. * @param src source string
  493. * @param dest destination string; its contents is replaced with normalized src
  494. * @param errorCode Standard ICU error code. Its input value must
  495. * pass the U_SUCCESS() test, or else the function returns
  496. * immediately. Check for U_FAILURE() on output or use with
  497. * function chaining. (See User Guide for details.)
  498. * @return dest
  499. * @stable ICU 4.4
  500. */
  501. virtual UnicodeString &
  502. normalize(const UnicodeString &src,
  503. UnicodeString &dest,
  504. UErrorCode &errorCode) const override;
  505. /**
  506. * Normalizes a UTF-8 string and optionally records how source substrings
  507. * relate to changed and unchanged result substrings.
  508. *
  509. * Implemented completely for most built-in modes except for FCD.
  510. * The base class implementation converts to & from UTF-16 and does not support edits.
  511. *
  512. * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
  513. * @param src Source UTF-8 string.
  514. * @param sink A ByteSink to which the normalized UTF-8 result string is written.
  515. * sink.Flush() is called at the end.
  516. * @param edits Records edits for index mapping, working with styled text,
  517. * and getting only changes (if any).
  518. * The Edits contents is undefined if any error occurs.
  519. * This function calls edits->reset() first unless
  520. * options includes U_EDITS_NO_RESET. edits can be nullptr.
  521. * @param errorCode Standard ICU error code. Its input value must
  522. * pass the U_SUCCESS() test, or else the function returns
  523. * immediately. Check for U_FAILURE() on output or use with
  524. * function chaining. (See User Guide for details.)
  525. * @stable ICU 60
  526. */
  527. virtual void
  528. normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
  529. Edits *edits, UErrorCode &errorCode) const override;
  530. /**
  531. * Appends the normalized form of the second string to the first string
  532. * (merging them at the boundary) and returns the first string.
  533. * The result is normalized if the first string was normalized.
  534. * The first and second strings must be different objects.
  535. * @param first string, should be normalized
  536. * @param second string, will be normalized
  537. * @param errorCode Standard ICU error code. Its input value must
  538. * pass the U_SUCCESS() test, or else the function returns
  539. * immediately. Check for U_FAILURE() on output or use with
  540. * function chaining. (See User Guide for details.)
  541. * @return first
  542. * @stable ICU 4.4
  543. */
  544. virtual UnicodeString &
  545. normalizeSecondAndAppend(UnicodeString &first,
  546. const UnicodeString &second,
  547. UErrorCode &errorCode) const override;
  548. /**
  549. * Appends the second string to the first string
  550. * (merging them at the boundary) and returns the first string.
  551. * The result is normalized if both the strings were normalized.
  552. * The first and second strings must be different objects.
  553. * @param first string, should be normalized
  554. * @param second string, should be normalized
  555. * @param errorCode Standard ICU error code. Its input value must
  556. * pass the U_SUCCESS() test, or else the function returns
  557. * immediately. Check for U_FAILURE() on output or use with
  558. * function chaining. (See User Guide for details.)
  559. * @return first
  560. * @stable ICU 4.4
  561. */
  562. virtual UnicodeString &
  563. append(UnicodeString &first,
  564. const UnicodeString &second,
  565. UErrorCode &errorCode) const override;
  566. /**
  567. * Gets the decomposition mapping of c.
  568. * For details see the base class documentation.
  569. *
  570. * This function is independent of the mode of the Normalizer2.
  571. * @param c code point
  572. * @param decomposition String object which will be set to c's
  573. * decomposition mapping, if there is one.
  574. * @return true if c has a decomposition, otherwise false
  575. * @stable ICU 4.6
  576. */
  577. virtual UBool
  578. getDecomposition(UChar32 c, UnicodeString &decomposition) const override;
  579. /**
  580. * Gets the raw decomposition mapping of c.
  581. * For details see the base class documentation.
  582. *
  583. * This function is independent of the mode of the Normalizer2.
  584. * @param c code point
  585. * @param decomposition String object which will be set to c's
  586. * raw decomposition mapping, if there is one.
  587. * @return true if c has a decomposition, otherwise false
  588. * @stable ICU 49
  589. */
  590. virtual UBool
  591. getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override;
  592. /**
  593. * Performs pairwise composition of a & b and returns the composite if there is one.
  594. * For details see the base class documentation.
  595. *
  596. * This function is independent of the mode of the Normalizer2.
  597. * @param a A (normalization starter) code point.
  598. * @param b Another code point.
  599. * @return The non-negative composite code point if there is one; otherwise a negative value.
  600. * @stable ICU 49
  601. */
  602. virtual UChar32
  603. composePair(UChar32 a, UChar32 b) const override;
  604. /**
  605. * Gets the combining class of c.
  606. * The default implementation returns 0
  607. * but all standard implementations return the Unicode Canonical_Combining_Class value.
  608. * @param c code point
  609. * @return c's combining class
  610. * @stable ICU 49
  611. */
  612. virtual uint8_t
  613. getCombiningClass(UChar32 c) const override;
  614. /**
  615. * Tests if the string is normalized.
  616. * For details see the Normalizer2 base class documentation.
  617. * @param s input string
  618. * @param errorCode Standard ICU error code. Its input value must
  619. * pass the U_SUCCESS() test, or else the function returns
  620. * immediately. Check for U_FAILURE() on output or use with
  621. * function chaining. (See User Guide for details.)
  622. * @return true if s is normalized
  623. * @stable ICU 4.4
  624. */
  625. virtual UBool
  626. isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
  627. /**
  628. * Tests if the UTF-8 string is normalized.
  629. * Internally, in cases where the quickCheck() method would return "maybe"
  630. * (which is only possible for the two COMPOSE modes) this method
  631. * resolves to "yes" or "no" to provide a definitive result,
  632. * at the cost of doing more work in those cases.
  633. *
  634. * This works for all normalization modes.
  635. * It is optimized for UTF-8 for all built-in modes except for FCD.
  636. * The base class implementation converts to UTF-16 and calls isNormalized().
  637. *
  638. * @param s UTF-8 input string
  639. * @param errorCode Standard ICU error code. Its input value must
  640. * pass the U_SUCCESS() test, or else the function returns
  641. * immediately. Check for U_FAILURE() on output or use with
  642. * function chaining. (See User Guide for details.)
  643. * @return true if s is normalized
  644. * @stable ICU 60
  645. */
  646. virtual UBool
  647. isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
  648. /**
  649. * Tests if the string is normalized.
  650. * For details see the Normalizer2 base class documentation.
  651. * @param s input string
  652. * @param errorCode Standard ICU error code. Its input value must
  653. * pass the U_SUCCESS() test, or else the function returns
  654. * immediately. Check for U_FAILURE() on output or use with
  655. * function chaining. (See User Guide for details.)
  656. * @return UNormalizationCheckResult
  657. * @stable ICU 4.4
  658. */
  659. virtual UNormalizationCheckResult
  660. quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
  661. /**
  662. * Returns the end of the normalized substring of the input string.
  663. * For details see the Normalizer2 base class documentation.
  664. * @param s input string
  665. * @param errorCode Standard ICU error code. Its input value must
  666. * pass the U_SUCCESS() test, or else the function returns
  667. * immediately. Check for U_FAILURE() on output or use with
  668. * function chaining. (See User Guide for details.)
  669. * @return "yes" span end index
  670. * @stable ICU 4.4
  671. */
  672. virtual int32_t
  673. spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
  674. /**
  675. * Tests if the character always has a normalization boundary before it,
  676. * regardless of context.
  677. * For details see the Normalizer2 base class documentation.
  678. * @param c character to test
  679. * @return true if c has a normalization boundary before it
  680. * @stable ICU 4.4
  681. */
  682. virtual UBool hasBoundaryBefore(UChar32 c) const override;
  683. /**
  684. * Tests if the character always has a normalization boundary after it,
  685. * regardless of context.
  686. * For details see the Normalizer2 base class documentation.
  687. * @param c character to test
  688. * @return true if c has a normalization boundary after it
  689. * @stable ICU 4.4
  690. */
  691. virtual UBool hasBoundaryAfter(UChar32 c) const override;
  692. /**
  693. * Tests if the character is normalization-inert.
  694. * For details see the Normalizer2 base class documentation.
  695. * @param c character to test
  696. * @return true if c is normalization-inert
  697. * @stable ICU 4.4
  698. */
  699. virtual UBool isInert(UChar32 c) const override;
  700. private:
  701. UnicodeString &
  702. normalize(const UnicodeString &src,
  703. UnicodeString &dest,
  704. USetSpanCondition spanCondition,
  705. UErrorCode &errorCode) const;
  706. void
  707. normalizeUTF8(uint32_t options, const char *src, int32_t length,
  708. ByteSink &sink, Edits *edits,
  709. USetSpanCondition spanCondition,
  710. UErrorCode &errorCode) const;
  711. UnicodeString &
  712. normalizeSecondAndAppend(UnicodeString &first,
  713. const UnicodeString &second,
  714. UBool doNormalize,
  715. UErrorCode &errorCode) const;
  716. const Normalizer2 &norm2;
  717. const UnicodeSet &set;
  718. };
  719. U_NAMESPACE_END
  720. #endif // !UCONFIG_NO_NORMALIZATION
  721. #endif /* U_SHOW_CPLUSPLUS_API */
  722. #endif // __NORMALIZER2_H__