loclikely.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 1997-2016, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: loclikely.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2010feb25
  16. * created by: Markus W. Scherer
  17. *
  18. * Code for likely and minimized locale subtags, separated out from other .cpp files
  19. * that then do not depend on resource bundle code and likely-subtags data.
  20. */
  21. #include <utility>
  22. #include "unicode/bytestream.h"
  23. #include "unicode/utypes.h"
  24. #include "unicode/locid.h"
  25. #include "unicode/putil.h"
  26. #include "unicode/uchar.h"
  27. #include "unicode/uloc.h"
  28. #include "unicode/ures.h"
  29. #include "unicode/uscript.h"
  30. #include "bytesinkutil.h"
  31. #include "charstr.h"
  32. #include "cmemory.h"
  33. #include "cstring.h"
  34. #include "loclikelysubtags.h"
  35. #include "ulocimp.h"
  36. namespace {
  37. /**
  38. * Create a tag string from the supplied parameters. The lang, script and region
  39. * parameters may be nullptr pointers. If they are, their corresponding length parameters
  40. * must be less than or equal to 0.
  41. *
  42. * If an illegal argument is provided, the function returns the error
  43. * U_ILLEGAL_ARGUMENT_ERROR.
  44. *
  45. * @param lang The language tag to use.
  46. * @param langLength The length of the language tag.
  47. * @param script The script tag to use.
  48. * @param scriptLength The length of the script tag.
  49. * @param region The region tag to use.
  50. * @param regionLength The length of the region tag.
  51. * @param variant The region tag to use.
  52. * @param variantLength The length of the region tag.
  53. * @param trailing Any trailing data to append to the new tag.
  54. * @param trailingLength The length of the trailing data.
  55. * @param sink The output sink receiving the tag string.
  56. * @param err A pointer to a UErrorCode for error reporting.
  57. **/
  58. void U_CALLCONV
  59. createTagStringWithAlternates(
  60. const char* lang,
  61. int32_t langLength,
  62. const char* script,
  63. int32_t scriptLength,
  64. const char* region,
  65. int32_t regionLength,
  66. const char* variant,
  67. int32_t variantLength,
  68. const char* trailing,
  69. int32_t trailingLength,
  70. icu::ByteSink& sink,
  71. UErrorCode& err) {
  72. if (U_FAILURE(err)) {
  73. return;
  74. }
  75. if (langLength >= ULOC_LANG_CAPACITY ||
  76. scriptLength >= ULOC_SCRIPT_CAPACITY ||
  77. regionLength >= ULOC_COUNTRY_CAPACITY) {
  78. err = U_ILLEGAL_ARGUMENT_ERROR;
  79. return;
  80. }
  81. if (langLength > 0) {
  82. sink.Append(lang, langLength);
  83. }
  84. if (scriptLength > 0) {
  85. sink.Append("_", 1);
  86. sink.Append(script, scriptLength);
  87. }
  88. if (regionLength > 0) {
  89. sink.Append("_", 1);
  90. sink.Append(region, regionLength);
  91. }
  92. if (variantLength > 0) {
  93. if (regionLength == 0) {
  94. /* extra separator is required */
  95. sink.Append("_", 1);
  96. }
  97. sink.Append("_", 1);
  98. sink.Append(variant, variantLength);
  99. }
  100. if (trailingLength > 0) {
  101. /*
  102. * Copy the trailing data into the supplied buffer.
  103. */
  104. sink.Append(trailing, trailingLength);
  105. }
  106. }
  107. bool CHECK_TRAILING_VARIANT_SIZE(const char* variant, int32_t variantLength) {
  108. int32_t count = 0;
  109. for (int32_t i = 0; i < variantLength; i++) {
  110. if (_isIDSeparator(variant[i])) {
  111. count = 0;
  112. } else if (count == 8) {
  113. return false;
  114. } else {
  115. count++;
  116. }
  117. }
  118. return true;
  119. }
  120. void
  121. _uloc_addLikelySubtags(const char* localeID,
  122. icu::ByteSink& sink,
  123. UErrorCode& err) {
  124. if (U_FAILURE(err)) {
  125. return;
  126. }
  127. if (localeID == nullptr) {
  128. err = U_ILLEGAL_ARGUMENT_ERROR;
  129. return;
  130. }
  131. icu::CharString lang;
  132. icu::CharString script;
  133. icu::CharString region;
  134. icu::CharString variant;
  135. const char* trailing = nullptr;
  136. ulocimp_getSubtags(localeID, &lang, &script, &region, &variant, &trailing, err);
  137. if (U_FAILURE(err)) {
  138. return;
  139. }
  140. if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
  141. err = U_ILLEGAL_ARGUMENT_ERROR;
  142. return;
  143. }
  144. if (lang.length() == 4) {
  145. if (script.isEmpty()) {
  146. script = std::move(lang);
  147. lang.clear();
  148. } else {
  149. err = U_ILLEGAL_ARGUMENT_ERROR;
  150. return;
  151. }
  152. } else if (lang.length() > 8) {
  153. err = U_ILLEGAL_ARGUMENT_ERROR;
  154. return;
  155. }
  156. int32_t trailingLength = (int32_t)uprv_strlen(trailing);
  157. const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
  158. if (U_FAILURE(err)) {
  159. return;
  160. }
  161. // We need to keep l on the stack because lsr may point into internal
  162. // memory of l.
  163. icu::Locale l = icu::Locale::createFromName(localeID);
  164. if (l.isBogus()) {
  165. err = U_ILLEGAL_ARGUMENT_ERROR;
  166. return;
  167. }
  168. icu::LSR lsr = likelySubtags->makeMaximizedLsrFrom(l, true, err);
  169. if (U_FAILURE(err)) {
  170. return;
  171. }
  172. const char* language = lsr.language;
  173. if (uprv_strcmp(language, "und") == 0) {
  174. language = "";
  175. }
  176. createTagStringWithAlternates(
  177. language,
  178. (int32_t)uprv_strlen(language),
  179. lsr.script,
  180. (int32_t)uprv_strlen(lsr.script),
  181. lsr.region,
  182. (int32_t)uprv_strlen(lsr.region),
  183. variant.data(),
  184. variant.length(),
  185. trailing,
  186. trailingLength,
  187. sink,
  188. err);
  189. }
  190. void
  191. _uloc_minimizeSubtags(const char* localeID,
  192. icu::ByteSink& sink,
  193. bool favorScript,
  194. UErrorCode& err) {
  195. if (U_FAILURE(err)) {
  196. return;
  197. }
  198. if (localeID == nullptr) {
  199. err = U_ILLEGAL_ARGUMENT_ERROR;
  200. return;
  201. }
  202. icu::CharString lang;
  203. icu::CharString script;
  204. icu::CharString region;
  205. icu::CharString variant;
  206. const char* trailing = nullptr;
  207. ulocimp_getSubtags(localeID, &lang, &script, &region, &variant, &trailing, err);
  208. if (U_FAILURE(err)) {
  209. return;
  210. }
  211. if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
  212. err = U_ILLEGAL_ARGUMENT_ERROR;
  213. return;
  214. }
  215. int32_t trailingLength = (int32_t)uprv_strlen(trailing);
  216. const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
  217. if (U_FAILURE(err)) {
  218. return;
  219. }
  220. icu::LSR lsr = likelySubtags->minimizeSubtags(
  221. lang.toStringPiece(),
  222. script.toStringPiece(),
  223. region.toStringPiece(),
  224. favorScript,
  225. err);
  226. if (U_FAILURE(err)) {
  227. return;
  228. }
  229. const char* language = lsr.language;
  230. if (uprv_strcmp(language, "und") == 0) {
  231. language = "";
  232. }
  233. createTagStringWithAlternates(
  234. language,
  235. (int32_t)uprv_strlen(language),
  236. lsr.script,
  237. (int32_t)uprv_strlen(lsr.script),
  238. lsr.region,
  239. (int32_t)uprv_strlen(lsr.region),
  240. variant.data(),
  241. variant.length(),
  242. trailing,
  243. trailingLength,
  244. sink,
  245. err);
  246. }
  247. } // namespace
  248. U_CAPI int32_t U_EXPORT2
  249. uloc_addLikelySubtags(const char* localeID,
  250. char* maximizedLocaleID,
  251. int32_t maximizedLocaleIDCapacity,
  252. UErrorCode* status) {
  253. return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
  254. maximizedLocaleID, maximizedLocaleIDCapacity,
  255. [&](icu::ByteSink& sink, UErrorCode& status) {
  256. ulocimp_addLikelySubtags(localeID, sink, status);
  257. },
  258. *status);
  259. }
  260. U_EXPORT icu::CharString
  261. ulocimp_addLikelySubtags(const char* localeID,
  262. UErrorCode& status) {
  263. return icu::ByteSinkUtil::viaByteSinkToCharString(
  264. [&](icu::ByteSink& sink, UErrorCode& status) {
  265. ulocimp_addLikelySubtags(localeID, sink, status);
  266. },
  267. status);
  268. }
  269. U_EXPORT void
  270. ulocimp_addLikelySubtags(const char* localeID,
  271. icu::ByteSink& sink,
  272. UErrorCode& status) {
  273. if (U_FAILURE(status)) { return; }
  274. icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
  275. _uloc_addLikelySubtags(localeBuffer.data(), sink, status);
  276. }
  277. U_CAPI int32_t U_EXPORT2
  278. uloc_minimizeSubtags(const char* localeID,
  279. char* minimizedLocaleID,
  280. int32_t minimizedLocaleIDCapacity,
  281. UErrorCode* status) {
  282. return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
  283. minimizedLocaleID, minimizedLocaleIDCapacity,
  284. [&](icu::ByteSink& sink, UErrorCode& status) {
  285. ulocimp_minimizeSubtags(localeID, sink, false, status);
  286. },
  287. *status);
  288. }
  289. U_EXPORT icu::CharString
  290. ulocimp_minimizeSubtags(const char* localeID,
  291. bool favorScript,
  292. UErrorCode& status) {
  293. return icu::ByteSinkUtil::viaByteSinkToCharString(
  294. [&](icu::ByteSink& sink, UErrorCode& status) {
  295. ulocimp_minimizeSubtags(localeID, sink, favorScript, status);
  296. },
  297. status);
  298. }
  299. U_EXPORT void
  300. ulocimp_minimizeSubtags(const char* localeID,
  301. icu::ByteSink& sink,
  302. bool favorScript,
  303. UErrorCode& status) {
  304. if (U_FAILURE(status)) { return; }
  305. icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
  306. _uloc_minimizeSubtags(localeBuffer.data(), sink, favorScript, status);
  307. }
  308. // Pairs of (language subtag, + or -) for finding out fast if common languages
  309. // are LTR (minus) or RTL (plus).
  310. static const char LANG_DIR_STRING[] =
  311. "root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-";
  312. // Implemented here because this calls ulocimp_addLikelySubtags().
  313. U_CAPI UBool U_EXPORT2
  314. uloc_isRightToLeft(const char *locale) {
  315. UErrorCode errorCode = U_ZERO_ERROR;
  316. icu::CharString lang;
  317. icu::CharString script;
  318. ulocimp_getSubtags(locale, &lang, &script, nullptr, nullptr, nullptr, errorCode);
  319. if (U_FAILURE(errorCode) || script.isEmpty()) {
  320. // Fastpath: We know the likely scripts and their writing direction
  321. // for some common languages.
  322. if (!lang.isEmpty()) {
  323. const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang.data());
  324. if (langPtr != nullptr) {
  325. switch (langPtr[lang.length()]) {
  326. case '-': return false;
  327. case '+': return true;
  328. default: break; // partial match of a longer code
  329. }
  330. }
  331. }
  332. // Otherwise, find the likely script.
  333. errorCode = U_ZERO_ERROR;
  334. icu::CharString likely = ulocimp_addLikelySubtags(locale, errorCode);
  335. if (U_FAILURE(errorCode)) {
  336. return false;
  337. }
  338. ulocimp_getSubtags(likely.data(), nullptr, &script, nullptr, nullptr, nullptr, errorCode);
  339. if (U_FAILURE(errorCode) || script.isEmpty()) {
  340. return false;
  341. }
  342. }
  343. UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script.data());
  344. return uscript_isRightToLeft(scriptCode);
  345. }
  346. U_NAMESPACE_BEGIN
  347. UBool
  348. Locale::isRightToLeft() const {
  349. return uloc_isRightToLeft(getBaseName());
  350. }
  351. U_NAMESPACE_END
  352. namespace {
  353. icu::CharString
  354. GetRegionFromKey(const char* localeID, const char* key, UErrorCode& status) {
  355. icu::CharString result;
  356. // First check for keyword value
  357. icu::CharString kw = ulocimp_getKeywordValue(localeID, key, status);
  358. int32_t len = kw.length();
  359. if (U_SUCCESS(status) && len >= 3 && len <= 7) {
  360. // chop off the subdivision code (which will generally be "zzzz" anyway)
  361. const char* const data = kw.data();
  362. if (uprv_isASCIILetter(data[0])) {
  363. result.append(uprv_toupper(data[0]), status);
  364. result.append(uprv_toupper(data[1]), status);
  365. } else {
  366. // assume three-digit region code
  367. result.append(data, 3, status);
  368. }
  369. }
  370. return result;
  371. }
  372. } // namespace
  373. U_EXPORT icu::CharString
  374. ulocimp_getRegionForSupplementalData(const char *localeID, bool inferRegion,
  375. UErrorCode& status) {
  376. if (U_FAILURE(status)) {
  377. return {};
  378. }
  379. icu::CharString rgBuf = GetRegionFromKey(localeID, "rg", status);
  380. if (U_SUCCESS(status) && rgBuf.isEmpty()) {
  381. // No valid rg keyword value, try for unicode_region_subtag
  382. rgBuf = ulocimp_getRegion(localeID, status);
  383. if (U_SUCCESS(status) && rgBuf.isEmpty() && inferRegion) {
  384. // Second check for sd keyword value
  385. rgBuf = GetRegionFromKey(localeID, "sd", status);
  386. if (U_SUCCESS(status) && rgBuf.isEmpty()) {
  387. // no unicode_region_subtag but inferRegion true, try likely subtags
  388. UErrorCode rgStatus = U_ZERO_ERROR;
  389. icu::CharString locBuf = ulocimp_addLikelySubtags(localeID, rgStatus);
  390. if (U_SUCCESS(rgStatus)) {
  391. rgBuf = ulocimp_getRegion(locBuf.data(), status);
  392. }
  393. }
  394. }
  395. }
  396. return rgBuf;
  397. }