nsUTF8Utils.h 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742
  1. /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
  2. /* This Source Code Form is subject to the terms of the Mozilla Public
  3. * License, v. 2.0. If a copy of the MPL was not distributed with this
  4. * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
  5. #ifndef nsUTF8Utils_h_
  6. #define nsUTF8Utils_h_
  7. // This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
  8. // file will provide signatures for the Mozilla abstract string types. It will
  9. // use XPCOM assertion/debugging macros, etc.
  10. #include "nscore.h"
  11. #include "mozilla/Assertions.h"
  12. #include "mozilla/SSE.h"
  13. #include "mozilla/TypeTraits.h"
  14. #include "nsCharTraits.h"
  15. class UTF8traits
  16. {
  17. public:
  18. static bool isASCII(char aChar)
  19. {
  20. return (aChar & 0x80) == 0x00;
  21. }
  22. static bool isInSeq(char aChar)
  23. {
  24. return (aChar & 0xC0) == 0x80;
  25. }
  26. static bool is2byte(char aChar)
  27. {
  28. return (aChar & 0xE0) == 0xC0;
  29. }
  30. static bool is3byte(char aChar)
  31. {
  32. return (aChar & 0xF0) == 0xE0;
  33. }
  34. static bool is4byte(char aChar)
  35. {
  36. return (aChar & 0xF8) == 0xF0;
  37. }
  38. static bool is5byte(char aChar)
  39. {
  40. return (aChar & 0xFC) == 0xF8;
  41. }
  42. static bool is6byte(char aChar)
  43. {
  44. return (aChar & 0xFE) == 0xFC;
  45. }
  46. };
  47. /**
  48. * Extract the next UCS-4 character from the buffer and return it. The
  49. * pointer passed in is advanced to the start of the next character in the
  50. * buffer. If non-null, the parameters err and overlong are filled in to
  51. * indicate that the character was represented by an overlong sequence, or
  52. * that an error occurred.
  53. */
  54. class UTF8CharEnumerator
  55. {
  56. public:
  57. static uint32_t NextChar(const char** aBuffer, const char* aEnd, bool* aErr)
  58. {
  59. NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
  60. const char* p = *aBuffer;
  61. *aErr = false;
  62. if (p >= aEnd) {
  63. *aErr = true;
  64. return 0;
  65. }
  66. char c = *p++;
  67. if (UTF8traits::isASCII(c)) {
  68. *aBuffer = p;
  69. return c;
  70. }
  71. uint32_t ucs4;
  72. uint32_t minUcs4;
  73. int32_t state = 0;
  74. if (!CalcState(c, ucs4, minUcs4, state)) {
  75. NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
  76. *aErr = true;
  77. return 0;
  78. }
  79. while (state--) {
  80. if (p == aEnd) {
  81. *aErr = true;
  82. return 0;
  83. }
  84. c = *p++;
  85. if (!AddByte(c, state, ucs4)) {
  86. *aErr = true;
  87. return 0;
  88. }
  89. }
  90. if (ucs4 < minUcs4) {
  91. // Overlong sequence
  92. ucs4 = UCS2_REPLACEMENT_CHAR;
  93. } else if (ucs4 >= 0xD800 &&
  94. (ucs4 <= 0xDFFF || ucs4 >= UCS_END)) {
  95. // Surrogates and code points outside the Unicode range.
  96. ucs4 = UCS2_REPLACEMENT_CHAR;
  97. }
  98. *aBuffer = p;
  99. return ucs4;
  100. }
  101. private:
  102. static bool CalcState(char aChar, uint32_t& aUcs4, uint32_t& aMinUcs4,
  103. int32_t& aState)
  104. {
  105. if (UTF8traits::is2byte(aChar)) {
  106. aUcs4 = (uint32_t(aChar) << 6) & 0x000007C0L;
  107. aState = 1;
  108. aMinUcs4 = 0x00000080;
  109. } else if (UTF8traits::is3byte(aChar)) {
  110. aUcs4 = (uint32_t(aChar) << 12) & 0x0000F000L;
  111. aState = 2;
  112. aMinUcs4 = 0x00000800;
  113. } else if (UTF8traits::is4byte(aChar)) {
  114. aUcs4 = (uint32_t(aChar) << 18) & 0x001F0000L;
  115. aState = 3;
  116. aMinUcs4 = 0x00010000;
  117. } else if (UTF8traits::is5byte(aChar)) {
  118. aUcs4 = (uint32_t(aChar) << 24) & 0x03000000L;
  119. aState = 4;
  120. aMinUcs4 = 0x00200000;
  121. } else if (UTF8traits::is6byte(aChar)) {
  122. aUcs4 = (uint32_t(aChar) << 30) & 0x40000000L;
  123. aState = 5;
  124. aMinUcs4 = 0x04000000;
  125. } else {
  126. return false;
  127. }
  128. return true;
  129. }
  130. static bool AddByte(char aChar, int32_t aState, uint32_t& aUcs4)
  131. {
  132. if (UTF8traits::isInSeq(aChar)) {
  133. int32_t shift = aState * 6;
  134. aUcs4 |= (uint32_t(aChar) & 0x3F) << shift;
  135. return true;
  136. }
  137. return false;
  138. }
  139. };
  140. /**
  141. * Extract the next UCS-4 character from the buffer and return it. The
  142. * pointer passed in is advanced to the start of the next character in the
  143. * buffer. If non-null, the err parameter is filled in if an error occurs.
  144. *
  145. * If an error occurs that causes UCS2_REPLACEMENT_CHAR to be returned, then
  146. * the buffer will be updated to move only a single UCS-2 character.
  147. *
  148. * Any other error returns 0 and does not move the buffer position.
  149. */
  150. class UTF16CharEnumerator
  151. {
  152. public:
  153. static uint32_t NextChar(const char16_t** aBuffer, const char16_t* aEnd,
  154. bool* aErr = nullptr)
  155. {
  156. NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
  157. const char16_t* p = *aBuffer;
  158. if (p >= aEnd) {
  159. NS_ERROR("No input to work with");
  160. if (aErr) {
  161. *aErr = true;
  162. }
  163. return 0;
  164. }
  165. char16_t c = *p++;
  166. if (!IS_SURROGATE(c)) { // U+0000 - U+D7FF,U+E000 - U+FFFF
  167. if (aErr) {
  168. *aErr = false;
  169. }
  170. *aBuffer = p;
  171. return c;
  172. } else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
  173. if (p == aEnd) {
  174. // Found a high surrogate at the end of the buffer. Flag this
  175. // as an error and return the Unicode replacement
  176. // character 0xFFFD.
  177. NS_WARNING("Unexpected end of buffer after high surrogate");
  178. if (aErr) {
  179. *aErr = true;
  180. }
  181. *aBuffer = p;
  182. return 0xFFFD;
  183. }
  184. // D800- DBFF - High Surrogate
  185. char16_t h = c;
  186. c = *p++;
  187. if (NS_IS_LOW_SURROGATE(c)) {
  188. // DC00- DFFF - Low Surrogate
  189. // N = (H - D800) *400 + 10000 + (L - DC00)
  190. uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
  191. if (aErr) {
  192. *aErr = false;
  193. }
  194. *aBuffer = p;
  195. return ucs4;
  196. } else {
  197. // Found a high surrogate followed by something other than
  198. // a low surrogate. Flag this as an error and return the
  199. // Unicode replacement character 0xFFFD. Note that the
  200. // pointer to the next character points to the second 16-bit
  201. // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
  202. // only the first code unit of an illegal sequence must be
  203. // treated as an illegally terminated code unit sequence
  204. // (also Chapter 3 D91, "isolated [not paired and ill-formed]
  205. // UTF-16 code units in the range D800..DFFF are ill-formed").
  206. NS_WARNING("got a High Surrogate but no low surrogate");
  207. if (aErr) {
  208. *aErr = true;
  209. }
  210. *aBuffer = p - 1;
  211. return 0xFFFD;
  212. }
  213. } else { // U+DC00 - U+DFFF
  214. // DC00- DFFF - Low Surrogate
  215. // Found a low surrogate w/o a preceding high surrogate. Flag
  216. // this as an error and return the Unicode replacement
  217. // character 0xFFFD.
  218. NS_WARNING("got a low Surrogate but no high surrogate");
  219. if (aErr) {
  220. *aErr = true;
  221. }
  222. *aBuffer = p;
  223. return 0xFFFD;
  224. }
  225. MOZ_ASSERT_UNREACHABLE("Impossible UCS-2 character value.");
  226. }
  227. };
  228. /**
  229. * A character sink (see |copy_string| in nsAlgorithm.h) for converting
  230. * UTF-8 to UTF-16
  231. */
  232. class ConvertUTF8toUTF16
  233. {
  234. public:
  235. typedef char value_type;
  236. typedef char16_t buffer_type;
  237. explicit ConvertUTF8toUTF16(buffer_type* aBuffer)
  238. : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false)
  239. {
  240. }
  241. size_t Length() const
  242. {
  243. return mBuffer - mStart;
  244. }
  245. bool ErrorEncountered() const
  246. {
  247. return mErrorEncountered;
  248. }
  249. void write(const value_type* aStart, uint32_t aN)
  250. {
  251. if (mErrorEncountered) {
  252. return;
  253. }
  254. // algorithm assumes utf8 units won't
  255. // be spread across fragments
  256. const value_type* p = aStart;
  257. const value_type* end = aStart + aN;
  258. buffer_type* out = mBuffer;
  259. for (; p != end /* && *p */;) {
  260. bool err;
  261. uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err);
  262. if (err) {
  263. mErrorEncountered = true;
  264. mBuffer = out;
  265. return;
  266. }
  267. if (ucs4 >= PLANE1_BASE) {
  268. *out++ = (buffer_type)H_SURROGATE(ucs4);
  269. *out++ = (buffer_type)L_SURROGATE(ucs4);
  270. } else {
  271. *out++ = ucs4;
  272. }
  273. }
  274. mBuffer = out;
  275. }
  276. void write_terminator()
  277. {
  278. *mBuffer = buffer_type(0);
  279. }
  280. private:
  281. buffer_type* const mStart;
  282. buffer_type* mBuffer;
  283. bool mErrorEncountered;
  284. };
  285. /**
  286. * A character sink (see |copy_string| in nsAlgorithm.h) for computing
  287. * the length of the UTF-16 string equivalent to a UTF-8 string.
  288. */
  289. class CalculateUTF8Length
  290. {
  291. public:
  292. typedef char value_type;
  293. CalculateUTF8Length()
  294. : mLength(0), mErrorEncountered(false)
  295. {
  296. }
  297. size_t Length() const
  298. {
  299. return mLength;
  300. }
  301. void write(const value_type* aStart, uint32_t aN)
  302. {
  303. // ignore any further requests
  304. if (mErrorEncountered) {
  305. return;
  306. }
  307. // algorithm assumes utf8 units won't
  308. // be spread across fragments
  309. const value_type* p = aStart;
  310. const value_type* end = aStart + aN;
  311. for (; p < end /* && *p */; ++mLength) {
  312. if (UTF8traits::isASCII(*p)) {
  313. p += 1;
  314. } else if (UTF8traits::is2byte(*p)) {
  315. p += 2;
  316. } else if (UTF8traits::is3byte(*p)) {
  317. p += 3;
  318. } else if (UTF8traits::is4byte(*p)) {
  319. // Because a UTF-8 sequence of 4 bytes represents a codepoint
  320. // greater than 0xFFFF, it will become a surrogate pair in the
  321. // UTF-16 string, so add 1 more to mLength.
  322. // This doesn't happen with is5byte and is6byte because they
  323. // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
  324. // converted to a single replacement character.
  325. // However, there is one case when a 4 byte UTF-8 sequence will
  326. // only generate 2 UTF-16 bytes. If we have a properly encoded
  327. // sequence, but with an invalid value (too small or too big),
  328. // that will result in a replacement character being written
  329. // This replacement character is encoded as just 1 single
  330. // UTF-16 character, which is 2 bytes.
  331. // The below code therefore only adds 1 to mLength if the UTF8
  332. // data will produce a decoded character which is greater than
  333. // or equal to 0x010000 and less than 0x0110000.
  334. // A 4byte UTF8 character is encoded as
  335. // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  336. // Bit 1-3 on the first byte, and bit 5-6 on the second byte,
  337. // map to bit 17-21 in the final result. If these bits are
  338. // between 0x01 and 0x11, that means that the final result is
  339. // between 0x010000 and 0x110000. The below code reads these
  340. // bits out and assigns them to c, but shifted up 4 bits to
  341. // avoid having to shift twice.
  342. // It doesn't matter what to do in the case where p + 4 > end
  343. // since no UTF16 characters will be written in that case by
  344. // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
  345. // any of the surrogate bits are wrong since no UTF16
  346. // characters will be written in that case either.
  347. if (p + 4 <= end) {
  348. uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 |
  349. ((uint32_t)(p[1] & 0x30));
  350. if (c >= 0x010 && c < 0x110) {
  351. ++mLength;
  352. }
  353. }
  354. p += 4;
  355. } else if (UTF8traits::is5byte(*p)) {
  356. p += 5;
  357. } else if (UTF8traits::is6byte(*p)) {
  358. p += 6;
  359. } else { // error
  360. ++mLength; // to account for the decrement below
  361. break;
  362. }
  363. }
  364. if (p != end) {
  365. NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
  366. --mLength; // The last multi-byte char wasn't complete, discard it.
  367. mErrorEncountered = true;
  368. }
  369. }
  370. private:
  371. size_t mLength;
  372. bool mErrorEncountered;
  373. };
  374. /**
  375. * A character sink (see |copy_string| in nsAlgorithm.h) for
  376. * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD
  377. * (0xEFBFBD in UTF-8).
  378. */
  379. class ConvertUTF16toUTF8
  380. {
  381. public:
  382. typedef char16_t value_type;
  383. typedef char buffer_type;
  384. // The error handling here is more lenient than that in
  385. // |ConvertUTF8toUTF16|, but it's that way for backwards
  386. // compatibility.
  387. explicit ConvertUTF16toUTF8(buffer_type* aBuffer)
  388. : mStart(aBuffer), mBuffer(aBuffer)
  389. {
  390. }
  391. size_t Size() const
  392. {
  393. return mBuffer - mStart;
  394. }
  395. void write(const value_type* aStart, uint32_t aN)
  396. {
  397. buffer_type* out = mBuffer; // gcc isn't smart enough to do this!
  398. for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
  399. value_type c = *p;
  400. if (!(c & 0xFF80)) { // U+0000 - U+007F
  401. *out++ = (char)c;
  402. } else if (!(c & 0xF800)) { // U+0100 - U+07FF
  403. *out++ = 0xC0 | (char)(c >> 6);
  404. *out++ = 0x80 | (char)(0x003F & c);
  405. } else if (!IS_SURROGATE(c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
  406. *out++ = 0xE0 | (char)(c >> 12);
  407. *out++ = 0x80 | (char)(0x003F & (c >> 6));
  408. *out++ = 0x80 | (char)(0x003F & c);
  409. } else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
  410. // D800- DBFF - High Surrogate
  411. value_type h = c;
  412. ++p;
  413. if (p == end) {
  414. // Treat broken characters as the Unicode
  415. // replacement character 0xFFFD (0xEFBFBD in
  416. // UTF-8)
  417. *out++ = '\xEF';
  418. *out++ = '\xBF';
  419. *out++ = '\xBD';
  420. NS_WARNING("String ending in half a surrogate pair!");
  421. break;
  422. }
  423. c = *p;
  424. if (NS_IS_LOW_SURROGATE(c)) {
  425. // DC00- DFFF - Low Surrogate
  426. // N = (H - D800) *400 + 10000 + ( L - DC00 )
  427. uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
  428. // 0001 0000-001F FFFF
  429. *out++ = 0xF0 | (char)(ucs4 >> 18);
  430. *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
  431. *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
  432. *out++ = 0x80 | (char)(0x003F & ucs4);
  433. } else {
  434. // Treat broken characters as the Unicode
  435. // replacement character 0xFFFD (0xEFBFBD in
  436. // UTF-8)
  437. *out++ = '\xEF';
  438. *out++ = '\xBF';
  439. *out++ = '\xBD';
  440. // The pointer to the next character points to the second
  441. // 16-bit value, not beyond it, as per Unicode 5.0.0
  442. // Chapter 3 C10, only the first code unit of an illegal
  443. // sequence must be treated as an illegally terminated
  444. // code unit sequence (also Chapter 3 D91, "isolated [not
  445. // paired and ill-formed] UTF-16 code units in the range
  446. // D800..DFFF are ill-formed").
  447. p--;
  448. NS_WARNING("got a High Surrogate but no low surrogate");
  449. }
  450. } else { // U+DC00 - U+DFFF
  451. // Treat broken characters as the Unicode replacement
  452. // character 0xFFFD (0xEFBFBD in UTF-8)
  453. *out++ = '\xEF';
  454. *out++ = '\xBF';
  455. *out++ = '\xBD';
  456. // DC00- DFFF - Low Surrogate
  457. NS_WARNING("got a low Surrogate but no high surrogate");
  458. }
  459. }
  460. mBuffer = out;
  461. }
  462. void write_terminator()
  463. {
  464. *mBuffer = buffer_type(0);
  465. }
  466. private:
  467. buffer_type* const mStart;
  468. buffer_type* mBuffer;
  469. };
  470. /**
  471. * A character sink (see |copy_string| in nsAlgorithm.h) for computing
  472. * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid
  473. * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8).
  474. */
  475. class CalculateUTF8Size
  476. {
  477. public:
  478. typedef char16_t value_type;
  479. CalculateUTF8Size()
  480. : mSize(0)
  481. {
  482. }
  483. size_t Size() const
  484. {
  485. return mSize;
  486. }
  487. void write(const value_type* aStart, uint32_t aN)
  488. {
  489. // Assume UCS2 surrogate pairs won't be spread across fragments.
  490. for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
  491. value_type c = *p;
  492. if (!(c & 0xFF80)) { // U+0000 - U+007F
  493. mSize += 1;
  494. } else if (!(c & 0xF800)) { // U+0100 - U+07FF
  495. mSize += 2;
  496. } else if (0xD800 != (0xF800 & c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
  497. mSize += 3;
  498. } else if (0xD800 == (0xFC00 & c)) { // U+D800 - U+DBFF
  499. ++p;
  500. if (p == end) {
  501. // Treat broken characters as the Unicode
  502. // replacement character 0xFFFD (0xEFBFBD in
  503. // UTF-8)
  504. mSize += 3;
  505. NS_WARNING("String ending in half a surrogate pair!");
  506. break;
  507. }
  508. c = *p;
  509. if (0xDC00 == (0xFC00 & c)) {
  510. mSize += 4;
  511. } else {
  512. // Treat broken characters as the Unicode
  513. // replacement character 0xFFFD (0xEFBFBD in
  514. // UTF-8)
  515. mSize += 3;
  516. // The next code unit is the second 16-bit value, not
  517. // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
  518. // only the first code unit of an illegal sequence must
  519. // be treated as an illegally terminated code unit
  520. // sequence (also Chapter 3 D91, "isolated [not paired and
  521. // ill-formed] UTF-16 code units in the range D800..DFFF
  522. // are ill-formed").
  523. p--;
  524. NS_WARNING("got a high Surrogate but no low surrogate");
  525. }
  526. } else { // U+DC00 - U+DFFF
  527. // Treat broken characters as the Unicode replacement
  528. // character 0xFFFD (0xEFBFBD in UTF-8)
  529. mSize += 3;
  530. NS_WARNING("got a low Surrogate but no high surrogate");
  531. }
  532. }
  533. }
  534. private:
  535. size_t mSize;
  536. };
  537. #ifdef MOZILLA_INTERNAL_API
  538. /**
  539. * A character sink that performs a |reinterpret_cast|-style conversion
  540. * from char to char16_t.
  541. */
  542. class LossyConvertEncoding8to16
  543. {
  544. public:
  545. typedef char value_type;
  546. typedef char input_type;
  547. typedef char16_t output_type;
  548. public:
  549. explicit LossyConvertEncoding8to16(char16_t* aDestination) :
  550. mDestination(aDestination)
  551. {
  552. }
  553. void
  554. write(const char* aSource, uint32_t aSourceLength)
  555. {
  556. #ifdef MOZILLA_MAY_SUPPORT_SSE2
  557. if (mozilla::supports_sse2()) {
  558. write_sse2(aSource, aSourceLength);
  559. return;
  560. }
  561. #endif
  562. const char* done_writing = aSource + aSourceLength;
  563. while (aSource < done_writing) {
  564. *mDestination++ = (char16_t)(unsigned char)(*aSource++);
  565. }
  566. }
  567. void
  568. write_sse2(const char* aSource, uint32_t aSourceLength);
  569. void
  570. write_terminator()
  571. {
  572. *mDestination = (char16_t)(0);
  573. }
  574. private:
  575. char16_t* mDestination;
  576. };
  577. /**
  578. * A character sink that performs a |reinterpret_cast|-style conversion
  579. * from char16_t to char.
  580. */
  581. class LossyConvertEncoding16to8
  582. {
  583. public:
  584. typedef char16_t value_type;
  585. typedef char16_t input_type;
  586. typedef char output_type;
  587. explicit LossyConvertEncoding16to8(char* aDestination)
  588. : mDestination(aDestination)
  589. {
  590. }
  591. void
  592. write(const char16_t* aSource, uint32_t aSourceLength)
  593. {
  594. #ifdef MOZILLA_MAY_SUPPORT_SSE2
  595. if (mozilla::supports_sse2()) {
  596. write_sse2(aSource, aSourceLength);
  597. return;
  598. }
  599. #endif
  600. const char16_t* done_writing = aSource + aSourceLength;
  601. while (aSource < done_writing) {
  602. *mDestination++ = (char)(*aSource++);
  603. }
  604. }
  605. #ifdef MOZILLA_MAY_SUPPORT_SSE2
  606. void
  607. write_sse2(const char16_t* aSource, uint32_t aSourceLength);
  608. #endif
  609. void
  610. write_terminator()
  611. {
  612. *mDestination = '\0';
  613. }
  614. private:
  615. char* mDestination;
  616. };
  617. #endif // MOZILLA_INTERNAL_API
  618. template<typename Char, typename UnsignedT>
  619. inline UnsignedT
  620. RewindToPriorUTF8Codepoint(const Char* utf8Chars, UnsignedT index)
  621. {
  622. static_assert(mozilla::IsSame<Char, char>::value ||
  623. mozilla::IsSame<Char, unsigned char>::value ||
  624. mozilla::IsSame<Char, signed char>::value,
  625. "UTF-8 data must be in 8-bit units");
  626. static_assert(mozilla::IsUnsigned<UnsignedT>::value, "index type must be unsigned");
  627. while (index > 0 && (utf8Chars[index] & 0xC0) == 0x80)
  628. --index;
  629. return index;
  630. }
  631. #endif /* !defined(nsUTF8Utils_h_) */