123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742 |
- /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
- /* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
- #ifndef nsUTF8Utils_h_
- #define nsUTF8Utils_h_
- // This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
- // file will provide signatures for the Mozilla abstract string types. It will
- // use XPCOM assertion/debugging macros, etc.
- #include "nscore.h"
- #include "mozilla/Assertions.h"
- #include "mozilla/SSE.h"
- #include "mozilla/TypeTraits.h"
- #include "nsCharTraits.h"
- class UTF8traits
- {
- public:
- static bool isASCII(char aChar)
- {
- return (aChar & 0x80) == 0x00;
- }
- static bool isInSeq(char aChar)
- {
- return (aChar & 0xC0) == 0x80;
- }
- static bool is2byte(char aChar)
- {
- return (aChar & 0xE0) == 0xC0;
- }
- static bool is3byte(char aChar)
- {
- return (aChar & 0xF0) == 0xE0;
- }
- static bool is4byte(char aChar)
- {
- return (aChar & 0xF8) == 0xF0;
- }
- static bool is5byte(char aChar)
- {
- return (aChar & 0xFC) == 0xF8;
- }
- static bool is6byte(char aChar)
- {
- return (aChar & 0xFE) == 0xFC;
- }
- };
- /**
- * Extract the next UCS-4 character from the buffer and return it. The
- * pointer passed in is advanced to the start of the next character in the
- * buffer. If non-null, the parameters err and overlong are filled in to
- * indicate that the character was represented by an overlong sequence, or
- * that an error occurred.
- */
- class UTF8CharEnumerator
- {
- public:
- static uint32_t NextChar(const char** aBuffer, const char* aEnd, bool* aErr)
- {
- NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
- const char* p = *aBuffer;
- *aErr = false;
- if (p >= aEnd) {
- *aErr = true;
- return 0;
- }
- char c = *p++;
- if (UTF8traits::isASCII(c)) {
- *aBuffer = p;
- return c;
- }
- uint32_t ucs4;
- uint32_t minUcs4;
- int32_t state = 0;
- if (!CalcState(c, ucs4, minUcs4, state)) {
- NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
- *aErr = true;
- return 0;
- }
- while (state--) {
- if (p == aEnd) {
- *aErr = true;
- return 0;
- }
- c = *p++;
- if (!AddByte(c, state, ucs4)) {
- *aErr = true;
- return 0;
- }
- }
- if (ucs4 < minUcs4) {
- // Overlong sequence
- ucs4 = UCS2_REPLACEMENT_CHAR;
- } else if (ucs4 >= 0xD800 &&
- (ucs4 <= 0xDFFF || ucs4 >= UCS_END)) {
- // Surrogates and code points outside the Unicode range.
- ucs4 = UCS2_REPLACEMENT_CHAR;
- }
- *aBuffer = p;
- return ucs4;
- }
- private:
- static bool CalcState(char aChar, uint32_t& aUcs4, uint32_t& aMinUcs4,
- int32_t& aState)
- {
- if (UTF8traits::is2byte(aChar)) {
- aUcs4 = (uint32_t(aChar) << 6) & 0x000007C0L;
- aState = 1;
- aMinUcs4 = 0x00000080;
- } else if (UTF8traits::is3byte(aChar)) {
- aUcs4 = (uint32_t(aChar) << 12) & 0x0000F000L;
- aState = 2;
- aMinUcs4 = 0x00000800;
- } else if (UTF8traits::is4byte(aChar)) {
- aUcs4 = (uint32_t(aChar) << 18) & 0x001F0000L;
- aState = 3;
- aMinUcs4 = 0x00010000;
- } else if (UTF8traits::is5byte(aChar)) {
- aUcs4 = (uint32_t(aChar) << 24) & 0x03000000L;
- aState = 4;
- aMinUcs4 = 0x00200000;
- } else if (UTF8traits::is6byte(aChar)) {
- aUcs4 = (uint32_t(aChar) << 30) & 0x40000000L;
- aState = 5;
- aMinUcs4 = 0x04000000;
- } else {
- return false;
- }
- return true;
- }
- static bool AddByte(char aChar, int32_t aState, uint32_t& aUcs4)
- {
- if (UTF8traits::isInSeq(aChar)) {
- int32_t shift = aState * 6;
- aUcs4 |= (uint32_t(aChar) & 0x3F) << shift;
- return true;
- }
- return false;
- }
- };
- /**
- * Extract the next UCS-4 character from the buffer and return it. The
- * pointer passed in is advanced to the start of the next character in the
- * buffer. If non-null, the err parameter is filled in if an error occurs.
- *
- * If an error occurs that causes UCS2_REPLACEMENT_CHAR to be returned, then
- * the buffer will be updated to move only a single UCS-2 character.
- *
- * Any other error returns 0 and does not move the buffer position.
- */
- class UTF16CharEnumerator
- {
- public:
- static uint32_t NextChar(const char16_t** aBuffer, const char16_t* aEnd,
- bool* aErr = nullptr)
- {
- NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
- const char16_t* p = *aBuffer;
- if (p >= aEnd) {
- NS_ERROR("No input to work with");
- if (aErr) {
- *aErr = true;
- }
- return 0;
- }
- char16_t c = *p++;
- if (!IS_SURROGATE(c)) { // U+0000 - U+D7FF,U+E000 - U+FFFF
- if (aErr) {
- *aErr = false;
- }
- *aBuffer = p;
- return c;
- } else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
- if (p == aEnd) {
- // Found a high surrogate at the end of the buffer. Flag this
- // as an error and return the Unicode replacement
- // character 0xFFFD.
- NS_WARNING("Unexpected end of buffer after high surrogate");
- if (aErr) {
- *aErr = true;
- }
- *aBuffer = p;
- return 0xFFFD;
- }
- // D800- DBFF - High Surrogate
- char16_t h = c;
- c = *p++;
- if (NS_IS_LOW_SURROGATE(c)) {
- // DC00- DFFF - Low Surrogate
- // N = (H - D800) *400 + 10000 + (L - DC00)
- uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
- if (aErr) {
- *aErr = false;
- }
- *aBuffer = p;
- return ucs4;
- } else {
- // Found a high surrogate followed by something other than
- // a low surrogate. Flag this as an error and return the
- // Unicode replacement character 0xFFFD. Note that the
- // pointer to the next character points to the second 16-bit
- // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
- // only the first code unit of an illegal sequence must be
- // treated as an illegally terminated code unit sequence
- // (also Chapter 3 D91, "isolated [not paired and ill-formed]
- // UTF-16 code units in the range D800..DFFF are ill-formed").
- NS_WARNING("got a High Surrogate but no low surrogate");
- if (aErr) {
- *aErr = true;
- }
- *aBuffer = p - 1;
- return 0xFFFD;
- }
- } else { // U+DC00 - U+DFFF
- // DC00- DFFF - Low Surrogate
- // Found a low surrogate w/o a preceding high surrogate. Flag
- // this as an error and return the Unicode replacement
- // character 0xFFFD.
- NS_WARNING("got a low Surrogate but no high surrogate");
- if (aErr) {
- *aErr = true;
- }
- *aBuffer = p;
- return 0xFFFD;
- }
- MOZ_ASSERT_UNREACHABLE("Impossible UCS-2 character value.");
- }
- };
- /**
- * A character sink (see |copy_string| in nsAlgorithm.h) for converting
- * UTF-8 to UTF-16
- */
- class ConvertUTF8toUTF16
- {
- public:
- typedef char value_type;
- typedef char16_t buffer_type;
- explicit ConvertUTF8toUTF16(buffer_type* aBuffer)
- : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false)
- {
- }
- size_t Length() const
- {
- return mBuffer - mStart;
- }
- bool ErrorEncountered() const
- {
- return mErrorEncountered;
- }
- void write(const value_type* aStart, uint32_t aN)
- {
- if (mErrorEncountered) {
- return;
- }
- // algorithm assumes utf8 units won't
- // be spread across fragments
- const value_type* p = aStart;
- const value_type* end = aStart + aN;
- buffer_type* out = mBuffer;
- for (; p != end /* && *p */;) {
- bool err;
- uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err);
- if (err) {
- mErrorEncountered = true;
- mBuffer = out;
- return;
- }
- if (ucs4 >= PLANE1_BASE) {
- *out++ = (buffer_type)H_SURROGATE(ucs4);
- *out++ = (buffer_type)L_SURROGATE(ucs4);
- } else {
- *out++ = ucs4;
- }
- }
- mBuffer = out;
- }
- void write_terminator()
- {
- *mBuffer = buffer_type(0);
- }
- private:
- buffer_type* const mStart;
- buffer_type* mBuffer;
- bool mErrorEncountered;
- };
- /**
- * A character sink (see |copy_string| in nsAlgorithm.h) for computing
- * the length of the UTF-16 string equivalent to a UTF-8 string.
- */
- class CalculateUTF8Length
- {
- public:
- typedef char value_type;
- CalculateUTF8Length()
- : mLength(0), mErrorEncountered(false)
- {
- }
- size_t Length() const
- {
- return mLength;
- }
- void write(const value_type* aStart, uint32_t aN)
- {
- // ignore any further requests
- if (mErrorEncountered) {
- return;
- }
- // algorithm assumes utf8 units won't
- // be spread across fragments
- const value_type* p = aStart;
- const value_type* end = aStart + aN;
- for (; p < end /* && *p */; ++mLength) {
- if (UTF8traits::isASCII(*p)) {
- p += 1;
- } else if (UTF8traits::is2byte(*p)) {
- p += 2;
- } else if (UTF8traits::is3byte(*p)) {
- p += 3;
- } else if (UTF8traits::is4byte(*p)) {
- // Because a UTF-8 sequence of 4 bytes represents a codepoint
- // greater than 0xFFFF, it will become a surrogate pair in the
- // UTF-16 string, so add 1 more to mLength.
- // This doesn't happen with is5byte and is6byte because they
- // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
- // converted to a single replacement character.
- // However, there is one case when a 4 byte UTF-8 sequence will
- // only generate 2 UTF-16 bytes. If we have a properly encoded
- // sequence, but with an invalid value (too small or too big),
- // that will result in a replacement character being written
- // This replacement character is encoded as just 1 single
- // UTF-16 character, which is 2 bytes.
- // The below code therefore only adds 1 to mLength if the UTF8
- // data will produce a decoded character which is greater than
- // or equal to 0x010000 and less than 0x0110000.
- // A 4byte UTF8 character is encoded as
- // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- // Bit 1-3 on the first byte, and bit 5-6 on the second byte,
- // map to bit 17-21 in the final result. If these bits are
- // between 0x01 and 0x11, that means that the final result is
- // between 0x010000 and 0x110000. The below code reads these
- // bits out and assigns them to c, but shifted up 4 bits to
- // avoid having to shift twice.
- // It doesn't matter what to do in the case where p + 4 > end
- // since no UTF16 characters will be written in that case by
- // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
- // any of the surrogate bits are wrong since no UTF16
- // characters will be written in that case either.
- if (p + 4 <= end) {
- uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 |
- ((uint32_t)(p[1] & 0x30));
- if (c >= 0x010 && c < 0x110) {
- ++mLength;
- }
- }
- p += 4;
- } else if (UTF8traits::is5byte(*p)) {
- p += 5;
- } else if (UTF8traits::is6byte(*p)) {
- p += 6;
- } else { // error
- ++mLength; // to account for the decrement below
- break;
- }
- }
- if (p != end) {
- NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
- --mLength; // The last multi-byte char wasn't complete, discard it.
- mErrorEncountered = true;
- }
- }
- private:
- size_t mLength;
- bool mErrorEncountered;
- };
- /**
- * A character sink (see |copy_string| in nsAlgorithm.h) for
- * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD
- * (0xEFBFBD in UTF-8).
- */
- class ConvertUTF16toUTF8
- {
- public:
- typedef char16_t value_type;
- typedef char buffer_type;
- // The error handling here is more lenient than that in
- // |ConvertUTF8toUTF16|, but it's that way for backwards
- // compatibility.
- explicit ConvertUTF16toUTF8(buffer_type* aBuffer)
- : mStart(aBuffer), mBuffer(aBuffer)
- {
- }
- size_t Size() const
- {
- return mBuffer - mStart;
- }
- void write(const value_type* aStart, uint32_t aN)
- {
- buffer_type* out = mBuffer; // gcc isn't smart enough to do this!
- for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
- value_type c = *p;
- if (!(c & 0xFF80)) { // U+0000 - U+007F
- *out++ = (char)c;
- } else if (!(c & 0xF800)) { // U+0100 - U+07FF
- *out++ = 0xC0 | (char)(c >> 6);
- *out++ = 0x80 | (char)(0x003F & c);
- } else if (!IS_SURROGATE(c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
- *out++ = 0xE0 | (char)(c >> 12);
- *out++ = 0x80 | (char)(0x003F & (c >> 6));
- *out++ = 0x80 | (char)(0x003F & c);
- } else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
- // D800- DBFF - High Surrogate
- value_type h = c;
- ++p;
- if (p == end) {
- // Treat broken characters as the Unicode
- // replacement character 0xFFFD (0xEFBFBD in
- // UTF-8)
- *out++ = '\xEF';
- *out++ = '\xBF';
- *out++ = '\xBD';
- NS_WARNING("String ending in half a surrogate pair!");
- break;
- }
- c = *p;
- if (NS_IS_LOW_SURROGATE(c)) {
- // DC00- DFFF - Low Surrogate
- // N = (H - D800) *400 + 10000 + ( L - DC00 )
- uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
- // 0001 0000-001F FFFF
- *out++ = 0xF0 | (char)(ucs4 >> 18);
- *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
- *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
- *out++ = 0x80 | (char)(0x003F & ucs4);
- } else {
- // Treat broken characters as the Unicode
- // replacement character 0xFFFD (0xEFBFBD in
- // UTF-8)
- *out++ = '\xEF';
- *out++ = '\xBF';
- *out++ = '\xBD';
- // The pointer to the next character points to the second
- // 16-bit value, not beyond it, as per Unicode 5.0.0
- // Chapter 3 C10, only the first code unit of an illegal
- // sequence must be treated as an illegally terminated
- // code unit sequence (also Chapter 3 D91, "isolated [not
- // paired and ill-formed] UTF-16 code units in the range
- // D800..DFFF are ill-formed").
- p--;
- NS_WARNING("got a High Surrogate but no low surrogate");
- }
- } else { // U+DC00 - U+DFFF
- // Treat broken characters as the Unicode replacement
- // character 0xFFFD (0xEFBFBD in UTF-8)
- *out++ = '\xEF';
- *out++ = '\xBF';
- *out++ = '\xBD';
- // DC00- DFFF - Low Surrogate
- NS_WARNING("got a low Surrogate but no high surrogate");
- }
- }
- mBuffer = out;
- }
- void write_terminator()
- {
- *mBuffer = buffer_type(0);
- }
- private:
- buffer_type* const mStart;
- buffer_type* mBuffer;
- };
- /**
- * A character sink (see |copy_string| in nsAlgorithm.h) for computing
- * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid
- * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8).
- */
- class CalculateUTF8Size
- {
- public:
- typedef char16_t value_type;
- CalculateUTF8Size()
- : mSize(0)
- {
- }
- size_t Size() const
- {
- return mSize;
- }
- void write(const value_type* aStart, uint32_t aN)
- {
- // Assume UCS2 surrogate pairs won't be spread across fragments.
- for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
- value_type c = *p;
- if (!(c & 0xFF80)) { // U+0000 - U+007F
- mSize += 1;
- } else if (!(c & 0xF800)) { // U+0100 - U+07FF
- mSize += 2;
- } else if (0xD800 != (0xF800 & c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
- mSize += 3;
- } else if (0xD800 == (0xFC00 & c)) { // U+D800 - U+DBFF
- ++p;
- if (p == end) {
- // Treat broken characters as the Unicode
- // replacement character 0xFFFD (0xEFBFBD in
- // UTF-8)
- mSize += 3;
- NS_WARNING("String ending in half a surrogate pair!");
- break;
- }
- c = *p;
- if (0xDC00 == (0xFC00 & c)) {
- mSize += 4;
- } else {
- // Treat broken characters as the Unicode
- // replacement character 0xFFFD (0xEFBFBD in
- // UTF-8)
- mSize += 3;
- // The next code unit is the second 16-bit value, not
- // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
- // only the first code unit of an illegal sequence must
- // be treated as an illegally terminated code unit
- // sequence (also Chapter 3 D91, "isolated [not paired and
- // ill-formed] UTF-16 code units in the range D800..DFFF
- // are ill-formed").
- p--;
- NS_WARNING("got a high Surrogate but no low surrogate");
- }
- } else { // U+DC00 - U+DFFF
- // Treat broken characters as the Unicode replacement
- // character 0xFFFD (0xEFBFBD in UTF-8)
- mSize += 3;
- NS_WARNING("got a low Surrogate but no high surrogate");
- }
- }
- }
- private:
- size_t mSize;
- };
- #ifdef MOZILLA_INTERNAL_API
- /**
- * A character sink that performs a |reinterpret_cast|-style conversion
- * from char to char16_t.
- */
- class LossyConvertEncoding8to16
- {
- public:
- typedef char value_type;
- typedef char input_type;
- typedef char16_t output_type;
- public:
- explicit LossyConvertEncoding8to16(char16_t* aDestination) :
- mDestination(aDestination)
- {
- }
- void
- write(const char* aSource, uint32_t aSourceLength)
- {
- #ifdef MOZILLA_MAY_SUPPORT_SSE2
- if (mozilla::supports_sse2()) {
- write_sse2(aSource, aSourceLength);
- return;
- }
- #endif
- const char* done_writing = aSource + aSourceLength;
- while (aSource < done_writing) {
- *mDestination++ = (char16_t)(unsigned char)(*aSource++);
- }
- }
- void
- write_sse2(const char* aSource, uint32_t aSourceLength);
- void
- write_terminator()
- {
- *mDestination = (char16_t)(0);
- }
- private:
- char16_t* mDestination;
- };
- /**
- * A character sink that performs a |reinterpret_cast|-style conversion
- * from char16_t to char.
- */
- class LossyConvertEncoding16to8
- {
- public:
- typedef char16_t value_type;
- typedef char16_t input_type;
- typedef char output_type;
- explicit LossyConvertEncoding16to8(char* aDestination)
- : mDestination(aDestination)
- {
- }
- void
- write(const char16_t* aSource, uint32_t aSourceLength)
- {
- #ifdef MOZILLA_MAY_SUPPORT_SSE2
- if (mozilla::supports_sse2()) {
- write_sse2(aSource, aSourceLength);
- return;
- }
- #endif
- const char16_t* done_writing = aSource + aSourceLength;
- while (aSource < done_writing) {
- *mDestination++ = (char)(*aSource++);
- }
- }
- #ifdef MOZILLA_MAY_SUPPORT_SSE2
- void
- write_sse2(const char16_t* aSource, uint32_t aSourceLength);
- #endif
- void
- write_terminator()
- {
- *mDestination = '\0';
- }
- private:
- char* mDestination;
- };
- #endif // MOZILLA_INTERNAL_API
- template<typename Char, typename UnsignedT>
- inline UnsignedT
- RewindToPriorUTF8Codepoint(const Char* utf8Chars, UnsignedT index)
- {
- static_assert(mozilla::IsSame<Char, char>::value ||
- mozilla::IsSame<Char, unsigned char>::value ||
- mozilla::IsSame<Char, signed char>::value,
- "UTF-8 data must be in 8-bit units");
- static_assert(mozilla::IsUnsigned<UnsignedT>::value, "index type must be unsigned");
- while (index > 0 && (utf8Chars[index] & 0xC0) == 0x80)
- --index;
- return index;
- }
- #endif /* !defined(nsUTF8Utils_h_) */
|