123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428 |
- /* -*- Mode: C; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
- /* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
- #include "mozTXTToHTMLConv.h"
- #include "nsNetUtil.h"
- #include "nsUnicharUtils.h"
- #include "nsCRT.h"
- #include "nsIExternalProtocolHandler.h"
- #include "nsIIOService.h"
- #include "nsIURI.h"
- #include <algorithm>
- #ifdef DEBUG_BenB_Perf
- #include "prtime.h"
- #include "prinrval.h"
- #endif
- const double growthRate = 1.2;
- // Bug 183111, editor now replaces multiple spaces with leading
- // 0xA0's and a single ending space, so need to treat 0xA0's as spaces.
- // 0xA0 is the Latin1/Unicode character for "non-breaking space (nbsp)"
- // Also recognize the Japanese ideographic space 0x3000 as a space.
- static inline bool IsSpace(const char16_t aChar)
- {
- return (nsCRT::IsAsciiSpace(aChar) || aChar == 0xA0 || aChar == 0x3000);
- }
- // Escape Char will take ch, escape it and append the result to
- // aStringToAppendTo
- void
- mozTXTToHTMLConv::EscapeChar(const char16_t ch, nsString& aStringToAppendTo,
- bool inAttribute)
- {
- switch (ch)
- {
- case '<':
- aStringToAppendTo.AppendLiteral("<");
- break;
- case '>':
- aStringToAppendTo.AppendLiteral(">");
- break;
- case '&':
- aStringToAppendTo.AppendLiteral("&");
- break;
- case '"':
- if (inAttribute)
- {
- aStringToAppendTo.AppendLiteral(""");
- break;
- }
- // else fall through
- MOZ_FALLTHROUGH;
- default:
- aStringToAppendTo += ch;
- }
- return;
- }
- // EscapeStr takes the passed in string and
- // escapes it IN PLACE.
- void
- mozTXTToHTMLConv::EscapeStr(nsString& aInString, bool inAttribute)
- {
- // the replace substring routines
- // don't seem to work if you have a character
- // in the in string that is also in the replacement
- // string! =(
- //aInString.ReplaceSubstring("&", "&");
- //aInString.ReplaceSubstring("<", "<");
- //aInString.ReplaceSubstring(">", ">");
- for (uint32_t i = 0; i < aInString.Length();)
- {
- switch (aInString[i])
- {
- case '<':
- aInString.Cut(i, 1);
- aInString.Insert(NS_LITERAL_STRING("<"), i);
- i += 4; // skip past the integers we just added
- break;
- case '>':
- aInString.Cut(i, 1);
- aInString.Insert(NS_LITERAL_STRING(">"), i);
- i += 4; // skip past the integers we just added
- break;
- case '&':
- aInString.Cut(i, 1);
- aInString.Insert(NS_LITERAL_STRING("&"), i);
- i += 5; // skip past the integers we just added
- break;
- case '"':
- if (inAttribute)
- {
- aInString.Cut(i, 1);
- aInString.Insert(NS_LITERAL_STRING("""), i);
- i += 6;
- break;
- }
- // else fall through
- MOZ_FALLTHROUGH;
- default:
- i++;
- }
- }
- }
- void
- mozTXTToHTMLConv::UnescapeStr(const char16_t * aInString, int32_t aStartPos, int32_t aLength, nsString& aOutString)
- {
- const char16_t * subString = nullptr;
- for (uint32_t i = aStartPos; int32_t(i) - aStartPos < aLength;)
- {
- int32_t remainingChars = i - aStartPos;
- if (aInString[i] == '&')
- {
- subString = &aInString[i];
- if (!nsCRT::strncmp(subString, u"<", std::min(4, aLength - remainingChars)))
- {
- aOutString.Append(char16_t('<'));
- i += 4;
- }
- else if (!nsCRT::strncmp(subString, u">", std::min(4, aLength - remainingChars)))
- {
- aOutString.Append(char16_t('>'));
- i += 4;
- }
- else if (!nsCRT::strncmp(subString, u"&", std::min(5, aLength - remainingChars)))
- {
- aOutString.Append(char16_t('&'));
- i += 5;
- }
- else if (!nsCRT::strncmp(subString, u""", std::min(6, aLength - remainingChars)))
- {
- aOutString.Append(char16_t('"'));
- i += 6;
- }
- else
- {
- aOutString += aInString[i];
- i++;
- }
- }
- else
- {
- aOutString += aInString[i];
- i++;
- }
- }
- }
- void
- mozTXTToHTMLConv::CompleteAbbreviatedURL(const char16_t * aInString, int32_t aInLength,
- const uint32_t pos, nsString& aOutString)
- {
- NS_ASSERTION(int32_t(pos) < aInLength, "bad args to CompleteAbbreviatedURL, see bug #190851");
- if (int32_t(pos) >= aInLength)
- return;
- if (aInString[pos] == '@')
- {
- // only pre-pend a mailto url if the string contains a .domain in it..
- //i.e. we want to linkify johndoe@foo.com but not "let's meet @8pm"
- nsDependentString inString(aInString, aInLength);
- if (inString.FindChar('.', pos) != kNotFound) // if we have a '.' after the @ sign....
- {
- aOutString.AssignLiteral("mailto:");
- aOutString += aInString;
- }
- }
- else if (aInString[pos] == '.')
- {
- if (ItMatchesDelimited(aInString, aInLength,
- u"www.", 4, LT_IGNORE, LT_IGNORE))
- {
- aOutString.AssignLiteral("http://");
- aOutString += aInString;
- }
- else if (ItMatchesDelimited(aInString,aInLength, u"ftp.", 4, LT_IGNORE, LT_IGNORE))
- {
- aOutString.AssignLiteral("ftp://");
- aOutString += aInString;
- }
- }
- }
- bool
- mozTXTToHTMLConv::FindURLStart(const char16_t * aInString, int32_t aInLength,
- const uint32_t pos, const modetype check,
- uint32_t& start)
- {
- switch(check)
- { // no breaks, because end of blocks is never reached
- case RFC1738:
- {
- if (!nsCRT::strncmp(&aInString[std::max(int32_t(pos - 4), 0)], u"<URL:", 5))
- {
- start = pos + 1;
- return true;
- }
- else
- return false;
- }
- case RFC2396E:
- {
- nsString temp(aInString, aInLength);
- int32_t i = pos <= 0 ? kNotFound : temp.RFindCharInSet(u"<>\"", pos - 1);
- if (i != kNotFound && (temp[uint32_t(i)] == '<' ||
- temp[uint32_t(i)] == '"'))
- {
- start = uint32_t(++i);
- return start < pos;
- }
- else
- return false;
- }
- case freetext:
- {
- int32_t i = pos - 1;
- for (; i >= 0 && (
- nsCRT::IsAsciiAlpha(aInString[uint32_t(i)]) ||
- nsCRT::IsAsciiDigit(aInString[uint32_t(i)]) ||
- aInString[uint32_t(i)] == '+' ||
- aInString[uint32_t(i)] == '-' ||
- aInString[uint32_t(i)] == '.'
- ); i--)
- ;
- if (++i >= 0 && uint32_t(i) < pos && nsCRT::IsAsciiAlpha(aInString[uint32_t(i)]))
- {
- start = uint32_t(i);
- return true;
- }
- else
- return false;
- }
- case abbreviated:
- {
- int32_t i = pos - 1;
- // This disallows non-ascii-characters for email.
- // Currently correct, but revisit later after standards changed.
- bool isEmail = aInString[pos] == (char16_t)'@';
- // These chars mark the start of the URL
- for (; i >= 0
- && aInString[uint32_t(i)] != '>' && aInString[uint32_t(i)] != '<'
- && aInString[uint32_t(i)] != '"' && aInString[uint32_t(i)] != '\''
- && aInString[uint32_t(i)] != '`' && aInString[uint32_t(i)] != ','
- && aInString[uint32_t(i)] != '{' && aInString[uint32_t(i)] != '['
- && aInString[uint32_t(i)] != '(' && aInString[uint32_t(i)] != '|'
- && aInString[uint32_t(i)] != '\\'
- && !IsSpace(aInString[uint32_t(i)])
- && (!isEmail || nsCRT::IsAscii(aInString[uint32_t(i)]))
- ; i--)
- ;
- if
- (
- ++i >= 0 && uint32_t(i) < pos
- &&
- (
- nsCRT::IsAsciiAlpha(aInString[uint32_t(i)]) ||
- nsCRT::IsAsciiDigit(aInString[uint32_t(i)])
- )
- )
- {
- start = uint32_t(i);
- return true;
- }
- else
- return false;
- }
- default:
- return false;
- } //switch
- }
- bool
- mozTXTToHTMLConv::FindURLEnd(const char16_t * aInString, int32_t aInStringLength, const uint32_t pos,
- const modetype check, const uint32_t start, uint32_t& end)
- {
- switch(check)
- { // no breaks, because end of blocks is never reached
- case RFC1738:
- case RFC2396E:
- {
- nsString temp(aInString, aInStringLength);
- int32_t i = temp.FindCharInSet(u"<>\"", pos + 1);
- if (i != kNotFound && temp[uint32_t(i--)] ==
- (check == RFC1738 || temp[start - 1] == '<' ? '>' : '"'))
- {
- end = uint32_t(i);
- return end > pos;
- }
- return false;
- }
- case freetext:
- case abbreviated:
- {
- uint32_t i = pos + 1;
- bool isEmail = aInString[pos] == (char16_t)'@';
- bool seenOpeningParenthesis = false; // there is a '(' earlier in the URL
- bool seenOpeningSquareBracket = false; // there is a '[' earlier in the URL
- for (; int32_t(i) < aInStringLength; i++)
- {
- // These chars mark the end of the URL
- if (aInString[i] == '>' || aInString[i] == '<' ||
- aInString[i] == '"' || aInString[i] == '`' ||
- aInString[i] == '}' || aInString[i] == '{' ||
- (aInString[i] == ')' && !seenOpeningParenthesis) ||
- (aInString[i] == ']' && !seenOpeningSquareBracket) ||
- // Allow IPv6 adresses like http://[1080::8:800:200C:417A]/foo.
- (aInString[i] == '[' && i > 2 &&
- (aInString[i - 1] != '/' || aInString[i - 2] != '/')) ||
- IsSpace(aInString[i]))
- break;
- // Disallow non-ascii-characters for email.
- // Currently correct, but revisit later after standards changed.
- if (isEmail && (
- aInString[i] == '(' || aInString[i] == '\'' ||
- !nsCRT::IsAscii(aInString[i])))
- break;
- if (aInString[i] == '(')
- seenOpeningParenthesis = true;
- if (aInString[i] == '[')
- seenOpeningSquareBracket = true;
- }
- // These chars are allowed in the middle of the URL, but not at end.
- // Technically they are, but are used in normal text after the URL.
- while (--i > pos && (
- aInString[i] == '.' || aInString[i] == ',' || aInString[i] == ';' ||
- aInString[i] == '!' || aInString[i] == '?' || aInString[i] == '-' ||
- aInString[i] == ':' || aInString[i] == '\''
- ))
- ;
- if (i > pos)
- {
- end = i;
- return true;
- }
- return false;
- }
- default:
- return false;
- } //switch
- }
- void
- mozTXTToHTMLConv::CalculateURLBoundaries(const char16_t * aInString, int32_t aInStringLength,
- const uint32_t pos, const uint32_t whathasbeendone,
- const modetype check, const uint32_t start, const uint32_t end,
- nsString& txtURL, nsString& desc,
- int32_t& replaceBefore, int32_t& replaceAfter)
- {
- uint32_t descstart = start;
- switch(check)
- {
- case RFC1738:
- {
- descstart = start - 5;
- desc.Append(&aInString[descstart], end - descstart + 2); // include "<URL:" and ">"
- replaceAfter = end - pos + 1;
- } break;
- case RFC2396E:
- {
- descstart = start - 1;
- desc.Append(&aInString[descstart], end - descstart + 2); // include brackets
- replaceAfter = end - pos + 1;
- } break;
- case freetext:
- case abbreviated:
- {
- descstart = start;
- desc.Append(&aInString[descstart], end - start + 1); // don't include brackets
- replaceAfter = end - pos;
- } break;
- default: break;
- } //switch
- EscapeStr(desc, false);
- txtURL.Append(&aInString[start], end - start + 1);
- txtURL.StripWhitespace();
- // FIX ME
- nsAutoString temp2;
- ScanTXT(&aInString[descstart], pos - descstart, ~kURLs /*prevents loop*/ & whathasbeendone, temp2);
- replaceBefore = temp2.Length();
- return;
- }
- bool mozTXTToHTMLConv::ShouldLinkify(const nsCString& aURL)
- {
- if (!mIOService)
- return false;
- nsAutoCString scheme;
- nsresult rv = mIOService->ExtractScheme(aURL, scheme);
- if(NS_FAILED(rv))
- return false;
- // Get the handler for this scheme.
- nsCOMPtr<nsIProtocolHandler> handler;
- rv = mIOService->GetProtocolHandler(scheme.get(), getter_AddRefs(handler));
- if(NS_FAILED(rv))
- return false;
- // Is it an external protocol handler? If not, linkify it.
- nsCOMPtr<nsIExternalProtocolHandler> externalHandler = do_QueryInterface(handler);
- if (!externalHandler)
- return true; // handler is built-in, linkify it!
- // If external app exists for the scheme then linkify it.
- bool exists;
- rv = externalHandler->ExternalAppExistsForScheme(scheme, &exists);
- return(NS_SUCCEEDED(rv) && exists);
- }
- bool
- mozTXTToHTMLConv::CheckURLAndCreateHTML(
- const nsString& txtURL, const nsString& desc, const modetype mode,
- nsString& outputHTML)
- {
- // Create *uri from txtURL
- nsCOMPtr<nsIURI> uri;
- nsresult rv;
- // Lazily initialize mIOService
- if (!mIOService)
- {
- mIOService = do_GetIOService();
- if (!mIOService)
- return false;
- }
- // See if the url should be linkified.
- NS_ConvertUTF16toUTF8 utf8URL(txtURL);
- if (!ShouldLinkify(utf8URL))
- return false;
- // it would be faster if we could just check to see if there is a protocol
- // handler for the url and return instead of actually trying to create a url...
- rv = mIOService->NewURI(utf8URL, nullptr, nullptr, getter_AddRefs(uri));
- // Real work
- if (NS_SUCCEEDED(rv) && uri)
- {
- outputHTML.AssignLiteral("<a class=\"moz-txt-link-");
- switch(mode)
- {
- case RFC1738:
- outputHTML.AppendLiteral("rfc1738");
- break;
- case RFC2396E:
- outputHTML.AppendLiteral("rfc2396E");
- break;
- case freetext:
- outputHTML.AppendLiteral("freetext");
- break;
- case abbreviated:
- outputHTML.AppendLiteral("abbreviated");
- break;
- default: break;
- }
- nsAutoString escapedURL(txtURL);
- EscapeStr(escapedURL, true);
- outputHTML.AppendLiteral("\" href=\"");
- outputHTML += escapedURL;
- outputHTML.AppendLiteral("\">");
- outputHTML += desc;
- outputHTML.AppendLiteral("</a>");
- return true;
- }
- else
- return false;
- }
- NS_IMETHODIMP mozTXTToHTMLConv::FindURLInPlaintext(const char16_t * aInString, int32_t aInLength, int32_t aPos, int32_t * aStartPos, int32_t * aEndPos)
- {
- // call FindURL on the passed in string
- nsAutoString outputHTML; // we'll ignore the generated output HTML
- *aStartPos = -1;
- *aEndPos = -1;
- FindURL(aInString, aInLength, aPos, kURLs, outputHTML, *aStartPos, *aEndPos);
- return NS_OK;
- }
- bool
- mozTXTToHTMLConv::FindURL(const char16_t * aInString, int32_t aInLength, const uint32_t pos,
- const uint32_t whathasbeendone,
- nsString& outputHTML, int32_t& replaceBefore, int32_t& replaceAfter)
- {
- enum statetype {unchecked, invalid, startok, endok, success};
- static const modetype ranking[] = {RFC1738, RFC2396E, freetext, abbreviated};
- statetype state[mozTXTToHTMLConv_lastMode + 1]; // 0(=unknown)..lastMode
- /* I don't like this abuse of enums as index for the array,
- but I don't know a better method */
- // Define, which modes to check
- /* all modes but abbreviated are checked for text[pos] == ':',
- only abbreviated for '.', RFC2396E and abbreviated for '@' */
- for (modetype iState = unknown; iState <= mozTXTToHTMLConv_lastMode;
- iState = modetype(iState + 1))
- state[iState] = aInString[pos] == ':' ? unchecked : invalid;
- switch (aInString[pos])
- {
- case '@':
- state[RFC2396E] = unchecked;
- MOZ_FALLTHROUGH;
- case '.':
- state[abbreviated] = unchecked;
- break;
- case ':':
- state[abbreviated] = invalid;
- break;
- default:
- break;
- }
- // Test, first successful mode wins, sequence defined by |ranking|
- int32_t iCheck = 0; // the currently tested modetype
- modetype check = ranking[iCheck];
- for (; iCheck < mozTXTToHTMLConv_numberOfModes && state[check] != success;
- iCheck++)
- /* check state from last run.
- If this is the first, check this one, which isn't = success yet */
- {
- check = ranking[iCheck];
- uint32_t start, end;
- if (state[check] == unchecked)
- if (FindURLStart(aInString, aInLength, pos, check, start))
- state[check] = startok;
- if (state[check] == startok)
- if (FindURLEnd(aInString, aInLength, pos, check, start, end))
- state[check] = endok;
- if (state[check] == endok)
- {
- nsAutoString txtURL, desc;
- int32_t resultReplaceBefore, resultReplaceAfter;
- CalculateURLBoundaries(aInString, aInLength, pos, whathasbeendone, check, start, end,
- txtURL, desc,
- resultReplaceBefore, resultReplaceAfter);
- if (aInString[pos] != ':')
- {
- nsAutoString temp = txtURL;
- txtURL.SetLength(0);
- CompleteAbbreviatedURL(temp.get(),temp.Length(), pos - start, txtURL);
- }
- if (!txtURL.IsEmpty() && CheckURLAndCreateHTML(txtURL, desc, check,
- outputHTML))
- {
- replaceBefore = resultReplaceBefore;
- replaceAfter = resultReplaceAfter;
- state[check] = success;
- }
- } // if
- } // for
- return state[check] == success;
- }
- bool
- mozTXTToHTMLConv::ItMatchesDelimited(const char16_t * aInString,
- int32_t aInLength, const char16_t* rep, int32_t aRepLen,
- LIMTYPE before, LIMTYPE after)
- {
- // this little method gets called a LOT. I found we were spending a
- // lot of time just calculating the length of the variable "rep"
- // over and over again every time we called it. So we're now passing
- // an integer in here.
- int32_t textLen = aInLength;
- if
- (
- ((before == LT_IGNORE && (after == LT_IGNORE || after == LT_DELIMITER))
- && textLen < aRepLen) ||
- ((before != LT_IGNORE || (after != LT_IGNORE && after != LT_DELIMITER))
- && textLen < aRepLen + 1) ||
- (before != LT_IGNORE && after != LT_IGNORE && after != LT_DELIMITER
- && textLen < aRepLen + 2)
- )
- return false;
- char16_t text0 = aInString[0];
- char16_t textAfterPos = aInString[aRepLen + (before == LT_IGNORE ? 0 : 1)];
- if
- (
- (before == LT_ALPHA
- && !nsCRT::IsAsciiAlpha(text0)) ||
- (before == LT_DIGIT
- && !nsCRT::IsAsciiDigit(text0)) ||
- (before == LT_DELIMITER
- &&
- (
- nsCRT::IsAsciiAlpha(text0) ||
- nsCRT::IsAsciiDigit(text0) ||
- text0 == *rep
- )) ||
- (after == LT_ALPHA
- && !nsCRT::IsAsciiAlpha(textAfterPos)) ||
- (after == LT_DIGIT
- && !nsCRT::IsAsciiDigit(textAfterPos)) ||
- (after == LT_DELIMITER
- &&
- (
- nsCRT::IsAsciiAlpha(textAfterPos) ||
- nsCRT::IsAsciiDigit(textAfterPos) ||
- textAfterPos == *rep
- )) ||
- !Substring(Substring(aInString, aInString+aInLength),
- (before == LT_IGNORE ? 0 : 1),
- aRepLen).Equals(Substring(rep, rep+aRepLen),
- nsCaseInsensitiveStringComparator())
- )
- return false;
- return true;
- }
- uint32_t
- mozTXTToHTMLConv::NumberOfMatches(const char16_t * aInString, int32_t aInStringLength,
- const char16_t* rep, int32_t aRepLen, LIMTYPE before, LIMTYPE after)
- {
- uint32_t result = 0;
- for (int32_t i = 0; i < aInStringLength; i++)
- {
- const char16_t * indexIntoString = &aInString[i];
- if (ItMatchesDelimited(indexIntoString, aInStringLength - i, rep, aRepLen, before, after))
- result++;
- }
- return result;
- }
- // NOTE: the converted html for the phrase is appended to aOutString
- // tagHTML and attributeHTML are plain ASCII (literal strings, in fact)
- bool
- mozTXTToHTMLConv::StructPhraseHit(const char16_t * aInString, int32_t aInStringLength, bool col0,
- const char16_t* tagTXT, int32_t aTagTXTLen,
- const char* tagHTML, const char* attributeHTML,
- nsString& aOutString, uint32_t& openTags)
- {
- /* We're searching for the following pattern:
- LT_DELIMITER - "*" - ALPHA -
- [ some text (maybe more "*"-pairs) - ALPHA ] "*" - LT_DELIMITER.
- <strong> is only inserted, if existence of a pair could be verified
- We use the first opening/closing tag, if we can choose */
- const char16_t * newOffset = aInString;
- int32_t newLength = aInStringLength;
- if (!col0) // skip the first element?
- {
- newOffset = &aInString[1];
- newLength = aInStringLength - 1;
- }
- // opening tag
- if
- (
- ItMatchesDelimited(aInString, aInStringLength, tagTXT, aTagTXTLen,
- (col0 ? LT_IGNORE : LT_DELIMITER), LT_ALPHA) // is opening tag
- && NumberOfMatches(newOffset, newLength, tagTXT, aTagTXTLen,
- LT_ALPHA, LT_DELIMITER) // remaining closing tags
- > openTags
- )
- {
- openTags++;
- aOutString.Append('<');
- aOutString.AppendASCII(tagHTML);
- aOutString.Append(char16_t(' '));
- aOutString.AppendASCII(attributeHTML);
- aOutString.AppendLiteral("><span class=\"moz-txt-tag\">");
- aOutString.Append(tagTXT);
- aOutString.AppendLiteral("</span>");
- return true;
- }
- // closing tag
- else if (openTags > 0
- && ItMatchesDelimited(aInString, aInStringLength, tagTXT, aTagTXTLen, LT_ALPHA, LT_DELIMITER))
- {
- openTags--;
- aOutString.AppendLiteral("<span class=\"moz-txt-tag\">");
- aOutString.Append(tagTXT);
- aOutString.AppendLiteral("</span></");
- aOutString.AppendASCII(tagHTML);
- aOutString.Append(char16_t('>'));
- return true;
- }
- return false;
- }
- bool
- mozTXTToHTMLConv::SmilyHit(const char16_t * aInString, int32_t aLength, bool col0,
- const char* tagTXT, const char* imageName,
- nsString& outputHTML, int32_t& glyphTextLen)
- {
- if ( !aInString || !tagTXT || !imageName )
- return false;
- int32_t tagLen = strlen(tagTXT);
-
- uint32_t delim = (col0 ? 0 : 1) + tagLen;
- if
- (
- (col0 || IsSpace(aInString[0]))
- &&
- (
- aLength <= int32_t(delim) ||
- IsSpace(aInString[delim]) ||
- (aLength > int32_t(delim + 1)
- &&
- (
- aInString[delim] == '.' ||
- aInString[delim] == ',' ||
- aInString[delim] == ';' ||
- aInString[delim] == '8' ||
- aInString[delim] == '>' ||
- aInString[delim] == '!' ||
- aInString[delim] == '?'
- )
- && IsSpace(aInString[delim + 1]))
- )
- && ItMatchesDelimited(aInString, aLength, NS_ConvertASCIItoUTF16(tagTXT).get(), tagLen,
- col0 ? LT_IGNORE : LT_DELIMITER, LT_IGNORE)
- // Note: tests at different pos for LT_IGNORE and LT_DELIMITER
- )
- {
- if (!col0)
- {
- outputHTML.Truncate();
- outputHTML.Append(char16_t(' '));
- }
- outputHTML.AppendLiteral("<span class=\""); // <span class="
- AppendASCIItoUTF16(imageName, outputHTML); // e.g. smiley-frown
- outputHTML.AppendLiteral("\" title=\""); // " title="
- AppendASCIItoUTF16(tagTXT, outputHTML); // smiley tooltip
- outputHTML.AppendLiteral("\"><span>"); // "><span>
- AppendASCIItoUTF16(tagTXT, outputHTML); // original text
- outputHTML.AppendLiteral("</span></span>"); // </span></span>
- glyphTextLen = (col0 ? 0 : 1) + tagLen;
- return true;
- }
- return false;
- }
- // the glyph is appended to aOutputString instead of the original string...
- bool
- mozTXTToHTMLConv::GlyphHit(const char16_t * aInString, int32_t aInLength, bool col0,
- nsString& aOutputString, int32_t& glyphTextLen)
- {
- char16_t text0 = aInString[0];
- char16_t text1 = aInString[1];
- char16_t firstChar = (col0 ? text0 : text1);
- // temporary variable used to store the glyph html text
- nsAutoString outputHTML;
- bool bTestSmilie;
- bool bArg = false;
- int i;
- // refactor some of this mess to avoid code duplication and speed execution a bit
- // there are two cases that need to be tried one after another. To avoid a lot of
- // duplicate code, rolling into a loop
- i = 0;
- while ( i < 2 )
- {
- bTestSmilie = false;
- if ( !i && (firstChar == ':' || firstChar == ';' || firstChar == '=' || firstChar == '>' || firstChar == '8' || firstChar == 'O'))
- {
- // first test passed
- bTestSmilie = true;
- bArg = col0;
- }
- if ( i && col0 && ( text1 == ':' || text1 == ';' || text1 == '=' || text1 == '>' || text1 == '8' || text1 == 'O' ) )
- {
- // second test passed
- bTestSmilie = true;
- bArg = false;
- }
- if ( bTestSmilie && (
- SmilyHit(aInString, aInLength, bArg,
- ":-)",
- "moz-smiley-s1", // smile
- outputHTML, glyphTextLen) ||
-
- SmilyHit(aInString, aInLength, bArg,
- ":)",
- "moz-smiley-s1", // smile
- outputHTML, glyphTextLen) ||
-
- SmilyHit(aInString, aInLength, bArg,
- ":-D",
- "moz-smiley-s5", // laughing
- outputHTML, glyphTextLen) ||
-
- SmilyHit(aInString, aInLength, bArg,
- ":-(",
- "moz-smiley-s2", // frown
- outputHTML, glyphTextLen) ||
-
- SmilyHit(aInString, aInLength, bArg,
- ":(",
- "moz-smiley-s2", // frown
- outputHTML, glyphTextLen) ||
-
- SmilyHit(aInString, aInLength, bArg,
- ":-[",
- "moz-smiley-s6", // embarassed
- outputHTML, glyphTextLen) ||
-
- SmilyHit(aInString, aInLength, bArg,
- ";-)",
- "moz-smiley-s3", // wink
- outputHTML, glyphTextLen) ||
- SmilyHit(aInString, aInLength, col0,
- ";)",
- "moz-smiley-s3", // wink
- outputHTML, glyphTextLen) ||
-
- SmilyHit(aInString, aInLength, bArg,
- ":-\\",
- "moz-smiley-s7", // undecided
- outputHTML, glyphTextLen) ||
-
- SmilyHit(aInString, aInLength, bArg,
- ":-P",
- "moz-smiley-s4", // tongue
- outputHTML, glyphTextLen) ||
-
- SmilyHit(aInString, aInLength, bArg,
- ";-P",
- "moz-smiley-s4", // tongue
- outputHTML, glyphTextLen) ||
-
- SmilyHit(aInString, aInLength, bArg,
- "=-O",
- "moz-smiley-s8", // surprise
- outputHTML, glyphTextLen) ||
-
- SmilyHit(aInString, aInLength, bArg,
- ":-*",
- "moz-smiley-s9", // kiss
- outputHTML, glyphTextLen) ||
-
- SmilyHit(aInString, aInLength, bArg,
- ">:o",
- "moz-smiley-s10", // yell
- outputHTML, glyphTextLen) ||
-
- SmilyHit(aInString, aInLength, bArg,
- ">:-o",
- "moz-smiley-s10", // yell
- outputHTML, glyphTextLen) ||
-
- SmilyHit(aInString, aInLength, bArg,
- "8-)",
- "moz-smiley-s11", // cool
- outputHTML, glyphTextLen) ||
-
- SmilyHit(aInString, aInLength, bArg,
- ":-$",
- "moz-smiley-s12", // money
- outputHTML, glyphTextLen) ||
-
- SmilyHit(aInString, aInLength, bArg,
- ":-!",
- "moz-smiley-s13", // foot
- outputHTML, glyphTextLen) ||
-
- SmilyHit(aInString, aInLength, bArg,
- "O:-)",
- "moz-smiley-s14", // innocent
- outputHTML, glyphTextLen) ||
-
- SmilyHit(aInString, aInLength, bArg,
- ":'(",
- "moz-smiley-s15", // cry
- outputHTML, glyphTextLen) ||
-
- SmilyHit(aInString, aInLength, bArg,
- ":-X",
- "moz-smiley-s16", // sealed
- outputHTML, glyphTextLen)
- )
- )
- {
- aOutputString.Append(outputHTML);
- return true;
- }
- i++;
- }
- if (text0 == '\f')
- {
- aOutputString.AppendLiteral("<span class='moz-txt-formfeed'></span>");
- glyphTextLen = 1;
- return true;
- }
- if (text0 == '+' || text1 == '+')
- {
- if (ItMatchesDelimited(aInString, aInLength,
- u" +/-", 4,
- LT_IGNORE, LT_IGNORE))
- {
- aOutputString.AppendLiteral(" ±");
- glyphTextLen = 4;
- return true;
- }
- if (col0 && ItMatchesDelimited(aInString, aInLength,
- u"+/-", 3,
- LT_IGNORE, LT_IGNORE))
- {
- aOutputString.AppendLiteral("±");
- glyphTextLen = 3;
- return true;
- }
- }
- // x^2 => x<sup>2</sup>, also handle powers x^-2, x^0.5
- // implement regular expression /[\dA-Za-z\)\]}]\^-?\d+(\.\d+)*[^\dA-Za-z]/
- if
- (
- text1 == '^'
- &&
- (
- nsCRT::IsAsciiDigit(text0) || nsCRT::IsAsciiAlpha(text0) ||
- text0 == ')' || text0 == ']' || text0 == '}'
- )
- &&
- (
- (2 < aInLength && nsCRT::IsAsciiDigit(aInString[2])) ||
- (3 < aInLength && aInString[2] == '-' && nsCRT::IsAsciiDigit(aInString[3]))
- )
- )
- {
- // Find first non-digit
- int32_t delimPos = 3; // skip "^" and first digit (or '-')
- for (; delimPos < aInLength
- &&
- (
- nsCRT::IsAsciiDigit(aInString[delimPos]) ||
- (aInString[delimPos] == '.' && delimPos + 1 < aInLength &&
- nsCRT::IsAsciiDigit(aInString[delimPos + 1]))
- );
- delimPos++)
- ;
- if (delimPos < aInLength && nsCRT::IsAsciiAlpha(aInString[delimPos]))
- {
- return false;
- }
- outputHTML.Truncate();
- outputHTML += text0;
- outputHTML.AppendLiteral(
- "<sup class=\"moz-txt-sup\">"
- "<span style=\"display:inline-block;width:0;height:0;overflow:hidden\">"
- "^</span>");
- aOutputString.Append(outputHTML);
- aOutputString.Append(&aInString[2], delimPos - 2);
- aOutputString.AppendLiteral("</sup>");
- glyphTextLen = delimPos /* - 1 + 1 */ ;
- return true;
- }
- /*
- The following strings are not substituted:
- |TXT |HTML |Reason
- +------+---------+----------
- -> ← Bug #454
- => ⇐ dito
- <- → dito
- <= ⇒ dito
- (tm) ™ dito
- 1/4 ¼ is triggered by 1/4 Part 1, 2/4 Part 2, ...
- 3/4 ¾ dito
- 1/2 ½ similar
- */
- return false;
- }
- /***************************************************************************
- Library-internal Interface
- ****************************************************************************/
- mozTXTToHTMLConv::mozTXTToHTMLConv()
- {
- }
- mozTXTToHTMLConv::~mozTXTToHTMLConv()
- {
- }
- NS_IMPL_ISUPPORTS(mozTXTToHTMLConv,
- mozITXTToHTMLConv,
- nsIStreamConverter,
- nsIStreamListener,
- nsIRequestObserver)
- int32_t
- mozTXTToHTMLConv::CiteLevelTXT(const char16_t *line,
- uint32_t& logLineStart)
- {
- int32_t result = 0;
- int32_t lineLength = NS_strlen(line);
- bool moreCites = true;
- while (moreCites)
- {
- /* E.g. the following lines count as quote:
- > text
- //#ifdef QUOTE_RECOGNITION_AGGRESSIVE
- >text
- //#ifdef QUOTE_RECOGNITION_AGGRESSIVE
- > text
- ] text
- USER> text
- USER] text
- //#endif
- logLineStart is the position of "t" in this example
- */
- uint32_t i = logLineStart;
- #ifdef QUOTE_RECOGNITION_AGGRESSIVE
- for (; int32_t(i) < lineLength && IsSpace(line[i]); i++)
- ;
- for (; int32_t(i) < lineLength && nsCRT::IsAsciiAlpha(line[i])
- && nsCRT::IsUpper(line[i]) ; i++)
- ;
- if (int32_t(i) < lineLength && (line[i] == '>' || line[i] == ']'))
- #else
- if (int32_t(i) < lineLength && line[i] == '>')
- #endif
- {
- i++;
- if (int32_t(i) < lineLength && line[i] == ' ')
- i++;
- // sendmail/mbox
- // Placed here for performance increase
- const char16_t * indexString = &line[logLineStart];
- // here, |logLineStart < lineLength| is always true
- uint32_t minlength = std::min(uint32_t(6), NS_strlen(indexString));
- if (Substring(indexString,
- indexString+minlength).Equals(Substring(NS_LITERAL_STRING(">From "), 0, minlength),
- nsCaseInsensitiveStringComparator()))
- //XXX RFC2646
- moreCites = false;
- else
- {
- result++;
- logLineStart = i;
- }
- }
- else
- moreCites = false;
- }
- return result;
- }
- void
- mozTXTToHTMLConv::ScanTXT(const char16_t * aInString, int32_t aInStringLength, uint32_t whattodo, nsString& aOutString)
- {
- bool doURLs = 0 != (whattodo & kURLs);
- bool doGlyphSubstitution = 0 != (whattodo & kGlyphSubstitution);
- bool doStructPhrase = 0 != (whattodo & kStructPhrase);
- uint32_t structPhrase_strong = 0; // Number of currently open tags
- uint32_t structPhrase_underline = 0;
- uint32_t structPhrase_italic = 0;
- uint32_t structPhrase_code = 0;
- nsAutoString outputHTML; // moved here for performance increase
- for(uint32_t i = 0; int32_t(i) < aInStringLength;)
- {
- if (doGlyphSubstitution)
- {
- int32_t glyphTextLen;
- if (GlyphHit(&aInString[i], aInStringLength - i, i == 0, aOutString, glyphTextLen))
- {
- i += glyphTextLen;
- continue;
- }
- }
- if (doStructPhrase)
- {
- const char16_t * newOffset = aInString;
- int32_t newLength = aInStringLength;
- if (i > 0 ) // skip the first element?
- {
- newOffset = &aInString[i-1];
- newLength = aInStringLength - i + 1;
- }
- switch (aInString[i]) // Performance increase
- {
- case '*':
- if (StructPhraseHit(newOffset, newLength, i == 0,
- u"*", 1,
- "b", "class=\"moz-txt-star\"",
- aOutString, structPhrase_strong))
- {
- i++;
- continue;
- }
- break;
- case '/':
- if (StructPhraseHit(newOffset, newLength, i == 0,
- u"/", 1,
- "i", "class=\"moz-txt-slash\"",
- aOutString, structPhrase_italic))
- {
- i++;
- continue;
- }
- break;
- case '_':
- if (StructPhraseHit(newOffset, newLength, i == 0,
- u"_", 1,
- "span" /* <u> is deprecated */,
- "class=\"moz-txt-underscore\"",
- aOutString, structPhrase_underline))
- {
- i++;
- continue;
- }
- break;
- case '|':
- if (StructPhraseHit(newOffset, newLength, i == 0,
- u"|", 1,
- "code", "class=\"moz-txt-verticalline\"",
- aOutString, structPhrase_code))
- {
- i++;
- continue;
- }
- break;
- }
- }
- if (doURLs)
- {
- switch (aInString[i])
- {
- case ':':
- case '@':
- case '.':
- if ( (i == 0 || ((i > 0) && aInString[i - 1] != ' ')) && aInString[i +1] != ' ') // Performance increase
- {
- int32_t replaceBefore;
- int32_t replaceAfter;
- if (FindURL(aInString, aInStringLength, i, whattodo,
- outputHTML, replaceBefore, replaceAfter)
- && structPhrase_strong + structPhrase_italic +
- structPhrase_underline + structPhrase_code == 0
- /* workaround for bug #19445 */ )
- {
- aOutString.Cut(aOutString.Length() - replaceBefore, replaceBefore);
- aOutString += outputHTML;
- i += replaceAfter + 1;
- continue;
- }
- }
- break;
- } //switch
- }
- switch (aInString[i])
- {
- // Special symbols
- case '<':
- case '>':
- case '&':
- EscapeChar(aInString[i], aOutString, false);
- i++;
- break;
- // Normal characters
- default:
- aOutString += aInString[i];
- i++;
- break;
- }
- }
- }
- void
- mozTXTToHTMLConv::ScanHTML(nsString& aInString, uint32_t whattodo, nsString &aOutString)
- {
- // some common variables we were recalculating
- // every time inside the for loop...
- int32_t lengthOfInString = aInString.Length();
- const char16_t * uniBuffer = aInString.get();
- #ifdef DEBUG_BenB_Perf
- PRTime parsing_start = PR_IntervalNow();
- #endif
- // Look for simple entities not included in a tags and scan them.
- // Skip all tags ("<[...]>") and content in an a link tag ("<a [...]</a>"),
- // comment tag ("<!--[...]-->"), style tag, script tag or head tag.
- // Unescape the rest (text between tags) and pass it to ScanTXT.
- nsAutoCString canFollow(" \f\n\r\t>");
- for (int32_t i = 0; i < lengthOfInString;)
- {
- if (aInString[i] == '<') // html tag
- {
- int32_t start = i;
- if (i + 2 < lengthOfInString &&
- nsCRT::ToLower(aInString[i + 1]) == 'a' &&
- canFollow.FindChar(aInString[i + 2]) != kNotFound)
- // if a tag, skip until </a>.
- // Make sure there's a white-space character after, not to match "abbr".
- {
- i = aInString.Find("</a>", true, i);
- if (i == kNotFound)
- i = lengthOfInString;
- else
- i += 4;
- }
- else if (Substring(aInString, i + 1, 3).LowerCaseEqualsASCII("!--"))
- // if out-commended code, skip until -->
- {
- i = aInString.Find("-->", false, i);
- if (i == kNotFound)
- i = lengthOfInString;
- else
- i += 3;
- }
- else if (i + 6 < lengthOfInString &&
- Substring(aInString, i + 1, 5).LowerCaseEqualsASCII("style") &&
- canFollow.FindChar(aInString[i + 6]) != kNotFound)
- // if style tag, skip until </style>
- {
- i = aInString.Find("</style>", true, i);
- if (i == kNotFound)
- i = lengthOfInString;
- else
- i += 8;
- }
- else if (i + 7 < lengthOfInString &&
- Substring(aInString, i + 1, 6).LowerCaseEqualsASCII("script") &&
- canFollow.FindChar(aInString[i + 7]) != kNotFound)
- // if script tag, skip until </script>
- {
- i = aInString.Find("</script>", true, i);
- if (i == kNotFound)
- i = lengthOfInString;
- else
- i += 9;
- }
- else if (i + 5 < lengthOfInString &&
- Substring(aInString, i + 1, 4).LowerCaseEqualsASCII("head") &&
- canFollow.FindChar(aInString[i + 5]) != kNotFound)
- // if head tag, skip until </head>
- // Make sure not to match <header>.
- {
- i = aInString.Find("</head>", true, i);
- if (i == kNotFound)
- i = lengthOfInString;
- else
- i += 7;
- }
- else // just skip tag (attributes etc.)
- {
- i = aInString.FindChar('>', i);
- if (i == kNotFound)
- i = lengthOfInString;
- else
- i++;
- }
- aOutString.Append(&uniBuffer[start], i - start);
- }
- else
- {
- uint32_t start = uint32_t(i);
- i = aInString.FindChar('<', i);
- if (i == kNotFound)
- i = lengthOfInString;
-
- nsString tempString;
- tempString.SetCapacity(uint32_t((uint32_t(i) - start) * growthRate));
- UnescapeStr(uniBuffer, start, uint32_t(i) - start, tempString);
- ScanTXT(tempString.get(), tempString.Length(), whattodo, aOutString);
- }
- }
- #ifdef DEBUG_BenB_Perf
- printf("ScanHTML time: %d ms\n", PR_IntervalToMilliseconds(PR_IntervalNow() - parsing_start));
- #endif
- }
- /****************************************************************************
- XPCOM Interface
- *****************************************************************************/
- NS_IMETHODIMP
- mozTXTToHTMLConv::Convert(nsIInputStream *aFromStream,
- const char *aFromType,
- const char *aToType,
- nsISupports *aCtxt, nsIInputStream **_retval)
- {
- return NS_ERROR_NOT_IMPLEMENTED;
- }
- NS_IMETHODIMP
- mozTXTToHTMLConv::AsyncConvertData(const char *aFromType,
- const char *aToType,
- nsIStreamListener *aListener, nsISupports *aCtxt) {
- return NS_ERROR_NOT_IMPLEMENTED;
- }
- NS_IMETHODIMP
- mozTXTToHTMLConv::OnDataAvailable(nsIRequest* request, nsISupports *ctxt,
- nsIInputStream *inStr, uint64_t sourceOffset,
- uint32_t count)
- {
- return NS_ERROR_NOT_IMPLEMENTED;
- }
- NS_IMETHODIMP
- mozTXTToHTMLConv::OnStartRequest(nsIRequest* request, nsISupports *ctxt)
- {
- return NS_ERROR_NOT_IMPLEMENTED;
- }
- NS_IMETHODIMP
- mozTXTToHTMLConv::OnStopRequest(nsIRequest* request, nsISupports *ctxt,
- nsresult aStatus)
- {
- return NS_ERROR_NOT_IMPLEMENTED;
- }
- NS_IMETHODIMP
- mozTXTToHTMLConv::CiteLevelTXT(const char16_t *line, uint32_t *logLineStart,
- uint32_t *_retval)
- {
- if (!logLineStart || !_retval || !line)
- return NS_ERROR_NULL_POINTER;
- *_retval = CiteLevelTXT(line, *logLineStart);
- return NS_OK;
- }
- NS_IMETHODIMP
- mozTXTToHTMLConv::ScanTXT(const char16_t *text, uint32_t whattodo,
- char16_t **_retval)
- {
- NS_ENSURE_ARG(text);
- // FIX ME!!!
- nsString outString;
- int32_t inLength = NS_strlen(text);
- // by setting a large capacity up front, we save time
- // when appending characters to the output string because we don't
- // need to reallocate and re-copy the characters already in the out String.
- NS_ASSERTION(inLength, "ScanTXT passed 0 length string");
- if (inLength == 0) {
- *_retval = NS_strdup(text);
- return NS_OK;
- }
- outString.SetCapacity(uint32_t(inLength * growthRate));
- ScanTXT(text, inLength, whattodo, outString);
- *_retval = ToNewUnicode(outString);
- return *_retval ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
- }
- NS_IMETHODIMP
- mozTXTToHTMLConv::ScanHTML(const char16_t *text, uint32_t whattodo,
- char16_t **_retval)
- {
- NS_ENSURE_ARG(text);
- // FIX ME!!!
- nsString outString;
- nsString inString (text); // look at this nasty extra copy of the entire input buffer!
- outString.SetCapacity(uint32_t(inString.Length() * growthRate));
- ScanHTML(inString, whattodo, outString);
- *_retval = ToNewUnicode(outString);
- return *_retval ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
- }
- nsresult
- MOZ_NewTXTToHTMLConv(mozTXTToHTMLConv** aConv)
- {
- NS_PRECONDITION(aConv != nullptr, "null ptr");
- if (!aConv)
- return NS_ERROR_NULL_POINTER;
- *aConv = new mozTXTToHTMLConv();
- if (!*aConv)
- return NS_ERROR_OUT_OF_MEMORY;
- NS_ADDREF(*aConv);
- // return (*aConv)->Init();
- return NS_OK;
- }
|