/* -*- Mode: C; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifdef DEBUG_BenB_Perf # include "prtime.h" # include "prinrval.h" #endif
using mozilla::IsAscii; using mozilla::IsAsciiAlpha; using mozilla::IsAsciiDigit; using mozilla::Maybe; using mozilla::Some; using mozilla::Span; using mozilla::intl::GraphemeClusterBreakIteratorUtf16; using mozilla::intl::GraphemeClusterBreakReverseIteratorUtf16;
constdouble growthRate = 1.2;
// Bug 183111, editor now replaces multiple spaces with leading // 0xA0's and a single ending space, so need to treat 0xA0's as spaces. // 0xA0 is the Latin1/Unicode character for "non-breaking space (nbsp)" // Also recognize the Japanese ideographic space 0x3000 as a space. staticinlinebool IsSpace(const char16_t aChar) { return (nsCRT::IsAsciiSpace(aChar) || aChar == 0xA0 || aChar == 0x3000);
}
// Escape Char will take ch, escape it and append the result to // aStringToAppendTo void mozTXTToHTMLConv::EscapeChar(const char16_t ch,
nsAString& aStringToAppendTo, bool inAttribute) { switch (ch) { case'<':
aStringToAppendTo.AppendLiteral("<"); break; case'>':
aStringToAppendTo.AppendLiteral(">"); break; case'&':
aStringToAppendTo.AppendLiteral("&"); break; case'"': if (inAttribute) {
aStringToAppendTo.AppendLiteral("""); break;
} // else fall through
[[fallthrough]]; default:
aStringToAppendTo += ch;
}
}
// EscapeStr takes the passed in string and // escapes it IN PLACE. void mozTXTToHTMLConv::EscapeStr(nsString& aInString, bool inAttribute) { // the replace substring routines // don't seem to work if you have a character // in the in string that is also in the replacement // string! =( // aInString.ReplaceSubstring("&", "&"); // aInString.ReplaceSubstring("<", "<"); // aInString.ReplaceSubstring(">", ">"); for (uint32_t i = 0; i < aInString.Length();) { switch (aInString[i]) { case'<':
aInString.Cut(i, 1);
aInString.InsertLiteral(u"<", i);
i += 4; // skip past the integers we just added break; case'>':
aInString.Cut(i, 1);
aInString.InsertLiteral(u">", i);
i += 4; // skip past the integers we just added break; case'&':
aInString.Cut(i, 1);
aInString.InsertLiteral(u"&", i);
i += 5; // skip past the integers we just added break; case'"': if (inAttribute) {
aInString.Cut(i, 1);
aInString.InsertLiteral(u""", i);
i += 6; break;
} // else fall through
[[fallthrough]]; default:
i++;
}
}
}
void mozTXTToHTMLConv::CompleteAbbreviatedURL(const char16_t* aInString,
int32_t aInLength, const uint32_t pos,
nsString& aOutString) {
NS_ASSERTION(int32_t(pos) < aInLength, "bad args to CompleteAbbreviatedURL, see bug #190851"); if (int32_t(pos) >= aInLength) return;
if (aInString[pos] == '@') { // only pre-pend a mailto url if the string contains a .domain in it.. // i.e. we want to linkify johndoe@foo.com but not "let's meet @8pm"
nsDependentString inString(aInString, aInLength); if (inString.FindChar('.', pos) !=
kNotFound) // if we have a '.' after the @ sign....
{
aOutString.AssignLiteral("mailto:");
aOutString += aInString;
}
} elseif (aInString[pos] == '.') { if (ItMatchesDelimited(aInString, aInLength, u"www.", 4, LT_IGNORE,
LT_IGNORE)) {
aOutString.AssignLiteral("http://");
aOutString += aInString;
}
}
}
bool mozTXTToHTMLConv::FindURLStart(const char16_t* aInString,
int32_t aInLength, const uint32_t pos, const modetype check, uint32_t& start) { switch (check) { // no breaks, because end of blocks is never reached case RFC1738: { if (!NS_strncmp(&aInString[std::max(int32_t(pos - 4), 0)], u", 5)) {
start = pos + 1; returntrue;
} returnfalse;
} case RFC2396E: {
nsDependentSubstring temp(aInString, aInLength);
int32_t i = pos <= 0 ? kNotFound : temp.RFindCharInSet(u"<>\"", pos - 1); if (i != kNotFound &&
(temp[uint32_t(i)] == '<' || temp[uint32_t(i)] == '"')) {
start = uint32_t(++i); return start < pos;
} returnfalse;
} case freetext: {
int32_t i = pos - 1; for (; i >= 0 &&
(IsAsciiAlpha(aInString[uint32_t(i)]) ||
IsAsciiDigit(aInString[uint32_t(i)]) ||
aInString[uint32_t(i)] == '+' || aInString[uint32_t(i)] == '-' ||
aInString[uint32_t(i)] == '.');
i--) {
;
} if (++i >= 0 && uint32_t(i) < pos &&
IsAsciiAlpha(aInString[uint32_t(i)])) {
start = uint32_t(i); returntrue;
} returnfalse;
} case abbreviated: {
int32_t i = pos - 1; // This disallows non-ascii-characters for email. // Currently correct, but revisit later after standards changed. bool isEmail = aInString[pos] == (char16_t)'@'; // These chars mark the start of the URL for (; i >= 0 && aInString[uint32_t(i)] != '>' &&
aInString[uint32_t(i)] != '<' && aInString[uint32_t(i)] != '"' &&
aInString[uint32_t(i)] != '\'' && aInString[uint32_t(i)] != '`' &&
aInString[uint32_t(i)] != ',' && aInString[uint32_t(i)] != '{' &&
aInString[uint32_t(i)] != '[' && aInString[uint32_t(i)] != '(' &&
aInString[uint32_t(i)] != '|' && aInString[uint32_t(i)] != '\\' &&
!IsSpace(aInString[uint32_t(i)]) &&
(!isEmail || IsAscii(aInString[uint32_t(i)])) &&
(!isEmail || aInString[uint32_t(i)] != ')');
i--) {
;
} if (++i >= 0 && uint32_t(i) < pos &&
(IsAsciiAlpha(aInString[uint32_t(i)]) ||
IsAsciiDigit(aInString[uint32_t(i)]))) {
start = uint32_t(i); returntrue;
} returnfalse;
} default: returnfalse;
} // switch
}
bool mozTXTToHTMLConv::FindURLEnd(const char16_t* aInString,
int32_t aInStringLength, const uint32_t pos, const modetype check, const uint32_t start,
uint32_t& end) { switch (check) { // no breaks, because end of blocks is never reached case RFC1738: case RFC2396E: {
nsDependentSubstring temp(aInString, aInStringLength);
int32_t i = temp.FindCharInSet(u"<>\"", pos + 1); if (i != kNotFound &&
temp[uint32_t(i--)] ==
(check == RFC1738 || temp[start - 1] == '<' ? '>' : '"')) {
end = uint32_t(i); return end > pos;
} returnfalse;
} case freetext: case abbreviated: {
uint32_t i = pos + 1; bool isEmail = aInString[pos] == (char16_t)'@'; bool seenOpeningParenthesis = false; // there is a '(' earlier in the URL bool seenOpeningSquareBracket = false; // there is a '[' earlier in the URL for (; int32_t(i) < aInStringLength; i++) { // These chars mark the end of the URL if (aInString[i] == '>' || aInString[i] == '<' || aInString[i] == '"' ||
aInString[i] == '`' || aInString[i] == '}' || aInString[i] == '{' ||
(aInString[i] == ')' && !seenOpeningParenthesis) ||
(aInString[i] == ']' && !seenOpeningSquareBracket) || // Allow IPv6 adresses like http://[1080::8:800:200C:417A]/foo.
(aInString[i] == '[' && i > 2 &&
(aInString[i - 1] != '/' || aInString[i - 2] != '/')) ||
IsSpace(aInString[i])) { break;
} // Disallow non-ascii-characters for email. // Currently correct, but revisit later after standards changed. if (isEmail && (aInString[i] == '(' || aInString[i] == '\'' ||
!IsAscii(aInString[i]))) { break;
} if (aInString[i] == '(') seenOpeningParenthesis = true; if (aInString[i] == '[') seenOpeningSquareBracket = true;
} // These chars are allowed in the middle of the URL, but not at end. // Technically they are, but are used in normal text after the URL. while (--i > pos && (aInString[i] == '.' || aInString[i] == ',' ||
aInString[i] == ';' || aInString[i] == '!' ||
aInString[i] == '?' || aInString[i] == '-' ||
aInString[i] == ':' || aInString[i] == '\'')) {
;
} if (i > pos) {
end = i; returntrue;
} returnfalse;
} default: returnfalse;
} // switch
}
void mozTXTToHTMLConv::CalculateURLBoundaries( const char16_t* aInString, int32_t aInStringLength, const uint32_t pos, const uint32_t whathasbeendone, const modetype check, const uint32_t start, const uint32_t end, nsString& txtURL, nsString& desc,
int32_t& replaceBefore, int32_t& replaceAfter) {
uint32_t descstart = start; switch (check) { case RFC1738: {
descstart = start - 5;
desc.Append(&aInString[descstart],
end - descstart + 2); // include "<URL:" and ">"
replaceAfter = end - pos + 1;
} break; case RFC2396E: {
descstart = start - 1;
desc.Append(&aInString[descstart],
end - descstart + 2); // include brackets
replaceAfter = end - pos + 1;
} break; case freetext: case abbreviated: {
descstart = start;
desc.Append(&aInString[descstart],
end - start + 1); // don't include brackets
replaceAfter = end - pos;
} break; default: break;
} // switch
EscapeStr(desc, false);
txtURL.Append(&aInString[start], end - start + 1);
txtURL.StripWhitespace();
// Get the handler for this scheme.
nsCOMPtr<nsIProtocolHandler> handler;
rv = mIOService->GetProtocolHandler(scheme.get(), getter_AddRefs(handler)); if (NS_FAILED(rv)) returnfalse;
// Is it an external protocol handler? If not, linkify it.
nsCOMPtr<nsIExternalProtocolHandler> externalHandler =
do_QueryInterface(handler); if (!externalHandler) returntrue; // handler is built-in, linkify it!
// If external app exists for the scheme then linkify it. bool exists;
rv = externalHandler->ExternalAppExistsForScheme(scheme, &exists); return (NS_SUCCEEDED(rv) && exists);
}
// See if the url should be linkified.
NS_ConvertUTF16toUTF8 utf8URL(txtURL); if (!ShouldLinkify(utf8URL)) returnfalse;
// it would be faster if we could just check to see if there is a protocol // handler for the url and return instead of actually trying to create a // url...
rv = mIOService->NewURI(utf8URL, nullptr, nullptr, getter_AddRefs(uri));
statetype state[mozTXTToHTMLConv_lastMode + 1]; // 0(=unknown)..lastMode /* I don't like this abuse of enums as index for the array,
but I don't know a better method */
// Define, which modes to check /* all modes but abbreviated are checked for text[pos] == ':',
only abbreviated for '.', RFC2396E and abbreviated for '@' */ for (modetype iState = unknown; iState <= mozTXTToHTMLConv_lastMode;
iState = modetype(iState + 1)) {
state[iState] = aInString[pos] == ':' ? unchecked : invalid;
} switch (aInString[pos]) { case'@':
state[RFC2396E] = unchecked;
[[fallthrough]]; case'.':
state[abbreviated] = unchecked; break; case':':
state[abbreviated] = invalid; break; default: break;
}
// Test, first successful mode wins, sequence defined by |ranking|
int32_t iCheck = 0; // the currently tested modetype
modetype check = ranking[iCheck]; for (; iCheck < mozTXTToHTMLConv_numberOfModes && state[check] != success;
iCheck++) /* check state from last run.
If this is the first, check this one, which isn't = success yet */
{
check = ranking[iCheck];
uint32_t start, end;
if (state[check] == unchecked) { if (FindURLStart(aInString, aInLength, pos, check, start)) {
state[check] = startok;
}
}
if (state[check] == startok) { if (FindURLEnd(aInString, aInLength, pos, check, start, end)) {
state[check] = endok;
}
}
bool mozTXTToHTMLConv::ItMatchesDelimited(const char16_t* aInString,
int32_t aInLength, const char16_t* rep, int32_t aRepLen,
LIMTYPE before, LIMTYPE after) { // this little method gets called a LOT. I found we were spending a // lot of time just calculating the length of the variable "rep" // over and over again every time we called it. So we're now passing // an integer in here.
int32_t textLen = aInLength;
// Limit lookahead length to avoid pathological O(n^2) behavior; looking so // far ahead is unlikely to be important for cases where styling marked-up // fragments is actually useful anyhow. const uint32_t len =
std::min(2000u, mozilla::AssertedCast<uint32_t>(aInStringLength));
GraphemeClusterBreakIteratorUtf16 ci(Span<const char16_t>(aInString, len)); for (uint32_t pos = 0; pos < len; pos = *ci.Next()) { if (ItMatchesDelimited(aInString + pos, aInStringLength - pos, rep, aRepLen,
before, after)) {
result++;
}
} return result;
}
// NOTE: the converted html for the phrase is appended to aOutString // tagHTML and attributeHTML are plain ASCII (literal strings, in fact) bool mozTXTToHTMLConv::StructPhraseHit( const char16_t* aInString, int32_t aInStringLength, bool col0, const char16_t* tagTXT, int32_t aTagTXTLen, constchar* tagHTML, constchar* attributeHTML, nsAString& aOutString, uint32_t& openTags) { /* We're searching for the following pattern: LT_DELIMITER - "*" - ALPHA - [ some text (maybe more "*"-pairs) - ALPHA ] "*" - LT_DELIMITER. <strong> is only inserted, if existence of a pair could be verified
We use the first opening/closing tag, if we can choose */
const char16_t* newOffset = aInString;
int32_t newLength = aInStringLength; if (!col0) // skip the first element?
{
newOffset = &aInString[1];
newLength = aInStringLength - 1;
}
// the glyph is appended to aOutputString instead of the original string... bool mozTXTToHTMLConv::GlyphHit(const char16_t* aInString, int32_t aInLength, bool col0, nsAString& aOutputString,
int32_t& glyphTextLen) {
char16_t text0 = aInString[0];
char16_t text1 = aInString[1];
char16_t firstChar = (col0 ? text0 : text1);
// temporary variable used to store the glyph html text
nsAutoString outputHTML; bool bTestSmilie; bool bArg = false; int i;
// refactor some of this mess to avoid code duplication and speed execution a // bit there are two cases that need to be tried one after another. To avoid a // lot of duplicate code, rolling into a loop
i = 0; while (i < 2) {
bTestSmilie = false; if (!i && (firstChar == ':' || firstChar == ';' || firstChar == '=' ||
firstChar == '>' || firstChar == '8' || firstChar == 'O')) { // first test passed
bTestSmilie = true;
bArg = col0;
} if (i && col0 &&
(text1 == ':' || text1 == ';' || text1 == '=' || text1 == '>' ||
text1 == '8' || text1 == 'O')) { // second test passed
glyphTextLen = delimPos /* - 1 + 1 */; returntrue;
} /* The following strings are not substituted: |TXT |HTML |Reason +------+---------+---------- -> ← Bug #454 => ⇐ dito <- → dito <= ⇒ dito (tm) ™ dito 1/4 ¼ is triggered by 1/4 Part 1, 2/4 Part 2, ... 3/4 ¾ dito 1/2 ½ similar
*/ returnfalse;
}
// some common variables we were recalculating // every time inside the for loop...
int32_t lengthOfInString = aInString.Length(); const char16_t* uniBuffer = aInString.get();
// Look for simple entities not included in a tags and scan them. // Skip all tags ("<[...]>") and content in an a link tag ("<a [...]</a>"), // comment tag ("<!--[...]-->"), style tag, script tag or head tag. // Unescape the rest (text between tags) and pass it to ScanTXT.
nsAutoCString canFollow(" \f\n\r\t>"); for (int32_t i = 0; i < lengthOfInString;) { if (aInString[i] == '<') // html tag
{
int32_t start = i; if (i + 2 < lengthOfInString && nsCRT::ToLower(aInString[i + 1]) == 'a' &&
canFollow.FindChar(aInString[i + 2]) != kNotFound) // if a tag, skip until </a>. // Make sure there's a white-space character after, not to match "abbr".
{
i = aInString.LowerCaseFindASCII("", i); if (i == kNotFound) {
i = lengthOfInString;
} else {
i += 4;
}
} elseif (Substring(aInString, i + 1, 3).LowerCaseEqualsASCII("!--")) // if out-commended code, skip until -->
{
i = aInString.Find(u"-->", i); if (i == kNotFound) {
i = lengthOfInString;
} else {
i += 3;
}
} elseif (i + 6 < lengthOfInString &&
Substring(aInString, i + 1, 5).LowerCaseEqualsASCII("style") &&
canFollow.FindChar(aInString[i + 6]) != kNotFound) // if style tag, skip until </style>
{
i = aInString.LowerCaseFindASCII("", i); if (i == kNotFound) {
i = lengthOfInString;
} else {
i += 8;
}
} elseif (i + 7 < lengthOfInString &&
Substring(aInString, i + 1, 6)
.LowerCaseEqualsASCII("script") &&
canFollow.FindChar(aInString[i + 7]) != kNotFound) // if script tag, skip until </script>
{
i = aInString.LowerCaseFindASCII("", i); if (i == kNotFound) {
i = lengthOfInString;
} else {
i += 9;
}
} elseif (i + 5 < lengthOfInString &&
Substring(aInString, i + 1, 4).LowerCaseEqualsASCII("head") &&
canFollow.FindChar(aInString[i + 5]) != kNotFound) // if head tag, skip until </head> // Make sure not to match <header>.
{
i = aInString.LowerCaseFindASCII("", i); if (i == kNotFound) {
i = lengthOfInString;
} else {
i += 7;
}
} else// just skip tag (attributes etc.)
{
i = aInString.FindChar('>', i); if (i == kNotFound) {
i = lengthOfInString;
} else {
i++;
}
}
aOutString.Append(&uniBuffer[start], i - start);
} else {
uint32_t start = uint32_t(i);
i = aInString.FindChar('<', i); if (i == kNotFound) i = lengthOfInString;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.