Quelle mozTXTToHTMLConv.cpp

Sprache: C

/* -*- Mode: C; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "mozilla/TextUtils.h"
#include "mozTXTToHTMLConv.h"
#include "mozilla/intl/Segmenter.h"
#include "mozilla/Maybe.h"
#include "nsIThreadRetargetableStreamListener.h"
#include "nsNetUtil.h"
#include "nsUnicharUtils.h"
#include "nsUnicodeProperties.h"
#include "nsCRT.h"
#include "nsIExternalProtocolHandler.h"
#include "nsIURI.h"

#include <algorithm>

#ifdef DEBUG_BenB_Perf
#  include "prtime.h"
#  include "prinrval.h"
#endif

using mozilla::IsAscii;
using mozilla::IsAsciiAlpha;
using mozilla::IsAsciiDigit;
using mozilla::Maybe;
using mozilla::Some;
using mozilla::Span;
using mozilla::intl::GraphemeClusterBreakIteratorUtf16;
using mozilla::intl::GraphemeClusterBreakReverseIteratorUtf16;

const double growthRate = 1.2;

// Bug 183111, editor now replaces multiple spaces with leading
// 0xA0's and a single ending space, so need to treat 0xA0's as spaces.
// 0xA0 is the Latin1/Unicode character for "non-breaking space (nbsp)"
// Also recognize the Japanese ideographic space 0x3000 as a space.
static inline bool IsSpace(const char16_t aChar) {
  return (nsCRT::IsAsciiSpace(aChar) || aChar == 0xA0 || aChar == 0x3000);
}

// Escape Char will take ch, escape it and append the result to
// aStringToAppendTo
void mozTXTToHTMLConv::EscapeChar(const char16_t ch,
                                  nsAString& aStringToAppendTo,
                                  bool inAttribute) {
  switch (ch) {
    case '<':
      aStringToAppendTo.AppendLiteral("<");
      break;
    case '>':
      aStringToAppendTo.AppendLiteral(">");
      break;
    case '&':
      aStringToAppendTo.AppendLiteral("&");
      break;
    case '"':
      if (inAttribute) {
        aStringToAppendTo.AppendLiteral(""");
        break;
      }
      // else fall through
      [[fallthrough]];
    default:
      aStringToAppendTo += ch;
  }
}

// EscapeStr takes the passed in string and
// escapes it IN PLACE.
void mozTXTToHTMLConv::EscapeStr(nsString& aInString, bool inAttribute) {
  // the replace substring routines
  // don't seem to work if you have a character
  // in the in string that is also in the replacement
  // string! =(
  // aInString.ReplaceSubstring("&", "&");
  // aInString.ReplaceSubstring("<", "<");
  // aInString.ReplaceSubstring(">", ">");
  for (uint32_t i = 0; i < aInString.Length();) {
    switch (aInString[i]) {
      case '<':
        aInString.Cut(i, 1);
        aInString.InsertLiteral(u"<", i);
        i += 4;  // skip past the integers we just added
        break;
      case '>':
        aInString.Cut(i, 1);
        aInString.InsertLiteral(u">", i);
        i += 4;  // skip past the integers we just added
        break;
      case '&':
        aInString.Cut(i, 1);
        aInString.InsertLiteral(u"&", i);
        i += 5;  // skip past the integers we just added
        break;
      case '"':
        if (inAttribute) {
          aInString.Cut(i, 1);
          aInString.InsertLiteral(u""", i);
          i += 6;
          break;
        }
        // else fall through
        [[fallthrough]];
      default:
        i++;
    }
  }
}

void mozTXTToHTMLConv::UnescapeStr(const char16_t* aInString, int32_t aStartPos,
                                   int32_t aLength, nsString& aOutString) {
  const char16_t* subString = nullptr;
  for (uint32_t i = aStartPos; int32_t(i) - aStartPos < aLength;) {
    int32_t remainingChars = i - aStartPos;
    if (aInString[i] == '&') {
      subString = &aInString[i];
      if (!NS_strncmp(subString, u"<",
                      std::min(4, aLength - remainingChars))) {
        aOutString.Append(char16_t('<'));
        i += 4;
      } else if (!NS_strncmp(subString, u">",
                             std::min(4, aLength - remainingChars))) {
        aOutString.Append(char16_t('>'));
        i += 4;
      } else if (!NS_strncmp(subString, u"&",
                             std::min(5, aLength - remainingChars))) {
        aOutString.Append(char16_t('&'));
        i += 5;
      } else if (!NS_strncmp(subString, u""",
                             std::min(6, aLength - remainingChars))) {
        aOutString.Append(char16_t('"'));
        i += 6;
      } else {
        aOutString += aInString[i];
        i++;
      }
    } else {
      aOutString += aInString[i];
      i++;
    }
  }
}

void mozTXTToHTMLConv::CompleteAbbreviatedURL(const char16_t* aInString,
                                              int32_t aInLength,
                                              const uint32_t pos,
                                              nsString& aOutString) {
  NS_ASSERTION(int32_t(pos) < aInLength,
               "bad args to CompleteAbbreviatedURL, see bug #190851");
  if (int32_t(pos) >= aInLength) return;

  if (aInString[pos] == '@') {
    // only pre-pend a mailto url if the string contains a .domain in it..
    // i.e. we want to linkify johndoe@foo.com but not "let's meet @8pm"
    nsDependentString inString(aInString, aInLength);
    if (inString.FindChar('.', pos) !=
        kNotFound)  // if we have a '.' after the @ sign....
    {
      aOutString.AssignLiteral("mailto:");
      aOutString += aInString;
    }
  } else if (aInString[pos] == '.') {
    if (ItMatchesDelimited(aInString, aInLength, u"www.", 4, LT_IGNORE,
                           LT_IGNORE)) {
      aOutString.AssignLiteral("http://");
      aOutString += aInString;
    }
  }
}

bool mozTXTToHTMLConv::FindURLStart(const char16_t* aInString,
                                    int32_t aInLength, const uint32_t pos,
                                    const modetype check, uint32_t& start) {
  switch (check) {  // no breaks, because end of blocks is never reached
    case RFC1738: {
      if (!NS_strncmp(&aInString[std::max(int32_t(pos - 4), 0)], u"<URL:", 5)) {
        start = pos + 1;
        return true;
      }
      return false;
    }
    case RFC2396E: {
      nsDependentSubstring temp(aInString, aInLength);
      int32_t i = pos <= 0 ? kNotFound : temp.RFindCharInSet(u"<>\"", pos - 1);
      if (i != kNotFound &&
          (temp[uint32_t(i)] == '<' || temp[uint32_t(i)] == '"')) {
        start = uint32_t(++i);
        return start < pos;
      }
      return false;
    }
    case freetext: {
      int32_t i = pos - 1;
      for (; i >= 0 &&
             (IsAsciiAlpha(aInString[uint32_t(i)]) ||
              IsAsciiDigit(aInString[uint32_t(i)]) ||
              aInString[uint32_t(i)] == '+' || aInString[uint32_t(i)] == '-' ||
              aInString[uint32_t(i)] == '.');
           i--) {
        ;
      }
      if (++i >= 0 && uint32_t(i) < pos &&
          IsAsciiAlpha(aInString[uint32_t(i)])) {
        start = uint32_t(i);
        return true;
      }
      return false;
    }
    case abbreviated: {
      int32_t i = pos - 1;
      // This disallows non-ascii-characters for email.
      // Currently correct, but revisit later after standards changed.
      bool isEmail = aInString[pos] == (char16_t)'@';
      // These chars mark the start of the URL
      for (; i >= 0 && aInString[uint32_t(i)] != '>' &&
             aInString[uint32_t(i)] != '<' && aInString[uint32_t(i)] != '"' &&
             aInString[uint32_t(i)] != '\'' && aInString[uint32_t(i)] != '`' &&
             aInString[uint32_t(i)] != ',' && aInString[uint32_t(i)] != '{' &&
             aInString[uint32_t(i)] != '[' && aInString[uint32_t(i)] != '(' &&
             aInString[uint32_t(i)] != '|' && aInString[uint32_t(i)] != '\\' &&
             !IsSpace(aInString[uint32_t(i)]) &&
             (!isEmail || IsAscii(aInString[uint32_t(i)])) &&
             (!isEmail || aInString[uint32_t(i)] != ')');
           i--) {
        ;
      }
      if (++i >= 0 && uint32_t(i) < pos &&
          (IsAsciiAlpha(aInString[uint32_t(i)]) ||
           IsAsciiDigit(aInString[uint32_t(i)]))) {
        start = uint32_t(i);
        return true;
      }
      return false;
    }
    default:
      return false;
  }  // switch
}

bool mozTXTToHTMLConv::FindURLEnd(const char16_t* aInString,
                                  int32_t aInStringLength, const uint32_t pos,
                                  const modetype check, const uint32_t start,
                                  uint32_t& end) {
  switch (check) {  // no breaks, because end of blocks is never reached
    case RFC1738:
    case RFC2396E: {
      nsDependentSubstring temp(aInString, aInStringLength);

      int32_t i = temp.FindCharInSet(u"<>\"", pos + 1);
      if (i != kNotFound &&
          temp[uint32_t(i--)] ==
              (check == RFC1738 || temp[start - 1] == '<' ? '>' : '"')) {
        end = uint32_t(i);
        return end > pos;
      }
      return false;
    }
    case freetext:
    case abbreviated: {
      uint32_t i = pos + 1;
      bool isEmail = aInString[pos] == (char16_t)'@';
      bool seenOpeningParenthesis = false;  // there is a '(' earlier in the URL
      bool seenOpeningSquareBracket =
          false;  // there is a '[' earlier in the URL
      for (; int32_t(i) < aInStringLength; i++) {
        // These chars mark the end of the URL
        if (aInString[i] == '>' || aInString[i] == '<' || aInString[i] == '"' ||
            aInString[i] == '`' || aInString[i] == '}' || aInString[i] == '{' ||
            (aInString[i] == ')' && !seenOpeningParenthesis) ||
            (aInString[i] == ']' && !seenOpeningSquareBracket) ||
            // Allow IPv6 adresses like http://[1080::8:800:200C:417A]/foo.
            (aInString[i] == '[' && i > 2 &&
             (aInString[i - 1] != '/' || aInString[i - 2] != '/')) ||
            IsSpace(aInString[i])) {
          break;
        }
        // Disallow non-ascii-characters for email.
        // Currently correct, but revisit later after standards changed.
        if (isEmail && (aInString[i] == '(' || aInString[i] == '\'' ||
                        !IsAscii(aInString[i]))) {
          break;
        }
        if (aInString[i] == '(') seenOpeningParenthesis = true;
        if (aInString[i] == '[') seenOpeningSquareBracket = true;
      }
      // These chars are allowed in the middle of the URL, but not at end.
      // Technically they are, but are used in normal text after the URL.
      while (--i > pos && (aInString[i] == '.' || aInString[i] == ',' ||
                           aInString[i] == ';' || aInString[i] == '!' ||
                           aInString[i] == '?' || aInString[i] == '-' ||
                           aInString[i] == ':' || aInString[i] == '\'')) {
        ;
      }
      if (i > pos) {
        end = i;
        return true;
      }
      return false;
    }
    default:
      return false;
  }  // switch
}

void mozTXTToHTMLConv::CalculateURLBoundaries(
    const char16_t* aInString, int32_t aInStringLength, const uint32_t pos,
    const uint32_t whathasbeendone, const modetype check, const uint32_t start,
    const uint32_t end, nsString& txtURL, nsString& desc,
    int32_t& replaceBefore, int32_t& replaceAfter) {
  uint32_t descstart = start;
  switch (check) {
    case RFC1738: {
      descstart = start - 5;
      desc.Append(&aInString[descstart],
                  end - descstart + 2);  // include "<URL:" and ">"
      replaceAfter = end - pos + 1;
    } break;
    case RFC2396E: {
      descstart = start - 1;
      desc.Append(&aInString[descstart],
                  end - descstart + 2);  // include brackets
      replaceAfter = end - pos + 1;
    } break;
    case freetext:
    case abbreviated: {
      descstart = start;
      desc.Append(&aInString[descstart],
                  end - start + 1);  // don't include brackets
      replaceAfter = end - pos;
    } break;
    default:
      break;
  }  // switch

  EscapeStr(desc, false);

  txtURL.Append(&aInString[start], end - start + 1);
  txtURL.StripWhitespace();

  // FIX ME
  nsAutoString temp2;
  ScanTXT(nsDependentSubstring(&aInString[descstart], pos - descstart),
          ~kURLs /*prevents loop*/ & whathasbeendone, temp2);
  replaceBefore = temp2.Length();
}

bool mozTXTToHTMLConv::ShouldLinkify(const nsCString& aURL) {
  if (!mIOService) return false;

  nsAutoCString scheme;
  nsresult rv = mIOService->ExtractScheme(aURL, scheme);
  if (NS_FAILED(rv)) return false;

  if (scheme == "http" || scheme == "https" || scheme == "mailto") {
    return true;
  }

  // Get the handler for this scheme.
  nsCOMPtr<nsIProtocolHandler> handler;
  rv = mIOService->GetProtocolHandler(scheme.get(), getter_AddRefs(handler));
  if (NS_FAILED(rv)) return false;

  // Is it an external protocol handler? If not, linkify it.
  nsCOMPtr<nsIExternalProtocolHandler> externalHandler =
      do_QueryInterface(handler);
  if (!externalHandler) return true;  // handler is built-in, linkify it!

  // If external app exists for the scheme then linkify it.
  bool exists;
  rv = externalHandler->ExternalAppExistsForScheme(scheme, &exists);
  return (NS_SUCCEEDED(rv) && exists);
}

bool mozTXTToHTMLConv::CheckURLAndCreateHTML(const nsString& txtURL,
                                             const nsString& desc,
                                             const modetype mode,
                                             nsString& outputHTML) {
  // Create *uri from txtURL
  nsCOMPtr<nsIURI> uri;
  nsresult rv;
  // Lazily initialize mIOService
  if (!mIOService) {
    mIOService = do_GetIOService();

    if (!mIOService) return false;
  }

  // See if the url should be linkified.
  NS_ConvertUTF16toUTF8 utf8URL(txtURL);
  if (!ShouldLinkify(utf8URL)) return false;

  // it would be faster if we could just check to see if there is a protocol
  // handler for the url and return instead of actually trying to create a
  // url...
  rv = mIOService->NewURI(utf8URL, nullptr, nullptr, getter_AddRefs(uri));

  // Real work
  if (NS_SUCCEEDED(rv) && uri) {
    outputHTML.AssignLiteral("<a class=\"moz-txt-link-");
    switch (mode) {
      case RFC1738:
        outputHTML.AppendLiteral("rfc1738");
        break;
      case RFC2396E:
        outputHTML.AppendLiteral("rfc2396E");
        break;
      case freetext:
        outputHTML.AppendLiteral("freetext");
        break;
      case abbreviated:
        outputHTML.AppendLiteral("abbreviated");
        break;
      default:
        break;
    }
    nsAutoString escapedURL(txtURL);
    EscapeStr(escapedURL, true);

    outputHTML.AppendLiteral("\" href=\"");
    outputHTML += escapedURL;
    outputHTML.AppendLiteral("\">");
    outputHTML += desc;
    outputHTML.AppendLiteral("</a>");
    return true;
  }
  return false;
}

NS_IMETHODIMP mozTXTToHTMLConv::FindURLInPlaintext(const char16_t* aInString,
                                                   int32_t aInLength,
                                                   int32_t aPos,
                                                   int32_t* aStartPos,
                                                   int32_t* aEndPos) {
  // call FindURL on the passed in string
  nsAutoString outputHTML;  // we'll ignore the generated output HTML

  *aStartPos = -1;
  *aEndPos = -1;

  FindURL(aInString, aInLength, aPos, kURLs, outputHTML, *aStartPos, *aEndPos);

  return NS_OK;
}

bool mozTXTToHTMLConv::FindURL(const char16_t* aInString, int32_t aInLength,
                               const uint32_t pos,
                               const uint32_t whathasbeendone,
                               nsString& outputHTML, int32_t& replaceBefore,
                               int32_t& replaceAfter) {
  enum statetype { unchecked, invalid, startok, endok, success };
  static const modetype ranking[] = {RFC1738, RFC2396E, freetext, abbreviated};

  statetype state[mozTXTToHTMLConv_lastMode + 1];  // 0(=unknown)..lastMode
  /* I don't like this abuse of enums as index for the array,
     but I don't know a better method */

  // Define, which modes to check
  /* all modes but abbreviated are checked for text[pos] == ':',
     only abbreviated for '.', RFC2396E and abbreviated for '@' */
  for (modetype iState = unknown; iState <= mozTXTToHTMLConv_lastMode;
       iState = modetype(iState + 1)) {
    state[iState] = aInString[pos] == ':' ? unchecked : invalid;
  }
  switch (aInString[pos]) {
    case '@':
      state[RFC2396E] = unchecked;
      [[fallthrough]];
    case '.':
      state[abbreviated] = unchecked;
      break;
    case ':':
      state[abbreviated] = invalid;
      break;
    default:
      break;
  }

  // Test, first successful mode wins, sequence defined by |ranking|
  int32_t iCheck = 0;  // the currently tested modetype
  modetype check = ranking[iCheck];
  for (; iCheck < mozTXTToHTMLConv_numberOfModes && state[check] != success;
       iCheck++)
  /* check state from last run.
     If this is the first, check this one, which isn't = success yet */
  {
    check = ranking[iCheck];

    uint32_t start, end;

    if (state[check] == unchecked) {
      if (FindURLStart(aInString, aInLength, pos, check, start)) {
        state[check] = startok;
      }
    }

    if (state[check] == startok) {
      if (FindURLEnd(aInString, aInLength, pos, check, start, end)) {
        state[check] = endok;
      }
    }

    if (state[check] == endok) {
      nsAutoString txtURL, desc;
      int32_t resultReplaceBefore, resultReplaceAfter;

      CalculateURLBoundaries(aInString, aInLength, pos, whathasbeendone, check,
                             start, end, txtURL, desc, resultReplaceBefore,
                             resultReplaceAfter);

      if (aInString[pos] != ':') {
        nsAutoString temp = txtURL;
        txtURL.SetLength(0);
        CompleteAbbreviatedURL(temp.get(), temp.Length(), pos - start, txtURL);
      }

      if (!txtURL.IsEmpty() &&
          CheckURLAndCreateHTML(txtURL, desc, check, outputHTML)) {
        replaceBefore = resultReplaceBefore;
        replaceAfter = resultReplaceAfter;
        state[check] = success;
      }
    }  // if
  }  // for
  return state[check] == success;
}

static inline bool IsAlpha(const uint32_t aChar) {
  return mozilla::unicode::GetGenCategory(aChar) == nsUGenCategory::kLetter;
}

static inline bool IsDigit(const uint32_t aChar) {
  return mozilla::unicode::GetGenCategory(aChar) == nsUGenCategory::kNumber;
}

bool mozTXTToHTMLConv::ItMatchesDelimited(const char16_t* aInString,
                                          int32_t aInLength,
                                          const char16_t* rep, int32_t aRepLen,
                                          LIMTYPE before, LIMTYPE after) {
  // this little method gets called a LOT. I found we were spending a
  // lot of time just calculating the length of the variable "rep"
  // over and over again every time we called it. So we're now passing
  // an integer in here.
  int32_t textLen = aInLength;

  if (((before == LT_IGNORE && (after == LT_IGNORE || after == LT_DELIMITER)) &&
       textLen < aRepLen) ||
      ((before != LT_IGNORE || (after != LT_IGNORE && after != LT_DELIMITER)) &&
       textLen < aRepLen + 1) ||
      (before != LT_IGNORE && after != LT_IGNORE && after != LT_DELIMITER &&
       textLen < aRepLen + 2)) {
    return false;
  }

  uint32_t text0 = aInString[0];
  if (aInLength > 1 && NS_IS_SURROGATE_PAIR(text0, aInString[1])) {
    text0 = SURROGATE_TO_UCS4(text0, aInString[1]);
  }
  // find length of the char/cluster to be ignored
  int32_t ignoreLen = before == LT_IGNORE ? 0 : 1;
  if (ignoreLen) {
    GraphemeClusterBreakIteratorUtf16 ci(
        Span<const char16_t>(aInString, aInLength));
    ignoreLen = *ci.Next();
  }

  int32_t afterIndex = aRepLen + ignoreLen;
  uint32_t textAfterPos = aInString[afterIndex];
  if (aInLength > afterIndex + 1 &&
      NS_IS_SURROGATE_PAIR(textAfterPos, aInString[afterIndex + 1])) {
    textAfterPos = SURROGATE_TO_UCS4(textAfterPos, aInString[afterIndex + 1]);
  }

  return !((before == LT_ALPHA && !IsAlpha(text0)) ||
           (before == LT_DIGIT && !IsDigit(text0)) ||
           (before == LT_DELIMITER &&
            (IsAlpha(text0) || IsDigit(text0) || text0 == *rep)) ||
           (after == LT_ALPHA && !IsAlpha(textAfterPos)) ||
           (after == LT_DIGIT && !IsDigit(textAfterPos)) ||
           (after == LT_DELIMITER &&
            (IsAlpha(textAfterPos) || IsDigit(textAfterPos) ||
             textAfterPos == *rep)) ||
           !Substring(Substring(aInString, aInString + aInLength), ignoreLen,
                      aRepLen)
                .Equals(Substring(rep, rep + aRepLen),
                        nsCaseInsensitiveStringComparator));
}

uint32_t mozTXTToHTMLConv::NumberOfMatches(const char16_t* aInString,
                                           int32_t aInStringLength,
                                           const char16_t* rep, int32_t aRepLen,
                                           LIMTYPE before, LIMTYPE after) {
  uint32_t result = 0;

  // Limit lookahead length to avoid pathological O(n^2) behavior; looking so
  // far ahead is unlikely to be important for cases where styling marked-up
  // fragments is actually useful anyhow.
  const uint32_t len =
      std::min(2000u, mozilla::AssertedCast<uint32_t>(aInStringLength));
  GraphemeClusterBreakIteratorUtf16 ci(Span<const char16_t>(aInString, len));
  for (uint32_t pos = 0; pos < len; pos = *ci.Next()) {
    if (ItMatchesDelimited(aInString + pos, aInStringLength - pos, rep, aRepLen,
                           before, after)) {
      result++;
    }
  }
  return result;
}

// NOTE: the converted html for the phrase is appended to aOutString
// tagHTML and attributeHTML are plain ASCII (literal strings, in fact)
bool mozTXTToHTMLConv::StructPhraseHit(
    const char16_t* aInString, int32_t aInStringLength, bool col0,
    const char16_t* tagTXT, int32_t aTagTXTLen, const char* tagHTML,
    const char* attributeHTML, nsAString& aOutString, uint32_t& openTags) {
  /* We're searching for the following pattern:
     LT_DELIMITER - "*" - ALPHA -
     [ some text (maybe more "*"-pairs) - ALPHA ] "*" - LT_DELIMITER.
     <strong> is only inserted, if existence of a pair could be verified
     We use the first opening/closing tag, if we can choose */

  const char16_t* newOffset = aInString;
  int32_t newLength = aInStringLength;
  if (!col0)  // skip the first element?
  {
    newOffset = &aInString[1];
    newLength = aInStringLength - 1;
  }

  // opening tag
  if (ItMatchesDelimited(aInString, aInStringLength, tagTXT, aTagTXTLen,
                         (col0 ? LT_IGNORE : LT_DELIMITER),
                         LT_ALPHA)  // is opening tag
      && NumberOfMatches(newOffset, newLength, tagTXT, aTagTXTLen, LT_ALPHA,
                         LT_DELIMITER)  // remaining closing tags
             > openTags) {
    openTags++;
    aOutString.Append('<');
    aOutString.AppendASCII(tagHTML);
    aOutString.Append(char16_t(' '));
    aOutString.AppendASCII(attributeHTML);
    aOutString.AppendLiteral("><span class=\"moz-txt-tag\">");
    aOutString.Append(tagTXT);
    aOutString.AppendLiteral("</span>");
    return true;
  }

  // closing tag
  if (openTags > 0 && ItMatchesDelimited(aInString, aInStringLength, tagTXT,
                                         aTagTXTLen, LT_ALPHA, LT_DELIMITER)) {
    openTags--;
    aOutString.AppendLiteral("<span class=\"moz-txt-tag\">");
    aOutString.Append(tagTXT);
    aOutString.AppendLiteral("</span></");
    aOutString.AppendASCII(tagHTML);
    aOutString.Append(char16_t('>'));
    return true;
  }

  return false;
}

bool mozTXTToHTMLConv::SmilyHit(const char16_t* aInString, int32_t aLength,
                                bool col0, const char* tagTXT,
                                const nsString& imageName, nsString& outputHTML,
                                int32_t& glyphTextLen) {
  if (!aInString || !tagTXT || imageName.IsEmpty()) return false;

  int32_t tagLen = strlen(tagTXT);

  uint32_t delim = (col0 ? 0 : 1) + tagLen;

  if ((col0 || IsSpace(aInString[0])) &&
      (aLength <= int32_t(delim) || IsSpace(aInString[delim]) ||
       (aLength > int32_t(delim + 1) &&
        (aInString[delim] == '.' || aInString[delim] == ',' ||
         aInString[delim] == ';' || aInString[delim] == '8' ||
         aInString[delim] == '>' || aInString[delim] == '!' ||
         aInString[delim] == '?') &&
        IsSpace(aInString[delim + 1]))) &&
      ItMatchesDelimited(aInString, aLength,
                         NS_ConvertASCIItoUTF16(tagTXT).get(), tagLen,
                         col0 ? LT_IGNORE : LT_DELIMITER, LT_IGNORE)
      // Note: tests at different pos for LT_IGNORE and LT_DELIMITER
  ) {
    if (!col0) {
      outputHTML.Truncate();
      outputHTML.Append(char16_t(' '));
    }

    outputHTML.Append(imageName);  // emoji unicode
    glyphTextLen = (col0 ? 0 : 1) + tagLen;
    return true;
  }

  return false;
}

// the glyph is appended to aOutputString instead of the original string...
bool mozTXTToHTMLConv::GlyphHit(const char16_t* aInString, int32_t aInLength,
                                bool col0, nsAString& aOutputString,
                                int32_t& glyphTextLen) {
  char16_t text0 = aInString[0];
  char16_t text1 = aInString[1];
  char16_t firstChar = (col0 ? text0 : text1);

  // temporary variable used to store the glyph html text
  nsAutoString outputHTML;
  bool bTestSmilie;
  bool bArg = false;
  int i;

  // refactor some of this mess to avoid code duplication and speed execution a
  // bit there are two cases that need to be tried one after another. To avoid a
  // lot of duplicate code, rolling into a loop

  i = 0;
  while (i < 2) {
    bTestSmilie = false;
    if (!i && (firstChar == ':' || firstChar == ';' || firstChar == '=' ||
               firstChar == '>' || firstChar == '8' || firstChar == 'O')) {
      // first test passed

      bTestSmilie = true;
      bArg = col0;
    }
    if (i && col0 &&
        (text1 == ':' || text1 == ';' || text1 == '=' || text1 == '>' ||
         text1 == '8' || text1 == 'O')) {
      // second test passed

      bTestSmilie = true;
      bArg = false;
    }
    if (bTestSmilie && (SmilyHit(aInString, aInLength, bArg, ":-)",
                                 u"��"_ns,  // smile, U+1F642
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, ":)",
                                 u"��"_ns,  // smile, U+1F642
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, ":-D",
                                 u"��"_ns,  // laughing, U+1F602
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, ":-(",
                                 u"��"_ns,  // frown, U+1F641
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, ":(",
                                 u"��"_ns,  // frown, U+1F641
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, ":$",
                                 u"��"_ns,  // embarassed, U+1F633
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, ";-)",
                                 u"��"_ns,  // wink, U+1F609
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, col0, ";)",
                                 u"��"_ns,  // wink, U+1F609
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, ":-\\",
                                 u"��"_ns,  // undecided, U+1F615
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, ":-P",
                                 u"��"_ns,  // tongue, U+1F61B
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, ";-P",
                                 u"��"_ns,  // winking face with tongue, U+1F61C
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, "=-O",
                                 u"��"_ns,  // surprise, U+1F62E
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, ":-*",
                                 u"��"_ns,  // kiss, U+1F618
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, ">:o",
                                 u"��"_ns,  // swearing, U+1F92C
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, ">:-o",
                                 u"��"_ns,  // swearing, U+1F92C
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, ">:(",
                                 u"��"_ns,  // angry, U+1F620
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, ">:-(",
                                 u"��"_ns,  // angry, U+1F620
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, "8-)",
                                 u"��"_ns,  // cool, U+1F60E
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, ":-$",
                                 u"��"_ns,  // money, U+1F911
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, ":-!",
                                 u"��"_ns,  // foot, U+1F62C
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, "O:-)",
                                 u"��"_ns,  // innocent, U+1F607
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, ":'(",
                                 u"��"_ns,  // cry, U+1F62D
                                 outputHTML, glyphTextLen) ||

                        SmilyHit(aInString, aInLength, bArg, ":-X",
                                 u"��"_ns,  // sealed, U+1F910
                                 outputHTML, glyphTextLen))) {
      aOutputString.Append(outputHTML);
      return true;
    }
    i++;
  }
  if (text0 == '\f') {
    aOutputString.AppendLiteral("<span class='moz-txt-formfeed'></span>");
    glyphTextLen = 1;
    return true;
  }
  if (text0 == '+' || text1 == '+') {
    if (ItMatchesDelimited(aInString, aInLength, u" +/-", 4, LT_IGNORE,
                           LT_IGNORE)) {
      aOutputString.AppendLiteral(" ±");
      glyphTextLen = 4;
      return true;
    }
    if (col0 && ItMatchesDelimited(aInString, aInLength, u"+/-", 3, LT_IGNORE,
                                   LT_IGNORE)) {
      aOutputString.AppendLiteral("±");
      glyphTextLen = 3;
      return true;
    }
  }

  // x^2  =>  x<sup>2</sup>,   also handle powers x^-2,  x^0.5
  // implement regular expression /[\dA-Za-z\)\]}]\^-?\d+(\.\d+)*[^\dA-Za-z]/
  if (text1 == '^' &&
      (IsAsciiDigit(text0) || IsAsciiAlpha(text0) || text0 == ')' ||
       text0 == ']' || text0 == '}') &&
      ((2 < aInLength && IsAsciiDigit(aInString[2])) ||
       (3 < aInLength && aInString[2] == '-' && IsAsciiDigit(aInString[3])))) {
    // Find first non-digit
    int32_t delimPos = 3;  // skip "^" and first digit (or '-')
    for (; delimPos < aInLength &&
           (IsAsciiDigit(aInString[delimPos]) ||
            (aInString[delimPos] == '.' && delimPos + 1 < aInLength &&
             IsAsciiDigit(aInString[delimPos + 1])));
         delimPos++) {
      ;
    }

    if (delimPos < aInLength && IsAsciiAlpha(aInString[delimPos])) {
      return false;
    }

    outputHTML.Truncate();
    outputHTML += text0;
    outputHTML.AppendLiteral(
        "<sup class=\"moz-txt-sup\">"
        "<span style=\"display:inline-block;width:0;height:0;overflow:hidden\">"
        "^</span>");

    aOutputString.Append(outputHTML);
    aOutputString.Append(&aInString[2], delimPos - 2);
    aOutputString.AppendLiteral("</sup>");

    glyphTextLen = delimPos /* - 1 + 1 */;
    return true;
  }
  /*
   The following strings are not substituted:
   |TXT   |HTML     |Reason
   +------+---------+----------
    ->     ←    Bug #454
    =>     ⇐    dito
    <-     →    dito
    <=     ⇒    dito
    (tm)   ™   dito
    1/4    ¼  is triggered by 1/4 Part 1, 2/4 Part 2, ...
    3/4    ¾  dito
    1/2    ½  similar
  */
  return false;
}

/***************************************************************************
  Library-internal Interface
****************************************************************************/

NS_IMPL_ISUPPORTS(mozTXTToHTMLConv, mozITXTToHTMLConv, nsIStreamConverter,
                  nsIThreadRetargetableStreamListener, nsIStreamListener,
                  nsIRequestObserver)

int32_t mozTXTToHTMLConv::CiteLevelTXT(const char16_t* line,
                                       uint32_t& logLineStart) {
  int32_t result = 0;
  int32_t lineLength = NS_strlen(line);

  bool moreCites = true;
  while (moreCites) {
    /* E.g. the following lines count as quote:

       > text
       //#ifdef QUOTE_RECOGNITION_AGGRESSIVE
       >text
       //#ifdef QUOTE_RECOGNITION_AGGRESSIVE
           > text
       ] text
       USER> text
       USER] text
       //#endif

       logLineStart is the position of "t" in this example
    */
    uint32_t i = logLineStart;

#ifdef QUOTE_RECOGNITION_AGGRESSIVE
    for (; int32_t(i) < lineLength && IsSpace(line[i]); i++);
    for (; int32_t(i) < lineLength && IsAsciiAlpha(line[i]) &&
           nsCRT::IsUpper(line[i]);
         i++);
    if (int32_t(i) < lineLength && (line[i] == '>' || line[i] == ']'))
#else
    if (int32_t(i) < lineLength && line[i] == '>')
#endif
    {
      i++;
      if (int32_t(i) < lineLength && line[i] == ' ') i++;
      // sendmail/mbox
      // Placed here for performance increase
      const char16_t* indexString = &line[logLineStart];
      // here, |logLineStart < lineLength| is always true
      uint32_t minlength = std::min(uint32_t(6), NS_strlen(indexString));
      if (Substring(indexString, indexString + minlength)
              .Equals(Substring(u">From "_ns, 0, minlength),
                      nsCaseInsensitiveStringComparator)) {
        // XXX RFC2646
        moreCites = false;
      } else {
        result++;
        logLineStart = i;
      }
    } else {
      moreCites = false;
    }
  }

  return result;
}

NS_IMETHODIMP
mozTXTToHTMLConv::ScanTXT(const nsAString& aInString, uint32_t whattodo,
                          nsAString& aOutString) {
  if (aInString.Length() == 0) {
    aOutString.Truncate();
    return NS_OK;
  }

  if (!aOutString.SetCapacity(uint32_t(aInString.Length() * growthRate),
                              mozilla::fallible)) {
    return NS_ERROR_OUT_OF_MEMORY;
  }

  bool doURLs = 0 != (whattodo & kURLs);
  bool doGlyphSubstitution = 0 != (whattodo & kGlyphSubstitution);
  bool doStructPhrase = 0 != (whattodo & kStructPhrase);

  uint32_t structPhrase_strong = 0;  // Number of currently open tags
  uint32_t structPhrase_underline = 0;
  uint32_t structPhrase_italic = 0;
  uint32_t structPhrase_code = 0;

  uint32_t endOfLastURLOutput = 0;

  nsAutoString outputHTML;  // moved here for performance increase

  const char16_t* rawInputString = aInString.BeginReading();
  uint32_t inLength = aInString.Length();

  const Span<const char16_t> inString(aInString);
  GraphemeClusterBreakIteratorUtf16 ci(inString);
  uint32_t i = 0;
  while (i < inLength) {
    if (doGlyphSubstitution) {
      int32_t glyphTextLen;
      if (GlyphHit(&rawInputString[i], inLength - i, i == 0, aOutString,
                   glyphTextLen)) {
        i = *ci.Seek(i + glyphTextLen - 1);
        continue;
      }
    }

    if (doStructPhrase) {
      const char16_t* newOffset = rawInputString;
      int32_t newLength = aInString.Length();
      if (i > 0)  // skip the first element?
      {
        GraphemeClusterBreakReverseIteratorUtf16 ri(
            Span<const char16_t>(rawInputString, i));
        Maybe<uint32_t> nextPos = ri.Next();
        newOffset += *nextPos;
        newLength -= *nextPos;
      }

      switch (aInString[i])  // Performance increase
      {
        case '*':
          if (StructPhraseHit(newOffset, newLength, i == 0, u"*", 1, "b",
                              "class=\"moz-txt-star\"", aOutString,
                              structPhrase_strong)) {
            i = *ci.Next();
            continue;
          }
          break;
        case '/':
          if (StructPhraseHit(newOffset, newLength, i == 0, u"/", 1, "i",
                              "class=\"moz-txt-slash\"", aOutString,
                              structPhrase_italic)) {
            i = *ci.Next();
            continue;
          }
          break;
        case '_':
          if (StructPhraseHit(newOffset, newLength, i == 0, u"_", 1,
                              "span" /* <u> is deprecated */,
                              "class=\"moz-txt-underscore\"", aOutString,
                              structPhrase_underline)) {
            i = *ci.Next();
            continue;
          }
          break;
        case '|':
          if (StructPhraseHit(newOffset, newLength, i == 0, u"|", 1, "code",
                              "class=\"moz-txt-verticalline\"", aOutString,
                              structPhrase_code)) {
            i = *ci.Next();
            continue;
          }
          break;
      }
    }

    if (doURLs) {
      switch (aInString[i]) {
        case ':':
        case '@':
        case '.':
          if ((i == 0 || ((i > 0) && aInString[i - 1] != ' ')) &&
              ((i == aInString.Length() - 1) ||
               (aInString[i + 1] != ' ')))  // Performance increase
          {
            int32_t replaceBefore;
            int32_t replaceAfter;
            if (FindURL(rawInputString, aInString.Length(), i, whattodo,
                        outputHTML, replaceBefore, replaceAfter) &&
                structPhrase_strong + structPhrase_italic +
                        structPhrase_underline + structPhrase_code ==
                    0
                /* workaround for bug #19445 */) {
              // Don't cut into previously inserted HTML (bug 1509493)
              if (aOutString.Length() - replaceBefore < endOfLastURLOutput) {
                break;
              }
              aOutString.Cut(aOutString.Length() - replaceBefore,
                             replaceBefore);
              aOutString += outputHTML;
              endOfLastURLOutput = aOutString.Length();
              i = *ci.Seek(i + replaceAfter);
              continue;
            }
          }
          break;
      }  // switch
    }

    switch (aInString[i]) {
      // Special symbols
      case '<':
      case '>':
      case '&':
        EscapeChar(aInString[i], aOutString, false);
        i = *ci.Next();
        break;
      // Normal characters
      default: {
        const uint32_t oldIdx = i;
        i = *ci.Next();
        aOutString.Append(inString.FromTo(oldIdx, i));
        break;
      }
    }
  }
  return NS_OK;
}

NS_IMETHODIMP
mozTXTToHTMLConv::ScanHTML(const nsAString& input, uint32_t whattodo,
                           nsAString& aOutString) {
  const nsPromiseFlatString& aInString = PromiseFlatString(input);
  if (!aOutString.SetCapacity(uint32_t(aInString.Length() * growthRate),
                              mozilla::fallible)) {
    return NS_ERROR_OUT_OF_MEMORY;
  }

  // some common variables we were recalculating
  // every time inside the for loop...
  int32_t lengthOfInString = aInString.Length();
  const char16_t* uniBuffer = aInString.get();

#ifdef DEBUG_BenB_Perf
  PRTime parsing_start = PR_IntervalNow();
#endif

  // Look for simple entities not included in a tags and scan them.
  // Skip all tags ("<[...]>") and content in an a link tag ("<a [...]</a>"),
  // comment tag (""), style tag, script tag or head tag.
  // Unescape the rest (text between tags) and pass it to ScanTXT.
  nsAutoCString canFollow(" \f\n\r\t>");
  for (int32_t i = 0; i < lengthOfInString;) {
    if (aInString[i] == '<')  // html tag
    {
      int32_t start = i;
      if (i + 2 < lengthOfInString && nsCRT::ToLower(aInString[i + 1]) == 'a' &&
          canFollow.FindChar(aInString[i + 2]) != kNotFound)
      // if a tag, skip until </a>.
      // Make sure there's a white-space character after, not to match "abbr".
      {
        i = aInString.LowerCaseFindASCII("</a>", i);
        if (i == kNotFound) {
          i = lengthOfInString;
        } else {
          i += 4;
        }
      } else if (Substring(aInString, i + 1, 3).LowerCaseEqualsASCII("!--"))
      // if out-commended code, skip until -->
      {
        i = aInString.Find(u"-->", i);
        if (i == kNotFound) {
          i = lengthOfInString;
        } else {
          i += 3;
        }
      } else if (i + 6 < lengthOfInString &&
                 Substring(aInString, i + 1, 5).LowerCaseEqualsASCII("style") &&
                 canFollow.FindChar(aInString[i + 6]) != kNotFound)
      // if style tag, skip until </style>
      {
        i = aInString.LowerCaseFindASCII("</style>", i);
        if (i == kNotFound) {
          i = lengthOfInString;
        } else {
          i += 8;
        }
      } else if (i + 7 < lengthOfInString &&
                 Substring(aInString, i + 1, 6)
                     .LowerCaseEqualsASCII("script") &&
                 canFollow.FindChar(aInString[i + 7]) != kNotFound)
      // if script tag, skip until </script>
      {
        i = aInString.LowerCaseFindASCII("</script>", i);
        if (i == kNotFound) {
          i = lengthOfInString;
        } else {
          i += 9;
        }
      } else if (i + 5 < lengthOfInString &&
                 Substring(aInString, i + 1, 4).LowerCaseEqualsASCII("head") &&
                 canFollow.FindChar(aInString[i + 5]) != kNotFound)
      // if head tag, skip until </head>
      // Make sure not to match <header>.
      {
        i = aInString.LowerCaseFindASCII("</head>", i);
        if (i == kNotFound) {
          i = lengthOfInString;
        } else {
          i += 7;
        }
      } else  // just skip tag (attributes etc.)
      {
        i = aInString.FindChar('>', i);
        if (i == kNotFound) {
          i = lengthOfInString;
        } else {
          i++;
        }
      }
      aOutString.Append(&uniBuffer[start], i - start);
    } else {
      uint32_t start = uint32_t(i);
      i = aInString.FindChar('<', i);
      if (i == kNotFound) i = lengthOfInString;

      nsAutoStringN<256> tempString;
      tempString.SetCapacity(uint32_t((uint32_t(i) - start) * growthRate));
      UnescapeStr(uniBuffer, start, uint32_t(i) - start, tempString);
      ScanTXT(tempString, whattodo, aOutString);
    }
  }

#ifdef DEBUG_BenB_Perf
  printf("ScanHTML time:    %d ms\n",
         PR_IntervalToMilliseconds(PR_IntervalNow() - parsing_start));
#endif
  return NS_OK;
}

/****************************************************************************
  XPCOM Interface
*****************************************************************************/

NS_IMETHODIMP
mozTXTToHTMLConv::Convert(nsIInputStream* aFromStream, const char* aFromType,
                          const char* aToType, nsISupports* aCtxt,
                          nsIInputStream** _retval) {
  return NS_ERROR_NOT_IMPLEMENTED;
}

NS_IMETHODIMP
mozTXTToHTMLConv::AsyncConvertData(const char* aFromType, const char* aToType,
                                   nsIStreamListener* aListener,
                                   nsISupports* aCtxt) {
  return NS_ERROR_NOT_IMPLEMENTED;
}

NS_IMETHODIMP
mozTXTToHTMLConv::GetConvertedType(const nsACString& aFromType,
                                   nsIChannel* aChannel, nsACString& aToType) {
  return NS_ERROR_NOT_IMPLEMENTED;
}

NS_IMETHODIMP
mozTXTToHTMLConv::OnDataAvailable(nsIRequest* request, nsIInputStream* inStr,
                                  uint64_t sourceOffset, uint32_t count) {
  return NS_ERROR_NOT_IMPLEMENTED;
}

NS_IMETHODIMP
mozTXTToHTMLConv::OnDataFinished(nsresult aStatus) {
  return NS_ERROR_NOT_IMPLEMENTED;
}

NS_IMETHODIMP
mozTXTToHTMLConv::CheckListenerChain() { return NS_ERROR_NOT_IMPLEMENTED; }

NS_IMETHODIMP
mozTXTToHTMLConv::MaybeRetarget(nsIRequest* request) {
  return NS_ERROR_NOT_IMPLEMENTED;
}

NS_IMETHODIMP
mozTXTToHTMLConv::OnStartRequest(nsIRequest* request) {
  return NS_ERROR_NOT_IMPLEMENTED;
}

NS_IMETHODIMP
mozTXTToHTMLConv::OnStopRequest(nsIRequest* request, nsresult aStatus) {
  return NS_ERROR_NOT_IMPLEMENTED;
}

NS_IMETHODIMP
mozTXTToHTMLConv::CiteLevelTXT(const char16_t* line, uint32_t* logLineStart,
                               uint32_t* _retval) {
  if (!logLineStart || !_retval || !line) return NS_ERROR_NULL_POINTER;
  *_retval = CiteLevelTXT(line, *logLineStart);
  return NS_OK;
}

nsresult MOZ_NewTXTToHTMLConv(mozTXTToHTMLConv** aConv) {
  MOZ_ASSERT(aConv != nullptr, "null ptr");
  if (!aConv) return NS_ERROR_NULL_POINTER;

  RefPtr<mozTXTToHTMLConv> conv = new mozTXTToHTMLConv();
  conv.forget(aConv);
  //    return (*aConv)->Init();
  return NS_OK;
}

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.34 Sekunden (vorverarbeitet am 2026-04-26) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.