Quelle mozInlineSpellWordUtil.cpp

Sprache: C

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "mozInlineSpellWordUtil.h"

#include <algorithm>
#include <utility>

#include "mozilla/BinarySearch.h"
#include "mozilla/EditorBase.h"
#include "mozilla/HTMLEditor.h"
#include "mozilla/Logging.h"
#include "mozilla/dom/Element.h"

#include "nsDebug.h"
#include "nsAtom.h"
#include "nsComponentManagerUtils.h"
#include "nsUnicodeProperties.h"
#include "nsServiceManagerUtils.h"
#include "nsIContent.h"
#include "nsTextFragment.h"
#include "nsRange.h"
#include "nsContentUtils.h"
#include "nsIFrame.h"

using namespace mozilla;

static LazyLogModule sInlineSpellWordUtilLog{"InlineSpellWordUtil"};

// IsIgnorableCharacter
//
//    These characters are ones that we should ignore in input.

inline bool IsIgnorableCharacter(char ch) {
  return (ch == static_cast<char>(0xAD));  // SOFT HYPHEN
}

inline bool IsIgnorableCharacter(char16_t ch) {
  return (ch == 0xAD ||   // SOFT HYPHEN
          ch == 0x1806);  // MONGOLIAN TODO SOFT HYPHEN
}

// IsConditionalPunctuation
//
//    Some characters (like apostrophes) require characters on each side to be
//    part of a word, and are otherwise punctuation.

inline bool IsConditionalPunctuation(char ch) {
  return (ch == '\'' ||                    // RIGHT SINGLE QUOTATION MARK
          ch == static_cast<char>(0xB7));  // MIDDLE DOT
}

inline bool IsConditionalPunctuation(char16_t ch) {
  return (ch == '\'' || ch == 0x2019 ||  // RIGHT SINGLE QUOTATION MARK
          ch == 0x00B7);                 // MIDDLE DOT
}

static bool IsAmbiguousDOMWordSeprator(char16_t ch) {
  // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
  return (ch == '@' || ch == ':' || ch == '.' || ch == '/' || ch == '-' ||
          IsConditionalPunctuation(ch));
}

static bool IsAmbiguousDOMWordSeprator(char ch) {
  // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
  return IsAmbiguousDOMWordSeprator(static_cast<char16_t>(ch));
}

// IsDOMWordSeparator
//
//    Determines if the given character should be considered as a DOM Word
//    separator. Basically, this is whitespace, although it could also have
//    certain punctuation that we know ALWAYS breaks words. This is important.
//    For example, we can't have any punctuation that could appear in a URL
//    or email address in this, because those need to always fit into a single
//    DOM word.

static bool IsDOMWordSeparator(char ch) {
  // simple spaces or no-break space
  return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' ||
          ch == static_cast<char>(0xA0));
}

static bool IsDOMWordSeparator(char16_t ch) {
  // simple spaces
  if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') return true;

  // complex spaces - check only if char isn't ASCII (uncommon)
  if (ch >= 0xA0 && (ch == 0x00A0 ||  // NO-BREAK SPACE
                     ch == 0x2002 ||  // EN SPACE
                     ch == 0x2003 ||  // EM SPACE
                     ch == 0x2009 ||  // THIN SPACE
                     ch == 0x3000))   // IDEOGRAPHIC SPACE
    return true;

  // otherwise not a space
  return false;
}

bool NodeOffset::operator==(
    const mozilla::RangeBoundary& aRangeBoundary) const {
  if (aRangeBoundary.Container() != mNode) {
    return false;
  }

  const Maybe<uint32_t> rangeBoundaryOffset =
      aRangeBoundary.Offset(RangeBoundary::OffsetFilter::kValidOffsets);

  MOZ_ASSERT(mOffset >= 0);
  return rangeBoundaryOffset &&
         (*rangeBoundaryOffset == static_cast<uint32_t>(mOffset));
}

bool NodeOffsetRange::operator==(const nsRange& aRange) const {
  return mBegin == aRange.StartRef() && mEnd == aRange.EndRef();
}

// static
Maybe<mozInlineSpellWordUtil> mozInlineSpellWordUtil::Create(
    const EditorBase& aEditorBase) {
  dom::Document* document = aEditorBase.GetDocument();
  if (NS_WARN_IF(!document)) {
    return Nothing();
  }

  const bool isContentEditableOrDesignMode = aEditorBase.IsHTMLEditor();

  // Find the root node for the editor. For contenteditable the mRootNode could
  // change to shadow root if the begin and end are inside the shadowDOM.
  nsINode* rootNode = aEditorBase.GetRoot();
  if (NS_WARN_IF(!rootNode)) {
    return Nothing();
  }

  mozInlineSpellWordUtil util{*document, isContentEditableOrDesignMode,
                              *rootNode};
  return Some(std::move(util));
}

static inline bool IsSpellCheckingTextNode(nsINode* aNode) {
  nsIContent* parent = aNode->GetParent();
  if (parent &&
      parent->IsAnyOfHTMLElements(nsGkAtoms::script, nsGkAtoms::style))
    return false;
  return aNode->IsText();
}

typedef void (*OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure);

// Find the next node in the DOM tree in preorder.
// Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is
// why we can't just use GetNextNode here, sadly.
static nsINode* FindNextNode(nsINode* aNode, const nsINode* aRoot,
                             OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure) {
  MOZ_ASSERT(aNode, "Null starting node?");

  nsINode* next = aNode->GetFirstChild();
  if (next) return next;

  // Don't look at siblings or otherwise outside of aRoot
  if (aNode == aRoot) return nullptr;

  next = aNode->GetNextSibling();
  if (next) return next;

  // Go up
  for (;;) {
    if (aOnLeaveNode) {
      aOnLeaveNode(aNode, aClosure);
    }

    next = aNode->GetParent();
    if (next == aRoot || !next) return nullptr;
    aNode = next;

    next = aNode->GetNextSibling();
    if (next) return next;
  }
}

// aNode is not a text node. Find the first text node starting at aNode/aOffset
// in a preorder DOM traversal.
static nsINode* FindNextTextNode(nsINode* aNode, int32_t aOffset,
                                 const nsINode* aRoot) {
  MOZ_ASSERT(aNode, "Null starting node?");
  MOZ_ASSERT(!IsSpellCheckingTextNode(aNode),
             "FindNextTextNode should start with a non-text node");

  nsINode* checkNode;
  // Need to start at the aOffset'th child
  nsIContent* child = aNode->GetChildAt_Deprecated(aOffset);

  if (child) {
    checkNode = child;
  } else {
    // aOffset was beyond the end of the child list.
    // goto next node after the last descendant of aNode in
    // a preorder DOM traversal.
    checkNode = aNode->GetNextNonChildNode(aRoot);
  }

  while (checkNode && !IsSpellCheckingTextNode(checkNode)) {
    checkNode = checkNode->GetNextNode(aRoot);
  }
  return checkNode;
}

// mozInlineSpellWordUtil::SetPositionAndEnd
//
//    We have two ranges "hard" and "soft". The hard boundary is simply
//    the scope of the root node. The soft boundary is that which is set
//    by the caller of this class by calling this function. If this function is
//    not called, the soft boundary is the same as the hard boundary.
//
//    When we reach the soft boundary (mSoftText.GetEnd()), we keep
//    going until we reach the end of a word. This allows the caller to set the
//    end of the range to anything, and we will always check whole multiples of
//    words. When we reach the hard boundary we stop no matter what.
//
//    There is no beginning soft boundary. This is because we only go to the
//    previous node once, when finding the previous word boundary in
//    SetPosition(). You might think of the soft boundary as being this initial
//    position.

nsresult mozInlineSpellWordUtil::SetPositionAndEnd(nsINode* aPositionNode,
                                                   int32_t aPositionOffset,
                                                   nsINode* aEndNode,
                                                   int32_t aEndOffset) {
  MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
          ("%s: pos=(%p, %i), end=(%p, %i)", __FUNCTION__, aPositionNode,
           aPositionOffset, aEndNode, aEndOffset));

  MOZ_ASSERT(aPositionNode, "Null begin node?");
  MOZ_ASSERT(aEndNode, "Null end node?");

  MOZ_ASSERT(mRootNode, "Not initialized");

  // Find a appropriate root if we are dealing with contenteditable nodes which
  // are in the shadow DOM.
  if (mIsContentEditableOrDesignMode) {
    nsINode* rootNode = aPositionNode->SubtreeRoot();
    if (rootNode != aEndNode->SubtreeRoot()) {
      return NS_ERROR_FAILURE;
    }

    if (mozilla::dom::ShadowRoot::FromNode(rootNode)) {
      mRootNode = rootNode;
    }
  }

  mSoftText.Invalidate();

  if (!IsSpellCheckingTextNode(aPositionNode)) {
    // Start at the start of the first text node after aNode/aOffset.
    aPositionNode = FindNextTextNode(aPositionNode, aPositionOffset, mRootNode);
    aPositionOffset = 0;
  }
  NodeOffset softBegin = NodeOffset(aPositionNode, aPositionOffset);

  if (!IsSpellCheckingTextNode(aEndNode)) {
    // End at the start of the first text node after aEndNode/aEndOffset.
    aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode);
    aEndOffset = 0;
  }
  NodeOffset softEnd = NodeOffset(aEndNode, aEndOffset);

  nsresult rv = EnsureWords(std::move(softBegin), std::move(softEnd));
  if (NS_FAILED(rv)) {
    return rv;
  }

  int32_t textOffset = MapDOMPositionToSoftTextOffset(mSoftText.GetBegin());
  if (textOffset < 0) {
    return NS_OK;
  }

  mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true);
  return NS_OK;
}

nsresult mozInlineSpellWordUtil::EnsureWords(NodeOffset aSoftBegin,
                                             NodeOffset aSoftEnd) {
  if (mSoftText.mIsValid) return NS_OK;
  mSoftText.AdjustBeginAndBuildText(std::move(aSoftBegin), std::move(aSoftEnd),
                                    mRootNode);

  mRealWords.Clear();
  Result<RealWords, nsresult> realWords = BuildRealWords();
  if (realWords.isErr()) {
    return realWords.unwrapErr();
  }

  mRealWords = realWords.unwrap();
  mSoftText.mIsValid = true;
  return NS_OK;
}

nsresult mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord,
                                                  nsRange** aRange) const {
  NodeOffset begin =
      MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
  NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
  return MakeRange(begin, end, aRange);
}
void mozInlineSpellWordUtil::MakeNodeOffsetRangeForWord(
    const RealWord& aWord, NodeOffsetRange* aNodeOffsetRange) {
  NodeOffset begin =
      MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
  NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
  *aNodeOffsetRange = NodeOffsetRange(begin, end);
}

// mozInlineSpellWordUtil::GetRangeForWord

nsresult mozInlineSpellWordUtil::GetRangeForWord(nsINode* aWordNode,
                                                 int32_t aWordOffset,
                                                 nsRange** aRange) {
  // Set our soft end and start
  NodeOffset pt(aWordNode, aWordOffset);

  if (!mSoftText.mIsValid || pt != mSoftText.GetBegin() ||
      pt != mSoftText.GetEnd()) {
    mSoftText.Invalidate();
    NodeOffset softBegin = pt;
    NodeOffset softEnd = pt;
    nsresult rv = EnsureWords(std::move(softBegin), std::move(softEnd));
    if (NS_FAILED(rv)) {
      return rv;
    }
  }

  int32_t offset = MapDOMPositionToSoftTextOffset(pt);
  if (offset < 0) return MakeRange(pt, pt, aRange);
  int32_t wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false);
  if (wordIndex < 0) return MakeRange(pt, pt, aRange);
  return MakeRangeForWord(mRealWords[wordIndex], aRange);
}

// This is to fix characters that the spellchecker may not like
static void NormalizeWord(const nsAString& aInput, int32_t aPos, int32_t aLen,
                          nsAString& aOutput) {
  aOutput.Truncate();
  for (int32_t i = 0; i < aLen; i++) {
    char16_t ch = aInput.CharAt(i + aPos);

    // remove ignorable characters from the word
    if (IsIgnorableCharacter(ch)) continue;

    // the spellchecker doesn't handle curly apostrophes in all languages
    if (ch == 0x2019) {  // RIGHT SINGLE QUOTATION MARK
      ch = '\'';
    }

    aOutput.Append(ch);
  }
}

// mozInlineSpellWordUtil::GetNextWord
//
//    FIXME-optimization: we shouldn't have to generate a range every single
//    time. It would be better if the inline spellchecker didn't require a
//    range unless the word was misspelled. This may or may not be possible.

bool mozInlineSpellWordUtil::GetNextWord(Word& aWord) {
  MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
          ("%s: mNextWordIndex=%d", __FUNCTION__, mNextWordIndex));

  if (mNextWordIndex < 0 || mNextWordIndex >= int32_t(mRealWords.Length())) {
    mNextWordIndex = -1;
    aWord.mSkipChecking = true;
    return false;
  }

  const RealWord& realWord = mRealWords[mNextWordIndex];
  MakeNodeOffsetRangeForWord(realWord, &aWord.mNodeOffsetRange);
  ++mNextWordIndex;
  aWord.mSkipChecking = !realWord.mCheckableWord;
  ::NormalizeWord(mSoftText.GetValue(), realWord.mSoftTextOffset,
                  realWord.mLength, aWord.mText);

  MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
          ("%s: returning: %s (skip=%d)", __FUNCTION__,
           NS_ConvertUTF16toUTF8(aWord.mText).get(), aWord.mSkipChecking));

  return true;
}

// mozInlineSpellWordUtil::MakeRange
//
//    Convenience function for creating a range over the current document.

nsresult mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd,
                                           nsRange** aRange) const {
  NS_ENSURE_ARG_POINTER(aBegin.mNode);
  if (!mDocument) {
    return NS_ERROR_NOT_INITIALIZED;
  }

  ErrorResult error;
  RefPtr<nsRange> range = nsRange::Create(aBegin.mNode, aBegin.mOffset,
                                          aEnd.mNode, aEnd.mOffset, error);
  if (NS_WARN_IF(error.Failed())) {
    return error.StealNSResult();
  }
  MOZ_ASSERT(range);
  range.forget(aRange);
  return NS_OK;
}

// static
already_AddRefed<nsRange> mozInlineSpellWordUtil::MakeRange(
    const NodeOffsetRange& aRange) {
  IgnoredErrorResult ignoredError;
  RefPtr<nsRange> range =
      nsRange::Create(aRange.Begin().Node(), aRange.Begin().Offset(),
                      aRange.End().Node(), aRange.End().Offset(), ignoredError);
  NS_WARNING_ASSERTION(!ignoredError.Failed(), "Creating a range failed");
  return range.forget();
}

/*********** Word Splitting ************/

// classifies a given character in the DOM word
enum CharClass {
  CHAR_CLASS_WORD,
  CHAR_CLASS_SEPARATOR,
  CHAR_CLASS_END_OF_INPUT
};

// Encapsulates DOM-word to real-word splitting
template <class T>
struct MOZ_STACK_CLASS WordSplitState {
  const T& mDOMWordText;
  int32_t mDOMWordOffset;
  CharClass mCurCharClass;

  explicit WordSplitState(const T& aString)
      : mDOMWordText(aString),
        mDOMWordOffset(0),
        mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}

  CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const;
  void Advance();
  void AdvanceThroughSeparators();
  void AdvanceThroughWord();

  // Finds special words like email addresses and URLs that may start at the
  // current position, and returns their length, or 0 if not found. This allows
  // arbitrary word breaking rules to be used for these special entities, as
  // long as they can not contain whitespace.
  bool IsSpecialWord() const;

  // Similar to IsSpecialWord except that this takes a split word as
  // input. This checks for things that do not require special word-breaking
  // rules.
  bool ShouldSkipWord(int32_t aStart, int32_t aLength) const;

  // Finds the last sequence of DOM word separators before aBeforeOffset and
  // returns the offset to its first element.
  Maybe<int32_t> FindOffsetOfLastDOMWordSeparatorSequence(
      int32_t aBeforeOffset) const;

  char16_t GetUnicharAt(int32_t aIndex) const;
};

// WordSplitState::ClassifyCharacter
template <class T>
CharClass WordSplitState<T>::ClassifyCharacter(int32_t aIndex,
                                               bool aRecurse) const {
  MOZ_ASSERT(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()),
             "Index out of range");
  if (aIndex == int32_t(mDOMWordText.Length())) return CHAR_CLASS_SEPARATOR;

  // this will classify the character, we want to treat "ignorable" characters
  // such as soft hyphens, and also ZWJ and ZWNJ as word characters.
  nsUGenCategory charCategory =
      mozilla::unicode::GetGenCategory(GetUnicharAt(aIndex));
  if (charCategory == nsUGenCategory::kLetter ||
      IsIgnorableCharacter(mDOMWordText[aIndex]) ||
      mDOMWordText[aIndex] == 0x200C /* ZWNJ */ ||
      mDOMWordText[aIndex] == 0x200D /* ZWJ */)
    return CHAR_CLASS_WORD;

  // If conditional punctuation is surrounded immediately on both sides by word
  // characters it also counts as a word character.
  if (IsConditionalPunctuation(mDOMWordText[aIndex])) {
    if (!aRecurse) {
      // not allowed to look around, this punctuation counts like a separator
      return CHAR_CLASS_SEPARATOR;
    }

    // check the left-hand character
    if (aIndex == 0) return CHAR_CLASS_SEPARATOR;
    if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)
      return CHAR_CLASS_SEPARATOR;
    // If the previous charatcer is a word-char, make sure that it's not a
    // special dot character.
    if (mDOMWordText[aIndex - 1] == '.') return CHAR_CLASS_SEPARATOR;

    // now we know left char is a word-char, check the right-hand character
    if (aIndex == int32_t(mDOMWordText.Length() - 1)) {
      return CHAR_CLASS_SEPARATOR;
    }

    if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)
      return CHAR_CLASS_SEPARATOR;
    // If the next charatcer is a word-char, make sure that it's not a
    // special dot character.
    if (mDOMWordText[aIndex + 1] == '.') return CHAR_CLASS_SEPARATOR;

    // char on either side is a word, this counts as a word
    return CHAR_CLASS_WORD;
  }

  // The dot character, if appearing at the end of a word, should
  // be considered part of that word.  Example: "etc.", or
  // abbreviations
  if (aIndex > 0 && mDOMWordText[aIndex] == '.' &&
      mDOMWordText[aIndex - 1] != '.' &&
      ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) {
    return CHAR_CLASS_WORD;
  }

  // all other punctuation
  if (charCategory == nsUGenCategory::kSeparator ||
      charCategory == nsUGenCategory::kOther ||
      charCategory == nsUGenCategory::kPunctuation ||
      charCategory == nsUGenCategory::kSymbol) {
    // Don't break on hyphens, as hunspell handles them on its own.
    if (aIndex > 0 && mDOMWordText[aIndex] == '-' &&
        mDOMWordText[aIndex - 1] != '-' &&
        ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) {
      // A hyphen is only meaningful as a separator inside a word
      // if the previous and next characters are a word character.
      if (aIndex == int32_t(mDOMWordText.Length()) - 1)
        return CHAR_CLASS_SEPARATOR;
      if (mDOMWordText[aIndex + 1] != '.' &&
          ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD)
        return CHAR_CLASS_WORD;
    }
    return CHAR_CLASS_SEPARATOR;
  }

  // any other character counts as a word
  return CHAR_CLASS_WORD;
}

// WordSplitState::Advance
template <class T>
void WordSplitState<T>::Advance() {
  MOZ_ASSERT(mDOMWordOffset >= 0, "Negative word index");
  MOZ_ASSERT(mDOMWordOffset < (int32_t)mDOMWordText.Length(),
             "Length beyond end");

  mDOMWordOffset++;
  if (mDOMWordOffset >= (int32_t)mDOMWordText.Length())
    mCurCharClass = CHAR_CLASS_END_OF_INPUT;
  else
    mCurCharClass = ClassifyCharacter(mDOMWordOffset, true);
}

// WordSplitState::AdvanceThroughSeparators
template <class T>
void WordSplitState<T>::AdvanceThroughSeparators() {
  while (mCurCharClass == CHAR_CLASS_SEPARATOR) Advance();
}

// WordSplitState::AdvanceThroughWord
template <class T>
void WordSplitState<T>::AdvanceThroughWord() {
  while (mCurCharClass == CHAR_CLASS_WORD) Advance();
}

// WordSplitState::IsSpecialWord
template <class T>
bool WordSplitState<T>::IsSpecialWord() const {
  // Search for email addresses. We simply define these as any sequence of
  // characters with an '@' character in the middle. The DOM word is already
  // split on whitepace, so we know that everything to the end is the address
  int32_t firstColon = -1;
  for (int32_t i = mDOMWordOffset; i < int32_t(mDOMWordText.Length()); i++) {
    if (mDOMWordText[i] == '@') {
      // only accept this if there are unambiguous word characters (don't bother
      // recursing to disambiguate apostrophes) on each side. This prevents
      // classifying, e.g. "@home" as an email address

      // Use this condition to only accept words with '@' in the middle of
      // them. It works, but the inlinespellcker doesn't like this. The problem
      // is that you type "fhsgfh@" that's a misspelled word followed by a
      // symbol, but when you type another letter "fhsgfh@g" that first word
      // need to be unmarked misspelled. It doesn't do this. it only checks the
      // current position for potentially removing a spelling range.
      if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD &&
          i < (int32_t)mDOMWordText.Length() - 1 &&
          ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) {
        return true;
      }
    } else if (mDOMWordText[i] == ':' && firstColon < 0) {
      firstColon = i;

      // If the first colon is followed by a slash, consider it a URL
      // This will catch things like asdf://foo.com
      if (firstColon < (int32_t)mDOMWordText.Length() - 1 &&
          mDOMWordText[firstColon + 1] == '/') {
        return true;
      }
    }
  }

  // Check the text before the first colon against some known protocols. It
  // is impossible to check against all protocols, especially since you can
  // plug in new protocols. We also don't want to waste time here checking
  // against a lot of obscure protocols.
  if (firstColon > mDOMWordOffset) {
    nsString protocol(
        Substring(mDOMWordText, mDOMWordOffset, firstColon - mDOMWordOffset));
    if (protocol.EqualsIgnoreCase("http") ||
        protocol.EqualsIgnoreCase("https") ||
        protocol.EqualsIgnoreCase("news") ||
        protocol.EqualsIgnoreCase("file") ||
        protocol.EqualsIgnoreCase("javascript") ||
        protocol.EqualsIgnoreCase("data") || protocol.EqualsIgnoreCase("ftp")) {
      return true;
    }
  }

  // not anything special
  return false;
}

// WordSplitState::ShouldSkipWord
template <class T>
bool WordSplitState<T>::ShouldSkipWord(int32_t aStart, int32_t aLength) const {
  int32_t last = aStart + aLength;

  // check to see if the word contains a digit
  for (int32_t i = aStart; i < last; i++) {
    if (mozilla::unicode::GetGenCategory(GetUnicharAt(i)) ==
        nsUGenCategory::kNumber) {
      return true;
    }
  }

  // not special
  return false;
}

template <class T>
Maybe<int32_t> WordSplitState<T>::FindOffsetOfLastDOMWordSeparatorSequence(
    const int32_t aBeforeOffset) const {
  for (int32_t i = aBeforeOffset - 1; i >= 0; --i) {
    if (IsDOMWordSeparator(mDOMWordText[i]) ||
        (!IsAmbiguousDOMWordSeprator(mDOMWordText[i]) &&
         ClassifyCharacter(i, true) == CHAR_CLASS_SEPARATOR)) {
      // Be greedy, find as many separators as we can
      for (int32_t j = i - 1; j >= 0; --j) {
        if (IsDOMWordSeparator(mDOMWordText[j]) ||
            (!IsAmbiguousDOMWordSeprator(mDOMWordText[j]) &&
             ClassifyCharacter(j, true) == CHAR_CLASS_SEPARATOR)) {
          i = j;
        } else {
          break;
        }
      }
      return Some(i);
    }
  }
  return Nothing();
}

template <>
char16_t WordSplitState<nsDependentSubstring>::GetUnicharAt(
    int32_t aIndex) const {
  return mDOMWordText[aIndex];
}

template <>
char16_t WordSplitState<nsDependentCSubstring>::GetUnicharAt(
    int32_t aIndex) const {
  return static_cast<char16_t>(static_cast<uint8_t>(mDOMWordText[aIndex]));
}

static inline bool IsBRElement(nsINode* aNode) {
  return aNode->IsHTMLElement(nsGkAtoms::br);
}

/**
* Given a TextNode, finds the last sequence of DOM word separators before
* aBeforeOffset and returns the offset to its first element.
*
* @param aContent the TextNode to check.
* @param aBeforeOffset the offset in the TextNode before which we will search
*        for the DOM separator. You can pass INT32_MAX to search the entire
*        length of the string.
*/
static Maybe<int32_t> FindOffsetOfLastDOMWordSeparatorSequence(
    nsIContent* aContent, int32_t aBeforeOffset) {
  const nsTextFragment* textFragment = aContent->GetText();
  MOZ_ASSERT(textFragment, "Where is our text?");
  int32_t end = std::min(aBeforeOffset, int32_t(textFragment->GetLength()));

  if (textFragment->Is2b()) {
    nsDependentSubstring targetText(textFragment->Get2b(), end);
    WordSplitState<nsDependentSubstring> state(targetText);
    return state.FindOffsetOfLastDOMWordSeparatorSequence(end);
  }

  nsDependentCSubstring targetText(textFragment->Get1b(), end);
  WordSplitState<nsDependentCSubstring> state(targetText);
  return state.FindOffsetOfLastDOMWordSeparatorSequence(end);
}

/**
* Check if there's a DOM word separator before aBeforeOffset in this node.
* Always returns true if it's a BR element.
* aSeparatorOffset is set to the index of the first character in the last
* separator if any is found (0 for BR elements).
*
* This function does not modify aSeparatorOffset when it returns false.
*/
static bool ContainsDOMWordSeparator(nsINode* aNode, int32_t aBeforeOffset,
                                     int32_t* aSeparatorOffset) {
  if (IsBRElement(aNode)) {
    *aSeparatorOffset = 0;
    return true;
  }

  if (!IsSpellCheckingTextNode(aNode)) return false;

  const Maybe<int32_t> separatorOffset =
      FindOffsetOfLastDOMWordSeparatorSequence(aNode->AsContent(),
                                               aBeforeOffset);
  if (separatorOffset) {
    *aSeparatorOffset = *separatorOffset;
    return true;
  }

  return false;
}

static bool IsBreakElement(nsINode* aNode) {
  if (!aNode->IsElement()) {
    return false;
  }

  dom::Element* element = aNode->AsElement();
  if (element->IsHTMLElement(nsGkAtoms::br)) {
    return true;
  }

  // If we don't have a frame, we don't consider ourselves a break
  // element.  In particular, words can span us.
  nsIFrame* frame = element->GetPrimaryFrame();
  if (!frame) {
    return false;
  }

  auto* disp = frame->StyleDisplay();
  // Anything that's not an inline element is a break element.
  // XXXbz should replaced inlines be break elements, though?
  // Also should inline-block and such be break elements?
  //
  // FIXME(emilio): We should teach the spell checker to deal with generated
  // content (it doesn't at all), then remove the IsListItem() check, as there
  // could be no marker, etc...
  return !disp->IsInlineFlow() || disp->IsListItem();
}

struct CheckLeavingBreakElementClosure {
  bool mLeftBreakElement;
};

static void CheckLeavingBreakElement(nsINode* aNode, void* aClosure) {
  CheckLeavingBreakElementClosure* cl =
      static_cast<CheckLeavingBreakElementClosure*>(aClosure);
  if (!cl->mLeftBreakElement && IsBreakElement(aNode)) {
    cl->mLeftBreakElement = true;
  }
}

void mozInlineSpellWordUtil::NormalizeWord(nsAString& aWord) {
  nsAutoString result;
  ::NormalizeWord(aWord, 0, aWord.Length(), result);
  aWord = result;
}

void mozInlineSpellWordUtil::SoftText::AdjustBeginAndBuildText(
    NodeOffset aBegin, NodeOffset aEnd, const nsINode* aRootNode) {
  MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug, ("%s", __FUNCTION__));

  mBegin = std::move(aBegin);
  mEnd = std::move(aEnd);

  // First we have to work backwards from mBegin to find a text node
  // containing a DOM word separator, a non-inline-element
  // boundary, or the hard start node. That's where we'll start building the
  // soft string from.
  nsINode* node = mBegin.mNode;
  int32_t firstOffsetInNode = 0;
  int32_t checkBeforeOffset = mBegin.mOffset;
  while (node) {
    if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) {
      if (node == mBegin.mNode) {
        // If we find a word separator on the first node, look at the preceding
        // word on the text node as well.
        if (firstOffsetInNode > 0) {
          // Try to find the previous word boundary in the current node. If
          // we can't find one, start checking previous sibling nodes (if any
          // adjacent ones exist) to see if we can find any text nodes with
          // DOM word separators. We bail out as soon as we see a node that is
          // not a text node, or we run out of previous sibling nodes. In the
          // event that we simply cannot find any preceding word separator, the
          // offset is set to 0, and the soft text beginning node is set to the
          // "most previous" text node before the original starting node, or
          // kept at the original starting node if no previous text nodes exist.
          int32_t newOffset = 0;
          if (!ContainsDOMWordSeparator(node, firstOffsetInNode - 1,
                                        &newOffset)) {
            nsIContent* prevNode = node->GetPreviousSibling();
            while (prevNode && IsSpellCheckingTextNode(prevNode)) {
              mBegin.mNode = prevNode;
              const Maybe<int32_t> separatorOffset =
                  FindOffsetOfLastDOMWordSeparatorSequence(prevNode, INT32_MAX);
              if (separatorOffset) {
                newOffset = *separatorOffset;
                break;
              }
              prevNode = prevNode->GetPreviousSibling();
            }
          }
          firstOffsetInNode = newOffset;
        } else {
          firstOffsetInNode = 0;
        }

        MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
                ("%s: adjusting mBegin.mOffset from %i to %i.", __FUNCTION__,
                 mBegin.mOffset, firstOffsetInNode));
        mBegin.mOffset = firstOffsetInNode;
      }
      break;
    }
    checkBeforeOffset = INT32_MAX;
    if (IsBreakElement(node)) {
      // Since GerPrevNode follows tree *preorder*, we're about to traverse up
      // out of 'node'. Since node induces breaks (e.g., it's a block), don't
      // bother trying to look outside it, just stop now.
      break;
    }
    // GetPreviousContent below expects aRootNode to be an ancestor of node.
    if (!node->IsInclusiveDescendantOf(aRootNode)) {
      break;
    }
    node = node->GetPrevNode(aRootNode);
  }

  // Now build up the string moving forward through the DOM until we reach
  // the soft end and *then* see a DOM word separator, a non-inline-element
  // boundary, or the hard end node.
  mValue.Truncate();
  mDOMMapping.Clear();
  bool seenSoftEnd = false;
  // Leave this outside the loop so large heap string allocations can be reused
  // across iterations
  while (node) {
    if (node == mEnd.mNode) {
      seenSoftEnd = true;
    }

    bool exit = false;
    if (IsSpellCheckingTextNode(node)) {
      nsIContent* content = static_cast<nsIContent*>(node);
      MOZ_ASSERT(content, "Where is our content?");
      const nsTextFragment* textFragment = content->GetText();
      MOZ_ASSERT(textFragment, "Where is our text?");
      uint32_t lastOffsetInNode = textFragment->GetLength();

      if (seenSoftEnd) {
        // check whether we can stop after this
        for (uint32_t i =
                 node == mEnd.mNode ? AssertedCast<uint32_t>(mEnd.mOffset) : 0;
             i < textFragment->GetLength(); ++i) {
          if (IsDOMWordSeparator(textFragment->CharAt(i))) {
            exit = true;
            // stop at the first separator after the soft end point
            lastOffsetInNode = i;
            break;
          }
        }
      }

      if (firstOffsetInNode >= 0 &&
          static_cast<uint32_t>(firstOffsetInNode) < lastOffsetInNode) {
        const uint32_t len = lastOffsetInNode - firstOffsetInNode;
        mDOMMapping.AppendElement(DOMTextMapping(
            NodeOffset(node, firstOffsetInNode), mValue.Length(), len));

        const bool ok = textFragment->AppendTo(
            mValue, static_cast<uint32_t>(firstOffsetInNode), len,
            mozilla::fallible);
        if (!ok) {
          // probably out of memory, remove from mDOMMapping
          mDOMMapping.RemoveLastElement();
          exit = true;
        }
      }

      firstOffsetInNode = 0;
    }

    if (exit) break;

    CheckLeavingBreakElementClosure closure = {false};
    node = FindNextNode(node, aRootNode, CheckLeavingBreakElement, &closure);
    if (closure.mLeftBreakElement || (node && IsBreakElement(node))) {
      // We left, or are entering, a break element (e.g., block). Maybe we can
      // stop now.
      if (seenSoftEnd) break;
      // Record the break
      mValue.Append(' ');
    }
  }

  MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
          ("%s: got DOM string: %s", __FUNCTION__,
           NS_ConvertUTF16toUTF8(mValue).get()));
}

auto mozInlineSpellWordUtil::BuildRealWords() const
    -> Result<RealWords, nsresult> {
  // This is pretty simple. We just have to walk mSoftText.GetValue(),
  // tokenizing it into "real words". We do an outer traversal of words
  // delimited by IsDOMWordSeparator, calling SplitDOMWordAndAppendTo on each of
  // those DOM words
  int32_t wordStart = -1;
  RealWords realWords;
  for (int32_t i = 0; i < int32_t(mSoftText.GetValue().Length()); ++i) {
    if (IsDOMWordSeparator(mSoftText.GetValue().CharAt(i))) {
      if (wordStart >= 0) {
        nsresult rv = SplitDOMWordAndAppendTo(wordStart, i, realWords);
        if (NS_FAILED(rv)) {
          return Err(rv);
        }
        wordStart = -1;
      }
    } else {
      if (wordStart < 0) {
        wordStart = i;
      }
    }
  }
  if (wordStart >= 0) {
    nsresult rv = SplitDOMWordAndAppendTo(
        wordStart, mSoftText.GetValue().Length(), realWords);
    if (NS_FAILED(rv)) {
      return Err(rv);
    }
  }

  return realWords;
}

/*********** DOM/realwords<->mSoftText.GetValue() mapping functions
* ************/

int32_t mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(
    const NodeOffset& aNodeOffset) const {
  if (!mSoftText.mIsValid) {
    NS_ERROR("Soft text must be valid if we're to map into it");
    return -1;
  }

  for (int32_t i = 0; i < int32_t(mSoftText.GetDOMMapping().Length()); ++i) {
    const DOMTextMapping& map = mSoftText.GetDOMMapping()[i];
    if (map.mNodeOffset.mNode == aNodeOffset.mNode) {
      // Allow offsets at either end of the string, in particular, allow the
      // offset that's at the end of the contributed string
      int32_t offsetInContributedString =
          aNodeOffset.mOffset - map.mNodeOffset.mOffset;
      if (offsetInContributedString >= 0 &&
          offsetInContributedString <= map.mLength)
        return map.mSoftTextOffset + offsetInContributedString;
      return -1;
    }
  }
  return -1;
}

namespace {

template <class T>
class FirstLargerOffset {
  int32_t mSoftTextOffset;

public:
  explicit FirstLargerOffset(int32_t aSoftTextOffset)
      : mSoftTextOffset(aSoftTextOffset) {}
  int operator()(const T& t) const {
    // We want the first larger offset, so never return 0 (which would
    // short-circuit evaluation before finding the last such offset).
    return mSoftTextOffset < t.mSoftTextOffset ? -1 : 1;
  }
};

template <class T>
bool FindLastNongreaterOffset(const nsTArray<T>& aContainer,
                              int32_t aSoftTextOffset, size_t* aIndex) {
  if (aContainer.Length() == 0) {
    return false;
  }

  BinarySearchIf(aContainer, 0, aContainer.Length(),
                 FirstLargerOffset<T>(aSoftTextOffset), aIndex);
  if (*aIndex > 0) {
    // There was at least one mapping with offset <= aSoftTextOffset. Step back
    // to find the last element with |mSoftTextOffset <= aSoftTextOffset|.
    *aIndex -= 1;
  } else {
    // Every mapping had offset greater than aSoftTextOffset.
    MOZ_ASSERT(aContainer[*aIndex].mSoftTextOffset > aSoftTextOffset);
  }
  return true;
}

}  // namespace

NodeOffset mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(
    int32_t aSoftTextOffset, DOMMapHint aHint) const {
  MOZ_ASSERT(mSoftText.mIsValid,
             "Soft text must be valid if we're to map out of it");
  if (!mSoftText.mIsValid) return NodeOffset(nullptr, -1);

  // Find the last mapping, if any, such that mSoftTextOffset <= aSoftTextOffset
  size_t index;
  bool found = FindLastNongreaterOffset(mSoftText.GetDOMMapping(),
                                        aSoftTextOffset, &index);
  if (!found) {
    return NodeOffset(nullptr, -1);
  }

  // 'index' is now the last mapping, if any, such that
  // mSoftTextOffset <= aSoftTextOffset.
  // If we're doing HINT_END, then we may want to return the end of the
  // the previous mapping instead of the start of this mapping
  if (aHint == HINT_END && index > 0) {
    const DOMTextMapping& map = mSoftText.GetDOMMapping()[index - 1];
    if (map.mSoftTextOffset + map.mLength == aSoftTextOffset)
      return NodeOffset(map.mNodeOffset.mNode,
                        map.mNodeOffset.mOffset + map.mLength);
  }

  // We allow ourselves to return the end of this mapping even if we're
  // doing HINT_START. This will only happen if there is no mapping which this
  // point is the start of. I'm not 100% sure this is OK...
  const DOMTextMapping& map = mSoftText.GetDOMMapping()[index];
  int32_t offset = aSoftTextOffset - map.mSoftTextOffset;
  if (offset >= 0 && offset <= map.mLength)
    return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset);

  return NodeOffset(nullptr, -1);
}

// static
void mozInlineSpellWordUtil::ToString(const DOMMapHint aHint,
                                      nsACString& aResult) {
  switch (aHint) {
    case HINT_BEGIN:
      aResult.AssignLiteral("begin");
      break;
    case HINT_END:
      aResult.AssignLiteral("end");
      break;
  }
}

int32_t mozInlineSpellWordUtil::FindRealWordContaining(
    int32_t aSoftTextOffset, DOMMapHint aHint, bool aSearchForward) const {
  if (MOZ_LOG_TEST(sInlineSpellWordUtilLog, LogLevel::Debug)) {
    nsAutoCString hint;
    mozInlineSpellWordUtil::ToString(aHint, hint);

    MOZ_LOG(
        sInlineSpellWordUtilLog, LogLevel::Debug,
        ("%s: offset=%i, hint=%s, searchForward=%i.", __FUNCTION__,
         aSoftTextOffset, hint.get(), static_cast<int32_t>(aSearchForward)));
  }

  MOZ_ASSERT(mSoftText.mIsValid,
             "Soft text must be valid if we're to map out of it");
  if (!mSoftText.mIsValid) return -1;

  // Find the last word, if any, such that mRealWords[index].mSoftTextOffset
  // <= aSoftTextOffset
  size_t index;
  bool found = FindLastNongreaterOffset(mRealWords, aSoftTextOffset, &index);
  if (!found) {
    return -1;
  }

  // 'index' is now the last word, if any, such that
  // mSoftTextOffset <= aSoftTextOffset.
  // If we're doing HINT_END, then we may want to return the end of the
  // the previous word instead of the start of this word
  if (aHint == HINT_END && index > 0) {
    const RealWord& word = mRealWords[index - 1];
    if (word.EndOffset() == aSoftTextOffset) {
      return index - 1;
    }
  }

  // We allow ourselves to return the end of this word even if we're
  // doing HINT_BEGIN. This will only happen if there is no word which this
  // point is the start of. I'm not 100% sure this is OK...
  const RealWord& word = mRealWords[index];
  int32_t offset = aSoftTextOffset - word.mSoftTextOffset;
  if (offset >= 0 && offset <= static_cast<int32_t>(word.mLength)) return index;

  if (aSearchForward) {
    if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) {
      // All words have mSoftTextOffset > aSoftTextOffset
      return 0;
    }
    // 'index' is the last word such that mSoftTextOffset <= aSoftTextOffset.
    // Word index+1, if it exists, will be the first with
    // mSoftTextOffset > aSoftTextOffset.
    if (index + 1 < mRealWords.Length()) return index + 1;
  }

  return -1;
}

// mozInlineSpellWordUtil::SplitDOMWordAndAppendTo

nsresult mozInlineSpellWordUtil::SplitDOMWordAndAppendTo(
    int32_t aStart, int32_t aEnd, nsTArray<RealWord>& aRealWords) const {
  nsDependentSubstring targetText(mSoftText.GetValue(), aStart, aEnd - aStart);
  WordSplitState<nsDependentSubstring> state(targetText);
  state.mCurCharClass = state.ClassifyCharacter(0, true);

  state.AdvanceThroughSeparators();
  if (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT && state.IsSpecialWord()) {
    int32_t specialWordLength =
        state.mDOMWordText.Length() - state.mDOMWordOffset;
    if (!aRealWords.AppendElement(
            RealWord(aStart + state.mDOMWordOffset, specialWordLength, false),
            fallible)) {
      return NS_ERROR_OUT_OF_MEMORY;
    }

    return NS_OK;
  }

  while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) {
    state.AdvanceThroughSeparators();
    if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT) break;

    // save the beginning of the word
    int32_t wordOffset = state.mDOMWordOffset;

    // find the end of the word
    state.AdvanceThroughWord();
    int32_t wordLen = state.mDOMWordOffset - wordOffset;
    if (!aRealWords.AppendElement(
            RealWord(aStart + wordOffset, wordLen,
                     !state.ShouldSkipWord(wordOffset, wordLen)),
            fallible)) {
      return NS_ERROR_OUT_OF_MEMORY;
    }
  }

  return NS_OK;
}

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.24 Sekunden (vorverarbeitet am 2026-04-27) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.