/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef Tokenizer_h__
#define Tokenizer_h__
#include <type_traits>
#include "nsString.h"
#include "mozilla/CheckedInt.h"
#include "mozilla/ScopeExit.h"
#include "mozilla/UniquePtr.h"
#include "nsTArray.h"
namespace mozilla {
template <
typename TChar>
class TokenizerBase {
public:
typedef nsTSubstring<TChar> TAString;
typedef nsTString<TChar> TString;
typedef nsTDependentString<TChar> TDependentString;
typedef nsTDependentSubstring<TChar> TDependentSubstring;
static TChar
const sWhitespaces[];
/**
* The analyzer works with elements in the input cut to a sequence of token
* where each token has an elementary type
*/
enum TokenType : uint32_t {
TOKEN_UNKNOWN,
TOKEN_RAW,
TOKEN_ERROR,
TOKEN_INTEGER,
TOKEN_WORD,
TOKEN_CHAR,
TOKEN_WS,
TOKEN_EOL,
TOKEN_EOF,
TOKEN_CUSTOM0 = 1000
};
enum ECaseSensitivity { CASE_SENSITIVE, CASE_INSENSITIVE };
/**
* Class holding the type and the value of a token. It can be manually
* created to allow checks against it via methods of TTokenizer or are results
* of some of the TTokenizer's methods.
*/
class Token {
TokenType mType;
TDependentSubstring mWord;
TString mCustom;
TChar mChar;
uint64_t mInteger;
ECaseSensitivity mCustomCaseInsensitivity;
bool mCustomEnabled;
// If this token is a result of the parsing process, this member is
// referencing a sub-string in the input buffer. If this is externally
// created Token this member is left an empty string.
TDependentSubstring mFragment;
friend class TokenizerBase<TChar>;
void AssignFragment(
typename TAString::const_char_iterator begin,
typename TAString::const_char_iterator end);
static Token Raw();
public:
Token();
Token(
const Token& aOther);
Token&
operator=(
const Token& aOther);
// Static constructors of tokens by type and value
static Token Word(TAString
const& aWord);
static Token
Char(TChar
const aChar);
static Token Number(uint64_t
const aNumber);
static Token Whitespace();
static Token NewLine();
static Token EndOfFile();
static Token Error();
// Compares the two tokens, type must be identical and value
// of one of the tokens must be 'any' or equal.
bool Equals(
const Token& aOther)
const;
TokenType Type()
const {
return mType; }
TChar AsChar()
const;
TDependentSubstring AsString()
const;
uint64_t AsInteger()
const;
TDependentSubstring Fragment()
const {
return mFragment; }
};
/**
* Consumers may register a custom string that, when found in the input, is
* considered a token and returned by Next*() and accepted by Check*()
* methods. AddCustomToken() returns a reference to a token that can then be
* comapred using Token::Equals() againts the output from Next*() or be passed
* to Check*().
*/
Token AddCustomToken(
const TAString& aValue,
ECaseSensitivity aCaseInsensitivity,
bool aEnabled =
true);
template <uint32_t N>
Token AddCustomToken(
const TChar (&aValue)[N],
ECaseSensitivity aCaseInsensitivity,
bool aEnabled =
true) {
return AddCustomToken(TDependentSubstring(aValue, N - 1),
aCaseInsensitivity, aEnabled);
}
void RemoveCustomToken(Token& aToken);
/**
* Only applies to a custom type of a Token (see AddCustomToken above.)
* This turns on and off token recognition. When a custom token is disabled,
* it's ignored as never added as a custom token.
*/
void EnableCustomToken(Token
const& aToken,
bool aEnable);
/**
* Mode of tokenization.
* FULL tokenization, the default, recognizes built-in tokens and any custom
* tokens, if added. CUSTOM_ONLY will only recognize custom tokens, the rest
* is seen as 'raw'. This mode can be understood as a 'binary' mode.
*/
enum class Mode { FULL, CUSTOM_ONLY };
void SetTokenizingMode(Mode aMode);
/**
* Return false iff the last Check*() call has returned false or when we've
* read past the end of the input string.
*/
[[nodiscard]]
bool HasFailed()
const;
protected:
explicit TokenizerBase(
const TChar* aWhitespaces = nullptr,
const TChar* aAdditionalWordChars = nullptr);
// false if we have already read the EOF token.
bool HasInput()
const;
// Main parsing function, it doesn't shift the read cursor, just returns the
// next token position.
typename TAString::const_char_iterator Parse(Token& aToken)
const;
// Is read cursor at the end?
bool IsEnd(
const typename TAString::const_char_iterator& caret)
const;
// True, when we are at the end of the input data, but it has not been marked
// as complete yet. In that case we cannot proceed with providing a
// multi-TChar token.
bool IsPending(
const typename TAString::const_char_iterator& caret)
const;
// Is read cursor on a character that is a word start?
bool IsWordFirst(
const TChar aInput)
const;
// Is read cursor on a character that is an in-word letter?
bool IsWord(
const TChar aInput)
const;
// Is read cursor on a character that is a valid number?
// TODO - support multiple radix
bool IsNumber(
const TChar aInput)
const;
// Is equal to the given custom token?
bool IsCustom(
const typename TAString::const_char_iterator& caret,
const Token& aCustomToken, uint32_t* aLongest = nullptr)
const;
// Friendly helper to assign a fragment on a Token
static void AssignFragment(Token& aToken,
typename TAString::const_char_iterator begin,
typename TAString::const_char_iterator end);
#ifdef DEBUG
// This is called from inside Tokenizer methods to make sure the token is
// valid.
void Validate(Token
const& aToken);
#endif
// true iff we have already read the EOF token
bool mPastEof;
// true iff the last Check*() call has returned false, reverts to true on
// Rollback() call
bool mHasFailed;
// true if the input string is final (finished), false when we expect more
// data yet to be fed to the tokenizer (see IncrementalTokenizer derived
// class).
bool mInputFinished;
// custom only vs full tokenizing mode, see the Parse() method
Mode mMode;
// minimal raw data chunked delivery during incremental feed
uint32_t mMinRawDelivery;
// Customizable list of whitespaces
const TChar* mWhitespaces;
// Additinal custom word characters
const TChar* mAdditionalWordChars;
// All these point to the original buffer passed to the constructor or to the
// incremental buffer after FeedInput.
typename TAString::const_char_iterator
mCursor;
// Position of the current (actually next to read) token start
typename TAString::const_char_iterator mEnd;
// End of the input position
// This is the list of tokens user has registered with AddCustomToken()
nsTArray<UniquePtr<Token>> mCustomTokens;
uint32_t mNextCustomTokenID;
private:
TokenizerBase() =
delete;
TokenizerBase(
const TokenizerBase&) =
delete;
TokenizerBase(TokenizerBase&&) =
delete;
TokenizerBase(
const TokenizerBase&&) =
delete;
TokenizerBase&
operator=(
const TokenizerBase&) =
delete;
};
/**
* This is a simple implementation of a lexical analyzer or maybe better
* called a tokenizer.
*
* Please use Tokenizer or Tokenizer16 classes, that are specializations
* of this template class. Tokenizer is for ASCII input, Tokenizer16 may
* handle char16_t input, but doesn't recognize whitespaces or numbers
* other than standard `char` specialized Tokenizer class.
*/
template <
typename TChar>
class TTokenizer :
public TokenizerBase<TChar> {
public:
typedef TokenizerBase<TChar> base;
/**
* @param aSource
* The string to parse.
* IMPORTANT NOTE: TTokenizer doesn't ensure the input string buffer
* lifetime. It's up to the consumer to make sure the string's buffer outlives
* the TTokenizer!
* @param aWhitespaces
* If non-null TTokenizer will use this custom set of whitespaces for
* CheckWhite() and SkipWhites() calls. By default the list consists of space
* and tab.
* @param aAdditionalWordChars
* If non-null it will be added to the list of characters that consist a
* word. This is useful when you want to accept e.g. '-' in HTTP headers. By
* default a word character is consider any character for which upper case
* is different from lower case.
*
* If there is an overlap between aWhitespaces and aAdditionalWordChars, the
* check for word characters is made first.
*/
explicit TTokenizer(
const typename base::TAString& aSource,
const TChar* aWhitespaces = nullptr,
const TChar* aAdditionalWordChars = nullptr);
explicit TTokenizer(
const TChar* aSource,
const TChar* aWhitespaces = nullptr,
const TChar* aAdditionalWordChars = nullptr);
/**
* When there is still anything to read from the input, tokenize it, store the
* token type and value to aToken result and shift the cursor past this just
* parsed token. Each call to Next() reads another token from the input and
* shifts the cursor. Returns false if we have passed the end of the input.
*/
[[nodiscard]]
bool Next(
typename base::Token& aToken);
/**
* Parse the token on the input read cursor position, check its type is equal
* to aTokenType and if so, put it into aResult, shift the cursor and return
* true. Otherwise, leave the input read cursor position intact and return
* false.
*/
[[nodiscard]]
bool Check(
const typename base::TokenType aTokenType,
typename base::Token& aResult);
/**
* Same as above method, just compares both token type and token value passed
* in aToken. When both the type and the value equals, shift the cursor and
* return true. Otherwise return false.
*/
[[nodiscard]]
bool Check(
const typename base::Token& aToken);
/**
* SkipWhites method (below) may also skip new line characters automatically.
*/
enum WhiteSkipping {
/**
* SkipWhites will only skip what is defined as a white space (default).
*/
DONT_INCLUDE_NEW_LINE = 0,
/**
* SkipWhites will skip definited white spaces as well as new lines
* automatically.
*/
INCLUDE_NEW_LINE = 1
};
/**
* Skips any occurence of whitespaces specified in mWhitespaces member,
* optionally skip also new lines.
*/
void SkipWhites(WhiteSkipping aIncludeNewLines = DONT_INCLUDE_NEW_LINE);
/**
* Skips all tokens until the given one is found or EOF is hit. The token
* or EOF are next to read.
*/
void SkipUntil(
typename base::Token
const& aToken);
// These are mostly shortcuts for the Check() methods above.
/**
* Check whitespace character is present.
*/
[[nodiscard]]
bool CheckWhite() {
return Check(base::Token::Whitespace()); }
/**
* Check there is a single character on the read cursor position. If so,
* shift the read cursor position and return true. Otherwise false.
*/
[[nodiscard]]
bool CheckChar(
const TChar aChar) {
return Check(base::Token::
Char(aChar));
}
/**
* This is a customizable version of CheckChar. aClassifier is a function
* called with value of the character on the current input read position. If
* this user function returns true, read cursor is shifted and true returned.
* Otherwise false. The user classifiction function is not called when we are
* at or past the end and false is immediately returned.
*/
[[nodiscard]]
bool CheckChar(
bool (*aClassifier)(
const TChar aChar));
/**
* Check for a whole expected word.
*/
[[nodiscard]]
bool CheckWord(
const typename base::TAString& aWord) {
return Check(base::Token::Word(aWord));
}
/**
* Shortcut for literal const word check with compile time length calculation.
*/
template <uint32_t N>
[[nodiscard]]
bool CheckWord(
const TChar (&aWord)[N]) {
return Check(
base::Token::Word(
typename base::TDependentString(aWord, N - 1)));
}
/**
* Helper to check for a string compound of multiple tokens like "foo bar".
* The match is binary-exact, a white space or a delimiter character in the
* phrase must match exactly the characters in the input.
*/
[[nodiscard]]
bool CheckPhrase(
const typename base::TAString& aPhrase);
template <uint32_t N>
[[nodiscard]]
bool CheckPhrase(
const TChar (&aPhrase)[N]) {
return CheckPhrase(
typename base::TDependentString(aPhrase, N - 1));
}
/**
* Checks \r, \n or \r\n.
*/
[[nodiscard]]
bool CheckEOL() {
return Check(base::Token::NewLine()); }
/**
* Checks we are at the end of the input string reading. If so, shift past
* the end and returns true. Otherwise does nothing and returns false.
*/
[[nodiscard]]
bool CheckEOF() {
return Check(base::Token::EndOfFile()); }
/**
* These are shortcuts to obtain the value immediately when the token type
* matches.
*/
[[nodiscard]]
bool ReadChar(TChar* aValue);
[[nodiscard]]
bool ReadChar(
bool (*aClassifier)(
const TChar aChar),
TChar* aValue);
[[nodiscard]]
bool ReadWord(
typename base::TAString& aValue);
[[nodiscard]]
bool ReadWord(
typename base::TDependentSubstring& aValue);
/**
* This is an integer read helper. It returns false and doesn't move the read
* cursor when any of the following happens:
* - the token at the read cursor is not an integer
* - the final number doesn't fit the T type
* Otherwise true is returned, aValue is filled with the integral number
* and the cursor is moved forward.
*/
template <
typename T>
[[nodiscard]]
bool ReadInteger(T* aValue) {
MOZ_RELEASE_ASSERT(aValue);
typename base::TAString::const_char_iterator rollback = mRollback;
typename base::TAString::const_char_iterator cursor = base::mCursor;
typename base::Token t;
if (!Check(base::TOKEN_INTEGER, t)) {
return false;
}
mozilla::CheckedInt<T> checked(t.AsInteger());
if (!checked.isValid()) {
// Move to a state as if Check() call has failed
mRollback = rollback;
base::mCursor = cursor;
base::mHasFailed =
true;
return false;
}
*aValue = checked.value();
return true;
}
/**
* Same as above, but accepts an integer with an optional minus sign.
*/
template <
typename T,
typename V = std::enable_if_t<
std::is_signed_v<std::remove_pointer_t<T>>,
std::remove_pointer_t<T>>>
[[nodiscard]]
bool ReadSignedInteger(T* aValue) {
MOZ_RELEASE_ASSERT(aValue);
typename base::TAString::const_char_iterator rollback = mRollback;
typename base::TAString::const_char_iterator cursor = base::mCursor;
auto revert = MakeScopeExit([&] {
// Move to a state as if Check() call has failed
mRollback = rollback;
base::mCursor = cursor;
base::mHasFailed =
true;
});
// Using functional raw access because '-' could be part of the word set
// making CheckChar('-') not work.
bool minus = CheckChar([](
const TChar aChar) {
return aChar ==
'-'; });
typename base::Token t;
if (!Check(base::TOKEN_INTEGER, t)) {
return false;
}
mozilla::CheckedInt<T> checked(t.AsInteger());
if (minus) {
checked *= -1;
}
if (!checked.isValid()) {
return false;
}
*aValue = checked.value();
revert.release();
return true;
}
/**
* This is an hexadecimal read helper. It returns false and doesn't move the
* read cursor when any of the following happens:
* - the token at the read cursor is not 0, and it's not followed by x
* - the token(s) that follow don't make a valid hexadecimal number
* - the final number doesn't fit the T type
* Otherwise true is returned, aValue is filled with the integral number
* and the cursor is moved forward.
*/
template <
typename T>
[[nodiscard]]
bool ReadHexadecimal(T* aValue,
bool aPrefixed =
true) {
MOZ_RELEASE_ASSERT(aValue);
typename base::TAString::const_char_iterator rollback = mRollback;
typename base::TAString::const_char_iterator cursor = base::mCursor;
auto revert = MakeScopeExit([&] {
// Move to a state as if Check() call has failed
mRollback = rollback;
base::mCursor = cursor;
base::mHasFailed =
true;
});
if (aPrefixed) {
typename base::Token t;
if (!Check(base::TOKEN_INTEGER, t) && t.AsInteger() != 0) {
return false;
}
if (!CheckChar([](
const TChar aChar) {
return aChar ==
'x'; })) {
return false;
}
}
TChar c =
'z';
mozilla::CheckedInt<T> resultingNumber = 0;
while (ReadChar(
[](
const TChar aChar) {
return (aChar >=
'0' && aChar <=
'9') ||
(aChar >=
'A' && aChar <=
'F') ||
(aChar >=
'a' && aChar <=
'f');
},
&c)) {
resultingNumber *= 16;
if (c <=
'9') {
resultingNumber +=
static_cast<uint64_t>(c -
'0');
}
else if (c <=
'F') {
resultingNumber +=
static_cast<uint64_t>(c -
'A') + 0xa;
}
else {
resultingNumber +=
static_cast<uint64_t>(c -
'a') + 0xa;
}
}
if (c ==
'z' || !resultingNumber.isValid()) {
return false;
}
*aValue = resultingNumber.value();
revert.release();
return true;
}
/**
* Returns the read cursor position back as it was before the last call of any
* parsing method of TTokenizer (Next, Check*, Skip*, Read*) so that the last
* operation can be repeated. Rollback cannot be used multiple times, it only
* reverts the last successfull parse operation. It also cannot be used
* before any parsing operation has been called on the TTokenizer.
*/
void Rollback();
/**
* Record() and Claim() are collecting the input as it is being parsed to
* obtain a substring between particular syntax bounderies defined by any
* recursive descent parser or simple parser the TTokenizer is used to read
* the input for. Inlucsion of a token that has just been parsed can be
* controlled using an arguemnt.
*/
enum ClaimInclusion {
/**
* Include resulting (or passed) token of the last lexical analyzer
* operation in the result.
*/
INCLUDE_LAST,
/**
* Do not include it.
*/
EXCLUDE_LAST
};
/**
* Start the process of recording. Based on aInclude value the begining of
* the recorded sub-string is at the current position (EXCLUDE_LAST) or at the
* position before the last parsed token (INCLUDE_LAST).
*/
void Record(ClaimInclusion aInclude = EXCLUDE_LAST);
/**
* Claim result of the record started with Record() call before. Depending on
* aInclude the ending of the sub-string result includes or excludes the last
* parsed or checked token.
*/
void Claim(
typename base::TAString& aResult,
ClaimInclusion aInclude = EXCLUDE_LAST);
void Claim(
typename base::TDependentSubstring& aResult,
ClaimInclusion aInclude = EXCLUDE_LAST);
/**
* If aToken is found, aResult is set to the substring between the current
* position and the position of aToken, potentially including aToken depending
* on aInclude.
* If aToken isn't found aResult is set to the substring between the current
* position and the end of the string.
* If aToken is found, the method returns true. Otherwise it returns false.
*
* Calling Rollback() after ReadUntil() will return the read cursor to the
* position it had before ReadUntil was called.
*/
[[nodiscard]]
bool ReadUntil(
typename base::Token
const& aToken,
typename base::TDependentSubstring& aResult,
ClaimInclusion aInclude = EXCLUDE_LAST);
[[nodiscard]]
bool ReadUntil(
typename base::Token
const& aToken,
typename base::TAString& aResult,
ClaimInclusion aInclude = EXCLUDE_LAST);
protected:
// All these point to the original buffer passed to the TTokenizer's
// constructor
typename base::TAString::const_char_iterator
mRecord;
// Position where the recorded sub-string for Claim() is
typename base::TAString::const_char_iterator
mRollback;
// Position of the previous token start
private:
TTokenizer() =
delete;
TTokenizer(
const TTokenizer&) =
delete;
TTokenizer(TTokenizer&&) =
delete;
TTokenizer(
const TTokenizer&&) =
delete;
TTokenizer&
operator=(
const TTokenizer&) =
delete;
};
typedef TTokenizer<
char> Tokenizer;
typedef TTokenizer<char16_t> Tokenizer16;
}
// namespace mozilla
#endif // Tokenizer_h__