/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- * vim: set ts=8 sts=2 et sw=2 tw=80: * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
using mozilla::AsciiAlphanumericToNumber; using mozilla::AssertedCast; using mozilla::DecodeOneUtf8CodePoint; using mozilla::IsAscii; using mozilla::IsAsciiAlpha; using mozilla::IsAsciiDigit; using mozilla::IsAsciiHexDigit; using mozilla::IsTrailingUnit; using mozilla::MakeScopeExit; using mozilla::Maybe; using mozilla::PointerRangeSize; using mozilla::Span; using mozilla::Utf8Unit;
using JS::ReadOnlyCompileOptions; using JS::RegExpFlag; using JS::RegExpFlags;
struct ReservedWordInfo { constchar* chars; // C string with reserved word text
js::frontend::TokenKind tokentype;
};
// Returns a ReservedWordInfo for the specified characters, or nullptr if the // string is not a reserved word. template <typename CharT> staticconst ReservedWordInfo* FindReservedWord(const CharT* s, size_t length) {
MOZ_ASSERT(length != 0);
SourceCoords::SourceCoords(FrontendContext* fc, uint32_t initialLineNumber,
uint32_t initialOffset)
: lineStartOffsets_(fc), initialLineNum_(initialLineNumber), lastIndex_(0) { // This is actually necessary! Removing it causes compile errors on // GCC and clang. You could try declaring this: // // const uint32_t SourceCoords::MAX_PTR; // // which fixes the GCC/clang error, but causes bustage on Windows. Sigh. //
uint32_t maxPtr = MAX_PTR;
// The first line begins at buffer offset |initialOffset|. MAX_PTR is the // sentinel. The appends cannot fail because |lineStartOffsets_| has // statically-allocated elements.
MOZ_ASSERT(lineStartOffsets_.capacity() >= 2);
MOZ_ALWAYS_TRUE(lineStartOffsets_.reserve(2));
lineStartOffsets_.infallibleAppend(initialOffset);
lineStartOffsets_.infallibleAppend(maxPtr);
}
if (index == sentinelIndex) { // We haven't seen this newline before. Update lineStartOffsets_ // only if lineStartOffsets_.append succeeds, to keep sentinel. // Otherwise return false to tell TokenStream about OOM.
uint32_t maxPtr = MAX_PTR; if (!lineStartOffsets_.append(maxPtr)) {
static_assert(std::is_same_v<decltype(lineStartOffsets_.allocPolicy()),
TempAllocPolicy&>, "this function's caller depends on it reporting an " "error on failure, as TempAllocPolicy ensures"); returnfalse;
}
lineStartOffsets_[index] = lineStartOffset;
} else { // We have seen this newline before (and ungot it). Do nothing (other // than checking it hasn't mysteriously changed). // This path can be executed after hitting OOM, so check index.
MOZ_ASSERT_IF(index < sentinelIndex,
lineStartOffsets_[index] == lineStartOffset);
} returntrue;
}
for (size_t i = sentinelIndex + 1; i < other.lineStartOffsets_.length();
i++) { if (!lineStartOffsets_.append(other.lineStartOffsets_[i])) { returnfalse;
}
} returntrue;
}
if (lineStartOffsets_[lastIndex_] <= offset) { // If we reach here, offset is on a line the same as or higher than // last time. Check first for the +0, +1, +2 cases, because they // typically cover 85--98% of cases. if (offset < lineStartOffsets_[lastIndex_ + 1]) { return lastIndex_; // index is same as last time
}
// If we reach here, there must be at least one more entry (plus the // sentinel). Try it.
lastIndex_++; if (offset < lineStartOffsets_[lastIndex_ + 1]) { return lastIndex_; // index is one higher than last time
}
// The same logic applies here.
lastIndex_++; if (offset < lineStartOffsets_[lastIndex_ + 1]) { return lastIndex_; // index is two higher than last time
}
// No luck. Oh well, we have a better-than-default starting point for // the binary search.
iMin = lastIndex_ + 1;
MOZ_ASSERT(iMin <
lineStartOffsets_.length() - 1); // -1 due to the sentinel
} else {
iMin = 0;
}
// This is a binary search with deferred detection of equality, which was // marginally faster in this case than a standard binary search. // The -2 is because |lineStartOffsets_.length() - 1| is the sentinel, and we // want one before that.
iMax = lineStartOffsets_.length() - 2; while (iMax > iMin) {
iMid = iMin + (iMax - iMin) / 2; if (offset >= lineStartOffsets_[iMid + 1]) {
iMin = iMid + 1; // offset is above lineStartOffsets_[iMid]
} else {
iMax = iMid; // offset is below or within lineStartOffsets_[iMid]
}
}
bool TokenStreamAnyChars::checkOptions() { // Constrain starting columns to where they will saturate. if (options().column.oneOriginValue() >
JS::LimitedColumnNumberOneOrigin::Limit) {
reportErrorNoOffset(JSMSG_BAD_COLUMN_NUMBER); returnfalse;
}
// |limit| is a code point boundary. if (MOZ_UNLIKELY(*ptr == limit)) { return;
}
// Otherwise rewind past trailing units to the start of the code point. #ifdef DEBUG
size_t retracted = 0; #endif while (MOZ_UNLIKELY(IsTrailingUnit((*ptr)[0]))) {
--*ptr; #ifdef DEBUG
retracted++; #endif
}
MOZ_ASSERT(retracted < 4, "the longest UTF-8 code point is four units, so this should never " "retract more than three units");
}
// |limit| is a code point boundary. if (MOZ_UNLIKELY(*ptr == limit)) { return;
}
// Otherwise the pointer must be retracted by one iff it splits a two-unit // code point. if (MOZ_UNLIKELY(unicode::IsTrailSurrogate((*ptr)[0]))) { // Outside test suites testing garbage WTF-16, it's basically guaranteed // here that |(*ptr)[-1] (*ptr)[0]| is a surrogate pair. if (MOZ_LIKELY(unicode::IsLeadSurrogate((*ptr)[-1]))) {
--*ptr;
}
}
}
// Reset the previous offset/column number offset cache for this line, if the // previous lookup wasn't on this line. if (line != lineOfLastColumnComputation_) {
lineOfLastColumnComputation_ = line;
lastChunkVectorForLine_ = nullptr;
lastOffsetOfComputedColumn_ = start;
lastComputedColumnOffset_ = JS::ColumnNumberUnsignedOffset::zero();
}
// Compute and return the final column number offset from a partially // calculated offset/column number offset, using the last-cached // offset/column number offset if they're more optimal. auto OffsetFromPartial =
[this, offset, &sourceUnits](
uint32_t partialOffset,
JS::ColumnNumberUnsignedOffset partialColumnOffset,
UnitsType unitsType) {
MOZ_ASSERT(partialOffset <= offset);
// If the last lookup on this line was closer to |offset|, use it. if (partialOffset < this->lastOffsetOfComputedColumn_ &&
this->lastOffsetOfComputedColumn_ <= offset) {
partialOffset = this->lastOffsetOfComputedColumn_;
partialColumnOffset = this->lastComputedColumnOffset_;
}
const Unit* begin = sourceUnits.codeUnitPtrAt(partialOffset); const Unit* end = sourceUnits.codeUnitPtrAt(offset);
// We won't add an entry to |longLineColumnInfo_| for lines where the maximum // column has offset less than this value. The most common (non-minified) // long line length is likely 80ch, maybe 100ch, so we use that, rounded up to // the next power of two for efficient division/multiplication below.
constexpr uint32_t ColumnChunkLength = mozilla::tl::RoundUpPow2<100>::value;
// The index within any associated |Vector<ChunkInfo>| of |offset|'s chunk. const uint32_t chunkIndex = offsetInLine / ColumnChunkLength; if (chunkIndex == 0) { // We don't know from an |offset| in the zeroth chunk that this line is even // long. First-chunk info is mostly useless, anyway -- we have |start| // already. So if we have *easy* access to that zeroth chunk, use it -- // otherwise just count pessimally. (This will still benefit from caching // the last column/offset for computations for successive offsets, so it's // not *always* worst-case.)
UnitsType unitsType; if (lastChunkVectorForLine_ && lastChunkVectorForLine_->length() > 0) {
MOZ_ASSERT((*lastChunkVectorForLine_)[0].columnOffset() ==
JS::ColumnNumberUnsignedOffset::zero());
unitsType = (*lastChunkVectorForLine_)[0].unitsType();
} else {
unitsType = UnitsType::PossiblyMultiUnit;
}
// If this line has no chunk vector yet, insert one in the hash map. (The // required index is allocated and filled further down.) if (!lastChunkVectorForLine_) { auto ptr = longLineColumnInfo_.lookupForAdd(line); if (!ptr) { // This could rehash and invalidate a cached vector pointer, but the outer // condition means we don't have a cached pointer. if (!longLineColumnInfo_.add(ptr, line, Vector<ChunkInfo>(fc))) { // In case of OOM, just count columns from the start of the line.
fc->recoverFromOutOfMemory(); return OffsetFromPartial(start, JS::ColumnNumberUnsignedOffset::zero(),
UnitsType::PossiblyMultiUnit);
}
}
// Note that adding elements to this vector won't invalidate this pointer.
lastChunkVectorForLine_ = &ptr->value();
}
auto entriesLen = AssertedCast<uint32_t>(lastChunkVectorForLine_->length()); if (chunkIndex < entriesLen) { // We've computed the chunk |offset| resides in. Compute the column number // from the chunk.
partialOffset = RetractedOffsetOfChunk(chunkIndex);
partialColumnOffset = (*lastChunkVectorForLine_)[chunkIndex].columnOffset();
// This is exact if |chunkIndex| isn't the last chunk.
unitsType = (*lastChunkVectorForLine_)[chunkIndex].unitsType();
// Otherwise the last chunk is pessimistically assumed to contain multi-unit // code points because we haven't fully examined its contents yet -- they // may not have been tokenized yet, they could contain encoding errors, or // they might not even exist.
MOZ_ASSERT_IF(chunkIndex == entriesLen - 1,
(*lastChunkVectorForLine_)[chunkIndex].unitsType() ==
UnitsType::PossiblyMultiUnit);
} else { // Extend the vector from its last entry or the start of the line. (This is // also a suitable partial start point if we must recover from OOM.) if (entriesLen > 0) {
partialOffset = RetractedOffsetOfChunk(entriesLen - 1);
partialColumnOffset =
(*lastChunkVectorForLine_)[entriesLen - 1].columnOffset();
} else {
partialOffset = start;
partialColumnOffset = JS::ColumnNumberUnsignedOffset::zero();
}
if (!lastChunkVectorForLine_->reserve(chunkIndex + 1)) { // As earlier, just start from the greatest offset/column in case of OOM.
fc->recoverFromOutOfMemory(); return OffsetFromPartial(partialOffset, partialColumnOffset,
UnitsType::PossiblyMultiUnit);
}
// OOM is no longer possible now. \o/
// The vector always begins with the column of the line start, i.e. zero, // with chunk units pessimally assumed not single-unit. if (entriesLen == 0) {
lastChunkVectorForLine_->infallibleAppend(
ChunkInfo(JS::ColumnNumberUnsignedOffset::zero(),
UnitsType::PossiblyMultiUnit));
entriesLen++;
}
do { const Unit* const begin = sourceUnits.codeUnitPtrAt(partialOffset); const Unit* chunkLimit = sourceUnits.codeUnitPtrAt(
start + std::min(entriesLen++ * ColumnChunkLength, offsetInLine));
static_assert(
ColumnChunkLength > SourceUnitTraits<Unit>::maxUnitsLength - 1, "any retraction below is assumed to never underflow to the " "preceding chunk, even for the longest code point");
// Prior tokenizing ensured that [begin, limit) is validly encoded, and // |begin < chunkLimit|, so any retraction here can't underflow.
RetractPointerToCodePointBoundary(&chunkLimit, limit);
// If this chunk (which will become non-final at the end of the loop) is // all single-unit code points, annotate the chunk accordingly. if (numUnits == numUTF16CodeUnits) {
lastChunkVectorForLine_->back().guaranteeSingleUnits();
}
lastChunkVectorForLine_->infallibleEmplaceBack(
partialColumnOffset, UnitsType::PossiblyMultiUnit);
} while (entriesLen < chunkIndex + 1);
// We're at a spot in the current final chunk, and final chunks never have // complete units information, so be pessimistic.
unitsType = UnitsType::PossiblyMultiUnit;
}
bool canAddLineOfContext = fillExceptingContext(&err, offset); if (canAddLineOfContext) { if (!internalComputeLineOfContext(&err, offset)) { break;
}
// As this is an encoding error, the computed window-end must be // identical to the location of the error -- any further on and the // window would contain invalid Unicode.
MOZ_ASSERT_IF(err.lineOfContext != nullptr,
err.lineLength == err.tokenOffset);
}
auto notes = MakeUnique<JSErrorNotes>(); if (!notes) {
ReportOutOfMemory(anyChars.fc); break;
}
// The largest encoding of a UTF-8 code point is 4 units. (Encoding an // obsolete 5- or 6-byte code point will complain only about a bad lead // code unit.)
constexpr size_t MaxWidth = sizeof("0xHH 0xHH 0xHH 0xHH");
template <class AnyCharsAccess>
MOZ_COLD void
TokenStreamChars<Utf8Unit, AnyCharsAccess>::badStructurallyValidCodePoint(
char32_t codePoint, uint8_t codePointLength, constchar* reason) { // Construct a string like "0x203D" (including null terminator) to include // in the error message. Write the string end-to-start from end to start // of an adequately sized |char| array, shifting least significant nibbles // off the number and writing the corresponding hex digits until done, then // prefixing with "0x". |codePointStr| points at the incrementally // computed string, within |codePointCharsArray|'s bounds.
// 0x1F'FFFF is the maximum value that can fit in 3+6+6+6 unconstrained // bits in a four-byte UTF-8 code unit sequence.
constexpr size_t MaxHexSize = sizeof( "0x1F" "FFFF"); // including '\0' char codePointCharsArray[MaxHexSize];
// Note that by do-while looping here rather than while-looping, this // writes a '0' when |codePoint == 0|. do {
MOZ_ASSERT(codePointCharsArray < codePointStr);
*--codePointStr = toHexChar(codePoint & 0xF);
codePoint >>= 4;
} while (codePoint);
// If a valid code point is decoded, this function call consumes its code // units. If not, it ungets the lead code unit and invokes the right error // handler, so on failure we must immediately return false.
SourceUnitsIterator iter(this->sourceUnits);
Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePointInline(
lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits,
onBadTrailingUnit, onBadCodePoint, onNotShortestForm); if (maybeCodePoint.isNothing()) { returnfalse;
}
template <class AnyCharsAccess> bool TokenStreamChars<char16_t, AnyCharsAccess>::getNonAsciiCodePoint(
int32_t lead, char32_t* codePoint) {
MOZ_ASSERT(lead != EOF);
MOZ_ASSERT(!isAsciiCodePoint(lead), "ASCII code unit/point must be handled separately");
MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(), "getNonAsciiCodePoint called incorrectly");
// The code point is usually |lead|: overwrite later if needed.
*codePoint = AssertedCast<char32_t>(lead);
// ECMAScript specifically requires that unpaired UTF-16 surrogates be // treated as the corresponding code point and not as an error. See // <https://tc39.github.io/ecma262/#sec-ecmascript-language-types-string-type>. // Thus this function does not consider any sequence of 16-bit numbers to // be intrinsically in error.
// Dispense with single-unit code points and lone trailing surrogates. if (MOZ_LIKELY(!unicode::IsLeadSurrogate(lead))) { if (MOZ_UNLIKELY(lead == unicode::LINE_SEPARATOR ||
lead == unicode::PARA_SEPARATOR)) { if (!updateLineInfoForEOL()) { #ifdef DEBUG // Assign to a sentinel value to hopefully cause errors.
*codePoint = std::numeric_limits<char32_t>::max(); #endif
MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint)); returnfalse;
}
// Also handle a lead surrogate not paired with a trailing surrogate. if (MOZ_UNLIKELY(
this->sourceUnits.atEnd() ||
!unicode::IsTrailSurrogate(this->sourceUnits.peekCodeUnit()))) {
MOZ_ASSERT(!IsLineTerminator(*codePoint)); returntrue;
}
// Otherwise we have a multi-unit code point.
*codePoint = unicode::UTF16Decode(lead, this->sourceUnits.getCodeUnit());
MOZ_ASSERT(!IsLineTerminator(*codePoint)); returntrue;
}
template <class AnyCharsAccess> bool TokenStreamChars<Utf8Unit, AnyCharsAccess>::getNonAsciiCodePoint(
int32_t unit, char32_t* codePoint) {
MOZ_ASSERT(unit != EOF);
MOZ_ASSERT(!isAsciiCodePoint(unit), "ASCII code unit/point must be handled separately");
Utf8Unit lead = Utf8Unit(static_cast<unsignedchar>(unit));
MOZ_ASSERT(lead == this->sourceUnits.previousCodeUnit(), "getNonAsciiCodePoint called incorrectly");
auto onBadLeadUnit = [this, &lead]() { this->badLeadUnit(lead); };
// This consumes the full, valid code point or ungets |lead| and calls the // appropriate error functor on failure.
SourceUnitsIterator iter(this->sourceUnits);
Maybe<char32_t> maybeCodePoint = DecodeOneUtf8CodePoint(
lead, &iter, SourceUnitsEnd(), onBadLeadUnit, onNotEnoughUnits,
onBadTrailingUnit, onBadCodePoint, onNotShortestForm); if (maybeCodePoint.isNothing()) { returnfalse;
}
char32_t cp = maybeCodePoint.value(); if (MOZ_UNLIKELY(cp == unicode::LINE_SEPARATOR ||
cp == unicode::PARA_SEPARATOR)) { if (!updateLineInfoForEOL()) { #ifdef DEBUG // Assign to a sentinel value to hopefully cause errors.
*codePoint = std::numeric_limits<char32_t>::max(); #endif
MOZ_MAKE_MEM_UNDEFINED(codePoint, sizeof(*codePoint)); returnfalse;
}
template <>
size_t SourceUnits<char16_t>::findWindowStart(size_t offset) const { // This is JS's understanding of UTF-16 that allows lone surrogates, so // we have to exclude lone surrogates from [windowStart, offset) ourselves.
auto HalfWindowSize = [&p, &initial]() { return PointerRangeSize(p, initial);
};
while (true) {
MOZ_ASSERT(earliestPossibleStart <= p);
MOZ_ASSERT(HalfWindowSize() <= WindowRadius); if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) { break;
}
char16_t c = p[-1];
// This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in // string and template literals. These code points do affect line and // column coordinates, even as they encode their literal values. if (IsLineTerminator(c)) { break;
}
// Don't allow invalid UTF-16 in pre-context. (Current users don't // require this, and this behavior isn't currently imposed on // pre-context, but these facts might change someday.)
if (MOZ_UNLIKELY(unicode::IsLeadSurrogate(c))) { break;
}
// Optimistically include the code unit, reverting below if needed.
p--;
// If it's not a surrogate at all, keep going. if (MOZ_LIKELY(!unicode::IsTrailSurrogate(c))) { continue;
}
// Stop if we don't have a usable surrogate pair. if (HalfWindowSize() >= WindowRadius ||
p <= earliestPossibleStart || // trail surrogate at low end
!unicode::IsLeadSurrogate(p[-1])) // no paired lead surrogate
{
p++; break;
}
template <>
size_t SourceUnits<Utf8Unit>::findWindowStart(size_t offset) const { // |offset| must be the location of the error or somewhere before it, so we // know preceding data is valid UTF-8.
auto HalfWindowSize = [&p, &initial]() { return PointerRangeSize(p, initial);
};
while (true) {
MOZ_ASSERT(earliestPossibleStart <= p);
MOZ_ASSERT(HalfWindowSize() <= WindowRadius); if (p <= earliestPossibleStart || HalfWindowSize() >= WindowRadius) { break;
}
// Peek backward for a line break, and only decrement if there is none.
uint8_t prev = p[-1].toUint8();
// First check for the ASCII LineTerminators. if (prev == '\r' || prev == '\n') { break;
}
// Now check for the non-ASCII LineTerminators U+2028 LINE SEPARATOR // (0xE2 0x80 0xA8) and U+2029 PARAGRAPH (0xE2 0x80 0xA9). If there // aren't three code units available, some comparison here will fail // before we'd underflow. if (MOZ_UNLIKELY((prev == 0xA8 || prev == 0xA9) &&
p[-2].toUint8() == 0x80 && p[-3].toUint8() == 0xE2)) { break;
}
// Rewind over the non-LineTerminator. This can't underflow // |earliestPossibleStart| because it begins a code point. while (IsTrailingUnit(*--p)) { continue;
}
MOZ_ASSERT(earliestPossibleStart <= p);
// But if we underflowed |WindowRadius|, adjust forward and stop. if (HalfWindowSize() > WindowRadius) {
static_assert(WindowRadius > 3, "skipping over non-lead code units below must not " "advance past |offset|");
auto HalfWindowSize = [&initial, &p]() { return PointerRangeSize(initial, p);
};
while (true) {
MOZ_ASSERT(p <= limit_);
MOZ_ASSERT(HalfWindowSize() <= WindowRadius); if (p >= limit_ || HalfWindowSize() >= WindowRadius) { break;
}
char16_t c = *p;
// This stops at U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR in // string and template literals. These code points do affect line and // column coordinates, even as they encode their literal values. if (IsLineTerminator(c)) { break;
}
// Don't allow invalid UTF-16 in post-context. (Current users don't // require this, and this behavior isn't currently imposed on // pre-context, but these facts might change someday.)
if (MOZ_UNLIKELY(unicode::IsTrailSurrogate(c))) { break;
}
// Optimistically consume the code unit, ungetting it below if needed.
p++;
// If it's not a surrogate at all, keep going. if (MOZ_LIKELY(!unicode::IsLeadSurrogate(c))) { continue;
}
// Retract if the lead surrogate would stand alone at the end of the // window. if (HalfWindowSize() >= WindowRadius || // split pair
p >= limit_ || // half-pair at end of source
!unicode::IsTrailSurrogate(*p)) // no paired trail surrogate
{
p--; break;
}
auto HalfWindowSize = [&initial, &p]() { return PointerRangeSize(initial, p);
};
while (true) {
MOZ_ASSERT(p <= limit_);
MOZ_ASSERT(HalfWindowSize() <= WindowRadius); if (p >= limit_ || HalfWindowSize() >= WindowRadius) { break;
}
// A non-encoding error might be followed by an encoding error within // |maxEnd|, so we must validate as we go to not include invalid UTF-8 // in the computed window. What joy!
Utf8Unit lead = *p; if (mozilla::IsAscii(lead)) { if (IsSingleUnitLineTerminator(lead)) { break;
}
// If this TokenStreamAnyChars doesn't have location information, try to // get it from the caller. if (!filename_) {
JSContext* maybeCx = context()->maybeCurrentJSContext(); if (maybeCx) {
NonBuiltinFrameIter iter(maybeCx,
FrameIter::FOLLOW_DEBUGGER_EVAL_PREV_LINK,
maybeCx->realm()->principals()); if (!iter.done() && iter.filename()) {
err->filename = JS::ConstUTF8CharsZ(iter.filename());
JS::TaggedColumnNumberOneOrigin columnNumber;
err->lineNumber = iter.computeLine(&columnNumber);
err->columnNumber =
JS::ColumnNumberOneOrigin(columnNumber.oneOriginValue()); returnfalse;
}
}
}
// Otherwise use this TokenStreamAnyChars's location information.
err->filename = filename_; returntrue;
}
template <> inlinevoid SourceUnits<char16_t>::computeWindowOffsetAndLength( const char16_t* encodedWindow, size_t encodedTokenOffset,
size_t* utf16TokenOffset, size_t encodedWindowLength,
size_t* utf16WindowLength) const {
MOZ_ASSERT_UNREACHABLE("shouldn't need to recompute for UTF-16");
}
template <> inlinevoid SourceUnits<Utf8Unit>::computeWindowOffsetAndLength( const Utf8Unit* encodedWindow, size_t encodedTokenOffset,
size_t* utf16TokenOffset, size_t encodedWindowLength,
size_t* utf16WindowLength) const {
MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength, "token offset must be within the window, and the two lambda " "calls below presume this ordering of values");
size_t i = 0; auto ComputeUtf16Count = [&i, &encodedWindow](const Utf8Unit* limit) { while (encodedWindow < limit) {
Utf8Unit lead = *encodedWindow++; if (MOZ_LIKELY(IsAscii(lead))) { // ASCII contributes a single UTF-16 code unit.
i++; continue;
}
Maybe<char32_t> cp = DecodeOneUtf8CodePoint(lead, &encodedWindow, limit);
MOZ_ASSERT(cp.isSome(), "computed window should only contain valid UTF-8");
i += unicode::IsSupplementary(cp.value()) ? 2 : 1;
}
return i;
};
// Compute the token offset from |i == 0| and the initial |encodedWindow|. const Utf8Unit* token = encodedWindow + encodedTokenOffset;
MOZ_ASSERT(token <= encodedWindowEnd);
*utf16TokenOffset = ComputeUtf16Count(token);
// Compute the window length, picking up from |i| and |encodedWindow| that, // in general, were modified just above.
*utf16WindowLength = ComputeUtf16Count(encodedWindowEnd);
}
template <typename Unit> bool TokenStreamCharsBase<Unit>::addLineOfContext(ErrorMetadata* err,
uint32_t offset) const { // Rename the variable to make meaning clearer: an offset into source units // in Unit encoding.
size_t encodedOffset = offset;
// These are also offsets into source units in Unit encoding.
size_t encodedWindowStart = sourceUnits.findWindowStart(encodedOffset);
size_t encodedWindowEnd = sourceUnits.findWindowEnd(encodedOffset);
// Don't add a useless "line" of context when the window ends up empty // because of an invalid encoding at the start of a line. if (encodedWindowLength == 0) {
MOZ_ASSERT(err->lineOfContext == nullptr, "ErrorMetadata::lineOfContext must be null so we don't " "have to set the lineLength/tokenOffset fields"); returntrue;
}
MOZ_ASSERT(encodedTokenOffset <= encodedWindowLength, "token offset must be inside the window");
// The length in UTF-8 code units of a code point is always greater than or // equal to the same code point's length in UTF-16 code points. ASCII code // points are 1 unit in either encoding. Code points in [U+0080, U+10000) // are 2-3 UTF-8 code units to 1 UTF-16 code unit. And code points in // [U+10000, U+10FFFF] are 4 UTF-8 code units to 2 UTF-16 code units. // // Therefore, if encoded window length equals the length in UTF-16 (this is // always the case for Unit=char16_t), the UTF-16 offsets are exactly the // encoded offsets. Otherwise we must convert offset/length from UTF-8 to // UTF-16. if constexpr (std::is_same_v<Unit, char16_t>) {
MOZ_ASSERT(utf16WindowLength == encodedWindowLength, "UTF-16 to UTF-16 shouldn't change window length");
err->tokenOffset = encodedTokenOffset;
err->lineLength = encodedWindowLength;
} else {
static_assert(std::is_same_v<Unit, Utf8Unit>, "should only see UTF-8 here");
bool simple = utf16WindowLength == encodedWindowLength; #ifdef DEBUG auto isAscii = [](Unit u) { return IsAscii(u); };
MOZ_ASSERT(std::all_of(encodedWindow, encodedWindow + encodedWindowLength,
isAscii) == simple, "equal window lengths in UTF-8 should correspond only to " "wholly-ASCII text"); #endif if (simple) {
err->tokenOffset = encodedTokenOffset;
err->lineLength = encodedWindowLength;
} else {
sourceUnits.computeWindowOffsetAndLength(
encodedWindow, encodedTokenOffset, &err->tokenOffset,
encodedWindowLength, &err->lineLength);
}
}
returntrue;
}
template <typename Unit, class AnyCharsAccess> bool TokenStreamSpecific<Unit, AnyCharsAccess>::computeErrorMetadata(
ErrorMetadata* err, const ErrorOffset& errorOffset) const { if (errorOffset.is<NoOffset>()) {
anyCharsAccess().computeErrorMetadataNoOffset(err); returntrue;
}
// This function's return value isn't a success/failure indication: it // returns true if this TokenStream can be used to provide a line of // context. if (fillExceptingContext(err, offset)) { // Add a line of context from this TokenStream to help with debugging. return internalComputeLineOfContext(err, offset);
}
// We can't fill in any more here. returntrue;
}
template <typename Unit, class AnyCharsAccess> void TokenStreamSpecific<Unit, AnyCharsAccess>::reportIllegalCharacter(
int32_t cp) {
UniqueChars display = JS_smprintf("U+%04X", cp); if (!display) {
ReportOutOfMemory(anyCharsAccess().fc); return;
}
error(JSMSG_ILLEGAL_CHARACTER, display.get());
}
// We have encountered a '\': check for a Unicode escape sequence after it. // Return the length of the escape sequence and the encoded code point (by // value) if we found a Unicode escape sequence, and skip all code units // involed. Otherwise, return 0 and don't advance along the buffer. template <typename Unit, class AnyCharsAccess>
uint32_t GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchUnicodeEscape(
char32_t* codePoint) {
MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\'));
int32_t unit = getCodeUnit(); if (unit != 'u') { // NOTE: |unit| may be EOF here.
ungetCodeUnit(unit);
MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\')); return 0;
}
if (unit == '{') { return matchExtendedUnicodeEscape(codePoint);
}
// NOTE: |unit| may be EOF here, so this ungets either one or two units.
ungetCodeUnit(unit);
ungetCodeUnit('u');
MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('\\')); return 0;
}
template <typename Unit, class AnyCharsAccess>
uint32_t
GeneralTokenStreamChars<Unit, AnyCharsAccess>::matchExtendedUnicodeEscape(
char32_t* codePoint) {
MOZ_ASSERT(this->sourceUnits.previousCodeUnit() == Unit('{'));
int32_t unit = getCodeUnit();
// Skip leading zeroes.
uint32_t leadingZeroes = 0; while (unit == '0') {
leadingZeroes++;
unit = getCodeUnit();
}
size_t i = 0;
uint32_t code = 0; while (IsAsciiHexDigit(unit) && i < 6) {
code = (code << 4) | AsciiAlphanumericToNumber(unit);
unit = getCodeUnit();
i++;
}
uint32_t gotten =
2 + // 'u{'
leadingZeroes + i + // significant hexdigits
(unit != EOF); // subtract a get if it didn't contribute to length
// We could point "into" a mistyped escape, e.g. for "\u{41H}" we // could point at the 'H'. But we don't do that now, so the code // unit after the '\' isn't necessarily bad, so just point at the // start of the actually-invalid escape.
ungetCodeUnit('\\');
error(JSMSG_BAD_ESCAPE); returnfalse;
}
}
// Unget the lead code unit before peeking at the full code point.
ungetCodeUnit(unit);
PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint(); if (!peeked.isNone() && unicode::IsIdentifierStart(peeked.codePoint())) {
this->sourceUnits.consumeKnownCodePoint(peeked);
template <typename Unit, class AnyCharsAccess> bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDirectives( bool isMultiline, bool shouldWarnDeprecated) { // Match directive comments used in debugging, such as "//# sourceURL" and // "//# sourceMappingURL". Use of "//@" instead of "//#" is deprecated. // // To avoid a crashing bug in IE, several JavaScript transpilers wrap single // line comments containing a source mapping URL inside a multiline // comment. To avoid potentially expensive lookahead and backtracking, we // only check for this case if we encounter a '#' code unit.
bool res = getDisplayURL(isMultiline, shouldWarnDeprecated) &&
getSourceMappingURL(isMultiline, shouldWarnDeprecated); if (!res) {
badToken();
}
template <typename Unit, class AnyCharsAccess>
[[nodiscard]] bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDirective( bool isMultiline, bool shouldWarnDeprecated, constchar* directive,
uint8_t directiveLength, constchar* errorMsgPragma,
UniquePtr<char16_t[], JS::FreePolicy>* destination) { // Stop if we don't find |directive|. (Note that |directive| must be // ASCII, so there are no tricky encoding issues to consider in matching // UTF-8/16-agnostically.) if (!this->sourceUnits.matchCodeUnits(directive, directiveLength)) { returntrue;
}
if (shouldWarnDeprecated) { if (!warning(JSMSG_DEPRECATED_PRAGMA, errorMsgPragma)) { returnfalse;
}
}
this->charBuffer.clear();
do {
int32_t unit = peekCodeUnit(); if (unit == EOF) { break;
}
if (MOZ_LIKELY(isAsciiCodePoint(unit))) { if (unicode::IsSpace(AssertedCast<Latin1Char>(unit))) { break;
}
consumeKnownCodeUnit(unit);
// Debugging directives can occur in both single- and multi-line // comments. If we're currently inside a multi-line comment, we // also must recognize multi-line comment terminators. if (isMultiline && unit == '*' && peekCodeUnit() == '/') {
ungetCodeUnit('*'); break;
}
if (!this->charBuffer.append(unit)) { returnfalse;
}
continue;
}
// This ignores encoding errors: subsequent caller-side code to // handle the remaining source text in the comment will do so.
PeekedCodePoint<Unit> peeked = this->sourceUnits.peekCodePoint(); if (peeked.isNone() || unicode::IsSpace(peeked.codePoint())) { break;
}
MOZ_ASSERT(!IsLineTerminator(peeked.codePoint()), "!IsSpace must imply !IsLineTerminator or else we'll fail to " "maintain line-info/flags for EOL");
this->sourceUnits.consumeKnownCodePoint(peeked);
if (!AppendCodePointToCharBuffer(this->charBuffer, peeked.codePoint())) { returnfalse;
}
} while (true);
if (this->charBuffer.empty()) { // The directive's URL was missing, but comments can contain anything, // so it isn't an error. returntrue;
}
return copyCharBufferTo(destination);
}
template <typename Unit, class AnyCharsAccess> bool TokenStreamSpecific<Unit, AnyCharsAccess>::getDisplayURL( bool isMultiline, bool shouldWarnDeprecated) { // Match comments of the form "//# sourceURL=<url>" or // "/\* //# sourceURL=<url> *\/" // // Note that while these are labeled "sourceURL" in the source text, // internally we refer to it as a "displayURL" to distinguish what the // developer would like to refer to the source as from the source's actual // URL.
template <typename Unit, class AnyCharsAccess> bool TokenStreamSpecific<Unit, AnyCharsAccess>::getSourceMappingURL( bool isMultiline, bool shouldWarnDeprecated) { // Match comments of the form "//# sourceMappingURL=<url>" or // "/\* //# sourceMappingURL=<url> *\/"
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.