/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
* vim: set ts=8 sts=2 et sw=2 tw=80:
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "builtin/String.h"
#include "mozilla/Attributes.h"
#include "mozilla/CheckedInt.h"
#include "mozilla/Compiler.h"
#include "mozilla/FloatingPoint.h"
#if JS_HAS_INTL_API
# include
"mozilla/intl/String.h"
#endif
#include "mozilla/Likely.h"
#include "mozilla/Maybe.h"
#include "mozilla/PodOperations.h"
#include "mozilla/Range.h"
#include "mozilla/SIMD.h"
#include "mozilla/TextUtils.h"
#include <algorithm>
#include <limits>
#include <string.h>
#include <type_traits>
#include "jsnum.h"
#include "jstypes.h"
#include "builtin/Array.h"
#if JS_HAS_INTL_API
# include
"builtin/intl/CommonFunctions.h"
# include
"builtin/intl/FormatBuffer.h"
#endif
#include "builtin/RegExp.h"
#include "gc/GC.h"
#include "jit/InlinableNatives.h"
#include "js/Conversions.h"
#include "js/friend/ErrorMessages.h" // js::GetErrorMessage, JSMSG_*
#if !JS_HAS_INTL_API
# include
"js/LocaleSensitive.h"
#endif
#include "js/Prefs.h"
#include "js/Printer.h"
#include "js/PropertyAndElement.h" // JS_DefineFunctions
#include "js/PropertySpec.h"
#include "js/StableStringChars.h"
#include "js/UniquePtr.h"
#include "util/StringBuilder.h"
#include "util/Unicode.h"
#include "vm/GlobalObject.h"
#include "vm/JSContext.h"
#include "vm/JSObject.h"
#include "vm/RegExpObject.h"
#include "vm/SelfHosting.h"
#include "vm/StaticStrings.h"
#include "vm/ToSource.h" // js::ValueToSource
#include "vm/GeckoProfiler-inl.h"
#include "vm/NativeObject-inl.h"
#include "vm/StringObject-inl.h"
#include "vm/StringType-inl.h"
using namespace js;
using mozilla::AsciiAlphanumericToNumber;
using mozilla::CheckedInt;
using mozilla::EnsureUtf16ValiditySpan;
using mozilla::IsAsciiHexDigit;
using mozilla::PodCopy;
using mozilla::RangedPtr;
using mozilla::SIMD;
using mozilla::Span;
using mozilla::Utf16ValidUpTo;
using JS::AutoCheckCannotGC;
using JS::AutoStableStringChars;
static JSLinearString* ArgToLinearString(JSContext* cx,
const CallArgs& args,
unsigned argno) {
if (argno >= args.length()) {
return cx->names().undefined;
}
JSString* str = ToString<CanGC>(cx, args[argno]);
if (!str) {
return nullptr;
}
return str->ensureLinear(cx);
}
/*
* Forward declarations for URI encode/decode and helper routines
*/
static bool str_decodeURI(JSContext* cx,
unsigned argc, Value* vp);
static bool str_decodeURI_Component(JSContext* cx,
unsigned argc, Value* vp);
static bool str_encodeURI(JSContext* cx,
unsigned argc, Value* vp);
static bool str_encodeURI_Component(JSContext* cx,
unsigned argc, Value* vp);
/*
* Global string methods
*/
/* ES5 B.2.1 */
template <
typename CharT>
static bool Escape(JSContext* cx,
const CharT* chars, uint32_t length,
StringChars<Latin1Char>& newChars, uint32_t* newLengthOut) {
// clang-format off
static const uint8_t shouldPassThrough[128] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,1,
/* !"#$%&'()*+,-./ */
1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
/* 0123456789:;<=>? */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
/* @ABCDEFGHIJKLMNO */
1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,
/* PQRSTUVWXYZ[\]^_ */
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
/* `abcdefghijklmno */
1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
/* pqrstuvwxyz{\}~ DEL */
};
// clang-format on
/* Take a first pass and see how big the result string will need to be. */
uint32_t newLength = length;
for (size_t i = 0; i < length; i++) {
char16_t ch = chars[i];
if (ch < 128 && shouldPassThrough[ch]) {
continue;
}
/*
* newlength is incremented below by at most 5 and at this point it must
* be a valid string length, so this should never overflow uint32_t.
*/
static_assert(JSString::MAX_LENGTH < UINT32_MAX - 5,
"Adding 5 to valid string length should not overflow");
MOZ_ASSERT(newLength <= JSString::MAX_LENGTH);
/* The character will be encoded as %XX or %uXXXX. */
newLength += (ch < 256) ? 2 : 5;
if (MOZ_UNLIKELY(newLength > JSString::MAX_LENGTH)) {
ReportAllocationOverflow(cx);
return false;
}
}
if (newLength == length) {
*newLengthOut = newLength;
return true;
}
if (!newChars.maybeAlloc(cx, newLength)) {
return false;
}
static const char digits[] =
"0123456789ABCDEF";
JS::AutoCheckCannotGC nogc;
Latin1Char* rawNewChars = newChars.data(nogc);
size_t i, ni;
for (i = 0, ni = 0; i < length; i++) {
char16_t ch = chars[i];
if (ch < 128 && shouldPassThrough[ch]) {
rawNewChars[ni++] = ch;
}
else if (ch < 256) {
rawNewChars[ni++] =
'%';
rawNewChars[ni++] = digits[ch >> 4];
rawNewChars[ni++] = digits[ch & 0xF];
}
else {
rawNewChars[ni++] =
'%';
rawNewChars[ni++] =
'u';
rawNewChars[ni++] = digits[ch >> 12];
rawNewChars[ni++] = digits[(ch & 0xF00) >> 8];
rawNewChars[ni++] = digits[(ch & 0xF0) >> 4];
rawNewChars[ni++] = digits[ch & 0xF];
}
}
MOZ_ASSERT(ni == newLength);
*newLengthOut = newLength;
return true;
}
static bool str_escape(JSContext* cx,
unsigned argc, Value* vp) {
AutoJSMethodProfilerEntry pseudoFrame(cx,
"escape");
CallArgs args = CallArgsFromVp(argc, vp);
Rooted<JSLinearString*> str(cx, ArgToLinearString(cx, args, 0));
if (!str) {
return false;
}
StringChars<Latin1Char> newChars(cx);
uint32_t newLength = 0;
// initialize to silence GCC warning
if (str->hasLatin1Chars()) {
AutoCheckCannotGC nogc;
if (!Escape(cx, str->latin1Chars(nogc), str->length(), newChars,
&newLength)) {
return false;
}
}
else {
AutoCheckCannotGC nogc;
if (!Escape(cx, str->twoByteChars(nogc), str->length(), newChars,
&newLength)) {
return false;
}
}
// Return input if no characters need to be escaped.
if (newLength == str->length()) {
args.rval().setString(str);
return true;
}
JSString* res = newChars.toStringDontDeflateNonStatic<CanGC>(cx, newLength);
if (!res) {
return false;
}
args.rval().setString(res);
return true;
}
template <
typename CharT>
static inline bool Unhex4(
const RangedPtr<
const CharT> chars,
char16_t* result) {
CharT a = chars[0], b = chars[1], c = chars[2], d = chars[3];
if (!(IsAsciiHexDigit(a) && IsAsciiHexDigit(b) && IsAsciiHexDigit(c) &&
IsAsciiHexDigit(d))) {
return false;
}
char16_t unhex = AsciiAlphanumericToNumber(a);
unhex = (unhex << 4) + AsciiAlphanumericToNumber(b);
unhex = (unhex << 4) + AsciiAlphanumericToNumber(c);
unhex = (unhex << 4) + AsciiAlphanumericToNumber(d);
*result = unhex;
return true;
}
template <
typename CharT>
static inline bool Unhex2(
const RangedPtr<
const CharT> chars,
char16_t* result) {
CharT a = chars[0], b = chars[1];
if (!(IsAsciiHexDigit(a) && IsAsciiHexDigit(b))) {
return false;
}
*result = (AsciiAlphanumericToNumber(a) << 4) + AsciiAlphanumericToNumber(b);
return true;
}
template <
typename CharT>
static bool Unescape(StringBuilder& sb,
const mozilla::Range<
const CharT> chars) {
// Step 2.
uint32_t length = chars.length();
/*
* Note that the spec algorithm has been optimized to avoid building
* a string in the case where no escapes are present.
*/
bool building =
false;
#define ENSURE_BUILDING \
do { \
if (!building) { \
building =
true; \
if (!sb.reserve(length))
return false; \
sb.infallibleAppend(chars.begin().get(), k); \
} \
}
while (
false);
// Step 4.
uint32_t k = 0;
// Step 5.
while (k < length) {
// Step 5.a.
char16_t c = chars[k];
// Step 5.b.
if (c ==
'%') {
static_assert(JSString::MAX_LENGTH < UINT32_MAX - 6,
"String length is not near UINT32_MAX");
// Steps 5.b.i-ii.
if (k + 6 <= length && chars[k + 1] ==
'u') {
if (Unhex4(chars.begin() + k + 2, &c)) {
ENSURE_BUILDING
k += 5;
}
}
else if (k + 3 <= length) {
if (Unhex2(chars.begin() + k + 1, &c)) {
ENSURE_BUILDING
k += 2;
}
}
}
// Step 5.c.
if (building && !sb.append(c)) {
return false;
}
// Step 5.d.
k += 1;
}
return true;
#undef ENSURE_BUILDING
}
// ES2018 draft rev f83aa38282c2a60c6916ebc410bfdf105a0f6a54
// B.2.1.2 unescape ( string )
static bool str_unescape(JSContext* cx,
unsigned argc, Value* vp) {
AutoJSMethodProfilerEntry pseudoFrame(cx,
"unescape");
CallArgs args = CallArgsFromVp(argc, vp);
// Step 1.
Rooted<JSLinearString*> str(cx, ArgToLinearString(cx, args, 0));
if (!str) {
return false;
}
// Step 3.
JSStringBuilder sb(cx);
if (str->hasTwoByteChars() && !sb.ensureTwoByteChars()) {
return false;
}
// Steps 2, 4-5.
bool unescapeFailed =
false;
if (str->hasLatin1Chars()) {
AutoCheckCannotGC nogc;
unescapeFailed = !Unescape(sb, str->latin1Range(nogc));
}
else {
AutoCheckCannotGC nogc;
unescapeFailed = !Unescape(sb, str->twoByteRange(nogc));
}
if (unescapeFailed) {
return false;
}
// Step 6.
JSLinearString* result;
if (!sb.empty()) {
result = sb.finishString();
if (!result) {
return false;
}
}
else {
result = str;
}
args.rval().setString(result);
return true;
}
static bool str_uneval(JSContext* cx,
unsigned argc, Value* vp) {
CallArgs args = CallArgsFromVp(argc, vp);
JSString* str = ValueToSource(cx, args.get(0));
if (!str) {
return false;
}
args.rval().setString(str);
return true;
}
static const JSFunctionSpec string_functions[] = {
JS_FN(
"escape", str_escape, 1, JSPROP_RESOLVING),
JS_FN(
"unescape", str_unescape, 1, JSPROP_RESOLVING),
JS_FN(
"uneval", str_uneval, 1, JSPROP_RESOLVING),
JS_FN(
"decodeURI", str_decodeURI, 1, JSPROP_RESOLVING),
JS_FN(
"encodeURI", str_encodeURI, 1, JSPROP_RESOLVING),
JS_FN(
"decodeURIComponent", str_decodeURI_Component, 1, JSPROP_RESOLVING),
JS_FN(
"encodeURIComponent", str_encodeURI_Component, 1, JSPROP_RESOLVING),
JS_FS_END,
};
static const unsigned STRING_ELEMENT_ATTRS =
JSPROP_ENUMERATE | JSPROP_READONLY | JSPROP_PERMANENT;
static bool str_enumerate(JSContext* cx, HandleObject obj) {
RootedString str(cx, obj->as<StringObject>().unbox());
js::StaticStrings& staticStrings = cx->staticStrings();
RootedValue value(cx);
for (size_t i = 0, length = str->length(); i < length; i++) {
JSString* str1 = staticStrings.getUnitStringForElement(cx, str, i);
if (!str1) {
return false;
}
value.setString(str1);
if (!DefineDataElement(cx, obj, i, value,
STRING_ELEMENT_ATTRS | JSPROP_RESOLVING)) {
return false;
}
}
return true;
}
static bool str_mayResolve(
const JSAtomState&, jsid id, JSObject*) {
// str_resolve ignores non-integer ids.
return id.isInt();
}
static bool str_resolve(JSContext* cx, HandleObject obj, HandleId id,
bool* resolvedp) {
if (!id.isInt()) {
return true;
}
RootedString str(cx, obj->as<StringObject>().unbox());
int32_t slot = id.toInt();
if ((size_t)slot < str->length()) {
JSString* str1 =
cx->staticStrings().getUnitStringForElement(cx, str, size_t(slot));
if (!str1) {
return false;
}
RootedValue value(cx, StringValue(str1));
if (!DefineDataElement(cx, obj, uint32_t(slot), value,
STRING_ELEMENT_ATTRS | JSPROP_RESOLVING)) {
return false;
}
*resolvedp =
true;
}
return true;
}
static const JSClassOps StringObjectClassOps = {
nullptr,
// addProperty
nullptr,
// delProperty
str_enumerate,
// enumerate
nullptr,
// newEnumerate
str_resolve,
// resolve
str_mayResolve,
// mayResolve
nullptr,
// finalize
nullptr,
// call
nullptr,
// construct
nullptr,
// trace
};
const JSClass StringObject::class_ = {
"String",
JSCLASS_HAS_RESERVED_SLOTS(StringObject::RESERVED_SLOTS) |
JSCLASS_HAS_CACHED_PROTO(JSProto_String),
&StringObjectClassOps,
&StringObject::classSpec_,
};
/*
* Perform the initial |RequireObjectCoercible(thisv)| and |ToString(thisv)|
* from nearly all String.prototype.* functions.
*/
static MOZ_ALWAYS_INLINE JSString* ToStringForStringFunction(
JSContext* cx,
const char* funName, HandleValue thisv) {
if (thisv.isString()) {
return thisv.toString();
}
if (thisv.isObject()) {
if (thisv.toObject().is<StringObject>()) {
StringObject* nobj = &thisv.toObject().as<StringObject>();
// We have to make sure that the ToPrimitive call from ToString
// would be unobservable.
if (HasNoToPrimitiveMethodPure(nobj, cx) &&
HasNativeMethodPure(nobj, cx->names().toString, str_toString, cx)) {
return nobj->unbox();
}
}
}
else if (thisv.isNullOrUndefined()) {
JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
JSMSG_INCOMPATIBLE_PROTO,
"String", funName,
thisv.isNull() ?
"null" :
"undefined");
return nullptr;
}
return ToStringSlow<CanGC>(cx, thisv);
}
MOZ_ALWAYS_INLINE
bool IsString(HandleValue v) {
return v.isString() || (v.isObject() && v.toObject().is<StringObject>());
}
MOZ_ALWAYS_INLINE
bool str_toSource_impl(JSContext* cx,
const CallArgs& args) {
MOZ_ASSERT(IsString(args.thisv()));
JSString* str = ToString<CanGC>(cx, args.thisv());
if (!str) {
return false;
}
UniqueChars quoted = QuoteString(cx, str,
'"');
if (!quoted) {
return false;
}
JSStringBuilder sb(cx);
if (!sb.append(
"(new String(") ||
!sb.append(quoted.get(), strlen(quoted.get())) || !sb.append(
"))")) {
return false;
}
JSString* result = sb.finishString();
if (!result) {
return false;
}
args.rval().setString(result);
return true;
}
static bool str_toSource(JSContext* cx,
unsigned argc, Value* vp) {
CallArgs args = CallArgsFromVp(argc, vp);
return CallNonGenericMethod<IsString, str_toSource_impl>(cx, args);
}
MOZ_ALWAYS_INLINE
bool str_toString_impl(JSContext* cx,
const CallArgs& args) {
MOZ_ASSERT(IsString(args.thisv()));
args.rval().setString(
args.thisv().isString()
? args.thisv().toString()
: args.thisv().toObject().as<StringObject>().unbox());
return true;
}
bool js::str_toString(JSContext* cx,
unsigned argc, Value* vp) {
CallArgs args = CallArgsFromVp(argc, vp);
return CallNonGenericMethod<IsString, str_toString_impl>(cx, args);
}
template <
typename DestChar,
typename SrcChar>
static inline void CopyChars(DestChar* destChars,
const SrcChar* srcChars,
size_t length) {
if constexpr (std::is_same_v<DestChar, SrcChar>) {
#if MOZ_IS_GCC
// Directly call memcpy to work around bug 1863131.
memcpy(destChars, srcChars, length *
sizeof(DestChar));
#else
PodCopy(destChars, srcChars, length);
#endif
}
else {
for (size_t i = 0; i < length; i++) {
destChars[i] = srcChars[i];
}
}
}
template <
typename CharT>
static inline void CopyChars(CharT* to,
const JSLinearString* from,
size_t begin, size_t length) {
MOZ_ASSERT(begin + length <= from->length());
JS::AutoCheckCannotGC nogc;
if (from->hasLatin1Chars()) {
CopyChars(to, from->latin1Chars(nogc) + begin, length);
}
else {
CopyChars(to, from->twoByteChars(nogc) + begin, length);
}
}
template <
typename CharT>
static JSLinearString* SubstringInlineString(JSContext* cx,
Handle<JSLinearString*> left,
Handle<JSLinearString*> right,
size_t begin, size_t lhsLength,
size_t rhsLength) {
constexpr size_t MaxLength = std::is_same_v<CharT, Latin1Char>
? JSFatInlineString::MAX_LENGTH_LATIN1
: JSFatInlineString::MAX_LENGTH_TWO_BYTE;
size_t length = lhsLength + rhsLength;
MOZ_ASSERT(length <= MaxLength,
"total length fits in stack chars");
MOZ_ASSERT(JSInlineString::lengthFits<CharT>(length));
CharT chars[MaxLength] = {};
CopyChars(chars, left, begin, lhsLength);
CopyChars(chars + lhsLength, right, 0, rhsLength);
if (
auto* str = cx->staticStrings().lookup(chars, length)) {
return str;
}
return NewInlineString<CanGC>(cx, chars, length);
}
JSString* js::SubstringKernel(JSContext* cx, HandleString str, int32_t beginInt,
int32_t lengthInt) {
MOZ_ASSERT(0 <= beginInt);
MOZ_ASSERT(0 <= lengthInt);
MOZ_ASSERT(uint32_t(beginInt) <= str->length());
MOZ_ASSERT(uint32_t(lengthInt) <= str->length() - beginInt);
uint32_t begin = beginInt;
uint32_t len = lengthInt;
/*
* Optimization for one level deep ropes.
* This is common for the following pattern:
*
* while() {
* text = text.substr(0, x) + "bla" + text.substr(x)
* text.charCodeAt(x + 1)
* }
*/
if (str->isRope()) {
JSRope* rope = &str->asRope();
if (rope->length() == len) {
// Substring is the full rope.
MOZ_ASSERT(begin == 0);
return rope;
}
if (begin + len <= rope->leftChild()->length()) {
// Substring is fully contained within the rope's left child.
return NewDependentString(cx, rope->leftChild(), begin, len);
}
if (begin >= rope->leftChild()->length()) {
// Substring is fully contained within the rope's right child.
begin -= rope->leftChild()->length();
return NewDependentString(cx, rope->rightChild(), begin, len);
}
// The substring spans both children. Avoid flattening the rope if the
// children are both linear and the substring fits in an inline string.
//
// Note: we could handle longer substrings by allocating a new rope here,
// but this can result in a lot more rope flattening later on. It's safer to
// flatten the rope in this case. See bug 1922926.
MOZ_ASSERT(begin < rope->leftChild()->length() &&
begin + len > rope->leftChild()->length());
bool fitsInline = rope->hasLatin1Chars()
? JSInlineString::lengthFits<Latin1Char>(len)
: JSInlineString::lengthFits<char16_t>(len);
if (fitsInline && rope->leftChild()->isLinear() &&
rope->rightChild()->isLinear()) {
Rooted<JSLinearString*> left(cx, &rope->leftChild()->asLinear());
Rooted<JSLinearString*> right(cx, &rope->rightChild()->asLinear());
size_t lhsLength = left->length() - begin;
size_t rhsLength = len - lhsLength;
if (rope->hasLatin1Chars()) {
return SubstringInlineString<Latin1Char>(cx, left, right, begin,
lhsLength, rhsLength);
}
return SubstringInlineString<char16_t>(cx, left, right, begin, lhsLength,
rhsLength);
}
}
return NewDependentString(cx, str, begin, len);
}
/**
* U+03A3 GREEK CAPITAL LETTER SIGMA has two different lower case mappings
* depending on its context:
* When it's preceded by a cased character and not followed by another cased
* character, its lower case form is U+03C2 GREEK SMALL LETTER FINAL SIGMA.
* Otherwise its lower case mapping is U+03C3 GREEK SMALL LETTER SIGMA.
*
* Unicode 9.0, §3.13 Default Case Algorithms
*/
static char16_t Final_Sigma(
const char16_t* chars, size_t length,
size_t index) {
MOZ_ASSERT(index < length);
MOZ_ASSERT(chars[index] == unicode::GREEK_CAPITAL_LETTER_SIGMA);
MOZ_ASSERT(unicode::ToLowerCase(unicode::GREEK_CAPITAL_LETTER_SIGMA) ==
unicode::GREEK_SMALL_LETTER_SIGMA);
#if JS_HAS_INTL_API
// Tell the analysis the BinaryProperty.contains function pointer called by
// mozilla::intl::String::Is{CaseIgnorable, Cased} cannot GC.
JS::AutoSuppressGCAnalysis nogc;
bool precededByCased =
false;
for (size_t i = index; i > 0;) {
char16_t c = chars[--i];
char32_t codePoint = c;
if (unicode::IsTrailSurrogate(c) && i > 0) {
char16_t lead = chars[i - 1];
if (unicode::IsLeadSurrogate(lead)) {
codePoint = unicode::UTF16Decode(lead, c);
i--;
}
}
// Ignore any characters with the property Case_Ignorable.
// NB: We need to skip over all Case_Ignorable characters, even when
// they also have the Cased binary property.
if (mozilla::intl::String::IsCaseIgnorable(codePoint)) {
continue;
}
precededByCased = mozilla::intl::String::IsCased(codePoint);
break;
}
if (!precededByCased) {
return unicode::GREEK_SMALL_LETTER_SIGMA;
}
bool followedByCased =
false;
for (size_t i = index + 1; i < length;) {
char16_t c = chars[i++];
char32_t codePoint = c;
if (unicode::IsLeadSurrogate(c) && i < length) {
char16_t trail = chars[i];
if (unicode::IsTrailSurrogate(trail)) {
codePoint = unicode::UTF16Decode(c, trail);
i++;
}
}
// Ignore any characters with the property Case_Ignorable.
// NB: We need to skip over all Case_Ignorable characters, even when
// they also have the Cased binary property.
if (mozilla::intl::String::IsCaseIgnorable(codePoint)) {
continue;
}
followedByCased = mozilla::intl::String::IsCased(codePoint);
break;
}
if (!followedByCased) {
return unicode::GREEK_SMALL_LETTER_FINAL_SIGMA;
}
#endif
return unicode::GREEK_SMALL_LETTER_SIGMA;
}
// If |srcLength == destLength| is true, the destination buffer was allocated
// with the same size as the source buffer. When we append characters which
// have special casing mappings, we test |srcLength == destLength| to decide
// if we need to back out and reallocate a sufficiently large destination
// buffer. Otherwise the destination buffer was allocated with the correct
// size to hold all lower case mapped characters, i.e.
// |destLength == ToLowerCaseLength(srcChars, 0, srcLength)| is true.
template <
typename CharT>
static size_t ToLowerCaseImpl(CharT* destChars,
const CharT* srcChars,
size_t startIndex, size_t srcLength,
size_t destLength) {
MOZ_ASSERT(startIndex < srcLength);
MOZ_ASSERT(srcLength <= destLength);
if constexpr (std::is_same_v<CharT, Latin1Char>) {
MOZ_ASSERT(srcLength == destLength);
}
size_t j = startIndex;
for (size_t i = startIndex; i < srcLength; i++) {
CharT c = srcChars[i];
if constexpr (!std::is_same_v<CharT, Latin1Char>) {
if (unicode::IsLeadSurrogate(c) && i + 1 < srcLength) {
char16_t trail = srcChars[i + 1];
if (unicode::IsTrailSurrogate(trail)) {
trail = unicode::ToLowerCaseNonBMPTrail(c, trail);
destChars[j++] = c;
destChars[j++] = trail;
i++;
continue;
}
}
// Special case: U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE
// lowercases to <U+0069 U+0307>.
if (c == unicode::LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
// Return if the output buffer is too small.
if (srcLength == destLength) {
return i;
}
destChars[j++] = CharT(
'i');
destChars[j++] = CharT(unicode::COMBINING_DOT_ABOVE);
continue;
}
// Special case: U+03A3 GREEK CAPITAL LETTER SIGMA lowercases to
// one of two codepoints depending on context.
if (c == unicode::GREEK_CAPITAL_LETTER_SIGMA) {
destChars[j++] = Final_Sigma(srcChars, srcLength, i);
continue;
}
}
c = unicode::ToLowerCase(c);
destChars[j++] = c;
}
MOZ_ASSERT(j == destLength);
return srcLength;
}
static size_t ToLowerCaseLength(
const char16_t* chars, size_t startIndex,
size_t length) {
size_t lowerLength = length;
for (size_t i = startIndex; i < length; i++) {
char16_t c = chars[i];
// U+0130 is lowercased to the two-element sequence <U+0069 U+0307>.
if (c == unicode::LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
lowerLength += 1;
}
}
return lowerLength;
}
template <
typename CharT>
static JSLinearString* ToLowerCase(JSContext* cx, JSLinearString* str) {
// Unlike toUpperCase, toLowerCase has the nice invariant that if the
// input is a Latin-1 string, the output is also a Latin-1 string.
StringChars<CharT> newChars(cx);
const size_t length = str->length();
size_t resultLength;
{
AutoCheckCannotGC nogc;
const CharT* chars = str->chars<CharT>(nogc);
// We don't need extra special casing checks in the loop below,
// because U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+03A3
// GREEK CAPITAL LETTER SIGMA already have simple lower case mappings.
MOZ_ASSERT(unicode::ChangesWhenLowerCased(
unicode::LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE),
"U+0130 has a simple lower case mapping");
MOZ_ASSERT(
unicode::ChangesWhenLowerCased(unicode::GREEK_CAPITAL_LETTER_SIGMA),
"U+03A3 has a simple lower case mapping");
// One element Latin-1 strings can be directly retrieved from the
// static strings cache.
if constexpr (std::is_same_v<CharT, Latin1Char>) {
if (length == 1) {
CharT lower = unicode::ToLowerCase(chars[0]);
MOZ_ASSERT(StaticStrings::hasUnit(lower));
return cx->staticStrings().getUnit(lower);
}
}
// Look for the first character that changes when lowercased.
size_t i = 0;
for (; i < length; i++) {
CharT c = chars[i];
if constexpr (!std::is_same_v<CharT, Latin1Char>) {
if (unicode::IsLeadSurrogate(c) && i + 1 < length) {
CharT trail = chars[i + 1];
if (unicode::IsTrailSurrogate(trail)) {
if (unicode::ChangesWhenLowerCasedNonBMP(c, trail)) {
break;
}
i++;
continue;
}
}
}
if (unicode::ChangesWhenLowerCased(c)) {
break;
}
}
// If no character needs to change, return the input string.
if (i == length) {
return str;
}
resultLength = length;
if (!newChars.maybeAlloc(cx, resultLength)) {
return nullptr;
}
PodCopy(newChars.data(nogc), chars, i);
size_t readChars =
ToLowerCaseImpl(newChars.data(nogc), chars, i, length, resultLength);
if constexpr (!std::is_same_v<CharT, Latin1Char>) {
if (readChars < length) {
resultLength = ToLowerCaseLength(chars, readChars, length);
if (!newChars.maybeRealloc(cx, length, resultLength)) {
return nullptr;
}
MOZ_ALWAYS_TRUE(length == ToLowerCaseImpl(newChars.data(nogc), chars,
readChars, length,
resultLength));
}
}
else {
MOZ_ASSERT(readChars == length,
"Latin-1 strings don't have special lower case mappings");
}
}
return newChars.
template toStringDontDeflate<CanGC>(cx, resultLength);
}
JSLinearString* js::StringToLowerCase(JSContext* cx, JSString* string) {
JSLinearString* linear = string->ensureLinear(cx);
if (!linear) {
return nullptr;
}
if (linear->hasLatin1Chars()) {
return ToLowerCase<Latin1Char>(cx, linear);
}
return ToLowerCase<char16_t>(cx, linear);
}
static bool str_toLowerCase(JSContext* cx,
unsigned argc, Value* vp) {
AutoJSMethodProfilerEntry pseudoFrame(cx,
"String.prototype",
"toLowerCase");
CallArgs args = CallArgsFromVp(argc, vp);
JSString* str = ToStringForStringFunction(cx,
"toLowerCase", args.thisv());
if (!str) {
return false;
}
JSString* result = StringToLowerCase(cx, str);
if (!result) {
return false;
}
args.rval().setString(result);
return true;
}
#if JS_HAS_INTL_API
// String.prototype.toLocaleLowerCase is self-hosted when Intl is exposed,
// with core functionality performed by the intrinsic below.
static const char* CaseMappingLocale(JSContext* cx, JSString* str) {
JSLinearString* locale = str->ensureLinear(cx);
if (!locale) {
return nullptr;
}
MOZ_ASSERT(locale->length() >= 2,
"locale is a valid language tag");
// Lithuanian, Turkish, and Azeri have language dependent case mappings.
static const char languagesWithSpecialCasing[][3] = {
"lt",
"tr",
"az"};
// All strings in |languagesWithSpecialCasing| are of length two, so we
// only need to compare the first two characters to find a matching locale.
// ES2017 Intl, §9.2.2 BestAvailableLocale
if (locale->length() == 2 || locale->latin1OrTwoByteChar(2) ==
'-') {
for (
const auto& language : languagesWithSpecialCasing) {
if (locale->latin1OrTwoByteChar(0) == language[0] &&
locale->latin1OrTwoByteChar(1) == language[1]) {
return language;
}
}
}
return "";
// ICU root locale
}
static bool HasDefaultCasing(
const char* locale) {
return !strcmp(locale,
""); }
bool js::intl_toLocaleLowerCase(JSContext* cx,
unsigned argc, Value* vp) {
CallArgs args = CallArgsFromVp(argc, vp);
MOZ_ASSERT(args.length() == 2);
MOZ_ASSERT(args[0].isString());
MOZ_ASSERT(args[1].isString());
RootedString string(cx, args[0].toString());
const char* locale = CaseMappingLocale(cx, args[1].toString());
if (!locale) {
return false;
}
// Call String.prototype.toLowerCase() for language independent casing.
if (HasDefaultCasing(locale)) {
JSString* str = StringToLowerCase(cx, string);
if (!str) {
return false;
}
args.rval().setString(str);
return true;
}
AutoStableStringChars inputChars(cx);
if (!inputChars.initTwoByte(cx, string)) {
return false;
}
mozilla::Range<
const char16_t> input = inputChars.twoByteRange();
// Note: maximum case mapping length is three characters, so the result
// length might be > INT32_MAX. ICU will fail in this case.
static_assert(JSString::MAX_LENGTH <= INT32_MAX,
"String length must fit in int32_t for ICU");
static const size_t INLINE_CAPACITY = js::intl::INITIAL_CHAR_BUFFER_SIZE;
intl::FormatBuffer<char16_t, INLINE_CAPACITY> buffer(cx);
auto ok = mozilla::intl::String::ToLocaleLowerCase(locale, input, buffer);
if (ok.isErr()) {
intl::ReportInternalError(cx, ok.unwrapErr());
return false;
}
JSString* result = buffer.toString(cx);
if (!result) {
return false;
}
args.rval().setString(result);
return true;
}
#else
// When the Intl API is not exposed, String.prototype.toLowerCase is implemented
// in C++.
static bool str_toLocaleLowerCase(JSContext* cx,
unsigned argc, Value* vp) {
AutoJSMethodProfilerEntry pseudoFrame(cx,
"String.prototype",
"toLocaleLowerCase");
CallArgs args = CallArgsFromVp(argc, vp);
RootedString str(
cx, ToStringForStringFunction(cx,
"toLocaleLowerCase", args.thisv()));
if (!str) {
return false;
}
/*
* Forcefully ignore the first (or any) argument and return toLowerCase(),
* ECMA has reserved that argument, presumably for defining the locale.
*/
if (cx->runtime()->localeCallbacks &&
cx->runtime()->localeCallbacks->localeToLowerCase) {
RootedValue result(cx);
if (!cx->runtime()->localeCallbacks->localeToLowerCase(cx, str, &result)) {
return false;
}
args.rval().set(result);
return true;
}
Rooted<JSLinearString*> linear(cx, str->ensureLinear(cx));
if (!linear) {
return false;
}
JSString* result = StringToLowerCase(cx, linear);
if (!result) {
return false;
}
args.rval().setString(result);
return true;
}
#endif // JS_HAS_INTL_API
static inline bool ToUpperCaseHasSpecialCasing(Latin1Char charCode) {
// U+00DF LATIN SMALL LETTER SHARP S is the only Latin-1 code point with
// special casing rules, so detect it inline.
bool hasUpperCaseSpecialCasing =
charCode == unicode::LATIN_SMALL_LETTER_SHARP_S;
MOZ_ASSERT(hasUpperCaseSpecialCasing ==
unicode::ChangesWhenUpperCasedSpecialCasing(charCode));
return hasUpperCaseSpecialCasing;
}
static inline bool ToUpperCaseHasSpecialCasing(char16_t charCode) {
return unicode::ChangesWhenUpperCasedSpecialCasing(charCode);
}
static inline size_t ToUpperCaseLengthSpecialCasing(Latin1Char charCode) {
// U+00DF LATIN SMALL LETTER SHARP S is uppercased to two 'S'.
MOZ_ASSERT(charCode == unicode::LATIN_SMALL_LETTER_SHARP_S);
return 2;
}
static inline size_t ToUpperCaseLengthSpecialCasing(char16_t charCode) {
MOZ_ASSERT(ToUpperCaseHasSpecialCasing(charCode));
return unicode::LengthUpperCaseSpecialCasing(charCode);
}
static inline void ToUpperCaseAppendUpperCaseSpecialCasing(char16_t charCode,
Latin1Char* elements,
size_t* index) {
// U+00DF LATIN SMALL LETTER SHARP S is uppercased to two 'S'.
MOZ_ASSERT(charCode == unicode::LATIN_SMALL_LETTER_SHARP_S);
static_assert(
'S' <= JSString::MAX_LATIN1_CHAR,
"'S' is a Latin-1 character");
elements[(*index)++] =
'S';
elements[(*index)++] =
'S';
}
static inline void ToUpperCaseAppendUpperCaseSpecialCasing(char16_t charCode,
char16_t* elements,
size_t* index) {
unicode::AppendUpperCaseSpecialCasing(charCode, elements, index);
}
// See ToLowerCaseImpl for an explanation of the parameters.
template <
typename DestChar,
typename SrcChar>
static size_t ToUpperCaseImpl(DestChar* destChars,
const SrcChar* srcChars,
size_t startIndex, size_t srcLength,
size_t destLength) {
static_assert(std::is_same_v<SrcChar, Latin1Char> ||
!std::is_same_v<DestChar, Latin1Char>,
"cannot write non-Latin-1 characters into Latin-1 string");
MOZ_ASSERT(startIndex < srcLength);
MOZ_ASSERT(srcLength <= destLength);
size_t j = startIndex;
for (size_t i = startIndex; i < srcLength; i++) {
char16_t c = srcChars[i];
if constexpr (!std::is_same_v<DestChar, Latin1Char>) {
if (unicode::IsLeadSurrogate(c) && i + 1 < srcLength) {
char16_t trail = srcChars[i + 1];
if (unicode::IsTrailSurrogate(trail)) {
trail = unicode::ToUpperCaseNonBMPTrail(c, trail);
destChars[j++] = c;
destChars[j++] = trail;
i++;
continue;
}
}
}
if (MOZ_UNLIKELY(c > 0x7f &&
ToUpperCaseHasSpecialCasing(
static_cast<SrcChar>(c)))) {
// Return if the output buffer is too small.
if (srcLength == destLength) {
return i;
}
ToUpperCaseAppendUpperCaseSpecialCasing(c, destChars, &j);
continue;
}
c = unicode::ToUpperCase(c);
if constexpr (std::is_same_v<DestChar, Latin1Char>) {
MOZ_ASSERT(c <= JSString::MAX_LATIN1_CHAR);
}
destChars[j++] = c;
}
MOZ_ASSERT(j == destLength);
return srcLength;
}
template <
typename CharT>
static size_t ToUpperCaseLength(
const CharT* chars, size_t startIndex,
size_t length) {
size_t upperLength = length;
for (size_t i = startIndex; i < length; i++) {
char16_t c = chars[i];
if (c > 0x7f && ToUpperCaseHasSpecialCasing(
static_cast<CharT>(c))) {
upperLength += ToUpperCaseLengthSpecialCasing(
static_cast<CharT>(c)) - 1;
}
}
return upperLength;
}
template <
typename DestChar,
typename SrcChar>
static inline bool ToUpperCase(JSContext* cx, StringChars<DestChar>& newChars,
const SrcChar* chars, size_t startIndex,
size_t length, size_t* resultLength) {
MOZ_ASSERT(startIndex < length);
AutoCheckCannotGC nogc;
*resultLength = length;
if (!newChars.maybeAlloc(cx, length)) {
return false;
}
CopyChars(newChars.data(nogc), chars, startIndex);
size_t readChars =
ToUpperCaseImpl(newChars.data(nogc), chars, startIndex, length, length);
if (readChars < length) {
size_t actualLength = ToUpperCaseLength(chars, readChars, length);
*resultLength = actualLength;
if (!newChars.maybeRealloc(cx, length, actualLength)) {
return false;
}
MOZ_ALWAYS_TRUE(length == ToUpperCaseImpl(newChars.data(nogc), chars,
readChars, length, actualLength));
}
return true;
}
template <
typename CharT>
static JSLinearString* ToUpperCase(JSContext* cx, JSLinearString* str) {
using Latin1StringChars = StringChars<Latin1Char>;
using TwoByteStringChars = StringChars<char16_t>;
mozilla::MaybeOneOf<Latin1StringChars, TwoByteStringChars> newChars;
const size_t length = str->length();
size_t resultLength;
{
AutoCheckCannotGC nogc;
const CharT* chars = str->chars<CharT>(nogc);
// Most one element Latin-1 strings can be directly retrieved from the
// static strings cache.
if constexpr (std::is_same_v<CharT, Latin1Char>) {
if (length == 1) {
Latin1Char c = chars[0];
if (c != unicode::MICRO_SIGN &&
c != unicode::LATIN_SMALL_LETTER_Y_WITH_DIAERESIS &&
c != unicode::LATIN_SMALL_LETTER_SHARP_S) {
char16_t upper = unicode::ToUpperCase(c);
MOZ_ASSERT(upper <= JSString::MAX_LATIN1_CHAR);
MOZ_ASSERT(StaticStrings::hasUnit(upper));
return cx->staticStrings().getUnit(upper);
}
MOZ_ASSERT(unicode::ToUpperCase(c) > JSString::MAX_LATIN1_CHAR ||
ToUpperCaseHasSpecialCasing(c));
}
}
// Look for the first character that changes when uppercased.
size_t i = 0;
for (; i < length; i++) {
CharT c = chars[i];
if constexpr (!std::is_same_v<CharT, Latin1Char>) {
if (unicode::IsLeadSurrogate(c) && i + 1 < length) {
CharT trail = chars[i + 1];
if (unicode::IsTrailSurrogate(trail)) {
if (unicode::ChangesWhenUpperCasedNonBMP(c, trail)) {
break;
}
i++;
continue;
}
}
}
if (unicode::ChangesWhenUpperCased(c)) {
break;
}
if (MOZ_UNLIKELY(c > 0x7f && ToUpperCaseHasSpecialCasing(c))) {
break;
}
}
// If no character needs to change, return the input string.
if (i == length) {
return str;
}
// The string changes when uppercased, so we must create a new string.
// Can it be Latin-1?
//
// If the original string is Latin-1, it can -- unless the string
// contains U+00B5 MICRO SIGN or U+00FF SMALL LETTER Y WITH DIAERESIS,
// the only Latin-1 codepoints that don't uppercase within Latin-1.
// Search for those codepoints to decide whether the new string can be
// Latin-1.
// If the original string is a two-byte string, its uppercase form is
// so rarely Latin-1 that we don't even consider creating a new
// Latin-1 string.
if constexpr (std::is_same_v<CharT, Latin1Char>) {
bool resultIsLatin1 = std::none_of(chars + i, chars + length, [](
auto c) {
bool upperCaseIsTwoByte =
c == unicode::MICRO_SIGN ||
c == unicode::LATIN_SMALL_LETTER_Y_WITH_DIAERESIS;
MOZ_ASSERT(upperCaseIsTwoByte ==
(unicode::ToUpperCase(c) > JSString::MAX_LATIN1_CHAR));
return upperCaseIsTwoByte;
});
if (resultIsLatin1) {
newChars.construct<Latin1StringChars>(cx);
if (!ToUpperCase(cx, newChars.ref<Latin1StringChars>(), chars, i,
length, &resultLength)) {
return nullptr;
}
}
else {
newChars.construct<TwoByteStringChars>(cx);
if (!ToUpperCase(cx, newChars.ref<TwoByteStringChars>(), chars, i,
length, &resultLength)) {
return nullptr;
}
}
}
else {
newChars.construct<TwoByteStringChars>(cx);
if (!ToUpperCase(cx, newChars.ref<TwoByteStringChars>(), chars, i, length,
&resultLength)) {
return nullptr;
}
}
}
auto toString = [&](
auto& chars) {
return chars.
template toStringDontDeflate<CanGC>(cx, resultLength);
};
return newChars.mapNonEmpty(toString);
}
JSLinearString* js::StringToUpperCase(JSContext* cx, JSString* string) {
JSLinearString* linear = string->ensureLinear(cx);
if (!linear) {
return nullptr;
}
if (linear->hasLatin1Chars()) {
return ToUpperCase<Latin1Char>(cx, linear);
}
return ToUpperCase<char16_t>(cx, linear);
}
static bool str_toUpperCase(JSContext* cx,
unsigned argc, Value* vp) {
AutoJSMethodProfilerEntry pseudoFrame(cx,
"String.prototype",
"toUpperCase");
CallArgs args = CallArgsFromVp(argc, vp);
JSString* str = ToStringForStringFunction(cx,
"toUpperCase", args.thisv());
if (!str) {
return false;
}
JSString* result = StringToUpperCase(cx, str);
if (!result) {
return false;
}
args.rval().setString(result);
return true;
}
#if JS_HAS_INTL_API
// String.prototype.toLocaleUpperCase is self-hosted when Intl is exposed,
// with core functionality performed by the intrinsic below.
bool js::intl_toLocaleUpperCase(JSContext* cx,
unsigned argc, Value* vp) {
CallArgs args = CallArgsFromVp(argc, vp);
MOZ_ASSERT(args.length() == 2);
MOZ_ASSERT(args[0].isString());
MOZ_ASSERT(args[1].isString());
RootedString string(cx, args[0].toString());
const char* locale = CaseMappingLocale(cx, args[1].toString());
if (!locale) {
return false;
}
// Call String.prototype.toUpperCase() for language independent casing.
if (HasDefaultCasing(locale)) {
JSString* str = js::StringToUpperCase(cx, string);
if (!str) {
return false;
}
args.rval().setString(str);
return true;
}
AutoStableStringChars inputChars(cx);
if (!inputChars.initTwoByte(cx, string)) {
return false;
}
mozilla::Range<
const char16_t> input = inputChars.twoByteRange();
// Note: maximum case mapping length is three characters, so the result
// length might be > INT32_MAX. ICU will fail in this case.
static_assert(JSString::MAX_LENGTH <= INT32_MAX,
"String length must fit in int32_t for ICU");
static const size_t INLINE_CAPACITY = js::intl::INITIAL_CHAR_BUFFER_SIZE;
intl::FormatBuffer<char16_t, INLINE_CAPACITY> buffer(cx);
auto ok = mozilla::intl::String::ToLocaleUpperCase(locale, input, buffer);
if (ok.isErr()) {
intl::ReportInternalError(cx, ok.unwrapErr());
return false;
}
JSString* result = buffer.toString(cx);
if (!result) {
return false;
}
args.rval().setString(result);
return true;
}
#else
// When the Intl API is not exposed, String.prototype.toUpperCase is implemented
// in C++.
static bool str_toLocaleUpperCase(JSContext* cx,
unsigned argc, Value* vp) {
AutoJSMethodProfilerEntry pseudoFrame(cx,
"String.prototype",
"toLocaleUpperCase");
CallArgs args = CallArgsFromVp(argc, vp);
RootedString str(
cx, ToStringForStringFunction(cx,
"toLocaleUpperCase", args.thisv()));
if (!str) {
return false;
}
/*
* Forcefully ignore the first (or any) argument and return toUpperCase(),
* ECMA has reserved that argument, presumably for defining the locale.
*/
if (cx->runtime()->localeCallbacks &&
cx->runtime()->localeCallbacks->localeToUpperCase) {
RootedValue result(cx);
if (!cx->runtime()->localeCallbacks->localeToUpperCase(cx, str, &result)) {
return false;
}
args.rval().set(result);
return true;
}
Rooted<JSLinearString*> linear(cx, str->ensureLinear(cx));
if (!linear) {
return false;
}
JSString* result = StringToUpperCase(cx, linear);
if (!result) {
return false;
}
args.rval().setString(result);
return true;
}
#endif // JS_HAS_INTL_API
#if JS_HAS_INTL_API
// String.prototype.localeCompare is self-hosted when Intl functionality is
// exposed, and the only intrinsics it requires are provided in the
// implementation of Intl.Collator.
#else
// String.prototype.localeCompare is implemented in C++ (delegating to
// JSLocaleCallbacks) when Intl functionality is not exposed.
static bool str_localeCompare(JSContext* cx,
unsigned argc, Value* vp) {
AutoJSMethodProfilerEntry pseudoFrame(cx,
"String.prototype",
"localeCompare");
CallArgs args = CallArgsFromVp(argc, vp);
RootedString str(
cx, ToStringForStringFunction(cx,
"localeCompare", args.thisv()));
if (!str) {
return false;
}
RootedString thatStr(cx, ToString<CanGC>(cx, args.get(0)));
if (!thatStr) {
return false;
}
if (cx->runtime()->localeCallbacks &&
cx->runtime()->localeCallbacks->localeCompare) {
RootedValue result(cx);
if (!cx->runtime()->localeCallbacks->localeCompare(cx, str, thatStr,
&result)) {
return false;
}
args.rval().set(result);
return true;
}
int32_t result;
if (!CompareStrings(cx, str, thatStr, &result)) {
return false;
}
args.rval().setInt32(result);
return true;
}
#endif // JS_HAS_INTL_API
#if JS_HAS_INTL_API
// ES2017 draft rev 45e890512fd77add72cc0ee742785f9f6f6482de
// 21.1.3.12 String.prototype.normalize ( [ form ] )
//
// String.prototype.normalize is only implementable if ICU's normalization
// functionality is available.
static bool str_normalize(JSContext* cx,
unsigned argc, Value* vp) {
AutoJSMethodProfilerEntry pseudoFrame(cx,
"String.prototype",
"normalize");
CallArgs args = CallArgsFromVp(argc, vp);
// Steps 1-2.
RootedString str(cx,
ToStringForStringFunction(cx,
"normalize", args.thisv()));
if (!str) {
return false;
}
using NormalizationForm = mozilla::intl::String::NormalizationForm;
NormalizationForm form;
if (!args.hasDefined(0)) {
// Step 3.
form = NormalizationForm::NFC;
}
else {
// Step 4.
JSLinearString* formStr = ArgToLinearString(cx, args, 0);
if (!formStr) {
return false;
}
// Step 5.
if (EqualStrings(formStr, cx->names().NFC)) {
form = NormalizationForm::NFC;
}
else if (EqualStrings(formStr, cx->names().NFD)) {
form = NormalizationForm::NFD;
}
else if (EqualStrings(formStr, cx->names().NFKC)) {
form = NormalizationForm::NFKC;
}
else if (EqualStrings(formStr, cx->names().NFKD)) {
form = NormalizationForm::NFKD;
}
else {
JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
JSMSG_INVALID_NORMALIZE_FORM);
return false;
}
}
// Latin-1 strings are already in Normalization Form C.
if (form == NormalizationForm::NFC && str->hasLatin1Chars()) {
// Step 7.
args.rval().setString(str);
return true;
}
// Step 6.
AutoStableStringChars stableChars(cx);
if (!stableChars.initTwoByte(cx, str)) {
return false;
}
mozilla::Range<
const char16_t> srcChars = stableChars.twoByteRange();
static const size_t INLINE_CAPACITY = js::intl::INITIAL_CHAR_BUFFER_SIZE;
intl::FormatBuffer<char16_t, INLINE_CAPACITY> buffer(cx);
auto alreadyNormalized =
mozilla::intl::String::Normalize(form, srcChars, buffer);
if (alreadyNormalized.isErr()) {
intl::ReportInternalError(cx, alreadyNormalized.unwrapErr());
return false;
}
using AlreadyNormalized = mozilla::intl::String::AlreadyNormalized;
// Return if the input string is already normalized.
if (alreadyNormalized.unwrap() == AlreadyNormalized::Yes) {
// Step 7.
args.rval().setString(str);
return true;
}
JSString* ns = buffer.toString(cx);
if (!ns) {
return false;
}
// Step 7.
args.rval().setString(ns);
return true;
}
#endif // JS_HAS_INTL_API
/**
* IsStringWellFormedUnicode ( string )
* https://tc39.es/ecma262/#sec-isstringwellformedunicode
*/
static bool IsStringWellFormedUnicode(JSContext* cx, HandleString str,
size_t* isWellFormedUpTo) {
MOZ_ASSERT(isWellFormedUpTo);
*isWellFormedUpTo = 0;
size_t len = str->length();
// Latin1 chars are well-formed.
if (str->hasLatin1Chars()) {
*isWellFormedUpTo = len;
return true;
}
JSLinearString* linear = str->ensureLinear(cx);
if (!linear) {
return false;
}
{
AutoCheckCannotGC nogc;
*isWellFormedUpTo = Utf16ValidUpTo(Span{linear->twoByteChars(nogc), len});
}
return true;
}
/**
* Well-Formed Unicode Strings (Stage 3 proposal)
*
* String.prototype.isWellFormed
* https://tc39.es/proposal-is-usv-string/#sec-string.prototype.iswellformed
*/
static bool str_isWellFormed(JSContext* cx,
unsigned argc, Value* vp) {
AutoJSMethodProfilerEntry pseudoFrame(cx,
"String.prototype",
"isWellFormed");
CallArgs args = CallArgsFromVp(argc, vp);
// Step 1. Let O be ? RequireObjectCoercible(this value).
// Step 2. Let S be ? ToString(O).
RootedString str(cx,
ToStringForStringFunction(cx,
"isWellFormed", args.thisv()));
if (!str) {
return false;
}
// Step 3. Return IsStringWellFormedUnicode(S).
size_t isWellFormedUpTo;
if (!IsStringWellFormedUnicode(cx, str, &isWellFormedUpTo)) {
return false;
}
MOZ_ASSERT(isWellFormedUpTo <= str->length());
args.rval().setBoolean(isWellFormedUpTo == str->length());
return true;
}
/**
* Well-Formed Unicode Strings (Stage 3 proposal)
*
* String.prototype.toWellFormed
* https://tc39.es/proposal-is-usv-string/#sec-string.prototype.towellformed
*/
static bool str_toWellFormed(JSContext* cx,
unsigned argc, Value* vp) {
AutoJSMethodProfilerEntry pseudoFrame(cx,
"String.prototype",
"toWellFormed");
CallArgs args = CallArgsFromVp(argc, vp);
// Step 1. Let O be ? RequireObjectCoercible(this value).
// Step 2. Let S be ? ToString(O).
RootedString str(cx,
ToStringForStringFunction(cx,
"toWellFormed", args.thisv()));
if (!str) {
return false;
}
// Step 3. Let strLen be the length of S.
size_t len = str->length();
// If the string itself is well-formed, return it.
size_t isWellFormedUpTo;
if (!IsStringWellFormedUnicode(cx, str, &isWellFormedUpTo)) {
return false;
}
if (isWellFormedUpTo == len) {
args.rval().setString(str);
return true;
}
MOZ_ASSERT(isWellFormedUpTo < len);
// Step 4-6
StringChars<char16_t> newChars(cx);
if (!newChars.maybeAlloc(cx, len)) {
return false;
}
{
AutoCheckCannotGC nogc;
JSLinearString* linear = str->ensureLinear(cx);
MOZ_ASSERT(linear,
"IsStringWellFormedUnicode linearized the string");
PodCopy(newChars.data(nogc), linear->twoByteChars(nogc), len);
auto span = mozilla::Span{newChars.data(nogc), len};
// Replace the character.
span[isWellFormedUpTo] = unicode::REPLACEMENT_CHARACTER;
// Check any remaining characters.
auto remaining = span.From(isWellFormedUpTo + 1);
if (!remaining.IsEmpty()) {
EnsureUtf16ValiditySpan(remaining);
}
}
JSString* result = newChars.toStringDontDeflateNonStatic<CanGC>(cx, len);
if (!result) {
return false;
}
// Step 7. Return result.
args.rval().setString(result);
return true;
}
static const JSFunctionSpec wellFormed_functions[] = {
JS_FN(
"isWellFormed", str_isWellFormed, 0, 0),
JS_FN(
"toWellFormed", str_toWellFormed, 0, 0),
JS_FS_END,
};
static MOZ_ALWAYS_INLINE
bool ToStringIndex(JSContext* cx, Handle<Value> value,
size_t length,
mozilla::Maybe<size_t>* result) {
// Handle the common case of int32 indices first.
if (MOZ_LIKELY(value.isInt32())) {
size_t index = size_t(value.toInt32());
if (index < length) {
*result = mozilla::Some(index);
}
return true;
}
double index = 0.0;
if (!ToInteger(cx, value, &index)) {
return false;
}
if (0 <= index && index < length) {
*result = mozilla::Some(size_t(index));
}
return true;
}
static MOZ_ALWAYS_INLINE
bool ToRelativeStringIndex(
JSContext* cx, Handle<Value> value, size_t length,
mozilla::Maybe<size_t>* result) {
// Handle the common case of int32 indices first.
if (MOZ_LIKELY(value.isInt32())) {
int32_t index = value.toInt32();
if (index < 0) {
index += int32_t(length);
}
if (size_t(index) < length) {
*result = mozilla::Some(size_t(index));
}
return true;
}
double index = 0.0;
if (!ToInteger(cx, value, &index)) {
return false;
}
if (index < 0) {
index += length;
}
if (0 <= index && index < length) {
*result = mozilla::Some(size_t(index));
}
return true;
}
/**
* 22.1.3.2 String.prototype.charAt ( pos )
*
* ES2024 draft rev 7d2644968bd56d54d2886c012d18698ff3f72c35
*/
static bool str_charAt(JSContext* cx,
unsigned argc, Value* vp) {
AutoJSMethodProfilerEntry pseudoFrame(cx,
"String.prototype",
"charAt");
CallArgs args = CallArgsFromVp(argc, vp);
// Steps 1-2.
RootedString str(cx, ToStringForStringFunction(cx,
"charAt", args.thisv()));
if (!str) {
return false;
}
// Step 3.
mozilla::Maybe<size_t> index{};
if (!ToStringIndex(cx, args.get(0), str->length(), &index)) {
return false;
}
// Steps 4-5.
if (index.isNothing()) {
args.rval().setString(cx->runtime()->emptyString);
return true;
}
MOZ_ASSERT(*index < str->length());
// Step 6.
auto* result = cx->staticStrings().getUnitStringForElement(cx, str, *index);
if (!result) {
return false;
}
args.rval().setString(result);
return true;
}
/**
* 22.1.3.3 String.prototype.charCodeAt ( pos )
*
* ES2024 draft rev 7d2644968bd56d54d2886c012d18698ff3f72c35
*/
bool js::str_charCodeAt(JSContext* cx,
unsigned argc, Value* vp) {
AutoJSMethodProfilerEntry pseudoFrame(cx,
"String.prototype",
"charCodeAt");
CallArgs args = CallArgsFromVp(argc, vp);
// Steps 1-2.
RootedString str(cx,
ToStringForStringFunction(cx,
"charCodeAt", args.thisv()));
if (!str) {
return false;
}
// Step 3.
mozilla::Maybe<size_t> index{};
if (!ToStringIndex(cx, args.get(0), str->length(), &index)) {
return false;
}
// Steps 4-5.
if (index.isNothing()) {
args.rval().setNaN();
return true;
}
MOZ_ASSERT(*index < str->length());
// Step 6.
char16_t c;
if (!str->getChar(cx, *index, &c)) {
return false;
}
args.rval().setInt32(c);
return true;
}
/**
* 22.1.3.4 String.prototype.codePointAt ( pos )
*
* ES2024 draft rev 7d2644968bd56d54d2886c012d18698ff3f72c35
*/
bool js::str_codePointAt(JSContext* cx,
unsigned argc, Value* vp) {
AutoJSMethodProfilerEntry pseudoFrame(cx,
"String.prototype",
"codePointAt");
CallArgs args = CallArgsFromVp(argc, vp);
// Steps 1-2.
RootedString str(cx,
ToStringForStringFunction(cx,
"codePointAt", args.thisv()));
if (!str) {
return false;
}
// Step 3.
mozilla::Maybe<size_t> index{};
if (!ToStringIndex(cx, args.get(0), str->length(), &index)) {
return false;
}
// Steps 4-5.
if (index.isNothing()) {
args.rval().setUndefined();
return true;
}
MOZ_ASSERT(*index < str->length());
// Step 6.
char32_t codePoint;
if (!str->getCodePoint(cx, *index, &codePoint)) {
return false;
}
// Step 7.
args.rval().setInt32(codePoint);
return true;
}
/**
* 22.1.3.1 String.prototype.at ( index )
*
* ES2024 draft rev 7d2644968bd56d54d2886c012d18698ff3f72c35
*/
static bool str_at(JSContext* cx,
unsigned argc, Value* vp) {
AutoJSMethodProfilerEntry pseudoFrame(cx,
"String.prototype",
"at");
CallArgs args = CallArgsFromVp(argc, vp);
// Steps 1-2.
RootedString str(cx, ToStringForStringFunction(cx,
"at", args.thisv()));
if (!str) {
return false;
}
// Steps 3-6.
mozilla::Maybe<size_t> index{};
if (!ToRelativeStringIndex(cx, args.get(0), str->length(), &index)) {
return false;
}
// Step 7.
if (index.isNothing()) {
args.rval().setUndefined();
return true;
}
MOZ_ASSERT(*index < str->length());
// Step 8.
auto* result = cx->staticStrings().getUnitStringForElement(cx, str, *index);
if (!result) {
return false;
}
args.rval().setString(result);
return true;
}
/*
* Boyer-Moore-Horspool superlinear search for pat:patlen in text:textlen.
* The patlen argument must be positive and no greater than sBMHPatLenMax.
*
* Return the index of pat in text, or -1 if not found.
*/
static const uint32_t sBMHCharSetSize = 256;
/* ISO-Latin-1 */
static const uint32_t sBMHPatLenMax = 255;
/* skip table element is uint8_t */
static const int sBMHBadPattern =
-2;
/* return value if pat is not ISO-Latin-1 */
template <
typename TextChar,
typename PatChar>
static int BoyerMooreHorspool(
const TextChar* text, uint32_t textLen,
const PatChar* pat, uint32_t patLen) {
MOZ_ASSERT(0 < patLen && patLen <= sBMHPatLenMax);
uint8_t skip[sBMHCharSetSize];
for (uint32_t i = 0; i < sBMHCharSetSize; i++) {
skip[i] = uint8_t(patLen);
}
uint32_t patLast = patLen - 1;
for (uint32_t i = 0; i < patLast; i++) {
char16_t c = pat[i];
if (c >= sBMHCharSetSize) {
return sBMHBadPattern;
}
skip[c] = uint8_t(patLast - i);
}
for (uint32_t k = patLast; k < textLen;) {
for (uint32_t i = k, j = patLast;; i--, j--) {
if (text[i] != pat[j]) {
break;
}
if (j == 0) {
return static_cast<
int>(i);
/* safe: max string size */
}
}
char16_t c = text[k];
k += (c >= sBMHCharSetSize) ? patLen : skip[c];
}
return -1;
}
template <
typename TextChar,
typename PatChar>
struct MemCmp {
using Extent = uint32_t;
static MOZ_ALWAYS_INLINE Extent computeExtent(
const PatChar*,
uint32_t patLen) {
return (patLen - 2) *
sizeof(PatChar);
}
static MOZ_ALWAYS_INLINE
bool match(
const PatChar* p,
const TextChar* t,
Extent extent) {
MOZ_ASSERT(
sizeof(TextChar) ==
sizeof(PatChar));
return memcmp(p, t, extent) == 0;
}
};
template <
typename TextChar,
typename PatChar>
struct ManualCmp {
using Extent =
const PatChar*;
static MOZ_ALWAYS_INLINE Extent computeExtent(
const PatChar* pat,
uint32_t patLen) {
return pat + patLen;
}
static MOZ_ALWAYS_INLINE
bool match(
const PatChar* p,
const TextChar* t,
Extent extent) {
for (; p != extent; ++p, ++t) {
if (*p != *t) {
return false;
}
}
return true;
}
};
template <
class InnerMatch,
typename TextChar,
typename PatChar>
static int Matcher(
const TextChar* text, uint32_t textlen,
const PatChar* pat,
uint32_t patlen) {
MOZ_ASSERT(patlen > 1);
const typename InnerMatch::Extent extent =
InnerMatch::computeExtent(pat, patlen);
uint32_t i = 0;
uint32_t n = textlen - patlen + 1;
while (i < n) {
const TextChar* pos;
// This is a bit awkward. Consider the case where we're searching "abcdef"
// for "def". n will be 4, because we know in advance that the last place we
// can *start* a successful search will be at 'd'. However, if we just use n
// - i, then our first search will be looking through "abcd" for "de",
// because our memchr2xN functions search for two characters at a time. So
// we just have to compensate by adding 1. This will never exceed textlen
// because we know patlen is at least two.
size_t searchLen = n - i + 1;
if (
sizeof(TextChar) == 1) {
MOZ_ASSERT(pat[0] <= 0xff);
pos = (TextChar*)SIMD::memchr2x8((
char*)text + i, pat[0], pat[1],
searchLen);
}
else {
pos = (TextChar*)SIMD::memchr2x16((char16_t*)(text + i), char16_t(pat[0]),
char16_t(pat[1]), searchLen);
}
if (pos == nullptr) {
return -1;
}
i =
static_cast<uint32_t>(pos - text);
const uint32_t inlineLookaheadChars = 2;
if (InnerMatch::match(pat + inlineLookaheadChars,
text + i + inlineLookaheadChars, extent)) {
return i;
}
i += 1;
}
return -1;
}
template <
typename TextChar,
typename PatChar>
static MOZ_ALWAYS_INLINE
int StringMatch(
const TextChar* text, uint32_t textLen,
const PatChar* pat, uint32_t patLen) {
if (patLen == 0) {
return 0;
}
if (textLen < patLen) {
return -1;
}
if (
sizeof(TextChar) == 1 &&
sizeof(PatChar) > 1 && pat[0] > 0xff) {
return -1;
}
if (patLen == 1) {
const TextChar* pos;
if (
sizeof(TextChar) == 1) {
MOZ_ASSERT(pat[0] <= 0xff);
pos = (TextChar*)SIMD::memchr8((
char*)text, pat[0], textLen);
}
else {
pos =
(TextChar*)SIMD::memchr16((char16_t*)text, char16_t(pat[0]), textLen);
}
if (pos == nullptr) {
return -1;
}
return pos - text;
}
// We use a fast two-character-wide search in Matcher below, so we need to
// validate that pat[1] isn't outside the latin1 range up front if the
// sizes are different.
if (
sizeof(TextChar) == 1 &&
sizeof(PatChar) > 1 && pat[1] > 0xff) {
return -1;
}
/*
* If the text or pattern string is short, BMH will be more expensive than
* the basic linear scan due to initialization cost and a more complex loop
* body. While the correct threshold is input-dependent, we can make a few
* conservative observations:
* - When |textLen| is "big enough", the initialization time will be
* proportionally small, so the worst-case slowdown is minimized.
* - When |patLen| is "too small", even the best case for BMH will be
* slower than a simple scan for large |textLen| due to the more complex
* loop body of BMH.
* From this, the values for "big enough" and "too small" are determined
* empirically. See bug 526348.
*/
if (textLen >= 512 && patLen >= 11 && patLen <= sBMHPatLenMax) {
int index = BoyerMooreHorspool(text, textLen, pat, patLen);
if (index != sBMHBadPattern) {
return index;
}
}
/*
* For big patterns with large potential overlap we want the SIMD-optimized
* speed of memcmp. For small patterns, a simple loop is faster. We also can't
* use memcmp if one of the strings is TwoByte and the other is Latin-1.
*/
return (patLen > 128 && std::is_same_v<TextChar, PatChar>)
? Matcher<MemCmp<TextChar, PatChar>, TextChar, PatChar>(
text, textLen, pat, patLen)
: Matcher<ManualCmp<TextChar, PatChar>, TextChar, PatChar>(
text, textLen, pat, patLen);
}
static int32_t StringMatch(
const JSLinearString* text,
const JSLinearString* pat, uint32_t start = 0) {
MOZ_ASSERT(start <= text->length());
uint32_t textLen = text->length() - start;
uint32_t patLen = pat->length();
int match;
AutoCheckCannotGC nogc;
if (text->hasLatin1Chars()) {
const Latin1Char* textChars = text->latin1Chars(nogc) + start;
if (pat->hasLatin1Chars()) {
match = StringMatch(textChars, textLen, pat->latin1Chars(nogc), patLen);
}
else {
match = StringMatch(textChars, textLen, pat->twoByteChars(nogc), patLen);
}
}
else {
const char16_t* textChars = text->twoByteChars(nogc) + start;
if (pat->hasLatin1Chars()) {
match = StringMatch(textChars, textLen, pat->latin1Chars(nogc), patLen);
}
else {
match = StringMatch(textChars, textLen, pat->twoByteChars(nogc), patLen);
}
}
return (match == -1) ? -1 : start + match;
}
static const size_t sRopeMatchThresholdRatioLog2 = 4;
int js::StringFindPattern(
const JSLinearString* text,
const JSLinearString* pat,
size_t start) {
return StringMatch(text, pat, start);
}
using LinearStringVector = Vector<JSLinearString*, 16, SystemAllocPolicy>;
template <
typename TextChar,
typename PatChar>
static int RopeMatchImpl(
const AutoCheckCannotGC& nogc,
LinearStringVector& strings,
const PatChar* pat,
size_t patLen) {
/* Absolute offset from the beginning of the logical text string. */
int pos = 0;
for (JSLinearString** outerp = strings.begin(); outerp != strings.end();
++outerp) {
/* Try to find a match within 'outer'. */
JSLinearString* outer = *outerp;
const TextChar* chars = outer->chars<TextChar>(nogc);
size_t len = outer->length();
int matchResult = StringMatch(chars, len, pat, patLen);
--> --------------------
--> maximum size reached
--> --------------------