// Copyright 2020 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file.
// Iterate through all chars in BMP except surrogates. for (UChar32 i = 0; i < static_cast<UChar32>(kNonBmpStart); i++) { if (i >= static_cast<UChar32>(kSurrogateStart) &&
i <= static_cast<UChar32>(kSurrogateEnd)) { continue; // Ignore surrogate range
}
current.set(i, i);
current.closeOver(USET_CASE_INSENSITIVE);
// Check to see if all characters in the case-folding equivalence // class as defined by UnicodeSet::closeOver all map to the same // canonical value.
UChar32 canonical = RegExpCaseFolding::Canonicalize(i); bool class_has_matching_canonical_char = false; bool class_has_non_matching_canonical_char = false; for (int32_t j = 0; j < current.getRangeCount(); j++) { for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
c++) { if (c == i) { continue;
}
UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c); if (canonical == other_canonical) {
class_has_matching_canonical_char = true;
} else {
class_has_non_matching_canonical_char = true;
}
}
} // If any other character in i's equivalence class has a // different canonical value, then i needs special handling. If // no other character shares a canonical value with i, we can // ignore i when adding alternatives for case-independent // comparison. If at least one other character shares a // canonical value, then i needs special handling. if (class_has_non_matching_canonical_char) { if (class_has_matching_canonical_char) {
special_add.add(i);
} else {
ignore.add(i);
}
}
}
// Verify that no Unicode equivalence class contains two non-trivial // JS equivalence classes. Every character in SpecialAddSet has the // same canonical value as every other non-IgnoreSet character in // its Unicode equivalence class. Therefore, if we call closeOver on // a set containing no IgnoreSet characters, the only characters // that must be removed from the result are in IgnoreSet. This fact // is used in CharacterRange::AddCaseEquivalents. for (int32_t i = 0; i < special_add.getRangeCount(); i++) { for (UChar32 c = special_add.getRangeStart(i);
c <= special_add.getRangeEnd(i); c++) {
UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
current.set(c, c);
current.closeOver(USET_CASE_INSENSITIVE);
current.removeAll(ignore); for (int32_t j = 0; j < current.getRangeCount(); j++) { for (UChar32 c2 = current.getRangeStart(j);
c2 <= current.getRangeEnd(j); c2++) {
CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
}
}
}
}
void WriteHeader(constchar* header_filename) {
std::ofstream out(header_filename);
out << std::hex << std::setfill('0') << std::setw(4);
out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
<< "// Use of this source code is governed by a BSD-style license that\n"
<< "// can be found in the LICENSE file.\n\n"
<< "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
<< "// The following functions are used to build UnicodeSets\n"
<< "// for special cases where the case-folding algorithm used by\n"
<< "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
<< "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
<< "// Semantics: Canonicalize) step 3.\n\n"
<< "#ifdef V8_INTL_SUPPORT\n"
<< "#include \"src/base/lazy-instance.h\"\n\n"
<< "#include \"src/regexp/special-case.h\"\n\n"
<< "#include \"unicode/uniset.h\"\n"
<< "namespace v8 {\n"
<< "namespace internal {\n\n";
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.