// Copyright 2017 The Abseil Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License.
// # Table generated by this Python code (bit 0x02 is currently unused): // TODO(mbar) Move Python code for generation of table to BUILD and link here.
// NOTE: The kAsciiPropertyBits table used within this code was generated by // Python code of the following form. (Bit 0x02 is currently unused and // available.) // // def Hex2(n): // return '0x' + hex(n/16)[2:] + hex(n%16)[2:] // def IsPunct(ch): // return (ord(ch) >= 32 and ord(ch) < 127 and // not ch.isspace() and not ch.isalnum()) // def IsBlank(ch): // return ch in ' \t' // def IsCntrl(ch): // return ord(ch) < 32 or ord(ch) == 127 // def IsXDigit(ch): // return ch.isdigit() or ch.lower() in 'abcdef' // for i in range(128): // ch = chr(i) // mask = ((ch.isalpha() and 0x01 or 0) | // (ch.isalnum() and 0x04 or 0) | // (ch.isspace() and 0x08 or 0) | // (IsPunct(ch) and 0x10 or 0) | // (IsBlank(ch) and 0x20 or 0) | // (IsCntrl(ch) and 0x40 or 0) | // (IsXDigit(ch) and 0x80 or 0)) // print Hex2(mask) + ',', // if i % 16 == 7: // print ' //', Hex2(i & 0x78) // elif i % 16 == 15: // print
// clang-format off // Array of bitfields holding character information. Each bit value corresponds // to a particular character feature. For readability, and because the value // of these bits is tightly coupled to this implementation, the individual bits // are not named. Note that bitfields for all characters above ASCII 127 are // zero-initialized.
ABSL_DLL constunsignedchar kPropertyBits[256] = {
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x00
0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x10
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, // 0x20
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, // 0x30
0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x40
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x50
0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x60
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x70
0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
};
// Returns whether `c` is in the a-z/A-Z range (w.r.t. `ToUpper`). // Implemented by: // 1. Pushing the a-z/A-Z range to [SCHAR_MIN, SCHAR_MIN + 26). // 2. Comparing to SCHAR_MIN + 26. template <bool ToUpper>
constexpr bool AsciiInAZRange(unsignedchar c) {
constexpr unsignedchar sub = (ToUpper ? 'a' : 'A') - SCHAR_MIN;
constexpr signedchar threshold = SCHAR_MIN + 26; // 26 = alphabet size. // Using unsigned arithmetic as overflows/underflows are well defined. unsignedchar u = c - sub; // Using signed cmp, as SIMD unsigned cmp isn't available in many platforms. returnstatic_cast<signedchar>(u) < threshold;
}
// Force-inline so the compiler won't merge the short and long implementations. // `src` may be null iff `size` is zero. template <bool ToUpper>
ABSL_ATTRIBUTE_ALWAYS_INLINE inline constexpr void AsciiStrCaseFoldImpl(
absl::Nonnull<char*> dst, absl::Nullable<constchar*> src, size_t size) { // The upper- and lowercase versions of ASCII characters differ by only 1 bit. // When we need to flip the case, we can xor with this bit to achieve the // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We // could have chosen 'z' and 'Z', or any other pair of characters as they all // have the same single bit difference.
constexpr unsignedchar kAsciiCaseBitFlip = 'a' ^ 'A';
for (size_t i = 0; i < size; ++i) { unsignedchar v = static_cast<unsignedchar>(src[i]);
v ^= AsciiInAZRange<ToUpper>(v) ? kAsciiCaseBitFlip : 0;
dst[i] = static_cast<char>(v);
}
}
// The string size threshold for starting using the long string version.
constexpr size_t kCaseFoldThreshold = 16;
// No-inline so the compiler won't merge the short and long implementations. // `src` may be null iff `size` is zero. template <bool ToUpper>
ABSL_ATTRIBUTE_NOINLINE constexpr void AsciiStrCaseFoldLong(
absl::Nonnull<char*> dst, absl::Nullable<constchar*> src, size_t size) {
ABSL_ASSUME(size >= kCaseFoldThreshold);
AsciiStrCaseFoldImpl<ToUpper>(dst, src, size);
}
// Splitting to short and long strings to allow vectorization decisions // to be made separately in the long and short cases. // `src` may be null iff `size` is zero. template <bool ToUpper>
constexpr void AsciiStrCaseFold(absl::Nonnull<char*> dst,
absl::Nullable<constchar*> src, size_t size) {
size < kCaseFoldThreshold ? AsciiStrCaseFoldImpl<ToUpper>(dst, src, size)
: AsciiStrCaseFoldLong<ToUpper>(dst, src, size);
}
void RemoveExtraAsciiWhitespace(absl::Nonnull<std::string*> str) { auto stripped = StripAsciiWhitespace(*str);
if (stripped.empty()) {
str->clear(); return;
}
auto input_it = stripped.begin(); auto input_end = stripped.end(); auto output_it = &(*str)[0]; bool is_ws = false;
for (; input_it < input_end; ++input_it) { if (is_ws) { // Consecutive whitespace? Keep only the last.
is_ws = absl::ascii_isspace(static_cast<unsignedchar>(*input_it)); if (is_ws) --output_it;
} else {
is_ws = absl::ascii_isspace(static_cast<unsignedchar>(*input_it));
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.