/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set ts=8 sts=2 et sw=2 tw=80: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include <ctype.h> // for |EOF|, |WEOF| #include <stdint.h> // for |uint32_t| #include <string.h> // for |memcpy|, et al #include"mozilla/MemoryChecking.h"
// This file may be used (through nsUTF8Utils.h) from non-XPCOM code, in // particular the standalone software updater. In that case stub out // the macros provided by nsDebug.h which are only usable when linking XPCOM
#ifdef NS_NO_XPCOM # define NS_WARNING(msg) # define NS_ASSERTION(cond, msg) # define NS_ERROR(msg) #else # include "nsDebug.h"// for NS_ASSERTION #endif
/* * Some macros for converting char16_t (UTF-16) to and from Unicode scalar * values. * * Note that UTF-16 represents all Unicode scalar values up to U+10FFFF by * using "surrogate pairs". These consist of a high surrogate, i.e. a code * point in the range U+D800 - U+DBFF, and a low surrogate, i.e. a code point * in the range U+DC00 - U+DFFF, like this: * * U+D800 U+DC00 = U+10000 * U+D800 U+DC01 = U+10001 * ... * U+DBFF U+DFFE = U+10FFFE * U+DBFF U+DFFF = U+10FFFF * * These surrogate code points U+D800 - U+DFFF are not themselves valid Unicode * scalar values and are not well-formed UTF-16 except as high-surrogate / * low-surrogate pairs.
*/
#define PLANE1_BASE uint32_t(0x00010000) // High surrogates are in the range 0xD800 -- OxDBFF #define NS_IS_HIGH_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xD800) // Low surrogates are in the range 0xDC00 -- 0xDFFF #define NS_IS_LOW_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xDC00) // Easier to type than NS_IS_HIGH_SURROGATE && NS_IS_LOW_SURROGATE #define NS_IS_SURROGATE_PAIR(h, l) \
(NS_IS_HIGH_SURROGATE(h) && NS_IS_LOW_SURROGATE(l)) // Faster than testing NS_IS_HIGH_SURROGATE || NS_IS_LOW_SURROGATE #define IS_SURROGATE(u) ((uint32_t(u) & 0xFFFFF800) == 0xD800)
// Everything else is not a surrogate: 0x000 -- 0xD7FF, 0xE000 -- 0xFFFF
// N = (H - 0xD800) * 0x400 + 0x10000 + (L - 0xDC00) // I wonder whether we could somehow assert that H is a high surrogate // and L is a low surrogate #define SURROGATE_TO_UCS4(h, l) \
(((uint32_t(h) & 0x03FF) << 10) + (uint32_t(l) & 0x03FF) + PLANE1_BASE)
// Extract surrogates from a UCS4 char // Reference: the Unicode standard 4.0, section 3.9 // Since (c - 0x10000) >> 10 == (c >> 10) - 0x0080 and // 0xD7C0 == 0xD800 - 0x0080, // ((c - 0x10000) >> 10) + 0xD800 can be simplified to #define H_SURROGATE(c) char16_t(char16_t(uint32_t(c) >> 10) + char16_t(0xD7C0)) // where it's to be noted that 0xD7C0 is not bitwise-OR'd // but added.
// Since 0x10000 & 0x03FF == 0, // (c - 0x10000) & 0x03FF == c & 0x03FF so that // ((c - 0x10000) & 0x03FF) | 0xDC00 is equivalent to #define L_SURROGATE(c) \
char16_t(char16_t(uint32_t(c) & uint32_t(0x03FF)) | char16_t(0xDC00))
staticbool equalsLatin1(const char_type* aStr1, constchar* aStr2, const size_t aN) { for (size_t i = aN; i > 0; --i, ++aStr1, ++aStr2) { if (*aStr1 != static_cast<char_type>(*aStr2)) { returnfalse;
}
}
returntrue;
}
// this version assumes that s2 is null-terminated and s1 has length n. // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, // we return 1. staticint compareASCIINullTerminated(const char_type* aStr1, size_t aN, constchar* aStr2) { for (; aN--; ++aStr1, ++aStr2) { if (!*aStr2) { return 1;
}
NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character"); if (!eq_int_type(to_int_type(*aStr1),
to_int_type(static_cast<char_type>(*aStr2)))) { return to_int_type(*aStr1) -
to_int_type(static_cast<char_type>(*aStr2));
}
}
if (*aStr2) { return -1;
}
return 0;
}
/** * Convert c to its lower-case form, but only if c is in the ASCII * range. Otherwise leave it alone.
*/ static char_type ASCIIToLower(char_type aChar) { if (aChar >= 'A' && aChar <= 'Z') { return char_type(aChar + ('a' - 'A'));
}
// this version assumes that s2 is null-terminated and s1 has length n. // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, // we return 1. staticint compareLowerCaseToASCIINullTerminated(const char_type* aStr1,
size_t aN, constchar* aStr2) { for (; aN--; ++aStr1, ++aStr2) { if (!*aStr2) { return 1;
}
NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'), "Unexpected uppercase character");
char_type lower_s1 = ASCIIToLower(*aStr1); if (lower_s1 != static_cast<char_type>(*aStr2)) { return to_int_type(lower_s1) -
to_int_type(static_cast<char_type>(*aStr2));
}
}
if (*aStr2) { return -1;
}
return 0;
}
static size_t length(const char_type* aStr) {
size_t result = 0; while (!eq(*aStr++, char_type(0))) {
++result;
} return result;
}
staticconst char_type* find(const char_type* aStr, size_t aN,
char_type aChar) { while (aN--) { if (eq(*aStr, aChar)) { return aStr;
}
++aStr;
}
// this version assumes that s2 is null-terminated and s1 has length n. // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, // we return 1. staticint compareASCIINullTerminated(const char_type* aStr1, size_t aN, constchar* aStr2) { // can't use strcmp here because we don't want to stop when aStr1 // contains a null for (; aN--; ++aStr1, ++aStr2) { if (!*aStr2) { return 1;
}
NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character"); if (*aStr1 != *aStr2) { return to_int_type(*aStr1) - to_int_type(*aStr2);
}
}
if (*aStr2) { return -1;
}
return 0;
}
/** * Convert c to its lower-case form, but only if c is ASCII.
*/ static char_type ASCIIToLower(char_type aChar) { if (aChar >= 'A' && aChar <= 'Z') { return char_type(aChar + ('a' - 'A'));
}
// this version assumes that s2 is null-terminated and s1 has length n. // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, // we return 1. staticint compareLowerCaseToASCIINullTerminated(const char_type* aStr1,
size_t aN, constchar* aStr2) { for (; aN--; ++aStr1, ++aStr2) { if (!*aStr2) { return 1;
}
NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character");
NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'), "Unexpected uppercase character");
char_type lower_s1 = ASCIIToLower(*aStr1); if (lower_s1 != *aStr2) { return to_int_type(lower_s1) - to_int_type(*aStr2);
}
}