typedefstruct CanonicalizationMap { constchar *id; /* input ID */ constchar *canonicalID; /* canonicalized output ID */
} CanonicalizationMap;
/** * A map to canonicalize locale IDs. This handles a variety of * different semantic kinds of transformations.
*/
constexpr CanonicalizationMap CANONICALIZE_MAP[] = {
{ "art__LOJBAN", "jbo" }, /* registered name */
{ "hy__AREVELA", "hy" }, /* Registered IANA variant */
{ "hy__AREVMDA", "hyw" }, /* Registered IANA variant */
{ "zh__GUOYU", "zh" }, /* registered name */
{ "zh__HAKKA", "hak" }, /* registered name */
{ "zh__XIANG", "hsn" }, /* registered name */ // subtags with 3 chars won't be treated as variants.
{ "zh_GAN", "gan" }, /* registered name */
{ "zh_MIN_NAN", "nan" }, /* registered name */
{ "zh_WUU", "wuu" }, /* registered name */
{ "zh_YUE", "yue" }, /* registered name */
};
/* ### BCP47 Conversion *******************************************/ /* Gets the size of the shortest subtag in the given localeID. */
int32_t getShortestSubtagLength(constchar *localeID) {
int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
int32_t length = localeIDLength;
int32_t tmpLength = 0;
int32_t i; bool reset = true;
for (i = 0; i < localeIDLength; i++) { if (localeID[i] != '_' && localeID[i] != '-') { if (reset) {
tmpLength = 0;
reset = false;
}
tmpLength++;
} else { if (tmpLength != 0 && tmpLength < length) {
length = tmpLength;
}
reset = true;
}
}
return length;
} /* Test if the locale id has BCP47 u extension and does not have '@' */ inlinebool _hasBCP47Extension(constchar *id) { return id != nullptr && uprv_strstr(id, "@") == nullptr && getShortestSubtagLength(id) == 1;
}
/* ### Keywords **************************************************/ inlinebool UPRV_ISDIGIT(char c) { return c >= '0' && c <= '9'; } inlinebool UPRV_ISALPHANUM(char c) { return uprv_isASCIILetter(c) || UPRV_ISDIGIT(c); } /* Punctuation/symbols allowed in legacy key values */ inlinebool UPRV_OK_VALUE_PUNCTUATION(char c) { return c == '_' || c == '-' || c == '+' || c == '/'; }
U_CAPI constchar * U_EXPORT2
locale_getKeywordsStart(constchar *localeID) { constchar *result = nullptr; if((result = uprv_strchr(localeID, '@')) != nullptr) { return result;
} #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) else { /* We do this because the @ sign is variant, and the @ sign used on one EBCDIC machine won't be compiled the same way on other EBCDIC based
machines. */ staticconst uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 }; const uint8_t *charToFind = ebcdicSigns; while(*charToFind) { if((result = uprv_strchr(localeID, *charToFind)) != nullptr) { return result;
}
charToFind++;
}
} #endif return nullptr;
}
namespace {
/** * @param keywordName incoming name to be canonicalized * @param status return status (keyword too long) * @return the keyword name
*/
CharString locale_canonKeywordName(std::string_view keywordName, UErrorCode& status)
{ if (U_FAILURE(status)) { return {}; }
CharString result;
for (char c : keywordName) { if (!UPRV_ISALPHANUM(c)) {
status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */ return {};
}
result.append(uprv_tolower(c), status);
} if (result.isEmpty()) {
status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */ return {};
}
if(prev == '@') { /* start of keyword definition */ /* we will grab pairs, trim spaces, lowercase keywords, sort and return */ do { bool duplicate = false; /* skip leading spaces */ while(*pos == ' ') {
pos++;
} if (!*pos) { /* handle trailing "; " */ break;
} if(numKeywords == maxKeywords) {
status = U_INTERNAL_PROGRAM_ERROR; return;
}
equalSign = uprv_strchr(pos, '=');
semicolon = uprv_strchr(pos, ';'); /* lack of '=' [foo@currency] is illegal */ /* ';' before '=' [foo@currency;collation=pinyin] is illegal */ if(!equalSign || (semicolon && semicolon<equalSign)) {
status = U_INVALID_FORMAT_ERROR; return;
} /* need to normalize both keyword and keyword name */ if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) { /* keyword name too long for internal buffer */
status = U_INTERNAL_PROGRAM_ERROR; return;
} for(i = 0, n = 0; i < equalSign - pos; ++i) { if (pos[i] != ' ') {
keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
}
}
/* zero-length keyword is an error. */ if (n == 0) {
status = U_INVALID_FORMAT_ERROR; return;
}
keywordList[numKeywords].keyword[n] = 0;
keywordList[numKeywords].keywordLen = n; /* now grab the value part. First we skip the '=' */
equalSign++; /* then we leading spaces */ while(*equalSign == ' ') {
equalSign++;
}
/* Premature end or zero-length value */ if (!*equalSign || equalSign == semicolon) {
status = U_INVALID_FORMAT_ERROR; return;
}
keywordList[numKeywords].valueStart = equalSign;
pos = semicolon;
i = 0; if(pos) { while(*(pos - i - 1) == ' ') {
i++;
}
keywordList[numKeywords].valueLen = static_cast<int32_t>(pos - equalSign - i);
pos++;
} else {
i = static_cast<int32_t>(uprv_strlen(equalSign)); while(i && equalSign[i-1] == ' ') {
i--;
}
keywordList[numKeywords].valueLen = i;
} /* If this is a duplicate keyword, then ignore it */ for (j=0; j<numKeywords; ++j) { if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
duplicate = true; break;
}
} if (!duplicate) {
++numKeywords;
}
} while(pos);
/* now we have a list of keywords */ /* we need to sort it */
uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, nullptr, false, &status);
int32_t bufLen = (int32_t)uprv_strlen(buffer); if(bufferCapacity<bufLen) { /* The capacity is less than the length?! Is this NUL terminated? */
*status = U_ILLEGAL_ARGUMENT_ERROR; return 0;
}
char* keywords = const_cast<char*>(locale_getKeywordsStart(buffer));
int32_t baseLen = keywords == nullptr ? bufLen : keywords - buffer; // Remove -1 from the capacity so that this function can guarantee NUL termination.
CheckedArrayByteSink sink(keywords == nullptr ? buffer + bufLen : keywords,
bufferCapacity - baseLen - 1);
int32_t reslen = ulocimp_setKeywordValue(
keywords == nullptr ? std::string_view() : keywords,
keywordName,
keywordValue == nullptr ? std::string_view() : keywordValue,
sink,
*status);
// See the documentation for this function, it's guaranteed to never // overflow the buffer but instead abort with BUFFER_OVERFLOW_ERROR. // In this case, nothing has been written to the sink, so it cannot have Overflowed().
U_ASSERT(!sink.Overflowed());
U_ASSERT(reslen >= 0); return u_terminateChars(buffer, bufferCapacity, reslen + baseLen, status);
}
U_EXPORT void
ulocimp_setKeywordValue(std::string_view keywordName,
std::string_view keywordValue,
CharString& localeID,
UErrorCode& status)
{ if (U_FAILURE(status)) { return; }
std::string_view keywords; if (constchar* start = locale_getKeywordsStart(localeID.data()); start != nullptr) { // This is safe because CharString::truncate() doesn't actually erase any // data, but simply sets the position for where new data will be written.
int32_t size = start - localeID.data();
keywords = localeID.toStringPiece();
keywords.remove_prefix(size);
localeID.truncate(size);
}
CharStringByteSink sink(&localeID);
ulocimp_setKeywordValue(keywords, keywordName, keywordValue, sink, status);
}
if (status == U_STRING_NOT_TERMINATED_WARNING) {
status = U_ZERO_ERROR;
} if (keywordName.empty()) {
status = U_ILLEGAL_ARGUMENT_ERROR; return 0;
}
CharString canonKeywordName = locale_canonKeywordName(keywordName, status); if (U_FAILURE(status)) { return 0;
}
CharString canonKeywordValue; for (char c : keywordValue) { if (!UPRV_ISALPHANUM(c) && !UPRV_OK_VALUE_PUNCTUATION(c)) {
status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */ return 0;
} /* Should we force lowercase in value to set? */
canonKeywordValue.append(c, status);
} if (U_FAILURE(status)) { return 0;
}
if (keywords.size() <= 1) { if (canonKeywordValue.isEmpty()) { /* no keywords = nothing to remove */
U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING); return 0;
}
/* start processing the value part */
nextEqualsign++; /* skip '=' */ /* First strip leading & trailing spaces (TC decided to tolerate these) */ while (nextEqualsign < keywords.size() && keywords[nextEqualsign] == ' ') {
nextEqualsign++;
}
keyValueTail = nextSeparator == std::string_view::npos ? keywords.size() : nextSeparator; while (keyValueTail > nextEqualsign && keywords[keyValueTail - 1] == ' ') {
keyValueTail--;
} if (nextEqualsign == keyValueTail) {
status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */ return 0;
}
rc = uprv_strcmp(canonKeywordName.data(), localeKeywordName.data()); if(rc == 0) { /* Current entry matches the input keyword. Update the entry */ if (!canonKeywordValue.isEmpty()) { /* updating a value */
updatedKeysAndValues.append(keyValuePrefix, status);
keyValuePrefix = ';'; /* for any subsequent key-value pair */
updatedKeysAndValues.append(canonKeywordName, status);
updatedKeysAndValues.append('=', status);
updatedKeysAndValues.append(canonKeywordValue, status);
} /* else removing this entry, don't emit anything */
handledInputKeyAndValue = true;
} else { /* input keyword sorts earlier than current entry, add before current entry */ if (rc < 0 && !canonKeywordValue.isEmpty() && !handledInputKeyAndValue) { /* insert new entry at this location */
updatedKeysAndValues.append(keyValuePrefix, status);
keyValuePrefix = ';'; /* for any subsequent key-value pair */
updatedKeysAndValues.append(canonKeywordName, status);
updatedKeysAndValues.append('=', status);
updatedKeysAndValues.append(canonKeywordValue, status);
handledInputKeyAndValue = true;
} /* copy the current entry */
updatedKeysAndValues.append(keyValuePrefix, status);
keyValuePrefix = ';'; /* for any subsequent key-value pair */
updatedKeysAndValues.append(localeKeywordName, status);
updatedKeysAndValues.append('=', status);
updatedKeysAndValues.append(keywords.data() + nextEqualsign, static_cast<int32_t>(keyValueTail - nextEqualsign), status);
} if (nextSeparator == std::string_view::npos && !canonKeywordValue.isEmpty() && !handledInputKeyAndValue) { /* append new entry at the end, it sorts later than existing entries */
updatedKeysAndValues.append(keyValuePrefix, status); /* skip keyValuePrefix update, no subsequent key-value pair */
updatedKeysAndValues.append(canonKeywordName, status);
updatedKeysAndValues.append('=', status);
updatedKeysAndValues.append(canonKeywordValue, status);
handledInputKeyAndValue = true;
}
keywordStart = nextSeparator;
} /* end loop searching */
/* Any error from updatedKeysAndValues.append above would be internal and not due to * problems with the passed-in locale. So if we did encounter problems with the * passed-in locale above, those errors took precedence and overrode any error * status from updatedKeysAndValues.append, and also caused a return of 0. If there * are errors here they are from updatedKeysAndValues.append; they do cause an * error return but the passed-in locale is unmodified and the original bufLen is * returned.
*/ if (!handledInputKeyAndValue || U_FAILURE(status)) { /* if input key/value specified removal of a keyword not present in locale, or
* there was an error in CharString.append, leave original locale alone. */
U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING); returnstatic_cast<int32_t>(keywords.size());
}
needLen = updatedKeysAndValues.length(); // Check to see can we fit the updatedKeysAndValues, if not, return // U_BUFFER_OVERFLOW_ERROR without copy updatedKeysAndValues into it. // We do this because this API function does not behave like most others: // It promises never to set a U_STRING_NOT_TERMINATED_WARNING. // When the contents fits but without the terminating NUL, in this case we need to not change // the buffer contents and return with a buffer overflow error. if (needLen > 0) {
int32_t capacity = 0; char* buffer = sink.GetAppendBuffer(
needLen, needLen, nullptr, needLen, &capacity); if (capacity < needLen || buffer == nullptr) {
status = U_BUFFER_OVERFLOW_ERROR; return needLen;
}
uprv_memcpy(buffer, updatedKeysAndValues.data(), needLen);
sink.Append(buffer, needLen);
}
U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING); return needLen;
}
/* ### ID parsing implementation **************************************************/
namespace {
inlinebool _isPrefixLetter(char a) { return a == 'x' || a == 'X' || a == 'i' || a == 'I'; }
/*returns true if one of the special prefixes is here (s=string)
'x-' or 'i-' */ inlinebool _isIDPrefix(constchar *s) { return _isPrefixLetter(s[0]) && _isIDSeparator(s[1]); }
/* Dot terminates it because of POSIX form where dot precedes the codepage * except for variant
*/ inlinebool _isTerminator(char a) { return a == 0 || a == '.' || a == '@'; }
/** * Lookup 'key' in the array 'list'. The array 'list' should contain * a nullptr entry, followed by more entries, and a second nullptr entry. * * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or * COUNTRIES_3.
*/
std::optional<int16_t> _findIndex(constchar* const* list, constchar* key)
{ constchar* const* anchor = list;
int32_t pass = 0;
/* Make two passes through two nullptr-terminated arrays at 'list' */ while (pass++ < 2) { while (*list) { if (uprv_strcmp(key, *list) == 0) { returnstatic_cast<int16_t>(list - anchor);
}
list++;
}
++list; /* skip final nullptr *CWB*/
} return std::nullopt;
}
/* * the internal functions _getLanguage(), _getScript(), _getRegion(), _getVariant() * avoid duplicating code to handle the earlier locale ID pieces * in the functions for the later ones by * setting the *pEnd pointer to where they stopped parsing * * TODO try to use this in Locale
*/
constexpr int32_t MAXLEN = ULOC_LANG_CAPACITY - 1; // Minus NUL.
/* if it starts with i- or x- then copy that prefix */
int32_t len = _isIDPrefix(localeID) ? 2 : 0; while (!_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len])) { if (len == MAXLEN) {
status = U_ILLEGAL_ARGUMENT_ERROR; return;
}
len++;
}
*pEnd = localeID + len; if (sink == nullptr || len == 0) { return; }
for (int32_t i = 0; i < len; ++i) {
buffer[i] = uprv_toupper(localeID[i]);
}
if (len == 3) { /* convert 3 character code to 2 character code if possible *CWB*/
U_ASSERT(capacity >= 4);
buffer[3] = '\0';
std::optional<int16_t> offset = _findIndex(COUNTRIES_3, buffer); if (offset.has_value()) { constchar* const alias = COUNTRIES[*offset];
sink->Append(alias, static_cast<int32_t>(uprv_strlen(alias))); return;
}
}
sink->Append(buffer, len);
}
/** * @param needSeparator if true, then add leading '_' if any variants * are added to 'variant'
*/ void
_getVariant(constchar* localeID, char prev,
ByteSink* sink, constchar** pEnd, bool needSeparator,
UErrorCode& status) { if (U_FAILURE(status)) return; if (pEnd != nullptr) { *pEnd = localeID; }
// Reasonable upper limit for variants // There are no strict limitation of the syntax of variant in the legacy // locale format. If the locale is constructed from unicode_locale_id // as defined in UTS35, then we know each unicode_variant_subtag // could have max length of 8 ((alphanum{5,8} | digit alphanum{3}) // 179 would allow 20 unicode_variant_subtag with sep in the // unicode_locale_id // 8*20 + 1*(20-1) = 179
constexpr int32_t MAX_VARIANTS_LENGTH = 179;
/* get one or more variant tags and separate them with '_' */
int32_t index = 0; if (_isIDSeparator(prev)) { /* get a variant string after a '-' or '_' */ for (index=0; !_isTerminator(localeID[index]); index++) { if (index >= MAX_VARIANTS_LENGTH) { // same as length > MAX_VARIANTS_LENGTH
status = U_ILLEGAL_ARGUMENT_ERROR; return;
} if (needSeparator) { if (sink != nullptr) {
sink->Append("_", 1);
}
needSeparator = false;
} if (sink != nullptr) { char c = uprv_toupper(localeID[index]); if (c == '-') c = '_';
sink->Append(&c, 1);
}
} if (pEnd != nullptr) { *pEnd = localeID+index; }
}
/* if there is no variant tag after a '-' or '_' then look for '@' */ if (index == 0) { if (prev=='@') { /* keep localeID */
} elseif((localeID=locale_getKeywordsStart(localeID))!=nullptr) {
++localeID; /* point after the '@' */
} else { return;
} for(; !_isTerminator(localeID[index]); index++) { if (index >= MAX_VARIANTS_LENGTH) { // same as length > MAX_VARIANTS_LENGTH
status = U_ILLEGAL_ARGUMENT_ERROR; return;
} if (needSeparator) { if (sink != nullptr) {
sink->Append("_", 1);
}
needSeparator = false;
} if (sink != nullptr) { char c = uprv_toupper(localeID[index]); if (c == '-' || c == ',') c = '_';
sink->Append(&c, 1);
}
} if (pEnd != nullptr) { *pEnd = localeID + index; }
}
}
if (_isIDSeparator(*localeID) && !_isBCP47Extension(localeID)) { /* If there was no country ID, skip a possible extra IDSeparator */ if (!hasRegion && _isIDSeparator(localeID[1])) {
localeID++;
} constchar* begin = localeID + 1; constchar* end = nullptr;
_getVariant(begin, *localeID, variant, &end, false, status); if (U_FAILURE(status)) { return; }
U_ASSERT(end != nullptr); if (end != begin && pEnd != nullptr) { *pEnd = end; }
}
}
/* keywords are located after '@' */ if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != nullptr) {
CharString keywords = ulocimp_getKeywords(tmpLocaleID + 1, '@', false, *status); if (U_FAILURE(*status)) { return nullptr;
} return uloc_openKeywordList(keywords.data(), keywords.length(), status);
} return nullptr;
}
/* bit-flags for 'options' parameter of _canonicalize */ #define _ULOC_STRIP_KEYWORDS 0x2 #define _ULOC_CANONICALIZE 0x1
namespace {
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.76Bemerkung:
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.