using icu::CharString; using icu::LocalMemory; using icu::LocalPointer; using icu::LocalUCHARBUFPointer; using icu::StringPiece; using icu::UnicodeString;
/* keep in sync with token defines in read.h */ constchar *tokenNames[TOK_TOKEN_COUNT] =
{ "string", /* A string token, such as "MonthNames" */ "'{'", /* An opening brace character */ "'}'", /* A closing brace character */ "','", /* A comma */ "':'", /* A colon */
"", /* End of the file has been reached successfully */ ""
};
/* Just to store "TRUE" */ //static const char16_t trueValue[] = {0x0054, 0x0052, 0x0055, 0x0045, 0x0000};
/* The nature of the lookahead buffer: There are MAX_LOOKAHEAD + 1 slots, used as a circular buffer. This provides MAX_LOOKAHEAD lookahead tokens and a slot for the current token and value. When getToken is called, the current pointer is moved to the next slot and the old slot is filled with the next token from the reader by calling getNextToken. The token values are stored in the slot, which means that token values don't survive a call to getToken, ie.
UString *value;
getToken(&value, nullptr, status); getToken(nullptr, nullptr, status); bad - value is now a different string
*/ staticvoid
initLookahead(ParseState* state, UCHARBUF *buf, UErrorCode *status)
{ static uint32_t initTypeStrings = 0;
uint32_t i;
if(isVerbose()){
printf(" %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
}
if (U_FAILURE(*status))
{ return nullptr;
} /* make the filename including the directory */ if (state->inputdir != nullptr)
{
uprv_strcat(filename, state->inputdir);
if (U_FAILURE(*status)) {
error(line, "An error occurred while opening the input file %s\n", filename); return nullptr;
}
/* We allocate more space than actually required * since the actual size needed for storing UChars * is not known in UTF-8 byte stream
*/
size = ucbuf_size(ucbuf) + 1;
pTarget = static_cast<char16_t*>(uprv_malloc(U_SIZEOF_UCHAR * size));
uprv_memset(pTarget, 0, size*U_SIZEOF_UCHAR);
target = pTarget;
targetLimit = pTarget+size;
/* read the rules into the buffer */ while (target < targetLimit)
{
c = ucbuf_getc(ucbuf, status); if(c == QUOTE) {
quoted = static_cast<UBool>(!quoted);
} /* weiv (06/26/2002): adding the following: * - preserving spaces in commands [...] * - # comments until the end of line
*/ if (c == STARTCOMMAND && !quoted)
{ /* preserve commands * closing bracket will be handled by the * append at the end of the loop
*/ while(c != ENDCOMMAND) {
U_APPEND_CHAR32_ONLY(c, target);
c = ucbuf_getc(ucbuf, status);
}
} elseif (c == HASH && !quoted) { /* skip comments */ while(c != CR && c != LF) {
c = ucbuf_getc(ucbuf, status);
} continue;
} elseif (c == ESCAPE)
{
c = unescape(ucbuf, status);
if (c == static_cast<UChar32>(U_ERR))
{
uprv_free(pTarget);
T_FileStream_close(file); return nullptr;
}
} elseif (!quoted && (c == SPACE || c == TAB || c == CR || c == LF))
{ /* ignore spaces carriage returns * and line feed unless in the form \uXXXX
*/ continue;
}
/* Append char16_t * after dissembling if c > 0xffff*/ if (c != static_cast<UChar32>(U_EOF))
{
U_APPEND_CHAR32_ONLY(c, target);
} else
{ break;
}
}
if(isVerbose()){
printf(" %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
}
if (U_FAILURE(*status))
{ return nullptr;
} /* make the filename including the directory */ if (state->inputdir != nullptr)
{
uprv_strcat(filename, state->inputdir);
if (U_FAILURE(*status)) {
error(line, "An error occurred while opening the input file %s\n", filename); return nullptr;
}
/* We allocate more space than actually required * since the actual size needed for storing UChars * is not known in UTF-8 byte stream
*/
pSource = ucbuf_getBuffer(ucbuf, &size, status);
pTarget = static_cast<char16_t*>(uprv_malloc(U_SIZEOF_UCHAR * (size + 1)));
uprv_memset(pTarget, 0, size*U_SIZEOF_UCHAR);
if(isVerbose()){
printf(" %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
}
if (U_FAILURE(*status))
{ return nullptr;
} /* make the filename including the directory */ if (state->outputdir != nullptr)
{
uprv_strcat(filename, state->outputdir);
void
GenrbImporter::getRules( constchar *localeID, constchar *collationType,
UnicodeString &rules, constchar *& /*errorReason*/, UErrorCode &errorCode) {
CharString filename(localeID, errorCode); for(int32_t i = 0; i < filename.length(); i++){ if(filename[i] == '-'){
filename.data()[i] = '_';
}
}
filename.append(".txt", errorCode); if (U_FAILURE(errorCode)) { return;
}
CharString inputDirBuf;
CharString openFileName; if(inputDir == nullptr) { constchar *filenameBegin = uprv_strrchr(filename.data(), U_FILE_SEP_CHAR); if (filenameBegin != nullptr) { /* * When a filename ../../../data/root.txt is specified, * we presume that the input directory is ../../../data * This is very important when the resource file includes * another file, like UCARules.txt or thaidict.brk.
*/
StringPiece dir = filename.toStringPiece(); constchar *filenameLimit = filename.data() + filename.length();
dir.remove_suffix(static_cast<int32_t>(filenameLimit - filenameBegin));
inputDirBuf.append(dir, errorCode);
inputDir = inputDirBuf.data();
}
}else{
int32_t dirlen = static_cast<int32_t>(uprv_strlen(inputDir));
if((filename[0] != U_FILE_SEP_CHAR) && (inputDir[dirlen-1] !='.')) { /* * append the input dir to openFileName if the first char in * filename is not file separator char and the last char input directory is not '.'. * This is to support : * genrb -s. /home/icu/data * genrb -s. icu/data * The user cannot mix notations like * genrb -s. /icu/data --- the absolute path specified. -s redundant * user should use * genrb -s. icu/data --- start from CWD and look in icu/data dir
*/
openFileName.append(inputDir, dirlen, errorCode); if(inputDir[dirlen-1] != U_FILE_SEP_CHAR) {
openFileName.append(U_FILE_SEP_CHAR, errorCode);
}
}
}
openFileName.append(filename, errorCode); if(U_FAILURE(errorCode)) { return;
} // printf("GenrbImporter::getRules(%s, %s) reads %s\n", localeID, collationType, openFileName.data()); constchar* cp = "";
LocalUCHARBUFPointer ucbuf(
ucbuf_open(openFileName.data(), &cp, getShowWarning(), true, &errorCode)); if(errorCode == U_FILE_ACCESS_ERROR) {
fprintf(stderr, "couldn't open file %s\n", openFileName.data()); return;
} if (ucbuf.isNull() || U_FAILURE(errorCode)) {
fprintf(stderr, "An error occurred processing file %s. Error: %s\n", openFileName.data(), u_errorName(errorCode)); return;
}
/* Parse the data into an SRBRoot */
LocalPointer<SRBRoot> data(
parse(ucbuf.getAlias(), inputDir, outputDir, filename.data(), false, false, false, &errorCode)); if (U_FAILURE(errorCode)) { return;
}
struct SResource *root = data->fRoot; struct SResource *collations = resLookup(root, "collations"); if (collations != nullptr) { struct SResource *collation = resLookup(collations, collationType); if (collation != nullptr) { struct SResource *sequence = resLookup(collation, "Sequence"); if (sequence != nullptr && sequence->isString()) { // No string pointer aliasing so that we need not hold onto the resource bundle.
StringResource *sr = static_cast<StringResource *>(sequence);
rules = sr->fString;
}
}
}
}
// Quick-and-dirty escaping function. // Assumes that we are on an ASCII-based platform. void
escape(const char16_t *s, char *buffer, size_t n) {
int32_t length = u_strlen(s);
int32_t i = 0; for (;;) {
UChar32 c;
U16_NEXT(s, i, length, c); if (c == 0) {
*buffer = 0; return;
} elseif (0x20 <= c && c <= 0x7e) { // printable ASCII
*buffer++ = static_cast<char>(c); // assumes ASCII-based platform
} else {
buffer += snprintf(buffer, n, "\\u%04X", static_cast<int>(c));
}
}
}
staticvoid
writeCollationJamoTOML(constchar* outputdir, constchar* name, constchar* collationType, const icu::CollationData* data, UErrorCode *status) {
FILE* f = openTOML(outputdir, name, collationType, "jamo", status); if (!f) {
printf("writeCollationJamoTOML FAILED TO OPEN FILE %s %s\n", name, collationType); return;
}
uint32_t jamo[0x1200-0x1100]; for (UChar32 c = 0x1100; c < 0x1200; ++c) {
uint32_t ce32 = data->getCE32(c); if (ce32 == icu::Collation::FALLBACK_CE32) {
ce32 = data->base->getCE32(c);
} // Can't reject complex CE32s, because search collations have expansions. // These expansions refer to the tailoring, which foils the reuse of the // these jamo tables. // XXX Figure out what to do. Perhaps instead of having Latin mini expansions, // there should be Hangul mini expansions. // XXX in any case, validate that modern jamo are self-contained.
jamo[c - 0x1100] = ce32;
if (data->base) {
tailoringSet.addAll(*(data->unsafeBackwardSet));
tailoringSet.removeAll(*(data->base->unsafeBackwardSet));
} else {
tailoringSet.addAll(*(data->unsafeBackwardSet));
}
// Use the same value for out-of-range and default in the hope of not having to allocate // different blocks, since ICU4X never does out-of-range queries.
uint32_t trieDefault = root ? icu::Collation::UNASSIGNED_CE32 : icu::Collation::FALLBACK_CE32;
icu::LocalUMutableCPTriePointer builder(umutablecptrie_open(trieDefault, trieDefault, status));
// If the diacritic table was cut short, copy CE32s between the lowered // limit and the max limit from the root to the tailoring. As of June 2022, // no collation in CLDR needs this. for (UChar32 c = diacriticLimit; c < ICU4X_DIACRITIC_LIMIT; ++c) { if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) { // These never occur in NFD data. continue;
}
uint32_t ce32 = data->getCE32(c); if (ce32 == icu::Collation::FALLBACK_CE32) {
ce32 = data->base->getCE32(c);
umutablecptrie_set(builder.getAlias(), c, ce32, status);
}
}
// Ensure that the range covered by the diacritic table isn't duplicated // in the trie. for (UChar32 c = ICU4X_DIACRITIC_BASE; c < diacriticLimit; ++c) { if (umutablecptrie_get(builder.getAlias(), c) != trieDefault) {
umutablecptrie_set(builder.getAlias(), c, trieDefault, status);
}
}
uint16_t lastPrimaries[4]; for (int32_t i = 0; i < 4; ++i) { // getLastPrimaryForGroup subtracts one from a 16-bit value, so we add one // back to get a value that fits in 16 bits.
lastPrimaries[i] = static_cast<uint16_t>((data->getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST + i) + 1) >> 16);
}
uint32_t numericPrimary = data->numericPrimary; if (numericPrimary & 0xFFFFFF) {
printf("Lower 24 bits set in numeric primary");
*status = U_INTERNAL_PROGRAM_ERROR; return;
}
staticvoid
writeCollationTOML(constchar* outputdir, constchar* name, constchar* collationType, const icu::CollationData* data, const icu::CollationSettings* settings, UErrorCode *status) {
UBool tailored = false;
UBool tailoredDiacritics = false;
UBool lithuanianDotAbove = (uprv_strcmp(name, "lt") == 0);
UBool reordering = false;
UBool isRoot = uprv_strcmp(name, "root") == 0;
UChar32 diacriticLimit = ICU4X_DIACRITIC_LIMIT; if (!data->base && isRoot) {
diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status); if (U_FAILURE(*status)) { return;
}
writeCollationJamoTOML(outputdir, name, collationType, data, status); if (U_FAILURE(*status)) { return;
}
writeCollationSpecialPrimariesTOML(outputdir, name, collationType, data, status); if (U_FAILURE(*status)) { return;
}
} elseif (data->base && !lithuanianDotAbove) { for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) { if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) { // These never occur in NFD data. continue;
}
uint32_t ce32 = data->getCE32(c); if ((ce32 != icu::Collation::FALLBACK_CE32) && (ce32 != data->base->getCE32(c))) {
tailoredDiacritics = true;
diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status); if (U_FAILURE(*status)) { return;
} break;
}
}
}
if (settings->hasReordering()) {
reordering = true; // Note: There are duplicate reorderings. Expecting the ICU4X provider // to take care of deduplication.
writeCollationReorderingTOML(outputdir, name, collationType, settings, status); if (U_FAILURE(*status)) { return;
}
}
// Write collation data if either base is non-null or the name is root. // Languages that only reorder scripts are otherwise root-like and have // null base. if (data->base || isRoot) {
tailored = !isRoot;
writeCollationDataTOML(outputdir, name, collationType, data, (!data->base && isRoot), diacriticLimit, status); if (U_FAILURE(*status)) { return;
}
}
uint32_t maxVariable = static_cast<uint32_t>(settings->getMaxVariable()); if (maxVariable >= 4) {
printf("Max variable out of range");
*status = U_INTERNAL_PROGRAM_ERROR; return;
}
result->add(member, line, *status);
member = nullptr;
} elseif(uprv_strcmp(subtag, "%%CollationBin")==0)
{ /* discard duplicate %%CollationBin if any*/
} elseif (uprv_strcmp(subtag, "Sequence") == 0 && member->isString())
{
StringResource *sr = static_cast<StringResource *>(member);
rules = sr->fString;
haveRules = true; // Defer building the collator until we have seen // all sub-elements of the collation table, including the Version. /* in order to achieve smaller data files, we can direct genrb */ /* to omit collation rules */ if(!state->omitCollationRules) {
result->add(member, line, *status);
member = nullptr;
}
} else// Just copy non-special items.
{
result->add(member, line, *status);
member = nullptr;
}
res_close(member); // TODO: use LocalPointer if (U_FAILURE(*status))
{
res_close(result); return nullptr;
}
}
if (!haveRules) { return result; }
#if UCONFIG_NO_COLLATION || UCONFIG_NO_FILE_IO
warning(line, "Not building collation elements because of UCONFIG_NO_COLLATION and/or UCONFIG_NO_FILE_IO, see uconfig.h");
(void)collationType; #else // CLDR ticket #3949, ICU ticket #8082: // Do not build collation binary data for for-import-only "private" collation rule strings. if (uprv_strncmp(collationType, "private-", 8) == 0) { if(isVerbose()) {
printf("Not building %s~%s collation binary\n", state->filename, collationType);
} return result;
}
if (U_FAILURE(*status))
{
res_close(result); return nullptr;
}
if (uprv_strcmp(subtag, "default") == 0)
{
member = parseResource(state, subtag, nullptr, status);
if (U_FAILURE(*status))
{
res_close(result); return nullptr;
}
result->add(member, line, *status);
} else
{
token = peekToken(state, 0, &tokenValue, &line, &comment, status); /* this probably needs to be refactored or recursively use the parser */ /* first we assume that our collation table won't have the explicit type */ /* then, we cannot handle aliases */ if(token == TOK_OPEN_BRACE) {
token = getToken(state, &tokenValue, &comment, &line, status);
TableResource *collationRes; if (keepCollationType(subtag)) {
collationRes = table_open(state->bundle, subtag, nullptr, status);
} else {
collationRes = nullptr;
} // need to parse the collation data regardless
collationRes = addCollation(state, collationRes, subtag, startline, status); if (collationRes != nullptr) {
result->add(collationRes, startline, *status);
}
} elseif(token == TOK_COLON) { /* right now, we'll just try to see if we have aliases */ /* we could have a table too */
token = peekToken(state, 1, &tokenValue, &line, &comment, status);
u_UCharsToChars(tokenValue->fChars, typeKeyword, u_strlen(tokenValue->fChars) + 1); if(uprv_strcmp(typeKeyword, "alias") == 0) {
member = parseResource(state, subtag, nullptr, status); if (U_FAILURE(*status))
{
res_close(result); return nullptr;
}
if (U_FAILURE(*status))
{
res_close(result); return nullptr;
}
}
}
}
/* Necessary, because CollationElements requires the bundle->fRoot member to be present which,
if this weren't special-cased, wouldn't be set until the entire file had been processed. */ staticstruct SResource *
realParseTable(ParseState* state, TableResource *table, char *tag, uint32_t startline, UErrorCode *status)
{ struct SResource *member = nullptr; struct UString *tokenValue=nullptr; struct UString comment; enum ETokenType token; char subtag[1024];
uint32_t line;
UBool readToken = false;
/* '{' . (name resource)* '}' */
if(isVerbose()){
printf(" parsing table %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
} for (;;)
{
ustr_init(&comment);
token = getToken(state, &tokenValue, &comment, &line, status);
if (token == TOK_CLOSE_BRACE)
{ if (!readToken && isVerbose()) {
warning(startline, "Encountered empty table");
} return table;
}
if (token != TOK_STRING)
{
*status = U_INVALID_FORMAT_ERROR;
if (U_FAILURE(*status))
{
error(line, "parse error. Stopped parsing tokens with %s", u_errorName(*status)); return nullptr;
}
member = parseResource(state, subtag, &comment, status);
if (member == nullptr || U_FAILURE(*status))
{
error(line, "parse error. Stopped parsing resource with %s", u_errorName(*status)); return nullptr;
}
table->add(member, line, *status);
if (U_FAILURE(*status))
{
error(line, "parse error. Stopped parsing table with %s", u_errorName(*status)); return nullptr;
}
readToken = true;
ustr_deinit(&comment);
}
/* not reached */ /* A compiler warning will appear if all paths don't contain a return statement. */ /* *status = U_INTERNAL_PROGRAM_ERROR;
return nullptr;*/
}
/* check for end of array, but don't consume next token unless it really is the end */
token = peekToken(state, 0, &tokenValue, nullptr, &memberComments, status);
if (token == TOK_CLOSE_BRACE)
{
getToken(state, nullptr, nullptr, nullptr, status); if (!readToken) {
warning(startline, "Encountered empty array");
} break;
}
/* string arrays are a special case */ if (token == TOK_STRING)
{
getToken(state, &tokenValue, &memberComments, nullptr, status);
member = string_open(state->bundle, nullptr, tokenValue->fChars, tokenValue->fLength, &memberComments, status);
} else
{
member = parseResource(state, nullptr, &memberComments, status);
}
if (result == nullptr || U_FAILURE(*status))
{ return nullptr;
}
if(isVerbose()){
printf(" vector %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
}
ustr_init(&memberComments); /* '{' . string [','] '}' */ for (;;)
{
ustr_setlen(&memberComments, 0, status);
/* check for end of array, but don't consume next token unless it really is the end */
token = peekToken(state, 0, nullptr, nullptr,&memberComments, status);
if (token == TOK_CLOSE_BRACE)
{ /* it's the end, consume the close brace */
getToken(state, nullptr, nullptr, nullptr, status); if (!readToken) {
warning(startline, "Encountered empty int vector");
}
ustr_deinit(&memberComments); return result;
}
if (U_FAILURE(*status))
{
res_close(result); return nullptr;
}
/* For handling illegal char in the Intvector */
value = uprv_strtoul(string, &stopstring, 0);/* make intvector support decimal,hexdigit,octal digit ranging from -2^31-2^32-1*/
int32_t len = static_cast<int32_t>(stopstring - string);
if (U_FAILURE(*status))
{
res_close(result); return nullptr;
}
/* the comma is optional (even though it is required to prevent the reader from concatenating
consecutive entries) so that a missing comma on the last entry isn't an error */ if (token == TOK_COMMA)
{
getToken(state, nullptr, nullptr, nullptr, status);
}
readToken = true;
}
/* not reached */ /* A compiler warning will appear if all paths don't contain a return statement. */ /* intvector_close(result, status); *status = U_INTERNAL_PROGRAM_ERROR;
return nullptr;*/
}
char toConv[3] = {'\0', '\0', '\0'}; for (int32_t i = 0; i < stringLength;)
{ // Skip spaces (which may have been line endings). char c0 = string[i++]; if (c0 == ' ') { continue; } if (i == stringLength) {
*status=U_INVALID_CHAR_FOUND;
error(line, "Encountered invalid binary value (odd number of hex digits)"); return nullptr;
}
toConv[0] = c0;
toConv[1] = string[i++];
if (U_FAILURE(*status))
{
uprv_free(string); return nullptr;
}
if(isVerbose()){
printf(" integer %s at line %i \n", tag == nullptr ? "(null)" : tag, static_cast<int>(startline));
}
if (stringLength == 0)
{
warning(startline, "Encountered empty integer. Default value is 0.");
}
/* Allow integer support for hexdecimal, octal digit and decimal*/ /* and handle illegal char in the integer*/
value = uprv_strtoul(string, &stopstring, 0);
int32_t len = static_cast<int32_t>(stopstring - string); if(len==stringLength)
{
result = int_open(state->bundle, tag, value, comment, status);
} else
{
*status=U_INVALID_CHAR_FOUND;
}
uprv_free(string);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.