/* * This struct was replaced by explicitly accessing equivalent * fields from triples of uint16_t. * The Group struct was padded to 8 bytes on compilers for early ARM CPUs, * which broke the assumption that sizeof(Group)==6 and that the ++ operator * would advance by 6 bytes (3 uint16_t). * * We can't just change the data structure because it's loaded from a data file, * and we don't want to make it less compact, so we changed the access code. * * For details see ICU tickets 6331 and 6008. typedef struct { uint16_t groupMSB, offsetHigh, offsetLow; / * avoid padding * / } Group;
*/ enum {
GROUP_MSB,
GROUP_OFFSET_HIGH,
GROUP_OFFSET_LOW,
GROUP_LENGTH
};
/* * Get the 32-bit group offset. * @param group (const uint16_t *) pointer to a Group triple of uint16_t * @return group offset (int32_t)
*/ #define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
/* * Get the groups table from a UCharNames struct. * The groups table consists of one uint16_t groupCount followed by * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH * and the comment for the old struct Group above. * * @param names (const UCharNames *) pointer to the UCharNames indexes * @return (const uint16_t *) pointer to the groups table
*/ #define GET_GROUPS(names) (const uint16_t *)((constchar *)names+names->groupsOffset)
/* * Important: expandName() and compareName() are almost the same - * apply fixes to both. * * UnicodeData.txt uses ';' as a field separator, so no * field can contain ';' as part of its contents. * In unames.dat, it is marked as token[';']==-1 only if the * semicolon is used in the data file - which is iff we * have Unicode 1.0 names or ISO comments or aliases. * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases * although we know that it will never be part of a name.
*/ static uint16_t
expandName(UCharNames *names, const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, char *buffer, uint16_t bufferLength) {
uint16_t* tokens = reinterpret_cast<uint16_t*>(names) + 8;
uint16_t token, tokenCount=*tokens++, bufferPos=0;
uint8_t* tokenStrings = reinterpret_cast<uint8_t*>(names) + names->tokenStringOffset;
uint8_t c;
if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { /* * skip the modern name if it is not requested _and_ * if the semicolon byte value is a character, not a token number
*/ if (static_cast<uint8_t>(';') >= tokenCount || tokens[static_cast<uint8_t>(';')] == static_cast<uint16_t>(-1)) { int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; do { while(nameLength>0) {
--nameLength; if(*name++==';') { break;
}
}
} while(--fieldIndex>0);
} else { /* * the semicolon byte value is a token number, therefore * only modern names are stored in unames.dat and there is no * such requested alternate name here
*/
nameLength=0;
}
}
/* write each letter directly, and write a token word per token */ while(nameLength>0) {
--nameLength;
c=*name++;
if(c>=tokenCount) { if(c!=';') { /* implicit letter */
WRITE_CHAR(buffer, bufferLength, bufferPos, c);
} else { /* finished */ break;
}
} else {
token=tokens[c]; if (token == static_cast<uint16_t>(-2)) { /* this is a lead byte for a double-byte token */
token=tokens[c<<8|*name++];
--nameLength;
} if (token == static_cast<uint16_t>(-1)) { if(c!=';') { /* explicit letter */
WRITE_CHAR(buffer, bufferLength, bufferPos, c);
} else { /* stop, but skip the semicolon if we are seeking extended names and there was no 2.0 name but there
is a 1.0 name. */ if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) { if (static_cast<uint8_t>(';') >= tokenCount || tokens[static_cast<uint8_t>(';')] == static_cast<uint16_t>(-1)) { continue;
}
} /* finished */ break;
}
} else { /* write token word */
uint8_t *tokenString=tokenStrings+token; while((c=*tokenString++)!=0) {
WRITE_CHAR(buffer, bufferLength, bufferPos, c);
}
}
}
}
/* * compareName() is almost the same as expandName() except that it compares * the currently expanded name to an input name. * It returns the match/no match result as soon as possible.
*/ static UBool
compareName(UCharNames *names, const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, constchar *otherName) {
uint16_t* tokens = reinterpret_cast<uint16_t*>(names) + 8;
uint16_t token, tokenCount=*tokens++;
uint8_t* tokenStrings = reinterpret_cast<uint8_t*>(names) + names->tokenStringOffset;
uint8_t c; constchar *origOtherName = otherName;
if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { /* * skip the modern name if it is not requested _and_ * if the semicolon byte value is a character, not a token number
*/ if (static_cast<uint8_t>(';') >= tokenCount || tokens[static_cast<uint8_t>(';')] == static_cast<uint16_t>(-1)) { int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; do { while(nameLength>0) {
--nameLength; if(*name++==';') { break;
}
}
} while(--fieldIndex>0);
} else { /* * the semicolon byte value is a token number, therefore * only modern names are stored in unames.dat and there is no * such requested alternate name here
*/
nameLength=0;
}
}
/* compare each letter directly, and compare a token word per token */ while(nameLength>0) {
--nameLength;
c=*name++;
if(c>=tokenCount) { if(c!=';') { /* implicit letter */ if (static_cast<char>(c) != *otherName++) { returnfalse;
}
} else { /* finished */ break;
}
} else {
token=tokens[c]; if (token == static_cast<uint16_t>(-2)) { /* this is a lead byte for a double-byte token */
token=tokens[c<<8|*name++];
--nameLength;
} if (token == static_cast<uint16_t>(-1)) { if(c!=';') { /* explicit letter */ if (static_cast<char>(c) != *otherName++) { returnfalse;
}
} else { /* stop, but skip the semicolon if we are seeking extended names and there was no 2.0 name but there
is a 1.0 name. */ if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) { if (static_cast<uint8_t>(';') >= tokenCount || tokens[static_cast<uint8_t>(';')] == static_cast<uint16_t>(-1)) { continue;
}
} /* finished */ break;
}
} else { /* write token word */
uint8_t *tokenString=tokenStrings+token; while((c=*tokenString++)!=0) { if (static_cast<char>(c) != *otherName++) { returnfalse;
}
}
}
}
}
WRITE_CHAR(buffer, bufferLength, length, '<'); while (catname[length - 1]) {
WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
}
WRITE_CHAR(buffer, bufferLength, length, '-'); for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
; if (ndigits < 4)
ndigits = 4; for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
uint8_t v = static_cast<uint8_t>(cp & 0xf);
buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
}
buffer += ndigits;
length += static_cast<uint16_t>(ndigits);
WRITE_CHAR(buffer, bufferLength, length, '>');
return length;
}
/* * getGroup() does a binary search for the group that contains the * Unicode code point "code". * The return value is always a valid Group* that may contain "code" * or else is the highest group before "code". * If the lowest group is after "code", then that one is returned.
*/ staticconst uint16_t *
getGroup(UCharNames *names, uint32_t code) { const uint16_t *groups=GET_GROUPS(names);
uint16_t groupMSB = static_cast<uint16_t>(code >> GROUP_SHIFT),
start=0,
limit=*groups++,
number;
/* binary search for the group of names that contains the one for code */ while(start<limit-1) {
number = static_cast<uint16_t>((start + limit) / 2); if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
limit=number;
} else {
start=number;
}
}
/* return this regardless of whether it is an exact match */ return groups+start*GROUP_LENGTH;
}
/* * expandGroupLengths() reads a block of compressed lengths of 32 strings and * expands them into offsets and lengths for each string. * Lengths are stored with a variable-width encoding in consecutive nibbles: * If a nibble<0xc, then it is the length itself (0=empty string). * If a nibble>=0xc, then it forms a length value with the following nibble. * Calculation see below. * The offsets and lengths arrays must be at least 33 (one more) long because * there is no check here at the end if the last nibble is still used.
*/ staticconst uint8_t *
expandGroupLengths(const uint8_t *s,
uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) { /* read the lengths of the 32 strings in this group and get each string's offset */
uint16_t i=0, offset=0, length=0;
uint8_t lengthByte;
/* all 32 lengths must be read to get the offset of the first group string */ while(i<LINES_PER_GROUP) {
lengthByte=*s++;
/* read even nibble - MSBs of lengthByte */ if(length>=12) { /* double-nibble length spread across two bytes */
length = static_cast<uint16_t>(((length & 0x3) << 4 | lengthByte >> 4) + 12);
lengthByte&=0xf;
} elseif((lengthByte /* &0xf0 */)>=0xc0) { /* double-nibble length spread across this one byte */
length = static_cast<uint16_t>((lengthByte & 0x3f) + 12);
} else { /* single-nibble length in MSBs */
length = static_cast<uint16_t>(lengthByte >> 4);
lengthByte&=0xf;
}
*offsets++=offset;
*lengths++=length;
offset+=length;
++i;
/* read odd nibble - LSBs of lengthByte */ if((lengthByte&0xf0)==0) { /* this nibble was not consumed for a double-nibble length above */
length=lengthByte; if(length<12) { /* single-nibble length in LSBs */
*offsets++=offset;
*lengths++=length;
offset+=length;
++i;
}
} else {
length=0; /* prevent double-nibble detection in the next iteration */
}
}
/* now, s is at the first group string */ return s;
}
/* * enumGroupNames() enumerates all the names in a 32-group * and either calls the enumerator function or finds a given input name.
*/ static UBool
enumGroupNames(UCharNames *names, const uint16_t *group,
UChar32 start, UChar32 end,
UEnumCharNamesFn *fn, void *context,
UCharNameChoice nameChoice) {
uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; const uint8_t* s = reinterpret_cast<uint8_t*>(names) + names->groupStringOffset + GET_GROUP_OFFSET(group);
/* * enumExtNames enumerate extended names. * It only needs to do it if it is called with a real function and not * with the dummy DO_FIND_NAME, because u_charFromName() does a check * for extended names by itself.
*/ static UBool
enumExtNames(UChar32 start, UChar32 end,
UEnumCharNamesFn *fn, void *context)
{ if(fn!=DO_FIND_NAME) { char buffer[200];
uint16_t length;
while(start<=end) {
buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; /* here, we assume that the buffer is large enough */ if(length>0) { if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) { returnfalse;
}
}
++start;
}
}
/* find the group that contains start, or the highest before it */
group=getGroup(names, start);
if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) { /* enumerate synthetic names between start and the group start */
UChar32 extLimit = static_cast<UChar32>(group[GROUP_MSB]) << GROUP_SHIFT; if(extLimit>limit) {
extLimit=limit;
} if(!enumExtNames(start, extLimit-1, fn, context)) { returnfalse;
}
start=extLimit;
}
if(startGroupMSB==endGroupMSB) { if(startGroupMSB==group[GROUP_MSB]) { /* if start and limit-1 are in the same group, then enumerate only in that one */ return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
}
} else { const uint16_t *groups=GET_GROUPS(names);
groupCount=*groups++;
groupLimit=groups+groupCount*GROUP_LENGTH;
if(startGroupMSB==group[GROUP_MSB]) { /* enumerate characters in the partial start group */ if((start&GROUP_MASK)!=0) { if(!enumGroupNames(names, group,
start, (static_cast<UChar32>(startGroupMSB) << GROUP_SHIFT) + LINES_PER_GROUP - 1,
fn, context, nameChoice)) { returnfalse;
}
group=NEXT_GROUP(group); /* continue with the next group */
}
} elseif(startGroupMSB>group[GROUP_MSB]) { /* make sure that we start enumerating with the first group after start */ const uint16_t *nextGroup=NEXT_GROUP(group); if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; if (end > limit) {
end = limit;
} if (!enumExtNames(start, end - 1, fn, context)) { returnfalse;
}
}
group=nextGroup;
}
/* enumerate entire groups between the start- and end-groups */ while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) { const uint16_t *nextGroup;
start = static_cast<UChar32>(group[GROUP_MSB]) << GROUP_SHIFT; if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) { returnfalse;
}
nextGroup=NEXT_GROUP(group); if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; if (end > limit) {
end = limit;
} if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) { returnfalse;
}
}
group=nextGroup;
}
/* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */ if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) { return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
} elseif (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT; if (next > start) {
start = next;
}
} else { returntrue;
}
}
/* we have not found a group, which means everything is made of
extended names. */ if (nameChoice == U_EXTENDED_CHAR_NAME) { if (limit > UCHAR_MAX_VALUE + 1) {
limit = UCHAR_MAX_VALUE + 1;
} return enumExtNames(start, limit - 1, fn, context);
}
returntrue;
}
static uint16_t
writeFactorSuffix(const uint16_t *factors, uint16_t count, constchar *s, /* suffix elements */
uint32_t code,
uint16_t indexes[8], /* output fields from here */ constchar *elementBases[8], constchar *elements[8], char *buffer, uint16_t bufferLength) {
uint16_t i, factor, bufferPos=0; char c;
/* write elements according to the factors */
/* * the factorized elements are determined by modulo arithmetic * with the factors of this algorithm * * note that for fewer operations, count is decremented here
*/
--count; for(i=count; i>0; --i) {
factor=factors[i];
indexes[i] = static_cast<uint16_t>(code % factor);
code/=factor;
} /* * we don't need to calculate the last modulus because start<=code<=end * guarantees here that code<=factors[0]
*/
indexes[0] = static_cast<uint16_t>(code);
/* write each element */ for(;;) { if(elementBases!=nullptr) {
*elementBases++=s;
}
/* we do not need to perform the rest of this loop for i==count - break here */ if(i>=count) { break;
}
/* skip the rest of the strings for this factors[i] */
factor = static_cast<uint16_t>(factors[i] - indexes[i] - 1); while(factor>0) { while(*s++!=0) {}
--factor;
}
/* * Important: * Parts of findAlgName() are almost the same as some of getAlgName(). * Fixes must be applied to both.
*/ static uint16_t
getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice, char *buffer, uint16_t bufferLength) {
uint16_t bufferPos=0;
/* Only the normative character name can be algorithmic. */ if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { /* zero-terminate */ if(bufferLength>0) {
*buffer=0;
} return 0;
}
switch(range->type) { case 0: { /* name = prefix hex-digits */ constchar* s = reinterpret_cast<constchar*>(range + 1); char c;
/* * Important: enumAlgNames() and findAlgName() are almost the same. * Any fix must be applied to both.
*/ static UBool
enumAlgNames(AlgorithmicRange *range,
UChar32 start, UChar32 limit,
UEnumCharNamesFn *fn, void *context,
UCharNameChoice nameChoice) { char buffer[200];
uint16_t length;
switch(range->type) { case 0: { char *s, *end; char c;
/* get the full name of the start character */
length = getAlgName(range, static_cast<uint32_t>(start), nameChoice, buffer, sizeof(buffer)); if(length<=0) { returntrue;
}
/* call the enumerator function with this first character */ if(!fn(context, start, nameChoice, buffer, length)) { returnfalse;
}
/* go to the end of the name; all these names have the same length */
end=buffer; while(*end!=0) {
++end;
}
/* enumerate the rest of the names */ while(++start<limit) { /* increment the hexadecimal number on a character-basis */
s=end; for (;;) {
c=*--s; if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
*s = static_cast<char>(c + 1); break;
} elseif(c=='9') {
*s='A'; break;
} elseif(c=='F') {
*s='0';
}
}
/* append the suffix of the start character */
length = static_cast<uint16_t>(prefixLength + writeFactorSuffix(factors, count,
s, static_cast<uint32_t>(start) - range->start,
indexes, elementBases, elements,
suffix, static_cast<uint16_t>(sizeof(buffer) - prefixLength)));
/* call the enumerator function with this first character */ if(!fn(context, start, nameChoice, buffer, length)) { returnfalse;
}
/* enumerate the rest of the names */ while(++start<limit) { /* increment the indexes in lexical order bound by the factors */
i=count; for (;;) {
idx = static_cast<uint16_t>(indexes[--i] + 1); if(idx<factors[i]) { /* skip one index and its element string */
indexes[i]=idx;
s=elements[i]; while(*s++!=0) {
}
elements[i]=s; break;
} else { /* reset this index to 0 and its element string to the first one */
indexes[i]=0;
elements[i]=elementBases[i];
}
}
/* to make matters a little easier, just append all elements to the suffix */
t=suffix;
length=prefixLength; for(i=0; i<count; ++i) {
s=elements[i]; while((c=*s++)!=0) {
*t++=c;
++length;
}
} /* zero-terminate */
*t=0;
/* * findAlgName() is almost the same as enumAlgNames() except that it * returns the code point for a name if it fits into the range. * It returns 0xffff otherwise.
*/ static UChar32
findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, constchar *otherName) {
UChar32 code;
/* initialize the suffix elements for enumeration; indexes should all be set to 0 */
writeFactorSuffix(factors, count, s, 0,
indexes, elementBases, elements, buffer, sizeof(buffer));
/* compare the first suffix */ if(0==uprv_strcmp(otherName, buffer)) { return start;
}
/* enumerate and compare the rest of the suffixes */ while(++start<limit) { /* increment the indexes in lexical order bound by the factors */
i=count; for (;;) {
idx = static_cast<uint16_t>(indexes[--i] + 1); if(idx<factors[i]) { /* skip one index and its element string */
indexes[i]=idx;
s=elements[i]; while(*s++!=0) {}
elements[i]=s; break;
} else { /* reset this index to 0 and its element string to the first one */
indexes[i]=0;
elements[i]=elementBases[i];
}
}
/* to make matters a little easier, just compare all elements of the suffix */
t=otherName; for(i=0; i<count; ++i) {
s=elements[i]; while((c=*s++)!=0) { if(c!=*t++) {
s=""; /* does not match */
i=99;
}
}
} if(i<99 && *t==0) { return start;
}
} break;
} default: /* undefined type */ break;
}
return 0xffff;
}
/* sets of name characters, maximum name lengths ---------------------------- */
#define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f))) #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
/* prefix length */
s = reinterpret_cast<constchar*>(factors + count);
length=calcStringSetLength(gNameSet, s);
s+=length+1; /* start of factor suffixes */
/* get the set and maximum factor suffix length for each factor */ for(i=0; i<count; ++i) {
maxFactorLength=0; for(factor=factors[i]; factor>0; --factor) {
factorLength=calcStringSetLength(gNameSet, s);
s+=factorLength+1; if(factorLength>maxFactorLength) {
maxFactorLength=factorLength;
}
}
length+=maxFactorLength;
}
range = reinterpret_cast<AlgorithmicRange*>(reinterpret_cast<uint8_t*>(range) + range->size);
--rangeCount;
} return maxNameLength;
}
static int32_t
calcExtNameSetsLengths(int32_t maxNameLength) {
int32_t i, length;
for(i=0; i<UPRV_LENGTHOF(charCatNames); ++i) { /* * for each category, count the length of the category name * plus 9= * 2 for <> * 1 for - * 6 for most hex digits per code point
*/
length=9+calcStringSetLength(gNameSet, charCatNames[i]); if(length>maxNameLength) {
maxNameLength=length;
}
} return maxNameLength;
}
/* enumerate all groups */ while(groupCount>0) {
s = reinterpret_cast<uint8_t*>(uCharNames) + uCharNames->groupStringOffset + GET_GROUP_OFFSET(group);
s=expandGroupLengths(s, offsets, lengths);
/* enumerate all lines in each group */ for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
line=s+offsets[lineNumber];
length=lengths[lineNumber]; if(length==0) { continue;
}
/* set gMax... - name length last for threading */
gMaxNameLength=maxNameLength;
}
static UBool
calcNameSetsLengths(UErrorCode *pErrorCode) { staticconstchar extChars[]="0123456789ABCDEF<>-";
int32_t i, maxNameLength;
if(gMaxNameLength!=0) { returntrue;
}
if(!isDataLoaded(pErrorCode)) { returnfalse;
}
/* set hex digits, used in various names, and <>-, used in extended names */ for (i = 0; i < static_cast<int32_t>(sizeof(extChars)) - 1; ++i) {
SET_ADD(gNameSet, extChars[i]);
}
/* set sets and lengths from algorithmic names */
maxNameLength=calcAlgNameSetsLengths(0);
/* set sets and lengths from extended names */
maxNameLength=calcExtNameSetsLengths(maxNameLength);
/* set sets and lengths from group names, set global maximum values */
calcGroupNameSetsLengths(maxNameLength);
returntrue;
}
U_NAMESPACE_END
/* public API --------------------------------------------------------------- */
/* construct the uppercase and lowercase of the name first */ for(i=0; i<sizeof(upper); ++i) { if((c0=*name++)!=0) {
upper[i]=uprv_toupper(c0);
lower[i]=uprv_tolower(c0);
} else {
upper[i]=lower[i]=0; break;
}
} if(i==sizeof(upper)) { /* name too long, there is no such character */
*pErrorCode = U_ILLEGAL_CHAR_FOUND; return error;
} // i==strlen(name)==strlen(lower)==strlen(upper)
/* try extended names first */ if (lower[0] == '<') { if (nameChoice == U_EXTENDED_CHAR_NAME && lower[--i] == '>') { // Parse a string like "<category-HHHH>" where HHHH is a hex code point.
uint32_t limit = i; while (i >= 3 && lower[--i] != '-') {}
// There should be 1 to 8 hex digits.
int32_t hexLength = limit - (i + 1); if (i >= 2 && lower[i] == '-' && 1 <= hexLength && hexLength <= 8) {
uint32_t cIdx;
/* Now validate the category name. We could use a binary search, or a trie, if
we really wanted to. */
uint8_t cat = getCharCat(cp); for (lower[i] = 0, cIdx = 0; cIdx < UPRV_LENGTHOF(charCatNames); ++cIdx) {
if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) { if (cat == cIdx) { return cp;
} break;
}
}
}
}
/* interleave the data-driven ones with the algorithmic ones */ /* iterate over all algorithmic ranges; assume that they are in ascending order */
p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
i=*p;
algRange=(AlgorithmicRange *)(p+1); while(i>0) { /* enumerate the character names before the current algorithmic range */ /* here: start<limit */ if((uint32_t)start<algRange->start) { if((uint32_t)limit<=algRange->start) {
enumNames(uCharNames, start, limit, fn, context, nameChoice); return;
} if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) { return;
}
start=(UChar32)algRange->start;
} /* enumerate the character names in the current algorithmic range */ /* here: algRange->start<=start<limit */ if((uint32_t)start<=algRange->end) { if((uint32_t)limit<=(algRange->end+1)) {
enumAlgNames(algRange, start, limit, fn, context, nameChoice); return;
} if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) { return;
}
start=(UChar32)algRange->end+1;
} /* continue to the next algorithmic range (here: start<limit) */
algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
--i;
} /* enumerate the character names after the last algorithmic range */
enumNames(uCharNames, start, limit, fn, context, nameChoice);
}
/** * Converts the char set cset into a Unicode set uset. * @param cset Set of 256 bit flags corresponding to a set of chars. * @param uset USet to receive characters. Existing contents are deleted.
*/ staticvoid
charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
char16_t us[256]; char cs[256];
int32_t i, length;
UErrorCode errorCode;
errorCode=U_ZERO_ERROR;
if(!calcNameSetsLengths(&errorCode)) { return;
}
/* build a char string with all chars that are used in character names */
length=0; for(i=0; i<256; ++i) { if(SET_CONTAINS(cset, i)) {
cs[length++] = static_cast<char>(i);
}
}
/* convert the char string to a char16_t string */
u_charsToUChars(cs, us, length);
/* add each char16_t to the USet */ for(i=0; i<length; ++i) { if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (char16_t)0 */
sa->add(sa->set, us[i]);
}
}
}
/** * Fills set with characters that are used in Unicode character names. * @param set USet to receive characters.
*/
U_CAPI void U_EXPORT2
uprv_getCharNameCharacters(const USetAdder *sa) {
charSetToUSet(gNameSet, sa);
}
/* data swapping ------------------------------------------------------------ */
/* * The token table contains non-negative entries for token bytes, * and -1 for bytes that represent themselves in the data file's charset. * -2 entries are used for lead bytes. * * Direct bytes (-1 entries) must be translated from the input charset family * to the output charset family. * makeTokenMap() writes a permutation mapping for this. * Use it once for single-/lead-byte tokens and once more for all trail byte * tokens. (';' is an unused trail byte marked with -1.)
*/ staticvoid
makeTokenMap(const UDataSwapper *ds,
int16_t tokens[], uint16_t tokenCount,
uint8_t map[256],
UErrorCode *pErrorCode) {
UBool usedOutChar[256];
uint16_t i, j;
uint8_t c1, c2;
/* set the direct bytes (byte 0 always maps to itself) */ for(i=1; i<tokenCount; ++i) { if(tokens[i]==-1) { /* convert the direct byte character */
c1 = static_cast<uint8_t>(i);
ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode); if(U_FAILURE(*pErrorCode)) {
udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
i, ds->inCharset); return;
}
/* enter the converted character into the map and mark it used */
map[c1]=c2;
usedOutChar[c2]=true;
}
}
/* set the mappings for the rest of the permutation */ for(i=j=1; i<tokenCount; ++i) { /* set mappings that were not set for direct bytes */ if(map[i]==0) { /* set an output byte value that was not used as an output byte above */ while(usedOutChar[j]) {
++j;
}
map[i] = static_cast<uint8_t>(j++);
}
}
/* * leave mappings at tokenCount and above unset if tokenCount<256 * because they won't be used
*/
}
}
/* check data format and format version */
pInfo=(const UDataInfo *)((constchar *)inData+4); if(!(
pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
pInfo->dataFormat[1]==0x6e &&
pInfo->dataFormat[2]==0x61 &&
pInfo->dataFormat[3]==0x6d &&
pInfo->formatVersion[0]==1
)) {
udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
pInfo->dataFormat[0], pInfo->dataFormat[1],
pInfo->dataFormat[2], pInfo->dataFormat[3],
pInfo->formatVersion[0]);
*pErrorCode=U_UNSUPPORTED_ERROR; return 0;
}
/* * now the tokens table * it needs to be permutated along with the compressed name strings
*/
p=(const uint16_t *)(inBytes+16);
q=(uint16_t *)(outBytes+16);
/* read and swap the tokenCount */
tokenCount=ds->readUInt16(*p);
ds->swapArray16(ds, p, 2, q, pErrorCode);
++p;
++q;
/* read the first 512 tokens and make the token maps */ if(tokenCount<=512) {
count=tokenCount;
} else {
count=512;
} for(i=0; i<count; ++i) {
tokens[i]=udata_readInt16(ds, p[i]);
} for(; i<512; ++i) {
tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
}
makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode); if(U_FAILURE(*pErrorCode)) { return 0;
}
/* * swap and permutate the tokens * go through a temporary array to support in-place swapping
*/
temp=(uint16_t *)uprv_malloc(tokenCount*2); if(temp==nullptr) {
udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
tokenCount);
*pErrorCode=U_MEMORY_ALLOCATION_ERROR; return 0;
}
/* copy the result into the output and free the temporary array */
uprv_memcpy(q, temp, tokenCount*2);
uprv_free(temp);
/* * swap the token strings but not a possible padding byte after * the terminating NUL of the last string
*/
udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
outBytes+tokenStringOffset, pErrorCode); if(U_FAILURE(*pErrorCode)) {
udata_printError(ds, "uchar_swapNames(token strings) failed\n"); return 0;
}
/* swap the group table */
count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
outBytes+groupsOffset, pErrorCode);
/* * swap the group strings * swap the string bytes but not the nibble-encoded string lengths
*/ if(ds->inCharset!=ds->outCharset) {
uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
/* iterate through string groups until only a few padding bytes are left */ while(stringsCount>32) {
nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
/* move past the length bytes */
stringsCount-=(uint32_t)(nextInStrings-inStrings);
outStrings+=nextInStrings-inStrings;
inStrings=nextInStrings;
count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
stringsCount-=count;
/* swap the string bytes using map[] and trailMap[] */ while(count>0) {
c=*inStrings++;
*outStrings++=map[c]; if(tokens[c]!=-2) {
--count;
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.34 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.