/* * A lexical comparison is used for sorting in the builder, to allow * an efficient search for a byte sequence that could be a prefix * of a previously entered byte sequence. * * Comparing by lengths first is for compatibility with old .ucm tools * like canonucm and rptp2ucm.
*/ if(lexical) { /* get the minimum length and continue */ if(l->bLen<=r->bLen) {
length=l->bLen;
} else {
length=r->bLen;
}
} else { /* compare lengths first */
result=l->bLen-r->bLen; if(result!=0) { return result;
} else {
length=l->bLen;
}
}
/* get pointers to the byte sequences */
lb=UCM_GET_BYTES(lTable, l);
rb=UCM_GET_BYTES(rTable, r);
/* 1. sort by Unicode first */
uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
compareMappingsUnicodeFirst, t, false, &errorCode);
/* build the reverseMap */ if(t->reverseMap==nullptr) { /* * allocate mappingsCapacity instead of mappingsLength so that * if mappings are added, the reverseMap need not be * reallocated each time * (see ucm_moveMappings() and ucm_addMapping())
*/
t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t)); if(t->reverseMap==nullptr) {
fprintf(stderr, "ucm error: unable to allocate reverseMap\n"); exit(U_MEMORY_ALLOCATION_ERROR);
}
} for(i=0; i<t->mappingsLength; ++i) {
t->reverseMap[i]=i;
}
/* 2. sort reverseMap by mappings bytes first */
uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
compareMappingsBytesFirst, t, false, &errorCode);
/* * remove mappings with their move flag set from the base table * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
*/
U_CAPI void U_EXPORT2
ucm_moveMappings(UCMTable *base, UCMTable *ext) {
UCMapping *mb, *mbLimit;
int8_t flag;
while(mb<mbLimit) {
flag=mb->moveFlag; if(flag!=0) { /* reset the move flag */
mb->moveFlag=0;
if(ext!=nullptr && (flag&UCM_MOVE_TO_EXT)) { /* add the mapping to the extension table */
ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
}
/* remove this mapping: move the last base mapping down and overwrite the current one */ if(mb<(mbLimit-1)) {
uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
}
--mbLimit;
--base->mappingsLength;
base->isSorted=false;
} else {
++mb;
}
}
}
for(;;) { /* skip irrelevant mappings on both sides */ for(;;) { if(mb==mbLimit) { return result;
}
if((0<=mb->f && mb->f<=2) || mb->f==4) { break;
}
++mb;
}
for(;;) { if(me==meLimit) { return result;
}
if((0<=me->f && me->f<=2) || me->f==4) { break;
}
++me;
}
/* compare the base and extension mappings */
cmp=compareUnicode(base, mb, ext, me); if(cmp<0) { if(intersectBase && (intersectBase!=2 || mb->bLen>1)) { /* * mapping in base but not in ext, move it * * if ext is DBCS, move DBCS mappings here * and check SBCS ones for Unicode prefix below
*/
mb->moveFlag|=UCM_MOVE_TO_EXT;
result|=NEEDS_MOVE;
/* does mb map from an input sequence that is a prefix of me's? */
} elseif( mb->uLen<me->uLen &&
0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
) { if(moveToExt) { /* mark this mapping to be moved to the extension table */
mb->moveFlag|=UCM_MOVE_TO_EXT;
result|=NEEDS_MOVE;
} else {
fprintf(stderr, "ucm error: the base table contains a mapping whose input sequence\n" " is a prefix of the input sequence of an extension mapping\n");
ucm_printMapping(base, mb, stderr);
ucm_printMapping(ext, me, stderr);
result|=HAS_ERRORS;
}
}
++mb;
} elseif(cmp==0) { /* * same output: remove the extension mapping, * otherwise treat as an error
*/ if( mb->f==me->f && mb->bLen==me->bLen &&
0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
) {
me->moveFlag|=UCM_REMOVE_MAPPING;
result|=NEEDS_MOVE;
} elseif(intersectBase) { /* mapping in base but not in ext, move it */
mb->moveFlag|=UCM_MOVE_TO_EXT;
result|=NEEDS_MOVE;
} else {
fprintf(stderr, "ucm error: the base table contains a mapping whose input sequence\n" " is the same as the input sequence of an extension mapping\n" " but it maps differently\n");
ucm_printMapping(base, mb, stderr);
ucm_printMapping(ext, me, stderr);
result|=HAS_ERRORS;
}
/* compare the base and extension mappings */
cmp=compareBytes(base, mb, ext, me, true); if(cmp<0) { if(intersectBase) { /* mapping in base but not in ext, move it */
mb->moveFlag|=UCM_MOVE_TO_EXT;
result|=NEEDS_MOVE;
/* * does mb map from an input sequence that is a prefix of me's? * for SI/SO tables, a single byte is never a prefix because it * occurs in a separate single-byte state
*/
} elseif( mb->bLen<me->bLen &&
(!isSISO || mb->bLen>1) &&
0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
) { if(moveToExt) { /* mark this mapping to be moved to the extension table */
mb->moveFlag|=UCM_MOVE_TO_EXT;
result|=NEEDS_MOVE;
} else {
fprintf(stderr, "ucm error: the base table contains a mapping whose input sequence\n" " is a prefix of the input sequence of an extension mapping\n");
ucm_printMapping(base, mb, stderr);
ucm_printMapping(ext, me, stderr);
result|=HAS_ERRORS;
}
}
++b;
} elseif(cmp==0) { /* * same output: remove the extension mapping, * otherwise treat as an error
*/ if( mb->f==me->f && mb->uLen==me->uLen &&
0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
) {
me->moveFlag|=UCM_REMOVE_MAPPING;
result|=NEEDS_MOVE;
} elseif(intersectBase) { /* mapping in base but not in ext, move it */
mb->moveFlag|=UCM_MOVE_TO_EXT;
result|=NEEDS_MOVE;
} else {
fprintf(stderr, "ucm error: the base table contains a mapping whose input sequence\n" " is the same as the input sequence of an extension mapping\n" " but it maps differently\n");
ucm_printMapping(base, mb, stderr);
ucm_printMapping(ext, me, stderr);
result|=HAS_ERRORS;
}
/* if we have an extension table, we must always use precision flags */ if(base->flagsType&UCM_FLAGS_IMPLICIT) {
fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n"); returnfalse;
} if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n"); returnfalse;
}
/* checking requires both tables to be sorted */
ucm_sortTable(base);
ucm_sortTable(ext);
while(fromUIndex<fromUTop && toUIndex<toUTop) {
cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, true); if(cmp==0) { /* equal: roundtrip, nothing to do (flags are initially 0) */
++fromUMapping;
++toUMapping;
++fromUIndex;
++toUIndex;
} elseif(cmp<0) { /* * the fromU mapping does not have a toU counterpart: * fallback Unicode->codepage
*/ if( (fromUMapping->bLen==subcharLength &&
0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
(subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
) {
fromUMapping->f=2; /* SUB mapping */
} else {
fromUMapping->f=1; /* normal fallback */
}
++fromUMapping;
++fromUIndex;
} else { /* * the toU mapping does not have a fromU counterpart: * (reverse) fallback codepage->Unicode, copy it to the fromU table
*/
if(m->uLen>1) {
idx=table->codePointsLength;
table->codePointsLength+=m->uLen; if(table->codePointsLength>table->codePointsCapacity) {
fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n"); exit(U_MEMORY_ALLOCATION_ERROR);
}
if(m->bLen>4) {
idx=table->bytesLength;
table->bytesLength+=m->bLen; if(table->bytesLength>table->bytesCapacity) {
fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n"); exit(U_MEMORY_ALLOCATION_ERROR);
}
U_CAPI int32_t U_EXPORT2
ucm_mappingType(UCMStates *baseStates,
UCMapping *m,
UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
(void)codePoints; /* check validity of the bytes and count the characters in them */
int32_t count=ucm_countChars(baseStates, bytes, m->bLen); if(count<1) { /* illegal byte sequence */ return -1;
}
/* * Suitable for an ICU conversion base table means: * - a 1:1 mapping (1 Unicode code point : 1 byte sequence) * - precision flag 0..3 * - SBCS: any 1:1 mapping * (the table stores additional bits to distinguish mapping types) * - MBCS: not a |2 SUB mapping for <subchar1> * - MBCS: not a |1 fallback to 0x00 * - MBCS: not a multi-byte mapping with leading 0x00 bytes * * Further restrictions for fromUnicode tables * are enforced in makeconv (MBCSOkForBaseFromUnicode()). * * All of the MBCS fromUnicode specific tests could be removed from here, * but the ones above are for unusual mappings, and removing the tests * from here would change canonucm output which seems gratuitous. * (Markus Scherer 2006-nov-28) * * Exception: All implicit mappings (f<0) that need to be moved * because of fromUnicode restrictions _must_ be moved here because * makeconv uses a hack for moving mappings only for the fromUnicode table * that only works with non-negative values of f.
*/ if( m->uLen==1 && count==1 && m->f<=3 &&
(baseStates->maxCharLength==1 ||
!((m->f==2 && m->bLen==1) ||
(m->f==1 && bytes[0]==0) ||
(m->f<=1 && m->bLen>1 && bytes[0]==0)))
) { return 0; /* suitable for a base table */
} else { return 1; /* needs to go into an extension table */
}
}
if(baseStates!=nullptr) { /* check validity of the bytes and count the characters in them */
type=ucm_mappingType(baseStates, m, codePoints, bytes); if(type<0) { /* illegal byte sequence */
printMapping(m, codePoints, bytes, stderr); returnfalse;
}
} else { /* not used - adding a mapping for an extension-only table before its base table is read */
type=1;
}
/* * Add the mapping to the base table if this is requested and suitable. * Otherwise, add it to the extension table.
*/ if(forBase && type==0) {
ucm_addMapping(ucm->base, m, codePoints, bytes);
} else {
ucm_addMapping(ucm->ext, m, codePoints, bytes);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.