/* BOCU-1 constants and macros ---------------------------------------------- */
/* * BOCU-1 encodes the code points of a Unicode string as * a sequence of byte-encoded differences (slope detection), * preserving lexical order. * * Optimize the difference-taking for runs of Unicode text within * small scripts: * * Most small scripts are allocated within aligned 128-blocks of Unicode * code points. Lexical order is preserved if the "previous code point" state * is always moved into the middle of such a block. * * Additionally, "prev" is moved from anywhere in the Unihan and Hangul * areas into the middle of those areas. * * C0 control codes and space are encoded with their US-ASCII bytes. * "prev" is reset for C0 controls but not for space.
*/
/* initial value for "prev": middle of the ASCII range */ #define BOCU1_ASCII_PREV 0x40
/* number of lead bytes */ #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
/* adjust trail byte counts for the use of some C0 control byte values */ #define BOCU1_TRAIL_CONTROLS_COUNT 20 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
/* number of trail bytes */ #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
/* * number of positive and negative single-byte codes * (counting 0==BOCU1_MIDDLE among the positive ones)
*/ #define BOCU1_SINGLE 64
/* number of lead bytes for positive and negative 2/3/4-byte sequences */ #define BOCU1_LEAD_2 43 #define BOCU1_LEAD_3 3 #define BOCU1_LEAD_4 1
/* The difference value range for single-byters. */ #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1) #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
/* The difference value range for double-byters. */ #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
/* The difference value range for 3-byters. */ #define BOCU1_REACH_POS_3 \
(BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ #define BOCU1_LENGTH_FROM_LEAD(lead) \
((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
(BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
(BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
/* The length of a byte sequence, according to its packed form. */ #define BOCU1_LENGTH_FROM_PACKED(packed) \
((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
/* * 12 commonly used C0 control codes (and space) are only used to encode * themselves directly, * which makes BOCU-1 MIME-usable and reasonably safe for * ASCII-oriented software. * * These controls are * 0 NUL * * 7 BEL * 8 BS * * 9 TAB * a LF * b VT * c FF * d CR * * e SO * f SI * * 1a SUB * 1b ESC * * The other 20 C0 controls are also encoded directly (to preserve order) * but are also used as trail bytes in difference encoding * (for better compression).
*/ #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
/* * Byte value map for control codes, * from external byte values 0x00..0x20 * to trail byte values 0..19 (0..0x13) as used in the difference calculation. * External byte values that are illegal as trail bytes are mapped to -1.
*/ staticconst int8_t
bocu1ByteToTrail[BOCU1_MIN]={ /* 0 1 2 3 4 5 6 7 */
-1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
/* 8 9 a b c d e f */
-1, -1, -1, -1, -1, -1, -1, -1,
/* * Byte value map for control codes, * from trail byte values 0..19 (0..0x13) as used in the difference calculation * to external byte values 0x00..0x20.
*/ staticconst int8_t
bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={ /* 0 1 2 3 4 5 6 7 */
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
/* 8 9 a b c d e f */
0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
/* 10 11 12 13 */
0x1c, 0x1d, 0x1e, 0x1f
};
/** * Integer division and modulo with negative numerators * yields negative modulo results and quotients that are one more than * what we need here. * This macro adjust the results so that the modulo-value m is always >=0. * * For positive n, the if() condition is always false. * * @param n Number to be split into quotient and rest. * Will be modified to contain the quotient. * @param d Divisor. * @param m Output variable for the rest (modulo result).
*/ #define NEGDIVMOD(n, d, m) UPRV_BLOCK_MACRO_BEGIN { \
(m)=(n)%(d); \
(n)/=(d); \ if((m)<0) { \
--(n); \
(m)+=(d); \
} \
} UPRV_BLOCK_MACRO_END
/* Faster versions of packDiff() for single-byte-encoded diff values. */
/** Is a diff value encodable in a single byte? */ #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
/** Encode a diff value in a single byte. */ #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
/** Is a diff value encodable in two bytes? */ #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
/** * Compute the next "previous" value for differencing * from the current code point. * * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below) * @return "previous code point" state value
*/ staticinline int32_t
bocu1Prev(int32_t c) { /* compute new prev */ if(/* 0x3040<=c && */ c<=0x309f) { /* Hiragana is not 128-aligned */ return 0x3070;
} elseif(0x4e00<=c && c<=0x9fa5) { /* CJK Unihan */ return 0x4e00-BOCU1_REACH_NEG_2;
} elseif(0xac00<=c /* && c<=0xd7a3 */) { /* Korean Hangul */ return (0xd7a3+0xac00)/2;
} else { /* mostly small scripts */ return BOCU1_SIMPLE_PREV(c);
}
}
/** Fast version of bocu1Prev() for most scripts. */ #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
/* * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c. * The UConverter fields are used as follows: * * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) * * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
*/
/** * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes * and return a packed integer with them. * * The encoding favors small absolute differences with short encodings * to compress runs of same-script characters. * * Optimized version with unrolled loops and fewer floating-point operations * than the standard packDiff(). * * @param diff difference value -0x10ffff..0x10ffff * @return * 0x010000zz for 1-byte sequence zz * 0x0200yyzz for 2-byte sequence yy zz * 0x03xxyyzz for 3-byte sequence xx yy zz * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
*/ static int32_t
packDiff(int32_t diff) {
int32_t result, m;
U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */ if(diff>=BOCU1_REACH_NEG_1) { /* mostly positive differences, and single-byte negative ones */ #if 0 /* single-byte case handled in macros, see below */ if(diff<=BOCU1_REACH_POS_1) { /* single byte */ return 0x01000000|(BOCU1_MIDDLE+diff);
} else #endif if(diff<=BOCU1_REACH_POS_2) { /* two bytes */
diff-=BOCU1_REACH_POS_1+1;
result=0x02000000;
/* * We know that / and % would deliver quotient 0 and rest=diff. * Avoid division and modulo for performance.
*/
result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
result |= static_cast<uint32_t>(BOCU1_START_POS_4) << 24;
}
} else { /* two- to four-byte negative differences */ if(diff>=BOCU1_REACH_NEG_2) { /* two bytes */
diff-=BOCU1_REACH_NEG_1;
result=0x02000000;
/* * We know that NEGDIVMOD would deliver * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT. * Avoid division and modulo for performance.
*/
m=diff+BOCU1_TRAIL_COUNT;
result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
/* set up the local pointers */
cnv=pArgs->converter;
source=pArgs->source;
sourceLimit=pArgs->sourceLimit;
target = reinterpret_cast<uint8_t*>(pArgs->target);
targetCapacity = static_cast<int32_t>(pArgs->targetLimit - pArgs->target);
offsets=pArgs->offsets;
/* get the converter state from UConverter */
c=cnv->fromUChar32;
prev = static_cast<int32_t>(cnv->fromUnicodeStatus); if(prev==0) {
prev=BOCU1_ASCII_PREV;
}
/* sourceIndex=-1 if the current character began in the previous buffer */
sourceIndex= c==0 ? 0 : -1;
nextSourceIndex=0;
fastSingle: /* fast loop for single-byte differences */ /* use only one loop counter variable, targetCapacity, not also source */
diff = static_cast<int32_t>(sourceLimit - source); if(targetCapacity>diff) {
targetCapacity=diff;
} while(targetCapacity>0 && (c=*source)<0x3000) { if(c<=0x20) { if(c!=0x20) {
prev=BOCU1_ASCII_PREV;
}
*target++ = static_cast<uint8_t>(c);
*offsets++=nextSourceIndex++;
++source;
--targetCapacity;
} else {
diff=c-prev; if(DIFF_IS_SINGLE(diff)) {
prev=BOCU1_SIMPLE_PREV(c);
*target++ = static_cast<uint8_t>(PACK_SINGLE_DIFF(diff));
*offsets++=nextSourceIndex++;
++source;
--targetCapacity;
} else { break;
}
}
} /* restore real values */
targetCapacity = static_cast<int32_t>(reinterpret_cast<const uint8_t*>(pArgs->targetLimit) - target);
sourceIndex=nextSourceIndex; /* wrong if offsets==nullptr but does not matter */
/* regular loop for all cases */ while(source<sourceLimit) { if(targetCapacity>0) {
c=*source++;
++nextSourceIndex;
if(c<=0x20) { /* * ISO C0 control & space: * Encode directly for MIME compatibility, * and reset state except for space, to not disrupt compression.
*/ if(c!=0x20) {
prev=BOCU1_ASCII_PREV;
}
*target++ = static_cast<uint8_t>(c);
*offsets++=sourceIndex;
--targetCapacity;
sourceIndex=nextSourceIndex; continue;
}
if(U16_IS_LEAD(c)) {
getTrail: if(source<sourceLimit) { /* test the following code unit */
char16_t trail=*source; if(U16_IS_TRAIL(trail)) {
++source;
++nextSourceIndex;
c=U16_GET_SUPPLEMENTARY(c, trail);
}
} else { /* no more input */
c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ break;
}
}
/* * all other Unicode code points c==U+0021..U+10ffff * are encoded with the difference c-prev * * a new prev is computed from c, * placed in the middle of a 0x80-block (for most small scripts) or * in the middle of the Unihan and Hangul blocks * to statistically minimize the following difference
*/
diff=c-prev;
prev=BOCU1_PREV(c); if(DIFF_IS_SINGLE(diff)) {
*target++ = static_cast<uint8_t>(PACK_SINGLE_DIFF(diff));
*offsets++=sourceIndex;
--targetCapacity;
sourceIndex=nextSourceIndex; if(c<0x3000) { goto fastSingle;
}
} elseif(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { /* optimize 2-byte case */
int32_t m;
/* write the output character bytes from diff and length */ /* from the first if in the loop we know that targetCapacity>0 */ if(length<=targetCapacity) { switch(length) { /* each branch falls through to the next one */ case 4:
*target++ = static_cast<uint8_t>(diff >> 24);
*offsets++=sourceIndex;
U_FALLTHROUGH; case 3:
*target++ = static_cast<uint8_t>(diff >> 16);
*offsets++=sourceIndex;
U_FALLTHROUGH; case 2:
*target++ = static_cast<uint8_t>(diff >> 8);
*offsets++=sourceIndex; /* case 1: handled above */
*target++ = static_cast<uint8_t>(diff);
*offsets++=sourceIndex;
U_FALLTHROUGH; default: /* will never occur */ break;
}
targetCapacity-=length;
sourceIndex=nextSourceIndex;
} else {
uint8_t *charErrorBuffer;
/* * We actually do this backwards here: * In order to save an intermediate variable, we output * first to the overflow buffer what does not fit into the * regular target.
*/ /* we know that 1<=targetCapacity<length<=4 */
length-=targetCapacity;
charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; switch(length) { /* each branch falls through to the next one */ case 3:
*charErrorBuffer++ = static_cast<uint8_t>(diff >> 16);
U_FALLTHROUGH; case 2:
*charErrorBuffer++ = static_cast<uint8_t>(diff >> 8);
U_FALLTHROUGH; case 1:
*charErrorBuffer = static_cast<uint8_t>(diff);
U_FALLTHROUGH; default: /* will never occur */ break;
}
cnv->charErrorBufferLength = static_cast<int8_t>(length);
/* now output what fits into the regular target */
diff>>=8*length; /* length was reduced by targetCapacity */ switch(targetCapacity) { /* each branch falls through to the next one */ case 3:
*target++ = static_cast<uint8_t>(diff >> 16);
*offsets++=sourceIndex;
U_FALLTHROUGH; case 2:
*target++ = static_cast<uint8_t>(diff >> 8);
*offsets++=sourceIndex;
U_FALLTHROUGH; case 1:
*target++ = static_cast<uint8_t>(diff);
*offsets++=sourceIndex;
U_FALLTHROUGH; default: /* will never occur */ break;
}
/* set the converter state back into UConverter */
cnv->fromUChar32= c<0 ? -c : 0;
cnv->fromUnicodeStatus = static_cast<uint32_t>(prev);
/* write back the updated pointers */
pArgs->source=source;
pArgs->target = reinterpret_cast<char*>(target);
pArgs->offsets=offsets;
}
/* * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling. * If a change is made in the original function, then either * change this function the same way or * re-copy the original function and remove the variables * offsets, sourceIndex, and nextSourceIndex.
*/ staticvoid U_CALLCONV
_Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UConverter *cnv; const char16_t *source, *sourceLimit;
uint8_t *target;
int32_t targetCapacity;
int32_t prev, c, diff;
/* set up the local pointers */
cnv=pArgs->converter;
source=pArgs->source;
sourceLimit=pArgs->sourceLimit;
target = reinterpret_cast<uint8_t*>(pArgs->target);
targetCapacity = static_cast<int32_t>(pArgs->targetLimit - pArgs->target);
/* get the converter state from UConverter */
c=cnv->fromUChar32;
prev = static_cast<int32_t>(cnv->fromUnicodeStatus); if(prev==0) {
prev=BOCU1_ASCII_PREV;
}
fastSingle: /* fast loop for single-byte differences */ /* use only one loop counter variable, targetCapacity, not also source */
diff = static_cast<int32_t>(sourceLimit - source); if(targetCapacity>diff) {
targetCapacity=diff;
} while(targetCapacity>0 && (c=*source)<0x3000) { if(c<=0x20) { if(c!=0x20) {
prev=BOCU1_ASCII_PREV;
}
*target++ = static_cast<uint8_t>(c);
} else {
diff=c-prev; if(DIFF_IS_SINGLE(diff)) {
prev=BOCU1_SIMPLE_PREV(c);
*target++ = static_cast<uint8_t>(PACK_SINGLE_DIFF(diff));
} else { break;
}
}
++source;
--targetCapacity;
} /* restore real values */
targetCapacity = static_cast<int32_t>(reinterpret_cast<const uint8_t*>(pArgs->targetLimit) - target);
/* regular loop for all cases */ while(source<sourceLimit) { if(targetCapacity>0) {
c=*source++;
if(c<=0x20) { /* * ISO C0 control & space: * Encode directly for MIME compatibility, * and reset state except for space, to not disrupt compression.
*/ if(c!=0x20) {
prev=BOCU1_ASCII_PREV;
}
*target++ = static_cast<uint8_t>(c);
--targetCapacity; continue;
}
if(U16_IS_LEAD(c)) {
getTrail: if(source<sourceLimit) { /* test the following code unit */
char16_t trail=*source; if(U16_IS_TRAIL(trail)) {
++source;
c=U16_GET_SUPPLEMENTARY(c, trail);
}
} else { /* no more input */
c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ break;
}
}
/* * all other Unicode code points c==U+0021..U+10ffff * are encoded with the difference c-prev * * a new prev is computed from c, * placed in the middle of a 0x80-block (for most small scripts) or * in the middle of the Unihan and Hangul blocks * to statistically minimize the following difference
*/
diff=c-prev;
prev=BOCU1_PREV(c); if(DIFF_IS_SINGLE(diff)) {
*target++ = static_cast<uint8_t>(PACK_SINGLE_DIFF(diff));
--targetCapacity; if(c<0x3000) { goto fastSingle;
}
} elseif(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { /* optimize 2-byte case */
int32_t m;
/* write the output character bytes from diff and length */ /* from the first if in the loop we know that targetCapacity>0 */ if(length<=targetCapacity) { switch(length) { /* each branch falls through to the next one */ case 4:
*target++ = static_cast<uint8_t>(diff >> 24);
U_FALLTHROUGH; case 3:
*target++ = static_cast<uint8_t>(diff >> 16); /* case 2: handled above */
*target++ = static_cast<uint8_t>(diff >> 8); /* case 1: handled above */
*target++ = static_cast<uint8_t>(diff);
U_FALLTHROUGH; default: /* will never occur */ break;
}
targetCapacity-=length;
} else {
uint8_t *charErrorBuffer;
/* * We actually do this backwards here: * In order to save an intermediate variable, we output * first to the overflow buffer what does not fit into the * regular target.
*/ /* we know that 1<=targetCapacity<length<=4 */
length-=targetCapacity;
charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; switch(length) { /* each branch falls through to the next one */ case 3:
*charErrorBuffer++ = static_cast<uint8_t>(diff >> 16);
U_FALLTHROUGH; case 2:
*charErrorBuffer++ = static_cast<uint8_t>(diff >> 8);
U_FALLTHROUGH; case 1:
*charErrorBuffer = static_cast<uint8_t>(diff);
U_FALLTHROUGH; default: /* will never occur */ break;
}
cnv->charErrorBufferLength = static_cast<int8_t>(length);
/* now output what fits into the regular target */
diff>>=8*length; /* length was reduced by targetCapacity */ switch(targetCapacity) { /* each branch falls through to the next one */ case 3:
*target++ = static_cast<uint8_t>(diff >> 16);
U_FALLTHROUGH; case 2:
*target++ = static_cast<uint8_t>(diff >> 8);
U_FALLTHROUGH; case 1:
*target++ = static_cast<uint8_t>(diff);
U_FALLTHROUGH; default: /* will never occur */ break;
}
/** * Function for BOCU-1 decoder; handles multi-byte lead bytes. * * @param b lead byte; * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD * @return (diff<<2)|count
*/ staticinline int32_t
decodeBocu1LeadByte(int32_t b) {
int32_t diff, count;
/* return the state for decoding the trail byte(s) */ return (static_cast<uint32_t>(diff) << 2) | count;
}
/** * Function for BOCU-1 decoder; handles multi-byte trail bytes. * * @param count number of remaining trail bytes including this one * @param b trail byte * @return new delta for diff including b - <0 indicates an error * * @see decodeBocu1
*/ staticinline int32_t
decodeBocu1TrailByte(int32_t count, int32_t b) { if(b<=0x20) { /* skip some C0 controls and make the trail byte range contiguous */
b=bocu1ByteToTrail[b]; /* b<0 for an illegal trail byte value will result in return<0 below */ #if BOCU1_MAX_TRAIL<0xff
} elseif(b>BOCU1_MAX_TRAIL) { return -99; #endif
} else {
b-=BOCU1_TRAIL_BYTE_OFFSET;
}
/* set up the local pointers */
cnv=pArgs->converter;
source = reinterpret_cast<const uint8_t*>(pArgs->source);
sourceLimit = reinterpret_cast<const uint8_t*>(pArgs->sourceLimit);
target=pArgs->target;
targetLimit=pArgs->targetLimit;
offsets=pArgs->offsets;
/* get the converter state from UConverter */
prev = static_cast<int32_t>(cnv->toUnicodeStatus); if(prev==0) {
prev=BOCU1_ASCII_PREV;
}
diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
count=diff&3;
diff>>=2;
byteIndex=cnv->toULength;
bytes=cnv->toUBytes;
/* sourceIndex=-1 if the current character began in the previous buffer */
sourceIndex=byteIndex==0 ? 0 : -1;
nextSourceIndex=0;
/* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ if(count>0 && byteIndex>0 && target<targetLimit) { goto getTrail;
}
fastSingle: /* fast loop for single-byte differences */ /* use count as the only loop counter variable */
diff = static_cast<int32_t>(sourceLimit - source);
count = static_cast<int32_t>(pArgs->targetLimit - target); if(count>diff) {
count=diff;
} while(count>0) { if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
c=prev+(c-BOCU1_MIDDLE); if(c<0x3000) {
*target++ = static_cast<char16_t>(c);
*offsets++=nextSourceIndex++;
prev=BOCU1_SIMPLE_PREV(c);
} else { break;
}
} elseif(c<=0x20) { if(c!=0x20) {
prev=BOCU1_ASCII_PREV;
}
*target++ = static_cast<char16_t>(c);
*offsets++=nextSourceIndex++;
} else { break;
}
++source;
--count;
}
sourceIndex=nextSourceIndex; /* wrong if offsets==nullptr but does not matter */
/* decode a sequence of single and lead bytes */ while(source<sourceLimit) { if(target>=targetLimit) { /* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR; break;
}
++nextSourceIndex;
c=*source++; if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { /* Write a code point directly from a single-byte difference. */
c=prev+(c-BOCU1_MIDDLE); if(c<0x3000) {
*target++ = static_cast<char16_t>(c);
*offsets++=sourceIndex;
prev=BOCU1_SIMPLE_PREV(c);
sourceIndex=nextSourceIndex; goto fastSingle;
}
} elseif(c<=0x20) { /* * Direct-encoded C0 control code or space. * Reset prev for C0 control codes but not for space.
*/ if(c!=0x20) {
prev=BOCU1_ASCII_PREV;
}
*target++ = static_cast<char16_t>(c);
*offsets++=sourceIndex;
sourceIndex=nextSourceIndex; continue;
} elseif(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { /* Optimize two-byte case. */ if(c>=BOCU1_MIDDLE) {
diff = (c - BOCU1_START_POS_2) * BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1 + 1;
} else {
diff = (c - BOCU1_START_NEG_2) * BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1;
}
/* trail byte */
++nextSourceIndex;
c=decodeBocu1TrailByte(1, *source++); if (c < 0 || static_cast<uint32_t>(c = prev + diff + c) > 0x10ffff) {
bytes[0]=source[-2];
bytes[1]=source[-1];
byteIndex=2;
*pErrorCode=U_ILLEGAL_CHAR_FOUND; break;
}
} elseif(c==BOCU1_RESET) { /* only reset the state, no code point */
prev=BOCU1_ASCII_PREV;
sourceIndex=nextSourceIndex; continue;
} else { /* * For multi-byte difference lead bytes, set the decoder state * with the partial difference value from the lead byte and * with the number of trail bytes.
*/
bytes[0] = static_cast<uint8_t>(c);
byteIndex=1;
if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { /* set the converter state in UConverter to deal with the next character */
cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
cnv->mode=0;
} else { /* set the converter state back into UConverter */
cnv->toUnicodeStatus = static_cast<uint32_t>(prev);
cnv->mode = static_cast<int32_t>(static_cast<uint32_t>(diff) << 2) | count;
}
cnv->toULength=byteIndex;
/* write back the updated pointers */
pArgs->source = reinterpret_cast<constchar*>(source);
pArgs->target=target;
pArgs->offsets=offsets;
}
/* * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling. * If a change is made in the original function, then either * change this function the same way or * re-copy the original function and remove the variables * offsets, sourceIndex, and nextSourceIndex.
*/ staticvoid U_CALLCONV
_Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UConverter *cnv; const uint8_t *source, *sourceLimit;
char16_t *target; const char16_t *targetLimit;
int32_t prev, count, diff, c;
int8_t byteIndex;
uint8_t *bytes;
/* set up the local pointers */
cnv=pArgs->converter;
source = reinterpret_cast<const uint8_t*>(pArgs->source);
sourceLimit = reinterpret_cast<const uint8_t*>(pArgs->sourceLimit);
target=pArgs->target;
targetLimit=pArgs->targetLimit;
/* get the converter state from UConverter */
prev = static_cast<int32_t>(cnv->toUnicodeStatus); if(prev==0) {
prev=BOCU1_ASCII_PREV;
}
diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
count=diff&3;
diff>>=2;
byteIndex=cnv->toULength;
bytes=cnv->toUBytes;
/* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ if(count>0 && byteIndex>0 && target<targetLimit) { goto getTrail;
}
/* decode a sequence of single and lead bytes */ while(source<sourceLimit) { if(target>=targetLimit) { /* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR; break;
}
c=*source++; if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { /* Write a code point directly from a single-byte difference. */
c=prev+(c-BOCU1_MIDDLE); if(c<0x3000) {
*target++ = static_cast<char16_t>(c);
prev=BOCU1_SIMPLE_PREV(c); goto fastSingle;
}
} elseif(c<=0x20) { /* * Direct-encoded C0 control code or space. * Reset prev for C0 control codes but not for space.
*/ if(c!=0x20) {
prev=BOCU1_ASCII_PREV;
}
*target++ = static_cast<char16_t>(c); continue;
} elseif(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { /* Optimize two-byte case. */ if(c>=BOCU1_MIDDLE) {
diff = (c - BOCU1_START_POS_2) * BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1 + 1;
} else {
diff = (c - BOCU1_START_NEG_2) * BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1;
}
/* trail byte */
c=decodeBocu1TrailByte(1, *source++); if (c < 0 || static_cast<uint32_t>(c = prev + diff + c) > 0x10ffff) {
bytes[0]=source[-2];
bytes[1]=source[-1];
byteIndex=2;
*pErrorCode=U_ILLEGAL_CHAR_FOUND; break;
}
} elseif(c==BOCU1_RESET) { /* only reset the state, no code point */
prev=BOCU1_ASCII_PREV; continue;
} else { /* * For multi-byte difference lead bytes, set the decoder state * with the partial difference value from the lead byte and * with the number of trail bytes.
*/
bytes[0] = static_cast<uint8_t>(c);
byteIndex=1;
if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { /* set the converter state in UConverter to deal with the next character */
cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
cnv->mode=0;
} else { /* set the converter state back into UConverter */
cnv->toUnicodeStatus = static_cast<uint32_t>(prev);
cnv->mode = (static_cast<uint32_t>(diff) << 2) | count;
}
cnv->toULength=byteIndex;
/* write back the updated pointers */
pArgs->source = reinterpret_cast<constchar*>(source);
pArgs->target=target;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.