#ifdef U_ENABLE_GENERIC_ISO_2022 /* * I am disabling the generic ISO-2022 converter after proposing to do so on * the icu mailing list two days ago. * * Reasons: * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of * its designation sequences, single shifts with return to the previous state, * switch-with-no-return to UTF-16BE or similar, etc. * This is unlike the language-specific variants like ISO-2022-JP which * require a much smaller repertoire of ISO-2022 features. * These variants continue to be supported. * 2. I believe that no one is really using the generic ISO-2022 converter * but rather always one of the language-specific variants. * Note that ICU's generic ISO-2022 converter has always output one escape * sequence followed by UTF-8 for the whole stream. * 3. Switching between subcharsets is extremely slow, because each time * the previous converter is closed and a new one opened, * without any kind of caching, least-recently-used list, etc. * 4. The code is currently buggy, and given the above it does not seem * reasonable to spend the time on maintenance. * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. * This means, for example, that when ISO-8859-7 is designated, the following * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. * The ICU ISO-2022 converter does not handle this - and has no information * about which subconverter would have to be shifted vs. which is designed * for 7-bit ISO-2022. * * Markus Scherer 2003-dec-03
*/ #endif
/* * 94-character sets with native byte values A1..FE are encoded in ISO 2022 * as bytes 21..7E. (Subtract 0x80.) * 96-character sets with native byte values A0..FF are encoded in ISO 2022 * as bytes 20..7F. (Subtract 0x80.) * Do not encode C1 control codes with native bytes 80..9F * as bytes 00..1F (C0 control codes).
*/ enum {
GR94_START=0xa1,
GR94_END=0xfe,
GR96_START=0xa0,
GR96_END=0xff
};
/* * ISO 2022 control codes must not be converted from Unicode * because they would mess up the byte stream. * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b * corresponding to SO, SI, and ESC.
*/ #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
/* for ISO-2022-JP and -CN implementations */ typedefenum { /* shared values */
INVALID_STATE=-1,
ASCII = 0,
/* CN */ /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
GB2312_1=1,
ISO_IR_165=2,
CNS_11643=3,
/* * these are used in StateEnum and ISO2022State variables, * but CNS_11643 must be used to index into myConverterArray[]
*/
CNS_11643_0=0x20,
CNS_11643_1,
CNS_11643_2,
CNS_11643_3,
CNS_11643_4,
CNS_11643_5,
CNS_11643_6,
CNS_11643_7
} StateEnum;
/* is the StateEnum charset value for a DBCS charset? */ #if UCONFIG_ONLY_HTML_CONVERSION #define IS_JP_DBCS(cs) (JISX208==(cs)) #else #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) #endif
#define CSM(cs) ((uint16_t)1<<(cs))
/* * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x * * Note: The converter uses some leniency: * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in * all versions, not just JIS7 and JIS8. * - ICU does not distinguish between different versions of JIS X 0208.
*/ #if UCONFIG_ONLY_HTML_CONVERSION enum { MAX_JA_VERSION=0 }; #else enum { MAX_JA_VERSION=4 }; #endif staticconst uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), #if !UCONFIG_ONLY_HTML_CONVERSION
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) #endif
};
typedefstruct ISO2022State {
int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
int8_t prevG; /* g before single shift (SS2 or SS3) */
} ISO2022State;
typedefenum
{
INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
} UCNV_TableStates_2022;
/* * The way these state transition arrays work is: * ex : ESC$B is the sequence for JISX208 * a) First Iteration: char is ESC * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index * int x = normalize_esq_chars_2022[27] which is equal to 1 * ii) Search for this value in escSeqStateTable_Key_2022[] * value of x is stored at escSeqStateTable_Key_2022[0] * iii) Save this index as offset * iv) Get state of this sequence from escSeqStateTable_Value_2022[] * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 * b) Switch on this state and continue to next char * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index * which is normalize_esq_chars_2022[36] == 4 * ii) x is currently 1(from above) * x<<=5 -- x is now 32 * x+=normalize_esq_chars_2022[36] * now x is 36 * iii) Search for this value in escSeqStateTable_Key_2022[] * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 * c) Switch on this state and continue to next char * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index * ii) x is currently 36 (from above) * x<<=5 -- x is now 1152 * x+=normalize_esq_chars_2022[66] * now x is 1161 * iii) Search for this value in escSeqStateTable_Key_2022[] * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
*/
/*Below are the 3 arrays depicting a state transition table*/ staticconst int8_t normalize_esq_chars_2022[256] = { /* 0 1 2 3 4 5 6 7 8 9 */
#ifdef U_ENABLE_GENERIC_ISO_2022 /* * When the generic ISO-2022 converter is completely removed, not just disabled * per #ifdef, then the following state table and the associated tables that are * dimensioned with MAX_STATES_2022 should be trimmed. * * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of * the associated escape sequences starting with ESC ( B should be removed. * This includes the ones with key values 1097 and all of the ones above 1000000. * * For the latter, the tables can simply be truncated. * For the former, since the tables must be kept parallel, it is probably best * to simply duplicate an adjacent table cell, parallel in all tables. * * It may make sense to restructure the tables, especially by using small search * tables for the variants instead of indexing them parallel to the table here.
*/ #endif
staticinlinevoid
setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ /* in ISO-2022-KR the designator sequence appears only once * in a file so we append it only once
*/ if( converter->charErrorBufferLength==0){
uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
myConverterData->currentType = ASCII1;
cnv->fromUnicodeStatus =false; if(pArgs->locale){
uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)-1);
}
version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
myConverterData->version = version; if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
(myLocale[2]=='_' || myLocale[2]=='\0'))
{ /* open the required converters and cache them */ if(version>MAX_JA_VERSION) { // ICU 55 fails to open a converter for an unsupported version. // Previously, it fell back to version 0, but that would yield // unexpected behavior.
*errorCode = U_MISSING_RESOURCE_ERROR; return;
} if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
myConverterData->myConverterArray[ISO8859_7] =
ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
}
myConverterData->myConverterArray[JISX208] =
ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode); if(jpCharsetMasks[version]&CSM(JISX212)) {
myConverterData->myConverterArray[JISX212] =
ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
} if(jpCharsetMasks[version]&CSM(GB2312)) {
myConverterData->myConverterArray[GB2312] =
ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
} if(jpCharsetMasks[version]&CSM(KSC5601)) {
myConverterData->myConverterArray[KSC5601] =
ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
}
/* set the function pointers to appropriate functions */
cnv->sharedData = const_cast<UConverterSharedData*>(&_ISO2022JPData);
uprv_strcpy(myConverterData->locale,"ja");
(void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
size_t len = uprv_strlen(myConverterData->name);
myConverterData->name[len] = static_cast<char>(myConverterData->version + static_cast<int>('0'));
myConverterData->name[len+1]='\0';
} #if !UCONFIG_ONLY_HTML_CONVERSION elseif(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
(myLocale[2]=='_' || myLocale[2]=='\0'))
{ if(version>1) { // ICU 55 fails to open a converter for an unsupported version. // Previously, it fell back to version 0, but that would yield // unexpected behavior.
*errorCode = U_MISSING_RESOURCE_ERROR; return;
} constchar *cnvName; if(version==1) {
cnvName="icu-internal-25546";
} else {
cnvName="ibm-949";
myConverterData->version=version=0;
} if(pArgs->onlyTestIsLoadable) {
ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
uprv_free(cnv->extraInfo);
cnv->extraInfo=nullptr; return;
} else {
myConverterData->currentConverter=ucnv_open(cnvName, errorCode); if (U_FAILURE(*errorCode)) {
_ISO2022Close(cnv); return;
}
/* initialize the state variables */
setInitialStateToUnicodeKR(cnv, myConverterData);
setInitialStateFromUnicodeKR(cnv, myConverterData);
/* set the function pointers to appropriate functions */
cnv->sharedData = const_cast<UConverterSharedData*>(&_ISO2022KRData);
uprv_strcpy(myConverterData->locale,"ko");
}
} elseif(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
(myLocale[2]=='_' || myLocale[2]=='\0'))
{ if(version>2) { // ICU 55 fails to open a converter for an unsupported version. // Previously, it fell back to version 0, but that would yield // unexpected behavior.
*errorCode = U_MISSING_RESOURCE_ERROR; return;
}
/* open the required converters and cache them */
myConverterData->myConverterArray[GB2312_1] =
ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); if(version==1) {
myConverterData->myConverterArray[ISO_IR_165] =
ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
}
myConverterData->myConverterArray[CNS_11643] =
ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
/* set the function pointers to appropriate functions */
cnv->sharedData = const_cast<UConverterSharedData*>(&_ISO2022CNData);
uprv_strcpy(myConverterData->locale,"cn");
cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; /* initialize the state variables */
uprv_strcpy(myConverterData->name,"ISO_2022"); #else
*errorCode = U_MISSING_RESOURCE_ERROR; // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard // data loading error code. return; #endif
}
if (converter->extraInfo != nullptr) { /*close the array of converter pointers and free the memory*/ for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { if(array[i]!=nullptr) {
ucnv_unloadSharedDataIfReady(array[i]);
}
}
togo = normalize_esq_chars_2022[static_cast<uint8_t>(c)]; if(togo == 0) { /* not a valid character anywhere in an escape sequence */
*key = 0;
*offset = 0; return INVALID_2022;
}
togo = (*key << 5) + togo;
while (hi != low) /*binary search*/{
int32_t mid = (hi+low) >> 1; /*Finds median*/
if (mid == oldmid) break;
if (escSeqStateTable_Key_2022[mid] > togo){
hi = mid;
} elseif (escSeqStateTable_Key_2022[mid] < togo){
low = mid;
} else/*we found it*/{
*key = togo;
*offset = mid; returnstatic_cast<UCNV_TableStates_2022>(escSeqStateTable_Value_2022[mid]);
}
oldmid = mid;
}
*key = 0;
*offset = 0; return INVALID_2022;
}
/*runs through a state machine to determine the escape sequence - codepage correspondence
*/ staticvoid
changeState_2022(UConverter* _this, constchar** source, constchar* sourceLimit,
Variant2022 var,
UErrorCode* err){
UCNV_TableStates_2022 value;
UConverterDataISO2022* myData2022 = static_cast<UConverterDataISO2022*>(_this->extraInfo);
uint32_t key = myData2022->key;
int32_t offset = 0;
int8_t initialToULength = _this->toULength; char c;
value = VALID_NON_TERMINAL_2022; while (*source < sourceLimit) {
c = *(*source)++;
_this->toUBytes[_this->toULength++] = static_cast<uint8_t>(c);
value = getKey_2022(c, reinterpret_cast<int32_t*>(&key), &offset);
switch (value){
case VALID_NON_TERMINAL_2022 : /* continue with the loop */ break;
case VALID_TERMINAL_2022:
key = 0; goto DONE;
case INVALID_2022: goto DONE;
case VALID_MAYBE_TERMINAL_2022: #ifdef U_ENABLE_GENERIC_ISO_2022 /* ESC ( B is ambiguous only for ISO_2022 itself */ if(var == ISO_2022) { /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
_this->toULength = 0;
/* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
/* continue with the loop */
value = VALID_NON_TERMINAL_2022; break;
} else #endif
{ /* not ISO_2022 itself, finish here */
value = VALID_TERMINAL_2022;
key = 0; goto DONE;
}
}
}
DONE:
myData2022->key = key;
if (value == VALID_NON_TERMINAL_2022) { /* indicate that the escape sequence is incomplete: key!=0 */ return;
} elseif (value == INVALID_2022 ) {
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
} else/* value == VALID_TERMINAL_2022 */ { switch(var){ #ifdef U_ENABLE_GENERIC_ISO_2022 case ISO_2022:
{ constchar *chosenConverterName = escSeqStateTable_Result_2022[offset]; if(chosenConverterName == nullptr) { /* SS2 or SS3 */
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
_this->toUCallbackReason = UCNV_UNASSIGNED; return;
}
_this->mode = UCNV_SI;
ucnv_close(myData2022->currentConverter);
myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); if(U_SUCCESS(*err)) {
myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
_this->mode = UCNV_SO;
} break;
} #endif case ISO_2022_JP:
{
StateEnum tempState = static_cast<StateEnum>(nextStateToUnicodeJP[offset]); switch(tempState) { case INVALID_STATE:
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE; break; case SS2_STATE: if(myData2022->toU2022State.cs[2]!=0) { if(myData2022->toU2022State.g<2) {
myData2022->toU2022State.prevG=myData2022->toU2022State.g;
}
myData2022->toU2022State.g=2;
} else { /* illegal to have SS2 before a matching designator */
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
} break; /* case SS3_STATE: not used in ISO-2022-JP-x */ case ISO8859_1: case ISO8859_7: if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
} else { /* G2 charset for SS2 */
myData2022->toU2022State.cs[2] = static_cast<int8_t>(tempState);
} break; default: if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
} else { /* G0 charset */
myData2022->toU2022State.cs[0] = static_cast<int8_t>(tempState);
} break;
}
} break; #if !UCONFIG_ONLY_HTML_CONVERSION case ISO_2022_CN:
{
StateEnum tempState = static_cast<StateEnum>(nextStateToUnicodeCN[offset]); switch(tempState) { case INVALID_STATE:
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE; break; case SS2_STATE: if(myData2022->toU2022State.cs[2]!=0) { if(myData2022->toU2022State.g<2) {
myData2022->toU2022State.prevG=myData2022->toU2022State.g;
}
myData2022->toU2022State.g=2;
} else { /* illegal to have SS2 before a matching designator */
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
} break; case SS3_STATE: if(myData2022->toU2022State.cs[3]!=0) { if(myData2022->toU2022State.g<2) {
myData2022->toU2022State.prevG=myData2022->toU2022State.g;
}
myData2022->toU2022State.g=3;
} else { /* illegal to have SS3 before a matching designator */
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
} break; case ISO_IR_165: if(myData2022->version==0) {
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE; break;
}
U_FALLTHROUGH; case GB2312_1:
U_FALLTHROUGH; case CNS_11643_1:
myData2022->toU2022State.cs[1] = static_cast<int8_t>(tempState); break; case CNS_11643_2:
myData2022->toU2022State.cs[2] = static_cast<int8_t>(tempState); break; default: /* other CNS 11643 planes */ if(myData2022->version==0) {
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
} else {
myData2022->toU2022State.cs[3] = static_cast<int8_t>(tempState);
} break;
}
} break; case ISO_2022_KR: if(offset==0x30){ /* nothing to be done, just accept this one escape sequence */
} else {
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
} break; #endif// !UCONFIG_ONLY_HTML_CONVERSION
default:
*err = U_ILLEGAL_ESCAPE_SEQUENCE; break;
}
} if(U_SUCCESS(*err)) {
_this->toULength = 0;
} elseif(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { if(_this->toULength>1) { /* * Ticket 5691: consistent illegal sequences: * - We include at least the first byte (ESC) in the illegal sequence. * - If any of the non-initial bytes could be the start of a character, * we stop the illegal sequence before the first one of those. * In escape sequences, all following bytes are "printable", that is, * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), * they are valid single/lead bytes. * For simplicity, we always only report the initial ESC byte as the * illegal sequence and back out all other bytes we looked at.
*/ /* Back out some bytes. */
int8_t backOutDistance=_this->toULength-1;
int8_t bytesFromThisBuffer=_this->toULength-initialToULength; if(backOutDistance<=bytesFromThisBuffer) { /* same as initialToULength<=1 */
*source-=backOutDistance;
} else { /* Back out bytes from the previous buffer: Need to replay them. */
_this->preToULength = static_cast<int8_t>(bytesFromThisBuffer - backOutDistance); /* same as -(initialToULength-1) */ /* preToULength is negative! */
uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
*source-=bytesFromThisBuffer;
}
_this->toULength=1;
}
} elseif(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
_this->toUCallbackReason = UCNV_UNASSIGNED;
}
}
#if !UCONFIG_ONLY_HTML_CONVERSION /*Checks the characters of the buffer against valid 2022 escape sequences *if the match we return a pointer to the initial start of the sequence otherwise *we return sourceLimit
*/ /*for 2022 looks ahead in the stream *to determine the longest possible convertible *data stream
*/ staticinlineconstchar*
getEndOfBuffer_2022(constchar** source, constchar* sourceLimit,
UBool /*flush*/){
constchar* mySource = *source;
#ifdef U_ENABLE_GENERIC_ISO_2022 if (*source >= sourceLimit) return sourceLimit;
do{
if (*mySource == ESC_2022){
int8_t i;
int32_t key = 0;
int32_t offset;
UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
/* Kludge: I could not * figure out the reason for validating an escape sequence * twice - once here and once in changeState_2022(). * is it possible to have an ESC character in a ISO2022 * byte stream which is valid in a code page? Is it legal?
*/ for (i=0;
(mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
i++) {
value = getKey_2022(*(mySource+i), &key, &offset);
} if (value > 0 || *mySource==ESC_2022) return mySource;
/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c * any future change in _MBCSFromUChar32() function should be reflected here. * @return number of bytes in *value; negative number if fallback; 0 if no mapping
*/ staticinline int32_t
MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
UChar32 c,
uint32_t* value,
UBool useFallback, int outputType)
{ const int32_t *cx; const uint16_t *table;
uint32_t stage2Entry;
uint32_t myValue;
int32_t length; const uint8_t *p; /* * TODO(markus): Use and require new, faster MBCS conversion table structures. * Use internal version of ucnv_open() that verifies that the new structures are available, * else U_INTERNAL_PROGRAM_ERROR.
*/ /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
table=sharedData->mbcs.fromUnicodeTable;
stage2Entry=MBCS_STAGE_2_FROM_U(table, c); /* get the bytes and the length for the output */ if(outputType==MBCS_OUTPUT_2){
myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); if(myValue<=0xff) {
length=1;
} else {
length=2;
}
} else/* outputType==MBCS_OUTPUT_3 */ {
p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
myValue = (static_cast<uint32_t>(*p) << 16) | (static_cast<uint32_t>(p[1]) << 8) | p[2]; if(myValue<=0xff) {
length=1;
} elseif(myValue<=0xffff) {
length=2;
} else {
length=3;
}
} /* is this code point assigned, or do we use fallbacks? */ if((stage2Entry&(1<<(16+(c&0xf))))!=0) { /* assigned */
*value=myValue; return length;
} elseif(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { /* * We allow a 0 byte output if the "assigned" bit is set for this entry. * There is no way with this data structure for fallback output * to be a zero byte.
*/
*value=myValue; return -length;
}
}
/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c * any future change in _MBCSSingleFromUChar32() function should be reflected here. * @param retval pointer to output byte * @return 1 roundtrip byte 0 no mapping -1 fallback byte
*/ staticinline int32_t
MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
UChar32 c,
uint32_t* retval,
UBool useFallback)
{ const uint16_t *table;
int32_t value; /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { return 0;
} /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
table=sharedData->mbcs.fromUnicodeTable; /* get the byte for the output */
value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); /* is this code point assigned, or do we use fallbacks? */
*retval = static_cast<uint32_t>(value & 0xff); if(value>=0xf00) { return 1; /* roundtrip */
} elseif(useFallback ? value>=0x800 : value>=0xc00) { return -1; /* fallback taken */
} else { return 0; /* no mapping */
}
}
/* * Check that the result is a 2-byte value with each byte in the range A1..FE * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte * to move it to the ISO 2022 range 21..7E. * Return 0 if out of range.
*/ staticinline uint32_t
_2022FromGR94DBCS(uint32_t value) { if (static_cast<uint16_t>(value - 0xa1a1) <= (0xfefe - 0xa1a1) && static_cast<uint8_t>(value - 0xa1) <= (0xfe - 0xa1)
) { return value - 0x8080; /* shift down to 21..7e byte range */
} else { return 0; /* not valid for ISO 2022 */
}
}
#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ /* * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point * unchanged.
*/ staticinline uint32_t
_2022ToGR94DBCS(uint32_t value) {
uint32_t returnValue = value + 0x8080; if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
(uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { return returnValue;
} else { return value;
}
} #endif
realSourceLimit = args->sourceLimit; while (args->source < realSourceLimit) { if(myData->key == 0) { /* are we in the middle of an escape sequence? */ /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
/* convert to before the ESC or until the end of the buffer */
myData->isFirstBuffer=false;
sourceStart = args->source;
myTargetStart = args->target;
args->converter = myData->currentConverter;
ucnv_toUnicode(args->converter,
&args->target,
args->targetLimit,
&args->source,
mySourceLimit,
args->offsets,
(UBool)(args->flush && mySourceLimit == realSourceLimit),
err);
args->converter = saveThis;
/************************************** IMPORTANT ************************************************** * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). * The converter iterates over each Unicode codepoint * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is * processed one char at a time it would make sense to reduce the extra processing a canned converter * would do as far as possible. * * If the implementation of these macros or structure of sharedData struct change in the future, make * sure that ISO-2022 is also changed. ***************************************************************************************************
*/
/*************************************************************************************************** * Rules for ISO-2022-jp encoding * (i) Escape sequences must be fully contained within a line they should not * span new lines or CRs * (ii) If the last character on a line is represented by two bytes then an ASCII or * JIS-Roman character escape sequence should follow before the line terminates * (iii) If the first character on the line is represented by two bytes then a two * byte character escape sequence should precede it * (iv) If no escape sequence is encountered then the characters are ASCII * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, * and invoked with SS2 (ESC N). * (vi) If there is any G0 designation in text, there must be a switch to * ASCII or to JIS X 0201-Roman before a space character (but not * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control * characters such as tab or CRLF. * (vi) Supported encodings: * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 * * source : RFC-1554 * * JISX201, JISX208,JISX212 : new .cnv data files created * KSC5601 : alias to ibm-949 mapping table * GB2312 : alias to ibm-1386 mapping table * ISO-8859-1 : Algorithmic implemented as LATIN1 case * ISO-8859-7 : alias to ibm-9409 mapping table
*/
/* preference order of JP charsets */ staticconst StateEnum jpCharsetPref[]={
ASCII,
JISX201,
ISO8859_1,
JISX208,
ISO8859_7,
JISX212,
GB2312,
KSC5601,
HWKANA_7BIT
};
/* * The escape sequences must be in order of the enum constants like JISX201 = 3, * not in order of jpCharsetPref[]!
*/ staticconstchar escSeqChars[][6] ={ "\x1B\x28\x42", /* <ESC>(B ASCII */ "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ "\x1B\x24\x42", /* <ESC>$B JISX-208 */ "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ "\x1B\x24\x41", /* <ESC>$A GB2312 */ "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ "\x1B\x28\x49"/* <ESC>(I HWKANA_7BIT */
}; staticconst int8_t escSeqCharsLen[] ={
3, /* length of <ESC>(B ASCII */
3, /* length of <ESC>.A ISO-8859-1 */
3, /* length of <ESC>.F ISO-8859-7 */
3, /* length of <ESC>(J JISX-201 */
3, /* length of <ESC>$B JISX-208 */
4, /* length of <ESC>$(D JISX-212 */
3, /* length of <ESC>$A GB2312 */
4, /* length of <ESC>$(C KSC5601 */
3 /* length of <ESC>(I HWKANA_7BIT */
};
/* * The iteration over various code pages works this way: * i) Get the currentState from myConverterData->currentState * ii) Check if the character is mapped to a valid character in the currentState * Yes -> a) set the initIterState to currentState * b) remain in this state until an invalid character is found * No -> a) go to the next code page and find the character * iii) Before changing the state increment the current state check if the current state * is equal to the intitIteration state * Yes -> A character that cannot be represented in any of the supported encodings * break and return a U_INVALID_CHARACTER error * No -> Continue and find the character in next code page * * * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
*/
/* Map 00..7F to Unicode according to JIS X 0201. */ staticinline uint32_t
jisx201ToU(uint32_t value) { if(value < 0x5c) { return value;
} elseif(value == 0x5c) { return 0xa5;
} elseif(value == 0x7e) { return 0x203e;
} else/* value <= 0x7f */ { return value;
}
}
/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ staticinline uint32_t
jisx201FromU(uint32_t value) { if(value<=0x7f) { if(value!=0x5c && value!=0x7e) { return value;
}
} elseif(value==0xa5) { return 0x5c;
} elseif(value==0x203e) { return 0x7e;
} return 0xfffe;
}
/* * Take a valid Shift-JIS byte pair, check that it is in the range corresponding * to JIS X 0208, and convert it to a pair of 21..7E bytes. * Return 0 if the byte pair is out of range.
*/ staticinline uint32_t
_2022FromSJIS(uint32_t value) {
uint8_t trail;
if(value > 0xEFFC) { return 0; /* beyond JIS X 0208 */
}
trail = static_cast<uint8_t>(value);
value &= 0xff00; /* lead byte */ if(value <= 0x9f00) {
value -= 0x7000;
} else/* 0xe000 <= value <= 0xef00 */ {
value -= 0xb000;
}
value <<= 1;
/* set up the state */
converterData = static_cast<UConverterDataISO2022*>(cnv->extraInfo);
pFromU2022State = &converterData->fromU2022State;
choiceCount = 0;
/* check if the last codepoint of previous buffer was a lead surrogate*/ if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { goto getTrail;
}
sourceChar = *(source++); /*check if the char is a First surrogate*/ if(U16_IS_SURROGATE(sourceChar)) { if(U16_IS_SURROGATE_LEAD(sourceChar)) {
getTrail: /*look ahead to find the trail surrogate*/ if(source < sourceLimit) { /* test the following code unit */
char16_t trail = *source; if(U16_IS_TRAIL(trail)) {
source++;
sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
cnv->fromUChar32=0x00; /* convert this supplementary code point */ /* exit this condition tree */
} else { /* this is an unmatched lead code unit (1st surrogate) */ /* callback(illegal) */
*err=U_ILLEGAL_CHAR_FOUND;
cnv->fromUChar32=sourceChar; break;
}
} else { /* no more input */
cnv->fromUChar32=sourceChar; break;
}
} else { /* this is an unmatched trail code unit (2nd surrogate) */ /* callback(illegal) */
*err=U_ILLEGAL_CHAR_FOUND;
cnv->fromUChar32=sourceChar; break;
}
}
/* do not convert SO/SI/ESC */ if(IS_2022_CONTROL(sourceChar)) { /* callback(illegal) */
*err=U_ILLEGAL_CHAR_FOUND;
cnv->fromUChar32=sourceChar; break;
}
/* do the conversion */
if(choiceCount == 0) {
uint16_t csm;
/* * The csm variable keeps track of which charsets are allowed * and not used yet while building the choices[].
*/
csm = jpCharsetMasks[converterData->version];
choiceCount = 0;
/* JIS7/8: try single-byte half-width Katakana before JISX208 */ if(converterData->version == 3 || converterData->version == 4) {
choices[choiceCount++] = static_cast<int8_t>(HWKANA_7BIT);
} /* Do not try single-byte half-width Katakana for other versions. */
csm &= ~CSM(HWKANA_7BIT);
/* try the current G0 charset */
choices[choiceCount++] = cs = pFromU2022State->cs[0];
csm &= ~CSM(cs);
/* try the current G2 charset */ if((cs = pFromU2022State->cs[2]) != 0) {
choices[choiceCount++] = cs;
csm &= ~CSM(cs);
}
/* try all the other possible charsets */ for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
cs = static_cast<int8_t>(jpCharsetPref[i]); if(CSM(cs) & csm) {
choices[choiceCount++] = cs;
csm &= ~CSM(cs);
}
}
}
cs = g = 0; /* * len==0: no mapping found yet * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks * len>0: found a roundtrip result, done
*/
len = 0; /* * We will turn off useFallback after finding a fallback, * but we still get fallbacks from PUA code points as usual. * Therefore, we will also need to check that we don't overwrite * an early fallback with a later one.
*/
useFallback = cnv->useFallback;
for(i = 0; i < choiceCount && len <= 0; ++i) {
uint32_t value;
int32_t len2;
int8_t cs0 = choices[i]; switch(cs0) { case ASCII: if(sourceChar <= 0x7f) {
targetValue = static_cast<uint32_t>(sourceChar);
len = 1;
cs = cs0;
g = 0;
} break; case ISO8859_1: if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
targetValue = static_cast<uint32_t>(sourceChar) - 0x80;
len = 1;
cs = cs0;
g = 2;
} break; case HWKANA_7BIT: if (static_cast<uint32_t>(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { if(converterData->version==3) { /* JIS7: use G1 (SO) */ /* Shift U+FF61..U+FF9F to bytes 21..5F. */
targetValue = static_cast<uint32_t>(sourceChar - (HWKANA_START - 0x21));
len = 1;
pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
g = 1;
} elseif(converterData->version==4) { /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ /* Shift U+FF61..U+FF9F to bytes A1..DF. */
targetValue = static_cast<uint32_t>(sourceChar - (HWKANA_START - 0xa1));
len = 1;
cs = pFromU2022State->cs[0]; if(IS_JP_DBCS(cs)) { /* switch from a DBCS charset to JISX201 */
cs = static_cast<int8_t>(JISX201);
} /* else stay in the current G0 charset */
g = 0;
} /* else do not use HWKANA_7BIT with other versions */
} break; case JISX201: /* G0 SBCS */
value = jisx201FromU(sourceChar); if(value <= 0x7f) {
targetValue = value;
len = 1;
cs = cs0;
g = 0;
useFallback = false;
} break; case JISX208: /* G0 DBCS from Shift-JIS table */
len2 = MBCS_FROM_UCHAR32_ISO2022(
converterData->myConverterArray[cs0],
sourceChar, &value,
useFallback, MBCS_OUTPUT_2); if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
value = _2022FromSJIS(value); if(value != 0) {
targetValue = value;
len = len2;
cs = cs0;
g = 0;
useFallback = false;
}
} elseif(len == 0 && useFallback && static_cast<uint32_t>(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
targetValue = hwkana_fb[sourceChar - HWKANA_START];
len = -2;
cs = cs0;
g = 0;
useFallback = false;
} break; case ISO8859_7: /* G0 SBCS forced to 7-bit output */
len2 = MBCS_SINGLE_FROM_UCHAR32(
converterData->myConverterArray[cs0],
sourceChar, &value,
useFallback); if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
targetValue = value - 0x80;
len = len2;
cs = cs0;
g = 2;
useFallback = false;
} break; default: /* G0 DBCS */
len2 = MBCS_FROM_UCHAR32_ISO2022(
converterData->myConverterArray[cs0],
sourceChar, &value,
useFallback, MBCS_OUTPUT_2); if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ if(cs0 == KSC5601) { /* * Check for valid bytes for the encoding scheme. * This is necessary because the sub-converter (windows-949) * has a broader encoding scheme than is valid for 2022.
*/
value = _2022FromGR94DBCS(value); if(value == 0) { break;
}
}
targetValue = value;
len = len2;
cs = cs0;
g = 0;
useFallback = false;
} break;
}
}
/* write the shift sequence if necessary */ if(g != pFromU2022State->g) { switch(g) { /* case 0 handled before writing escapes */ case 1:
buffer[outLen++] = UCNV_SO;
pFromU2022State->g = 1; break; default: /* case 2 */
buffer[outLen++] = 0x1b;
buffer[outLen++] = 0x4e; break; /* no case 3: no SS3 in ISO-2022-JP-x */
}
}
/* write the output bytes */ if(len == 1) {
buffer[outLen++] = static_cast<char>(targetValue);
} else/* len == 2 */ {
buffer[outLen++] = static_cast<char>(targetValue >> 8);
buffer[outLen++] = static_cast<char>(targetValue);
}
} else { /* * if we cannot find the character after checking all codepages * then this is an error
*/
*err = U_INVALID_CHAR_FOUND;
cnv->fromUChar32=sourceChar; break;
}
if(sourceChar == CR || sourceChar == LF) { /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
pFromU2022State->cs[2] = 0;
choiceCount = 0;
}
/* * the end of the input stream and detection of truncated input * are handled by the framework, but for ISO-2022-JP conversion * we need to be in ASCII mode at the very end * * conditions: * successful * in SO mode or not in ASCII mode * end of input and no truncated input
*/ if( U_SUCCESS(*err) &&
(pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
args->flush && source>=sourceLimit && cnv->fromUChar32==0
) {
int32_t sourceIndex;
/* get the source index of the last input character */ /* * TODO this would be simpler and more reliable if we used a pair * of sourceIndex/prevSourceIndex like in ucnvmbcs.c * so that we could simply use the prevSourceIndex here; * this code gives an incorrect result for the rare case of an unmatched * trail surrogate that is alone in the last buffer of the text stream
*/
sourceIndex = static_cast<int32_t>(source - args->source); if(sourceIndex>0) {
--sourceIndex; if( U16_IS_TRAIL(args->source[sourceIndex]) &&
(sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
) {
--sourceIndex;
}
} else {
sourceIndex=-1;
}
switch(mySourceChar) { case UCNV_SI: if(myData->version==3) {
pToU2022State->g=0; continue;
} else { /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
myData->isEmptySegment = false; /* reset this, we have a different error */ break;
}
case UCNV_SO: if(myData->version==3) { /* JIS7: switch to G1 half-width Katakana */
pToU2022State->cs[1] = static_cast<int8_t>(HWKANA_7BIT);
pToU2022State->g=1; continue;
} else { /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
myData->isEmptySegment = false; /* reset this, we have a different error */ break;
}
/* If in ISO-2022-JP only and we successfully completed an escape sequence, but previous segment was empty, create an error */ if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
args->converter->toUCallbackReason = UCNV_IRREGULAR;
args->converter->toULength = static_cast<int8_t>(toULengthBefore + (mySource - mySourceBefore));
}
}
/* invalid or illegal escape sequence */ if(U_FAILURE(*err)){
args->target = myTarget;
args->source = mySource;
myData->isEmptySegment = false; /* Reset to avoid future spurious errors */ return;
} /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ if(myData->key==0) {
myData->isEmptySegment = true;
} continue;
/* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
case CR: case LF: /* automatically reset to single-byte mode */ if (static_cast<StateEnum>(pToU2022State->cs[0]) != ASCII && static_cast<StateEnum>(pToU2022State->cs[0]) != JISX201) {
pToU2022State->cs[0] = static_cast<int8_t>(ASCII);
}
pToU2022State->cs[2] = 0;
pToU2022State->g = 0;
U_FALLTHROUGH; default: /* convert one or two bytes */
myData->isEmptySegment = false;
cs = static_cast<StateEnum>(pToU2022State->cs[pToU2022State->g]); if (static_cast<uint8_t>(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version == 4 &&
!IS_JP_DBCS(cs)
) { /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
/* return from a single-shift state to the previous one */ if(pToU2022State->g >= 2) {
pToU2022State->g=pToU2022State->prevG;
}
} elseswitch(cs) { case ASCII: if(mySourceChar <= 0x7f) {
targetUniChar = mySourceChar;
} break; case ISO8859_1: if(mySourceChar <= 0x7f) {
targetUniChar = mySourceChar + 0x80;
} /* return from a single-shift state to the previous one */
pToU2022State->g=pToU2022State->prevG; break; case ISO8859_7: if(mySourceChar <= 0x7f) { /* convert mySourceChar+0x80 to use a normal 8-bit table */
targetUniChar =
_MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
myData->myConverterArray[cs],
mySourceChar + 0x80);
} /* return from a single-shift state to the previous one */
pToU2022State->g=pToU2022State->prevG; break; case JISX201: if(mySourceChar <= 0x7f) {
targetUniChar = jisx201ToU(mySourceChar);
} break; case HWKANA_7BIT: if (static_cast<uint8_t>(mySourceChar - 0x21) <= (0x5f - 0x21)) { /* 7-bit halfwidth Katakana */
targetUniChar = mySourceChar + (HWKANA_START - 0x21);
} break; default: /* G0 DBCS */ if(mySource < mySourceLimit) { int leadIsOk, trailIsOk;
uint8_t trailByte;
getTrailByte:
trailByte = static_cast<uint8_t>(*mySource); /* * Ticket 5691: consistent illegal sequences: * - We include at least the first byte in the illegal sequence. * - If any of the non-initial bytes could be the start of a character, * we stop the illegal sequence before the first one of those. * * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is * an ESC/SO/SI, we report only the first byte as the illegal sequence. * Otherwise we convert or report the pair of bytes.
*/
leadIsOk = static_cast<uint8_t>(mySourceChar - 0x21) <= (0x7e - 0x21);
trailIsOk = static_cast<uint8_t>(trailByte - 0x21) <= (0x7e - 0x21); if (leadIsOk && trailIsOk) {
++mySource;
tmpSourceChar = (mySourceChar << 8) | trailByte; if(cs == JISX208) {
_2022ToSJIS(static_cast<uint8_t>(mySourceChar), trailByte, tempBuf);
mySourceChar = tmpSourceChar;
} else { /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
mySourceChar = tmpSourceChar; if (cs == KSC5601) {
tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
}
tempBuf[0] = static_cast<char>(tmpSourceChar >> 8);
tempBuf[1] = static_cast<char>(tmpSourceChar);
}
targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, false);
} elseif (!(trailIsOk || IS_2022_CONTROL(trailByte))) { /* report a pair of illegal bytes if the second byte is not a DBCS starter */
++mySource; /* add another bit so that the code below writes 2 bytes in case of error */
mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
}
} else {
args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
args->converter->toULength = 1; goto endloop;
}
} /* End of inner switch */ break;
} /* End of outer switch */ if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ if(args->offsets){
args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
}
*(myTarget++) = static_cast<char16_t>(targetUniChar);
} elseif(targetUniChar > missingCharMarker){ /* disassemble the surrogate pair and write to output*/
targetUniChar-=0x0010000;
*myTarget = static_cast<char16_t>(0xd800 + static_cast<char16_t>(targetUniChar >> 10)); if(args->offsets){
args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
}
++myTarget; if(myTarget< args->targetLimit){
*myTarget = static_cast<char16_t>(0xdc00 + static_cast<char16_t>(targetUniChar & 0x3ff)); if(args->offsets){
args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
}
++myTarget;
}else{
args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= static_cast<char16_t>(0xdc00 + static_cast<char16_t>(targetUniChar & 0x3ff));
}
} else{ /* Call the callback function*/
toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); break;
}
} else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
*err =U_BUFFER_OVERFLOW_ERROR; break;
}
}
endloop:
args->target = myTarget;
args->source = mySource;
}
#if !UCONFIG_ONLY_HTML_CONVERSION /*************************************************************** * Rules for ISO-2022-KR encoding * i) The KSC5601 designator sequence should appear only once in a file, * at the beginning of a line before any KSC5601 characters. This usually * means that it appears by itself on the first line of the file * ii) There are only 2 shifting sequences SO to shift into double byte mode * and SI to shift into single byte mode
*/ staticvoid U_CALLCONV
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
converterData = static_cast<UConverterDataISO2022*>(args->converter->extraInfo); /* if the version is 1 then the user is requesting * conversion with ibm-25546 pass the arguments to * MBCS converter and return
*/ if(converterData->version==1){
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); return;
}
} else{ /* oops.. the code point is unassingned * set the error and reason
*/
/*check if the char is a First surrogate*/ if(U16_IS_SURROGATE(sourceChar)) { if(U16_IS_SURROGATE_LEAD(sourceChar)) {
getTrail: /*look ahead to find the trail surrogate*/ if(source < sourceLimit) { /* test the following code unit */
char16_t trail = *source; if(U16_IS_TRAIL(trail)) {
source++;
sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
*err = U_INVALID_CHAR_FOUND; /* convert this surrogate code point */ /* exit this condition tree */
} else { /* this is an unmatched lead code unit (1st surrogate) */ /* callback(illegal) */
*err=U_ILLEGAL_CHAR_FOUND;
}
} else { /* no more input */
*err = U_ZERO_ERROR;
}
} else { /* this is an unmatched trail code unit (2nd surrogate) */ /* callback(illegal) */
*err=U_ILLEGAL_CHAR_FOUND;
}
} else { /* callback(unassigned) for a BMP code point */
*err = U_INVALID_CHAR_FOUND;
}
/* * the end of the input stream and detection of truncated input * are handled by the framework, but for ISO-2022-KR conversion * we need to be in ASCII mode at the very end * * conditions: * successful * not in ASCII mode * end of input and no truncated input
*/ if( U_SUCCESS(*err) &&
isTargetByteDBCS &&
args->flush && source>=sourceLimit && args->converter->fromUChar32==0
) {
int32_t sourceIndex;
/* we are switching to ASCII */
isTargetByteDBCS=false;
/* get the source index of the last input character */ /* * TODO this would be simpler and more reliable if we used a pair * of sourceIndex/prevSourceIndex like in ucnvmbcs.c * so that we could simply use the prevSourceIndex here; * this code gives an incorrect result for the rare case of an unmatched * trail surrogate that is alone in the last buffer of the text stream
*/
sourceIndex = static_cast<int32_t>(source - args->source); if(sourceIndex>0) {
--sourceIndex; if( U16_IS_TRAIL(args->source[sourceIndex]) &&
(sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
) {
--sourceIndex;
}
} else {
sourceIndex=-1;
}
/* remember the original start of the input for offsets */
sourceStart = args->source;
if(myData->key != 0) { /* continue with a partial escape sequence */ goto escape;
}
while(U_SUCCESS(*err) && args->source < args->sourceLimit) { /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
subArgs.source = args->source;
subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); if(subArgs.source != subArgs.sourceLimit) { /* * get the current partial byte sequence * * it needs to be moved between the public and the subconverter * so that the conversion framework, which only sees the public * converter, can handle truncated and illegal input etc.
*/ if(args->converter->toULength > 0) {
uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
}
subArgs.converter->toULength = args->converter->toULength;
/* * Convert up to the end of the input, or to before the next escape character. * Does not handle conversion extensions because the preToU[] state etc. * is not copied.
*/
ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
if(args->offsets != nullptr && sourceStart != args->source) { /* update offsets to base them on the actual start of the input */
int32_t *offsets = args->offsets;
char16_t *target = args->target;
int32_t delta = static_cast<int32_t>(args->source - sourceStart); while(target < subArgs.target) { if(*offsets >= 0) {
*offsets += delta;
}
++offsets;
++target;
}
}
args->source = subArgs.source;
args->target = subArgs.target;
args->offsets = subArgs.offsets;
if(mySourceChar==UCNV_SI){
myData->toU2022State.g = 0; if (myData->isEmptySegment) {
myData->isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
args->converter->toUCallbackReason = UCNV_IRREGULAR;
args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
args->converter->toULength = 1;
args->target = myTarget;
args->source = mySource; return;
} /*consume the source */ continue;
}elseif(mySourceChar==UCNV_SO){
myData->toU2022State.g = 1;
myData->isEmptySegment = true; /* Begin a new segment, empty so far */ /*consume the source */ continue;
}elseif(mySourceChar==ESC_2022){
mySource--;
escape:
myData->isEmptySegment = false; /* Any invalid ESC sequences will be detected separately, so just reset this */
changeState_2022(args->converter,&(mySource),
mySourceLimit, ISO_2022_KR, err); if(U_FAILURE(*err)){
args->target = myTarget;
args->source = mySource; return;
} continue;
}
myData->isEmptySegment = false; /* Any invalid char errors will be detected separately, so just reset this */ if(myData->toU2022State.g == 1) { if(mySource < mySourceLimit) { int leadIsOk, trailIsOk;
uint8_t trailByte;
getTrailByte:
targetUniChar = missingCharMarker;
trailByte = static_cast<uint8_t>(*mySource); /* * Ticket 5691: consistent illegal sequences: * - We include at least the first byte in the illegal sequence. * - If any of the non-initial bytes could be the start of a character, * we stop the illegal sequence before the first one of those. * * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is * an ESC/SO/SI, we report only the first byte as the illegal sequence. * Otherwise we convert or report the pair of bytes.
*/
leadIsOk = static_cast<uint8_t>(mySourceChar - 0x21) <= (0x7e - 0x21);
trailIsOk = static_cast<uint8_t>(trailByte - 0x21) <= (0x7e - 0x21); if (leadIsOk && trailIsOk) {
++mySource;
tempBuf[0] = static_cast<char>(mySourceChar + 0x80);
tempBuf[1] = static_cast<char>(trailByte + 0x80);
targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
mySourceChar = (mySourceChar << 8) | trailByte;
} elseif (!(trailIsOk || IS_2022_CONTROL(trailByte))) { /* report a pair of illegal bytes if the second byte is not a DBCS starter */
++mySource; /* add another bit so that the code below writes 2 bytes in case of error */
mySourceChar = static_cast<char16_t>(0x10000 | (mySourceChar << 8) | trailByte);
}
} else {
args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
args->converter->toULength = 1; break;
}
} elseif(mySourceChar <= 0x7f) {
targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
} else {
targetUniChar = 0xffff;
} if(targetUniChar < 0xfffe){ if(args->offsets) {
args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
}
*(myTarget++) = static_cast<char16_t>(targetUniChar);
} else { /* Call the callback function*/
toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); break;
}
} else{
*err =U_BUFFER_OVERFLOW_ERROR; break;
}
}
args->target = myTarget;
args->source = mySource;
}
/*************************** END ISO2022-KR *********************************/
/*************************** ISO-2022-CN ********************************* * * Rules for ISO-2022-CN Encoding: * i) The designator sequence must appear once on a line before any instance * of character set it designates. * ii) If two lines contain characters from the same character set, both lines * must include the designator sequence. * iii) Once the designator sequence is known, a shifting sequence has to be found * to invoke the shifting * iv) All lines start in ASCII and end in ASCII. * v) Four shifting sequences are employed for this purpose: * * Sequcence ASCII Eq Charsets * ---------- ------- --------- * SI <SI> US-ASCII * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 * SS2 <ESC>N CNS-11643-1992 Plane 2 * SS3 <ESC>O CNS-11643-1992 Planes 3-7 * * vi) * SOdesignator : ESC "$" ")" finalchar_for_SO * SS2designator : ESC "$" "*" finalchar_for_SS2 * SS3designator : ESC "$" "+" finalchar_for_SS3 * * ESC $ ) A Indicates the bytes following SO are Chinese * characters as defined in GB 2312-80, until * another SOdesignation appears * * * ESC $ ) E Indicates the bytes following SO are as defined * in ISO-IR-165 (for details, see section 2.1), * until another SOdesignation appears * * ESC $ ) G Indicates the bytes following SO are as defined * in CNS 11643-plane-1, until another * SOdesignation appears * * ESC $ * H Indicates the two bytes immediately following * SS2 is a Chinese character as defined in CNS * 11643-plane-2, until another SS2designation * appears * (Meaning <ESC>N must precede every 2 byte * sequence.) * * ESC $ + I Indicates the immediate two bytes following SS3 * is a Chinese character as defined in CNS * 11643-plane-3, until another SS3designation * appears * (Meaning <ESC>O must precede every 2 byte * sequence.) * * ESC $ + J Indicates the immediate two bytes following SS3 * is a Chinese character as defined in CNS * 11643-plane-4, until another SS3designation * appears * (In English: <ESC>O must precede every 2 byte * sequence.) * * ESC $ + K Indicates the immediate two bytes following SS3 * is a Chinese character as defined in CNS * 11643-plane-5, until another SS3designation * appears * * ESC $ + L Indicates the immediate two bytes following SS3 * is a Chinese character as defined in CNS * 11643-plane-6, until another SS3designation * appears * * ESC $ + M Indicates the immediate two bytes following SS3 * is a Chinese character as defined in CNS * 11643-plane-7, until another SS3designation * appears * * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and * has its own designation information before any Chinese characters * appear *
*/
/* The following are defined this way to make the strings truly readonly */ staticconstchar GB_2312_80_STR[] = "\x1B\x24\x29\x41"; staticconstchar ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; staticconstchar CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; staticconstchar CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; staticconstchar CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; staticconstchar CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; staticconstchar CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; staticconstchar CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; staticconstchar CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
/* set up the state */
converterData = static_cast<UConverterDataISO2022*>(cnv->extraInfo);
pFromU2022State = &converterData->fromU2022State;
choiceCount = 0;
/* check if the last codepoint of previous buffer was a lead surrogate*/ if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { goto getTrail;
}
sourceChar = *(source++); /*check if the char is a First surrogate*/ if(U16_IS_SURROGATE(sourceChar)) { if(U16_IS_SURROGATE_LEAD(sourceChar)) {
getTrail: /*look ahead to find the trail surrogate*/ if(source < sourceLimit) { /* test the following code unit */
char16_t trail = *source; if(U16_IS_TRAIL(trail)) {
source++;
sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
cnv->fromUChar32=0x00; /* convert this supplementary code point */ /* exit this condition tree */
} else { /* this is an unmatched lead code unit (1st surrogate) */ /* callback(illegal) */
*err=U_ILLEGAL_CHAR_FOUND;
cnv->fromUChar32=sourceChar; break;
}
} else { /* no more input */
cnv->fromUChar32=sourceChar; break;
}
} else { /* this is an unmatched trail code unit (2nd surrogate) */ /* callback(illegal) */
*err=U_ILLEGAL_CHAR_FOUND;
cnv->fromUChar32=sourceChar; break;
}
}
/* do the conversion */ if(sourceChar <= 0x007f ){ /* do not convert SO/SI/ESC */ if(IS_2022_CONTROL(sourceChar)) { /* callback(illegal) */
*err=U_ILLEGAL_CHAR_FOUND;
cnv->fromUChar32=sourceChar; break;
}
/* US-ASCII */ if(pFromU2022State->g == 0) {
buffer[0] = static_cast<char>(sourceChar);
len = 1;
} else {
buffer[0] = UCNV_SI;
buffer[1] = static_cast<char>(sourceChar);
len = 2;
pFromU2022State->g = 0;
choiceCount = 0;
} if(sourceChar == CR || sourceChar == LF) { /* reset the state at the end of a line */
uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
choiceCount = 0;
}
} else{ /* convert U+0080..U+10ffff */
int32_t i;
int8_t cs, g;
if(choiceCount == 0) { /* try the current SO/G1 converter first */
choices[0] = pFromU2022State->cs[1];
/* default to GB2312_1 if none is designated yet */ if(choices[0] == 0) {
choices[0] = GB2312_1;
}
/* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ if(choices[0] == GB2312_1) {
choices[1] = static_cast<int8_t>(CNS_11643_1);
} else {
choices[1] = static_cast<int8_t>(GB2312_1);
}
cs = g = 0; /* * len==0: no mapping found yet * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks * len>0: found a roundtrip result, done
*/
len = 0; /* * We will turn off useFallback after finding a fallback, * but we still get fallbacks from PUA code points as usual. * Therefore, we will also need to check that we don't overwrite * an early fallback with a later one.
*/
useFallback = cnv->useFallback;
/* write the shift sequence if necessary */ if(g != pFromU2022State->g) { switch(g) { case 1:
buffer[len++] = UCNV_SO;
/* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
pFromU2022State->g = 1; break; case 2:
buffer[len++] = 0x1b;
buffer[len++] = 0x4e; break; default: /* case 3 */
buffer[len++] = 0x1b;
buffer[len++] = 0x4f; break;
}
}
/* write the two output bytes */
buffer[len++] = static_cast<char>(targetValue >> 8);
buffer[len++] = static_cast<char>(targetValue);
} else { /* if we cannot find the character after checking all codepages * then this is an error
*/
*err = U_INVALID_CHAR_FOUND;
cnv->fromUChar32=sourceChar; break;
}
}
/* * the end of the input stream and detection of truncated input * are handled by the framework, but for ISO-2022-CN conversion * we need to be in ASCII mode at the very end * * conditions: * successful * not in ASCII mode * end of input and no truncated input
*/ if( U_SUCCESS(*err) &&
pFromU2022State->g!=0 &&
args->flush && source>=sourceLimit && cnv->fromUChar32==0
) {
int32_t sourceIndex;
/* we are switching to ASCII */
pFromU2022State->g=0;
/* get the source index of the last input character */ /* * TODO this would be simpler and more reliable if we used a pair * of sourceIndex/prevSourceIndex like in ucnvmbcs.c * so that we could simply use the prevSourceIndex here; * this code gives an incorrect result for the rare case of an unmatched * trail surrogate that is alone in the last buffer of the text stream
*/
sourceIndex = static_cast<int32_t>(source - args->source); if(sourceIndex>0) {
--sourceIndex; if( U16_IS_TRAIL(args->source[sourceIndex]) &&
(sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
) {
--sourceIndex;
}
} else {
sourceIndex=-1;
}
switch(mySourceChar){ case UCNV_SI:
pToU2022State->g=0; if (myData->isEmptySegment) {
myData->isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
args->converter->toUCallbackReason = UCNV_IRREGULAR;
args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
args->converter->toULength = 1;
args->target = myTarget;
args->source = mySource; return;
} continue;
case UCNV_SO: if(pToU2022State->cs[1] != 0) {
pToU2022State->g=1;
myData->isEmptySegment = true; /* Begin a new segment, empty so far */ continue;
} else { /* illegal to have SO before a matching designator */
myData->isEmptySegment = false; /* Handling a different error, reset this to avoid future spurious errs */ break;
}
/* After SO there must be at least one character before a designator (designator error handled separately) */ if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
args->converter->toUCallbackReason = UCNV_IRREGULAR;
args->converter->toULength = static_cast<int8_t>(toULengthBefore + (mySource - mySourceBefore));
}
}
/* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
case CR: case LF:
uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
U_FALLTHROUGH; default: /* convert one or two bytes */
myData->isEmptySegment = false; if(pToU2022State->g != 0) { if(mySource < mySourceLimit) {
UConverterSharedData *cnv;
StateEnum tempState;
int32_t tempBufLen; int leadIsOk, trailIsOk;
uint8_t trailByte;
getTrailByte:
trailByte = static_cast<uint8_t>(*mySource); /* * Ticket 5691: consistent illegal sequences: * - We include at least the first byte in the illegal sequence. * - If any of the non-initial bytes could be the start of a character, * we stop the illegal sequence before the first one of those. * * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is * an ESC/SO/SI, we report only the first byte as the illegal sequence. * Otherwise we convert or report the pair of bytes.
*/
leadIsOk = static_cast<uint8_t>(mySourceChar - 0x21) <= (0x7e - 0x21);
trailIsOk = static_cast<uint8_t>(trailByte - 0x21) <= (0x7e - 0x21); if (leadIsOk && trailIsOk) {
++mySource;
tempState = static_cast<StateEnum>(pToU2022State->cs[pToU2022State->g]); if(tempState >= CNS_11643_0) {
cnv = myData->myConverterArray[CNS_11643];
tempBuf[0] = static_cast<char>(0x80 + (tempState - CNS_11643_0));
tempBuf[1] = static_cast<char>(mySourceChar);
tempBuf[2] = static_cast<char>(trailByte);
tempBufLen = 3;
}else{
U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
cnv = myData->myConverterArray[tempState];
tempBuf[0] = static_cast<char>(mySourceChar);
tempBuf[1] = static_cast<char>(trailByte);
tempBufLen = 2;
}
targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, false);
mySourceChar = (mySourceChar << 8) | trailByte;
} elseif (!(trailIsOk || IS_2022_CONTROL(trailByte))) { /* report a pair of illegal bytes if the second byte is not a DBCS starter */
++mySource; /* add another bit so that the code below writes 2 bytes in case of error */
mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
} if(pToU2022State->g>=2) { /* return from a single-shift state to the previous one */
pToU2022State->g=pToU2022State->prevG;
}
} else {
args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
args->converter->toULength = 1; goto endloop;
}
} else{ if(mySourceChar <= 0x7f) {
targetUniChar = static_cast<char16_t>(mySourceChar);
}
} break;
} if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ if(args->offsets){
args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
}
*(myTarget++) = static_cast<char16_t>(targetUniChar);
} elseif(targetUniChar > missingCharMarker){ /* disassemble the surrogate pair and write to output*/
targetUniChar-=0x0010000;
*myTarget = static_cast<char16_t>(0xd800 + static_cast<char16_t>(targetUniChar >> 10)); if(args->offsets){
args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
}
++myTarget; if(myTarget< args->targetLimit){
*myTarget = static_cast<char16_t>(0xdc00 + static_cast<char16_t>(targetUniChar & 0x3ff)); if(args->offsets){
args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
}
++myTarget;
}else{
args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= static_cast<char16_t>(0xdc00 + static_cast<char16_t>(targetUniChar & 0x3ff));
}
/* set our substitution string into the subconverter */
myConverterData->currentConverter->subChars = reinterpret_cast<uint8_t*>(subchar);
myConverterData->currentConverter->subCharLen = static_cast<int8_t>(length);
/* let the subconverter write the subchar, set/retrieve fromUChar32 state */
args->converter = myConverterData->currentConverter;
myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
ucnv_cbFromUWriteSub(args, 0, err);
cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
args->converter = cnv;
/* * Structure for cloning an ISO 2022 converter into a single memory block.
*/ struct cloneStruct
{
UConverter cnv;
UConverter currentConverter;
UConverterDataISO2022 mydata;
};
/* ucnv.c/ucnv_safeClone() copied the main UConverter already */
uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
localClone->cnv.isExtraLocal = true;
if (U_FAILURE(*pErrorCode)) { return;
} #ifdef U_ENABLE_GENERIC_ISO_2022 if (cnv->sharedData == &_ISO2022Data) { /* We use UTF-8 in this case */
sa->addRange(sa->set, 0, 0xd7FF);
sa->addRange(sa->set, 0xE000, 0x10FFFF); return;
} #endif
/* open a set and initialize it with code points that are algorithmically round-tripped */ switch(cnvData->locale[0]){ case'j': /* include JIS X 0201 which is hardcoded */
sa->add(sa->set, 0xa5);
sa->add(sa->set, 0x203e); if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { /* include Latin-1 for some variants of JP */
sa->addRange(sa->set, 0, 0xff);
} else { /* include ASCII for JP */
sa->addRange(sa->set, 0, 0x7f);
} if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { /* * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) * use half-width Katakana. * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) * half-width Katakana via the ESC ( I sequence. * However, we only emit (fromUnicode) half-width Katakana according to the * definition of each variant. * * When including fallbacks, * we need to include half-width Katakana Unicode code points for all JP variants because * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
*/ /* include half-width Katakana for JP */
sa->addRange(sa->set, HWKANA_START, HWKANA_END);
} break; #if !UCONFIG_ONLY_HTML_CONVERSION case'c': case'z': /* include ASCII for CN */
sa->addRange(sa->set, 0, 0x7f); break; case'k': /* there is only one converter for KR, and it is not in the myConverterArray[] */
cnvData->currentConverter->sharedData->impl->getUnicodeSet(
cnvData->currentConverter, sa, which, pErrorCode); /* the loop over myConverterArray[] will simply not find another converter */ break; #endif default: break;
}
#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
cnvData->version==0 && i==CNS_11643
) { /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
ucnv_MBCSGetUnicodeSetForBytes(
cnvData->myConverterArray[i],
sa, UCNV_ROUNDTRIP_SET,
0, 0x81, 0x82,
pErrorCode);
} #endif
for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
UConverterSetFilter filter; if(cnvData->myConverterArray[i]!=nullptr) { if(cnvData->locale[0]=='j' && i==JISX208) { /* * Only add code points that map to Shift-JIS codes * corresponding to JIS X 0208.
*/
filter=UCNV_SET_FILTER_SJIS; #if !UCONFIG_ONLY_HTML_CONVERSION
} elseif( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
cnvData->version==0 && i==CNS_11643) { /* * Version-specific for CN: * CN version 0 does not map CNS planes 3..7 although * they are all available in the CNS conversion table; * CN version 1 (-EXT) does map them all. * The two versions create different Unicode sets.
*/
filter=UCNV_SET_FILTER_2022_CN;
} elseif(i==KSC5601) { /* * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) * are broader than GR94.
*/
filter=UCNV_SET_FILTER_GR94DBCS; #endif
} else {
filter=UCNV_SET_FILTER_NONE;
}
ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
}
}
/* * ISO 2022 converters must not convert SO/SI/ESC despite what * sub-converters do by themselves. * Remove these characters from the set.
*/
sa->remove(sa->set, 0x0e);
sa->remove(sa->set, 0x0f);
sa->remove(sa->set, 0x1b);
/* ISO 2022 converters do not convert C1 controls either */
sa->removeRange(sa->set, 0x80, 0x9f);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.