/**************************************HZ Encoding************************************************* * Rules for HZ encoding * * In ASCII mode, a byte is interpreted as an ASCII character, unless a * '~' is encountered. The character '~' is an escape character. By * convention, it must be immediately followed ONLY by '~', '{' or '\n' * (<LF>), with the following special meaning.
* 1. The escape sequence '~~' is interpreted as a '~'. * 2. The escape-to-GB sequence '~{' switches the mode from ASCII to GB. * 3. The escape sequence '~\n' is a line-continuation marker to be * consumed with no output produced. * In GB mode, characters are interpreted two bytes at a time as (pure) * GB codes until the escape-from-GB code '~}' is read. This code * switches the mode from GB back to ASCII. (Note that the escape- * from-GB code '~}' ($7E7D) is outside the defined GB range.) * * Source: RFC 1842 * * Note that the formal syntax in RFC 1842 is invalid. I assume that the * intended definition of single-byte-segment is as follows (pedberg): * single-byte-segment = single-byte-seq 1*single-byte-char
*/
if(args->converter->mode == UCNV_TILDE) { /* second byte after ~ */
args->converter->mode=0; switch(mySourceChar) { case 0x0A: /* no output for ~\n (line-continuation marker) */ continue; case UCNV_TILDE: if(args->offsets) {
args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2);
}
*(myTarget++)=(char16_t)mySourceChar;
myData->isEmptySegment = false; continue; case UCNV_OPEN_BRACE: case UCNV_CLOSE_BRACE:
myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE); if (myData->isEmptySegment) {
myData->isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
args->converter->toUCallbackReason = UCNV_IRREGULAR;
args->converter->toUBytes[0] = UCNV_TILDE;
args->converter->toUBytes[1] = static_cast<uint8_t>(mySourceChar);
args->converter->toULength = 2;
args->target = myTarget;
args->source = mySource; return;
}
myData->isEmptySegment = true; continue; default: /* if the first byte is equal to TILDE and the trail byte * is not a valid byte then it is an error condition
*/ /* * Ticket 5691: consistent illegal sequences: * - We include at least the first byte in the illegal sequence. * - If any of the non-initial bytes could be the start of a character, * we stop the illegal sequence before the first one of those.
*/
myData->isEmptySegment = false; /* different error here, reset this to avoid spurious future error */
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
args->converter->toUBytes[0] = UCNV_TILDE; if( myData->isStateDBCS ?
(0x21 <= mySourceChar && mySourceChar <= 0x7e) :
mySourceChar <= 0x7f
) { /* The current byte could be the start of a character: Back it out. */
args->converter->toULength = 1;
--mySource;
} else { /* Include the current byte in the illegal sequence. */
args->converter->toUBytes[1] = static_cast<uint8_t>(mySourceChar);
args->converter->toULength = 2;
}
args->target = myTarget;
args->source = mySource; return;
}
} elseif(myData->isStateDBCS) { if(args->converter->toUnicodeStatus == 0x00){ /* lead byte */ if(mySourceChar == UCNV_TILDE) {
args->converter->mode = UCNV_TILDE;
} else { /* add another bit to distinguish a 0 byte from not having seen a lead byte */
args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100);
myData->isEmptySegment = false; /* the segment has something, either valid or will produce a different error, so reset this */
} continue;
} else{ /* trail byte */ int leadIsOk, trailIsOk;
uint32_t leadByte = args->converter->toUnicodeStatus & 0xff;
targetUniChar = 0xffff; /* * Ticket 5691: consistent illegal sequences: * - We include at least the first byte in the illegal sequence. * - If any of the non-initial bytes could be the start of a character, * we stop the illegal sequence before the first one of those. * * In HZ DBCS, if the second byte is in the 21..7e range, * we report only the first byte as the illegal sequence. * Otherwise we convert or report the pair of bytes.
*/
leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21);
trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); if (leadIsOk && trailIsOk) {
tempBuf[0] = (char) (leadByte+0x80) ;
tempBuf[1] = (char) (mySourceChar+0x80);
targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
tempBuf, 2, args->converter->useFallback);
mySourceChar= (leadByte << 8) | mySourceChar;
} elseif (trailIsOk) { /* report a single illegal byte and continue with the following DBCS starter byte */
--mySource;
mySourceChar = (int32_t)leadByte;
} else { /* report a pair of illegal bytes if the second byte is not a DBCS starter */ /* add another bit so that the code below writes 2 bytes in case of error */
mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
}
args->converter->toUnicodeStatus =0x00;
}
} else{ if(mySourceChar == UCNV_TILDE) {
args->converter->mode = UCNV_TILDE; continue;
} elseif(mySourceChar <= 0x7f) {
targetUniChar = (char16_t)mySourceChar; /* ASCII */
myData->isEmptySegment = false; /* the segment has something valid */
} else {
targetUniChar = 0xffff;
myData->isEmptySegment = false; /* different error here, reset this to avoid spurious future error */
}
} if(targetUniChar < 0xfffe){ if(args->offsets) {
args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 1-(myData->isStateDBCS));
}
} else{ /* oops.. the code point is unassigned */ /*Handle surrogates */ /*check if the char is a First surrogate*/ if(U16_IS_SURROGATE(mySourceChar)) { if(U16_IS_SURROGATE_LEAD(mySourceChar)) {
args->converter->fromUChar32=mySourceChar;
getTrail: /*look ahead to find the trail surrogate*/ if(mySourceIndex < mySourceLength) { /* test the following code unit */
char16_t trail=(char16_t) args->source[mySourceIndex]; if(U16_IS_TRAIL(trail)) {
++mySourceIndex;
mySourceChar=U16_GET_SUPPLEMENTARY(args->converter->fromUChar32, trail);
args->converter->fromUChar32=0x00; /* there are no surrogates in GB2312*/
*err = U_INVALID_CHAR_FOUND; /* exit this condition tree */
} else { /* this is an unmatched lead code unit (1st surrogate) */ /* callback(illegal) */
*err=U_ILLEGAL_CHAR_FOUND;
}
} else { /* no more input */
*err = U_ZERO_ERROR;
}
} else { /* this is an unmatched trail code unit (2nd surrogate) */ /* callback(illegal) */
*err=U_ILLEGAL_CHAR_FOUND;
}
} else { /* callback(unassigned) for a BMP code point */
*err = U_INVALID_CHAR_FOUND;
}
/* * Structure for cloning an HZ converter into a single memory block.
*/ struct cloneHZStruct
{
UConverter cnv;
UConverter subCnv;
UConverterDataHZ mydata;
};
/* add all of the code points that the sub-converter handles */
ucnv_MBCSGetFilteredUnicodeSetForUnicode(
((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData,
sa, which, UCNV_SET_FILTER_HZ,
pErrorCode);
}
U_CDECL_END staticconst UConverterImpl _HZImpl={
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.