"AZaz09 \u007F " + // U+000000 to U+00007F "\u0080 \u0398 \u03BB \u0725 " + // U+000080 to U+0007FF "\u0964 \u0F5F \u20AC \uFFFB", // U+000800 to U+00FFFF
// there would be strings containing non-BMP code points here, but // unfortunately JS strings are UCS-2 (and worse yet are treated as // 16-bit values by the spec), so we have to do gymnastics to work // with non-BMP -- manual surrogate decoding doesn't work because // String.prototype.charCodeAt() ignores surrogate pairs and only // returns 16-bit values
];
// test conversion equality -- keys are names of files containing equivalent // Unicode data, values are the encoding of the file in the format expected by // nsIConverter(In|Out)putStream.init const UNICODE_FILES = { "unicode-conversion.utf8.txt": "UTF-8", "unicode-conversion.utf16.txt": "UTF-16", "unicode-conversion.utf16le.txt": "UTF-16LE", "unicode-conversion.utf16be.txt": "UTF-16BE",
};
function test_utf8_1() { for (var i = 0; i < UNICODE_STRINGS.length; i++) { var pipe = Pipe(); var conv = new COS(pipe.outputStream, "UTF-8"); Assert.ok(conv.writeString(UNICODE_STRINGS[i]));
conv.close();
if (
!equalStreams( new UTF8(pipe.inputStream),
stringToCodePoints(UNICODE_STRINGS[i])
)
) {
do_throw("UNICODE_STRINGS[" + i + "] not handled correctly");
}
}
}
function test_cross_conversion() { for (var fn1 in UNICODE_FILES) { var fin = getBinaryInputStream(fn1); var ss = StorageStream();
var bos = new BOS(ss.getOutputStream(0)); var av; while ((av = fin.available()) > 0) { var data = fin.readByteArray(av);
bos.writeByteArray(data);
}
fin.close();
bos.close();
for (var fn2 in UNICODE_FILES) { var fin2 = getUnicharInputStream(fn2, UNICODE_FILES[fn2]); var unichar = new CIS(
ss.newInputStream(0),
UNICODE_FILES[fn1],
8192,
0x0
);
function lowbits(n) { return Math.pow(2, n) - 1;
}
function Pipe() { returnnew _Pipe(false, false, 1024, 10, null);
}
// complex charset readers
/** * Wraps a UTF-8 stream to allow access to the Unicode code points in it. * * @param stream * the stream to wrap
*/ function UTF8(stream) { this._stream = new BIS(stream);
}
UTF8.prototype = { // returns numeric code point at front of stream encoded in UTF-8, -1 if at // end of stream, or throws if valid (and properly encoded!) code point not // found
readUnit() { var str = this._stream;
var c, c2, c3, c4, rv;
// if at end of stream, must distinguish failure to read any bytes // (correct behavior) from failure to read some byte after the first // in the character try {
c = str.read8();
} catch (e) { return -1;
}
if (c < 0x80) { return c;
}
if (c < 0xc0) { // c < 11000000 // byte doesn't have enough leading ones (must be at least two) throw NS_ERROR_ILLEGAL_VALUE;
}
c2 = str.read8(); if (c2 >= 0xc0 || c2 < 0x80) { throw NS_ERROR_ILLEGAL_VALUE;
} // not 10xxxxxx
if (c < 0xe0) { // c < 11100000 // two-byte between U+000080 and U+0007FF
rv = ((lowbits(5) & c) << 6) + (lowbits(6) & c2); // no upper bounds-check needed, by previous lines if (rv >= 0x80) { return rv;
} throw NS_ERROR_ILLEGAL_VALUE;
}
c3 = str.read8(); if (c3 >= 0xc0 || c3 < 0x80) { throw NS_ERROR_ILLEGAL_VALUE;
} // not 10xxxxxx
if (c < 0xf0) { // c < 11110000 // three-byte between U+000800 and U+00FFFF
rv =
((lowbits(4) & c) << 12) + ((lowbits(6) & c2) << 6) + (lowbits(6) & c3); // no upper bounds-check needed, by previous lines if (rv >= 0xe000 || (rv >= 0x800 && rv <= 0xd7ff)) { return rv;
} throw NS_ERROR_ILLEGAL_VALUE;
}
c4 = str.read8(); if (c4 >= 0xc0 || c4 < 0x80) { throw NS_ERROR_ILLEGAL_VALUE;
} // not 10xxxxxx
if (c < 0xf8) { // c < 11111000 // four-byte between U+010000 and U+10FFFF
rv =
((lowbits(3) & c) << 18) +
((lowbits(6) & c2) << 12) +
((lowbits(6) & c3) << 6) +
(lowbits(6) & c4); // need an upper bounds-check since 0x10FFFF isn't (2**n - 1) if (rv >= 0x10000 && rv <= 0x10ffff) { return rv;
} throw NS_ERROR_ILLEGAL_VALUE;
}
// 11111000 or greater -- no UTF-8 mapping throw NS_ERROR_ILLEGAL_VALUE;
},
};
/** * Wraps a UTF-16 stream to allow access to the Unicode code points in it. * * @param stream * the stream to wrap * @param bigEndian * true for UTF-16BE, false for UTF-16LE, not present at all for UTF-16 with * a byte-order mark
*/ function UTF16(stream, bigEndian) { this._stream = new BIS(stream); if (arguments.length > 1) { this._bigEndian = bigEndian;
} else { var bom = this._stream.read16(); if (bom == 0xfeff) { this._bigEndian = true;
} elseif (bom == 0xfffe) { this._bigEndian = false;
} else {
do_throw("missing BOM: " + bom.toString(16).toUpperCase());
}
}
}
UTF16.prototype = { // returns numeric code point at front of stream encoded in UTF-16, // -1 if at end of stream, or throws if UTF-16 code point not found
readUnit() { var str = this._stream;
// if at end of stream, must distinguish failure to read any bytes // (correct behavior) from failure to read some byte after the first // in the character try { var b1 = str.read8();
} catch (e) { return -1;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.