# Copyright Mozilla Foundation. See the COPYRIGHT # file at the top-level directory of this distribution. # # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or # https://www.apache.org/licenses/LICENSE-2.0> or the MIT license # <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your # option. This file may not be copied, modified, or distributed # except according to those terms.
if (not os.path.isfile("../encoding/encodings.json")) or (not os.path.isfile("../encoding/indexes.json")):
sys.stderr.write("This script needs a clone of https://github.com/whatwg/encoding/ (preferably at revision 1d519bf8e5555cef64cf3a712485f41cd1a6a990 ) next to the encoding_rs directory.\n");
sys.exit(-1)
ifnot os.path.isfile("../encoding_c/src/lib.rs"):
sys.stderr.write("This script also writes the generated parts of the encoding_c crate and needs a clone of https://github.com/hsivonen/encoding_c next to the encoding_rs directory.\n");
sys.exit(-1)
ifnot os.path.isfile("../codepage/src/lib.rs"):
sys.stderr.write("This script also writes the generated parts of the codepage crate and needs a clone of https://github.com/hsivonen/codepage next to the encoding_rs directory.\n");
sys.exit(-1)
def cmp_from_end(one, other):
c = cmp(len(one), len(other)) if c != 0: return c
i = len(one) - 1 while i >= 0:
c = cmp(one[i], other[i]) if c != 0: return c
i -= 1 return 0
# Guestimate based on # https://w3techs.com/technologies/overview/character_encoding/all # whose methodology is known to be bogus, but the results are credible for # this purpose. UTF-16LE lifted up due to prevalence on Windows and # "ANSI codepages" prioritized.
encodings_by_code_page_frequency = [ "UTF-8", "UTF-16LE", "windows-1252", "windows-1251", "GBK", "Shift_JIS", "EUC-KR", "windows-1250", "windows-1256", "windows-1254", "Big5", "windows-874", "windows-1255", "windows-1253", "windows-1257", "windows-1258", "EUC-JP", "ISO-8859-2", "ISO-8859-15", "ISO-8859-7", "KOI8-R", "gb18030", "ISO-8859-5", "ISO-8859-8-I", "ISO-8859-4", "ISO-8859-6", "ISO-2022-JP", "KOI8-U", "ISO-8859-13", "ISO-8859-3", "UTF-16BE", "IBM866", "ISO-8859-10", "ISO-8859-8", "macintosh", "x-mac-cyrillic", "ISO-8859-14", "ISO-8859-16",
]
temp_keys = encodings_by_code_page.keys()
temp_keys.sort() for code_page in temp_keys: ifnot code_page in code_pages:
code_pages.append(code_page)
# The position in the index (0 is the first index entry, # i.e. byte value 0x80) that starts the longest run of # consecutive code points. Must not be in the first # quadrant. If the character to be encoded is not in this # run, the part of the index after the run is searched # forward. Then the part of the index from 32 to the start # of the run. The first quadrant is searched last. # # If there is no obviously most useful longest run, # the index here is just used to affect the search order.
start_of_longest_run_in_single_byte = { "IBM866": 96, # 0 would be longest, but we don't want to start in the first quadrant "windows-874": 33, "windows-1250": 92, "windows-1251": 64, "windows-1252": 32, "windows-1253": 83, "windows-1254": 95, "windows-1255": 96, "windows-1256": 65, "windows-1257": 95, # not actually longest "windows-1258": 95, # not actually longest "macintosh": 106, # useless "x-mac-cyrillic": 96, "KOI8-R": 64, # not actually longest "KOI8-U": 64, # not actually longest "ISO-8859-2": 95, # not actually longest "ISO-8859-3": 95, # not actually longest "ISO-8859-4": 95, # not actually longest "ISO-8859-5": 46, "ISO-8859-6": 65, "ISO-8859-7": 83, "ISO-8859-8": 96, "ISO-8859-10": 90, # not actually longest "ISO-8859-13": 95, # not actually longest "ISO-8859-14": 95, "ISO-8859-15": 63, "ISO-8859-16": 95, # not actually longest
}
#
for group in data: if group["heading"] == "Legacy single-byte encodings":
single_byte = group["encodings"] else:
multi_byte.extend(group["encodings"]) for encoding in group["encodings"]:
preferred.append(encoding["name"]) for label in encoding["labels"]:
labels.append(Label(label, encoding["name"]))
for name in preferred:
dom.append(to_dom_name(name))
label_file.write('''/// The initializer for the [%s](static.%s.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
/// such asin initializers of other `static`s. Ifin doubt,
/// use the corresponding non-`_INIT` reference-typed `static`.
///
/// This part of the public API will go away if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate orif Rust starts allowing static arrays
/// to be initialized with `pub static FOO: &'static Encoding`
/// items.
pub static %s_INIT: Encoding = Encoding {
name: "%s",
variant: VariantEncoding::%s,
};
/// The %s encoding.
///
%s///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static %s: &'static Encoding = &%s_INIT;
label_test_file = open("src/test_labels_names.rs", "w")
label_test_file.write('''// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
// Instead, please regenerate using generate-encoding-data.py
use super::*;
#[test]
fn test_all_labels() { ''')
for label in labels:
label_test_file.write('''assert_eq!(Encoding::for_label(b"%s"), Some(%s));\n''' % (label.label, to_constant_name(label.preferred)))
for code_point in indexes[name.lower()]:
data_file.write('0x%04X,\n' % null_to_zero(code_point))
data_file.write('''], ''')
data_file.write('''};
''')
# Big5
index = indexes["big5"]
astralness = []
low_bits = []
for code_point in index[942:19782]: if code_point:
astralness.append(1 if code_point > 0xFFFF else 0)
low_bits.append(code_point & 0xFFFF) else:
astralness.append(0)
low_bits.append(0)
# pad length to multiple of 32 for j in xrange(32 - (len(astralness) % 32)):
astralness.append(0)
i = 0 while i < len(astralness):
accu = 0 for j in xrange(32):
accu |= astralness[i + j] << j
data_file.write('0x%08X,\n' % accu)
i += 32
data_file.write('''];
''')
static_u16_table("BIG5_LOW_BITS", low_bits)
# Encoder table for Level 1 Hanzi # Note: If we were OK with doubling this table, we # could use a directly-indexable table instead...
level1_hanzi_index = index[5495:10896]
level1_hanzi_pairs = [] for i in xrange(len(level1_hanzi_index)):
hanzi_lead = (i / 157) + 0xA4
hanzi_trail = (i % 157)
hanzi_trail += 0x40 if hanzi_trail < 0x3F else 0x62
level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail)))
level1_hanzi_pairs.append((0x4E5A, (0xC8, 0x7B)))
level1_hanzi_pairs.append((0x5202, (0xC8, 0x7D)))
level1_hanzi_pairs.append((0x9FB0, (0xC8, 0xA1)))
level1_hanzi_pairs.append((0x5188, (0xC8, 0xA2)))
level1_hanzi_pairs.append((0x9FB1, (0xC8, 0xA3)))
level1_hanzi_pairs.sort(key=lambda x: x[0])
# JIS 0208 Level 1 Kanji
static_u16_table("JIS0208_LEVEL1_KANJI", index[1410:4375])
# JIS 0208 Level 2 Kanji and Additional Kanji
static_u16_table("JIS0208_LEVEL2_AND_ADDITIONAL_KANJI", index[4418:7808])
# IBM Kanji
static_u16_table("IBM_KANJI", index[8272:8632])
# Check that the other instance is the same if index[8272:8632] != index[10744:11104]: raise Error()
# JIS 0208 symbols (all non-Kanji, non-range items)
symbol_index = []
symbol_triples = []
pointers_to_scan = [
(0, 188),
(658, 691),
(1159, 1221),
]
in_run = False
run_start_pointer = 0
run_start_array_index = 0 for (start, end) in pointers_to_scan: for i in range(start, end):
code_point = index[i] if in_run: if code_point:
symbol_index.append(code_point) else:
symbol_triples.append(run_start_pointer)
symbol_triples.append(i - run_start_pointer)
symbol_triples.append(run_start_array_index)
in_run = False else: if code_point:
in_run = True
run_start_pointer = i
run_start_array_index = len(symbol_index)
symbol_index.append(code_point) if in_run:
symbol_triples.append(run_start_pointer)
symbol_triples.append(end - run_start_pointer)
symbol_triples.append(run_start_array_index)
in_run = False if in_run: raise Error()
# Now add manually the two overlapping slices of # index from the NEC/IBM extensions.
run_start_array_index = len(symbol_index)
symbol_index.extend(index[10736:10744]) # Later
symbol_triples.append(10736)
symbol_triples.append(8)
symbol_triples.append(run_start_array_index) # Earlier
symbol_triples.append(8644)
symbol_triples.append(4)
symbol_triples.append(run_start_array_index)
# Encoder table for Level 1 Kanji # Note: If we were OK with 30 KB more footprint, we # could use a directly-indexable table instead...
level1_kanji_index = index[1410:4375]
level1_kanji_pairs = [] for i in xrange(len(level1_kanji_index)):
pointer = 1410 + i
(lead, trail) = divmod(pointer, 188)
lead += 0x81 if lead < 0x1F else 0xC1
trail += 0x40 if trail < 0x3F else 0x41
level1_kanji_pairs.append((level1_kanji_index[i], (lead, trail)))
level1_kanji_pairs.sort(key=lambda x: x[0])
# Fast encoder table for Kanji
kanji_bytes = [None] * (0x9FA1 - 0x4E00) for pointer in xrange(len(index)):
code_point = index[pointer] if code_point and code_point >= 0x4E00 and code_point <= 0x9FA0:
(lead, trail) = divmod(pointer, 188)
lead += 0x81 if lead < 0x1F else 0xC1
trail += 0x40 if trail < 0x3F else 0x41 # unset the high bit of lead if IBM Kanji if pointer >= 8272:
lead = lead & 0x7F
kanji_bytes[code_point - 0x4E00] = (lead, trail)
# KS X 1001 Hangul
hangul_index = []
previous_code_point = 0 for row in xrange(0x48 - 0x2F): for column in xrange(94):
code_point = index[9026 + column + (row * 190)] if previous_code_point >= code_point: raise Error()
hangul_index.append(code_point)
previous_code_point = code_point
static_u16_table("KSX1001_HANGUL", hangul_index)
# KS X 1001 Hanja
hanja_index = [] for row in xrange(0x7D - 0x49): for column in xrange(94):
hanja_index.append(index[13966 + column + (row * 190)])
static_u16_table("KSX1001_HANJA", hanja_index)
# KS X 1001 symbols
symbol_index = [] for i in range(6176, 6270):
symbol_index.append(index[i]) for i in range(6366, 6437):
symbol_index.append(index[i])
static_u16_table("KSX1001_SYMBOLS", symbol_index)
# KS X 1001 Uppercase Latin
subindex = [] for i in range(7506, 7521):
subindex.append(null_to_zero(index[i]))
static_u16_table("KSX1001_UPPERCASE", subindex)
# KS X 1001 Lowercase Latin
subindex = [] for i in range(7696, 7712):
subindex.append(index[i])
static_u16_table("KSX1001_LOWERCASE", subindex)
# KS X 1001 Box drawing
subindex = [] for i in range(7126, 7194):
subindex.append(index[i])
static_u16_table("KSX1001_BOX", subindex)
# KS X 1001 other
pointers = []
offsets = []
previous_code_point = 0 for row in xrange(10): for column in xrange(94):
i = 6556 + column + (row * 190)
code_point = index[i] # Exclude ranges that were processed as lookup tables # or that contain unmapped cells by filling them with # ASCII. Upon encode, ASCII code points will # never appear as the search key. if (i >= 6946 and i <= 6950):
code_point = i - 6946 elif (i >= 6961 and i <= 6967):
code_point = i - 6961 elif (i >= 6992 and i <= 6999):
code_point = i - 6992 elif (i >= 7024 and i <= 7029):
code_point = i - 7024 elif (i >= 7126 and i <= 7219):
code_point = i - 7126 elif (i >= 7395 and i <= 7409):
code_point = i - 7395 elif (i >= 7506 and i <= 7521):
code_point = i - 7506 elif (i >= 7696 and i <= 7711):
code_point = i - 7696 elif (i >= 7969 and i <= 7979):
code_point = i - 7969 elif (i >= 8162 and i <= 8169):
code_point = i - 8162 elif (i >= 8299 and i <= 8313):
code_point = i - 8299 elif (i >= 8347 and i <= 8359):
code_point = i - 8347 if code_point - previous_code_point != 1:
pointers.append(column + (row * 94))
offsets.append(code_point)
previous_code_point = code_point
static_u16_table("KSX1001_OTHER_POINTERS", pointers) # Omit the last offset, because the end of the last line # is unmapped, so we don't want to look at it.
static_u16_table("KSX1001_OTHER_UNSORTED_OFFSETS", offsets[:-1])
# Fast Hangul and Hanja encode
hangul_bytes = [None] * (0xD7A4 - 0xAC00)
hanja_unified_bytes = [None] * (0x9F9D - 0x4E00)
hanja_compatibility_bytes = [None] * (0xFA0C - 0xF900) for row in xrange(0x7D): for column in xrange(190):
pointer = column + (row * 190)
code_point = index[pointer] if code_point:
lead = 0x81 + row
trail = 0x41 + column if code_point >= 0xAC00 and code_point < 0xD7A4:
hangul_bytes[code_point - 0xAC00] = (lead, trail) elif code_point >= 0x4E00 and code_point < 0x9F9D:
hanja_unified_bytes[code_point - 0x4E00] = (lead, trail) elif code_point >= 0xF900 and code_point < 0xFA0C:
hanja_compatibility_bytes[code_point - 0xF900] = (lead, trail)
# Unicode 1.1 ideographs above the old GB2312 block # Compressed form takes 63% of uncompressed form
pointers = []
offsets = []
previous_code_point = 0 for i in xrange(6080):
code_point = index[i] if previous_code_point > code_point: raise Error() if code_point - previous_code_point != 1:
pointers.append(i)
offsets.append(code_point)
previous_code_point = code_point
# Unicode 1.1 ideographs to the left of the old GB2312 block # Compressed form takes 40% of uncompressed form
pointers = []
offsets = []
previous_code_point = 0 for row in xrange(0x7D - 0x29): for column in xrange(190 - 94):
i = 7790 + column + (row * 190) if i > 23650: # Exclude compatibility ideographs at the end break
code_point = index[i] if previous_code_point > code_point: raise Error() if code_point - previous_code_point != 1:
pointers.append(column + (row * (190 - 94)))
offsets.append(code_point)
previous_code_point = code_point
# GBK bottom: Compatibility ideagraphs, Ext A and PUA
bottom_index = [] # 5 compat following Unified Ideographs for i in range(23651, 23656):
bottom_index.append(index[i]) # Last row for i in range(23750, 23846):
bottom_index.append(index[i])
static_u16_table("GBK_BOTTOM", bottom_index)
# GB2312 Hanzi # (and the 5 PUA code points in between Level 1 and Level 2)
hanzi_index = [] for row in xrange(0x77 - 0x2F): for column in xrange(94):
hanzi_index.append(index[9026 + column + (row * 190)])
static_u16_table("GB2312_HANZI", hanzi_index)
# GB2312 symbols
symbol_index = [] for i in xrange(94):
symbol_index.append(index[6176 + i])
static_u16_table("GB2312_SYMBOLS", symbol_index)
# GB2312 symbols on Greek row (incl. PUA)
symbol_index = [] for i in xrange(22):
symbol_index.append(index[7189 + i])
# GB2312 Pinyin
pinyin_index = [] for i in xrange(32):
pinyin_index.append(index[7506 + i])
static_u16_table("GB2312_PINYIN", pinyin_index)
# GB2312 other (excl. bottom PUA)
pointers = []
offsets = []
previous_code_point = 0 for row in xrange(14): for column in xrange(94):
i = 6366 + column + (row * 190)
code_point = index[i] # Exclude the two ranges that were processed as # lookup tables above by filling them with # ASCII. Upon encode, ASCII code points will # never appear as the search key. if (i >= 7189 and i < 7189 + 22):
code_point = i - 7189 elif (i >= 7506 and i < 7506 + 32):
code_point = i - 7506 if code_point - previous_code_point != 1:
pointers.append(column + (row * 94))
offsets.append(code_point)
previous_code_point = code_point
# Non-gbk code points
pointers = []
offsets = [] for pair in indexes["gb18030-ranges"]: if pair[1] == 0x10000: break# the last entry doesn't fit in u16
pointers.append(pair[0])
offsets.append(pair[1])
# Encoder table for Level 1 Hanzi # The units here really fit into 12 bits, but since we're # looking for speed here, let's use 16 bits per unit. # Once we use 16 bits per unit, we might as well precompute # the output bytes.
level1_hanzi_index = hanzi_index[:(94 * (0xD8 - 0xB0) - 5)]
level1_hanzi_pairs = [] for i in xrange(len(level1_hanzi_index)):
hanzi_lead = (i / 94) + 0xB0
hanzi_trail = (i % 94) + 0xA1
level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail)))
level1_hanzi_pairs.sort(key=lambda x: x[0])
variant_file = open("src/variant.rs", "w")
variant_file.write('''// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT orhttps://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
// Instead, please regenerate using generate-encoding-data.py
//! This module provides enums that wrap the various decoders and encoders.
//! The purpose is to make `Decoder` and `Encoder` `Sized` by writing the
//! dispatch explicitly for a finite set of specialized decoders and encoders.
//! Unfortunately, this means the compiler doesn't generate the dispatch code
//! and it has to be written here instead.
//!
//! The purpose of making `Decoder` and `Encoder` `Sized` is to allow stack
//! allocation in Rust code, including the convenience methods on `Encoding`.
''')
encoding_variants = [u"single-byte",] for encoding in multi_byte: if encoding["name"] in [u"UTF-16LE", u"UTF-16BE"]: continue else:
encoding_variants.append(encoding["name"])
encoding_variants.append(u"UTF-16")
decoder_variants = [] for variant in encoding_variants: if variant == u"GBK": continue
decoder_variants.append(variant)
encoder_variants = [] for variant in encoding_variants: if variant in [u"replacement", u"GBK", u"UTF-16"]: continue
encoder_variants.append(variant)
for variant in decoder_variants:
variant_file.write("use %s::*;\n" % to_snake_name(variant))
variant_file.write('''use super::*;
pub enum VariantDecoder { ''')
for variant in decoder_variants:
variant_file.write(" %s(%sDecoder),\n" % (to_camel_name(variant), to_camel_name(variant)))
variant_file.write('''}
impl VariantDecoder { ''')
def write_variant_method(name, mut, arg_list, ret, variants, excludes, kind):
variant_file.write('''pub fn %s(&''' % name) if mut:
variant_file.write('''mut ''')
variant_file.write('''self''') for arg in arg_list:
variant_file.write(''', %s: %s''' % (arg[0], arg[1]))
variant_file.write(''')''') if ret:
variant_file.write(''' -> %s''' % ret)
variant_file.write(''' {\nmatch *self {\n''') for variant in variants:
variant_file.write('''Variant%s::%s(ref ''' % (kind, to_camel_name(variant))) if mut:
variant_file.write('''mut ''') if variant in excludes:
variant_file.write('''v) => (),''') continue
variant_file.write('''v) => v.%s(''' % name)
first = True for arg in arg_list: ifnot first:
variant_file.write(''', ''')
first = False
variant_file.write(arg[0])
variant_file.write('''),\n''')
variant_file.write('''}\n}\n\n''')
single_byte_file.write(single_byte_rs_begin)
single_byte_file.write("""
// Instead, please regenerate using generate-encoding-data.py
#[test]
fn test_single_byte_decode() {""")
idx = 0 # for Miri, return after 2nd test for name in preferred: if name == u"ISO-8859-8-I": continue; if is_single_byte(name):
single_byte_file.write("""
decode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name)))
idx += 1 if idx == 2:
single_byte_file.write(""" if cfg!(miri) {
// Miri is too slow return;
}""")
single_byte_file.write("""
}
#[test]
fn test_single_byte_encode() {""")
idx = 0 # for Miri, return after 2nd test for name in preferred: if name == u"ISO-8859-8-I": continue; if is_single_byte(name):
single_byte_file.write("""
encode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name)))
idx += 1 if idx == 2:
single_byte_file.write(""" if cfg!(miri) {
// Miri is too slow return;
}""")
static_file.write("""// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT orhttps://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
// Instead, please regenerate using generate-encoding-data.py
// This file isnot meant to be included directly. Instead, encoding_rs.h
// includes this file.
for i in range(256):
combined = (1 << 2) # invalid lead if i < 0x80 or i > 0xBF:
combined |= (1 << 3) # normal trail if i < 0xA0 or i > 0xBF:
combined |= (1 << 4) # three-byte special lower bound if i < 0x80 or i > 0x9F:
combined |= (1 << 5) # three-byte special upper bound if i < 0x90 or i > 0xBF:
combined |= (1 << 6) # four-byte special lower bound if i < 0x80 or i > 0x8F:
combined |= (1 << 7) # four-byte special upper bound
utf_8_file.write("%d," % combined)
for i in range(128, 256):
lane = (1 << 2) # invalid lead if i >= 0xC2 and i <= 0xDF:
lane = (1 << 3) # normal trail elif i == 0xE0:
lane = (1 << 4) # three-byte special lower bound elif i >= 0xE1 and i <= 0xEC:
lane = (1 << 3) # normal trail elif i == 0xED:
lane = (1 << 5) # three-byte special upper bound elif i >= 0xEE and i <= 0xEF:
lane = (1 << 3) # normal trail elif i == 0xF0:
lane = (1 << 6) # four-byte special lower bound elif i >= 0xF1 and i <= 0xF3:
lane = (1 << 3) # normal trail elif i == 0xF4:
lane = (1 << 7) # four-byte special upper bound
utf_8_file.write("%d," % lane)
utf_8_file.write("""
],
};
""")
utf_8_file.write(utf_8_rs_end)
utf_8_file.close()
# Unit tests
TEST_HEADER = '''Generated from WHATWG indexes.json; see LICENSE-WHATWG.
This is a generated file. Please do not edit.
Instead, please regenerate using generate-encoding-data.py '''
index = indexes["jis0208"]
jis0208_in_file = open("src/test_data/jis0208_in.txt", "w")
jis0208_in_file.write(TEST_HEADER) for pointer in range(0, 94 * 94):
(lead, trail) = divmod(pointer, 94)
lead += 0xA1
trail += 0xA1
jis0208_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
jis0208_in_file.close()
jis0208_in_ref_file = open("src/test_data/jis0208_in_ref.txt", "w")
jis0208_in_ref_file.write(TEST_HEADER) for pointer in range(0, 94 * 94):
code_point = index[pointer] if code_point:
jis0208_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) else:
jis0208_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
jis0208_in_ref_file.close()
jis0208_out_file = open("src/test_data/jis0208_out.txt", "w")
jis0208_out_ref_file = open("src/test_data/jis0208_out_ref.txt", "w")
jis0208_out_file.write(TEST_HEADER)
jis0208_out_ref_file.write(TEST_HEADER) for pointer in range(0, 94 * 94):
code_point = index[pointer] if code_point:
revised_pointer = pointer if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220):
revised_pointer = index.index(code_point)
(lead, trail) = divmod(revised_pointer, 94)
lead += 0xA1
trail += 0xA1
jis0208_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
jis0208_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
jis0208_out_file.close()
jis0208_out_ref_file.close()
shift_jis_in_file = open("src/test_data/shift_jis_in.txt", "w")
shift_jis_in_file.write(TEST_HEADER) for pointer in range(0, len(index)):
(lead, trail) = divmod(pointer, 188)
lead += 0x81 if lead < 0x1F else 0xC1
trail += 0x40 if trail < 0x3F else 0x41
shift_jis_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
shift_jis_in_file.close()
shift_jis_in_ref_file = open("src/test_data/shift_jis_in_ref.txt", "w")
shift_jis_in_ref_file.write(TEST_HEADER) for pointer in range(0, len(index)):
code_point = 0xE000 - 8836 + pointer if pointer >= 8836 and pointer <= 10715 else index[pointer] if code_point:
shift_jis_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) else:
trail = pointer % 188
trail += 0x40 if trail < 0x3F else 0x41 if trail < 0x80:
shift_jis_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8")) else:
shift_jis_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
shift_jis_in_ref_file.close()
shift_jis_out_file = open("src/test_data/shift_jis_out.txt", "w")
shift_jis_out_ref_file = open("src/test_data/shift_jis_out_ref.txt", "w")
shift_jis_out_file.write(TEST_HEADER)
shift_jis_out_ref_file.write(TEST_HEADER) for pointer in range(0, 8272):
code_point = index[pointer] if code_point:
revised_pointer = pointer if revised_pointer >= 1207 and revised_pointer < 1220:
revised_pointer = index.index(code_point)
(lead, trail) = divmod(revised_pointer, 188)
lead += 0x81 if lead < 0x1F else 0xC1
trail += 0x40 if trail < 0x3F else 0x41
shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) for pointer in range(8836, len(index)):
code_point = index[pointer] if code_point:
revised_pointer = index.index(code_point) if revised_pointer >= 8272 and revised_pointer < 8836:
revised_pointer = pointer
(lead, trail) = divmod(revised_pointer, 188)
lead += 0x81 if lead < 0x1F else 0xC1
trail += 0x40 if trail < 0x3F else 0x41
shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
shift_jis_out_file.close()
shift_jis_out_ref_file.close()
iso_2022_jp_in_file = open("src/test_data/iso_2022_jp_in.txt", "w")
iso_2022_jp_in_file.write(TEST_HEADER) for pointer in range(0, 94 * 94):
(lead, trail) = divmod(pointer, 94)
lead += 0x21
trail += 0x21
iso_2022_jp_in_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
iso_2022_jp_in_file.close()
iso_2022_jp_in_ref_file = open("src/test_data/iso_2022_jp_in_ref.txt", "w")
iso_2022_jp_in_ref_file.write(TEST_HEADER) for pointer in range(0, 94 * 94):
code_point = index[pointer] if code_point:
iso_2022_jp_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) else:
iso_2022_jp_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
iso_2022_jp_in_ref_file.close()
iso_2022_jp_out_file = open("src/test_data/iso_2022_jp_out.txt", "w")
iso_2022_jp_out_ref_file = open("src/test_data/iso_2022_jp_out_ref.txt", "w")
iso_2022_jp_out_file.write(TEST_HEADER)
iso_2022_jp_out_ref_file.write(TEST_HEADER) for pointer in range(0, 94 * 94):
code_point = index[pointer] if code_point:
revised_pointer = pointer if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220):
revised_pointer = index.index(code_point)
(lead, trail) = divmod(revised_pointer, 94)
lead += 0x21
trail += 0x21
iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) for i in xrange(len(half_width_index)):
code_point = i + 0xFF61
normalized_code_point = half_width_index[i]
pointer = index.index(normalized_code_point)
(lead, trail) = divmod(pointer, 94)
lead += 0x21
trail += 0x21
iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
iso_2022_jp_out_file.close()
iso_2022_jp_out_ref_file.close()
index = indexes["euc-kr"]
euc_kr_in_file = open("src/test_data/euc_kr_in.txt", "w")
euc_kr_in_file.write(TEST_HEADER) for pointer in range(0, len(index)):
(lead, trail) = divmod(pointer, 190)
lead += 0x81
trail += 0x41
euc_kr_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
euc_kr_in_file.close()
euc_kr_in_ref_file = open("src/test_data/euc_kr_in_ref.txt", "w")
euc_kr_in_ref_file.write(TEST_HEADER) for pointer in range(0, len(index)):
code_point = index[pointer] if code_point:
euc_kr_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) else:
trail = pointer % 190
trail += 0x41 if trail < 0x80:
euc_kr_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8")) else:
euc_kr_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
euc_kr_in_ref_file.close()
euc_kr_out_file = open("src/test_data/euc_kr_out.txt", "w")
euc_kr_out_ref_file = open("src/test_data/euc_kr_out_ref.txt", "w")
euc_kr_out_file.write(TEST_HEADER)
euc_kr_out_ref_file.write(TEST_HEADER) for pointer in range(0, len(index)):
code_point = index[pointer] if code_point:
(lead, trail) = divmod(pointer, 190)
lead += 0x81
trail += 0x41
euc_kr_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
euc_kr_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
euc_kr_out_file.close()
euc_kr_out_ref_file.close()
index = indexes["gb18030"]
gb18030_in_file = open("src/test_data/gb18030_in.txt", "w")
gb18030_in_file.write(TEST_HEADER) for pointer in range(0, len(index)):
(lead, trail) = divmod(pointer, 190)
lead += 0x81
trail += 0x40 if trail < 0x3F else 0x41
gb18030_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
gb18030_in_file.close()
gb18030_in_ref_file = open("src/test_data/gb18030_in_ref.txt", "w")
gb18030_in_ref_file.write(TEST_HEADER) for pointer in range(0, len(index)):
code_point = index[pointer] if code_point:
gb18030_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) else:
trail = pointer % 190
trail += 0x40 if trail < 0x3F else 0x41 if trail < 0x80:
gb18030_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8")) else:
gb18030_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
gb18030_in_ref_file.close()
gb18030_out_file = open("src/test_data/gb18030_out.txt", "w")
gb18030_out_ref_file = open("src/test_data/gb18030_out_ref.txt", "w")
gb18030_out_file.write(TEST_HEADER)
gb18030_out_ref_file.write(TEST_HEADER) for pointer in range(0, len(index)): if pointer == 6555: continue
code_point = index[pointer] if code_point:
(lead, trail) = divmod(pointer, 190)
lead += 0x81
trail += 0x40 if trail < 0x3F else 0x41
gb18030_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
gb18030_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
gb18030_out_file.close()
gb18030_out_ref_file.close()
index = indexes["big5"]
big5_in_file = open("src/test_data/big5_in.txt", "w")
big5_in_file.write(TEST_HEADER) for pointer in range(0, len(index)):
(lead, trail) = divmod(pointer, 157)
lead += 0x81
trail += 0x40 if trail < 0x3F else 0x62
big5_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
big5_in_file.close()
for code_point in prefer_last: # Python lists don't have .rindex() :-( for i in xrange(len(index) - 1, -1, -1):
candidate = index[i] if candidate == code_point:
pointer_for_prefer_last.append(i) break
big5_out_file = open("src/test_data/big5_out.txt", "w")
big5_out_ref_file = open("src/test_data/big5_out_ref.txt", "w")
big5_out_file.write(TEST_HEADER)
big5_out_ref_file.write(TEST_HEADER) for pointer in range(((0xA1 - 0x81) * 157), len(index)):
code_point = index[pointer] if code_point: if code_point in prefer_last: if pointer != pointer_for_prefer_last[prefer_last.index(code_point)]: continue else: if pointer != index.index(code_point): continue
(lead, trail) = divmod(pointer, 157)
lead += 0x81
trail += 0x40 if trail < 0x3F else 0x62
big5_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
big5_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
big5_out_file.close()
big5_out_ref_file.close()
index = indexes["jis0212"]
jis0212_in_file = open("src/test_data/jis0212_in.txt", "w")
jis0212_in_file.write(TEST_HEADER) for pointer in range(0, len(index)):
(lead, trail) = divmod(pointer, 94)
lead += 0xA1
trail += 0xA1
jis0212_in_file.write("\x8F%s%s\n" % (chr(lead), chr(trail)))
jis0212_in_file.close()
jis0212_in_ref_file = open("src/test_data/jis0212_in_ref.txt", "w")
jis0212_in_ref_file.write(TEST_HEADER) for pointer in range(0, len(index)):
code_point = index[pointer] if code_point:
jis0212_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) else:
jis0212_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
jis0212_in_ref_file.close()
for code_page in code_pages:
codepage_test_file.write(" assert_eq!(to_encoding(%d), Some(%s));\n" % (code_page, to_constant_name(encodings_by_code_page[code_page])))
codepage_test_file.write("""}
#[test]
fn test_from_encoding() { """)
for name in preferred: if code_pages_by_encoding.has_key(name):
codepage_test_file.write(" assert_eq!(from_encoding(%s), Some(%d));\n" % (to_constant_name(name), code_pages_by_encoding[name])) else:
codepage_test_file.write(" assert_eq!(from_encoding(%s), None);\n" % to_constant_name(name))
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.