Quelle generate-encoding-data.py

Sprache: Python

#!/usr/bin/python

# Copyright Mozilla Foundation. See the COPYRIGHT
# file at the top-level directory of this distribution.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.

import json
import subprocess
import sys
import os.path

if (not os.path.isfile("../encoding/encodings.json")) or (not os.path.isfile("../encoding/indexes.json")):
  sys.stderr.write("This script needs a clone of https://github.com/whatwg/encoding/ (preferably at revision 1d519bf8e5555cef64cf3a712485f41cd1a6a990 ) next to the encoding_rs directory.\n");
  sys.exit(-1)

if not os.path.isfile("../encoding_c/src/lib.rs"):
  sys.stderr.write("This script also writes the generated parts of the encoding_c crate and needs a clone of https://github.com/hsivonen/encoding_c next to the encoding_rs directory.\n");
  sys.exit(-1)

if not os.path.isfile("../codepage/src/lib.rs"):
  sys.stderr.write("This script also writes the generated parts of the codepage crate and needs a clone of https://github.com/hsivonen/codepage next to the encoding_rs directory.\n");
  sys.exit(-1)

def cmp_from_end(one, other):
  c = cmp(len(one), len(other))
  if c != 0:
    return c
  i = len(one) - 1
  while i >= 0:
    c = cmp(one[i], other[i])
    if c != 0:
      return c
    i -= 1
  return 0

class Label:
  def __init__(self, label, preferred):
    self.label = label
    self.preferred = preferred
  def __cmp__(self, other):
    return cmp_from_end(self.label, other.label)

class CodePage:
  def __init__(self, code_page, preferred):
    self.code_page = code_page
    self.preferred = preferred
  def __cmp__(self, other):
    return self.code_page, other.code_page

def static_u16_table(name, data):
  data_file.write('''pub static %s: [u16; %d] = [
  ''' % (name, len(data)))

  for i in xrange(len(data)):
    data_file.write('0x%04X,\n' % data[i])

  data_file.write('''];

  ''')

def static_u16_table_from_indexable(name, data, item, feature):
  data_file.write('''#[cfg(all(
    feature = "less-slow-%s",
    not(feature = "fast-%s")
))]
static %s: [u16; %d] = [
  ''' % (feature, feature, name, len(data)))

  for i in xrange(len(data)):
    data_file.write('0x%04X,\n' % data[i][item])

  data_file.write('''];

  ''')

def static_u8_pair_table_from_indexable(name, data, item, feature):
  data_file.write('''#[cfg(all(
    feature = "less-slow-%s",
    not(feature = "fast-%s")
))]
static %s: [[u8; 2]; %d] = [
  ''' % (feature, feature, name, len(data)))

  for i in xrange(len(data)):
    data_file.write('[0x%02X, 0x%02X],\n' % data[i][item])

  data_file.write('''];

  ''')

def static_u8_pair_table(name, data, feature):
  data_file.write('''#[cfg(feature = "%s")]
static %s: [[u8; 2]; %d] = [
  ''' % (feature, name, len(data)))

  for i in xrange(len(data)):
    pair = data[i]
    if not pair:
      pair = (0, 0)
    data_file.write('[0x%02X, 0x%02X],\n' % pair)

  data_file.write('''];

  ''')

preferred = []

dom = []

labels = []

data = json.load(open("../encoding/encodings.json", "r"))

indexes = json.load(open("../encoding/indexes.json", "r"))

single_byte = []

multi_byte = []

def to_camel_name(name):
  if name == u"iso-8859-8-i":
    return u"Iso8I"
  if name.startswith(u"iso-8859-"):
    return name.replace(u"iso-8859-", u"Iso")
  return name.title().replace(u"X-", u"").replace(u"-", u"").replace(u"_", u"")

def to_constant_name(name):
  return name.replace(u"-", u"_").upper()

def to_snake_name(name):
  return name.replace(u"-", u"_").lower()

def to_dom_name(name):
  return name

# Guestimate based on
# https://w3techs.com/technologies/overview/character_encoding/all
# whose methodology is known to be bogus, but the results are credible for
# this purpose. UTF-16LE lifted up due to prevalence on Windows and
# "ANSI codepages" prioritized.
encodings_by_code_page_frequency = [
  "UTF-8",
  "UTF-16LE",
  "windows-1252",
  "windows-1251",
  "GBK",
  "Shift_JIS",
  "EUC-KR",
  "windows-1250",
  "windows-1256",
  "windows-1254",
  "Big5",
  "windows-874",
  "windows-1255",
  "windows-1253",
  "windows-1257",
  "windows-1258",
  "EUC-JP",
  "ISO-8859-2",
  "ISO-8859-15",
  "ISO-8859-7",
  "KOI8-R",
  "gb18030",
  "ISO-8859-5",
  "ISO-8859-8-I",
  "ISO-8859-4",
  "ISO-8859-6",
  "ISO-2022-JP",
  "KOI8-U",
  "ISO-8859-13",
  "ISO-8859-3",
  "UTF-16BE",
  "IBM866",
  "ISO-8859-10",
  "ISO-8859-8",
  "macintosh",
  "x-mac-cyrillic",
  "ISO-8859-14",
  "ISO-8859-16",
]

encodings_by_code_page = {
  932: "Shift_JIS",
  936: "GBK",
  949: "EUC-KR",
  950: "Big5",
  866: "IBM866",
  874: "windows-874",
  1200: "UTF-16LE",
  1201: "UTF-16BE",
  1250: "windows-1250",
  1251: "windows-1251",
  1252: "windows-1252",
  1253: "windows-1253",
  1254: "windows-1254",
  1255: "windows-1255",
  1256: "windows-1256",
  1257: "windows-1257",
  1258: "windows-1258",
  10000: "macintosh",
  10017: "x-mac-cyrillic",
  20866: "KOI8-R",
  20932: "EUC-JP",
  21866: "KOI8-U",
  28592: "ISO-8859-2",
  28593: "ISO-8859-3",
  28594: "ISO-8859-4",
  28595: "ISO-8859-5",
  28596: "ISO-8859-6",
  28597: "ISO-8859-7",
  28598: "ISO-8859-8",
  28600: "ISO-8859-10",
  28603: "ISO-8859-13",
  28604: "ISO-8859-14",
  28605: "ISO-8859-15",
  28606: "ISO-8859-16",
  38598: "ISO-8859-8-I",
  50221: "ISO-2022-JP",
  54936: "gb18030",
  65001: "UTF-8",
}

code_pages_by_encoding = {}

for code_page, encoding in encodings_by_code_page.iteritems():
  code_pages_by_encoding[encoding] = code_page

encoding_by_alias_code_page = {
  951: "Big5",
  10007: "x-mac-cyrillic",
  20936: "GBK",
  20949: "EUC-KR",
  21010: "UTF-16LE", # Undocumented; needed by calamine for Excel compat
  28591: "windows-1252",
  28599: "windows-1254",
  28601: "windows-874",
  50220: "ISO-2022-JP",
  50222: "ISO-2022-JP",
  50225: "replacement", # ISO-2022-KR
  50227: "replacement", # ISO-2022-CN
  51949: "EUC-JP",
  51936: "GBK",
  51949: "EUC-KR",
  52936: "replacement", # HZ
}

code_pages = []

for name in encodings_by_code_page_frequency:
  code_pages.append(code_pages_by_encoding[name])

encodings_by_code_page.update(encoding_by_alias_code_page)

temp_keys = encodings_by_code_page.keys()
temp_keys.sort()
for code_page in temp_keys:
  if not code_page in code_pages:
    code_pages.append(code_page)

# The position in the index (0 is the first index entry,
# i.e. byte value 0x80) that starts the longest run of
# consecutive code points. Must not be in the first
# quadrant. If the character to be encoded is not in this
# run, the part of the index after the run is searched
# forward. Then the part of the index from 32 to the start
# of the run. The first quadrant is searched last.
#
# If there is no obviously most useful longest run,
# the index here is just used to affect the search order.
start_of_longest_run_in_single_byte = {
  "IBM866": 96, # 0 would be longest, but we don't want to start in the first quadrant
  "windows-874": 33,
  "windows-1250": 92,
  "windows-1251": 64,
  "windows-1252": 32,
  "windows-1253": 83,
  "windows-1254": 95,
  "windows-1255": 96,
  "windows-1256": 65,
  "windows-1257": 95, # not actually longest
  "windows-1258": 95, # not actually longest
  "macintosh": 106, # useless
  "x-mac-cyrillic": 96,
  "KOI8-R": 64, # not actually longest
  "KOI8-U": 64, # not actually longest
  "ISO-8859-2": 95, # not actually longest
  "ISO-8859-3": 95, # not actually longest
  "ISO-8859-4": 95, # not actually longest
  "ISO-8859-5": 46,
  "ISO-8859-6": 65,
  "ISO-8859-7": 83,
  "ISO-8859-8": 96,
  "ISO-8859-10": 90, # not actually longest
  "ISO-8859-13": 95, # not actually longest
  "ISO-8859-14": 95,
  "ISO-8859-15": 63,
  "ISO-8859-16": 95, # not actually longest
}

#

for group in data:
  if group["heading"] == "Legacy single-byte encodings":
    single_byte = group["encodings"]
  else:
    multi_byte.extend(group["encodings"])
  for encoding in group["encodings"]:
    preferred.append(encoding["name"])
    for label in encoding["labels"]:
      labels.append(Label(label, encoding["name"]))

for name in preferred:
  dom.append(to_dom_name(name))

preferred.sort()
labels.sort()
dom.sort(cmp=cmp_from_end)

longest_label_length = 0
longest_name_length = 0
longest_label = None
longest_name = None

for name in preferred:
  if len(name) > longest_name_length:
    longest_name_length = len(name)
    longest_name = name

for label in labels:
  if len(label.label) > longest_label_length:
    longest_label_length = len(label.label)
    longest_label = label.label

def longest_run_for_single_byte(name):
  if name == u"ISO-8859-8-I":
    name = u"ISO-8859-8"
  index = indexes[name.lower()]
  run_byte_offset = start_of_longest_run_in_single_byte[name]
  run_bmp_offset = index[run_byte_offset]
  previous_code_point = run_bmp_offset
  run_length = 1
  while True:
    i = run_byte_offset + run_length
    if i == len(index):
      break
    code_point = index[i]
    if previous_code_point + 1 != code_point:
      break
    previous_code_point = code_point
    run_length += 1
  return (run_bmp_offset, run_byte_offset, run_length)

def is_single_byte(name):
  for encoding in single_byte:
    if name == encoding["name"]:
      return True
  return False

def read_non_generated(path):
  partially_generated_file = open(path, "r")
  full = partially_generated_file.read()
  partially_generated_file.close()

  generated_begin = "// BEGIN GENERATED CODE. PLEASE DO NOT EDIT."
  generated_end = "// END GENERATED CODE"

  generated_begin_index = full.find(generated_begin)
  if generated_begin_index < 0:
    sys.stderr.write("Can't find generated code start marker in %s. Exiting.\n" % path)
    sys.exit(-1)
  generated_end_index = full.find(generated_end)
  if generated_end_index < 0:
    sys.stderr.write("Can't find generated code end marker in %s. Exiting.\n" % path)
    sys.exit(-1)

  return (full[0:generated_begin_index + len(generated_begin)],
          full[generated_end_index:])

(lib_rs_begin, lib_rs_end) = read_non_generated("src/lib.rs")

label_file = open("src/lib.rs", "w")

label_file.write(lib_rs_begin)
label_file.write("""
// Instead, please regenerate using generate-encoding-data.py

const LONGEST_LABEL_LENGTH: usize = %d; // %s

""" % (longest_label_length, longest_label))

for name in preferred:
  variant = None
  if is_single_byte(name):
    (run_bmp_offset, run_byte_offset, run_length) = longest_run_for_single_byte(name)
    variant = "SingleByte(&data::SINGLE_BYTE_DATA.%s, 0x%04X, %d, %d)" % (to_snake_name(u"iso-8859-8" if name == u"ISO-8859-8-I" else name), run_bmp_offset, run_byte_offset, run_length)
  else:
    variant = to_camel_name(name)

  docfile = open("doc/%s.txt" % name, "r")
  doctext = docfile.read()
  docfile.close()

  label_file.write('''/// The initializer for the [%s](static.%s.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
/// such as in initializers of other `static`s. If in doubt,
/// use the corresponding non-`_INIT` reference-typed `static`.
///
/// This part of the public API will go away if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate or if Rust starts allowing static arrays
/// to be initialized with `pub static FOO: &'static Encoding`
/// items.
pub static %s_INIT: Encoding = Encoding {
    name: "%s",
    variant: VariantEncoding::%s,
};

/// The %s encoding.
///
%s///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static %s: &'static Encoding = &%s_INIT;

''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name), to_dom_name(name), variant, to_dom_name(name), doctext, to_constant_name(name), to_constant_name(name)))

label_file.write("""static LABELS_SORTED: [&'static str; %d] = [
""" % len(labels))

for label in labels:
  label_file.write('''"%s",\n''' % label.label)

label_file.write("""];

static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; %d] = [
""" % len(labels))

for label in labels:
  label_file.write('''&%s_INIT,\n''' % to_constant_name(label.preferred))

label_file.write('''];

''')
label_file.write(lib_rs_end)
label_file.close()

label_test_file = open("src/test_labels_names.rs", "w")
label_test_file.write('''// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/

// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
// Instead, please regenerate using generate-encoding-data.py

use super::*;

#[test]
fn test_all_labels() {
''')

for label in labels:
  label_test_file.write('''assert_eq!(Encoding::for_label(b"%s"), Some(%s));\n''' % (label.label, to_constant_name(label.preferred)))

label_test_file.write('''}
''')
label_test_file.close()

def null_to_zero(code_point):
  if not code_point:
    code_point = 0
  return code_point

(data_rs_begin, data_rs_end) = read_non_generated("src/data.rs")

data_file = open("src/data.rs", "w")
data_file.write(data_rs_begin)
data_file.write('''
// Instead, please regenerate using generate-encoding-data.py

#[repr(align(64))] // Align to cache lines
pub struct SingleByteData {
''')

# Single-byte

for encoding in single_byte:
  name = encoding["name"]
  if name == u"ISO-8859-8-I":
    continue

  data_file.write('''    pub %s: [u16; 128],
''' % to_snake_name(name))

data_file.write('''}

pub static SINGLE_BYTE_DATA: SingleByteData = SingleByteData {
''')

for encoding in single_byte:
  name = encoding["name"]
  if name == u"ISO-8859-8-I":
    continue

  data_file.write('''    %s: [
''' % to_snake_name(name))

  for code_point in indexes[name.lower()]:
    data_file.write('0x%04X,\n' % null_to_zero(code_point))

  data_file.write('''],
''')

data_file.write('''};

''')

# Big5

index = indexes["big5"]

astralness = []
low_bits = []

for code_point in index[942:19782]:
  if code_point:
    astralness.append(1 if code_point > 0xFFFF else 0)
    low_bits.append(code_point & 0xFFFF)
  else:
    astralness.append(0)
    low_bits.append(0)

# pad length to multiple of 32
for j in xrange(32 - (len(astralness) % 32)):
  astralness.append(0)

data_file.write('''#[cfg_attr(feature = "cargo-clippy", allow(unreadable_literal))]
static BIG5_ASTRALNESS: [u32; %d] = [
''' % (len(astralness) / 32))

i = 0
while i < len(astralness):
  accu = 0
  for j in xrange(32):
    accu |= astralness[i + j] << j
  data_file.write('0x%08X,\n' % accu)
  i += 32

data_file.write('''];

''')

static_u16_table("BIG5_LOW_BITS", low_bits)

# Encoder table for Level 1 Hanzi
# Note: If we were OK with doubling this table, we
# could use a directly-indexable table instead...
level1_hanzi_index = index[5495:10896]
level1_hanzi_pairs = []
for i in xrange(len(level1_hanzi_index)):
  hanzi_lead = (i / 157) + 0xA4
  hanzi_trail = (i % 157)
  hanzi_trail += 0x40 if hanzi_trail < 0x3F else 0x62
  level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail)))
level1_hanzi_pairs.append((0x4E5A, (0xC8, 0x7B)))
level1_hanzi_pairs.append((0x5202, (0xC8, 0x7D)))
level1_hanzi_pairs.append((0x9FB0, (0xC8, 0xA1)))
level1_hanzi_pairs.append((0x5188, (0xC8, 0xA2)))
level1_hanzi_pairs.append((0x9FB1, (0xC8, 0xA3)))
level1_hanzi_pairs.sort(key=lambda x: x[0])

static_u16_table_from_indexable("BIG5_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "big5-hanzi-encode")
static_u8_pair_table_from_indexable("BIG5_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "big5-hanzi-encode")

# Fast Unified Ideograph encode
big5_unified_ideograph_bytes = [None] * (0x9FCC - 0x4E00)
for row in xrange(0x7E - 0x20):
  for column in xrange(157):
    pointer = 5024 + column + (row * 157)
    code_point = index[pointer]
    if code_point and code_point >= 0x4E00 and code_point <= 0x9FCB:
      unified_offset = code_point - 0x4E00
      unified_lead = 0xA1 + row
      unified_trail = (0x40 if column < 0x3F else 0x62) + column
      if code_point == 0x5341 or code_point == 0x5345 or not big5_unified_ideograph_bytes[unified_offset]:
        big5_unified_ideograph_bytes[unified_offset] = (unified_lead, unified_trail)

static_u8_pair_table("BIG5_UNIFIED_IDEOGRAPH_BYTES", big5_unified_ideograph_bytes, "fast-big5-hanzi-encode")

# JIS0208

index = indexes["jis0208"]

# JIS 0208 Level 1 Kanji
static_u16_table("JIS0208_LEVEL1_KANJI", index[1410:4375])

# JIS 0208 Level 2 Kanji and Additional Kanji
static_u16_table("JIS0208_LEVEL2_AND_ADDITIONAL_KANJI", index[4418:7808])

# IBM Kanji
static_u16_table("IBM_KANJI", index[8272:8632])

# Check that the other instance is the same
if index[8272:8632] != index[10744:11104]:
  raise Error()

# JIS 0208 symbols (all non-Kanji, non-range items)
symbol_index = []
symbol_triples = []
pointers_to_scan = [
  (0, 188),
  (658, 691),
  (1159, 1221),
]
in_run = False
run_start_pointer = 0
run_start_array_index = 0
for (start, end) in pointers_to_scan:
  for i in range(start, end):
    code_point = index[i]
    if in_run:
      if code_point:
        symbol_index.append(code_point)
      else:
        symbol_triples.append(run_start_pointer)
        symbol_triples.append(i - run_start_pointer)
        symbol_triples.append(run_start_array_index)
        in_run = False
    else:
      if code_point:
        in_run = True
        run_start_pointer = i
        run_start_array_index = len(symbol_index)
        symbol_index.append(code_point)
  if in_run:
    symbol_triples.append(run_start_pointer)
    symbol_triples.append(end - run_start_pointer)
    symbol_triples.append(run_start_array_index)
    in_run = False
if in_run:
  raise Error()

# Now add manually the two overlapping slices of
# index from the NEC/IBM extensions.
run_start_array_index = len(symbol_index)
symbol_index.extend(index[10736:10744])
# Later
symbol_triples.append(10736)
symbol_triples.append(8)
symbol_triples.append(run_start_array_index)
# Earlier
symbol_triples.append(8644)
symbol_triples.append(4)
symbol_triples.append(run_start_array_index)

static_u16_table("JIS0208_SYMBOLS", symbol_index)
static_u16_table("JIS0208_SYMBOL_TRIPLES", symbol_triples)

# Write down the magic numbers needed when preferring the earlier case
data_file.write('''const IBM_SYMBOL_START: usize = %d;''' % (run_start_array_index + 1))
data_file.write('''const IBM_SYMBOL_END: usize = %d;''' % (run_start_array_index + 4))
data_file.write('''const IBM_SYMBOL_POINTER_START: usize = %d;''' % 8645)

# JIS 0208 ranges (excluding kana)
range_triples = []
pointers_to_scan = [
  (188, 281),
  (470, 657),
  (1128, 1159),
  (8634, 8644),
  (10716, 10736),
]
in_run = False
run_start_pointer = 0
run_start_code_point = 0
previous_code_point = 0
for (start, end) in pointers_to_scan:
  for i in range(start, end):
    code_point = index[i]
    if in_run:
      if code_point:
        if previous_code_point + 1 != code_point:
          range_triples.append(run_start_pointer)
          range_triples.append(i - run_start_pointer)
          range_triples.append(run_start_code_point)
          run_start_pointer = i
          run_start_code_point = code_point
        previous_code_point = code_point
      else:
          range_triples.append(run_start_pointer)
          range_triples.append(i - run_start_pointer)
          range_triples.append(run_start_code_point)
          run_start_pointer = 0
          run_start_code_point = 0
          previous_code_point = 0
          in_run = False
    else:
      if code_point:
        in_run = True
        run_start_pointer = i
        run_start_code_point = code_point
        previous_code_point = code_point
  if in_run:
    range_triples.append(run_start_pointer)
    range_triples.append(end - run_start_pointer)
    range_triples.append(run_start_code_point)
    run_start_pointer = 0
    run_start_code_point = 0
    previous_code_point = 0
    in_run = False
if in_run:
  raise Error()

static_u16_table("JIS0208_RANGE_TRIPLES", range_triples)

# Encoder table for Level 1 Kanji
# Note: If we were OK with 30 KB more footprint, we
# could use a directly-indexable table instead...
level1_kanji_index = index[1410:4375]
level1_kanji_pairs = []
for i in xrange(len(level1_kanji_index)):
  pointer = 1410 + i
  (lead, trail) = divmod(pointer, 188)
  lead += 0x81 if lead < 0x1F else 0xC1
  trail += 0x40 if trail < 0x3F else 0x41
  level1_kanji_pairs.append((level1_kanji_index[i], (lead, trail)))
level1_kanji_pairs.sort(key=lambda x: x[0])

static_u16_table_from_indexable("JIS0208_LEVEL1_KANJI_CODE_POINTS", level1_kanji_pairs, 0, "kanji-encode")
static_u8_pair_table_from_indexable("JIS0208_LEVEL1_KANJI_SHIFT_JIS_BYTES", level1_kanji_pairs, 1, "kanji-encode")

# Fast encoder table for Kanji
kanji_bytes = [None] * (0x9FA1 - 0x4E00)
for pointer in xrange(len(index)):
  code_point = index[pointer]
  if code_point and code_point >= 0x4E00 and code_point <= 0x9FA0:
    (lead, trail) = divmod(pointer, 188)
    lead += 0x81 if lead < 0x1F else 0xC1
    trail += 0x40 if trail < 0x3F else 0x41
    # unset the high bit of lead if IBM Kanji
    if pointer >= 8272:
      lead = lead & 0x7F
    kanji_bytes[code_point - 0x4E00] = (lead, trail)

static_u8_pair_table("JIS0208_KANJI_BYTES", kanji_bytes, "fast-kanji-encode")

# ISO-2022-JP half-width katakana

# index is still jis0208
half_width_index = indexes["iso-2022-jp-katakana"]

data_file.write('''pub static ISO_2022_JP_HALF_WIDTH_TRAIL: [u8; %d] = [
''' % len(half_width_index))

for i in xrange(len(half_width_index)):
  code_point = half_width_index[i]
  pointer = index.index(code_point)
  trail = pointer % 94 + 0x21
  data_file.write('0x%02X,\n' % trail)

data_file.write('''];

''')

# EUC-KR

index = indexes["euc-kr"]

# Unicode 1.1 Hangul above the old KS X 1001 block
# Compressed form takes 35% of uncompressed form
pointers = []
offsets = []
previous_code_point = 0
for row in xrange(0x20):
  for column in xrange(190):
    i = column + (row * 190)
    # Skip the gaps
    if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40):
      continue
    code_point = index[i]
    if previous_code_point > code_point:
      raise Error()
    if code_point - previous_code_point != 1:
      adjustment = 0
      if column >= 0x40:
        adjustment = 12
      elif column >= 0x20:
        adjustment = 6
      pointers.append(column - adjustment + (row * (190 - 12)))
      offsets.append(code_point)
    previous_code_point = code_point

static_u16_table("CP949_TOP_HANGUL_POINTERS", pointers)
static_u16_table("CP949_TOP_HANGUL_OFFSETS", offsets)

# Unicode 1.1 Hangul to the left of the old KS X 1001 block
pointers = []
offsets = []
previous_code_point = 0
for row in xrange(0x46 - 0x20):
  for column in xrange(190 - 94):
    i = 6080 + column + (row * 190)
    # Skip the gaps
    if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40):
      continue
    if i > 13127:
      # Exclude unassigned on partial last row
      break
    code_point = index[i]
    if previous_code_point > code_point:
      raise Error()
    if code_point - previous_code_point != 1:
      adjustment = 0
      if column >= 0x40:
        adjustment = 12
      elif column >= 0x20:
        adjustment = 6
      pointers.append(column - adjustment + (row * (190 - 94 - 12)))
      offsets.append(code_point)
    previous_code_point = code_point

static_u16_table("CP949_LEFT_HANGUL_POINTERS", pointers)
static_u16_table("CP949_LEFT_HANGUL_OFFSETS", offsets)

# KS X 1001 Hangul
hangul_index = []
previous_code_point = 0
for row in xrange(0x48 - 0x2F):
  for column in xrange(94):
    code_point = index[9026 + column + (row * 190)]
    if previous_code_point >= code_point:
      raise Error()
    hangul_index.append(code_point)
    previous_code_point = code_point

static_u16_table("KSX1001_HANGUL", hangul_index)

# KS X 1001 Hanja
hanja_index = []
for row in xrange(0x7D - 0x49):
  for column in xrange(94):
    hanja_index.append(index[13966 + column + (row * 190)])

static_u16_table("KSX1001_HANJA", hanja_index)

# KS X 1001 symbols
symbol_index = []
for i in range(6176, 6270):
  symbol_index.append(index[i])
for i in range(6366, 6437):
  symbol_index.append(index[i])

static_u16_table("KSX1001_SYMBOLS", symbol_index)

# KS X 1001 Uppercase Latin
subindex = []
for i in range(7506, 7521):
  subindex.append(null_to_zero(index[i]))

static_u16_table("KSX1001_UPPERCASE", subindex)

# KS X 1001 Lowercase Latin
subindex = []
for i in range(7696, 7712):
  subindex.append(index[i])

static_u16_table("KSX1001_LOWERCASE", subindex)

# KS X 1001 Box drawing
subindex = []
for i in range(7126, 7194):
  subindex.append(index[i])

static_u16_table("KSX1001_BOX", subindex)

# KS X 1001 other
pointers = []
offsets = []
previous_code_point = 0
for row in xrange(10):
  for column in xrange(94):
    i = 6556 + column + (row * 190)
    code_point = index[i]
    # Exclude ranges that were processed as lookup tables
    # or that contain unmapped cells by filling them with
    # ASCII. Upon encode, ASCII code points will
    # never appear as the search key.
    if (i >= 6946 and i <= 6950):
      code_point = i - 6946
    elif (i >= 6961 and i <= 6967):
      code_point = i - 6961
    elif (i >= 6992 and i <= 6999):
      code_point = i - 6992
    elif (i >= 7024 and i <= 7029):
      code_point = i - 7024
    elif (i >= 7126 and i <= 7219):
      code_point = i - 7126
    elif (i >= 7395 and i <= 7409):
      code_point = i - 7395
    elif (i >= 7506 and i <= 7521):
      code_point = i - 7506
    elif (i >= 7696 and i <= 7711):
      code_point = i - 7696
    elif (i >= 7969 and i <= 7979):
      code_point = i - 7969
    elif (i >= 8162 and i <= 8169):
      code_point = i - 8162
    elif (i >= 8299 and i <= 8313):
      code_point = i - 8299
    elif (i >= 8347 and i <= 8359):
      code_point = i - 8347
    if code_point - previous_code_point != 1:
      pointers.append(column + (row * 94))
      offsets.append(code_point)
    previous_code_point = code_point

static_u16_table("KSX1001_OTHER_POINTERS", pointers)
# Omit the last offset, because the end of the last line
# is unmapped, so we don't want to look at it.
static_u16_table("KSX1001_OTHER_UNSORTED_OFFSETS", offsets[:-1])

# Fast Hangul and Hanja encode
hangul_bytes = [None] * (0xD7A4 - 0xAC00)
hanja_unified_bytes = [None] * (0x9F9D - 0x4E00)
hanja_compatibility_bytes = [None] * (0xFA0C - 0xF900)
for row in xrange(0x7D):
  for column in xrange(190):
    pointer = column + (row * 190)
    code_point = index[pointer]
    if code_point:
      lead = 0x81 + row
      trail = 0x41 + column
      if code_point >= 0xAC00 and code_point < 0xD7A4:
        hangul_bytes[code_point - 0xAC00] = (lead, trail)
      elif code_point >= 0x4E00 and code_point < 0x9F9D:
        hanja_unified_bytes[code_point - 0x4E00] = (lead, trail)
      elif code_point >= 0xF900 and code_point < 0xFA0C:
        hanja_compatibility_bytes[code_point - 0xF900] = (lead, trail)

static_u8_pair_table("CP949_HANGUL_BYTES", hangul_bytes, "fast-hangul-encode")
static_u8_pair_table("KSX1001_UNIFIED_HANJA_BYTES", hanja_unified_bytes, "fast-hanja-encode")
static_u8_pair_table("KSX1001_COMPATIBILITY_HANJA_BYTES", hanja_compatibility_bytes, "fast-hanja-encode")

# JIS 0212

index = indexes["jis0212"]

# JIS 0212 Kanji
static_u16_table("JIS0212_KANJI", index[1410:7211])

# JIS 0212 accented (all non-Kanji, non-range items)
symbol_index = []
symbol_triples = []
pointers_to_scan = [
  (0, 596),
  (608, 644),
  (656, 1409),
]
in_run = False
run_start_pointer = 0
run_start_array_index = 0
for (start, end) in pointers_to_scan:
  for i in range(start, end):
    code_point = index[i]
    if in_run:
      if code_point:
        symbol_index.append(code_point)
      elif index[i + 1]:
        symbol_index.append(0)
      else:
        symbol_triples.append(run_start_pointer)
        symbol_triples.append(i - run_start_pointer)
        symbol_triples.append(run_start_array_index)
        in_run = False
    else:
      if code_point:
        in_run = True
        run_start_pointer = i
        run_start_array_index = len(symbol_index)
        symbol_index.append(code_point)
  if in_run:
    symbol_triples.append(run_start_pointer)
    symbol_triples.append(end - run_start_pointer)
    symbol_triples.append(run_start_array_index)
    in_run = False
if in_run:
  raise Error()

static_u16_table("JIS0212_ACCENTED", symbol_index)
static_u16_table("JIS0212_ACCENTED_TRIPLES", symbol_triples)

# gb18030

index = indexes["gb18030"]

# Unicode 1.1 ideographs above the old GB2312 block
# Compressed form takes 63% of uncompressed form
pointers = []
offsets = []
previous_code_point = 0
for i in xrange(6080):
  code_point = index[i]
  if previous_code_point > code_point:
    raise Error()
  if code_point - previous_code_point != 1:
    pointers.append(i)
    offsets.append(code_point)
  previous_code_point = code_point

static_u16_table("GBK_TOP_IDEOGRAPH_POINTERS", pointers)
static_u16_table("GBK_TOP_IDEOGRAPH_OFFSETS", offsets)

# Unicode 1.1 ideographs to the left of the old GB2312 block
# Compressed form takes 40% of uncompressed form
pointers = []
offsets = []
previous_code_point = 0
for row in xrange(0x7D - 0x29):
  for column in xrange(190 - 94):
    i = 7790 + column + (row * 190)
    if i > 23650:
      # Exclude compatibility ideographs at the end
      break
    code_point = index[i]
    if previous_code_point > code_point:
      raise Error()
    if code_point - previous_code_point != 1:
      pointers.append(column + (row * (190 - 94)))
      offsets.append(code_point)
    previous_code_point = code_point

static_u16_table("GBK_LEFT_IDEOGRAPH_POINTERS", pointers)
static_u16_table("GBK_LEFT_IDEOGRAPH_OFFSETS", offsets)

# GBK other (excl. Ext A, Compat & PUA at the bottom)
pointers = []
offsets = []
previous_code_point = 0
for row in xrange(0x29 - 0x20):
  for column in xrange(190 - 94):
    i = 6080 + column + (row * 190)
    code_point = index[i]
    if code_point - previous_code_point != 1:
      pointers.append(column + (row * (190 - 94)))
      offsets.append(code_point)
    previous_code_point = code_point

pointers.append((190 - 94) * (0x29 - 0x20))
static_u16_table("GBK_OTHER_POINTERS", pointers)
static_u16_table("GBK_OTHER_UNSORTED_OFFSETS", offsets)

# GBK bottom: Compatibility ideagraphs, Ext A and PUA
bottom_index = []
# 5 compat following Unified Ideographs
for i in range(23651, 23656):
  bottom_index.append(index[i])
# Last row
for i in range(23750, 23846):
  bottom_index.append(index[i])

static_u16_table("GBK_BOTTOM", bottom_index)

# GB2312 Hanzi
# (and the 5 PUA code points in between Level 1 and Level 2)
hanzi_index = []
for row in xrange(0x77 - 0x2F):
  for column in xrange(94):
    hanzi_index.append(index[9026 + column + (row * 190)])

static_u16_table("GB2312_HANZI", hanzi_index)

# GB2312 symbols
symbol_index = []
for i in xrange(94):
  symbol_index.append(index[6176 + i])

static_u16_table("GB2312_SYMBOLS", symbol_index)

# GB2312 symbols on Greek row (incl. PUA)
symbol_index = []
for i in xrange(22):
  symbol_index.append(index[7189 + i])

static_u16_table("GB2312_SYMBOLS_AFTER_GREEK", symbol_index)

# GB2312 Pinyin
pinyin_index = []
for i in xrange(32):
  pinyin_index.append(index[7506 + i])

static_u16_table("GB2312_PINYIN", pinyin_index)

# GB2312 other (excl. bottom PUA)
pointers = []
offsets = []
previous_code_point = 0
for row in xrange(14):
  for column in xrange(94):
    i = 6366 + column + (row * 190)
    code_point = index[i]
    # Exclude the two ranges that were processed as
    # lookup tables above by filling them with
    # ASCII. Upon encode, ASCII code points will
    # never appear as the search key.
    if (i >= 7189 and i < 7189 + 22):
      code_point = i - 7189
    elif (i >= 7506 and i < 7506 + 32):
      code_point = i - 7506
    if code_point - previous_code_point != 1:
      pointers.append(column + (row * 94))
      offsets.append(code_point)
    previous_code_point = code_point

pointers.append(14 * 94)
static_u16_table("GB2312_OTHER_POINTERS", pointers)
static_u16_table("GB2312_OTHER_UNSORTED_OFFSETS", offsets)

# Non-gbk code points
pointers = []
offsets = []
for pair in indexes["gb18030-ranges"]:
  if pair[1] == 0x10000:
    break # the last entry doesn't fit in u16
  pointers.append(pair[0])
  offsets.append(pair[1])

static_u16_table("GB18030_RANGE_POINTERS", pointers)
static_u16_table("GB18030_RANGE_OFFSETS", offsets)

# Encoder table for Level 1 Hanzi
# The units here really fit into 12 bits, but since we're
# looking for speed here, let's use 16 bits per unit.
# Once we use 16 bits per unit, we might as well precompute
# the output bytes.
level1_hanzi_index = hanzi_index[:(94 * (0xD8 - 0xB0) - 5)]
level1_hanzi_pairs = []
for i in xrange(len(level1_hanzi_index)):
  hanzi_lead = (i / 94) + 0xB0
  hanzi_trail = (i % 94) + 0xA1
  level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail)))
level1_hanzi_pairs.sort(key=lambda x: x[0])

static_u16_table_from_indexable("GB2312_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "gb-hanzi-encode")
static_u8_pair_table_from_indexable("GB2312_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "gb-hanzi-encode")

# Fast Hanzi encoder table
hanzi_bytes = [None] * (0x9FA7 - 0x4E00)
for row in xrange(126):
  for column in xrange(190):
    pointer = column + (row * 190)
    code_point = index[pointer]
    if code_point and code_point >= 0x4E00 and code_point <= 0x9FA6:
      hanzi_lead = 0x81 + row
      hanzi_trail = column + (0x40 if column < 0x3F else 0x41)
      hanzi_bytes[code_point - 0x4E00] = (hanzi_lead, hanzi_trail)

static_u8_pair_table("GBK_HANZI_BYTES", hanzi_bytes, "fast-gb-hanzi-encode")

data_file.write(data_rs_end)

data_file.close()

# Variant

variant_file = open("src/variant.rs", "w")
variant_file.write('''// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
// Instead, please regenerate using generate-encoding-data.py

//! This module provides enums that wrap the various decoders and encoders.
//! The purpose is to make `Decoder` and `Encoder` `Sized` by writing the
//! dispatch explicitly for a finite set of specialized decoders and encoders.
//! Unfortunately, this means the compiler doesn't generate the dispatch code
//! and it has to be written here instead.
//!
//! The purpose of making `Decoder` and `Encoder` `Sized` is to allow stack
//! allocation in Rust code, including the convenience methods on `Encoding`.

''')

encoding_variants = [u"single-byte",]
for encoding in multi_byte:
  if encoding["name"] in [u"UTF-16LE", u"UTF-16BE"]:
    continue
  else:
    encoding_variants.append(encoding["name"])
encoding_variants.append(u"UTF-16")

decoder_variants = []
for variant in encoding_variants:
  if variant == u"GBK":
    continue
  decoder_variants.append(variant)

encoder_variants = []
for variant in encoding_variants:
  if variant in [u"replacement", u"GBK", u"UTF-16"]:
    continue
  encoder_variants.append(variant)

for variant in decoder_variants:
  variant_file.write("use %s::*;\n" % to_snake_name(variant))

variant_file.write('''use super::*;

pub enum VariantDecoder {
''')

for variant in decoder_variants:
  variant_file.write("   %s(%sDecoder),\n" % (to_camel_name(variant), to_camel_name(variant)))

variant_file.write('''}

impl VariantDecoder {
''')

def write_variant_method(name, mut, arg_list, ret, variants, excludes, kind):
  variant_file.write('''pub fn %s(&''' % name)
  if mut:
    variant_file.write('''mut ''')
  variant_file.write('''self''')
  for arg in arg_list:
    variant_file.write(''', %s: %s''' % (arg[0], arg[1]))
  variant_file.write(''')''')
  if ret:
    variant_file.write(''' -> %s''' % ret)
  variant_file.write(''' {\nmatch *self {\n''')
  for variant in variants:
    variant_file.write('''Variant%s::%s(ref ''' % (kind, to_camel_name(variant)))
    if mut:
      variant_file.write('''mut ''')
    if variant in excludes:
      variant_file.write('''v) => (),''')
      continue
    variant_file.write('''v) => v.%s(''' % name)
    first = True
    for arg in arg_list:
      if not first:
        variant_file.write(''', ''')
      first = False
      variant_file.write(arg[0])
    variant_file.write('''),\n''')
  variant_file.write('''}\n}\n\n''')

write_variant_method("max_utf16_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder")

write_variant_method("max_utf8_buffer_length_without_replacement", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder")

write_variant_method("max_utf8_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder")

write_variant_method("decode_to_utf16_raw", True, [("src", "&[u8]"),
                           ("dst", "&mut [u16]"),
                           ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder")

write_variant_method("decode_to_utf8_raw", True, [("src", "&[u8]"),
                           ("dst", "&mut [u8]"),
                           ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder")

variant_file.write('''

    pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> Option<usize> {
        match *self {
            VariantDecoder::SingleByte(ref v) => {
                return Some(v.latin1_byte_compatible_up_to(buffer));
            }
            VariantDecoder::Utf8(ref v) => {
                if !v.in_neutral_state() {
                    return None;
                }
            }
            VariantDecoder::Gb18030(ref v) => {
                if !v.in_neutral_state() {
                    return None;
                }
            }
            VariantDecoder::Big5(ref v) => {
                if !v.in_neutral_state() {
                    return None;
                }
            }
            VariantDecoder::EucJp(ref v) => {
                if !v.in_neutral_state() {
                    return None;
                }
            }
            VariantDecoder::Iso2022Jp(ref v) => {
                if v.in_neutral_state() {
                    return Some(Encoding::iso_2022_jp_ascii_valid_up_to(buffer));
                }
                return None;
            }
            VariantDecoder::ShiftJis(ref v) => {
                if !v.in_neutral_state() {
                    return None;
                }
            }
            VariantDecoder::EucKr(ref v) => {
                if !v.in_neutral_state() {
                    return None;
                }
            }
            VariantDecoder::UserDefined(_) => {}
            VariantDecoder::Replacement(_) | VariantDecoder::Utf16(_) => {
                return None;
            }
        };
        Some(Encoding::ascii_valid_up_to(buffer))
    }
}

pub enum VariantEncoder {
''')

for variant in encoder_variants:
  variant_file.write("   %s(%sEncoder),\n" % (to_camel_name(variant), to_camel_name(variant)))

variant_file.write('''}

impl VariantEncoder {
    pub fn has_pending_state(&self) -> bool {
        match *self {
            VariantEncoder::Iso2022Jp(ref v) => {
                v.has_pending_state()
            }
            _ => false,
        }
    }
''')

write_variant_method("max_buffer_length_from_utf16_without_replacement", False, [("u16_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder")

write_variant_method("max_buffer_length_from_utf8_without_replacement", False, [("byte_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder")

write_variant_method("encode_from_utf16_raw", True, [("src", "&[u16]"),
                           ("dst", "&mut [u8]"),
                           ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder")

write_variant_method("encode_from_utf8_raw", True, [("src", "&str"),
                           ("dst", "&mut [u8]"),
                           ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder")

variant_file.write('''}

pub enum VariantEncoding {
    SingleByte(&'static [u16; 128], u16, u8, u8),''')

for encoding in multi_byte:
  variant_file.write("%s,\n" % to_camel_name(encoding["name"]))

variant_file.write('''}

impl VariantEncoding {
    pub fn new_variant_decoder(&self) -> VariantDecoder {
        match *self {
            VariantEncoding::SingleByte(table, _, _, _) => SingleByteDecoder::new(table),
            VariantEncoding::Utf8 => Utf8Decoder::new(),
            VariantEncoding::Gbk | VariantEncoding::Gb18030 => Gb18030Decoder::new(),
            VariantEncoding::Big5 => Big5Decoder::new(),
            VariantEncoding::EucJp => EucJpDecoder::new(),
            VariantEncoding::Iso2022Jp => Iso2022JpDecoder::new(),
            VariantEncoding::ShiftJis => ShiftJisDecoder::new(),
            VariantEncoding::EucKr => EucKrDecoder::new(),
            VariantEncoding::Replacement => ReplacementDecoder::new(),
            VariantEncoding::UserDefined => UserDefinedDecoder::new(),
            VariantEncoding::Utf16Be => Utf16Decoder::new(true),
            VariantEncoding::Utf16Le => Utf16Decoder::new(false),
        }
    }

    pub fn new_encoder(&self, encoding: &'static Encoding) -> Encoder {
        match *self {
            VariantEncoding::SingleByte(table, run_bmp_offset, run_byte_offset, run_length) => SingleByteEncoder::new(encoding, table, run_bmp_offset, run_byte_offset, run_length),
            VariantEncoding::Utf8 => Utf8Encoder::new(encoding),
            VariantEncoding::Gbk => Gb18030Encoder::new(encoding, false),
            VariantEncoding::Gb18030 => Gb18030Encoder::new(encoding, true),
            VariantEncoding::Big5 => Big5Encoder::new(encoding),
            VariantEncoding::EucJp => EucJpEncoder::new(encoding),
            VariantEncoding::Iso2022Jp => Iso2022JpEncoder::new(encoding),
            VariantEncoding::ShiftJis => ShiftJisEncoder::new(encoding),
            VariantEncoding::EucKr => EucKrEncoder::new(encoding),
            VariantEncoding::UserDefined => UserDefinedEncoder::new(encoding),
            VariantEncoding::Utf16Be | VariantEncoding::Replacement |
            VariantEncoding::Utf16Le => unreachable!(),
        }
    }

    pub fn is_single_byte(&self) -> bool {
        match *self {
            VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined => true,
            _ => false,
        }
    }
}
''')

variant_file.close()

(ffi_rs_begin, ffi_rs_end) = read_non_generated("../encoding_c/src/lib.rs")

ffi_file = open("../encoding_c/src/lib.rs", "w")

ffi_file.write(ffi_rs_begin)
ffi_file.write("""
// Instead, please regenerate using generate-encoding-data.py

/// The minimum length of buffers that may be passed to `encoding_name()`.
pub const ENCODING_NAME_MAX_LENGTH: usize = %d; // %s

""" % (longest_name_length, longest_name))

for name in preferred:
  ffi_file.write('''/// The %s encoding.
#[no_mangle]
pub static %s_ENCODING: ConstEncoding = ConstEncoding(&%s_INIT);

''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name)))

ffi_file.write(ffi_rs_end)
ffi_file.close()

(single_byte_rs_begin, single_byte_rs_end) = read_non_generated("src/single_byte.rs")

single_byte_file = open("src/single_byte.rs", "w")

single_byte_file.write(single_byte_rs_begin)
single_byte_file.write("""
// Instead, please regenerate using generate-encoding-data.py

    #[test]
    fn test_single_byte_decode() {""")

idx = 0 # for Miri, return after 2nd test
for name in preferred:
  if name == u"ISO-8859-8-I":
    continue;
  if is_single_byte(name):
    single_byte_file.write("""
        decode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name)))
    idx += 1
    if idx == 2:
      single_byte_file.write("""
        if cfg!(miri) {
            // Miri is too slow
            return;
        }""")

single_byte_file.write("""
    }

    #[test]
    fn test_single_byte_encode() {""")

idx = 0 # for Miri, return after 2nd test
for name in preferred:
  if name == u"ISO-8859-8-I":
    continue;
  if is_single_byte(name):
    single_byte_file.write("""
        encode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name)))
    idx += 1
    if idx == 2:
      single_byte_file.write("""
        if cfg!(miri) {
            // Miri is too slow
            return;
        }""")

single_byte_file.write("""
    }
""")

single_byte_file.write(single_byte_rs_end)
single_byte_file.close()

static_file = open("../encoding_c/include/encoding_rs_statics.h", "w")

static_file.write("""// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
// Instead, please regenerate using generate-encoding-data.py

// This file is not meant to be included directly. Instead, encoding_rs.h
// includes this file.

#ifndef encoding_rs_statics_h_
#define encoding_rs_statics_h_

#ifndef ENCODING_RS_ENCODING
#define ENCODING_RS_ENCODING Encoding
#ifndef __cplusplus
typedef struct Encoding_ Encoding;
#endif
#endif

#ifndef ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR
#define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ENCODING_RS_ENCODING*
#endif

#ifndef ENCODING_RS_ENCODER
#define ENCODING_RS_ENCODER Encoder
#ifndef __cplusplus
typedef struct Encoder_ Encoder;
#endif
#endif

#ifndef ENCODING_RS_DECODER
#define ENCODING_RS_DECODER Decoder
#ifndef __cplusplus
typedef struct Decoder_ Decoder;
#endif
#endif

#define INPUT_EMPTY 0

#define OUTPUT_FULL 0xFFFFFFFF

// %s
#define ENCODING_NAME_MAX_LENGTH %d

""" % (longest_name, longest_name_length))

for name in preferred:
  static_file.write('''/// The %s encoding.
extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const %s_ENCODING;

''' % (to_dom_name(name), to_constant_name(name)))

static_file.write("""#endif // encoding_rs_statics_h_
""")
static_file.close()

(utf_8_rs_begin, utf_8_rs_end) = read_non_generated("src/utf_8.rs")

utf_8_file = open("src/utf_8.rs", "w")

utf_8_file.write(utf_8_rs_begin)
utf_8_file.write("""
// Instead, please regenerate using generate-encoding-data.py

pub static UTF8_DATA: Utf8Data = Utf8Data {
    table: [
""")

for i in range(256):
  combined = (1 << 2) # invalid lead
  if i < 0x80 or i > 0xBF:
    combined |= (1 << 3) # normal trail
  if i < 0xA0 or i > 0xBF:
    combined |= (1 << 4) # three-byte special lower bound
  if i < 0x80 or i > 0x9F:
    combined |= (1 << 5) # three-byte special upper bound
  if i < 0x90 or i > 0xBF:
    combined |= (1 << 6) # four-byte special lower bound
  if i < 0x80 or i > 0x8F:
    combined |= (1 << 7) # four-byte special upper bound
  utf_8_file.write("%d," % combined)

for i in range(128, 256):
  lane = (1 << 2) # invalid lead
  if i >= 0xC2 and i <= 0xDF:
    lane = (1 << 3) # normal trail
  elif i == 0xE0:
    lane = (1 << 4) # three-byte special lower bound
  elif i >= 0xE1 and i <= 0xEC:
    lane = (1 << 3) # normal trail
  elif i == 0xED:
    lane = (1 << 5) # three-byte special upper bound
  elif i >= 0xEE and i <= 0xEF:
    lane = (1 << 3) # normal trail
  elif i == 0xF0:
    lane = (1 << 6) # four-byte special lower bound
  elif i >= 0xF1 and i <= 0xF3:
    lane = (1 << 3) # normal trail
  elif i == 0xF4:
    lane = (1 << 7) # four-byte special upper bound
  utf_8_file.write("%d," % lane)

utf_8_file.write("""
    ],
};

""")

utf_8_file.write(utf_8_rs_end)
utf_8_file.close()

# Unit tests

TEST_HEADER = '''Generated from WHATWG indexes.json; see LICENSE-WHATWG.

This is a generated file. Please do not edit.
Instead, please regenerate using generate-encoding-data.py
'''

index = indexes["jis0208"]

jis0208_in_file = open("src/test_data/jis0208_in.txt", "w")
jis0208_in_file.write(TEST_HEADER)
for pointer in range(0, 94 * 94):
  (lead, trail) = divmod(pointer, 94)
  lead += 0xA1
  trail += 0xA1
  jis0208_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
jis0208_in_file.close()

jis0208_in_ref_file = open("src/test_data/jis0208_in_ref.txt", "w")
jis0208_in_ref_file.write(TEST_HEADER)
for pointer in range(0, 94 * 94):
  code_point = index[pointer]
  if code_point:
    jis0208_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  else:
    jis0208_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
jis0208_in_ref_file.close()

jis0208_out_file = open("src/test_data/jis0208_out.txt", "w")
jis0208_out_ref_file = open("src/test_data/jis0208_out_ref.txt", "w")
jis0208_out_file.write(TEST_HEADER)
jis0208_out_ref_file.write(TEST_HEADER)
for pointer in range(0, 94 * 94):
  code_point = index[pointer]
  if code_point:
    revised_pointer = pointer
    if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220):
      revised_pointer = index.index(code_point)
    (lead, trail) = divmod(revised_pointer, 94)
    lead += 0xA1
    trail += 0xA1
    jis0208_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
    jis0208_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
jis0208_out_file.close()
jis0208_out_ref_file.close()

shift_jis_in_file = open("src/test_data/shift_jis_in.txt", "w")
shift_jis_in_file.write(TEST_HEADER)
for pointer in range(0, len(index)):
  (lead, trail) = divmod(pointer, 188)
  lead += 0x81 if lead < 0x1F else 0xC1
  trail += 0x40 if trail < 0x3F else 0x41
  shift_jis_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
shift_jis_in_file.close()

shift_jis_in_ref_file = open("src/test_data/shift_jis_in_ref.txt", "w")
shift_jis_in_ref_file.write(TEST_HEADER)
for pointer in range(0, len(index)):
  code_point = 0xE000 - 8836 + pointer if pointer >= 8836 and pointer <= 10715 else index[pointer]
  if code_point:
    shift_jis_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  else:
    trail = pointer % 188
    trail += 0x40 if trail < 0x3F else 0x41
    if trail < 0x80:
      shift_jis_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
    else:
      shift_jis_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
shift_jis_in_ref_file.close()

shift_jis_out_file = open("src/test_data/shift_jis_out.txt", "w")
shift_jis_out_ref_file = open("src/test_data/shift_jis_out_ref.txt", "w")
shift_jis_out_file.write(TEST_HEADER)
shift_jis_out_ref_file.write(TEST_HEADER)
for pointer in range(0, 8272):
  code_point = index[pointer]
  if code_point:
    revised_pointer = pointer
    if revised_pointer >= 1207 and revised_pointer < 1220:
      revised_pointer = index.index(code_point)
    (lead, trail) = divmod(revised_pointer, 188)
    lead += 0x81 if lead < 0x1F else 0xC1
    trail += 0x40 if trail < 0x3F else 0x41
    shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
    shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
for pointer in range(8836, len(index)):
  code_point = index[pointer]
  if code_point:
    revised_pointer = index.index(code_point)
    if revised_pointer >= 8272 and revised_pointer < 8836:
      revised_pointer = pointer
    (lead, trail) = divmod(revised_pointer, 188)
    lead += 0x81 if lead < 0x1F else 0xC1
    trail += 0x40 if trail < 0x3F else 0x41
    shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
    shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
shift_jis_out_file.close()
shift_jis_out_ref_file.close()

iso_2022_jp_in_file = open("src/test_data/iso_2022_jp_in.txt", "w")
iso_2022_jp_in_file.write(TEST_HEADER)
for pointer in range(0, 94 * 94):
  (lead, trail) = divmod(pointer, 94)
  lead += 0x21
  trail += 0x21
  iso_2022_jp_in_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
iso_2022_jp_in_file.close()

iso_2022_jp_in_ref_file = open("src/test_data/iso_2022_jp_in_ref.txt", "w")
iso_2022_jp_in_ref_file.write(TEST_HEADER)
for pointer in range(0, 94 * 94):
  code_point = index[pointer]
  if code_point:
    iso_2022_jp_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  else:
    iso_2022_jp_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
iso_2022_jp_in_ref_file.close()

iso_2022_jp_out_file = open("src/test_data/iso_2022_jp_out.txt", "w")
iso_2022_jp_out_ref_file = open("src/test_data/iso_2022_jp_out_ref.txt", "w")
iso_2022_jp_out_file.write(TEST_HEADER)
iso_2022_jp_out_ref_file.write(TEST_HEADER)
for pointer in range(0, 94 * 94):
  code_point = index[pointer]
  if code_point:
    revised_pointer = pointer
    if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220):
      revised_pointer = index.index(code_point)
    (lead, trail) = divmod(revised_pointer, 94)
    lead += 0x21
    trail += 0x21
    iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
    iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
for i in xrange(len(half_width_index)):
  code_point = i + 0xFF61
  normalized_code_point = half_width_index[i]
  pointer = index.index(normalized_code_point)
  (lead, trail) = divmod(pointer, 94)
  lead += 0x21
  trail += 0x21
  iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
  iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
iso_2022_jp_out_file.close()
iso_2022_jp_out_ref_file.close()

index = indexes["euc-kr"]

euc_kr_in_file = open("src/test_data/euc_kr_in.txt", "w")
euc_kr_in_file.write(TEST_HEADER)
for pointer in range(0, len(index)):
  (lead, trail) = divmod(pointer, 190)
  lead += 0x81
  trail += 0x41
  euc_kr_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
euc_kr_in_file.close()

euc_kr_in_ref_file = open("src/test_data/euc_kr_in_ref.txt", "w")
euc_kr_in_ref_file.write(TEST_HEADER)
for pointer in range(0, len(index)):
  code_point = index[pointer]
  if code_point:
    euc_kr_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  else:
    trail = pointer % 190
    trail += 0x41
    if trail < 0x80:
      euc_kr_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
    else:
      euc_kr_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
euc_kr_in_ref_file.close()

euc_kr_out_file = open("src/test_data/euc_kr_out.txt", "w")
euc_kr_out_ref_file = open("src/test_data/euc_kr_out_ref.txt", "w")
euc_kr_out_file.write(TEST_HEADER)
euc_kr_out_ref_file.write(TEST_HEADER)
for pointer in range(0, len(index)):
  code_point = index[pointer]
  if code_point:
    (lead, trail) = divmod(pointer, 190)
    lead += 0x81
    trail += 0x41
    euc_kr_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
    euc_kr_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
euc_kr_out_file.close()
euc_kr_out_ref_file.close()

index = indexes["gb18030"]

gb18030_in_file = open("src/test_data/gb18030_in.txt", "w")
gb18030_in_file.write(TEST_HEADER)
for pointer in range(0, len(index)):
  (lead, trail) = divmod(pointer, 190)
  lead += 0x81
  trail += 0x40 if trail < 0x3F else 0x41
  gb18030_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
gb18030_in_file.close()

gb18030_in_ref_file = open("src/test_data/gb18030_in_ref.txt", "w")
gb18030_in_ref_file.write(TEST_HEADER)
for pointer in range(0, len(index)):
  code_point = index[pointer]
  if code_point:
    gb18030_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  else:
    trail = pointer % 190
    trail += 0x40 if trail < 0x3F else 0x41
    if trail < 0x80:
      gb18030_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
    else:
      gb18030_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
gb18030_in_ref_file.close()

gb18030_out_file = open("src/test_data/gb18030_out.txt", "w")
gb18030_out_ref_file = open("src/test_data/gb18030_out_ref.txt", "w")
gb18030_out_file.write(TEST_HEADER)
gb18030_out_ref_file.write(TEST_HEADER)
for pointer in range(0, len(index)):
  if pointer == 6555:
    continue
  code_point = index[pointer]
  if code_point:
    (lead, trail) = divmod(pointer, 190)
    lead += 0x81
    trail += 0x40 if trail < 0x3F else 0x41
    gb18030_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
    gb18030_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
gb18030_out_file.close()
gb18030_out_ref_file.close()

index = indexes["big5"]

big5_in_file = open("src/test_data/big5_in.txt", "w")
big5_in_file.write(TEST_HEADER)
for pointer in range(0, len(index)):
  (lead, trail) = divmod(pointer, 157)
  lead += 0x81
  trail += 0x40 if trail < 0x3F else 0x62
  big5_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
big5_in_file.close()

big5_two_characters = {
  1133: u"\u00CA\u0304",
  1135: u"\u00CA\u030C",
  1164: u"\u00EA\u0304",
  1166: u"\u00EA\u030C",
}

big5_in_ref_file = open("src/test_data/big5_in_ref.txt", "w")
big5_in_ref_file.write(TEST_HEADER)
for pointer in range(0, len(index)):
  if pointer in big5_two_characters.keys():
    big5_in_ref_file.write((u"%s\n" % big5_two_characters[pointer]).encode("utf-8"))
    continue
  code_point = index[pointer]
  if code_point:
    big5_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  else:
    trail = pointer % 157
    trail += 0x40 if trail < 0x3F else 0x62
    if trail < 0x80:
      big5_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
    else:
      big5_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
big5_in_ref_file.close()

prefer_last = [
  0x2550,
  0x255E,
  0x2561,
  0x256A,
  0x5341,
  0x5345,
]

pointer_for_prefer_last = []

for code_point in prefer_last:
  # Python lists don't have .rindex() :-(
  for i in xrange(len(index) - 1, -1, -1):
    candidate = index[i]
    if candidate == code_point:
       pointer_for_prefer_last.append(i)
       break

big5_out_file = open("src/test_data/big5_out.txt", "w")
big5_out_ref_file = open("src/test_data/big5_out_ref.txt", "w")
big5_out_file.write(TEST_HEADER)
big5_out_ref_file.write(TEST_HEADER)
for pointer in range(((0xA1 - 0x81) * 157), len(index)):
  code_point = index[pointer]
  if code_point:
    if code_point in prefer_last:
      if pointer != pointer_for_prefer_last[prefer_last.index(code_point)]:
        continue
    else:
      if pointer != index.index(code_point):
        continue
    (lead, trail) = divmod(pointer, 157)
    lead += 0x81
    trail += 0x40 if trail < 0x3F else 0x62
    big5_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
    big5_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
big5_out_file.close()
big5_out_ref_file.close()

index = indexes["jis0212"]

jis0212_in_file = open("src/test_data/jis0212_in.txt", "w")
jis0212_in_file.write(TEST_HEADER)
for pointer in range(0, len(index)):
  (lead, trail) = divmod(pointer, 94)
  lead += 0xA1
  trail += 0xA1
  jis0212_in_file.write("\x8F%s%s\n" % (chr(lead), chr(trail)))
jis0212_in_file.close()

jis0212_in_ref_file = open("src/test_data/jis0212_in_ref.txt", "w")
jis0212_in_ref_file.write(TEST_HEADER)
for pointer in range(0, len(index)):
  code_point = index[pointer]
  if code_point:
    jis0212_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
  else:
    jis0212_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
jis0212_in_ref_file.close()

(codepage_begin, codepage_end) = read_non_generated("../codepage/src/lib.rs")

codepage_file = open("../codepage/src/lib.rs", "w")

codepage_file.write(codepage_begin)
codepage_file.write("""
// Instead, please regenerate using generate-encoding-data.py

/// Supported code page numbers in estimated order of usage frequency
static CODE_PAGES: [u16; %d] = [
""" % len(code_pages))

for code_page in code_pages:
  codepage_file.write("    %d,\n" % code_page)

codepage_file.write("""];

/// Encodings corresponding to the code page numbers in the same order
static ENCODINGS: [&'static Encoding; %d] = [
""" % len(code_pages))

for code_page in code_pages:
  name = encodings_by_code_page[code_page]
  codepage_file.write("    &%s_INIT,\n" % to_constant_name(name))

codepage_file.write("""];

""")

codepage_file.write(codepage_end)
codepage_file.close()

(codepage_test_begin, codepage_test_end) = read_non_generated("../codepage/src/tests.rs")

codepage_test_file = open("../codepage/src/tests.rs", "w")

codepage_test_file.write(codepage_test_begin)
codepage_test_file.write("""
// Instead, please regenerate using generate-encoding-data.py

#[test]
fn test_to_encoding() {
    assert_eq!(to_encoding(0), None);

""")

for code_page in code_pages:
  codepage_test_file.write("    assert_eq!(to_encoding(%d), Some(%s));\n" % (code_page, to_constant_name(encodings_by_code_page[code_page])))

codepage_test_file.write("""}

#[test]
fn test_from_encoding() {
""")

for name in preferred:
  if code_pages_by_encoding.has_key(name):
    codepage_test_file.write("    assert_eq!(from_encoding(%s), Some(%d));\n" % (to_constant_name(name), code_pages_by_encoding[name]))
  else:
    codepage_test_file.write("    assert_eq!(from_encoding(%s), None);\n" % to_constant_name(name))

codepage_test_file.write("""}
""")

codepage_test_file.write(codepage_test_end)
codepage_test_file.close()

subprocess.call(["cargo", "fmt"])

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.48 Sekunden (vorverarbeitet am 2026-04-25) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.