Quelle base_chars.py

Sprache: Python

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import re
from collections import namedtuple
from unicodedata import category, combining, normalize

UNICODE_LIMIT = 0x110000

UNICODE_COMBINING_CLASS_NOT_REORDERED = 0
UNICODE_COMBINING_CLASS_KANA_VOICING = 8
UNICODE_COMBINING_CLASS_VIRAMA = 9

BaseCharMapping = namedtuple("BaseCharMapping", ("char", "base_char"))
BaseCharMappingBlock = namedtuple("BaseCharMappingBlock", ("first", "last", "offset"))

# Keep this function in sync with IsCombiningDiacritic in nsUnicodeProperties.h.
def is_combining_diacritic(char):
    return combining(char) not in (
        UNICODE_COMBINING_CLASS_NOT_REORDERED,
        UNICODE_COMBINING_CLASS_KANA_VOICING,
        UNICODE_COMBINING_CLASS_VIRAMA,
        91,
        129,
        130,
        132,
    )

# Keep this function in sync with IsMathOrMusicSymbol in nsUnicodeProperties.h.
def is_math_or_music_symbol(char):
    return category(char) in ("Sm", "So")

def changes_plane(char, base_char):
    # Mappings that would change the first 16 bits of a character are not
    # currently supported. This is because the mapping table only records the
    # last 16 bits of the base character and also because moving into or out of
    # the basic multilingual plane would change the length of a UTF-16 string.
    return ord(char) >> 16 != ord(base_char) >> 16

def main(header, fallback_table):
    mappings = {}

    # Glean mappings from decompositions

    for char in range(UNICODE_LIMIT):
        char = chr(char)
        if is_combining_diacritic(char) or is_math_or_music_symbol(char):
            continue
        decomposition = normalize("NFD", char)
        if len(decomposition) < 2:
            continue
        base_char = decomposition[0]
        if changes_plane(char, base_char):
            continue
        next_char = decomposition[1]
        if not is_combining_diacritic(next_char):
            # Hangul syllables decompose but do not actually have diacritics.
            # This also excludes decompositions with the Japanese marks U+3099
            # and U+309A (COMBINING KATAKANA-HIRAGANA [SEMI-]VOICED SOUND
            # MARK), which we should not ignore for searching (bug 1624244).
            continue
        mappings[char] = base_char

    # Add mappings from the ASCII fallback table

    for line in open(fallback_table, encoding="UTF-8"):
        m = re.match("^(.) → (.+?) ;", line)
        if not m:
            continue
        char = m.group(1)
        decomposition = m.group(2)
        if len(decomposition) >= 3:
            if decomposition.startswith("'") and decomposition.endswith("'"):
                decomposition = decomposition[1:-1]
        if len(decomposition) >= 2:
            if decomposition.startswith("\\"):
                decomposition = decomposition[1:]
        if len(decomposition) > 1:
            continue
        if changes_plane(char, decomposition):
            continue
        mappings[char] = decomposition

    # Organize mappings into contiguous blocks

    mappings = sorted([BaseCharMapping(ord(k), ord(v)) for k, v in mappings.items()])
    blocks = []
    i = 0
    while i < len(mappings) - 1:
        offset = i
        first = mappings[i].char & 0xFF
        while (
            i < len(mappings) - 1 and mappings[i].char >> 8 == mappings[i + 1].char >> 8
        ):
            while (
                i < len(mappings) - 1
                and mappings[i].char >> 8 == mappings[i + 1].char >> 8
                and mappings[i + 1].char - mappings[i].char > 1
            ):
                char = mappings[i].char + 1
                mappings.insert(i + 1, BaseCharMapping(char, char))
                i += 1
            i += 1
        last = mappings[i].char & 0xFF
        blocks.append(BaseCharMappingBlock(first, last, offset))
        i += 1

    indexes = []
    for i, block in enumerate(blocks):
        while len(indexes) < mappings[block.offset].char >> 8:
            indexes.append(255)
        indexes.append(i)

    # Write the mappings to a C header file

    header.write("struct BaseCharMappingBlock {\n")
    header.write("  uint8_t mFirst;\n")
    header.write("  uint8_t mLast;\n")
    header.write("  uint16_t mMappingStartOffset;\n")
    header.write("};\n")
    header.write("\n")
    header.write("static const uint16_t BASE_CHAR_MAPPING_LIST[] = {\n")
    for char, base_char in mappings:
        header.write(
            "  /* {:#06x}".format(char)
            + " */ "
            + "{:#06x}".format(base_char & 0xFFFF)
            + ","
        )
        if char != base_char:
            header.write(" /* " + chr(char) + " → " + chr(base_char) + " */")
        header.write("\n")
    header.write("};\n")
    header.write("\n")
    header.write(
        "static const struct BaseCharMappingBlock BASE_CHAR_MAPPING_BLOCKS[] = {\n"
    )
    for block in blocks:
        header.write(
            "  {"
            + "{:#04x}".format(block.first)
            + ", "
            + "{:#04x}".format(block.last)
            + ", "
            + str(block.offset).rjust(4)
            + "}, // "
            + "{:#04x}".format(mappings[block.offset].char >> 8)
            + "xx\n"
        )
    header.write("};\n")
    header.write("\n")
    header.write("static const uint8_t BASE_CHAR_MAPPING_BLOCK_INDEX[] = {\n")
    for i, index in enumerate(indexes):
        header.write(
            "  " + str(index).rjust(3) + ", // " + "{:#04x}".format(i) + "xx\n"
        )
    header.write("};\n")

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.14 Sekunden (vorverarbeitet am 2026-04-27) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.