#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
""" Usage:
make_intl_data.py langtags [cldr_common.zip]
make_intl_data.py tzdata
make_intl_data.py currency
make_intl_data.py units
make_intl_data.py numbering
Target
"langtags":
This script extracts information about 1) mappings between deprecated
and
current Unicode BCP 47 locale identifiers,
and 2) deprecated
and current
BCP 47 Unicode extension value
from CLDR,
and converts it to C++ mapping
code
in intl/components/LocaleGenerated.cpp. The code
is used
in
intl/components/Locale.cpp.
Target
"tzdata":
This script computes which time zone informations are
not up-to-date
in ICU
and provides the necessary mappings to workaround this problem.
https://ssl.icu-project.org/trac/ticket/12044
Target
"currency":
Generates the mapping
from currency codes to decimal digits used
for them.
Target
"units":
Generate source
and test files using the list of so-called
"sanctioned unit
identifiers
" and verifies that the ICU data filter includes these units.
Target
"numbering":
Generate source
and test files using the list of numbering systems
with
simple digit mappings
and verifies that it
's in sync with ICU/CLDR.
"""
import io
import json
import os
import re
import tarfile
import tempfile
from contextlib
import closing
from functools
import partial, total_ordering
from itertools
import chain, filterfalse, groupby, tee, zip_longest
from operator
import attrgetter, itemgetter
from urllib.parse
import urlsplit
from urllib.request
import Request
as UrlRequest
from urllib.request
import urlopen
from zipfile
import ZipFile
import yaml
# From https://docs.python.org/3/library/itertools.html
def grouper(iterable, n, fillvalue=
None):
"Collect data into fixed-length chunks or blocks"
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
def writeMappingHeader(println, description, source, url):
if type(description)
is not list:
description = [description]
for desc
in description:
println(
"// {0}".format(desc))
println(
"// Derived from {0}.".format(source))
println(
"// {0}".format(url))
def writeMappingsVar(println, mapping, name, description, source, url):
"""Writes a variable definition with a mapping table.
Writes the contents of dictionary |mapping| through the |println|
function
with the given variable name
and a comment
with description,
fileDate,
and URL.
"""
println(
"")
writeMappingHeader(println, description, source, url)
println(
"var {0} = {{".format(name))
for key, value
in sorted(mapping.items(), key=itemgetter(0)):
println(
' "{0}": "{1}",'.format(key, value))
println(
"};")
def writeMappingsBinarySearch(
println,
fn_name,
type_name,
name,
validate_fn,
validate_case_fn,
mappings,
tag_maxlength,
description,
source,
url,
):
"""Emit code to perform a binary search on language tag subtags.
Uses the contents of |mapping|, which can either be a dictionary
or set,
to emit a mapping function to find subtag replacements.
"""
println(
"")
writeMappingHeader(println, description, source, url)
println(
"""
bool mozilla::intl::Locale::{0}({1} {2}) {{
MOZ_ASSERT({3}({2}.Span()));
MOZ_ASSERT({4}({2}.Span()));
""".format(
fn_name, type_name, name, validate_fn, validate_case_fn
).strip()
)
writeMappingsBinarySearchBody(println, name, name, mappings, tag_maxlength)
println(
"""
}
""".lstrip(
"\n"
)
)
def writeMappingsBinarySearchBody(
println, source_name, target_name, mappings, tag_maxlength
):
def write_array(subtags, name, length, fixed):
if fixed:
println(
" static const char {}[{}][{}] = {{".format(
name, len(subtags), length + 1
)
)
else:
println(
" static const char* {}[{}] = {{".format(name, len(subtags)))
# Group in pairs of ten to not exceed the 80 line column limit.
for entries
in grouper(subtags, 10):
entries = (
'"{}"'.format(tag).rjust(length + 2)
for tag
in entries
if tag
is not None
)
println(
" {},".format(
", ".join(entries)))
println(
" };")
trailing_return =
True
# Sort the subtags by length. That enables using an optimized comparator
# for the binary search, which only performs a single |memcmp| for multiple
# of two subtag lengths.
mappings_keys = mappings.keys()
if type(mappings)
is dict
else mappings
for length, subtags
in groupby(sorted(mappings_keys, key=len), len):
# Omit the length check if the current length is the maximum length.
if length != tag_maxlength:
println(
"""
if ({}.Length() == {}) {{
""".format(
source_name, length
).rstrip(
"\n"
)
)
else:
trailing_return =
False
println(
"""
{
""".rstrip(
"\n"
)
)
# The subtags need to be sorted for binary search to work.
subtags = sorted(subtags)
def equals(subtag):
return """{}.EqualTo("{}
")""".format(source_name, subtag)
# Don't emit a binary search for short lists.
if len(subtags) == 1:
if type(mappings)
is dict:
println(
"""
if ({}) {{
{}.Set(mozilla::MakeStringSpan(
"{}"));
return true;
}}
return false;
""".format(
equals(subtags[0]), target_name, mappings[subtags[0]]
).strip(
"\n"
)
)
else:
println(
"""
return {};
""".format(
equals(subtags[0])
).strip(
"\n"
)
)
elif len(subtags) <= 4:
if type(mappings)
is dict:
for subtag
in subtags:
println(
"""
if ({}) {{
{}.Set(
"{}");
return true;
}}
""".format(
equals(subtag), target_name, mappings[subtag]
).strip(
"\n"
)
)
println(
"""
return false;
""".strip(
"\n"
)
)
else:
cond = (equals(subtag)
for subtag
in subtags)
cond = (
" ||\n" +
" " * (4 + len(
"return "))).join(cond)
println(
"""
return {};
""".format(
cond
).strip(
"\n"
)
)
else:
write_array(subtags, source_name +
"s", length,
True)
if type(mappings)
is dict:
write_array([mappings[k]
for k
in subtags],
"aliases", length,
False)
println(
"""
if (const char* replacement = SearchReplacement({0}s, aliases, {0})) {{
{1}.Set(mozilla::MakeStringSpan(replacement));
return true;
}}
return false;
""".format(
source_name, target_name
).rstrip()
)
else:
println(
"""
return HasReplacement({0}s, {0});
""".format(
source_name
).rstrip()
)
println(
"""
}
""".strip(
"\n"
)
)
if trailing_return:
println(
"""
return false;
"""
)
def writeComplexLanguageTagMappings(
println, complex_language_mappings, description, source, url
):
println(
"")
writeMappingHeader(println, description, source, url)
println(
"""
void mozilla::intl::Locale::PerformComplexLanguageMappings() {
MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span()));
MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span()));
""".lstrip()
)
# Merge duplicate language entries.
language_aliases = {}
for deprecated_language, (language, script, region)
in sorted(
complex_language_mappings.items(), key=itemgetter(0)
):
key = (language, script, region)
if key
not in language_aliases:
language_aliases[key] = []
else:
language_aliases[key].append(deprecated_language)
first_language =
True
for deprecated_language, (language, script, region)
in sorted(
complex_language_mappings.items(), key=itemgetter(0)
):
key = (language, script, region)
if deprecated_language
in language_aliases[key]:
continue
if_kind =
"if" if first_language
else "else if"
first_language =
False
cond = (
'Language().EqualTo("{}")'.format(lang)
for lang
in [deprecated_language] + language_aliases[key]
)
cond = (
" ||\n" +
" " * (2 + len(if_kind) + 2)).join(cond)
println(
"""
{} ({}) {{
""".format(
if_kind, cond
).strip(
"\n"
)
)
println(
"""
SetLanguage(
"{}");
""".format(
language
).strip(
"\n"
)
)
if script
is not None:
println(
"""
if (Script().Missing()) {{
SetScript(
"{}");
}}
""".format(
script
).strip(
"\n"
)
)
if region
is not None:
println(
"""
if (Region().Missing()) {{
SetRegion(
"{}");
}}
""".format(
region
).strip(
"\n"
)
)
println(
"""
}
""".strip(
"\n"
)
)
println(
"""
}
""".strip(
"\n"
)
)
def writeComplexRegionTagMappings(
println, complex_region_mappings, description, source, url
):
println(
"")
writeMappingHeader(println, description, source, url)
println(
"""
void mozilla::intl::Locale::PerformComplexRegionMappings() {
MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span()));
MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span()));
MOZ_ASSERT(IsStructurallyValidRegionTag(Region().Span()));
MOZ_ASSERT(IsCanonicallyCasedRegionTag(Region().Span()));
""".lstrip()
)
# |non_default_replacements| is a list and hence not hashable. Convert it
# to a string to get a proper hashable value.
def hash_key(default, non_default_replacements):
return (default, str(sorted(str(v)
for v
in non_default_replacements)))
# Merge duplicate region entries.
region_aliases = {}
for deprecated_region, (default, non_default_replacements)
in sorted(
complex_region_mappings.items(), key=itemgetter(0)
):
key = hash_key(default, non_default_replacements)
if key
not in region_aliases:
region_aliases[key] = []
else:
region_aliases[key].append(deprecated_region)
first_region =
True
for deprecated_region, (default, non_default_replacements)
in sorted(
complex_region_mappings.items(), key=itemgetter(0)
):
key = hash_key(default, non_default_replacements)
if deprecated_region
in region_aliases[key]:
continue
if_kind =
"if" if first_region
else "else if"
first_region =
False
cond = (
'Region().EqualTo("{}")'.format(region)
for region
in [deprecated_region] + region_aliases[key]
)
cond = (
" ||\n" +
" " * (2 + len(if_kind) + 2)).join(cond)
println(
"""
{} ({}) {{
""".format(
if_kind, cond
).strip(
"\n"
)
)
replacement_regions = sorted(
{region
for (_, _, region)
in non_default_replacements}
)
first_case =
True
for replacement_region
in replacement_regions:
replacement_language_script = sorted(
(language, script)
for (language, script, region)
in (non_default_replacements)
if region == replacement_region
)
if_kind =
"if" if first_case
else "else if"
first_case =
False
def compare_tags(language, script):
if script
is None:
return 'Language().EqualTo("{}")'.format(language)
return '(Language().EqualTo("{}") && Script().EqualTo("{}"))'.format(
language, script
)
cond = (
compare_tags(language, script)
for (language, script)
in replacement_language_script
)
cond = (
" ||\n" +
" " * (4 + len(if_kind) + 2)).join(cond)
println(
"""
{} ({}) {{
SetRegion(
"{}");
}}
""".format(
if_kind, cond, replacement_region
)
.rstrip()
.strip(
"\n")
)
println(
"""
else {{
SetRegion(
"{}");
}}
}}
""".format(
default
)
.rstrip()
.strip(
"\n")
)
println(
"""
}
""".strip(
"\n"
)
)
def writeVariantTagMappings(println, variant_mappings, description, source, url):
"""Writes a function definition that maps variant subtags."""
println(
"""
static const char* ToCharPointer(const char* str) {
return str;
}
static const char* ToCharPointer(const mozilla::intl::UniqueChars& str) {
return str.get();
}
template <typename T, typename U = T>
static bool IsLessThan(const T& a, const U& b) {
return strcmp(ToCharPointer(a), ToCharPointer(b)) < 0;
}
"""
)
writeMappingHeader(println, description, source, url)
println(
"""
bool mozilla::intl::Locale::PerformVariantMappings() {
// The variant subtags need to be sorted
for binary search.
MOZ_ASSERT(std::is_sorted(mVariants.begin(), mVariants.end(),
IsLessThan<decltype(mVariants)::ElementType>));
auto removeVariantAt = [&](size_t index) {
mVariants.erase(mVariants.begin() + index);
};
auto insertVariantSortedIfNotPresent = [&](const char* variant) {
auto* p = std::lower_bound(
mVariants.begin(), mVariants.end(), variant,
IsLessThan<decltype(mVariants)::ElementType, decltype(variant)>);
// Don
't insert the replacement when already present.
if (p != mVariants.end() && strcmp(p->get(), variant) == 0) {
return true;
}
// Insert the preferred variant
in sort order.
auto preferred = DuplicateStringToUniqueChars(variant);
return !!mVariants.insert(p, std::move(preferred));
};
for (size_t i = 0; i < mVariants.length();) {
const char* variant = mVariants[i].get();
MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeStringSpan(variant)));
""".lstrip()
)
(no_alias, with_alias) = partition(
variant_mappings.items(),
lambda item: item[1]
is None
)
no_replacements =
" ||\n ".join(
f
"""strcmp(variant, "{deprecated_variant}
") == 0"""
for (deprecated_variant, _)
in sorted(no_alias, key=itemgetter(0))
)
println(
f
"""
if ({no_replacements}) {{
removeVariantAt(i);
}}
""".strip(
"\n"
)
)
for deprecated_variant, (type, replacement)
in sorted(
with_alias, key=itemgetter(0)
):
println(
f
"""
else if (strcmp(variant,
"{deprecated_variant}") == 0) {{
removeVariantAt(i);
""".strip(
"\n"
)
)
if type ==
"language":
println(
f
"""
SetLanguage(
"{replacement}");
""".strip(
"\n"
)
)
elif type ==
"region":
println(
f
"""
SetRegion(
"{replacement}");
""".strip(
"\n"
)
)
else:
assert type ==
"variant"
println(
f
"""
if (!insertVariantSortedIfNotPresent(
"{replacement}")) {{
return false;
}}
""".strip(
"\n"
)
)
println(
"""
}
""".strip(
"\n"
)
)
println(
"""
else {
i++;
}
}
return true;
}
""".strip(
"\n"
)
)
def writeLegacyMappingsFunction(println, legacy_mappings, description, source, url):
"""Writes a function definition that maps legacy language tags."""
println(
"")
writeMappingHeader(println, description, source, url)
println(
"""\
bool mozilla::intl::Locale::UpdateLegacyMappings() {
// We
're mapping legacy tags to non-legacy form here.
// Other tags remain unchanged.
//
// Legacy tags are either sign language tags (
"sgn")
or have one
or multiple
// variant subtags. Therefore we can quickly exclude most tags by checking
// these two subtags.
MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span()));
if (!Language().EqualTo(
"sgn") && mVariants.length() == 0) {
return true;
}
#ifdef DEBUG
for (const auto& variant : Variants()) {
MOZ_ASSERT(IsStructurallyValidVariantTag(variant));
MOZ_ASSERT(IsCanonicallyCasedVariantTag(variant));
}
#endif
// The variant subtags need to be sorted
for binary search.
MOZ_ASSERT(std::is_sorted(mVariants.begin(), mVariants.end(),
IsLessThan<decltype(mVariants)::ElementType>));
auto findVariant = [this](const char* variant) {
auto* p = std::lower_bound(mVariants.begin(), mVariants.end(), variant,
IsLessThan<decltype(mVariants)::ElementType,
decltype(variant)>);
if (p != mVariants.end() && strcmp(p->get(), variant) == 0) {
return p;
}
return static_cast<decltype(p)>(nullptr);
};
auto insertVariantSortedIfNotPresent = [&](const char* variant) {
auto* p = std::lower_bound(mVariants.begin(), mVariants.end(), variant,
IsLessThan<decltype(mVariants)::ElementType,
decltype(variant)>);
// Don
't insert the replacement when already present.
if (p != mVariants.end() && strcmp(p->get(), variant) == 0) {
return true;
}
// Insert the preferred variant
in sort order.
auto preferred = DuplicateStringToUniqueChars(variant);
return !!mVariants.insert(p, std::move(preferred));
};
auto removeVariant = [&](auto* p) {
size_t index = std::distance(mVariants.begin(), p);
mVariants.erase(mVariants.begin() + index);
};
auto removeVariants = [&](auto* p, auto* q) {
size_t pIndex = std::distance(mVariants.begin(), p);
size_t qIndex = std::distance(mVariants.begin(), q);
MOZ_ASSERT(pIndex < qIndex,
"variant subtags are sorted");
mVariants.erase(mVariants.begin() + qIndex);
mVariants.erase(mVariants.begin() + pIndex);
};
"""
)
# Helper class for pattern matching.
class AnyClass:
def __eq__(self, obj):
return obj
is not None
Any = AnyClass()
# Group the mappings by language.
legacy_mappings_by_language = {}
for type, replacement
in legacy_mappings.items():
(language, _, _, _) = type
legacy_mappings_by_language.setdefault(language, {})[type] = replacement
# Handle the empty language case first.
if None in legacy_mappings_by_language:
# Get the mappings and remove them from the dict.
mappings = legacy_mappings_by_language.pop(
None)
# This case only applies for the "hepburn-heploc" -> "alalc97"
# mapping, so just inline it here.
from_tag = (
None,
None,
None,
"hepburn-heploc")
to_tag = (
None,
None,
None,
"alalc97")
assert len(mappings) == 1
assert mappings[from_tag] == to_tag
println(
"""
if (mVariants.length() >= 2) {
if (auto* hepburn = findVariant(
"hepburn")) {
if (auto* heploc = findVariant(
"heploc")) {
removeVariants(hepburn, heploc);
if (!insertVariantSortedIfNotPresent(
"alalc97")) {
return false;
}
}
}
}
"""
)
# Handle sign languages next.
if "sgn" in legacy_mappings_by_language:
mappings = legacy_mappings_by_language.pop(
"sgn")
# Legacy sign language mappings have the form "sgn-XX" where "XX" is
# some region code.
assert all(type == (
"sgn",
None, Any,
None)
for type
in mappings.keys())
# Legacy sign languages are mapped to a single language subtag.
assert all(
replacement == (Any,
None,
None,
None)
for replacement
in mappings.values()
)
println(
"""
if (Language().EqualTo(
"sgn")) {
if (Region().Present() && SignLanguageMapping(mLanguage, Region())) {
mRegion.Set(mozilla::MakeStringSpan(
""));
}
}
""".rstrip().lstrip(
"\n"
)
)
# Finally handle all remaining cases.
# The remaining mappings have neither script nor region subtags in the source locale.
assert all(
type == (Any,
None,
None, Any)
for mappings
in legacy_mappings_by_language.values()
for type
in mappings.keys()
)
# And they have neither script nor region nor variant subtags in the target locale.
assert all(
replacement == (Any,
None,
None,
None)
for mappings
in legacy_mappings_by_language.values()
for replacement
in mappings.values()
)
# Compact the mappings table by removing empty fields.
legacy_mappings_by_language = {
lang: {
variants: r_language
for ((_, _, _, variants), (r_language, _, _, _))
in mappings.items()
}
for (lang, mappings)
in legacy_mappings_by_language.items()
}
# Try to combine the remaining cases.
legacy_mappings_compact = {}
# Python can't hash dicts or lists, so use the string representation as the hash key.
def hash_key(mappings):
return str(sorted(mappings.items(), key=itemgetter(0)))
for lang, mappings
in sorted(
legacy_mappings_by_language.items(), key=itemgetter(0)
):
key = hash_key(mappings)
legacy_mappings_compact.setdefault(key, []).append(lang)
for langs
in legacy_mappings_compact.values():
language_equal_to = (
f
"""Language().EqualTo("{lang}
")""" for lang
in sorted(langs)
)
cond = f
""" ||\n{" " * len(" else if (
")}""".join(language_equal_to)
println(
f
"""
else if ({cond}) {{
""".rstrip().lstrip(
"\n"
)
)
mappings = legacy_mappings_by_language[langs[0]]
# Count the variant subtags to determine the sort order.
def variant_size(m):
(k, _) = m
return len(k.split(
"-"))
# Alias rules are applied by largest union size first.
for size, mappings_by_size
in groupby(
sorted(mappings.items(), key=variant_size, reverse=
True), key=variant_size
):
# Convert grouper object to dict.
mappings_by_size = dict(mappings_by_size)
is_first =
True
chain_if = size == 1
# Alias rules are applied in alphabetical order
for variants, r_language
in sorted(
mappings_by_size.items(), key=itemgetter(0)
):
sorted_variants = sorted(variants.split(
"-"))
len_variants = len(sorted_variants)
maybe_else =
"else " if chain_if
and not is_first
else ""
is_first =
False
for i, variant
in enumerate(sorted_variants):
println(
f
"""
{
" " * i}{maybe_else}
if (auto* {variant} = findVariant(
"{variant}")) {{
""".rstrip().lstrip(
"\n"
)
)
indent =
" " * len_variants
println(
f
"""
{indent}removeVariant{
"s" if len_variants > 1
else ""}({
", ".join(sorted_variants)});
{indent}SetLanguage(
"{r_language}");
{indent}{
"return true;" if not chain_if
else ""}
""".rstrip().lstrip(
"\n"
)
)
for i
in range(len_variants, 0, -1):
println(
f
"""
{
" " * (i - 1)}}}
""".rstrip().lstrip(
"\n"
)
)
println(
"""
}
""".rstrip().lstrip(
"\n"
)
)
println(
"""
return true;
}
"""
)
def writeSignLanguageMappingsFunction(
println, legacy_mappings, description, source, url
):
"""Writes a function definition that maps legacy sign language tags."""
println(
"")
writeMappingHeader(println, description, source, url)
println(
"""\
bool mozilla::intl::Locale::SignLanguageMapping(LanguageSubtag& language,
const RegionSubtag& region) {
MOZ_ASSERT(language.EqualTo(
"sgn"));
MOZ_ASSERT(IsStructurallyValidRegionTag(region.Span()));
MOZ_ASSERT(IsCanonicallyCasedRegionTag(region.Span()));
""".rstrip()
)
region_mappings = {
rg: lg
for ((lang, _, rg, _), (lg, _, _, _))
in legacy_mappings.items()
if lang ==
"sgn"
}
source_name =
"region"
target_name =
"language"
tag_maxlength = 3
writeMappingsBinarySearchBody(
println, source_name, target_name, region_mappings, tag_maxlength
)
println(
"""
}
""".lstrip()
)
def readSupplementalData(core_file):
"""Reads CLDR Supplemental Data and extracts information for Intl.js.
Information extracted:
- legacyMappings: mappings
from legacy tags to preferred complete language tags
- languageMappings: mappings
from language subtags to preferred subtags
- complexLanguageMappings: mappings
from language subtags
with complex rules
- regionMappings: mappings
from region subtags to preferred subtags
- complexRegionMappings: mappings
from region subtags
with complex rules
- variantMappings: mappings
from variant subtags to preferred subtags
- likelySubtags: likely subtags used
for generating test data only
Returns these mappings
as dictionaries.
"""
import xml.etree.ElementTree
as ET
# From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
re_unicode_language_id = re.compile(
r
"""
^
# unicode_language_id = unicode_language_subtag
# unicode_language_subtag = alpha{2,3} | alpha{5,8}
(?P<language>[a-z]{2,3}|[a-z]{5,8})
# (sep unicode_script_subtag)?
# unicode_script_subtag = alpha{4}
(?:-(?P<script>[a-z]{4}))?
# (sep unicode_region_subtag)?
# unicode_region_subtag = (alpha{2} | digit{3})
(?:-(?P<region>([a-z]{2}|[0-9]{3})))?
# (sep unicode_variant_subtag)*
# unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
(?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)?
$
""",
re.IGNORECASE | re.VERBOSE,
)
# CLDR uses "_" as the separator for some elements. Replace it with "-".
def bcp47_id(cldr_id):
return cldr_id.replace(
"_",
"-")
# Return the tuple (language, script, region, variants) and assert all
# subtags are in canonical case.
def bcp47_canonical(language, script, region, variants):
# Canonical case for language subtags is lower case.
assert language
is None or language.lower() == language
# Canonical case for script subtags is title case.
assert script
is None or script.title() == script
# Canonical case for region subtags is upper case.
assert region
is None or region.upper() == region
# Canonical case for variant subtags is lower case.
assert variants
is None or variants.lower() == variants
return (language, script, region, variants[1:]
if variants
else None)
# Language ids are interpreted as multi-maps in
# <https://www.unicode.org/reports/tr35/#LocaleId_Canonicalization>.
#
# See UTS35, §Annex C, Definitions - 1. Multimap interpretation.
def language_id_to_multimap(language_id):
match = re_unicode_language_id.match(language_id)
assert (
match
is not None
), f
"{language_id} invalid Unicode BCP 47 locale identifier"
canonical_language_id = bcp47_canonical(
*match.group(
"language",
"script",
"region",
"variants")
)
(language, _, _, _) = canonical_language_id
# Normalize "und" language to None, but keep the rest as is.
return (language
if language !=
"und" else None,) + canonical_language_id[1:]
rules = {}
territory_exception_rules = {}
tree = ET.parse(core_file.open(
"common/supplemental/supplementalMetadata.xml"))
# Load the rules from supplementalMetadata.xml.
#
# See UTS35, §Annex C, Definitions - 2. Alias elements.
# See UTS35, §Annex C, Preprocessing.
for alias_name
in [
"languageAlias",
"scriptAlias",
"territoryAlias",
"variantAlias",
]:
for alias
in tree.iterfind(
".//" + alias_name):
# Replace '_' by '-'.
type = bcp47_id(alias.get(
"type"))
replacement = bcp47_id(alias.get(
"replacement"))
# Prefix with "und-".
if alias_name !=
"languageAlias":
type =
"und-" + type
# Discard all rules where the type is an invalid languageId.
if re_unicode_language_id.match(type)
is None:
continue
type = language_id_to_multimap(type)
# Multiple, whitespace-separated territory replacements may be present.
if alias_name ==
"territoryAlias" and " " in replacement:
replacements = replacement.split(
" ")
replacement_list = [
language_id_to_multimap(
"und-" + r)
for r
in replacements
]
assert (
type
not in territory_exception_rules
), f
"Duplicate alias rule: {type}"
territory_exception_rules[type] = replacement_list
# The first element is the default territory replacement.
replacement = replacements[0]
# Prefix with "und-".
if alias_name !=
"languageAlias":
replacement =
"und-" + replacement
replacement = language_id_to_multimap(replacement)
assert type
not in rules, f
"Duplicate alias rule: {type}"
rules[type] = replacement
# Helper class for pattern matching.
class AnyClass:
def __eq__(self, obj):
return obj
is not None
Any = AnyClass()
modified_rules =
True
loop_count = 0
while modified_rules:
modified_rules =
False
loop_count += 1
# UTS 35 defines that canonicalization is applied until a fixed point has
# been reached. This iterative application of the canonicalization algorithm
# is only needed for a relatively small set of rules, so we can precompute
# the transitive closure of all rules here and then perform a single pass
# when canonicalizing language tags at runtime.
transitive_rules = {}
# Compute the transitive closure.
# Any case which currently doesn't occur in the CLDR sources isn't supported
# and will lead to throwing an error.
for type, replacement
in rules.items():
(language, script, region, variants) = type
(r_language, r_script, r_region, r_variants) = replacement
for i_type, i_replacement
in rules.items():
(i_language, i_script, i_region, i_variants) = i_type
(i_r_language, i_r_script, i_r_region, i_r_variants) = i_replacement
if i_language
is not None and i_language == r_language:
# This case currently only occurs when neither script nor region
# subtags are present. A single variant subtags may be present
# in |type|. And |i_type| definitely has a single variant subtag.
# Should this ever change, update this code accordingly.
assert type == (Any,
None,
None,
None)
or type == (
Any,
None,
None,
Any,
)
assert replacement == (Any,
None,
None,
None)
assert i_type == (Any,
None,
None, Any)
assert i_replacement == (Any,
None,
None,
None)
# This case happens for the rules
# "zh-guoyu -> zh",
# "zh-hakka -> hak", and
# "und-hakka -> und".
# Given the possible input "zh-guoyu-hakka", the first rule will
# change it to "zh-hakka", and then the second rule can be
# applied. (The third rule isn't applied ever.)
#
# Let's assume there's a hypothetical rule
# "zh-aaaaa" -> "en"
# And we have the input "zh-aaaaa-hakka", then "zh-aaaaa -> en"
# is applied before "zh-hakka -> hak", because rules are sorted
# alphabetically. That means the overall result is "en":
# "zh-aaaaa-hakka" is first canonicalized to "en-hakka" and then
# "hakka" is removed through the third rule.
#
# No current rule requires to handle this special case, so we
# don't yet support it.
assert variants
is None or variants <= i_variants
# Combine all variants and remove duplicates.
vars = set(
i_variants.split(
"-")
+ (variants.split(
"-")
if variants
else [])
)
# Add the variants alphabetically sorted.
n_type = (language,
None,
None,
"-".join(sorted(vars)))
assert (
n_type
not in transitive_rules
or transitive_rules[n_type] == i_replacement
)
transitive_rules[n_type] = i_replacement
continue
if i_script
is not None and i_script == r_script:
# This case currently doesn't occur, so we don't yet support it.
raise ValueError(
f
"{type} -> {replacement} :: {i_type} -> {i_replacement}"
)
if i_region
is not None and i_region == r_region:
# This case currently only applies for sign language
# replacements. Similar to the language subtag case any other
# combination isn't currently supported.
assert type == (
None,
None, Any,
None)
assert replacement == (
None,
None, Any,
None)
assert i_type == (
"sgn",
None, Any,
None)
assert i_replacement == (Any,
None,
None,
None)
n_type = (
"sgn",
None, region,
None)
assert n_type
not in transitive_rules
transitive_rules[n_type] = i_replacement
continue
if i_variants
is not None and i_variants == r_variants:
# This case currently doesn't occur, so we don't yet support it.
raise ValueError(
f
"{type} -> {replacement} :: {i_type} -> {i_replacement}"
)
# Ensure there are no contradicting rules.
assert all(
rules[type] == replacement
for (type, replacement)
in transitive_rules.items()
if type
in rules
)
# If |transitive_rules| is not a subset of |rules|, new rules will be added.
modified_rules =
not (transitive_rules.keys() <= rules.keys())
# Ensure we only have to iterate more than once for the "guoyo-{hakka,xiang}"
# case. Failing this assertion means either there's a bug when computing the
# stop condition of this loop or a new kind of legacy language tags was added.
if modified_rules
and loop_count > 1:
new_rules = {k
for k
in transitive_rules.keys()
if k
not in rules}
for k
in new_rules:
assert k == (Any,
None,
None,
"guoyu-hakka")
or k == (
Any,
None,
None,
"guoyu-xiang",
)
# Merge the transitive rules.
rules.update(transitive_rules)
# Computes the size of the union of all field value sets.
def multi_map_size(locale_id):
(language, script, region, variants) = locale_id
return (
(1
if language
is not None else 0)
+ (1
if script
is not None else 0)
+ (1
if region
is not None else 0)
+ (len(variants.split(
"-"))
if variants
is not None else 0)
)
# Dictionary of legacy mappings, contains raw rules, e.g.
# (None, None, None, "hepburn-heploc") -> (None, None, None, "alalc97").
legacy_mappings = {}
# Dictionary of simple language subtag mappings, e.g. "in" -> "id".
language_mappings = {}
# Dictionary of complex language subtag mappings, modifying more than one
# subtag, e.g. "sh" -> ("sr", "Latn", None) and "cnr" -> ("sr", None, "ME").
complex_language_mappings = {}
# Dictionary of simple script subtag mappings, e.g. "Qaai" -> "Zinh".
script_mappings = {}
# Dictionary of simple region subtag mappings, e.g. "DD" -> "DE".
region_mappings = {}
# Dictionary of complex region subtag mappings, containing more than one
# replacement, e.g. "SU" -> ("RU", ["AM", "AZ", "BY", ...]).
complex_region_mappings = {}
# Dictionary of aliased variant subtags to a tuple of preferred replacement
# type and replacement, e.g. "arevela" -> ("language", "hy") or
# "aaland" -> ("region", "AX") or "heploc" -> ("variant", "alalc97").
variant_mappings = {}
# Preprocess all rules so we can perform a single lookup per subtag at runtime.
for type, replacement
in rules.items():
(language, script, region, variants) = type
(r_language, r_script, r_region, r_variants) = replacement
type_map_size = multi_map_size(type)
# Most mappings are one-to-one and can be encoded through lookup tables.
if type_map_size == 1:
if language
is not None:
assert r_language
is not None,
"Can't remove a language subtag"
# We don't yet support this case.
assert (
r_variants
is None
), f
"Unhandled variant replacement in language alias: {replacement}"
if replacement == (Any,
None,
None,
None):
language_mappings[language] = r_language
else:
complex_language_mappings[language] = replacement[:-1]
elif script
is not None:
# We don't support removing script subtags.
assert (
r_script
is not None
), f
"Can't remove a script subtag: {replacement}"
# We only support one-to-one script mappings for now.
assert replacement == (
None,
Any,
None,
None,
), f
"Unhandled replacement in script alias: {replacement}"
script_mappings[script] = r_script
elif region
is not None:
# We don't support removing region subtags.
assert (
r_region
is not None
), f
"Can't remove a region subtag: {replacement}"
# We only support one-to-one region mappings for now.
assert replacement == (
None,
None,
Any,
None,
), f
"Unhandled replacement in region alias: {replacement}"
if type
not in territory_exception_rules:
region_mappings[region] = r_region
else:
complex_region_mappings[region] = [
r_region
for (_, _, r_region, _)
in territory_exception_rules[type]
]
else:
assert variants
is not None
assert len(variants.split(
"-")) == 1
# We only support one-to-one variant mappings for now.
assert (
multi_map_size(replacement) <= 1
), f
"Unhandled replacement in variant alias: {replacement}"
if r_language
is not None:
variant_mappings[variants] = (
"language", r_language)
elif r_script
is not None:
variant_mappings[variants] = (
"script", r_script)
elif r_region
is not None:
variant_mappings[variants] = (
"region", r_region)
elif r_variants
is not None:
assert len(r_variants.split(
"-")) == 1
variant_mappings[variants] = (
"variant", r_variants)
else:
variant_mappings[variants] =
None
else:
# Alias rules which have multiple input fields must be processed
# first. This applies only to a handful of rules, so our generated
# code adds fast paths to skip these rules in the common case.
# Case 1: Language and at least one variant subtag.
if language
is not None and variants
is not None:
pass
# Case 2: Sign language and a region subtag.
elif language ==
"sgn" and region
is not None:
pass
# Case 3: "hepburn-heploc" to "alalc97" canonicalization.
elif (
language
is None
and variants
is not None
and len(variants.split(
"-")) == 2
):
pass
# Any other combination is currently unsupported.
else:
raise ValueError(f
"{type} -> {replacement}")
legacy_mappings[type] = replacement
tree = ET.parse(core_file.open(
"common/supplemental/likelySubtags.xml"))
likely_subtags = {}
for likely_subtag
in tree.iterfind(
".//likelySubtag"):
from_tag = bcp47_id(likely_subtag.get(
"from"))
from_match = re_unicode_language_id.match(from_tag)
assert (
from_match
is not None
), f
"{from_tag} invalid Unicode BCP 47 locale identifier"
assert (
from_match.group(
"variants")
is None
), f
"unexpected variant subtags in {from_tag}"
to_tag = bcp47_id(likely_subtag.get(
"to"))
to_match = re_unicode_language_id.match(to_tag)
assert (
to_match
is not None
), f
"{to_tag} invalid Unicode BCP 47 locale identifier"
assert (
to_match.group(
"variants")
is None
), f
"unexpected variant subtags in {to_tag}"
from_canonical = bcp47_canonical(
*from_match.group(
"language",
"script",
"region",
"variants")
)
to_canonical = bcp47_canonical(
*to_match.group(
"language",
"script",
"region",
"variants")
)
# Remove the empty variant subtags.
from_canonical = from_canonical[:-1]
to_canonical = to_canonical[:-1]
likely_subtags[from_canonical] = to_canonical
complex_region_mappings_final = {}
for deprecated_region, replacements
in complex_region_mappings.items():
# Find all likely subtag entries which don't already contain a region
# subtag and whose target region is in the list of replacement regions.
region_likely_subtags = [
(from_language, from_script, to_region)
for (
(from_language, from_script, from_region),
(_, _, to_region),
)
in likely_subtags.items()
if from_region
is None and to_region
in replacements
]
# The first replacement entry is the default region.
default = replacements[0]
# Find all likely subtag entries whose region matches the default region.
default_replacements = {
(language, script)
for (language, script, region)
in region_likely_subtags
if region == default
}
# And finally find those entries which don't use the default region.
# These are the entries we're actually interested in, because those need
# to be handled specially when selecting the correct preferred region.
non_default_replacements = [
(language, script, region)
for (language, script, region)
in region_likely_subtags
if (language, script)
not in default_replacements
]
# Remove redundant mappings.
#
# For example starting with CLDR 43, the deprecated region "SU" has the
# following non-default replacement entries for "GE":
# - ('sva', None, 'GE')
# - ('sva', 'Cyrl', 'GE')
# - ('sva', 'Latn', 'GE')
#
# The latter two entries are redundant, because they're already handled
# by the first entry.
non_default_replacements = [
(language, script, region)
for (language, script, region)
in non_default_replacements
if script
is None
or (language,
None, region)
not in non_default_replacements
]
# If there are no non-default replacements, we can handle the region as
# part of the simple region mapping.
if non_default_replacements:
complex_region_mappings_final[deprecated_region] = (
default,
non_default_replacements,
)
else:
region_mappings[deprecated_region] = default
return {
"legacyMappings": legacy_mappings,
"languageMappings": language_mappings,
"complexLanguageMappings": complex_language_mappings,
"scriptMappings": script_mappings,
"regionMappings": region_mappings,
"complexRegionMappings": complex_region_mappings_final,
"variantMappings": variant_mappings,
"likelySubtags": likely_subtags,
}
def readUnicodeExtensions(core_file):
import xml.etree.ElementTree
as ET
# Match all xml-files in the BCP 47 directory.
bcpFileRE = re.compile(r
"^common/bcp47/.+\.xml$")
# https://www.unicode.org/reports/tr35/#Unicode_locale_identifier
#
# type = alphanum{3,8} (sep alphanum{3,8})* ;
typeRE = re.compile(r
"^[a-z0-9]{3,8}(-[a-z0-9]{3,8})*$")
# https://www.unicode.org/reports/tr35/#Unicode_language_identifier
#
# unicode_region_subtag = alpha{2} ;
alphaRegionRE = re.compile(r
"^[A-Z]{2}$", re.IGNORECASE)
# Mapping from Unicode extension types to dict of deprecated to
# preferred values.
mapping = {
# Unicode BCP 47 U Extension
"u": {},
# Unicode BCP 47 T Extension
"t": {},
}
def readBCP47File(file):
tree = ET.parse(file)
for keyword
in tree.iterfind(
".//keyword/key"):
extension = keyword.get(
"extension",
"u")
assert (
extension ==
"u" or extension ==
"t"
),
"unknown extension type: {}".format(extension)
extension_name = keyword.get(
"name")
for type
in keyword.iterfind(
"type"):
# <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
#
# The key or type name used by Unicode locale extension with 'u' extension
# syntax or the 't' extensions syntax. When alias below is absent, this name
# can be also used with the old style "@key=type" syntax.
name = type.get(
"name")
# Ignore the special name:
# - <https://unicode.org/reports/tr35/#CODEPOINTS>
# - <https://unicode.org/reports/tr35/#REORDER_CODE>
# - <https://unicode.org/reports/tr35/#RG_KEY_VALUE>
# - <https://unicode.org/reports/tr35/#SCRIPT_CODE>
# - <https://unicode.org/reports/tr35/#SUBDIVISION_CODE>
# - <https://unicode.org/reports/tr35/#PRIVATE_USE>
if name
in (
"CODEPOINTS",
"REORDER_CODE",
"RG_KEY_VALUE",
"SCRIPT_CODE",
"SUBDIVISION_CODE",
"PRIVATE_USE",
):
continue
# All other names should match the 'type' production.
assert (
typeRE.match(name)
is not None
),
"{} matches the 'type' production".format(name)
# <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
#
# The preferred value of the deprecated key, type or attribute element.
# When a key, type or attribute element is deprecated, this attribute is
# used for specifying a new canonical form if available.
preferred = type.get(
"preferred")
# <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
#
# The BCP 47 form is the canonical form, and recommended. Other aliases are
# included only for backwards compatibility.
alias = type.get(
"alias")
# <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
#
# Use the bcp47 data to replace keys, types, tfields, and tvalues by their
# canonical forms. See Section 3.6.4 U Extension Data Files) and Section
# 3.7.1 T Extension Data Files. The aliases are in the alias attribute
# value, while the canonical is in the name attribute value.
# 'preferred' contains the new preferred name, 'alias' the compatibility
# name, but then there's this entry where 'preferred' and 'alias' are the
# same. So which one to choose? Assume 'preferred' is the actual canonical
# name.
#
# <type name="islamicc"
# description="Civil (algorithmic) Arabic calendar"
# deprecated="true"
# preferred="islamic-civil"
# alias="islamic-civil"/>
if preferred
is not None:
assert typeRE.match(preferred), preferred
mapping[extension].setdefault(extension_name, {})[name] = preferred
if alias
is not None:
for alias_name
in alias.lower().split(
" "):
# Ignore alias entries which don't match the 'type' production.
if typeRE.match(alias_name)
is None:
continue
# See comment above when 'alias' and 'preferred' are both present.
if (
preferred
is not None
and name
in mapping[extension][extension_name]
):
continue
# Skip over entries where 'name' and 'alias' are equal.
#
# <type name="pst8pdt"
# description="POSIX style time zone for US Pacific Time"
# alias="PST8PDT"
# since="1.8"/>
if name == alias_name:
continue
mapping[extension].setdefault(extension_name, {})[
alias_name
] = name
def readSupplementalMetadata(file):
# Find subdivision and region replacements.
#
# <https://www.unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
#
# Replace aliases in special key values:
# - If there is an 'sd' or 'rg' key, replace any subdivision alias
# in its value in the same way, using subdivisionAlias data.
tree = ET.parse(file)
for alias
in tree.iterfind(
".//subdivisionAlias"):
type = alias.get(
"type")
assert (
typeRE.match(type)
is not None
),
"{} matches the 'type' production".format(type)
# Take the first replacement when multiple ones are present.
replacement = alias.get(
"replacement").split(
" ")[0].lower()
# Append "zzzz" if the replacement is a two-letter region code.
if alphaRegionRE.match(replacement)
is not None:
replacement +=
"zzzz"
# Assert the replacement is syntactically correct.
assert (
typeRE.match(replacement)
is not None
),
"replacement {} matches the 'type' production".format(replacement)
# 'subdivisionAlias' applies to 'rg' and 'sd' keys.
mapping[
"u"].setdefault(
"rg", {})[type] = replacement
mapping[
"u"].setdefault(
"sd", {})[type] = replacement
for name
in core_file.namelist():
if bcpFileRE.match(name):
readBCP47File(core_file.open(name))
readSupplementalMetadata(
core_file.open(
"common/supplemental/supplementalMetadata.xml")
)
return {
"unicodeMappings": mapping[
"u"],
"transformMappings": mapping[
"t"],
}
def writeCLDRLanguageTagData(println, data, url):
"""Writes the language tag data to the Intl data file."""
println(generatedFileWarning)
println(
"// Version: CLDR-{}".format(data[
"version"]))
println(
"// URL: {}".format(url))
println(
"""
#include "mozilla/Assertions.h"
#include "mozilla/Span.h"
#include "mozilla/TextUtils.h"
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <iterator>
#include <string>
#include <type_traits>
#include "mozilla/intl/Locale.h"
using namespace mozilla::intl::LanguageTagLimits;
template <size_t Length, size_t TagLength, size_t SubtagLength>
static inline bool HasReplacement(
const char (&subtags)[Length][TagLength],
const mozilla::intl::LanguageTagSubtag<SubtagLength>& subtag) {
MOZ_ASSERT(subtag.Length() == TagLength - 1,
"subtag must have the same length as the list of subtags");
const char* ptr = subtag.Span().data();
return std::binary_search(std::begin(subtags), std::end(subtags), ptr,
[](const char* a, const char* b) {
return memcmp(a, b, TagLength - 1) < 0;
});
}
template <size_t Length, size_t TagLength, size_t SubtagLength>
static inline const char* SearchReplacement(
const char (&subtags)[Length][TagLength], const char* (&aliases)[Length],
const mozilla::intl::LanguageTagSubtag<SubtagLength>& subtag) {
MOZ_ASSERT(subtag.Length() == TagLength - 1,
"subtag must have the same length as the list of subtags");
const char* ptr = subtag.Span().data();
auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr,
[](const char* a, const char* b) {
return memcmp(a, b, TagLength - 1) < 0;
});
if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) {
return aliases[std::distance(std::begin(subtags), p)];
}
return nullptr;
}
#ifdef DEBUG
static bool IsAsciiLowercaseAlphanumeric(char c) {
return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
}
static bool IsAsciiLowercaseAlphanumericOrDash(char c) {
return IsAsciiLowercaseAlphanumeric(c) || c ==
'-';
}
static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) {
return std::all_of(span.begin(), span.end(),
mozilla::IsAsciiLowercaseAlpha<char>);
}
static bool IsCanonicallyCasedScriptTag(mozilla::Span<const char> span) {
return mozilla::IsAsciiUppercaseAlpha(span[0]) &&
std::all_of(span.begin() + 1, span.end(),
mozilla::IsAsciiLowercaseAlpha<char>);
}
static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) {
return std::all_of(span.begin(), span.end(),
mozilla::IsAsciiUppercaseAlpha<char>) ||
std::all_of(span.begin(), span.end(), mozilla::IsAsciiDigit<char>);
}
static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) {
return std::all_of(span.begin(), span.end(), IsAsciiLowercaseAlphanumeric);
}
static bool IsCanonicallyCasedUnicodeKey(mozilla::Span<const char> key) {
return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
}
static bool IsCanonicallyCasedUnicodeType(mozilla::Span<const char> type) {
return std::all_of(type.begin(), type.end(),
IsAsciiLowercaseAlphanumericOrDash);
}
static bool IsCanonicallyCasedTransformKey(mozilla::Span<const char> key) {
return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
}
static bool IsCanonicallyCasedTransformType(mozilla::Span<const char> type) {
return std::all_of(type.begin(), type.end(),
IsAsciiLowercaseAlphanumericOrDash);
}
#endif
""".rstrip()
)
source =
"CLDR Supplemental Data, version {}".format(data[
"version"])
legacy_mappings = data[
"legacyMappings"]
language_mappings = data[
"languageMappings"]
complex_language_mappings = data[
"complexLanguageMappings"]
script_mappings = data[
"scriptMappings"]
region_mappings = data[
"regionMappings"]
complex_region_mappings = data[
"complexRegionMappings"]
variant_mappings = data[
"variantMappings"]
unicode_mappings = data[
"unicodeMappings"]
transform_mappings = data[
"transformMappings"]
# unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
language_maxlength = 8
# unicode_script_subtag = alpha{4} ;
script_maxlength = 4
# unicode_region_subtag = (alpha{2} | digit{3}) ;
region_maxlength = 3
writeMappingsBinarySearch(
println,
"LanguageMapping",
"LanguageSubtag&",
"language",
"IsStructurallyValidLanguageTag",
"IsCanonicallyCasedLanguageTag",
language_mappings,
language_maxlength,
"Mappings from language subtags to preferred values.",
source,
url,
)
writeMappingsBinarySearch(
println,
"ComplexLanguageMapping",
"const LanguageSubtag&",
"language",
"IsStructurallyValidLanguageTag",
"IsCanonicallyCasedLanguageTag",
complex_language_mappings.keys(),
language_maxlength,
"Language subtags with complex mappings.",
source,
url,
)
writeMappingsBinarySearch(
println,
"ScriptMapping",
"ScriptSubtag&",
"script",
"IsStructurallyValidScriptTag",
"IsCanonicallyCasedScriptTag",
script_mappings,
script_maxlength,
"Mappings from script subtags to preferred values.",
source,
url,
)
writeMappingsBinarySearch(
println,
"RegionMapping",
"RegionSubtag&",
"region",
"IsStructurallyValidRegionTag",
"IsCanonicallyCasedRegionTag",
region_mappings,
region_maxlength,
"Mappings from region subtags to preferred values.",
source,
url,
)
writeMappingsBinarySearch(
println,
"ComplexRegionMapping",
"const RegionSubtag&",
"region",
"IsStructurallyValidRegionTag",
"IsCanonicallyCasedRegionTag",
complex_region_mappings.keys(),
region_maxlength,
"Region subtags with complex mappings.",
source,
url,
)
writeComplexLanguageTagMappings(
println,
complex_language_mappings,
"Language subtags with complex mappings.",
source,
url,
)
writeComplexRegionTagMappings(
println,
complex_region_mappings,
"Region subtags with complex mappings.",
source,
url,
)
writeVariantTagMappings(
println,
variant_mappings,
"Mappings from variant subtags to preferred values.",
source,
url,
)
writeLegacyMappingsFunction(
println, legacy_mappings,
"Canonicalize legacy locale identifiers.", source, url
)
writeSignLanguageMappingsFunction(
println, legacy_mappings,
"Mappings from legacy sign languages.", source, url
)
writeUnicodeExtensionsMappings(println, unicode_mappings,
"Unicode")
writeUnicodeExtensionsMappings(println, transform_mappings,
"Transform")
def writeCLDRLanguageTagLikelySubtagsTest(println, data, url):
"""Writes the likely-subtags test file."""
println(generatedFileWarning)
source =
"CLDR Supplemental Data, version {}".format(data[
"version"])
language_mappings = data[
"languageMappings"]
complex_language_mappings = data[
"complexLanguageMappings"]
script_mappings = data[
"scriptMappings"]
region_mappings = data[
"regionMappings"]
complex_region_mappings = data[
"complexRegionMappings"]
likely_subtags = data[
"likelySubtags"]
def bcp47(tag):
(language, script, region) = tag
return "{}{}{}".format(
language,
"-" + script
if script
else "",
"-" + region
if region
else ""
)
def canonical(tag):
(language, script, region) = tag
# Map deprecated language subtags.
if language
in language_mappings:
language = language_mappings[language]
elif language
in complex_language_mappings:
(language2, script2, region2) = complex_language_mappings[language]
(language, script, region) = (
language2,
script
if script
else script2,
region
if region
else region2,
)
# Map deprecated script subtags.
if script
in script_mappings:
script = script_mappings[script]
# Map deprecated region subtags.
if region
in region_mappings:
region = region_mappings[region]
else:
# Assume no complex region mappings are needed for now.
assert (
region
not in complex_region_mappings
),
"unexpected region with complex mappings: {}".format(region)
return (language, script, region)
# https://unicode.org/reports/tr35/#Likely_Subtags
def addLikelySubtags(tag):
# Step 1: Canonicalize.
(language, script, region) = canonical(tag)
if script ==
"Zzzz":
script =
None
if region ==
"ZZ":
region =
None
# Step 2: Lookup.
searches = (
(language, script, region),
(language, script,
None),
(language,
None, region),
(language,
None,
None),
)
search = next(search
for search
in searches
if search
in likely_subtags)
(language_s, script_s, region_s) = search
(language_m, script_m, region_m) = likely_subtags[search]
# Step 3: Return.
return (
language
if language != language_s
else language_m,
script
if script != script_s
else script_m,
region
if region != region_s
else region_m,
)
# https://unicode.org/reports/tr35/#Likely_Subtags
def removeLikelySubtags(tag):
# Step 1: Add likely subtags.
--> --------------------
--> maximum size reached
--> --------------------