/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * This file is part of the LibreOffice project. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * This file incorporates work covered by the following license notice: * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/
sal_Int16
unicode::getUnicodeType(const sal_uInt32 ch)
{ static sal_uInt32 c = 0x00; static sal_uInt32 r = 0x00;
if (ch == c) return r; else c = ch;
switch (u_charType(ch))
{ case U_UNASSIGNED:
r = css::i18n::UnicodeType::UNASSIGNED; break; case U_UPPERCASE_LETTER:
r = css::i18n::UnicodeType::UPPERCASE_LETTER; break; case U_LOWERCASE_LETTER:
r = css::i18n::UnicodeType::LOWERCASE_LETTER; break; case U_TITLECASE_LETTER:
r = css::i18n::UnicodeType::TITLECASE_LETTER; break; case U_MODIFIER_LETTER:
r = css::i18n::UnicodeType::MODIFIER_LETTER; break; case U_OTHER_LETTER:
r = css::i18n::UnicodeType::OTHER_LETTER; break; case U_NON_SPACING_MARK:
r = css::i18n::UnicodeType::NON_SPACING_MARK; break; case U_ENCLOSING_MARK:
r = css::i18n::UnicodeType::ENCLOSING_MARK; break; case U_COMBINING_SPACING_MARK:
r = css::i18n::UnicodeType::COMBINING_SPACING_MARK; break; case U_DECIMAL_DIGIT_NUMBER:
r = css::i18n::UnicodeType::DECIMAL_DIGIT_NUMBER; break; case U_LETTER_NUMBER:
r = css::i18n::UnicodeType::LETTER_NUMBER; break; case U_OTHER_NUMBER:
r = css::i18n::UnicodeType::OTHER_NUMBER; break; case U_SPACE_SEPARATOR:
r = css::i18n::UnicodeType::SPACE_SEPARATOR; break; case U_LINE_SEPARATOR:
r = css::i18n::UnicodeType::LINE_SEPARATOR; break; case U_PARAGRAPH_SEPARATOR:
r = css::i18n::UnicodeType::PARAGRAPH_SEPARATOR; break; case U_CONTROL_CHAR:
r = css::i18n::UnicodeType::CONTROL; break; case U_FORMAT_CHAR:
r = css::i18n::UnicodeType::FORMAT; break; case U_PRIVATE_USE_CHAR:
r = css::i18n::UnicodeType::PRIVATE_USE; break; case U_SURROGATE:
r = css::i18n::UnicodeType::SURROGATE; break; case U_DASH_PUNCTUATION:
r = css::i18n::UnicodeType::DASH_PUNCTUATION; break; case U_INITIAL_PUNCTUATION:
r = css::i18n::UnicodeType::INITIAL_PUNCTUATION; break; case U_FINAL_PUNCTUATION:
r = css::i18n::UnicodeType::FINAL_PUNCTUATION; break; case U_CONNECTOR_PUNCTUATION:
r = css::i18n::UnicodeType::CONNECTOR_PUNCTUATION; break; case U_OTHER_PUNCTUATION:
r = css::i18n::UnicodeType::OTHER_PUNCTUATION; break; case U_MATH_SYMBOL:
r = css::i18n::UnicodeType::MATH_SYMBOL; break; case U_CURRENCY_SYMBOL:
r = css::i18n::UnicodeType::CURRENCY_SYMBOL; break; case U_MODIFIER_SYMBOL:
r = css::i18n::UnicodeType::MODIFIER_SYMBOL; break; case U_OTHER_SYMBOL:
r = css::i18n::UnicodeType::OTHER_SYMBOL; break; case U_START_PUNCTUATION:
r = css::i18n::UnicodeType::START_PUNCTUATION; break; case U_END_PUNCTUATION:
r = css::i18n::UnicodeType::END_PUNCTUATION; break;
}
return r;
}
sal_uInt8
unicode::getUnicodeDirection( const sal_Unicode ch ) { static sal_Unicode c = 0x00; static sal_uInt8 r = 0x00;
sal_Int16 unicode::getScriptClassFromUScriptCode(UScriptCode eScript)
{ //See unicode/uscript.h
sal_Int16 nRet; switch (eScript)
{ case USCRIPT_INVALID_CODE: case USCRIPT_COMMON: case USCRIPT_INHERITED: case USCRIPT_UNWRITTEN_LANGUAGES: case USCRIPT_UNKNOWN: case USCRIPT_MATHEMATICAL_NOTATION: case USCRIPT_SYMBOLS: case USCRIPT_CODE_LIMIT:
nRet = ScriptType::WEAK; break; case USCRIPT_ARMENIAN: case USCRIPT_CHEROKEE: case USCRIPT_COPTIC: case USCRIPT_CYRILLIC: case USCRIPT_GEORGIAN: case USCRIPT_GOTHIC: case USCRIPT_GREEK: case USCRIPT_LATIN: case USCRIPT_OGHAM: case USCRIPT_OLD_ITALIC: case USCRIPT_RUNIC: case USCRIPT_CANADIAN_ABORIGINAL: case USCRIPT_BRAILLE: case USCRIPT_CYPRIOT: case USCRIPT_OSMANYA: case USCRIPT_SHAVIAN: case USCRIPT_KATAKANA_OR_HIRAGANA: case USCRIPT_GLAGOLITIC: case USCRIPT_CIRTH: case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC: case USCRIPT_OLD_HUNGARIAN: case USCRIPT_LATIN_FRAKTUR: case USCRIPT_LATIN_GAELIC:
nRet = ScriptType::LATIN; break; case USCRIPT_BOPOMOFO: case USCRIPT_HAN: case USCRIPT_HANGUL: case USCRIPT_HIRAGANA: case USCRIPT_KATAKANA: case USCRIPT_YI: case USCRIPT_SIMPLIFIED_HAN: case USCRIPT_TRADITIONAL_HAN: case USCRIPT_JAPANESE: case USCRIPT_KOREAN: case USCRIPT_TANGUT: case USCRIPT_KHITAN_SMALL_SCRIPT:
nRet = ScriptType::ASIAN; break; case USCRIPT_ARABIC: case USCRIPT_BENGALI: case USCRIPT_DESERET: case USCRIPT_DEVANAGARI: case USCRIPT_ETHIOPIC: case USCRIPT_GUJARATI: case USCRIPT_GURMUKHI: case USCRIPT_HEBREW: case USCRIPT_KANNADA: case USCRIPT_KHMER: case USCRIPT_LAO: case USCRIPT_MALAYALAM: case USCRIPT_MONGOLIAN: case USCRIPT_MYANMAR: case USCRIPT_ORIYA: case USCRIPT_SINHALA: case USCRIPT_SYRIAC: case USCRIPT_TAMIL: case USCRIPT_TELUGU: case USCRIPT_THAANA: case USCRIPT_THAI: case USCRIPT_TIBETAN: case USCRIPT_TAGALOG: case USCRIPT_HANUNOO: case USCRIPT_BUHID: case USCRIPT_TAGBANWA: case USCRIPT_LIMBU: case USCRIPT_LINEAR_B: case USCRIPT_TAI_LE: case USCRIPT_UGARITIC: case USCRIPT_BUGINESE: case USCRIPT_KHAROSHTHI: case USCRIPT_SYLOTI_NAGRI: case USCRIPT_NEW_TAI_LUE: case USCRIPT_TIFINAGH: case USCRIPT_OLD_PERSIAN: case USCRIPT_BALINESE: case USCRIPT_BATAK: case USCRIPT_BLISSYMBOLS: case USCRIPT_BRAHMI: case USCRIPT_CHAM: case USCRIPT_DEMOTIC_EGYPTIAN: case USCRIPT_HIERATIC_EGYPTIAN: case USCRIPT_EGYPTIAN_HIEROGLYPHS: case USCRIPT_KHUTSURI: case USCRIPT_PAHAWH_HMONG: case USCRIPT_HARAPPAN_INDUS: case USCRIPT_JAVANESE: case USCRIPT_KAYAH_LI: case USCRIPT_LEPCHA: case USCRIPT_LINEAR_A: case USCRIPT_MANDAEAN: case USCRIPT_MAYAN_HIEROGLYPHS: case USCRIPT_MEROITIC: case USCRIPT_NKO: case USCRIPT_ORKHON: case USCRIPT_OLD_PERMIC: case USCRIPT_PHAGS_PA: case USCRIPT_PHOENICIAN: case USCRIPT_PHONETIC_POLLARD: case USCRIPT_RONGORONGO: case USCRIPT_SARATI: case USCRIPT_ESTRANGELO_SYRIAC: case USCRIPT_WESTERN_SYRIAC: case USCRIPT_EASTERN_SYRIAC: case USCRIPT_TENGWAR: case USCRIPT_VAI: case USCRIPT_VISIBLE_SPEECH: case USCRIPT_CUNEIFORM: case USCRIPT_CARIAN: case USCRIPT_LANNA: case USCRIPT_LYCIAN: case USCRIPT_LYDIAN: case USCRIPT_OL_CHIKI: case USCRIPT_REJANG: case USCRIPT_SAURASHTRA: case USCRIPT_SIGN_WRITING: case USCRIPT_SUNDANESE: case USCRIPT_MOON: case USCRIPT_MEITEI_MAYEK: case USCRIPT_IMPERIAL_ARAMAIC: case USCRIPT_AVESTAN: case USCRIPT_CHAKMA: case USCRIPT_KAITHI: case USCRIPT_MANICHAEAN: case USCRIPT_INSCRIPTIONAL_PAHLAVI: case USCRIPT_PSALTER_PAHLAVI: case USCRIPT_BOOK_PAHLAVI: case USCRIPT_INSCRIPTIONAL_PARTHIAN: case USCRIPT_SAMARITAN: case USCRIPT_TAI_VIET: case USCRIPT_BAMUM: case USCRIPT_LISU: case USCRIPT_NAKHI_GEBA: case USCRIPT_OLD_SOUTH_ARABIAN: case USCRIPT_BASSA_VAH: case USCRIPT_DUPLOYAN_SHORTAND: case USCRIPT_ELBASAN: case USCRIPT_GRANTHA: case USCRIPT_KPELLE: case USCRIPT_LOMA: case USCRIPT_MENDE: case USCRIPT_MEROITIC_CURSIVE: case USCRIPT_OLD_NORTH_ARABIAN: case USCRIPT_NABATAEAN: case USCRIPT_PALMYRENE: case USCRIPT_SINDHI: case USCRIPT_WARANG_CITI: default: // anything new is going to be pretty wild
nRet = ScriptType::COMPLEX; break;
} return nRet;
}
sal_Int16 unicode::getScriptClassFromLanguageTag( const LanguageTag& rLanguageTag )
{
constexpr int32_t nBuf = 42;
UScriptCode aBuf[nBuf]; if (rLanguageTag.hasScript())
{
aBuf[0] = static_cast<UScriptCode>(u_getPropertyValueEnum( UCHAR_SCRIPT,
OUStringToOString( rLanguageTag.getScript(), RTL_TEXTENCODING_ASCII_US).getStr()));
} else
{
OUString aName; if (rLanguageTag.getCountry().isEmpty())
aName = rLanguageTag.getLanguage(); else
aName = rLanguageTag.getLanguage() + "-" + rLanguageTag.getCountry();
UErrorCode status = U_ZERO_ERROR; const int32_t nScripts = uscript_getCode(
OUStringToOString( aName, RTL_TEXTENCODING_ASCII_US).getStr(),
aBuf, nBuf, &status); // U_BUFFER_OVERFLOW_ERROR would be set with too many scripts for buffer // and required capacity returned, but really.. if (nScripts == 0 || !U_SUCCESS(status)) return css::i18n::ScriptType::LATIN;
} return getScriptClassFromUScriptCode( aBuf[0]);
}
OString unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript)
{
OString sRet; switch (eScript)
{ case USCRIPT_CODE_LIMIT: case USCRIPT_INVALID_CODE: case USCRIPT_MATHEMATICAL_NOTATION: case USCRIPT_SYMBOLS:
sRet = "zxx"_ostr; break; case USCRIPT_COMMON: case USCRIPT_INHERITED: case USCRIPT_UNWRITTEN_LANGUAGES: case USCRIPT_UNKNOWN:
sRet = "und"_ostr; break; case USCRIPT_ARABIC:
sRet = "ar"_ostr; break; case USCRIPT_ARMENIAN:
sRet = "hy"_ostr; break; case USCRIPT_BENGALI:
sRet = "bn"_ostr; break; case USCRIPT_BOPOMOFO:
sRet = "zh"_ostr; break; case USCRIPT_CHEROKEE:
sRet = "chr"_ostr; break; case USCRIPT_COPTIC:
sRet = "cop"_ostr; break; case USCRIPT_CYRILLIC:
sRet = "ru"_ostr; break; case USCRIPT_DESERET:
sRet = "en"_ostr; break; case USCRIPT_DEVANAGARI:
sRet = "hi"_ostr; break; case USCRIPT_ETHIOPIC:
sRet = "am"_ostr; break; case USCRIPT_GEORGIAN: case USCRIPT_KHUTSURI:
sRet = "ka"_ostr; break; case USCRIPT_GOTHIC:
sRet = "got"_ostr; break; case USCRIPT_GREEK:
sRet = "el"_ostr; break; case USCRIPT_GUJARATI: case USCRIPT_KHOJKI:
sRet = "gu"_ostr; break; case USCRIPT_GURMUKHI:
sRet = "pa"_ostr; break; case USCRIPT_HAN:
sRet = "zh"_ostr; break; case USCRIPT_HANGUL: case USCRIPT_KOREAN: case USCRIPT_JAMO:
sRet = "ko"_ostr; // Jamo - elements of Hangul Syllables break; case USCRIPT_HEBREW:
sRet = "hr"_ostr; break; case USCRIPT_HIRAGANA:
sRet = "ja"_ostr; break; case USCRIPT_KANNADA:
sRet = "kn"_ostr; break; case USCRIPT_KATAKANA:
sRet = "ja"_ostr; break; case USCRIPT_KHMER:
sRet = "km"_ostr; break; case USCRIPT_LAO:
sRet = "lo"_ostr; break; case USCRIPT_LATIN:
sRet = "en"_ostr; break; case USCRIPT_MALAYALAM:
sRet = "ml"_ostr; break; case USCRIPT_MONGOLIAN:
sRet = "mn"_ostr; break; case USCRIPT_MYANMAR:
sRet = "my"_ostr; break; case USCRIPT_OGHAM:
sRet = "pgl"_ostr; break; case USCRIPT_OLD_ITALIC:
sRet = "osc"_ostr; break; case USCRIPT_ORIYA:
sRet = "or"_ostr; break; case USCRIPT_RUNIC:
sRet = "ang"_ostr; break; case USCRIPT_SINHALA:
sRet = "si"_ostr; break; case USCRIPT_SYRIAC: case USCRIPT_ESTRANGELO_SYRIAC:
sRet = "syr"_ostr; break; case USCRIPT_TAMIL: case USCRIPT_GRANTHA:
sRet = "ta"_ostr; break; case USCRIPT_TELUGU:
sRet = "te"_ostr; break; case USCRIPT_THAANA:
sRet = "dv"_ostr; break; case USCRIPT_THAI:
sRet = "th"_ostr; break; case USCRIPT_TIBETAN:
sRet = "bo"_ostr; break; case USCRIPT_CANADIAN_ABORIGINAL:
sRet = "iu"_ostr; break; case USCRIPT_YI:
sRet = "ii"_ostr; break; case USCRIPT_TAGALOG:
sRet = "tl"_ostr; break; case USCRIPT_HANUNOO:
sRet = "hnn"_ostr; break; case USCRIPT_BUHID:
sRet = "bku"_ostr; break; case USCRIPT_TAGBANWA:
sRet = "tbw"_ostr; break; case USCRIPT_BRAILLE:
sRet = "en"_ostr; break; case USCRIPT_CYPRIOT:
sRet = "ecy"_ostr; break; case USCRIPT_LIMBU:
sRet = "lif"_ostr; break; case USCRIPT_LINEAR_B:
sRet = "gmy"_ostr; break; case USCRIPT_OSMANYA:
sRet = "so"_ostr; break; case USCRIPT_SHAVIAN:
sRet = "en"_ostr; break; case USCRIPT_TAI_LE:
sRet = "tdd"_ostr; break; case USCRIPT_UGARITIC:
sRet = "uga"_ostr; break; case USCRIPT_KATAKANA_OR_HIRAGANA:
sRet = "ja"_ostr; break; case USCRIPT_BUGINESE:
sRet = "bug"_ostr; break; case USCRIPT_GLAGOLITIC:
sRet = "ch"_ostr; break; case USCRIPT_KHAROSHTHI: case USCRIPT_BRAHMI:
sRet = "pra"_ostr; break; case USCRIPT_SYLOTI_NAGRI:
sRet = "syl"_ostr; break; case USCRIPT_NEW_TAI_LUE:
sRet = "khb"_ostr; break; case USCRIPT_TIFINAGH:
sRet = "tmh"_ostr; break; case USCRIPT_OLD_PERSIAN:
sRet = "peo"_ostr; break; case USCRIPT_BALINESE:
sRet = "ban"_ostr; break; case USCRIPT_BATAK:
sRet = "btk"_ostr; break; case USCRIPT_BLISSYMBOLS:
sRet = "en"_ostr; break; case USCRIPT_CHAM:
sRet = "cja"_ostr; break; case USCRIPT_CIRTH: case USCRIPT_TENGWAR:
sRet = "sjn"_ostr; break; case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
sRet = "cu"_ostr; break; case USCRIPT_DEMOTIC_EGYPTIAN: case USCRIPT_HIERATIC_EGYPTIAN: case USCRIPT_EGYPTIAN_HIEROGLYPHS:
sRet = "egy"_ostr; break; case USCRIPT_SIMPLIFIED_HAN:
sRet = "zh"_ostr; break; case USCRIPT_TRADITIONAL_HAN:
sRet = "zh"_ostr; break; case USCRIPT_PAHAWH_HMONG:
sRet = "blu"_ostr; break; case USCRIPT_OLD_HUNGARIAN:
sRet = "ohu"_ostr; break; case USCRIPT_HARAPPAN_INDUS:
sRet = "xiv"_ostr; break; case USCRIPT_JAVANESE:
sRet = "kaw"_ostr; break; case USCRIPT_KAYAH_LI:
sRet = "eky"_ostr; break; case USCRIPT_LATIN_FRAKTUR:
sRet = "de"_ostr; break; case USCRIPT_LATIN_GAELIC:
sRet = "ga"_ostr; break; case USCRIPT_LEPCHA:
sRet = "lep"_ostr; break; case USCRIPT_LINEAR_A:
sRet = "ecr"_ostr; break; case USCRIPT_MAYAN_HIEROGLYPHS:
sRet = "myn"_ostr; break; case USCRIPT_MEROITIC_CURSIVE: case USCRIPT_MEROITIC:
sRet = "xmr"_ostr; break; case USCRIPT_NKO:
sRet = "nqo"_ostr; break; case USCRIPT_ORKHON:
sRet = "otk"_ostr; break; case USCRIPT_OLD_PERMIC:
sRet = "kv"_ostr; break; case USCRIPT_PHAGS_PA:
sRet = "xng"_ostr; break; case USCRIPT_PHOENICIAN:
sRet = "phn"_ostr; break; case USCRIPT_PHONETIC_POLLARD:
sRet = "hmd"_ostr; break; case USCRIPT_RONGORONGO:
sRet = "rap"_ostr; break; case USCRIPT_SARATI:
sRet = "qya"_ostr; break; case USCRIPT_WESTERN_SYRIAC:
sRet = "tru"_ostr; break; case USCRIPT_EASTERN_SYRIAC:
sRet = "aii"_ostr; break; case USCRIPT_VAI:
sRet = "vai"_ostr; break; case USCRIPT_VISIBLE_SPEECH:
sRet = "en"_ostr; break; case USCRIPT_CUNEIFORM:
sRet = "akk"_ostr; break; case USCRIPT_CARIAN:
sRet = "xcr"_ostr; break; case USCRIPT_JAPANESE:
sRet = "ja"_ostr; break; case USCRIPT_LANNA:
sRet = "nod"_ostr; break; case USCRIPT_LYCIAN:
sRet = "xlc"_ostr; break; case USCRIPT_LYDIAN:
sRet = "xld"_ostr; break; case USCRIPT_OL_CHIKI:
sRet = "sat"_ostr; break; case USCRIPT_REJANG:
sRet = "rej"_ostr; break; case USCRIPT_SAURASHTRA:
sRet = "saz"_ostr; break; case USCRIPT_SIGN_WRITING:
sRet = "en"_ostr; break; case USCRIPT_SUNDANESE:
sRet = "su"_ostr; break; case USCRIPT_MOON:
sRet = "en"_ostr; break; case USCRIPT_MEITEI_MAYEK:
sRet = "mni"_ostr; break; case USCRIPT_IMPERIAL_ARAMAIC:
sRet = "arc"_ostr; break; case USCRIPT_AVESTAN:
sRet = "ae"_ostr; break; case USCRIPT_CHAKMA:
sRet = "ccp"_ostr; break; case USCRIPT_KAITHI:
sRet = "awa"_ostr; break; case USCRIPT_MANICHAEAN:
sRet = "xmn"_ostr; break; case USCRIPT_INSCRIPTIONAL_PAHLAVI: case USCRIPT_PSALTER_PAHLAVI: case USCRIPT_BOOK_PAHLAVI: case USCRIPT_INSCRIPTIONAL_PARTHIAN:
sRet = "xpr"_ostr; break; case USCRIPT_SAMARITAN:
sRet = "heb"_ostr; break; case USCRIPT_TAI_VIET:
sRet = "blt"_ostr; break; case USCRIPT_MANDAEAN: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
sRet = "mic"_ostr; break; case USCRIPT_NABATAEAN:
sRet = "mis-Nbat"_ostr; // Uncoded with script break; case USCRIPT_PALMYRENE:
sRet = "mis-Palm"_ostr; // Uncoded with script break; case USCRIPT_BAMUM:
sRet = "bax"_ostr; break; case USCRIPT_LISU:
sRet = "lis"_ostr; break; case USCRIPT_NAKHI_GEBA:
sRet = "nxq"_ostr; break; case USCRIPT_OLD_SOUTH_ARABIAN:
sRet = "xsa"_ostr; break; case USCRIPT_BASSA_VAH:
sRet = "bsq"_ostr; break; case USCRIPT_DUPLOYAN_SHORTAND:
sRet = "fr"_ostr; break; case USCRIPT_ELBASAN:
sRet = "sq"_ostr; break; case USCRIPT_KPELLE:
sRet = "kpe"_ostr; break; case USCRIPT_LOMA:
sRet = "lom"_ostr; break; case USCRIPT_MENDE:
sRet = "men"_ostr; break; case USCRIPT_OLD_NORTH_ARABIAN:
sRet = "xna"_ostr; break; case USCRIPT_SINDHI:
sRet = "sd"_ostr; break; case USCRIPT_WARANG_CITI:
sRet = "hoc"_ostr; break; case USCRIPT_AFAKA:
sRet = "djk"_ostr; break; case USCRIPT_JURCHEN:
sRet = "juc"_ostr; break; case USCRIPT_MRO:
sRet = "cmr"_ostr; break; case USCRIPT_NUSHU:
sRet = "mis-Nshu"_ostr; // Uncoded with script break; case USCRIPT_SHARADA:
sRet = "sa"_ostr; break; case USCRIPT_SORA_SOMPENG:
sRet = "srb"_ostr; break; case USCRIPT_TAKRI:
sRet = "doi"_ostr; break; case USCRIPT_TANGUT:
sRet = "txg"_ostr; break; case USCRIPT_WOLEAI:
sRet = "woe"_ostr; break; case USCRIPT_ANATOLIAN_HIEROGLYPHS:
sRet = "hlu"_ostr; break; case USCRIPT_TIRHUTA:
sRet = "mai"_ostr; break; case USCRIPT_CAUCASIAN_ALBANIAN:
sRet = "xag"_ostr; break; case USCRIPT_MAHAJANI:
sRet = "mwr"_ostr; break; case USCRIPT_AHOM:
sRet = "aho"_ostr; break; case USCRIPT_HATRAN:
sRet = "qly-Hatr"_ostr; break; case USCRIPT_MODI:
sRet = "mr-Modi"_ostr; break; case USCRIPT_MULTANI:
sRet = "skr-Mutl"_ostr; break; case USCRIPT_PAU_CIN_HAU:
sRet = "ctd-Pauc"_ostr; break; case USCRIPT_SIDDHAM:
sRet = "sa-Sidd"_ostr; break; case USCRIPT_ADLAM:
sRet = "mis-Adlm"_ostr; // Adlam for Fulani, no language code break; case USCRIPT_BHAIKSUKI:
sRet = "mis-Bhks"_ostr; // Bhaiksuki for some Buddhist texts, no language code break; case USCRIPT_MARCHEN:
sRet = "bo-Marc"_ostr; break; case USCRIPT_NEWA:
sRet = "new-Newa"_ostr; break; case USCRIPT_OSAGE:
sRet = "osa-Osge"_ostr; break; case USCRIPT_HAN_WITH_BOPOMOFO:
sRet = "mis-Hanb"_ostr; // Han with Bopomofo, zh-Hanb ? break; case USCRIPT_SYMBOLS_EMOJI:
sRet = "mis-Zsye"_ostr; // Emoji variant break; case USCRIPT_MASARAM_GONDI:
sRet = "gon-Gonm"_ostr; // macro language code, could be wsg,esg,gno break; case USCRIPT_SOYOMBO:
sRet = "mn-Soyo"_ostr; // abugida to write Mongolian, also Tibetan and Sanskrit break; case USCRIPT_ZANABAZAR_SQUARE:
sRet = "mn-Zanb"_ostr; // abugida to write Mongolian break; case USCRIPT_DOGRA:
sRet = "dgo"_ostr; // Dogri proper break; case USCRIPT_GUNJALA_GONDI:
sRet = "wsg"_ostr; // Adilabad Gondi break; case USCRIPT_MAKASAR:
sRet = "mak"_ostr; break; case USCRIPT_MEDEFAIDRIN:
sRet = "dmf-Medf"_ostr; break; case USCRIPT_HANIFI_ROHINGYA:
sRet = "rhg"_ostr; break; case USCRIPT_SOGDIAN: case USCRIPT_OLD_SOGDIAN:
sRet = "sog"_ostr; break; case USCRIPT_ELYMAIC:
sRet = "arc-Elym"_ostr; break; case USCRIPT_NYIAKENG_PUACHUE_HMONG:
sRet = "hmn-Hmnp"_ostr; // macrolanguage code break; case USCRIPT_NANDINAGARI:
sRet = "sa-Nand"_ostr; break; case USCRIPT_WANCHO:
sRet = "nnp-Wcho"_ostr; break; case USCRIPT_CHORASMIAN:
sRet = "xco-Chrs"_ostr; break; case USCRIPT_DIVES_AKURU:
sRet = "dv-Diak"_ostr; break; case USCRIPT_KHITAN_SMALL_SCRIPT:
sRet = "zkt-Kits"_ostr; break; case USCRIPT_YEZIDI:
sRet = "kmr-Yezi"_ostr; break; #if (U_ICU_VERSION_MAJOR_NUM >= 70) case USCRIPT_CYPRO_MINOAN:
sRet = "mis-Cpmn"_ostr; // Uncoded with script break; case USCRIPT_OLD_UYGHUR:
sRet = "oui-Ougr"_ostr; break; case USCRIPT_TANGSA:
sRet = "nst-Tnsa"_ostr; break; case USCRIPT_TOTO:
sRet = "txo-Toto"_ostr; break; case USCRIPT_VITHKUQI:
sRet = "sq-Vith"_ostr; // macrolanguage code break; #endif #if (U_ICU_VERSION_MAJOR_NUM >= 72) case USCRIPT_KAWI:
sRet = "mis-Kawi"_ostr; // Uncoded with script break; case USCRIPT_NAG_MUNDARI:
sRet = "unr-Nagm"_ostr; break; #endif #if (U_ICU_VERSION_MAJOR_NUM >= 75) case USCRIPT_ARABIC_NASTALIQ:
sRet = "fa-Aran"_ostr; break; #endif #if (U_ICU_VERSION_MAJOR_NUM >= 76) case USCRIPT_GARAY:
sRet = "wo-Gara"_ostr; break; case USCRIPT_GURUNG_KHEMA:
sRet = "gvr-Gukh"_ostr; break; case USCRIPT_KIRAT_RAI:
sRet = "bap-Krai"_ostr; break; case USCRIPT_OL_ONAL:
sRet = "unr-Onao"_ostr; break; case USCRIPT_SUNUWAR:
sRet = "suz-Sunu"_ostr; break; case USCRIPT_TODHRI:
sRet = "sq-Todr"_ostr; break; case USCRIPT_TULU_TIGALARI:
sRet = "sa-Tutg"_ostr; break; #endif
} return sRet;
}
//Format a number as a percentage according to the rules of the given //language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE
OUString unicode::formatPercent(double dNumber, const LanguageTag &rLangTag)
{ // get a currency formatter for this locale ID
UErrorCode errorCode=U_ZERO_ERROR;
LanguageTag aLangTag(rLangTag);
// As of CLDR Version 24 these languages were not listed as using spacing // between number and % but are reported as such by our l10n groups // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html // so format using French which has the desired rules if (aLangTag.getLanguage() == "es" || aLangTag.getLanguage() == "sl")
aLangTag.reset(u"fr-FR"_ustr);
switch ( unicode::getUnicodeType(uChar) )
{ case css::i18n::UnicodeType::SURROGATE: if (bPreventNonHex || mbIsHexString)
{ returnfalse;
}
if( rtl::isLowSurrogate(uChar) && maUtf16.isEmpty() && maInput.isEmpty() )
{
maUtf16.append(sal_Unicode(uChar)); returntrue;
} if( rtl::isHighSurrogate(uChar) && maInput.isEmpty() )
maUtf16.insert(0, sal_Unicode(uChar)); if (maUtf16.getLength() == 2)
{
assert(rtl::isHighSurrogate(maUtf16[0]) && rtl::isLowSurrogate(maUtf16[1])); // The resulting codepoint may itself be combining, so may allow more
sal_uInt32 nUCS4 = rtl::combineSurrogates(maUtf16[0], maUtf16[1]);
maUtf16.setLength(0); return AllowMoreInput(nUCS4);
} // unexpected order of high/low, so don't accept more if( !maUtf16.isEmpty() )
maInput.append(maUtf16); if( !maCombining.isEmpty() )
maInput.append(maCombining); returnfalse;
case css::i18n::UnicodeType::NON_SPACING_MARK: case css::i18n::UnicodeType::COMBINING_SPACING_MARK: if (bPreventNonHex || mbIsHexString)
{ returnfalse;
}
//extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark. if( !maUtf16.isEmpty() )
{
maInput = maUtf16; if( !maCombining.isEmpty() )
maInput.append(maCombining); returnfalse;
}
maCombining.insertUtf32(0, uChar); break;
default: //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character. if( !maUtf16.isEmpty() )
{
maInput = maUtf16; if( !maCombining.isEmpty() )
maInput.append(maCombining); returnfalse;
}
// 0 - 1f are control characters. Do not process those. if( uChar < 0x20 )
{ returnfalse;
}
switch( uChar )
{ case'u': case'U': // U+ notation found. Continue looking for another one. if( mbRequiresU )
{
mbRequiresU = false;
maInput.insert(0,"U+");
} // treat as a normal character else
{ if( !bPreventNonHex )
maInput.insertUtf32(0, uChar); returnfalse;
} break; case'+': // + already found: skip when not U, or edge case of +U+xxxx if( mbRequiresU || (maInput.indexOf("U+") == 0) ) returnfalse; // hex chars followed by '+' - now require a 'U' elseif ( !maInput.isEmpty() )
mbRequiresU = true; // treat as a normal character else
{ if( !bPreventNonHex )
maInput.insertUtf32(0, uChar); returnfalse;
} break; default: // + already found. Since not U, cancel further input if( mbRequiresU ) returnfalse; // maximum digits per notation is 8: only one notation elseif( maInput.indexOf("U+") == -1 && maInput.getLength() == 8 ) returnfalse; // maximum digits per notation is 8: previous notation found elseif( maInput.indexOf("U+") == 8 ) returnfalse; // a hex character. Add to string. elseif( rtl::isAsciiHexDigit(uChar) )
{
mbIsHexString = true;
maInput.insertUtf32(0, uChar);
} // not a hex character: stop input. keep if it is the first input provided else
{ if( maInput.isEmpty() )
maInput.insertUtf32(0, uChar); returnfalse;
}
}
} returntrue;
}
OUString ToggleUnicodeCodepoint::StringToReplace()
{ // this function potentially modifies the input string. No more addition of characters #ifndef NDEBUG
mbInputEnded = true; #endif
if( maInput.isEmpty() )
{ //edge case - input finished with incomplete low surrogate or combining characters without a base if (!maUtf16.isEmpty())
maInput = maUtf16; if (!maCombining.isEmpty())
maInput.append(maCombining); return maInput.toString();
}
if( !mbIsHexString ) return maInput.toString();
//validate unicode notation.
OUString sIn;
sal_uInt32 nUnicode = 0;
sal_Int32 nUPlus = maInput.indexOf("U+"); //if U+ notation used, strip off all extra chars added not in U+ notation if( nUPlus != -1 )
{
maInput.remove(0, nUPlus);
sIn = maInput.copy(2).makeStringAndClear();
nUPlus = sIn.indexOf("U+");
} else
sIn = maInput.toString(); while( nUPlus != -1 )
{
nUnicode = o3tl::toUInt32(sIn.subView(0, nUPlus), 16); //prevent creating control characters or invalid Unicode values if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
maInput = sIn.subView(nUPlus);
sIn = sIn.copy(nUPlus+2);
nUPlus = sIn.indexOf("U+");
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.