void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) { if(U_IS_SURROGATE(c)) {
fprintf(stderr, "error in gennorm2 phase %d: " "illegal round-trip mapping from surrogate code point U+%04lX\n", static_cast<int>(phase), static_cast<long>(c)); exit(U_INVALID_FORMAT_ERROR);
} if(!isWellFormed(m)) {
fprintf(stderr, "error in gennorm2 phase %d: " "illegal round-trip mapping from U+%04lX to malformed string\n", static_cast<int>(phase), static_cast<long>(c)); exit(U_INVALID_FORMAT_ERROR);
}
int32_t numCP=u_countChar32(toUCharPtr(m.getBuffer()), m.length()); if(numCP!=2) {
fprintf(stderr, "error in gennorm2 phase %d: " "illegal round-trip mapping from U+%04lX to %d!=2 code points\n", static_cast<int>(phase), static_cast<long>(c), static_cast<int>(numCP)); exit(U_INVALID_FORMAT_ERROR);
}
Norm *p=checkNormForMapping(norms.createNorm(c), c);
p->mapping=new UnicodeString(m);
p->mappingType=Norm::ROUND_TRIP;
p->mappingCP=U_SENTINEL;
norms.mappingSet.add(c);
}
void Normalizer2DataBuilder::removeMapping(UChar32 c) { // createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data.
Norm *p=checkNormForMapping(norms.createNorm(c), c);
p->mappingType=Norm::REMOVED;
norms.mappingSet.add(c);
}
UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer,
Norm::MappingType mappingType) const { if(buffer.isEmpty()) { returnfalse; // Maps-to-empty-string is no boundary of any kind.
}
int32_t lastStarterIndex=buffer.lastStarterIndex(); if(lastStarterIndex<0) { returnfalse; // no starter
} const int32_t lastIndex=buffer.length()-1; if(mappingType==Norm::ONE_WAY && lastStarterIndex<lastIndex && buffer.ccAt(lastIndex)>1) { // One-way mapping where after the last starter is at least one combining mark // with a combining class greater than 1, // which means that another combining mark can reorder before it. // By contrast, in a round-trip mapping this does not prevent a boundary as long as // the starter or composite does not combine-forward with a following combining mark. returnfalse;
}
UChar32 starter=buffer.charAt(lastStarterIndex); if(lastStarterIndex==0 && norms.combinesBack(starter)) { // The last starter is at the beginning of the mapping and combines backward. returnfalse;
} if(Hangul::isJamoL(starter) ||
(Hangul::isJamoV(starter) &&
0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))) { // A Jamo leading consonant or an LV pair combines-forward if it is at the end, // otherwise it is blocked. return lastStarterIndex!=lastIndex;
} // Note: There can be no Hangul syllable in the fully decomposed mapping.
// Multiple starters can combine into one. // Look for the first of the last sequence of starters, excluding Jamos.
int32_t i=lastStarterIndex;
UChar32 c; while(0<i && buffer.ccAt(i-1)==0 && !Hangul::isJamo(c=buffer.charAt(i-1))) {
starter=c;
--i;
} // Compose as far as possible, and see if further compositions with // characters following this mapping are possible. const Norm *starterNorm=norms.getNorm(starter); if(i==lastStarterIndex &&
(starterNorm==nullptr || !starterNorm->combinesFwd())) { returntrue; // The last starter does not combine forward.
}
uint8_t prevCC=0; while(++i<buffer.length()) {
uint8_t cc=buffer.ccAt(i); // !=0 if after last starter if(i>lastStarterIndex && norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) { // The starter combines with a mark that reorders before the current one. returnfalse;
}
UChar32 c=buffer.charAt(i); if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) &&
norms.getNormRef(c).combinesBack && (starter=starterNorm->combine(c))>=0) { // The starter combines with c into a composite replacement starter.
starterNorm=norms.getNorm(starter); if(i>=lastStarterIndex &&
(starterNorm==nullptr || !starterNorm->combinesFwd())) { returntrue; // The composite does not combine further.
} // Keep prevCC because we "removed" the combining mark.
} elseif(cc==0) {
starterNorm=norms.getNorm(c); if(i==lastStarterIndex &&
(starterNorm==nullptr || !starterNorm->combinesFwd())) { returntrue; // The new starter does not combine forward.
}
prevCC=0;
} else {
prevCC=cc;
}
} if(prevCC==0) { returnfalse; // forward-combining starter at the very end
} if(norms.combinesWithCCBetween(*starterNorm, prevCC, 256)) { // The starter combines with another mark. returnfalse;
} returntrue;
}
void Normalizer2DataBuilder::postProcess(Norm &norm) { // Prerequisites: Compositions are built, mappings are recursively decomposed. // Mappings are not yet in canonical order. // // This function works on a Norm struct. We do not know which code point(s) map(s) to it. // Therefore, we cannot compute algorithmic mapping deltas here. // Error conditions are checked, but printed later when we do know the offending code point. if(norm.hasMapping()) { if(norm.mapping->length()>Normalizer2Impl::MAPPING_LENGTH_MASK) {
norm.error="mapping longer than maximum of 31"; return;
} // Ensure canonical order.
BuilderReorderingBuffer buffer; if(norm.rawMapping!=nullptr) {
norms.reorder(*norm.rawMapping, buffer);
buffer.reset();
}
norms.reorder(*norm.mapping, buffer); if(buffer.isEmpty()) { // A character that is deleted (maps to an empty string) must // get the worst-case lccc and tccc values because arbitrary // characters on both sides will become adjacent.
norm.leadCC=1;
norm.trailCC=0xff;
} else {
norm.leadCC=buffer.ccAt(0);
norm.trailCC=buffer.ccAt(buffer.length()-1);
}
norm.hasCompBoundaryBefore=
!buffer.isEmpty() && norm.leadCC==0 && !norms.combinesBack(buffer.charAt(0)); // No comp-boundary-after when norm.combinesBack: // MaybeNo character whose first mapping character may combine-back, // in which case we would not recompose to this character, // and may need more context.
norm.hasCompBoundaryAfter=
!norm.combinesBack && !norm.combinesFwd() &&
mappingHasCompBoundaryAfter(buffer, norm.mappingType);
if(norm.combinesBack) { if(norm.mappingType!=Norm::ROUND_TRIP) { // One-way mappings don't get NFC_QC=Maybe, and // should not have gotten combinesBack set.
norm.error="combines-back and has a one-way mapping, " "not possible in Unicode normalization";
} elseif(norm.combinesFwd()) { // Earlier code checked ccc=0.
norm.type=Norm::MAYBE_NO_COMBINES_FWD;
} elseif(norm.cc==0) {
norm.type=Norm::MAYBE_NO_MAPPING_ONLY;
} else {
norm.error="combines-back and decomposes with ccc!=0, " "not possible in Unicode normalization"; // ... because we don't reorder again after composition.
}
} elseif(norm.mappingType==Norm::ROUND_TRIP) { if(norm.combinesFwd()) {
norm.type=Norm::YES_NO_COMBINES_FWD;
} else {
norm.type=Norm::YES_NO_MAPPING_ONLY;
}
} else { // one-way mapping if(norm.combinesFwd()) {
norm.error="combines-forward and has a one-way mapping, " "not possible in Unicode normalization";
} elseif(buffer.isEmpty()) {
norm.type=Norm::NO_NO_EMPTY;
} elseif(!norm.hasCompBoundaryBefore) {
norm.type=Norm::NO_NO_COMP_NO_MAYBE_CC;
} elseif(mappingRecomposes(buffer)) {
norm.type=Norm::NO_NO_COMP_BOUNDARY_BEFORE;
} else { // The mapping is comp-normalized.
norm.type=Norm::NO_NO_COMP_YES;
}
}
} else { // no mapping
norm.leadCC=norm.trailCC=norm.cc;
// Hangul LV encoded as minYesNo
uint32_t lv=indexes[Normalizer2Impl::IX_MIN_YES_NO]; // Hangul LVT encoded as minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER
uint32_t lvt=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]|
Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER; if(Hangul::HANGUL_BASE<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=Hangul::HANGUL_BASE;
} // Set the first LV, then write all other Hangul syllables as LVT, // then overwrite the remaining LV.
umutablecptrie_set(norm16Trie, Hangul::HANGUL_BASE, lv, errorCode);
umutablecptrie_setRange(norm16Trie, Hangul::HANGUL_BASE+1, Hangul::HANGUL_END, lvt, errorCode);
UChar32 c=Hangul::HANGUL_BASE; while((c+=Hangul::JAMO_T_COUNT)<=Hangul::HANGUL_END) {
umutablecptrie_set(norm16Trie, c, lv, errorCode);
}
errorCode.assertSuccess();
}
LocalUCPTriePointer Normalizer2DataBuilder::processData() { // Build composition lists before recursive decomposition, // so that we still have the raw, pair-wise mappings.
CompositionBuilder compBuilder(norms);
norms.enumRanges(compBuilder);
// Recursively decompose all mappings.
Decomposer decomposer(norms); do {
decomposer.didDecompose=false;
norms.enumRanges(decomposer);
} while(decomposer.didDecompose);
// Set the Norm::Type and other properties.
int32_t normsLength=norms.length(); for(int32_t i=1; i<normsLength; ++i) {
postProcess(norms.getNormRefByIndex(i));
}
// Write the properties, mappings and composition lists to // appropriate parts of the "extra data" array.
ExtraData extra(norms, optimization==OPTIMIZE_FAST);
norms.enumRanges(extra);
int32_t maybeDataLength=
extra.maybeNoMappingsOnly.length()+
extra.maybeNoMappingsAndCompositions.length()+
extra.maybeYesCompositions.length();
int32_t minMaybeNo=Normalizer2Impl::MIN_NORMAL_MAYBE_YES-maybeDataLength*2; // Adjust minMaybeNo down to 8-align it, // so that NO_NO_DELTA bits 2..1 can be used without subtracting the center.
minMaybeNo&=~7;
// Pad the extraData to even length for 4-byte alignment of following data. if(extraData.length()&1) {
extraData.append(static_cast<char16_t>(0));
}
int32_t minNoNoDelta=getMinNoNoDelta();
U_ASSERT((minNoNoDelta&7)==0); if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
fprintf(stderr, "gennorm2 error: " "data structure overflow, too much mapping composition data\n"); exit(U_BUFFER_OVERFLOW_ERROR);
}
// writeNorm16() and setHangulData() reduce these as needed.
indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=0x110000;
// Map each code point to its norm16 value, // including the properties that fit directly, // and the offset to the "extra data" if necessary.
Norm16Writer norm16Writer(norm16Trie, norms, *this);
norms.enumRanges(norm16Writer); // TODO: iterate via getRange() instead of callback?
setHangulData(norm16Trie);
// Look for the "worst" norm16 value of any supplementary code point // corresponding to a lead surrogate, and set it as that surrogate's value. // Enables UTF-16 quick check inner loops to look at only code units. // // We could be more sophisticated: // We could collect a bit set for whether there are values in the different // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.) // and select the best value that only breaks the composition and/or decomposition // inner loops if necessary. // However, that seems like overkill for an optimization for supplementary characters. // // First check that surrogate code *points* are inert. // The parser should have rejected values/mappings for them.
uint32_t value;
UChar32 end = umutablecptrie_getRange(norm16Trie, 0xd800, UCPMAP_RANGE_NORMAL, 0,
nullptr, nullptr, &value); if (value != Normalizer2Impl::INERT || end < 0xdfff) {
fprintf(stderr, "gennorm2 error: not all surrogate code points are inert: U+d800..U+%04x=%lx\n", static_cast<int>(end), static_cast<long>(value)); exit(U_INTERNAL_PROGRAM_ERROR);
}
uint32_t maxNorm16 = 0; // ANDing values yields 0 bits where any value has a 0. // Used for worst-case HAS_COMP_BOUNDARY_AFTER.
uint32_t andedNorm16 = 0;
end = 0; for (UChar32 start = 0x10000;;) { if (start > end) {
end = umutablecptrie_getRange(norm16Trie, start, UCPMAP_RANGE_NORMAL, 0,
nullptr, nullptr, &value); if (end < 0) { break; }
} if ((start & 0x3ff) == 0) { // Data for a new lead surrogate.
maxNorm16 = andedNorm16 = value;
} else { if (value > maxNorm16) {
maxNorm16 = value;
}
andedNorm16 &= value;
} // Intersect each range with the code points for one lead surrogate.
UChar32 leadEnd = start | 0x3ff; if (leadEnd <= end) { // End of the supplementary block for a lead surrogate. if (maxNorm16 >= static_cast<uint32_t>(indexes[Normalizer2Impl::IX_LIMIT_NO_NO])) { // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0. // Otherwise it might end up at something like JAMO_VT which stays in // the inner decomposition quick check loop.
maxNorm16 = static_cast<uint32_t>(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
}
maxNorm16 =
(maxNorm16 & ~Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER)|
(andedNorm16 & Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER); if (maxNorm16 != Normalizer2Impl::INERT) {
umutablecptrie_set(norm16Trie, U16_LEAD(start), maxNorm16, errorCode);
} if (value == Normalizer2Impl::INERT) { // Potentially skip inert supplementary blocks for several lead surrogates.
start = (end + 1) & ~0x3ff;
} else {
start = leadEnd + 1;
}
} else {
start = end + 1;
}
}
// Adjust supplementary minimum code points to break quick check loops at their lead surrogates. // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate) // which is harmless. // As a result, the minimum code points are always BMP code points.
int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]; if(minCP>=0x10000) {
indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP);
}
minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]; if(minCP>=0x10000) {
indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP);
}
minCP=indexes[Normalizer2Impl::IX_MIN_LCCC_CP]; if(minCP>=0x10000) {
indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=U16_LEAD(minCP);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.