/* * StringPrep profile file format ------------------------------------ * * The file format prepared and written here contains a 16-bit trie and a mapping table. * * Before the data contents described below, there are the headers required by * the udata API for loading ICU data. Especially, a UDataInfo structure * precedes the actual data. It contains platform properties values and the * file format version. * * The following is a description of format version 2. * * Data contents: * * The contents is a parsed, binary form of RFC3454 and possibly * NormalizationCorrections.txt depending on the options specified on the profile. * * Any Unicode code point from 0 to 0x10ffff can be looked up to get * the trie-word, if any, for that code point. This means that the input * to the lookup are 21-bit unsigned integers, with not all of the * 21-bit range used. * * *.spp files customarily begin with a UDataInfo structure, see udata.h and .c. * After that there are the following structures: * * int32_t indexes[_SPREP_INDEX_TOP]; -- _SPREP_INDEX_TOP=16, see enum in sprpimpl.h file * * UTrie stringPrepTrie; -- size in bytes=indexes[_SPREP_INDEX_TRIE_SIZE] * * uint16_t mappingTable[]; -- Contains the sequence of code units that the code point maps to * size in bytes = indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] * * The indexes array contains the following values: * indexes[_SPREP_INDEX_TRIE_SIZE] -- The size of the StringPrep trie in bytes * indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] -- The size of the mappingTable in bytes * indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] -- The index of Unicode version of last entry in NormalizationCorrections.txt * indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] -- The starting index of 1 UChar mapping index in the mapping table * indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] -- The starting index of 2 UChars mapping index in the mapping table * indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] -- The starting index of 3 UChars mapping index in the mapping table * indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] -- The starting index of 4 UChars mapping index in the mapping table * indexes[_SPREP_OPTIONS] -- Bit set of options to turn on in the profile, e.g: USPREP_NORMALIZATION_ON, USPREP_CHECK_BIDI_ON * * * StringPrep Trie : * * The StringPrep tries is a 16-bit trie that contains data for the profile. * Each code point is associated with a value (trie-word) in the trie. * * - structure of data words from the trie * * i) A value greater than or equal to _SPREP_TYPE_THRESHOLD (0xFFF0) * represents the type associated with the code point * if(trieWord >= _SPREP_TYPE_THRESHOLD){ * type = trieWord - 0xFFF0; * } * The type can be : * USPREP_UNASSIGNED * USPREP_PROHIBITED * USPREP_DELETE * * ii) A value less than _SPREP_TYPE_THRESHOLD means the type is USPREP_MAP and * contains distribution described below * * 0 - ON : The code point is prohibited (USPREP_PROHIBITED). This is to allow for codepoint that are both prohibited and mapped. * 1 - ON : The value in the next 14 bits is an index into the mapping table * OFF: The value in the next 14 bits is an delta value from the code point * 2..15 - Contains data as described by bit 1. If all bits are set * (value = _SPREP_MAX_INDEX_VALUE) then the type is USPREP_DELETE * * * Mapping Table: * The data in mapping table is sorted according to the length of the mapping sequence. * If the type of the code point is USPREP_MAP and value in trie word is an index, the index * is compared with start indexes of sequence length start to figure out the length according to * the following algorithm: * * if( index >= indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] && * index < indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START]){ * length = 1; * }else if(index >= indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] && * index < indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START]){ * length = 2; * }else if(index >= indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] && * index < indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START]){ * length = 3; * }else{ * // The first position in the mapping table contains the length * // of the sequence * length = mappingTable[index++]; * * } *
*/
/* file data ---------------------------------------------------------------- */ /* indexes[] value names */
static uint16_t* mappingData= NULL; static int32_t mappingDataCapacity = 0; /* we skip the first index in mapping data */ static int16_t currentIndex = 0; /* the current index into the data trie */ static int32_t maxLength = 0; /* maximum length of mapping string */
/* Callback for deleting the value from the hashtable */ staticvoid U_CALLCONV valueDeleter(void* obj){
ValueStruct* value = (ValueStruct*) obj;
uprv_free(value->mapping);
uprv_free(value);
}
/* Callback for hashing the entry */ static int32_t U_CALLCONV hashEntry(const UHashTok parm) { return parm.integer;
}
codepoint = element->key.integer;
value = (ValueStruct*)element->value.pointer;
/* store the start of indexes */ if(oldMappingLength != mappingLength){ /* Assume that index[] is used according to the enums defined */ if(oldMappingLength <=_SPREP_MAX_INDEX_TOP_LENGTH){
indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex;
} if(oldMappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH &&
mappingLength == _SPREP_MAX_INDEX_TOP_LENGTH +1){
limitIndex = currentIndex;
}
oldMappingLength = mappingLength;
}
if(value->length == mappingLength){
uint32_t savedTrieWord = 0;
trieWord = currentIndex << 2; /* turn on the 2nd bit to signal that the following bits contain an index */
trieWord += 0x02;
if(trieWord > _SPREP_TYPE_THRESHOLD){
fprintf(stderr,"trieWord cannot contain value greater than 0x%04X.\n",_SPREP_TYPE_THRESHOLD); exit(U_ILLEGAL_CHAR_FOUND);
} /* figure out if the code point has type already stored */
savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL); if(savedTrieWord!=0){ if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){ /* turn on the first bit in trie word */
trieWord += 0x01;
}else{ /* * the codepoint has value something other than prohibited * and a mapping .. error!
*/
fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint); exit(U_ILLEGAL_ARGUMENT_ERROR);
}
}
/* now set the value in the trie */ if(!utrie_set32(sprepTrie,codepoint,trieWord)){
fprintf(stderr,"Could not set the value for code point.\n"); exit(U_ILLEGAL_ARGUMENT_ERROR);
}
/* written the trie word for the codepoint... increment the count*/
writtenElementCount++;
/* sanity check are we exceeding the max number allowed */ if(currentIndex+value->length+1 > _SPREP_MAX_INDEX_VALUE){
fprintf(stderr, "Too many entries in the mapping table %i. Maximum allowed is %i\n",
currentIndex+value->length, _SPREP_MAX_INDEX_VALUE); exit(U_INDEX_OUTOFBOUNDS_ERROR);
}
/* copy the mapping data */ /* write the length */ if(mappingLength > _SPREP_MAX_INDEX_TOP_LENGTH ){ /* the cast here is safe since we donot expect the length to be > 65535 */
mappingData[currentIndex++] = (uint16_t) mappingLength;
} /* copy the contents to mappindData array */
u_memmove(mappingData+currentIndex, value->mapping, value->length);
currentIndex += value->length; if (currentIndex > mappingDataCapacity) { /* If this happens there is a bug in the computation of the mapping data size in storeMapping() */
fprintf(stderr, "gensprep, fatal error at %s, %d. Aborting.\n", __FILE__, __LINE__); exit(U_INTERNAL_PROGRAM_ERROR);
}
}
}
mappingLength++;
pos = -1;
} /* set the last length for range check */ if(mappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH){
indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex+1;
}else{
indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] = limitIndex;
}
/* figure out if the code point has type already stored */
savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL); if(savedTrieWord!=0){ if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){ /* turn on the first bit in trie word */
trieWord += 0x01;
}else{ /* * the codepoint has value something other than prohibited * and a mapping .. error!
*/
fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint); exit(U_ILLEGAL_ARGUMENT_ERROR);
}
}
/* figure out the real length */ for(i=0; i<length; i++){
adjustedLen += U16_LENGTH(mapping[i]);
}
if(adjustedLen == 0){
trieWord = (uint16_t)(_SPREP_MAX_INDEX_VALUE << 2); /* make sure that the value of trieWord is less than the threshold */ if(trieWord < _SPREP_TYPE_THRESHOLD){ /* now set the value in the trie */ if(!utrie_set32(sprepTrie,codepoint,trieWord)){
fprintf(stderr,"Could not set the value for code point.\n"); exit(U_ILLEGAL_ARGUMENT_ERROR);
} /* value is set so just return */ return;
}else{
fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD); exit(U_ILLEGAL_CHAR_FOUND);
}
}
/* make sure that the second bit is OFF */ if((trieWord & 0x02) != 0 ){
fprintf(stderr,"The second bit in the trie word is not zero while storing a delta.\n"); exit(U_INTERNAL_PROGRAM_ERROR);
} /* make sure that the value of trieWord is less than the threshold */ if(trieWord < _SPREP_TYPE_THRESHOLD){ /* now set the value in the trie */ if(!utrie_set32(sprepTrie,codepoint,trieWord)){
fprintf(stderr,"Could not set the value for code point.\n"); exit(U_ILLEGAL_ARGUMENT_ERROR);
} /* value is set so just return */ return;
}
} /* * if the delta is not in the given range or if the trieWord is larger than the threshold * just fall through for storing the mapping in the mapping table
*/
}
if((int)(_SPREP_TYPE_THRESHOLD + type) > 0xFFFF){
fprintf(stderr,"trieWord cannot contain value greater than 0xFFFF.\n"); exit(U_ILLEGAL_CHAR_FOUND);
}
trieWord = (_SPREP_TYPE_THRESHOLD + type); /* the top 4 bits contain the value */ if(start == end){
uint32_t savedTrieWord = utrie_get32(sprepTrie, start, NULL); if(savedTrieWord>0){ if(savedTrieWord < _SPREP_TYPE_THRESHOLD && type == USPREP_PROHIBITED){ /* * A mapping is stored in the trie word * and the only other possible type that a * code point can have is USPREP_PROHIBITED *
*/
/* turn on the 0th bit in the savedTrieWord */
savedTrieWord += 0x01;
/* the downcast is safe since we only save 16 bit values */
trieWord = (uint16_t)savedTrieWord;
/* make sure that the value of trieWord is less than the threshold */ if(trieWord < _SPREP_TYPE_THRESHOLD){ /* now set the value in the trie */ if(!utrie_set32(sprepTrie,start,trieWord)){
fprintf(stderr,"Could not set the value for code point.\n"); exit(U_ILLEGAL_ARGUMENT_ERROR);
} /* value is set so just return */ return;
}else{
fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD); exit(U_ILLEGAL_CHAR_FOUND);
}
}elseif(savedTrieWord != trieWord){
fprintf(stderr,"Value for codepoint \\U%08X already set!.\n", (int)start); exit(U_ILLEGAL_ARGUMENT_ERROR);
} /* if savedTrieWord == trieWord .. fall through and set the value */
} if(!utrie_set32(sprepTrie,start,trieWord)){
fprintf(stderr,"Could not set the value for code point \\U%08X.\n", (int)start); exit(U_ILLEGAL_ARGUMENT_ERROR);
}
}else{ if(!utrie_setRange32(sprepTrie, start, end+1, trieWord, false)){
fprintf(stderr,"Value for certain codepoint already set.\n"); exit(U_ILLEGAL_CHAR_FOUND);
}
}
}
/* folding value: just store the offset (16 bits) if there is any non-0 entry */ static uint32_t U_CALLCONV
getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) {
uint32_t value;
UChar32 limit=0;
UBool inBlockZero;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.