// prints out the standard usage method describing command line arguments, // then bails out with the desired exit code staticvoid usageAndDie(UErrorCode retCode) {
fprintf((U_SUCCESS(retCode) ? stdout : stderr), "Usage: %s -trietype [-options] input-dictionary-file output-file\n", progName);
fprintf((U_SUCCESS(retCode) ? stdout : stderr), "\tRead in a word list and write out a string trie dictionary\n" "options:\n" "\t-h or -? or --help this usage text\n" "\t-V or --version show a version message\n" "\t-c or --copyright include a copyright notice\n" "\t-v or --verbose turn on verbose output\n" "\t-q or --quiet do not display warnings and progress\n" "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"// TODO: figure out if we need this option "\t followed by path, defaults to %s\n" "\t--uchars output a UCharsTrie (mutually exclusive with -b!)\n" "\t--bytes output a BytesTrie (mutually exclusive with -u!)\n" "\t--transform the kind of transform to use (eg --transform offset-40A3,\n" "\t which specifies an offset transform with constant 0x40A3)\n" "\t--toml output the trie in toml format (default is binary),\n",
u_getDataDirectory()); exit(retCode);
}
{ 0x44, 0x69, 0x63, 0x74 }, /* "Dict" */
{ 1, 0, 0, 0 }, /* format version */
{ 0, 0, 0, 0 } /* data version */
};
#if !UCONFIG_NO_BREAK_ITERATION
// A wrapper for both BytesTrieBuilder and UCharsTrieBuilder. // may want to put this somewhere in ICU, as it could be useful outside // of this tool? class DataDict { private:
BytesTrieBuilder *bt;
UCharsTrieBuilder *ut;
UChar32 transformConstant;
int32_t transformType; public: // constructs a new data dictionary. if there is an error, // it will be returned in status // isBytesTrie != 0 will produce a BytesTrieBuilder, // isBytesTrie == 0 will produce a UCharsTrieBuilder
DataDict(UBool isBytesTrie, UErrorCode &status) : bt(nullptr), ut(nullptr),
transformConstant(0), transformType(DictionaryData::TRANSFORM_NONE) { if (isBytesTrie) {
bt = new BytesTrieBuilder(status);
} else {
ut = new UCharsTrieBuilder(status);
}
}
~DataDict() { delete bt; delete ut;
}
private: char transform(UChar32 c, UErrorCode &status) { if (transformType == DictionaryData::TRANSFORM_TYPE_OFFSET) { if (c == 0x200D) { returnstatic_cast<char>(0xFF); } elseif (c == 0x200C) { returnstatic_cast<char>(0xFE); }
int32_t delta = c - transformConstant; if (delta < 0 || 0xFD < delta) {
fprintf(stderr, "Codepoint U+%04lx out of range for --transform offset-%04lx!\n", static_cast<long>(c), static_cast<long>(transformConstant)); exit(U_ILLEGAL_ARGUMENT_ERROR); // TODO: should return and print the line number
} returnstatic_cast<char>(delta);
} else { // no such transform type
status = U_INTERNAL_PROGRAM_ERROR; returnstatic_cast<char>(c); // it should be noted this transform type will not generally work
}
}
void transform(const UnicodeString &word, CharString &buf, UErrorCode &errorCode) {
UChar32 c = 0;
int32_t len = word.length(); for (int32_t i = 0; i < len; i += U16_LENGTH(c)) {
c = word.char32At(i);
buf.append(transform(c, errorCode), errorCode);
}
}
public: // sets the desired transformation data. // should be populated from a command line argument // so far the only acceptable format is offset-<hex constant> // eventually others (mask-<hex constant>?) may be enabled // more complex functions may be more difficult void setTransform(constchar *t) { if (strncmp(t, "offset-", 7) == 0) { char *end; unsignedlong base = uprv_strtoul(t + 7, &end, 16); if (end == (t + 7) || *end != 0 || base > 0x10FF80) {
fprintf(stderr, "Syntax for offset value in --transform offset-%s invalid!\n", t + 7);
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
}
transformType = DictionaryData::TRANSFORM_TYPE_OFFSET;
transformConstant = static_cast<UChar32>(base);
} else {
fprintf(stderr, "Invalid transform specified: %s\n", t);
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
}
}
// add a word to the trie void addWord(const UnicodeString &word, int32_t value, UErrorCode &status) { if (bt) {
CharString buf;
transform(word, buf, status);
bt->add(buf.toStringPiece(), value, status);
} if (ut) { ut->add(word, value, status); }
}
// if we are a bytestrie, give back the StringPiece representing the serialized version of us
StringPiece serializeBytes(UErrorCode &status) { return bt->buildStringPiece(USTRINGTRIE_BUILD_SMALL, status);
}
// if we are a ucharstrie, produce the UnicodeString representing the serialized version of us void serializeUChars(UnicodeString &s, UErrorCode &status) {
ut->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, s, status);
}
//---------------------------------------------------------------------------- // // main for gendict // //---------------------------------------------------------------------------- int main(int argc, char **argv) { // // Pick up and check the command line arguments, // using the standard ICU tool utils option handling. //
U_MAIN_INIT_ARGS(argc, argv);
progName = argv[0];
argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); if(argc<0) { // Unrecognized option
fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
}
if(options[ARG_HELP].doesOccur || options[ARG_QMARK].doesOccur) { // -? or -h for help.
usageAndDie(U_ZERO_ERROR);
}
if (argc < 3) {
fprintf(stderr, "input and output file must both be specified.\n");
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
} constchar *outFileName = argv[2]; constchar *wordFileName = argv[1];
if (options[ARG_UCHARS].doesOccur == options[ARG_BYTES].doesOccur) {
fprintf(stderr, "you must specify exactly one type of trie to output!\n");
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
}
UBool isBytesTrie = options[ARG_BYTES].doesOccur; if (isBytesTrie != options[ARG_TRANSFORM].doesOccur) {
fprintf(stderr, "you must provide a transformation for a bytes trie, and must not provide one for a uchars trie!\n");
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
}
/* write message with just the name */ // potential for a buffer overflow here...
snprintf(msg, sizeof(msg), "gendict writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
fprintf(stderr, "%s\n", msg);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.