Quelle gendict.cpp Sprache: C

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
*   Copyright (C) 2002-2016, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*
* File gendict.cpp
*/

#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/ucnv.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/uclean.h"
#include "unicode/udata.h"
#include "unicode/putil.h"
#include "unicode/ucharstriebuilder.h"
#include "unicode/bytestriebuilder.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "unicode/ucnv.h"
#include "unicode/ustring.h"
#include "unicode/utf16.h"

#include "charstr.h"
#include "dictionarydata.h"
#include "uoptions.h"
#include "unewdata.h"
#include "cmemory.h"
#include "uassert.h"
#include "ucbuf.h"
#include "toolutil.h"
#include "cstring.h"
#include "writesrc.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "putilimp.h"
UDate startTime;

static int elapsedTime() {
  return static_cast<int>(uprv_floor((uprv_getRawUTCtime() - startTime) / 1000.0));
}

U_NAMESPACE_USE

static char *progName;
static UOption options[]={
    UOPTION_HELP_H,             /* 0 */
    UOPTION_HELP_QUESTION_MARK, /* 1 */
    UOPTION_VERBOSE,            /* 2 */
    UOPTION_ICUDATADIR,         /* 4 */
    UOPTION_COPYRIGHT,          /* 5 */
    { "uchars", nullptr, nullptr, nullptr, '\1', UOPT_NO_ARG, 0}, /* 6 */
    { "bytes", nullptr, nullptr, nullptr, '\1', UOPT_NO_ARG, 0}, /* 7 */
    { "transform", nullptr, nullptr, nullptr, '\1', UOPT_REQUIRES_ARG, 0}, /* 8 */
    { "toml", nullptr, nullptr, nullptr, '\1', UOPT_NO_ARG, 0}, /* 9 */
    UOPTION_QUIET,              /* 10 */
};

enum arguments {
    ARG_HELP = 0,
    ARG_QMARK,
    ARG_VERBOSE,
    ARG_ICUDATADIR,
    ARG_COPYRIGHT,
    ARG_UCHARS,
    ARG_BYTES,
    ARG_TRANSFORM,
    ARG_TOML,
    ARG_QUIET
};

// prints out the standard usage method describing command line arguments,
// then bails out with the desired exit code
static void usageAndDie(UErrorCode retCode) {
    fprintf((U_SUCCESS(retCode) ? stdout : stderr), "Usage: %s -trietype [-options] input-dictionary-file output-file\n", progName);
    fprintf((U_SUCCESS(retCode) ? stdout : stderr),
           "\tRead in a word list and write out a string trie dictionary\n"
           "options:\n"
           "\t-h or -? or --help this usage text\n"
           "\t-V or --version show a version message\n"
           "\t-c or --copyright include a copyright notice\n"
           "\t-v or --verbose turn on verbose output\n"
           "\t-q or --quiet do not display warnings and progress\n"
           "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" // TODO: figure out if we need this option
           "\t followed by path, defaults to %s\n"
           "\t--uchars output a UCharsTrie (mutually exclusive with -b!)\n"
           "\t--bytes output a BytesTrie (mutually exclusive with -u!)\n"
           "\t--transform the kind of transform to use (eg --transform offset-40A3,\n"
           "\t which specifies an offset transform with constant 0x40A3)\n"
           "\t--toml output the trie in toml format (default is binary),\n",
            u_getDataDirectory());
    exit(retCode);
}

/* UDataInfo cf. udata.h */
static UDataInfo dataInfo = {
    sizeof(UDataInfo),
    0,

    U_IS_BIG_ENDIAN,
    U_CHARSET_FAMILY,
    U_SIZEOF_UCHAR,
    0,

    { 0x44, 0x69, 0x63, 0x74 },     /* "Dict" */
    { 1, 0, 0, 0 },                 /* format version */
    { 0, 0, 0, 0 }                  /* data version */
};

#if !UCONFIG_NO_BREAK_ITERATION

// A wrapper for both BytesTrieBuilder and UCharsTrieBuilder.
// may want to put this somewhere in ICU, as it could be useful outside
// of this tool?
class DataDict {
private:
    BytesTrieBuilder *bt;
    UCharsTrieBuilder *ut;
    UChar32 transformConstant;
    int32_t transformType;
public:
    // constructs a new data dictionary. if there is an error,
    // it will be returned in status
    // isBytesTrie != 0 will produce a BytesTrieBuilder,
    // isBytesTrie == 0 will produce a UCharsTrieBuilder
    DataDict(UBool isBytesTrie, UErrorCode &status) : bt(nullptr), ut(nullptr),
        transformConstant(0), transformType(DictionaryData::TRANSFORM_NONE) {
        if (isBytesTrie) {
            bt = new BytesTrieBuilder(status);
        } else {
            ut = new UCharsTrieBuilder(status);
        }
    }

    ~DataDict() {
        delete bt;
        delete ut;
    }

private:
    char transform(UChar32 c, UErrorCode &status) {
        if (transformType == DictionaryData::TRANSFORM_TYPE_OFFSET) {
            if (c == 0x200D) { return static_cast<char>(0xFF); }
            else if (c == 0x200C) { return static_cast<char>(0xFE); }
            int32_t delta = c - transformConstant;
            if (delta < 0 || 0xFD < delta) {
                fprintf(stderr, "Codepoint U+%04lx out of range for --transform offset-%04lx!\n",
                        static_cast<long>(c), static_cast<long>(transformConstant));
                exit(U_ILLEGAL_ARGUMENT_ERROR); // TODO: should return and print the line number
            }
            return static_cast<char>(delta);
        } else { // no such transform type
            status = U_INTERNAL_PROGRAM_ERROR;
            return static_cast<char>(c); // it should be noted this transform type will not generally work
        }
    }

    void transform(const UnicodeString &word, CharString &buf, UErrorCode &errorCode) {
        UChar32 c = 0;
        int32_t len = word.length();
        for (int32_t i = 0; i < len; i += U16_LENGTH(c)) {
            c = word.char32At(i);
            buf.append(transform(c, errorCode), errorCode);
        }
    }

public:
    // sets the desired transformation data.
    // should be populated from a command line argument
    // so far the only acceptable format is offset-<hex constant>
    // eventually others (mask-<hex constant>?) may be enabled
    // more complex functions may be more difficult
    void setTransform(const char *t) {
        if (strncmp(t, "offset-", 7) == 0) {
            char *end;
            unsigned long base = uprv_strtoul(t + 7, &end, 16);
            if (end == (t + 7) || *end != 0 || base > 0x10FF80) {
                fprintf(stderr, "Syntax for offset value in --transform offset-%s invalid!\n", t + 7);
                usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
            }
            transformType = DictionaryData::TRANSFORM_TYPE_OFFSET;
            transformConstant = static_cast<UChar32>(base);
        }
        else {
            fprintf(stderr, "Invalid transform specified: %s\n", t);
            usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
        }
    }

    // add a word to the trie
    void addWord(const UnicodeString &word, int32_t value, UErrorCode &status) {
        if (bt) {
            CharString buf;
            transform(word, buf, status);
            bt->add(buf.toStringPiece(), value, status);
        }
        if (ut) { ut->add(word, value, status); }
    }

    // if we are a bytestrie, give back the StringPiece representing the serialized version of us
    StringPiece serializeBytes(UErrorCode &status) {
        return bt->buildStringPiece(USTRINGTRIE_BUILD_SMALL, status);
    }

    // if we are a ucharstrie, produce the UnicodeString representing the serialized version of us
    void serializeUChars(UnicodeString &s, UErrorCode &status) {
        ut->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, s, status);
    }

    int32_t getTransform() {
        return (transformType | transformConstant);
    }
};
#endif

static const char16_t LINEFEED_CHARACTER = 0x000A;
static const char16_t CARRIAGE_RETURN_CHARACTER = 0x000D;

static UBool readLine(UCHARBUF *f, UnicodeString &fileLine, IcuToolErrorCode &errorCode) {
    int32_t lineLength;
    const char16_t *line = ucbuf_readline(f, &lineLength, errorCode);
    if(line == nullptr || errorCode.isFailure()) { return false; }
    // Strip trailing CR/LF, comments, and spaces.
    const char16_t *comment = u_memchr(line, 0x23, lineLength);  // '#'
    if(comment != nullptr) {
        lineLength = static_cast<int32_t>(comment - line);
    } else {
        while(lineLength > 0 && (line[lineLength - 1] == CARRIAGE_RETURN_CHARACTER || line[lineLength - 1] == LINEFEED_CHARACTER)) { --lineLength; }
    }
    while(lineLength > 0 && u_isspace(line[lineLength - 1])) { --lineLength; }
    fileLine.setTo(false, line, lineLength);
    return true;
}

//----------------------------------------------------------------------------
//
//  main      for gendict
//
//----------------------------------------------------------------------------
int  main(int argc, char **argv) {
    //
    // Pick up and check the command line arguments,
    //    using the standard ICU tool utils option handling.
    //
    U_MAIN_INIT_ARGS(argc, argv);
    progName = argv[0];
    argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
    if(argc<0) {
        // Unrecognized option
        fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    }

    if(options[ARG_HELP].doesOccur || options[ARG_QMARK].doesOccur) {
        //  -? or -h for help.
        usageAndDie(U_ZERO_ERROR);
    }

    UBool verbose = options[ARG_VERBOSE].doesOccur;
    UBool quiet = options[ARG_QUIET].doesOccur;

    if (argc < 3) {
        fprintf(stderr, "input and output file must both be specified.\n");
        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    }
    const char *outFileName  = argv[2];
    const char *wordFileName = argv[1];

    startTime = uprv_getRawUTCtime(); // initialize start timer

if (options[ARG_ICUDATADIR].doesOccur) {
        u_setDataDirectory(options[ARG_ICUDATADIR].value);
    }

    const char *copyright = nullptr;
    if (options[ARG_COPYRIGHT].doesOccur) {
        copyright = U_COPYRIGHT_STRING;
    }

    if (options[ARG_UCHARS].doesOccur == options[ARG_BYTES].doesOccur) {
        fprintf(stderr, "you must specify exactly one type of trie to output!\n");
        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    }
    UBool isBytesTrie = options[ARG_BYTES].doesOccur;
    if (isBytesTrie != options[ARG_TRANSFORM].doesOccur) {
        fprintf(stderr, "you must provide a transformation for a bytes trie, and must not provide one for a uchars trie!\n");
        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    }

    IcuToolErrorCode status("gendict/main()");

    UBool isToml = options[ARG_TOML].doesOccur;

#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
    const char* outDir=nullptr;

    UNewDataMemory *pData;
    char msg[1024];
    UErrorCode tempstatus = U_ZERO_ERROR;

    /* write message with just the name */ // potential for a buffer overflow here...
    snprintf(msg, sizeof(msg), "gendict writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
    fprintf(stderr, "%s\n", msg);

    /* write the dummy data file */
    pData = udata_create(outDir, nullptr, outFileName, &dataInfo, nullptr, &tempstatus);
    udata_writeBlock(pData, msg, strlen(msg));
    udata_finish(pData, &tempstatus);
    return (int)tempstatus;

#else
    //  Read in the dictionary source file
    if (verbose) { printf("Opening file %s...\n", wordFileName); }
    const char *codepage = "UTF-8";
    LocalUCHARBUFPointer f(ucbuf_open(wordFileName, &codepage, true, false, status));
    if (status.isFailure()) {
        fprintf(stderr, "error opening input file: ICU Error \"%s\"\n", status.errorName());
        exit(status.reset());
    }
    if (verbose) { printf("Initializing dictionary builder of type %s...\n", (isBytesTrie ? "BytesTrie" : "UCharsTrie")); }
    DataDict dict(isBytesTrie, status);
    if (status.isFailure()) {
        fprintf(stderr, "new DataDict: ICU Error \"%s\"\n", status.errorName());
        exit(status.reset());
    }
    if (options[ARG_TRANSFORM].doesOccur) {
        dict.setTransform(options[ARG_TRANSFORM].value);
    }

    UnicodeString fileLine;
    if (verbose) { puts("Adding words to dictionary..."); }
    UBool hasValues = false;
    UBool hasValuelessContents = false;
    int lineCount = 0;
    int wordCount = 0;
    int minlen = 255;
    int maxlen = 0;
    UBool isOk = true;
    while (readLine(f.getAlias(), fileLine, status)) {
        lineCount++;
        if (fileLine.isEmpty()) continue;

        // Parse word [spaces value].
        int32_t keyLen;
        for (keyLen = 0; keyLen < fileLine.length() && !u_isspace(fileLine[keyLen]); ++keyLen) {}
        if (keyLen == 0) {
            fprintf(stderr, "Error: no word on line %i!\n", lineCount);
            isOk = false;
            continue;
        }
        int32_t valueStart;
        for (valueStart = keyLen;
            valueStart < fileLine.length() && u_isspace(fileLine[valueStart]);
            ++valueStart) {}

        if (keyLen < valueStart) {
            int32_t valueLength = fileLine.length() - valueStart;
            if (valueLength > 15) {
                fprintf(stderr, "Error: value too long on line %i!\n", lineCount);
                isOk = false;
                continue;
            }
            char s[16];
            fileLine.extract(valueStart, valueLength, s, 16, US_INV);
            char *end;
            unsigned long value = uprv_strtoul(s, &end, 0);
            if (end == s || *end != 0 || static_cast<int32_t>(uprv_strlen(s)) != valueLength || value > 0xffffffff) {
                fprintf(stderr, "Error: value syntax error or value too large on line %i!\n", lineCount);
                isOk = false;
                continue;
            }
            dict.addWord(fileLine.tempSubString(0, keyLen), static_cast<int32_t>(value), status);
            hasValues = true;
            wordCount++;
            if (keyLen < minlen) minlen = keyLen;
            if (keyLen > maxlen) maxlen = keyLen;
        } else {
            dict.addWord(fileLine.tempSubString(0, keyLen), 0, status);
            hasValuelessContents = true;
            wordCount++;
            if (keyLen < minlen) minlen = keyLen;
            if (keyLen > maxlen) maxlen = keyLen;
        }

        if (status.isFailure()) {
            fprintf(stderr, "ICU Error \"%s\": Failed to add word to trie at input line %d in input file\n",
                status.errorName(), lineCount);
            exit(status.reset());
        }
    }
    if (verbose) { printf("Processed %d lines, added %d words, minlen %d, maxlen %d\n", lineCount, wordCount, minlen, maxlen); }

    if (!isOk && status.isSuccess()) {
        status.set(U_ILLEGAL_ARGUMENT_ERROR);
    }
    if (hasValues && hasValuelessContents) {
        fprintf(stderr, "warning: file contained both valued and unvalued strings!\n");
    }

    if (verbose) { printf("Serializing data...isBytesTrie? %d\n", isBytesTrie); }
    int32_t outDataSize;
    const void *outData;
    UnicodeString usp;
    if (isBytesTrie) {
        StringPiece sp = dict.serializeBytes(status);
        outDataSize = sp.size();
        outData = sp.data();
    } else {
        dict.serializeUChars(usp, status);
        outDataSize = usp.length() * U_SIZEOF_UCHAR;
        outData = usp.getBuffer();
    }
    if (status.isFailure()) {
        fprintf(stderr, "gendict: got failure of type %s while serializing, if U_ILLEGAL_ARGUMENT_ERROR possibly due to duplicate dictionary entries\n", status.errorName());
        exit(status.reset());
    }
    if (verbose) { puts("Opening output file..."); }

    if (isToml) {
        FILE* f = fopen(outFileName, "w");
        if (f == nullptr) {
            fprintf(stderr, "gendict: could not open output file \"%s\"\n", outFileName);
            exit(status.reset());
        }
        fprintf(f, "trie_type = \"%s\"\n", isBytesTrie ? "bytes" : "uchars");
        fprintf(f, "has_values = %s\n", hasValues ? "true" : "false");
        int32_t transform = dict.getTransform();
        bool isOffset = (transform & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET;
        int32_t offset = transform & DictionaryData::TRANSFORM_OFFSET_MASK;
        fprintf(f, "transform_type = \"%s\"\n", isOffset ? "offset" : "none");
        fprintf(f, "transform_offset = %d\n", offset);

        int32_t outDataWidth = isBytesTrie ? 8 : 16;
        int32_t outDataLength = isBytesTrie ? outDataSize : outDataSize / U_SIZEOF_UCHAR;
        usrc_writeArray(f, "trie_data = [\n ",  outData, outDataWidth, outDataLength, " ", "\n]\n");

        fclose(f);
    } else {
        UNewDataMemory *pData = udata_create(nullptr, nullptr, outFileName, &dataInfo, copyright, status);
        if (status.isFailure()) {
            fprintf(stderr, "gendict: could not open output file \"%s\", \"%s\"\n", outFileName, status.errorName());
            exit(status.reset());
        }

        if (verbose) { puts("Writing to output file..."); }
        int32_t indexes[DictionaryData::IX_COUNT] = {
            DictionaryData::IX_COUNT * sizeof(int32_t), 0, 0, 0, 0, 0, 0, 0
        };
        int32_t size = outDataSize + indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
        indexes[DictionaryData::IX_RESERVED1_OFFSET] = size;
        indexes[DictionaryData::IX_RESERVED2_OFFSET] = size;
        indexes[DictionaryData::IX_TOTAL_SIZE] = size;

        indexes[DictionaryData::IX_TRIE_TYPE] = isBytesTrie ? DictionaryData::TRIE_TYPE_BYTES : DictionaryData::TRIE_TYPE_UCHARS;
        if (hasValues) {
            indexes[DictionaryData::IX_TRIE_TYPE] |= DictionaryData::TRIE_HAS_VALUES;
        }

        indexes[DictionaryData::IX_TRANSFORM] = dict.getTransform();
        udata_writeBlock(pData, indexes, sizeof(indexes));
        udata_writeBlock(pData, outData, outDataSize);
        size_t bytesWritten = udata_finish(pData, status);
        if (status.isFailure()) {
            fprintf(stderr, "gendict: error \"%s\" writing the output file\n", status.errorName());
            exit(status.reset());
        }

        if (bytesWritten != static_cast<size_t>(size)) {
            fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
            exit(U_INTERNAL_PROGRAM_ERROR);
        }
    }

    if (!quiet) { printf("%s: done writing\t%s (%ds).\n", progName, outFileName, elapsedTime()); }

#ifdef TEST_GENDICT
    if (isBytesTrie) {
        BytesTrie::Iterator it(outData, outDataSize, status);
        while (it.hasNext()) {
            it.next(status);
            const StringPiece s = it.getString();
            int32_t val = it.getValue();
            printf("%s -> %i\n", s.data(), val);
        }
    } else {
        UCharsTrie::Iterator it((const char16_t *)outData, outDataSize, status);
        while (it.hasNext()) {
            it.next(status);
            const UnicodeString s = it.getString();
            int32_t val = it.getValue();
            char tmp[1024];
            s.extract(0, s.length(), tmp, 1024);
            printf("%s -> %i\n", tmp, val);
        }
    }
#endif

    return 0;
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
}

Messung V0.5

¤ Dauer der Verarbeitung: 0.14 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.