Quelle rbt_pars.h Sprache: C

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 1999-2011, International Business Machines Corporation
* and others. All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   11/17/99    aliu        Creation.
**********************************************************************
*/
#ifndef RBT_PARS_H
#define RBT_PARS_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION
#ifdef __cplusplus

#include "unicode/uobject.h"
#include "unicode/parseerr.h"
#include "unicode/unorm.h"
#include "rbt.h"
#include "hash.h"
#include "uvector.h"

U_NAMESPACE_BEGIN

class TransliterationRuleData;
class UnicodeFunctor;
class ParseData;
class RuleHalf;
class ParsePosition;
class StringMatcher;

class TransliteratorParser : public UMemory {

public:

    /**
     * A Vector of TransliterationRuleData objects, one for each discrete group
     * of rules in the rule set
     */
    UVector dataVector;

    /**
     * PUBLIC data member.
     * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
     */
    UVector idBlockVector;

    /**
     * PUBLIC data member containing the parsed compound filter, if any.
     */
    UnicodeSet* compoundFilter;

private:

    /**
     * The current data object for which we are parsing rules
     */
    TransliterationRuleData* curData;

    UTransDirection direction;

    /**
     * Parse error information.
     */
    UParseError parseError;

    /**
     * Temporary symbol table used during parsing.
     */
    ParseData* parseData;

    /**
     * Temporary vector of matcher variables.  When parsing is complete, this
     * is copied into the array data.variables.  As with data.variables,
     * element 0 corresponds to character data.variablesBase.
     */
    UVector variablesVector;

    /**
     * Temporary table of variable names.  When parsing is complete, this is
     * copied into data.variableNames.
     */
    Hashtable variableNames;

    /**
     * String of standins for segments.  Used during the parsing of a single
     * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
     * to StringMatcher object segmentObjects.elementAt(0), etc.
     */
    UnicodeString segmentStandins;

    /**
     * Vector of StringMatcher objects for segments.  Used during the
     * parsing of a single rule.
     * segmentStandins.charAt(0) is the standin for "$1" and corresponds
     * to StringMatcher object segmentObjects.elementAt(0), etc.
     */
    UVector segmentObjects;

    /**
     * The next available stand-in for variables.  This starts at some point in
     * the private use area (discovered dynamically) and increments up toward
     * <code>variableLimit</code>.  At any point during parsing, available
     * variables are <code>variableNext..variableLimit-1</code>.
     */
    char16_t variableNext;

    /**
     * The last available stand-in for variables.  This is discovered
     * dynamically.  At any point during parsing, available variables are
     * <code>variableNext..variableLimit-1</code>.
     */
    char16_t variableLimit;

    /**
     * When we encounter an undefined variable, we do not immediately signal
     * an error, in case we are defining this variable, e.g., "$a = [a-z];".
     * Instead, we save the name of the undefined variable, and substitute
     * in the placeholder char variableLimit - 1, and decrement
     * variableLimit.
     */
    UnicodeString undefinedVariableName;

    /**
     * The stand-in character for the 'dot' set, represented by '.' in
     * patterns.  This is allocated the first time it is needed, and
     * reused thereafter.
     */
    char16_t dotStandIn;

public:

    /**
     * Constructor.
     */
    TransliteratorParser(UErrorCode &statusReturn);

    /**
     * Destructor.
     */
    ~TransliteratorParser();

    /**
     * Parse the given string as a sequence of rules, separated by newline
     * characters ('\n'), and cause this object to implement those rules.  Any
     * previous rules are discarded.  Typically this method is called exactly
     * once after construction.
     *
     * Parse the given rules, in the given direction.  After this call
     * returns, query the public data members for results.  The caller
     * owns the 'data' and 'compoundFilter' data members after this
     * call returns.
     * @param rules      rules, separated by ';'
     * @param direction  either FORWARD or REVERSE.
     * @param pe         Struct to receive information on position
     *                   of error if an error is encountered
     * @param ec         Output param set to success/failure code.
     */
    void parse(const UnicodeString& rules,
               UTransDirection direction,
               UParseError& pe,
               UErrorCode& ec);

    /**
     * Return the compound filter parsed by parse().  Caller owns result.
     * @return the compound filter parsed by parse().
     */
    UnicodeSet* orphanCompoundFilter();

private:

    /**
     * Return a representation of this transliterator as source rules.
     * @param rules      Output param to receive the rules.
     * @param direction  either FORWARD or REVERSE.
     */
    void parseRules(const UnicodeString& rules,
                    UTransDirection direction,
                    UErrorCode& status);

    /**
     * MAIN PARSER.  Parse the next rule in the given rule string, starting
     * at pos.  Return the index after the last character parsed.  Do not
     * parse characters at or after limit.
     *
     * Important:  The character at pos must be a non-whitespace character
     * that is not the comment character.
     *
     * This method handles quoting, escaping, and whitespace removal.  It
     * parses the end-of-rule character.  It recognizes context and cursor
     * indicators.  Once it does a lexical breakdown of the rule at pos, it
     * creates a rule object and adds it to our rule list.
     * @param rules      Output param to receive the rules.
     * @param pos        the starting position.
     * @param limit      pointer past the last character of the rule.
     * @return           the index after the last character parsed.
     */
    int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);

    /**
     * Set the variable range to [start, end] (inclusive).
     * @param start    the start value of the range.
     * @param end      the end value of the range.
     */
    void setVariableRange(int32_t start, int32_t end, UErrorCode& status);

    /**
     * Assert that the given character is NOT within the variable range.
     * If it is, return false.  This is necessary to ensure that the
     * variable range does not overlap characters used in a rule.
     * @param ch     the given character.
     * @return       True, if the given character is NOT within the variable range.
     */
    UBool checkVariableRange(UChar32 ch) const;

    /**
     * Set the maximum backup to 'backup', in response to a pragma
     * statement.
     * @param backup    the new value to be set.
     */
    void pragmaMaximumBackup(int32_t backup);

    /**
     * Begin normalizing all rules using the given mode, in response
     * to a pragma statement.
     * @param mode    the given mode.
     */
    void pragmaNormalizeRules(UNormalizationMode mode);

    /**
     * Return true if the given rule looks like a pragma.
     * @param pos offset to the first non-whitespace character
     * of the rule.
     * @param limit pointer past the last character of the rule.
     * @return true if the given rule looks like a pragma.
     */
    static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);

    /**
     * Parse a pragma.  This method assumes resemblesPragma() has
     * already returned true.
     * @param pos offset to the first non-whitespace character
     * of the rule.
     * @param limit pointer past the last character of the rule.
     * @return the position index after the final ';' of the pragma,
     * or -1 on failure.
     */
    int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);

    /**
     * Called by main parser upon syntax error.  Search the rule string
     * for the probable end of the rule.  Of course, if the error is that
     * the end of rule marker is missing, then the rule end will not be found.
     * In any case the rule start will be correctly reported.
     * @param parseErrorCode error code.
     * @param msg error description.
     * @param start position of first character of current rule.
     * @return start position of first character of current rule.
     */
    int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
                        UErrorCode& status);

    /**
     * Parse a UnicodeSet out, store it, and return the stand-in character
     * used to represent it.
     *
     * @param rule    the rule for UnicodeSet.
     * @param pos     the position in pattern at which to start parsing.
     * @return        the stand-in character used to represent it.
     */
    char16_t parseSet(const UnicodeString& rule,
                      ParsePosition& pos,
                      UErrorCode& status);

    /**
     * Generate and return a stand-in for a new UnicodeFunctor.  Store
     * the matcher (adopt it).
     * @param adopted the UnicodeFunctor to be adopted.
     * @return        a stand-in for a new UnicodeFunctor.
     */
    char16_t generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);

    /**
     * Return the standin for segment seg (1-based).
     * @param seg    the given segment.
     * @return       the standIn character for the given segment.
     */
    char16_t getSegmentStandin(int32_t seg, UErrorCode& status);

    /**
     * Set the object for segment seg (1-based).
     * @param seg      the given segment.
     * @param adopted  the StringMatcher to be adopted.
     */
    void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);

    /**
     * Return the stand-in for the dot set.  It is allocated the first
     * time and reused thereafter.
     * @return    the stand-in for the dot set.
     */
    char16_t getDotStandIn(UErrorCode& status);

    /**
     * Append the value of the given variable name to the given
     * UnicodeString.
     * @param name    the variable name to be appended.
     * @param buf     the given UnicodeString to append to.
     */
    void appendVariableDef(const UnicodeString& name,
                           UnicodeString& buf,
                           UErrorCode& status);

    /**
     * Glue method to get around access restrictions in C++.
     */
    /*static Transliterator* createBasicInstance(const UnicodeString& id,
                                               const UnicodeString* canonID);*/

    friend class RuleHalf;

    // Disallowed methods; no impl.
    /**
     * Copy constructor
     */
    TransliteratorParser(const TransliteratorParser&);

    /**
     * Assignment operator
     */
    TransliteratorParser& operator=(const TransliteratorParser&);
};

U_NAMESPACE_END

#endif /* #ifdef __cplusplus */

/**
* Strip/convert the following from the transliterator rules:
* comments
* newlines
* white space at the beginning and end of a line
* unescape \u notation
*
* The target must be equal in size as the source.
* @internal
*/
U_CAPI int32_t
utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

#endif

Messung V0.5

¤ Dauer der Verarbeitung: 0.15 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.