Quelle parse_esgrammar.py Sprache: Python

"""Parse a grammar written in ECMArkup."""

from __future__ import annotations
# mypy: no-implicit-optional

import os
import collections
from typing import Dict, Iterable, Optional, Tuple

from jsparagus import parse_pgen, gen, grammar, extension, types
from jsparagus.lexer import LexicalGrammar
from jsparagus.ordered import OrderedSet, OrderedFrozenSet

ESGrammarLexer = LexicalGrammar(
    # the operators and keywords:
    "[ ] { } , ~ + ? ( ) @ < > ' ; "
    "but empty here lookahead no not of one or returns through Some None impl for let",

    NL="\n",

    # any number of colons together
    EQ=r':+',

    # terminals of the ES grammar, quoted with backticks
    T=r'`[^` \n]+`|```',

    # also terminals, denoting control characters
    CHR=r'<[A-Z ]+>|U\+[0-9A-f]{4}',

    # nonterminals/types that will be followed by parameters
    NTCALL=r'[A-Za-z]\w*(?=[\[<])',

    # nonterminals (also, boolean parameters and type names)
    NT=r'[A-Za-z]\w*',

    # nonterminals wrapped in vertical bars for no apparent reason
    NTALT=r'\|[A-Z]\w+\|',

    # the spec also gives a few productions names
    PRODID=r'#[A-Za-z]\w*',

    # prose not wrapped in square brackets
    # To avoid conflict with the `>` token, this is recognized only after a space.
    PROSE=r'(?<= )>[^\n]*',

    # prose wrapped in square brackets
    WPROSE=r'\[>[^]]*\]',

    # expression denoting a matched terminal or nonterminal
    MATCH_REF=r'\$(?:0|[1-9][0-9]*)',

    # the spec also gives a few productions names
    RUSTCOMMENT=r'//.*\n',
)

ESGrammarParser = gen.compile(
    parse_pgen.load_grammar(
        os.path.join(os.path.dirname(__file__), "esgrammar.pgen")))

SIGIL_FALSE = '~'
SIGIL_TRUE = '+'

# Abbreviations for single-character terminals, used in the lexical grammar.
ECMASCRIPT_CODE_POINTS = {
    # From <https://tc39.es/ecma262/#table-31>
    '': grammar.Literal('\u200c'),
    '': grammar.Literal('\u200d'),
    '': grammar.Literal('\ufeff'),

    # From <https://tc39.es/ecma262/#table-32>
    '': grammar.Literal('\t'),
    '': grammar.Literal('\u000b'),
    '': grammar.Literal('\u000c'),
    '': grammar.Literal(' '),
    '': grammar.Literal('\u00a0'),
    # <ZWNBSP> already defined above
    '': grammar.UnicodeCategory('Zs'),

    # From <https://tc39.es/ecma262/#table-33>
    '': grammar.Literal('\u000a'),
    '': grammar.Literal('\u000d'),
    '': grammar.Literal('\u2028'),
    '': grammar.Literal('\u2028'),
}

class ESGrammarBuilder:
    def __init__(self, terminal_names):
        # Names of terminals that are written as nonterminals in the grammar.
        # For example, "BooleanLiteral" is a terminal name when parsing the
        # syntactic grammar.
        if terminal_names is None:
            terminal_names = frozenset()
        self.terminal_names = frozenset(terminal_names)
        self.reset()

    def reset(self):
        self.lexer = None
        # This is how full-parsing and lazy-parsing are implemented, using
        # different traits.
        #
        # This field contains the Rust's trait used for calling the method.
        # When a CallMethod is generated, it is assumed to be a function of
        # this trait. The trait is used by the Rust backend to generate
        # multiple backends which are implementing different set of traits.
        # Having the trait on the function call is useful as a way to filter
        # functions calls at code-generation time.
        #
        # This field is updated by the `rust_param_impl`, which is used in
        # grammar extensions, and visited before producing any CallMethod.
        self.method_trait = "AstBuilder"

    def rust_edsl(self, impl, grammar):
        return extension.GrammarExtension(impl, grammar, self.lexer.filename)

    def rust_param_impl(self, trait, for_type, param):
        self.method_trait = trait
        return extension.ImplFor(param, trait, for_type)

    def rust_impl(self, trait, impl_type):
        return self.rust_param_impl(trait, impl_type, [])

    def rust_nt_def(self, lhs, rhs_line):
        # Right now, only focus on the syntactic grammar, and assume that all
        # rules are patching existing grammar production by adding code.
        return extension.ExtPatch(self.nt_def(None, lhs, ':', [rhs_line]))

    def rust_rhs_line(self, symbols):
        return self.rhs_line(None, symbols, None, None)

    def rust_expr(self, expr):
        assert isinstance(expr, grammar.CallMethod)
        return expr

    def empty(self):
        return []

    def single(self, x):
        return [x]

    def append(self, x, y):
        return x + [y]

    def concat(self, x, y):
        return x + y

    def blank_line(self):
        return []

    def nt_def_to_list(self, nt_def):
        return [nt_def]

    def to_production(self, lhs, i, rhs, is_sole_production):
        """Wrap a list of grammar symbols `rhs` in a Production object."""
        body, reducer, condition = rhs
        if reducer is None:
            reducer = self.default_reducer(lhs, i, body, is_sole_production)
        return grammar.Production(body, reducer, condition=condition)

    def default_reducer(self, lhs, i, body, is_sole_production):
        assert isinstance(lhs, grammar.Nt)
        nt_name = lhs.name

        nargs = sum(1 for e in body if grammar.is_concrete_element(e))
        if is_sole_production:
            method_name = nt_name
        else:
            method_name = '{} {}'.format(nt_name, i)
        return self.expr_call(method_name, tuple(range(nargs)), None)

    def needs_asi(self, lhs, p):
        """True if p is a production in which ASI can happen."""
        # The purpose of the fake ForLexicalDeclaration production is to have a
        # copy of LexicalDeclaration that does not trigger ASI.
        #
        # Two productions have body == [";"] -- one for EmptyStatement and one
        # for ClassMember. Neither should trigger ASI.
        #
        # The only other semicolons that should not trigger ASI are the ones in
        # `for` statement productions, which happen to be exactly those
        # semicolons that are not at the end of a production.
        return (not (isinstance(lhs, grammar.Nt)
                     and lhs.name == 'ForLexicalDeclaration')
                and len(p.body) > 1
                and p.body[-1] == ';')

    def apply_asi(self, p, reducer_was_autogenerated):
        """Return two rules based on p, so that ASI can be applied."""
        assert isinstance(p.reducer, grammar.CallMethod)

        if reducer_was_autogenerated:
            # Don't pass the semicolon to the method.
            reducer = self.expr_call(p.reducer.method,
                                     p.reducer.args[:-1],
                                     None)
        else:
            reducer = p.reducer

        # Except for do-while loops, check at runtime that ASI occurs only at
        # the end of a line.
        if (len(p.body) == 7
                and p.body[0] == 'do'
                and p.body[2] == 'while'
                and p.body[3] == '('
                and p.body[5] == ')'
                and p.body[6] == ';'):
            code = "do_while_asi"
        else:
            code = "asi"

        return [
            # The preferred production, with the semicolon in.
            p.copy_with(body=p.body[:],
                        reducer=reducer),
            # The fallback production, performing ASI.
            p.copy_with(body=p.body[:-1] + [grammar.ErrorSymbol(code)],
                        reducer=reducer),
        ]

    def expand_lexical_rhs(self, rhs):
        body, reducer, condition = rhs
        out = []
        for e in body:
            if isinstance(e, str):
                # The terminal symbols of the lexical grammar are characters, so
                # add each character of this string as a separate element.
                out += [grammar.Literal(ch) for ch in e]
            else:
                out.append(e)
        return [out, reducer, condition]

    def nt_def(self, nt_type, lhs, eq, rhs_list):
        has_sole_production = (len(rhs_list) == 1)
        production_list = []
        for i, rhs in enumerate(rhs_list):
            if eq == ':':
                # Syntactic grammar. A hack is needed for ASI.
                reducer_was_autogenerated = rhs[1] is None
                p = self.to_production(lhs, i, rhs, has_sole_production)
                if self.needs_asi(lhs, p):
                    production_list += self.apply_asi(p, reducer_was_autogenerated)
                else:
                    production_list.append(p)
            elif eq == '::':
                # Lexical grammar. A hack is needed to replace multicharacter
                # terminals like `!==` into sequences of character terminals.
                rhs = self.expand_lexical_rhs(rhs)
                p = self.to_production(lhs, i, rhs, has_sole_production)
                production_list.append(p)
        return (lhs.name, eq, grammar.NtDef(lhs.args, production_list, nt_type))

    def nt_def_one_of(self, nt_type, nt_lhs, eq, terminals):
        return self.nt_def(nt_type, nt_lhs, eq, [([t], None, None) for t in terminals])

    def nt_lhs_no_params(self, name):
        return grammar.Nt(name, ())

    def nt_lhs_with_params(self, name, params):
        return grammar.Nt(name, tuple(params))

    def simple_type(self, name):
        return types.Type(name)

    def lifetime_type(self, name):
        return types.Lifetime(name)

    def parameterized_type(self, name, args):
        return types.Type(name, tuple(args))

    def t_list_line(self, terminals):
        return terminals

    def terminal(self, t):
        assert t[0] == "`"
        assert t[-1] == "`"
        return t[1:-1]

    def terminal_chr(self, chr):
        raise ValueError("FAILED: %r" % chr)

    def rhs_line(self, ifdef, rhs, reducer, _prodid):
        return (rhs, reducer, ifdef)

    def rhs_line_prose(self, prose):
        return ([prose], None, None)

    def empty_rhs(self):
        return []

    def expr_match_ref(self, token):
        assert token.startswith('$')
        return int(token[1:])

    def expr_call(self, method, args, fallible):
        # NOTE: Currently "AstBuilder" functions are made fallible using the
        # fallible_methods taken from some Rust code which extract this
        # information to produce a JSON file.
        if self.method_trait == "AstBuilder":
            fallible = None
        return grammar.CallMethod(method, args or (), types.Type(self.method_trait),
                                  fallible is not None)

    def expr_some(self, expr):
        return grammar.Some(expr)

    def expr_none(self):
        return None

    def ifdef(self, value, nt):
        return nt, value

    def optional(self, nt):
        return grammar.Optional(nt)

    def but_not(self, nt, exclusion):
        _, exclusion = exclusion
        return grammar.Exclude(nt, [exclusion])
        # return ('-', nt, exclusion)

    def but_not_one_of(self, nt, exclusion_list):
        exclusion_list = [exclusion for _, exclusion in exclusion_list]
        return grammar.Exclude(nt, exclusion_list)
        # return ('-', nt, exclusion_list)

    def no_line_terminator_here(self, lt):
        if lt not in ('LineTerminator', '|LineTerminator|'):
            raise ValueError("unrecognized directive " + repr("[no " + lt + " here]"))
        return grammar.NoLineTerminatorHere

    def nonterminal(self, name):
        if name in self.terminal_names:
            return name
        return grammar.Nt(name, ())

    def nonterminal_apply(self, name, args):
        if name in self.terminal_names:
            raise ValueError("parameters applied to terminal {!r}".format(name))
        if len(set(k for k, expr in args)) != len(args):
            raise ValueError("parameter passed multiple times")
        return grammar.Nt(name, tuple(args))

    def arg_expr(self, sigil, argname):
        if sigil == '?':
            return (argname, grammar.Var(argname))
        else:
            return (argname, sigil)

    def sigil_false(self):
        return False

    def sigil_true(self):
        return True

    def exclusion_terminal(self, t):
        return ("t", t)

    def exclusion_nonterminal(self, nt):
        return ("nt", nt)

    def exclusion_chr_range(self, c1, c2):
        return ("range", c1, c2)

    def la_eq(self, t):
        return grammar.LookaheadRule(OrderedFrozenSet([t]), True)

    def la_ne(self, t):
        return grammar.LookaheadRule(OrderedFrozenSet([t]), False)

    def la_not_in_nonterminal(self, nt):
        return grammar.LookaheadRule(OrderedFrozenSet([nt]), False)

    def la_not_in_set(self, lookahead_exclusions):
        if all(len(excl) == 1 for excl in lookahead_exclusions):
            return grammar.LookaheadRule(
                OrderedFrozenSet(excl[0] for excl in lookahead_exclusions),
                False)
        raise ValueError("unsupported: lookahead > 1 token, {!r}"
                         .format(lookahead_exclusions))

    def chr(self, t):
        assert t[0] == "<" or t[0] == 'U'
        if t[0] == "<":
            assert t[-1] == ">"
            if t not in ECMASCRIPT_CODE_POINTS:
                raise ValueError("unrecognized character abbreviation {!r}".format(t))
            return ECMASCRIPT_CODE_POINTS[t]
        else:
            assert t[1] == "+"
            return grammar.Literal(chr(int(t[2:], base=16)))

def finish_grammar(nt_defs, goals, variable_terminals, synthetic_terminals,
                   single_grammar=True, extensions=[]):
    nt_grammars = {}
    for nt_name, eq, _ in nt_defs:
        if nt_name in nt_grammars:
            raise ValueError(
                "duplicate definitions for nonterminal {!r}"
                .format(nt_name))
        nt_grammars[nt_name] = eq

    # Figure out which grammar we were trying to get (":" for syntactic,
    # "::" for lexical) based on the goal symbols.
    goals = list(goals)
    if len(goals) == 0:
        raise ValueError("no goal nonterminals specified")
    if single_grammar:
        selected_grammars = set(nt_grammars[goal] for goal in goals)
        assert len(selected_grammars) != 0
        if len(selected_grammars) > 1:
            raise ValueError(
                "all goal nonterminals must be part of the same grammar; "
                "got {!r} (matching these grammars: {!r})"
                .format(set(goals), set(selected_grammars)))
        [selected_grammar] = selected_grammars

    terminal_set = set()

    def hack_production(p):
        for i, e in enumerate(p.body):
            if isinstance(e, str) and e[:1] == "`":
                if len(e) < 3 or e[-1:] != "`":
                    raise ValueError(
                        "Unrecognized grammar symbol: {!r} (in {!r})"
                        .format(e, p))
                p[i] = token = e[1:-1]
                terminal_set.add(token)

    nonterminals = {}
    for nt_name, eq, rhs_list_or_lambda in nt_defs:
        if single_grammar and eq != selected_grammar:
            continue

        if isinstance(rhs_list_or_lambda, grammar.NtDef):
            nonterminals[nt_name] = rhs_list_or_lambda
        else:
            rhs_list = rhs_list_or_lambda
            for p in rhs_list:
                if not isinstance(p, grammar.Production):
                    raise ValueError(
                        "invalid grammar: ifdef in non-function-call context")
                hack_production(p)
            if nt_name in nonterminals:
                raise ValueError(
                    "unsupported: multiple definitions for nt " + nt_name)
            nonterminals[nt_name] = rhs_list

    for t in terminal_set:
        if t in nonterminals:
            raise ValueError(
                "grammar contains both a terminal `{}` and nonterminal {}"
                .format(t, t))

    # Add execution modes to generate the various functions needed to handle
    # syntax parsing and full parsing execution modes.
    exec_modes = collections.defaultdict(OrderedSet)
    noop_parser = types.Type("ParserTrait", (types.Lifetime("alloc"), types.UnitType))
    token_parser = types.Type("ParserTrait", (
        types.Lifetime("alloc"), types.Type("StackValue", (types.Lifetime("alloc"),))))
    ast_builder = types.Type("AstBuilderDelegate", (types.Lifetime("alloc"),))

    # Full parsing takes token as input and build an AST.
    exec_modes["full_actions"].extend([token_parser, ast_builder])

    # Syntax parsing takes token as input but skip building the AST.
    # TODO: The syntax parser is commented out for now, as we need something to
    # be produced when we cannot call the AstBuilder for producing the values.

    # No-op parsing is used for the simulator, which is so far used for
    # querying whether we can end the incremental input and lookup if a state
    # can accept some kind of tokens.
    exec_modes["noop_actions"].add(noop_parser)

    # Extensions are using an equivalent of Rust types to define the kind of
    # parsers to be used, this map is used to convert these type names to the
    # various execution modes.
    full_parser = types.Type("FullParser")
    syntax_parser = types.Type("SyntaxParser")
    noop_parser = types.Type("NoopParser")
    type_to_modes = {
        noop_parser: ["noop_actions", "full_actions"],
        syntax_parser: ["full_actions"],
        full_parser: ["full_actions"],
    }

    result = grammar.Grammar(
        nonterminals,
        goal_nts=goals,
        variable_terminals=variable_terminals,
        synthetic_terminals=synthetic_terminals,
        exec_modes=exec_modes,
        type_to_modes=type_to_modes)
    result.patch(extensions)
    return result

def parse_esgrammar(
        text: str,
        *,
        filename: Optional[str] = None,
        extensions: Iterable[Tuple[os.PathLike, int, str]] = (),
        goals: Optional[Iterable[str]] = None,
        terminal_names: Iterable[str] = (),
        synthetic_terminals: Optional[Dict[str, OrderedSet[str]]] = None,
        single_grammar: bool = True
) -> grammar.Grammar:
    if not text.endswith("\n\n"):
        # Horrible hack: add a blank line at the end of the document so that
        # the esgrammar grammar can use newlines as delimiters. :-P
        text += "\n"

    terminal_names = frozenset(terminal_names)
    if synthetic_terminals is None:
        synthetic_terminals = {}

    builder = ESGrammarBuilder(terminal_names)
    parser = ESGrammarParser(builder=builder, goal="grammar")
    lexer = ESGrammarLexer(parser, filename=filename)
    lexer.write(text)
    nt_defs = lexer.close()
    grammar_extensions = []
    for ext_filename, start_lineno, content in extensions:
        builder.reset()
        parser = ESGrammarParser(builder=builder, goal="rust_edsl")
        lexer = ESGrammarLexer(parser, filename=ext_filename)
        builder.lexer = lexer
        lexer.start_lineno = start_lineno
        lexer.write(content)
        result = lexer.close()
        grammar_extensions.append(result)

    if goals is None:
        # Default to the first nonterminal in the input.
        goals = [nt_defs[0][0]]

    return finish_grammar(
        nt_defs,
        goals=goals,
        variable_terminals=terminal_names - frozenset(synthetic_terminals),
        synthetic_terminals=synthetic_terminals,
        single_grammar=single_grammar,
        extensions=grammar_extensions)

Messung V0.5

¤ Dauer der Verarbeitung: 0.12 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.