如何检查token是否是EOF,以及如何获取之前的token到EOF

问题描述 投票:0回答:1

我有一个

CustomErrorListener
覆盖了
syntaxError
。当最后一个标记(在
EOF
之前)存在语法错误时,
syntaxError
方法会将
EOF
报告为有问题的符号。

这是我的顶级语法规则:

program: statementSeparator* version statements statementSeparator* EOF;

某些程序的结尾如下:

h q[0, 4

将在

4
EOF
所在位置)之后立即报告问题。尽管在我看来,真正令人反感的符号是
q[0, 4
,它缺少结尾
]

  1. 语法是否有问题,导致

    EOF
    之前的标记不会被报告为违规符号?

  2. 如果语法没问题:

    a) 是否有正确的方法来检查

    EOF
    令牌?以及

    b) 是否可以在

    EOF
    之前访问令牌?

词法分析器

lexer grammar CqasmLexer;

// White spaces and comments are skipped, i.e. not passed to the parser
WHITE_SPACE: [ \t]+ -> skip;
SINGLE_LINE_COMMENT: '//' ~[\r\n]* -> skip;
MULTI_LINE_COMMENT: '/*' .*? '*/' -> skip;

// Signs
NEW_LINE: '\r'?'\n';
SEMICOLON: ';';
COLON: ':';
COMMA: ',';
DOT: '.';
EQUALS: '=';
OPEN_BRACKET: '[';
CLOSE_BRACKET: ']';
OPEN_BRACE: '{';
CLOSE_BRACE: '}';
OPEN_PARENS: '(';
CLOSE_PARENS: ')';
PLUS: '+';  // this token is shared by UNARY_PLUS_OP and PLUS_OP
MINUS: '-';  // this token is shared by UNARY_MINUS_OP and MINUS_OP

// Operators
// UNARY_PLUS_OP: '+';
// UNARY_MINUS_OP: '-';
BITWISE_NOT_OP: '~';
LOGICAL_NOT_OP: '!';
POWER_OP: '**';
PRODUCT_OP: '*';
DIVISION_OP: '/';
MODULO_OP: '%';
// PLUS_OP: '+';
// MINUS_OP: '-';
SHL_OP: '<<';
SHR_OP: '>>';
CMP_GT_OP: '>';
CMP_LT_OP: '<';
CMP_GE_OP: '>=';
CMP_LE_OP: '<=';
CMP_EQ_OP: '==';
CMP_NE_OP: '!=';
BITWISE_AND_OP: '&';
BITWISE_XOR_OP: '^';
BITWISE_OR_OP: '|';
LOGICAL_AND_OP: '&&';
LOGICAL_XOR_OP: '^^';
LOGICAL_OR_OP: '||';
TERNARY_CONDITIONAL_OP: '?';

// Keywords
VERSION: 'version' -> pushMode(VERSION_STATEMENT);
MEASURE: 'measure';
QUBIT_TYPE: 'qubit';
BIT_TYPE: 'bit';
AXIS_TYPE: 'axis';
BOOL_TYPE: 'bool';
INT_TYPE: 'int';
FLOAT_TYPE: 'float';

// Numeric literals
BOOLEAN_LITERAL: 'true' | 'false';
INTEGER_LITERAL: Digit+;
FLOAT_LITERAL:
    Digit+ '.' Digit+ Exponent?
    | Digit+ '.' Exponent?  // float literals can end with a dot
    | '.' Digit+ Exponent?;  // or just start with a dot
fragment Digit: [0-9];
fragment Exponent: [eE][-+]?Digit+;

// Identifier
IDENTIFIER: Letter (Letter | Digit)*;
fragment Letter: [a-zA-Z_];

// Version mode
//
// Whenever we encounter a 'version' token, we enter the Version mode
// Within the version mode, a sequence such as '3.0' will be treated as a version number, and not as a float literal
mode VERSION_STATEMENT;
VERSION_WHITESPACE: [ \t]+ -> skip;
VERSION_NUMBER: Digit+ ('.' Digit+)? -> popMode;

解析器

parser grammar CqasmParser;

options {
    tokenVocab = CqasmLexer;
}

program: statementSeparator* version statements statementSeparator* EOF;

version: VERSION VERSION_NUMBER;

statements: (statementSeparator+ statement)*;

statementSeparator: NEW_LINE | SEMICOLON;

statement:
    QUBIT_TYPE arraySizeDeclaration? IDENTIFIER  # qubitTypeDeclaration
    | BIT_TYPE arraySizeDeclaration? IDENTIFIER  # bitTypeDeclaration
    | AXIS_TYPE IDENTIFIER (EQUALS expression)?  # axisTypeDeclaration
    | BOOL_TYPE arraySizeDeclaration? IDENTIFIER (EQUALS expression)?  # boolTypeDeclaration
    | INT_TYPE arraySizeDeclaration? IDENTIFIER (EQUALS expression)?  # intTypeDeclaration
    | FLOAT_TYPE arraySizeDeclaration? IDENTIFIER (EQUALS expression)?  # floatTypeDeclaration
    | expression EQUALS MEASURE expression  # measureInstruction
    | IDENTIFIER expressionList  # instruction
    ;

arraySizeDeclaration: OPEN_BRACKET INTEGER_LITERAL CLOSE_BRACKET;

expressionList: expression (COMMA expression)*;

indexList: indexEntry (COMMA indexEntry)*;

indexEntry:
    expression  # indexItem
    | expression COLON expression  # indexRange
    ;

expression:
    OPEN_PARENS expression CLOSE_PARENS  # parensExpression
    | <assoc=right> (PLUS | MINUS) expression  # unaryPlusMinusExpression
    | <assoc=right> BITWISE_NOT_OP expression  # bitwiseNotExpression
    | <assoc=right> LOGICAL_NOT_OP expression  # logicalNotExpression
    | <assoc=right> expression POWER_OP expression  # powerExpression
    | expression (PRODUCT_OP | DIVISION_OP | MODULO_OP) expression  # productExpression
    | expression (PLUS | MINUS) expression  # additionExpression
    | expression (SHL_OP | SHR_OP) expression  # shiftExpression
    | expression (CMP_GT_OP | CMP_LT_OP | CMP_GE_OP | CMP_LE_OP) expression  # comparisonExpression
    | expression (CMP_EQ_OP | CMP_NE_OP) expression  # equalityExpression
    | expression BITWISE_AND_OP expression  # bitwiseAndExpression
    | expression BITWISE_XOR_OP expression  # bitwiseXorExpression
    | expression BITWISE_OR_OP expression  # bitwiseOrExpression
    | expression LOGICAL_AND_OP expression  # logicalAndExpression
    | expression LOGICAL_XOR_OP expression  # logicalXorExpression
    | expression LOGICAL_OR_OP expression  # logicalOrExpression
    | <assoc=right> expression TERNARY_CONDITIONAL_OP expression COLON expression  # ternaryConditionalExpression
    | IDENTIFIER OPEN_PARENS expressionList? CLOSE_PARENS  # functionCall
    | IDENTIFIER OPEN_BRACKET indexList CLOSE_BRACKET  # index
    | IDENTIFIER  # identifier
    | OPEN_BRACKET expression COMMA expression COMMA expression CLOSE_BRACKET  # axisInitializationList
    | OPEN_BRACE expressionList CLOSE_BRACE  # initializationList
    | BOOLEAN_LITERAL  # booleanLiteral
    | INTEGER_LITERAL  # integerLiteral
    | FLOAT_LITERAL  # floatLiteral
    ;
antlr antlr4
1个回答
0
投票

规则部分

   | IDENTIFIER OPEN_BRACKET indexList CLOSE_BRACKET  # index

因缺少

CLOSE_BRACKET
令牌而失败。您的输入文本与该点匹配,因此 Antlr 的解析器标记索引在断定它不是
CLOSE_BRACKET
标记(在您的情况下为
EOF
标记)时指向下一个标记。因此,这就是已识别错误的位置(如 @Joachim Sauer 所指出)。

识别实际的错误源,无论是在词法分析器还是解析器中,都可以通过向解析器添加错误策略处理程序来完成

/**
 * Parser error strategy to redirect errors to the error listeners with typed exceptions.
 */
public class ParserErrorStrategy extends DefaultErrorStrategy {

    private static final String MISSING = "Missing %s at %s";
    private static final String EXPECT = "Extraneous input %s; expecting %s";

    @Override
    protected void reportUnwantedToken(Parser recognizer) {
        if (inErrorRecoveryMode(recognizer)) return;

        beginErrorCondition(recognizer);
        Token token = recognizer.getCurrentToken();
        String name = getTokenErrorDisplay(token);
        IntervalSet expect = getExpectedTokens(recognizer);
        String msg = String.format(EXPECT, name, expect.toString(recognizer.getVocabulary()));
        UnwantedTokenException e = new UnwantedTokenException(recognizer);
        recognizer.notifyErrorListeners(token, msg, e);
    }

    @Override
    protected void reportMissingToken(Parser recognizer) {
        if (inErrorRecoveryMode(recognizer)) return;

        beginErrorCondition(recognizer);
        Token token = recognizer.getCurrentToken();
        IntervalSet expect = getExpectedTokens(recognizer);
        String msg = String.format(MISSING, expect.toString(recognizer.getVocabulary()),
            getTokenErrorDisplay(token));
        MissingTokenException e = new MissingTokenException(recognizer);
        recognizer.notifyErrorListeners(token, msg, e);
    }
}

以及解析器和词法分析器的识别器错误侦听器

public class RecongizerErrorListener extends BaseErrorListener {

    private final Tool tool;
    private final ParseRecord rec;
    private int lastErrorIdx = -1;

    public RecongizerErrorListener(Tool tool, ParseRecord rec) {
        this.tool = tool;
        this.rec = rec;
    }

    @Override
    public void syntaxError(Recognizer<?, ?> recognizer, Object symbol, int line, int charPositionInLine,
            String msg, RecognitionException e) {

        rec.errs++;

        Token offendingToken = symbol instanceof Token ? (Token) symbol : null;
        String cause = GrammarUtil.evalError(recognizer, offendingToken, line, charPositionInLine, msg, e);

        tool.syntaxProblem(ErrorDesc.SYNTAX_ERROR, recognizer.getGrammarFileName(), offendingToken, e, cause);

        if (tool.debug() && offendingToken != null) {
            int thisErrorIdx = offendingToken.getTokenIndex();
            int type = offendingToken.getType();
            if (type <= -1 && thisErrorIdx >= rec.ts.size() - 1) {
                tool.toolProblem("Unexpected syntax error token type '%d' error: %s ", type, cause);
            }

            if (thisErrorIdx > lastErrorIdx + 10) {
                lastErrorIdx = thisErrorIdx - 10;
            }
            List<String> tokenStack = new ArrayList<>();
            for (int idx = lastErrorIdx + 1; idx <= thisErrorIdx; idx++) {
                Token token = rec.ts.get(idx);
                String name = recognizer.getVocabulary().getDisplayName(token.getType());
                String text = Strings.ellipsize(token.getText(), 12);
                tokenStack.add(String.format("@%s %s[%s] %s:%s", token.getTokenIndex(), name, text,
                        token.getLine(), token.getCharPositionInLine() + 1));
            }
            lastErrorIdx = thisErrorIdx;

            Parser parser = (Parser) recognizer;
            List<String> ruleStack = parser.getRuleInvocationStack();
            Collections.reverse(ruleStack);

            String rules = String.join("->", ruleStack);
            String tokens = String.join("=>", Strings.encode(tokenStack));

            tool.toolProblem("%s: %s\n\tRules  : %s\n\tTokens : %s\n", msg,
                    ((CommonToken) offendingToken).toString(parser), rules, tokens);
        }
    }

    @Override
    public void reportAmbiguity(Parser parser, DFA dfa, int startIndex, int stopIndex, boolean exact,
            BitSet ambigAlts, ATNConfigSet configs) {

        if (tool.debug()) {
            Token token = GrammarUtil.find(rec.ts, startIndex);
            if (token != null) {
                String cause = GrammarUtil.evalAmbiguity(parser, dfa, startIndex, stopIndex, exact, ambigAlts,
                        configs);
                tool.syntaxProblem(ErrorDesc.AMBIG_WARN, parser.getSourceName(), token, null, cause);
            }
        }
    }
}

帮手

public class GrammarUtil {

private static final String AmbMsg = "Ambiguity %s: for alts %s at '%s'";

private static final Comparator<Token> TokenComparator = (t1, t2) -> {
    if (t1.getStartIndex() < t2.getStartIndex()) return -1;
    if (t1.getStartIndex() > t2.getStartIndex()) return 1;
    return 0;
};

private GrammarUtil() {}

private static String getTokenText(Vocabulary vocab, Token token) {
    String name = vocab.getDisplayName(token.getType());
    return String.format("'%s' <%S>", token.getText(), name);
}

/**
 * Returns the full text contained between the end-points of the given nodes.
 *
 * @param nodes a spanning list of nodes
 * @return the contained text
 */
public static String getText(List<TerminalNode> nodes) {
    if (nodes == null || nodes.isEmpty()) return Strings.EMPTY;

    Token beg = nodes.get(0).getSymbol();
    Token end = nodes.get(nodes.size() - 1).getSymbol();
    CharStream cs = beg.getInputStream();
    return cs.getText(Interval.of(beg.getStartIndex(), end.getStopIndex()));
}

/**
 * Return the combined text of all leaf nodes. Does not get any off-channel tokens (if
 * any) so won't return whitespace and comments if they are sent to parser on hidden
 * channel. Returns the default value if the node is {@code null}
 */
public static String getText(ParseTree node, String def) {
    return node != null ? node.getText() : def;
}

/**
 * Returns the underlying text delimited by the given list of consecutive parser rule
 * context nodes.
 *
 * @param nodes consecutive parser nodes
 * @return the underlying text
 */
public static String getRuleText(List<? extends ParserRuleContext> nodes) {
    if (nodes == null || nodes.isEmpty()) return Strings.EMPTY;

    Token beg = nodes.get(0).getStart();
    Token end = nodes.get(nodes.size() - 1).getStop();
    CharStream cs = beg.getInputStream();
    return cs.getText(Interval.of(beg.getStartIndex(), end.getStopIndex()));
}

/** Returns the text of the given values joined using the given separator. */
public static String join(CharSequence sep, List<? extends Object> values) {
    if (values == null || values.isEmpty()) return Strings.EMPTY;

    Object first = values.get(0);
    if (first instanceof String) return String.join(sep, values.toArray(new String[0]));
    if (first instanceof Token)
        return values.stream().map(v -> ((Token) v).getText()).collect(Collectors.joining(sep));
    if (first instanceof ParseTree)
        return values.stream().map(v -> ((ParseTree) v).getText()).collect(Collectors.joining(sep));
    return values.stream().map(v -> v.toString()).collect(Collectors.joining(sep));
}

/**
 * Returns the token containing the given {@code startIndex} character offset or
 * {@code null}.
 *
 * @param stream     the token stream
 * @param startIndex character offset of a token
 * @return the token containing the character offset or {@code null}
 */
public static Token find(TokenStream stream, int startIndex) {
    if (stream == null || startIndex < 0) return null;

    List<Token> tokens = ((CommonTokenStream) stream).getTokens();
    CommonToken key = new CommonToken(0);
    key.setStartIndex(startIndex);
    int idx = Collections.binarySearch(tokens, key, TokenComparator);
    return idx > -1 ? tokens.get(idx) : null;
}

public static String evalAmbiguity(Parser recognizer, DFA dfa, int startIndex, int stopIndex,
        boolean exact, BitSet ambigAlts, ATNConfigSet configs) {

    String decision = getDecisionDescription(recognizer, dfa);
    BitSet conflictingAlts = getConflictingAlts(ambigAlts, configs);

    String text = recognizer.getInputStream().getText(Interval.of(startIndex, stopIndex));
    text = TxtUtil.wrap(64, text);
    text = Strings.ellipsize(text, 256);
    text = Strings.encode(text);
    text = Strings.formatEscape(text);
    String cause = String.format(AmbMsg, decision, conflictingAlts, text);

    return cause;
}

public static String evalError(Recognizer<?, ?> recognizer, Token token, int line, int charPos,
        String msg, RecognitionException e) {

    String expected = getExpected(e);
    Vocabulary vocab = recognizer.getVocabulary();

    String cause;
    if (e == null) {
        cause = Strings.capitalize(msg) + " at %s:%s " + getTokenText(vocab, token);

    } else if (e instanceof InputMismatchException) {
        cause = "Mismatched input " + getTokenText(vocab, token) + " at %s:%s " + expected;

    } else if (e instanceof NoViableAltException) {
        String input = "<unknown>";
        TokenStream ts = ((Parser) recognizer).getInputStream();
        if (ts != null) {
            NoViableAltException ne = (NoViableAltException) e;
            if (ne.getStartToken().getType() == Token.EOF) {
                input = "<EOF>";
            } else {
                input = ts.getText(ne.getStartToken(), ne.getOffendingToken());
                input = Strings.encode(input);
            }
        }
        cause = "No viable alternative for input '" + input + "' at %s:%s " + getTokenText(vocab, token);

    } else if (e instanceof LexerNoViableAltException) {
        LexerNoViableAltException le = (LexerNoViableAltException) e;
        int start = le.getStartIndex();
        String txt = "<?>";
        if (start >= 0 && start < le.getInputStream().size()) {
            txt = le.getInputStream().getText(Interval.of(start, start));
            txt = Strings.encode(txt);
        }
        cause = "Lexer: no viable alternative for input '" + txt + "' at %s:%s";
        // fudgit(le, start);

    } else if (e instanceof FailedPredicateException) {
        FailedPredicateException fe = (FailedPredicateException) e;
        cause = String.format("Failed predicate '{%s}?'", fe.getPredicate());
        cause += " at %s:%s " + getTokenText(vocab, token);

    } else if (e instanceof UnwantedTokenException) {
        cause = "Extraneous input " + getTokenText(vocab, token) + " at %s:%s " + expected;

    } else if (e instanceof MissingTokenException) {
        cause = "Missing input " + expected + " at %s:%s " + getTokenText(vocab, token);

    } else {
        cause = String.format("Unknown recognition error of type '%s'", e.getClass().getSimpleName())
                + " at %s:%s " + expected;
    }

    cause = TxtUtil.wrap(64, cause);
    cause = Strings.ellipsize(cause, 256);
    return cause;
}

// Returns a description of the expected tokens at the error site.
private static String getExpected(RecognitionException e) {
    if (e == null) return Strings.EMPTY;
    IntervalSet expected = null;
    try {
        expected = e.getExpectedTokens();
    } catch (Exception ex) {}
    if (expected == null || expected.isNil()) return Strings.EMPTY;

    StringBuilder sb = new StringBuilder("; expected {");
    Vocabulary vocab = e.getRecognizer().getVocabulary();
    for (int ttype : expected.toList()) {
        String typename = vocab.getDisplayName(ttype);
        sb.append(String.format("'%s', ", typename));
    }
    if (sb.length() > 2) sb.setLength(sb.length() - 2);
    sb.append("}");
    return sb.toString();
}

public static String getDecisionDescription(Parser recognizer, DFA dfa) {
    int decision = dfa.decision;
    int ruleIndex = dfa.atnStartState.ruleIndex;

    String[] ruleNames = recognizer.getRuleNames();
    if (ruleIndex < 0 || ruleIndex >= ruleNames.length) {
        return String.valueOf(decision);
    }

    String ruleName = ruleNames[ruleIndex];
    if (ruleName == null || ruleName.isEmpty()) {
        return String.valueOf(decision);
    }

    return String.format("%d (%s)", decision, ruleName);
}

/**
 * Computes the set of conflicting or ambiguous alternatives from a configuration set,
 * if that information was not already provided by the parser.
 *
 * @param reportedAlts The set of conflicting or ambiguous alternatives, as reported
 *                     by the parser.
 * @param configs      The conflicting or ambiguous configuration set.
 * @return Returns {@code reportedAlts} if it is not {@code null}, otherwise returns
 *         the set of alternatives represented in {@code configs}.
 */
public static BitSet getConflictingAlts(BitSet reportedAlts, ATNConfigSet configs) {
    if (reportedAlts != null) return reportedAlts;

    BitSet result = new BitSet();
    for (ATNConfig config : configs) {
        result.set(config.alt);
    }
    return result;
}
}

(此代码中缺少一些次要的辅助方法,但它们的功能应该是显而易见的。)

© www.soinside.com 2019 - 2024. All rights reserved.