以下代码在文本文件按代码顺序排列时,即Introduction然后是Information时效果很好,但是如果Information在Introduction之前出现,则会出错。使用lex / yacc处理该问题的解决方案是什么?预先感谢。
import ply.lex as lex
# List of token names. This is always required
tokens = [
'CheckupInformation',
'Introduction',
'Information',
'perfect',
'sick',
'LPAREN',
'RPAREN',
'CHAR',
'NUMBER'
]
def t_CheckupInformation(t) : 'CheckupInformation' ; return t
def t_Introduction(t) : 'Introduction' ; return t
def t_Information(t) : 'Information' ; return t
def t_perfect(t): 'perfect'; return t
def t_sick(t) : 'sick'; return t
t_LPAREN = r'\('
t_RPAREN = r'\)'
t_CHAR = r'[a-zA-Z_][a-zA-Z0-9_\-]*'
t_ignore = " \t"
# Define a rule so we can track line numbers
def t_NUMBER(t):
r'[+\-0-9_][0-9_]*'
t.lexer.lineno += len(t.value)
try:
t.value = int(t.value)
except ValueError:
print("Integer value too large %s" % t.value)
t.value = 0
return t
def t_SEMICOLON(t):
r'\;.*'
t.lexer.lineno += len(t.value)
def t_newline(t):
r'\n+'
t.lexer.lineno += len(t.value)
# Error handling rule
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# Build the lexer
lexer = lex.lex()
# define upper level classes first
class stat:
def __init__(self):
self.statement = ""
self.intro = list()
self.body = list()
P=stat()
def p_stat(p):
'Stat : LPAREN CheckupInformation statIntro statBody RPAREN'
p[0]=(p[1],p[2],p[3],p[4],p[5])
def p_Intro(p) :
'''statIntro : LPAREN Introduction Name RPAREN
| statIntro LPAREN Introduction Name RPAREN
| empty'''
if len(p)==5:
p[0] = (p[3])
elif len(p)==6:
p[0] = (p[4])
else:
p[0]= None
P.intro.append(p[0])
def p_Name(p):
'Name : CHAR'
p[0]=p[1]
def p_Body(p):
'''statBody : LPAREN Information bodyinfo RPAREN
| statBody LPAREN Information bodyinfo RPAREN'''
if len(p)==5:
p[0] = (p[3])
elif len(p)==6:
p[0] = (p[4])
P.body.append(p[0])
def p_bodyinfo(p):
'''bodyinfo : LPAREN CHAR perfect RPAREN
| LPAREN CHAR sick RPAREN'''
p[0]=p[2],p[3]
def p_empty(p):
'empty : '
print("This function is called")
pass
def p_error(p):
print("Syntax error in input '%s'!" % p.value)
import ply.yacc as yacc
parser = yacc.yacc()
import sys
if len(sys.argv) < 2 :
sys.exit("Usage: %s <filename>" % sys.argv[0])
fp = open(sys.argv[1])
contents=fp.read()
result=parser.parse(contents)
print("(CheckupInformation")
if (P.intro) != None:
for x in range(len(P.intro)):
print(" (Introduction %s)" %(P.intro[x]))
for x in range(len(P.body)):
print(" (Information( %s %s))" %(P.body[x]))
print(")")
该代码适用于file1且无法处理file2。
错误:输入'(简介'中的语法错误!(CheckupInformation (无介绍) (信息(Anonymous1完善)))
File1:
(CheckupInformation
(Introduction John)
(Introduction Patt)
(Information(Anonymous1 perfect))
(Information(Anonymous2 sick))
)
File2:
(CheckupInformation
(Information(Anonymous1 perfect))
(Information(Anonymous2 sick))
(Introduction John)
(Introduction Patt)
)
这可能不是您想要的答案,但是我发现自己无法仅更改代码中的一两行。以下内容还远远不够完美,但我认为它正在针对您的问题采用合理的方法。我试图用有用的注释来注释它。请仔细阅读它,并尝试理解我为什么做我的工作,并在必要时参考Ply手册(一些参考在代码注释中,但是文档中有很多有用的背景信息,我没有具体参考) 。
祝你好运。>>
import ply.lex as lex
# Keyword handling copied from the Ply manual, https://www.dabeaz.com/ply/ply.html#ply_nn6
reserved = {
'CheckupInformation': 'TK_CheckupInformation',
'Introduction': 'TK_Introduction',
'Information': 'TK_Information',
'perfect': 'TK_perfect',
'sick': 'TK_sick',
}
# I changed CHAR to WORD because CHAR sounds like a character
tokens = ['NUMBER','WORD'] + list(reserved.values())
def t_WORD(t):
r'[a-zA-Z_][a-zA-Z0-9_-]*'
t.type = reserved.get(t.value,'WORD') # Check for reserved words
return t
# See the Ply manual: https://www.dabeaz.com/ply/ply.html#ply_nn11
literals = '()'
# See the Ply manual: https://www.dabeaz.com/ply/ply.html#ply_nn8
t_ignore = ' \t\n'
t_ignore_COMMENT = r'\;.*'
# Fixed the regex. You can't have a sign in the middle of a number.
def t_NUMBER(t):
r'[+-]?[0-9_]+'
try:
t.value = int(t.value)
except ValueError:
print("Integer value too large %s" % t.value)
t.value = 0
return t
# See below for the definition of lineno_for_token
# Error handling rule
def t_error(t):
print("Illegal character '%s' at line %d'" % (
t.value[0], t.lexer.lineno_for_token(t)))
t.lexer.skip(1)
# Build the lexer
lexer = lex.lex()
# Ply tracks the character index automatically as lexer.lexpos, and every
# token it produces has a lexpos attribute. So there is no need to misuse
# the lineno attribute for that purpose. It should be the line number of
# the token, as its name indicates.
# You don't seem to use lineno (or lexpos) anywhere, but it is handy for
# error messages. But since it is used rarely, it's easier to compute it
# on demand by counting newlines to the lex position.
# Perhaps this should just be added to the lexer we just built.
lex.Lexer.lineno_for_token = lambda self, t: 1 + self.lexdata.count('\n', 0, t.lexpos)
# Fixed this to use an upper-class name and to derive from object.
# Object to hold a top-level description
class Stat(object):
# Attributes used for components
components = {'intro', 'body'}
def __init__(self, component_dict):
self.statement = "" # I don't know what this is used for
# Copy the components dictionary as attributes, using
# an empty list as default
for k in self.components:
setattr(self, k, component_dict.get(k, ()))
# Verify that we used every key in the dict.
for k in component_dict.keys():
if k not in self.components:
print("Warning! Ignoring " + k
+ " because it is not in Stat.components")
# Arrange for the object to print as expected
def __repr__(self):
return '(CheckupInformation %r %r)' % (self.intro, self.body)
# Instead of having a global "P" object (whose name is not very useful),
# we return a Stat object
def p_stat(p):
""" stat : '(' TK_CheckupInformation components ')' """
p[0] = Stat(p[3])
# We allow all components to be optional and order independent here. We
# also allow them all to be repeated. But that could be made more precise.
# components is a dictionary whose values are lists
def p_components_empty(p):
""" components : """
p[0] = { }
def p_components_append(p):
""" components : components component """
p[0] = p[1]
# The component is a two-element tuple
key, value = p[2]
if key in p[0]:
p[0][key].append(value)
else:
p[0][key] = [value]
# Syntax for each component type (just one element, not a list)
# component is a tuple of (key, value)
# All of the productions just copy the value from some specific syntax.
def p_component(p):
""" component : statIntro
| statBody
"""
p[0] = p[1]
def p_statIntro(p):
"""statIntro : '(' TK_Introduction WORD ')' """
p[0] = ('intro', p[3])
def p_statBody(p):
"""statBody : '(' TK_Information bodyinfo ')' """
p[0] = ('body', p[3])
# bodyinfo is a tuple of (identifier, status)
def p_bodyinfo(p):
"""bodyinfo : '(' WORD TK_perfect ')'
| '(' WORD TK_sick ')'
"""
p[0] = (p[2],p[3])
def p_error(p):
print("Syntax error in input '%s'! at line %d" % (
p.value, p.lexer.lineno_for_token(p)))
import ply.yacc as yacc
parser = yacc.yacc()
# Only do this if we're called from the command line
if __name__ == "__main__":
import sys
if len(sys.argv) < 2 :
sys.exit("Usage: %s <filename>" % sys.argv[0])
with open(sys.argv[1]) as fp:
stat = parser.parse(fp.read())
if stat is not None:
print("(CheckupInformation")
for x in range(len(stat.intro)):
print(" (Introduction %s)" %(stat.intro[x]))
for x in range(len(stat.body)):
print(" (Information( %s %s))" %(stat.body[x]))
print(")")