from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab, validate=True)
pattern = [{'LOWER': 'play'},
{'OP': '*'}, {'OP': '!', 'LOWER': 'store'},
{'LOWER': {'IN': ["game", "pacman"]}}
]
matcher.add('HUNTING', None, pattern)
def extract_patterns(nlp_doc, matcher):
result_spans = []
matches = matcher(nlp_doc)
print("matches:", len(matches))
for match_id, start, end in matches:
span = nlp_doc[start:end]
result_spans.append(span)
return result_spans
text = ('play store game. \n play with pacman')
doc = nlp(text)
extract_patterns(doc, matcher=matcher)
以上代码的返回结果如下。[与吃豆人一起玩,玩商店游戏。和吃豆子一起玩]
但是预期的结果是[和pacman一起玩]
是否可以使用Spacy Matcher?
您的样式应该看起来像
pattern = [
{'LOWER': 'play'},
{'OP': '!', 'LOWER': 'store'},
{'LOWER': {'IN': ["game", "pacman"]}}
]
将导致单个匹配,['play with pacman']
,因为它不敏感地匹配play
令牌大小写,然后不敏感地匹配store
以外的令牌,然后不敏感地匹配game
或pacman
大小写。 >
完整测试代码段:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab, validate=True)
pattern = [
{'LOWER': 'play'},
{'OP': '!', 'LOWER': 'store'},
{'LOWER': {'IN': ["game", "pacman"]}}
]
matcher.add('HUNTING', None, pattern)
def extract_patterns(nlp_doc, matcher):
result_spans = []
matches = matcher(nlp_doc)
print("matches:", len(matches))
for match_id, start, end in matches:
span = nlp_doc[start:end]
result_spans.append(span.text)
return list(set(result_spans))
doc = nlp("play store game. \n play with pacman")
extract_patterns(doc, matcher=matcher)