我想在跨度(发送)列表中使用 Spacy 的 Matcher
class Chunker:
def __init__(self, nlp, matcher):
self.nlp = nlp
self.matcher = matcher
self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
def on_match_callback(self, matcher, doc, i, matches):
match_id, start, end = matches[i]
string_id = self.nlp.vocab.strings[match_id]
span = doc[start:end]
print("(", span, ")")
self.phrase[string_id].append(span)
def chunk(self, text):
self.phrases = []
doc = self.nlp(text)
sents = list(doc.sents)
for sent in sents:
self.phrase = {
"NP": [],
"VP": [],
"VVP": []
}
self.phrases.append(self.phrase)
print("[", sent, "]")
self.matcher(sent)
for phrase in self.phrase.values():
phrase.sort(key=lambda x: x.start)
return self.phrases
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
chunker = Chunker(nlp, matcher)
phrases = chunker.chunk("Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.\nI love pdf, it is wonderfull.")
print(phrases)
但它似乎很困惑,给我这个回应
[ Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.
]
( the HDF5 library )
( the Python language )
( the NumPy package )
( Pytables )
( top )
( is built on )
( using )
[ I love pdf, it is wonderfull. ]
( is )
( of )
( built )
[{'NP': [Pytables, top, the HDF5 library, the Python language, the NumPy package], 'VP': [is built on, using], 'VVP': []}, {'NP': [built], 'VP': [is, of], 'VVP': []}]
第一个元素很好但第二个元素不好
{'NP': [built], 'VP': [is, of], 'VVP': []}
如果我们用不同的文本多次使用匹配器会有问题吗?
我没有使用多个句子,而是在回调函数上检查句子ID,它有效但看起来有点恶心
class Chunker:
def __init__(self, nlp, matcher):
self.nlp = nlp
self.matcher = matcher
self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
def on_match_callback(self, matcher, doc, i, matches):
match_id, start, end = matches[i]
string_id = self.nlp.vocab.strings[match_id]
span = doc[start:end]
sents = list(doc.sents)
sent_id = sents.index(span.sent)
print("(", span, ")")
print("Sentence number: ", sent_id)
self.phrases[sent_id][string_id].append(span)
def chunk(self, text):
self.phrases = []
doc = self.nlp(text)
self.phrases = [{"NP": [], "VP": [], "VVP": []} for _ in doc.sents]
self.matcher(doc)
for phrases in self.phrases:
for phrase in phrases.values():
phrase.sort(key=lambda x: x.start)
return self.phrases