多次使用 Spacy 的 Matcher 时出现奇怪的行为

问题描述 投票:0回答:1

我想在跨度(发送)列表中使用 Spacy 的 Matcher

class Chunker:
    def __init__(self, nlp, matcher):
        self.nlp = nlp
        self.matcher = matcher
        self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")

    def on_match_callback(self, matcher, doc, i, matches):
        match_id, start, end = matches[i]
        string_id = self.nlp.vocab.strings[match_id]
        span = doc[start:end]
        print("(", span, ")")
        self.phrase[string_id].append(span)

    def chunk(self, text):
        self.phrases = []
        doc = self.nlp(text)
        sents = list(doc.sents)
        for sent in sents:
            self.phrase = {
                "NP": [],
                "VP": [],
                "VVP": []
            }
            self.phrases.append(self.phrase)
            print("[", sent, "]")
            self.matcher(sent)

            for phrase in self.phrase.values():
                phrase.sort(key=lambda x: x.start)

        return self.phrases
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
chunker = Chunker(nlp, matcher)

phrases = chunker.chunk("Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.\nI love pdf, it is wonderfull.")
print(phrases)

但它似乎很困惑,给我这个回应

[ Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.
 ]
( the HDF5 library )
( the Python language )
( the NumPy package )
( Pytables )
( top )
( is built on )
( using )
[ I love pdf, it is wonderfull. ]
( is )
( of )
( built )
[{'NP': [Pytables, top, the HDF5 library, the Python language, the NumPy package], 'VP': [is built on, using], 'VVP': []}, {'NP': [built], 'VP': [is, of], 'VVP': []}]

第一个元素很好但第二个元素不好

{'NP': [built], 'VP': [is, of], 'VVP': []}
如果我们用不同的文本多次使用匹配器会有问题吗?

python nlp spacy
1个回答
0
投票

我没有使用多个句子,而是在回调函数上检查句子ID,它有效但看起来有点恶心

class Chunker:
    def __init__(self, nlp, matcher):
        self.nlp = nlp
        self.matcher = matcher
        self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")

    def on_match_callback(self, matcher, doc, i, matches):
        match_id, start, end = matches[i]
        string_id = self.nlp.vocab.strings[match_id]
        span = doc[start:end]
        sents = list(doc.sents)
        sent_id = sents.index(span.sent)
        print("(", span, ")")
        print("Sentence number: ", sent_id)

        self.phrases[sent_id][string_id].append(span)

    def chunk(self, text):
        self.phrases = []
        doc = self.nlp(text)
        self.phrases = [{"NP": [], "VP": [], "VVP": []} for _ in doc.sents]
        self.matcher(doc)

        for phrases in self.phrases:
            for phrase in phrases.values():
                phrase.sort(key=lambda x: x.start)

        return self.phrases
© www.soinside.com 2019 - 2024. All rights reserved.