使这个经过训练的命名实体代码一次使用一个句子

问题描述 投票:0回答:0
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import codecs

#f = codecs.open(dir+location, 'r', encoding='utf-8')
#data = f.read() load the ner dataset
data = pd.read_csv(r"C:\python\python3.7.8\tfchatenv\chatbotgui.py\data\ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")

data.tail(10)
words = list(set(data["Word"].values))
n_words = len(words); n_words

class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
# get all the sentences
getter = SentenceGetter(data)

sent = getter.get_next()
print(sent)

sentences = getter.sentences
# addd the features 
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

from sklearn_crfsuite import CRF

crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report
#make a prediction
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)

report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

crf.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None,
    all_possible_transitions=False, averaging=None, c=None, c1=0.1, c2=0.1,
    calibration_candidates=None, calibration_eta=None,
    calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None,
    num_memories=None, pa_type=None, period=None, trainer_cls=None,
    variance=None, verbose=False)

import eli5

#get the weights
crf = CRF(algorithm='lbfgs',
c1=10,
c2=0.1,
max_iterations=100,
all_possible_transitions=False)

pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)

report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

crf.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None,
    all_possible_transitions=False, averaging=None, c=None, c1=10, c2=0.1,
    calibration_candidates=None, calibration_eta=None,
    calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None,
    num_memories=None, pa_type=None, period=None, trainer_cls=None,
    variance=None, verbose=False)

eli5.show_weights(crf, top=30)

如何使用这段代码提取单个句子的 NER?

我真的不知道你如何通过写一个句子将它用于 NER。不知道拿什么做参考。我希望它能用适当的命名实体标签解析一个句子。

named-entity-recognition crf
© www.soinside.com 2019 - 2024. All rights reserved.