Jupyter Kernel die / Spyder控制台在Spacy 2.0.11中训练自定义NER模型时停止。

问题描述 投票:1回答:1





CarryBag 09038820815c.txt Stopperneedle 0903882080f4.txt Foilbags 09038820819.txt


import spacy # import en_core_web_sm import re import csv from spacy.matcher import PhraseMatcher import plac from pathlib import Path import random #Function to convert PhraseMatcher return value to string indexes def str_index_conversion(lbl, doc, matchitem): o_one = len(str(doc[0:matchitem[1]])) subdoc = doc[matchitem[1]:matchitem[2]] o_two = o_one + len(str(subdoc)) return (o_one, o_two, lbl) # nlp = spacy.load('en') nlp = spacy.load('en_core_web_sm') if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner) else: ner = nlp.get_pipe('ner') ner.add_label('PRODUCT') DIR = 'D:/Docs/' matcher = PhraseMatcher(nlp.vocab) list_str_index = [] to_train_ents = [] with open(r'D:\ner_dummy_pack.csv', newline='', encoding ='utf-8') as myFile: reader = csv.reader(myFile) for row in reader: try: product = row[0].lower() #print('K---'+ product) filename = row[1] file = open(DIR+filename, "r", encoding ='utf-8') print(file) filecontents = file.read() for s in filecontents: filecontents = re.sub(r'\s+', ' ', filecontents) filecontents = re.sub(r'^https?:\/\/.*[\r\n]*', '', filecontents, flags=re.MULTILINE) filecontents = re.sub(r"http\S+", "", filecontents) filecontents = re.sub(r"[-\"#/@;:<>?{}*`• ?+=~|$.!‘?“”?,_]", " ", filecontents) filecontents = re.sub(r'\d+', '', filecontents)#removing all numbers filecontents = re.sub(' +', ' ',filecontents) #filecontents = filecontents.encode().decode('unicode-escape') filecontents = ''.join([line.lower() for line in filecontents]) if "," in product: product_patterns = product.split(',') product_patterns = [i.strip() for i in product_patterns] for elem in product_patterns: matcher.add('PRODUCT', None, nlp(elem)) else: matcher.add('PRODUCT', None, nlp(product)) print(filecontents) doc = nlp(filecontents) matches = matcher(doc) #print(matches) list_str_index = [str_index_conversion('PRODUCT', doc, x) for x in matches] to_train_ents.append((filecontents, dict(entities=list_str_index))) break except Exception as e: print(e) pass to_train_entsfinal=to_train_ents def main(model=None, output_dir=None, n_iter=100): # nlp.vocab.vectors.name = 'spacy_pretrained_vectors' optimizer = nlp.begin_training() other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(10): losses = {} random.shuffle(to_train_entsfinal) for item in to_train_entsfinal: nlp.update([item[0]], [item[1]], sgd=optimizer, drop=0.50, losses=losses) print(losses) print("OUTTTTT") if output_dir is None: output_dir = "C:\\Users\\APRIL" noutput_dir = Path(output_dir) if not noutput_dir.exists(): noutput_dir.mkdir() #nlp.meta['name'] = new_model_name nlp.to_disk(output_dir) random.shuffle(to_train_entsfinal) if __name__ == '__main__': main()


Blister abc.txt Blisterpack abc.txt Blisters abc.txt



def main(model=None, output_dir=None, n_iter=100): top_memory_precentage_use = 75 # or what ever number you choose def handle_memory(ruler): if psutil.virtual_memory().percent < top_memory_precentage_use: dump_ruler_nonascii(ruler) ruler = nlp.begin_training() #or just init the nlp object again return ruler # This fitted for my use case def dump_ruler_nonascii(ruler): path = Path(os.path.join(self.data_path, 'config.jsonl')) pattern = ruler.patterns with open(path, "a", encoding="utf-8") as f: for line in pattern: f.write(json.dumps(line, ensure_ascii=False) + "\n") return ruler # nlp.vocab.vectors.name = 'spacy_pretrained_vectors' optimizer = nlp.begin_training() other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(10): losses = {} random.shuffle(to_train_entsfinal) for item in to_train_entsfinal: nlp.update([item[0]], [item[1]], sgd=optimizer, drop=0.50, losses=losses) print(losses) print("OUTTTTT") if output_dir is None: output_dir = "C:\\Users\\APRIL" noutput_dir = Path(output_dir) if not noutput_dir.exists(): noutput_dir.mkdir() #nlp.meta['name'] = new_model_name nlp.to_disk(output_dir) random.shuffle(to_train_entsfinal) if __name__ == '__main__': main()

python-3.x machine-learning nlp spacy valueerror


© www.soinside.com 2019 - 2024. All rights reserved.