下面的一段代码实现了我想要实现的结果。有一个名为“引理”的字符串列表,其中包含特定类别单词的可接受形式。另一个名为“形式”的列表包含大量单词的拼写变体,这些单词存在于来自不同时期和特定语言的不同方言的大量文本中。对于“forms”中的每个单词,我想获取“lemmas”中最接近匹配的字符串。
正如我所说,该脚本似乎适用于我构建的一些测试列表。但是,我遇到的问题是,当我使用相当大的真实列表时,需要很长时间才能生成结果。事实上,我不得不停止执行程序,因为它已经花了两个多小时,而且电脑变得非常慢,我无能为力。
我可以做些什么来提高效率?我将如何使用其他 Python 工具或库修改代码以使其更快?提前致谢。
import textdistance
from textdistance import hamming
from textdistance import cosine
from textdistance import jaro_winkler
import heapq
# 'lemmas' is a list containing a huge amount of words, basically dictionary entries
# 'forms' is a huge list of spelling variations of words found in hundreds of texts
distances = {}
processed_pairs = set() # keep track of processed pairs
for lemma in lemmas:
if lemma is None:
continue
lemma_lower = lemma.lower()
for form in forms:
if form is None:
continue
form_lower = form.lower()
pair = (lemma_lower, form_lower) # create a tuple with the lowercase pair
if pair not in processed_pairs: # check if the pair has been processed before
processed_pairs.add(pair)
if textdistance.hamming.normalized_similarity(lemma_lower, form_lower) > 0.34 and textdistance.jaro_winkler(lemma_lower, form_lower) > 0.7 and textdistance.cosine(lemma_lower, form_lower) > 0.5:
dist = hamming.normalized_similarity(lemma_lower, form_lower)
distances.setdefault(form_lower, []).append((dist, lemma_lower))
# Find the closest pairs
closest_pairs = {}
for form, dist_lemmas in distances.items():
closest_pairs[form] = heapq.nsmallest(2, dist_lemmas)
with open(ROOT / 'potential_lemmas.txt', 'w') as f:
for form, pairs in closest_pairs.items():
for dist, lemma in pairs:
f.write(f"{form} ➝ {lemma}: {dist}\n")
编辑:
第二次尝试合并@Jamie_B的建议:
def compute_distances(lemma, forms):
distances = []
lemma_lower = lemma.lower()
for form in forms:
if form is None:
continue
form_lower = form.lower()
dist = textdistance.jaro_winkler(lemma_lower, form_lower)
if dist > 0.7:
distances.append((dist, form_lower, lemma_lower))
return distances
def find_closest_pairs(lemmas, forms):
distances = Parallel(n_jobs=-1)(delayed(compute_distances)(lemma, forms) for lemma in lemmas)
distances = [dist for sublist in distances for dist in sublist]
# Find the closest pairs
closest_pairs = {}
processed_pairs = set()
for dist, form, lemma in sorted(distances, reverse=True):
pair = (lemma, form)
if pair not in processed_pairs:
processed_pairs.add(pair)
closest_pairs.setdefault(form, []).append((dist, lemma))
closest_pairs = {form: heapq.nlargest(2, dist_lemmas) for form, dist_lemmas in closest_pairs.items()}
return closest_pairs
closest_pairs = find_closest_pairs(lemmas, forms)
with open(ROOT / 'potential_lemmas_parallel.txt', 'w') as f:
for form, pairs in closest_pairs.items():
for dist, lemma in pairs:
f.write(f"{form} ➝ {lemma}: {dist}\n")
你可以直接使用
rapidfuzz
。
https://maxbachmann.github.io/RapidFuzz/Usage/process.html#cdist
import rapidfuzz.process
scores = rapidfuzz.process.cdist(forms, lemmas, workers=-1)
nearest = scores.argmax(axis=1)
# nearest now contains the indexes of `lemmas` with highest closest score
基于一个小的基准测试,你的代码花了
1m58.226s
使用
.cdist()
0m11.481s
您可以更改默认记分员例如
scorer=rapidfuzz.distance.JaroWinkler.distance
以下解决方案基于您的原始代码(汉明距离),它提供(几乎)一个数量级的加速(〜89.41%),根据line-profiler测量的每五次运行的平均值。使用此解决方案作为并行处理的基础可能会让您更接近您所追求的总处理时间。
要使用
line-profiler
,pip install line-profiler
然后在添加kernprof -l -v test.py
并调用要从@profile
中进行分析的函数后运行__main__
。
from itertools import zip_longest
from bisect import insort
lemmas = ["Do", "dOne", "PUrpose", "can", "be", "use", "for", "cannon", "amuse", "useful", "user", "become", "downtown", "develop", "fulminate", "deduce", "de", "bezant"]
forms = ["doNE", "donE", "doIng", "purposeful", "canonical", "becareful", "being", "berate", "best", "bezant", "full", "fulmination", "predict", "downgrade", "down", "developing", "deduct", "deducing"]
distances = {}
@profile
def profile_distance_calcs():
lemmas_low = [lemma.lower() for lemma in lemmas]
forms_low = [form.lower() for form in forms]
for form in forms_low:
form_distances = []
for lemma in lemmas_low:
char_matches = [c1 != c2 for c1, c2 in zip_longest(lemma, form)]
dist = 1 - (sum(char_matches)/len(char_matches))
if dist > 0.25:
insort(form_distances, (dist, lemma))
distances[form] = form_distances
with open("potential_lemmas_hamming.txt", "w") as f:
for form, form_distances in distances.items():
for dist, lemma in reversed(form_distances[-2:]):
f.write(f"{form} ➝ {lemma}: {dist}\n")
if __name__ == "__main__":
profile_distance_calcs()
从下面的时间配置文件细分(总时间:0.00122992 秒),您可以了解减速的来源。
罪魁祸首(显然)是距离计算,这就是为什么我切换
textdistance.hamming.normalized_similarity
以基于 textdistance
hamming 和 hamming.normalized_similarity 源更有效(准系统)手动计算同一事物代码。我也相信使用 bisect.insort
并在插入时维护排序列表比插入所有元素然后运行 heapq.nlargest
更快。
Line # Hits Time Per Hit % Time Line Contents
==============================================================
10 @profile
11 def profile_distance_calcs():
12 1 7.9 7.9 0.6 lemmas_low = [lemma.lower() for lemma in lemmas]
13 1 7.0 7.0 0.6 forms_low = [form.lower() for form in forms]
14 18 1.8 0.1 0.1 for form in forms_low:
15 18 2.0 0.1 0.2 form_distances = []
16 324 33.4 0.1 2.7 for lemma in lemmas_low:
17 324 844.5 2.6 68.7 char_matches = [c1 != c2 for c1, c2 in zip_longest(lemma, form)]
18 324 155.6 0.5 12.7 dist = 1 - (sum(char_matches)/len(char_matches))
19 285 44.4 0.2 3.6 if dist > 0.25:
20 39 12.3 0.3 1.0 insort(form_distances, (dist, lemma))
21 18 4.7 0.3 0.4 distances[form] = form_distances
22
23 1 52.5 52.5 4.3 with open("potential_lemmas_hamming.txt", "w") as f:
24 17 4.2 0.2 0.3 for form, form_distances in distances.items():
25 26 11.5 0.4 0.9 for dist, lemma in reversed(form_distances[-2:]):
26 26 48.3 1.9 3.9 f.write(f"{form} ➝ {lemma}: {dist}\n")
这里是你的原始代码用于比较。我修改了它的某些方面,主要区别在于使用
heapq.nlargest
因为我相信你是在为每种形式寻找 2 个最相似的引理,而不是 heapq.nsmallest
提供的 2 个最不相似的引理。
from textdistance import hamming, cosine, jaro_winkler
import heapq
lemmas = ["do", "done", "purpose", "can", "be", "use", "for", "cannon", "amuse", "useful", "user", "become", "downtown", "develop", "fulminate", "deduce", "de", "bezant"]
forms = ["done", "done", "doing", "purposeful", "canonical", "becareful", "being", "berate", "best", "bezant", "full", "fulmination", "predict", "downgrade", "down", "developing", "deduct", "deducing"]
distances = {}
processed_pairs = set() # keep track of processed pairs
@profile
def profile_distance_calcs():
for lemma in lemmas:
if lemma is None:
continue
lemma_lower = lemma.lower()
for form in forms:
if form is None:
continue
form_lower = form.lower()
pair = (lemma_lower, form_lower)
if pair not in processed_pairs:
processed_pairs.add(pair)
dist = hamming.normalized_similarity(lemma_lower, form_lower)
if dist > 0.25:
distances.setdefault(form_lower, []).append((dist, lemma_lower))
# Find the closest pairs
closest_pairs = {}
for form, dist_lemmas in distances.items():
closest_pairs[form] = heapq.nlargest(2, dist_lemmas)
with open("potential_lemmas_orig.txt", "w") as f:
for form, pairs in closest_pairs.items():
for dist, lemma in pairs:
f.write(f"{form} ➝ {lemma}: {dist}\n")
if __name__ == "__main__":
profile_distance_calcs()
原始代码的时间剖析(总时间:0.0114992 s):
Line # Hits Time Per Hit % Time Line Contents
==============================================================
11 @profile
12 def profile_distance_calcs():
13 18 2.4 0.1 0.0 for lemma in lemmas:
14 18 1.9 0.1 0.0 if lemma is None:
15 continue
16 18 6.4 0.4 0.1 lemma_lower = lemma.lower()
17 324 38.8 0.1 0.3 for form in forms:
18 324 32.6 0.1 0.3 if form is None:
19 continue
20 324 108.2 0.3 0.9 form_lower = form.lower()
21 324 46.9 0.1 0.4 pair = (lemma_lower, form_lower)
22 306 60.2 0.2 0.5 if pair not in processed_pairs:
23 306 92.0 0.3 0.8 processed_pairs.add(pair)
24 306 10828.9 35.4 94.2 dist = hamming.normalized_similarity(lemma_lower, form_lower)
25 270 47.5 0.2 0.4 if dist > 0.25:
26 36 24.1 0.7 0.2 distances.setdefault(form_lower, []).append((dist, lemma_lower))
27
28 # Find the closest pairs
29 1 0.2 0.2 0.0 closest_pairs = {}
30 16 4.3 0.3 0.0 for form, dist_lemmas in distances.items():
31 16 72.7 4.5 0.6 closest_pairs[form] = heapq.nlargest(2, dist_lemmas)
32
33 1 72.3 72.3 0.6 with open("potential_lemmas_orig.txt", "w") as f:
34 16 4.2 0.3 0.0 for form, pairs in closest_pairs.items():
35 26 6.5 0.3 0.1 for dist, lemma in pairs:
36 26 49.0 1.9 0.4 f.write(f"{form} ➝ {lemma}: {dist}\n")