是一个大型嵌套字典,结构如下
# {
# "query_1": {
# "doc_1": 0.27525704173232723,
# "doc_2": 0.05115370517104112,
# "doc_3": 0.684077731862703,
# "doc_4": 0.8593173782232113,
# "doc_5": 0.9992261057642471,
# "doc_6": 0.8927443913799646,
# "doc_7": 0.21853726164833975,
# "doc_8": 0.20343449721373175,
# "doc_9": 0.3671854317362193,
# "doc_10": 0.5517113596197711,
# "doc_11": 0.1882548003666461,
# "doc_12": 0.044734783865907124,
# "doc_13": 0.22633588060935117,
# "doc_14": 0.2578317547900737,
# "doc_15": 0.5659508960653867
# },
# ...
# "query_1587658": {
# "doc_1": 0.10856609385238025,
# "doc_2": 0.016464788372524297,
# "doc_3": 0.3566483096640577,
# "doc_4": 0.5400913407418702,
# "doc_5": 0.17848721844101811,
# "doc_6": 0.42536942094534014,
# "doc_7": 0.5080529426125224,
# "doc_8": 0.3461002807821024,
# "doc_9": 0.24761391133009103,
# "doc_10": 0.6286192869283476,
# "doc_11": 0.08574801929277642,
# "doc_12": 0.5384879539476498,
# "doc_13": 0.44215582705669476,
# "doc_14": 0.05529319806340971,
# "doc_15": 0.10878796076603192
# }
# }
代表排名。有必要对内部词典(检索到的文档)进行排序,并为每个查询选择
top_k
项。以下算法按顺序执行此排序:
import json
import heapq
top_k=5
for query_idx, docs_scores in ranking.items():
ranking[query_idx] = {k: v for k, v in heapq.nlargest(top_k, docs_scores.items(), key=lambda item: item[1])}
print(json.dumps(ranking, indent=4))
# {
# "query_1": {
# "doc_5": 0.9992261057642471,
# "doc_6": 0.8927443913799646,
# "doc_4": 0.8593173782232113,
# "doc_3": 0.684077731862703,
# "doc_15": 0.5659508960653867
# },
# ...
# },
# "query_1587658": {
# "doc_10": 0.6286192869283476,
# "doc_4": 0.5400913407418702,
# "doc_12": 0.5384879539476498,
# "doc_7": 0.5080529426125224,
# "doc_13": 0.44215582705669476
# }
# }
由于一个查询的顺序独立于另一个查询,您能否提供一种并行化上述算法的方法?
我不确定 numba 是否是完成该任务的正确工具(也许如果你可以将 Python 字典转换为 numpy 数组,那么可以加快速度):
但是您可以简单地通过以下方式加快脚本速度:
def process(ranking):
for k in ranking:
ranking[k] = dict(
sorted(ranking[k].items(), key=lambda i: i[1], reverse=True)[:5]
)
简单基准:
import heapq
from random import random, seed
from timeit import timeit
def create_dict(n=1587658, _seed=42):
seed(_seed)
out = {
f"query_{i}": {f"doc_{n}": random() for n in range(1, 16)}
for i in range(1, n + 1)
}
return out
def variant_1(ranking):
top_k = 5
for query_idx, docs_scores in ranking.items():
ranking[query_idx] = {
k: v
for k, v in heapq.nlargest(
top_k, docs_scores.items(), key=lambda item: item[1]
)
}
def variant_2(ranking):
for k in ranking:
ranking[k] = dict(
sorted(ranking[k].items(), key=lambda i: i[1], reverse=True)[:5]
)
t1 = timeit("variant_1(d)", setup="d=create_dict()", globals=globals(), number=1)
t2 = timeit("variant_2(d)", setup="d=create_dict()", globals=globals(), number=1)
print(t1)
print(t2)
在我的计算机(Python 3.11/AMD 5700x)上打印:
5.448211211245507
3.5935627752915025