def searchSimilarDocumentsByPhrases(corpus, Ids, contractIds,count,phrases=None):
tfidf = TfidfVectorizer(vocabulary = phrases, ngram_range=(1, 6))
tfs = tfidf.fit_transform(corpus)
feature_names = tfidf.get_feature_names_out()
rows, cols = tfs.nonzero()
phrase_counts = defaultdict(list)
for row, col in zip(rows, cols):
phraseCount = corpus[row].count(feature_names[col])
phrase_counts[feature_names[col]].append({Ids[row]:{contractIds[row]: phraseCount}})
counter=count
phraselist=[]
for phrase, counts in phrase_counts.items():
counts.sort(key=lambda x: list(x.items()), reverse=True)
counts=counts[:counter]
phraselist.append(phrase)
phraselist.append(counts)
return phraselist
我提供的输入是
{ "phrases":
[
"test and evaluation"
],
"count":"3"
}
我得到输出为
[
"test and evaluation",
[
{
"64": {
"LMLB_C-41": 2
}
},
{
"24": {
"LMLB_C-2": 1
}
},
{
"1180": {
"LMLB_C-157": 4
}
}
],
]
虽然我希望输出像下面的值降序排列
[
"test and evaluation",
[
{
"1180": {
"LMLB_C-157": 4
}
},
{
"64": {
"LMLB_C-41": 2
}
},
{
"24": {
"LMLB_C-2": 1
}
}
],
]