我有一个CSV文件,其中有一个IT事件的记录,我有一个 "摘要 "列和一个 "类别 "列,我为这一列中的每一行生成了tokenize字。
你并没有具体说明你想如何比较你的向量,但一种常见的方法是使用欧氏距离(L2 Norm)。我建议将你的 token 列提取到 numpy 数组中,然后使用 numpy.linalg.norm
import numpy as np
# a and b would represent your token columns
# you didn't specify the size of each token/vector so I set it to 5 for the sake of this example
a = np.random.rand(4,5)
b = np.random.rand(4,5)
a, b
OUT:
(array([[0.39435087, 0.06389897, 0.66712442, 0.5442628 , 0.29284329],
[0.10868951, 0.61121235, 0.24025041, 0.57043359, 0.1375542 ],
[0.56818288, 0.74752492, 0.16356138, 0.79570418, 0.01905405],
[0.44776656, 0.31403308, 0.29965215, 0.21162856, 0.49277446]]),
array([[0.6300318 , 0.47202827, 0.19513324, 0.38156414, 0.85896642],
[0.69856134, 0.33403423, 0.17599279, 0.62404711, 0.10093772],
[0.18569367, 0.9487905 , 0.76287508, 0.30532111, 0.66589667],
[0.88249761, 0.32604273, 0.95195868, 0.89162121, 0.03382068]]))
# this will be a per row distance between your tokens/vectors
# you can add it as a new column in your dataframe
np.linalg.norm(a-b, axis=-1)
OUT:
array([0.88986344, 0.65811907, 1.09766282, 1.13475447])