我有一个bigquery表,其中有一列具有512维向量(浮点数)的重复数据类型。
我想运行一个查询,查找N个最相似的向量。
就我而言,相似性可以简单地定义为目标向量与数据库中每个向量的内积。
我已经找到并运行以下查询,它将在表中的所有组合中生成此查询:
#standardSQL
CREATE TABLE ml.url_cosine_similarity AS
WITH pairwise AS (
SELECT t1.url AS id_1, t2.url AS id_2
FROM `project.dataset.table` t1
INNER JOIN `project.dataset.table` t2
ON t1.url < t2.url
)
SELECT id_1, id_2, (
SELECT
SUM(value1 * value2)/
SQRT(SUM(value1 * value1))/
SQRT(SUM(value2 * value2))
FROM UNNEST(a.page_vector) value1 WITH OFFSET pos1
JOIN UNNEST(b.page_vector) value2 WITH OFFSET pos2
ON pos1 = pos2
) cosine_similarity
FROM pairwise t
JOIN `project.dataset.table` a ON a.url = id_1
JOIN `project.dataset.table` b ON b.url = id_2
但是,由于我不太了解bigquery中数组的工作方式,因此我不确定如何更改此查询以采用目标向量,并返回N个邻居。
请参见简化示例-它返回表中的前3个最接近的向量对
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 id, [1,2,3,4,5] page_vector UNION ALL
SELECT 2, [1,3,4,5,16] UNION ALL
SELECT 3, [2,3,4,5,6] UNION ALL
SELECT 4, [2,4,6,8,9] UNION ALL
SELECT 5, [1,3,4,5,16] UNION ALL
SELECT 6, [11,12,13,14,15]
)
SELECT a.id id1, b.id id2, (
SELECT
SUM(value1 * value2)/
SQRT(SUM(value1 * value1))/
SQRT(SUM(value2 * value2))
FROM UNNEST(a.page_vector) value1 WITH OFFSET pos1
JOIN UNNEST(b.page_vector) value2 WITH OFFSET pos2
ON pos1 = pos2
) cosine_similarity
FROM `project.dataset.table` a
JOIN `project.dataset.table` b
ON a.id < b.id
ORDER BY cosine_similarity DESC
LIMIT 3
有输出
Row id1 id2 cosine_similarity
1 2 5 1.0
2 1 4 0.9986422261219272
3 3 4 0.9962894120648842
如果要为表中的每个向量输出最近的向量(比方说两个),请参见下面的示例
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 id, [1,2,3,4,5] page_vector UNION ALL
SELECT 2, [1,3,4,5,16] UNION ALL
SELECT 3, [2,3,4,5,6] UNION ALL
SELECT 4, [2,4,6,8,9] UNION ALL
SELECT 5, [1,3,4,5,16] UNION ALL
SELECT 6, [11,12,13,14,15]
)
SELECT id, ANY_VALUE(page_vector) page_vector,
ARRAY_AGG(
STRUCT(id2 AS id, page_vector2 AS page_vector, cosine_similarity AS cosine_similarity)
ORDER BY cosine_similarity DESC
LIMIT 2
) similar_vectors
FROM (
SELECT a.id, a.page_vector,
b.id id2, b.page_vector page_vector2, (
SELECT
SUM(value1 * value2)/
SQRT(SUM(value1 * value1))/
SQRT(SUM(value2 * value2))
FROM UNNEST(a.page_vector) value1 WITH OFFSET pos1
JOIN UNNEST(b.page_vector) value2 WITH OFFSET pos2
ON pos1 = pos2
) cosine_similarity
FROM `project.dataset.table` a
JOIN `project.dataset.table` b
ON a.id != b.id
)
GROUP BY id
ORDER BY id
这将产生下面的输出