在BigQuery中删除数组之间的重复值

问题描述 投票:0回答:2

假设我有以下数组:

SELECT ['A', 'B', 'C', 'A', 'A', 'A'] AS origin_array
UNION ALL 
SELECT ['A', 'A', 'B'] AS secondary_array

而且我想删除数组之间的所有重复值(而不是数组中的值),这样最终结果将是:

SELECT ['C', 'A', 'A'] AS result_array

任何想法如何完成?

google-bigquery
2个回答
0
投票

如果仅键入UNION而不是UNION ALL,则不应使用重复的值。


0
投票

下面是BigQuery标准SQL的内容>>

#standardSQL
CREATE TEMP FUNCTION DEDUP_ARRAYS(arr1 ANY TYPE, arr2 ANY TYPE) AS ((
  SELECT ARRAY_AGG(item) AS result_array FROM (
    SELECT ARRAY_CONCAT_AGG(SPLIT(REPEAT(CONCAT(item, ','), cnt), ',')) items 
    FROM (
      SELECT ARRAY(
        SELECT AS STRUCT i.item, ABS(IFNULL(o.cnt, 0) - IFNULL(s.cnt, 0)) cnt
        FROM (SELECT DISTINCT item FROM UNNEST(ARRAY_CONCAT(arr1, arr2)) item) i
        LEFT JOIN (SELECT AS STRUCT item, COUNT(1) cnt FROM UNNEST(arr1) item GROUP BY item) o ON i.item = o.item
        LEFT JOIN (SELECT AS STRUCT item, COUNT(1) cnt FROM UNNEST(arr2) item GROUP BY item) s ON i.item = s.item
        WHERE IFNULL(o.cnt, 0) != IFNULL(s.cnt, 0)  
      ) f
    ), UNNEST(f)
  ), UNNEST(items) item
  WHERE item != ''
));
WITH `project.dataset.table` AS (
  SELECT ['A', 'B', 'C', 'A', 'A', 'A'] AS origin_array, ['A', 'A', 'B'] AS secondary_array
)
SELECT DEDUP_ARRAYS(origin_array, secondary_array) AS result_array
FROM `project.dataset.table`

有输出

Row result_array     
1   A    
    A    
    C     

[SELECT ['C', 'A', 'A'] AS result_array将返回的内容

注意;以上最有可能可以重构/优化-留给您:o)

第二选项

它使用略有不同的方法,对我来说看起来更优化,并且在需要时更易读和易于维护

#standardSQL
CREATE TEMP FUNCTION DEDUP_ARRAYS(arr1 ANY TYPE, arr2 ANY TYPE) AS ((
  SELECT ARRAY(
      SELECT item FROM UNNEST(ARRAY_CONCAT(o, s))
      GROUP BY item, pos
      HAVING COUNT(1) = 1
    )
  FROM (
    SELECT 
      ARRAY(SELECT AS STRUCT item, ROW_NUMBER() OVER(PARTITION BY item) pos FROM UNNEST(arr1) item) o,
      ARRAY(SELECT AS STRUCT item, ROW_NUMBER() OVER(PARTITION BY item) pos FROM UNNEST(arr2) item) s
  )
));
WITH `project.dataset.table` AS (
  SELECT ['A', 'B', 'C', 'A', 'A', 'A'] AS origin_array, ['A', 'A', 'B'] AS secondary_array
)
SELECT DEDUP_ARRAYS(origin_array, secondary_array) AS result_array
FROM `project.dataset.table`  

显然具有相同的结果

Row result_array     
1   A    
    A    
    C    
© www.soinside.com 2019 - 2024. All rights reserved.