Bigquery:根据亲戚表查找家族氏族

问题描述 投票:0回答:1

我有家庭关系数据库:

with example_data as(
  SELECT 'abc' as relative_1, 'def' as relative_2
  union all
  SELECT 'abc' as relative_1, '123' as relative_2
  union all
  SELECT 'def' as relative_1, '334' as relative_2
  union all
  SELECT 'fdc' as relative_1, '123' as relative_2
  union all
  SELECT 'fgl' as relative_1, '342' as relative_2
) 

如何基于此数据创建完整的家族,以便获得这种结果:

enter image description here

我编写的用于创建所需输出的代码似乎根本不实际,实际上,对于一个有10万行输入的表,我的查询在第5个自联接后达到了6小时的限制。我不担心最终会把整个表连接成一个长串-我知道一个家族中只有这么多家庭成员。

[此外,如果可以将结果作为嵌套表返回,则嵌套表以list_all_relatives作为重复字段,并且在relative_1和各个远亲之间的步数最少。

我在图像中返回结果的低效率代码:

  SELECT 'abc' as relative_1, 'def' as relative_2, 'abc' as list_0
  union all
  SELECT 'abc' as relative_1, '123' as relative_2, 'abc' as list_0
  union all
  SELECT 'def' as relative_1, '334' as relative_2, 'def' as list_0
  union all
  SELECT 'fdc' as relative_1, '123' as relative_2, 'fdc' as list_0
  union all
  SELECT 'fgl' as relative_1, '342' as relative_2, 'fgl' as list_0
)
,
step_0 as (
 SELECT relative_1, relative_2,
        ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(relative_1,',',relative_2,',',list_0), ',')) AS x ORDER BY x), ',') AS combined_list
  from raw_data
)
,
step_1 as (
SELECT relative_1, relative_2, list_1,
       ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(relative_1,',',relative_2,',',list_1), ',')) AS x ORDER BY x), ',') AS combined_list
from 
    step_0
  left join
    (select relative_2,combined_list as list_1 from step_0)
  using(relative_2)
)
,
step_2 as (
SELECT distinct * except (combined_list,list_1), 
       ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(combined_list,',',list_2), ',')) AS x ORDER BY x), ',') AS combined_list,
from 
    step_1
left join
    (select relative_1,combined_list as list_2 from step_1)
  using(relative_1) 
)
,
step_3 as (
SELECT distinct * except (combined_list,list_2), 
       ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(combined_list,',',list_3), ',')) AS x ORDER BY x), ',') AS combined_list,
from 
    step_2
left join
    (select relative_2,combined_list as list_3 from step_2)
  using(relative_2) 
)
,
step_4 as (
SELECT distinct * except (combined_list,list_3), 
       ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(combined_list,',',list_4), ',')) AS x ORDER BY x), ',') AS combined_list,
from 
    step_3
left join
    (select relative_1,combined_list as list_4 from step_3)
  using(relative_1) 
),

step_N as (
SELECT *
from step_4 
)
,
step_prefinal as (
SELECT distinct
        relative_1,list_4, combined_list, 
        1+length(REGEXP_REPLACE(REGEXP_REPLACE(LOWER(combined_list), '[a-z]', ''),'[0-9]','')) as n_elements_in_list,
        max(1+length(REGEXP_REPLACE(REGEXP_REPLACE(LOWER(combined_list), '[a-z]', ''),'[0-9]',''))) over (partition by relative_1) as longest_list_relatives
from step_N
)
,
step_final as (
SELECT relative_1, combined_list, n_elements_in_list,
       count(*) over (partition by relative_1) as cnt_lists_per_relative,
       max(n_elements_in_list) over (partition by relative_1) as max_elements
from 
  step_prefinal 
 where true
 and longest_list_relatives = n_elements_in_list
 group by 1,2,3
)
,
stats as (
SELECT cnt_lists_per_relative, count(distinct relative_1) as cnt,   
       max(max_elements) as max_elements
from 
step_final
group by 1
order by 1
)

SELECT relative_1, combined_list as list_all_relatives 
from step_final
where true
google-bigquery recursive-query
1个回答
0
投票

我能够创建代码以更简单的方式重现您的输出。

© www.soinside.com 2019 - 2024. All rights reserved.