提高此窗口密集型查询的性能

问题描述 投票:0回答:1

我在 HQL 中有以下代码。它从包含超过 25 亿行和大约 334 列的表中提取数据。运行需要一段时间,因此我正在寻找任何可能的潜在性能提升。这是查询的主要块,后续的 CTE 会执行一些进一步的处理,但下面是最昂贵的查询。

我很欣赏窗口函数在这种情况下可能很重,但是它们是必需的,因为需要高度特定的时间戳计算。

非常感谢任何帮助!

WITH t1 AS ( 
SELECT 
*, 
CASE 
WHEN (LEAD(hour) OVER (PARTITION BY id ORDER BY `time`) - hour) > 1 
    THEN NULL 
WHEN (LEAD(hour) OVER (PARTITION BY id ORDER BY `time`) - hour) = 1 
    THEN UNIX_TIMESTAMP(CONCAT(`date`, " ", LPAD((hour + 1), 2, 0), ":00:00"), 'dd/MM/yyyy HH:mm:ss') - UNIX_TIMESTAMP(CONCAT(`date`, " ", REGEXP_REPLACE(`time`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss')   
WHEN (LAG(s_id) OVER (PARTITION BY id ORDER BY `time`) = s_id) = TRUE
AND (hour - LAG(hour) OVER (PARTITION BY id ORDER BY `time`)) = 1 
AND (LEAD(hour) OVER (PARTITION BY id ORDER BY `time`) - hour) = 0 
    THEN (LEAD(UNIX_TIMESTAMP(CONCAT(`date`, " ", REGEXP_REPLACE(`time`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss'), 1) OVER (PARTITION BY id ORDER BY `time`) -  
    UNIX_TIMESTAMP(CONCAT(`date`, " ", REGEXP_REPLACE(`time`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss') + LAG(hour_overlap_add) OVER (PARTITION BY id ORDER BY `time`)) 
ELSE LEAD(unix_timestamp(CONCAT(`date`, " ", regexp_replace(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss'), 1) OVER (PARTITION BY id, hour ORDER BY `time`) 
- unix_timestamp(CONCAT(`date`, " ", regexp_replace(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss') 
END AS time_to_next_trans 
FROM( 
    SELECT 
        *, 
        (overlap_time_to_next_trans - sec_between_current_trans_and_next_hr) as hour_overlap_add 
    FROM( 
        SELECT 
            c_id, 
            s_id, 
            id, 
            rat, 
            dt, 
            `date`, 
            `time`, 
            hour, 
            (LEAD(s_id) OVER (PARTITION BY id ORDER BY `time`) = s_id) AS s_id_change, 
            LEAD(hour) OVER (PARTITION BY id ORDER BY `time`) - hour AS difference_hour, 
            UNIX_TIMESTAMP(CONCAT(`date`, " ", LPAD((hour + 1), 2, 0), ":00:00"), 'dd/MM/yyyy HH:mm:ss') 
            - UNIX_TIMESTAMP(CONCAT(`date`, " ", REGEXP_REPLACE(`time`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss') AS sec_between_current_trans_and_next_hr, 
            LEAD(UNIX_TIMESTAMP(CONCAT(`DATE`, " ", REGEXP_REPLACE(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss'), 1) OVER (PARTITION BY id ORDER BY `time`) - 
            UNIX_TIMESTAMP(CONCAT(`DATE`, " ", REGEXP_REPLACE(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss') AS overlap_time_to_next_trans 
        FROM database.tablename
        WHERE dt = "${rundate}” AND ID <> 0 
        )j 
    )o 
)
sql hadoop hive hql
1个回答
0
投票

在步骤 1 中获得大部分推导。然后重复使用这些推导而无需再次推导。

with j as 
(
  SELECT 
    c_id
    ,s_id
    ,id
    ,rat
    ,dt
    ,`date`
    ,`time`
    ,hour
    ,(LEAD(s_id) OVER (PARTITION BY id ORDER BY `time`) = s_id) AS s_id_change_lead
    ,(LAG(s_id) OVER (PARTITION BY id ORDER BY `time`) = s_id) AS s_id_change_lag   
    ,LEAD(hour) OVER (PARTITION BY id ORDER BY `time`) - hour AS difference_hour
    ,hour - LAG(hour) OVER (PARTITION BY id ORDER BY `time`) as new_hour_diff
    ,UNIX_TIMESTAMP(CONCAT(`date`, " ", LPAD((hour + 1), 2, 0), ":00:00"), 'dd/MM/yyyy HH:mm:ss') - UNIX_TIMESTAMP(CONCAT(`date`, " ", REGEXP_REPLACE(`time`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss') AS sec_between_current_trans_and_next_hr, 
    ,LEAD(UNIX_TIMESTAMP(CONCAT(`DATE`, " ", REGEXP_REPLACE(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss'), 1) OVER (PARTITION BY id ORDER BY `time`) - UNIX_TIMESTAMP(CONCAT(`DATE`, " ", REGEXP_REPLACE(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss') AS overlap_time_to_next_trans 
  FROM database.tablename
  WHERE dt = "${rundate}" AND ID <> 0
),

o as
(
  SELECT 
  *,
  overlap_time_to_next_trans - sec_between_current_trans_and_next_hr as hour_overlap_add
  FROM j
),

t1 as 
(
SELECT 
  *
  ,CASE 
    WHEN difference_hour > 1 
      THEN NULL
    WHEN difference_hour = 1 
      THEN sec_between_current_trans_and_next_hr
    WHEN s_id_change_lag = TRUE AND new_hour_diff=1 AND difference_hour = 0
      THEN overlap_time_to_next_trans + LAG(hour_overlap_add) OVER (PARTITION BY id ORDER BY `time`)
    ELSE LEAD(unix_timestamp(CONCAT(`date`, " ", regexp_replace(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss'), 1) OVER (PARTITION BY id, hour ORDER BY `time`) - unix_timestamp(CONCAT(`date`, " ", regexp_replace(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss')
  END AS time_to_next_trans
FROM o
)
  
select * from t1;

希望这有帮助。

© www.soinside.com 2019 - 2024. All rights reserved.