我在 HQL 中有以下代码。它从包含超过 25 亿行和大约 334 列的表中提取数据。运行需要一段时间,因此我正在寻找任何可能的潜在性能提升。这是查询的主要块,后续的 CTE 会执行一些进一步的处理,但下面是最昂贵的查询。
我很欣赏窗口函数在这种情况下可能很重,但是它们是必需的,因为需要高度特定的时间戳计算。
非常感谢任何帮助!
WITH t1 AS (
SELECT
*,
CASE
WHEN (LEAD(hour) OVER (PARTITION BY id ORDER BY `time`) - hour) > 1
THEN NULL
WHEN (LEAD(hour) OVER (PARTITION BY id ORDER BY `time`) - hour) = 1
THEN UNIX_TIMESTAMP(CONCAT(`date`, " ", LPAD((hour + 1), 2, 0), ":00:00"), 'dd/MM/yyyy HH:mm:ss') - UNIX_TIMESTAMP(CONCAT(`date`, " ", REGEXP_REPLACE(`time`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss')
WHEN (LAG(s_id) OVER (PARTITION BY id ORDER BY `time`) = s_id) = TRUE
AND (hour - LAG(hour) OVER (PARTITION BY id ORDER BY `time`)) = 1
AND (LEAD(hour) OVER (PARTITION BY id ORDER BY `time`) - hour) = 0
THEN (LEAD(UNIX_TIMESTAMP(CONCAT(`date`, " ", REGEXP_REPLACE(`time`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss'), 1) OVER (PARTITION BY id ORDER BY `time`) -
UNIX_TIMESTAMP(CONCAT(`date`, " ", REGEXP_REPLACE(`time`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss') + LAG(hour_overlap_add) OVER (PARTITION BY id ORDER BY `time`))
ELSE LEAD(unix_timestamp(CONCAT(`date`, " ", regexp_replace(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss'), 1) OVER (PARTITION BY id, hour ORDER BY `time`)
- unix_timestamp(CONCAT(`date`, " ", regexp_replace(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss')
END AS time_to_next_trans
FROM(
SELECT
*,
(overlap_time_to_next_trans - sec_between_current_trans_and_next_hr) as hour_overlap_add
FROM(
SELECT
c_id,
s_id,
id,
rat,
dt,
`date`,
`time`,
hour,
(LEAD(s_id) OVER (PARTITION BY id ORDER BY `time`) = s_id) AS s_id_change,
LEAD(hour) OVER (PARTITION BY id ORDER BY `time`) - hour AS difference_hour,
UNIX_TIMESTAMP(CONCAT(`date`, " ", LPAD((hour + 1), 2, 0), ":00:00"), 'dd/MM/yyyy HH:mm:ss')
- UNIX_TIMESTAMP(CONCAT(`date`, " ", REGEXP_REPLACE(`time`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss') AS sec_between_current_trans_and_next_hr,
LEAD(UNIX_TIMESTAMP(CONCAT(`DATE`, " ", REGEXP_REPLACE(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss'), 1) OVER (PARTITION BY id ORDER BY `time`) -
UNIX_TIMESTAMP(CONCAT(`DATE`, " ", REGEXP_REPLACE(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss') AS overlap_time_to_next_trans
FROM database.tablename
WHERE dt = "${rundate}” AND ID <> 0
)j
)o
)
在步骤 1 中获得大部分推导。然后重复使用这些推导而无需再次推导。
with j as
(
SELECT
c_id
,s_id
,id
,rat
,dt
,`date`
,`time`
,hour
,(LEAD(s_id) OVER (PARTITION BY id ORDER BY `time`) = s_id) AS s_id_change_lead
,(LAG(s_id) OVER (PARTITION BY id ORDER BY `time`) = s_id) AS s_id_change_lag
,LEAD(hour) OVER (PARTITION BY id ORDER BY `time`) - hour AS difference_hour
,hour - LAG(hour) OVER (PARTITION BY id ORDER BY `time`) as new_hour_diff
,UNIX_TIMESTAMP(CONCAT(`date`, " ", LPAD((hour + 1), 2, 0), ":00:00"), 'dd/MM/yyyy HH:mm:ss') - UNIX_TIMESTAMP(CONCAT(`date`, " ", REGEXP_REPLACE(`time`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss') AS sec_between_current_trans_and_next_hr,
,LEAD(UNIX_TIMESTAMP(CONCAT(`DATE`, " ", REGEXP_REPLACE(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss'), 1) OVER (PARTITION BY id ORDER BY `time`) - UNIX_TIMESTAMP(CONCAT(`DATE`, " ", REGEXP_REPLACE(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss') AS overlap_time_to_next_trans
FROM database.tablename
WHERE dt = "${rundate}" AND ID <> 0
),
o as
(
SELECT
*,
overlap_time_to_next_trans - sec_between_current_trans_and_next_hr as hour_overlap_add
FROM j
),
t1 as
(
SELECT
*
,CASE
WHEN difference_hour > 1
THEN NULL
WHEN difference_hour = 1
THEN sec_between_current_trans_and_next_hr
WHEN s_id_change_lag = TRUE AND new_hour_diff=1 AND difference_hour = 0
THEN overlap_time_to_next_trans + LAG(hour_overlap_add) OVER (PARTITION BY id ORDER BY `time`)
ELSE LEAD(unix_timestamp(CONCAT(`date`, " ", regexp_replace(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss'), 1) OVER (PARTITION BY id, hour ORDER BY `time`) - unix_timestamp(CONCAT(`date`, " ", regexp_replace(`TIME`, '\\.\\d+', '')), 'dd/MM/yyyy HH:mm:ss')
END AS time_to_next_trans
FROM o
)
select * from t1;
希望这有帮助。