我想计算一个新列,该列根据每组和该组内的序列变化分配 ID。
考虑下表:
# | user_id | my_date | color |
# |---------|------------|--------|
# | a | 2023-02-01 | red | ----
# | a | 2023-03-22 | red | |
# | a | 2023-03-30 | red | | this is period *1* for user_id = a
# | a | 2023-06-10 | red | |
# | a | 2023-06-11 | red | ----
# | a | 2023-07-03 | green |
# | a | 2023-07-09 | green |
# | a | 2024-01-11 | green |
# | a | 2024-02-11 | yellow |
# | a | 2024-02-12 | yellow |
# | a | 2024-02-13 | yellow |
# | a | 2024-02-14 | yellow |
# | b | 2022-10-20 | blue |
# | b | 2022-10-21 | blue |
# | b | 2022-10-22 | blue |
# | b | 2022-10-23 | brown | ----
# | b | 2022-10-24 | brown | | this is period *2* for user_id = b
# | b | 2022-10-25 | brown | ----
# | b | 2022-10-26 | blue |
# | b | 2022-10-27 | blue |
对于每个
user_id
,如果我们按顺序(升序)沿着my_date
,我们可以看到存在具有相同color
特征的“周期”或序列。
我想创建一个新列来说明周期/序列。
# | user_id | my_date | color | period_number |
# |---------|------------|--------|---------------|
# | a | 2023-02-01 | red | 1 |
# | a | 2023-03-22 | red | 1 |
# | a | 2023-03-30 | red | 1 |
# | a | 2023-06-10 | red | 1 |
# | a | 2023-06-11 | red | 1 |
# | a | 2023-07-03 | green | 2 |
# | a | 2023-07-09 | green | 2 |
# | a | 2024-01-11 | green | 2 |
# | a | 2024-02-11 | yellow | 3 |
# | a | 2024-02-12 | yellow | 3 |
# | a | 2024-02-13 | yellow | 3 |
# | a | 2024-02-14 | yellow | 3 |
# | b | 2022-10-20 | blue | 1 |
# | b | 2022-10-21 | blue | 1 |
# | b | 2022-10-22 | blue | 1 |
# | b | 2022-10-23 | brown | 2 |
# | b | 2022-10-24 | brown | 2 |
# | b | 2022-10-25 | brown | 2 |
# | b | 2022-10-26 | blue | 3 |
# | b | 2022-10-27 | blue | 3 |
我使用基于 Trino SQL 的 AWS Athena。
WITH my_table AS (
SELECT *
FROM (VALUES
('a', DATE '2023-02-01', 'red'),
('a', DATE '2023-03-22', 'red'),
('a', DATE '2023-03-30', 'red'),
('a', DATE '2023-06-10', 'red'),
('a', DATE '2023-06-11', 'red'),
('a', DATE '2023-07-03', 'green'),
('a', DATE '2023-07-09', 'green'),
('a', DATE '2024-01-11', 'green'),
('a', DATE '2024-02-11', 'yellow'),
('a', DATE '2024-02-12', 'yellow'),
('a', DATE '2024-02-13', 'yellow'),
('a', DATE '2024-02-14', 'yellow'),
('b', DATE '2022-10-20', 'blue'),
('b', DATE '2022-10-21', 'blue'),
('b', DATE '2022-10-22', 'blue'),
('b', DATE '2022-10-23', 'brown'),
('b', DATE '2022-10-24', 'brown'),
('b', DATE '2022-10-25', 'brown'),
('b', DATE '2022-10-26', 'blue'),
('b', DATE '2022-10-27', 'blue')
) AS t(user_id, my_date, color)
)
SELECT *
FROM my_table;
参见示例
with my_table AS (
SELECT *
FROM (VALUES
('a', '2023-02-01', 'red'),
('a', '2023-03-30', 'red'),
('a', '2023-06-10', 'red'),
('a', '2023-06-11', 'red'),
('a', '2023-07-03', 'green'),
('a', '2023-07-09', 'green'),
('a', '2024-01-11', 'green'),
('a', '2024-02-11', 'yellow'),
('a', '2024-02-12', 'yellow'),
('a', '2024-02-13', 'yellow'),
('a', '2024-02-14', 'yellow'),
('b', '2022-10-20', 'blue'),
('b', '2022-10-21', 'blue'),
('b', '2022-10-22', 'blue'),
('b', '2022-10-23', 'brown'),
('b', '2022-10-24', 'brown'),
('b', '2022-10-25', 'brown'),
('b', '2022-10-26', 'blue'),
('b', '2022-10-27', 'blue')
) AS t(user_id, my_date, color)
)
查询
,periods as(select user_id,color,my_date
,(select min(my_date) from my_table t2
where t2.user_id=t.user_id and t2.my_date>t.my_date
and t2.color<>t.color) next_date
from my_table t
)
,gr as(
select user_id,color,next_date
,min(my_date)mindt
from periods
group by user_id,color,next_date
)
,res as(
select t.*
from my_table t
left join gr g on g.user_id=t.user_id
and g.mindt<=t.my_date
)
select user_id,color,my_date,count(*) period_number
from res
group by user_id,color,my_date
order by user_id,my_date;