我在 SQL 中有这个表(sample_table):
CREATE TABLE student_table (
student INT,
var CHAR(1),
d1 DATE,
d2 DATE
);
INSERT INTO student_table (student, var, d1, d2) VALUES
(1, 'd', '2008-09-27', '2008-10-02'),
(1, 'd', '2008-11-14', '2008-11-21'),
(2, 'd', '2007-01-15', '2007-01-20'),
(2, 'd', '2010-03-04', '2010-03-10'),
(3, 'a', '2011-03-24', '2011-04-02'),
(3, 'a', '2015-01-12', '2015-01-14'),
(4, 'e', '2009-07-18', '2009-07-23'),
(4, 'a', '2020-06-19', '2020-06-27'),
(5, 'c', '2009-11-26', '2009-11-28'),
(5, 'a', '2015-12-24', '2015-12-25'),
(6, 'c', '2009-01-09', '2009-01-18'),
(6, 'a', '2018-11-21', '2018-11-30'),
(7, 'b', '2003-07-08', '2003-07-14'),
(7, 'a', '2006-06-01', '2006-06-06'),
(7, 'a', '2010-02-26', '2010-03-07'),
(8, 'b', '2004-11-11', '2004-11-21'),
(8, 'a', '2014-03-26', '2014-03-30'),
(8, 'a', '2021-05-06', '2021-05-12'),
(8, 'c', '2023-04-28', '2023-05-06');
student var d1 d2
1 d 2008-09-27 2008-10-02
1 d 2008-11-14 2008-11-21
2 d 2007-01-15 2007-01-20
2 d 2010-03-04 2010-03-10
3 a 2011-03-24 2011-04-02
3 a 2015-01-12 2015-01-14
4 e 2009-07-18 2009-07-23
4 a 2020-06-19 2020-06-27
5 c 2009-11-26 2009-11-28
5 a 2015-12-24 2015-12-25
6 c 2009-01-09 2009-01-18
6 a 2018-11-21 2018-11-30
7 b 2003-07-08 2003-07-14
7 a 2006-06-01 2006-06-06
7 a 2010-02-26 2010-03-07
8 b 2004-11-11 2004-11-21
8 a 2014-03-26 2014-03-30
8 a 2021-05-06 2021-05-12
8 c 2023-04-28 2023-05-06
我正在尝试使用 CASE WHEN 语句来识别 4 组不同的学生:
我很难弄清楚如何使用 CASE WHEN 语句来确保没有学生被重复计算在两组中。我尝试以不同的方式编写 CASE WHEN 语句并确定以下内容:
WITH student_var_sequence AS (
SELECT
student,
var,
ROW_NUMBER() OVER (PARTITION BY student ORDER BY d1, d2) AS row_num,
COUNT(*) OVER (PARTITION BY student) AS total_rows
FROM sample_table
),
student_var_groups AS (
SELECT
student,
MAX(CASE WHEN var = 'a' THEN row_num ELSE 0 END) AS last_a_row,
MAX(row_num) AS last_row
FROM student_var_sequence
GROUP BY student
),
student_var_cases AS (
SELECT
svs.student,
svs.var,
CASE
WHEN svg.last_a_row = 0 THEN 'Never had a'
WHEN svg.last_a_row = svg.last_row THEN 'Only have a'
WHEN svg.last_a_row < svg.last_row THEN 'Had a but not now'
WHEN svg.last_a_row < MAX(svs.row_num) OVER (PARTITION BY svs.student) THEN 'Had a, not a, then a again'
ELSE 'Other'
END AS new_var
FROM student_var_sequence svs
JOIN student_var_groups svg ON svs.student = svg.student
)
SELECT * FROM student_var_cases;
结果如下所示:
student var new_var
1 d Never had a
1 d Never had a
2 d Never had a
2 d Never had a
3 a Only have a
3 a Only have a
4 a Only have a
4 e Only have a
5 a Only have a
5 c Only have a
6 a Only have a
6 c Only have a
7 a Only have a
7 a Only have a
7 b Only have a
8 a Had a but not now
8 a Had a but not now
8 b Had a but not now
8 c Had a but not now
我可以在这里看到错误 - 例如据说student_5只有“a”,但我可以看出这不是真的。
有没有办法可以简化我的 SQL 代码来修复这些错误?
尝试2:
WITH
student_sequence AS (
SELECT
student_id,
var,
ROW_NUMBER() OVER (PARTITION BY student_id ORDER BY d1, d2) AS row_num
FROM student_table
),
-- Students that never had var=a
never_a AS (
SELECT student_id
FROM student_sequence
GROUP BY student_id
HAVING COUNT(CASE WHEN var = 'a' THEN 1 END) = 0
),
-- Students that only have var=a
only_a AS (
SELECT student_id
FROM student_sequence
GROUP BY student_id
HAVING COUNT(CASE WHEN var != 'a' THEN 1 END) = 0
),
-- Students that had var=a at some point but as per their latest row they dont have var=a
had_a_not_now AS (
SELECT student_id
FROM student_sequence
WHERE student_id NOT IN (SELECT student_id FROM never_a)
AND student_id NOT IN (SELECT student_id FROM only_a)
GROUP BY student_id
HAVING MAX(CASE WHEN var = 'a' THEN row_num END) < MAX(row_num)
),
-- Students that had var=a, then had var !=a and then went back to having var=a at least once
a_not_a_a AS (
SELECT student_id
FROM student_sequence
WHERE student_id NOT IN (SELECT student_id FROM never_a)
AND student_id NOT IN (SELECT student_id FROM only_a)
AND student_id NOT IN (SELECT student_id FROM had_a_not_now)
)
-- Combine all groups
SELECT
student_id,
CASE
WHEN student_id IN (SELECT student_id FROM never_a) THEN 'Never had a'
WHEN student_id IN (SELECT student_id FROM only_a) THEN 'Only have a'
WHEN student_id IN (SELECT student_id FROM had_a_not_now) THEN 'Had a but not now'
WHEN student_id IN (SELECT student_id FROM a_not_a_a) THEN 'Had a, not a, then a again'
END AS student_group
FROM student_sequence
GROUP BY student_id;
是的,您确实可以简化 SQL 查询以获得您想要的结果 - 但是标准并没有真正涵盖数据。带4、5、6号学生
(4, 'e', '2009-07-18', '2009-07-23'),
(4, 'a', '2020-06-19', '2020-06-27'),
(5, 'c', '2009-11-26', '2009-11-28'),
(5, 'a', '2015-12-24', '2015-12-25'),
(6, 'c', '2009-01-09', '2009-01-18'),
(6, 'a', '2018-11-21', '2018-11-30'),
他们以前没有 - 现在有了 - 那么应该如何根据您的 4 个标准对它们进行分类?是否应该将它们排除在外,否则我们可以有 5 个标准“只有现在”?
假设您想要将 4,5 和 6 等学生以及其他条件包括在内,您可以使用子查询、条件语句和聚合函数根据指定的条件对学生进行分类。例如
SELECT
student,
group_status
FROM (
SELECT
s.student,
CASE
WHEN NOT EXISTS (SELECT 1 FROM student_table WHERE student = s.student) THEN 'Never had a'
WHEN NOT EXISTS (SELECT 1 FROM student_table WHERE student = s.student AND var <> 'a') THEN 'Only have a'
WHEN 'a' NOT IN (SELECT var FROM student_table WHERE student = s.student ORDER BY d2 DESC LIMIT 1) THEN 'Had a dont have latest a'
WHEN 'a' IN (SELECT var FROM student_table WHERE student = s.student AND d2 = (SELECT MAX(d2) FROM student_table WHERE student = s.student)) THEN
CASE
WHEN 'a' IN (SELECT var FROM student_table WHERE student = s.student AND d2 <> (SELECT MAX(d2) FROM student_table WHERE student = s.student)) THEN 'Had a but not now'
ELSE 'Had a only now'
END
ELSE 'Had a but not now'
END AS group_status
FROM
(SELECT DISTINCT student FROM student_table) s
) AS student_groups
WHERE
group_status IS NOT NULL;
根据您的数据查询的结果将是
"3" "Only have a"
"5" "Had a only now"
"4" "Had a only now"
"6" "Had a only now"
"2" "Had a dont have latest a"
"7" "Had a but not now"
"1" "Had a dont have latest a"
"8" "Had a dont have latest a"
根据您的数据和标准对此进行细分。
我认为这是正确的,但如果我误解了,请原谅我:)