一个简化的(有点傻)的例子将使我的要求更清楚一些。 Bob/Tim 应该是这里的唯一标识符。
DROP TABLE IF EXISTS #People
CREATE TABLE #People
(
Person varchar(10),
City varchar(20),
Department varchar(1),
PersonAge int
);
INSERT INTO #People VALUES ('Bob', 'New York', 'A', 40),
('Tim', 'New York', 'A', 30),
('Tim', 'New York', 'B', 30)
;WITH InitialGrouping AS
(
SELECT
Person,
City,
Department,
MAX(PersonAge) PersonAge,
COUNT(*) NumRows
FROM
#People
GROUP BY
Person, City, Department
) --SELECT * FROM InitialGrouping
SELECT
City,
Department,
AVG(PersonAge) AveragePersonAge,
COUNT(DISTINCT Person) PersonCount
FROM
InitialGrouping
GROUP BY
GROUPING SETS (City, Department, (City, Department))
ORDER BY
City, Department
;WITH InitialGrouping AS
(
SELECT
GROUPING_ID(Person,City,Department) GROUPID,
Person,
City,
Department,
MAX(PersonAge) PersonAge,
COUNT(*) NumRows
FROM
#People
GROUP BY
Person, GROUPING SETS (City, Department, (City, Department))
) --SELECT * FROM InitialGrouping
SELECT
City,
Department,
AVG(PersonAge) AveragePersonAge,
COUNT(DISTINCT Person) PersonCount
FROM
InitialGrouping
GROUP BY
GROUPID, City, Department
ORDER BY
City, Department
这两个例子在功能上并不等价。如果您查看纽约的汇总行,您会发现第一个 (33) 的平均年龄是错误的,而第二个 (35) 的平均年龄是正确的。第二个在功能上是正确的,但是,对于具有大量分组集的更复杂的示例,它绝对可以在查询计划的串联步骤中爆炸基数,从而导致性能无法接受。在这个简单的示例中,您可以看到第一个查询将内容保持在 5 行,而第二个查询扩展到 8 行。在具有大量分组集的更复杂的情况下,我已经看到基数在数百万行中扩展了很多次。
有没有办法解决这个问题,防止事情扩展超过最终输出的目标粒度?
我使用第一种方式,但更改方法类型顺序聚合并使用窗口函数
select
City,
Department,
max(PersonAge) AveragePersonAge,
max( Person) PersonCount
from (
select
City
,Department
,AVG(PersonAge) over(partition by City,Department ) as PersonAge
,count( *) over(partition by City,Department ) as Person
from #People
)a
GROUP BY
GROUPING SETS (Department,City,(Department,City) )
ORDER BY
City, Department