TMP 表定义(我们插入所有数据的地方)
CREATE TABLE guardicore.mv_test_data_tmp
(
`some_id` UUID,
`count` Nullable(UInt32),
`aggr_id` FixedString(64),
`id` UUID,
`start_time` DateTime64(6),
)
ENGINE = MergeTree
PARTITION BY toDate(start_time)
ORDER BY start_time
TTL toDateTime(start_time) + toIntervalHour(1)
SETTINGS allow_nullable_key = 1, index_granularity = 8192
最终表定义(用于聚合数据)
CREATE TABLE guardicore.mv_test_data
(
`some_id` UUID,
`count` Nullable(UInt32),
`aggr_id` FixedString(64),
`id` UUID,
`start_time` DateTime64(6),
)
ENGINE = SummingMergeTree
PARTITION BY toDate(start_time)
ORDER BY (start_time, aggr_id)
SETTINGS allow_nullable_key = 1, index_granularity = 8192
决赛桌的具体化视图:
CREATE MATERIALIZED VIEW guardicore.mv_test_data_mv TO guardicore.mv_test_data
(
`aggr_id` FixedString(64),
`start_time` DateTime,
`count` Nullable(UInt64),
`some_id` UUID,
`id` UUID,
) AS
SELECT
aggr_id,
toStartOfFifteenMinutes(start_time) AS start_time,
sum(count) AS count,
anyLast(some_id) AS some_id,
FROM guardicore.mv_test_data_tmp
GROUP BY
aggr_id,
start_time
ORDER BY
aggr_id ASC,
start_time ASC
问题是当 MV 打开并且我正在进行聚合时。 ClickHouse 消耗大约 2-3 倍的内存和 CPU。我是否错过了定义中的某些内容或者应该检查一些配置?目前所有配置均为默认CH 配置。 测试是在 4 小时内向 TMP 表插入 300,000,000 次
可空键对于
count
字段没有意义,只需插入 0
值
可空字段的性能很差
更好地使用
ORDER BY (aggr_id, start_time)
来 guardicore.mv_test_data
以便将来选择
内存和 CPU 使用率取决于组合值的基数
aggr_id, start_time
在每个 INSERT 语句中,更多组合可以为 GROUP BY 和 ORDER BY 提供更多内存和 CPU,用于处理 MATERIALIZED VIEW 触发器
可以分享一下吗
SELECT
count(),
uniq(tuple( aggr_id, toStartOfFifteenMinutes(start_time))) AS uniq_combinations
FROM guardicore.mv_test_data_tmp
尝试了解聚合的效率?
此外,我想建议在MV和目标表中使用不同的字段名称而不是
start_time
让我们尝试类似的事情
CREATE TABLE guardicore.mv_test_data_tmp
(
`id` UUID,
`some_id` UUID,
`count` UInt32,
`aggr_id` FixedString(64),
`start_time` DateTime64(6)
)
ENGINE = MergeTree
PARTITION BY toDate(start_time)
ORDER BY start_time
TTL toDateTime(start_time) + toIntervalHour(1);
CREATE TABLE guardicore.mv_test_data
(
`id` UUID,
`some_id` UUID,
`count` Nullable(UInt32),
`aggr_id` FixedString(64),
`start_15m` DateTime
)
ENGINE = SummingMergeTree
PARTITION BY toDate(start_15m)
ORDER BY (aggr_id, start_15m);
CREATE MATERIALIZED VIEW guardicore.mv_test_data_mv TO guardicore.mv_test_data
(
`aggr_id` FixedString(64),
`start_time` DateTime,
`count` Nullable(UInt64),
`some_id` UUID,
`id` UUID,
) AS
SELECT
aggr_id,
anyLast(id) AS id,
anyLast(some_id) AS some_id,
toStartOfFifteenMinutes(start_time) AS start_15m,
sum(count) AS count,
FROM guardicore.mv_test_data_tmp
GROUP BY
aggr_id,
start_15m
ORDER BY
aggr_id,
start_15m