我想在ClickHouse上存储每5分钟百分位99的数据。但是,万一我想计算 p99 10 分钟。 2个间隔的平均p99可能不准确。
我知道有“T-digest”方法,我们可以合并每个区间的 t-digest 来估计整个百分位。但是,我仍然困惑如何将 T-digest 的结果存储在 ClickHouse 中。或者还有其他合适的方法可以做
任何人都可以建议我这个场景。感谢所有回复。
非常感谢。
致以诚挚的问候,
https://clickhouse.com/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest/
使用 t-digest 算法计算数值数据序列的近似分位数。
在许多情况下,QuantilesTDigestState 使用的数据明显少于分位数,并且近似误差小于 1%(您的里程可能会有所不同)。
使用 quantiles_tdigest AggregateFunction(quantilesTDigest(0.75, 0.9, 0.95, 0.99), UInt32) CODEC (ZSTD(1)) 的示例
drop database if exists db_test;
create database db_test;
CREATE TABLE db_test.raw_local (
time DateTime('Europe/Warsaw'),
cdn_id UInt32,
continent UInt32,
cdn_response_time_ms UInt32
)
ENGINE = MergeTree PARTITION BY toYYYYMMDD(time)
ORDER BY (cdn_id,time);
CREATE TABLE db_test.hour_local (
cdn_id UInt32,
continent UInt32,
grouped DateTime,
quantiles_exact AggregateFunction(quantilesExact(0.75, 0.9, 0.95, 0.99), UInt32)
)
ENGINE = AggregatingMergeTree
PARTITION BY toYYYYMMDD(grouped)
ORDER BY (cdn_id, continent, grouped);
CREATE MATERIALIZED VIEW db_test.hour_mv TO db_test.hour_local AS
SELECT cdn_id,
continent,
toStartOfHour(time) AS grouped ,
quantilesExactState(0.75, 0.9, 0.95, 0.99)(cdn_response_time_ms) AS quantiles_exact
FROM db_test.raw_local group by cdn_id, continent, grouped;
CREATE TABLE db_test.hour_local_optimized (
cdn_id UInt32,
continent UInt32,
grouped DateTime,
quantiles_tdigest AggregateFunction(quantilesTDigest(0.75, 0.9, 0.95, 0.99), UInt32) CODEC (ZSTD(1))
)
ENGINE = AggregatingMergeTree
PARTITION BY toYYYYMMDD(grouped)
ORDER BY (cdn_id, continent, grouped);
CREATE MATERIALIZED VIEW db_test.hour_local_optimized_mv TO db_test.hour_local_optimized
AS
SELECT cdn_id,
continent,
toStartOfHour(time) AS grouped ,
quantilesTDigestState(0.75, 0.9, 0.95, 0.99)(cdn_response_time_ms) AS quantiles_tdigest
FROM db_test.raw_local group by cdn_id, continent, grouped;
CREATE TABLE db_test.hour_local_optimized_2 (
cdn_id UInt32,
continent UInt32,
grouped DateTime,
quantiles_tdigest AggregateFunction(quantilesTDigest(0.75, 0.9, 0.95, 0.99), UInt32) CODEC (ZSTD(2)))
ENGINE = AggregatingMergeTree
PARTITION BY toYYYYMMDD(grouped)
ORDER BY (cdn_id, continent, grouped);
CREATE MATERIALIZED VIEW db_test.hour_local_optimized_2_mv TO db_test.hour_local_optimized_2
AS
SELECT cdn_id,
continent,
toStartOfHour(time) AS grouped ,
quantilesTDigestState(0.75, 0.9, 0.95, 0.99)(cdn_response_time_ms) AS quantiles_tdigest
FROM db_test.raw_local group by cdn_id, continent, grouped;
SET max_partitions_per_insert_block = 0;
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(0,100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(100000000, 100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(200000000, 100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(300000000, 100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(400000000, 100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(500000000, 100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(600000000, 100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(700000000, 100000000);
SELECT
database,
table,
formatReadableSize(sum(data_compressed_bytes) AS size) AS compressed,
formatReadableSize(sum(data_uncompressed_bytes) AS usize) AS uncompressed,
round(usize / size, 2) AS compr_rate,
sum(rows) AS rows,
count() AS part_count
FROM system.parts
WHERE (active = 1) AND (table LIKE '%') AND (database LIKE 'db_test')
GROUP BY
database,
table
ORDER BY size DESC;
┌─database─┬─table──────────────────┬─compressed─┬─uncompressed─┬─compr_rate─┬──────rows─┬─part_count─┐
│ db_test │ raw_local │ 4.04 GiB │ 11.92 GiB │ 2.95 │ 800000000 │ 395 │
│ db_test │ hour_local │ 1.50 GiB │ 2.98 GiB │ 1.98 │ 33450 │ 100 │
│ db_test │ hour_local_optimized │ 41.76 MiB │ 126.26 MiB │ 3.02 │ 33450 │ 100 │
│ db_test │ hour_local_optimized_2 │ 38.99 MiB │ 126.26 MiB │ 3.24 │ 33450 │ 100 │
└──────────┴────────────────────────┴────────────┴──────────────┴────────────┴───────────┴────────────┘
SELECT
continent,
quantilesExact(0.75, 0.9, 0.95, 0.99)(cdn_response_time_ms)
FROM db_test.raw_local
WHERE (toYYYYMMDD(time) = 20210102) AND (cdn_id = 2)
GROUP BY continent;
┌─continent─┬─quantilesExact(0.75, 0.9, 0.95, 0.99)(cdn_response_time_ms)─┐
│ 0 │ [218,262,276,288] │
│ 4 │ [218,261,276,288] │
│ 3 │ [218,262,276,288] │
│ 2 │ [218,261,276,288] │
│ 1 │ [218,262,276,288] │
└───────────┴─────────────────────────────────────────────────────────────┘
5 rows in set. Elapsed: 0.036 sec. Processed 2.90 million rows, 46.36 MB (80.28 million rows/s., 1.28 GB/s.)
SELECT
continent,
quantilesExactMerge(0.75, 0.9, 0.95, 0.99)(quantiles_exact)
FROM db_test.hour_local
WHERE (toYYYYMMDD(grouped) = 20210102) AND (cdn_id = 2)
GROUP BY continent;
┌─continent─┬─quantilesExactMerge(0.75, 0.9, 0.95, 0.99)(quantiles_exact)─┐
│ 0 │ [218,262,276,288] │
│ 4 │ [218,261,276,288] │
│ 3 │ [218,262,276,288] │
│ 2 │ [218,261,276,288] │
│ 1 │ [218,262,276,288] │
└───────────┴─────────────────────────────────────────────────────────────┘
5 rows in set. Elapsed: 0.041 sec.
SELECT
continent,
arrayMap(x -> round(x), quantilesTDigestMerge(0.75, 0.9, 0.95, 0.99)(quantiles_tdigest))
FROM db_test.hour_local_optimized
WHERE (toYYYYMMDD(grouped) = 20210102) AND (cdn_id = 2)
GROUP BY continent
┌─continent─┬─arrayMap(lambda(tuple(x), round(x)), quantilesTDigestMerge(0.75, 0.9, 0.95, 0.99)(quantiles_tdigest))─┐
│ 0 │ [218,262,276,288] │
│ 4 │ [218,261,276,288] │
│ 3 │ [218,262,276,288] │
│ 2 │ [218,261,276,288] │
│ 1 │ [218,262,276,288] │
└───────────┴───────────────────────────────────────────────────────────────────────────────────────────────────────┘
5 rows in set. Elapsed: 0.005 sec.
┌─continent─┬─arrayMap(lambda(tuple(x), round(x)), quantilesTDigestMerge(0.75, 0.9, 0.95, 0.99)(quantiles_tdigest))─┐
│ 0 │ [218,262,276,288] │
│ 4 │ [218,261,276,288] │
│ 3 │ [218,262,276,288] │
│ 2 │ [218,261,276,288] │
│ 1 │ [218,262,276,288] │
└───────────┴───────────────────────────────────────────────────────────────────────────────────────────────────────┘
5 rows in set. Elapsed: 0.006 sec.
@丹尼·克莱恩 知道了。非常感谢,但如果数据在摄取到 ClickHouse 之前已经计算出每 5 分钟的 pct99(不是原始数据),如下所示。
时间|百分位 99 10.00|10 10.05|12 10.10|10
是否可以找到 10.00-10.10 之间的百分位。 感谢您的帮助。