聚合-百分位Clickhouse

问题描述 投票:0回答:2

我想在ClickHouse上存储每5分钟百分位99的数据。但是,万一我想计算 p99 10 分钟。 2个间隔的平均p99可能不准确。

我知道有“T-digest”方法,我们可以合并每个区间的 t-digest 来估计整个百分位。但是,我仍然困惑如何将 T-digest 的结果存储在 ClickHouse 中。或者还有其他合适的方法可以做

任何人都可以建议我这个场景。感谢所有回复。

非常感谢。

致以诚挚的问候,

python hash aggregation clickhouse percentile
2个回答
0
投票

https://clickhouse.com/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest/

使用 t-digest 算法计算数值数据序列的近似分位数。

在许多情况下,QuantilesTDigestState 使用的数据明显少于分位数,并且近似误差小于 1%(您的里程可能会有所不同)。

使用 quantiles_tdigest AggregateFunction(quantilesTDigest(0.75, 0.9, 0.95, 0.99), UInt32) CODEC (ZSTD(1)) 的示例

drop database if exists db_test;

create database db_test;


CREATE TABLE db_test.raw_local (
  time DateTime('Europe/Warsaw'),
  cdn_id UInt32,
  continent UInt32,
  cdn_response_time_ms UInt32
)
ENGINE = MergeTree PARTITION BY toYYYYMMDD(time)
ORDER BY (cdn_id,time);


CREATE TABLE db_test.hour_local (
   cdn_id UInt32,
   continent UInt32,
   grouped DateTime,
   quantiles_exact AggregateFunction(quantilesExact(0.75, 0.9, 0.95, 0.99), UInt32)
)
ENGINE = AggregatingMergeTree
PARTITION BY toYYYYMMDD(grouped)
ORDER BY (cdn_id, continent, grouped);


CREATE MATERIALIZED VIEW db_test.hour_mv TO db_test.hour_local AS
SELECT cdn_id,
       continent,
       toStartOfHour(time) AS grouped ,
       quantilesExactState(0.75, 0.9, 0.95, 0.99)(cdn_response_time_ms) AS quantiles_exact
FROM db_test.raw_local group by cdn_id, continent, grouped;


CREATE TABLE db_test.hour_local_optimized (
   cdn_id UInt32,
   continent UInt32,
   grouped DateTime,
   quantiles_tdigest AggregateFunction(quantilesTDigest(0.75, 0.9, 0.95, 0.99), UInt32) CODEC (ZSTD(1))
)
ENGINE = AggregatingMergeTree
PARTITION BY toYYYYMMDD(grouped)
ORDER BY (cdn_id, continent, grouped);

CREATE MATERIALIZED VIEW db_test.hour_local_optimized_mv TO db_test.hour_local_optimized
 AS
SELECT cdn_id,
       continent,
       toStartOfHour(time) AS grouped ,
       quantilesTDigestState(0.75, 0.9, 0.95, 0.99)(cdn_response_time_ms) AS quantiles_tdigest
FROM db_test.raw_local  group by cdn_id, continent, grouped;


CREATE TABLE db_test.hour_local_optimized_2 (
    cdn_id UInt32,
    continent UInt32,
    grouped DateTime,
    quantiles_tdigest AggregateFunction(quantilesTDigest(0.75, 0.9, 0.95, 0.99), UInt32) CODEC (ZSTD(2)))
ENGINE = AggregatingMergeTree
PARTITION BY toYYYYMMDD(grouped)
ORDER BY (cdn_id, continent, grouped);



CREATE MATERIALIZED VIEW db_test.hour_local_optimized_2_mv TO db_test.hour_local_optimized_2
AS
SELECT cdn_id,
      continent,
      toStartOfHour(time) AS grouped ,
      quantilesTDigestState(0.75, 0.9, 0.95, 0.99)(cdn_response_time_ms) AS quantiles_tdigest
FROM db_test.raw_local  group by cdn_id, continent, grouped;


SET max_partitions_per_insert_block = 0;
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(0,100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(100000000, 100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(200000000, 100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(300000000, 100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(400000000, 100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(500000000, 100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(600000000, 100000000);
insert into db_test.raw_local select toDateTime('2021-01-01 00:00:00') + (number / 100), rand() % 3, number % 5, intDiv(rand() % 32323, 111) from numbers(700000000, 100000000);


SELECT
    database,
    table,
    formatReadableSize(sum(data_compressed_bytes) AS size) AS compressed,
    formatReadableSize(sum(data_uncompressed_bytes) AS usize) AS uncompressed,
    round(usize / size, 2) AS compr_rate,
    sum(rows) AS rows,
    count() AS part_count
FROM system.parts
WHERE (active = 1) AND (table LIKE '%') AND (database LIKE 'db_test')
GROUP BY
    database,
    table
ORDER BY size DESC;


┌─database─┬─table──────────────────┬─compressed─┬─uncompressed─┬─compr_rate─┬──────rows─┬─part_count─┐
│ db_test  │ raw_local              │ 4.04 GiB   │ 11.92 GiB    │       2.95 │ 800000000 │        395 │
│ db_test  │ hour_local             │ 1.50 GiB   │ 2.98 GiB     │       1.98 │     33450 │        100 │
│ db_test  │ hour_local_optimized   │ 41.76 MiB  │ 126.26 MiB   │       3.02 │     33450 │        100 │
│ db_test  │ hour_local_optimized_2 │ 38.99 MiB  │ 126.26 MiB   │       3.24 │     33450 │        100 │
└──────────┴────────────────────────┴────────────┴──────────────┴────────────┴───────────┴────────────┘


SELECT
    continent,
    quantilesExact(0.75, 0.9, 0.95, 0.99)(cdn_response_time_ms)
FROM db_test.raw_local
WHERE (toYYYYMMDD(time) = 20210102) AND (cdn_id = 2)
GROUP BY continent;
┌─continent─┬─quantilesExact(0.75, 0.9, 0.95, 0.99)(cdn_response_time_ms)─┐
│         0 │ [218,262,276,288]                                           │
│         4 │ [218,261,276,288]                                           │
│         3 │ [218,262,276,288]                                           │
│         2 │ [218,261,276,288]                                           │
│         1 │ [218,262,276,288]                                           │
└───────────┴─────────────────────────────────────────────────────────────┘
5 rows in set. Elapsed: 0.036 sec. Processed 2.90 million rows, 46.36 MB (80.28 million rows/s., 1.28 GB/s.)

SELECT
    continent,
    quantilesExactMerge(0.75, 0.9, 0.95, 0.99)(quantiles_exact)
FROM db_test.hour_local
WHERE (toYYYYMMDD(grouped) = 20210102) AND (cdn_id = 2)
GROUP BY continent;
┌─continent─┬─quantilesExactMerge(0.75, 0.9, 0.95, 0.99)(quantiles_exact)─┐
│         0 │ [218,262,276,288]                                           │
│         4 │ [218,261,276,288]                                           │
│         3 │ [218,262,276,288]                                           │
│         2 │ [218,261,276,288]                                           │
│         1 │ [218,262,276,288]                                           │
└───────────┴─────────────────────────────────────────────────────────────┘
5 rows in set. Elapsed: 0.041 sec.

SELECT
    continent,
    arrayMap(x -> round(x), quantilesTDigestMerge(0.75, 0.9, 0.95, 0.99)(quantiles_tdigest))
FROM db_test.hour_local_optimized
WHERE (toYYYYMMDD(grouped) = 20210102) AND (cdn_id = 2)
GROUP BY continent
┌─continent─┬─arrayMap(lambda(tuple(x), round(x)), quantilesTDigestMerge(0.75, 0.9, 0.95, 0.99)(quantiles_tdigest))─┐
│         0 │ [218,262,276,288]                                                                                     │
│         4 │ [218,261,276,288]                                                                                     │
│         3 │ [218,262,276,288]                                                                                     │
│         2 │ [218,261,276,288]                                                                                     │
│         1 │ [218,262,276,288]                                                                                     │
└───────────┴───────────────────────────────────────────────────────────────────────────────────────────────────────┘
5 rows in set. Elapsed: 0.005 sec.


┌─continent─┬─arrayMap(lambda(tuple(x), round(x)), quantilesTDigestMerge(0.75, 0.9, 0.95, 0.99)(quantiles_tdigest))─┐
│         0 │ [218,262,276,288]                                                                                     │
│         4 │ [218,261,276,288]                                                                                     │
│         3 │ [218,262,276,288]                                                                                     │
│         2 │ [218,261,276,288]                                                                                     │
│         1 │ [218,262,276,288]                                                                                     │
└───────────┴───────────────────────────────────────────────────────────────────────────────────────────────────────┘
5 rows in set. Elapsed: 0.006 sec.

0
投票

@丹尼·克莱恩 知道了。非常感谢,但如果数据在摄取到 ClickHouse 之前已经计算出每 5 分钟的 pct99(不是原始数据),如下所示。

时间|百分位 99 10.00|10 10.05|12 10.10|10

是否可以找到 10.00-10.10 之间的百分位。 感谢您的帮助。

© www.soinside.com 2019 - 2024. All rights reserved.