我们通过 kafka 连接器 将数据提取到 Snowflake 中。 为了提高数据读取性能/扫描更少的分区,我们决定向存储在 RECORD_CONTENT 变体字段中的键/键组合添加一个集群键。
RECORD_CONTENT 字段中的数据如下所示:
{
"jsonSrc": {
"Integerfield": 1,
"SourceDateTime": "2020-06-30 05:33:08:345",
*REST_OF_THE_KEY_VALUE_PAIRS*
}
现在的问题是,像 SourceDateTime 这样的日期时间列上的集群不工作:
CLUSTER BY (to_date(RECORD_CONTENT:jsonSrc:loadDts::datetime))
...在像 Integerfield 确实这样的字段上进行聚类时:
CLUSTER BY (RECORD_CONTENT:jsonSrc:Integerfield::int )
不工作意味着:在RECORD_CONTENT:jsonSrc:loadDts::datetime上使用过滤器时,它对扫描的分区没有影响,而在RECORD_CONTENT:jsonSrc:Integerfield::int上过滤确实执行分区修剪。
这里出了什么问题?这是bug吗?
注意:
为了更好的修剪和更少的存储消耗,我们建议 将对象和关键数据展平为单独的关系列 如果您的半结构化数据包括:日期和时间戳, 特别是非 ISO 8601 日期和时间戳,作为字符串值请参阅此链接:字符串中的数字
数组
非本机值(例如日期和时间戳)存储为字符串 当加载到 VARIANT 列时,因此对这些值的操作可以 比存储在关系中时速度更慢并且消耗更多空间 具有相应数据类型的列。
https://docs.snowflake.com/en/user-guide/semistructed-considerations.html#storing-semi-structed-data-in-a-variant-column-vs-flattening-the-嵌套结构
请在下面找到一个 3 分钟的实验室来证明这一点,
--create testing table to evalute prune/cluster feature for all sub-field data types in variant column
CREATE OR REPLACE TABLE GET_PROPOSALS as
select
row_number() over (order by seq8()) as pk,
mod(pk,12876381) + 1 as city_id, --NDV 12876381
chr(65 + uniform(0, 15, random())) || chr(65 + uniform(0, 15, random())) as message, --NDV 256
cast (dateadd(day, mod(pk, 1095), '2020-01-01' ) as date) as transactionDate, --NDV 1095 DAYS, i.e. 3 years
cast (dateadd(second, mod(pk, 3600), '2020-01-01 00:00:00' ) as timestamp) as message_timestamp,
cast (object_construct('city_id', city_id,
'message', message,
'transactionDate', transactionDate,
'transactionDate2', to_char(transactionDate, 'yyyy-mm-dd'),
'transactionDate3', to_char(transactionDate, 'yymmdd'),
'message_timestamp', message_timestamp,
'message_timestamp2', to_char(message_timestamp, 'yyyy-mm-dd-hh24:mi:ss'),
'message_timestamp3', DATE_PART(EPOCH_SECOND, message_timestamp)
)
as variant) as data,
RANDSTR(120 ,RANDOM()) as any_other_columns --fill in necessary space to simulate real life customer environment
from table(generator(rowcount => 12960000))
order by random(); --data is randomly distributed, i.e. 100% not clustered on any columns whatsoever
--1577836801
select DATE_PART(EPOCH_SECOND, '2020-01-01 00:00:01'::datetime);
select to_date('1577836801');
desc table GET_PROPOSALS;
/*
--kindly note: your testing data might be different since it's randmoly generated/selected but gist is the same here
{
"city_id": 586422,
"message": "NM",
"message_timestamp": "2020-01-01 00:53:41.000",
"message_timestamp2": "2020-01-01-00:53:41",
"message_timestamp3": 1577840021,
"transactionDate": "2021-08-19",
"transactionDate2": "2021-08-19",
"transactionDate3": "210819"
}
*/
select *
from GET_PROPOSALS
limit 1;
--simulate number sub-field data pruning
create or replace table GET_PROPOSALS_CITY_ID as
select *
from GET_PROPOSALS
order by data:city_id;
select system$clustering_information('GET_PROPOSALS_CITY_ID', '(data:city_id::number)');
alter session set use_cached_result=false;
--only need to hit 1 partition since number sub-field can be properly clustered
select *
from GET_PROPOSALS_CITY_ID
where data:city_id=586422;
--simulate char sub-field data pruning
create or replace table GET_PROPOSALS_MESSAGE as
select *
from GET_PROPOSALS
order by data:message;
select system$clustering_information('GET_PROPOSALS_MESSAGE', '(data:message::varchar)');
--only need to hit 2 partition since char sub-field can be properly clustered
select *
from GET_PROPOSALS_MESSAGE
where data:message='NM';
--simulate date sub-field data pruning
create or replace table GET_PROPOSALS_TRANSACTIONDATE as
select *
from GET_PROPOSALS
order by data:transactionDate;
--needs to hit all 256 partitions. i.e. full table scan and no pruning if we cluster on date type sub-field directly
select *
from GET_PROPOSALS_TRANSACTIONDATE
where data:transactionDate='2021-08-19';
--simulate date sub-field data pruning, try to use workaround as format it to yyyy-mm-dd char type
create or replace table GET_PROPOSALS_TRANSACTIONDATE2 as
select *
from GET_PROPOSALS
order by data:transactionDate2;
--only need to hit 1 partition since char sub-field can be properly clustered
select *
from GET_PROPOSALS_TRANSACTIONDATE2
where data:transactionDate2='2021-08-19';
--simulate date sub-field data pruning, try to use workaround as format it to yymmdd char type
create or replace table GET_PROPOSALS_TRANSACTIONDATE3 as
select *
from GET_PROPOSALS
order by data:transactionDate3;
--only need to hit 1 partition since char sub-field can be properly clustered
select *
from GET_PROPOSALS_TRANSACTIONDATE3
where data:transactionDate3='210819';
--simulate timestamp sub-field data pruning
create or replace table GET_PROPOSALS_MESSAGETIMESTAMP as
select *
from GET_PROPOSALS
order by data:message_timestamp;
--needs to hit all 256 partitions. i.e. full table scan and no pruning if we cluster on timestamp type sub-field directly
select *
from GET_PROPOSALS_MESSAGETIMESTAMP
where data:message_timestamp='2020-01-01 00:53:41.000';
--simulate timestamp sub-field data pruning, try to use workaround as format it to yyyy-mm-dd char type
create or replace table GET_PROPOSALS_MESSAGETIMESTAMP2 as
select *
from GET_PROPOSALS
order by data:message_timestamp2;
--only need to hit 1 partition since char sub-field can be properly clustered
select *
from GET_PROPOSALS_MESSAGETIMESTAMP2
where data:message_timestamp2='2020-01-01-00:53:41';
--simulate timestamp sub-field data pruning, try to use workaround as format it to epoch type
create or replace table GET_PROPOSALS_MESSAGETIMESTAMP3 as
select *
from GET_PROPOSALS
order by data:message_timestamp3::date;
--only need to hit 1 partition since epoch sub-field can be properly clustered
select *
from GET_PROPOSALS_MESSAGETIMESTAMP3
where data:message_timestamp3='2020-01-01 00:53:41'::timestamp;
请注意:在某个键上按 100% 集群进行排序,因此我只是使用此技巧立即获取 100% 集群表,而不是为了测试目的而等待自动集群。