我正在尝试转换存储在 hive 表中的以下 json 字符串。
输入:
COL1 JSON_STRING
1 {"COL2": {"REFA": "9", "REFB": "9"}, "COL3": {"REFA": "80000.0000", "REFB": "80000.0000"}, "COL4": {"REFA": "0.0000", "REFB": "0.0000"}}
预期输出:
COL1 REF COL2 COL3 COL4
1 REFA 9 80000 0
1 REFB 9 80001 0
1 DIFFERENCE 0 -1 0
编写以下代码,以获得部分结果
SELECT
COL1,
COL2_REFA,COL2_REFB,(COL2_REFA-COL2_REFB) COL2_DIFFERENCE,
COL3_REFA,COL3_REFB,(COL3_REFA-COL3_REFB) COL3_DIFFERENCE,
COL4_REFA,COL4_REFB,(COL4_REFA-COL4_REFB) COL4_DIFFERENCE,
FROM
TABLE T1
LATERAL VIEW json_tuple(T1.JSON_STRING,'COL2','COL3','COL4') T2 AS `COL2`,`COL3`,`COL4`
LATERAL VIEW json_tuple(T2.COL2,'REFA','REFB') T3 AS `COL2_REFA`,`COL2_REFB`
LATERAL VIEW json_tuple(T2.COL3,'REFA','REFB') T4 AS `COL3_REFA`,`COL3_REFB`
LATERAL VIEW json_tuple(T2.COL4,'REFA','REFB') T5 AS `COL4_REFA`,`COL4_REFB`
以上查询结果:
COL1 COL2_REFA COL2_REFB COL2_DIFF COL3_REFA COL3_REFB COL3_DIFF COL4_REFA COL4_REFB COL4_DIFF
1 9 9 0 80000 80001 -1 0 0 0
不确定这是否是展平嵌套 json 字符串的有效方法。如何高效地达到预期产出?查询还需要兼容 pyspark sql(2.4)。
谢谢
检查以下解决方案
SELECT
COL1,
inline(
array(
named_struct(
'REF',
'REFA',
'COL2',
get_json_object(data,'$.COL2.REFA'),
'COL3',
get_json_object(data,'$.COL3.REFA'),
'COL4',
get_json_object(data,'$.COL4.REFA')
),
named_struct(
'REF',
'REFB',
'COL2',
get_json_object(data,'$.COL2.REFB'),
'COL3',
get_json_object(data,'$.COL3.REFB'),
'COL4',
get_json_object(data,'$.COL4.REFB')
),
named_struct(
'REF',
'DIFFERENCE',
'COL2',
(get_json_object(data,'$.COL2.REFA') - get_json_object(data,'$.COL2.REFB')),
'COL3',
(get_json_object(data,'$.COL3.REFA') - get_json_object(data,'$.COL3.REFB')),
'COL4',
(get_json_object(data,'$.COL4.REFA') - get_json_object(data,'$.COL4.REFB'))
)
)
)
FROM source
输出
+----+----------+----+----------+------+
|COL1|REF |COL2|COL3 |COL4 |
+----+----------+----+----------+------+
|1 |REFA |9 |80000.0000|0.0000|
|1 |REFB |9 |80001.0000|0.0000|
|1 |DIFFERENCE|0.0 |-1.0 |0.0 |
+----+----------+----+----------+------+
感谢斯里尼瓦斯为我指明了正确的方向。在您的查询中,除了 JSON 字符串之外,无法选择其他列。 抛出以下错误。
UDTF's are not supported outside the SELECT clause
将横向视图与内联 UDT 结合使用。有效。我正在为具有此类用例场景的其他用户发布此查询。
select T1.COL1,T2.* from
(select 1 COL1,'{"COL2": {"REFA": "9", "REFB": "9"}, "COL3": {"REFA": "80000.0000", "REFB": "80001.0000"}, "COL4": {"REFA": "0.0000", "REFB": "0.0000"}}' as JSON_STRING) as T1
lateral view
inline(
array(
named_struct(
'SYSTEM',
'REFA',
'COL2',
get_json_object(T1.JSON_STRING,'$.COL2.REFA'),
'COL3',
get_json_object(T1.JSON_STRING,'$.COL3.REFA'),
'COL4',
get_json_object(T1.JSON_STRING,'$.COL4.REFA')
),
named_struct(
'SYSTEM',
'REFB',
'COL2',
get_json_object(T1.JSON_STRING,'$.COL2.REFB'),
'COL3',
get_json_object(T1.JSON_STRING,'$.COL3.REFB'),
'COL4',
get_json_object(T1.JSON_STRING,'$.COL4.REFB')
),
named_struct(
'SYSTEM',
'DIFFERENCE',
'COL2',
(get_json_object(T1.JSON_STRING,'$.COL2.REFA')-get_json_object(T1.JSON_STRING,'$.COL2.REFB')),
'COL3',
(get_json_object(T1.JSON_STRING,'$.COL3.REFA')-get_json_object(T1.JSON_STRING,'$.COL3.REFB')),
'COL4',
(get_json_object(T1.JSON_STRING,'$.COL4.REFA')-get_json_object(T1.JSON_STRING,'$.COL4.REFB'))
)
)
) T2 as SYSTEM,COL2,COL3,COL4
结果:
+----------+-------------+----------+-------------+----------+
| t1.col1 | t2.system | t2.col2 | t2.col3 | t2.col4 |
+----------+-------------+----------+-------------+----------+
| 1 | REFA | 9 | 80000.0000 | 0.0000 |
| 1 | REFB | 9 | 80001.0000 | 0.0000 |
| 1 | DIFFERENCE | 0.0 | -1.0 | 0.0 |
+----------+-------------+----------+-------------+----------+