我有一个具有以下架构的
df
,
g_hut: string
date: date
arr_data:array
element:struct
Id:string
Q_Id:string
Q_Type:string
我想将
arr_data
列从 Array(Struct)
转换为 Array(Map)
。
g_hut: string
date: date
arr_data:array
element:map
key:string
value:string
原始
arr_data
列的行看起来像这样,
arr_data:
[
{'Id': '12a', 'Q_Id': 'uac', 'Q_Type': 'action'},
{'Id': '', 'Q_Id': '', ''},
{'Id': '76v', 'Q_Id': '', 'Q_Type': 'form'}
]
我尝试了以下方法,
df = df.withColumn("arr_data_map", f.array(f.create_map(
f.lit("Id"), f.col("arr_data.Id"),
f.lit("Q_Id"), f.col("arr_data.Q_Id"),
f.lit("Q_Type"), f.col("arr_data.Q_Type")
)))
我得到以下结果,
[
{'Id': ['12a', '', '76v']},
{'Q_Id': ['uac', '','']},
{'Q_Type': ['action', '', 'form']}
]
这不是我想要的。我想要原始的
arr_data
与上面提到的 Map
模式。我怎样才能实现这个目标?
下面创建一个示例
df
(原始),其模式具有数组(结构),
data = [
('A', datetime.date(2022, 1, 1), [{'Id': '12a', 'Q_Id': 'uac', 'Q_Type': 'action'},
{'Id': '', 'Q_Id': '', 'Q_Type': ''},
{'Id': '76v', 'Q_Id': '', 'Q_Type': 'form'}]),
('B', datetime.date(2022, 1, 2), [{'Id': '34b', 'Q_Id': 'abc', 'Q_Type': 'action'},
{'Id': '56c', 'Q_Id': 'def', 'Q_Type': 'form'},
{'Id': '78d', 'Q_Id': 'ghi', 'Q_Type': 'action'}])
]
# Define the schema
schema = t.StructType([t.StructField("g_hut", t.StringType()),
t.StructField("date", t.DateType()),
t.StructField("arr_data", t.ArrayType(
t.StructType([
t.StructField("Id", t.StringType()),
t.StructField("Q_Id", t.StringType()),
t.StructField("Q_Type", t.StringType())]))
)
])
# Create a DataFrame
df = spark.createDataFrame(data, schema=schema)
应用高阶转换函数将数组内的每个结构转换为相应的映射表示
out = df.withColumn(
'arr_data',
F.transform(
'arr_data',
lambda x: F.create_map(
F.lit('Id'), x.Id,
F.lit('Q_Id'), x.Q_Id,
F.lit('Q_Type'), x.Q_Type
)
)
)
out.show()
+-----+----------+----------------------------------------------------------------------------------------------------------------------------------+
|g_hut|date |arr_data |
+-----+----------+----------------------------------------------------------------------------------------------------------------------------------+
|A |2022-01-01|[{Id -> 12a, Q_Id -> uac, Q_Type -> action}, {Id -> , Q_Id -> , Q_Type -> }, {Id -> 76v, Q_Id -> , Q_Type -> form}] |
|B |2022-01-02|[{Id -> 34b, Q_Id -> abc, Q_Type -> action}, {Id -> 56c, Q_Id -> def, Q_Type -> form}, {Id -> 78d, Q_Id -> ghi, Q_Type -> action}]|
+-----+----------+----------------------------------------------------------------------------------------------------------------------------------+
为了达到预期的结果,您可以使用transform函数和map函数将结构体数组转换为映射数组。以下是如何做到这一点的示例:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
# Create a Spark session
spark = SparkSession.builder.appName("example").getOrCreate()
# Sample data
data = [
('A', '2022-01-01', [{'Id': '12a', 'Q_Id': 'uac', 'Q_Type': 'action'},
{'Id': '', 'Q_Id': '', 'Q_Type': ''},
{'Id': '76v', 'Q_Id': '', 'Q_Type': 'form'}]),
('B', '2022-01-02', [{'Id': '34b', 'Q_Id': 'abc', 'Q_Type': 'action'},
{'Id': '56c', 'Q_Id': 'def', 'Q_Type': 'form'},
{'Id': '78d', 'Q_Id': 'ghi', 'Q_Type': 'action'}])
]
# Define the schema
schema = ["g_hut", "date", "arr_data"]
# Create a DataFrame
df = spark.createDataFrame(data, schema=schema)
# Define a function to convert struct to map
def struct_to_map(struct_column):
return F.create_map(*[F.lit(c).alias("key"), F.col("arr_data." + c).alias("value") for c in struct_column])
# Apply the transformation
df_transformed = df.withColumn("arr_data_map", F.expr("transform(arr_data, x -> transform(x, y -> named_struct('key', y, 'value', x[y])))"))
# Show the result
df_transformed.select("g_hut", "date", "arr_data_map").show(truncate=False)