这是源数据框
id | 名字 | 层 | 家长 | 孩子 |
---|---|---|---|---|
1 | A | 1 | 3 | |
2 | 1 | 1 | 5 | |
3 | B | 2 | 1 | 4 |
4 | C | 3 | 3 | 6 |
5 | 2 | 2 | 2 | 7 |
6 | D | 4 | 4 | |
7 | 3 | 3 | 5 |
需要以下方式的结果以动态方式,有时孩子的结果会更多(pyspark)
身份证 | 姓名 | 儿童1 | 儿童2 | 儿童3 |
---|---|---|---|---|
1 | A | B | C | D |
2 | 1 | 2 | 3 | 空 |
请在pyspark中提供动态代码
我有下面的代码,但我觉得这是静态的
from pyspark.sql import SparkSession
# Create SparkSession
spark = SparkSession.builder \
.appName("Hierarchy DataFrame") \
.getOrCreate()
# Create DataFrame from the provided data
data = [
(1, 'A', 1, None, '3'),
(2, '1', 1, None, '5'),
(3, 'B', 2, '1', '4'),
(4, 'C', 3, '3', '6'),
(5, '2', 2, '2', '7'),
(6, 'D', 4, '4', None),
(7, '3', 3, '5', None)
]
hie_df = spark.createDataFrame(data, ['id', 'name', 'layer', 'parent', 'child'])
# Define DataFrames for each level
parent_layer1 = hie_df.filter(hie_df['layer'] == 1)
level2 = hie_df.filter((hie_df['parent'].isNotNull()) & (hie_df['layer'] == 2))
level3 = hie_df.filter((hie_df['parent'].isNotNull()) & (hie_df['layer'] == 3))
level4 = hie_df.filter((hie_df['parent'].isNotNull()) & (hie_df['layer'] == 4))
# Join DataFrames and select the desired columns
result_df = parent_layer1.join(level2, parent_layer1['id'] == level2['parent'], 'left') \
.join(level3, level2['id'] == level3['parent'], 'left') \
.join(level4, level3['id'] == level4['parent'], 'left') \
.select(parent_layer1['id'],
parent_layer1['name'],
level2['name'].alias('child1'),
level3['name'].alias('child2'),
level4['name'].alias('child3'))
# Fill NULL values with 'NULL'
result_df = result_df.na.fill('NULL')
result_df.show()
from pyspark.sql import SparkSession
# Create SparkSession
spark = SparkSession.builder \
.appName("Hierarchy DataFrame") \
.getOrCreate()
# Create DataFrame from the provided data
data = [
(1, 'A', 1, None, '3'),
(2, '1', 1, None, '5'),
(3, 'B', 2, '1', '4'),
(4, 'C', 3, '3', '6'),
(5, '2', 2, '2', '7'),
(6, 'D', 4, '4', None),
(7, '3', 3, '5', None)
]
hie_df = spark.createDataFrame(data, ['id', 'name', 'layer', 'parent', 'child'])
# Define DataFrames for each level
parent_layer1 = hie_df.filter(hie_df['layer'] == 1)
level2 = hie_df.filter((hie_df['parent'].isNotNull()) & (hie_df['layer'] == 2))
level3 = hie_df.filter((hie_df['parent'].isNotNull()) & (hie_df['layer'] == 3))
level4 = hie_df.filter((hie_df['parent'].isNotNull()) & (hie_df['layer'] == 4))
# Join DataFrames and select the desired columns
result_df = parent_layer1.join(level2, parent_layer1['id'] == level2['parent'], 'left') \
.join(level3, level2['id'] == level3['parent'], 'left') \
.join(level4, level3['id'] == level4['parent'], 'left') \
.select(parent_layer1['id'],
parent_layer1['name'],
level2['name'].alias('child1'),
level3['name'].alias('child2'),
level4['name'].alias('child3'))
# Fill NULL values with 'NULL'
result_df = result_df.na.fill('NULL')
result_df.show()