我有一个以下格式的数据集。
样本数据集:
现在我需要转置(测试,Opt_marks,TotalMarks)以下格式的信息。
设计输出:
我在下面尝试过,但没有得到所需的输出。
from pyspark.sql.functions import explode,create_map, lit
from functools import reduce
transpose_df= df_output.select(
explode(
create_map(
*reduce(lambda y,x:x+y,
[[lit(col), col] for col in df_output.columns]
)
)
)
)
任何提示或参考程序都会有所帮助。谢谢
我将测试名称添加到
Opt_Marks
和 TotalMarks
以区分列名称:
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("pyspark_playground").getOrCreate()
def test_columns_adder(df, test_name):
df = (
df
.withColumn(test_name, f.when(f.array_contains(f.col('Test'), test_name), f.lit('Yes')).otherwise(f.lit('No')))
.withColumn(f'Opt_Marks_{test_name}', f.element_at(f.col('Opt_Marks'), test_name))
.withColumn(f'TotalMarks_{test_name}', f.element_at(f.col('TotalMarks'), test_name))
)
return df
df = spark.createDataFrame([
(5014970, 'IMO', 68, 100),
(5014970, 'SMO', 14, 50),
(5014974, 'NSO', 12, 35),
(5014974, 'IMO', 59, 100)
], ['student_no', 'Test', 'Opt_Marks', 'TotalMarks'])
list_of_tests = [element.Test for element in df.select('Test').distinct().collect()]
df = (
df
.groupBy('student_no')
.agg(
f.collect_list(f.col('Test')).alias('Test'),
f.map_from_arrays(f.collect_list(f.col('Test')), f.collect_list(f.col('Opt_Marks'))).alias('Opt_Marks'),
f.map_from_arrays(f.collect_list(f.col('Test')), f.collect_list(f.col('TotalMarks'))).alias('TotalMarks')
)
)
for test_name in list_of_tests:
df = test_columns_adder(df, test_name)
output = df.drop('Test', 'Opt_Marks', 'TotalMarks')
output.show()
输出:
+----------+---+-------------+--------------+---+-------------+--------------+---+-------------+--------------+
|student_no|IMO|Opt_Marks_IMO|TotalMarks_IMO|SMO|Opt_Marks_SMO|TotalMarks_SMO|NSO|Opt_Marks_NSO|TotalMarks_NSO|
+----------+---+-------------+--------------+---+-------------+--------------+---+-------------+--------------+
| 5014970|Yes| 68| 100|Yes| 14| 50| No| NULL| NULL|
| 5014974|Yes| 59| 100| No| NULL| NULL|Yes| 12| 35|
+----------+---+-------------+--------------+---+-------------+--------------+---+-------------+--------------+