我具有使用pyspark的printSchema()函数创建的JSON模式,并且我想根据此JSON创建配置单元DDL。
我确实进行了谷歌搜索,但没有找到解决方案。有人有主意吗?
谢谢,
百合
这里是spark python
函数,您可以使用示例从dataframe
模式创建DDL以创建hive表。请相应调整。
def sparkDataFrameCreateTable(df, T = ''):
cols = df.dtypes
ddl = []
ddl.append("CREATE TABLE IF NOT EXISTS {} (".format(T))
kv = df.dtypes
num = len(df.dtypes)
count = 1
for i in kv:
print(count, num, i)
if count == num:
total = str(i[0]) + str(" ") + str(i[1])
else:
total = str(i[0]) + str(" ") + str(i[1]) + str(", ")
ddl.append(total)
count = count + 1
ddl.append(") STORED AS PARQUET")
schema_map = ''.join(ddl)
print(schema_map)
exec_sql = spark.sql(schema_map)
return exec_sql
df = spark.range(10)
spark.sql("create database if not exists junk")
spark.sql("show databases").show()
sparkDataFrameCreateTable(df, "junk.test")
spark.sql("use junk")
spark.sql("show tables").show()
+------------+
|databaseName|
+------------+
| default|
| junk|
+------------+
1 1 ('id', 'bigint')
CREATE TABLE IF NOT EXISTS junk.test (id bigint) STORED AS PARQUET
+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| junk| test| false|
+--------+---------+-----------+