从spark的document,我知道我可以从文件的libsvm-formatted
数据集中加载。
但是,我想在远程spark集群中运行代码,所以我将虹膜数据集硬编码到我的代码中,并且我想直接从此String对象加载。
但是,当查看DataFrameReader对象时,我发现没有API支持从String
直接加载数据集。
我尝试过这种方式-
val irisData =
"""
|"sepal_length","sepal_width","petal_length","petal_width","label"
|5.1,3.5,1.4,0.2,Iris-setosa
|4.9,3.0,1.4,0.2,Iris-setosa
|4.7,3.2,1.3,0.2,Iris-setosa
|4.6,3.1,1.5,0.2,Iris-setosa
""".stripMargin
println(irisData)
"sepal_length","sepal_width","petal_length","petal_width","label"
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
val stringDS = spark.createDataset(irisData.split("\n").toSeq)(Encoders.STRING)
val irisDatasetDF = spark.read
.option("inferSchema", "true")
.option("header", "true")
.csv(stringDS)
irisDatasetDF.show(false)
+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|label |
+------------+-----------+------------+-----------+-----------+
|5.1 |3.5 |1.4 |0.2 |Iris-setosa|
|4.9 |3.0 |1.4 |0.2 |Iris-setosa|
|4.7 |3.2 |1.3 |0.2 |Iris-setosa|
|4.6 |3.1 |1.5 |0.2 |Iris-setosa|
+------------+-----------+------------+-----------+-----------+