IllegalArgumentException:'字段“标签”不存在。在PYSPARK

问题描述 投票:0回答:1

我一直在 pyspark 中开发线性回归函数,并使用交叉验证来验证准确性。但它会抛出错误“llegalArgumentException:”字段“标签”不存在。”但我已将响应变量分配给它。在谷歌中看到了一些例子,并观察到它对其他人有效,而不是对我有效,除非我做错了什么。

Herewith, I have given the code and it would of great help if u can point out my mistake .Thanks in Advance 
# Linear regression - SPARK

# Libraries to be loaded
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator


def spark_linear_regression(dsname, target, ratio):
    spark = SparkSession.builder.appName('ml-bank').getOrCreate()
    data = spark.read.csv(dsname, header=True, inferSchema=True)
    data = data.dropna()
    print("Completed : 10 % ")

    cat_col = [t[0] for t in data.dtypes if t[1] == 'string']
    indexers = [StringIndexer(inputCol=column, outputCol=column + "_index").fit(data) for column in
                cat_col]
    index_colname = [t[0] + "_index" for t in data.dtypes if t[1] == 'string']
    selected_column = list(set(data.columns) - set(cat_col))
    selected_column.extend(index_colname)

    pipeline = Pipeline(stages=indexers)
    df_r = pipeline.fit(data).transform(data)
    df_r = df_r.select(selected_column)

    print("Completed : 30 %")

    selected_column.remove(target)
    assembler = VectorAssembler(inputCols=selected_column, outputCol="features")
    final_data = assembler.transform(df_r)
    final_data = final_data.select(['features', target])

    train_ratio = ratio / 100
    train_ratio = round(train_ratio, 2)
    val_ratio = round(1 - train_ratio, 2)

    train_data, test_data = final_data.randomSplit([train_ratio, val_ratio], seed=500)
    print("Completed : 50 %")
    linear_regression = LinearRegression(featuresCol='features', labelCol=target, maxIter=10)
    param_grid = ParamGridBuilder() \
        .addGrid(linear_regression.regParam, [0.1, 0.01]) \
        .addGrid(linear_regression.fitIntercept, [False, True]) \
        .addGrid(linear_regression.elasticNetParam, [0.0, 0.5, 1.0]) \
        .build()

    linear_regression = CrossValidator(estimator=linear_regression,
                                       estimatorParamMaps=param_grid,
                                       evaluator=RegressionEvaluator(),
                                       numFolds=5)

    print("Completed : 60 %")
    linear_regression_model = linear_regression.fit(train_data)
    linear_regression_summary = linear_regression_model.summary

    print("Completed : 80 %")
    print("Training RMSE: %f" % linear_regression_summary.rootMeanSquaredError)
    print("Training R^2: %f" % linear_regression_summary.r2)

    linear_pred = linear_regression_model.transform(test_data)
    linear_pred.select("prediction", target, "features").show(5)
    linear_evaluator = RegressionEvaluator(predictionCol="prediction",
                                           labelCol=target, metricName="r2")
    print("Completed : 95 %")
    print("R Squared (R2) on test data = %g" % linear_evaluator.evaluate(linear_pred))


spark_linear_regression(Mart_Sales.csv", "Item_MRP",
                        80)
pyspark linear-regression apache-spark-mllib
1个回答
0
投票

添加

RegressionEvaluator(metricName="rmse", labelCol = 'resp')

给您的交叉评估员。

© www.soinside.com 2019 - 2024. All rights reserved.