我有一个数据框。如果任何列具有空值,我想引发异常。我可以通过对列名称进行硬编码并放置或检查条件来做到这一点。我怎样才能让它充满活力并且性能有效。
有多种方法可以完成此任务,以下是使用
RAISE_ERROR
函数的一种解决方案。
df.show(false)
+---+----+
|id |name|
+---+----+
|1 |A |
|2 |B |
|3 |NULL|
+---+----+
val checkNullExprs = df
.columns
.map{ c =>
expr(s"IF(${c} IS NULL, RAISE_ERROR('[ ${c} IS NULL ]'), ${c})")
.as(c)
}
df
.select(checkNullExprs:_*)
.show(false)
java.lang.RuntimeException: [ name IS NULL ]
at org.apache.spark.sql.catalyst.expressions.RaiseError.eval(misc.scala:90)
at org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:543)
at org.apache.spark.sql.catalyst.expressions.If.eval(conditionalExpressions.scala:90)
at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:158)
at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(InterpretedMutableProjection.scala:89)
at org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation$$anonfun$apply$47.$anonfun$applyOrElse$82(Optimizer.scala:2149)
at scala.collection.immutable.List.map(List.scala:297)
at org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation$$anonfun$apply$47.applyOrElse(Optimizer.scala:2149)
at org.a
或
df.show(false)
+----+----+
|id |name|
+----+----+
|1 |A |
|NULL|B |
|3 |NULL|
+----+----+
val messageExpr = concat(
lit("[ "),
concat_ws(", ", filter(array(df.columns.map(c => expr(s"""IF(COUNT(${c}) != COUNT(*), "`${c}` has nulls", NULL)""").as(c)):_*), f => f.isNotNull)),
lit(" ]")
).as("message")
df
.select(messageExpr)
.withColumn("message", expr("IF(LENGTH(message) > 2, RAISE_ERROR(message), CAST(NULL AS STRING))"))
.select("message")
.show(false)
java.lang.RuntimeException: [ `id` has nulls, `name` has nulls ]
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.project_doConsume_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)