我有一个在 AWS EMR(emr-5.31.0) 中运行的 Spark(2.4.6) Scala 作业随机失败并出现错误
org.xml.sax.SAXParseException; Premature end of file
。该作业始终覆盖 S3 中的镶木地板文件,并且大部分成功,但偶尔我们会看到这些错误。在 Spark UI 中,一切看起来都很好,而且我没有看到任何其他错误,所以我不确定如何找到问题的根源。还有其他人遇到过这个问题吗?
这里是我们经常看到的stack trace:
2023-04-19 01:04:50 ERROR FileFormatWriter:91 - Aborting job.
java.io.IOException: Failed publishing one or more staging directories
at com.amazon.ws.emr.hadoop.fs.staging.ExternalStagedFileCommitter.lambda$publishOrDelete$0(ExternalStagedFileCommitter.java:69)
at com.amazon.ws.emr.hadoop.fs.util.ExceptionCollector.throwIfNotEmpty(ExceptionCollector.java:89)
at com.amazon.ws.emr.hadoop.fs.staging.ExternalStagedFileCommitter.publishOrDelete(ExternalStagedFileCommitter.java:68)
at com.amazon.ws.emr.hadoop.fs.staging.DefaultStagingMechanism.publishOrDeleteExternalStagingDirectories(DefaultStagingMechanism.java:106)
at org.apache.spark.internal.io.StagingServiceOptimizedCommitProtocol.commitJob(StagingServiceOptimizedCommitProtocol.scala:172)
at org.apache.spark.internal.io.CompositeCommitProtocol.commitJob(CompositeCommitProtocol.scala:108)
at org.apache.spark.sql.execution.datasources.SQLEmrOptimizedCommitProtocol.commitJob(SQLEmrOptimizedCommitProtocol.scala:121)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:187)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:173)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:173)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:169)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:197)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:194)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:169)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:114)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:112)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:677)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:677)
at org.apache.spark.sql.execution.SQLExecution$.org$apache$spark$sql$execution$SQLExecution$$executeQuery$1(SQLExecution.scala:83)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1$$anonfun$apply$1.apply(SQLExecution.scala:94)
at org.apache.spark.sql.execution.QueryExecutionMetrics$.withMetrics(QueryExecutionMetrics.scala:141)
at org.apache.spark.sql.execution.SQLExecution$.org$apache$spark$sql$execution$SQLExecution$$withMetrics(SQLExecution.scala:178)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:93)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:200)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:92)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:677)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:286)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:272)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:230)
at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:567)
at ...write.parquet...
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.yarn.ApplicationMaster$$anon$2.run(ApplicationMaster.scala:685)
Caused by: com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.SdkClientException: Failed to parse XML document with handler class com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.model.transform.XmlResponsesSaxParser$DeleteObjectsHandler
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.model.transform.XmlResponsesSaxParser.parseXmlInputStream(XmlResponsesSaxParser.java:166)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.model.transform.XmlResponsesSaxParser.parseDeletedObjectsResult(XmlResponsesSaxParser.java:472)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.model.transform.Unmarshallers$DeleteObjectsResultUnmarshaller.unmarshall(Unmarshallers.java:340)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.model.transform.Unmarshallers$DeleteObjectsResultUnmarshaller.unmarshall(Unmarshallers.java:336)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.internal.S3XmlResponseHandler.handle(S3XmlResponseHandler.java:62)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.internal.ResponseHeaderHandlerChain.handle(ResponseHeaderHandlerChain.java:44)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.internal.ResponseHeaderHandlerChain.handle(ResponseHeaderHandlerChain.java:30)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.response.AwsResponseHandlerAdapter.handle(AwsResponseHandlerAdapter.java:69)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleResponse(AmazonHttpClient.java:1726)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleSuccessResponse(AmazonHttpClient.java:1446)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1368)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1145)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:802)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:770)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:744)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:704)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:686)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:550)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:530)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5140)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5086)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.AmazonS3Client.deleteObjects(AmazonS3Client.java:2277)
at com.amazon.ws.emr.hadoop.fs.s3.lite.call.DeleteObjectsCall.perform(DeleteObjectsCall.java:24)
at com.amazon.ws.emr.hadoop.fs.s3.lite.call.DeleteObjectsCall.perform(DeleteObjectsCall.java:10)
at com.amazon.ws.emr.hadoop.fs.s3.lite.executor.GlobalS3Executor.execute(GlobalS3Executor.java:114)
at com.amazon.ws.emr.hadoop.fs.s3.lite.AmazonS3LiteClient.invoke(AmazonS3LiteClient.java:191)
at com.amazon.ws.emr.hadoop.fs.s3.lite.AmazonS3LiteClient.invoke(AmazonS3LiteClient.java:186)
at com.amazon.ws.emr.hadoop.fs.s3.lite.AmazonS3LiteClient.deleteObjects(AmazonS3LiteClient.java:128)
at com.amazon.ws.emr.hadoop.fs.s3n.Jets3tNativeFileSystemStore.deleteAll(Jets3tNativeFileSystemStore.java:350)
at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.doSingleThreadedBatchDelete(S3NativeFileSystem.java:1042)
at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.delete(S3NativeFileSystem.java:343)
at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.lambda$newFileCreationSubsystem$2(S3NativeFileSystem.java:217)
at com.amazon.ws.emr.hadoop.fs.staging.ExternalStagedFileCommitter.deleteDir(ExternalStagedFileCommitter.java:341)
at com.amazon.ws.emr.hadoop.fs.staging.ExternalStagedFileCommitter.access$000(ExternalStagedFileCommitter.java:38)
at com.amazon.ws.emr.hadoop.fs.staging.ExternalStagedFileCommitter$1.lambda$newDirectoryTask$2(ExternalStagedFileCommitter.java:290)
at com.amazon.ws.emr.hadoop.fs.staging.Task$1.run(Task.java:37)
at com.amazon.ws.emr.hadoop.fs.staging.ExternalStagingTaskCoordinator$TaskWithCallback.run(ExternalStagingTaskCoordinator.java:152)
at com.amazon.ws.emr.hadoop.fs.staging.StagedFilesExecutor.lambda$submitOrRun$0(StagedFilesExecutor.java:96)
at com.amazon.ws.emr.hadoop.fs.staging.StagedFilesExecutor.submitOrRun(StagedFilesExecutor.java:104)
at com.amazon.ws.emr.hadoop.fs.staging.StagedFilesExecutor.submitOrRunFirstBatch(StagedFilesExecutor.java:82)
at com.amazon.ws.emr.hadoop.fs.staging.StagedFilesExecutor.run(StagedFilesExecutor.java:54)
at com.amazon.ws.emr.hadoop.fs.staging.ExternalStagedFileCommitter.parallelExecute(ExternalStagedFileCommitter.java:266)
at com.amazon.ws.emr.hadoop.fs.staging.ExternalStagedFileCommitter.parallelPublishOrDelete(ExternalStagedFileCommitter.java:247)
at com.amazon.ws.emr.hadoop.fs.staging.ExternalStagedFileCommitter.publishOrDelete(ExternalStagedFileCommitter.java:63)
... 40 more
Caused by: org.xml.sax.SAXParseException; Premature end of file.
at org.apache.xerces.util.ErrorHandlerWrapper.createSAXParseException(Unknown Source)
at org.apache.xerces.util.ErrorHandlerWrapper.fatalError(Unknown Source)
at org.apache.xerces.impl.XMLErrorReporter.reportError(Unknown Source)
at org.apache.xerces.impl.XMLErrorReporter.reportError(Unknown Source)
at org.apache.xerces.impl.XMLErrorReporter.reportError(Unknown Source)
at org.apache.xerces.impl.XMLScanner.reportFatalError(Unknown Source)
at org.apache.xerces.impl.XMLDocumentScannerImpl$PrologDispatcher.dispatch(Unknown Source)
at org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanDocument(Unknown Source)
at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source)
at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source)
at org.apache.xerces.parsers.XMLParser.parse(Unknown Source)
at org.apache.xerces.parsers.AbstractSAXParser.parse(Unknown Source)
at com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.model.transform.XmlResponsesSaxParser.parseXmlInputStream(XmlResponsesSaxParser.java:152)
... 83 more