我是火花新手。当我尝试通过我的 hadoop 主节点上的 jupyter 笔记本运行 pyspark 时,出现此错误。
使用 阿帕奇火花= 3.4.0 蟒蛇 3.11
请检查下面给出的代码
from pyspark.sql import SparkSession
from pyspark.sql import Row
spark = SparkSession.builder \
.appName("YourAppName") \
.config("spark.sql.hive.hiveserver2.jdbc.url", "jdbc:hive2://{IP}:{PORT}/{SCHEMA};user={USER};password={PWD}") \
.config("spark.master", "spark://{IP}:{PORT}") \
.enableHiveSupport() \
.getOrCreate()
spark.sql("SHOW TABLES").show()
这是我得到的错误。
Py4JError Traceback (most recent call last)
Cell In[3], line 1
----> 1 spark.sql("SHOW TABLES").show()
File ~/anaconda3/lib/python3.11/site-packages/pyspark/sql/session.py:1631, in SparkSession.sql(self, sqlQuery, args, **kwargs)
1627 assert self._jvm is not None
1628 litArgs = self._jvm.PythonUtils.toArray(
1629 [_to_java_column(lit(v)) for v in (args or [])]
1630 )
-> 1631 return DataFrame(self._jsparkSession.sql(sqlQuery, litArgs), self)
1632 finally:
1633 if len(kwargs) > 0:
File ~/anaconda3/lib/python3.11/site-packages/py4j/java_gateway.py:1322, in JavaMember.__call__(self, *args)
1316 command = proto.CALL_COMMAND_NAME +\
1317 self.command_header +\
1318 args_command +\
1319 proto.END_COMMAND_PART
1321 answer = self.gateway_client.send_command(command)
-> 1322 return_value = get_return_value(
1323 answer, self.gateway_client, self.target_id, self.name)
1325 for temp_arg in temp_args:
1326 if hasattr(temp_arg, "_detach"):
File ~/anaconda3/lib/python3.11/site-packages/pyspark/errors/exceptions/captured.py:179, in capture_sql_exception.<locals>.deco(*a, **kw)
177 def deco(*a: Any, **kw: Any) -> Any:
178 try:
--> 179 return f(*a, **kw)
180 except Py4JJavaError as e:
181 converted = convert_exception(e.java_exception)
File ~/anaconda3/lib/python3.11/site-packages/py4j/protocol.py:330, in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
329 else:
--> 330 raise Py4JError(
331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
332 format(target_id, ".", name, value))
333 else:
334 raise Py4JError(
335 "An error occurred while calling {0}{1}{2}".
336 format(target_id, ".", name))
Py4JError: An error occurred while calling o40.sql. Trace:
py4j.Py4JException: Method sql([class java.lang.String, class [Ljava.lang.Object;]) does not exist
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:321)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:329)
at py4j.Gateway.invoke(Gateway.java:274)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Thread.java:750)
我能够使用另一个虚拟机连接 Hive 表,但需要使用主节点
我的 pyspark 和 Spark 版本之间存在差异。我不得不将我的 pyspark 降级到 Spark 版本。