我试图在PySpark Pandas udf中对一个类实例进行pickle和unpickle。在udf之外,pickling工作得很好。
class ExampleModel:
pass
clf = ExampleModel(args)
pickled_val = base64.b64encode(pickle.dumps(clf))
clf2 = pickle.loads(base64.b64decode(pickled_val))
print(clf2)
# <__main__.ExampleModel instance at 0x7f04d7444780>
然而,在pandas udf里面,我可以访问ExampleModel类,但不能解开字符串列。
df = spark_session.createDataFrame(
[
(1, pickled_val, '')
],
['id', 'txt', 'error']
)
@pandas_udf(df.schema, PandasUDFType.GROUPED_MAP)
def example_unpickle(pdf):
try:
clf_obj = ExampleModel()
except Exception as e:
pdf.loc[:,'error'] = "1:" + str(e)
return pdf
try:
clf3 = pickle.loads(base64.b64decode(pdf.iloc[0,1]))
except Exception as e:
pdf.loc[:,'error'] = "2: " + str(e)
return pdf
df_clf = df\
.groupby('id')\
.apply(example_unpickle)
df_clf.show(truncate = False)
给出错误。
AttributeError: 'module' object has no attribute 'ExampleModel'
+---+------------------------------------------------+--------------------------------------------------+
|id |txt |error |
+---+------------------------------------------------+--------------------------------------------------+
|1 |KGlfX21haW5fXwpFeGFtcGxlTW9kZWwKcDAKKGRwMQpiLg==|2: 'module' object has no attribute 'ExampleModel'|
+---+------------------------------------------------+--------------------------------------------------+
解决的办法是把这个类做成一个单独的文件,并创建一个名为 "pickled_val "的文件。__init__.py
在同一目录下。
然后将该类导入为。
from ExampleFileName import ExampleModel