“BrokenProcessPool:任务未能反序列化。”运行 pyLDAvis.gensim 时出错

问题描述 投票:0回答:1

我正在尝试使用 pyLDAvis.gensim 可视化 lda 的结果。但是,我在运行 pyLDAvis.gensim 包时遇到了错误。当我执行代码时,我收到“BrokenProcessPool:任务未能反序列化”。错误信息。

我正在使用 Windows 11 并在 Google Colab 上运行代码。

这是我的代码片段:

import pyLDAvis
import pyLDAvis.gensim

model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamodel)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, id2word)
vis

我收到的报错信息如下:

/usr/local/lib/python3.9/dist-packages/pandas/core/internals/blocks.py:351: RuntimeWarning: divide by zero encountered in log
  def _split_op_result(self, result: ArrayLike) -> list[Block]:
/usr/local/lib/python3.9/dist-packages/pandas/core/internals/blocks.py:351: RuntimeWarning: divide by zero encountered in log
  def _split_op_result(self, result: ArrayLike) -> list[Block]:
ERROR:concurrent.futures:exception calling callback for <Future at 0x7f1102c265b0 state=finished raised BrokenProcessPool>
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/joblib/externals/loky/process_executor.py", line 391, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
  File "/usr/lib/python3.9/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
ModuleNotFoundError: No module named 'pandas.core.indexes.numeric'
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/joblib/externals/loky/_base.py", line 26, in _invoke_callbacks
    callback(self)
  File "/usr/local/lib/python3.9/dist-packages/joblib/parallel.py", line 385, in __call__
    self.parallel.dispatch_next()
  File "/usr/local/lib/python3.9/dist-packages/joblib/parallel.py", line 834, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/usr/local/lib/python3.9/dist-packages/joblib/parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.9/dist-packages/joblib/parallel.py", line 819, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/usr/local/lib/python3.9/dist-packages/joblib/_parallel_backends.py", line 556, in apply_async
    future = self._workers.submit(SafeFunction(func))
  File "/usr/local/lib/python3.9/dist-packages/joblib/externals/loky/reusable_executor.py", line 176, in submit
    return super().submit(fn, *args, **kwargs)
  File "/usr/local/lib/python3.9/dist-packages/joblib/externals/loky/process_executor.py", line 1129, in submit
    raise self._flags.broken
joblib.externals.loky.process_executor.BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.
---------------------------------------------------------------------------
_RemoteTraceback                          Traceback (most recent call last)
_RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/joblib/externals/loky/process_executor.py", line 391, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
  File "/usr/lib/python3.9/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
ModuleNotFoundError: No module named 'pandas.core.indexes.numeric'
"""

The above exception was the direct cause of the following exception:

BrokenProcessPool                         Traceback (most recent call last)
<ipython-input-62-f1b7220187d9> in <cell line: 6>()
      4 model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamodel)
      5 pyLDAvis.enable_notebook()
----> 6 vis = pyLDAvis.gensim.prepare(model, corpus, id2word)
      7 vis

15 frames
/usr/local/lib/python3.9/dist-packages/pyLDAvis/gensim.py in prepare(topic_model, corpus, dictionary, doc_topic_dist, **kwargs)
    121     """
    122     opts = fp.merge(_extract_data(topic_model, corpus, dictionary, doc_topic_dist), kwargs)
--> 123     return vis_prepare(**opts)

/usr/local/lib/python3.9/dist-packages/pyLDAvis/_prepare.py in prepare(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency, R, lambda_step, mds, n_jobs, plot_opts, sort_topics, start_index)
    430     term_frequency = np.sum(term_topic_freq, axis=0)
    431 
--> 432     topic_info = _topic_info(topic_term_dists, topic_proportion,
    433                              term_frequency, term_topic_freq, vocab, lambda_step, R,
    434                              n_jobs, start_index)

/usr/local/lib/python3.9/dist-packages/pyLDAvis/_prepare.py in _topic_info(topic_term_dists, topic_proportion, term_frequency, term_topic_freq, vocab, lambda_step, R, n_jobs, start_index)
    271         ])
    272 
--> 273     top_terms = pd.concat(Parallel(n_jobs=n_jobs)
    274                           (delayed(_find_relevance_chunks)(log_ttd, log_lift, R, ls)
    275                           for ls in _job_chunks(lambda_seq, n_jobs)))

/usr/local/lib/python3.9/dist-packages/joblib/parallel.py in __call__(self, iterable)
   1096 
   1097             with self._backend.retrieval_context():
-> 1098                 self.retrieve()
   1099             # Make sure that we get a last message telling us we are done
   1100             elapsed_time = time.time() - self._start_time

/usr/local/lib/python3.9/dist-packages/joblib/parallel.py in retrieve(self)
    973             try:
    974                 if getattr(self._backend, 'supports_timeout', False):
--> 975                     self._output.extend(job.get(timeout=self.timeout))
    976                 else:
    977                     self._output.extend(job.get())

/usr/local/lib/python3.9/dist-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
    565         AsyncResults.get from multiprocessing."""
    566         try:
--> 567             return future.result(timeout=timeout)
    568         except CfTimeoutError as e:
    569             raise TimeoutError from e

/usr/lib/python3.9/concurrent/futures/_base.py in result(self, timeout)
    444                     raise CancelledError()
    445                 elif self._state == FINISHED:
--> 446                     return self.__get_result()
    447                 else:
    448                     raise TimeoutError()

/usr/lib/python3.9/concurrent/futures/_base.py in __get_result(self)
    389         if self._exception:
    390             try:
--> 391                 raise self._exception
    392             finally:
    393                 # Break a reference cycle with the exception in self._exception

/usr/local/lib/python3.9/dist-packages/joblib/externals/loky/_base.py in _invoke_callbacks(self)
     24         for callback in self._done_callbacks:
     25             try:
---> 26                 callback(self)
     27             except BaseException:
     28                 LOGGER.exception(f'exception calling callback for {self!r}')

/usr/local/lib/python3.9/dist-packages/joblib/parallel.py in __call__(self, out)
    383         with self.parallel._lock:
    384             if self.parallel._original_iterator is not None:
--> 385                 self.parallel.dispatch_next()
    386 
    387 

/usr/local/lib/python3.9/dist-packages/joblib/parallel.py in dispatch_next(self)
    832 
    833         """
--> 834         if not self.dispatch_one_batch(self._original_iterator):
    835             self._iterating = False
    836             self._original_iterator = None

/usr/local/lib/python3.9/dist-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    899                 return False
    900             else:
--> 901                 self._dispatch(tasks)
    902                 return True
    903 

/usr/local/lib/python3.9/dist-packages/joblib/parallel.py in _dispatch(self, batch)
    817         with self._lock:
    818             job_idx = len(self._jobs)
--> 819             job = self._backend.apply_async(batch, callback=cb)
    820             # A job can complete so quickly than its callback is
    821             # called before we get here, causing self._jobs to

/usr/local/lib/python3.9/dist-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    554     def apply_async(self, func, callback=None):
    555         """Schedule a func to be run"""
--> 556         future = self._workers.submit(SafeFunction(func))
    557         future.get = functools.partial(self.wrap_future_result, future)
    558         if callback is not None:

/usr/local/lib/python3.9/dist-packages/joblib/externals/loky/reusable_executor.py in submit(self, fn, *args, **kwargs)
    174     def submit(self, fn, *args, **kwargs):
    175         with self._submit_resize_lock:
--> 176             return super().submit(fn, *args, **kwargs)
    177 
    178     def _resize(self, max_workers):

/usr/local/lib/python3.9/dist-packages/joblib/externals/loky/process_executor.py in submit(self, fn, *args, **kwargs)
   1127         with self._flags.shutdown_lock:
   1128             if self._flags.broken is not None:
-> 1129                 raise self._flags.broken
   1130             if self._flags.shutdown:
   1131                 raise ShutdownExecutorError(

BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.

我不确定为什么会出现此错误以及如何解决它。谁能提供解决此问题的指导或建议?

谢谢。

python google-colaboratory pickle lda pyldavis
1个回答
0
投票

我注意到 Google Collab 只支持 pandas 到 1.5.3,当你安装 pyLDAvis 时,它会安装一个不同的版本。 On installing pyLDAvis in Google Colab 将 pandas 版本恢复到 1.5.3 为我解决了这个问题。 Install pandas 1.5.3 after pyLDAvis installation

© www.soinside.com 2019 - 2024. All rights reserved.