import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import wordcloud
from wordcloud import WordCloud,STOPWORDS
# Read the whole text.
remarks = open(r'C:\Users\marmar\Remarks.txt').read().split()
#Create words over an image
mask = np.array(Image.open(r'C:\users\marmar\Documents\cloud.png'))
#set the stopwords list
stopwords= set(STOPWORDS)
#append new words to the stopwords list
new_words =open(r'C:\Users\marmar\Documents\comments.txt').read().split()
new_stopwords=stopwords.union(new_words)
#generate the word cloud with parameters
wc =
WordCloud(
background_color="white", max_words=2000,
mask=mask,min_font_size =15, max_font_size=40, relative_scaling =
0.5, stopwords=new_stopwords,normalize_plurals= True)
wc.generate(remarks)
plt.figure(figsize=(25,25))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
#Show the wordcloud
plt.show()
因此,如果我从备注文本文件中删除
.read()
.split()
,它实际上会起作用并返回词云。但是,我希望能够对单词进行标记,并且实际上使词云准确。 (它没有显示单词分开)。但是,每次我这样做时,都会收到此错误。
TypeError Traceback (most recent call last)
<ipython-input-7-76f0df420fc2> in <module>()
19 wc = WordCloud(background_color="white", max_words=2000,
mask=mask,min_font_size =15, max_font_size=40, relative_scaling = 0.5,
stopwords=new_stopwords,
20 normalize_plurals= True)
---> 21 wc.generate(remarks)
22 plt.figure(figsize=(25,25))
23 plt.imshow(wc, interpolation="bilinear")
~\AppData\Local\Continuum\anaconda3\lib\site-
packages\wordcloud\wordcloud.py in generate(self, text)
563 self
564 """
--> 565 return self.generate_from_text(text)
566
567 def _check_generated(self):
~\AppData\Local\Continuum\anaconda3\lib\site-
packages\wordcloud\wordcloud.py in generate_from_text(self, text)
544 self
545 """
--> 546 words = self.process_text(text)
547 self.generate_from_frequencies(words)
548 return self
~\AppData\Local\Continuum\anaconda3\lib\site-packages\wordcloud\wordcloud.py
in process_text(self, text)
511 regexp = self.regexp if self.regexp is not None else r"\w[\w']+"
512
--> 513 words = re.findall(regexp, text, flags)
514 # remove stopwords
515 words = [word for word in words if word.lower() not in
stopwords]
~\AppData\Local\Continuum\anaconda3\lib\re.py in findall(pattern, string,
flags)
220
221 Empty matches are included in the result."""
--> 222 return _compile(pattern, flags).findall(string)
223
224 def finditer(pattern, string, flags=0):
TypeError: expected string or bytes-like object
我试图做的实际上是从我的备注文本文件中删除所有标点符号,并将文本文件转换为 unicode。评论文本文件工作正常,不知道为什么我的备注文件不能。
谢谢!
str.split 的输出是
list
。回溯中的默认正则表达式 (`r"""\w[\w']+""") 表明 wordcloud 需要一串单词,因此在传递给 wordcloud 之前必须将列表重新构造为字符串。
这应该有效:
...
# Read the whole text into a list of strings.
# (Presuambly to remove end of line characters).
remarks = open(r'C:\Users\marmar\Remarks.txt').read().split()
# Convert the list back into a single string.
remarks = ' '.join(remarks)
...