我正在尝试规范化文本,并且代码删除了单词之间的空格。运行代码时
import pandas as pd
import nltk as nltk
nltk.download('stopwords')
nltk.download('punkt')
import re
import numpy as np
import pandas as pd
from pandas import DataFrame
corpus =['The sky is blue and beautiful.','Love this blue and beautiful sky!','The quick brown fox jumps over the lazy dog.']
labels=['weather','weather','animals']
corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus, 'Category': labels})
corpus_df = corpus_df[["Document","Category"]]
print(corpus_df)
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words("english")
def normalize_document(doc):
#lowercase and remove special characters/whitespace
doc = re.sub(r'[^a-zA-Z]',' ',doc,re.I|re.A)
doc = doc.lower()
doc = doc.strip()
#tokenise document
tokens=wpt.tokenize(doc)
#filter stopwords out of the document
filtered_tokens=[token for token in tokens if token not in stop_words]
#recreate document from filtered tokens
doc=''.join(filtered_tokens)
return doc
normalize_corpus=np.vectorize(normalize_document)
norm_corpus=normalize_corpus(corpus_df)
我正在接受
norm_corpus= array([['skybluebeautiful', 'weather'],
['lovebluebeautifulsky', 'weather'],
['quickbrownfoxjumpslazydog', 'animals']], dtype='<U25')
而不是单词之间带有空格的数组:
norm_corpus= array([['sky blue beautiful', 'weather'],
['love blue beautiful sky', 'weather'],
['quick brown fox jumps lazy dog', 'animals']], dtype='<U25')
感谢您的帮助!
您可以通过添加空格来使用doc=' '.join(filtered_tokens)
,因为您没有为令牌定义任何东西。