import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import nltk
nlp = spacy.load("en_core_web_md")
class fileread:
def readfile(self):
file_path = 'C:\\Users\\Documents\\Emails\\DEP72303-SYSOUT.txt'
with open(file_path, 'r') as text:
return text.read()
class preprocess:
fi=fileread()
def remove(self):
return re.sub('^[\sA-Za-z0-9]', '',self.fi.readfile())
def preprocess(self):
doc = nlp(self.remove())
filtered = []
for token in doc:
if token.is_stop or token.is_punct:
continue
filtered.append(token.lemma_)
return " ".join(filtered)
pre=preprocess()
output=pre.preprocess()
tokens=output.split("\n")
print(tokens)
这段代码读取一个包含原始数据的文件,并使用正则表达式进行预处理,删除停用词和标点符号,最后完成词形还原。
import unittest
import os # Make sure this line is present
from preprocessing import preprocess
class MockFileRead:
# Mocking the file read class to use the temporary file
def readfile(self):
with open('test_input.txt', 'r') as text:
return text.read()
class TestPreprocess(unittest.TestCase):
def test_preprocess(self):
# Assuming you have a sample input file with known content for testing
sample_input = "Sample text content for testing purposes."
# Create a temporary file for testing
with open('test_input.txt', 'w') as temp_file:
temp_file.write(sample_input)
preprocess_instance = preprocess()
preprocess_instance.fi = MockFileRead()
# Expected output after preprocessing
expected_output = "sample text content test purpose"
self.assertEqual(preprocess_instance.preprocess(), expected_output)
# Clean up: Remove the temporary file
os.remove('test_input.txt')
if __name__ == 'main':
unittest.main()
在上面的程序中我的预期输出是:示例文本内容测试目的 但我得到的输出是:充足的文本内容测试目的
如何解析代码并获得正确的输出。
目前正在尝试仅删除每行开头的单个字符或数字。要删除所有非字母数字字符,您应该使用类似 [^a-zA-Z0-9\s]
的模式