预处理原始数据的单元测试错误

问题描述 投票:0回答:1
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import nltk

nlp = spacy.load("en_core_web_md")

class fileread:
    def readfile(self):
        file_path = 'C:\\Users\\Documents\\Emails\\DEP72303-SYSOUT.txt'

        with open(file_path, 'r') as text:
            return text.read()
class preprocess:
    fi=fileread()
    def remove(self):
        return re.sub('^[\sA-Za-z0-9]', '',self.fi.readfile())

    def preprocess(self):
        doc = nlp(self.remove())
        filtered = []
        for token in doc:
            if token.is_stop or token.is_punct:
                continue

            filtered.append(token.lemma_)

        return " ".join(filtered)
pre=preprocess()
output=pre.preprocess()
tokens=output.split("\n")
print(tokens)

这段代码读取一个包含原始数据的文件,并使用正则表达式进行预处理,删除停用词和标点符号,最后完成词形还原。

上述程序的单元测试是:

import unittest
import os  # Make sure this line is present
from preprocessing import preprocess

class MockFileRead:
    # Mocking the file read class to use the temporary file
    def readfile(self):
        with open('test_input.txt', 'r') as text:
            return text.read()

class TestPreprocess(unittest.TestCase):
    def test_preprocess(self):
        # Assuming you have a sample input file with known content for testing
        sample_input = "Sample text content for testing purposes."

        # Create a temporary file for testing
        with open('test_input.txt', 'w') as temp_file:
            temp_file.write(sample_input)

        preprocess_instance = preprocess()
        preprocess_instance.fi = MockFileRead()

        # Expected output after preprocessing
        expected_output = "sample text content test purpose"

        self.assertEqual(preprocess_instance.preprocess(), expected_output)

        # Clean up: Remove the temporary file
        os.remove('test_input.txt')

if __name__ == 'main':
    unittest.main()

在上面的程序中我的预期输出是:示例文本内容测试目的 但我得到的输出是:充足的文本内容测试目的

如何解析代码并获得正确的输出。

python pandas nlp data-science
1个回答
0
投票

目前正在尝试仅删除每行开头的单个字符或数字。要删除所有非字母数字字符,您应该使用类似 [^a-zA-Z0-9\s]

的模式
© www.soinside.com 2019 - 2024. All rights reserved.