我需要比较 2 个文件,一个 Pdf 和一个 Excel 文件,以发现数据的差异

问题描述 投票:0回答:1

我的代码似乎对于添加的状态可以正常工作,每个状态都在自己的单独行中,但是删除的状态被连接成一个长的不可读的字符串。我需要将每个差异放在其自己的单独行中,并希望将其导出到 Excel 文件和 Word 文件中。另外,为了将来的目的,我想动态添加文件而不是硬编码

import pandas as pd
from PyPDF2 import PdfReader
import difflib
import re
from xml.etree.ElementTree import Element, SubElement, tostring
from xml.dom.minidom import parseString
from docx import Document

# Function to extract text from PDF using PyPDF2
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

# Function to read Excel file and return a list of rows
def read_excel(excel_path):
    try:
        df = pd.read_excel(excel_path)
        return [row.tolist() for _, row in df.iterrows()]
    except FileNotFoundError:
        print("Excel file not found.")
        return []

# Function to clean text data
def clean_text(text):
    # Remove any characters that are not printable ASCII or illegal in Excel
    cleaned_text = ''.join(filter(lambda x: x.isprintable() or x.isspace() or ord(x) > 31, text))
    # Replace any problematic characters with an empty string
    cleaned_text = re.sub(r'[^\x20-\x7E]', '', cleaned_text)
    return cleaned_text

# Function to find differences between two texts
def find_differences(text1, text2):
    diff = difflib.ndiff(text1.splitlines(), text2.splitlines())
    diff_data = []
    current_deleted_text = ""
    for line in diff:
        if line.startswith('+'):
            if current_deleted_text:
                diff_data.append({"Text": current_deleted_text.strip(), "Status": "Deleted"})
                current_deleted_text = ""
            diff_data.append({"Text": line[2:], "Status": "Added"})
        elif line.startswith('-'):
            current_deleted_text += line[2:] + "\n"
        else:
            if current_deleted_text:
                diff_data.append({"Text": current_deleted_text.strip(), "Status": "Deleted"})
                current_deleted_text = ""
            diff_data.append({"Text": line[2:], "Status": "Matched"})
    if current_deleted_text:
        diff_data.append({"Text": current_deleted_text.strip(), "Status": "Deleted"})
    return diff_data


# Define the PDF and Excel file paths
pdf_path = "pdf_name.pdf"
excel_path = 'excel_name'

# Extract text from Excel
excel_rows = read_excel(excel_path)

# Extract text from PDF
pdf_text = extract_text_from_pdf(pdf_path)

# Clean the PDF text
pdf_text_cleaned = clean_text(pdf_text)

# Find differences between PDF and Excel files
diff_data = find_differences(pdf_text_cleaned, '\n'.join([' '.join(row) for row in excel_rows]))

# Create a DataFrame to store the differences
diff_df = pd.DataFrame(diff_data, columns=["Text", "Status"])

# Define colors for highlighting
highlight_colors = {'Added': '#00FF00', 'Deleted': '#FF0000', 'Matched': '#FFFFFF'}  # Green for Added, Red for Deleted, White for Matched

# Export to Excel with highlighting
excel_writer = pd.ExcelWriter("differences.xlsx", engine='xlsxwriter')
diff_df.to_excel(excel_writer, index=False)

# Get the xlsxwriter workbook and worksheet objects
workbook = excel_writer.book
worksheet = excel_writer.sheets['Sheet1']

# Add cell formats for highlighting
for status, color in highlight_colors.items():
    cell_format = workbook.add_format({'bg_color': color})
    # Find cells with the specified status and apply the cell format
    for row_idx, row in diff_df.iterrows():
        if row['Status'] == status:
            worksheet.set_row(row_idx + 1, None, cell_format)

# Save the Excel file
excel_writer._save()

# Export to CSV
diff_df.to_csv("differences.csv", index=False)

# Function to convert CSV to XML
def csv_to_xml(csv_file, xml_file):
    df = pd.read_csv(csv_file)
    root = Element('Differences')
    for idx, row in df.iterrows():
        diff = SubElement(root, 'Difference')
        text = SubElement(diff, 'Text')
        text.text = row['Text']
        status = SubElement(diff, 'Status')
        status.text = row['Status']
    xml_data = tostring(root)
    xml_str = parseString(xml_data).toprettyxml()
    with open(xml_file, 'w') as f:
        f.write(xml_str)

# Function to generate Word document from XML
def xml_to_docx(xml_file, docx_file):
    doc = Document()
    tree = ElementTree.parse(xml_file)
    root = tree.getroot()
    for diff in root.findall('Difference'):
        text = diff.find('Text').text
        status = diff.find('Status').text
        if status == 'Added':
            doc.add_paragraph(text).bold = True
        elif status == 'Deleted':
            doc.add_paragraph(text).italic = True
        else:
            doc.add_paragraph(text)
    doc.save(docx_file)

# Convert CSV to XML
csv_file = 'differences.csv'
xml_file = 'differences.xml'
csv_to_xml(csv_file, xml_file)

# Read CSV file into a DataFrame
df = pd.read_csv('differences.csv')

# Create a new Word document
doc = Document()

# Add a table to the document
table = doc.add_table(rows=1, cols=2)
table.style = 'Table Grid'

# Add header row to the table
hdr_cells = table.rows[0].cells
hdr_cells[0].text = 'Text'
hdr_cells[1].text = 'Status'

# Add each difference as a row in the table
for idx, row in df.iterrows():
    row_cells = table.add_row().cells
    row_cells[0].text = row['Text']
    row_cells[1].text = row['Status']

# Save the document using a context manager
with open('differences.docx', 'wb') as f:
    doc.save(f)

print("Word document created successfully.")

我希望差异位于其自己的单独行中,并以其状态对应的颜色突出显示

python pandas compare pypdf
1个回答
0
投票

您的问题可能来自于处理已删除文本的方式,只需使用

current_deleted_text += line[2:] + "\n"
将文本和换行符添加到单个字符串,然后使用
diff_data
 添加一行到 
diff_data.append({"Text": current_deleted_text.strip(), "Status": "Deleted"})

 

尝试这样做:

if current_deleted_text:
    for s in current_deleted_text.split('\n')[:-1]:
        diff_data.append({"Text": s.strip(), "Status": "Deleted"})

而不是这个:

if current_deleted_text:
    diff_data.append({"Text": current_deleted_text.strip(), "Status": "Deleted"})

[:-1]
确保您不会因为添加到
Deleted
的最后一个字符串而得到空的
current_deleted_text
行。

这有帮助吗?

© www.soinside.com 2019 - 2024. All rights reserved.