使用文本 - Python 脚本拆分时,拆分的 PDF 文件会被复制

问题描述 投票:0回答:1

我正在尝试使用 python 按特定关键字拆分 100 个 PDF。 如果 Python 中的页面包含该关键字,则将该页面拆分为新的 PDF。 我面临的问题是,文件正在重复。我已尝试一切方法来阻止复制过程,但它不会改变。

import os
import fitz  # PyMuPDF

def split_pdf_by_text(pdf_path, keyword, output_folder):
    # Check if the provided path is a valid file
    if not os.path.isfile(pdf_path):
        print(f"Error: '{pdf_path}' is not a valid file.")
        return

    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    print(f"Processing PDF: {pdf_path}")

    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    # Initialize a set to keep track of processed pages
    processed_pages = set()

    # Iterate through each page
    for page_number in range(len(pdf_document)):
        # Skip the page if it's already processed
        if page_number in processed_pages:
            continue

        # Get the page
        page = pdf_document.load_page(page_number)

        # Extract text from the page
        text = page.get_text()

        # Check if the keyword exists in the page text
        if keyword in text:
            # Construct the output file path
            output_file_name = f"{os.path.splitext(os.path.basename(pdf_path))[0]}_page_{page_number + 1}.pdf"
            output_path = os.path.join(output_folder, output_file_name)

            # Create a new PDF document
            new_pdf = fitz.open()
            new_pdf.insert_pdf(pdf_document, from_page=page_number, to_page=page_number)  # Insert the current page into the new PDF
            new_pdf.save(output_path)  # Save the new PDF

            print(f"Page {page_number + 1} saved to: {output_path}")

            # Close the new PDF
            new_pdf.close()

            # Add the page number to the set of processed pages
            processed_pages.add(page_number)

    # Close the original PDF
    pdf_document.close()

# Define the function to process all PDF files in a directory
def process_all_pdfs(input_folder, keyword, output_folder):
    # Iterate through each file in the input folder
    for root, _, files in os.walk(input_folder):
        for file in files:
            if file.endswith(".pdf"):
                # Get the full path of the PDF file
                pdf_path = os.path.join(root, file)
                
                # Process the PDF file
                split_pdf_by_text(pdf_path, keyword, output_folder)

如果某个页面已处理,我尝试跳过它并移至下一页,但这对我来说不起作用。

python pdf split
1个回答
0
投票
import fitz
hundred_filenames = [...]

for filename in hundred_filenames:
    doc = fitz.open(filename)
    for page in doc:
        if page.search_for(keyword) != []:  # contains the keyword!
            out_name = f"{doc.name}-{page.number}.pdf"
            new = fitz.open()
            new.insert_pdf(doc, from_page=page.number, to_page=page.number)
            new.save(out_name)
            new.close()
    doc.close()

注意:我是PyMuPDF的维护者和原始创建者。

© www.soinside.com 2019 - 2024. All rights reserved.