我正在尝试使用 python 按特定关键字拆分 100 个 PDF。 如果 Python 中的页面包含该关键字,则将该页面拆分为新的 PDF。 我面临的问题是,文件正在重复。我已尝试一切方法来阻止复制过程,但它不会改变。
import os
import fitz # PyMuPDF
def split_pdf_by_text(pdf_path, keyword, output_folder):
# Check if the provided path is a valid file
if not os.path.isfile(pdf_path):
print(f"Error: '{pdf_path}' is not a valid file.")
return
# Create output folder if it doesn't exist
if not os.path.exists(output_folder):
os.makedirs(output_folder)
print(f"Processing PDF: {pdf_path}")
# Open the PDF file
pdf_document = fitz.open(pdf_path)
# Initialize a set to keep track of processed pages
processed_pages = set()
# Iterate through each page
for page_number in range(len(pdf_document)):
# Skip the page if it's already processed
if page_number in processed_pages:
continue
# Get the page
page = pdf_document.load_page(page_number)
# Extract text from the page
text = page.get_text()
# Check if the keyword exists in the page text
if keyword in text:
# Construct the output file path
output_file_name = f"{os.path.splitext(os.path.basename(pdf_path))[0]}_page_{page_number + 1}.pdf"
output_path = os.path.join(output_folder, output_file_name)
# Create a new PDF document
new_pdf = fitz.open()
new_pdf.insert_pdf(pdf_document, from_page=page_number, to_page=page_number) # Insert the current page into the new PDF
new_pdf.save(output_path) # Save the new PDF
print(f"Page {page_number + 1} saved to: {output_path}")
# Close the new PDF
new_pdf.close()
# Add the page number to the set of processed pages
processed_pages.add(page_number)
# Close the original PDF
pdf_document.close()
# Define the function to process all PDF files in a directory
def process_all_pdfs(input_folder, keyword, output_folder):
# Iterate through each file in the input folder
for root, _, files in os.walk(input_folder):
for file in files:
if file.endswith(".pdf"):
# Get the full path of the PDF file
pdf_path = os.path.join(root, file)
# Process the PDF file
split_pdf_by_text(pdf_path, keyword, output_folder)
如果某个页面已处理,我尝试跳过它并移至下一页,但这对我来说不起作用。
import fitz
hundred_filenames = [...]
for filename in hundred_filenames:
doc = fitz.open(filename)
for page in doc:
if page.search_for(keyword) != []: # contains the keyword!
out_name = f"{doc.name}-{page.number}.pdf"
new = fitz.open()
new.insert_pdf(doc, from_page=page.number, to_page=page.number)
new.save(out_name)
new.close()
doc.close()
注意:我是PyMuPDF的维护者和原始创建者。