我正在尝试仅删除多个 PDF 文件的第一页并将其合并为一个文件。 (我每天收到150个PDF文件,第一页是我需要的发票,接下来的三到12页只是我不需要的备份)所以输入是150个不同大小的PDF文件,我想要的输出是1 个 PDF 文件,仅包含 150 个文件中每个文件的第一页。
我似乎所做的是合并除第一页(这是我唯一需要的)之外的所有页面。
# Get all PDF documents in current directory
import os
pdf_files = []
for filename in os.listdir("."):
if filename.endswith(".pdf"):
pdf_files.append(filename)
pdf_files.sort(key=str.lower)
# Take first page from each PDF
from PyPDF2 import PdfFileWriter, PdfFileReader
for filename in pdf_files:
reader = PdfFileReader(filename)
writer = PdfFileWriter()
for pageNum in range(1, reader.numPages):
page = reader.getPage(pageNum)
writer.addPage(page)
with open("CombinedFirstPages.pdf", "wb") as fp:
writer.write(fp)
试试这个:
# Get all PDF documents in current directory
import os
your_target_folder = "."
pdf_files = []
for dirpath, _, filenames in os.walk(your_target_folder):
for items in filenames:
file_full_path = os.path.abspath(os.path.join(dirpath, items))
if file_full_path.lower().endswith(".pdf"):
pdf_files.append(file_full_path)
pdf_files.sort(key=str.lower)
# Take first page from each PDF
from PyPDF2 import PdfFileReader, PdfFileWriter
writer = PdfFileWriter()
for file_path in pdf_files:
reader = PdfFileReader(file_path)
page = reader.getPage(0)
writer.addPage(page)
with open("CombinedFirstPages.pdf", "wb") as output:
writer.write(output)
做了一些改变。以下代码对我有用。
import os
from PyPDF2 import PdfWriter, PdfReader
pdf_files = []
# Get all PDF documents in current directory
for filename in os.listdir("."):
if filename.endswith(".pdf"):
pdf_files.append(filename)
pdf_files.sort(key=str.lower)
# Take first page from each PDF
pdf_writer = PdfWriter()
for filename in pdf_files:
reader = PdfReader(filename)
page = reader.pages[0]
pdf_writer.add_page(page)
with open("CombinedFirstPages.pdf", "wb") as fp:
pdf_writer.write(fp)
如果 PDF 没有文本而仅包含图像,则其他答案不起作用。以下内容适用于任何类型的 pdf(相关 pypdf doc)
from pypdf import PdfWriter
from pathlib import Path
pdf_files = sorted(Path("<path-to-folder-with-files>").glob('**/*.pdf'))
# Take first page from each PDF
pdf_writer = PdfWriter()
for file in pdf_files:
pdf_writer.append(file, pages=(0, 1))
with open("CombinedFirstPages.pdf", "wb") as fp:
pdf_writer.write(fp)
此脚本获取所有 PDF 文件并将第一页转换为 png。 在当前执行目录下
#pip install pdf2image
import os
import tempfile
from pdf2image import convert_from_path
output_folder=os.getcwd() #current work directory
def pdf_to_png(pdf_name,source,destino):
with tempfile.TemporaryDirectory() as path:
images_from_path = convert_from_path(pdf_path=source+"/"+pdf_name,
dpi=100,
output_folder=destino,
fmt="png",
output_file=pdf_name[:-4],
single_file=True)
for filename in os.listdir(output_folder):
if filename.endswith(".pdf"):
pdf_to_png(filename,output_folder,output_folder)
print("ok!")