我最近正在做一个任务,修改一个python代码,该代码将获取pdf并根据要求,分割pdf并将它们输出到一个文件夹中。该代码能够读取 pdf,并提供创建文件所需的页数,但输出不会在文件路径内创建文档。
进行更改以使其能够更好地阅读 pdf 后,我不再能够获得任何输出。
import re
import time
import os
from PyPDF2 import PdfReader, PdfWriter
import fitz
import parameters
input_dir = parameters.inputs_foldername+'\Certificates'
try:
os.mkdir(input_dir)
except:
pass
output_dir = 'Outputs_'+parameters.batch_name+'\Split Certificates'
isExist = os.path.exists(output_dir)
if not isExist:
os.makedirs(output_dir)
for path in os.listdir(input_dir):
full_path = os.path.join(input_dir, path)
t0 = time.time()
i = 0
new = True
pdf_writer = None # Initialize pdf_writer
parid = None # Initialize parid
with fitz.open(full_path) as doc:
pdf = PdfReader(full_path)
for i in range(len(pdf.pages)):
text = doc[i].get_text()
if ("Page 1 of" not in text) and (new == True):
try:
tmp = re.search(r"(?<=\* \* \*\n)\d{7}", text)
parid = tmp.group()
except AttributeError:
print(f"Pattern not found in text: {text}")
continue
if pdf_writer is not None:
with open(output_dir+'/'+str(parid)+'.pdf', "wb") as out:
pdf_writer.write(out)
pdf_writer = PdfWriter()
pdf_writer.add_page(pdf.pages[i])
i += 1
new = False
elif ("Page 1 of" not in text) and (new == False):
pdf_writer.add_page(pdf.pages[i])
elif ("Page 1 of" in text) and (new == False):
with open(output_dir+'/'+str(parid)+'.pdf', "wb") as out:
pdf_writer.write(out)
new = True
# Save the last pdf_writer after the loop
if pdf_writer is not None:
with open(output_dir+'/'+str(parid)+'.pdf', "wb") as out:
pdf_writer.write(out)
t1 = time.time()
print(str(i+1)+" pages processed in " + str(int(t1-t0)) + " seconds.")
正如我在评论中提到的,在不知道您的 PDF 或详细信息的情况下很难提供帮助,但我认为您可能正在寻找类似的东西......也许。这个想法是有一个函数来处理 PDF 和
yield
对 parid
(无论是什么)以及相关页面;另一个函数使用 itertools.groupby
将它们整理成组(假设每个 parid
的页面是连续的;我认为这也是原始代码中的假设)并将它们复制给编写器。
import glob
import itertools
import os
import re
import time
import fitz
import parameters
from PyPDF2 import PdfReader, PdfWriter
input_dir = f'{parameters.inputs_foldername}/Certificates'
os.makedirs(input_dir, exist_ok=True)
output_dir = f'Outputs_{parameters.batch_name}/Split Certificates'
os.makedirs(output_dir, exist_ok=True)
def get_pages_and_parids(pdf_path):
"""
Group the PDF at `full_path` into (parid, page) pairs.
"""
parid = None
with fitz.open(pdf_path) as doc:
pdf = PdfReader(pdf_path)
for i in range(len(pdf.pages)):
text = doc[i].get_text()
if "Page 1 of" not in text:
try:
tmp = re.search(r"(?<=\* \* \*\n)\d{7}", text)
parid = tmp.group()
except AttributeError:
continue
yield parid, pdf.pages[i]
def process_pdf(full_path):
t0 = time.time()
n_pages = 0
n_pages_written = 0
for parid, pages in itertools.groupby(get_pages_and_parids(full_path), key=lambda x: x[0]):
if parid:
pdf_writer = PdfWriter()
for _, page in pages:
pdf_writer.add_page(page)
n_pages_written += 1
with open(f'{output_dir}/{parid}.pdf', "wb") as out:
pdf_writer.write(out)
n_pages += 1
t1 = time.time()
print(f"{n_pages} pages processed, {n_pages_written} written in {full_path} in {t1 - t0} seconds.")
def main():
for full_path in glob.glob(input_dir + "/*.pdf"):
process_pdf(full_path)
if __name__ == '__main__':
main()