PDF分割代码运行后不输出pdf

问题描述 投票:0回答:1

我最近正在做一个任务,修改一个python代码,该代码将获取pdf并根据要求,分割pdf并将它们输出到一个文件夹中。该代码能够读取 pdf,并提供创建文件所需的页数,但输出不会在文件路径内创建文档。

进行更改以使其能够更好地阅读 pdf 后,我不再能够获得任何输出。

import re
import time
import os
from PyPDF2 import PdfReader, PdfWriter
import fitz
import parameters

input_dir = parameters.inputs_foldername+'\Certificates'
try:
    os.mkdir(input_dir)
except:
    pass

output_dir = 'Outputs_'+parameters.batch_name+'\Split Certificates'
isExist = os.path.exists(output_dir)
if not isExist:
    os.makedirs(output_dir)

for path in os.listdir(input_dir):
    full_path = os.path.join(input_dir, path)

    t0 = time.time()

    i = 0
    new = True
    pdf_writer = None  # Initialize pdf_writer
    parid = None  # Initialize parid

    with fitz.open(full_path) as doc:
        pdf = PdfReader(full_path)
        for i in range(len(pdf.pages)):
            text = doc[i].get_text()

            if ("Page 1 of" not in text) and (new == True):
                try:
                    tmp = re.search(r"(?<=\*  \*  \*\n)\d{7}", text)
                    parid = tmp.group()
                except AttributeError:
                    print(f"Pattern not found in text: {text}")
                    continue
                if pdf_writer is not None:
                    with open(output_dir+'/'+str(parid)+'.pdf', "wb") as out:
                        pdf_writer.write(out)
                pdf_writer = PdfWriter()
                pdf_writer.add_page(pdf.pages[i])
                i += 1
                new = False
            elif ("Page 1 of" not in text) and (new == False):
                pdf_writer.add_page(pdf.pages[i])
            elif ("Page 1 of" in text) and (new == False):
                with open(output_dir+'/'+str(parid)+'.pdf', "wb") as out:
                    pdf_writer.write(out)
                new = True

    # Save the last pdf_writer after the loop
    if pdf_writer is not None:
        with open(output_dir+'/'+str(parid)+'.pdf', "wb") as out:
            pdf_writer.write(out)

    t1 = time.time()
    print(str(i+1)+" pages processed in " + str(int(t1-t0)) + " seconds.")
python python-3.x pdf output filesplitting
1个回答
0
投票

正如我在评论中提到的,在不知道您的 PDF 或详细信息的情况下很难提供帮助,但我认为您可能正在寻找类似的东西......也许。这个想法是有一个函数来处理 PDF 和

yield
parid
(无论是什么)以及相关页面;另一个函数使用
itertools.groupby
将它们整理成组(假设每个
parid
的页面是连续的;我认为这也是原始代码中的假设)并将它们复制给编写器。

import glob
import itertools
import os
import re
import time

import fitz
import parameters
from PyPDF2 import PdfReader, PdfWriter

input_dir = f'{parameters.inputs_foldername}/Certificates'
os.makedirs(input_dir, exist_ok=True)
output_dir = f'Outputs_{parameters.batch_name}/Split Certificates'
os.makedirs(output_dir, exist_ok=True)


def get_pages_and_parids(pdf_path):
    """
    Group the PDF at `full_path` into (parid, page) pairs.
    """
    parid = None
    with fitz.open(pdf_path) as doc:
        pdf = PdfReader(pdf_path)
        for i in range(len(pdf.pages)):
            text = doc[i].get_text()
            if "Page 1 of" not in text:
                try:
                    tmp = re.search(r"(?<=\*  \*  \*\n)\d{7}", text)
                    parid = tmp.group()
                except AttributeError:
                    continue
            yield parid, pdf.pages[i]


def process_pdf(full_path):
    t0 = time.time()
    n_pages = 0
    n_pages_written = 0
    for parid, pages in itertools.groupby(get_pages_and_parids(full_path), key=lambda x: x[0]):
        if parid:
            pdf_writer = PdfWriter()
            for _, page in pages:
                pdf_writer.add_page(page)
                n_pages_written += 1
            with open(f'{output_dir}/{parid}.pdf', "wb") as out:
                pdf_writer.write(out)
        n_pages += 1
    t1 = time.time()
    print(f"{n_pages} pages processed, {n_pages_written} written in {full_path} in {t1 - t0} seconds.")


def main():
    for full_path in glob.glob(input_dir + "/*.pdf"):
        process_pdf(full_path)

if __name__ == '__main__':
    main()
最新问题
© www.soinside.com 2019 - 2024. All rights reserved.