我想从目录中的pdf提取文本到另一个目录中的文本文件(转换PDF => .txt)与PyPDF2
我在这里阅读了这些信息:https://automatetheboringstuff.com/chapter13/
但我没有找到有关批量转换文件的信息
import PyPDF2
pdfFileObj = open('meetingminutes.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pdfReader.numPages
pageObj = pdfReader.getPage(0)
pageObj.extractText()
我正在寻找一种解决方案来转换目录中的pdf文件,并将它们转换为另一个目录中具有相同名称的.txt文件。
您可以查看以下代码
import os
import PyPDF2
PDFS_FOLDER = '/absolute/path/of/your/pdf/folder'
TEXTS_FOLDER = '/absolute/path/of/your/txt/folder/which/is/already/created'
def get_all_pdfs(folder_path):
"""
:param folder_path: absolute folder path of the pdfs
:return: a list with all the absolute path of pdfs
"""
return os.listdir(folder_path)
def create_absolute_path(root_path, file_name):
"""
:param root_path: absolute route path
:param file_name: file name
:return: absolute path of the file name
"""
root_path = root_path + '/' if root_path[-1] != '/' else root_path
return "%s%s" %(root_path, file_name)
def convert_pdf_to_text(pdf_path):
"""
:param pdf_path:
:return: bytearray with all the pages content
"""
pdfFileObj = open(pdf_path, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
number_of_pages = pdfReader.getNumPages()
text_byte_array = bytearray()
for page_number in range(number_of_pages):
current_page = pdfReader.getPage(page_number)
page_content = bytearray(current_page.extractText().encode('utf-8'))
text_byte_array.extend(page_content)
return text_byte_array
def convert_pdf_extension_to_text(pdf_file_name):
"""
:param pdf_file_name: string which contains a pdf file name
:return: string with the filename but with .txt extension instead of .pdf
"""
return "%s.txt" %(pdf_file_name.split('.pdf', 1)[0])
def save_text_to_folder(text, target_path):
"""
:param text: byte or bytearray
:param target_path:
:return:
"""
with open(target_path, 'wb') as f:
f.write(text)
if __name__ == '__main__':
all_pdfs = get_all_pdfs(PDFS_FOLDER)
for pdf_file_name in all_pdfs:
abs_path_pdf = create_absolute_path(PDFS_FOLDER, pdf_file_name)
text = convert_pdf_to_text(abs_path_pdf)
text_path = convert_pdf_extension_to_text(pdf_file_name)
target_text_path = create_absolute_path(TEXTS_FOLDER, text_path)
save_text_to_folder(text, target_text_path)