我正在尝试使用this提取Python
PDF文件中包含的文本。
我正在使用PyPDF2模块,并具有以下脚本:
import PyPDF2
pdf_file = open('sample.pdf')
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
page = read_pdf.getPage(0)
page_content = page.extractText()
print page_content
当我运行代码时,我得到以下输出,该输出与PDF文档中包含的输出不同:
!"#$%#$%&%$&'()*%+,-%./01'*23%4
5'%1$#26%3/%7/))/8%&)/26%8#3"%3"*%313/9#&)
%
如何在PDF文档中提取文本?
这是用于提取文本的最简单的代码
码:
# importing required modules
import PyPDF2
# creating a pdf file object
pdfFileObj = open('filename.pdf', 'rb')
# creating a pdf reader object
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
# printing number of pages in pdf file
print(pdfReader.numPages)
# creating a page object
pageObj = pdfReader.getPage(5)
# extracting text from page
print(pageObj.extractText())
# closing the pdf file object
pdfFileObj.close()
我尝试了很多Python PDF转换器,Tika是最好的。
from tika import parser
raw = parser.from_file("///Users/Documents/Textos/Texto1.pdf")
raw = str(raw)
safe_text = raw.encode('utf-8', errors='ignore')
safe_text = str(safe_text).replace("\n", "").replace("\\", "")
print('--- safe text ---' )
print( safe_text )
pdftotext是最好最简单的! pdftotext也保留了结构。
我尝试了PyPDF2,PDFMiner和其他一些,但没有一个给出了令人满意的结果。
我在这里找到了一个解决方案PDFLayoutTextStripper
这很好,因为它可以保持原始PDF的布局。
它是用Java编写的,但我添加了一个支持Python的网关。
示例代码:
from py4j.java_gateway import JavaGateway
gw = JavaGateway()
result = gw.entry_point.strip('samples/bus.pdf')
# result is a dict of {
# 'success': 'true' or 'false',
# 'payload': pdf file content if 'success' is 'true'
# 'error': error message if 'success' is 'false'
# }
print result['payload']
PDFLayoutTextStripper的样本输出:
你可以在这里看到更多细节Stripper with Python
我正在添加代码来完成此任务:它对我来说工作正常:
# This works in python 3
# required python packages
# tabula-py==1.0.0
# PyPDF2==1.26.0
# Pillow==4.0.0
# pdfminer.six==20170720
import os
import shutil
import warnings
from io import StringIO
import requests
import tabula
from PIL import Image
from PyPDF2 import PdfFileWriter, PdfFileReader
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
warnings.filterwarnings("ignore")
def download_file(url):
local_filename = url.split('/')[-1]
local_filename = local_filename.replace("%20", "_")
r = requests.get(url, stream=True)
print(r)
with open(local_filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_filename
class PDFExtractor():
def __init__(self, url):
self.url = url
# Downloading File in local
def break_pdf(self, filename, start_page=-1, end_page=-1):
pdf_reader = PdfFileReader(open(filename, "rb"))
# Reading each pdf one by one
total_pages = pdf_reader.numPages
if start_page == -1:
start_page = 0
elif start_page < 1 or start_page > total_pages:
return "Start Page Selection Is Wrong"
else:
start_page = start_page - 1
if end_page == -1:
end_page = total_pages
elif end_page < 1 or end_page > total_pages - 1:
return "End Page Selection Is Wrong"
else:
end_page = end_page
for i in range(start_page, end_page):
output = PdfFileWriter()
output.addPage(pdf_reader.getPage(i))
with open(str(i + 1) + "_" + filename, "wb") as outputStream:
output.write(outputStream)
def extract_text_algo_1(self, file):
pdf_reader = PdfFileReader(open(file, 'rb'))
# creating a page object
pageObj = pdf_reader.getPage(0)
# extracting extract_text from page
text = pageObj.extractText()
text = text.replace("\n", "").replace("\t", "")
return text
def extract_text_algo_2(self, file):
pdfResourceManager = PDFResourceManager()
retstr = StringIO()
la_params = LAParams()
device = TextConverter(pdfResourceManager, retstr, codec='utf-8', laparams=la_params)
fp = open(file, 'rb')
interpreter = PDFPageInterpreter(pdfResourceManager, device)
password = ""
max_pages = 0
caching = True
page_num = set()
for page in PDFPage.get_pages(fp, page_num, maxpages=max_pages, password=password, caching=caching,
check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
text = text.replace("\t", "").replace("\n", "")
fp.close()
device.close()
retstr.close()
return text
def extract_text(self, file):
text1 = self.extract_text_algo_1(file)
text2 = self.extract_text_algo_2(file)
if len(text2) > len(str(text1)):
return text2
else:
return text1
def extarct_table(self, file):
# Read pdf into DataFrame
try:
df = tabula.read_pdf(file, output_format="csv")
except:
print("Error Reading Table")
return
print("\nPrinting Table Content: \n", df)
print("\nDone Printing Table Content\n")
def tiff_header_for_CCITT(self, width, height, img_size, CCITT_group=4):
tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h'
return struct.pack(tiff_header_struct,
b'II', # Byte order indication: Little indian
42, # Version number (always 42)
8, # Offset to first IFD
8, # Number of tags in IFD
256, 4, 1, width, # ImageWidth, LONG, 1, width
257, 4, 1, height, # ImageLength, LONG, 1, lenght
258, 3, 1, 1, # BitsPerSample, SHORT, 1, 1
259, 3, 1, CCITT_group, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
262, 3, 1, 0, # Threshholding, SHORT, 1, 0 = WhiteIsZero
273, 4, 1, struct.calcsize(tiff_header_struct), # StripOffsets, LONG, 1, len of header
278, 4, 1, height, # RowsPerStrip, LONG, 1, lenght
279, 4, 1, img_size, # StripByteCounts, LONG, 1, size of extract_image
0 # last IFD
)
def extract_image(self, filename):
number = 1
pdf_reader = PdfFileReader(open(filename, 'rb'))
for i in range(0, pdf_reader.numPages):
page = pdf_reader.getPage(i)
try:
xObject = page['/Resources']['/XObject'].getObject()
except:
print("No XObject Found")
return
for obj in xObject:
try:
if xObject[obj]['/Subtype'] == '/Image':
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
data = xObject[obj]._data
if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
mode = "RGB"
else:
mode = "P"
image_name = filename.split(".")[0] + str(number)
print(xObject[obj]['/Filter'])
if xObject[obj]['/Filter'] == '/FlateDecode':
data = xObject[obj].getData()
img = Image.frombytes(mode, size, data)
img.save(image_name + "_Flate.png")
# save_to_s3(imagename + "_Flate.png")
print("Image_Saved")
number += 1
elif xObject[obj]['/Filter'] == '/DCTDecode':
img = open(image_name + "_DCT.jpg", "wb")
img.write(data)
# save_to_s3(imagename + "_DCT.jpg")
img.close()
number += 1
elif xObject[obj]['/Filter'] == '/JPXDecode':
img = open(image_name + "_JPX.jp2", "wb")
img.write(data)
# save_to_s3(imagename + "_JPX.jp2")
img.close()
number += 1
elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
if xObject[obj]['/DecodeParms']['/K'] == -1:
CCITT_group = 4
else:
CCITT_group = 3
width = xObject[obj]['/Width']
height = xObject[obj]['/Height']
data = xObject[obj]._data # sorry, getData() does not work for CCITTFaxDecode
img_size = len(data)
tiff_header = self.tiff_header_for_CCITT(width, height, img_size, CCITT_group)
img_name = image_name + '_CCITT.tiff'
with open(img_name, 'wb') as img_file:
img_file.write(tiff_header + data)
# save_to_s3(img_name)
number += 1
except:
continue
return number
def read_pages(self, start_page=-1, end_page=-1):
# Downloading file locally
downloaded_file = download_file(self.url)
print(downloaded_file)
# breaking PDF into number of pages in diff pdf files
self.break_pdf(downloaded_file, start_page, end_page)
# creating a pdf reader object
pdf_reader = PdfFileReader(open(downloaded_file, 'rb'))
# Reading each pdf one by one
total_pages = pdf_reader.numPages
if start_page == -1:
start_page = 0
elif start_page < 1 or start_page > total_pages:
return "Start Page Selection Is Wrong"
else:
start_page = start_page - 1
if end_page == -1:
end_page = total_pages
elif end_page < 1 or end_page > total_pages - 1:
return "End Page Selection Is Wrong"
else:
end_page = end_page
for i in range(start_page, end_page):
# creating a page based filename
file = str(i + 1) + "_" + downloaded_file
print("\nStarting to Read Page: ", i + 1, "\n -----------===-------------")
file_text = self.extract_text(file)
print(file_text)
self.extract_image(file)
self.extarct_table(file)
os.remove(file)
print("Stopped Reading Page: ", i + 1, "\n -----------===-------------")
os.remove(downloaded_file)
# I have tested on these 3 pdf files
# url = "http://s3.amazonaws.com/NLP_Project/Original_Documents/Healthcare-January-2017.pdf"
url = "http://s3.amazonaws.com/NLP_Project/Original_Documents/Sample_Test.pdf"
# url = "http://s3.amazonaws.com/NLP_Project/Original_Documents/Sazerac_FS_2017_06_30%20Annual.pdf"
# creating the instance of class
pdf_extractor = PDFExtractor(url)
# Getting desired data out
pdf_extractor.read_pages(15, 23)
您可以从Here下载tika-app-xxx.jar(最新版)。
然后将此.jar文件放在python脚本文件的同一文件夹中。
然后在脚本中插入以下代码:
import os
import os.path
tika_dir=os.path.join(os.path.dirname(__file__),'<tika-app-xxx>.jar')
def extract_pdf(source_pdf:str,target_txt:str):
os.system('java -jar '+tika_dir+' -t {} > {}'.format(source_pdf,target_txt))
减少依赖。单个.jar文件更易于管理python包。
多格式支持。位置source_pdf
可以是任何类型文档的目录。 (.doc,.html,.odt等)
最新。 tika-app.jar总是早于相关版本的tika python包发布。
稳定。它比PyPDF更稳定,维护良好(由Apache提供支持)。
一个无头无人是必要的。
如果你在Windows上的Anaconda中试用它,PyPDF2可能无法处理一些具有非标准结构或unicode字符的PDF。如果您需要打开并阅读大量pdf文件,我建议使用以下代码 - 相对路径为.//pdfs//
的文件夹中的所有pdf文件的文本将存储在列表pdf_text_list
中。
from tika import parser
import glob
def read_pdf(filename):
text = parser.from_file(filename)
return(text)
all_files = glob.glob(".\\pdfs\\*.pdf")
pdf_text_list=[]
for i,file in enumerate(all_files):
text=read_pdf(file)
pdf_text_list.append(text['content'])
print(pdf_text_list)
PyPDF2确实有效,但结果可能会有所不同。我发现它的结果提取结果非常不一致。
reader=PyPDF2.pdf.PdfFileReader(self._path)
eachPageText=[]
for i in range(0,reader.getNumPages()):
pageText=reader.getPage(i).extractText()
print(pageText)
eachPageText.append(pageText)
使用textract。
它支持许多类型的文件,包括PDF
import textract
text = textract.process("path/to/file.extension")
看看这段代码:
import PyPDF2
pdf_file = open('sample.pdf', 'rb')
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
page = read_pdf.getPage(0)
page_content = page.extractText()
print page_content.encode('utf-8')
输出是:
!"#$%#$%&%$&'()*%+,-%./01'*23%4
5'%1$#26%3/%7/))/8%&)/26%8#3"%3"*%313/9#&)
%
使用相同的代码从201308FCR.pdf读取pdf。输出正常。
它的documentation解释了原因:
def extractText(self):
"""
Locate all text drawing commands, in the order they are provided in the
content stream, and extract the text. This works well for some PDF
files, but poorly for others, depending on the generator used. This will
be refined in the future. Do not rely on the order of text coming out of
this function, as it will change if this function is made more
sophisticated.
:return: a unicode string object.
"""
在尝试textract(似乎有太多依赖项)和pypdf2(无法从我测试的pdfs中提取文本)和tika(这太慢了)之后我最终使用了xpdf中的pdftotext
(正如另一个答案中已经提到的那样)并直接从python中调用二进制文件(您可能需要调整pdftotext的路径):
import os, subprocess
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
args = ["/usr/local/bin/pdftotext",
'-enc',
'UTF-8',
"{}/my-pdf.pdf".format(SCRIPT_DIR),
'-']
res = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output = res.stdout.decode('utf-8')
有pdftotext基本相同,但这假定在/ usr / local / bin中使用pdftotext,而我在AWS lambda中使用它并希望在当前目录中使用它。
顺便说一句:要在lambda上使用它,你需要将二进制和依赖项放到libstdc++.so
中你的lambda函数。我个人需要编译xpdf。由于这方面的指示会炸毁这个答案,我把它们放在on my personal blog。
您可能希望使用时间证明xPDF和派生工具来提取文本,因为pyPDF2似乎有various issues文本提取仍然。
长期的答案是,文本在PDF中编码的方式有很多变化,可能需要解码PDF字符串本身,然后可能需要用CMAP进行映射,然后可能需要分析单词和字母之间的距离等。
如果PDF损坏(即显示正确的文本但复制时会产生垃圾)并且您确实需要提取文本,那么您可能需要考虑将PDF转换为图像(使用ImageMagik),然后使用Tesseract从图像中获取文本使用OCR。
下面的代码是Python 3中问题的解决方案。在运行代码之前,请确保在您的环境中安装了PyPDF2
库。如果未安装,请打开命令提示符并运行以下命令:
pip3 install PyPDF2
解决方案代码
import PyPDF2
pdfFileObject = open('sample.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObject)
count = pdfReader.numPages
for i in range(count):
page = pdfReader.getPage(i)
print(page.extractText())
多页pdf可以在单个拉伸中提取为文本,而不是使用下面的代码将单个页码作为参数
import PyPDF2
import collections
pdf_file = open('samples.pdf', 'rb')
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
c = collections.Counter(range(number_of_pages))
for i in c:
page = read_pdf.getPage(i)
page_content = page.extractText()
print page_content.encode('utf-8')
您可以使用PDFtoText https://github.com/jalan/pdftotext
PDF到文本保持文本格式缩进,如果你有表格无关紧要。
PyPDF2在某些情况下会忽略空格并使结果文本变得混乱,但我使用PyMuPDF并且我真的很满意你可以使用这个link获取更多信息