下面的代码给了我错误:处理图像时出错:'dict'对象没有属性'宽度',下面的代码从pdf中读取pdf图像,如果pdf有图像,它应该从该图像中提取文本并检查提供的关键字列表(这里不加方法只是为了省力)
import os
import pytesseract
import fitz # PyMuPDF
from PIL import Image
def extract_text_from_image(image):
try:
** image = Image.frombytes("RGB", (image, image.height), image.samples) **
image = image.convert("L") # Convert to grayscale
extracted_text = pytesseract.image_to_string(image, config='--oem 3 --psm 6')
return extracted_text
except Exception as e:
# Log the error and continue processing
print(f"Error processing image: {str(e)}")
return ""
def process_pdf(pdf_path):
try:
doc = fitz.open(pdf_path)
total_pages = doc.page_count
for i in range(total_pages):
page = doc[i]
image_list = page.get_images(full=True)
if image_list:
for img in image_list:
xref = img[0]
base_image = doc.extract_image(xref)
extracted_text = extract_text_from_image(base_image)
# Process the extracted text (e.g., check for keywords)
# Log relevant information
print(f"Page {i + 1}/{total_pages}: Extracted text: {extracted_text}")
else:
print(f"Page {i + 1}/{total_pages}: No images found")
# Show progress to the user
progress_percent = (i + 1) / total_pages * 100
print(f"Processing progress: [{'#' * int(progress_percent / 2):50s}] {progress_percent:.2f}%")
# Check file size
file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
if file_size_mb > 10: # Adjust the threshold as needed
print(f"File size ({file_size_mb:.2f} MB) exceeds threshold. Moving to the next file.")
except Exception as e:
# Log the error and continue processing other files
print(f"Error processing PDF {pdf_path}: {str(e)}")
process_pdf(r"your_pdf.pdf")
import fitz # PyMuPDF - nothing else is needed
doc = fitz.open("input.pdf")
for page in doc:
for item in page.get_images():
xref = item[0]
pix = fitz.Pixmap(doc, xref) # make Pixmap from image
# OCR the image, make a 1-page PDF from it
pdfdata = pix.pdfocr_tobytes()
ocrpdf = fitz.open("pdf", pdfdata)
ocrtext = ocrpdf[0].get_text() # extract OCR-ed text
# ... do something with the text
# note: text details / metadata / positions etc. are available too
注意:我是PyMuPDF的维护者和原始创建者。