我正在构建代码,如果 pdf 中包含图像“pytesseract”和“PyMuPDF”,则从图像中提取文本

问题描述 投票:0回答:1

下面的代码给了我错误:处理图像时出错:'dict'对象没有属性'宽度',下面的代码从pdf中读取pdf图像,如果pdf有图像,它应该从该图像中提取文本并检查提供的关键字列表(这里不加方法只是为了省力)

import os
import pytesseract
import fitz  # PyMuPDF
from PIL import Image

def extract_text_from_image(image):
    try:
**        image = Image.frombytes("RGB", (image, image.height), image.samples) **
        image = image.convert("L")  # Convert to grayscale
        extracted_text = pytesseract.image_to_string(image, config='--oem 3 --psm 6')
        return extracted_text
    except Exception as e:
        # Log the error and continue processing
        print(f"Error processing image: {str(e)}")
        return ""

def process_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        total_pages = doc.page_count
        for i in range(total_pages):
            page = doc[i]
            image_list = page.get_images(full=True)
            if image_list:
                for img in image_list:
                    xref = img[0]
                    base_image = doc.extract_image(xref)
                    extracted_text = extract_text_from_image(base_image)
                    # Process the extracted text (e.g., check for keywords)
                    # Log relevant information
                    print(f"Page {i + 1}/{total_pages}: Extracted text: {extracted_text}")
            else:
                print(f"Page {i + 1}/{total_pages}: No images found")
            # Show progress to the user
            progress_percent = (i + 1) / total_pages * 100
            print(f"Processing progress: [{'#' * int(progress_percent / 2):50s}] {progress_percent:.2f}%")
        # Check file size
        file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
        if file_size_mb > 10:  # Adjust the threshold as needed
            print(f"File size ({file_size_mb:.2f} MB) exceeds threshold. Moving to the next file.")
    except Exception as e:
        # Log the error and continue processing other files
        print(f"Error processing PDF {pdf_path}: {str(e)}")


process_pdf(r"your_pdf.pdf")
python-3.x python-tesseract pymupdf
1个回答
0
投票
import fitz  # PyMuPDF - nothing else is needed
doc = fitz.open("input.pdf")

for page in doc:
    for item in page.get_images():
        xref = item[0]
        pix = fitz.Pixmap(doc, xref)  # make Pixmap from image
        # OCR the image, make a 1-page PDF from it
        pdfdata = pix.pdfocr_tobytes()
        ocrpdf = fitz.open("pdf", pdfdata)
        ocrtext = ocrpdf[0].get_text()  # extract OCR-ed text
        # ... do something with the text
        # note: text details / metadata / positions etc. are available too

注意:我是PyMuPDF的维护者和原始创建者。

© www.soinside.com 2019 - 2024. All rights reserved.