我有一个项目,需要将消息应用程序的屏幕截图转换为机器可读的格式(可能是 JSON)。我问你是否可以概述我的算法的基本方法。我打算用 Python 编写我的算法。
如何坚持来回/对话格式。我是否应该将源图像分成单独的块,每个蓝/白色语音气泡一个块?随后,我会将这些单独的语音气泡输入 OCR 引擎,并保持排序。
哪种 OCR 引擎最适合截图?显然我的源图像不是手写的。文本以指定字体和字体大小机器打印。由于当今的“视网膜”显示屏,屏幕截图具有高分辨率,但 DPI 仍然较低。我应该重新缩放/调整图像大小吗?
我构建了一个算法,使用“发件人”和“合作伙伴”关键索引将聊天屏幕截图分类为 json。
import tesserocr
from nostril import nonsense
import cv2
from PIL import Image, ImageEnhance
from google.colab.patches import cv2_imshow
import pytesseract
import random
import string
import re
import numpy as np
from gibberish_detector import detector
import requests
from io import BytesIO
import urllib.request
import nltk
from flask import Flask
def imageToText(url):
# gibberish_detector set model
Detector = detector.create_from_model('big.model')
response = requests.get(url)
# load image from url in PIL image and convert to RGB/jpg format
img = Image.open(requests.get(url, stream=True).raw)
rgb_im = img.convert('RGB')
# Decrease saturation of image
converter = ImageEnhance.Color(rgb_im)
pil_img = converter.enhance(0)
# react image from url in open cv
req = urllib.request.urlopen(url)
arr = np.asarray(bytearray(req.read()), dtype=np.uint8)
final = cv2.imdecode(arr, cv2.IMREAD_UNCHANGED)
#initialize tesserocr api
api = tesserocr.PyTessBaseAPI(path='tessdata')
api.SetImage(pil_img)
# differentiate all the chat boxes based of text line in array
boxes = api.GetComponentImages(tesserocr.RIL.TEXTLINE, True)
height, width, c = final.shape
inc = int(0.01*width)
list_all = []
# cv2_imshow(final)
for i, (im,box,_,_) in enumerate(boxes):
x,y,w,h = box['x'],box['y'],box['w'],box['h']
crop_img = final[max(0, y-inc):min(y+h+inc,height), max(0, x-inc):min(x+w+inc, width)]
api.SetRectangle(box['x'], box['y'], box['w'], box['h'])
extractedInformation = pytesseract.image_to_string(crop_img)
stringAll = extractedInformation.replace('|', 'I').replace('\n\x0c', ' ');
try:
if not Detector.is_gibberish(stringAll):
if (x < 200 and stringAll != '0 ' and stringAll != '\x0c' and not re.search("(Type a message Send|Snapchat|Ne Sent|Delivered|Message\n\n)", stringAll) and not re.search("((1[0-2]|0?[1-9]):([0-5][0-9]))", stringAll)):
list_all.append({"body": stringAll, "sender": 'Partner'})
elif (x > 199 and stringAll != '0 ' and stringAll != '\x0c' and not re.search("Type a message Send|Snapchat|Ne Sent|Delivered|Message\n\n", stringAll) and not re.search("((1[0-2]|0?[1-9]):([0-5][0-9]))", stringAll)):
list_all.append({"body": stringAll, "sender": 'Me'})
except: ''
return list_all
print(imageToText('https://b2358461.smushcdn.com/2358461/wp-content/uploads/2021/08/IMG_9559-710x1536.jpeg?lossy=1&strip=1&webp=1'))