Python-tesseract OCR 问题:分词不正确

问题描述 投票:0回答:1

Stack Overflow 社区您好,

在尝试识别“ADRIEL”一词时,我遇到了 Python-tesseract OCR 工具的问题。 OCR 输出错误地将其分割为两个单独的单词:[ADRI、EL。

Im using this image:

我使用以下代码片段:

import pytesseract
import cv2

image = cv2.imread("cnh_nome2.png")

gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

pytesseract.pytesseract.tesseract_cmd = r"C:\Users\..."

text = pytesseract.image_to_string(gray_image, lang="por")

print(text)

输出为: [安德瑞 EL

python ocr tesseract python-tesseract
1个回答
0
投票

我得到了这个设置:

import pytesseract
import cv2

image_file = "Adriel.png"

# load the input image, convert it from BGR to RGB channel ordering,
image = cv2.imread(image_file, cv2.IMREAD_UNCHANGED)
rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
print('Original Dimensions : ',image.shape)
 
scale_percent = 80 # percent of original size
width = int(rgb.shape[1] * scale_percent / 100)
height = int(rgb.shape[0] * scale_percent / 100)
dim = (width, height)
  
# resize image
resized = cv2.resize(rgb, dim, interpolation = cv2.INTER_AREA)
print('Resized Dimensions : ',resized.shape)

grayImage = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
(thresh, blackAndWhiteImage) = cv2.threshold(grayImage, 175, 240, cv2.THRESH_BINARY) # 0...255
bw = cv2.imshow('Black white image', blackAndWhiteImage)

# Configuration
options = r'--psm 6 --oem 3 -c tessedit_char_whitelist=21eNOMESBRADIL'

# OCR the input image using Tesseract
text_bw = pytesseract.image_to_string(blackAndWhiteImage, config=options)
print(text_bw)

with open("numbers.txt", 'w') as f:
    f.writelines(text_bw)

cv2.imshow('Resized', resized)
cv2.waitKey(0)
cv2.destroyAllWindows()

输出:

Original Dimensions :  (71, 513, 4)
Resized Dimensions :  (56, 410, 3)
2e1NOMEESOBRENOME
ADRIEL
© www.soinside.com 2019 - 2024. All rights reserved.