使用Opencv检测图像中的文本区域

问题描述 投票:0回答:4

我有一张图像,想要检测其中的文本区域。

我尝试了TiRG_RAW_20110219项目,但结果并不理想。如果输入图像是这样的: input image

它正在生成以下输出: output image

任何人都可以建议一些替代方案吗?我希望通过仅发送文本区域作为输入来改进 tesseract 的输出。

python image opencv image-processing python-tesseract
4个回答
67
投票
import cv2


def captch_ex(file_name):
    img = cv2.imread(file_name)

    img_final = cv2.imread(file_name)
    img2gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    ret, mask = cv2.threshold(img2gray, 180, 255, cv2.THRESH_BINARY)
    image_final = cv2.bitwise_and(img2gray, img2gray, mask=mask)
    ret, new_img = cv2.threshold(image_final, 180, 255, cv2.THRESH_BINARY)  # for black text , cv.THRESH_BINARY_INV
    '''
            line  8 to 12  : Remove noisy portion 
    '''
    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3,
                                                         3))  # to manipulate the orientation of dilution , large x means horizonatally dilating  more, large y means vertically dilating more
    dilated = cv2.dilate(new_img, kernel, iterations=9)  # dilate , more the iteration more the dilation


    contours, hierarchy = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)  # findContours returns 3 variables for getting contours

    for contour in contours:
        # get rectangle bounding contour
        [x, y, w, h] = cv2.boundingRect(contour)

        # Don't plot small false positives that aren't text
        if w < 35 and h < 35:
            continue

        # draw rectangle around contour on original image
        cv2.rectangle(img, (x, y), (x + w, y + h), (255, 0, 255), 2)

        '''
        #you can crop image and send to OCR  , false detected will return no text :)
        cropped = img_final[y :y +  h , x : x + w]

        s = file_name + '/crop_' + str(index) + '.jpg' 
        cv2.imwrite(s , cropped)
        index = index + 1

        '''
    # write original image with added contours to disk
    cv2.imshow('captcha_result', img)
    cv2.waitKey()


file_name = 'your_image.jpg'
captch_ex(file_name)

Click to see result

Click to see result


31
投票

由于没有人发布完整的解决方案,这里有一种方法。通过观察所需文本为白色并且单词以水平对齐方式构建,我们可以使用颜色分割来提取和 OCR 字母。

  1. 执行颜色分割。我们加载图像,转换为HSV格式,定义下/上范围并使用

    cv2.inRange()
    执行颜色分割以获得二进制掩模

  2. 扩张以连接文本字符。我们使用

    cv2.getStructuringElement()
    创建水平形状的内核,然后使用
    cv2.dilate()
    扩张以将各个字母组合成单个轮廓

  3. 删除非文本轮廓。我们使用

    cv2.findContours()
    找到轮廓,并使用纵横比进行过滤以删除非文本字符。由于文本处于水平方向,如果确定轮廓小于预定义的宽高比阈值,那么我们通过用
    cv2.drawContours()

    填充轮廓来删除非文本轮廓
  4. 执行OCR。我们使用初始蒙版对膨胀图像进行按位运算,仅隔离文本字符并反转图像,使文本为黑色,背景为白色。最后,我们将图像放入 Pytesseract OCR


这是每个步骤的可视化:

输入图片

颜色分割生成的掩模

# Load image, convert to HSV format, define lower/upper ranges, and perform
# color segmentation to create a binary mask
image = cv2.imread('1.jpg')
hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
lower = np.array([0, 0, 218])
upper = np.array([157, 54, 255])
mask = cv2.inRange(hsv, lower, upper)

使用宽高比过滤连接文本轮廓并删除非文本轮廓的扩张图像

# Create horizontal kernel and dilate to connect text characters
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,3))
dilate = cv2.dilate(mask, kernel, iterations=5)

# Find contours and filter using aspect ratio
# Remove non-text contours by filling in the contour
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    x,y,w,h = cv2.boundingRect(c)
    ar = w / float(h)
    if ar < 5:
        cv2.drawContours(dilate, [c], -1, (0,0,0), -1)

按位和两个掩码并反转以获得 OCR 结果

# Bitwise dilated image with mask, invert, then OCR
result = 255 - cv2.bitwise_and(dilate, mask)
data = pytesseract.image_to_string(result, lang='eng',config='--psm 6')
print(data)

来自 Pytesseract OCR 的结果,使用

--psm 6
配置设置来假定统一的文本块。请查看此处了解更多配置选项

All women become
like their mothers.
That is their tragedy.
No man does.

That's his.

OSCAR WILDE

完整代码

import cv2
import numpy as np
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Load image, convert to HSV format, define lower/upper ranges, and perform
# color segmentation to create a binary mask
image = cv2.imread('1.jpg')
hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
lower = np.array([0, 0, 218])
upper = np.array([157, 54, 255])
mask = cv2.inRange(hsv, lower, upper)

# Create horizontal kernel and dilate to connect text characters
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,3))
dilate = cv2.dilate(mask, kernel, iterations=5)

# Find contours and filter using aspect ratio
# Remove non-text contours by filling in the contour
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    x,y,w,h = cv2.boundingRect(c)
    ar = w / float(h)
    if ar < 5:
        cv2.drawContours(dilate, [c], -1, (0,0,0), -1)

# Bitwise dilated image with mask, invert, then OCR
result = 255 - cv2.bitwise_and(dilate, mask)
data = pytesseract.image_to_string(result, lang='eng',config='--psm 6')
print(data)

cv2.imshow('mask', mask)
cv2.imshow('dilate', dilate)
cv2.imshow('result', result)
cv2.waitKey()

使用此 HSV 颜色阈值脚本确定 HSV 下/上颜色范围

import cv2
import numpy as np

def nothing(x):
    pass

# Load image
image = cv2.imread('1.jpg')

# Create a window
cv2.namedWindow('image')

# Create trackbars for color change
# Hue is from 0-179 for Opencv
cv2.createTrackbar('HMin', 'image', 0, 179, nothing)
cv2.createTrackbar('SMin', 'image', 0, 255, nothing)
cv2.createTrackbar('VMin', 'image', 0, 255, nothing)
cv2.createTrackbar('HMax', 'image', 0, 179, nothing)
cv2.createTrackbar('SMax', 'image', 0, 255, nothing)
cv2.createTrackbar('VMax', 'image', 0, 255, nothing)

# Set default value for Max HSV trackbars
cv2.setTrackbarPos('HMax', 'image', 179)
cv2.setTrackbarPos('SMax', 'image', 255)
cv2.setTrackbarPos('VMax', 'image', 255)

# Initialize HSV min/max values
hMin = sMin = vMin = hMax = sMax = vMax = 0
phMin = psMin = pvMin = phMax = psMax = pvMax = 0

while(1):
    # Get current positions of all trackbars
    hMin = cv2.getTrackbarPos('HMin', 'image')
    sMin = cv2.getTrackbarPos('SMin', 'image')
    vMin = cv2.getTrackbarPos('VMin', 'image')
    hMax = cv2.getTrackbarPos('HMax', 'image')
    sMax = cv2.getTrackbarPos('SMax', 'image')
    vMax = cv2.getTrackbarPos('VMax', 'image')

    # Set minimum and maximum HSV values to display
    lower = np.array([hMin, sMin, vMin])
    upper = np.array([hMax, sMax, vMax])

    # Convert to HSV format and color threshold
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    mask = cv2.inRange(hsv, lower, upper)
    result = cv2.bitwise_and(image, image, mask=mask)

    # Print if there is a change in HSV value
    if((phMin != hMin) | (psMin != sMin) | (pvMin != vMin) | (phMax != hMax) | (psMax != sMax) | (pvMax != vMax) ):
        print("(hMin = %d , sMin = %d, vMin = %d), (hMax = %d , sMax = %d, vMax = %d)" % (hMin , sMin , vMin, hMax, sMax , vMax))
        phMin = hMin
        psMin = sMin
        pvMin = vMin
        phMax = hMax
        psMax = sMax
        pvMax = vMax

    # Display result image
    cv2.imshow('image', result)
    if cv2.waitKey(10) & 0xFF == ord('q'):
        break

cv2.destroyAllWindows()

6
投票

如果您不介意动手,您可以尝试将这些文本区域增长为一个更大的矩形区域,然后将其一次性全部输入到超正方体中。

我还建议尝试多次对图像进行阈值处理,并将每个图像分别输入超正方体,看看是否有帮助。您可以将输出与字典单词进行比较,以自动确定特定 OCR 结果是否良好。


0
投票

您可以使用基于深度学习的文本检测器,称为“高效准确的场景文本 - EAST”。它可以与 OpenCV 函数一起使用,但首先您需要从 frozen_east_text_detection.pb 下载经过训练的模型 以下代码及其注释全部借自

此处-

text_detection.py

。记得将下载的
.pb文件传入
cv2.dnn.readNet()

亮点:

训练后的模型作为
    cv2.dnn.readNet()
  • 文件传递到
    .pb
    该模型只接受尺寸为32倍数的图像。(这里我们将输入图像的宽度和高度默认设置为320。)
  • layerNames
  • 中定义了两个输出层,每个层用于包含文本和边界框坐标的概率
    我们无法将通常对每个 OpenCV 函数执行的图像传递到模型中。每个图像都会传递到 
  • cv2.dnn.blobFromImage()
  • ,其中图像被视为
    blob
    。它经历了均值减法缩放通道交换更多详细信息请点击这里 输入 blob 与输出层一起传递到
  • net.setInput()
  • 输出是一个分数元组,其中包含:
    
  • 某个区域是否为文本的概率
    • 文本区域的边界框坐标
    我们过滤掉低于一定概率的预测
  • 对于剩余的预测,我们执行非极大值抑制以删除重叠的框
  • 更多代码解释
请参考这里

代码: image = cv2.imread('path_to_image') orig = image.copy() (H, W) = image.shape[:2] # set the new width and height and then determine the ratio in change # for both the width and height (newW, newH) = (320, 320) rW = W / float(newW) rH = H / float(newH) # resize the image and grab the new image dimensions image = cv2.resize(image, (newW, newH)) (H, W) = image.shape[:2] # define the two output layer names for the EAST detector model that # we are interested -- the first is the output probabilities and the # second can be used to derive the bounding box coordinates of text layerNames = [ "feature_fusion/Conv_7/Sigmoid", "feature_fusion/concat_3"] # load the pre-trained EAST text detector print("[INFO] loading EAST text detector...") net = cv2.dnn.readNet('path_containing_frozen_east_text_detection.pb') # construct a blob from the image and then perform a forward pass of # the model to obtain the two output layer sets blob = cv2.dnn.blobFromImage(image, 1.0, (W, H),(123.68, 116.78, 103.94), swapRB=True, crop=False) net.setInput(blob) (scores, geometry) = net.forward(layerNames) # grab the number of rows and columns from the scores volume, then # initialize our set of bounding box rectangles and corresponding # confidence scores (numRows, numCols) = scores.shape[2:4] rects = [] confidences = [] # loop over the number of rows for y in range(0, numRows): # extract the scores (probabilities), followed by the geometrical # data used to derive potential bounding box coordinates that # surround text scoresData = scores[0, 0, y] xData0 = geometry[0, 0, y] xData1 = geometry[0, 1, y] xData2 = geometry[0, 2, y] xData3 = geometry[0, 3, y] anglesData = geometry[0, 4, y] for x in range(0, numCols): # ignore probability values below 0.75 if scoresData[x] < 0.75: continue # compute the offset factor as our resulting feature maps will # be 4x smaller than the input image (offsetX, offsetY) = (x * 4.0, y * 4.0) # extract the rotation angle for the prediction and then # compute the sin and cosine angle = anglesData[x] cos = np.cos(angle) sin = np.sin(angle) # use the geometry volume to derive the width and height of # the bounding box h = xData0[x] + xData2[x] w = xData1[x] + xData3[x] # compute both the starting and ending (x, y)-coordinates for # the text prediction bounding box endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x])) endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x])) startX = int(endX - w) startY = int(endY - h) # add the bounding box coordinates and probability score to # our respective lists rects.append((startX, startY, endX, endY)) confidences.append(scoresData[x]) # apply non-maxima suppression to suppress weak, overlapping bounding # boxes boxes = non_max_suppression(np.array(rects), probs=confidences) # loop over the bounding boxes for (startX, startY, endX, endY) in boxes: # scale the bounding box coordinates based on the respective # ratios startX = int(startX * rW) startY = int(startY * rH) endX = int(endX * rW) endY = int(endY * rH) # draw the bounding box on the image cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2) cv2.imwrite('path_to_save', orig)

结果:

虽然结果没有达到预期,但已经很接近了

更新:

要将每个单独的边界框裁剪并保存为图像,请执行以下操作:

# take a copy o the original image image2 = orig.copy() for i, (startX, startY, endX, endY) in enumerate(boxes): startX = int(startX * rW) startY = int(startY * rH) endX = int(endX * rW) endY = int(endY * rH) cropped = image2[startY:endY, startX:endX] cv2.imwrite(r'Cropped_result\crop_img_{}.jpg'.format(i), cropped)

© www.soinside.com 2019 - 2024. All rights reserved.