OpenCV：在阿拉伯语期刊中查找专栏（Python）

Question

我是 OpenCV 新手，也是 Python 新手。我尝试将在网上找到的代码拼接在一起来解决我的研究问题。我有一本 1870 年的阿拉伯语日记，有数百页，每页都包含两栏，并有粗黑边框。我想将两列提取为图像文件，以便分别对它们运行 OCR，同时忽略页眉和页脚。这是页面示例：

我有十页原始打印作为单独的 png 文件。我编写了以下脚本来处理每一个。它在 10 页中的 2 页中按预期工作，但无法在其他 8 页中生成列。我对所有函数的理解不够深入，无法知道我可以在哪里使用这些值，或者我的整个方法是否被误导了 -我认为最好的学习方法是询问社区您将如何解决这个问题。

import cv2

def cutpage(fname, pnum):
    image = cv2.imread(fname)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (7,7), 0)
    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 13))
    dilate = cv2.dilate(thresh, kernel, iterations=1)
    dilatename = "temp/dilate" + str(pnum) + ".png"
    cv2.imwrite(dilatename, dilate)
    cnts = cv2.findContours(dilate, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[0])

    fullpage=1
    column=1
    for c in cnts:
        x, y, w, h = cv2.boundingRect(c)
        if h > 300 and w > 20:
            if (h/w)<2.5:
                print("Found full page: ", x, y, w, h)
                filename = "temp/p" + str(pnum) + "-full" + str(fullpage) + ".png"
                fullpage+=1
            else:
                print("Found column: ", x, y, w, h)
                filename = "temp/p" + str(pnum) + "-col" + str(column) + ".png"
                column+=1
            roi = image[y:y+h, x:x+w]
            cv2.imwrite(filename, roi)
    return (column-1)
        
for nr in range(10):
    filename = "p"+str(nr)+".png"
    print("Checking page", nr)
    diditwork = cutpage(filename, nr)
    print("Found", diditwork, "columns")

按照教程，我创建了一个模糊和扩张的二元反转，以便它可以通过大的白色区域来识别不同的矩形区域。我还保存了每个放大版本的副本，以便我可以看到它的样子，这是处理后的上面的页面：

“for c in cnts”循环应该找到图像中的大矩形区域。如果高宽比小于 2.5，我会得到一个完整的页面（没有页眉和页脚，这效果很好），如果高宽比大于这个，我知道它是一个列，并且它保存了这个例如 temp/p2-col2.png

我得到了一些漂亮的完整页面，没有页眉和页脚，也就是说，只有较大的黑色边框，但没有被切成列。在 10 页中的 2 页中，我得到了我想要的，即：

由于我有时会得到想要的结果，所以一定有东西在起作用，但我不知道如何进一步改进它。

编辑：

以下是更多页面示例：

Answer 1

我尝试了一些没有任何扩张的东西，因为我想看看是否可以只使用中间线作为“分隔符”。这是代码：

im = cv2.cvtColor(cv2.imread("arabic.png"), cv2.COLOR_BGR2RGB) # read im as rgb for better plots
gray = cv2.cvtColor(im, cv2.COLOR_RGB2GRAY) # convert to gray
_, threshold = cv2.threshold(gray, 250, 255, cv2.THRESH_BINARY_INV) # inverse thresholding
contours, _ = cv2.findContours(threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) # find contours
sortedContours = sorted(contours, key = cv2.contourArea, reverse=True) # sort according to area, descending
bigBox = sortedContours[0] # get the contour of the big box
middleLine = sortedContours[1] # get the contour of the vertical line
xMiddleLine, _, _, _ = cv2.boundingRect(middleLine) # get x coordinate of middleLine
leftBoxContour = np.array([point for point in bigBox if point[0, 0] < xMiddleLine]) # assign left of line as points from the big contour
rightBoxContour = np.array([point for point in bigBox if point[0, 0] >= xMiddleLine]) # assigh right of line as points from the big contour
leftBoxX, leftBoxY, leftBoxW, leftBoxH = cv2.boundingRect(leftBoxContour) # get properties of box on left
rightBoxX, rightBoxY, rightBoxW, rightBoxH = cv2.boundingRect(rightBoxContour) # get properties of box on right
leftBoxCrop = im[leftBoxY:leftBoxY + leftBoxH, leftBoxX:leftBoxX + leftBoxW] # crop left 
rightBoxCrop = im[rightBoxY:rightBoxY + rightBoxH, rightBoxX:rightBoxX + rightBoxW] # crop right
# maybe do you assertations about aspect ratio??
cv2.imwrite("right.png", rightBoxCrop) # save image
cv2.imwrite("left.png", leftBoxCrop) # save image

我没有使用任何有关宽高比的断言，所以也许这仍然是你需要做的事情..

基本上，这种方法中最重要的线是基于 x 坐标生成左轮廓和右轮廓。这是我得到的最终结果：

边缘仍然有一些黑色部分，但对于 OCR 来说这应该不是问题。

仅供参考：我在 jupyter 中使用以下软件包：

import cv2
import numpy as np
%matplotlib notebook
import matplotlib.pyplot as plt

V2.0：仅使用大框检测来完成：

所以我做了一些扩张，这个大盒子很容易被检测到。我使用水平内核来确保大盒子的垂直线始终足够粗以被检测到。然而，我无法解决中间线的问题，因为它非常细......尽管如此，这里是上述方法的代码：

im = cv2.cvtColor(cv2.imread("1.png"), cv2.COLOR_BGR2RGB) # read im as rgb for better plots
gray = cv2.cvtColor(im, cv2.COLOR_RGB2GRAY) # convert to gray
gray[gray<255] = 0 # added some contrast to make it either completly black or white
_, threshold = cv2.threshold(gray, 250, 255, cv2.THRESH_BINARY_INV) # inverse thresholding
thresholdDilated = cv2.dilate(threshold, np.ones((1,10)), iterations = 1) # dilate horizontally
contours, _ = cv2.findContours(thresholdDilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) # find contours
sortedContours = sorted(contours, key = cv2.contourArea, reverse=True) # sort according to area, descending
x, y, w, h = cv2.boundingRect(sortedContours[0]) # get the bounding rect properties of the contour
left = im[y:y+h, x:x+int(w/2)+10].copy() # generate left, I included 10 pix from the right just in case
right = im[y:y+h, int(w/2)-10:w].copy() # and right, I included 10 pix from the left just in case
fig, ax = plt.subplots(nrows = 2, ncols = 3) # plotting...
ax[0,0].axis("off")
ax[0,1].imshow(im)
ax[0,1].axis("off")
ax[0,2].axis("off")
ax[1,0].imshow(left)
ax[1,0].axis("off")
ax[1,1].axis("off")
ax[1,2].imshow(right)
ax[1,2].axis("off")

这些是结果，您可以注意到它并不完美，但同样，由于您的目标是 OCR，这应该不是问题。

让我知道这是否可以，如果不行，我会绞尽脑汁寻找更好的解决方案......

V3.0：一种获得更直图像的更好方法，这将提高 OCR 的质量。

受到我在这里的另一个答案的启发：answer。拉直图像以便 OCR 获得更好的结果是有意义的。因此，我在检测到的外框上使用了四点变换。这将使图像稍微变直，并使文本更加水平。这是代码：

im = cv2.cvtColor(cv2.imread("2.png"), cv2.COLOR_BGR2RGB) # read im as rgb for better plots
gray = cv2.cvtColor(im, cv2.COLOR_RGB2GRAY) # convert to gray
gray[gray<255] = 0 # added some contrast to make it either completly black or white
_, threshold = cv2.threshold(gray, 250, 255, cv2.THRESH_BINARY_INV) # inverse thresholding
thresholdDilated = cv2.dilate(threshold, np.ones((1,10)), iterations = 1) # dilate horizontally
contours, _ = cv2.findContours(thresholdDilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) # find contours
largest_contour = max(contours, key = cv2.contourArea) # get largest contour
hull = cv2.convexHull(largest_contour) # get the hull
epsilon = 0.02 * cv2.arcLength(largest_contour, True) # epsilon
pts1 = np.float32(cv2.approxPolyDP(hull, epsilon, True).reshape(-1, 2)) # get the points
result = four_point_transform(im, pts1) # using imutils
height, width = result.shape[:2] # get the dimensions of the transformed image
left = result[:, 0:int(width/2)].copy() # from the beginning to half the width
right = result[:, int(width/2): width].copy() # from half the width till the end
fig, ax = plt.subplots(nrows = 2, ncols = 3) # plotting...
ax[0,0].axis("off")
ax[0,1].imshow(result)
ax[0,1].axvline(width/2)
ax[0,1].axis("off")
ax[0,2].axis("off")
ax[1,0].imshow(left)
ax[1,0].axis("off")
ax[1,1].axis("off")
ax[1,2].imshow(right)
ax[1,2].axis("off")

包含以下套件：

import cv2
import numpy as np
%matplotlib notebook
import matplotlib.pyplot as plt
from imutils.perspective import four_point_transform

正如您从代码中看到的，这是一种更好的方法，由于四点变换，您可以强制图像居中且水平。此外，不需要包含一些重叠，因为图像分离得很好。这是一个例子：

OpenCV：在阿拉伯语期刊中查找专栏（Python）

问题描述投票：0回答：1

1个回答

最新问题

OpenCV：在阿拉伯语期刊中查找专栏（Python）

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1