我正在用python做一个项目,我需要扫描和图像并通过保留输入图像的原始布局将其写入word文件。对于扫描图像,我使用
pytesseract
并在原始图像上绘制轮廓,然后使用轮廓的x,y坐标从左到右对轮廓进行排序,然后在word文档中写入文本,但结果很差(如果两个轮廓水平相邻,我的方法会垂直排序打印它们),是否有任何增强的方法可以这样做?
这是我的代码:
document = Document()
path = "C:/xampp/htdocs/implementation/"
image = 'detecttable.jpg'
img = cv2.imread(path+image,0)
entire_tesseract_response = image_to_string(img)
entireText = nlp(entire_tesseract_response)
# done pre-processing and results are assigned here
dilation,enhanceimage = preprocessing(img)
rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3))
contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
contours.sort(key=lambda x:get_contour_precedence(x, img.shape[1]))
bgr = np.ones((img.shape[0], img.shape[1]), dtype= 'uint8')*255
cv2.drawContours(bgr, cnts, -1, (0,0,255), 1)
areaThr = 61000 # let suppose if any countour is greater than 61000 then it will be a table.
i = 0
number_of_tables=0
data1=[]
ypre = 0
xpre = 0
datapre=""
p=""
for cnt in contours:
x, y, width, height = cv2.boundingRect(cnt)
area = cv2.contourArea(cnt)
if (area > areaThr):
number_of_tables=number_of_tables+1
i = i + 1
table = img[y:y+height-1, x:x+width-1]
data,vertical=table_processing(table)
cols= vertical
rows1 = int(data/vertical)
data_images=imageReader(data)
table = document.add_table(0, cols)
table.style = 'TableGrid'
itera=0
for i in range(rows1):
tableimg=1
row_cells = table.add_row().cells
for j in range(cols):
cv2.imwrite(str(itera)+".png",data_images[itera])
roi = cv2.resize(data_images[itera],None,fx=4, fy=4, interpolation = cv2.INTER_CUBIC)
text = image_to_string(roi)
row_cells[j].text = text
data1.append(text)
itera+=1
else:
table = img[y:y+height-1, x:x+width-1]
roi = cv2.resize(table,None,fx=4, fy=4, interpolation = cv2.INTER_CUBIC)
text = image_to_string(roi)
if not text:
segment=charsegment(roi)
# text = " "+image_to_string(segment)
data1.append(text)
if ypre!=0 and y==ypre:
space = x-xpre
space = int(space/7)
datapace=""
for i in range(space):
datapace=datapace+" "
text = datapace+text
p.add_run(text)
else:
if x<100:
if len(datapre)<=70:
p=document.add_paragraph(text)
else:
if(len(datapre))>70:
p.add_run(text)
else:
p=document.add_paragraph(text)
p.alignment = 0
elif x>=100 and x<=300:
p = document.add_paragraph(text)
p.alignment = 1
else:
p = document.add_paragraph(text)
p.alignment = 2
ypre= y
xpre = x
datapre=text
name = image.split(".")
filePath = name[0].split("/")
fileName = filePath[0]+"/wordFiles/"+filePath[2]+".docx"
entireText = image_to_string(img)
document.save(fileName)
doc = docx.Document(fileName)
为此目的使用 Tesserat OCR 的 Hocr 格式
你找到解决这个问题的方法了吗?我无法获得 docx 格式的正确格式。