我正在尝试 OCR 这个 .png 数据,但我在将数据拆分为列时遇到问题。特别是在怀俄明州生产数据中,我想将租赁、公司和县分开。这是一份历史文档,格式有点奇怪。缩进逐页变化,除了为每个页面编写单独的代码并裁剪特定部分之外,我不知道如何编写可以解释公司和租赁缩进变化的内容。以下是我在 RStudio 中编写的内容。
library(tesseract)
library(tidyverse)
eng <- tesseract("eng")
text <- tesseract::ocr("test_page_75_crop_table.png")
cat(text)
dataframe_test <- text %>%
str_split(pattern = "\n") %>%
unlist() %>%
tibble(data = .)
我已经能够使用Word软件的OCR提取出以下代码的表格:
library(RDCOMClient)
library(magick)
################################################
#### Step 1 : We convert the image to a PDF ####
################################################
path_PDF <- "D:\\temp2.pdf"
path_PNG <- "D:\\A09Yb.png"
path_Word <- "D:\\temp2.docx"
pdf(path_PDF, height = 16, width = 12)
im <- image_read(path_PNG)
plot(im)
dev.off()
####################################################################
#### Step 2 : We use the OCR of Word to convert the PDF to word ####
####################################################################
wordApp <- COMCreate("Word.Application")
wordApp[["Visible"]] <- TRUE
wordApp[["DisplayAlerts"]] <- FALSE
doc <- wordApp[["Documents"]]$Open(normalizePath(path_PDF), ConfirmConversions = FALSE)
doc$SaveAs2(path_Word)
##############################################################
#### Step 3 : We extract the table from the Word document ####
##############################################################
nb_Tables <- doc$tables()$count()
list_Table <- list()
for(l in 1 : nb_Tables)
{
print(l)
nb_Row <- doc$tables(l)$Rows()$Count()
nb_Col <- doc$tables(l)$Columns()$Count()
mat_Temp <- matrix(NA, nrow = nb_Row, ncol = nb_Col)
for(i in 1 : nb_Row)
{
for(j in 1 : nb_Col)
{
mat_Temp[i, j] <- tryCatch(doc$tables(l)$cell(i, j)$range()$text(), error = function(e) NA)
}
}
list_Table[[l]] <- mat_Temp
}
list_Table
[[1]]
[,1] [,2]
[1,] "FIELD\r\a" "COUNTr\r\a"
[2,] "HESI URIC.N OIL\r\a" "Na TRONa FRCNT\r\a"
[3,] "UNICN on.\rLEESES\rPc1SG\". SPRING CREEK\rSKELLY OIL\rTRUE\rLEZSES\rFCLECzr\rYULE CREEK O IL\rL \"SES\r\a" "NAIRONA\rC27311\rLANKER\rTER\r\a"
[4,] NA NA
[5,] "CREEK\r\a" "CROOK\r\a"
[6,] "\tIN \tSHELL-SCHL'R ICHI\r\a" NA
[7,] "OCR\r\a" ""
[8,] "\r\a" ""
[9,] "GREET NCRTFERN\rLEASES\rPCM ELL neaNLcNEc LEZSES\r\a" "ICHT\rCONVERSE\r\a"
[10,] "GIL \tGAS FUTURES LECSES\r\a" "CEMPBELL\r\a"
[11,] "" "CRCOK\r\a"
[12,] "" "SkEETbATER\r\a"
[,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] "\r\a" "\r\a" "\r\a" "\r\a" "\r\a" "\r\a" "\r\a"
[2,] "\r\a" "\r\a" "33N\r\a" "IRKS\r\a" "30\r\a" "938\r\a" "\r\a"
[3,] "\r\a" "11\r21\r\a" "3LN E4k CCYE\r\a" "\r\a" "388\r439\r\a" "12, 040\r422\r502\r\a" "9. 632\r9,632\r\a"
[4,] NA NA "c3H\r\a" "PDCY\r\a" NA NA NA
[5,] "20\r\a" "\r\a" "IG5s\r\a" "FhLS\rFN'LS\r\a" "EFPL\rBFPL\r\a" "135\r109\r\a" "543 les 400\r3E6\r\a"
[6,] "e scN 67k 5CN\r50N SES\r\a" NA NA NA NA NA NA
[7,] NA NA NA NA NA NA NA
[8,] "\r\a" "51N SEB\r\a" "YhLS\r\a" "EFPL\r\a" "\r\a" "2, 779\r\a" "\r\a"
[9,] "\r\a" "" "PNLS\r\a" "eFPt\r\a" "231\r692\r\a" "7, 153\r21, 450\r\a" "\r\a"
[10,] "\r\a" "Lc;ec\r12 52N\r\a" "\r\a" "TRKS\r\a" "\r\a" "561\r561\r\a" "\r\a"
[11,] "\r\a" "1960\r6EH\r\a" "\r\a" "\r\a" "\r\a" "\r\a" "697\r\a"
[12,] "\r\a" "54B 6EH\r7 53N\r11\r\a" "BCSL hCSL\rFAIR\r\a" "\r\a" "107\r\a" "913\r3,325\r\a" "522\r1. 219\r\a"
[,10] [,11] [,12]
[1,] "\r\a" "\r\a" NA
[2,] "183,274\r\a" "12.910\r\a" NA
[3,] "12,040 eco,\rfie,950\r21,01E\r122,004\r7.159\r\a" "9 ,632\r39 ,940\r20 ,449\r391 ,660\r\a" NA
[4,] NA NA NA
[5,] "\r\a" "zye,03e\r206,219\r\a" "\r\a"
[6,] NA NA NA
[7,] NA NA NA
[8,] "103 ,ose\r\a" "\r\a" NA
[9,] "241,272\r1 .225\r1 ,225\r\a" "\r\a" NA
[10,] "43,739\rEE,E03\r\a" "1 ,300\r1 ,300\r\a" NA
[11,] "2CC,E24\r\a" "71 *590\r\a" NA
[12,] "24,E97\r252,718\r\a" "56 ,973\r44,557\r173,120\r615 .77B\r615,778\r\a" NA