使用 iOS Vision Framework,我可以使用
执行 OCR 并从图像中获取已识别的文本VNRecognizedTextObservation
现在假设我有一张图像,其中有一些文本段落和一个表格。该表有许多列和与之关联的行(请参阅下图)。是否可以使用 Vision 识别表中特定列的键和值?
例如,我想使用 Vision 从下图中单独获取 2014 年零售销售数据。这个怎么做?我们可以同时使用 Vision 和 CoreML 来做到这一点吗?
是的,可以使用视觉
guard let cgImage = image.cgImage else { return }
let imageRequestHandler = VNImageRequestHandler(cgImage: cgImage, orientation: .right)
let size = CGSize(width: cgImage.width, height: cgImage.height) // note, in pixels from `cgImage`; this assumes you have already rotate, too
let bounds = CGRect(origin: .zero, size: size)
// Create a new request to recognize text.
let request = VNRecognizeTextRequest { [self] request, error in
guard
let results = request.results as? [VNRecognizedTextObservation],
error == nil
else { return }
let rects = results.map {
convert(boundingBox: $0.boundingBox, to: CGRect(origin: .zero, size: size))
}
func convert(boundingBox: CGRect, to bounds: CGRect) -> CGRect {
let imageWidth = bounds.width
let imageHeight = bounds.height
// Begin with input rect.
var rect = boundingBox
// Reposition origin.
rect.origin.x *= imageWidth
rect.origin.x += bounds.minX
rect.origin.y = (1 - rect.maxY) * imageHeight + bounds.minY
// Rescale normalized coordinates.
rect.size.width *= imageWidth
rect.size.height *= imageHeight
return rect
}
您可以检测列的键边界框并扩展边界框高度
var targetBoundingBox: CGRect?
var targetWord = "Retailer"
for result in results {
if let candidate = result.topCandidates(1).first, candidate.string.lowercased().contains(targetWord.lowercased()) {
targetBoundingBox = convert(boundingBox: result.boundingBox, to: CGRect(origin: .zero, size: size))
targetBoundingBox?.size.height += bounds.maxY
break
}
}
if let targetBoundingBox = targetBoundingBox {
print("Bounding box of '\(targetWord)': \(targetBoundingBox)")
var textInsideTargetBox = ""
for result in results {
let boundingBox = convert(boundingBox: result.boundingBox, to: CGRect(origin: .zero, size: size))
if targetBoundingBox.intersects(boundingBox), let text = result.topCandidates(1).first?.string {
textInsideTargetBox += "\(text) "
}
}
print(textInsideTargetBox,"string")
let format = UIGraphicsImageRendererFormat()
format.scale = 1
let final = UIGraphicsImageRenderer(bounds: bounds, format: format).image { _ in
image.draw(in: bounds)
UIColor.green.setStroke()
// for rect in rects {
let path = UIBezierPath(rect: targetBoundingBox)
path.lineWidth = 9
path.stroke()
// }
}
DispatchQueue.main.async { [self] in
resultImage.image = final
}
} else {
print("Bounding box of '\(targetWord)' not found")
}