我正在使用 Apple 的 Vision 框架开发一个文本识别项目,并尝试利用
VNRecognizeTextRequest
来提取图像中文本的边界框。我的目标是准确识别和获取单个单词的边界框,而不是整个句子或文本块。
我遇到的大多数资源和指南都侧重于获取句子或较大文本块的边界框,而很少讨论专门针对单个单词的讨论。此外,我知道
VNRecognizeTextRequest
提供两种识别精度:.fast
和 .accurate
。根据我的测试,.fast
模式的识别质量明显低于.accurate
,因此我更喜欢使用后者来确保最高的文本识别质量。
但是,我遇到了挑战:尽管将识别设置为
.accurate
,我仍然无法获取单个单词的边界框 - 结果仍然包含较大的文本块。
有谁知道如何配置
VNRecognizeTextRequest
或使用任何其他方法来专门检测单个单词的边界框?我是否缺少任何可以将检测细化到这种详细程度的特定设置或参数?
我非常感谢有关此事的任何建议或指导。谢谢!
他们是一个获取句子boundingBox的简单示例:
import SwiftUI
import Vision
struct OCR: View {
@State var image: UIImage? = UIImage(named: "test")
@State var texts: [String] = []
@State var positions: [CGRect] = []
func VNImageRectForNormalizedRect(rect: CGRect, imageSize: CGSize) -> CGRect {
let width = imageSize.width
let height = imageSize.height
let x = rect.minX * width
let y = (1 - rect.maxY) * height
let rectWidth = rect.width * width
let rectHeight = rect.height * height
return CGRect(x: x, y: y, width: rectWidth, height: rectHeight)
}
var body: some View {
ZStack {
if let image = image {
Image(uiImage: image)
.resizable()
.aspectRatio(contentMode: .fit)
.overlay(Canvas { context, size in
for position in positions {
let normalizedRect = VNImageRectForNormalizedRect(rect: position, imageSize: image.size)
context.stroke(Path(normalizedRect), with: .color(.red), lineWidth: 1)
}
})
.onAppear {
recognizeText(image: image) { t, p in
texts = t
positions = p
}
}
} else {
Text("Their is no picture")
}
}
}
}
extension OCR {
func recognizeText(image: UIImage, completion: @escaping([String], [CGRect]) -> Void) {
var texts: [String] = []
var positions: [CGRect] = []
guard let cgImage = image.cgImage else { return }
let request = VNRecognizeTextRequest { (request, error) in
guard let observations = request.results as? [VNRecognizedTextObservation], error == nil else {
print("Text recognition error: \(error?.localizedDescription ?? "Unknown error")")
return
}
for observation in observations {
guard let topCandidate = observation.topCandidates(1).first else { continue }
texts.append(topCandidate.string)
positions.append(observation.boundingBox)
}
DispatchQueue.main.async {
print(texts)
print(positions)
completion(texts, positions)
}
}
request.recognitionLevel = .accurate
let handler = VNImageRequestHandler(cgImage: cgImage)
try? handler.perform([request])
}
}
#Preview {
OCR()
}
您可以使用以下方式
for (index, character) in topCandidate.string.enumerated() {
let startIndex = topCandidate.string.index(topCandidate.string.startIndex, offsetBy: index)
let endIndex = topCandidate.string.index(startIndex, offsetBy: 1)
let range = startIndex..<endIndex
if let wordBox = try? topCandidate.boundingBox(for: range) {
}
}