带有文本框(矩形)的 MS Word 文档,我已成功使用 libreoffice 将其转换为 PDF。 我应该如何找到pdf中的所有文本框(矩形)以及如何解释矩形的坐标?
@Override
public void modifyPath(PathConstructionRenderInfo renderInfo) {
if (renderInfo.getOperation() == PathConstructionRenderInfo.RECT) {
float x = renderInfo.getSegmentData().get(0);
float y = renderInfo.getSegmentData().get(1);
float w = renderInfo.getSegmentData().get(2);
float h = renderInfo.getSegmentData().get(3);
Vector a = new Vector(x, y, 1).cross(renderInfo.getCtm());
Vector c = new Vector(x + w, y + h, 1).cross(renderInfo.getCtm());
实现ExtRenderListener,只允许查找页面(A4)矩形,不允许查找包含页面中所有内容的(文本框)矩形。
正如 Bruno 指出的,问题是您可能会遇到仅由行到或移动到操作定义的矩形。
您需要跟踪所有画线操作,并在它们相交时立即“聚合”它们(每当绘制一条线时,其结束/开始与已知线的结束/开始相匹配)。
public class RectangleFinder implements IEventListener {
private Map<Line, Integer> knownLines = new HashMap<>();
private Map<Integer, Integer> clusters = new HashMap<>();
public void eventOccurred(IEventData data, EventType type) {
if(data instanceof PathRenderInfo){
PathRenderInfo pathRenderInfo = (PathRenderInfo) data;
pathRenderInfo.preserveGraphicsState();
Path path = pathRenderInfo.getPath();
if(pathRenderInfo.getOperation() == PathRenderInfo.NO_OP)
return;
if(pathRenderInfo.getOperation() != PathRenderInfo.FILL)
return;
if(!isBlack(pathRenderInfo.getFillColor()))
return;
for(Subpath sPath : path.getSubpaths()){
for(IShape segment : sPath.getSegments()) {
if(segment instanceof Line) {
lineOccurred((Line) segment);
}
}
}
}
}
private boolean isBlack(Color c){
if(c instanceof IccBased){
IccBased col01 = (IccBased) c;
return col01.getNumberOfComponents() == 1 && col01.getColorValue()[0] == 0.0f;
}
if(c instanceof DeviceGray){
DeviceGray col02 = (DeviceGray) c;
return col02.getNumberOfComponents() == 1 && col02.getColorValue()[0] == 0.0f;
}
return false;
}
private void lineOccurred(Line line){
int ID = 0;
if(!knownLines.containsKey(line)) {
ID = knownLines.size();
knownLines.put(line, ID);
}else{
ID = knownLines.get(line);
}
Point start = line.getBasePoints().get(0);
Point end = line.getBasePoints().get(1);
for(Line line2 : knownLines.keySet()){
if(line.equals(line2))
continue;
if(line2.getBasePoints().get(0).equals(start)
|| line2.getBasePoints().get(1).equals(end)
|| line2.getBasePoints().get(0).equals(end)
|| line2.getBasePoints().get(1).equals(start)){
int ID2 = find(knownLines.get(line2));
clusters.put(ID, ID2);
break;
}
}
}
private int find(int ID){
int out = ID;
while(clusters.containsKey(out))
out = clusters.get(out);
return out;
}
public Set<EventType> getSupportedEvents() {
return null;
}
public Collection<Set<Line>> getClusters(){
Map<Integer, Set<Line>> out = new HashMap<>();
for(Integer val : clusters.values())
out.put(val, new HashSet<Line>());
out.put(-1, new HashSet<Line>());
for(Line l : knownLines.keySet()){
int clusterID = clusters.containsKey(knownLines.get(l)) ? clusters.get(knownLines.get(l)) : -1;
out.get(clusterID).add(l);
}
out.remove(-1);
return out.values();
}
public Collection<Rectangle> getBoundingBoxes(){
Set<Rectangle> rectangles = new HashSet<>();
for(Set<Line> cluster : getClusters()){
double minX = Double.MAX_VALUE;
double minY = Double.MAX_VALUE;
double maxX = -Double.MAX_VALUE;
double maxY = -Double.MAX_VALUE;
for(Line l : cluster){
for(Point p : l.getBasePoints()){
minX = Math.min(minX, p.x);
minY = Math.min(minY, p.y);
maxX = Math.max(maxX, p.x);
maxY = Math.max(maxY, p.y);
}
}
double w = (maxX - minX);
double h = (maxY - minY);
rectangles.add(new Rectangle((float) minX, (float) minY, (float) w, (float) h));
}
return rectangles;
}
}
这是我编写的一个类,用于在页面上查找黑色(填充)矩形。 通过细微的调整,它也可以找到其他矩形。
这是c# 12版本:
using System;
using System.Collections.Generic;
using iText.Kernel.Colors;
using iText.Kernel.Geom;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Data;
using iText.Kernel.Pdf.Canvas.Parser.Listener;
using Color = iText.Kernel.Colors.Color;
public class RectangleFinder : IEventListener
{
private readonly Dictionary<Line, int> knownLines = [];
private readonly Dictionary<int, int> clusters = [];
public void EventOccurred(IEventData data, EventType type)
{
if (data is PathRenderInfo pathRenderInfo)
{
pathRenderInfo.PreserveGraphicsState();
Path path = pathRenderInfo.GetPath();
if (pathRenderInfo.GetOperation() == PathRenderInfo.NO_OP)
return;
if (pathRenderInfo.GetOperation() != PathRenderInfo.FILL)
return;
if (!IsBlack(pathRenderInfo.GetFillColor()))
return;
foreach (Subpath sPath in path.GetSubpaths())
{
foreach (IShape segment in sPath.GetSegments())
{
if (segment is Line line)
{
LineOccurred(line);
}
}
}
}
}
private static bool IsBlack(Color c)
{
if (c is IccBased col01)
{
return col01.GetNumberOfComponents() == 1 && col01.GetColorValue()[0] == 0.0f;
}
if (c is DeviceGray col02)
{
return col02.GetNumberOfComponents() == 1 && col02.GetColorValue()[0] == 0.0f;
}
return false;
}
private void LineOccurred(Line line)
{
int ID;
if (!knownLines.ContainsKey(line))
{
ID = knownLines.Count;
knownLines.Add(line, ID);
}
else
{
ID = knownLines[line];
}
iText.Kernel.Geom.Point start = line.GetBasePoints()[0];
iText.Kernel.Geom.Point end = line.GetBasePoints()[1];
foreach (Line line2 in knownLines.Keys)
{
if (line.Equals(line2))
continue;
if (line2.GetBasePoints()[0].Equals(start)
|| line2.GetBasePoints()[1].Equals(end)
|| line2.GetBasePoints()[0].Equals(end)
|| line2.GetBasePoints()[1].Equals(start))
{
int ID2 = Find(knownLines[line2]);
clusters[ID] = ID2;
break;
}
}
}
private int Find(int ID)
{
int output = ID;
while (clusters.ContainsKey(output))
output = clusters[output];
return output;
}
public ICollection<EventType> GetSupportedEvents()
{
return null;
}
public IEnumerable<HashSet<Line>> GetClusters()
{
Dictionary<int, HashSet<Line>> output = [];
foreach (int val in clusters.Values)
output[val] = [];
output[-1] = [];
foreach (Line l in knownLines.Keys)
{
int clusterID = clusters.ContainsKey(knownLines[l]) ? clusters[knownLines[l]] : -1;
output[clusterID].Add(l);
}
output.Remove(-1);
return output.Values;
}
public IEnumerable<Rectangle> GetBoundingBoxes()
{
HashSet<Rectangle> rectangles = [];
foreach (HashSet<Line> cluster in GetClusters())
{
double minX = double.MaxValue;
double minY = double.MaxValue;
double maxX = -double.MaxValue;
double maxY = -double.MaxValue;
foreach (Line l in cluster)
{
foreach (Point p in l.GetBasePoints())
{
minX = Math.Min(minX, p.x);
minY = Math.Min(minY, p.y);
maxX = Math.Max(maxX, p.x);
maxY = Math.Max(maxY, p.y);
}
}
double w = (maxX - minX);
double h = (maxY - minY);
rectangles.Add(new Rectangle((int)minX, (int)minY, (int)w, (int)h));
}
return rectangles;
}
}