根据使用PDFBOX的PDF输出识别文本

问题描述 投票:1回答:1

我使用PDF BOX获取PDF文本的颜色信息。我可以使用以下代码获取输出。但是我的疑问是StrokingColor代表什么,Non stroking颜色代表什么。基于此,我将如何确定哪种文本具有哪种颜色。有人建议我吗?我的最新输出是这样的:DeviceRGB设备CMYKjava.awt.Color [r = 63,g = 240,b = 0]java.awt.Color [r = 35,g = 31,b = 32]34.93499831.1131.875

PDDocument doc = null;
        try {
            doc = PDDocument.load(strFilepath);
            PDFStreamEngine engine = new PDFStreamEngine(ResourceLoader.loadProperties("org/apache/pdfbox/resources/PageDrawer.properties"));
            PDPage page = (PDPage)doc.getDocumentCatalog().getAllPages().get(1);
            engine.processStream(page, page.findResources(), page.getContents().getStream());
            PDGraphicsState graphicState = engine.getGraphicsState();
            System.out.println(graphicState.getStrokingColor().getColorSpace().getName());
            System.out.println(graphicState.getNonStrokingColor().getColorSpace().getName());
            System.out.println(graphicState.getNonStrokingColor().getJavaColor()); 
            System.out.println(graphicState.getStrokingColor().getJavaColor());
            float colorSpaceValues[] = graphicState.getStrokingColor().getColorSpaceValue();
            for (float c : colorSpaceValues) {
                System.out.println(c * 255);
            }
        }
        finally {
            if (doc != null) {
                doc.close();
            }
        }
pdfbox
1个回答
5
投票

根据OP希望在注释中的澄清

如果一个黑色的文本“ Sample”和一个灰色的其他文本“ sample1”,则将一个pdf页面的字体颜色与另一个pdf页面的字体颜色进行比较。...我需要知道该样本->黑色,sample1->像这样的灰色..我想要全文及其颜色

PDFBox具有文本提取引擎PDFTextStripper。但是,将其用于手头的任务仍存在一些挑战:

  • 最初是not设计用于提取文本旁边的颜色信息;它使用的TextPosition对象甚至没有颜色属性。因此,我们将不得不对其进行一些扩展。

    • 我们将首先为颜色操作注册侦听器,以完全跟踪颜色。

    • [此外,我们将TextPosition对象的颜色信息存储在另一种结构中(我宁愿相应地扩展文本位置,但由于几个不可访问的私有成员会带来相当大的麻烦)。]]

    • 这已在this answer中详细显示;对于背景,请看那里。

  • PDF允许多种方式绘制文本。字母可以用一种颜色填充,而其边框可以用另一种颜色抚摸。它们的边界甚至可以用作后续绘制操作的剪切路径。我们将只考虑填充和抚摸颜色。

  • 绘制的文本以后可能会被其他图形覆盖,或者完全隐藏它或更改其外观颜色。我们暂时将其忽略。

  • 对于PDFBox 1.8.x

    如所示,我们像这样扩展PDFTextStripper

    import java.io.IOException;
    import java.util.Arrays;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    
    import org.apache.pdfbox.util.PDFTextStripper;
    import org.apache.pdfbox.util.TextPosition;
    
    public class ColorTextStripper extends PDFTextStripper
    {
        public ColorTextStripper() throws IOException
        {
            super();
            setSuppressDuplicateOverlappingText(false);
    
            registerOperatorProcessor("CS", new org.apache.pdfbox.util.operator.SetStrokingColorSpace());
            registerOperatorProcessor("cs", new org.apache.pdfbox.util.operator.SetNonStrokingColorSpace());
            registerOperatorProcessor("SC", new org.apache.pdfbox.util.operator.SetStrokingColor());
            registerOperatorProcessor("sc", new org.apache.pdfbox.util.operator.SetNonStrokingColor());
            registerOperatorProcessor("SCN", new org.apache.pdfbox.util.operator.SetStrokingColor());
            registerOperatorProcessor("scn", new org.apache.pdfbox.util.operator.SetNonStrokingColor());
            registerOperatorProcessor("G", new org.apache.pdfbox.util.operator.SetStrokingGrayColor());
            registerOperatorProcessor("g", new org.apache.pdfbox.util.operator.SetNonStrokingGrayColor());
            registerOperatorProcessor("RG", new org.apache.pdfbox.util.operator.SetStrokingRGBColor());
            registerOperatorProcessor("rg", new org.apache.pdfbox.util.operator.SetNonStrokingRGBColor());
            registerOperatorProcessor("K", new org.apache.pdfbox.util.operator.SetStrokingCMYKColor());
            registerOperatorProcessor("k", new org.apache.pdfbox.util.operator.SetNonStrokingCMYKColor());
        }
    
        @Override
        protected void processTextPosition(TextPosition text)
        {
            renderingMode.put(text, getGraphicsState().getTextState().getRenderingMode());
            strokingColor.put(text, getGraphicsState().getStrokingColor().getColorSpaceValue());
            nonStrokingColor.put(text, getGraphicsState().getNonStrokingColor().getColorSpaceValue());
    
            super.processTextPosition(text);
        }
    
        Map<TextPosition, Integer> renderingMode = new HashMap<TextPosition, Integer>();
        Map<TextPosition, float[]> strokingColor = new HashMap<TextPosition, float[]>();
        Map<TextPosition, float[]> nonStrokingColor = new HashMap<TextPosition, float[]>();
    
        final static List<Integer> FILLING_MODES = Arrays.asList(0, 2, 4, 6);
        final static List<Integer> STROKING_MODES = Arrays.asList(1, 2, 5, 6);
        final static List<Integer> CLIPPING_MODES = Arrays.asList(4, 5, 6, 7);
    
        @Override
        protected void writeString(String text, List<TextPosition> textPositions) throws IOException
        {
            for (TextPosition textPosition: textPositions)
            {
                Integer charRenderingMode = renderingMode.get(textPosition);
                float[] charStrokingColor = strokingColor.get(textPosition);
                float[] charNonStrokingColor = nonStrokingColor.get(textPosition);
    
                StringBuilder textBuilder = new StringBuilder();
                textBuilder.append(textPosition.getCharacter())
                           .append("{");
    
                if (FILLING_MODES.contains(charRenderingMode))
                {
                    textBuilder.append("FILL:")
                               .append(toString(charNonStrokingColor))
                               .append(';');
                }
    
                if (STROKING_MODES.contains(charRenderingMode))
                {
                    textBuilder.append("STROKE:")
                               .append(toString(charStrokingColor))
                               .append(';');
                }
    
                if (CLIPPING_MODES.contains(charRenderingMode))
                {
                    textBuilder.append("CLIP;");
                }
    
                textBuilder.append("}");
                writeString(textBuilder.toString());
            }
        }
    
        String toString(float[] values)
        {
            if (values == null)
                return "null";
            StringBuilder builder = new StringBuilder();
            switch(values.length)
            {
            case 1:
                builder.append("GRAY"); break;
            case 3:
                builder.append("RGB"); break;
            case 4:
                builder.append("CMYK"); break;
            default:
                builder.append("UNKNOWN");
            }
            for (float f: values)
            {
                builder.append(' ')
                       .append(f);
            }
    
            return builder.toString();
        }
    }
    

    您可以这样称呼它:

    PDFTextStripper stripper = new ColorTextStripper();
    
    PDDocument document = PDDocument.load(SOURCE_FILE);
    
    String text = stripper.getText(document);
    

    结果文本包含以下内容:

    P{FILL:RGB 0.803 0.076 0.086;}e{FILL:RGB 0.803 0.076 0.086;}l{FILL:RGB 0.803 0.076 0.086;}l{FILL:RGB 0.803 0.076 0.086;}e{FILL:RGB 0.803 0.076 0.086;}
    

    G{FILL:RGB 0.102 0.101 0.095;}r{FILL:RGB 0.102 0.101 0.095;}a{FILL:RGB 0.102 0.101 0.095;}z{FILL:RGB 0.102 0.101 0.095;}i{FILL:RGB 0.102 0.101 0.095;}e{FILL:RGB 0.102 0.101 0.095;}
    

    对于Pelle

    Grazie,来自此

    “

    K{FILL:RGB 0.0 0.322 0.573;}E{FILL:RGB 0.0 0.322 0.573;}Y{FILL:RGB 0.0 0.322 0.573;}
    

    C{FILL:GRAY 0.0;}o{FILL:GRAY 0.0;}m{FILL:GRAY 0.0;}b{FILL:GRAY 0.0;}i{FILL:GRAY 0.0;}n{FILL:GRAY 0.0;}e{FILL:GRAY 0.0;}d{FILL:GRAY 0.0;}
    

    用于KEY

    Combined,来自此:

    << img src =“ https://image.soinside.com/eyJ1cmwiOiAiaHR0cHM6Ly9pLnN0YWNrLmltZ3VyLmNvbS80RVcyWC5wbmcifQ==” alt =“ KEY and Combined”>“>

    [除了将所有信息序列化为String结果之外,您当然还可以以结构化的方式创建一些同时包含颜色和字符信息的类。就像现在在writeString中创建String结果一样,您可以更改此方法以将此类的实例添加到其中的某些列表。

    要求

    至少需要PDFBox版本1.8.4才能完成此工作。我使用2.0.0-SNAPSHOT进行了测试,但1.8.4就足够了。另一方面,1.8.3存在一个错误,该错误有时会将错误的TextPosition对象转发到writeString,请参见。 PDFBOX-1804和更早版本完全不向TextPosition提供writeString集合。

    对于PDFBox 2.x

    [PDFBox 2.x中有多个重构和其他更改,它们也与上面的代码有关。

    移植到PDFBox 2.x可能看起来像这样:

    public class ColorTextStripper extends PDFTextStripper {
        public ColorTextStripper() throws IOException {
            super();
            setSuppressDuplicateOverlappingText(false);
    
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetStrokingColor());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor());
            addOperator(new org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor());
        }
    
        @Override
        protected void processTextPosition(TextPosition text) {
            renderingMode.put(text, getGraphicsState().getTextState().getRenderingMode());
            strokingColor.put(text, getGraphicsState().getStrokingColor().getComponents());
            nonStrokingColor.put(text, getGraphicsState().getNonStrokingColor().getComponents());
    
            super.processTextPosition(text);
        }
    
        Map<TextPosition, RenderingMode> renderingMode = new HashMap<TextPosition, RenderingMode>();
        Map<TextPosition, float[]> strokingColor = new HashMap<TextPosition, float[]>();
        Map<TextPosition, float[]> nonStrokingColor = new HashMap<TextPosition, float[]>();
    
        final static List<RenderingMode> FILLING_MODES = Arrays.asList(RenderingMode.FILL, RenderingMode.FILL_STROKE, RenderingMode.FILL_CLIP, RenderingMode.FILL_STROKE_CLIP);
        final static List<RenderingMode> STROKING_MODES = Arrays.asList(RenderingMode.STROKE, RenderingMode.FILL_STROKE, RenderingMode.STROKE_CLIP, RenderingMode.FILL_STROKE_CLIP);
        final static List<RenderingMode> CLIPPING_MODES = Arrays.asList(RenderingMode.FILL_CLIP, RenderingMode.STROKE_CLIP, RenderingMode.FILL_STROKE_CLIP, RenderingMode.NEITHER_CLIP);
    
        @Override
        protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
            for (TextPosition textPosition: textPositions) {
                RenderingMode charRenderingMode = renderingMode.get(textPosition);
                float[] charStrokingColor = strokingColor.get(textPosition);
                float[] charNonStrokingColor = nonStrokingColor.get(textPosition);
    
                StringBuilder textBuilder = new StringBuilder();
                textBuilder.append(textPosition.getUnicode()).append("{");
    
                if (FILLING_MODES.contains(charRenderingMode)) {
                    textBuilder.append("FILL:").append(toString(charNonStrokingColor)).append(';');
                }
    
                if (STROKING_MODES.contains(charRenderingMode)) {
                    textBuilder.append("STROKE:").append(toString(charStrokingColor)).append(';');
                }
    
                if (CLIPPING_MODES.contains(charRenderingMode)) {
                    textBuilder.append("CLIP;");
                }
    
                textBuilder.append("}");
                writeString(textBuilder.toString());
            }
        }
    
        String toString(float[] values)
        {
            if (values == null)
                return "null";
            StringBuilder builder = new StringBuilder();
            switch(values.length) {
            case 1:
                builder.append("GRAY"); break;
            case 3:
                builder.append("RGB"); break;
            case 4:
                builder.append("CMYK"); break;
            default:
                builder.append("UNKNOWN");
            }
            for (float f: values) {
                builder.append(' ')
                       .append(f);
            }
    
            return builder.toString();
        }
    }
    

    ([ColorTextStripper

    © www.soinside.com 2019 - 2024. All rights reserved.