使用Python和Google Vision检测PDF文件上的文本时,我收到JSON解码错误

问题描述 投票:0回答:1

我正在尝试使用Google Vision和Python。我正在使用示例文件,但我不断收到相同的错误消息:

Traceback (most recent call last):
  File "C:\Program Files (x86)\Python37-32\lib\site-packages\google\protobuf\jso
n_format.py", line 416, in Parse
    js = json.loads(text, object_pairs_hook=_DuplicateChecker)
  File "C:\Program Files (x86)\Python37-32\lib\json\__init__.py", line 361, in l
oads
    return cls(**kw).decode(s)
  File "C:\Program Files (x86)\Python37-32\lib\json\decoder.py", line 338, in de
code
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "C:\Program Files (x86)\Python37-32\lib\json\decoder.py", line 356, in ra
w_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "sample.py", line 72, in <module>
    async_detect_document('gs://matr/file_1035.pdf','gs://matr/output/')
  File "sample.py", line 59, in async_detect_document
    json_string, vision.types.AnnotateFileResponse())
  File "C:\Program Files (x86)\Python37-32\lib\site-packages\google\protobuf\jso
n_format.py", line 418, in Parse
    raise ParseError('Failed to load JSON: {0}.'.format(str(e)))
google.protobuf.json_format.ParseError: Failed to load JSON: Expecting value: li
ne 1 column 1 (char 0).

我猜它与生成的JSON文件有关。它确实生成一个JSON文件,但我想它应该打印到命令行。以下是JSON文件的前几行:

{
    "inputConfig": {
        "gcsSource": {
            "uri": "gs://python-docs-samples-tests/HodgeConj.pdf"
        },
        "mimeType": "application/pdf"
    },

我生成的文件通过使用加载到JSON对象中

data = json.load(jsonfile)

我试过print (json_string)但我只得到b'placeholder'

我怎样才能让它发挥作用?我使用的是Python 3.7.2

我的代码如下:

def async_detect_document(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS"""
    from google.cloud import vision
    from google.cloud import storage
    from google.protobuf import json_format
    import re
    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size = 2

    client = vision.ImageAnnotatorClient()

    feature = vision.types.Feature(
        type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.types.GcsSource(uri=gcs_source_uri)
    input_config = vision.types.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.types.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size)

    async_request = vision.types.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(
        requests=[async_request])

    print('Waiting for the operation to finish.')
    operation.result(timeout=180)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name=bucket_name)

    # List objects with the given prefix.
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    output = blob_list[0]

    json_string = output.download_as_string()
    response = json_format.Parse(
        json_string, vision.types.AnnotateFileResponse())

    # The actual response for the first page of the input file.
    first_page_response = response.responses[0]
    annotation = first_page_response.full_text_annotation

    # Here we print the full text from the first page.
    # The response contains more information:
    # annotation/pages/blocks/paragraphs/words/symbols
    # including confidence scores and bounding boxes
    print(u'Full text:\n{}'.format(
        annotation.text))

async_detect_document('gs://my_bucket/file_1035.pdf','gs://my_bucket/output/')
python json google-vision
1个回答
0
投票

我在github页面上收到了用户的回复。 https://github.com/GoogleCloudPlatform/python-docs-samples/issues/2086#issuecomment-487635159

我有这个问题,并确定它是由作为bloblist的一部分迭代的前缀引起的。我可以看到“output /”在输出中列为文件,随后在其上尝试解析导致错误。

尝试硬编码像prefix ='output / out'这样的前缀,该文件夹不会包含在列表中。

The demo code should probably be modified to handle this simple case a little better.




import re


def async_detect_document(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS"""
    from google.cloud import vision
    from google.cloud import storage
    from google.protobuf import json_format
    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size = 2

    client = vision.ImageAnnotatorClient()

    feature = vision.types.Feature(
        type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.types.GcsSource(uri=gcs_source_uri)
    input_config = vision.types.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.types.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size)

    async_request = vision.types.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(
        requests=[async_request])

    print('Waiting for the operation to finish.')
    operation.result(timeout=180)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name=bucket_name)

    print ('prefix: ' + prefix)
    prefix = 'output/out'
    print ('prefix new: ' + prefix)


    # List objects with the given prefix.
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    output = blob_list[0]

    json_string = output.download_as_string()
    response = json_format.Parse(
        json_string, vision.types.AnnotateFileResponse())

    # The actual response for the first page of the input file.
    first_page_response = response.responses[0]
    annotation = first_page_response.full_text_annotation

    # Here we print the full text from the first page.
    # The response contains more information:
    # annotation/pages/blocks/paragraphs/words/symbols
    # including confidence scores and bounding boxes
    print(u'Full text:\n{}'.format(
        annotation.text))


async_detect_document('gs://my_bucket/my_file.pdf','gs://my_bucket/output/out')
© www.soinside.com 2019 - 2024. All rights reserved.