如何使用python定位PDF文件中的数字签名

Question

我有一个相当简单的挑战，至少一开始看起来是这样。我选择了 python 来做这件事。

我有多个 PDF 文件，我必须检测这些文件的数字签名（如果存在或根本不存在），并从中获取一些详细信息（如果存在）。例如，在我的 PDF 文件中，签名上始终有签名者的电子邮件地址。没有电子邮件的签名属于例外情况。

我的设想是，我应该能够找到文档中的所有签名对象，并将它们放入数组或列表中，然后检查它们的详细信息。这可能吗？如何找到它们？

我在网上能找到的都是有关验证签名或签署 PDF 文件的内容，虽然这很重要，但我不需要。

编辑

我现在已成功在 PDF 文件中找到签名并获取其内容。然而，我现在确实很难解码内容。

当前查找签名的解决方案只是临时的，因为我必须使用签名对象的索引来定位它。不同的PDF文档会有不同的签名索引。然而，类型是“Sig”，因此可以实现某种循环来检查对象类型，如果它是“Sig”，那么您就找到了一个签名对象。我稍后将使用最终脚本编辑我的帖子。

签名内容类型为

pdfreader.types.native.HexString

。我相信如果我能够解码该字符串，我就能够获得我正在寻找的数据。我一直在尝试将其更改为字节，然后从字节解码为 utf-8 或 ascii，但如果是 UTF-8，我会收到错误

'utf-8' codec can't decode byte 0x82 in position 1: invalid start byte

，如果是 ascii，我会收到错误

'ascii' codec can't decode byte 0x82 in position 1: ordinal not in range(128)

任何解决方法将不胜感激。

解决方案

正如评论中所指出的，正如我开始怀疑的那样，签名的“内容”不会包含我需要的任何属性。我注意到，当输出签名时，我在控制台中的输出不完整（可能是因为我在 Visual Studio 中使用调试控制台）并且数字签名有更多属性。我发现，对于我的签名，电子邮件位于“名称”属性下。现在唯一的挑战是找出我的 PDF 文件中有多少个对象。为此，我使用了字符串操作。我发现最后一个预告片有一个“大小”属性，其中包含对象的数量。我没有设法在 PDFTrailer 对象中找到它，因此没有字符串操作。如果还有其他方法的话，最好分享一下。请原谅最后对象循环中的 try-catch，更好的是在读取对象之前检查对象是否存在并且它具有类型。不要复制它并找出适当的解决方案。

from pdfreader import PDFDocument

filename = r"<filepath.pdf>"
fd = open(filename, "rb")
doc = PDFDocument(fd)
pdf_content = str(fd.read())
nr_of_obj = 0
nr_of_signatures = 0
signature_names = [] #in my case - emails
s_trailer = 'trailer'
s_startxref = 'startxref'

# +/- 2 at the end for new line character before and after the trailer
#Finding where last trailer of the document starts and ends

last_trailer_start = pdf_content.rfind(s_trailer)+len(s_trailer)+2
last_trailer_end = pdf_content.rfind(s_startxref)-2

#Putting the trailer together
trailer = ""
for i in range(last_trailer_start, last_trailer_end):
    trailer += pdf_content[i]

#Split the trailer attributes. Locate and read Size
trailer = trailer.split('/')
for attribute in trailer:
    if attribute.find('Size') >= 0:
        nr_of_obj = int(attribute.replace('Size','').strip())
        break

#Use the size to loop through objects. All the objects will not exist and not all will have Types. Hence try-catch.
for i in range(1, nr_of_obj):    
    try:
        raw_obj = doc.locate_object(i,0)
        obj = doc.build(raw_obj)
        if obj.Type == 'Sig':
            nr_of_signatures +=1
            signature_names.append(obj.Name)
    except:
        continue
print(nr_of_signatures)
print(signature_names)

Answer 1

截至 2023 年 11 月，从 PDF 文件中提取签名和签名者似乎仍然不是一件小事。几年前我写过这篇文章here。另请参阅 Adobe Reader 和 Acrobat PDF 签名说明。

所以这里又进行了一次尝试，现在我使用最新版本的 pypdf 来提取字段，并使用 asn1crypto （除了 python-dateutil）来解析 PKCS7 签名。（似乎PyOpenSSL和cryptography已弃用加载pkcs7数据。）可能还有其他库/包可以简化过程。

签名验证超出范围。

import datetime

from asn1crypto import cms
from dateutil.parser import parse
from pypdf import PdfReader


class AttrClass:
    """Abstract helper class"""

    def __init__(self, data, cls_name=None):
        self._data = data
        self._cls_name = cls_name

    def __getattr__(self, name):
        value = self._data[name]
        if isinstance(value, dict):
            return AttrClass(value, cls_name=name.capitalize() or self._cls_name)
        return value

    def __values_for_str__(self):
        """Values to show for "str" and "repr" methods"""
        return [
            (k, v) for k, v in self._data.items()
            if isinstance(v, (str, int, datetime.datetime))
        ]

    def __str__(self):
        """String representation of object"""
        values = ", ".join([
            f"{k}={v}" for k, v in self.__values_for_str__()
        ])
        return f"{self._cls_name or self.__class__.__name__}({values})"

    def __repr__(self):
        return f"<{self}>"


class Signature(AttrClass):
    """Signature helper class

    Attributes:
        signing_time (datetime, datetime): when user has signed
            (user HW's clock)
        signer_name (str): the signer's common name
        signer_contact_info (str, None): the signer's email / contact info
        signer_location (str, None): the signer's location
        signature_type (str): ETSI.cades.detached, adbe.pkcs7.detached, ...
        certificate (Certificate): the signers certificate
        digest_algorithm (str): the digest algorithm used
        message_digest (bytes): the digest
        signature_algorithm (str): the signature algorithm used
        signature_bytes (bytest): the raw signature
    """


class Subject(AttrClass):
    """Certificate subject helper class

    Attributes:
        common_name (str): the subject's common name
        given_name (str): the subject's first name
        surname (str): the subject's surname
        serial_number (str): subject's identifier (may not exist)
        country (str): subject's country
    """
    pass


class Certificate(AttrClass):
    """Signer's certificate helper class

    Attributes:
        version (str): v3 (= X509v3)
        serial_number (int): the certificate's serial number
        subject (object): signer's subject details
        issuer (object): certificate issuer's details
        signature (object): certificate signature
        extensions (list[OrderedDict]): certificate extensions
        validity (object): validity (not_before, not_after)
        subject_public_key_info (object): public key info
        issuer_unique_id (object, None): issuer unique id
        subject_uniqiue_id (object, None): subject unique id
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.subject = Subject(self._data['subject'])

    def __values_for_str__(self):
        return (
            super().__values_for_str__() +
            [('common_name', self.subject.common_name)]
        )


def parse_pkcs7_signatures(signature_data: bytes):
    """Parse a PKCS7 / CMS / CADES signature"""
    content_info = cms.ContentInfo.load(signature_data).native
    if content_info['content_type'] != 'signed_data':
        return None
    content = content_info['content']
    certificates = content['certificates']
    # each PKCS7 / CMS / CADES could have several signatures
    signer_infos = content['signer_infos']
    for signer_info in signer_infos:
        # the sid key should point to the certificates collection
        sid = signer_info['sid']
        digest_algorithm = signer_info['digest_algorithm']['algorithm']
        signature_algorithm = signer_info['signature_algorithm']['algorithm']
        signature_bytes = signer_info['signature']
        # signed attributes is a list of key, value pairs
        # oversimplification: normally we have no repeated attributes
        signed_attrs = {sa['type']: sa['values'][0] for sa in signer_info['signed_attrs']}
        # find matching certificate, only for issuer / serial number
        for cert in certificates:
            cert = cert['tbs_certificate']
            if (
                sid['serial_number'] == cert['serial_number'] and
                sid['issuer'] == cert['issuer']
            ):
                break
        else:
            raise RuntimeError(
                f"Couldn't find certificate in certificates collection: {sid}")
        yield dict(
            sid=sid,
            certificate=Certificate(cert),
            digest_algorithm=digest_algorithm,
            signature_algorithm=signature_algorithm,
            signature_bytes=signature_bytes,
            signer_info=signer_info,
            **signed_attrs,
        )


def get_pdf_signatures(filename):
    """Parse PDF signatures"""
    reader = PdfReader(filename)
    fields = reader.get_fields().values()
    signature_field_values = [
        f.value for f in fields if f.field_type == '/Sig']
    for v in signature_field_values:
        # - signature datetime (not included in pkcs7) in format:
        #   D:YYYYMMDDHHmmss[offset]
        #   where offset is +/-HH'mm' difference to UTC.
        signing_time = parse(v['/M'][2:].strip("'").replace("'", ":"))
        # - used standard for signature encoding, in my case:
        # - get PKCS7/CMS/CADES signature package encoded in ASN.1 / DER format
        raw_signature_data = v['/Contents']
        for attrdict in parse_pkcs7_signatures(raw_signature_data):
            if attrdict:
                attrdict.update(dict(
                    signer_name=v['/Name'],
                    signer_contact_info=v.get('/ContactInfo'),
                    signer_location=v.get('/Location'),
                    signing_time=signing_time,
                    signature_type=v['/SubFilter'][1:],  # ETSI.CAdES.detached, ...
                    signature_handler=v['/Filter'][1:],
                    raw=raw_signature_data,
                ))
                yield Signature(attrdict)


filename = "<my signed pdf file>"
for signature in get_pdf_signatures(filename):
    print(f"Signature: {signature}")
    print(f"Signer: {signature.signer_name}")
    print(f"Signing time: {signature.signing_time}")
    print(f"Signer's certificate: {signature.certificate}")
    print(f"Certificate's subject: {signature.certificate.subject}")

如何使用python定位PDF文件中的数字签名

问题描述投票：0回答：1

1个回答

最新问题

如何使用python定位PDF文件中的数字签名

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1