在python中计算文件的crc

Question

我想计算文件的 CRC 并获得如下输出：

E45A12AC

。这是我的代码：

#!/usr/bin/env python 
import os, sys
import zlib

def crc(fileName):
    fd = open(fileName,"rb")
    content = fd.readlines()
    fd.close()
    for eachLine in content:
        zlib.crc32(eachLine)

for eachFile in sys.argv[1:]:
    crc(eachFile)

这会计算每行的 CRC，但其输出（例如

-1767935985

）不是我想要的。

Hashlib 按照我想要的方式工作，但它计算 md5：

import hashlib
m = hashlib.md5()
for line in open('data.txt', 'rb'):
    m.update(line)
print m.hexdigest()

是否可以使用

zlib.crc32

得到类似的东西？

Answer 1

更紧凑和优化的代码

def crc(fileName):
    prev = 0
    for eachLine in open(fileName,"rb"):
        prev = zlib.crc32(eachLine, prev)
    return "%X"%(prev & 0xFFFFFFFF)

PS2：由于评论中的建议，旧 PS 已被弃用 - 因此被删除。谢谢。我不明白，我怎么错过了这个，但它真的很好。

Answer 2

kobor42 答案的修改版本，通过读取固定大小的块而不是“行”，性能提高了 2-3 倍：

import zlib

def crc32(fileName):
    with open(fileName, 'rb') as fh:
        hash = 0
        while True:
            s = fh.read(65536)
            if not s:
                break
            hash = zlib.crc32(s, hash)
        return "%08X" % (hash & 0xFFFFFFFF)

返回的字符串中还包含前导零。

Answer 3

hashlib兼容接口，支持 CRC-32：

import zlib

class crc32(object):
    name = 'crc32'
    digest_size = 4
    block_size = 1

    def __init__(self, arg=''):
        self.__digest = 0
        self.update(arg)

    def copy(self):
        copy = super(self.__class__, self).__new__(self.__class__)
        copy.__digest = self.__digest
        return copy

    def digest(self):
        return self.__digest

    def hexdigest(self):
        return '{:08x}'.format(self.__digest)

    def update(self, arg):
        self.__digest = zlib.crc32(arg, self.__digest) & 0xffffffff

# Now you can define hashlib.crc32 = crc32
import hashlib
hashlib.crc32 = crc32

# Python > 2.7: hashlib.algorithms += ('crc32',)
# Python > 3.2: hashlib.algorithms_available.add('crc32')

Answer 4

要将任何整数的最低 32 位显示为 8 个十六进制数字（不带符号），您可以通过使用由 32 位组成的掩码（全部为值 1）来“掩码”该值，然后应用格式设置。即：

>>> x = -1767935985
>>> format(x & 0xFFFFFFFF, '08x')
'969f700f'

您要格式化的整数是否来自

zlib.crc32

或任何其他计算都完全无关。

Answer 5

Python 3.8+（使用海象运算符）：

import zlib

def crc32(filename, chunksize=65536):
    """Compute the CRC-32 checksum of the contents of the given filename"""
    with open(filename, "rb") as f:
        checksum = 0
        while (chunk := f.read(chunksize)) :
            checksum = zlib.crc32(chunk, checksum)
        return checksum

chunksize

是一次从文件中读取的字节数。无论您将

chunksize

设置为什么（必须是

> 0

），同一个文件都会获得相同的 CRC，但设置太低可能会使代码变慢，太高可能会使用太多内存。

结果是一个 32 位整数。空文件的 CRC-32 校验和为

。

Answer 6

编辑后包括下面的Altren 解决方案。

CrouZ 答案的修改版和更紧凑的版本，使用 for 循环和文件缓冲，性能略有提高：

def forLoopCrc(fpath):
    """With for loop and buffer."""
    crc = 0
    with open(fpath, 'rb', 65536) as ins:
        for x in range(int((os.stat(fpath).st_size / 65536)) + 1):
            crc = zlib.crc32(ins.read(65536), crc)
    return '%08X' % (crc & 0xFFFFFFFF)

结果，在 6700k HDD 中：

（注：重新测试多次，速度始终更快。）

Warming up the machine...
Finished.

Beginning tests...
File size: 90288KB
Test cycles: 500

With for loop and buffer.
Result 45.24728019630359 

CrouZ solution
Result 45.433838356097894 

kobor42 solution
Result 104.16215688703986 

Altren solution
Result 101.7247863946586

使用以下脚本在 Python 3.6.4 x64 中进行测试：

import os, timeit, zlib, random, binascii

def forLoopCrc(fpath):
    """With for loop and buffer."""
    crc = 0
    with open(fpath, 'rb', 65536) as ins:
        for x in range(int((os.stat(fpath).st_size / 65536)) + 1):
            crc = zlib.crc32(ins.read(65536), crc)
    return '%08X' % (crc & 0xFFFFFFFF)

def crc32(fileName):
    """CrouZ solution"""
    with open(fileName, 'rb') as fh:
        hash = 0
        while True:
            s = fh.read(65536)
            if not s:
                break
            hash = zlib.crc32(s, hash)
        return "%08X" % (hash & 0xFFFFFFFF)

def crc(fileName):
    """kobor42 solution"""
    prev = 0
    for eachLine in open(fileName,"rb"):
        prev = zlib.crc32(eachLine, prev)
    return "%X"%(prev & 0xFFFFFFFF)

def crc32altren(filename):
    """Altren solution"""
    buf = open(filename,'rb').read()
    hash = binascii.crc32(buf) & 0xFFFFFFFF
    return "%08X" % hash

fpath = r'D:\test\test.dat'
tests = {forLoopCrc: 'With for loop and buffer.', 
     crc32: 'CrouZ solution', crc: 'kobor42 solution',
         crc32altren: 'Altren solution'}
count = 500

# CPU, HDD warmup
randomItm = [x for x in tests.keys()]
random.shuffle(randomItm)
print('\nWarming up the machine...')
for c in range(count):
    randomItm[0](fpath)
print('Finished.\n')

# Begin test
print('Beginning tests...\nFile size: %dKB\nTest cycles: %d\n' % (
    os.stat(fpath).st_size/1024, count))
for x in tests:
    print(tests[x])
    start_time = timeit.default_timer()
    for c in range(count):
        x(fpath)
    print('Result', timeit.default_timer() - start_time, '\n')

它更快，因为 for 循环比 while 循环更快（来源：here 和 here）。

Answer 7

合并以上2段代码如下：

try:
    fd = open(decompressedFile,"rb")
except IOError:
    logging.error("Unable to open the file in readmode:" + decompressedFile)
    return 4
eachLine = fd.readline()
prev = 0
while eachLine:
    prev = zlib.crc32(eachLine, prev)
    eachLine = fd.readline()
fd.close()

Answer 8

有一种更快、更紧凑的方法来使用 binascii 计算 CRC：

import binascii

def crc32(filename):
    buf = open(filename,'rb').read()
    hash = binascii.crc32(buf) & 0xFFFFFFFF
    return "%08X" % hash

Answer 9

您可以使用base64来退出，例如[ERD45FTR]。并且 zlib.crc32 提供更新选项。

import os, sys
import zlib
import base64

def crc(fileName):
  fd = open(fileName,"rb")
  content = fd.readlines()
  fd.close()
  prev = None
  for eachLine in content:
   if not prev:
     prev = zlib.crc32(eachLine)
   else:
     prev = zlib.crc32(eachLine, prev)
  return prev

for eachFile in sys.argv[1:]:
  print base64.b64encode(str(crc(eachFile)))

Answer 10

解决方案：

import os, sys
import zlib

def crc(fileName, excludeLine="", includeLine=""):
  try:
        fd = open(fileName,"rb")
  except IOError:
        print "Unable to open the file in readmode:", filename
        return
  eachLine = fd.readline()
  prev = None
  while eachLine:
      if excludeLine and eachLine.startswith(excludeLine):
            continue   
      if not prev:
        prev = zlib.crc32(eachLine)
      else:
        prev = zlib.crc32(eachLine, prev)
      eachLine = fd.readline()
  fd.close()    
  return format(prev & 0xFFFFFFFF, '08x') #returns 8 digits crc

for eachFile in sys.argv[1:]:
    print crc(eachFile)

不太清楚什么是 (excludeLine="", includeLine="")...

在python中计算文件的crc

问题描述投票：0回答：10

10个回答

最新问题

在python中计算文件的crc

问题描述 投票：0回答：10

10个回答

最新问题

问题描述投票：0回答：10