我想计算文件的 CRC 并获得如下输出:
E45A12AC
。这是我的代码:
#!/usr/bin/env python
import os, sys
import zlib
def crc(fileName):
fd = open(fileName,"rb")
content = fd.readlines()
fd.close()
for eachLine in content:
zlib.crc32(eachLine)
for eachFile in sys.argv[1:]:
crc(eachFile)
这会计算每行的 CRC,但其输出(例如
-1767935985
)不是我想要的。
Hashlib 按照我想要的方式工作,但它计算 md5:
import hashlib
m = hashlib.md5()
for line in open('data.txt', 'rb'):
m.update(line)
print m.hexdigest()
是否可以使用
zlib.crc32
得到类似的东西?
更紧凑和优化的代码
def crc(fileName):
prev = 0
for eachLine in open(fileName,"rb"):
prev = zlib.crc32(eachLine, prev)
return "%X"%(prev & 0xFFFFFFFF)
PS2:由于评论中的建议,旧 PS 已被弃用 - 因此被删除。谢谢。我不明白,我怎么错过了这个,但它真的很好。
kobor42 答案的修改版本,通过读取固定大小的块而不是“行”,性能提高了 2-3 倍:
import zlib
def crc32(fileName):
with open(fileName, 'rb') as fh:
hash = 0
while True:
s = fh.read(65536)
if not s:
break
hash = zlib.crc32(s, hash)
return "%08X" % (hash & 0xFFFFFFFF)
返回的字符串中还包含前导零。
hashlib兼容接口,支持 CRC-32:
import zlib
class crc32(object):
name = 'crc32'
digest_size = 4
block_size = 1
def __init__(self, arg=''):
self.__digest = 0
self.update(arg)
def copy(self):
copy = super(self.__class__, self).__new__(self.__class__)
copy.__digest = self.__digest
return copy
def digest(self):
return self.__digest
def hexdigest(self):
return '{:08x}'.format(self.__digest)
def update(self, arg):
self.__digest = zlib.crc32(arg, self.__digest) & 0xffffffff
# Now you can define hashlib.crc32 = crc32
import hashlib
hashlib.crc32 = crc32
# Python > 2.7: hashlib.algorithms += ('crc32',)
# Python > 3.2: hashlib.algorithms_available.add('crc32')
要将任何整数的最低 32 位显示为 8 个十六进制数字(不带符号),您可以通过使用由 32 位组成的掩码(全部为值 1)来“掩码”该值,然后应用格式设置。即:
>>> x = -1767935985
>>> format(x & 0xFFFFFFFF, '08x')
'969f700f'
您要格式化的整数是否来自
zlib.crc32
或任何其他计算都完全无关。
Python 3.8+(使用海象运算符):
import zlib
def crc32(filename, chunksize=65536):
"""Compute the CRC-32 checksum of the contents of the given filename"""
with open(filename, "rb") as f:
checksum = 0
while (chunk := f.read(chunksize)) :
checksum = zlib.crc32(chunk, checksum)
return checksum
chunksize
是一次从文件中读取的字节数。无论您将 chunksize
设置为什么(必须是 > 0
),同一个文件都会获得相同的 CRC,但设置太低可能会使代码变慢,太高可能会使用太多内存。
结果是一个 32 位整数。空文件的 CRC-32 校验和为
0
。
编辑后包括下面的Altren 解决方案。
CrouZ 答案的修改版和更紧凑的版本,使用 for 循环和文件缓冲,性能略有提高:
def forLoopCrc(fpath):
"""With for loop and buffer."""
crc = 0
with open(fpath, 'rb', 65536) as ins:
for x in range(int((os.stat(fpath).st_size / 65536)) + 1):
crc = zlib.crc32(ins.read(65536), crc)
return '%08X' % (crc & 0xFFFFFFFF)
结果,在 6700k HDD 中:
(注:重新测试多次,速度始终更快。)
Warming up the machine...
Finished.
Beginning tests...
File size: 90288KB
Test cycles: 500
With for loop and buffer.
Result 45.24728019630359
CrouZ solution
Result 45.433838356097894
kobor42 solution
Result 104.16215688703986
Altren solution
Result 101.7247863946586
使用以下脚本在 Python 3.6.4 x64 中进行测试:
import os, timeit, zlib, random, binascii
def forLoopCrc(fpath):
"""With for loop and buffer."""
crc = 0
with open(fpath, 'rb', 65536) as ins:
for x in range(int((os.stat(fpath).st_size / 65536)) + 1):
crc = zlib.crc32(ins.read(65536), crc)
return '%08X' % (crc & 0xFFFFFFFF)
def crc32(fileName):
"""CrouZ solution"""
with open(fileName, 'rb') as fh:
hash = 0
while True:
s = fh.read(65536)
if not s:
break
hash = zlib.crc32(s, hash)
return "%08X" % (hash & 0xFFFFFFFF)
def crc(fileName):
"""kobor42 solution"""
prev = 0
for eachLine in open(fileName,"rb"):
prev = zlib.crc32(eachLine, prev)
return "%X"%(prev & 0xFFFFFFFF)
def crc32altren(filename):
"""Altren solution"""
buf = open(filename,'rb').read()
hash = binascii.crc32(buf) & 0xFFFFFFFF
return "%08X" % hash
fpath = r'D:\test\test.dat'
tests = {forLoopCrc: 'With for loop and buffer.',
crc32: 'CrouZ solution', crc: 'kobor42 solution',
crc32altren: 'Altren solution'}
count = 500
# CPU, HDD warmup
randomItm = [x for x in tests.keys()]
random.shuffle(randomItm)
print('\nWarming up the machine...')
for c in range(count):
randomItm[0](fpath)
print('Finished.\n')
# Begin test
print('Beginning tests...\nFile size: %dKB\nTest cycles: %d\n' % (
os.stat(fpath).st_size/1024, count))
for x in tests:
print(tests[x])
start_time = timeit.default_timer()
for c in range(count):
x(fpath)
print('Result', timeit.default_timer() - start_time, '\n')
合并以上2段代码如下:
try:
fd = open(decompressedFile,"rb")
except IOError:
logging.error("Unable to open the file in readmode:" + decompressedFile)
return 4
eachLine = fd.readline()
prev = 0
while eachLine:
prev = zlib.crc32(eachLine, prev)
eachLine = fd.readline()
fd.close()
有一种更快、更紧凑的方法来使用 binascii 计算 CRC:
import binascii
def crc32(filename):
buf = open(filename,'rb').read()
hash = binascii.crc32(buf) & 0xFFFFFFFF
return "%08X" % hash
您可以使用base64来退出,例如[ERD45FTR]。并且 zlib.crc32 提供更新选项。
import os, sys
import zlib
import base64
def crc(fileName):
fd = open(fileName,"rb")
content = fd.readlines()
fd.close()
prev = None
for eachLine in content:
if not prev:
prev = zlib.crc32(eachLine)
else:
prev = zlib.crc32(eachLine, prev)
return prev
for eachFile in sys.argv[1:]:
print base64.b64encode(str(crc(eachFile)))
解决方案:
import os, sys
import zlib
def crc(fileName, excludeLine="", includeLine=""):
try:
fd = open(fileName,"rb")
except IOError:
print "Unable to open the file in readmode:", filename
return
eachLine = fd.readline()
prev = None
while eachLine:
if excludeLine and eachLine.startswith(excludeLine):
continue
if not prev:
prev = zlib.crc32(eachLine)
else:
prev = zlib.crc32(eachLine, prev)
eachLine = fd.readline()
fd.close()
return format(prev & 0xFFFFFFFF, '08x') #returns 8 digits crc
for eachFile in sys.argv[1:]:
print crc(eachFile)
不太清楚什么是 (excludeLine="", includeLine="")...