python - bz2 未正确重新压缩

Question

以下代码能够读取 bzipped 文件：

offset = 24

# Open the object
fobj = open(filey,'rb')

# Read the data
buffer = fobj.read()

# Apply bz2 compression
buffer_unbzip,places_to_bzip = bzip_blocks_decompress_all(buffer,offset)

其中 bzip_blocks_decompress_all 函数定义如下：

def bzip_blocks_decompress_all(data,offset):
    import bz2
    frames = bytearray()
    places_to_bzip = []
    while offset < len(data):
        block_cmp_bytes = abs(int.from_bytes(data[offset:offset + 4], 'big', signed=True))
        offset += 4
        frames += bz2.decompress(data[offset:offset + block_cmp_bytes])
        places_to_bzip.append([offset,offset+block_cmp_bytes])
        offset += block_cmp_bytes
        
    return frames,places_to_bzip

所以我有对象被 bzip 压缩的位置（places_to_bzip）。所以我的想法是我们应该能够做如下的事情：

# Try to compress using bz2 just based on some of the places_to_bzip
a1 = buffer[places_to_bzip[0][0]:places_to_bzip[0][1]]
a2 = buffer_unbzip[places_to_bzip[0][0]:places_to_bzip[0][1]]

# Convert a2 back to a1 with a bzip compression
a3 = bz2.compress(a2)
print(len(a1))
print(len(a2))
print(len(a3))

104
104
70

为什么不能正确重新压缩？以下是用于测试的 a1 和 a2 的输出：

print(a1)
b'BZh51AY&SY\xe6\xb1\xacS\x00\x00\x02_\xab\xfe(@\x00\x10\x00@\x04\x00@\x00@\x800\x02\x00\x00\x01\x00@\x08\x00\x00\x18 \x00T4\x8d\x004\x01\xa0\x91(\x01\x90\xd3\xd2\x14\xac\xd6v\x85\xf0\x0fD\x85\xc3A}\xe09\xbc\xe1\x8b\x04Y\xbfb$"\xcc\x13\xc0B\r\x99\xf1Qa%S\x00|]\xc9\x14\xe1BC\x9a\xc6\xb1L'

print(a2)
bytearray(b'\x00\x0b\x00\x02\x05z\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00X\x00\x00\x00\x00\x002\x04@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01h\x00\x00\x00\x00\x002\x04@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')

Answer 1

根据我的评论，

buffer_unbzip

仅包含解压缩的数据，

places_to_bzip

中的偏移量是原始压缩数据中切片的开始/结束偏移量。解压缩帧的偏移量未知。

下面我对输入文件进行了逆向工程并生成了一个，然后使用OP的代码来提取数据。代码被修改为还返回每个解压帧的开始/结束，然后遍历偏移量重新压缩并比较每个帧的压缩数据：

import bz2
import struct

### Reproducible input file example ###
def write_frame(f, data):
    bzdata = bz2.compress(data)
    # Write size of compressed data as big-endian 4-byte integer,
    # then the compressed data.
    f.write(struct.pack('>L', len(bzdata)) + bzdata)

with open('file.bin', 'wb') as f:
    f.write(b'A' * 24)  # header in the original data?
    write_frame(f, b'B' * 50)  # compressed frames
    write_frame(f, b'C' * 25)
    write_frame(f, b'D' * 30)
    write_frame(f, b'E' * 12)
### END ###

offset = 24

# Open the object
with open('file.bin','rb') as fobj:
    # Read the data
    buffer = fobj.read()

def bzip_blocks_decompress_all(data,offset):
    import bz2
    frames = bytearray()
    places_to_bzip = []
    places_to_unbzip = []
    while offset < len(data):
        # Why signed and abs()?  A length should never be negative.
        # Note: >L means big-endian 4-byte unsigned integer.
        #       A tuple of the struct elements is returned,
        #       in this case a 1-tuple, so [0] to get the integer
        block_cmp_bytes = struct.unpack_from('>L', data, offset)[0]
        #block_cmp_bytes = abs(int.from_bytes(data[offset:offset + 4], 'big', signed=True))
        offset += 4
        start = len(frames)
        frames += bz2.decompress(data[offset:offset + block_cmp_bytes])
        end = len(frames)
        places_to_bzip.append([start, end])
        places_to_unbzip.append([offset, offset + block_cmp_bytes])
        offset += block_cmp_bytes
        
    return frames, places_to_bzip, places_to_unbzip

# Apply bz2 compression
buffer_unbzip, places_to_bzip, places_to_unbzip = bzip_blocks_decompress_all(buffer, offset)
print(f'{buffer=}')
print(f'{buffer_unbzip=}')

# Try to compress using bz2 just based on some of the places_to_bzip
for (bstart, bend), (unbstart, unbend) in zip(places_to_bzip, places_to_unbzip):
    a1 = buffer[unbstart:unbend]
    a2 = buffer_unbzip[bstart:bend]

    # Convert a2 back to a1 with a bzip compression
    a3 = bz2.compress(a2)
    print(a1 == a3, a2)

输出：

buffer=b"AAAAAAAAAAAAAAAAAAAAAAAA\x00\x00\x00'BZh91AY&SY?\xbf\xc2\x8b\x00\x00\x02\x14\x00\x00\x01\x10\x00 \x00!\x00\x82\x0b\x17rE8P\x90?\xbf\xc2\x8b\x00\x00\x00'BZh91AY&SY\x0b\xc7\x94'\x00\x00\x02$\x00\x02\x00\x08\x00 \x00!\x00\x82\x0b\x17rE8P\x90\x0b\xc7\x94'\x00\x00\x00'BZh91AY&SYX\xf3\xe3\x91\x00\x00\x02$\x00\x00\x10\x04\x00 \x00!\x00\x82\x0b\x17rE8P\x90X\xf3\xe3\x91\x00\x00\x00'BZh91AY&SY\xb6\xa1w{\x00\x00\x02D\x00\x00@\x02\x00 \x00!\x00\x82\x0b\x17rE8P\x90\xb6\xa1w{"
buffer_unbzip=bytearray(b'BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBCCCCCCCCCCCCCCCCCCCCCCCCCDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDEEEEEEEEEEEE')
True bytearray(b'BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB')
True bytearray(b'CCCCCCCCCCCCCCCCCCCCCCCCC')
True bytearray(b'DDDDDDDDDDDDDDDDDDDDDDDDDDDDDD')
True bytearray(b'EEEEEEEEEEEE')

python - bz2 未正确重新压缩

问题描述投票：0回答：1

1个回答

最新问题

python - bz2 未正确重新压缩

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1