我正在尝试读取 s3 存储桶上的一个非常大的 zip 文件,并使用下面的代码作为 lambda 函数在另一个 s3 存储桶上提取其数据:
import json
import boto3
from io import BytesIO
import zipfile
def lambda_handler(event, context):
s3_resource = boto3.resource('s3')
source_bucket = 'bucket1'
target_bucket = 'bucket2'
for file in my_bucket.objects.all():
if (str(file.key).endswith('.zip')):
zip_obj = s3_resource.Object(bucket_name=source_bucket, key=file.key)
buffer = BytesIO(zip_obj.get()["Body"].read())
z = zipfile.ZipFile(buffer)
for filename in z.namelist():
file_info = z.getinfo(filename)
try:
response = s3_resource.meta.client.upload_fileobj(
z.open(filename),
Bucket=target_bucket,
Key=f'{filename}'
)
except Exception as e:
print(e)
else:
print(file.key + ' is not a zip file.')
现在的问题是这段代码将整个文件读取到内存中,并且我收到了 MemoryError。
我想知道是否有更有效的方法来点此?喜欢分块读取文件吗?
谢谢。
根据 John Rotenstein 的评论,我自己找到了一种使用 httpx 和 Stream-unzip 的方法:
import boto3
from stream_unzip import stream_unzip
import httpx
from io import BytesIO
s3_client = boto3.client('s3')
bucket_name = 'bucket_name'
def zipped_chunks():
with httpx.stream('GET', 'https://bucket.s3.amazonaws.com/path/file.zip') as r:
yield from r.iter_bytes(chunk_size=65536)
for file_name, file_size, unzipped_chunks in stream_unzip(zipped_chunks()):
s3_key = f'unzipped/{file_name}'.replace("'","")
buffer = BytesIO()
for chunk in unzipped_chunks:
buffer.write(chunk)
buffer.seek(0)
s3_client.upload_fileobj(buffer, bucket_name, s3_key)
buffer.close()
print(f"File '{file_name}' has been uploaded to S3 bucket '{bucket_name}' with key '{s3_key}'")