Python - 从 Google Cloud Storage 下载整个目录

问题描述 投票:0回答:7

在下一页

https://googlecloudplatform.github.io/google-cloud-python/latest/storage/blobs.html

有可用于Python和Google云存储的所有API调用。即使在 github 上的“官方”示例中

https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/storage/cloud-client/snippets.py

没有相关示例。

最后,用与下载文件相同的方法下载目录会出现错误

Error:  [Errno 21] Is a directory:
python google-cloud-storage
7个回答
36
投票

你只需先列出一个目录下的所有文件,然后逐个下载即可:

bucket_name = 'your-bucket-name'
prefix = 'your-bucket-directory/'
dl_dir = 'your-local-directory/'

storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name=bucket_name)
blobs = bucket.list_blobs(prefix=prefix)  # Get list of files
for blob in blobs:
    filename = blob.name.replace('/', '_') 
    blob.download_to_filename(dl_dir + filename)  # Download

blob.name
包含整个目录结构+文件名,因此如果您想要与存储桶中相同的文件名,您可能需要先提取它(而不是用
/
替换
_


9
投票

如果您想保持相同的目录结构而不重命名并创建嵌套文件夹。对于 python 3.5+,我有一个基于 @ksbg 答案的解决方案:

from pathlib import Path
bucket_name = 'your-bucket-name'
prefix = 'your-bucket-directory/'
dl_dir = 'your-local-directory/'

storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name=bucket_name)
blobs = bucket.list_blobs(prefix=prefix)  # Get list of files
for blob in blobs:
    if blob.name.endswith("/"):
        continue
    file_split = blob.name.split("/")
    directory = "/".join(file_split[0:-1])
    Path(directory).mkdir(parents=True, exist_ok=True)
    blob.download_to_filename(blob.name) 

2
投票

比方说,我们要从存储路径下载

FINALFOLDER
gs://TEST_BUCKET_NAME/FOLDER1/FOLDER2/FINALFOLDER
下载完成后,最终路径如下:
D:\\my_blob_data\FINALFOLDER

from os import makedirs
from os.path import join, isdir, isfile, basename
from google.cloud import storage

# if your environment was authenticated, the default config will be picked up
storage_client = storage.Client() # comment this line if you want to use service account

# uncomment the line below if you have a service account json
# storage_client = storage.Client.from_service_account_json('creds/sa.json')

bucket_name = 'TEST_BUCKET_NAME'
prefix = 'FOLDER2'
dst_path = 'D:\\my_blob_data'

if isdir(dstPath) == False:
    makedirs(dstPath)

bucket = storage_client.bucket(bucket_name=bucket_name)
blobs = bucket.list_blobs(prefix=prefix)  # Get list of files
for blob in blobs:
    blob_name = blob.name 
    dst_file_name = blob_name.replace('FOLDER1/FOLDER2', dst_path) #.replace('FOLDER1/FOLDER2', 'D:\\my_blob_data') 
    # extract the final directory and create it in the destination path if it does not exist
    dst_dir = dst_file_name.replace('/' + basename(dst_file_name), '')
    if isdir(dst_dir) == False:
        makedirs(dst_dir)
    # download the blob object
    blob.download_to_filename(dst_file_name)

0
投票

使用tensoflow gfile包,这里是一个递归函数。

  • root_dir 是 GCS 父文件夹。
  • local_base_dir 是在本地创建的父文件夹
def copy_recursively(root_dir, local_base_dir):            
    if tf.io.gfile.exists(local_base_dir):
        tf.io.gfile.rmtree(local_base_dir)

    tf.io.gfile.mkdir(local_base_dir)   
    file_list = tf.io.gfile.glob(root_dir+'/**')

    for item in file_list:
             
        if not tf.io.gfile.isdir(item):
            fname = item.rsplit('/',1)[-1]
            if not fname.startswith('.'):
                tf.io.gfile.copy(item,
                                 os.path.join(local_base_dir,fname), 
                                 overwrite=False)
        else:
            child_dir= item.rsplit('/',1)[-1]
            full_dir_path = os.path.join(local_base_dir,child_dir)
            print(f"Setting up child directory: {full_dir_path}")
            copy_recursively(item,full_dir_path)
    
    root_dir = 'gs://.../.../..'
    local_base_dir = root_dir.rsplit('/',1)[-1]
    
    copy_recursively(root_dir, local_base_dir)

0
投票

本地以zip格式下载父目录中的所有文件和子目录并上传到任何GCS存储桶。

希望这段代码也能帮助您。

from google.cloud import storage
from zipfile import ZipFile, ZipInfo, io, os
from datetime import datetime

# The ID of your GCS bucket
bucket_name = "SOURCE_BUCKET"

# The ID of your GCS object
prefix = 'Fold1/' 

archive = io.BytesIO()
with ZipFile(archive, 'w') as zip:
    storage_client = storage.Client()
    source_bucket = storage_client.get_bucket(bucket_name)
    blobs = source_bucket.list_blobs(prefix=prefix)
    for blob in blobs:
        if blob.name.endswith("/"): continue
        filename = blob.name #.replace('/', '_') 
        data = source_bucket.blob(filename).download_as_string()
        zip_file = ZipInfo(filename)
        zip.writestr(zip_file,data)
archive.seek(0)
now = datetime.now()
dt_string = now.strftime("%d-%m-%Y_%H:%M:%S")
object_name = "Fold1_"+"dt_string"+".zip"


##### download to local
blob.download_to_filename(object_name)

##### upload to any bucket
target_bucket = "TARGET_BUCKET"
bucket = storage_client.get_bucket(target_bucket)
blob = storage.Blob(object_name, bucket)
blob.upload_from_file(archive, content_type='application/zip')

0
投票

以与源 gcs 目录相同的相对顺序递归下载所有文件夹

def download_gcs_folder_recursively_to_local(blob_folder_path, destination_folder_path, gcs_project_name, gcs_bucket_name):


    if not blob_folder_path.endswith("/"):
        blob_folder_path = blob_folder_path+"/"
    if not destination_folder_path.endswith("/"):
        destination_folder_path = destination_folder_path+"/"

    storage_client = storage.Client(gcs_project_name)
    bucket = storage_client.get_bucket(gcs_bucket_name)
    blobs = bucket.list_blobs(prefix=blob_folder_path)
    os.makedirs("tmp_cp_folder", exist_ok=True)

    for blob in blobs:
        if blob.name.endswith("/"):
            continue
        tmp_filename = blob.name.replace('/', '_')
        relative_file_path = blob.name[len(blob_folder_path):]
        relative_file_parent_folder = "" if len(relative_file_path.split("/")) == 1 else relative_file_path.rsplit('/',1)[0]
        os.makedirs(f"{destination_folder_path}{relative_file_parent_folder}", exist_ok=True)
        blob.download_to_filename(f"tmp_cp_folder/{tmp_filename}")
        os.system(f"mv tmp_cp_folder/{tmp_filename} {destination_folder_path}{relative_file_path}")

    os.removedirs("tmp_cp_folder")

基于早期的解决方案之一:https://stackoverflow.com/a/49749281


-2
投票

请参阅此链接 - https://medium.com/@sandeepsinh/multiple-file-download-form-google-cloud-storage-using-python-and-gcs-api-1dbcab23c44

1 - 添加您的凭证 Json 2 - 列出存储桶项目 3 - 下载

import logging
import os
from google.cloud import storage
global table_id
global bucket_name
logging.basicConfig(format=’%(levelname)s:%(message)s’, level=logging.DEBUG) 
bucket_name = ‘mybucket’
table_id = ‘shakespeare’
storage_client = storage.Client.from_service_account_json(‘/google-cloud/keyfile/service_account.json’)
# The “folder” where the files you want to download are
folder=’/google-cloud/download/{}’.format(table_id)
delimiter=’/’
bucket=storage_client.get_bucket(bucket_name)
blobs=bucket.list_blobs(prefix=table_id, delimiter=delimiter) #List all objects that satisfy the filter.
# Download the file to a destination 
def download_to_local():
 logging.info(‘File download Started…. Wait for the job to complete.’)
 # Create this folder locally if not exists
 if not os.path.exists(folder):
 os.makedirs(folder)
 # Iterating through for loop one by one using API call
 for blob in blobs:
 logging.info(‘Blobs: {}’.format(blob.name))
 destination_uri = ‘{}/{}’.format(folder, blob.name) 
 blob.download_to_filename(destination_uri)
 logging.info(‘Exported {} to {}’.format(
 blob.name, destination_uri))
if __name__ == ‘__main__’:
 download_to_local()
© www.soinside.com 2019 - 2024. All rights reserved.