解压缩文件并在 python 中重命名它们

问题描述 投票:0回答:0

在 Azure Synapse 笔记本中解压缩后尝试重命名文件,但在重命名之前我一直成功(代码在 rename_file 方法的最后一行失败)


from azure.storage.filedatalake import DataLakeServiceClient 

import os 
import shutil
import io
import zipfile

class FileTransformer:
    def __init__(self,sourcePath,sourceName,solutionFolder):
        #class variables
        self.source_path = sourcePath
        self.source_name = sourceName
        self.sink_path = f"0_unzipped/{sourcePath}/{sourceName}"
        self.solution_folder = solutionFolder

    def access_source(self,credentials):
        dl_service_client_source = DataLakeServiceClient(f"https://{'lake'}.dfs.core.windows.net", credential=credential)
        dl_file_system_client_source = dl_service_client_source.get_file_system_client("raw")
        dl_directory_client_source = dl_file_system_client_source.get_directory_client(self.source_path)
        dl_source_file_client = dl_directory_client_source.get_file_client(self.source_name) 
        return dl_source_file_client

    
    def extract_data(self,credentials):
        compressed_data = io.BytesIO()
       # self.access_sink(credentials)
        self.access_source(credentials).download_file().readinto(compressed_data)

        mounts = []

        for mount in mssparkutils.fs.mounts():
            mounts.append(mount.mountPoint)
        
        if "/sink" not in mounts:
            mssparkutils.fs.mount(
                f"abfss://{'refined'}@{'lake'}.dfs.core.windows.net/{self.solution_folder}",
                "/sink",
                {"linkedService": "ls_lake"}
            )
        
        
        extraction_path = f"/synfs/{mssparkutils.env.getJobId()}/sink/{self.sink_path}"
        with zipfile.ZipFile(compressed_data) as zip_file:
            zip_file.extractall(extraction_path)

        dl_service_client_source = DataLakeServiceClient(f"https://{'lake'}.dfs.core.windows.net", credential=credential)
        dl_file_system_client_source = dl_service_client_source.get_file_system_client("refined")
        dl_directory_client_source = dl_file_system_client_source.get_file_client(self.solution_folder)  

        files1 = (self.solution_folder + '/' + self.sink_path)
        #.replace(" ", "\\ ")
        print(files1)
      #  files = dl_file_system_client_source.get_paths(path=files1)
        files = dl_file_system_client_source.get_paths(path=files1)
      #  print(self.solution_folder)
       # print(self.source_path)
       # print(self.source_name)
       # print(files)
      #  print(f"files: {files}")

        for file in files:
            if not file.is_directory:
                file_client = dl_file_system_client_source.get_file_client(file.name)
               # file_client = dl_directory_client_source.get_file_client(file_system=dl_file_system_client_source,file_path=file.name)
               # file_client = dl_directory_client_source.get_file_client(file.name)
                dir_path, file_name1 = os.path.split(file.name)
                file_name, file_extension = os.path.splitext(file.name)
                print(f"file_name: {file_name}")
                print(f"files.name: {file.name}")
              #  print(f"file.name: {file.name}")
             #   print(f"file_name: {file_name}")
             #   print(f"file_extension: {file_extension}")
                new_name = f'test{file_extension}'
             #   print(f"new_name: {new_name}")
                new_file_path = os.path.join(dir_path, new_name)
                print(f"new_file_path: {new_file_path}")
              #  print(f"solution_folder: {self.solution_folder}")
                file_client.rename_file(new_file_path)


######

client_id = get_secret(secret_loc="ls_keyvault", secret_name="client-id")
client_secret = get_secret(secret_loc="ls_keyvault", secret_name="app-secret")

credential = ClientSecretCredential(
                tenant_id="test.onmicrosoft.com",
                client_id=client_id,
                client_secret=client_secret
            )

file = FileTransformer("Path3/Path4/Path5/Path6","FolderName","Path1/Path2")
file.extract_data(credential)

一切都在正确打印:new_file_path、new_name、extension 等,但在类的最后一行代码中仍然收到错误 “file_client.rename_file(new_file_path)”

完整引用:

HttpResponseError                         Traceback (most recent call last)
/tmp/ipykernel_20528/2509103003.py in <module>
    107 
    108 file = FileTransformerFileTransformer("Path3/Path4/Path5/Path6","FolderName","Path1/Path2")
--> 109 file.extract_data(credential)

/tmp/ipykernel_20528/2509103003.py in extract_data(self, credentials)
     78                 print(f"new_file_path: {new_file_path}")
     79               #  print(f"solution_folder: {self.solution_folder}")
---> 80                 file_client.rename_file(new_file_path)
     81 
     82 

~/cluster-env/clonedenv/lib/python3.8/site-packages/azure/storage/filedatalake/_data_lake_file_client.py in rename_file(self, new_name, **kwargs)
    840             _location_mode=self._location_mode
    841         )
--> 842         new_file_client._rename_path(  # pylint: disable=protected-access
    843             '/{}/{}{}'.format(quote(unquote(self.file_system_name)),
    844                               quote(unquote(self.path_name)),

~/cluster-env/clonedenv/lib/python3.8/site-packages/azure/storage/filedatalake/_path_client.py in _rename_path(self, rename_source, **kwargs)
    860             return self._client.path.create(**options)
    861         except HttpResponseError as error:
--> 862             process_storage_error(error)
    863 
    864     def _get_path_properties(self, **kwargs):

~/cluster-env/clonedenv/lib/python3.8/site-packages/azure/storage/filedatalake/_deserialize.py in process_storage_error(storage_error)
    206     try:
    207         # `from None` prevents us from double printing the exception (suppresses generated layer error context)
--> 208         exec("raise error from None")   # pylint: disable=exec-used # nosec
    209     except SyntaxError:
    210         raise error

~/cluster-env/clonedenv/lib/python3.8/site-packages/azure/storage/filedatalake/_deserialize.py in <module>

HttpResponseError: (OutOfRangeInput) The specified resource name length is not within the permissible limits.
RequestId:6ef9e79f-a01f-0067-5cb0-62b105000000
Time:2023-03-30T02:34:49.8798540Z
Code: OutOfRangeInput
Message: The specified resource name length is not within the permissible limits.
RequestId:6ef9e79f-a01f-0067-5cb0-62b105000000
Time:2023-03-30T02:34:49.8798540Z
 
python azure azure-synapse
© www.soinside.com 2019 - 2024. All rights reserved.