我已将数据存储在 Azure 数据湖中的不同文件夹和子文件夹中。我想知道存储的数据大小。 下面是我们可以在天蓝色数据块中运行的函数,以递归方式运行该函数,这将为我们提供数据的大小。
有效的代码 -
%python
# Specify the root path to your ADLS Gen2 container
root_path = “abfss://<container-name>@<storage-account>.dfs.core.windows.net/<Path>”
# Function to calculate the size of a directory and its subdirectories recursively
def calculate_directory_size(directory_path):
total_size = 0
for file_info in dbutils.fs.ls(directory_path):
if file_info.isDir():
total_size += calculate_directory_size(file_info.path)
else:
total_size += file_info.size
return total_size
# List all directories within the root path
directories = [f.path for f in dbutils.fs.ls(root_path) if f.isDir()]
# Calculate and print the size of each directory and its subdirectories
for directory in directories:
directory_name = directory.split("/")[-1]
print("directory", directory)
directory_size = calculate_directory_size(directory)
print(f"Data volume in {directory_name}: {directory_size} bytes")
# Convert bytes to gigabytes (GB) for readability
directory_size_gb = directory_size / (1024 ** 3)
print(f"Data volume in {directory_name}: {directory_size_gb:.5f} GB")
我最初尝试的代码由于某些错误而遇到了一些错误。 下面的代码可以正常工作 -
%python
# Specify the root path to your ADLS Gen2 container
root_path = “abfss://<container-name>@<storage-account>.dfs.core.windows.net/<Path>”
# Function to calculate the size of a directory and its subdirectories recursively
def calculate_directory_size(directory_path):
total_size = 0
for file_info in dbutils.fs.ls(directory_path):
if file_info.isDir():
total_size += calculate_directory_size(file_info.path)
else:
total_size += file_info.size
return total_size
# List all directories within the root path
directories = [f.path for f in dbutils.fs.ls(root_path) if f.isDir()]
# Calculate and print the size of each directory and its subdirectories
for directory in directories:
directory_name = directory.split("/")[-1]
print("directory", directory)
directory_size = calculate_directory_size(directory)
print(f"Data volume in {directory_name}: {directory_size} bytes")
# Convert bytes to gigabytes (GB) for readability
directory_size_gb = directory_size / (1024 ** 3)
print(f"Data volume in {directory_name}: {directory_size_gb:.5f} GB")