如果文件中有类似目录结构的列表,如何使用 pandas 从 blob 读取文件?

问题描述 投票:0回答:2

尝试使用 pandas read_csv 方法访问存储在 Azure Data Lake Storage 中的文件。但是由于文件夹路径被解释为列表而面临问题。有什么方法可以明确指定它是路径而不是列表的熊猫? 问题出在路径中提到的“Type - ['D']”

import pandas as pd

datalake_connection_string = "<connection_string_for_the_container>"

data=pd.read_csv(f"abfs://container_name@storage_account_name.blob.core.windows.net/OutputFiles/CodeOutputs/Type - ['D']/SOLUTIONS/summary.csv",storage_options={"connection_string": datalake_connection_string})

我面临的错误如下:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Input In [14], in <cell line: 1>()
----> 1 data=pd.read_csv(f"abfs://container_name@storage_account_name.blob.core.windows.net/OutputFiles/CodeOutputs/Type - ['D']/SOLUTIONS/summary.csv",storage_options={"connection_string": ADLSConnectionString})

File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/pandas/util/_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
    209     else:
    210         kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)

File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    325 if len(args) > num_allow_args:
    326     warnings.warn(
    327         msg.format(arguments=_format_argument_list(allow_args)),
    328         FutureWarning,
    329         stacklevel=find_stack_level(),
    330     )
--> 331 return func(*args, **kwargs)

File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    935 kwds_defaults = _refine_defaults_read(
    936     dialect,
    937     delimiter,
   (...)
    946     defaults={"delimiter": ","},
    947 )
    948 kwds.update(kwds_defaults)
--> 950 return _read(filepath_or_buffer, kwds)

File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:605, in _read(filepath_or_buffer, kwds)
    602 _validate_names(kwds.get("names", None))
    604 # Create the parser.
--> 605 parser = TextFileReader(filepath_or_buffer, **kwds)
    607 if chunksize or iterator:
    608     return parser

File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1442, in TextFileReader.__init__(self, f, engine, **kwds)
   1439     self.options["has_index_names"] = kwds["has_index_names"]
   1441 self.handles: IOHandles | None = None
-> 1442 self._engine = self._make_engine(f, self.engine)

File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1735, in TextFileReader._make_engine(self, f, engine)
   1733     if "b" not in mode:
   1734         mode += "b"
-> 1735 self.handles = get_handle(
   1736     f,
   1737     mode,
   1738     encoding=self.options.get("encoding", None),
   1739     compression=self.options.get("compression", None),
   1740     memory_map=self.options.get("memory_map", False),
   1741     is_text=is_text,
   1742     errors=self.options.get("encoding_errors", "strict"),
   1743     storage_options=self.options.get("storage_options", None),
   1744 )
   1745 assert self.handles is not None
   1746 f = self.handles.handle

File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/pandas/io/common.py:713, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    710     codecs.lookup_error(errors)
    712 # open URLs
--> 713 ioargs = _get_filepath_or_buffer(
    714     path_or_buf,
    715     encoding=encoding,
    716     compression=compression,
    717     mode=mode,
    718     storage_options=storage_options,
    719 )
    721 handle = ioargs.filepath_or_buffer
    722 handles: list[BaseBuffer]

File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/pandas/io/common.py:409, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
    406     pass
    408 try:
--> 409     file_obj = fsspec.open(
    410         filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})
    411     ).open()
    412 # GH 34626 Reads from Public Buckets without Credentials needs anon=True
    413 except tuple(err_types_to_retry_with_anon):

File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/fsspec/core.py:441, in open(urlpath, mode, compression, encoding, errors, protocol, newline, **kwargs)
    391 def open(
    392     urlpath,
    393     mode="rb",
   (...)
    399     **kwargs,
    400 ):
    401     """Given a path or paths, return one ``OpenFile`` object.
    402 
    403     Parameters
   (...)
    439     ``OpenFile`` object.
    440     """
--> 441     return open_files(
    442         urlpath=[urlpath],
    443         mode=mode,
    444         compression=compression,
    445         encoding=encoding,
    446         errors=errors,
    447         protocol=protocol,
    448         newline=newline,
    449         expand=False,
    450         **kwargs,
    451     )[0]

File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/fsspec/core.py:195, in OpenFiles.__getitem__(self, item)
    194 def __getitem__(self, item):
--> 195     out = super().__getitem__(item)
    196     if isinstance(item, slice):
    197         return OpenFiles(out, mode=self.mode, fs=self.fs)

IndexError: list index out of range

任何人都可以为此提出解决方法吗?

python pandas azure blob azure-data-lake-gen2
2个回答
0
投票

我会尝试使用 os.path.join 之类的东西将其分解。这对我有用(在 Linux 上):

import pandas as pd
import os
srcFile = r"file.csv"
srcFolder = r"/projects/proj_42_python/Type - ['D']/"
srcPath = os.path.join(srcFolder,srcFile)
pd.read_csv(srcPath)

0
投票

我在我的环境中尝试并得到以下结果:

如果文件内部有类似列表的目录结构,您可以使用下面的代码使用 pandas 从 blob 中读取文件。

在该结构中,每个引号的

Type - ['D']
应该有一个反斜杠以使其重定向到正确的路径。 您可以从门户网站获取 Blob SAS Url:

代码:

import pandas as pd 
source ='https://storage6780.blob.core.windows.net/test/OutputFiles/CodeOutputs/Type%20-%20%5B\'D\'%5D/SOLUTIONS/sample.csva?<Your SAS Token>'
df = pd.read_csv(source,encoding="ISO-8859-1")
print(df)

输出:

enter image description here

参考:

命名和引用容器、Blob 和元数据 - Azure 存储 |微软学习

© www.soinside.com 2019 - 2024. All rights reserved.