从 HDF5 文件提取数据到 Pandas Dataframe

问题描述 投票:0回答:1

首先,感谢您提供的空间。

我来这里寻求帮助,因为我需要从一个文件夹中的多个 .HDF5 文件中提取数据,并将它们放入一个唯一的 Pandas 数据框中。列是 ['H1'、'L1'、'Frequency_Hz']。

我在 Google 云端硬盘上共享了这些文件,以便轻松访问: https://drive.google.com/drive/folders/1GwocMZeqZGyikZYgwvGvnnsVNn0B7aNW?usp=sharing

我的代码如下,包含我的编程逻辑,但它返回以下错误。我感谢任何帮助!

# Imports
import glob
import h5py
import numpy as np
import pandas as pd

# Create a list to store the DataFrames of each HDF5 file:
dfs = []

# Get the list of training HDF5 files using glob:
arquivos_hdf5_treino = glob.glob('../Data/OriginalDatasets/train/*.hdf5')

# View the file list:
arquivos_hdf5_treino
['../Data/OriginalDatasets/train\\001121a05.hdf5',
 '../Data/OriginalDatasets/train\\00a6db666.hdf5',
 '../Data/OriginalDatasets/train\\00f36a6ac.hdf5',
 '../Data/OriginalDatasets/train\\0197bacf8.hdf5',
 '../Data/OriginalDatasets/train\\01b8b67f3.hdf5',
 '../Data/OriginalDatasets/train\\01dba9731.hdf5',
 '../Data/OriginalDatasets/train\\02887d232.hdf5',
 '../Data/OriginalDatasets/train\\02c8f43f3.hdf5',
 '../Data/OriginalDatasets/train\\0367dc82c.hdf5',
 '../Data/OriginalDatasets/train\\0517ef7fe.hdf5',
 '../Data/OriginalDatasets/train\\05c0675fe.hdf5',
 '../Data/OriginalDatasets/train\\05cdc0769.hdf5',
 '../Data/OriginalDatasets/train\\05f0aef12.hdf5',
 '../Data/OriginalDatasets/train\\067b3fb4b.hdf5',
 '../Data/OriginalDatasets/train\\06e321c6e.hdf5',
 '../Data/OriginalDatasets/train\\08a060dad.hdf5',
 '../Data/OriginalDatasets/train\\08c444d66.hdf5',
 '../Data/OriginalDatasets/train\\0920a4276.hdf5',
 '../Data/OriginalDatasets/train\\09531cde3.hdf5',
 '../Data/OriginalDatasets/train\\097370861.hdf5',
 '../Data/OriginalDatasets/train\\09e55aeba.hdf5',
 '../Data/OriginalDatasets/train\\09ecddbba.hdf5',
 '../Data/OriginalDatasets/train\\0ba188c57.hdf5',
 '../Data/OriginalDatasets/train\\0bc8216f2.hdf5',
 '../Data/OriginalDatasets/train\\0c55d030c.hdf5',
 '../Data/OriginalDatasets/train\\0d0ad0b19.hdf5',
 '../Data/OriginalDatasets/train\\0dc4c8ed0.hdf5',
 '../Data/OriginalDatasets/train\\0e39a18bf.hdf5',
 '../Data/OriginalDatasets/train\\0e60d4893.hdf5',
 '../Data/OriginalDatasets/train\\0e66d0460.hdf5',
 '../Data/OriginalDatasets/train\\0eb30f7c4.hdf5',
 '../Data/OriginalDatasets/train\\0ebe28dd5.hdf5',
 '../Data/OriginalDatasets/train\\0f53d8b96.hdf5',
 '../Data/OriginalDatasets/train\\10dfa2ed6.hdf5',
 '../Data/OriginalDatasets/train\\10eaa1cb2.hdf5',
 '../Data/OriginalDatasets/train\\1185806d8.hdf5',
 '../Data/OriginalDatasets/train\\119610501.hdf5',
 '../Data/OriginalDatasets/train\\123594dc7.hdf5',
 '../Data/OriginalDatasets/train\\1282f6c1f.hdf5',
 '../Data/OriginalDatasets/train\\12f0fd6fd.hdf5',
 '../Data/OriginalDatasets/train\\12f9824fa.hdf5',
 '../Data/OriginalDatasets/train\\13a23148f.hdf5',
 '../Data/OriginalDatasets/train\\13df1746e.hdf5',
 '../Data/OriginalDatasets/train\\147cc5f92.hdf5',
 '../Data/OriginalDatasets/train\\1510f75f9.hdf5',
 '../Data/OriginalDatasets/train\\1523dcd0c.hdf5',
 '../Data/OriginalDatasets/train\\1607fd753.hdf5',
 '../Data/OriginalDatasets/train\\1748ad051.hdf5',
 '../Data/OriginalDatasets/train\\177d1a100.hdf5',
 '../Data/OriginalDatasets/train\\1796d0836.hdf5']


# Initializing the count of the number of hdf5 files:
numArquivo = 1

# Iterating over Training hdf5 files and extracting data:
for arquivo_hdf5 in arquivos_hdf5_treino:
    with h5py.File(arquivo_hdf5, 'r') as arquivo:
       
        # Printing the count of the number of hdf5 files on the screen:
        print(f'Arquivo {numArquivo}')

        # Creating the key list in HDF5 files:
        keyList = list(arquivo.keys())[0]

        # Creating the list of variables in HDF5 files:
        varList = list(arquivo[keyList])

        # Printing all datasets, also known as "keys":
        print(f'Chave em {arquivo_hdf5}: {keyList}')
           
        # Iterating over the datasets in the file:
        for key in arquivo.keys():
           
            # Printing the variables in the keys on the screen:
            print(f'Variáveis na chave {key}: {varList}')

            # Extracting the datasets:
            dados = arquivo[key][:]

            # Printing the dataset to the screen:
            print(f'Dados no conjunto de dados {key}: {dados}')

            # Converting data to a Pandas DataFrame:
            df = pd.DataFrame(dados)
                               
            # Adding the DataFrame to the list:
            dfs.append(df)
       
        # Printing a blank line on the screen:
        print()
       
        # Incrementing the number of files:
        numArquivo += 1

# Concatenating DataFrames into a single DataFrame:
resultado_final = pd.concat(dfs, ignore_index=True)

# Viewing the first lines:
print(resultado_final.head())

Arquivo 1
Chave em ../Data/OriginalDatasets/train\001121a05.hdf5: 001121a05
Variáveis na chave 001121a05: ['H1', 'L1', 'frequency_Hz']

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[7], line 27
     24 print(f'Variáveis na chave {key}: {varList}')
     26 # Extraindo os conjuntos de dados:
---> 27 dados = arquivo[key][:]
     29 # Imprimindo na tela o conjunto de dados:
     30 print(f'Dados no conjunto de dados {key}: {dados}')

File h5py\_objects.pyx:54, in h5py._objects.with_phil.wrapper()

File h5py\_objects.pyx:55, in h5py._objects.with_phil.wrapper()

File c:\Opt\Anaconda3\Lib\site-packages\h5py\_hl\group.py:330, in Group.__getitem__(self, name)
    328     oid = h5o.open(self.id, self._e(name), lapl=self._lapl)
    329 else:
--> 330     raise TypeError("Accessing a group is done with bytes or str, "
    331                     " not {}".format(type(name)))
    333 otype = h5i.get_type(oid)
    334 if otype == h5i.GROUP:

TypeError: Accessing a group is done with bytes or str,  not <class 'slice'> 

python pandas dataframe hdf5
1个回答
0
投票

让我们从文件架构开始。 (你必须首先理解模式。然后你才能正确读取数据。)要小心键。 h5py 对 H5 对象使用字典语法。因此键可以是数据集或组。每个键都是对象名称,其值是对象。如果您不知道对象类型,可以使用

isinstance()
进行测试。

您对文件架构有轻微的误解。我检查了 3 个文件,每个文件都有这个模式:

  • 与文件同名的顶级组(例如
    001121a05
    表示
    001121a05.hdf5
  • 该组下还有 3 个对象:
    H1
    是一个组,
    L1
    是一个 组和
    frequency_Hz
    是一个数据集。
  • H1
    L1
    组各有 2 个数据集:名为
    SFTs
    timestamps_GPS

有 2-3 个问题需要解决:

  1. 当您在
    arquivo.keys()
    上循环时,您正在根级别上循环 对象(仅限名为
    001121a05
    的组)。这就是你收到错误的原因 关于
    TypeError: Accessing a group
  2. 一旦你解决了这个问题,然后在第二级对象上循环
    arquivo[keyList].keys()
    ,当你尝试阅读时,你会收到另一个错误
    H1
    L1
    作为数据集(因为它们是组)。
  3. 您将需要更多逻辑来读取
    H1
    下数据集中的数据 和
    L1

我修改了您的代码以读取

frequency_Hz
中的数据并加载到数据帧。它应该为您指明正确的方向。如果您想要
H1
L1
数据,则需要根据需要加载
[H1][SFTs]
[L1][SFTs]

此外,我还做了一些其他小的更改来简化代码(例如,我使用了

glob.iglob()
enumerate()
)。另外,我修改了一些变量名称以阐明它们的含义。

# Create a list to store the DataFrames of each HDF5 file:
dfs = []

# Iterating over Training hdf5 files and extracting data:
for Arquivo_cnt, arquivo_hdf5 in enumerate(glob.iglob('*.hdf5')):
    with h5py.File(arquivo_hdf5, 'r') as arquivo:
       
        # Printing the count of the number of hdf5 files on the screen:
        print(f'Arquivo {Arquivo_cnt+1}')

        # Get the root level key list and print:
        rootkey = list(arquivo.keys())[0]
        print(f'Chave em {arquivo_hdf5}: {rootkey}')
        
        # Get a list of objects below the root level and print:
        varList = list(arquivo[rootkey].keys())
        print(f'Variáveis na chave {rootkey}: {varList}')
           
        # Iterating over the objects under rootkey (groups and datasets):
        for key in arquivo[rootkey].keys():
            print(f'For object name {key}, object type: {type(arquivo[rootkey][key])}')

            # only process datasets; skip groups
            if isinstance(arquivo[rootkey][key], h5py.Dataset):
               
                # Extract the dataset to an np.array:
                dados = arquivo[rootkey][key][:]
    
                # Print the data to the screen:
                print(f'Dados no conjunto de dados {key}: \n{dados}')
    
                # Load data to a Pandas DataFrame:
                df = pd.DataFrame(dados)
                                   
                # Add the DataFrame to the list:
                dfs.append(df)
           
            # Printing a blank line on the screen:
            print()       

# Concatenating DataFrames into a single DataFrame:
resultado_final = pd.concat(dfs, ignore_index=True)

# Viewing the first lines:
print(resultado_final.head())
© www.soinside.com 2019 - 2024. All rights reserved.