如何使用 Pandas 处理不同时间范围(5m、15m、30m、1 小时、4 小时、1 天)的 OHLCV 数据?

问题描述 投票:0回答:1

这是我目前编写的代码:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import List
import csv

def load_data(file_path: str) -> pd.DataFrame:
    """Load data from a CSV file."""
    df = pd.read_csv(file_path, parse_dates=['time'], index_col='time')
    names = ['time','open','close','high','low','volume']
    return df

def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """Clean the data."""

    # Forward fill missing values in the 'time' column with hourly increments
    time_diff = df.index.to_series().diff()
    time_diff = time_diff.dt.total_seconds() / 3600  # Convert to hours
    time_diff = time_diff.fillna(0).cumsum()
    df.index = df.index + pd.to_timedelta(time_diff, unit='H')

    # Fill gaps in the price data
    df = df.resample('T').ffill()

    # Sort the DataFrame by the 'time' index
    df = df.sort_index()

    # Drop duplicate time slots, keeping the first occurrence
    df = df.drop_duplicates(keep='first')

    return df

def resample_data(df: pd.DataFrame) -> pd.DataFrame:
    """Resample the data to create new data points."""

    # Create new columns
    time_frames = ['5T', '15T', '30T', '60T', '240T', '1440T']  # in minutes
    new_columns = ['5_min', '15_min', '30_min', '1_hour', '4_hour', '1_day']

    # Select only the necessary columns for resampling
    columns_to_resample = ['open', 'high', 'low', 'close', 'volume']

    # Create new columns with sub-columns for opening and closing prices
    for col in new_columns:
        df[col, 'open'] = pd.Series(dtype=float)
        df[col, 'close'] = pd.Series(dtype=float)

    # Assign resampled values to new columns and sub-columns
    for time_frame, col in zip(time_frames, new_columns):
        df_resampled = df[columns_to_resample].resample(time_frame).agg({
            'open': 'first',
            'high': 'max',
            'low': 'min',
            'close': 'last',
            'volume': 'sum'
        })

        # Apply linear interpolation to fill NaN values
        df_resampled = df_resampled.interpolate()

        # Assign resampled values to sub-columns
        df[col, 'open'] = df_resampled['open']
        df[col, 'close'] = df_resampled['close']

    # Check for NaN values
    nan_values = df.isna().sum()
    print(nan_values)

    # Return the modified DataFrame
    return df

# Load the data
file_path = r'C:\Users\Shadow\.cursor-tutor\projects\Machine Learning Modules\btcusd_ISO8601.csv'
df = load_data(file_path)

# Clean the data
df = clean_data(df)

# Resample the data
df = resample_data(df)

# Check the head of the DataFrame
print(df.head())

# Plot the data
plt.figure(figsize=(15, 5))
plt.plot(df['close'])
plt.title('Bitcoin Close price.', fontsize=15)
plt.ylabel('Price in dollars.')
plt.savefig('bitcoin_close_price.png')
What am I missing?

我遇到的问题是脚本在输出中抛出了很多 nan 值。列和表已正确创建,但未填充正确的推断数据。

感谢任何帮助/调试,因为我遇到了一些障碍,并且没有超过几个月的 Python 经验可供调用。

非常感谢

python python-3.x pandas finance
1个回答
0
投票

在浏览了 pandas 文档之后,我设法弄清楚了。这是已完成的脚本,现在可以在没有 nan 值的情况下运行。当然,它可能完全偏离,但在最后绘制了一个快速图表来检查数据的整体形状,与 BTCUSD 的同一时间范围相比,似乎有一个很好的匹配。需要进行更多测试才能获得我满意的证明,但初步迹象良好。

将 pandas 导入为 pd 将 numpy 导入为 np 将 matplotlib.pyplot 导入为 plt 从输入导入列表 导入 csv

从 CSV 文件加载数据

def load_data(file_path: str) -> pd.DataFrame: """从 CSV 文件加载数据。""" df = pd.read_csv(file_path, parse_dates=['时间'], index_col='时间') 名称 = ['时间','开盘','收盘','最高价','最低价','交易量']

print(df)
return df

清理数据 - 原始数据集的初始 resample_data,使用 reindex 和 ffill 函数以最小化丢失数据后

def clean_data(df: pd.DataFrame) -> pd.DataFrame: “”“清理数据。”“”

# Create a complete time range from the start to the end of your data at 1-minute intervals
full_time_range = pd.date_range(start=df.index.min(), end=df.index.max(), freq='1T')

# Reindex the DataFrame to have a row for each minute in the full time range
df = df.reindex(full_time_range)

# Forward fill to handle the missing data after reindexing
df.ffill(inplace=True)

# Aggregate the data as before
df = df.resample('1T').agg({
    'open': 'first',
    'high': 'max',
    'low': 'min',
    'close': 'last',
    'volume': 'sum'
})

# Sort the DataFrame by the 'time' index just in case
df = df.sort_index()

print('df.isna().sum()')
    
return df
print(df)

通过 multi_index 函数重采样的第二阶段,在 5、15、30、60、240 和 1440 分钟创建新数据点

我们还使用 ffill 和 bfill 删除 nan 值

def resample_data(df: pd.DataFrame) -> pd.DataFrame: """对数据重新采样以创建新的数据点。"""

print(df)

# Define the time frames and corresponding new column names
time_frames = ['5T', '15T', '30T', '60T', '240T', '1440T']  # in minutes
new_columns = ['5_min', '15_min', '30_min', '1_hour', '4_hour', '1_day']
columns_to_resample = ['open', 'close', 'high', 'low', 'volume']

# Initialise a multi_Index for the new columns
tuples = [(col, sub_col) for col in new_columns for sub_col in columns_to_resample]
multi_index = pd.MultiIndex.from_tuples(tuples)
resampled_df = pd.DataFrame(index=df.index, columns=multi_index)

# Assigning resampled values to new columns and sub-columns
for time_frame, col in zip(time_frames, new_columns):
    df_resampled = df.resample(time_frame).agg({
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last',
        'volume': 'sum'
    })

    # Forward fill the resampled data to avoid NaN values
    df_resampled.ffill(inplace=True)

    print(df_resampled)
    
    # Assign resampled values to sub-columns
    for sub_col in columns_to_resample:
        resampled_df[(col, sub_col)] = df_resampled[sub_col]

# Combine the original df with the resampled_df
df_combined = pd.concat([df, resampled_df], axis=1)

# Reorder the columns to match the expected order
column_order = ['open', 'close', 'high', 'low', 'volume'] + list(resampled_df.columns)
df_combined = df_combined[column_order]

# Forward fill again to handle any NaN values after reindexing
df_combined.ffill(inplace=True)

# Back fill to complete the missing data
df_combined.bfill(inplace=True)

# Check for NaN values
nan_values = df_combined.isna().sum()
print(nan_values)

# Return the modified DataFrame
return df_combined

加载数据

file_path = r'C:\Users\Shadow.cursor-tutor\projects\机器学习模块 tcusd_ISO8601.csv' df = load_data(文件路径)

清理数据

df = clean_data(df)

重新采样数据

df = 重新采样数据(df)

© www.soinside.com 2019 - 2024. All rights reserved.