这是我目前编写的代码:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import List
import csv
def load_data(file_path: str) -> pd.DataFrame:
"""Load data from a CSV file."""
df = pd.read_csv(file_path, parse_dates=['time'], index_col='time')
names = ['time','open','close','high','low','volume']
return df
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
"""Clean the data."""
# Forward fill missing values in the 'time' column with hourly increments
time_diff = df.index.to_series().diff()
time_diff = time_diff.dt.total_seconds() / 3600 # Convert to hours
time_diff = time_diff.fillna(0).cumsum()
df.index = df.index + pd.to_timedelta(time_diff, unit='H')
# Fill gaps in the price data
df = df.resample('T').ffill()
# Sort the DataFrame by the 'time' index
df = df.sort_index()
# Drop duplicate time slots, keeping the first occurrence
df = df.drop_duplicates(keep='first')
return df
def resample_data(df: pd.DataFrame) -> pd.DataFrame:
"""Resample the data to create new data points."""
# Create new columns
time_frames = ['5T', '15T', '30T', '60T', '240T', '1440T'] # in minutes
new_columns = ['5_min', '15_min', '30_min', '1_hour', '4_hour', '1_day']
# Select only the necessary columns for resampling
columns_to_resample = ['open', 'high', 'low', 'close', 'volume']
# Create new columns with sub-columns for opening and closing prices
for col in new_columns:
df[col, 'open'] = pd.Series(dtype=float)
df[col, 'close'] = pd.Series(dtype=float)
# Assign resampled values to new columns and sub-columns
for time_frame, col in zip(time_frames, new_columns):
df_resampled = df[columns_to_resample].resample(time_frame).agg({
'open': 'first',
'high': 'max',
'low': 'min',
'close': 'last',
'volume': 'sum'
})
# Apply linear interpolation to fill NaN values
df_resampled = df_resampled.interpolate()
# Assign resampled values to sub-columns
df[col, 'open'] = df_resampled['open']
df[col, 'close'] = df_resampled['close']
# Check for NaN values
nan_values = df.isna().sum()
print(nan_values)
# Return the modified DataFrame
return df
# Load the data
file_path = r'C:\Users\Shadow\.cursor-tutor\projects\Machine Learning Modules\btcusd_ISO8601.csv'
df = load_data(file_path)
# Clean the data
df = clean_data(df)
# Resample the data
df = resample_data(df)
# Check the head of the DataFrame
print(df.head())
# Plot the data
plt.figure(figsize=(15, 5))
plt.plot(df['close'])
plt.title('Bitcoin Close price.', fontsize=15)
plt.ylabel('Price in dollars.')
plt.savefig('bitcoin_close_price.png')
What am I missing?
我遇到的问题是脚本在输出中抛出了很多 nan 值。列和表已正确创建,但未填充正确的推断数据。
感谢任何帮助/调试,因为我遇到了一些障碍,并且没有超过几个月的 Python 经验可供调用。
非常感谢
在浏览了 pandas 文档之后,我设法弄清楚了。这是已完成的脚本,现在可以在没有 nan 值的情况下运行。当然,它可能完全偏离,但在最后绘制了一个快速图表来检查数据的整体形状,与 BTCUSD 的同一时间范围相比,似乎有一个很好的匹配。需要进行更多测试才能获得我满意的证明,但初步迹象良好。
将 pandas 导入为 pd 将 numpy 导入为 np 将 matplotlib.pyplot 导入为 plt 从输入导入列表 导入 csv
def load_data(file_path: str) -> pd.DataFrame: """从 CSV 文件加载数据。""" df = pd.read_csv(file_path, parse_dates=['时间'], index_col='时间') 名称 = ['时间','开盘','收盘','最高价','最低价','交易量']
print(df)
return df
def clean_data(df: pd.DataFrame) -> pd.DataFrame: “”“清理数据。”“”
# Create a complete time range from the start to the end of your data at 1-minute intervals
full_time_range = pd.date_range(start=df.index.min(), end=df.index.max(), freq='1T')
# Reindex the DataFrame to have a row for each minute in the full time range
df = df.reindex(full_time_range)
# Forward fill to handle the missing data after reindexing
df.ffill(inplace=True)
# Aggregate the data as before
df = df.resample('1T').agg({
'open': 'first',
'high': 'max',
'low': 'min',
'close': 'last',
'volume': 'sum'
})
# Sort the DataFrame by the 'time' index just in case
df = df.sort_index()
print('df.isna().sum()')
return df
print(df)
def resample_data(df: pd.DataFrame) -> pd.DataFrame: """对数据重新采样以创建新的数据点。"""
print(df)
# Define the time frames and corresponding new column names
time_frames = ['5T', '15T', '30T', '60T', '240T', '1440T'] # in minutes
new_columns = ['5_min', '15_min', '30_min', '1_hour', '4_hour', '1_day']
columns_to_resample = ['open', 'close', 'high', 'low', 'volume']
# Initialise a multi_Index for the new columns
tuples = [(col, sub_col) for col in new_columns for sub_col in columns_to_resample]
multi_index = pd.MultiIndex.from_tuples(tuples)
resampled_df = pd.DataFrame(index=df.index, columns=multi_index)
# Assigning resampled values to new columns and sub-columns
for time_frame, col in zip(time_frames, new_columns):
df_resampled = df.resample(time_frame).agg({
'open': 'first',
'high': 'max',
'low': 'min',
'close': 'last',
'volume': 'sum'
})
# Forward fill the resampled data to avoid NaN values
df_resampled.ffill(inplace=True)
print(df_resampled)
# Assign resampled values to sub-columns
for sub_col in columns_to_resample:
resampled_df[(col, sub_col)] = df_resampled[sub_col]
# Combine the original df with the resampled_df
df_combined = pd.concat([df, resampled_df], axis=1)
# Reorder the columns to match the expected order
column_order = ['open', 'close', 'high', 'low', 'volume'] + list(resampled_df.columns)
df_combined = df_combined[column_order]
# Forward fill again to handle any NaN values after reindexing
df_combined.ffill(inplace=True)
# Back fill to complete the missing data
df_combined.bfill(inplace=True)
# Check for NaN values
nan_values = df_combined.isna().sum()
print(nan_values)
# Return the modified DataFrame
return df_combined
file_path = r'C:\Users\Shadow.cursor-tutor\projects\机器学习模块 tcusd_ISO8601.csv' df = load_data(文件路径)
df = clean_data(df)
df = 重新采样数据(df)