我是机器学习和预测的新手。目前,我正在做一个关于预测的项目。我尝试使用 here 中的代码来运行我的数据集。总体上一切都很好,但最后当我想使用测试值预测未来值并将其绘制在图表中以比较误差时。我无法解决它,我需要大家的帮助。例如,我的测试数据是从 3 月 1 日到 3 月 31 日,我想预测 4 月 1 日的值。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import xgboost as xgb
import warnings
from keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, TimeDistributed, Flatten, Dropout
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller, kpss
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from math import sqrt
from pandas import read_csv
# tf.random.set_seed(42)
# np.random.seed(42)
# tf.keras.backend.set_floatx('float32')
# warnings.simplefilter(action='ignore', category=(FutureWarning, UserWarning))
# sns.set()
df = read_csv('selected_data_ISONE.csv')
print(df.describe().round(2))
print(df.dtypes)
df['hour'] = pd.to_timedelta(df['hour'], unit='h')
df['date'] = pd.to_datetime(df['date'])
print(df['date'])
df['date'] = df['date']+df['hour']
print(df['date'])
df = df.set_index('date')
print(df.info())
# Find NaNs and duplicates in df_energy
print('There are {} missing values or NaNs in df.'.format(df.isnull().values.sum()))
temp_energy = df.duplicated(keep='first').sum()
print('There are {} duplicate rows in df (except first occurrence) based on all columns.'.format(temp_energy))
#Find the number of NaNs in each column
print(df.isnull().sum(axis=0))
# Define a function to plot different types of time-series
def plot_series(df=None, column=None, series=pd.Series([]), label=None, ylabel=None, title=None, start=0, end=None):
"""
Plots a certain time-series which has either been loaded in a dataframe and which
constitutes one of its columns or it a custom pandas series created by the user.
The user can define either the 'df' and the 'column' or the 'series' and additionally,
can also define the 'label', the 'ylabel', the 'title', the 'start' and the 'end' of the plot.
"""
sns.set()
fig, ax = plt.subplots(figsize=(30, 12))
ax.set_xlabel('Time', fontsize=16)
if column:
ax.plot(df[column][start:end], label=label)
ax.set_ylabel(ylabel, fontsize=16)
if series.any():
ax.plot(series, label=label)
ax.set_ylabel(ylabel, fontsize=16)
if label:
ax.legend(fontsize=16)
if title:
ax.set_title(title, fontsize=24)
ax.grid(True)
return ax
# # Zoom into the plot of the hourly (actual) total load
# ax = plot_series(df=df, column='demand', ylabel='Temperature (C)', title='Actual Total Load (First 2 weeks - Original)', end=24*7*2)
# plt.show()
# Plot the hourly actual electricity price, along with the weekly rolling mean
rolling = df['demand'].rolling(24*7, center=True).mean()
ax = plot_series(df, 'demand', label='Hourly', ylabel='Actual Load (€/MWh)',
title='Actual Hourly Electricity Demand and Weekly rolling mean')
ax.plot(rolling, linestyle='-', linewidth=2, label='Weekly rolling mean')
plt.show()
# Plot the electricity price (monthly frequence) along with its 1-year lagged series
monthly_price = df['demand'].asfreq('D')
ax = plot_series(series=monthly_price, ylabel='Actual Load',
title='Actual electricity demand (Monthly frequency) and 1-year lagged price')
shifted = df['demand'].asfreq('M').shift(12)
ax.plot(shifted, label='Hourly')
ax.legend(['Actual Load', '1-year Lagged Actual Load'])
plt.show()
# Plot the actual electricity price at a daily/weekly scale
ax = plot_series(df, 'demand', label='Hourly', ylabel='Actual Load',
start=1 + 24 * 500, end=1 + 24 * 515,
title='Actual Hourly Electricity Demand (Zoomed - 2 Weeks)')
plt.show()
y = df['demand']
adf_test = adfuller(y, regression='c')
print('ADF Statistic: {:.6f}\np-value: {:.6f}\n#Lags used: {}'
.format(adf_test[0], adf_test[1], adf_test[2]))
for key, value in adf_test[4].items():
print('Critical Value ({}): {:.6f}'.format(key, value))
kpss_test = kpss(y, regression='c', nlags='legacy')
print('KPSS Statistic: {:.6f}\np-value: {:.6f}\n#Lags used: {}'
.format(kpss_test[0], kpss_test[1], kpss_test[2]))
for key, value in kpss_test[3].items():
print('Critical Value ({}): {:.6f}'.format(key, value))
fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(10, 6))
plot_acf(df['demand'], lags=50, ax=ax1)
plot_pacf(df['demand'], lags=50, ax=ax2)
plt.tight_layout()
plt.show()
# Find the correlations between the energy price and the rest of the features
correlations = df.corr(method='pearson')
print(correlations['demand'].sort_values(ascending=False).to_string())
# Plot Pearson correlation matrix
correlations = df.corr(method='pearson')
fig = plt.figure(figsize=(24, 24))
sns.heatmap(correlations, annot=True, fmt='.2f')
plt.title('Pearson Correlation Matrix')
plt.show()
highly_correlated = abs(correlations[correlations > 0.70])
print(highly_correlated[highly_correlated < 1.0].stack().to_string())
# Generate 'hour', 'weekday' and 'month' features
for i in range(len(df)):
position = df.index[i]
weekday = position.weekday()
month = position.month
df.loc[position, 'weekday'] = weekday
df.loc[position, 'month'] = month
df['hour'] = df['hour'] / pd.Timedelta(hours=1)
# Generate 'business hour' feature
for i in range(len(df)):
position = df.index[i]
hour = position.hour
if ((hour > 8 and hour < 14) or (hour > 16 and hour < 21)):
df.loc[position, 'business hour'] = 2
elif (hour >= 14 and hour <= 16):
df.loc[position, 'business hour'] = 1
else:
df.loc[position, 'business hour'] = 0
# Generate 'weekend' feature
for i in range(len(df)):
position = df.index[i]
weekday = position.weekday()
if (weekday == 6):
df.loc[position, 'weekday'] = 2
elif (weekday == 5):
df.loc[position, 'weekday'] = 1
else:
df.loc[position, 'weekday'] = 0
num_data = len(df)
train_end_idx = df[:int(0.7*num_data)]
cv_end_idx = df[int(0.7*num_data):int(0.8*num_data)]
test_end_idx = df[int(0.8*num_data):int(0.9*num_data)]
print(train_end_idx)
print(cv_end_idx)
print(test_end_idx)
df_train = df[:72643]
df_cv = df[72643:83020]
y_train = df_train['demand'].values
y_cv = df_cv['demand'].values
y_train = y_train.reshape(-1, 1)
y_cv = y_cv.reshape(-1, 1)
X_train = df_train.drop(['demand'], axis=1)
X_cv = df_cv.drop(['demand'], axis=1)
names = X_train.columns.values
scaler_price = MinMaxScaler(feature_range=(0, 1))
scaler_mult = MinMaxScaler(feature_range=(0, 1))
scaler_price.fit(y_train)
scaler_price.transform(y_train)
scaler_price.transform(y_cv)
scaler_mult.fit(X_train)
scaler_mult.transform(X_train)
scaler_mult.transform(X_cv)
param = {'eta': 0.03, 'max_depth': 2,
'subsample': 1.0, 'colsample_bytree': 0.80,
'alpha': 1.5, 'lambda': 1.5, 'gamma': 1.5,
'objective': 'reg:linear', 'eval_metric': 'rmse',
'silent': 1, 'min_child_weight': 5, 'n_jobs': -1}
dtrain = xgb.DMatrix(X_train, y_train, feature_names=X_train.columns.values)
dtest = xgb.DMatrix(X_cv, y_cv, feature_names=X_cv.columns.values)
eval_list = [(dtrain, 'train'), (dtest, 'eval')]
xgb_model = xgb.train(param, dtrain, 200, eval_list)
fig, ax = plt.subplots(figsize=(12, 20))
xgb.plot_importance(xgb_model, max_num_features=70, height=0.8, ax=ax)
plt.show()
correlations = df_train.corr(method='pearson')
correlations_price = abs(correlations['demand'])
print(correlations_price[correlations_price > 0.20]
.sort_values(ascending=False).to_string())
considered_features = ['hour', 'weekday', 'month', 'business hour', 'temperature']
print(len(considered_features))
values = df[considered_features].values
# Plot time-series of considered features
plt.figure(figsize=(20, 20))
for i in range(values.shape[1]):
plt.subplot(values.shape[1], 1, i+1)
plt.plot(values[:, i])
plt.show()
# Plot distributions of considered features
plt.figure(figsize=(8, 16))
for i in range(values.shape[1]):
plt.subplot(values.shape[1], 1, i+1)
plt.hist(values[:, i], bins=24)
plt.show()
# Automatic Outlier Removal
for feature in considered_features:
mean_feat = df[feature].mean()
stdev_feat = df[feature].std()
upper_limit = mean_feat + 4 * stdev_feat
lower_limit = mean_feat - 4 * stdev_feat
df.loc[df[feature] > upper_limit, feature] = np.nan
df.loc[df[feature] < lower_limit, feature] = np.nan
# Number of removed values per column
print(df[considered_features].isnull().sum())
# Fill outlier rows using interpolation
df.interpolate(method='linear', limit_direction='forward', inplace=True, axis=0)
def multivariate_data(dataset, target, start_index, end_index, history_size, target_size, step, single_step=False):
data = []
labels = []
start_index = start_index + history_size
if end_index is None:
end_index = len(dataset) - target_size
for i in range(start_index, end_index):
indices = range(i - history_size, i, step)
data.append(dataset[indices])
if single_step:
labels.append(target[i + target_size])
else:
labels.append(target[i: i + target_size])
return np.array(data), np.array(labels)
train_end_idx = 72643
cv_end_idx = 83020
test_end_idx = 93398
dataset = df[considered_features]
scaler_mult = MinMaxScaler(feature_range=(0, 1))
scaler_mult.fit(dataset[:train_end_idx])
y = df['demand'].values
scaler = MinMaxScaler(feature_range=(0, 1))
y_reshaped = y.reshape(-1, 1)
scaler.fit(y_reshaped[:train_end_idx])
scaled_price = scaler.transform(y_reshaped)
scaled_dataset = scaler_mult.transform(dataset)
scaled_dataset = np.concatenate((scaled_dataset, scaled_price), axis=1)
# Set the number of previous time-lags that will be used
#multivariate_past_history = 3
#multivariate_past_history = 10
multivariate_past_history = 25
multivariate_future_target = 0
X_train_mult, y_train_mult = multivariate_data(scaled_dataset, scaled_dataset[:, -1],
0, train_end_idx,
multivariate_past_history,
multivariate_future_target,
step=1, single_step=True)
X_val_mult, y_val_mult = multivariate_data(scaled_dataset, scaled_dataset[:, -1],
train_end_idx, cv_end_idx,
multivariate_past_history,
multivariate_future_target,
step=1, single_step=True)
X_test_mult, y_test_mult = multivariate_data(scaled_dataset, scaled_dataset[:, -1],
cv_end_idx, test_end_idx,
multivariate_past_history,
multivariate_future_target,
step=1, single_step=True)
batch_size = 32
buffer_size = 1000
train_mult = tf.data.Dataset.from_tensor_slices((X_train_mult, y_train_mult))
train_mult = train_mult.cache().shuffle(buffer_size).batch(batch_size).prefetch(1)
val_mult = tf.data.Dataset.from_tensor_slices((X_val_mult, y_val_mult))
val_mult = val_mult.batch(batch_size).prefetch(1)
# Define some common parameters
input_shape_mult = X_train_mult.shape[-2:]
loss = tf.keras.losses.MeanSquaredError()
metric = [tf.keras.metrics.RootMeanSquaredError()]
lr_schedule = tf.keras.callbacks.LearningRateScheduler(
lambda epoch: 1e-4 * 10**(epoch / 10))
early_stopping = tf.keras.callbacks.EarlyStopping(patience=10)
y_test_mult_reshaped = y_test_mult.reshape(-1, 1)
y_test_mult_inv = scaler.inverse_transform(y_test_mult_reshaped)
tf.keras.backend.clear_session()
multivariate_stacked_lstm = tf.keras.models.Sequential([
LSTM(250, input_shape=input_shape_mult,
return_sequences=True),
LSTM(150, return_sequences=True),
Flatten(),
Dense(150, activation='relu'),
Dropout(0.05),
Dense(1)
])
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4, amsgrad=True)
multivariate_stacked_lstm.compile(loss=loss, optimizer=optimizer, metrics=metric)
history_lr = multivariate_stacked_lstm.fit(train_mult, epochs=2, validation_data=val_mult, callbacks=[lr_schedule])
tf.keras.backend.clear_session()
multivariate_stacked_lstm = tf.keras.models.Sequential([
LSTM(250, input_shape=input_shape_mult, return_sequences=True),
LSTM(150, return_sequences=True),
Flatten(),
Dense(150, activation='relu'),
Dropout(0.05),
Dense(1)
])
#Callback to save the Keras model or model weights at some frequency
model_checkpoint = tf.keras.callbacks.ModelCheckpoint('multivariate_stacked_lstm.h5', save_best_only=True)
#Optimizer that implements Adam algorithm
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-3, amsgrad=True)
multivariate_stacked_lstm.compile(loss=loss, optimizer=optimizer, metrics=metric)
history = multivariate_stacked_lstm.fit(train_mult, epochs=2, validation_data=val_mult, callbacks=[early_stopping,model_checkpoint])
loss_train = history.history['loss']
loss_val = history.history['val_loss']
epochs = range(len(loss_train))
plt.plot(epochs, loss_train, 'g', label='Training loss')
plt.plot(epochs, loss_val, 'b', label='validation loss')
plt.title('Training and Validation loss')
plt.legend()
plt.show()
multivariate_stacked_lstm = tf.keras.models.load_model('multivariate_stacked_lstm.h5')
forecast = multivariate_stacked_lstm.predict(X_test_mult)
multivariate_stacked_lstm_forecast = scaler.inverse_transform(forecast)
fig, ax = plt.subplots(figsize=(30, 10))
ax.set_xlabel('Time sample', fontsize=22)
ax.set_ylabel('Electricity demand', fontsize=22)
ax.plot(y_test_mult_inv[3263:], label='Real values')
ax.plot(multivariate_stacked_lstm_forecast[3263:], label='Multivariate Stacked LSTM forecast')
ax.legend(prop={'size':22})
plt.show()
rmse_mult_stacked_lstm = sqrt(mean_squared_error(y_test_mult_inv, multivariate_stacked_lstm_forecast))
print('RMSE of hour-ahead electricity price multivariate Stacked LSTM forecast: {}'.format(round(rmse_mult_stacked_lstm, 3)))