使用测试值预测未来价值

问题描述 投票:0回答:0

我是机器学习和预测的新手。目前,我正在做一个关于预测的项目。我尝试使用 here 中的代码来运行我的数据集。总体上一切都很好,但最后当我想使用测试值预测未来值并将其绘制在图表中以比较误差时。我无法解决它,我需要大家的帮助。例如,我的测试数据是从 3 月 1 日到 3 月 31 日,我想预测 4 月 1 日的值。

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import xgboost as xgb
import warnings

from keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, TimeDistributed, Flatten, Dropout
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller, kpss
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from math import sqrt
from pandas import read_csv

# tf.random.set_seed(42)
# np.random.seed(42)
# tf.keras.backend.set_floatx('float32')
# warnings.simplefilter(action='ignore', category=(FutureWarning, UserWarning))
# sns.set()

df = read_csv('selected_data_ISONE.csv')
print(df.describe().round(2))
print(df.dtypes)
df['hour'] = pd.to_timedelta(df['hour'], unit='h')
df['date'] = pd.to_datetime(df['date'])
print(df['date'])
df['date'] = df['date']+df['hour']
print(df['date'])
df = df.set_index('date')
print(df.info())

# Find NaNs and duplicates in df_energy
print('There are {} missing values or NaNs in df.'.format(df.isnull().values.sum()))
temp_energy = df.duplicated(keep='first').sum()
print('There are {} duplicate rows in df (except first occurrence) based on all columns.'.format(temp_energy))

#Find the number of NaNs in each column
print(df.isnull().sum(axis=0))

# Define a function to plot different types of time-series
def plot_series(df=None, column=None, series=pd.Series([]), label=None, ylabel=None, title=None, start=0, end=None):
    """
    Plots a certain time-series which has either been loaded in a dataframe and which
    constitutes one of its columns or it a custom pandas series created by the user.
    The user can define either the 'df' and the 'column' or the 'series' and additionally,
    can also define the 'label', the 'ylabel', the 'title', the 'start' and the 'end' of the plot.
    """
    sns.set()
    fig, ax = plt.subplots(figsize=(30, 12))
    ax.set_xlabel('Time', fontsize=16)
    if column:
        ax.plot(df[column][start:end], label=label)
        ax.set_ylabel(ylabel, fontsize=16)
    if series.any():
        ax.plot(series, label=label)
        ax.set_ylabel(ylabel, fontsize=16)
    if label:
        ax.legend(fontsize=16)
    if title:
        ax.set_title(title, fontsize=24)
    ax.grid(True)
    return ax

# # Zoom into the plot of the hourly (actual) total load
# ax = plot_series(df=df, column='demand', ylabel='Temperature (C)', title='Actual Total Load (First 2 weeks - Original)', end=24*7*2)
# plt.show()

# Plot the hourly actual electricity price, along with the weekly rolling mean
rolling = df['demand'].rolling(24*7, center=True).mean()
ax = plot_series(df, 'demand', label='Hourly', ylabel='Actual Load (€/MWh)',
                 title='Actual Hourly Electricity Demand and Weekly rolling mean')
ax.plot(rolling, linestyle='-', linewidth=2, label='Weekly rolling mean')
plt.show()

# Plot the electricity price (monthly frequence) along with its 1-year lagged series
monthly_price = df['demand'].asfreq('D')
ax = plot_series(series=monthly_price, ylabel='Actual Load',
                 title='Actual electricity demand (Monthly frequency) and 1-year lagged price')
shifted = df['demand'].asfreq('M').shift(12)
ax.plot(shifted, label='Hourly')
ax.legend(['Actual Load', '1-year Lagged Actual Load'])
plt.show()

# Plot the actual electricity price at a daily/weekly scale
ax = plot_series(df, 'demand', label='Hourly', ylabel='Actual Load',
                 start=1 + 24 * 500, end=1 + 24 * 515,
                 title='Actual Hourly Electricity Demand (Zoomed - 2 Weeks)')
plt.show()

y = df['demand']
adf_test = adfuller(y, regression='c')
print('ADF Statistic: {:.6f}\np-value: {:.6f}\n#Lags used: {}'
      .format(adf_test[0], adf_test[1], adf_test[2]))
for key, value in adf_test[4].items():
    print('Critical Value ({}): {:.6f}'.format(key, value))

kpss_test = kpss(y, regression='c', nlags='legacy')
print('KPSS Statistic: {:.6f}\np-value: {:.6f}\n#Lags used: {}'
      .format(kpss_test[0], kpss_test[1], kpss_test[2]))
for key, value in kpss_test[3].items():
    print('Critical Value ({}): {:.6f}'.format(key, value))

fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(10, 6))
plot_acf(df['demand'], lags=50, ax=ax1)
plot_pacf(df['demand'], lags=50, ax=ax2)
plt.tight_layout()
plt.show()

# Find the correlations between the energy price and the rest of the features

correlations = df.corr(method='pearson')
print(correlations['demand'].sort_values(ascending=False).to_string())

# Plot Pearson correlation matrix

correlations = df.corr(method='pearson')
fig = plt.figure(figsize=(24, 24))
sns.heatmap(correlations, annot=True, fmt='.2f')
plt.title('Pearson Correlation Matrix')
plt.show()

highly_correlated = abs(correlations[correlations > 0.70])
print(highly_correlated[highly_correlated < 1.0].stack().to_string())

# Generate 'hour', 'weekday' and 'month' features

for i in range(len(df)):
    position = df.index[i]
    weekday = position.weekday()
    month = position.month
    df.loc[position, 'weekday'] = weekday
    df.loc[position, 'month'] = month

df['hour'] = df['hour'] / pd.Timedelta(hours=1)
# Generate 'business hour' feature

for i in range(len(df)):
    position = df.index[i]
    hour = position.hour
    if ((hour > 8 and hour < 14) or (hour > 16 and hour < 21)):
        df.loc[position, 'business hour'] = 2
    elif (hour >= 14 and hour <= 16):
        df.loc[position, 'business hour'] = 1
    else:
        df.loc[position, 'business hour'] = 0

# Generate 'weekend' feature

for i in range(len(df)):
    position = df.index[i]
    weekday = position.weekday()
    if (weekday == 6):
        df.loc[position, 'weekday'] = 2
    elif (weekday == 5):
        df.loc[position, 'weekday'] = 1
    else:
        df.loc[position, 'weekday'] = 0

num_data = len(df)
train_end_idx = df[:int(0.7*num_data)]
cv_end_idx = df[int(0.7*num_data):int(0.8*num_data)]
test_end_idx = df[int(0.8*num_data):int(0.9*num_data)]
print(train_end_idx)
print(cv_end_idx)
print(test_end_idx)

df_train = df[:72643]
df_cv = df[72643:83020]

y_train = df_train['demand'].values
y_cv = df_cv['demand'].values
y_train = y_train.reshape(-1, 1)
y_cv = y_cv.reshape(-1, 1)

X_train = df_train.drop(['demand'], axis=1)
X_cv = df_cv.drop(['demand'], axis=1)
names = X_train.columns.values

scaler_price = MinMaxScaler(feature_range=(0, 1))
scaler_mult = MinMaxScaler(feature_range=(0, 1))

scaler_price.fit(y_train)
scaler_price.transform(y_train)
scaler_price.transform(y_cv)

scaler_mult.fit(X_train)
scaler_mult.transform(X_train)
scaler_mult.transform(X_cv)

param = {'eta': 0.03, 'max_depth': 2,
         'subsample': 1.0, 'colsample_bytree': 0.80,
         'alpha': 1.5, 'lambda': 1.5, 'gamma': 1.5,
         'objective': 'reg:linear', 'eval_metric': 'rmse',
         'silent': 1, 'min_child_weight': 5, 'n_jobs': -1}

dtrain = xgb.DMatrix(X_train, y_train, feature_names=X_train.columns.values)
dtest = xgb.DMatrix(X_cv, y_cv, feature_names=X_cv.columns.values)
eval_list = [(dtrain, 'train'), (dtest, 'eval')]

xgb_model = xgb.train(param, dtrain, 200, eval_list)
fig, ax = plt.subplots(figsize=(12, 20))
xgb.plot_importance(xgb_model, max_num_features=70, height=0.8, ax=ax)
plt.show()
correlations = df_train.corr(method='pearson')
correlations_price = abs(correlations['demand'])
print(correlations_price[correlations_price > 0.20]
      .sort_values(ascending=False).to_string())
considered_features = ['hour', 'weekday', 'month', 'business hour', 'temperature']
print(len(considered_features))
values = df[considered_features].values
# Plot time-series of considered features

plt.figure(figsize=(20, 20))
for i in range(values.shape[1]):
    plt.subplot(values.shape[1], 1, i+1)
    plt.plot(values[:, i])
plt.show()
# Plot distributions of considered features

plt.figure(figsize=(8, 16))
for i in range(values.shape[1]):
    plt.subplot(values.shape[1], 1, i+1)
    plt.hist(values[:, i], bins=24)
plt.show()
# Automatic Outlier Removal

for feature in considered_features:
    mean_feat = df[feature].mean()
    stdev_feat = df[feature].std()
    upper_limit = mean_feat + 4 * stdev_feat
    lower_limit = mean_feat - 4 * stdev_feat
    df.loc[df[feature] > upper_limit, feature] = np.nan
    df.loc[df[feature] < lower_limit, feature] = np.nan
# Number of removed values per column

print(df[considered_features].isnull().sum())
# Fill outlier rows using interpolation

df.interpolate(method='linear', limit_direction='forward', inplace=True, axis=0)

def multivariate_data(dataset, target, start_index, end_index, history_size, target_size, step, single_step=False):
    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i - history_size, i, step)
        data.append(dataset[indices])

        if single_step:
            labels.append(target[i + target_size])
        else:
            labels.append(target[i: i + target_size])

    return np.array(data), np.array(labels)

train_end_idx = 72643
cv_end_idx = 83020
test_end_idx = 93398
dataset = df[considered_features]
scaler_mult = MinMaxScaler(feature_range=(0, 1))

scaler_mult.fit(dataset[:train_end_idx])
y = df['demand'].values

scaler = MinMaxScaler(feature_range=(0, 1))

y_reshaped = y.reshape(-1, 1)
scaler.fit(y_reshaped[:train_end_idx])

scaled_price = scaler.transform(y_reshaped)
scaled_dataset = scaler_mult.transform(dataset)
scaled_dataset = np.concatenate((scaled_dataset, scaled_price), axis=1)

# Set the number of previous time-lags that will be used
#multivariate_past_history = 3
#multivariate_past_history = 10
multivariate_past_history = 25
multivariate_future_target = 0
X_train_mult, y_train_mult = multivariate_data(scaled_dataset, scaled_dataset[:, -1],
                                               0, train_end_idx,
                                               multivariate_past_history,
                                               multivariate_future_target,
                                               step=1, single_step=True)
X_val_mult, y_val_mult = multivariate_data(scaled_dataset, scaled_dataset[:, -1],
                                           train_end_idx, cv_end_idx,
                                           multivariate_past_history,
                                           multivariate_future_target,
                                           step=1, single_step=True)
X_test_mult, y_test_mult = multivariate_data(scaled_dataset, scaled_dataset[:, -1],
                                             cv_end_idx, test_end_idx,
                                             multivariate_past_history,
                                             multivariate_future_target,
                                             step=1, single_step=True)
batch_size = 32
buffer_size = 1000
train_mult = tf.data.Dataset.from_tensor_slices((X_train_mult, y_train_mult))
train_mult = train_mult.cache().shuffle(buffer_size).batch(batch_size).prefetch(1)
val_mult = tf.data.Dataset.from_tensor_slices((X_val_mult, y_val_mult))
val_mult = val_mult.batch(batch_size).prefetch(1)

# Define some common parameters
input_shape_mult = X_train_mult.shape[-2:]
loss = tf.keras.losses.MeanSquaredError()
metric = [tf.keras.metrics.RootMeanSquaredError()]
lr_schedule = tf.keras.callbacks.LearningRateScheduler(
              lambda epoch: 1e-4 * 10**(epoch / 10))
early_stopping = tf.keras.callbacks.EarlyStopping(patience=10)
y_test_mult_reshaped = y_test_mult.reshape(-1, 1)
y_test_mult_inv = scaler.inverse_transform(y_test_mult_reshaped)

tf.keras.backend.clear_session()

multivariate_stacked_lstm = tf.keras.models.Sequential([
    LSTM(250, input_shape=input_shape_mult,
         return_sequences=True),
    LSTM(150, return_sequences=True),
    Flatten(),
    Dense(150, activation='relu'),
    Dropout(0.05),
    Dense(1)
])

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4, amsgrad=True)
multivariate_stacked_lstm.compile(loss=loss, optimizer=optimizer, metrics=metric)
history_lr = multivariate_stacked_lstm.fit(train_mult, epochs=2, validation_data=val_mult, callbacks=[lr_schedule])

tf.keras.backend.clear_session()

multivariate_stacked_lstm = tf.keras.models.Sequential([
    LSTM(250, input_shape=input_shape_mult, return_sequences=True),
    LSTM(150, return_sequences=True),
    Flatten(),
    Dense(150, activation='relu'),
    Dropout(0.05),
    Dense(1)
])

#Callback to save the Keras model or model weights at some frequency
model_checkpoint = tf.keras.callbacks.ModelCheckpoint('multivariate_stacked_lstm.h5', save_best_only=True)
#Optimizer that implements Adam algorithm
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-3, amsgrad=True)

multivariate_stacked_lstm.compile(loss=loss, optimizer=optimizer, metrics=metric)
history = multivariate_stacked_lstm.fit(train_mult, epochs=2, validation_data=val_mult, callbacks=[early_stopping,model_checkpoint])
loss_train = history.history['loss']
loss_val = history.history['val_loss']
epochs = range(len(loss_train))
plt.plot(epochs, loss_train, 'g', label='Training loss')
plt.plot(epochs, loss_val, 'b', label='validation loss')
plt.title('Training and Validation loss')
plt.legend()
plt.show()

multivariate_stacked_lstm = tf.keras.models.load_model('multivariate_stacked_lstm.h5')
forecast = multivariate_stacked_lstm.predict(X_test_mult)
multivariate_stacked_lstm_forecast = scaler.inverse_transform(forecast)

fig, ax = plt.subplots(figsize=(30, 10))
ax.set_xlabel('Time sample', fontsize=22)
ax.set_ylabel('Electricity demand', fontsize=22)
ax.plot(y_test_mult_inv[3263:], label='Real values')
ax.plot(multivariate_stacked_lstm_forecast[3263:], label='Multivariate Stacked LSTM forecast')
ax.legend(prop={'size':22})
plt.show()

rmse_mult_stacked_lstm = sqrt(mean_squared_error(y_test_mult_inv, multivariate_stacked_lstm_forecast))
print('RMSE of hour-ahead electricity price multivariate Stacked LSTM forecast: {}'.format(round(rmse_mult_stacked_lstm, 3)))
python deep-learning regression forecasting
© www.soinside.com 2019 - 2024. All rights reserved.