一维 CNN 预测图与实际时间序列图不匹配

问题描述 投票:0回答:1

我想使用一维 CNN 模型预测给定协变量的登革热病例的时间序列计数。损失函数和 MSE 和 MAE 等指标似乎令人满意。然而,训练集和测试集的预测图与实际数据不匹配。我不确定出了什么问题。完整代码如下:

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

# Load the data
file_path = 'https://gist.githubusercontent.com/JishanAhmed2019/a7666a3651d27bf03dc93e63aac896b0/raw/f93f28aeaa41418689744a4fbb5bde29114f9872/Dengue.csv'
data = pd.read_csv(file_path, index_col='Date', sep='\t', parse_dates=True)

# Split the data into training and testing sets before scaling
split_fraction = 0.85
split_point = int(len(data) * split_fraction)

train_data = data.iloc[:split_point]
test_data = data.iloc[split_point:]

# Extract features and targets from both sets
X_train, y_train = train_data.drop('Dhaka_Dengue', axis=1), train_data['Dhaka_Dengue']
X_test, y_test = test_data.drop('Dhaka_Dengue', axis=1), test_data['Dhaka_Dengue']

# Apply scaling separately to avoid data leakage
scaler_features = MinMaxScaler()
scaler_target = MinMaxScaler()

X_train_scaled = scaler_features.fit_transform(X_train)
X_test_scaled = scaler_features.transform(X_test)

y_train_scaled = scaler_target.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_scaled = scaler_target.transform(y_test.values.reshape(-1, 1)).flatten()

# Manually split the training data to create a validation set
val_fraction = 0.15
val_split_point = int(len(X_train_scaled) * (1 - val_fraction))

X_train_final = X_train_scaled[:val_split_point]
y_train_final = y_train_scaled[:val_split_point]
X_val = X_train_scaled[val_split_point:]
y_val = y_train_scaled[val_split_point:]

# Reshape for 1D CNN input
X_train_final_reshaped = X_train_final.reshape((X_train_final.shape[0], X_train_final.shape[1], 1))
X_val_reshaped = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

# Define the 1D CNN model
model = Sequential([
    Conv1D(32, 5, padding='same', activation=LeakyReLU(alpha=0.1), input_shape=(X_train_final_reshaped.shape[1], 1)),
    MaxPooling1D(2, padding="same"),
   # Conv1D(16, 5, padding='same', activation=LeakyReLU(alpha=0.1)),
   # MaxPooling1D(2, padding="same"),
    Flatten(),
    #Dense(32, activation='relu'),
    Dropout(0.20),
    Dense(1)
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error', metrics=['mae'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min')

# Train the model
history = model.fit(
    X_train_final_reshaped, y_train_final,
    epochs=500,
    batch_size=32,
    verbose=1,
    validation_data=(X_val_reshaped, y_val),
    callbacks=[early_stopping]
)

# Make predictions
train_predictions = model.predict(X_train_final_reshaped)
val_predictions = model.predict(X_val_reshaped)
test_predictions = model.predict(X_test_reshaped)

# Inverse transform predictions and actual values to original scale
train_predictions_inverse = scaler_target.inverse_transform(train_predictions).flatten()
val_predictions_inverse = scaler_target.inverse_transform(val_predictions).flatten()
test_predictions_inverse = scaler_target.inverse_transform(test_predictions).flatten()

y_train_inverse = scaler_target.inverse_transform(y_train_final.reshape(-1, 1)).flatten()
y_val_inverse = scaler_target.inverse_transform(y_val.reshape(-1, 1)).flatten()
y_test_inverse = scaler_target.inverse_transform(y_test_scaled.reshape(-1, 1)).flatten()

# Calculate and print RMSE and MAE for training, validation, and test sets
train_rmse = np.sqrt(mean_squared_error(y_train_inverse, train_predictions_inverse))
val_rmse = np.sqrt(mean_squared_error(y_val_inverse, val_predictions_inverse))
test_rmse = np.sqrt(mean_squared_error(y_test_inverse, test_predictions_inverse))

train_mae = mean_absolute_error(y_train_inverse, train_predictions_inverse)
val_mae = mean_absolute_error(y_val_inverse, val_predictions_inverse)
test_mae = mean_absolute_error(y_test_inverse, test_predictions_inverse)

print("Training RMSE:", train_rmse, "MAE:", train_mae)
print("Validation RMSE:", val_rmse, "MAE:", val_mae)
print("Testing RMSE:", test_rmse, "MAE:", test_mae)

# Plot training and validation loss
plt.figure(figsize=(10, 4))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

# Plot actual vs. predicted for the test set
plt.figure(figsize=(10, 4))
plt.plot(test_data.index, y_test_inverse, label='Actual')
plt.plot(test_data.index, test_predictions_inverse, label='Predicted')
plt.title('Test Set Actual vs. Predicted')
plt.legend()
plt.show()


# Plot the actual vs. predicted values for the training set
plt.figure(figsize=(14, 5))
plt.plot(train_dates, y_train_inverse, label='Train Actual')
plt.plot(train_dates, train_predictions_inverse, label='Train Predictions')
plt.title('Training Predictions vs Actual')
plt.ylabel('Dengue Incidents')
plt.xlabel('Date')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

# Plot the actual vs. predicted values for the testing set
plt.figure(figsize=(14, 5))
plt.plot(test_dates, y_test_inverse, label='Test Actual')
plt.plot(test_dates, test_predictions_inverse, label='Test Predictions')
plt.title('Testing Predictions vs Actual')
plt.ylabel('Dengue Incidents')
plt.xlabel('Date')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

plt.figure(figsize=(14, 7))  # Set the figure size for better readability
plt.plot(data.index, data['Dhaka_Dengue'], label='Actual Dengue Count', color='blue')
plt.title('Actual Dengue Count Time Series')
plt.xlabel('Date')
plt.ylabel('Dengue Count')
plt.legend()
plt.xticks(rotation=45)  # Rotate date labels for better readability
plt.tight_layout()
plt.show()

machine-learning deep-learning time-series conv-neural-network sequence-modeling
1个回答
0
投票

您没有将日期传递给您的模型。不知道最佳的方法是什么,但是像将日期转换为 unix 时间之类的东西,或者只是一个简单的索引列似乎可以工作:

from datetime import timezone

data = pd.read_csv(file_path, index_col='Date', sep='\t', parse_dates=True)
data.reset_index(level=0, inplace=True)
data["Date"] = data["Date"].map(lambda x: x.replace(tzinfo=timezone.utc).timestamp())

# Split the data into training and testing sets before scaling
features = data.drop("Dhaka_Dengue", axis=1).values
target = data["Dhaka_Dengue"].values

# Scaling features
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)
target_scaled = scaler.fit_transform(target.reshape(-1, 1)).flatten()

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    features_scaled, target_scaled, test_size=0.2, random_state=42
)

_, y_test_sorted = zip(*sorted(zip(X_test[:, 0], y_test)))
_, y_pred_sorted = zip(*sorted(zip(X_test[:, 0], test_predictions)))

plt.plot(y_test_sorted)
plt.plot(y_pred_sorted)

© www.soinside.com 2019 - 2024. All rights reserved.