为了掌握 PyTorch 神经网络,我尝试复制现有的 TensorFlow 架构。然而,我遇到了巨大的性能差距。虽然 TensorFlow 在 25 个 epoch 内实现了快速学习,但 PyTorch 需要至少 250 个 epoch 才能实现类似的泛化。尽管进行了细致的代码审查,我仍然无法识别出进一步的增强功能。尽管仔细调整了两个神经网络的架构,但差异仍然存在。谁能阐明这里还有什么问题吗?
在后续部分中,我将介绍这两种实现的完整 Python 代码,以及 CLI 输出和图形可视化。
可重复性:由于我不想共享原始数据集,所以我附上了一段模拟数据集的代码。生成的
data_inverter.csv
可用于重现观察到的行为。
PyTorch 代码:
# Standard library imports
import pandas as pd
import matplotlib.pyplot as plt
# External library imports
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import max_error, mean_absolute_error, mean_squared_error
# Loading dataset
df_data = pd.read_csv("./data_inverter.csv", names=["pvt", "edge", "slew", "load", "delay"])
# Selecting subset of data based on specific conditions
df_select = df_data[(df_data["pvt"] == "PtypV1500T027") & (df_data["edge"] == "rise")]
# Splitting features and target variable
X = df_select.drop(["pvt", "edge", "delay"], axis='columns')
y = df_select["delay"]
# Scaling input features using Min-Max scaling
slew_scaler = MinMaxScaler()
load_scaler = MinMaxScaler()
X_scaled = X.copy()
X_scaled["slew"] = slew_scaler.fit_transform(X_scaled.slew.values.reshape(-1, 1))
X_scaled["load"] = load_scaler.fit_transform(X_scaled.load.values.reshape(-1, 1))
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=42)
# Converting data to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train.values)
y_train_tensor = torch.FloatTensor(y_train.values).view(-1, 1)
X_test_tensor = torch.FloatTensor(X_test.values)
y_test_tensor = torch.FloatTensor(y_test.values).view(-1, 1)
# Setting random seed for reproducibility
torch.manual_seed(42)
# Defining neural network architecture
model = torch.nn.Sequential(
torch.nn.Linear(X_train_tensor.shape[1], 128),
torch.nn.ReLU(),
torch.nn.Linear(128, 128),
torch.nn.ReLU(),
torch.nn.Linear(128, 64),
torch.nn.ReLU(),
torch.nn.Linear(64, 32),
torch.nn.ReLU(),
torch.nn.Linear(32, 16),
torch.nn.ReLU(),
torch.nn.Linear(16, 1),
torch.nn.ELU()
)
# Loss function and optimizer
criterion = torch.nn.MSELoss()
criterion_val = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())
# Training the model
num_epochs = 25
progress = {'loss': [], 'mae': [], 'mse': [], 'val_loss': [], 'val_mae': [], 'val_mse': []}
for epoch in range(num_epochs):
# Forward pass
y_predict = model(X_train_tensor)
loss = criterion(y_predict, y_train_tensor)
# Backward and optimize
loss.backward()
optimizer.step()
optimizer.zero_grad()
# Validation
with torch.no_grad():
model.eval()
y_test_predict = model(X_test_tensor)
loss_val = criterion_val(y_test_predict, y_test_tensor)
model.train()
# Record progress
progress['loss'].append(loss.item())
progress['mae'].append(mean_absolute_error(y_train_tensor, y_predict.detach().numpy()))
progress['mse'].append(mean_squared_error(y_train_tensor, y_predict.detach().numpy()))
progress['val_loss'].append(loss_val.item())
progress['val_mae'].append(mean_absolute_error(y_test_tensor, y_test_predict.detach().numpy()))
progress['val_mse'].append(mean_squared_error(y_test_tensor, y_test_predict.detach().numpy()))
print("Epoch %i/%i - loss: %0.5F" % (epoch, num_epochs, loss.item()))
# Displaying model summary
print(model)
# Plotting training progress
df_progress = pd.DataFrame(progress)
df_progress.plot()
plt.title("Model training progress: DNN PyTorch")
plt.tight_layout()
plt.show()
# Making predictions on the testing set
with torch.no_grad():
model.eval()
y_predict_tensor = model(X_test_tensor)
y_predict = y_predict_tensor.numpy()
# Displaying model performance metrics
print("Model performance metrics: DNN PyTorch")
print("MAX error:", max_error(y_test_tensor, y_predict))
print("MAE error:", mean_absolute_error(y_test_tensor, y_predict))
print("MSE error:", mean_squared_error(y_test_tensor, y_predict, squared=False))
plt.scatter(y_test, y_predict)
plt.scatter(y_test, y_test, marker='.')
plt.title("Model predictions: DNN PyTorch")
plt.tight_layout()
plt.show()
TensorFlow 代码:
# Standard library imports
import pandas as pd
import matplotlib.pyplot as plt
# External library imports
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import max_error, mean_absolute_error, mean_squared_error
# Loading dataset
df_data = pd.read_csv("./data_inverter.csv", names=["pvt", "edge", "slew", "load", "delay"])
# Selecting subset of data based on specific conditions
df_select = df_data[(df_data["pvt"] == "PtypV1500T027") & (df_data["edge"] == "rise")]
# Splitting features and target variable
X = df_select.drop(["pvt", "edge", "delay"], axis='columns')
y = df_select["delay"]
# Scaling input features using Min-Max scaling
slew_scaler = MinMaxScaler()
load_scaler = MinMaxScaler()
X_scaled = X.copy()
X_scaled["slew"] = slew_scaler.fit_transform(X_scaled.slew.values.reshape(-1, 1))
X_scaled["load"] = load_scaler.fit_transform(X_scaled.load.values.reshape(-1, 1))
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=42)
# Converting data to TensorFlow tensors
X_train_tensor = tf.constant(X_train.values, dtype=tf.float32)
y_train_tensor = tf.constant(y_train.values, dtype=tf.float32)
X_test_tensor = tf.constant(X_test.values, dtype=tf.float32)
y_test_tensor = tf.constant(y_test.values, dtype=tf.float32)
# Setting random seed for reproducibility
tf.keras.utils.set_random_seed(42)
# Defining neural network architecture
model = tf.keras.models.Sequential([
tf.keras.layers.Dense(128, activation='relu', input_dim=X_train_tensor.shape[1]),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(16, activation='relu'),
tf.keras.layers.Dense(1, activation='elu')
])
# Compiling the model
model.compile(
loss=tf.keras.losses.MeanSquaredError(), # Using Mean Squared Error loss function
optimizer=tf.keras.optimizers.Adam(), # Using Adam optimizer
metrics=['mae', 'mse'] # Using Mean Absolute Error and Mean Squared Error as metrics
)
# Training the model
progress = model.fit(X_train_tensor, y_train_tensor, validation_data=(X_test_tensor, y_test_tensor), epochs=25)
# Evaluating model performance on the testing set
model.evaluate(X_test_tensor, y_test_tensor, verbose=2)
# Displaying model summary
print(model.summary())
# Plotting training progress
pd.DataFrame(progress.history).plot()
plt.title("Model training progress: DNN TensorFlow")
plt.tight_layout()
plt.show()
# Making predictions on the testing set
y_predict = model.predict(X_test_tensor)
# Displaying model performance metrics
print("Model performance metrics: DNN TensorFlow")
print("MAX error:", max_error(y_test_tensor, y_predict))
print("MAE error:", mean_absolute_error(y_test_tensor, y_predict))
print("MSE error:", mean_squared_error(y_test_tensor, y_predict, squared=False))
plt.scatter(y_test, y_predict)
plt.scatter(y_test, y_test, marker='.')
plt.title("Model predictions: DNN TensorFlow")
plt.tight_layout()
plt.show()
25 个周期后 PyTorch 模型性能指标的 CLI 输出:
Sequential(
(0): Linear(in_features=2, out_features=128, bias=True)
(1): ReLU()
(2): Linear(in_features=128, out_features=128, bias=True)
(3): ReLU()
(4): Linear(in_features=128, out_features=64, bias=True)
(5): ReLU()
(6): Linear(in_features=64, out_features=32, bias=True)
(7): ReLU()
(8): Linear(in_features=32, out_features=16, bias=True)
(9): ReLU()
(10): Linear(in_features=16, out_features=1, bias=True)
(11): ELU(alpha=1.0)
)
Model performance metrics: DNN PyTorch
MAX error: 1.2864852
MAE error: 0.3353702
MSE error: 0.42874745
25 个周期后 TensorFlow 模型性能指标的 CLI 输出:
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 128) 384
dense_1 (Dense) (None, 128) 16512
dense_2 (Dense) (None, 64) 8256
dense_3 (Dense) (None, 32) 2080
dense_4 (Dense) (None, 16) 528
dense_5 (Dense) (None, 1) 17
=================================================================
Total params: 27777 (108.50 KB)
Trainable params: 27777 (108.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
6/6 [==============================] - 0s 750us/step
Model performance metrics: DNN TensorFlow
MAX error: 0.013849139
MAE error: 0.0029576812
MSE error: 0.0036013061
PyTorch 散点图(橙色 = 目标与自身的对比,蓝色 = 目标与预测的对比):
TensorFlow 散点图(橙色 = 目标与自身的对比,蓝色 = 目标与预测的对比):
.
................................................ ...................................................... ................................................
附加附加信息(对问题和评论的反应):
torch.optim.Adam
- 默认学习率设置为 0.001。
tf.keras.optimizers.Adam
- 默认学习率设置为 0.001
.
................................................ ...................................................... ................................................
这是 250 个 epoch 后的 PyTorch 模型性能:
Sequential(
(0): Linear(in_features=2, out_features=128, bias=True)
(1): ReLU()
(2): Linear(in_features=128, out_features=128, bias=True)
(3): ReLU()
(4): Linear(in_features=128, out_features=64, bias=True)
(5): ReLU()
(6): Linear(in_features=64, out_features=32, bias=True)
(7): ReLU()
(8): Linear(in_features=32, out_features=16, bias=True)
(9): ReLU()
(10): Linear(in_features=16, out_features=1, bias=True)
(11): ELU(alpha=1.0)
)
Model performance metrics: DNN PyTorch
MAX error: 0.025619686
MAE error: 0.006687804
MSE error: 0.008531998
.
................................................ ...................................................... ................................................
如果您想运行重现问题,您可以使用此代码来模拟数据集:
import csv
import math
x_values = [0.003, 0.00354604, 0.00546274, 0.00912297, 0.0148254, 0.0228266, 0.0333551, 0.0466191, 0.0628111, 0.0821111, 0.104689, 0.130705, 0.160313, 0.193659, 0.230886, 0.272128, 0.317517, 0.36718, 0.42124, 0.479818, 0.54303, 0.61099, 0.683809, 0.761595, 0.844455, 0.932492, 1.02581, 1.1245, 1.22868, 1.33842, 1.45383, 1.57501, 1.70203, 1.835, 1.974]
y_values = [0.001, 0.00102008, 0.00109058, 0.0012252, 0.00143494, 0.00172922, 0.00211646, 0.0026043, 0.00319984, 0.0039097, 0.0047401, 0.00569697, 0.00678594, 0.00801243, 0.00938161, 0.0108985, 0.0125679, 0.0143945, 0.0163828, 0.0185373, 0.0208622, 0.0233618, 0.0260401, 0.028901, 0.0319486, 0.0351866, 0.0386187, 0.0422487, 0.0460802, 0.0501166, 0.0543615, 0.0588182, 0.0634902, 0.0683808, 0.0734931, 0.0788305, 0.0843961, 0.0901929, 0.0962242, 0.102493, 0.109002, 0.115755, 0.122753, 0.130001, 0.137502, 0.145257, 0.153269, 0.161543, 0.170079, 0.178881]
z_values = [[math.sqrt(5*(x+0.25)) * math.sqrt(3*(y+0.005)) for y in y_values] for x in x_values]
with open("./data_inverter.csv", 'w') as fid:
writer = csv.writer(fid)
for i in range(len(x_values)):
for j in range(len(y_values)):
writer.writerow(["PtypV1500T027", "rise", x_values[i], y_values[j], z_values[i][j]])
区别在于,TensorFlow 的
model.fit
默认为迷你批处理*(批处理大小为 32,请参阅 model.fit
的 文档),而您的 PyTorch 训练循环只是批处理*。因此,您的 PyTorch 模型仅执行 25 次权重更新,而 TensorFlow 模型执行 (N/32)*25
(其中 N
是您的样本数)。理论上,您可以通过在 PyTorch 中使用更大的学习率来弥补这一点,但如果您使用具有内部状态(例如 Adam)的优化器,则效果不佳。
通过实施小批量处理,您可以在 Pytorch 中获得类似的结果:
batch_size = 32
for epoch in range(num_epochs):
# Forward pass
batches = list()
# mini-batching
for x_batch, y_true in zip(
torch.split(X_train_tensor, batch_size, dim=0),
torch.split(y_train_tensor, batch_size, dim=0),
):
y_predict_batch = model(x_batch)
loss = criterion(y_predict_batch, y_true)
batches.append(y_predict_batch)
# Backward and optimize
loss.backward()
optimizer.step()
optimizer.zero_grad()
y_predict = torch.concat(batches, dim=0)
# Validation
with torch.no_grad():
model.eval()
y_test_predict = model(X_test_tensor)
print(y_test_predict.shape, y_test_tensor.shape)
loss_val = criterion_val(y_test_predict, y_test_tensor)
model.train()
torch.utils.data
模块来进行小批量处理,而不是我的实现。
*:有关批处理和小批量的区别,请参阅这个问题:深度学习中“小批量”的含义是什么?