我尝试实现一个基于自动编码器的异常检测器,以查找数据集中的异常情况
KDDTrain+
。这实际上是一个非常简单的实现。不幸的是,我未能以可重复的方式实施培训程序。我仅在单个 CPU 上训练网络,并播种了所有涉及的包 - 特别是 TensorFlow - 但我没有得到所需的结果。
我实现的代码如下:
import pandas as pd
import numpy as np
import tensorflow as tf
SEED = 42
tf.config.experimental.enable_op_determinism()
tf.random.set_seed(SEED)
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
def load_and_prepare_data():
# load data from file
col_names = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment",
"urgent", "hot", "num_failed_logins", "logged_in",
"num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", "num_shells",
"num_access_files", "num_outbound_cmds",
"is_host_login", "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate",
"rerror_rate", "srv_rerror_rate", "same_srv_rate",
"diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
"dst_host_same_srv_rate", "dst_host_diff_srv_rate",
"dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate",
"dst_host_srv_serror_rate", "dst_host_rerror_rate",
"dst_host_srv_rerror_rate", "label"]
df = pd.read_csv("../data/KDDTrain+_20Percent.txt", header=None, names=col_names, index_col=False)
# produce numerical labels for categorical data
categorical_variables = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login']
categorical_data = pd.get_dummies(df[categorical_variables])
numerical_variables = list(set(df.columns.values.tolist()) - set(categorical_variables))
numerical_variables.remove('label')
numerical_data = df[numerical_variables].copy()
df_preprocessed = pd.concat([numerical_data, categorical_data], axis=1)
# create data split
labels = df['label'].copy()
label_encoder = LabelEncoder()
integer_labels = label_encoder.fit_transform(labels)
n = int(len(df_preprocessed) * 0.75)
x_train = df_preprocessed.loc[0:n, :]
y_train = integer_labels[0:n]
x_test = df_preprocessed.loc[n:len(df_preprocessed), :]
y_test = integer_labels[n:len(df_preprocessed)]
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_train = x_train.astype(np.float32)
x_test = scaler.transform(x_test)
x_test = x_test.astype(np.float32)
return x_train, y_train, x_test, y_test
def build_model(input_dim, latent_space_dim, num_neurons_per_layer_list, activation_func):
# input layer
input_data = tf.keras.layers.Input(shape=(input_dim,), name='encoder_input')
# hidden layers of encoder
num_hidden_layers = len(num_neurons_per_layer_list)
encoder = tf.keras.layers.Dense(units=num_neurons_per_layer_list[0], activation=activation_func, name='encoder_0',
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
bias_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(input_data)
for i in range(1, num_hidden_layers):
encoder = tf.keras.layers.Dense(units=num_neurons_per_layer_list[i], activation=activation_func,
name='encoder_{:d}'.format(i),
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
bias_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(encoder)
# bottleneck layer
latent_encoding = tf.keras.layers.Dense(latent_space_dim, activation='linear', name='latent_encoding',
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
bias_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(encoder)
# hidden layers of decoder
decoder = tf.keras.layers.Dense(units=num_neurons_per_layer_list[num_hidden_layers - 1], activation=activation_func,
name='decoder_0',
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
bias_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(latent_encoding)
for i in range(1, num_hidden_layers):
decoder = tf.keras.layers.Dense(units=num_neurons_per_layer_list[num_hidden_layers - 1 - i],
activation=activation_func,
name='decoder_{:d}'.format(i),
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
bias_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(decoder)
# output layer
reconstructed_data = tf.keras.layers.Dense(units=input_dim, activation='linear', name='reconstructed_data',
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
bias_initializer=tf.keras.initializers.GlorotUniform(seed=SEED))(decoder)
autoencoder_model = tf.keras.models.Model(input_data, reconstructed_data)
return autoencoder_model
def main():
x_train, y_train, x_test, y_test = load_and_prepare_data()
input_dim = x_train.shape[1]
latent_space_dim = 4
num_neurons_per_layer_list = [16, 48, 64, 96]
activation_func = 'relu'
autoencoder_model = build_model(input_dim, latent_space_dim, num_neurons_per_layer_list, activation_func)
learning_rate = 1e-4
loss_function = 'mse'
# opt = optimizers.Adam(learning_rate=learning_rate)
autoencoder_model.compile(optimizer='adam', loss=loss_function)
history = autoencoder_model.fit(x_train, x_train, shuffle=False, epochs=10, batch_size=512,
validation_data=(x_test, x_test))
main()
我希望每次运行都能得到相同的损失、网络中相同的权重和偏差以及相同的评估结果。
运行相同的代码两次,我得到以下结果:
第一次运行:
纪元1/5 79/79 [=======================] - 1s 7ms/步 - 损失:0.0779 - val_loss:0.0628
纪元2/5 79/79 [=======================] - 0s 5ms/步 - 损耗:0.0514 - val_loss:0.0397
纪元3/5 79/79 [=======================] - 0s 4ms/步 - 损失:0.0311 - val_loss:0.0236
纪元4/5 79/79 [=======================] - 0s 5ms/步 - 损耗:0.0193 - val_loss:0.0157
纪元5/5 79/79 [=======================] - 0s 5ms/步 - 损耗:0.0146 - val_loss:0.0130
第二次运行:
纪元1/5 79/79 [=======================] - 1s 7ms/步 - 损失:0.0726 - val_loss:0.0589
纪元2/5 79/79 [=======================] - 0s 5ms/步 - 损失:0.0475 - val_loss:0.0363
纪元3/5 79/79 [=======================] - 0s 5ms/步 - 损失:0.0297 - val_loss:0.0233
纪元4/5 79/79 [=======================] - 0s 5ms/步 - 损耗:0.0185 - val_loss:0.0144
纪元5/5 79/79 [=======================] - 0s 5ms/步 - 损耗:0.0131 - val_loss:0.0115
终于我找到了解决我的问题的方法。首先,有必要为所有涉及的随机数生成器提供种子。使用方法很方便
tensorflow.keras.utils.set_random_seed(seed_val)
——调用这个方法就相当于调用
确定性地再现训练结果的一个重要要求是放弃使用 GPU。为此,请使用调用
第一个命令仅在训练时用于针对 CPU。后面的命令用于放弃 CPU 端的并行执行。请注意,执行这些命令可能会增加代码的运行时间。
最后,在我发布的示例中,问题与数据相关。在某一时刻,我通过
提取了所有数值变量/特征numerical_variables = list(set(df.columns.values.tolist()) - set(categorical_variables))
由于将列表转换为集合,这行代码不会给出可重现的结果。通过
numerical_variables.sort()
按字母顺序对结果列表进行排序(将集合转换回列表后)可以解决问题。
注释
我使用tensorflow==2.12.0。据我所知,上述所有陈述和解释对于tensorflow>=2.0.0都有效。对于使用以前的张量流版本的种子应用程序,请参阅here - 无论如何,我认为这给出了对该主题的非常好的解释。
据我所知可以使用这些命令
在 GPU 上运行训练时也能实现确定性。但我没有测试过。