我有以下代码,用于基于以下链接将深度学习模型应用于 LunarLander-v2 环境: LunarLander-v2
代码在这里:
# import gymnasium as gym
import tensorflow as tf
import gym
import os
import random
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.regularizers import l2
import numpy as np
import scipy
import uuid
import shutil
import pandas as pd
import matplotlib.pyplot as plt
import keras.backend as K
input_shape =(9,)
outputs =4
def masked_huber_loss(mask_value,clip_delta):
def f(y_true,y_pred):
error =y_true -y_pred
cond =K.abs(error)
mask_true =K.cast(K.not_equal(y_true,mask_value),K.floatx)
masked_squared_error = 0.5 * K.square(mask_true * (y_true - y_pred))
linear_loss = mask_true * (clip_delta * K.abs(error) - 0.5 * (clip_delta ** 2))
huber_loss = tf.where(cond, masked_squared_error, linear_loss)
return K.sum(huber_loss) / K.sum(mask_true)
f.__name__ = 'masked_huber_loss'
return f
def create_model(learning_rate, regularization_factor):
model = Sequential([
Dense(64, input_shape=input_shape, activation="relu", kernel_regularizer=l2(regularization_factor)),
Dense(64, activation="relu", kernel_regularizer=l2(regularization_factor)),
Dense(64, activation="relu", kernel_regularizer=l2(regularization_factor)),
Dense(outputs, activation='linear', kernel_regularizer=l2(regularization_factor))
])
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss=masked_huber_loss(0.0, 1.0))
return model
def get_q_values(model,state):
input =state
print(model.predict(input))
return model.predict(input)[0]
def get_multiple_q_values(model,states):
print(model.predict(states))
return model.predict(states)
def select_action_epsilon_greedy(q_values, epsilon):
random_value = random.uniform(0, 1)
if random_value < epsilon:
return random.randint(0, len(q_values) - 1)
else:
return np.argmax(q_values)
def select_best_action(q_values):
return np.argmax(q_values)
class StateTransition():
def __init__(self, old_state, action, reward, new_state, done):
self.old_state = old_state
self.action = action
self.reward = reward
self.new_state = new_state
self.done = done
class ReplayBuffer():
current_index = 0
def __init__(self, size = 10000):
self.size = size
self.transitions = []
def add(self, transition):
if len(self.transitions) < self.size:
self.transitions.append(transition)
else:
self.transitions[self.current_index] = transition
self.__increment_current_index()
def length(self):
return len(self.transitions)
def get_batch(self, batch_size):
return random.sample(self.transitions, batch_size)
def __increment_current_index(self):
self.current_index += 1
if self.current_index >= self.size - 1:
self.current_index = 0
def calculate_target_values(model, target_model, state_transitions, discount_factor):
states = []
new_states = []
for transition in state_transitions:
states.append(transition.old_state)
new_states.append(transition.new_state)
new_states = np.array(new_states)
q_values_new_state = get_multiple_q_values(model, new_states)
q_values_new_state_target_model = get_multiple_q_values(target_model, new_states)
targets = []
for index, state_transition in enumerate(state_transitions):
best_action = select_best_action(q_values_new_state[index])
best_action_next_state_q_value = q_values_new_state_target_model[index][best_action]
if state_transition.done:
target_value = state_transition.reward
else:
target_value = state_transition.reward + discount_factor * best_action_next_state_q_value
target_vector = [0] * outputs
target_vector[state_transition.action] = target_value
targets.append(target_vector)
return np.array(targets)
def train_model(model, states, targets):
model.fit(states, targets, epochs=1, batch_size=len(targets), verbose=0)
def copy_model(model):
backup_file = 'backup_'+str(uuid.uuid4())
model.save(backup_file)
new_model = load_model(backup_file, custom_objects={ 'masked_huber_loss': masked_huber_loss(0.0, 1.0) })
shutil.rmtree(backup_file)
return new_model
class AverageRewardTracker():
current_index = 0
def __init__(self, num_rewards_for_average=100):
self.num_rewards_for_average = num_rewards_for_average
self.last_x_rewards = []
def add(self, reward):
if len(self.last_x_rewards) < self.num_rewards_for_average:
self.last_x_rewards.append(reward)
else:
self.last_x_rewards[self.current_index] = reward
self.__increment_current_index()
def __increment_current_index(self):
self.current_index += 1
if self.current_index >= self.num_rewards_for_average:
self.current_index = 0
def get_average(self):
return np.average(self.last_x_rewards)
class FileLogger():
def __init__(self, file_name='progress.log'):
self.file_name = file_name
self.clean_progress_file()
def log(self, episode, steps, reward, average_reward):
f = open(self.file_name, 'a+')
f.write(f"{episode};{steps};{reward};{average_reward}\n")
f.close()
def clean_progress_file(self):
if os.path.exists(self.file_name):
os.remove(self.file_name)
f = open(self.file_name, 'a+')
f.write("episode;steps;reward;average\n")
f.close()
replay_buffer_size = 200000
learning_rate = 0.001
regularization_factor = 0.001
training_batch_size = 128
training_start = 256
max_episodes = 10000
max_steps = 1000
target_network_replace_frequency_steps = 1000
model_backup_frequency_episodes = 100
starting_epsilon = 1.0
minimum_epsilon = 0.01
epsilon_decay_factor_per_episode = 0.995
discount_factor = 0.99
train_every_x_steps = 4
env =gym.make("LunarLander-v2",render_mode ="human")
# observation,info =env.reset(seed=42)
# for _ in range(1000):
# action =env.action_space.sample()
# observation,reward,terminated,truncated,info =env.step(action)
# if terminated or truncated:
# observation,info =env.reset()
# env.close()
# print(f"Input: {env.observation_space.shape}")
# print(f"Output: {env.action_space}")
replay_buffer = ReplayBuffer(replay_buffer_size)
model = create_model(learning_rate, regularization_factor)
target_model = copy_model(model)
epsilon = starting_epsilon
step_count = 0
average_reward_tracker = AverageRewardTracker(100)
file_logger = FileLogger()
for episode in range(max_episodes):
print(f"Starting episode {episode} with epsilon {epsilon}")
episode_reward = 0
state = env.reset()
# print(state)
fraction_finished = 0.0
state = np.append(state[0], fraction_finished)
state =state.reshape(1,state.shape[0])
first_q_values = get_q_values(model, state)
print(f"Q values: {first_q_values}")
print(f"Max Q: {max(first_q_values)}")
for step in range(1, max_steps + 1):
step_count += 1
q_values = get_q_values(model, state)
action = select_action_epsilon_greedy(q_values, epsilon)
new_state, reward, terminated, truncated, info = env.step(action)
done = truncated or terminated
fraction_finished = (step + 1) / max_steps
new_state = np.append(new_state, fraction_finished)
episode_reward += reward
if step == max_steps:
print(f"Episode reached the maximum number of steps. {max_steps}")
done = True
state_transition = StateTransition(state, action, reward, new_state, done)
replay_buffer.add(state_transition)
state = new_state
if step_count % target_network_replace_frequency_steps == 0:
print("Updating target model")
target_model = copy_model(model)
if replay_buffer.length() >= training_start and step_count % train_every_x_steps == 0:
batch = replay_buffer.get_batch(batch_size=training_batch_size)
targets = calculate_target_values(model, target_model, batch, discount_factor)
states = np.array([state_transition.old_state for state_transition in batch])
train_model(model, states, targets)
if done:
break
average_reward_tracker.add(episode_reward)
average = average_reward_tracker.get_average()
print(
f"episode {episode} finished in {step} steps with reward {episode_reward}. "
f"Average reward over last 100: {average}")
if episode != 0 and episode % model_backup_frequency_episodes == 0:
backup_file = f"model_{episode}.h5"
print(f"Backing up model to {backup_file}")
model.save(backup_file)
epsilon *= epsilon_decay_factor_per_episode
epsilon = max(minimum_epsilon, epsilon)
data = pd.read_csv(file_logger.file_name, sep=';')
plt.figure(figsize=(20,10))
plt.plot(data['average'])
plt.plot(data['reward'])
plt.title('Reward')
plt.ylabel('Reward')
plt.xlabel('Episode')
plt.legend(['Average reward', 'Reward'], loc='upper right')
plt.show()
当我运行此代码时,首先打印以下值:
Starting episode 0 with epsilon 1.0
1/1 [==============================] - 0s 80ms/step
[[ 0.06509031 -0.02552683 0.03243302 0.1490265 ]]
1/1 [==============================] - 0s 19ms/step
Q values: [ 0.06509031 -0.02552683 0.03243302 0.1490265 ]
Max Q: 0.14902649819850922
1/1 [==============================] - 0s 18ms/step
[[ 0.06509031 -0.02552683 0.03243302 0.1490265 ]]
1/1 [==============================] - 0s 20ms/step
但突然产生错误:
In[0] ndims must be >= 2: 1
[[{{node sequential/dense/MatMul}}]] [Op:__inference_predict_function_1841]
它还显示几行:例如
Traceback (most recent call last):
File "C:\Users\User\PycharmProjects\Machine_Learning_2023\gymnasium_Example.py", line 220, in <module>
q_values = get_q_values(model, state)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\PycharmProjects\Machine_Learning_2023\gymnasium_Example.py", line 46, in get_q_values
print(model.predict(input))
^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\PycharmProjects\Machine_Learning_2023\venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
我已经尝试了几次代码更改,但仍然存在这个问题,您能帮我看看这个错误的原因是什么吗?我哪里出错了?
In[0] ndims 必须 >= 2: 1 [[{{节点顺序/密集/MatMul}}]] [操作:__inference_predict_function_1841]
您的输入数据的维度存在问题。模型期望输入至少具有二维,但您传递的输入仅具有一维。
回溯(最近一次调用最后一次):文件 “C:\用户\用户\PycharmProjects\Machine_Learning_2023\gymnasium_Example.py”, 第 220 行,在 q_values = get_q_values(模型,状态)
错误发生在名为“gymnasium_Example.py”的文件的第 220 行。 文件
“C:\Users\User\PycharmProjects\Machine_Learning_2023\gymnasium_Example.py”, 第 46 行,在 get_q_values 中 打印(模型.预测(输入)
错误出现在同一文件第 46 行的 model.predict(input) 调用中 文件“C:\ Users \ User \ PycharmProjects \ Machine_Learning_2023 env \ Lib \ site-packages \ keras \ src \ utils raceback_utils.py”,第70行,在error_handler
因此,您遇到此错误有一些原因:
您传递给模型预测函数的输入可能不具有模型期望的正确形状。请仔细检查模型的构造和初始化代码。
注意:- 检查 keras 是否正确安装或最新,并完全满足与其他库的所有兼容性。