我已经从有关深度 q-learning 的源代码中复制了代码以尝试从中学习,但它是一个较旧的源代码,因此 keras 和 openai gym 都有很多问题。我已经尝试了好几个小时来处理无数错误,一旦我解决了一个错误,另一个错误就出现了,我不知道此时发生了什么。如果有人可以看看并提供帮助,将不胜感激。
# Create Environment
env_name = 'CartPole-v1'
env = gym.make(env_name)
num_observations = env.observation_space.shape[0]
num_actions = env.action_space.n
# Create Neural Network
model = Sequential()
model.add(Dense(4, input_shape=(4,)))
model.add(Activation('relu'))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dense(num_actions))
model.add(Activation('linear'))
# Create Target Network
target_model = clone_model(model)
# Define Parameters
EPOCHS = 1000
BATCH_SIZE = 32
epsilon = 1.0
EPSILON_REDUCE = 0.995
LEARNING_RATE = 0.001
GAMMA = 0.95
def epsilon_greedy_action_selection(model, epsilon, observation):
if np.random.random() > epsilon:
prediction = model.predict(observation)
action = np.argmax(prediction)
else:
action = np.random.randint(0, env.action_space.n) # Change the 9 later based on num of available actions in that state
return action
replay_buffer = deque(maxlen=20000)
update_target_model = 10
def replay(replay_buffer, batch_size, model, target_model):
if len(replay_buffer) < batch_size:
return
samples = random.sample(replay_buffer, batch_size)
target_batch = []
zipped_samples = list(zip(*samples))
states, actions, rewards, new_states, terminateds, truncateds = zipped_samples
targets = target_model.predict(np.array(states))
q_values = model.predict(np.array(new_states))
for i in range(batch_size):
q_value = max(q_values[i][0])
target = targets[i].copy()
if terminateds[i] or truncateds[i]:
target[0][actions[i]] = rewards[i]
else:
target[0][actions[i]] = rewards[i] + q_value * GAMMA
target_batch.append(target)
model.fit(np.array(states), np.array(target_batch), epochs=1, verbose=1)
def update_model_handler(epoch, update_target_model, model, target_model):
if epoch > 0 and epoch % update_target_model == 0:
target_model.set_weights(model.get_weights())
# Train the model
model.compile(loss='mse', optimizer=(Adam(learning_rate=LEARNING_RATE)))
best_so_far = 0
for epoch in range(EPOCHS):
observation = env.reset()[0]
observation = np.asarray(observation)
observation = observation.reshape([1, 4])
terminated = False
truncated = False
points = 0
while True:
action = epsilon_greedy_action_selection(model, epsilon, observation)
next_observation, reward, terminated, truncated, info = env.step(action)
next_observation = next_observation.reshape([1, 4])
replay_buffer.append((observation, action, reward, next_observation, terminated, truncated))
observation = next_observation
points += 1
replay(replay_buffer, BATCH_SIZE, model, target_model)
if terminated == 1 or truncated == 1:
break
epsilon *= EPSILON_REDUCE
update_model_handler(epoch, update_target_model, model, target_model)
if points > best_so_far:
best_so_far = points
if epoch % 25 == 0:
print(f"{epoch}: POINTS: {points} eps: {epsilon} BSF: {best_so_far}")
当前代码抛出以下错误:ValueError:层“顺序”的输入 0 与层不兼容:预期形状=(无,4),发现形状=(32、1、4)。虽然,除了那个错误之外,还会弹出许多其他错误。
就像我说的,我试了好几个小时,但在理解问题所在方面没有取得任何进展。我相信自从我从中复制了这个源代码以来,keras 的版本已经发生了一些变化,这导致了错误。