ndims 必须 >= 2: 1 强化学习中的问题

问题描述 投票:0回答:1

我有以下代码,用于基于以下链接将深度学习模型应用于 LunarLander-v2 环境: LunarLander-v2

代码在这里:

# import gymnasium as gym
import tensorflow as tf
import gym
import os
import random
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.regularizers import l2
import numpy as np
import scipy
import uuid
import shutil
import pandas as pd
import matplotlib.pyplot as plt
import keras.backend as K
input_shape =(9,)
outputs =4
def masked_huber_loss(mask_value,clip_delta):
    def f(y_true,y_pred):
        error =y_true -y_pred
        cond =K.abs(error)
        mask_true =K.cast(K.not_equal(y_true,mask_value),K.floatx)
        masked_squared_error = 0.5 * K.square(mask_true * (y_true - y_pred))
        linear_loss = mask_true * (clip_delta * K.abs(error) - 0.5 * (clip_delta ** 2))
        huber_loss = tf.where(cond, masked_squared_error, linear_loss)
        return K.sum(huber_loss) / K.sum(mask_true)

    f.__name__ = 'masked_huber_loss'
    return f
def create_model(learning_rate, regularization_factor):
    model = Sequential([
        Dense(64, input_shape=input_shape, activation="relu", kernel_regularizer=l2(regularization_factor)),
        Dense(64, activation="relu", kernel_regularizer=l2(regularization_factor)),
        Dense(64, activation="relu", kernel_regularizer=l2(regularization_factor)),
        Dense(outputs, activation='linear', kernel_regularizer=l2(regularization_factor))
    ])

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=masked_huber_loss(0.0, 1.0))

    return model


def get_q_values(model,state):
    input =state
    print(model.predict(input))
    return model.predict(input)[0]
def get_multiple_q_values(model,states):
    print(model.predict(states))
    return model.predict(states)
def select_action_epsilon_greedy(q_values, epsilon):
  random_value = random.uniform(0, 1)
  if random_value < epsilon:
    return random.randint(0, len(q_values) - 1)
  else:
    return np.argmax(q_values)
def select_best_action(q_values):
  return np.argmax(q_values)
class StateTransition():

  def __init__(self, old_state, action, reward, new_state, done):
    self.old_state = old_state
    self.action = action
    self.reward = reward
    self.new_state = new_state
    self.done = done

class ReplayBuffer():
  current_index = 0

  def __init__(self, size = 10000):
    self.size = size
    self.transitions = []

  def add(self, transition):
    if len(self.transitions) < self.size:
      self.transitions.append(transition)
    else:
      self.transitions[self.current_index] = transition
      self.__increment_current_index()

  def length(self):
    return len(self.transitions)

  def get_batch(self, batch_size):
    return random.sample(self.transitions, batch_size)

  def __increment_current_index(self):
    self.current_index += 1
    if self.current_index >= self.size - 1:
      self.current_index = 0


def calculate_target_values(model, target_model, state_transitions, discount_factor):
    states = []
    new_states = []
    for transition in state_transitions:
        states.append(transition.old_state)
        new_states.append(transition.new_state)

    new_states = np.array(new_states)

    q_values_new_state = get_multiple_q_values(model, new_states)
    q_values_new_state_target_model = get_multiple_q_values(target_model, new_states)

    targets = []
    for index, state_transition in enumerate(state_transitions):
        best_action = select_best_action(q_values_new_state[index])
        best_action_next_state_q_value = q_values_new_state_target_model[index][best_action]

        if state_transition.done:
            target_value = state_transition.reward
        else:
            target_value = state_transition.reward + discount_factor * best_action_next_state_q_value

        target_vector = [0] * outputs
        target_vector[state_transition.action] = target_value
        targets.append(target_vector)

    return np.array(targets)
def train_model(model, states, targets):
  model.fit(states, targets, epochs=1, batch_size=len(targets), verbose=0)
def copy_model(model):
  backup_file = 'backup_'+str(uuid.uuid4())
  model.save(backup_file)
  new_model = load_model(backup_file, custom_objects={ 'masked_huber_loss': masked_huber_loss(0.0, 1.0) })
  shutil.rmtree(backup_file)
  return new_model
class AverageRewardTracker():
  current_index = 0

  def __init__(self, num_rewards_for_average=100):
    self.num_rewards_for_average = num_rewards_for_average
    self.last_x_rewards = []

  def add(self, reward):
    if len(self.last_x_rewards) < self.num_rewards_for_average:
      self.last_x_rewards.append(reward)
    else:
      self.last_x_rewards[self.current_index] = reward
      self.__increment_current_index()

  def __increment_current_index(self):
    self.current_index += 1
    if self.current_index >= self.num_rewards_for_average:
      self.current_index = 0

  def get_average(self):
    return np.average(self.last_x_rewards)


class FileLogger():

  def __init__(self, file_name='progress.log'):
    self.file_name = file_name
    self.clean_progress_file()

  def log(self, episode, steps, reward, average_reward):
    f = open(self.file_name, 'a+')
    f.write(f"{episode};{steps};{reward};{average_reward}\n")
    f.close()

  def clean_progress_file(self):
    if os.path.exists(self.file_name):
      os.remove(self.file_name)
    f = open(self.file_name, 'a+')
    f.write("episode;steps;reward;average\n")
    f.close()

replay_buffer_size = 200000
learning_rate = 0.001
regularization_factor = 0.001
training_batch_size = 128
training_start = 256
max_episodes = 10000
max_steps = 1000
target_network_replace_frequency_steps = 1000
model_backup_frequency_episodes = 100
starting_epsilon = 1.0
minimum_epsilon = 0.01
epsilon_decay_factor_per_episode = 0.995
discount_factor = 0.99
train_every_x_steps = 4


env =gym.make("LunarLander-v2",render_mode ="human")
# observation,info =env.reset(seed=42)
# for  _ in range(1000):
#     action =env.action_space.sample()
#     observation,reward,terminated,truncated,info =env.step(action)
#     if terminated or truncated:
#         observation,info =env.reset()
# env.close()
# print(f"Input: {env.observation_space.shape}")
# print(f"Output: {env.action_space}")
replay_buffer = ReplayBuffer(replay_buffer_size)
model = create_model(learning_rate, regularization_factor)
target_model = copy_model(model)
epsilon = starting_epsilon
step_count = 0
average_reward_tracker = AverageRewardTracker(100)
file_logger = FileLogger()

for episode in range(max_episodes):
    print(f"Starting episode {episode} with epsilon {epsilon}")

    episode_reward = 0
    state = env.reset()
    # print(state)
    fraction_finished = 0.0
    state = np.append(state[0], fraction_finished)
    state =state.reshape(1,state.shape[0])

    first_q_values = get_q_values(model, state)
    print(f"Q values: {first_q_values}")
    print(f"Max Q: {max(first_q_values)}")

    for step in range(1, max_steps + 1):
        step_count += 1
        q_values = get_q_values(model, state)
        action = select_action_epsilon_greedy(q_values, epsilon)
        new_state, reward, terminated, truncated, info = env.step(action)
        done = truncated or terminated

        fraction_finished = (step + 1) / max_steps
        new_state = np.append(new_state, fraction_finished)

        episode_reward += reward

        if step == max_steps:
            print(f"Episode reached the maximum number of steps. {max_steps}")
            done = True

        state_transition = StateTransition(state, action, reward, new_state, done)
        replay_buffer.add(state_transition)

        state = new_state

        if step_count % target_network_replace_frequency_steps == 0:
            print("Updating target model")
            target_model = copy_model(model)

        if replay_buffer.length() >= training_start and step_count % train_every_x_steps == 0:
            batch = replay_buffer.get_batch(batch_size=training_batch_size)
            targets = calculate_target_values(model, target_model, batch, discount_factor)
            states = np.array([state_transition.old_state for state_transition in batch])
            train_model(model, states, targets)

        if done:
            break

    average_reward_tracker.add(episode_reward)
    average = average_reward_tracker.get_average()

    print(
        f"episode {episode} finished in {step} steps with reward {episode_reward}. "
        f"Average reward over last 100: {average}")

    if episode != 0 and episode % model_backup_frequency_episodes == 0:
        backup_file = f"model_{episode}.h5"
        print(f"Backing up model to {backup_file}")
        model.save(backup_file)

    epsilon *= epsilon_decay_factor_per_episode
    epsilon = max(minimum_epsilon, epsilon)
data = pd.read_csv(file_logger.file_name, sep=';')

plt.figure(figsize=(20,10))
plt.plot(data['average'])
plt.plot(data['reward'])
plt.title('Reward')
plt.ylabel('Reward')
plt.xlabel('Episode')
plt.legend(['Average reward', 'Reward'], loc='upper right')
plt.show()

当我运行此代码时,首先打印以下值:

Starting episode 0 with epsilon 1.0
1/1 [==============================] - 0s 80ms/step
[[ 0.06509031 -0.02552683  0.03243302  0.1490265 ]]
1/1 [==============================] - 0s 19ms/step
Q values: [ 0.06509031 -0.02552683  0.03243302  0.1490265 ]
Max Q: 0.14902649819850922
1/1 [==============================] - 0s 18ms/step
[[ 0.06509031 -0.02552683  0.03243302  0.1490265 ]]
1/1 [==============================] - 0s 20ms/step

但突然产生错误:

In[0] ndims must be >= 2: 1
     [[{{node sequential/dense/MatMul}}]] [Op:__inference_predict_function_1841]

它还显示几行:例如

Traceback (most recent call last):
  File "C:\Users\User\PycharmProjects\Machine_Learning_2023\gymnasium_Example.py", line 220, in <module>
    q_values = get_q_values(model, state)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\User\PycharmProjects\Machine_Learning_2023\gymnasium_Example.py", line 46, in get_q_values
    print(model.predict(input))
          ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\User\PycharmProjects\Machine_Learning_2023\venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler

我已经尝试了几次代码更改,但仍然存在这个问题,您能帮我看看这个错误的原因是什么吗?我哪里出错了?

python tensorflow keras reinforcement-learning
1个回答
0
投票

In[0] ndims 必须 >= 2: 1 [[{{节点顺序/密集/MatMul}}]] [操作:__inference_predict_function_1841]

您的输入数据的维度存在问题。模型期望输入至少具有二维,但您传递的输入仅具有一维。

回溯(最近一次调用最后一次):文件 “C:\用户\用户\PycharmProjects\Machine_Learning_2023\gymnasium_Example.py”, 第 220 行,在 q_values = get_q_values(模型,状态)

错误发生在名为“gymnasium_Example.py”的文件的第 220 行。 文件

“C:\Users\User\PycharmProjects\Machine_Learning_2023\gymnasium_Example.py”, 第 46 行,在 get_q_values 中 打印(模型.预测(输入)

错误出现在同一文件第 46 行的 model.predict(input) 调用中 文件“C:\ Users \ User \ PycharmProjects \ Machine_Learning_2023 env \ Lib \ site-packages \ keras \ src \ utils raceback_utils.py”,第70行,在error_handler

因此,您遇到此错误有一些原因:

您传递给模型预测函数的输入可能不具有模型期望的正确形状。请仔细检查模型的构造和初始化代码。

注意:- 检查 keras 是否正确安装或最新,并完全满足与其他库的所有兼容性。

© www.soinside.com 2019 - 2024. All rights reserved.