我正在 Atari 上实施 DQN。我有一些与形状相关的问题

问题描述 投票:0回答:1

如果您可以从头开始制作 atari 基本 dqn(包装器除外),请分享该文件。

我随机收到 2 个错误

ValueError:用序列设置数组元素。请求的数组在 1 维之后具有不均匀的形状。检测到的形状为 (32,) + 不均匀部分。

ValueError:用序列设置数组元素。请求的数组在二维后具有不均匀的形状。检测到的形状为 (1,2) + 不均匀部分。

这是文字版本:

import random
import torch
from torch import nn
import numpy as np
import PIL
from torch.utils.tensorboard import SummaryWriter
import gym
from stable_baselines3.common.atari_wrappers import (
    FireResetEnv,
    MaxAndSkipEnv
)
import time
import itertools
import collections

params = {
    'env_name':         "PongNoFrameskip-v4",
    'stop_reward':      18.0,
    'run_name':         'pong',
    'replay_size':      100000,
    'replay_initial':   10000,
    'target_net_sync':  1000,
    'epsilon_frames':   10**5,
    'epsilon_start':    1.0,
    'epsilon_final':    0.02,
    'learning_rate':    0.0001,
    'gamma':            0.99,
    'batch_size':       32
}

env = gym.make(params['env_name'])
env = MaxAndSkipEnv(env, skip=4)
if "FIRE" in env.unwrapped.get_action_meanings():
    env = FireResetEnv(env)
env = gym.wrappers.ResizeObservation(env, (84, 84))
env = gym.wrappers.GrayScaleObservation(env)
env = gym.wrappers.FrameStack(env, 4)

Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

class DQNetwork(nn.Module):
  def __init__(self, env):
    super().__init__()
    self.env = env
    self.input_size = env.observation_space.shape[0]
    self.output_size = env.action_space.n

    self.conv_net = nn.Sequential(
        nn.Conv2d(self.input_size, 32, 8, stride=4),
        nn.ReLU(),
        nn.Conv2d(32, 64, 4, stride=2),
        nn.ReLU(),
        nn.Conv2d(64, 64, 3, stride=1),
        nn.ReLU(),
        nn.Flatten()
    )

    conv_out_size = self._n_conv_out()

    self.fc_out = nn.Sequential(
        nn.Linear(conv_out_size, 512),
        nn.ReLU(),
        nn.Linear(512, self.output_size)
    )

  def _n_conv_out(self):
    inputs = torch.rand(self.env.observation_space.shape)

    size = self.conv_net(inputs).shape
    return np.prod(size)

  def forward(self, x):
    out = self.conv_net(x / 255.0).view(x.size()[0], -1)
    return self.fc_out(out)
  
def calc_loss(batch, net, tgt_net, device="cpu", GAMMA = params['gamma']):
    states, actions, rewards, dones, next_states = batch

    states_v = torch.from_numpy(states).to(device)
    next_states_v = torch.from_numpy(next_states).to(device)
    actions_v = torch.from_numpy(actions).to(device)
    rewards_v = torch.from_numpy(rewards).to(device)
    done_mask = torch.BoolTensor(dones).to(device)

    state_action_values = net(states_v).gather(1, index = actions_v.unsqueeze(-1)).squeeze(-1)
    next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0
    next_state_values = next_state_values.detach()

    expected_state_action_values = next_state_values * GAMMA + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)

class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()

    def _reset(self):
        self.state = env.reset()
        self.total_reward = 0.0

    def play_step(self, net, epsilon=0.0, device="cpu"):
        done_reward = None

        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_a = np.array([self.state], copy=False)
            state_v = torch.tensor(state_a).to(device)
            q_vals_v = net(state_v)
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

        # do step in the environment
        new_state, reward, ter, trunc, _ = self.env.step(action)
        is_done = ter or trunc
        self.total_reward += reward

        exp = Experience(self.state, action, reward, is_done, new_state)
        self.exp_buffer.append(exp)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward

writer = SummaryWriter(f"runs/DqnPong-v1")

device = torch.device("cpu")
    
dqn_network = DQNetwork(env).to(device)
optimizer = torch.optim.Adam(dqn_network.parameters(), lr=params['learning_rate'])
tgt_net = DQNetwork(env).to(device)
tgt_net.load_state_dict(dqn_network.state_dict())

class ExperienceBuffer:
  def __init__(self, capacity):
    self.experienceBuffer = collections.deque(maxlen = capacity)
  
  def __len__(self):
    return len(self.experienceBuffer)
  
  def append(self, exp):
    self.experienceBuffer.append(exp)
  
  def sample(self, batch_size):
    indices = np.random.choice(len(self.experienceBuffer), batch_size, replace=False)
    states, actions, rewards, dones, next_states = zip(*[self.experienceBuffer[idx] for idx in indices])
    
    return ( np.array(states),
             np.array(actions, dtype=np.int64),
             np.array(rewards, dtype=np.float32),
             np.array(dones, dtype=np.uint8),
             np.array(next_states) )



replay_buffer = ExperienceBuffer(capacity = params['replay_size'])
total_rewards = []

ts = time.time()

episode_reward = 0

agent = Agent(env, replay_buffer)

print(device)

best_mean_reward = None
ts_frame = 0

#Main Training Loop
obs = env.reset
for step in itertools.count():
  epsilon = np.interp(step, [0, params['epsilon_frames']], [params['epsilon_start'], params['epsilon_final']])

  reward = agent.play_step(dqn_network, epsilon=epsilon, device=device)

  if reward is not None:
    total_rewards.append(reward)
    speed = (step - ts_frame) / (time.time() - ts)
    ts_frame = step
    ts = time.time()
    mean_reward = np.mean(total_rewards[-100:])
    print("%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (
        step, len(total_rewards), mean_reward, epsilon,
        speed
    ))
    writer.add_scalar("epsilon", epsilon, step)
    writer.add_scalar("speed", speed, step)
    writer.add_scalar("reward_100", mean_reward, step)
    writer.add_scalar("reward", reward, step)
    if best_mean_reward is None or best_mean_reward < mean_reward:
        torch.save(dqn_network.state_dict(), params['env_name'] + "-best.pt")
        if best_mean_reward is not None:
            print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
        best_mean_reward = mean_reward
    if mean_reward > params['stop_reward']:
        print(f"Solved in {step} frames!")
        break

  if len(replay_buffer) < params['replay_initial']:
    continue
  
  if step % params['target_net_sync'] == 0:
      tgt_net.load_state_dict(dqn_network.state_dict())

  obses = np.asarray([t[0] for t in transitions])
  actions = np.asarray([t[1] for t in transitions])
  rews = np.asarray([t[2] for t in transitions])
  dones = np.asarray([t[3] for t in transitions])
  new_obses = np.asarray([t[4] for t in transitions])

  obses_t = torch.as_tensor(obses, dtype=torch.float32)
  actions_t = torch.as_tensor(actions, dtype=torch.int64).unsqueeze(-1)
  rews_t = torch.as_tensor(rews, dtype=torch.float32).unsqueeze(-1)
  dones_t = torch.as_tensor(dones, dtype=torch.float32).unsqueeze(-1)
  new_obses_t = torch.as_tensor(new_obses, dtype=torch.float32)"""

  optimizer.zero_grad()
  batch = replay_buffer.sample(params['batch_size'])
  loss_t = calc_loss(batch, dqn_network, tgt_net, device=device)
  loss_t.backward()
  optimizer.step()
writer.close()

如果您可以从头开始制作 atari(pong)基本 dqn(包装器除外),请分享该文件。

我尝试更改 dqn 前向函数和包装器,但仍然没有解决。

网上也没有最新的代码。在 Clean-RL、天手等上他们使用模块

pytorch shapes reinforcement-learning dqn atari-2600
1个回答
0
投票

我设法修复它,

这是更新后的代码:

import random
import torch
from torch import nn
import numpy as np
import PIL
from torch.utils.tensorboard import SummaryWriter
import gym
from stable_baselines3.common.atari_wrappers import (
    FireResetEnv,
    MaxAndSkipEnv
)
import time
import itertools
import collections

params = {
    'env_name':         "PongNoFrameskip-v4",
    'stop_reward':      18.0,
    'run_name':         'pong',
    'replay_size':      100000,
    'replay_initial':   10000,
    'target_net_sync':  1000,
    'epsilon_frames':   10**5,
    'epsilon_start':    1.0,
    'epsilon_final':    0.02,
    'learning_rate':    0.0001,
    'gamma':            0.99,
    'batch_size':       32
}

env = gym.make(params['env_name'])
env = MaxAndSkipEnv(env, skip=4)
if "FIRE" in env.unwrapped.get_action_meanings():
    env = FireResetEnv(env)
env = gym.wrappers.ResizeObservation(env, (84, 84))
env = gym.wrappers.GrayScaleObservation(env)
env = gym.wrappers.FrameStack(env, 4)

Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

class DQNetwork(nn.Module):
  def __init__(self, env):
    super().__init__()
    self.env = env
    self.input_size = env.observation_space.shape[0]
    self.output_size = env.action_space.n

    self.conv_net = nn.Sequential(
        nn.Conv2d(self.input_size, 32, 8, stride=4),
        nn.ReLU(),
        nn.Conv2d(32, 64, 4, stride=2),
        nn.ReLU(),
        nn.Conv2d(64, 64, 3, stride=1),
        nn.ReLU(),
        nn.Flatten()
    )

    conv_out_size = self._n_conv_out()

    self.fc_out = nn.Sequential(
        nn.Linear(conv_out_size, 512),
        nn.ReLU(),
        nn.Linear(512, self.output_size)
    )

  def _n_conv_out(self):
    inputs = torch.rand(self.env.observation_space.shape)

    size = self.conv_net(inputs).shape
    return np.prod(size)

  def forward(self, x):
    out = self.conv_net(x / 255.0).view(x.size()[0], -1)
    return self.fc_out(out)
  
def calc_loss(batch, net, tgt_net, device="cpu", GAMMA = params['gamma']):
    states, actions, rewards, dones, next_states = batch

    states_v = torch.from_numpy(states).to(device)
    next_states_v = torch.from_numpy(next_states).to(device)
    actions_v = torch.from_numpy(actions).to(device)
    rewards_v = torch.from_numpy(rewards).to(device)
    done_mask = torch.BoolTensor(dones).to(device)

    state_action_values = net(states_v).gather(1, index = actions_v.unsqueeze(-1)).squeeze(-1)
    next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0
    next_state_values = next_state_values.detach()

    expected_state_action_values = next_state_values * GAMMA + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)

class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()

    def _reset(self):
        self.state, _ = env.reset()
        self.total_reward = 0.0

    def play_step(self, net, epsilon=0.0, device="cpu"):
        done_reward = None

        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_a = np.array([self.state], copy=False)
            state_v = torch.tensor(state_a).to(device)
            q_vals_v = net(state_v)
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

        # do step in the environment
        new_state, reward, ter, trunc, _ = self.env.step(action)
        is_done = ter or trunc
        self.total_reward += reward

        exp = Experience(self.state, action, reward, is_done, new_state)
        self.exp_buffer.append(exp)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward

writer = SummaryWriter(f"runs/DqnPong-v1")

device = torch.device("cpu")
    
dqn_network = DQNetwork(env).to(device)
optimizer = torch.optim.Adam(dqn_network.parameters(), lr=params['learning_rate'])
tgt_net = DQNetwork(env).to(device)
tgt_net.load_state_dict(dqn_network.state_dict())

class ExperienceBuffer:
  def __init__(self, capacity):
    self.experienceBuffer = collections.deque(maxlen = capacity)
  
  def __len__(self):
    return len(self.experienceBuffer)
  
  def append(self, exp):
    self.experienceBuffer.append(exp)
  
  def sample(self, batch_size):
    indices = np.random.choice(len(self.experienceBuffer), batch_size, replace=False)
    states, actions, rewards, dones, next_states = zip(*[self.experienceBuffer[idx] for idx in indices])
    
    return ( np.array(states),
             np.array(actions, dtype=np.int64),
             np.array(rewards, dtype=np.float32),
             np.array(dones, dtype=np.uint8),
             np.array(next_states) )



replay_buffer = ExperienceBuffer(capacity = params['replay_size'])
total_rewards = []

ts = time.time()

episode_reward = 0

agent = Agent(env, replay_buffer)

print(device)

best_mean_reward = None
ts_frame = 0

#Main Training Loop
obs, _ = env.reset()
for step in itertools.count():
  epsilon = np.interp(step, [0, params['epsilon_frames']], [params['epsilon_start'], params['epsilon_final']])

  reward = agent.play_step(dqn_network, epsilon=epsilon, device=device)

  if reward is not None:
    total_rewards.append(reward)
    speed = (step - ts_frame) / (time.time() - ts)
    ts_frame = step
    ts = time.time()
    mean_reward = np.mean(total_rewards[-100:])
    print("%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (
        step, len(total_rewards), mean_reward, epsilon,
        speed
    ))
    writer.add_scalar("epsilon", epsilon, step)
    writer.add_scalar("speed", speed, step)
    writer.add_scalar("reward_100", mean_reward, step)
    writer.add_scalar("reward", reward, step)
    if best_mean_reward is None or best_mean_reward < mean_reward:
        torch.save(dqn_network.state_dict(), params['env_name'] + "-best.pt")
        if best_mean_reward is not None:
            print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
        best_mean_reward = mean_reward
    if mean_reward > params['stop_reward']:
        print(f"Solved in {step} frames!")
        break

  if len(replay_buffer) < params['replay_initial']:
    continue
  
  if step % params['target_net_sync'] == 0:
      tgt_net.load_state_dict(dqn_network.state_dict())

  """total_rewards.append(reward)
  replay_buffer.append(())

  mean_reward = np.mean(total_rewards[-100:])
  transitions = replay_buffer.sample(params['batch_size'])"""

  """obses = np.asarray([t[0] for t in transitions])
  actions = np.asarray([t[1] for t in transitions])
  rews = np.asarray([t[2] for t in transitions])
  dones = np.asarray([t[3] for t in transitions])
  new_obses = np.asarray([t[4] for t in transitions])

  obses_t = torch.as_tensor(obses, dtype=torch.float32)
  actions_t = torch.as_tensor(actions, dtype=torch.int64).unsqueeze(-1)
  rews_t = torch.as_tensor(rews, dtype=torch.float32).unsqueeze(-1)
  dones_t = torch.as_tensor(dones, dtype=torch.float32).unsqueeze(-1)
  new_obses_t = torch.as_tensor(new_obses, dtype=torch.float32)"""

  optimizer.zero_grad()
  batch = replay_buffer.sample(params['batch_size'])
  loss_t = calc_loss(batch, dqn_network, tgt_net, device=device)
  loss_t.backward()
  optimizer.step()
writer.close()

我通过更新 .reset() 调用并返回错误修复了该错误 随着新健身房版本的回归

observation, info

所以我把它改为

obs, _ = env.reset

© www.soinside.com 2019 - 2024. All rights reserved.