如果您可以从头开始制作 atari 基本 dqn(包装器除外),请分享该文件。
我随机收到 2 个错误
ValueError:用序列设置数组元素。请求的数组在 1 维之后具有不均匀的形状。检测到的形状为 (32,) + 不均匀部分。
ValueError:用序列设置数组元素。请求的数组在二维后具有不均匀的形状。检测到的形状为 (1,2) + 不均匀部分。
这是文字版本:
import random
import torch
from torch import nn
import numpy as np
import PIL
from torch.utils.tensorboard import SummaryWriter
import gym
from stable_baselines3.common.atari_wrappers import (
FireResetEnv,
MaxAndSkipEnv
)
import time
import itertools
import collections
params = {
'env_name': "PongNoFrameskip-v4",
'stop_reward': 18.0,
'run_name': 'pong',
'replay_size': 100000,
'replay_initial': 10000,
'target_net_sync': 1000,
'epsilon_frames': 10**5,
'epsilon_start': 1.0,
'epsilon_final': 0.02,
'learning_rate': 0.0001,
'gamma': 0.99,
'batch_size': 32
}
env = gym.make(params['env_name'])
env = MaxAndSkipEnv(env, skip=4)
if "FIRE" in env.unwrapped.get_action_meanings():
env = FireResetEnv(env)
env = gym.wrappers.ResizeObservation(env, (84, 84))
env = gym.wrappers.GrayScaleObservation(env)
env = gym.wrappers.FrameStack(env, 4)
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])
class DQNetwork(nn.Module):
def __init__(self, env):
super().__init__()
self.env = env
self.input_size = env.observation_space.shape[0]
self.output_size = env.action_space.n
self.conv_net = nn.Sequential(
nn.Conv2d(self.input_size, 32, 8, stride=4),
nn.ReLU(),
nn.Conv2d(32, 64, 4, stride=2),
nn.ReLU(),
nn.Conv2d(64, 64, 3, stride=1),
nn.ReLU(),
nn.Flatten()
)
conv_out_size = self._n_conv_out()
self.fc_out = nn.Sequential(
nn.Linear(conv_out_size, 512),
nn.ReLU(),
nn.Linear(512, self.output_size)
)
def _n_conv_out(self):
inputs = torch.rand(self.env.observation_space.shape)
size = self.conv_net(inputs).shape
return np.prod(size)
def forward(self, x):
out = self.conv_net(x / 255.0).view(x.size()[0], -1)
return self.fc_out(out)
def calc_loss(batch, net, tgt_net, device="cpu", GAMMA = params['gamma']):
states, actions, rewards, dones, next_states = batch
states_v = torch.from_numpy(states).to(device)
next_states_v = torch.from_numpy(next_states).to(device)
actions_v = torch.from_numpy(actions).to(device)
rewards_v = torch.from_numpy(rewards).to(device)
done_mask = torch.BoolTensor(dones).to(device)
state_action_values = net(states_v).gather(1, index = actions_v.unsqueeze(-1)).squeeze(-1)
next_state_values = tgt_net(next_states_v).max(1)[0]
next_state_values[done_mask] = 0.0
next_state_values = next_state_values.detach()
expected_state_action_values = next_state_values * GAMMA + rewards_v
return nn.MSELoss()(state_action_values, expected_state_action_values)
class Agent:
def __init__(self, env, exp_buffer):
self.env = env
self.exp_buffer = exp_buffer
self._reset()
def _reset(self):
self.state = env.reset()
self.total_reward = 0.0
def play_step(self, net, epsilon=0.0, device="cpu"):
done_reward = None
if np.random.random() < epsilon:
action = env.action_space.sample()
else:
state_a = np.array([self.state], copy=False)
state_v = torch.tensor(state_a).to(device)
q_vals_v = net(state_v)
_, act_v = torch.max(q_vals_v, dim=1)
action = int(act_v.item())
# do step in the environment
new_state, reward, ter, trunc, _ = self.env.step(action)
is_done = ter or trunc
self.total_reward += reward
exp = Experience(self.state, action, reward, is_done, new_state)
self.exp_buffer.append(exp)
self.state = new_state
if is_done:
done_reward = self.total_reward
self._reset()
return done_reward
writer = SummaryWriter(f"runs/DqnPong-v1")
device = torch.device("cpu")
dqn_network = DQNetwork(env).to(device)
optimizer = torch.optim.Adam(dqn_network.parameters(), lr=params['learning_rate'])
tgt_net = DQNetwork(env).to(device)
tgt_net.load_state_dict(dqn_network.state_dict())
class ExperienceBuffer:
def __init__(self, capacity):
self.experienceBuffer = collections.deque(maxlen = capacity)
def __len__(self):
return len(self.experienceBuffer)
def append(self, exp):
self.experienceBuffer.append(exp)
def sample(self, batch_size):
indices = np.random.choice(len(self.experienceBuffer), batch_size, replace=False)
states, actions, rewards, dones, next_states = zip(*[self.experienceBuffer[idx] for idx in indices])
return ( np.array(states),
np.array(actions, dtype=np.int64),
np.array(rewards, dtype=np.float32),
np.array(dones, dtype=np.uint8),
np.array(next_states) )
replay_buffer = ExperienceBuffer(capacity = params['replay_size'])
total_rewards = []
ts = time.time()
episode_reward = 0
agent = Agent(env, replay_buffer)
print(device)
best_mean_reward = None
ts_frame = 0
#Main Training Loop
obs = env.reset
for step in itertools.count():
epsilon = np.interp(step, [0, params['epsilon_frames']], [params['epsilon_start'], params['epsilon_final']])
reward = agent.play_step(dqn_network, epsilon=epsilon, device=device)
if reward is not None:
total_rewards.append(reward)
speed = (step - ts_frame) / (time.time() - ts)
ts_frame = step
ts = time.time()
mean_reward = np.mean(total_rewards[-100:])
print("%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (
step, len(total_rewards), mean_reward, epsilon,
speed
))
writer.add_scalar("epsilon", epsilon, step)
writer.add_scalar("speed", speed, step)
writer.add_scalar("reward_100", mean_reward, step)
writer.add_scalar("reward", reward, step)
if best_mean_reward is None or best_mean_reward < mean_reward:
torch.save(dqn_network.state_dict(), params['env_name'] + "-best.pt")
if best_mean_reward is not None:
print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
best_mean_reward = mean_reward
if mean_reward > params['stop_reward']:
print(f"Solved in {step} frames!")
break
if len(replay_buffer) < params['replay_initial']:
continue
if step % params['target_net_sync'] == 0:
tgt_net.load_state_dict(dqn_network.state_dict())
obses = np.asarray([t[0] for t in transitions])
actions = np.asarray([t[1] for t in transitions])
rews = np.asarray([t[2] for t in transitions])
dones = np.asarray([t[3] for t in transitions])
new_obses = np.asarray([t[4] for t in transitions])
obses_t = torch.as_tensor(obses, dtype=torch.float32)
actions_t = torch.as_tensor(actions, dtype=torch.int64).unsqueeze(-1)
rews_t = torch.as_tensor(rews, dtype=torch.float32).unsqueeze(-1)
dones_t = torch.as_tensor(dones, dtype=torch.float32).unsqueeze(-1)
new_obses_t = torch.as_tensor(new_obses, dtype=torch.float32)"""
optimizer.zero_grad()
batch = replay_buffer.sample(params['batch_size'])
loss_t = calc_loss(batch, dqn_network, tgt_net, device=device)
loss_t.backward()
optimizer.step()
writer.close()
如果您可以从头开始制作 atari(pong)基本 dqn(包装器除外),请分享该文件。
我尝试更改 dqn 前向函数和包装器,但仍然没有解决。
网上也没有最新的代码。在 Clean-RL、天手等上他们使用模块
我设法修复它,
这是更新后的代码:
import random
import torch
from torch import nn
import numpy as np
import PIL
from torch.utils.tensorboard import SummaryWriter
import gym
from stable_baselines3.common.atari_wrappers import (
FireResetEnv,
MaxAndSkipEnv
)
import time
import itertools
import collections
params = {
'env_name': "PongNoFrameskip-v4",
'stop_reward': 18.0,
'run_name': 'pong',
'replay_size': 100000,
'replay_initial': 10000,
'target_net_sync': 1000,
'epsilon_frames': 10**5,
'epsilon_start': 1.0,
'epsilon_final': 0.02,
'learning_rate': 0.0001,
'gamma': 0.99,
'batch_size': 32
}
env = gym.make(params['env_name'])
env = MaxAndSkipEnv(env, skip=4)
if "FIRE" in env.unwrapped.get_action_meanings():
env = FireResetEnv(env)
env = gym.wrappers.ResizeObservation(env, (84, 84))
env = gym.wrappers.GrayScaleObservation(env)
env = gym.wrappers.FrameStack(env, 4)
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])
class DQNetwork(nn.Module):
def __init__(self, env):
super().__init__()
self.env = env
self.input_size = env.observation_space.shape[0]
self.output_size = env.action_space.n
self.conv_net = nn.Sequential(
nn.Conv2d(self.input_size, 32, 8, stride=4),
nn.ReLU(),
nn.Conv2d(32, 64, 4, stride=2),
nn.ReLU(),
nn.Conv2d(64, 64, 3, stride=1),
nn.ReLU(),
nn.Flatten()
)
conv_out_size = self._n_conv_out()
self.fc_out = nn.Sequential(
nn.Linear(conv_out_size, 512),
nn.ReLU(),
nn.Linear(512, self.output_size)
)
def _n_conv_out(self):
inputs = torch.rand(self.env.observation_space.shape)
size = self.conv_net(inputs).shape
return np.prod(size)
def forward(self, x):
out = self.conv_net(x / 255.0).view(x.size()[0], -1)
return self.fc_out(out)
def calc_loss(batch, net, tgt_net, device="cpu", GAMMA = params['gamma']):
states, actions, rewards, dones, next_states = batch
states_v = torch.from_numpy(states).to(device)
next_states_v = torch.from_numpy(next_states).to(device)
actions_v = torch.from_numpy(actions).to(device)
rewards_v = torch.from_numpy(rewards).to(device)
done_mask = torch.BoolTensor(dones).to(device)
state_action_values = net(states_v).gather(1, index = actions_v.unsqueeze(-1)).squeeze(-1)
next_state_values = tgt_net(next_states_v).max(1)[0]
next_state_values[done_mask] = 0.0
next_state_values = next_state_values.detach()
expected_state_action_values = next_state_values * GAMMA + rewards_v
return nn.MSELoss()(state_action_values, expected_state_action_values)
class Agent:
def __init__(self, env, exp_buffer):
self.env = env
self.exp_buffer = exp_buffer
self._reset()
def _reset(self):
self.state, _ = env.reset()
self.total_reward = 0.0
def play_step(self, net, epsilon=0.0, device="cpu"):
done_reward = None
if np.random.random() < epsilon:
action = env.action_space.sample()
else:
state_a = np.array([self.state], copy=False)
state_v = torch.tensor(state_a).to(device)
q_vals_v = net(state_v)
_, act_v = torch.max(q_vals_v, dim=1)
action = int(act_v.item())
# do step in the environment
new_state, reward, ter, trunc, _ = self.env.step(action)
is_done = ter or trunc
self.total_reward += reward
exp = Experience(self.state, action, reward, is_done, new_state)
self.exp_buffer.append(exp)
self.state = new_state
if is_done:
done_reward = self.total_reward
self._reset()
return done_reward
writer = SummaryWriter(f"runs/DqnPong-v1")
device = torch.device("cpu")
dqn_network = DQNetwork(env).to(device)
optimizer = torch.optim.Adam(dqn_network.parameters(), lr=params['learning_rate'])
tgt_net = DQNetwork(env).to(device)
tgt_net.load_state_dict(dqn_network.state_dict())
class ExperienceBuffer:
def __init__(self, capacity):
self.experienceBuffer = collections.deque(maxlen = capacity)
def __len__(self):
return len(self.experienceBuffer)
def append(self, exp):
self.experienceBuffer.append(exp)
def sample(self, batch_size):
indices = np.random.choice(len(self.experienceBuffer), batch_size, replace=False)
states, actions, rewards, dones, next_states = zip(*[self.experienceBuffer[idx] for idx in indices])
return ( np.array(states),
np.array(actions, dtype=np.int64),
np.array(rewards, dtype=np.float32),
np.array(dones, dtype=np.uint8),
np.array(next_states) )
replay_buffer = ExperienceBuffer(capacity = params['replay_size'])
total_rewards = []
ts = time.time()
episode_reward = 0
agent = Agent(env, replay_buffer)
print(device)
best_mean_reward = None
ts_frame = 0
#Main Training Loop
obs, _ = env.reset()
for step in itertools.count():
epsilon = np.interp(step, [0, params['epsilon_frames']], [params['epsilon_start'], params['epsilon_final']])
reward = agent.play_step(dqn_network, epsilon=epsilon, device=device)
if reward is not None:
total_rewards.append(reward)
speed = (step - ts_frame) / (time.time() - ts)
ts_frame = step
ts = time.time()
mean_reward = np.mean(total_rewards[-100:])
print("%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (
step, len(total_rewards), mean_reward, epsilon,
speed
))
writer.add_scalar("epsilon", epsilon, step)
writer.add_scalar("speed", speed, step)
writer.add_scalar("reward_100", mean_reward, step)
writer.add_scalar("reward", reward, step)
if best_mean_reward is None or best_mean_reward < mean_reward:
torch.save(dqn_network.state_dict(), params['env_name'] + "-best.pt")
if best_mean_reward is not None:
print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
best_mean_reward = mean_reward
if mean_reward > params['stop_reward']:
print(f"Solved in {step} frames!")
break
if len(replay_buffer) < params['replay_initial']:
continue
if step % params['target_net_sync'] == 0:
tgt_net.load_state_dict(dqn_network.state_dict())
"""total_rewards.append(reward)
replay_buffer.append(())
mean_reward = np.mean(total_rewards[-100:])
transitions = replay_buffer.sample(params['batch_size'])"""
"""obses = np.asarray([t[0] for t in transitions])
actions = np.asarray([t[1] for t in transitions])
rews = np.asarray([t[2] for t in transitions])
dones = np.asarray([t[3] for t in transitions])
new_obses = np.asarray([t[4] for t in transitions])
obses_t = torch.as_tensor(obses, dtype=torch.float32)
actions_t = torch.as_tensor(actions, dtype=torch.int64).unsqueeze(-1)
rews_t = torch.as_tensor(rews, dtype=torch.float32).unsqueeze(-1)
dones_t = torch.as_tensor(dones, dtype=torch.float32).unsqueeze(-1)
new_obses_t = torch.as_tensor(new_obses, dtype=torch.float32)"""
optimizer.zero_grad()
batch = replay_buffer.sample(params['batch_size'])
loss_t = calc_loss(batch, dqn_network, tgt_net, device=device)
loss_t.backward()
optimizer.step()
writer.close()
我通过更新 .reset() 调用并返回错误修复了该错误 随着新健身房版本的回归
observation, info
所以我把它改为
obs, _ = env.reset