我在这里附上我的代码
def train(opt):
if torch.cuda.is_available():
torch.cuda.manual_seed(123)
else:
torch.manual_seed(123)
if os.path.isdir(opt.log_path):
shutil.rmtree(opt.log_path)
os.makedirs(opt.log_path)
if not os.path.isdir(opt.saved_path):
os.makedirs(opt.saved_path)
mp = _mp.get_context("spawn")
envs = MultipleEnvironments(opt.world, opt.stage, opt.action_type, opt.num_processes)
model = PPO(envs.num_states, envs.num_actions)
if torch.cuda.is_available():
model.cuda()
model.share_memory()
process = mp.Process(target=eval, args=(opt, model, envs.num_states, envs.num_actions))
process.start()
optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr)
[agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns]
curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns]
curr_states = torch.from_numpy(np.concatenate(curr_states, 0))
if torch.cuda.is_available():
curr_states = curr_states.cuda()
curr_episode = 0
episode_plot = []
R_plot = []
ep_reward_plot = []
start_datetime = datetime.datetime.now().strftime("%m-%d_%H-%M")
while True:
if curr_episode % opt.save_interval == 0 and curr_episode > 0:
# torch.save(model.state_dict(),
# "{}/ppo_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage))
torch.save(model.state_dict(),
"{}/ppo_super_mario_bros_{}_{}_{}".format(opt.saved_path, opt.world, opt.stage, curr_episode))
curr_episode += 1
episode_plot.append(int(curr_episode))
old_log_policies = []
actions = []
values = []
states = []
rewards = []
dones = []
for _ in range(opt.num_local_steps):
states.append(curr_states)
logits, value = model(curr_states)
values.append(value.squeeze())
policy = F.softmax(logits, dim=1)
old_m = Categorical(policy)
action = old_m.sample()
actions.append(action)
old_log_policy = old_m.log_prob(action)
old_log_policies.append(old_log_policy)
if torch.cuda.is_available():
[agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.cpu())]
else:
[agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action)]
state, reward, done, info = zip(*[agent_conn.recv() for agent_conn in envs.agent_conns])
state = torch.from_numpy(np.concatenate(state, 0))
if torch.cuda.is_available():
state = state.cuda()
reward = torch.cuda.FloatTensor(reward)
done = torch.cuda.FloatTensor(done)
else:
reward = torch.FloatTensor(reward)
done = torch.FloatTensor(done)
rewards.append(reward)
dones.append(done)
curr_states = state
_, next_value, = model(curr_states)
next_value = next_value.squeeze()
old_log_policies = torch.cat(old_log_policies).detach()
actions = torch.cat(actions)
values = torch.cat(values).detach()
states = torch.cat(states)
gae = 0
R = []
for value, reward, done in list(zip(values, rewards, dones))[::-1]:
gae = gae * opt.gamma * opt.tau
gae = gae + reward + opt.gamma * next_value.detach() * (1 - done) - value.detach()
next_value = value
R.append(gae + value)
R = R[::-1]
R = torch.cat(R).detach()
advantages = R - values
print("mean big R:", torch.mean(R).item())
episode_reward_mean = torch.stack(rewards).mean(dim=1, keepdim=True).sum().item()
print("mean reward", episode_reward_mean)
R_plot.append(torch.mean(R).item())
ep_reward_plot.append(episode_reward_mean)
plt.plot(episode_plot,R_plot,"r-")
plt.xlabel('Episode')
plt.ylabel('Mean R (PPO)')
plt.savefig("ppo_R_episode_{}.pdf".format(start_datetime))
plt.close()
plt.plot(episode_plot,ep_reward_plot,"r-")
plt.xlabel('Episode')
plt.ylabel('Mean Reward (PPO)')
plt.savefig("ppo_reward_episode_{}.pdf".format(start_datetime))
plt.close()
np.savetxt("ppo_R_episode_{}.csv".format(start_datetime), np.array(R_plot), delimiter=",")
np.savetxt("ppo_reward_episode_{}.csv".format(start_datetime), np.array(ep_reward_plot), delimiter=",")
for i in range(opt.num_epochs):
indice = torch.randperm(opt.num_local_steps * opt.num_processes)
for j in range(opt.batch_size):
batch_indices = indice[
int(j * (opt.num_local_steps * opt.num_processes / opt.batch_size)): int((j + 1) * (
opt.num_local_steps * opt.num_processes / opt.batch_size))]
logits, value = model(states[batch_indices])
new_policy = F.softmax(logits, dim=1)
new_m = Categorical(new_policy)
new_log_policy = new_m.log_prob(actions[batch_indices])
ratio = torch.exp(new_log_policy - old_log_policies[batch_indices])
actor_loss = -torch.mean(torch.min(ratio * advantages[batch_indices],
torch.clamp(ratio, 1.0 - opt.epsilon, 1.0 + opt.epsilon) *
advantages[
batch_indices]))
# critic_loss = torch.mean((R[batch_indices] - value) ** 2) / 2
critic_loss = F.smooth_l1_loss(R[batch_indices], value.squeeze())
entropy_loss = torch.mean(new_m.entropy())
total_loss = actor_loss + critic_loss - opt.beta * entropy_loss
optimizer.zero_grad()
total_loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step()
print("Episode: {}. Total loss: {}".format(curr_episode, total_loss))
if __name__ == "__main__":
opt = get_args()
train(opt)
我收到的错误实际上是以下内容:
文件“d:/Mini Project2/train.py”,第 173 行,在 火车(选择) 文件“d:/Mini Project2/train.py”,第 52 行,在 train 中 envs = MultipleEnvironments(opt.world,opt.stage,opt.action_type,opt.num_processes) init 中的文件“d:\Mini Project2\src nv.py”,第 125 行 进程.start() 文件“C:\Users\hk598\AppData\Local\Programs\Python\Python38\lib\multiprocessing\process.py”,第 121 行,在开始 self._popen = self._Popen(自我) 文件“C:\Users\hk598\AppData\Local\Programs\Python\Python38\lib\multiprocessin