我现在正在实施PPO。正在实施一种具有一个输入和一个输出变量的算法,并且这些错误不断发生。
第 89 行,在 train_net 中 ratio = torch.exp(pi_a.squeeze(dim=1)) / torch.exp(torch.log(prob_a.squeeze(dim=1))) IndexError: Dimension out of range(预期在[-1, 0]范围内,但得到了1)
your text
进口健身房
your text
进口手电筒
your text
将 torch.nn 导入为 nn
your text
导入 torch.nn.functional 作为 F
your text
导入 torch.optim 作为优化
your text
从 torch.distributions 导入分类
your text
learning_rate = 0.0005
your text
伽玛 = 0.98
your text
lmbda = 0.95
your text
eps_clip = 0.1
your text
K_epoch = 3
your text
T_horizon = 20
your text
PPO类(nn.Module):
定义init(自我):
超级(PPO,自我)。init()
self.data = []
self.fc1 = nn.Linear(2, 64)
self.fc_out = nn.Linear(64, 3)
self.fc_value = nn.Linear(64, 1)
self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
def pi(self, x):
x = F.relu(self.fc1(x))
x = self.fc_out(x)
return x
def v(self, x):
x = F.relu(self.fc1(x))
v = self.fc_value(x)
return v
def put_data(self, transition):
self.data.append(transition)
def make_batch(self):
s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], []
for transition in self.data:
s, a, r, s_prime, prob_a, done = transition
s_lst.append(s)
a_lst.append(a)
r_lst.append(r)
s_prime_lst.append(s_prime)
prob_a_lst.append(prob_a)
done_mask = 0 if done else 1
done_lst.append([done_mask])
s, a, r, s_prime, done_mask, prob_a = (
torch.tensor(s_lst, dtype=torch.float),
torch.tensor(a_lst, dtype=torch.long),
torch.tensor(r_lst, dtype=torch.float),
torch.tensor(s_prime_lst, dtype=torch.float),
torch.tensor(done_lst, dtype=torch.float),
torch.tensor(prob_a_lst, dtype=torch.float)
)
self.data = []
return s, a, r, s_prime, done_mask, prob_a
def train_net(self):
s, a, r, s_prime, done_mask, prob_a = self.make_batch()
for i in range(K_epoch):
td_target = r + gamma * self.v(s_prime) * done_mask
delta = td_target - self.v(s)
delta = delta.detach()
advantage_lst = []
advantage = torch.tensor(0.0, dtype=torch.float)
for delta_t in reversed(delta):
advantage = gamma * lmbda * advantage + delta_t
advantage_lst.append(advantage)
advantage_lst.reverse()
advantage = torch.cat(advantage_lst).unsqueeze(1)
pi = self.pi(s)
m = Categorical(F.softmax(pi, dim=1))
pi_a = m.log_prob(a)
prob_a = F.softmax(pi, dim=1)
ratio = torch.exp(pi_a.squeeze(dim=1)) / torch.exp(torch.log(prob_a.squeeze(dim=1)))
surr1 = ratio * advantage
surr2 = torch.clamp(ratio, 1 - eps_clip, 1 + eps_clip) * advantage
loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s), td_target.detach())
self.optimizer.zero_grad()
loss.mean().backward()
self.optimizer.step()
your text
定义主要():
env = gym.make('MountainCar-v0')
模型 = PPO()
score = 0.0
print_interval = 20
for n_epi in range(10000):
s = env.reset()
done = False
while not done:
for t in range(T_horizon):
s_tensor = torch.from_numpy(s).float().unsqueeze(0)
action = model.pi(s_tensor)
m = Categorical(F.softmax(action, dim=1))
a = m.sample().item()
s_prime, r, done, info = env.step(a)
model.put_data((s, a, r / 100.0, s_prime, action.squeeze().detach().numpy(), done))
s = s_prime
score += r
if done:
break
model.train_net()
if n_epi % print_interval == 0 and n_epi != 0:
print("# of episode :{}, avg score : {:.1f}".format(n_epi, score / print_interval))
score = 0.0
env.close()
your text
如果name =='main':
主要()
我试着改变那一行:.
ratio = torch.exp(pi_a.squeeze(dim=0)) / torch.exp(torch.log(prob_a.squeeze(dim=0)))
但我意识到它并没有解决它。 运行代码后,我想在执行窗口中显示剧集数和平均分数,但这太难了。帮帮我……