PPO 算法不会学习

问题描述 投票:0回答:0

我已经编写了将目标对象移动到 2D 表面上的特定目标位置的代码。

import math
import numpy as np
import matplotlib.pyplot as plt

import gym
from gym import spaces
from stable_baselines3 import PPO

from gym import spaces

THRESHOLD = 5 # Defines boundaries of environment
GOAL = np.random.uniform(-THRESHOLD, THRESHOLD, 2) # Coordinates

class CartPushEnv2D(gym.Env):

    metadata = {
        "render_modes": ["human", "rgb_array"]
    }

    def __init__(self):
        self.current_time = 0
        self.action_space = spaces.Discrete(4) # Left, right, up and down
        self.observation_space = spaces.Box(low = -THRESHOLD, high = THRESHOLD, shape = (4,), dtype=np.float32)

    def step(self, action):
        x_c_o, y_c_o, x_o_g, y_o_g = self.observation # Vectors between the three points: Cart (c), Object (o), Goal (g)

        x_o = GOAL[0] + x_o_g
        y_o = GOAL[1] + y_o_g
        x_c = x_o + x_c_o
        y_c = y_o + y_c_o

        if (action == 0) and (x_c > -THRESHOLD):
            x_c -= 0.1
        elif (action == 1) and (x_c < THRESHOLD):
            x_c += 0.1
        elif (action == 2) and (y_c > -THRESHOLD):
            y_c -= 0.1
        if (action == 3) and (y_c < THRESHOLD):
            y_c += 0.1
        
        if np.linalg.norm(np.array([x_c, y_c]) - np.array([x_o,y_o])) <= math.sqrt(2): # If the cart moves into the object...
            
            if action == 0:
                x_o -= 0.1
            elif action == 1:
                x_o += 0.1
            elif action == 2:
                y_o -= 0.1
            elif action == 3:
                y_o += 0.1

        x_o_g = x_o - GOAL[0]
        y_o_g = y_o - GOAL[1]
        x_c_o = x_c - x_o
        y_c_o = y_c - y_o

        self.observation = x_c_o, y_c_o, x_o_g, y_o_g

        if self.current_time >= 1000: # episode length: 1000
            done = True
        else:
            done = bool(np.linalg.norm([x_o_g, y_o_g]) <= math.sqrt(2))
        
        reward = - np.linalg.norm([x_c_o, y_c_o]) - np.linalg.norm([x_o_g, y_o_g]) # Reward is a combination of cart-object distance and object-goal distance

        info = {}
        self.current_time += 1

        return np.array(self.observation).astype(np.float32), reward, done, info

    def reset(self):
        self.observation = np.random.uniform(-THRESHOLD, THRESHOLD, (4,))
        self.current_time = 0
        return np.array(self.observation).astype(np.float32)

env = CartPushEnv2D()

model = PPO('MlpPolicy', env, verbose=1)
model.learn(100000)
obs = env.reset()
episodes = 0
reward_list = []
done = False
dist_c_g = []
dist_o_g = []

while not done:
    action, states = model.predict(obs)
    obs, reward, done, info = env.step(action)
    reward_list.append(reward)
    episodes += 1
    dist_c_g.append(np.linalg.norm(obs[0:2] + obs[2:4]))
    dist_o_g.append(np.linalg.norm(obs[2:4]))

def graph_plot():

    x = list(range(episodes))
    y = dist_c_g
    z = dist_o_g

    plt.plot(x, y, label = "Cart distance")
    plt.plot(x, z, label = "Object distance")
    plt.xlabel('timestep')
    plt.ylabel('distance to goal')
    plt.title('2D translation task')
    plt.legend()
    plt.show()

graph_plot()

然而,当我运行它时,我得到这样的图表......

.

问题是在训练期间,平均情节奖励和损失是波动的。我不知道他们为什么这样做,但调整超参数(例如

learning_rate
ent_coeff
等)没有做任何事情。

训练时间不是问题;我已经完成了数百万个时间步,结果是一样的。剧集长度也不是。我玩过奖励功能(例如,以不同方式加权购物车对象或对象目标距离,尝试稀疏而不是密集奖励等)但一无所获。

应该指出的是,我已经成功地训练了推车以达到目标。问题是让它到达目标,然后将其推向目标。

任何建议将不胜感激。

谢谢!

python machine-learning reinforcement-learning
© www.soinside.com 2019 - 2024. All rights reserved.