我已经编写了将目标对象移动到 2D 表面上的特定目标位置的代码。
import math
import numpy as np
import matplotlib.pyplot as plt
import gym
from gym import spaces
from stable_baselines3 import PPO
from gym import spaces
THRESHOLD = 5 # Defines boundaries of environment
GOAL = np.random.uniform(-THRESHOLD, THRESHOLD, 2) # Coordinates
class CartPushEnv2D(gym.Env):
metadata = {
"render_modes": ["human", "rgb_array"]
}
def __init__(self):
self.current_time = 0
self.action_space = spaces.Discrete(4) # Left, right, up and down
self.observation_space = spaces.Box(low = -THRESHOLD, high = THRESHOLD, shape = (4,), dtype=np.float32)
def step(self, action):
x_c_o, y_c_o, x_o_g, y_o_g = self.observation # Vectors between the three points: Cart (c), Object (o), Goal (g)
x_o = GOAL[0] + x_o_g
y_o = GOAL[1] + y_o_g
x_c = x_o + x_c_o
y_c = y_o + y_c_o
if (action == 0) and (x_c > -THRESHOLD):
x_c -= 0.1
elif (action == 1) and (x_c < THRESHOLD):
x_c += 0.1
elif (action == 2) and (y_c > -THRESHOLD):
y_c -= 0.1
if (action == 3) and (y_c < THRESHOLD):
y_c += 0.1
if np.linalg.norm(np.array([x_c, y_c]) - np.array([x_o,y_o])) <= math.sqrt(2): # If the cart moves into the object...
if action == 0:
x_o -= 0.1
elif action == 1:
x_o += 0.1
elif action == 2:
y_o -= 0.1
elif action == 3:
y_o += 0.1
x_o_g = x_o - GOAL[0]
y_o_g = y_o - GOAL[1]
x_c_o = x_c - x_o
y_c_o = y_c - y_o
self.observation = x_c_o, y_c_o, x_o_g, y_o_g
if self.current_time >= 1000: # episode length: 1000
done = True
else:
done = bool(np.linalg.norm([x_o_g, y_o_g]) <= math.sqrt(2))
reward = - np.linalg.norm([x_c_o, y_c_o]) - np.linalg.norm([x_o_g, y_o_g]) # Reward is a combination of cart-object distance and object-goal distance
info = {}
self.current_time += 1
return np.array(self.observation).astype(np.float32), reward, done, info
def reset(self):
self.observation = np.random.uniform(-THRESHOLD, THRESHOLD, (4,))
self.current_time = 0
return np.array(self.observation).astype(np.float32)
env = CartPushEnv2D()
model = PPO('MlpPolicy', env, verbose=1)
model.learn(100000)
obs = env.reset()
episodes = 0
reward_list = []
done = False
dist_c_g = []
dist_o_g = []
while not done:
action, states = model.predict(obs)
obs, reward, done, info = env.step(action)
reward_list.append(reward)
episodes += 1
dist_c_g.append(np.linalg.norm(obs[0:2] + obs[2:4]))
dist_o_g.append(np.linalg.norm(obs[2:4]))
def graph_plot():
x = list(range(episodes))
y = dist_c_g
z = dist_o_g
plt.plot(x, y, label = "Cart distance")
plt.plot(x, z, label = "Object distance")
plt.xlabel('timestep')
plt.ylabel('distance to goal')
plt.title('2D translation task')
plt.legend()
plt.show()
graph_plot()
然而,当我运行它时,我得到这样的图表......
问题是在训练期间,平均情节奖励和损失是波动的。我不知道他们为什么这样做,但调整超参数(例如
learning_rate
,ent_coeff
等)没有做任何事情。
训练时间不是问题;我已经完成了数百万个时间步,结果是一样的。剧集长度也不是。我玩过奖励功能(例如,以不同方式加权购物车对象或对象目标距离,尝试稀疏而不是密集奖励等)但一无所获。
应该指出的是,我已经成功地训练了推车以达到目标。问题是让它到达目标,然后将其推向目标。
任何建议将不胜感激。
谢谢!