我一直在自定义健身房环境中使用DDPG代理,它对不同的动作空间有不同的限制,代码如下所示:
self.action_space = spaces.Box(
low=np.array([self.constraints[channel][0] for channel in self.channels]),
high=np.array([self.constraints[channel][1] for channel in self.channels]),
dtype=np.float64,
)
示例法律行动空间类似于
array([0.44, 0.58, 1.05, 0.12])
。 4 个不同的通道,全部限制在该值的 20% 以内。
我使用的代理如下(为了整洁起见,部分代码):
class ActorNetwork(nn.Module):
def __init__(self, alpha, #and other params):
super(ActorNetwork, self).__init__()
#defined the actor achitecture here
def forward(self, state):
x = self.fc1(state)
x = self.bn1(x)
x = F.relu(x)
x = self.fc2(x)
x = self.bn2(x)
x = F.relu(x)
x = T.sigmoid(self.mu(x))*1.5 #1.5 is the max value in 4 channels
return x
class Agent(object):
def __init__(self, alpha, #and other params ):
#initialized Actor-Critic Network
def choose_action(self, observation):
self.actor.eval()
observation = T.tensor(observation, dtype=T.float).to(self.actor.device)
mu = self.actor.forward(observation).to(self.actor.device)
mu_prime = mu + T.tensor(np.random.normal(0.0, 0.07,size=4), #hard coded to test
dtype=T.float).to(self.actor.device)
mu_prime = T.clamp(mu_prime, min=0.0, max = 1.5)
self.actor.train()
return mu_prime.cpu().detach().numpy()
def predict_next_state(self, observation):
self.actor.eval()
observation = T.tensor(observation, dtype=T.float).to(self.actor.device)
mu = self.actor.forward(observation).to(self.actor.device)
return mu.cpu().detach().numpy()
def learn(self):
if self.memory.mem_cntr < self.batch_size:
return
state, action, reward, new_state, done = \
self.memory.sample_buffer(self.batch_size)
reward = T.tensor(reward, dtype=T.float).to(self.critic.device)
done = T.tensor(done).to(self.critic.device)
new_state = T.tensor(new_state, dtype=T.float).to(self.critic.device)
action = T.tensor(action, dtype=T.float).to(self.critic.device)
state = T.tensor(state, dtype=T.float).to(self.critic.device)
self.critic.eval()
target_actions = self.target_actor.forward(new_state)
critic_value_ = self.target_critic.forward(new_state, target_actions)
critic_value = self.critic.forward(state, action)
#updating network with losses
但是在训练期间,代理输出的动作超出了动作空间框环境中定义的约束。我如何实现这一点,以便代理仅从操作空间中选择操作?目前,我对每项非法行为的处罚为与合法空间之差的平方和。它似乎不适用于更多数量的动作空间,因为代理只是提前收敛并且之后不再学习。
我也面临着类似的问题。您能找到问题的解决方案吗?