我正在开发 LSTM RecurrentPPO,它需要行为克隆实现。
Stable Baselines 3 提供的模仿库(请参见此处:https://imitation.readthedocs.io/en/latest/)似乎不是为 SB3-contrib 的 RecurrentPPO 制作的。
我发现这种方法可以适用于 RecurrentPPO :https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/sb3/pretraining.ipynb
我想这部分代码必须修改才能考虑lstm_states和episode_starts,但我不知道如何实现它。
def pretrain_agent(
student,
batch_size=64,
epochs=1000,
scheduler_gamma=0.7,
learning_rate=1.0,
log_interval=100,
no_cuda=True,
seed=1,
test_batch_size=64,
):
use_cuda = not no_cuda and th.cuda.is_available()
th.manual_seed(seed)
device = th.device("cuda" if use_cuda else "cpu")
kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}
if isinstance(env.action_space, gym.spaces.Box):
criterion = nn.MSELoss()
else:
criterion = nn.CrossEntropyLoss()
# Extract initial policy
model = student.policy.to(device)
def train(model, device, train_loader, optimizer):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
if isinstance(env.action_space, gym.spaces.Box):
# A2C/PPO policy outputs actions, values, log_prob
# SAC/TD3 policy outputs actions only
if isinstance(student, (A2C, PPO)):
action, _, _ = model(data)
else:
# SAC/TD3:
action = model(data)
action_prediction = action.double()
else:
# Retrieve the logits for A2C/PPO when using discrete actions
dist = model.get_distribution(data)
action_prediction = dist.distribution.logits
target = target.long()
loss = criterion(action_prediction, target)
loss.backward()
optimizer.step()
if batch_idx % log_interval == 0:
print(
"Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
epoch,
batch_idx * len(data),
len(train_loader.dataset),
100.0 * batch_idx / len(train_loader),
loss.item(),
)
)
def test(model, device, test_loader):
model.eval()
test_loss = 0
with th.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
if isinstance(env.action_space, gym.spaces.Box):
# A2C/PPO policy outputs actions, values, log_prob
# SAC/TD3 policy outputs actions only
if isinstance(student, (A2C, PPO)):
action, _, _ = model(data)
else:
# SAC/TD3:
action = model(data)
action_prediction = action.double()
else:
# Retrieve the logits for A2C/PPO when using discrete actions
dist = model.get_distribution(data)
action_prediction = dist.distribution.logits
target = target.long()
test_loss = criterion(action_prediction, target)
test_loss /= len(test_loader.dataset)
print(f"Test set: Average loss: {test_loss:.4f}")
# Here, we use PyTorch `DataLoader` to our load previously created `ExpertDataset` for training
# and testing
train_loader = th.utils.data.DataLoader(
dataset=train_expert_dataset, batch_size=batch_size, shuffle=True, **kwargs
)
test_loader = th.utils.data.DataLoader(
dataset=test_expert_dataset,
batch_size=test_batch_size,
shuffle=True,
**kwargs,
)
# Define an Optimizer and a learning rate schedule.
optimizer = optim.Adadelta(model.parameters(), lr=learning_rate)
scheduler = StepLR(optimizer, step_size=1, gamma=scheduler_gamma)
# Now we are finally ready to train the policy model.
for epoch in range(1, epochs + 1):
train(model, device, train_loader, optimizer)
test(model, device, test_loader)
scheduler.step()
# Implant the trained policy network back into the RL student agent
a2c_student.policy = model
有没有人有解决办法??
致以诚挚的问候。
我也偶然发现了这个问题。
Traceback (most recent call last):
File "my_imitate.py", line 49, in <module>
bc_trainer.train(n_epochs=1)
File "python3.8/site-packages/imitation/algorithms/bc.py", line 470, in train
training_metrics = self.loss_calculator(self.policy, obs, acts)
File "python3.8/site-packages/imitation/algorithms/bc.py", line 119, in __call__
_, log_prob, entropy = policy.evaluate_actions(obs, acts)
TypeError: evaluate_actions() missing 2 required positional arguments: 'lstm_states' and 'episode_starts'
问题显然是
evaluate_actions
中的RecurrentActorCriticPolicy
对于evaluate_actions
有不同的签名,它也需要lstm_states
和episode_starts
。这意味着在推出收集期间也需要存储此信息(我认为会,但事实并非如此)。
解决方案是在推出收集期间存储缺失的信息,并在 BC 期间处理它们(如果它们存在并且与当前的策略兼容)。