本实验在 OpenAI Gym 上训练近端策略优化(PPO)代理 Atari Breakout 游戏。它在多个进程上运行游戏环境以进行高效采样。
16from typing import Dict
17
18import numpy as np
19import torch
20from torch import nn
21from torch import optim
22from torch.distributions import Categorical
23
24from labml import monit, tracker, logger, experiment
25from labml.configs import FloatDynamicHyperParam, IntDynamicHyperParam
26from labml_helpers.module import Module
27from labml_nn.rl.game import Worker
28from labml_nn.rl.ppo import ClippedPPOLoss, ClippedValueFunctionLoss
29from labml_nn.rl.ppo.gae import GAE选择设备
32if torch.cuda.is_available():
33 device = torch.device("cuda:0")
34else:
35 device = torch.device("cpu")38class Model(Module):43 def __init__(self):
44 super().__init__()第一个卷积层采用 84x84 帧并生成 20x20 帧
48 self.conv1 = nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4)第二个卷积层采用 20x20 帧并生成 9x9 的帧
52 self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2)第三个卷积层采用 9x9 帧并生成 7x7 帧
56 self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1)完全连接的图层从第三个卷积图层获取平坦的帧,并输出 512 个要素
61 self.lin = nn.Linear(in_features=7 * 7 * 64, out_features=512)一个完全连接的层,用于获取日志
64 self.pi_logits = nn.Linear(in_features=512, out_features=4)一个完全连接的层来获取价值函数
67 self.value = nn.Linear(in_features=512, out_features=1)70 self.activation = nn.ReLU()72 def forward(self, obs: torch.Tensor):
73 h = self.activation(self.conv1(obs))
74 h = self.activation(self.conv2(h))
75 h = self.activation(self.conv3(h))
76 h = h.reshape((-1, 7 * 7 * 64))
77
78 h = self.activation(self.lin(h))
79
80 pi = Categorical(logits=self.pi_logits(h))
81 value = self.value(h).reshape(-1)
82
83 return pi, value将观测值从缩放[0, 255]
到[0, 1]
86def obs_to_torch(obs: np.ndarray) -> torch.Tensor:88 return torch.tensor(obs, dtype=torch.float32, device=device) / 255.91class Trainer:96 def __init__(self, *,
97 updates: int, epochs: IntDynamicHyperParam,
98 n_workers: int, worker_steps: int, batches: int,
99 value_loss_coef: FloatDynamicHyperParam,
100 entropy_bonus_coef: FloatDynamicHyperParam,
101 clip_range: FloatDynamicHyperParam,
102 learning_rate: FloatDynamicHyperParam,
103 ):更新次数
107 self.updates = updates使用采样数据训练模型的周期数
109 self.epochs = epochs工作进程的数量
111 self.n_workers = n_workers单次更新的每个进程要运行的步骤数
113 self.worker_steps = worker_steps微型批次数
115 self.batches = batches单次更新的样本总数
117 self.batch_size = self.n_workers * self.worker_steps小批量的大小
119 self.mini_batch_size = self.batch_size // self.batches
120 assert (self.batch_size % self.batches == 0)价值损失系数
123 self.value_loss_coef = value_loss_coef熵加成系数
125 self.entropy_bonus_coef = entropy_bonus_coef裁剪范围
128 self.clip_range = clip_range学习率
130 self.learning_rate = learning_rate创建工作人员
135 self.workers = [Worker(47 + i) for i in range(self.n_workers)]初始化观测值的张量
138 self.obs = np.zeros((self.n_workers, 4, 84, 84), dtype=np.uint8)
139 for worker in self.workers:
140 worker.child.send(("reset", None))
141 for i, worker in enumerate(self.workers):
142 self.obs[i] = worker.child.recv()模型
145 self.model = Model().to(device)优化者
148 self.optimizer = optim.Adam(self.model.parameters(), lr=2.5e-4)使用和的 GAE
151 self.gae = GAE(self.n_workers, self.worker_steps, 0.99, 0.95)PPO 损失
154 self.ppo_loss = ClippedPPOLoss()价值损失
157 self.value_loss = ClippedValueFunctionLoss()159 def sample(self) -> Dict[str, torch.Tensor]:164 rewards = np.zeros((self.n_workers, self.worker_steps), dtype=np.float32)
165 actions = np.zeros((self.n_workers, self.worker_steps), dtype=np.int32)
166 done = np.zeros((self.n_workers, self.worker_steps), dtype=np.bool)
167 obs = np.zeros((self.n_workers, self.worker_steps, 4, 84, 84), dtype=np.uint8)
168 log_pis = np.zeros((self.n_workers, self.worker_steps), dtype=np.float32)
169 values = np.zeros((self.n_workers, self.worker_steps + 1), dtype=np.float32)
170
171 with torch.no_grad():每位工作人员worker_steps
的样本
173 for t in range(self.worker_steps):self.obs
跟踪来自每个 worker 的最后一个观测值,这是模型对下一个操作进行采样的输入
176 obs[:, t] = self.obs每个 worker 的示例操作;这会返回大小数组n_workers
179 pi, v = self.model(obs_to_torch(self.obs))
180 values[:, t] = v.cpu().numpy()
181 a = pi.sample()
182 actions[:, t] = a.cpu().numpy()
183 log_pis[:, t] = pi.log_prob(a).cpu().numpy()对每个 worker 运行采样操作
186 for w, worker in enumerate(self.workers):
187 worker.child.send(("step", actions[w, t]))
188
189 for w, worker in enumerate(self.workers):执行操作后获得结果
191 self.obs[w], rewards[w, t], done[w, t], info = worker.child.recv()收集剧集信息,在剧集结束后可用;这包括总奖励和剧集长度——看看Game
它是如何运作的。
196 if info:
197 tracker.add('reward', info['reward'])
198 tracker.add('length', info['length'])在最后一步之后获取的值
201 _, v = self.model(obs_to_torch(self.obs))
202 values[:, self.worker_steps] = v.cpu().numpy()计算优势
205 advantages = self.gae(done, rewards, values)208 samples = {
209 'obs': obs,
210 'actions': actions,
211 'values': values[:, :-1],
212 'log_pis': log_pis,
213 'advantages': advantages
214 }样本目前在[workers, time_step]
表中,我们应该将其压平以进行训练
218 samples_flat = {}
219 for k, v in samples.items():
220 v = v.reshape(v.shape[0] * v.shape[1], *v.shape[2:])
221 if k == 'obs':
222 samples_flat[k] = obs_to_torch(v)
223 else:
224 samples_flat[k] = torch.tensor(v, device=device)
225
226 return samples_flat228 def train(self, samples: Dict[str, torch.Tensor]):随着时代数量的增加,它学习得更快,但会变得有点不稳定;也就是说,平均剧集奖励不会随着时间的推移而单调增加。可能会缩小剪切范围可能会解决这个问题。
238 for _ in range(self.epochs()):随机播放每个时代
240 indexes = torch.randperm(self.batch_size)每小批次
243 for start in range(0, self.batch_size, self.mini_batch_size):获得小批量
245 end = start + self.mini_batch_size
246 mini_batch_indexes = indexes[start: end]
247 mini_batch = {}
248 for k, v in samples.items():
249 mini_batch[k] = v[mini_batch_indexes]火车
252 loss = self._calc_loss(mini_batch)设置学习速率
255 for pg in self.optimizer.param_groups:
256 pg['lr'] = self.learning_rate()将先前计算的梯度归零
258 self.optimizer.zero_grad()计算梯度
260 loss.backward()剪辑渐变
262 torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=0.5)根据渐变更新参数
264 self.optimizer.step()266 @staticmethod
267 def _normalize(adv: torch.Tensor):269 return (adv - adv.mean()) / (adv.std() + 1e-8)271 def _calc_loss(self, samples: Dict[str, torch.Tensor]) -> torch.Tensor:从中抽样的返回
277 sampled_return = samples['values'] + samples['advantages']283 sampled_normalized_advantage = self._normalize(samples['advantages'])采样观测值被输入到模型中以获取和;我们将观测值视为状态
287 pi, value = self.model(samples['obs']),是从中采样的动作
290 log_pi = pi.log_prob(samples['actions'])计算保单损失
293 policy_loss = self.ppo_loss(log_pi, samples['log_pis'], sampled_normalized_advantage, self.clip_range())299 entropy_bonus = pi.entropy()
300 entropy_bonus = entropy_bonus.mean()计算值函数损失
303 value_loss = self.value_loss(value, samples['values'], sampled_return, self.clip_range())308 loss = (policy_loss
309 + self.value_loss_coef() * value_loss
310 - self.entropy_bonus_coef() * entropy_bonus)用于监控
313 approx_kl_divergence = .5 * ((samples['log_pis'] - log_pi) ** 2).mean()添加到追踪器
316 tracker.add({'policy_reward': -policy_loss,
317 'value_loss': value_loss,
318 'entropy_bonus': entropy_bonus,
319 'kl_div': approx_kl_divergence,
320 'clip_fraction': self.ppo_loss.clip_fraction})
321
322 return loss324 def run_training_loop(self):最近 100 集信息
330 tracker.set_queue('reward', 100, True)
331 tracker.set_queue('length', 100, True)
332
333 for update in monit.loop(self.updates):当前政策的样本
335 samples = self.sample()训练模型
338 self.train(samples)保存跟踪的指标。
341 tracker.save()定期在屏幕上添加新行
343 if (update + 1) % 1_000 == 0:
344 logger.log()346 def destroy(self):351 for worker in self.workers:
352 worker.child.send(("close", None))355def main():创建实验
357 experiment.create(name='ppo')配置
359 configs = {更新次数
361 'updates': 10000,365 'epochs': IntDynamicHyperParam(8),工作进程数
367 'n_workers': 8,单次更新的每个进程要运行的步骤数
369 'worker_steps': 128,微型批次数
371 'batches': 4,375 'value_loss_coef': FloatDynamicHyperParam(0.5),379 'entropy_bonus_coef': FloatDynamicHyperParam(0.01),⚙️ 剪辑范围。
381 'clip_range': FloatDynamicHyperParam(0.1),385 'learning_rate': FloatDynamicHyperParam(1e-3, (0, 1e-3)),
386 }
387
388 experiment.configs(configs)初始化训练器
391 m = Trainer(**configs)运行并监控实验
394 with experiment.start():
395 m.run_training_loop()阻止工人
397 m.destroy()401if __name__ == "__main__":
402 main()