PPO 使用 Atari Breakout 进行实验

本实验在 OpenAI Gym 上训练近端策略优化(PPO)代理 Atari Breakout 游戏。它在多个进程上运行游戏环境以进行高效采样。

Open In ColabView Run

16from typing import Dict
17
18import numpy as np
19import torch
20from torch import nn
21from torch import optim
22from torch.distributions import Categorical
23
24from labml import monit, tracker, logger, experiment
25from labml.configs import FloatDynamicHyperParam, IntDynamicHyperParam
26from labml_helpers.module import Module
27from labml_nn.rl.game import Worker
28from labml_nn.rl.ppo import ClippedPPOLoss, ClippedValueFunctionLoss
29from labml_nn.rl.ppo.gae import GAE

选择设备

32if torch.cuda.is_available():
33    device = torch.device("cuda:0")
34else:
35    device = torch.device("cpu")

型号

38class Model(Module):
43    def __init__(self):
44        super().__init__()

第一个卷积层采用 84x84 帧并生成 20x20 帧

48        self.conv1 = nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4)

第二个卷积层采用 20x20 帧并生成 9x9 的帧

52        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2)

第三个卷积层采用 9x9 帧并生成 7x7 帧

56        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1)

完全连接的图层从第三个卷积图层获取平坦的帧,并输出 512 个要素

61        self.lin = nn.Linear(in_features=7 * 7 * 64, out_features=512)

一个完全连接的层,用于获取日志

64        self.pi_logits = nn.Linear(in_features=512, out_features=4)

一个完全连接的层来获取价值函数

67        self.value = nn.Linear(in_features=512, out_features=1)

70        self.activation = nn.ReLU()
72    def forward(self, obs: torch.Tensor):
73        h = self.activation(self.conv1(obs))
74        h = self.activation(self.conv2(h))
75        h = self.activation(self.conv3(h))
76        h = h.reshape((-1, 7 * 7 * 64))
77
78        h = self.activation(self.lin(h))
79
80        pi = Categorical(logits=self.pi_logits(h))
81        value = self.value(h).reshape(-1)
82
83        return pi, value

将观测值从缩放[0, 255][0, 1]

86def obs_to_torch(obs: np.ndarray) -> torch.Tensor:
88    return torch.tensor(obs, dtype=torch.float32, device=device) / 255.

训练师

91class Trainer:
96    def __init__(self, *,
97                 updates: int, epochs: IntDynamicHyperParam,
98                 n_workers: int, worker_steps: int, batches: int,
99                 value_loss_coef: FloatDynamicHyperParam,
100                 entropy_bonus_coef: FloatDynamicHyperParam,
101                 clip_range: FloatDynamicHyperParam,
102                 learning_rate: FloatDynamicHyperParam,
103                 ):

配置

更新次数

107        self.updates = updates

使用采样数据训练模型的周期数

109        self.epochs = epochs

工作进程的数量

111        self.n_workers = n_workers

单次更新的每个进程要运行的步骤数

113        self.worker_steps = worker_steps

微型批次数

115        self.batches = batches

单次更新的样本总数

117        self.batch_size = self.n_workers * self.worker_steps

小批量的大小

119        self.mini_batch_size = self.batch_size // self.batches
120        assert (self.batch_size % self.batches == 0)

价值损失系数

123        self.value_loss_coef = value_loss_coef

熵加成系数

125        self.entropy_bonus_coef = entropy_bonus_coef

裁剪范围

128        self.clip_range = clip_range

学习率

130        self.learning_rate = learning_rate

初始化

创建工作人员

135        self.workers = [Worker(47 + i) for i in range(self.n_workers)]

初始化观测值的张量

138        self.obs = np.zeros((self.n_workers, 4, 84, 84), dtype=np.uint8)
139        for worker in self.workers:
140            worker.child.send(("reset", None))
141        for i, worker in enumerate(self.workers):
142            self.obs[i] = worker.child.recv()

模型

145        self.model = Model().to(device)

优化者

148        self.optimizer = optim.Adam(self.model.parameters(), lr=2.5e-4)

使用和的 GAE

151        self.gae = GAE(self.n_workers, self.worker_steps, 0.99, 0.95)

PPO 损失

154        self.ppo_loss = ClippedPPOLoss()

价值损失

157        self.value_loss = ClippedValueFunctionLoss()

当前政策的样本数据

159    def sample(self) -> Dict[str, torch.Tensor]:
164        rewards = np.zeros((self.n_workers, self.worker_steps), dtype=np.float32)
165        actions = np.zeros((self.n_workers, self.worker_steps), dtype=np.int32)
166        done = np.zeros((self.n_workers, self.worker_steps), dtype=np.bool)
167        obs = np.zeros((self.n_workers, self.worker_steps, 4, 84, 84), dtype=np.uint8)
168        log_pis = np.zeros((self.n_workers, self.worker_steps), dtype=np.float32)
169        values = np.zeros((self.n_workers, self.worker_steps + 1), dtype=np.float32)
170
171        with torch.no_grad():

每位工作人员worker_steps 的样本

173            for t in range(self.worker_steps):

self.obs 跟踪来自每个 worker 的最后一个观测值,这是模型对下一个操作进行采样的输入

176                obs[:, t] = self.obs

每个 worker 的示例操作;这会返回大小数组n_workers

179                pi, v = self.model(obs_to_torch(self.obs))
180                values[:, t] = v.cpu().numpy()
181                a = pi.sample()
182                actions[:, t] = a.cpu().numpy()
183                log_pis[:, t] = pi.log_prob(a).cpu().numpy()

对每个 worker 运行采样操作

186                for w, worker in enumerate(self.workers):
187                    worker.child.send(("step", actions[w, t]))
188
189                for w, worker in enumerate(self.workers):

执行操作后获得结果

191                    self.obs[w], rewards[w, t], done[w, t], info = worker.child.recv()

收集剧集信息,在剧集结束后可用;这包括总奖励和剧集长度——看看Game 它是如何运作的。

196                    if info:
197                        tracker.add('reward', info['reward'])
198                        tracker.add('length', info['length'])

在最后一步之后获取的值

201            _, v = self.model(obs_to_torch(self.obs))
202            values[:, self.worker_steps] = v.cpu().numpy()

计算优势

205        advantages = self.gae(done, rewards, values)

208        samples = {
209            'obs': obs,
210            'actions': actions,
211            'values': values[:, :-1],
212            'log_pis': log_pis,
213            'advantages': advantages
214        }

样本目前在[workers, time_step] 表中,我们应该将其压平以进行训练

218        samples_flat = {}
219        for k, v in samples.items():
220            v = v.reshape(v.shape[0] * v.shape[1], *v.shape[2:])
221            if k == 'obs':
222                samples_flat[k] = obs_to_torch(v)
223            else:
224                samples_flat[k] = torch.tensor(v, device=device)
225
226        return samples_flat

根据样本训练模型

228    def train(self, samples: Dict[str, torch.Tensor]):

随着时代数量的增加,它学习得更快,但会变得有点不稳定;也就是说,平均剧集奖励不会随着时间的推移而单调增加。可能会缩小剪切范围可能会解决这个问题。

238        for _ in range(self.epochs()):

随机播放每个时代

240            indexes = torch.randperm(self.batch_size)

每小批次

243            for start in range(0, self.batch_size, self.mini_batch_size):

获得小批量

245                end = start + self.mini_batch_size
246                mini_batch_indexes = indexes[start: end]
247                mini_batch = {}
248                for k, v in samples.items():
249                    mini_batch[k] = v[mini_batch_indexes]

火车

252                loss = self._calc_loss(mini_batch)

设置学习速率

255                for pg in self.optimizer.param_groups:
256                    pg['lr'] = self.learning_rate()

将先前计算的梯度归零

258                self.optimizer.zero_grad()

计算梯度

260                loss.backward()

剪辑渐变

262                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=0.5)

根据渐变更新参数

264                self.optimizer.step()

规范化优势函数

266    @staticmethod
267    def _normalize(adv: torch.Tensor):
269        return (adv - adv.mean()) / (adv.std() + 1e-8)

计算总损失

271    def _calc_loss(self, samples: Dict[str, torch.Tensor]) -> torch.Tensor:

从中抽样的返回

277        sampled_return = samples['values'] + samples['advantages']

,优势从哪里抽样。有关计算,请参阅以下 Main 类中的采样函数

283        sampled_normalized_advantage = self._normalize(samples['advantages'])

采样观测值被输入到模型中以获取;我们将观测值视为状态

287        pi, value = self.model(samples['obs'])

是从中采样的动作

290        log_pi = pi.log_prob(samples['actions'])

计算保单损失

293        policy_loss = self.ppo_loss(log_pi, samples['log_pis'], sampled_normalized_advantage, self.clip_range())

计算熵加成

299        entropy_bonus = pi.entropy()
300        entropy_bonus = entropy_bonus.mean()

计算值函数损失

303        value_loss = self.value_loss(value, samples['values'], sampled_return, self.clip_range())

308        loss = (policy_loss
309                + self.value_loss_coef() * value_loss
310                - self.entropy_bonus_coef() * entropy_bonus)

用于监控

313        approx_kl_divergence = .5 * ((samples['log_pis'] - log_pi) ** 2).mean()

添加到追踪器

316        tracker.add({'policy_reward': -policy_loss,
317                     'value_loss': value_loss,
318                     'entropy_bonus': entropy_bonus,
319                     'kl_div': approx_kl_divergence,
320                     'clip_fraction': self.ppo_loss.clip_fraction})
321
322        return loss

跑步训练循环

324    def run_training_loop(self):

最近 100 集信息

330        tracker.set_queue('reward', 100, True)
331        tracker.set_queue('length', 100, True)
332
333        for update in monit.loop(self.updates):

当前政策的样本

335            samples = self.sample()

训练模型

338            self.train(samples)

保存跟踪的指标。

341            tracker.save()

定期在屏幕上添加新行

343            if (update + 1) % 1_000 == 0:
344                logger.log()

摧毁

阻止工人

346    def destroy(self):
351        for worker in self.workers:
352            worker.child.send(("close", None))
355def main():

创建实验

357    experiment.create(name='ppo')

配置

359    configs = {

更新次数

361        'updates': 10000,

⚙️ 使用采样数据训练模型的周期数。您可以在实验运行时更改此设置。Example

365        'epochs': IntDynamicHyperParam(8),

工作进程数

367        'n_workers': 8,

单次更新的每个进程要运行的步骤数

369        'worker_steps': 128,

微型批次数

371        'batches': 4,

⚙️ 价值损失系数。您可以在实验运行时更改此设置。Example

375        'value_loss_coef': FloatDynamicHyperParam(0.5),

⚙️ 熵加成系数。您可以在实验运行时更改此设置。Example

379        'entropy_bonus_coef': FloatDynamicHyperParam(0.01),

⚙️ 剪辑范围。

381        'clip_range': FloatDynamicHyperParam(0.1),

您可以在实验运行时更改此设置。Example⚙️ 学习率。

385        'learning_rate': FloatDynamicHyperParam(1e-3, (0, 1e-3)),
386    }
387
388    experiment.configs(configs)

初始化训练器

391    m = Trainer(**configs)

运行并监控实验

394    with experiment.start():
395        m.run_training_loop()

阻止工人

397    m.destroy()

运行它

401if __name__ == "__main__":
402    main()