#

使用 Atari Breakout 进行 DQN 实

这个实验训练 Deep Q Network（DQN）在 OpenAI Gym 上玩 Atari Breakout 游戏。它在多个进程上运行游戏环境以进行高效采样。

16import numpy as np
17import torch
18
19from labml import tracker, experiment, logger, monit
20from labml.internal.configs.dynamic_hyperparam import FloatDynamicHyperParam
21from labml_helpers.schedule import Piecewise
22from labml_nn.rl.dqn import QFuncLoss
23from labml_nn.rl.dqn.model import Model
24from labml_nn.rl.dqn.replay_buffer import ReplayBuffer
25from labml_nn.rl.game import Worker

#

选择设备

28if torch.cuda.is_available():
29    device = torch.device("cuda:0")
30else:
31    device = torch.device("cpu")

#

将观测值从缩放[0, 255] 到[0, 1]

34def obs_to_torch(obs: np.ndarray) -> torch.Tensor:

#

36    return torch.tensor(obs, dtype=torch.float32, device=device) / 255.

#

训练师

39class Trainer:

#

44    def __init__(self, *,
45                 updates: int, epochs: int,
46                 n_workers: int, worker_steps: int, mini_batch_size: int,
47                 update_target_model: int,
48                 learning_rate: FloatDynamicHyperParam,
49                 ):

#

工作人员人数

51        self.n_workers = n_workers

#

每次更新时采样的步骤

53        self.worker_steps = worker_steps

#

训练迭代次数

55        self.train_epochs = epochs

#

更新次数

58        self.updates = updates

#

用于训练的微型批次的大小

60        self.mini_batch_size = mini_batch_size

#

每 250 次更新一次目标网络

63        self.update_target_model = update_target_model

#

学习率

66        self.learning_rate = learning_rate

#

作为更新函数的探索

69        self.exploration_coefficient = Piecewise(
70            [
71                (0, 1.0),
72                (25_000, 0.1),
73                (self.updates / 2, 0.01)
74            ], outside_value=0.01)

#

$β$ 作为更新函数的重播缓冲区

77        self.prioritized_replay_beta = Piecewise(
78            [
79                (0, 0.4),
80                (self.updates, 1)
81            ], outside_value=1)

#

使用@@

重播缓冲区 $α = 0.6$ 。重播缓冲区的容量必须是 2 的幂。

84        self.replay_buffer = ReplayBuffer(2 ** 14, 0.6)

#

采样和训练模型

87        self.model = Model().to(device)

#

要获取的目标模型 $Q (s^{'}; θ_{i}^{-})$

89        self.target_model = Model().to(device)

#

创建工作人员

92        self.workers = [Worker(47 + i) for i in range(self.n_workers)]

#

初始化观测值的张量

95        self.obs = np.zeros((self.n_workers, 4, 84, 84), dtype=np.uint8)

#

重置工作人员

98        for worker in self.workers:
99            worker.child.send(("reset", None))

#

获得初步观测值

102        for i, worker in enumerate(self.workers):
103            self.obs[i] = worker.child.recv()

#

损失函数

106        self.loss_func = QFuncLoss(0.99)

#

优化者

108        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=2.5e-4)

#

$ϵ$ -贪婪采样

在对@@

动作进行抽样时，我们使用 $ϵ$ -greedy策略，其中我们采取概率的贪婪动作， $1 - ϵ$ 并随机采取概率动作 $ϵ$ 。我们称之 $ϵ$ 为exploration_coefficient 。

110    def _sample_action(self, q_value: torch.Tensor, exploration_coefficient: float):

#

采样不需要渐变

120        with torch.no_grad():

#

采样具有最高 Q 值的动作。这是贪婪的行动。

122            greedy_action = torch.argmax(q_value, dim=-1)

#

统一采样和行动

124            random_action = torch.randint(q_value.shape[-1], greedy_action.shape, device=q_value.device)

#

选择贪婪动作还是随机动作

126            is_choose_rand = torch.rand(greedy_action.shape, device=q_value.device) < exploration_coefficient

#

根据以下内容选择操作is_choose_rand

128            return torch.where(is_choose_rand, random_action, greedy_action).cpu().numpy()

#

样本数据

130    def sample(self, exploration_coefficient: float):

#

这不需要渐变

134        with torch.no_grad():

#

样本worker_steps

136            for t in range(self.worker_steps):

#

获取当前观测值的 Q_Values

138                q_value = self.model(obs_to_torch(self.obs))

#

操作示例

140                actions = self._sample_action(q_value, exploration_coefficient)

#

对每个工作器运行采样操作

143                for w, worker in enumerate(self.workers):
144                    worker.child.send(("step", actions[w]))

#

收集每位员工的信息

147                for w, worker in enumerate(self.workers):

#

执行操作后获取结果

149                    next_obs, reward, done, info = worker.child.recv()

#

将过渡添加到重播缓冲区

152                    self.replay_buffer.add(self.obs[w], actions[w], reward, next_obs, done)

#

更新剧集信息。收集剧集信息，如果剧集结束则可用；这包括总奖励和剧集时长——看看Game 它是如何运作的。

158                    if info:
159                        tracker.add('reward', info['reward'])
160                        tracker.add('length', info['length'])

#

更新当前观测值

163                    self.obs[w] = next_obs

#

训练模型

165    def train(self, beta: float):

#

169        for _ in range(self.train_epochs):

#

来自优先级重播缓冲区的样本

171            samples = self.replay_buffer.sample(self.mini_batch_size, beta)

#

获取预测的 Q 值

173            q_value = self.model(obs_to_torch(samples['obs']))

#

获取 “双 Q 学习” 的下一个状态的 Q 值。梯度不应该为这些传播

177            with torch.no_grad():

#

得到 $Q (s^{'}; θ_{i})$

179                double_q_value = self.model(obs_to_torch(samples['next_obs']))

#

得到 $Q (s^{'}; θ_{i}^{-})$

181                target_q_value = self.target_model(obs_to_torch(samples['next_obs']))

#

计算时差 (TD) 误差和损失 $L (θ)$ 。 $δ$

184            td_errors, loss = self.loss_func(q_value,
185                                             q_value.new_tensor(samples['action']),
186                                             double_q_value, target_q_value,
187                                             q_value.new_tensor(samples['done']),
188                                             q_value.new_tensor(samples['reward']),
189                                             q_value.new_tensor(samples['weights']))

#

计算重播缓冲区的优先级 $p_{i} = ∣ δ_{i} ∣ + ϵ$

192            new_priorities = np.abs(td_errors.cpu().numpy()) + 1e-6

#

更新重播缓冲区优先级

194            self.replay_buffer.update_priorities(samples['indexes'], new_priorities)

#

设置学习速率

197            for pg in self.optimizer.param_groups:
198                pg['lr'] = self.learning_rate()

#

将先前计算的梯度归零

200            self.optimizer.zero_grad()

#

计算梯度

202            loss.backward()

#

剪辑渐变

204            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=0.5)

#

根据渐变更新参数

206            self.optimizer.step()

#

跑步训练循环

208    def run_training_loop(self):

#

最近 100 集信息

214        tracker.set_queue('reward', 100, True)
215        tracker.set_queue('length', 100, True)

#

最初复制到目标网络

218        self.target_model.load_state_dict(self.model.state_dict())
219
220        for update in monit.loop(self.updates):

#

$ϵ$ ，勘探分数

222            exploration = self.exploration_coefficient(update)
223            tracker.add('exploration', exploration)

#

$β$ 用于优先重播

225            beta = self.prioritized_replay_beta(update)
226            tracker.add('beta', beta)

#

当前政策的示例

229            self.sample(exploration)

#

缓冲区满后开始训练

232            if self.replay_buffer.is_full():

#

训练模型

234                self.train(beta)

#

定期更新目标网络

237                if update % self.update_target_model == 0:
238                    self.target_model.load_state_dict(self.model.state_dict())

#

保存跟踪的指标。

241            tracker.save()

#

定期在屏幕上添加新行

243            if (update + 1) % 1_000 == 0:
244                logger.log()

#

摧毁

阻止工人

246    def destroy(self):

#

251        for worker in self.workers:
252            worker.child.send(("close", None))

#

255def main():

#

创建实验

257    experiment.create(name='dqn')

#

配置

260    configs = {

#

更新次数

262        'updates': 1_000_000,

#

使用@@

采样数据训练模型的周期数。

264        'epochs': 8,

#

工作进程数

266        'n_workers': 8,

#

单次更新的每个进程要运行的步骤数

268        'worker_steps': 4,

#

小批量

270        'mini_batch_size': 32,

#

目标模型更新间隔

272        'update_target_model': 250,

#

学习率。

274        'learning_rate': FloatDynamicHyperParam(1e-4, (0, 1e-3)),
275    }

#

配置

278    experiment.configs(configs)

#

初始化训练器

281    m = Trainer(**configs)

#

运行并监控实验

283    with experiment.start():
284        m.run_training_loop()

#

阻止工人

286    m.destroy()

#

运行它

290if __name__ == "__main__":
291    main()

使用 Atari Breakout 进行 DQN 实

训练师

ϵ-贪婪采样

样本数据

训练模型

跑步训练循环

摧毁

运行它

$ϵ$ -贪婪采样