diff --git a/labml_nn/rl/__init__.py b/labml_nn/rl/__init__.py index 199030ae..c428bd1a 100644 --- a/labml_nn/rl/__init__.py +++ b/labml_nn/rl/__init__.py @@ -3,7 +3,8 @@ * [Proximal Policy Optimization](ppo) * [This is an experiment](ppo/experiment.html) that runs a PPO agent on Atari Breakout. -* [Generalized advantage estimation](ppo/gae.html) + * [Generalized advantage estimation](ppo/gae.html) +* [Deep Q Networks [This is the implementation for OpenAI game wrapper](game.html) that uses `multiprocessing`. """ \ No newline at end of file diff --git a/labml_nn/rl/dqn/__init__.py b/labml_nn/rl/dqn/__init__.py index d4d1c522..dd4956d5 100644 --- a/labml_nn/rl/dqn/__init__.py +++ b/labml_nn/rl/dqn/__init__.py @@ -1,13 +1,12 @@ """ # Deep Q Networks -This is a Deep Q Learning implementation that uses: +This is an implementation of paper + [Playing Atari with Deep Reinforcement Learning](https://arxiv.org/abs/1312.5602) + along with [Dueling Network](model.html), [Prioritized Replay](replay_buffer.html) + and Double Q Network. -* [Dueling Network](model.html) -* [Prioritized Replay](replay_buffer.html) -* Double Q Network - -Here's the [experiment](experiment.html) and [model](model.html). +Here are the [experiment](experiment.html) and [model](model.html) implementation. \( \def\green#1{{\color{yellowgreen}{#1}}} diff --git a/labml_nn/rl/dqn/experiment.py b/labml_nn/rl/dqn/experiment.py index bc7078fb..6a4cd36f 100644 --- a/labml_nn/rl/dqn/experiment.py +++ b/labml_nn/rl/dqn/experiment.py @@ -1,9 +1,8 @@ """ -\( - \def\hl1#1{{\color{orange}{#1}}} - \def\blue#1{{\color{cyan}{#1}}} - \def\green#1{{\color{yellowgreen}{#1}}} -\) +# DQN Experiment with Atari Breakout + +This experiment trains a Deep Q Network (DQN) to play Atari Breakout game on OpenAI Gym. +It runs the [game environments on multiple processes](../game.html) to sample efficiently. """ import numpy as np @@ -16,6 +15,7 @@ from labml_nn.rl.dqn.model import Model from labml_nn.rl.dqn.replay_buffer import ReplayBuffer from labml_nn.rl.game import Worker +# Select device if torch.cuda.is_available(): device = torch.device("cuda:0") else: @@ -29,17 +29,10 @@ def obs_to_torch(obs: np.ndarray) -> torch.Tensor: class Trainer: """ - ## Main class - This class runs the training loop. - It initializes TensorFlow, handles logging and monitoring, - and runs workers as multiple processes. + ## Trainer """ def __init__(self): - """ - ### Initialize - """ - # #### Configurations # number of workers @@ -54,7 +47,7 @@ class Trainer: # size of mini batch for training self.mini_batch_size = 32 - # exploration as a function of time step + # exploration as a function of updates self.exploration_coefficient = Piecewise( [ (0, 1.0), @@ -65,20 +58,21 @@ class Trainer: # update target network every 250 update self.update_target_model = 250 - # $\beta$ for replay buffer as a function of time steps + # $\beta$ for replay buffer as a function of updates self.prioritized_replay_beta = Piecewise( [ (0, 0.4), (self.updates, 1) ], outside_value=1) - # replay buffer + # replay buffer with $\alpha = 0.6$ self.replay_buffer = ReplayBuffer(2 ** 14, 0.6) + # Model for sampling and training self.model = Model().to(device) + # target model to get $\color{orange}Q(s';\color{orange}{\theta_i^{-}})$ self.target_model = Model().to(device) - # last observation for each worker # create workers self.workers = [Worker(47 + i) for i in range(self.n_workers)] @@ -89,6 +83,7 @@ class Trainer: for i, worker in enumerate(self.workers): self.obs[i] = worker.child.recv() + # loss function self.loss_func = QFuncLoss(0.99) # optimizer self.optimizer = torch.optim.Adam(self.model.parameters(), lr=2.5e-4) @@ -99,44 +94,48 @@ class Trainer: When sampling actions we use a $\epsilon$-greedy strategy, where we take a greedy action with probabiliy $1 - \epsilon$ and take a random action with probability $\epsilon$. - We refer to $\epsilon$ as *exploration*. + We refer to $\epsilon$ as `exploration_coefficient`. """ + # Sampling doesn't need gradients with torch.no_grad(): + # Sample the action with highest Q-value. This is the greedy action. greedy_action = torch.argmax(q_value, dim=-1) + # Uniformly sample and action random_action = torch.randint(q_value.shape[-1], greedy_action.shape, device=q_value.device) - + # Whether to chose greedy action or the random action is_choose_rand = torch.rand(greedy_action.shape, device=q_value.device) < exploration_coefficient - + # Pick the action based on `is_choose_rand` return torch.where(is_choose_rand, random_action, greedy_action).cpu().numpy() def sample(self, exploration_coefficient: float): """### Sample data""" + # This doesn't need gradients with torch.no_grad(): - # sample `SAMPLE_STEPS` + # Sample `worker_steps` for t in range(self.worker_steps): - # sample actions + # Get Q_values for the current observation q_value = self.model(obs_to_torch(self.obs)) + # Sample actions actions = self._sample_action(q_value, exploration_coefficient) - # run sampled actions on each worker + # Run sampled actions on each worker for w, worker in enumerate(self.workers): worker.child.send(("step", actions[w])) - # collect information from each worker + # Collect information from each worker for w, worker in enumerate(self.workers): - # get results after executing the actions + # Get results after executing the actions next_obs, reward, done, info = worker.child.recv() - # add transition to replay buffer + # Add transition to replay buffer self.replay_buffer.add(self.obs[w], actions[w], reward, next_obs, done) # update episode information # collect episode info, which is available if an episode finished; # this includes total reward and length of the episode - # look at `Game` to see how it works. - # We also add a game frame to it for monitoring. if info: tracker.add('reward', info['reward']) tracker.add('length', info['length']) @@ -145,16 +144,24 @@ class Trainer: self.obs[w] = next_obs def train(self, beta: float): + """ + ### Train the model + """ for _ in range(self.train_epochs): - # sample from priority replay buffer + # Sample from priority replay buffer samples = self.replay_buffer.sample(self.mini_batch_size, beta) - # train network + # Get the predicted Q-value q_value = self.model(obs_to_torch(samples['obs'])) + # Get the Q-values of the next state for [Double Q-learning](index.html). + # Gradients shouldn't propagate for these with torch.no_grad(): + # Get $\color{cyan}Q(s';\color{cyan}{\theta_i})$ double_q_value = self.model(obs_to_torch(samples['next_obs'])) + # Get $\color{orange}Q(s';\color{orange}{\theta_i^{-}})$ target_q_value = self.target_model(obs_to_torch(samples['next_obs'])) + # Compute Temporal Difference (TD) errors, $\delta$, and the loss, $\mathcal{L}(\theta)$. td_errors, loss = self.loss_func(q_value, q_value.new_tensor(samples['action']), double_q_value, target_q_value, @@ -162,15 +169,18 @@ class Trainer: q_value.new_tensor(samples['reward']), q_value.new_tensor(samples['weights'])) - # $p_i = |\delta_i| + \epsilon$ + # Calculate priorities for replay buffer $p_i = |\delta_i| + \epsilon$ new_priorities = np.abs(td_errors.cpu().numpy()) + 1e-6 - # update replay buffer + # Update replay buffer priorities self.replay_buffer.update_priorities(samples['indexes'], new_priorities) - # compute gradients + # Zero out the previously calculated gradients self.optimizer.zero_grad() + # Calculate gradients loss.backward() + # Clip gradients torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=0.5) + # Update parameters based on gradients self.optimizer.step() def run_training_loop(self): @@ -178,33 +188,36 @@ class Trainer: ### Run training loop """ - # copy to target network initially - self.target_model.load_state_dict(self.model.state_dict()) - - # last 100 episode information + # Last 100 episode information tracker.set_queue('reward', 100, True) tracker.set_queue('length', 100, True) + # Copy to target network initially + self.target_model.load_state_dict(self.model.state_dict()) + for update in monit.loop(self.updates): # $\epsilon$, exploration fraction exploration = self.exploration_coefficient(update) tracker.add('exploration', exploration) - # $\beta$ for priority replay + # $\beta$ for prioritized replay beta = self.prioritized_replay_beta(update) tracker.add('beta', beta) - # sample with current policy + # Sample with current policy self.sample(exploration) + # Start training after the buffer is full if self.replay_buffer.is_full(): - # train the model + # Train the model self.train(beta) - # periodically update target network + # Periodically update target network if update % self.update_target_model == 0: self.target_model.load_state_dict(self.model.state_dict()) + # Save tracked indicators. tracker.save() + # Add a new line to the screen periodically if (update + 1) % 1_000 == 0: logger.log() @@ -217,10 +230,18 @@ class Trainer: worker.child.send(("close", None)) -# ## Run it -if __name__ == "__main__": +def main(): + # Create the experiment experiment.create(name='dqn') + # Initialize the trainer m = Trainer() + # Run and monitor the experiment with experiment.start(): m.run_training_loop() + # Stop the workers m.destroy() + + +# ## Run it +if __name__ == "__main__": + main() diff --git a/labml_nn/rl/ppo/experiment.py b/labml_nn/rl/ppo/experiment.py index e8d40b55..2fa60dc3 100644 --- a/labml_nn/rl/ppo/experiment.py +++ b/labml_nn/rl/ppo/experiment.py @@ -1,11 +1,11 @@ """ # PPO Experiment with Atari Breakout -This experiment runs PPO Atari Breakout game on OpenAI Gym. -It runs the [game environments on multiple processes](game.html) to sample efficiently. +This experiment trains Proximal Policy Optimization (PPO) agent Atari Breakout game on OpenAI Gym. +It runs the [game environments on multiple processes](../game.html) to sample efficiently. """ -from typing import Dict, List +from typing import Dict import numpy as np import torch @@ -15,10 +15,11 @@ from torch.distributions import Categorical from labml import monit, tracker, logger, experiment from labml_helpers.module import Module +from labml_nn.rl.game import Worker from labml_nn.rl.ppo import ClippedPPOLoss, ClippedValueFunctionLoss from labml_nn.rl.ppo.gae import GAE -from labml_nn.rl.game import Worker +# Select device if torch.cuda.is_available(): device = torch.device("cuda:0") else: @@ -82,6 +83,7 @@ class Trainer: """ ## Trainer """ + def __init__(self): # #### Configurations @@ -165,7 +167,6 @@ class Trainer: # collect episode info, which is available if an episode finished; # this includes total reward and length of the episode - # look at `Game` to see how it works. - # We also add a game frame to it for monitoring. if info: tracker.add('reward', info['reward']) tracker.add('length', info['length']) @@ -225,12 +226,16 @@ class Trainer: loss = self._calc_loss(clip_range=clip_range, samples=mini_batch) - # compute gradients + # Set learning rate for pg in self.optimizer.param_groups: pg['lr'] = learning_rate + # Zero out the previously calculated gradients self.optimizer.zero_grad() + # Calculate gradients loss.backward() + # Clip gradients torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=0.5) + # Update parameters based on gradients self.optimizer.step() @staticmethod @@ -311,8 +316,9 @@ class Trainer: # train the model self.train(samples, learning_rate, clip_range) - # write summary info to the writer, and log to the screen + # Save tracked indicators. tracker.save() + # Add a new line to the screen periodically if (update + 1) % 1_000 == 0: logger.log() @@ -325,10 +331,18 @@ class Trainer: worker.child.send(("close", None)) -# ## Run it -if __name__ == "__main__": +def main(): + # Create the experiment experiment.create(name='ppo') + # Initialize the trainer m = Trainer() + # Run and monitor the experiment with experiment.start(): m.run_training_loop() + # Stop the workers m.destroy() + + +# ## Run it +if __name__ == "__main__": + main()