This commit is contained in:
Varuna Jayasiri
2020-10-24 13:34:26 +05:30
parent b4d35f871e
commit 7cc9f300e9
4 changed files with 672 additions and 1 deletions

View File

@ -0,0 +1,385 @@
"""
\(
\def\hl1#1{{\color{orange}{#1}}}
\def\blue#1{{\color{cyan}{#1}}}
\def\green#1{{\color{yellowgreen}{#1}}}
\)
"""
import numpy as np
import torch
from torch import nn
from labml import tracker, experiment, logger, monit
from labml_helpers.module import Module
from labml_helpers.schedule import Piecewise
from labml_nn.rl.dqn import QFuncLoss
from labml_nn.rl.dqn.replay_buffer import ReplayBuffer
from labml_nn.rl.game import Worker
if torch.cuda.is_available():
device = torch.device("cuda:0")
else:
device = torch.device("cpu")
class Model(Module):
"""
## <a name="model"></a>Neural Network Model for $Q$ Values
#### Dueling Network ⚔️
We are using a [dueling network](https://arxiv.org/abs/1511.06581)
to calculate Q-values.
Intuition behind dueling network architure is that in most states
the action doesn't matter,
and in some states the action is significant. Dueling network allows
this to be represented very well.
\begin{align}
Q^\pi(s,a) &= V^\pi(s) + A^\pi(s, a)
\\
\mathop{\mathbb{E}}_{a \sim \pi(s)}
\Big[
A^\pi(s, a)
\Big]
&= 0
\end{align}
So we create two networks for $V$ and $A$ and get $Q$ from them.
$$
Q(s, a) = V(s) +
\Big(
A(s, a) - \frac{1}{|\mathcal{A}|} \sum_{a' \in \mathcal{A}} A(s, a')
\Big)
$$
We share the initial layers of the $V$ and $A$ networks.
"""
def __init__(self):
"""
### Initialize
We need `scope` because we need multiple copies of variables
for target network and training network.
"""
super().__init__()
self.conv = nn.Sequential(
# The first convolution layer takes a
# 84x84 frame and produces a 20x20 frame
nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4),
nn.ReLU(),
# The second convolution layer takes a
# 20x20 frame and produces a 9x9 frame
nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
nn.ReLU(),
# The third convolution layer takes a
# 9x9 frame and produces a 7x7 frame
nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
nn.ReLU(),
)
# A fully connected layer takes the flattened
# frame from third convolution layer, and outputs
# 512 features
self.lin = nn.Linear(in_features=7 * 7 * 64, out_features=512)
self.state_score = nn.Sequential(
nn.Linear(in_features=512, out_features=256),
nn.ReLU(),
nn.Linear(in_features=256, out_features=1),
)
self.action_score = nn.Sequential(
nn.Linear(in_features=512, out_features=256),
nn.ReLU(),
nn.Linear(in_features=256, out_features=4),
)
#
self.activation = nn.ReLU()
def __call__(self, obs: torch.Tensor):
h = self.conv(obs)
h = h.reshape((-1, 7 * 7 * 64))
h = self.activation(self.lin(h))
action_score = self.action_score(h)
state_score = self.state_score(h)
# $Q(s, a) =V(s) + \Big(A(s, a) - \frac{1}{|\mathcal{A}|} \sum_{a' \in \mathcal{A}} A(s, a')\Big)$
action_score_centered = action_score - action_score.mean(dim=-1, keepdim=True)
q = state_score + action_score_centered
return q
def obs_to_torch(obs: np.ndarray) -> torch.Tensor:
"""Scale observations from `[0, 255]` to `[0, 1]`"""
return torch.tensor(obs, dtype=torch.float32, device=device) / 255.
class Main(object):
"""
## <a name="main"></a>Main class
This class runs the training loop.
It initializes TensorFlow, handles logging and monitoring,
and runs workers as multiple processes.
"""
def __init__(self):
"""
### Initialize
"""
# #### Configurations
# number of workers
self.n_workers = 8
# steps sampled on each update
self.worker_steps = 4
# number of training iterations
self.train_epochs = 8
# number of updates
self.updates = 1_000_000
# size of mini batch for training
self.mini_batch_size = 32
# exploration as a function of time step
self.exploration_coefficient = Piecewise(
[
(0, 1.0),
(25_000, 0.1),
(self.updates / 2, 0.01)
], outside_value=0.01)
# update target network every 250 update
self.update_target_model = 250
# $\beta$ for replay buffer as a function of time steps
self.prioritized_replay_beta = Piecewise(
[
(0, 0.4),
(self.updates, 1)
], outside_value=1)
# replay buffer
self.replay_buffer = ReplayBuffer(2 ** 14, 0.6)
self.model = Model().to(device)
self.target_model = Model().to(device)
# last observation for each worker
# create workers
self.workers = [Worker(47 + i) for i in range(self.n_workers)]
# initialize tensors for observations
self.obs = np.zeros((self.n_workers, 4, 84, 84), dtype=np.uint8)
for worker in self.workers:
worker.child.send(("reset", None))
for i, worker in enumerate(self.workers):
self.obs[i] = worker.child.recv()
self.loss_func = QFuncLoss(0.99)
# optimizer
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=2.5e-4)
def _sample_action(self, q_value: torch.Tensor, exploration_coefficient: float):
"""
#### $\epsilon$-greedy Sampling
When sampling actions we use a $\epsilon$-greedy strategy, where we
take a greedy action with probabiliy $1 - \epsilon$ and
take a random action with probability $\epsilon$.
We refer to $\epsilon$ as *exploration*.
"""
with torch.no_grad():
greedy_action = torch.argmax(q_value, dim=-1)
random_action = torch.randint(q_value.shape[-1], greedy_action.shape, device=q_value.device)
is_choose_rand = torch.rand(greedy_action.shape, device=q_value.device) < exploration_coefficient
return torch.where(is_choose_rand, random_action, greedy_action).cpu().numpy()
def sample(self, exploration_coefficient: float):
"""### Sample data"""
with torch.no_grad():
# sample `SAMPLE_STEPS`
for t in range(self.worker_steps):
# sample actions
q_value = self.model(obs_to_torch(self.obs))
actions = self._sample_action(q_value, exploration_coefficient)
# run sampled actions on each worker
for w, worker in enumerate(self.workers):
worker.child.send(("step", actions[w]))
# collect information from each worker
for w, worker in enumerate(self.workers):
# get results after executing the actions
next_obs, reward, done, info = worker.child.recv()
# add transition to replay buffer
self.replay_buffer.add(self.obs[w], actions[w], reward, next_obs, done)
# update episode information
# collect episode info, which is available if an episode finished;
# this includes total reward and length of the episode -
# look at `Game` to see how it works.
# We also add a game frame to it for monitoring.
if info:
tracker.add('reward', info['reward'])
tracker.add('length', info['length'])
# update current observation
self.obs[w] = next_obs
def train(self, beta: float):
"""
### Train the model
We want to find optimal action-value function.
\begin{align}
Q^*(s,a) &= \max_\pi \mathbb{E} \Big[
r_t + \gamma r_{t + 1} + \gamma^2 r_{t + 2} + ... | s_t = s, a_t = a, \pi
\Big]
\\
Q^*(s,a) &= \mathop{\mathbb{E}}_{s' \sim \large{\varepsilon}} \Big[
r + \gamma \max_{a'} Q^* (s', a') | s, a
\Big]
\end{align}
#### Target network 🎯
In order to improve stability we use experience replay that randomly sample
from previous experience $U(D)$. We also use a Q network
with a separate set of paramters $\hl1{\theta_i^{-}}$ to calculate the target.
$\hl1{\theta_i^{-}}$ is updated periodically.
This is according to the [paper by DeepMind](https://deepmind.com/research/dqn/).
So the loss function is,
$$
\mathcal{L}_i(\theta_i) = \mathop{\mathbb{E}}_{(s,a,r,s') \sim U(D)}
\bigg[
\Big(
r + \gamma \max_{a'} Q(s', a'; \hl1{\theta_i^{-}}) - Q(s,a;\theta_i)
\Big) ^ 2
\bigg]
$$
#### Double $Q$-Learning
The max operator in the above calculation uses same network for both
selecting the best action and for evaluating the value.
That is,
$$
\max_{a'} Q(s', a'; \theta) = \blue{Q}
\Big(
s', \mathop{\operatorname{argmax}}_{a'}
\blue{Q}(s', a'; \blue{\theta}); \blue{\theta}
\Big)
$$
We use [double Q-learning](https://arxiv.org/abs/1509.06461), where
the $\operatorname{argmax}$ is taken from $\theta_i$ and
the value is taken from $\theta_i^{-}$.
And the loss function becomes,
\begin{align}
\mathcal{L}_i(\theta_i) = \mathop{\mathbb{E}}_{(s,a,r,s') \sim U(D)}
\Bigg[
\bigg(
&r + \gamma \blue{Q}
\Big(
s',
\mathop{\operatorname{argmax}}_{a'}
\green{Q}(s', a'; \green{\theta_i}); \blue{\theta_i^{-}}
\Big)
\\
- &Q(s,a;\theta_i)
\bigg) ^ 2
\Bigg]
\end{align}
"""
for _ in range(self.train_epochs):
# sample from priority replay buffer
samples = self.replay_buffer.sample(self.mini_batch_size, beta)
# train network
q_value = self.model(obs_to_torch(samples['obs']))
with torch.no_grad():
double_q_value = self.model(obs_to_torch(samples['next_obs']))
target_q_value = self.target_model(obs_to_torch(samples['next_obs']))
td_errors, loss = self.loss_func(q_value,
q_value.new_tensor(samples['action']),
double_q_value, target_q_value,
q_value.new_tensor(samples['done']),
q_value.new_tensor(samples['reward']),
q_value.new_tensor(samples['weights']))
# $p_i = |\delta_i| + \epsilon$
new_priorities = np.abs(td_errors.cpu().numpy()) + 1e-6
# update replay buffer
self.replay_buffer.update_priorities(samples['indexes'], new_priorities)
# compute gradients
self.optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=0.5)
self.optimizer.step()
def run_training_loop(self):
"""
### Run training loop
"""
# copy to target network initially
self.target_model.load_state_dict(self.model.state_dict())
# last 100 episode information
tracker.set_queue('reward', 100, True)
tracker.set_queue('length', 100, True)
for update in monit.loop(self.updates):
# $\epsilon$, exploration fraction
exploration = self.exploration_coefficient(update)
tracker.add('exploration', exploration)
# $\beta$ for priority replay
beta = self.prioritized_replay_beta(update)
tracker.add('beta', beta)
# sample with current policy
self.sample(exploration)
if self.replay_buffer.is_full():
# train the model
self.train(beta)
# periodically update target network
if update % self.update_target_model == 0:
self.target_model.load_state_dict(self.model.state_dict())
tracker.save()
if (update + 1) % 1_000 == 0:
logger.log()
def destroy(self):
"""
### Destroy
Stop the workers
"""
for worker in self.workers:
worker.child.send(("close", None))
# ## Run it
if __name__ == "__main__":
experiment.create(name='dqn')
m = Main()
with experiment.start():
m.run_training_loop()
m.destroy()