mirror of
https://github.com/labmlai/annotated_deep_learning_paper_implementations.git
synced 2025-11-01 20:28:41 +08:00
📚 ppo intro
This commit is contained in:
@ -10,6 +10,15 @@ summary: >
|
||||
This is a [PyTorch](https://pytorch.org) implementation of
|
||||
[Proximal Policy Optimization - PPO](https://arxiv.org/abs/1707.06347).
|
||||
|
||||
PPO is a policy gradient method for reinforcement learning.
|
||||
Simple policy gradient methods one do a single gradient update per sample (or a set of samples).
|
||||
Doing multiple gradient steps for a singe sample causes problems
|
||||
because the policy deviates too much producing a bad policy.
|
||||
PPO lets us do multiple gradient updates per sample by trying to keep the
|
||||
policy close to the policy that was used to sample data.
|
||||
It does so by clipping gradient flow if the updated policy
|
||||
is not close to the policy used to sample the data.
|
||||
|
||||
You can find an experiment that uses it [here](experiment.html).
|
||||
The experiment uses [Generalized Advantage Estimation](gae.html).
|
||||
"""
|
||||
@ -24,6 +33,8 @@ class ClippedPPOLoss(Module):
|
||||
"""
|
||||
## PPO Loss
|
||||
|
||||
Here's how the PPO update rule is derived.
|
||||
|
||||
We want to maximize policy reward
|
||||
$$\max_\theta J(\pi_\theta) =
|
||||
\mathop{\mathbb{E}}_{\tau \sim \pi_\theta}\Biggl[\sum_{t=0}^\infty \gamma^t r_t \Biggr]$$
|
||||
@ -128,6 +139,8 @@ class ClippedPPOLoss(Module):
|
||||
# *this is different from rewards* $r_t$.
|
||||
ratio = torch.exp(log_pi - sampled_log_pi)
|
||||
|
||||
# ### Cliping the policy ratio
|
||||
#
|
||||
# \begin{align}
|
||||
# \mathcal{L}^{CLIP}(\theta) =
|
||||
# \mathbb{E}_{a_t, s_t \sim \pi_{\theta{OLD}}} \biggl[
|
||||
@ -167,6 +180,8 @@ class ClippedValueFunctionLoss(Module):
|
||||
"""
|
||||
## Clipped Value Function Loss
|
||||
|
||||
Similarly we clip the value function update also.
|
||||
|
||||
\begin{align}
|
||||
V^{\pi_\theta}_{CLIP}(s_t)
|
||||
&= clip\Bigl(V^{\pi_\theta}(s_t) - \hat{V_t}, -\epsilon, +\epsilon\Bigr)
|
||||
|
||||
16
labml_nn/rl/ppo/readme.md
Normal file
16
labml_nn/rl/ppo/readme.md
Normal file
@ -0,0 +1,16 @@
|
||||
# [Proximal Policy Optimization (PPO)](https://nn.labml.ai/rl/ppo/index.html)
|
||||
|
||||
This is a [PyTorch](https://pytorch.org) implementation of
|
||||
[Proximal Policy Optimization - PPO](https://arxiv.org/abs/1707.06347).
|
||||
|
||||
PPO is a policy gradient method for reinforcement learning.
|
||||
Simple policy gradient methods one do a single gradient update per sample (or a set of samples).
|
||||
Doing multiple gradient steps for a singe sample causes problems
|
||||
because the policy deviates too much producing a bad policy.
|
||||
PPO lets us do multiple gradient updates per sample by trying to keep the
|
||||
policy close to the policy that was used to sample data.
|
||||
It does so by clipping gradient flow if the updated policy
|
||||
is not close to the policy used to sample the data.
|
||||
|
||||
You can find an experiment that uses it [here](https://nn.labml.ai/rl/ppo/experiment.html).
|
||||
The experiment uses [Generalized Advantage Estimation](https://nn.labml.ai/rl/ppo/gae.html).
|
||||
Reference in New Issue
Block a user