mirror of
https://github.com/labmlai/annotated_deep_learning_paper_implementations.git
synced 2025-08-26 16:50:39 +08:00
papers list
This commit is contained in:
@ -68,7 +68,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Capsule Networks</h1>
|
<h1>Capsule Networks</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation/tutorial of
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation/tutorial of
|
||||||
<a href="https://arxiv.org/abs/1710.09829">Dynamic Routing Between Capsules</a>.</p>
|
<a href="https://papers.labml.ai/paper/1710.09829">Dynamic Routing Between Capsules</a>.</p>
|
||||||
<p>Capsule network is a neural network architecture that embeds features
|
<p>Capsule network is a neural network architecture that embeds features
|
||||||
as capsules and routes them with a voting mechanism to next layer of capsules.</p>
|
as capsules and routes them with a voting mechanism to next layer of capsules.</p>
|
||||||
<p>Unlike in other implementations of models, we’ve included a sample, because
|
<p>Unlike in other implementations of models, we’ve included a sample, because
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
<h1>Classify MNIST digits with Capsule Networks</h1>
|
<h1>Classify MNIST digits with Capsule Networks</h1>
|
||||||
<p>This is an annotated PyTorch code to classify MNIST digits with PyTorch.</p>
|
<p>This is an annotated PyTorch code to classify MNIST digits with PyTorch.</p>
|
||||||
<p>This paper implements the experiment described in paper
|
<p>This paper implements the experiment described in paper
|
||||||
<a href="https://arxiv.org/abs/1710.09829">Dynamic Routing Between Capsules</a>.</p>
|
<a href="https://papers.labml.ai/paper/1710.09829">Dynamic Routing Between Capsules</a>.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
<div class="highlight"><pre><span class="lineno">14</span><span></span><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Any</span>
|
<div class="highlight"><pre><span class="lineno">14</span><span></span><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Any</span>
|
||||||
|
@ -68,7 +68,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/capsule_networks/index.html">Capsule Networks</a></h1>
|
<h1><a href="https://nn.labml.ai/capsule_networks/index.html">Capsule Networks</a></h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation/tutorial of
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation/tutorial of
|
||||||
<a href="https://arxiv.org/abs/1710.09829">Dynamic Routing Between Capsules</a>.</p>
|
<a href="https://papers.labml.ai/paper/1710.09829">Dynamic Routing Between Capsules</a>.</p>
|
||||||
<p>Capsule network is a neural network architecture that embeds features
|
<p>Capsule network is a neural network architecture that embeds features
|
||||||
as capsules and routes them with a voting mechanism to next layer of capsules.</p>
|
as capsules and routes them with a voting mechanism to next layer of capsules.</p>
|
||||||
<p>Unlike in other implementations of models, we’ve included a sample, because
|
<p>Unlike in other implementations of models, we’ve included a sample, because
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Cycle GAN</h1>
|
<h1>Cycle GAN</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation/tutorial of the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation/tutorial of the paper
|
||||||
<a href="https://arxiv.org/abs/1703.10593">Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks</a>.</p>
|
<a href="https://papers.labml.ai/paper/1703.10593">Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks</a>.</p>
|
||||||
<p>I’ve taken pieces of code from <a href="https://github.com/eriklindernoren/PyTorch-GAN">eriklindernoren/PyTorch-GAN</a>.
|
<p>I’ve taken pieces of code from <a href="https://github.com/eriklindernoren/PyTorch-GAN">eriklindernoren/PyTorch-GAN</a>.
|
||||||
It is a very good resource if you want to checkout other GAN variations too.</p>
|
It is a very good resource if you want to checkout other GAN variations too.</p>
|
||||||
<p>Cycle GAN does image-to-image translation.
|
<p>Cycle GAN does image-to-image translation.
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/gan/cycle_gan/index.html">Cycle GAN</a></h1>
|
<h1><a href="https://nn.labml.ai/gan/cycle_gan/index.html">Cycle GAN</a></h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation/tutorial of the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation/tutorial of the paper
|
||||||
<a href="https://arxiv.org/abs/1703.10593">Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks</a>.</p>
|
<a href="https://papers.labml.ai/paper/1703.10593">Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks</a>.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
|
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Deep Convolutional Generative Adversarial Networks (DCGAN)</h1>
|
<h1>Deep Convolutional Generative Adversarial Networks (DCGAN)</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of paper
|
||||||
<a href="https://arxiv.org/abs/1511.06434">Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks</a>.</p>
|
<a href="https://papers.labml.ai/paper/1511.06434">Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks</a>.</p>
|
||||||
<p>This implementation is based on the <a href="https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html">PyTorch DCGAN Tutorial</a>.</p>
|
<p>This implementation is based on the <a href="https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html">PyTorch DCGAN Tutorial</a>.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/gan/dcgan/index.html">Deep Convolutional Generative Adversarial Networks - DCGAN</a></h1>
|
<h1><a href="https://nn.labml.ai/gan/dcgan/index.html">Deep Convolutional Generative Adversarial Networks - DCGAN</a></h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of paper
|
||||||
<a href="https://arxiv.org/abs/1511.06434">Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks</a>.</p>
|
<a href="https://papers.labml.ai/paper/1511.06434">Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks</a>.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
|
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Generative Adversarial Networks (GAN)</h1>
|
<h1>Generative Adversarial Networks (GAN)</h1>
|
||||||
<p>This is an implementation of
|
<p>This is an implementation of
|
||||||
<a href="https://arxiv.org/abs/1406.2661">Generative Adversarial Networks</a>.</p>
|
<a href="https://papers.labml.ai/paper/1406.2661">Generative Adversarial Networks</a>.</p>
|
||||||
<p>The generator, $G(\pmb{z}; \theta_g)$ generates samples that match the
|
<p>The generator, $G(\pmb{z}; \theta_g)$ generates samples that match the
|
||||||
distribution of data, while the discriminator, $D(\pmb{x}; \theta_g)$
|
distribution of data, while the discriminator, $D(\pmb{x}; \theta_g)$
|
||||||
gives the probability that $\pmb{x}$ came from data rather than $G$.</p>
|
gives the probability that $\pmb{x}$ came from data rather than $G$.</p>
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/gan/original/index.html">Generative Adversarial Networks - GAN</a></h1>
|
<h1><a href="https://nn.labml.ai/gan/original/index.html">Generative Adversarial Networks - GAN</a></h1>
|
||||||
<p>This is an annotated implementation of
|
<p>This is an annotated implementation of
|
||||||
<a href="https://arxiv.org/abs/1406.2661">Generative Adversarial Networks</a>.</p>
|
<a href="https://papers.labml.ai/paper/1406.2661">Generative Adversarial Networks</a>.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
|
|
||||||
|
@ -69,12 +69,12 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>StyleGAN 2</h1>
|
<h1>StyleGAN 2</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/1912.04958">Analyzing and Improving the Image Quality of StyleGAN</a>
|
<a href="https://papers.labml.ai/paper/1912.04958">Analyzing and Improving the Image Quality of StyleGAN</a>
|
||||||
which introduces <strong>StyleGAN 2</strong>.
|
which introduces <strong>StyleGAN 2</strong>.
|
||||||
StyleGAN 2 is an improvement over <strong>StyleGAN</strong> from the paper
|
StyleGAN 2 is an improvement over <strong>StyleGAN</strong> from the paper
|
||||||
<a href="https://arxiv.org/abs/1812.04948">A Style-Based Generator Architecture for Generative Adversarial Networks</a>.
|
<a href="https://papers.labml.ai/paper/1812.04948">A Style-Based Generator Architecture for Generative Adversarial Networks</a>.
|
||||||
And StyleGAN is based on <strong>Progressive GAN</strong> from the paper
|
And StyleGAN is based on <strong>Progressive GAN</strong> from the paper
|
||||||
<a href="https://arxiv.org/abs/1710.10196">Progressive Growing of GANs for Improved Quality, Stability, and Variation</a>.
|
<a href="https://papers.labml.ai/paper/1710.10196">Progressive Growing of GANs for Improved Quality, Stability, and Variation</a>.
|
||||||
All three papers are from the same authors from <a href="https://twitter.com/NVIDIAAI">NVIDIA AI</a>.</p>
|
All three papers are from the same authors from <a href="https://twitter.com/NVIDIAAI">NVIDIA AI</a>.</p>
|
||||||
<p><em>Our implementation is a minimalistic StyleGAN 2 model training code.
|
<p><em>Our implementation is a minimalistic StyleGAN 2 model training code.
|
||||||
Only single GPU training is supported to keep the implementation simple.
|
Only single GPU training is supported to keep the implementation simple.
|
||||||
@ -1695,7 +1695,7 @@ since we want to calculate the standard deviation for each feature.</p>
|
|||||||
<p>The down-sample operation <a href="#smooth">smoothens</a> each feature channel and
|
<p>The down-sample operation <a href="#smooth">smoothens</a> each feature channel and
|
||||||
scale $2 \times$ using bilinear interpolation.
|
scale $2 \times$ using bilinear interpolation.
|
||||||
This is based on the paper
|
This is based on the paper
|
||||||
<a href="https://arxiv.org/abs/1904.11486">Making Convolutional Networks Shift-Invariant Again</a>.</p>
|
<a href="https://papers.labml.ai/paper/1904.11486">Making Convolutional Networks Shift-Invariant Again</a>.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
<div class="highlight"><pre><span class="lineno">645</span><span class="k">class</span> <span class="nc">DownSample</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span></pre></div>
|
<div class="highlight"><pre><span class="lineno">645</span><span class="k">class</span> <span class="nc">DownSample</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span></pre></div>
|
||||||
@ -1766,7 +1766,7 @@ This is based on the paper
|
|||||||
<h3>Up-sample</h3>
|
<h3>Up-sample</h3>
|
||||||
<p>The up-sample operation scales the image up by $2 \times$ and <a href="#smooth">smoothens</a> each feature channel.
|
<p>The up-sample operation scales the image up by $2 \times$ and <a href="#smooth">smoothens</a> each feature channel.
|
||||||
This is based on the paper
|
This is based on the paper
|
||||||
<a href="https://arxiv.org/abs/1904.11486">Making Convolutional Networks Shift-Invariant Again</a>.</p>
|
<a href="https://papers.labml.ai/paper/1904.11486">Making Convolutional Networks Shift-Invariant Again</a>.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
<div class="highlight"><pre><span class="lineno">668</span><span class="k">class</span> <span class="nc">UpSample</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span></pre></div>
|
<div class="highlight"><pre><span class="lineno">668</span><span class="k">class</span> <span class="nc">UpSample</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span></pre></div>
|
||||||
@ -2265,7 +2265,7 @@ Without equalized learning rate, the effective weights will get updated proporti
|
|||||||
<p><a id="gradient_penalty"></a></p>
|
<p><a id="gradient_penalty"></a></p>
|
||||||
<h2>Gradient Penalty</h2>
|
<h2>Gradient Penalty</h2>
|
||||||
<p>This is the $R_1$ regularization penality from the paper
|
<p>This is the $R_1$ regularization penality from the paper
|
||||||
<a href="https://arxiv.org/abs/1801.04406">Which Training Methods for GANs do actually Converge?</a>.</p>
|
<a href="https://papers.labml.ai/paper/1801.04406">Which Training Methods for GANs do actually Converge?</a>.</p>
|
||||||
<p>
|
<p>
|
||||||
<script type="math/tex; mode=display">R_1(\psi) = \frac{\gamma}{2} \mathbb{E}_{p_\mathcal{D}(x)}
|
<script type="math/tex; mode=display">R_1(\psi) = \frac{\gamma}{2} \mathbb{E}_{p_\mathcal{D}(x)}
|
||||||
\Big[\Vert \nabla_x D_\psi(x)^2 \Vert\Big]</script>
|
\Big[\Vert \nabla_x D_\psi(x)^2 \Vert\Big]</script>
|
||||||
|
@ -69,12 +69,12 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/gan/stylegan/index.html">StyleGAN 2</a></h1>
|
<h1><a href="https://nn.labml.ai/gan/stylegan/index.html">StyleGAN 2</a></h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/1912.04958">Analyzing and Improving the Image Quality of StyleGAN</a>
|
<a href="https://papers.labml.ai/paper/1912.04958">Analyzing and Improving the Image Quality of StyleGAN</a>
|
||||||
which introduces <strong>StyleGAN2</strong>.
|
which introduces <strong>StyleGAN2</strong>.
|
||||||
StyleGAN 2 is an improvement over <strong>StyleGAN</strong> from the paper
|
StyleGAN 2 is an improvement over <strong>StyleGAN</strong> from the paper
|
||||||
<a href="https://arxiv.org/abs/1812.04948">A Style-Based Generator Architecture for Generative Adversarial Networks</a>.
|
<a href="https://papers.labml.ai/paper/1812.04948">A Style-Based Generator Architecture for Generative Adversarial Networks</a>.
|
||||||
And StyleGAN is based on <strong>Progressive GAN</strong> from the paper
|
And StyleGAN is based on <strong>Progressive GAN</strong> from the paper
|
||||||
<a href="https://arxiv.org/abs/1710.10196">Progressive Growing of GANs for Improved Quality, Stability, and Variation</a>.
|
<a href="https://papers.labml.ai/paper/1710.10196">Progressive Growing of GANs for Improved Quality, Stability, and Variation</a>.
|
||||||
All three papers are from the same authors from <a href="https://twitter.com/NVIDIAAI">NVIDIA AI</a>.</p>
|
All three papers are from the same authors from <a href="https://twitter.com/NVIDIAAI">NVIDIA AI</a>.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
|
@ -73,7 +73,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Gradient Penalty for Wasserstein GAN (WGAN-GP)</h1>
|
<h1>Gradient Penalty for Wasserstein GAN (WGAN-GP)</h1>
|
||||||
<p>This is an implementation of
|
<p>This is an implementation of
|
||||||
<a href="https://arxiv.org/abs/1704.00028">Improved Training of Wasserstein GANs</a>.</p>
|
<a href="https://papers.labml.ai/paper/1704.00028">Improved Training of Wasserstein GANs</a>.</p>
|
||||||
<p><a href="../index.html">WGAN</a> suggests clipping weights to enforce Lipschitz constraint
|
<p><a href="../index.html">WGAN</a> suggests clipping weights to enforce Lipschitz constraint
|
||||||
on the discriminator network (critic).
|
on the discriminator network (critic).
|
||||||
This and other weight constraints like L2 norm clipping, weight normalization,
|
This and other weight constraints like L2 norm clipping, weight normalization,
|
||||||
@ -82,7 +82,7 @@ L1, L2 weight decay have problems:</p>
|
|||||||
<li>Limiting the capacity of the discriminator</li>
|
<li>Limiting the capacity of the discriminator</li>
|
||||||
<li>Exploding and vanishing gradients (without <a href="../../../normalization/batch_norm/index.html">Batch Normalization</a>).</li>
|
<li>Exploding and vanishing gradients (without <a href="../../../normalization/batch_norm/index.html">Batch Normalization</a>).</li>
|
||||||
</ol>
|
</ol>
|
||||||
<p>The paper <a href="https://arxiv.org/abs/1704.00028">Improved Training of Wasserstein GANs</a>
|
<p>The paper <a href="https://papers.labml.ai/paper/1704.00028">Improved Training of Wasserstein GANs</a>
|
||||||
proposal a better way to improve Lipschitz constraint, a gradient penalty.</p>
|
proposal a better way to improve Lipschitz constraint, a gradient penalty.</p>
|
||||||
<p>
|
<p>
|
||||||
<script type="math/tex; mode=display">\mathcal{L}_{GP} = \lambda \underset{\hat{x} \sim \mathbb{P}_{\hat{x}}}{\mathbb{E}}
|
<script type="math/tex; mode=display">\mathcal{L}_{GP} = \lambda \underset{\hat{x} \sim \mathbb{P}_{\hat{x}}}{\mathbb{E}}
|
||||||
|
@ -70,7 +70,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/gan/wasserstein/gradient_penalty/index.html">Gradient Penalty for Wasserstein GAN (WGAN-GP)</a></h1>
|
<h1><a href="https://nn.labml.ai/gan/wasserstein/gradient_penalty/index.html">Gradient Penalty for Wasserstein GAN (WGAN-GP)</a></h1>
|
||||||
<p>This is an implementation of
|
<p>This is an implementation of
|
||||||
<a href="https://arxiv.org/abs/1704.00028">Improved Training of Wasserstein GANs</a>.</p>
|
<a href="https://papers.labml.ai/paper/1704.00028">Improved Training of Wasserstein GANs</a>.</p>
|
||||||
<p><a href="https://nn.labml.ai/gan/wasserstein/index.html">WGAN</a> suggests
|
<p><a href="https://nn.labml.ai/gan/wasserstein/index.html">WGAN</a> suggests
|
||||||
clipping weights to enforce Lipschitz constraint
|
clipping weights to enforce Lipschitz constraint
|
||||||
on the discriminator network (critic).
|
on the discriminator network (critic).
|
||||||
@ -80,7 +80,7 @@ L1, L2 weight decay have problems:</p>
|
|||||||
<li>Limiting the capacity of the discriminator</li>
|
<li>Limiting the capacity of the discriminator</li>
|
||||||
<li>Exploding and vanishing gradients (without <a href="https://nn.labml.ai/normalization/batch_norm/index.html">Batch Normalization</a>).</li>
|
<li>Exploding and vanishing gradients (without <a href="https://nn.labml.ai/normalization/batch_norm/index.html">Batch Normalization</a>).</li>
|
||||||
</ol>
|
</ol>
|
||||||
<p>The paper <a href="https://arxiv.org/abs/1704.00028">Improved Training of Wasserstein GANs</a>
|
<p>The paper <a href="https://papers.labml.ai/paper/1704.00028">Improved Training of Wasserstein GANs</a>
|
||||||
proposal a better way to improve Lipschitz constraint, a gradient penalty.</p>
|
proposal a better way to improve Lipschitz constraint, a gradient penalty.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Wasserstein GAN (WGAN)</h1>
|
<h1>Wasserstein GAN (WGAN)</h1>
|
||||||
<p>This is an implementation of
|
<p>This is an implementation of
|
||||||
<a href="https://arxiv.org/abs/1701.07875">Wasserstein GAN</a>.</p>
|
<a href="https://papers.labml.ai/paper/1701.07875">Wasserstein GAN</a>.</p>
|
||||||
<p>The original GAN loss is based on Jensen-Shannon (JS) divergence
|
<p>The original GAN loss is based on Jensen-Shannon (JS) divergence
|
||||||
between the real distribution $\mathbb{P}_r$ and generated distribution $\mathbb{P}_g$.
|
between the real distribution $\mathbb{P}_r$ and generated distribution $\mathbb{P}_g$.
|
||||||
The Wasserstein GAN is based on Earth Mover distance between these distributions.</p>
|
The Wasserstein GAN is based on Earth Mover distance between these distributions.</p>
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/gan/wasserstein/index.html">Wasserstein GAN - WGAN</a></h1>
|
<h1><a href="https://nn.labml.ai/gan/wasserstein/index.html">Wasserstein GAN - WGAN</a></h1>
|
||||||
<p>This is an implementation of
|
<p>This is an implementation of
|
||||||
<a href="https://arxiv.org/abs/1701.07875">Wasserstein GAN</a>.</p>
|
<a href="https://papers.labml.ai/paper/1701.07875">Wasserstein GAN</a>.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
|
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Graph Attention Networks (GAT)</h1>
|
<h1>Graph Attention Networks (GAT)</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/1710.10903">Graph Attention Networks</a>.</p>
|
<a href="https://papers.labml.ai/paper/1710.10903">Graph Attention Networks</a>.</p>
|
||||||
<p>GATs work on graph data.
|
<p>GATs work on graph data.
|
||||||
A graph consists of nodes and edges connecting nodes.
|
A graph consists of nodes and edges connecting nodes.
|
||||||
For example, in Cora dataset the nodes are research papers and the edges are citations that
|
For example, in Cora dataset the nodes are research papers and the edges are citations that
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/graphs/gat/index.html">Graph Attention Networks (GAT)</a></h1>
|
<h1><a href="https://nn.labml.ai/graphs/gat/index.html">Graph Attention Networks (GAT)</a></h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/1710.10903">Graph Attention Networks</a>.</p>
|
<a href="https://papers.labml.ai/paper/1710.10903">Graph Attention Networks</a>.</p>
|
||||||
<p>GATs work on graph data.
|
<p>GATs work on graph data.
|
||||||
A graph consists of nodes and edges connecting nodes.
|
A graph consists of nodes and edges connecting nodes.
|
||||||
For example, in Cora dataset the nodes are research papers and the edges are citations that
|
For example, in Cora dataset the nodes are research papers and the edges are citations that
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Graph Attention Networks v2 (GATv2)</h1>
|
<h1>Graph Attention Networks v2 (GATv2)</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the GATv2 operator from the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the GATv2 operator from the paper
|
||||||
<a href="https://arxiv.org/abs/2105.14491">How Attentive are Graph Attention Networks?</a>.</p>
|
<a href="https://papers.labml.ai/paper/2105.14491">How Attentive are Graph Attention Networks?</a>.</p>
|
||||||
<p>GATv2s work on graph data similar to <a href="../gat/index.html">GAT</a>.
|
<p>GATv2s work on graph data similar to <a href="../gat/index.html">GAT</a>.
|
||||||
A graph consists of nodes and edges connecting nodes.
|
A graph consists of nodes and edges connecting nodes.
|
||||||
For example, in Cora dataset the nodes are research papers and the edges are citations that
|
For example, in Cora dataset the nodes are research papers and the edges are citations that
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/graphs/gatv2/index.html">Graph Attention Networks v2 (GATv2)</a></h1>
|
<h1><a href="https://nn.labml.ai/graphs/gatv2/index.html">Graph Attention Networks v2 (GATv2)</a></h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the GATv2 operator from the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the GATv2 operator from the paper
|
||||||
<a href="https://arxiv.org/abs/2105.14491">How Attentive are Graph Attention Networks?</a>.</p>
|
<a href="https://papers.labml.ai/paper/2105.14491">How Attentive are Graph Attention Networks?</a>.</p>
|
||||||
<p>GATv2s work on graph data.
|
<p>GATv2s work on graph data.
|
||||||
A graph consists of nodes and edges connecting nodes.
|
A graph consists of nodes and edges connecting nodes.
|
||||||
For example, in Cora dataset the nodes are research papers and the edges are citations that
|
For example, in Cora dataset the nodes are research papers and the edges are citations that
|
||||||
|
@ -68,7 +68,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>HyperNetworks - HyperLSTM</h1>
|
<h1>HyperNetworks - HyperLSTM</h1>
|
||||||
<p>We have implemented HyperLSTM introduced in paper
|
<p>We have implemented HyperLSTM introduced in paper
|
||||||
<a href="https://arxiv.org/abs/1609.09106">HyperNetworks</a>, with annotations
|
<a href="https://papers.labml.ai/paper/1609.09106">HyperNetworks</a>, with annotations
|
||||||
using <a href="https://pytorch.org">PyTorch</a>.
|
using <a href="https://pytorch.org">PyTorch</a>.
|
||||||
<a href="https://blog.otoro.net/2016/09/28/hyper-networks/">This blog post</a>
|
<a href="https://blog.otoro.net/2016/09/28/hyper-networks/">This blog post</a>
|
||||||
by David Ha gives a good explanation of HyperNetworks.</p>
|
by David Ha gives a good explanation of HyperNetworks.</p>
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Batch-Channel Normalization</h1>
|
<h1>Batch-Channel Normalization</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of Batch-Channel Normalization from the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of Batch-Channel Normalization from the paper
|
||||||
<a href="https://arxiv.org/abs/1903.10520">Micro-Batch Training with Batch-Channel Normalization and Weight Standardization</a>.
|
<a href="https://papers.labml.ai/paper/1903.10520">Micro-Batch Training with Batch-Channel Normalization and Weight Standardization</a>.
|
||||||
We also have an <a href="../weight_standardization/index.html">annotated implementation of Weight Standardization</a>.</p>
|
We also have an <a href="../weight_standardization/index.html">annotated implementation of Weight Standardization</a>.</p>
|
||||||
<p>Batch-Channel Normalization performs batch normalization followed
|
<p>Batch-Channel Normalization performs batch normalization followed
|
||||||
by a channel normalization (similar to a <a href="../group_norm/index.html">Group Normalization</a>.
|
by a channel normalization (similar to a <a href="../group_norm/index.html">Group Normalization</a>.
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Batch Normalization</h1>
|
<h1>Batch Normalization</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of Batch Normalization from paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of Batch Normalization from paper
|
||||||
<a href="https://arxiv.org/abs/1502.03167">Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift</a>.</p>
|
<a href="https://papers.labml.ai/paper/1502.03167">Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift</a>.</p>
|
||||||
<h3>Internal Covariate Shift</h3>
|
<h3>Internal Covariate Shift</h3>
|
||||||
<p>The paper defines <em>Internal Covariate Shift</em> as the change in the
|
<p>The paper defines <em>Internal Covariate Shift</em> as the change in the
|
||||||
distribution of network activations due to the change in
|
distribution of network activations due to the change in
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/normalization/batch_norm/index.html">Batch Normalization</a></h1>
|
<h1><a href="https://nn.labml.ai/normalization/batch_norm/index.html">Batch Normalization</a></h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of Batch Normalization from paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of Batch Normalization from paper
|
||||||
<a href="https://arxiv.org/abs/1502.03167">Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift</a>.</p>
|
<a href="https://papers.labml.ai/paper/1502.03167">Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift</a>.</p>
|
||||||
<h3>Internal Covariate Shift</h3>
|
<h3>Internal Covariate Shift</h3>
|
||||||
<p>The paper defines <em>Internal Covariate Shift</em> as the change in the
|
<p>The paper defines <em>Internal Covariate Shift</em> as the change in the
|
||||||
distribution of network activations due to the change in
|
distribution of network activations due to the change in
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Group Normalization</h1>
|
<h1>Group Normalization</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
|
||||||
the <a href="https://arxiv.org/abs/1803.08494">Group Normalization</a> paper.</p>
|
the <a href="https://papers.labml.ai/paper/1803.08494">Group Normalization</a> paper.</p>
|
||||||
<p><a href="../batch_norm/index.html">Batch Normalization</a> works well for large enough batch sizes
|
<p><a href="../batch_norm/index.html">Batch Normalization</a> works well for large enough batch sizes
|
||||||
but not well for small batch sizes, because it normalizes over the batch.
|
but not well for small batch sizes, because it normalizes over the batch.
|
||||||
Training large models with large batch sizes is not possible due to the memory capacity of the
|
Training large models with large batch sizes is not possible due to the memory capacity of the
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/normalization/group_norm/index.html">Group Normalization</a></h1>
|
<h1><a href="https://nn.labml.ai/normalization/group_norm/index.html">Group Normalization</a></h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
|
||||||
the <a href="https://arxiv.org/abs/1803.08494">Group Normalization</a> paper.</p>
|
the <a href="https://papers.labml.ai/paper/1803.08494">Group Normalization</a> paper.</p>
|
||||||
<p><a href="https://nn.labml.ai/normalization/batch_norm/index.html">Batch Normalization</a> works well for large enough batch sizes
|
<p><a href="https://nn.labml.ai/normalization/batch_norm/index.html">Batch Normalization</a> works well for large enough batch sizes
|
||||||
but not well for small batch sizes, because it normalizes over the batch.
|
but not well for small batch sizes, because it normalizes over the batch.
|
||||||
Training large models with large batch sizes is not possible due to the memory capacity of the
|
Training large models with large batch sizes is not possible due to the memory capacity of the
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Instance Normalization</h1>
|
<h1>Instance Normalization</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
|
||||||
<a href="https://arxiv.org/abs/1607.08022">Instance Normalization: The Missing Ingredient for Fast Stylization</a>.</p>
|
<a href="https://papers.labml.ai/paper/1607.08022">Instance Normalization: The Missing Ingredient for Fast Stylization</a>.</p>
|
||||||
<p>Instance normalization was introduced to improve <a href="https://paperswithcode.com/task/style-transfer">style transfer</a>.
|
<p>Instance normalization was introduced to improve <a href="https://paperswithcode.com/task/style-transfer">style transfer</a>.
|
||||||
It is based on the observation that stylization should not depend on the contrast of the content image.
|
It is based on the observation that stylization should not depend on the contrast of the content image.
|
||||||
The “contrast normalization” is</p>
|
The “contrast normalization” is</p>
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/normalization/instance_norm/index.html">Instance Normalization</a></h1>
|
<h1><a href="https://nn.labml.ai/normalization/instance_norm/index.html">Instance Normalization</a></h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
|
||||||
<a href="https://arxiv.org/abs/1607.08022">Instance Normalization: The Missing Ingredient for Fast Stylization</a>.</p>
|
<a href="https://papers.labml.ai/paper/1607.08022">Instance Normalization: The Missing Ingredient for Fast Stylization</a>.</p>
|
||||||
<p>Instance normalization was introduced to improve <a href="https://paperswithcode.com/task/style-transfer">style transfer</a>.
|
<p>Instance normalization was introduced to improve <a href="https://paperswithcode.com/task/style-transfer">style transfer</a>.
|
||||||
It is based on the observation that stylization should not depend on the contrast of the content image.
|
It is based on the observation that stylization should not depend on the contrast of the content image.
|
||||||
Since it’s hard for a convolutional network to learn “contrast normalization”, this paper
|
Since it’s hard for a convolutional network to learn “contrast normalization”, this paper
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Layer Normalization</h1>
|
<h1>Layer Normalization</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
|
||||||
<a href="https://arxiv.org/abs/1607.06450">Layer Normalization</a>.</p>
|
<a href="https://papers.labml.ai/paper/1607.06450">Layer Normalization</a>.</p>
|
||||||
<h3>Limitations of <a href="../batch_norm/index.html">Batch Normalization</a></h3>
|
<h3>Limitations of <a href="../batch_norm/index.html">Batch Normalization</a></h3>
|
||||||
<ul>
|
<ul>
|
||||||
<li>You need to maintain running means.</li>
|
<li>You need to maintain running means.</li>
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/normalization/layer_norm/index.html">Layer Normalization</a></h1>
|
<h1><a href="https://nn.labml.ai/normalization/layer_norm/index.html">Layer Normalization</a></h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
|
||||||
<a href="https://arxiv.org/abs/1607.06450">Layer Normalization</a>.</p>
|
<a href="https://papers.labml.ai/paper/1607.06450">Layer Normalization</a>.</p>
|
||||||
<h3>Limitations of <a href="https://nn.labml.ai/normalization/batch_norm/index.html">Batch Normalization</a></h3>
|
<h3>Limitations of <a href="https://nn.labml.ai/normalization/batch_norm/index.html">Batch Normalization</a></h3>
|
||||||
<ul>
|
<ul>
|
||||||
<li>You need to maintain running means.</li>
|
<li>You need to maintain running means.</li>
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Weight Standardization</h1>
|
<h1>Weight Standardization</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of Weight Standardization from the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of Weight Standardization from the paper
|
||||||
<a href="https://arxiv.org/abs/1903.10520">Micro-Batch Training with Batch-Channel Normalization and Weight Standardization</a>.
|
<a href="https://papers.labml.ai/paper/1903.10520">Micro-Batch Training with Batch-Channel Normalization and Weight Standardization</a>.
|
||||||
We also have an <a href="../batch_channel_norm/index.html">annotated implementation of Batch-Channel Normalization</a>.</p>
|
We also have an <a href="../batch_channel_norm/index.html">annotated implementation of Batch-Channel Normalization</a>.</p>
|
||||||
<p>Batch normalization <strong>gives a smooth loss landscape</strong> and
|
<p>Batch normalization <strong>gives a smooth loss landscape</strong> and
|
||||||
<strong>avoids elimination singularities</strong>.
|
<strong>avoids elimination singularities</strong>.
|
||||||
@ -91,7 +91,7 @@ where $f: A \rightarrow \mathbb{R}^m, A \in \mathbb{R}^n$.</p>
|
|||||||
inputs. So as long as the inputs are normally distributed the outputs remain close to normal.
|
inputs. So as long as the inputs are normally distributed the outputs remain close to normal.
|
||||||
This avoids outputs of nodes from always falling beyond the active range of the activation function
|
This avoids outputs of nodes from always falling beyond the active range of the activation function
|
||||||
(e.g. always negative input for a ReLU).</p>
|
(e.g. always negative input for a ReLU).</p>
|
||||||
<p><em><a href="https://arxiv.org/abs/1903.10520">Refer to the paper for proofs</a></em>.</p>
|
<p><em><a href="https://papers.labml.ai/paper/1903.10520">Refer to the paper for proofs</a></em>.</p>
|
||||||
<p>Here is <a href="experiment.html">the training code</a> for training
|
<p>Here is <a href="experiment.html">the training code</a> for training
|
||||||
a VGG network that uses weight standardization to classify CIFAR-10 data.
|
a VGG network that uses weight standardization to classify CIFAR-10 data.
|
||||||
This uses a <a href="conv2d.html">2D-Convolution Layer with Weight Standardization</a>.</p>
|
This uses a <a href="conv2d.html">2D-Convolution Layer with Weight Standardization</a>.</p>
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/normalization/weight_standardization/index.html">Weight Standardization</a></h1>
|
<h1><a href="https://nn.labml.ai/normalization/weight_standardization/index.html">Weight Standardization</a></h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of Weight Standardization from the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of Weight Standardization from the paper
|
||||||
<a href="https://arxiv.org/abs/1903.10520">Micro-Batch Training with Batch-Channel Normalization and Weight Standardization</a>.
|
<a href="https://papers.labml.ai/paper/1903.10520">Micro-Batch Training with Batch-Channel Normalization and Weight Standardization</a>.
|
||||||
We also have an
|
We also have an
|
||||||
<a href="https://nn.labml.ai/normalization/batch_channel_norm/index.html">annotated implementation of Batch-Channel Normalization</a>.</p>
|
<a href="https://nn.labml.ai/normalization/batch_channel_norm/index.html">annotated implementation of Batch-Channel Normalization</a>.</p>
|
||||||
</div>
|
</div>
|
||||||
|
@ -70,7 +70,7 @@
|
|||||||
<p>This is based from AdaBelief
|
<p>This is based from AdaBelief
|
||||||
<a href="https://github.com/juntang-zhuang/Adabelief-Optimizer">official implementation</a>
|
<a href="https://github.com/juntang-zhuang/Adabelief-Optimizer">official implementation</a>
|
||||||
of the paper
|
of the paper
|
||||||
<a href="https://arxiv.org/abs/2010.07468">AdaBelief Optimizer: Adapting Stepsizes by the Belief in Observed Gradients</a>.</p>
|
<a href="https://papers.labml.ai/paper/2010.07468">AdaBelief Optimizer: Adapting Stepsizes by the Belief in Observed Gradients</a>.</p>
|
||||||
<p>This is implemented in <a href="https://pytorch.org">PyTorch</a> as an extension to <a href="radam.html">RAdam</a>.</p>
|
<p>This is implemented in <a href="https://pytorch.org">PyTorch</a> as an extension to <a href="radam.html">RAdam</a>.</p>
|
||||||
<p>The main difference between Adam optimizer and AdaBelief is that,
|
<p>The main difference between Adam optimizer and AdaBelief is that,
|
||||||
how it calculates the adaptive learning rate;
|
how it calculates the adaptive learning rate;
|
||||||
|
@ -68,7 +68,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Adam Optimizer</h1>
|
<h1>Adam Optimizer</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of popular optimizer <em>Adam</em> from paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of popular optimizer <em>Adam</em> from paper
|
||||||
<a href="https://arxiv.org/abs/1412.6980v9">Adam: A Method for Stochastic Optimization</a>.</p>
|
<a href="https://papers.labml.ai/paper/1412.6980v9">Adam: A Method for Stochastic Optimization</a>.</p>
|
||||||
<p><em>Adam</em> update is,</p>
|
<p><em>Adam</em> update is,</p>
|
||||||
<p>
|
<p>
|
||||||
<script type="math/tex; mode=display">\begin{align}
|
<script type="math/tex; mode=display">\begin{align}
|
||||||
|
@ -68,7 +68,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>AMSGrad</h1>
|
<h1>AMSGrad</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/1904.09237">On the Convergence of Adam and Beyond</a>.</p>
|
<a href="https://papers.labml.ai/paper/1904.09237">On the Convergence of Adam and Beyond</a>.</p>
|
||||||
<p>We implement this as an extension to our <a href="adam.html">Adam optimizer implementation</a>.
|
<p>We implement this as an extension to our <a href="adam.html">Adam optimizer implementation</a>.
|
||||||
The implementation it self is really small since it’s very similar to Adam.</p>
|
The implementation it self is really small since it’s very similar to Adam.</p>
|
||||||
<p>We also have an implementation of the synthetic example described in the paper where Adam fails to converge.</p>
|
<p>We also have an implementation of the synthetic example described in the paper where Adam fails to converge.</p>
|
||||||
|
@ -68,7 +68,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Noam Optimizer</h1>
|
<h1>Noam Optimizer</h1>
|
||||||
<p>This is the <a href="https://pytorch.org">PyTorch</a> implementation of optimizer introduced in the paper
|
<p>This is the <a href="https://pytorch.org">PyTorch</a> implementation of optimizer introduced in the paper
|
||||||
<a href="https://arxiv.org/abs/1706.03762">Attention Is All You Need</a>.</p>
|
<a href="https://papers.labml.ai/paper/1706.03762">Attention Is All You Need</a>.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
<div class="highlight"><pre><span class="lineno">14</span><span></span><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Dict</span>
|
<div class="highlight"><pre><span class="lineno">14</span><span></span><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Dict</span>
|
||||||
|
@ -70,7 +70,7 @@
|
|||||||
<p>This implementation is based on
|
<p>This implementation is based on
|
||||||
<a href="https://github.com/LiyuanLucasLiu/RAdam">the official implementation</a>
|
<a href="https://github.com/LiyuanLucasLiu/RAdam">the official implementation</a>
|
||||||
of the paper
|
of the paper
|
||||||
<a href="https://arxiv.org/abs/1908.03265">On the Variance of the Adaptive Learning Rate and Beyond</a>.</p>
|
<a href="https://papers.labml.ai/paper/1908.03265">On the Variance of the Adaptive Learning Rate and Beyond</a>.</p>
|
||||||
<p>We have implemented it in <a href="https://pytorch.org">PyTorch</a>
|
<p>We have implemented it in <a href="https://pytorch.org">PyTorch</a>
|
||||||
as an extension to <a href="amsgrad.html">our AMSGrad implementation</a>
|
as an extension to <a href="amsgrad.html">our AMSGrad implementation</a>
|
||||||
thus requiring only the modifications to be implemented.</p>
|
thus requiring only the modifications to be implemented.</p>
|
||||||
|
137
docs/papers.json
Normal file
137
docs/papers.json
Normal file
@ -0,0 +1,137 @@
|
|||||||
|
{
|
||||||
|
"1701.07875": [
|
||||||
|
"https://nn.labml.ai/gan/wasserstein/index.html"
|
||||||
|
],
|
||||||
|
"1704.00028": [
|
||||||
|
"https://nn.labml.ai/gan/wasserstein/gradient_penalty/index.html"
|
||||||
|
],
|
||||||
|
"1406.2661": [
|
||||||
|
"https://nn.labml.ai/gan/original/index.html"
|
||||||
|
],
|
||||||
|
"1511.06434": [
|
||||||
|
"https://nn.labml.ai/gan/dcgan/index.html"
|
||||||
|
],
|
||||||
|
"1912.04958": [
|
||||||
|
"https://nn.labml.ai/gan/stylegan/index.html"
|
||||||
|
],
|
||||||
|
"1703.10593": [
|
||||||
|
"https://nn.labml.ai/gan/cycle_gan/index.html"
|
||||||
|
],
|
||||||
|
"1609.09106": [
|
||||||
|
"https://nn.labml.ai/hypernetworks/hyper_lstm.html"
|
||||||
|
],
|
||||||
|
"1903.10520": [
|
||||||
|
"https://nn.labml.ai/normalization/weight_standardization/index.html",
|
||||||
|
"https://nn.labml.ai/normalization/batch_channel_norm/index.html"
|
||||||
|
],
|
||||||
|
"1607.08022": [
|
||||||
|
"https://nn.labml.ai/normalization/instance_norm/index.html"
|
||||||
|
],
|
||||||
|
"1607.06450": [
|
||||||
|
"https://nn.labml.ai/normalization/layer_norm/index.html"
|
||||||
|
],
|
||||||
|
"1803.08494": [
|
||||||
|
"https://nn.labml.ai/normalization/group_norm/index.html"
|
||||||
|
],
|
||||||
|
"1502.03167": [
|
||||||
|
"https://nn.labml.ai/normalization/batch_norm/index.html"
|
||||||
|
],
|
||||||
|
"1512.03385": [
|
||||||
|
"https://nn.labml.ai/resnet/index.html"
|
||||||
|
],
|
||||||
|
"1503.02531": [
|
||||||
|
"https://nn.labml.ai/distillation/index.html"
|
||||||
|
],
|
||||||
|
"2010.07468": [
|
||||||
|
"https://nn.labml.ai/optimizers/ada_belief.html"
|
||||||
|
],
|
||||||
|
"1908.03265": [
|
||||||
|
"https://nn.labml.ai/optimizers/radam.html"
|
||||||
|
],
|
||||||
|
"1904.09237": [
|
||||||
|
"https://nn.labml.ai/optimizers/amsgrad.html"
|
||||||
|
],
|
||||||
|
"2105.08050": [
|
||||||
|
"https://nn.labml.ai/transformers/gmlp/index.html"
|
||||||
|
],
|
||||||
|
"1911.00172": [
|
||||||
|
"https://nn.labml.ai/transformers/knn/index.html"
|
||||||
|
],
|
||||||
|
"2002.05202": [
|
||||||
|
"https://nn.labml.ai/transformers/feed_forward.html"
|
||||||
|
],
|
||||||
|
"2102.11174": [
|
||||||
|
"https://nn.labml.ai/transformers/fast_weights/index.html"
|
||||||
|
],
|
||||||
|
"2002.09402": [
|
||||||
|
"https://nn.labml.ai/transformers/feedback/index.html"
|
||||||
|
],
|
||||||
|
"2105.01601": [
|
||||||
|
"https://nn.labml.ai/transformers/mlp_mixer/index.html"
|
||||||
|
],
|
||||||
|
"2010.11929": [
|
||||||
|
"https://nn.labml.ai/transformers/vit/index.html"
|
||||||
|
],
|
||||||
|
"2101.03961": [
|
||||||
|
"https://nn.labml.ai/transformers/switch/index.html"
|
||||||
|
],
|
||||||
|
"1810.04805": [
|
||||||
|
"https://nn.labml.ai/transformers/mlm/index.html"
|
||||||
|
],
|
||||||
|
"2105.14103": [
|
||||||
|
"https://nn.labml.ai/transformers/aft/index.html"
|
||||||
|
],
|
||||||
|
"1706.03762": [
|
||||||
|
"https://nn.labml.ai/transformers/mha.html"
|
||||||
|
],
|
||||||
|
"1911.05507": [
|
||||||
|
"https://nn.labml.ai/transformers/compressive/index.html"
|
||||||
|
],
|
||||||
|
"2105.03824": [
|
||||||
|
"https://nn.labml.ai/transformers/fnet/index.html"
|
||||||
|
],
|
||||||
|
"1901.02860": [
|
||||||
|
"https://nn.labml.ai/transformers/xl/index.html",
|
||||||
|
"https://nn.labml.ai/transformers/xl/relative_mha.html"
|
||||||
|
],
|
||||||
|
"1710.09829": [
|
||||||
|
"https://nn.labml.ai/capsule_networks/index.html",
|
||||||
|
"https://nn.labml.ai/capsule_networks/mnist.html"
|
||||||
|
],
|
||||||
|
"1607.03474": [
|
||||||
|
"https://nn.labml.ai/recurrent_highway_networks/index.html"
|
||||||
|
],
|
||||||
|
"1710.10903": [
|
||||||
|
"https://nn.labml.ai/graphs/gat/index.html"
|
||||||
|
],
|
||||||
|
"2105.14491": [
|
||||||
|
"https://nn.labml.ai/graphs/gatv2/index.html"
|
||||||
|
],
|
||||||
|
"1603.08983": [
|
||||||
|
"https://nn.labml.ai/adaptive_computation/parity.html"
|
||||||
|
],
|
||||||
|
"2107.05407": [
|
||||||
|
"https://nn.labml.ai/adaptive_computation/ponder_net/index.html"
|
||||||
|
],
|
||||||
|
"1704.03477": [
|
||||||
|
"https://nn.labml.ai/sketch_rnn/index.html"
|
||||||
|
],
|
||||||
|
"1312.5602": [
|
||||||
|
"https://nn.labml.ai/rl/dqn/index.html"
|
||||||
|
],
|
||||||
|
"1509.06461": [
|
||||||
|
"https://nn.labml.ai/rl/dqn/index.html"
|
||||||
|
],
|
||||||
|
"1511.06581": [
|
||||||
|
"https://nn.labml.ai/rl/dqn/model.html"
|
||||||
|
],
|
||||||
|
"1511.05952": [
|
||||||
|
"https://nn.labml.ai/rl/dqn/replay_buffer.html"
|
||||||
|
],
|
||||||
|
"1707.06347": [
|
||||||
|
"https://nn.labml.ai/rl/ppo/index.html"
|
||||||
|
],
|
||||||
|
"1506.02438": [
|
||||||
|
"https://nn.labml.ai/rl/ppo/gae.html"
|
||||||
|
]
|
||||||
|
}
|
@ -67,7 +67,7 @@
|
|||||||
<a href='#section-0'>#</a>
|
<a href='#section-0'>#</a>
|
||||||
</div>
|
</div>
|
||||||
<h1>Recurrent Highway Networks</h1>
|
<h1>Recurrent Highway Networks</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of <a href="https://arxiv.org/abs/1607.03474">Recurrent Highway Networks</a>.</p>
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of <a href="https://papers.labml.ai/paper/1607.03474">Recurrent Highway Networks</a>.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
<div class="highlight"><pre><span class="lineno">11</span><span></span><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Optional</span>
|
<div class="highlight"><pre><span class="lineno">11</span><span></span><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Optional</span>
|
||||||
|
@ -68,7 +68,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Deep Residual Learning for Image Recognition (ResNet)</h1>
|
<h1>Deep Residual Learning for Image Recognition (ResNet)</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/1512.03385">Deep Residual Learning for Image Recognition</a>.</p>
|
<a href="https://papers.labml.ai/paper/1512.03385">Deep Residual Learning for Image Recognition</a>.</p>
|
||||||
<p>ResNets train layers as residual functions to overcome the
|
<p>ResNets train layers as residual functions to overcome the
|
||||||
<em>degradation problem</em>.
|
<em>degradation problem</em>.
|
||||||
The degradation problem is the accuracy of deep neural networks degrading when
|
The degradation problem is the accuracy of deep neural networks degrading when
|
||||||
|
@ -68,7 +68,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/resnet/index.html">Deep Residual Learning for Image Recognition (ResNet)</a></h1>
|
<h1><a href="https://nn.labml.ai/resnet/index.html">Deep Residual Learning for Image Recognition (ResNet)</a></h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/1512.03385">Deep Residual Learning for Image Recognition</a>.</p>
|
<a href="https://papers.labml.ai/paper/1512.03385">Deep Residual Learning for Image Recognition</a>.</p>
|
||||||
<p>ResNets train layers as residual functions to overcome the
|
<p>ResNets train layers as residual functions to overcome the
|
||||||
<em>degradation problem</em>.
|
<em>degradation problem</em>.
|
||||||
The degradation problem is the accuracy of deep neural networks degrading when
|
The degradation problem is the accuracy of deep neural networks degrading when
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Deep Q Networks (DQN)</h1>
|
<h1>Deep Q Networks (DQN)</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of paper
|
||||||
<a href="https://arxiv.org/abs/1312.5602">Playing Atari with Deep Reinforcement Learning</a>
|
<a href="https://papers.labml.ai/paper/1312.5602">Playing Atari with Deep Reinforcement Learning</a>
|
||||||
along with <a href="model.html">Dueling Network</a>, <a href="replay_buffer.html">Prioritized Replay</a>
|
along with <a href="model.html">Dueling Network</a>, <a href="replay_buffer.html">Prioritized Replay</a>
|
||||||
and Double Q Network.</p>
|
and Double Q Network.</p>
|
||||||
<p>Here is the <a href="experiment.html">experiment</a> and <a href="model.html">model</a> implementation.</p>
|
<p>Here is the <a href="experiment.html">experiment</a> and <a href="model.html">model</a> implementation.</p>
|
||||||
@ -136,7 +136,7 @@ That is,
|
|||||||
\color{cyan}{Q}(s', a'; \color{cyan}{\theta}); \color{cyan}{\theta}
|
\color{cyan}{Q}(s', a'; \color{cyan}{\theta}); \color{cyan}{\theta}
|
||||||
\Big)
|
\Big)
|
||||||
</script>
|
</script>
|
||||||
We use <a href="https://arxiv.org/abs/1509.06461">double Q-learning</a>, where
|
We use <a href="https://papers.labml.ai/paper/1509.06461">double Q-learning</a>, where
|
||||||
the $\operatorname{argmax}$ is taken from $\color{cyan}{\theta_i}$ and
|
the $\operatorname{argmax}$ is taken from $\color{cyan}{\theta_i}$ and
|
||||||
the value is taken from $\color{orange}{\theta_i^{-}}$.</p>
|
the value is taken from $\color{orange}{\theta_i^{-}}$.</p>
|
||||||
<p>And the loss function becomes,
|
<p>And the loss function becomes,
|
||||||
|
@ -82,7 +82,7 @@
|
|||||||
<a href='#section-1'>#</a>
|
<a href='#section-1'>#</a>
|
||||||
</div>
|
</div>
|
||||||
<h2>Dueling Network ⚔️ Model for $Q$ Values</h2>
|
<h2>Dueling Network ⚔️ Model for $Q$ Values</h2>
|
||||||
<p>We are using a <a href="https://arxiv.org/abs/1511.06581">dueling network</a>
|
<p>We are using a <a href="https://papers.labml.ai/paper/1511.06581">dueling network</a>
|
||||||
to calculate Q-values.
|
to calculate Q-values.
|
||||||
Intuition behind dueling network architecture is that in most states
|
Intuition behind dueling network architecture is that in most states
|
||||||
the action doesn’t matter,
|
the action doesn’t matter,
|
||||||
|
@ -68,7 +68,7 @@
|
|||||||
<a href='#section-0'>#</a>
|
<a href='#section-0'>#</a>
|
||||||
</div>
|
</div>
|
||||||
<h1>Prioritized Experience Replay Buffer</h1>
|
<h1>Prioritized Experience Replay Buffer</h1>
|
||||||
<p>This implements paper <a href="https://arxiv.org/abs/1511.05952">Prioritized experience replay</a>,
|
<p>This implements paper <a href="https://papers.labml.ai/paper/1511.05952">Prioritized experience replay</a>,
|
||||||
using a binary segment tree.</p>
|
using a binary segment tree.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
@ -83,7 +83,7 @@ using a binary segment tree.</p>
|
|||||||
<a href='#section-1'>#</a>
|
<a href='#section-1'>#</a>
|
||||||
</div>
|
</div>
|
||||||
<h2>Buffer for Prioritized Experience Replay</h2>
|
<h2>Buffer for Prioritized Experience Replay</h2>
|
||||||
<p><a href="https://arxiv.org/abs/1511.05952">Prioritized experience replay</a>
|
<p><a href="https://papers.labml.ai/paper/1511.05952">Prioritized experience replay</a>
|
||||||
samples important transitions more frequently.
|
samples important transitions more frequently.
|
||||||
The transitions are prioritized by the Temporal Difference error (td error), $\delta$.</p>
|
The transitions are prioritized by the Temporal Difference error (td error), $\delta$.</p>
|
||||||
<p>We sample transition $i$ with probability,
|
<p>We sample transition $i$ with probability,
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Generalized Advantage Estimation (GAE)</h1>
|
<h1>Generalized Advantage Estimation (GAE)</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of paper
|
||||||
<a href="https://arxiv.org/abs/1506.02438">Generalized Advantage Estimation</a>.</p>
|
<a href="https://papers.labml.ai/paper/1506.02438">Generalized Advantage Estimation</a>.</p>
|
||||||
<p>You can find an experiment that uses it <a href="experiment.html">here</a>.</p>
|
<p>You can find an experiment that uses it <a href="experiment.html">here</a>.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Proximal Policy Optimization - PPO</h1>
|
<h1>Proximal Policy Optimization - PPO</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
|
||||||
<a href="https://arxiv.org/abs/1707.06347">Proximal Policy Optimization - PPO</a>.</p>
|
<a href="https://papers.labml.ai/paper/1707.06347">Proximal Policy Optimization - PPO</a>.</p>
|
||||||
<p>PPO is a policy gradient method for reinforcement learning.
|
<p>PPO is a policy gradient method for reinforcement learning.
|
||||||
Simple policy gradient methods do a single gradient update per sample (or a set of samples).
|
Simple policy gradient methods do a single gradient update per sample (or a set of samples).
|
||||||
Doing multiple gradient steps for a single sample causes problems
|
Doing multiple gradient steps for a single sample causes problems
|
||||||
@ -171,7 +171,7 @@ J(\pi_\theta) - J(\pi_{\theta_{OLD}})
|
|||||||
The error we introduce to $J(\pi_\theta) - J(\pi_{\theta_{OLD}})$
|
The error we introduce to $J(\pi_\theta) - J(\pi_{\theta_{OLD}})$
|
||||||
by this assumption is bound by the KL divergence between
|
by this assumption is bound by the KL divergence between
|
||||||
$\pi_\theta$ and $\pi_{\theta_{OLD}}$.
|
$\pi_\theta$ and $\pi_{\theta_{OLD}}$.
|
||||||
<a href="https://arxiv.org/abs/1705.10528">Constrained Policy Optimization</a>
|
<a href="https://papers.labml.ai/paper/1705.10528">Constrained Policy Optimization</a>
|
||||||
shows the proof of this. I haven’t read it.</p>
|
shows the proof of this. I haven’t read it.</p>
|
||||||
<p>
|
<p>
|
||||||
<script type="math/tex; mode=display">\begin{align}
|
<script type="math/tex; mode=display">\begin{align}
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/rl/ppo/index.html">Proximal Policy Optimization - PPO</a></h1>
|
<h1><a href="https://nn.labml.ai/rl/ppo/index.html">Proximal Policy Optimization - PPO</a></h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
|
||||||
<a href="https://arxiv.org/abs/1707.06347">Proximal Policy Optimization - PPO</a>.</p>
|
<a href="https://papers.labml.ai/paper/1707.06347">Proximal Policy Optimization - PPO</a>.</p>
|
||||||
<p>PPO is a policy gradient method for reinforcement learning.
|
<p>PPO is a policy gradient method for reinforcement learning.
|
||||||
Simple policy gradient methods one do a single gradient update per sample (or a set of samples).
|
Simple policy gradient methods one do a single gradient update per sample (or a set of samples).
|
||||||
Doing multiple gradient steps for a singe sample causes problems
|
Doing multiple gradient steps for a singe sample causes problems
|
||||||
|
@ -8,14 +8,14 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/gan/wasserstein/index.html</loc>
|
<loc>https://nn.labml.ai/gan/wasserstein/index.html</loc>
|
||||||
<lastmod>2021-08-08T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/gan/wasserstein/gradient_penalty/index.html</loc>
|
<loc>https://nn.labml.ai/gan/wasserstein/gradient_penalty/index.html</loc>
|
||||||
<lastmod>2021-05-09T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -36,7 +36,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/gan/original/index.html</loc>
|
<loc>https://nn.labml.ai/gan/original/index.html</loc>
|
||||||
<lastmod>2021-05-07T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -50,14 +50,14 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/gan/dcgan/index.html</loc>
|
<loc>https://nn.labml.ai/gan/dcgan/index.html</loc>
|
||||||
<lastmod>2021-05-07T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/gan/stylegan/index.html</loc>
|
<loc>https://nn.labml.ai/gan/stylegan/index.html</loc>
|
||||||
<lastmod>2021-06-21T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -71,7 +71,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/gan/cycle_gan/index.html</loc>
|
<loc>https://nn.labml.ai/gan/cycle_gan/index.html</loc>
|
||||||
<lastmod>2021-08-08T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -113,14 +113,14 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/hypernetworks/hyper_lstm.html</loc>
|
<loc>https://nn.labml.ai/hypernetworks/hyper_lstm.html</loc>
|
||||||
<lastmod>2021-08-08T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/normalization/weight_standardization/index.html</loc>
|
<loc>https://nn.labml.ai/normalization/weight_standardization/index.html</loc>
|
||||||
<lastmod>2021-08-08T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -141,7 +141,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/normalization/instance_norm/index.html</loc>
|
<loc>https://nn.labml.ai/normalization/instance_norm/index.html</loc>
|
||||||
<lastmod>2021-04-23T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -155,7 +155,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/normalization/layer_norm/index.html</loc>
|
<loc>https://nn.labml.ai/normalization/layer_norm/index.html</loc>
|
||||||
<lastmod>2021-04-20T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -169,14 +169,14 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/normalization/batch_channel_norm/index.html</loc>
|
<loc>https://nn.labml.ai/normalization/batch_channel_norm/index.html</loc>
|
||||||
<lastmod>2021-08-08T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/normalization/group_norm/index.html</loc>
|
<loc>https://nn.labml.ai/normalization/group_norm/index.html</loc>
|
||||||
<lastmod>2021-08-08T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -190,7 +190,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/normalization/batch_norm/index.html</loc>
|
<loc>https://nn.labml.ai/normalization/batch_norm/index.html</loc>
|
||||||
<lastmod>2021-08-08T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -246,7 +246,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/resnet/index.html</loc>
|
<loc>https://nn.labml.ai/resnet/index.html</loc>
|
||||||
<lastmod>2021-07-16T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -351,7 +351,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/optimizers/ada_belief.html</loc>
|
<loc>https://nn.labml.ai/optimizers/ada_belief.html</loc>
|
||||||
<lastmod>2021-01-30T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -365,7 +365,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/optimizers/noam.html</loc>
|
<loc>https://nn.labml.ai/optimizers/noam.html</loc>
|
||||||
<lastmod>2021-01-30T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -386,21 +386,21 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/optimizers/radam.html</loc>
|
<loc>https://nn.labml.ai/optimizers/radam.html</loc>
|
||||||
<lastmod>2021-01-30T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/optimizers/adam.html</loc>
|
<loc>https://nn.labml.ai/optimizers/adam.html</loc>
|
||||||
<lastmod>2021-01-30T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/optimizers/amsgrad.html</loc>
|
<loc>https://nn.labml.ai/optimizers/amsgrad.html</loc>
|
||||||
<lastmod>2021-02-27T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -435,7 +435,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/transformers/knn/index.html</loc>
|
<loc>https://nn.labml.ai/transformers/knn/index.html</loc>
|
||||||
<lastmod>2021-01-30T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -449,14 +449,14 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/transformers/configs.html</loc>
|
<loc>https://nn.labml.ai/transformers/configs.html</loc>
|
||||||
<lastmod>2021-02-05T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/transformers/models.html</loc>
|
<loc>https://nn.labml.ai/transformers/models.html</loc>
|
||||||
<lastmod>2021-06-02T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -470,7 +470,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/transformers/feed_forward.html</loc>
|
<loc>https://nn.labml.ai/transformers/feed_forward.html</loc>
|
||||||
<lastmod>2021-02-02T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -484,7 +484,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/transformers/basic/autoregressive_experiment.html</loc>
|
<loc>https://nn.labml.ai/transformers/basic/autoregressive_experiment.html</loc>
|
||||||
<lastmod>2021-06-07T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -498,7 +498,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/transformers/fast_weights/index.html</loc>
|
<loc>https://nn.labml.ai/transformers/fast_weights/index.html</loc>
|
||||||
<lastmod>2021-08-08T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -512,14 +512,14 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/transformers/index.html</loc>
|
<loc>https://nn.labml.ai/transformers/index.html</loc>
|
||||||
<lastmod>2021-07-17T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/transformers/feedback/index.html</loc>
|
<loc>https://nn.labml.ai/transformers/feedback/index.html</loc>
|
||||||
<lastmod>2021-08-08T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -568,7 +568,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/transformers/vit/index.html</loc>
|
<loc>https://nn.labml.ai/transformers/vit/index.html</loc>
|
||||||
<lastmod>2021-07-17T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -589,7 +589,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/transformers/switch/index.html</loc>
|
<loc>https://nn.labml.ai/transformers/switch/index.html</loc>
|
||||||
<lastmod>2021-08-08T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -617,7 +617,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/transformers/mlm/index.html</loc>
|
<loc>https://nn.labml.ai/transformers/mlm/index.html</loc>
|
||||||
<lastmod>2021-06-06T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -645,14 +645,14 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/transformers/mha.html</loc>
|
<loc>https://nn.labml.ai/transformers/mha.html</loc>
|
||||||
<lastmod>2021-06-07T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/transformers/compressive/index.html</loc>
|
<loc>https://nn.labml.ai/transformers/compressive/index.html</loc>
|
||||||
<lastmod>2021-08-08T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -673,7 +673,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/transformers/fnet/index.html</loc>
|
<loc>https://nn.labml.ai/transformers/fnet/index.html</loc>
|
||||||
<lastmod>2021-06-21T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -687,7 +687,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/transformers/xl/index.html</loc>
|
<loc>https://nn.labml.ai/transformers/xl/index.html</loc>
|
||||||
<lastmod>2021-08-08T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -701,35 +701,35 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/transformers/xl/relative_mha.html</loc>
|
<loc>https://nn.labml.ai/transformers/xl/relative_mha.html</loc>
|
||||||
<lastmod>2021-02-07T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/capsule_networks/index.html</loc>
|
<loc>https://nn.labml.ai/capsule_networks/index.html</loc>
|
||||||
<lastmod>2021-08-08T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/capsule_networks/mnist.html</loc>
|
<loc>https://nn.labml.ai/capsule_networks/mnist.html</loc>
|
||||||
<lastmod>2021-02-14T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/recurrent_highway_networks/index.html</loc>
|
<loc>https://nn.labml.ai/recurrent_highway_networks/index.html</loc>
|
||||||
<lastmod>2021-02-11T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/graphs/gat/index.html</loc>
|
<loc>https://nn.labml.ai/graphs/gat/index.html</loc>
|
||||||
<lastmod>2021-07-08T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -743,7 +743,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/graphs/gatv2/index.html</loc>
|
<loc>https://nn.labml.ai/graphs/gatv2/index.html</loc>
|
||||||
<lastmod>2021-07-30T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -792,7 +792,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/sketch_rnn/index.html</loc>
|
<loc>https://nn.labml.ai/sketch_rnn/index.html</loc>
|
||||||
<lastmod>2021-03-04T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -813,14 +813,14 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/rl/dqn/index.html</loc>
|
<loc>https://nn.labml.ai/rl/dqn/index.html</loc>
|
||||||
<lastmod>2021-04-04T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/rl/dqn/model.html</loc>
|
<loc>https://nn.labml.ai/rl/dqn/model.html</loc>
|
||||||
<lastmod>2020-12-10T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -834,21 +834,21 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/rl/dqn/replay_buffer.html</loc>
|
<loc>https://nn.labml.ai/rl/dqn/replay_buffer.html</loc>
|
||||||
<lastmod>2020-12-10T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/rl/ppo/index.html</loc>
|
<loc>https://nn.labml.ai/rl/ppo/index.html</loc>
|
||||||
<lastmod>2021-08-08T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://nn.labml.ai/rl/ppo/gae.html</loc>
|
<loc>https://nn.labml.ai/rl/ppo/gae.html</loc>
|
||||||
<lastmod>2021-03-30T16:30:00+00:00</lastmod>
|
<lastmod>2021-08-17T16:30:00+00:00</lastmod>
|
||||||
<priority>1.00</priority>
|
<priority>1.00</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
@ -68,7 +68,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Sketch RNN</h1>
|
<h1>Sketch RNN</h1>
|
||||||
<p>This is an annotated <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
<p>This is an annotated <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/1704.03477">A Neural Representation of Sketch Drawings</a>.</p>
|
<a href="https://papers.labml.ai/paper/1704.03477">A Neural Representation of Sketch Drawings</a>.</p>
|
||||||
<p>Sketch RNN is a sequence-to-sequence variational auto-encoder.
|
<p>Sketch RNN is a sequence-to-sequence variational auto-encoder.
|
||||||
Both encoder and decoder are recurrent neural network models.
|
Both encoder and decoder are recurrent neural network models.
|
||||||
It learns to reconstruct stroke based simple drawings, by predicting
|
It learns to reconstruct stroke based simple drawings, by predicting
|
||||||
|
@ -68,7 +68,7 @@
|
|||||||
<a href='#section-0'>#</a>
|
<a href='#section-0'>#</a>
|
||||||
</div>
|
</div>
|
||||||
<h1>Transformer Auto-Regression Experiment</h1>
|
<h1>Transformer Auto-Regression Experiment</h1>
|
||||||
<p>This trains a simple transformer introduced in <a href="https://arxiv.org/abs/1706.03762">Attention Is All You Need</a>
|
<p>This trains a simple transformer introduced in <a href="https://papers.labml.ai/paper/1706.03762">Attention Is All You Need</a>
|
||||||
on an NLP auto-regression task (with Tiny Shakespeare dataset).</p>
|
on an NLP auto-regression task (with Tiny Shakespeare dataset).</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Compressive Transformer</h1>
|
<h1>Compressive Transformer</h1>
|
||||||
<p>This is an implementation of
|
<p>This is an implementation of
|
||||||
<a href="https://arxiv.org/abs/1911.05507">Compressive Transformers for Long-Range Sequence Modelling</a>
|
<a href="https://papers.labml.ai/paper/1911.05507">Compressive Transformers for Long-Range Sequence Modelling</a>
|
||||||
in <a href="https://pytorch.org">PyTorch</a>.</p>
|
in <a href="https://pytorch.org">PyTorch</a>.</p>
|
||||||
<p>This is an extension of <a href="../xl/index.html">Transformer XL</a> where past memories
|
<p>This is an extension of <a href="../xl/index.html">Transformer XL</a> where past memories
|
||||||
are compressed to give a longer attention range.
|
are compressed to give a longer attention range.
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/transformers/compressive/index.html">Compressive Transformer</a></h1>
|
<h1><a href="https://nn.labml.ai/transformers/compressive/index.html">Compressive Transformer</a></h1>
|
||||||
<p>This is an implementation of
|
<p>This is an implementation of
|
||||||
<a href="https://arxiv.org/abs/1911.05507">Compressive Transformers for Long-Range Sequence Modelling</a>
|
<a href="https://papers.labml.ai/paper/1911.05507">Compressive Transformers for Long-Range Sequence Modelling</a>
|
||||||
in <a href="https://pytorch.org">PyTorch</a>.</p>
|
in <a href="https://pytorch.org">PyTorch</a>.</p>
|
||||||
<p>This is an extension of <a href="https://nn.labml.ai/transformers/xl/index.html">Transformer XL</a> where past memories
|
<p>This is an extension of <a href="https://nn.labml.ai/transformers/xl/index.html">Transformer XL</a> where past memories
|
||||||
are compressed to give a longer attention range.
|
are compressed to give a longer attention range.
|
||||||
|
@ -240,7 +240,7 @@
|
|||||||
<h3>GELU activation</h3>
|
<h3>GELU activation</h3>
|
||||||
<p>
|
<p>
|
||||||
<script type="math/tex; mode=display">x \Phi(x)</script> where $\Phi(x) = P(X \le x), X \sim \mathcal{N}(0,1)$</p>
|
<script type="math/tex; mode=display">x \Phi(x)</script> where $\Phi(x) = P(X \le x), X \sim \mathcal{N}(0,1)$</p>
|
||||||
<p>It was introduced in paper <a href="https://arxiv.org/abs/1606.08415">Gaussian Error Linear Units</a>.</p>
|
<p>It was introduced in paper <a href="https://papers.labml.ai/paper/1606.08415">Gaussian Error Linear Units</a>.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
<div class="highlight"><pre><span class="lineno">62</span><span class="nd">@option</span><span class="p">(</span><span class="n">FeedForwardConfigs</span><span class="o">.</span><span class="n">activation</span><span class="p">,</span> <span class="s1">'GELU'</span><span class="p">)</span>
|
<div class="highlight"><pre><span class="lineno">62</span><span class="nd">@option</span><span class="p">(</span><span class="n">FeedForwardConfigs</span><span class="o">.</span><span class="n">activation</span><span class="p">,</span> <span class="s1">'GELU'</span><span class="p">)</span>
|
||||||
@ -294,7 +294,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h2>GLU Variants</h2>
|
<h2>GLU Variants</h2>
|
||||||
<p>These are variants with gated hidden layers for the FFN
|
<p>These are variants with gated hidden layers for the FFN
|
||||||
as introduced in paper <a href="https://arxiv.org/abs/2002.05202">GLU Variants Improve Transformer</a>.
|
as introduced in paper <a href="https://papers.labml.ai/paper/2002.05202">GLU Variants Improve Transformer</a>.
|
||||||
We have omitted the bias terms as specified in the paper.</p>
|
We have omitted the bias terms as specified in the paper.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Fast weights transformer</h1>
|
<h1>Fast weights transformer</h1>
|
||||||
<p>The paper
|
<p>The paper
|
||||||
<a href="https://arxiv.org/abs/2102.11174">Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch</a>
|
<a href="https://papers.labml.ai/paper/2102.11174">Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch</a>
|
||||||
finds similarities between linear self-attention and fast weight systems
|
finds similarities between linear self-attention and fast weight systems
|
||||||
and makes modifications to self-attention update rule based on that.
|
and makes modifications to self-attention update rule based on that.
|
||||||
It also introduces a simpler, yet effective kernel function.</p>
|
It also introduces a simpler, yet effective kernel function.</p>
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/transformers/fast_weights/index.html">Fast weights transformer</a></h1>
|
<h1><a href="https://nn.labml.ai/transformers/fast_weights/index.html">Fast weights transformer</a></h1>
|
||||||
<p>This is an annotated implementation of the paper
|
<p>This is an annotated implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/2102.11174">Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch</a>.</p>
|
<a href="https://papers.labml.ai/paper/2102.11174">Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch</a>.</p>
|
||||||
<p>Here is the <a href="https://nn.labml.ai/transformers/fast_weights/index.html">annotated implementation</a>.
|
<p>Here is the <a href="https://nn.labml.ai/transformers/fast_weights/index.html">annotated implementation</a>.
|
||||||
Here are <a href="https://nn.labml.ai/transformers/fast_weights/experiment.html">the training code</a>
|
Here are <a href="https://nn.labml.ai/transformers/fast_weights/experiment.html">the training code</a>
|
||||||
and a notebook for training a fast weights transformer on the Tiny Shakespeare dataset.</p>
|
and a notebook for training a fast weights transformer on the Tiny Shakespeare dataset.</p>
|
||||||
|
@ -84,7 +84,7 @@ GELU (Gaussian Error Linear Unit) activation is also used instead of ReLU.
|
|||||||
<script type="math/tex; mode=display">x \Phi(x)</script> where $\Phi(x) = P(X \le x), X \sim \mathcal{N}(0,1)$</p>
|
<script type="math/tex; mode=display">x \Phi(x)</script> where $\Phi(x) = P(X \le x), X \sim \mathcal{N}(0,1)$</p>
|
||||||
<h3>Gated Linear Units</h3>
|
<h3>Gated Linear Units</h3>
|
||||||
<p>This is a generic implementation that supports different variants including
|
<p>This is a generic implementation that supports different variants including
|
||||||
<a href="https://arxiv.org/abs/2002.05202">Gated Linear Units</a> (GLU).
|
<a href="https://papers.labml.ai/paper/2002.05202">Gated Linear Units</a> (GLU).
|
||||||
We have also implemented experiments on these:</p>
|
We have also implemented experiments on these:</p>
|
||||||
<ul>
|
<ul>
|
||||||
<li><a href="glu_variants/experiment.html">experiment that uses <code>labml.configs</code></a></li>
|
<li><a href="glu_variants/experiment.html">experiment that uses <code>labml.configs</code></a></li>
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Feedback Transformer</h1>
|
<h1>Feedback Transformer</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/2002.09402">Accessing Higher-level Representations in Sequential Transformers with Feedback Memory</a>.</p>
|
<a href="https://papers.labml.ai/paper/2002.09402">Accessing Higher-level Representations in Sequential Transformers with Feedback Memory</a>.</p>
|
||||||
<p>Normal transformers process tokens in parallel. Each transformer layer pays attention
|
<p>Normal transformers process tokens in parallel. Each transformer layer pays attention
|
||||||
to the outputs of the previous layer.
|
to the outputs of the previous layer.
|
||||||
Feedback transformer pays attention to the output of all layers in previous steps.
|
Feedback transformer pays attention to the output of all layers in previous steps.
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/transformers/feedback/index.html">Feedback Transformer</a></h1>
|
<h1><a href="https://nn.labml.ai/transformers/feedback/index.html">Feedback Transformer</a></h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/2002.09402">Accessing Higher-level Representations in Sequential Transformers with Feedback Memory</a>.</p>
|
<a href="https://papers.labml.ai/paper/2002.09402">Accessing Higher-level Representations in Sequential Transformers with Feedback Memory</a>.</p>
|
||||||
<p>Normal transformers process tokens in parallel. Each transformer layer pays attention
|
<p>Normal transformers process tokens in parallel. Each transformer layer pays attention
|
||||||
to the outputs of the previous layer.
|
to the outputs of the previous layer.
|
||||||
Feedback transformer pays attention to the output of all layers in previous steps.
|
Feedback transformer pays attention to the output of all layers in previous steps.
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>FNet: Mixing Tokens with Fourier Transforms</h1>
|
<h1>FNet: Mixing Tokens with Fourier Transforms</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/2105.03824">FNet: Mixing Tokens with Fourier Transforms</a>.</p>
|
<a href="https://papers.labml.ai/paper/2105.03824">FNet: Mixing Tokens with Fourier Transforms</a>.</p>
|
||||||
<p>This paper replaces the <a href="../mha.html">self-attention layer</a> with two
|
<p>This paper replaces the <a href="../mha.html">self-attention layer</a> with two
|
||||||
<a href="https://en.wikipedia.org/wiki/Discrete_Fourier_transform">Fourier transforms</a> to
|
<a href="https://en.wikipedia.org/wiki/Discrete_Fourier_transform">Fourier transforms</a> to
|
||||||
<em>mix</em> tokens.
|
<em>mix</em> tokens.
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/transformers/fnet/index.html">FNet: Mixing Tokens with Fourier Transforms</a></h1>
|
<h1><a href="https://nn.labml.ai/transformers/fnet/index.html">FNet: Mixing Tokens with Fourier Transforms</a></h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/2105.03824">FNet: Mixing Tokens with Fourier Transforms</a>.</p>
|
<a href="https://papers.labml.ai/paper/2105.03824">FNet: Mixing Tokens with Fourier Transforms</a>.</p>
|
||||||
<p>This paper replaces the <a href="https://nn.labml.ai/transformers//mha.html">self-attention layer</a> with two
|
<p>This paper replaces the <a href="https://nn.labml.ai/transformers//mha.html">self-attention layer</a> with two
|
||||||
<a href="https://en.wikipedia.org/wiki/Discrete_Fourier_transform">Fourier transforms</a> to
|
<a href="https://en.wikipedia.org/wiki/Discrete_Fourier_transform">Fourier transforms</a> to
|
||||||
<em>mix</em> tokens.
|
<em>mix</em> tokens.
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
<h1>Transformers</h1>
|
<h1>Transformers</h1>
|
||||||
<p>This module contains <a href="https://pytorch.org/">PyTorch</a>
|
<p>This module contains <a href="https://pytorch.org/">PyTorch</a>
|
||||||
implementations and explanations of original transformer
|
implementations and explanations of original transformer
|
||||||
from paper <a href="https://arxiv.org/abs/1706.03762">Attention Is All You Need</a>,
|
from paper <a href="https://papers.labml.ai/paper/1706.03762">Attention Is All You Need</a>,
|
||||||
and derivatives and enhancements of it.</p>
|
and derivatives and enhancements of it.</p>
|
||||||
<ul>
|
<ul>
|
||||||
<li><a href="mha.html">Multi-head attention</a></li>
|
<li><a href="mha.html">Multi-head attention</a></li>
|
||||||
@ -87,30 +87,30 @@ oldest memories to give a longer attention span.</p>
|
|||||||
<p>This is an implementation of GPT-2 architecture.</p>
|
<p>This is an implementation of GPT-2 architecture.</p>
|
||||||
<h2><a href="glu_variants/simple.html">GLU Variants</a></h2>
|
<h2><a href="glu_variants/simple.html">GLU Variants</a></h2>
|
||||||
<p>This is an implementation of the paper
|
<p>This is an implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/2002.05202">GLU Variants Improve Transformer</a>.</p>
|
<a href="https://papers.labml.ai/paper/2002.05202">GLU Variants Improve Transformer</a>.</p>
|
||||||
<h2><a href="knn/index.html">kNN-LM</a></h2>
|
<h2><a href="knn/index.html">kNN-LM</a></h2>
|
||||||
<p>This is an implementation of the paper
|
<p>This is an implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/1911.00172">Generalization through Memorization: Nearest Neighbor Language Models</a>.</p>
|
<a href="https://papers.labml.ai/paper/1911.00172">Generalization through Memorization: Nearest Neighbor Language Models</a>.</p>
|
||||||
<h2><a href="feedback/index.html">Feedback Transformer</a></h2>
|
<h2><a href="feedback/index.html">Feedback Transformer</a></h2>
|
||||||
<p>This is an implementation of the paper
|
<p>This is an implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/2002.09402">Accessing Higher-level Representations in Sequential Transformers with Feedback Memory</a>.</p>
|
<a href="https://papers.labml.ai/paper/2002.09402">Accessing Higher-level Representations in Sequential Transformers with Feedback Memory</a>.</p>
|
||||||
<h2><a href="switch/index.html">Switch Transformer</a></h2>
|
<h2><a href="switch/index.html">Switch Transformer</a></h2>
|
||||||
<p>This is a miniature implementation of the paper
|
<p>This is a miniature implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/2101.03961">Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity</a>.
|
<a href="https://papers.labml.ai/paper/2101.03961">Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity</a>.
|
||||||
Our implementation only has a few million parameters and doesn’t do model parallel distributed training.
|
Our implementation only has a few million parameters and doesn’t do model parallel distributed training.
|
||||||
It does single GPU training but we implement the concept of switching as described in the paper.</p>
|
It does single GPU training but we implement the concept of switching as described in the paper.</p>
|
||||||
<h2><a href="fast_weights/index.html">Fast Weights Transformer</a></h2>
|
<h2><a href="fast_weights/index.html">Fast Weights Transformer</a></h2>
|
||||||
<p>This is an implementation of the paper
|
<p>This is an implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/2102.11174">Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch</a>.</p>
|
<a href="https://papers.labml.ai/paper/2102.11174">Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch</a>.</p>
|
||||||
<h2><a href="fnet/index.html">FNet: Mixing Tokens with Fourier Transforms</a></h2>
|
<h2><a href="fnet/index.html">FNet: Mixing Tokens with Fourier Transforms</a></h2>
|
||||||
<p>This is an implementation of the paper
|
<p>This is an implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/2105.03824">FNet: Mixing Tokens with Fourier Transforms</a>.</p>
|
<a href="https://papers.labml.ai/paper/2105.03824">FNet: Mixing Tokens with Fourier Transforms</a>.</p>
|
||||||
<h2><a href="aft/index.html">Attention Free Transformer</a></h2>
|
<h2><a href="aft/index.html">Attention Free Transformer</a></h2>
|
||||||
<p>This is an implementation of the paper
|
<p>This is an implementation of the paper
|
||||||
<a href="https://papers.labml.ai/paper/2105.14103">An Attention Free Transformer</a>.</p>
|
<a href="https://papers.labml.ai/paper/2105.14103">An Attention Free Transformer</a>.</p>
|
||||||
<h2><a href="mlm/index.html">Masked Language Model</a></h2>
|
<h2><a href="mlm/index.html">Masked Language Model</a></h2>
|
||||||
<p>This is an implementation of Masked Language Model used for pre-training in paper
|
<p>This is an implementation of Masked Language Model used for pre-training in paper
|
||||||
<a href="https://arxiv.org/abs/1810.04805">BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding</a>.</p>
|
<a href="https://papers.labml.ai/paper/1810.04805">BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding</a>.</p>
|
||||||
<h2><a href="mlp_mixer/index.html">MLP-Mixer: An all-MLP Architecture for Vision</a></h2>
|
<h2><a href="mlp_mixer/index.html">MLP-Mixer: An all-MLP Architecture for Vision</a></h2>
|
||||||
<p>This is an implementation of the paper
|
<p>This is an implementation of the paper
|
||||||
<a href="https://papers.labml.ai/paper/2105.01601">MLP-Mixer: An all-MLP Architecture for Vision</a>.</p>
|
<a href="https://papers.labml.ai/paper/2105.01601">MLP-Mixer: An all-MLP Architecture for Vision</a>.</p>
|
||||||
@ -119,7 +119,7 @@ It does single GPU training but we implement the concept of switching as describ
|
|||||||
<a href="https://papers.labml.ai/paper/2105.08050">Pay Attention to MLPs</a>.</p>
|
<a href="https://papers.labml.ai/paper/2105.08050">Pay Attention to MLPs</a>.</p>
|
||||||
<h2><a href="vit/index.html">Vision Transformer (ViT)</a></h2>
|
<h2><a href="vit/index.html">Vision Transformer (ViT)</a></h2>
|
||||||
<p>This is an implementation of the paper
|
<p>This is an implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/2010.11929">An Image Is Worth 16x16 Words: Transformers For Image Recognition At Scale</a>.</p>
|
<a href="https://papers.labml.ai/paper/2010.11929">An Image Is Worth 16x16 Words: Transformers For Image Recognition At Scale</a>.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
<div class="highlight"><pre><span class="lineno">92</span><span></span><span class="kn">from</span> <span class="nn">.configs</span> <span class="kn">import</span> <span class="n">TransformerConfigs</span>
|
<div class="highlight"><pre><span class="lineno">92</span><span></span><span class="kn">from</span> <span class="nn">.configs</span> <span class="kn">import</span> <span class="n">TransformerConfigs</span>
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>k-Nearest Neighbor Language Models</h1>
|
<h1>k-Nearest Neighbor Language Models</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/1911.00172">Generalization through Memorization: Nearest Neighbor Language Models</a>.
|
<a href="https://papers.labml.ai/paper/1911.00172">Generalization through Memorization: Nearest Neighbor Language Models</a>.
|
||||||
It uses k-nearest neighbors to improve perplexity of autoregressive transformer models.</p>
|
It uses k-nearest neighbors to improve perplexity of autoregressive transformer models.</p>
|
||||||
<p>An autoregressive language model estimates $p(w_t | \color{yellowgreen}{c_t})$,
|
<p>An autoregressive language model estimates $p(w_t | \color{yellowgreen}{c_t})$,
|
||||||
where $w_t$ is the token at step $t$
|
where $w_t$ is the token at step $t$
|
||||||
|
@ -68,7 +68,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Multi-Headed Attention (MHA)</h1>
|
<h1>Multi-Headed Attention (MHA)</h1>
|
||||||
<p>This is a tutorial/implementation of multi-headed attention
|
<p>This is a tutorial/implementation of multi-headed attention
|
||||||
from paper <a href="https://arxiv.org/abs/1706.03762">Attention Is All You Need</a>
|
from paper <a href="https://papers.labml.ai/paper/1706.03762">Attention Is All You Need</a>
|
||||||
in <a href="https://pytorch.org/">PyTorch</a>.
|
in <a href="https://pytorch.org/">PyTorch</a>.
|
||||||
The implementation is inspired from <a href="https://nlp.seas.harvard.edu/2018/04/03/attention.html">Annotated Transformer</a>.</p>
|
The implementation is inspired from <a href="https://nlp.seas.harvard.edu/2018/04/03/attention.html">Annotated Transformer</a>.</p>
|
||||||
<p>Here is the <a href="basic/autoregressive_experiment.html">training code</a> that uses a basic transformer
|
<p>Here is the <a href="basic/autoregressive_experiment.html">training code</a> that uses a basic transformer
|
||||||
|
@ -70,7 +70,7 @@
|
|||||||
<h1>Masked Language Model (MLM)</h1>
|
<h1>Masked Language Model (MLM)</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the Masked Language Model (MLM)
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the Masked Language Model (MLM)
|
||||||
used to pre-train the BERT model introduced in the paper
|
used to pre-train the BERT model introduced in the paper
|
||||||
<a href="https://arxiv.org/abs/1810.04805">BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding</a>.</p>
|
<a href="https://papers.labml.ai/paper/1810.04805">BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding</a>.</p>
|
||||||
<h2>BERT Pretraining</h2>
|
<h2>BERT Pretraining</h2>
|
||||||
<p>BERT model is a transformer model.
|
<p>BERT model is a transformer model.
|
||||||
The paper pre-trains the model using MLM and with next sentence prediction.
|
The paper pre-trains the model using MLM and with next sentence prediction.
|
||||||
|
@ -70,7 +70,7 @@
|
|||||||
<h1><a href="https://nn.labml.ai/transformers/mlm/index.html">Masked Language Model (MLM)</a></h1>
|
<h1><a href="https://nn.labml.ai/transformers/mlm/index.html">Masked Language Model (MLM)</a></h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of Masked Language Model (MLM)
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of Masked Language Model (MLM)
|
||||||
used to pre-train the BERT model introduced in the paper
|
used to pre-train the BERT model introduced in the paper
|
||||||
<a href="https://arxiv.org/abs/1810.04805">BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding</a>.</p>
|
<a href="https://papers.labml.ai/paper/1810.04805">BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding</a>.</p>
|
||||||
<h2>BERT Pretraining</h2>
|
<h2>BERT Pretraining</h2>
|
||||||
<p>BERT model is a transformer model.
|
<p>BERT model is a transformer model.
|
||||||
The paper pre-trains the model using MLM and with next sentence prediction.
|
The paper pre-trains the model using MLM and with next sentence prediction.
|
||||||
|
@ -179,7 +179,7 @@ and add the original residual vectors.
|
|||||||
Alternative is to do a layer normalization after adding the residuals.
|
Alternative is to do a layer normalization after adding the residuals.
|
||||||
But we found this to be less stable when training.
|
But we found this to be less stable when training.
|
||||||
We found a detailed discussion about this in the paper
|
We found a detailed discussion about this in the paper
|
||||||
<a href="https://arxiv.org/abs/2002.04745">On Layer Normalization in the Transformer Architecture</a>.</p>
|
<a href="https://papers.labml.ai/paper/2002.04745">On Layer Normalization in the Transformer Architecture</a>.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
<div class="highlight"><pre><span class="lineno">59</span><span class="k">class</span> <span class="nc">TransformerLayer</span><span class="p">(</span><span class="n">Module</span><span class="p">):</span></pre></div>
|
<div class="highlight"><pre><span class="lineno">59</span><span class="k">class</span> <span class="nc">TransformerLayer</span><span class="p">(</span><span class="n">Module</span><span class="p">):</span></pre></div>
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Switch Transformer</h1>
|
<h1>Switch Transformer</h1>
|
||||||
<p>This is a miniature <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
<p>This is a miniature <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/2101.03961">Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity</a>.
|
<a href="https://papers.labml.ai/paper/2101.03961">Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity</a>.
|
||||||
Our implementation only has a few million parameters and doesn’t do model parallel distributed training.
|
Our implementation only has a few million parameters and doesn’t do model parallel distributed training.
|
||||||
It does single GPU training, but we implement the concept of switching as described in the paper.</p>
|
It does single GPU training, but we implement the concept of switching as described in the paper.</p>
|
||||||
<p>The Switch Transformer uses different parameters for each token by switching among parameters
|
<p>The Switch Transformer uses different parameters for each token by switching among parameters
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/transformers/switch/index.html">Switch Transformer</a></h1>
|
<h1><a href="https://nn.labml.ai/transformers/switch/index.html">Switch Transformer</a></h1>
|
||||||
<p>This is a miniature <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
<p>This is a miniature <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/2101.03961">Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity</a>.
|
<a href="https://papers.labml.ai/paper/2101.03961">Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity</a>.
|
||||||
Our implementation only has a few million parameters and doesn’t do model parallel distributed training.
|
Our implementation only has a few million parameters and doesn’t do model parallel distributed training.
|
||||||
It does single GPU training, but we implement the concept of switching as described in the paper.</p>
|
It does single GPU training, but we implement the concept of switching as described in the paper.</p>
|
||||||
<p>The Switch Transformer uses different parameters for each token by switching among parameters
|
<p>The Switch Transformer uses different parameters for each token by switching among parameters
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Vision Transformer (ViT)</h1>
|
<h1>Vision Transformer (ViT)</h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/2010.11929">An Image Is Worth 16x16 Words: Transformers For Image Recognition At Scale</a>.</p>
|
<a href="https://papers.labml.ai/paper/2010.11929">An Image Is Worth 16x16 Words: Transformers For Image Recognition At Scale</a>.</p>
|
||||||
<p>Vision transformer applies a pure transformer to images
|
<p>Vision transformer applies a pure transformer to images
|
||||||
without any convolution layers.
|
without any convolution layers.
|
||||||
They split the image into patches and apply a transformer on patch embeddings.
|
They split the image into patches and apply a transformer on patch embeddings.
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/transformer/vit/index.html">Vision Transformer (ViT)</a></h1>
|
<h1><a href="https://nn.labml.ai/transformer/vit/index.html">Vision Transformer (ViT)</a></h1>
|
||||||
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
|
||||||
<a href="https://arxiv.org/abs/2010.11929">An Image Is Worth 16x16 Words: Transformers For Image Recognition At Scale</a>.</p>
|
<a href="https://papers.labml.ai/paper/2010.11929">An Image Is Worth 16x16 Words: Transformers For Image Recognition At Scale</a>.</p>
|
||||||
<p>Vision transformer applies a pure transformer to images
|
<p>Vision transformer applies a pure transformer to images
|
||||||
without any convolution layers.
|
without any convolution layers.
|
||||||
They split the image into patches and apply a transformer on patch embeddings.
|
They split the image into patches and apply a transformer on patch embeddings.
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Transformer XL</h1>
|
<h1>Transformer XL</h1>
|
||||||
<p>This is an implementation of
|
<p>This is an implementation of
|
||||||
<a href="https://arxiv.org/abs/1901.02860">Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context</a>
|
<a href="https://papers.labml.ai/paper/1901.02860">Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context</a>
|
||||||
in <a href="https://pytorch.org">PyTorch</a>.</p>
|
in <a href="https://pytorch.org">PyTorch</a>.</p>
|
||||||
<p>Transformer has a limited attention span,
|
<p>Transformer has a limited attention span,
|
||||||
equal to the length of the sequence trained in parallel.
|
equal to the length of the sequence trained in parallel.
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1><a href="https://nn.labml.ai/transformers/xl/index.html">Transformer XL</a></h1>
|
<h1><a href="https://nn.labml.ai/transformers/xl/index.html">Transformer XL</a></h1>
|
||||||
<p>This is an implementation of
|
<p>This is an implementation of
|
||||||
<a href="https://arxiv.org/abs/1901.02860">Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context</a>
|
<a href="https://papers.labml.ai/paper/1901.02860">Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context</a>
|
||||||
in <a href="https://pytorch.org">PyTorch</a>.</p>
|
in <a href="https://pytorch.org">PyTorch</a>.</p>
|
||||||
<p>Transformer has a limited attention span,
|
<p>Transformer has a limited attention span,
|
||||||
equal to the length of the sequence trained in parallel.
|
equal to the length of the sequence trained in parallel.
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<h1>Relative Multi-Headed Attention</h1>
|
<h1>Relative Multi-Headed Attention</h1>
|
||||||
<p>This is an implementation of relative multi-headed attention from paper
|
<p>This is an implementation of relative multi-headed attention from paper
|
||||||
<a href="https://arxiv.org/abs/1901.02860">Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context</a>
|
<a href="https://papers.labml.ai/paper/1901.02860">Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context</a>
|
||||||
in <a href="https://pytorch.org">PyTorch</a>.</p>
|
in <a href="https://pytorch.org">PyTorch</a>.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class='code'>
|
<div class='code'>
|
||||||
|
84
utils/papers_list.py
Normal file
84
utils/papers_list.py
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
import json
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from labml import logger
|
||||||
|
from labml.logger import Text
|
||||||
|
|
||||||
|
HOME = Path('./labml_nn')
|
||||||
|
|
||||||
|
REGEX = re.compile(r"""
|
||||||
|
\(
|
||||||
|
https://papers\.labml\.ai/paper/ # Start of a numeric entity reference
|
||||||
|
(?P<id>[0-9\.]+) # Paper ID
|
||||||
|
\)
|
||||||
|
""", re.VERBOSE)
|
||||||
|
|
||||||
|
IGNORE = {
|
||||||
|
'transformers/index.html',
|
||||||
|
'transformers/configs.html',
|
||||||
|
'optimizers/noam.html',
|
||||||
|
'transformers/basic/autoregressive_experiment.html',
|
||||||
|
}
|
||||||
|
|
||||||
|
IGNORE_PAPERS = {
|
||||||
|
'2002.04745', # On Layer Normalization in the Transformer Architecture
|
||||||
|
'1606.08415', # Gaussian Error Linear Units (GELUs)
|
||||||
|
'1710.10196', # Progressive Growing of GANs for Improved Quality, Stability, and Variation
|
||||||
|
'1904.11486', # Making Convolutional Networks Shift-Invariant Again
|
||||||
|
'1801.04406', # Which Training Methods for GANs do actually Converge?
|
||||||
|
'1812.04948', # A Style-Based Generator Architecture for Generative Adversarial Networks
|
||||||
|
'1705.10528', # Constrained Policy Optimization
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def collect(path: Path):
|
||||||
|
if path.is_file():
|
||||||
|
html = path.relative_to(HOME)
|
||||||
|
if html.suffix not in {'.py'}:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if html.stem == '__init__':
|
||||||
|
html = html.parent / 'index.html'
|
||||||
|
else:
|
||||||
|
html = html.parent / f'{html.stem}.html'
|
||||||
|
|
||||||
|
if str(html) in IGNORE:
|
||||||
|
return []
|
||||||
|
|
||||||
|
with open(str(path), 'r') as f:
|
||||||
|
contents = f.read()
|
||||||
|
papers = set()
|
||||||
|
for m in REGEX.finditer(contents):
|
||||||
|
if m.group('id') in IGNORE_PAPERS:
|
||||||
|
continue
|
||||||
|
papers.add(m.group('id'))
|
||||||
|
|
||||||
|
if len(papers) > 1:
|
||||||
|
logger.log([(str(html), Text.key), ': ', str(papers)])
|
||||||
|
return [{'url': str(html), 'arxiv_id': p} for p in papers]
|
||||||
|
|
||||||
|
urls = []
|
||||||
|
for f in path.iterdir():
|
||||||
|
urls += collect(f)
|
||||||
|
|
||||||
|
return urls
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
papers = []
|
||||||
|
for f in HOME.iterdir():
|
||||||
|
papers += collect(f)
|
||||||
|
|
||||||
|
by_id = {}
|
||||||
|
for p in papers:
|
||||||
|
if p['arxiv_id'] not in by_id:
|
||||||
|
by_id[p['arxiv_id']] = []
|
||||||
|
by_id[p['arxiv_id']].append(f'''https://nn.labml.ai/{p['url']}''')
|
||||||
|
|
||||||
|
with open(str(HOME.parent / 'docs' / 'papers.json'), 'w') as f:
|
||||||
|
f.write(json.dumps(by_id, indent=1))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Reference in New Issue
Block a user