diff --git a/docs/capsule_networks/index.html b/docs/capsule_networks/index.html
index e563ece7..334c1d50 100644
--- a/docs/capsule_networks/index.html
+++ b/docs/capsule_networks/index.html
@@ -68,7 +68,7 @@
                 </div>
                 <h1>Capsule Networks</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation/tutorial of
-<a href="https://arxiv.org/abs/1710.09829">Dynamic Routing Between Capsules</a>.</p>
+<a href="https://papers.labml.ai/paper/1710.09829">Dynamic Routing Between Capsules</a>.</p>
 <p>Capsule network is a neural network architecture that embeds features
 as capsules and routes them with a voting mechanism to next layer of capsules.</p>
 <p>Unlike in other implementations of models, we&rsquo;ve included a sample, because
diff --git a/docs/capsule_networks/mnist.html b/docs/capsule_networks/mnist.html
index e2feb1fb..8b64caf0 100644
--- a/docs/capsule_networks/mnist.html
+++ b/docs/capsule_networks/mnist.html
@@ -69,7 +69,7 @@
                 <h1>Classify MNIST digits with Capsule Networks</h1>
 <p>This is an annotated PyTorch code to classify MNIST digits with PyTorch.</p>
 <p>This paper implements the experiment described in paper
-<a href="https://arxiv.org/abs/1710.09829">Dynamic Routing Between Capsules</a>.</p>
+<a href="https://papers.labml.ai/paper/1710.09829">Dynamic Routing Between Capsules</a>.</p>
             </div>
             <div class='code'>
                 <div class="highlight"><pre><span class="lineno">14</span><span></span><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Any</span>
diff --git a/docs/capsule_networks/readme.html b/docs/capsule_networks/readme.html
index 4d6e991f..215aa730 100644
--- a/docs/capsule_networks/readme.html
+++ b/docs/capsule_networks/readme.html
@@ -68,7 +68,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/capsule_networks/index.html">Capsule Networks</a></h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation/tutorial of
-<a href="https://arxiv.org/abs/1710.09829">Dynamic Routing Between Capsules</a>.</p>
+<a href="https://papers.labml.ai/paper/1710.09829">Dynamic Routing Between Capsules</a>.</p>
 <p>Capsule network is a neural network architecture that embeds features
 as capsules and routes them with a voting mechanism to next layer of capsules.</p>
 <p>Unlike in other implementations of models, we&rsquo;ve included a sample, because
diff --git a/docs/gan/cycle_gan/index.html b/docs/gan/cycle_gan/index.html
index 9ae58422..5f706732 100644
--- a/docs/gan/cycle_gan/index.html
+++ b/docs/gan/cycle_gan/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Cycle GAN</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation/tutorial of the paper
-<a href="https://arxiv.org/abs/1703.10593">Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks</a>.</p>
+<a href="https://papers.labml.ai/paper/1703.10593">Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks</a>.</p>
 <p>I&rsquo;ve taken pieces of code from <a href="https://github.com/eriklindernoren/PyTorch-GAN">eriklindernoren/PyTorch-GAN</a>.
 It is a very good resource if you want to checkout other GAN variations too.</p>
 <p>Cycle GAN does image-to-image translation.
diff --git a/docs/gan/cycle_gan/readme.html b/docs/gan/cycle_gan/readme.html
index 79637c15..56ebf408 100644
--- a/docs/gan/cycle_gan/readme.html
+++ b/docs/gan/cycle_gan/readme.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/gan/cycle_gan/index.html">Cycle GAN</a></h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation/tutorial of the paper
-<a href="https://arxiv.org/abs/1703.10593">Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks</a>.</p>
+<a href="https://papers.labml.ai/paper/1703.10593">Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks</a>.</p>
             </div>
             <div class='code'>
                 
diff --git a/docs/gan/dcgan/index.html b/docs/gan/dcgan/index.html
index 820cb788..7ef2b47d 100644
--- a/docs/gan/dcgan/index.html
+++ b/docs/gan/dcgan/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Deep Convolutional Generative Adversarial Networks (DCGAN)</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of paper
-<a href="https://arxiv.org/abs/1511.06434">Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks</a>.</p>
+<a href="https://papers.labml.ai/paper/1511.06434">Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks</a>.</p>
 <p>This implementation is based on the <a href="https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html">PyTorch DCGAN Tutorial</a>.</p>
             </div>
             <div class='code'>
diff --git a/docs/gan/dcgan/readme.html b/docs/gan/dcgan/readme.html
index 7e093b6f..f8472341 100644
--- a/docs/gan/dcgan/readme.html
+++ b/docs/gan/dcgan/readme.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/gan/dcgan/index.html">Deep Convolutional Generative Adversarial Networks - DCGAN</a></h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of paper
-<a href="https://arxiv.org/abs/1511.06434">Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks</a>.</p>
+<a href="https://papers.labml.ai/paper/1511.06434">Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks</a>.</p>
             </div>
             <div class='code'>
                 
diff --git a/docs/gan/original/index.html b/docs/gan/original/index.html
index 556327ee..39411415 100644
--- a/docs/gan/original/index.html
+++ b/docs/gan/original/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Generative Adversarial Networks (GAN)</h1>
 <p>This is an implementation of
-<a href="https://arxiv.org/abs/1406.2661">Generative Adversarial Networks</a>.</p>
+<a href="https://papers.labml.ai/paper/1406.2661">Generative Adversarial Networks</a>.</p>
 <p>The generator, $G(\pmb{z}; \theta_g)$ generates samples that match the
 distribution of data, while the discriminator, $D(\pmb{x}; \theta_g)$
 gives the probability that $\pmb{x}$ came from data rather than $G$.</p>
diff --git a/docs/gan/original/readme.html b/docs/gan/original/readme.html
index 0bc677c1..152d29e8 100644
--- a/docs/gan/original/readme.html
+++ b/docs/gan/original/readme.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/gan/original/index.html">Generative Adversarial Networks - GAN</a></h1>
 <p>This is an annotated implementation of
-<a href="https://arxiv.org/abs/1406.2661">Generative Adversarial Networks</a>.</p>
+<a href="https://papers.labml.ai/paper/1406.2661">Generative Adversarial Networks</a>.</p>
             </div>
             <div class='code'>
                 
diff --git a/docs/gan/stylegan/index.html b/docs/gan/stylegan/index.html
index 68d6be4d..95b7e3d1 100644
--- a/docs/gan/stylegan/index.html
+++ b/docs/gan/stylegan/index.html
@@ -69,12 +69,12 @@
                 </div>
                 <h1>StyleGAN 2</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
- <a href="https://arxiv.org/abs/1912.04958">Analyzing and Improving the Image Quality of StyleGAN</a>
+ <a href="https://papers.labml.ai/paper/1912.04958">Analyzing and Improving the Image Quality of StyleGAN</a>
  which introduces <strong>StyleGAN 2</strong>.
 StyleGAN 2 is an improvement over <strong>StyleGAN</strong> from the paper
- <a href="https://arxiv.org/abs/1812.04948">A Style-Based Generator Architecture for Generative Adversarial Networks</a>.
+ <a href="https://papers.labml.ai/paper/1812.04948">A Style-Based Generator Architecture for Generative Adversarial Networks</a>.
 And StyleGAN is based on <strong>Progressive GAN</strong> from the paper
- <a href="https://arxiv.org/abs/1710.10196">Progressive Growing of GANs for Improved Quality, Stability, and Variation</a>.
+ <a href="https://papers.labml.ai/paper/1710.10196">Progressive Growing of GANs for Improved Quality, Stability, and Variation</a>.
 All three papers are from the same authors from <a href="https://twitter.com/NVIDIAAI">NVIDIA AI</a>.</p>
 <p><em>Our implementation is a minimalistic StyleGAN 2 model training code.
 Only single GPU training is supported to keep the implementation simple.
@@ -1695,7 +1695,7 @@ since we want to calculate the standard deviation for each feature.</p>
 <p>The down-sample operation <a href="#smooth">smoothens</a> each feature channel and
  scale $2 \times$ using bilinear interpolation.
 This is based on the paper
- <a href="https://arxiv.org/abs/1904.11486">Making Convolutional Networks Shift-Invariant Again</a>.</p>
+ <a href="https://papers.labml.ai/paper/1904.11486">Making Convolutional Networks Shift-Invariant Again</a>.</p>
             </div>
             <div class='code'>
                 <div class="highlight"><pre><span class="lineno">645</span><span class="k">class</span> <span class="nc">DownSample</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span></pre></div>
@@ -1766,7 +1766,7 @@ This is based on the paper
 <h3>Up-sample</h3>
 <p>The up-sample operation scales the image up by $2 \times$ and <a href="#smooth">smoothens</a> each feature channel.
 This is based on the paper
- <a href="https://arxiv.org/abs/1904.11486">Making Convolutional Networks Shift-Invariant Again</a>.</p>
+ <a href="https://papers.labml.ai/paper/1904.11486">Making Convolutional Networks Shift-Invariant Again</a>.</p>
             </div>
             <div class='code'>
                 <div class="highlight"><pre><span class="lineno">668</span><span class="k">class</span> <span class="nc">UpSample</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span></pre></div>
@@ -2265,7 +2265,7 @@ Without equalized learning rate, the effective weights will get updated proporti
                 <p><a id="gradient_penalty"></a></p>
 <h2>Gradient Penalty</h2>
 <p>This is the $R_1$ regularization penality from the paper
-<a href="https://arxiv.org/abs/1801.04406">Which Training Methods for GANs do actually Converge?</a>.</p>
+<a href="https://papers.labml.ai/paper/1801.04406">Which Training Methods for GANs do actually Converge?</a>.</p>
 <p>
 <script type="math/tex; mode=display">R_1(\psi) = \frac{\gamma}{2} \mathbb{E}_{p_\mathcal{D}(x)}
 \Big[\Vert \nabla_x D_\psi(x)^2 \Vert\Big]</script>
diff --git a/docs/gan/stylegan/readme.html b/docs/gan/stylegan/readme.html
index 6c946567..999171ae 100644
--- a/docs/gan/stylegan/readme.html
+++ b/docs/gan/stylegan/readme.html
@@ -69,12 +69,12 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/gan/stylegan/index.html">StyleGAN 2</a></h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
- <a href="https://arxiv.org/abs/1912.04958">Analyzing and Improving the Image Quality of StyleGAN</a>
+ <a href="https://papers.labml.ai/paper/1912.04958">Analyzing and Improving the Image Quality of StyleGAN</a>
  which introduces <strong>StyleGAN2</strong>.
 StyleGAN 2 is an improvement over <strong>StyleGAN</strong> from the paper
- <a href="https://arxiv.org/abs/1812.04948">A Style-Based Generator Architecture for Generative Adversarial Networks</a>.
+ <a href="https://papers.labml.ai/paper/1812.04948">A Style-Based Generator Architecture for Generative Adversarial Networks</a>.
 And StyleGAN is based on <strong>Progressive GAN</strong> from the paper
- <a href="https://arxiv.org/abs/1710.10196">Progressive Growing of GANs for Improved Quality, Stability, and Variation</a>.
+ <a href="https://papers.labml.ai/paper/1710.10196">Progressive Growing of GANs for Improved Quality, Stability, and Variation</a>.
 All three papers are from the same authors from <a href="https://twitter.com/NVIDIAAI">NVIDIA AI</a>.</p>
             </div>
             <div class='code'>
diff --git a/docs/gan/wasserstein/gradient_penalty/index.html b/docs/gan/wasserstein/gradient_penalty/index.html
index 84057d0d..7796bd39 100644
--- a/docs/gan/wasserstein/gradient_penalty/index.html
+++ b/docs/gan/wasserstein/gradient_penalty/index.html
@@ -73,7 +73,7 @@
                 </div>
                 <h1>Gradient Penalty for Wasserstein GAN (WGAN-GP)</h1>
 <p>This is an implementation of
-<a href="https://arxiv.org/abs/1704.00028">Improved Training of Wasserstein GANs</a>.</p>
+<a href="https://papers.labml.ai/paper/1704.00028">Improved Training of Wasserstein GANs</a>.</p>
 <p><a href="../index.html">WGAN</a> suggests clipping weights to enforce Lipschitz constraint
 on the discriminator network (critic).
 This and other weight constraints like L2 norm clipping, weight normalization,
@@ -82,7 +82,7 @@ L1, L2 weight decay have problems:</p>
 <li>Limiting the capacity of the discriminator</li>
 <li>Exploding and vanishing gradients (without <a href="../../../normalization/batch_norm/index.html">Batch Normalization</a>).</li>
 </ol>
-<p>The paper <a href="https://arxiv.org/abs/1704.00028">Improved Training of Wasserstein GANs</a>
+<p>The paper <a href="https://papers.labml.ai/paper/1704.00028">Improved Training of Wasserstein GANs</a>
 proposal a better way to improve Lipschitz constraint, a gradient penalty.</p>
 <p>
 <script type="math/tex; mode=display">\mathcal{L}_{GP} = \lambda \underset{\hat{x} \sim \mathbb{P}_{\hat{x}}}{\mathbb{E}}
diff --git a/docs/gan/wasserstein/gradient_penalty/readme.html b/docs/gan/wasserstein/gradient_penalty/readme.html
index 2266b5c6..1777e0c1 100644
--- a/docs/gan/wasserstein/gradient_penalty/readme.html
+++ b/docs/gan/wasserstein/gradient_penalty/readme.html
@@ -70,7 +70,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/gan/wasserstein/gradient_penalty/index.html">Gradient Penalty for Wasserstein GAN (WGAN-GP)</a></h1>
 <p>This is an implementation of
-<a href="https://arxiv.org/abs/1704.00028">Improved Training of Wasserstein GANs</a>.</p>
+<a href="https://papers.labml.ai/paper/1704.00028">Improved Training of Wasserstein GANs</a>.</p>
 <p><a href="https://nn.labml.ai/gan/wasserstein/index.html">WGAN</a> suggests
 clipping weights to enforce Lipschitz constraint
 on the discriminator network (critic).
@@ -80,7 +80,7 @@ L1, L2 weight decay have problems:</p>
 <li>Limiting the capacity of the discriminator</li>
 <li>Exploding and vanishing gradients (without <a href="https://nn.labml.ai/normalization/batch_norm/index.html">Batch Normalization</a>).</li>
 </ol>
-<p>The paper <a href="https://arxiv.org/abs/1704.00028">Improved Training of Wasserstein GANs</a>
+<p>The paper <a href="https://papers.labml.ai/paper/1704.00028">Improved Training of Wasserstein GANs</a>
 proposal a better way to improve Lipschitz constraint, a gradient penalty.</p>
             </div>
             <div class='code'>
diff --git a/docs/gan/wasserstein/index.html b/docs/gan/wasserstein/index.html
index abe191ef..36115b6e 100644
--- a/docs/gan/wasserstein/index.html
+++ b/docs/gan/wasserstein/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Wasserstein GAN (WGAN)</h1>
 <p>This is an implementation of
-<a href="https://arxiv.org/abs/1701.07875">Wasserstein GAN</a>.</p>
+<a href="https://papers.labml.ai/paper/1701.07875">Wasserstein GAN</a>.</p>
 <p>The original GAN loss is based on Jensen-Shannon (JS) divergence
 between the real distribution $\mathbb{P}_r$ and generated distribution $\mathbb{P}_g$.
 The Wasserstein GAN is based on Earth Mover distance between these distributions.</p>
diff --git a/docs/gan/wasserstein/readme.html b/docs/gan/wasserstein/readme.html
index 0d85917c..d0a49a80 100644
--- a/docs/gan/wasserstein/readme.html
+++ b/docs/gan/wasserstein/readme.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/gan/wasserstein/index.html">Wasserstein GAN - WGAN</a></h1>
 <p>This is an implementation of
-<a href="https://arxiv.org/abs/1701.07875">Wasserstein GAN</a>.</p>
+<a href="https://papers.labml.ai/paper/1701.07875">Wasserstein GAN</a>.</p>
             </div>
             <div class='code'>
                 
diff --git a/docs/graphs/gat/index.html b/docs/graphs/gat/index.html
index 6c4d74df..636d02fb 100644
--- a/docs/graphs/gat/index.html
+++ b/docs/graphs/gat/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Graph Attention Networks (GAT)</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
-<a href="https://arxiv.org/abs/1710.10903">Graph Attention Networks</a>.</p>
+<a href="https://papers.labml.ai/paper/1710.10903">Graph Attention Networks</a>.</p>
 <p>GATs work on graph data.
 A graph consists of nodes and edges connecting nodes.
 For example, in Cora dataset the nodes are research papers and the edges are citations that
diff --git a/docs/graphs/gat/readme.html b/docs/graphs/gat/readme.html
index 89cfdc8e..8763ec59 100644
--- a/docs/graphs/gat/readme.html
+++ b/docs/graphs/gat/readme.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/graphs/gat/index.html">Graph Attention Networks (GAT)</a></h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
-<a href="https://arxiv.org/abs/1710.10903">Graph Attention Networks</a>.</p>
+<a href="https://papers.labml.ai/paper/1710.10903">Graph Attention Networks</a>.</p>
 <p>GATs work on graph data.
 A graph consists of nodes and edges connecting nodes.
 For example, in Cora dataset the nodes are research papers and the edges are citations that
diff --git a/docs/graphs/gatv2/index.html b/docs/graphs/gatv2/index.html
index 180b7cf8..09ad2236 100644
--- a/docs/graphs/gatv2/index.html
+++ b/docs/graphs/gatv2/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Graph Attention Networks v2 (GATv2)</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the GATv2 operator from the paper
-<a href="https://arxiv.org/abs/2105.14491">How Attentive are Graph Attention Networks?</a>.</p>
+<a href="https://papers.labml.ai/paper/2105.14491">How Attentive are Graph Attention Networks?</a>.</p>
 <p>GATv2s work on graph data similar to <a href="../gat/index.html">GAT</a>.
 A graph consists of nodes and edges connecting nodes.
 For example, in Cora dataset the nodes are research papers and the edges are citations that
diff --git a/docs/graphs/gatv2/readme.html b/docs/graphs/gatv2/readme.html
index c38f5e2e..8ab988ca 100644
--- a/docs/graphs/gatv2/readme.html
+++ b/docs/graphs/gatv2/readme.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/graphs/gatv2/index.html">Graph Attention Networks v2 (GATv2)</a></h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the GATv2 operator from the paper
-<a href="https://arxiv.org/abs/2105.14491">How Attentive are Graph Attention Networks?</a>.</p>
+<a href="https://papers.labml.ai/paper/2105.14491">How Attentive are Graph Attention Networks?</a>.</p>
 <p>GATv2s work on graph data.
 A graph consists of nodes and edges connecting nodes.
 For example, in Cora dataset the nodes are research papers and the edges are citations that
diff --git a/docs/hypernetworks/hyper_lstm.html b/docs/hypernetworks/hyper_lstm.html
index 0256286d..fbfcdea8 100644
--- a/docs/hypernetworks/hyper_lstm.html
+++ b/docs/hypernetworks/hyper_lstm.html
@@ -68,7 +68,7 @@
                 </div>
                 <h1>HyperNetworks - HyperLSTM</h1>
 <p>We have implemented HyperLSTM introduced in paper
-<a href="https://arxiv.org/abs/1609.09106">HyperNetworks</a>, with annotations
+<a href="https://papers.labml.ai/paper/1609.09106">HyperNetworks</a>, with annotations
 using <a href="https://pytorch.org">PyTorch</a>.
 <a href="https://blog.otoro.net/2016/09/28/hyper-networks/">This blog post</a>
 by David Ha gives a good explanation of HyperNetworks.</p>
diff --git a/docs/normalization/batch_channel_norm/index.html b/docs/normalization/batch_channel_norm/index.html
index d79f5882..d5676ff4 100644
--- a/docs/normalization/batch_channel_norm/index.html
+++ b/docs/normalization/batch_channel_norm/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Batch-Channel Normalization</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of Batch-Channel Normalization from the paper
- <a href="https://arxiv.org/abs/1903.10520">Micro-Batch Training with Batch-Channel Normalization and Weight Standardization</a>.
+ <a href="https://papers.labml.ai/paper/1903.10520">Micro-Batch Training with Batch-Channel Normalization and Weight Standardization</a>.
 We also have an <a href="../weight_standardization/index.html">annotated implementation of Weight Standardization</a>.</p>
 <p>Batch-Channel Normalization performs batch normalization followed
 by a channel normalization (similar to a <a href="../group_norm/index.html">Group Normalization</a>.
diff --git a/docs/normalization/batch_norm/index.html b/docs/normalization/batch_norm/index.html
index f9ffef8e..cbbf8f50 100644
--- a/docs/normalization/batch_norm/index.html
+++ b/docs/normalization/batch_norm/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Batch Normalization</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of Batch Normalization from paper
- <a href="https://arxiv.org/abs/1502.03167">Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift</a>.</p>
+ <a href="https://papers.labml.ai/paper/1502.03167">Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift</a>.</p>
 <h3>Internal Covariate Shift</h3>
 <p>The paper defines <em>Internal Covariate Shift</em> as the change in the
 distribution of network activations due to the change in
diff --git a/docs/normalization/batch_norm/readme.html b/docs/normalization/batch_norm/readme.html
index 0e7aa7dc..8ac6ba30 100644
--- a/docs/normalization/batch_norm/readme.html
+++ b/docs/normalization/batch_norm/readme.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/normalization/batch_norm/index.html">Batch Normalization</a></h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of Batch Normalization from paper
- <a href="https://arxiv.org/abs/1502.03167">Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift</a>.</p>
+ <a href="https://papers.labml.ai/paper/1502.03167">Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift</a>.</p>
 <h3>Internal Covariate Shift</h3>
 <p>The paper defines <em>Internal Covariate Shift</em> as the change in the
 distribution of network activations due to the change in
diff --git a/docs/normalization/group_norm/index.html b/docs/normalization/group_norm/index.html
index 92f7d291..e88ea4e4 100644
--- a/docs/normalization/group_norm/index.html
+++ b/docs/normalization/group_norm/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Group Normalization</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
-the <a href="https://arxiv.org/abs/1803.08494">Group Normalization</a> paper.</p>
+the <a href="https://papers.labml.ai/paper/1803.08494">Group Normalization</a> paper.</p>
 <p><a href="../batch_norm/index.html">Batch Normalization</a> works well for large enough batch sizes
 but not well for small batch sizes, because it normalizes over the batch.
 Training large models with large batch sizes is not possible due to the memory capacity of the
diff --git a/docs/normalization/group_norm/readme.html b/docs/normalization/group_norm/readme.html
index b0f21981..09d7f4d3 100644
--- a/docs/normalization/group_norm/readme.html
+++ b/docs/normalization/group_norm/readme.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/normalization/group_norm/index.html">Group Normalization</a></h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
-the <a href="https://arxiv.org/abs/1803.08494">Group Normalization</a> paper.</p>
+the <a href="https://papers.labml.ai/paper/1803.08494">Group Normalization</a> paper.</p>
 <p><a href="https://nn.labml.ai/normalization/batch_norm/index.html">Batch Normalization</a> works well for large enough batch sizes
 but not well for small batch sizes, because it normalizes over the batch.
 Training large models with large batch sizes is not possible due to the memory capacity of the
diff --git a/docs/normalization/instance_norm/index.html b/docs/normalization/instance_norm/index.html
index ac0c4c6d..069690bb 100644
--- a/docs/normalization/instance_norm/index.html
+++ b/docs/normalization/instance_norm/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Instance Normalization</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
-<a href="https://arxiv.org/abs/1607.08022">Instance Normalization: The Missing Ingredient for Fast Stylization</a>.</p>
+<a href="https://papers.labml.ai/paper/1607.08022">Instance Normalization: The Missing Ingredient for Fast Stylization</a>.</p>
 <p>Instance normalization was introduced to improve <a href="https://paperswithcode.com/task/style-transfer">style transfer</a>.
 It is based on the observation that stylization should not depend on the contrast of the content image.
 The &ldquo;contrast normalization&rdquo; is</p>
diff --git a/docs/normalization/instance_norm/readme.html b/docs/normalization/instance_norm/readme.html
index 507d0777..38a56157 100644
--- a/docs/normalization/instance_norm/readme.html
+++ b/docs/normalization/instance_norm/readme.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/normalization/instance_norm/index.html">Instance Normalization</a></h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
-<a href="https://arxiv.org/abs/1607.08022">Instance Normalization: The Missing Ingredient for Fast Stylization</a>.</p>
+<a href="https://papers.labml.ai/paper/1607.08022">Instance Normalization: The Missing Ingredient for Fast Stylization</a>.</p>
 <p>Instance normalization was introduced to improve <a href="https://paperswithcode.com/task/style-transfer">style transfer</a>.
 It is based on the observation that stylization should not depend on the contrast of the content image.
 Since it&rsquo;s hard for a convolutional network to learn &ldquo;contrast normalization&rdquo;, this paper
diff --git a/docs/normalization/layer_norm/index.html b/docs/normalization/layer_norm/index.html
index 73f48280..8049ae07 100644
--- a/docs/normalization/layer_norm/index.html
+++ b/docs/normalization/layer_norm/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Layer Normalization</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
-<a href="https://arxiv.org/abs/1607.06450">Layer Normalization</a>.</p>
+<a href="https://papers.labml.ai/paper/1607.06450">Layer Normalization</a>.</p>
 <h3>Limitations of <a href="../batch_norm/index.html">Batch Normalization</a></h3>
 <ul>
 <li>You need to maintain running means.</li>
diff --git a/docs/normalization/layer_norm/readme.html b/docs/normalization/layer_norm/readme.html
index 6e4ae0da..fd953126 100644
--- a/docs/normalization/layer_norm/readme.html
+++ b/docs/normalization/layer_norm/readme.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/normalization/layer_norm/index.html">Layer Normalization</a></h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
-<a href="https://arxiv.org/abs/1607.06450">Layer Normalization</a>.</p>
+<a href="https://papers.labml.ai/paper/1607.06450">Layer Normalization</a>.</p>
 <h3>Limitations of <a href="https://nn.labml.ai/normalization/batch_norm/index.html">Batch Normalization</a></h3>
 <ul>
 <li>You need to maintain running means.</li>
diff --git a/docs/normalization/weight_standardization/index.html b/docs/normalization/weight_standardization/index.html
index fd4d62b9..ebd91b39 100644
--- a/docs/normalization/weight_standardization/index.html
+++ b/docs/normalization/weight_standardization/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Weight Standardization</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of Weight Standardization from the paper
- <a href="https://arxiv.org/abs/1903.10520">Micro-Batch Training with Batch-Channel Normalization and Weight Standardization</a>.
+ <a href="https://papers.labml.ai/paper/1903.10520">Micro-Batch Training with Batch-Channel Normalization and Weight Standardization</a>.
 We also have an <a href="../batch_channel_norm/index.html">annotated implementation of Batch-Channel Normalization</a>.</p>
 <p>Batch normalization <strong>gives a smooth loss landscape</strong> and
 <strong>avoids elimination singularities</strong>.
@@ -91,7 +91,7 @@ where $f: A \rightarrow \mathbb{R}^m, A \in \mathbb{R}^n$.</p>
 inputs. So as long as the inputs are normally distributed the outputs remain close to normal.
 This avoids outputs of nodes from always falling beyond the active range of the activation function
 (e.g. always negative input for a ReLU).</p>
-<p><em><a href="https://arxiv.org/abs/1903.10520">Refer to the paper for proofs</a></em>.</p>
+<p><em><a href="https://papers.labml.ai/paper/1903.10520">Refer to the paper for proofs</a></em>.</p>
 <p>Here is <a href="experiment.html">the training code</a> for training
 a VGG network that uses weight standardization to classify CIFAR-10 data.
 This uses a <a href="conv2d.html">2D-Convolution Layer with Weight Standardization</a>.</p>
diff --git a/docs/normalization/weight_standardization/readme.html b/docs/normalization/weight_standardization/readme.html
index 556db55b..e1ee6d7c 100644
--- a/docs/normalization/weight_standardization/readme.html
+++ b/docs/normalization/weight_standardization/readme.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/normalization/weight_standardization/index.html">Weight Standardization</a></h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of Weight Standardization from the paper
- <a href="https://arxiv.org/abs/1903.10520">Micro-Batch Training with Batch-Channel Normalization and Weight Standardization</a>.
+ <a href="https://papers.labml.ai/paper/1903.10520">Micro-Batch Training with Batch-Channel Normalization and Weight Standardization</a>.
 We also have an
 <a href="https://nn.labml.ai/normalization/batch_channel_norm/index.html">annotated implementation of Batch-Channel Normalization</a>.</p>
             </div>
diff --git a/docs/optimizers/ada_belief.html b/docs/optimizers/ada_belief.html
index 0c4a92bf..faa7c818 100644
--- a/docs/optimizers/ada_belief.html
+++ b/docs/optimizers/ada_belief.html
@@ -70,7 +70,7 @@
 <p>This is based from AdaBelief
 <a href="https://github.com/juntang-zhuang/Adabelief-Optimizer">official implementation</a>
 of the paper
-<a href="https://arxiv.org/abs/2010.07468">AdaBelief Optimizer: Adapting Stepsizes by the Belief in Observed Gradients</a>.</p>
+<a href="https://papers.labml.ai/paper/2010.07468">AdaBelief Optimizer: Adapting Stepsizes by the Belief in Observed Gradients</a>.</p>
 <p>This is implemented in <a href="https://pytorch.org">PyTorch</a> as an extension to <a href="radam.html">RAdam</a>.</p>
 <p>The main difference between Adam optimizer and AdaBelief is that,
 how it calculates the adaptive learning rate;
diff --git a/docs/optimizers/adam.html b/docs/optimizers/adam.html
index 07da4913..3e0dd021 100644
--- a/docs/optimizers/adam.html
+++ b/docs/optimizers/adam.html
@@ -68,7 +68,7 @@
                 </div>
                 <h1>Adam Optimizer</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of popular optimizer <em>Adam</em> from paper
- <a href="https://arxiv.org/abs/1412.6980v9">Adam: A Method for Stochastic Optimization</a>.</p>
+ <a href="https://papers.labml.ai/paper/1412.6980v9">Adam: A Method for Stochastic Optimization</a>.</p>
 <p><em>Adam</em> update is,</p>
 <p>
 <script type="math/tex; mode=display">\begin{align}
diff --git a/docs/optimizers/amsgrad.html b/docs/optimizers/amsgrad.html
index 2bc2ce6d..afdeadb7 100644
--- a/docs/optimizers/amsgrad.html
+++ b/docs/optimizers/amsgrad.html
@@ -68,7 +68,7 @@
                 </div>
                 <h1>AMSGrad</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
-<a href="https://arxiv.org/abs/1904.09237">On the Convergence of Adam and Beyond</a>.</p>
+<a href="https://papers.labml.ai/paper/1904.09237">On the Convergence of Adam and Beyond</a>.</p>
 <p>We implement this as an extension to our <a href="adam.html">Adam optimizer implementation</a>.
 The implementation it self is really small since it&rsquo;s very similar to Adam.</p>
 <p>We also have an implementation of the synthetic example described in the paper where Adam fails to converge.</p>
diff --git a/docs/optimizers/noam.html b/docs/optimizers/noam.html
index fc3a331a..1e7a44cb 100644
--- a/docs/optimizers/noam.html
+++ b/docs/optimizers/noam.html
@@ -68,7 +68,7 @@
                 </div>
                 <h1>Noam Optimizer</h1>
 <p>This is the <a href="https://pytorch.org">PyTorch</a> implementation of optimizer introduced in the paper
-<a href="https://arxiv.org/abs/1706.03762">Attention Is All You Need</a>.</p>
+<a href="https://papers.labml.ai/paper/1706.03762">Attention Is All You Need</a>.</p>
             </div>
             <div class='code'>
                 <div class="highlight"><pre><span class="lineno">14</span><span></span><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Dict</span>
diff --git a/docs/optimizers/radam.html b/docs/optimizers/radam.html
index e83d35f5..2bbf3e70 100644
--- a/docs/optimizers/radam.html
+++ b/docs/optimizers/radam.html
@@ -70,7 +70,7 @@
 <p>This implementation is based on
 <a href="https://github.com/LiyuanLucasLiu/RAdam">the official implementation</a>
 of the paper
-<a href="https://arxiv.org/abs/1908.03265">On the Variance of the Adaptive Learning Rate and Beyond</a>.</p>
+<a href="https://papers.labml.ai/paper/1908.03265">On the Variance of the Adaptive Learning Rate and Beyond</a>.</p>
 <p>We have implemented it in <a href="https://pytorch.org">PyTorch</a>
 as an extension to <a href="amsgrad.html">our AMSGrad implementation</a>
 thus requiring only the modifications to be implemented.</p>
diff --git a/docs/papers.json b/docs/papers.json
new file mode 100644
index 00000000..f12e7685
--- /dev/null
+++ b/docs/papers.json
@@ -0,0 +1,137 @@
+{
+ "1701.07875": [
+  "https://nn.labml.ai/gan/wasserstein/index.html"
+ ],
+ "1704.00028": [
+  "https://nn.labml.ai/gan/wasserstein/gradient_penalty/index.html"
+ ],
+ "1406.2661": [
+  "https://nn.labml.ai/gan/original/index.html"
+ ],
+ "1511.06434": [
+  "https://nn.labml.ai/gan/dcgan/index.html"
+ ],
+ "1912.04958": [
+  "https://nn.labml.ai/gan/stylegan/index.html"
+ ],
+ "1703.10593": [
+  "https://nn.labml.ai/gan/cycle_gan/index.html"
+ ],
+ "1609.09106": [
+  "https://nn.labml.ai/hypernetworks/hyper_lstm.html"
+ ],
+ "1903.10520": [
+  "https://nn.labml.ai/normalization/weight_standardization/index.html",
+  "https://nn.labml.ai/normalization/batch_channel_norm/index.html"
+ ],
+ "1607.08022": [
+  "https://nn.labml.ai/normalization/instance_norm/index.html"
+ ],
+ "1607.06450": [
+  "https://nn.labml.ai/normalization/layer_norm/index.html"
+ ],
+ "1803.08494": [
+  "https://nn.labml.ai/normalization/group_norm/index.html"
+ ],
+ "1502.03167": [
+  "https://nn.labml.ai/normalization/batch_norm/index.html"
+ ],
+ "1512.03385": [
+  "https://nn.labml.ai/resnet/index.html"
+ ],
+ "1503.02531": [
+  "https://nn.labml.ai/distillation/index.html"
+ ],
+ "2010.07468": [
+  "https://nn.labml.ai/optimizers/ada_belief.html"
+ ],
+ "1908.03265": [
+  "https://nn.labml.ai/optimizers/radam.html"
+ ],
+ "1904.09237": [
+  "https://nn.labml.ai/optimizers/amsgrad.html"
+ ],
+ "2105.08050": [
+  "https://nn.labml.ai/transformers/gmlp/index.html"
+ ],
+ "1911.00172": [
+  "https://nn.labml.ai/transformers/knn/index.html"
+ ],
+ "2002.05202": [
+  "https://nn.labml.ai/transformers/feed_forward.html"
+ ],
+ "2102.11174": [
+  "https://nn.labml.ai/transformers/fast_weights/index.html"
+ ],
+ "2002.09402": [
+  "https://nn.labml.ai/transformers/feedback/index.html"
+ ],
+ "2105.01601": [
+  "https://nn.labml.ai/transformers/mlp_mixer/index.html"
+ ],
+ "2010.11929": [
+  "https://nn.labml.ai/transformers/vit/index.html"
+ ],
+ "2101.03961": [
+  "https://nn.labml.ai/transformers/switch/index.html"
+ ],
+ "1810.04805": [
+  "https://nn.labml.ai/transformers/mlm/index.html"
+ ],
+ "2105.14103": [
+  "https://nn.labml.ai/transformers/aft/index.html"
+ ],
+ "1706.03762": [
+  "https://nn.labml.ai/transformers/mha.html"
+ ],
+ "1911.05507": [
+  "https://nn.labml.ai/transformers/compressive/index.html"
+ ],
+ "2105.03824": [
+  "https://nn.labml.ai/transformers/fnet/index.html"
+ ],
+ "1901.02860": [
+  "https://nn.labml.ai/transformers/xl/index.html",
+  "https://nn.labml.ai/transformers/xl/relative_mha.html"
+ ],
+ "1710.09829": [
+  "https://nn.labml.ai/capsule_networks/index.html",
+  "https://nn.labml.ai/capsule_networks/mnist.html"
+ ],
+ "1607.03474": [
+  "https://nn.labml.ai/recurrent_highway_networks/index.html"
+ ],
+ "1710.10903": [
+  "https://nn.labml.ai/graphs/gat/index.html"
+ ],
+ "2105.14491": [
+  "https://nn.labml.ai/graphs/gatv2/index.html"
+ ],
+ "1603.08983": [
+  "https://nn.labml.ai/adaptive_computation/parity.html"
+ ],
+ "2107.05407": [
+  "https://nn.labml.ai/adaptive_computation/ponder_net/index.html"
+ ],
+ "1704.03477": [
+  "https://nn.labml.ai/sketch_rnn/index.html"
+ ],
+ "1312.5602": [
+  "https://nn.labml.ai/rl/dqn/index.html"
+ ],
+ "1509.06461": [
+  "https://nn.labml.ai/rl/dqn/index.html"
+ ],
+ "1511.06581": [
+  "https://nn.labml.ai/rl/dqn/model.html"
+ ],
+ "1511.05952": [
+  "https://nn.labml.ai/rl/dqn/replay_buffer.html"
+ ],
+ "1707.06347": [
+  "https://nn.labml.ai/rl/ppo/index.html"
+ ],
+ "1506.02438": [
+  "https://nn.labml.ai/rl/ppo/gae.html"
+ ]
+}
\ No newline at end of file
diff --git a/docs/recurrent_highway_networks/index.html b/docs/recurrent_highway_networks/index.html
index a556c54b..68dc356c 100644
--- a/docs/recurrent_highway_networks/index.html
+++ b/docs/recurrent_highway_networks/index.html
@@ -67,7 +67,7 @@
                     <a href='#section-0'>#</a>
                 </div>
                 <h1>Recurrent Highway Networks</h1>
-<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of <a href="https://arxiv.org/abs/1607.03474">Recurrent Highway Networks</a>.</p>
+<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of <a href="https://papers.labml.ai/paper/1607.03474">Recurrent Highway Networks</a>.</p>
             </div>
             <div class='code'>
                 <div class="highlight"><pre><span class="lineno">11</span><span></span><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Optional</span>
diff --git a/docs/resnet/index.html b/docs/resnet/index.html
index 0d7930f9..fe97c655 100644
--- a/docs/resnet/index.html
+++ b/docs/resnet/index.html
@@ -68,7 +68,7 @@
                 </div>
                 <h1>Deep Residual Learning for Image Recognition (ResNet)</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
-<a href="https://arxiv.org/abs/1512.03385">Deep Residual Learning for Image Recognition</a>.</p>
+<a href="https://papers.labml.ai/paper/1512.03385">Deep Residual Learning for Image Recognition</a>.</p>
 <p>ResNets train layers as residual functions to overcome the
 <em>degradation problem</em>.
 The degradation problem is the accuracy of deep neural networks degrading when
diff --git a/docs/resnet/readme.html b/docs/resnet/readme.html
index d8f185d4..7f695cff 100644
--- a/docs/resnet/readme.html
+++ b/docs/resnet/readme.html
@@ -68,7 +68,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/resnet/index.html">Deep Residual Learning for Image Recognition (ResNet)</a></h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
-<a href="https://arxiv.org/abs/1512.03385">Deep Residual Learning for Image Recognition</a>.</p>
+<a href="https://papers.labml.ai/paper/1512.03385">Deep Residual Learning for Image Recognition</a>.</p>
 <p>ResNets train layers as residual functions to overcome the
 <em>degradation problem</em>.
 The degradation problem is the accuracy of deep neural networks degrading when
diff --git a/docs/rl/dqn/index.html b/docs/rl/dqn/index.html
index 440d7136..b520a583 100644
--- a/docs/rl/dqn/index.html
+++ b/docs/rl/dqn/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Deep Q Networks (DQN)</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of paper
- <a href="https://arxiv.org/abs/1312.5602">Playing Atari with Deep Reinforcement Learning</a>
+ <a href="https://papers.labml.ai/paper/1312.5602">Playing Atari with Deep Reinforcement Learning</a>
  along with <a href="model.html">Dueling Network</a>, <a href="replay_buffer.html">Prioritized Replay</a>
  and Double Q Network.</p>
 <p>Here is the <a href="experiment.html">experiment</a> and <a href="model.html">model</a> implementation.</p>
@@ -136,7 +136,7 @@ That is,
     \color{cyan}{Q}(s', a'; \color{cyan}{\theta}); \color{cyan}{\theta}
 \Big)
 </script>
-We use <a href="https://arxiv.org/abs/1509.06461">double Q-learning</a>, where
+We use <a href="https://papers.labml.ai/paper/1509.06461">double Q-learning</a>, where
 the $\operatorname{argmax}$ is taken from $\color{cyan}{\theta_i}$ and
 the value is taken from $\color{orange}{\theta_i^{-}}$.</p>
 <p>And the loss function becomes,
diff --git a/docs/rl/dqn/model.html b/docs/rl/dqn/model.html
index 71e92fba..306d814a 100644
--- a/docs/rl/dqn/model.html
+++ b/docs/rl/dqn/model.html
@@ -82,7 +82,7 @@
                     <a href='#section-1'>#</a>
                 </div>
                 <h2>Dueling Network ⚔️ Model for $Q$ Values</h2>
-<p>We are using a <a href="https://arxiv.org/abs/1511.06581">dueling network</a>
+<p>We are using a <a href="https://papers.labml.ai/paper/1511.06581">dueling network</a>
  to calculate Q-values.
 Intuition behind dueling network architecture is that in most states
  the action doesn&rsquo;t matter,
diff --git a/docs/rl/dqn/replay_buffer.html b/docs/rl/dqn/replay_buffer.html
index 03693cf8..18c310db 100644
--- a/docs/rl/dqn/replay_buffer.html
+++ b/docs/rl/dqn/replay_buffer.html
@@ -68,7 +68,7 @@
                     <a href='#section-0'>#</a>
                 </div>
                 <h1>Prioritized Experience Replay Buffer</h1>
-<p>This implements paper <a href="https://arxiv.org/abs/1511.05952">Prioritized experience replay</a>,
+<p>This implements paper <a href="https://papers.labml.ai/paper/1511.05952">Prioritized experience replay</a>,
 using a binary segment tree.</p>
             </div>
             <div class='code'>
@@ -83,7 +83,7 @@ using a binary segment tree.</p>
                     <a href='#section-1'>#</a>
                 </div>
                 <h2>Buffer for Prioritized Experience Replay</h2>
-<p><a href="https://arxiv.org/abs/1511.05952">Prioritized experience replay</a>
+<p><a href="https://papers.labml.ai/paper/1511.05952">Prioritized experience replay</a>
  samples important transitions more frequently.
 The transitions are prioritized by the Temporal Difference error (td error), $\delta$.</p>
 <p>We sample transition $i$ with probability,
diff --git a/docs/rl/ppo/gae.html b/docs/rl/ppo/gae.html
index db2f96a6..f8f78c27 100644
--- a/docs/rl/ppo/gae.html
+++ b/docs/rl/ppo/gae.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Generalized Advantage Estimation (GAE)</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of paper
-<a href="https://arxiv.org/abs/1506.02438">Generalized Advantage Estimation</a>.</p>
+<a href="https://papers.labml.ai/paper/1506.02438">Generalized Advantage Estimation</a>.</p>
 <p>You can find an experiment that uses it <a href="experiment.html">here</a>.</p>
             </div>
             <div class='code'>
diff --git a/docs/rl/ppo/index.html b/docs/rl/ppo/index.html
index 36e8c4b2..548a35ed 100644
--- a/docs/rl/ppo/index.html
+++ b/docs/rl/ppo/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Proximal Policy Optimization - PPO</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
-<a href="https://arxiv.org/abs/1707.06347">Proximal Policy Optimization - PPO</a>.</p>
+<a href="https://papers.labml.ai/paper/1707.06347">Proximal Policy Optimization - PPO</a>.</p>
 <p>PPO is a policy gradient method for reinforcement learning.
 Simple policy gradient methods do a single gradient update per sample (or a set of samples).
 Doing multiple gradient steps for a single sample causes problems
@@ -171,7 +171,7 @@ J(\pi_\theta) - J(\pi_{\theta_{OLD}})
 The error we introduce to $J(\pi_\theta) - J(\pi_{\theta_{OLD}})$
  by this assumption is bound by the KL divergence between
  $\pi_\theta$ and $\pi_{\theta_{OLD}}$.
-<a href="https://arxiv.org/abs/1705.10528">Constrained Policy Optimization</a>
+<a href="https://papers.labml.ai/paper/1705.10528">Constrained Policy Optimization</a>
  shows the proof of this. I haven&rsquo;t read it.</p>
 <p>
 <script type="math/tex; mode=display">\begin{align}
diff --git a/docs/rl/ppo/readme.html b/docs/rl/ppo/readme.html
index bd384de2..005c1648 100644
--- a/docs/rl/ppo/readme.html
+++ b/docs/rl/ppo/readme.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/rl/ppo/index.html">Proximal Policy Optimization - PPO</a></h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of
-<a href="https://arxiv.org/abs/1707.06347">Proximal Policy Optimization - PPO</a>.</p>
+<a href="https://papers.labml.ai/paper/1707.06347">Proximal Policy Optimization - PPO</a>.</p>
 <p>PPO is a policy gradient method for reinforcement learning.
 Simple policy gradient methods one do a single gradient update per sample (or a set of samples).
 Doing multiple gradient steps for a singe sample causes problems
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index 23beee36..d46361cf 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -8,14 +8,14 @@
       
     <url>
       <loc>https://nn.labml.ai/gan/wasserstein/index.html</loc>
-      <lastmod>2021-08-08T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
 
     <url>
       <loc>https://nn.labml.ai/gan/wasserstein/gradient_penalty/index.html</loc>
-      <lastmod>2021-05-09T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -36,7 +36,7 @@
 
     <url>
       <loc>https://nn.labml.ai/gan/original/index.html</loc>
-      <lastmod>2021-05-07T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -50,14 +50,14 @@
 
     <url>
       <loc>https://nn.labml.ai/gan/dcgan/index.html</loc>
-      <lastmod>2021-05-07T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
 
     <url>
       <loc>https://nn.labml.ai/gan/stylegan/index.html</loc>
-      <lastmod>2021-06-21T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -71,7 +71,7 @@
 
     <url>
       <loc>https://nn.labml.ai/gan/cycle_gan/index.html</loc>
-      <lastmod>2021-08-08T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -113,14 +113,14 @@
 
     <url>
       <loc>https://nn.labml.ai/hypernetworks/hyper_lstm.html</loc>
-      <lastmod>2021-08-08T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
 
     <url>
       <loc>https://nn.labml.ai/normalization/weight_standardization/index.html</loc>
-      <lastmod>2021-08-08T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -141,7 +141,7 @@
 
     <url>
       <loc>https://nn.labml.ai/normalization/instance_norm/index.html</loc>
-      <lastmod>2021-04-23T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -155,7 +155,7 @@
 
     <url>
       <loc>https://nn.labml.ai/normalization/layer_norm/index.html</loc>
-      <lastmod>2021-04-20T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -169,14 +169,14 @@
 
     <url>
       <loc>https://nn.labml.ai/normalization/batch_channel_norm/index.html</loc>
-      <lastmod>2021-08-08T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
 
     <url>
       <loc>https://nn.labml.ai/normalization/group_norm/index.html</loc>
-      <lastmod>2021-08-08T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -190,7 +190,7 @@
 
     <url>
       <loc>https://nn.labml.ai/normalization/batch_norm/index.html</loc>
-      <lastmod>2021-08-08T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -246,7 +246,7 @@
 
     <url>
       <loc>https://nn.labml.ai/resnet/index.html</loc>
-      <lastmod>2021-07-16T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -351,7 +351,7 @@
 
     <url>
       <loc>https://nn.labml.ai/optimizers/ada_belief.html</loc>
-      <lastmod>2021-01-30T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -365,7 +365,7 @@
 
     <url>
       <loc>https://nn.labml.ai/optimizers/noam.html</loc>
-      <lastmod>2021-01-30T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -386,21 +386,21 @@
 
     <url>
       <loc>https://nn.labml.ai/optimizers/radam.html</loc>
-      <lastmod>2021-01-30T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
 
     <url>
       <loc>https://nn.labml.ai/optimizers/adam.html</loc>
-      <lastmod>2021-01-30T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
 
     <url>
       <loc>https://nn.labml.ai/optimizers/amsgrad.html</loc>
-      <lastmod>2021-02-27T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -435,7 +435,7 @@
 
     <url>
       <loc>https://nn.labml.ai/transformers/knn/index.html</loc>
-      <lastmod>2021-01-30T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -449,14 +449,14 @@
 
     <url>
       <loc>https://nn.labml.ai/transformers/configs.html</loc>
-      <lastmod>2021-02-05T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
 
     <url>
       <loc>https://nn.labml.ai/transformers/models.html</loc>
-      <lastmod>2021-06-02T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -470,7 +470,7 @@
 
     <url>
       <loc>https://nn.labml.ai/transformers/feed_forward.html</loc>
-      <lastmod>2021-02-02T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -484,7 +484,7 @@
 
     <url>
       <loc>https://nn.labml.ai/transformers/basic/autoregressive_experiment.html</loc>
-      <lastmod>2021-06-07T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -498,7 +498,7 @@
 
     <url>
       <loc>https://nn.labml.ai/transformers/fast_weights/index.html</loc>
-      <lastmod>2021-08-08T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -512,14 +512,14 @@
 
     <url>
       <loc>https://nn.labml.ai/transformers/index.html</loc>
-      <lastmod>2021-07-17T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
 
     <url>
       <loc>https://nn.labml.ai/transformers/feedback/index.html</loc>
-      <lastmod>2021-08-08T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -568,7 +568,7 @@
 
     <url>
       <loc>https://nn.labml.ai/transformers/vit/index.html</loc>
-      <lastmod>2021-07-17T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -589,7 +589,7 @@
 
     <url>
       <loc>https://nn.labml.ai/transformers/switch/index.html</loc>
-      <lastmod>2021-08-08T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -617,7 +617,7 @@
 
     <url>
       <loc>https://nn.labml.ai/transformers/mlm/index.html</loc>
-      <lastmod>2021-06-06T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -645,14 +645,14 @@
 
     <url>
       <loc>https://nn.labml.ai/transformers/mha.html</loc>
-      <lastmod>2021-06-07T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
 
     <url>
       <loc>https://nn.labml.ai/transformers/compressive/index.html</loc>
-      <lastmod>2021-08-08T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -673,7 +673,7 @@
 
     <url>
       <loc>https://nn.labml.ai/transformers/fnet/index.html</loc>
-      <lastmod>2021-06-21T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -687,7 +687,7 @@
 
     <url>
       <loc>https://nn.labml.ai/transformers/xl/index.html</loc>
-      <lastmod>2021-08-08T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -701,35 +701,35 @@
 
     <url>
       <loc>https://nn.labml.ai/transformers/xl/relative_mha.html</loc>
-      <lastmod>2021-02-07T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
 
     <url>
       <loc>https://nn.labml.ai/capsule_networks/index.html</loc>
-      <lastmod>2021-08-08T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
 
     <url>
       <loc>https://nn.labml.ai/capsule_networks/mnist.html</loc>
-      <lastmod>2021-02-14T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
 
     <url>
       <loc>https://nn.labml.ai/recurrent_highway_networks/index.html</loc>
-      <lastmod>2021-02-11T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
 
     <url>
       <loc>https://nn.labml.ai/graphs/gat/index.html</loc>
-      <lastmod>2021-07-08T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -743,7 +743,7 @@
 
     <url>
       <loc>https://nn.labml.ai/graphs/gatv2/index.html</loc>
-      <lastmod>2021-07-30T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -792,7 +792,7 @@
 
     <url>
       <loc>https://nn.labml.ai/sketch_rnn/index.html</loc>
-      <lastmod>2021-03-04T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -813,14 +813,14 @@
 
     <url>
       <loc>https://nn.labml.ai/rl/dqn/index.html</loc>
-      <lastmod>2021-04-04T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
 
     <url>
       <loc>https://nn.labml.ai/rl/dqn/model.html</loc>
-      <lastmod>2020-12-10T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
@@ -834,21 +834,21 @@
 
     <url>
       <loc>https://nn.labml.ai/rl/dqn/replay_buffer.html</loc>
-      <lastmod>2020-12-10T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
 
     <url>
       <loc>https://nn.labml.ai/rl/ppo/index.html</loc>
-      <lastmod>2021-08-08T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
 
     <url>
       <loc>https://nn.labml.ai/rl/ppo/gae.html</loc>
-      <lastmod>2021-03-30T16:30:00+00:00</lastmod>
+      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
diff --git a/docs/sketch_rnn/index.html b/docs/sketch_rnn/index.html
index 7ec0aeb9..e8e2cb52 100644
--- a/docs/sketch_rnn/index.html
+++ b/docs/sketch_rnn/index.html
@@ -68,7 +68,7 @@
                 </div>
                 <h1>Sketch RNN</h1>
 <p>This is an annotated <a href="https://pytorch.org">PyTorch</a> implementation of the paper
-<a href="https://arxiv.org/abs/1704.03477">A Neural Representation of Sketch Drawings</a>.</p>
+<a href="https://papers.labml.ai/paper/1704.03477">A Neural Representation of Sketch Drawings</a>.</p>
 <p>Sketch RNN is a sequence-to-sequence variational auto-encoder.
 Both encoder and decoder are recurrent neural network models.
 It learns to reconstruct stroke based simple drawings, by predicting
diff --git a/docs/transformers/basic/autoregressive_experiment.html b/docs/transformers/basic/autoregressive_experiment.html
index d10676e0..233fdb24 100644
--- a/docs/transformers/basic/autoregressive_experiment.html
+++ b/docs/transformers/basic/autoregressive_experiment.html
@@ -68,7 +68,7 @@
                     <a href='#section-0'>#</a>
                 </div>
                 <h1>Transformer Auto-Regression Experiment</h1>
-<p>This trains a simple transformer introduced in <a href="https://arxiv.org/abs/1706.03762">Attention Is All You Need</a>
+<p>This trains a simple transformer introduced in <a href="https://papers.labml.ai/paper/1706.03762">Attention Is All You Need</a>
 on an NLP auto-regression task (with Tiny Shakespeare dataset).</p>
             </div>
             <div class='code'>
diff --git a/docs/transformers/compressive/index.html b/docs/transformers/compressive/index.html
index 892d00f1..649a89bd 100644
--- a/docs/transformers/compressive/index.html
+++ b/docs/transformers/compressive/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Compressive Transformer</h1>
 <p>This is an implementation of
-<a href="https://arxiv.org/abs/1911.05507">Compressive Transformers for Long-Range Sequence Modelling</a>
+<a href="https://papers.labml.ai/paper/1911.05507">Compressive Transformers for Long-Range Sequence Modelling</a>
 in <a href="https://pytorch.org">PyTorch</a>.</p>
 <p>This is an extension of <a href="../xl/index.html">Transformer XL</a> where past memories
 are compressed to give a longer attention range.
diff --git a/docs/transformers/compressive/readme.html b/docs/transformers/compressive/readme.html
index 73075774..24db6151 100644
--- a/docs/transformers/compressive/readme.html
+++ b/docs/transformers/compressive/readme.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/transformers/compressive/index.html">Compressive Transformer</a></h1>
 <p>This is an implementation of
-<a href="https://arxiv.org/abs/1911.05507">Compressive Transformers for Long-Range Sequence Modelling</a>
+<a href="https://papers.labml.ai/paper/1911.05507">Compressive Transformers for Long-Range Sequence Modelling</a>
 in <a href="https://pytorch.org">PyTorch</a>.</p>
 <p>This is an extension of <a href="https://nn.labml.ai/transformers/xl/index.html">Transformer XL</a> where past memories
 are compressed to give a longer attention range.
diff --git a/docs/transformers/configs.html b/docs/transformers/configs.html
index c31bd482..b30f2244 100644
--- a/docs/transformers/configs.html
+++ b/docs/transformers/configs.html
@@ -240,7 +240,7 @@
                 <h3>GELU activation</h3>
 <p>
 <script type="math/tex; mode=display">x \Phi(x)</script> where $\Phi(x) = P(X \le x), X \sim \mathcal{N}(0,1)$</p>
-<p>It was introduced in paper <a href="https://arxiv.org/abs/1606.08415">Gaussian Error Linear Units</a>.</p>
+<p>It was introduced in paper <a href="https://papers.labml.ai/paper/1606.08415">Gaussian Error Linear Units</a>.</p>
             </div>
             <div class='code'>
                 <div class="highlight"><pre><span class="lineno">62</span><span class="nd">@option</span><span class="p">(</span><span class="n">FeedForwardConfigs</span><span class="o">.</span><span class="n">activation</span><span class="p">,</span> <span class="s1">&#39;GELU&#39;</span><span class="p">)</span>
@@ -294,7 +294,7 @@
                 </div>
                 <h2>GLU Variants</h2>
 <p>These are variants with gated hidden layers for the FFN
-as introduced in paper <a href="https://arxiv.org/abs/2002.05202">GLU Variants Improve Transformer</a>.
+as introduced in paper <a href="https://papers.labml.ai/paper/2002.05202">GLU Variants Improve Transformer</a>.
 We have omitted the bias terms as specified in the paper.</p>
             </div>
             <div class='code'>
diff --git a/docs/transformers/fast_weights/index.html b/docs/transformers/fast_weights/index.html
index f78cf2c1..387b07ba 100644
--- a/docs/transformers/fast_weights/index.html
+++ b/docs/transformers/fast_weights/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Fast weights transformer</h1>
 <p>The paper
-<a href="https://arxiv.org/abs/2102.11174">Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch</a>
+<a href="https://papers.labml.ai/paper/2102.11174">Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch</a>
 finds similarities between linear self-attention and fast weight systems
 and makes modifications to self-attention update rule based on that.
 It also introduces a simpler, yet effective kernel function.</p>
diff --git a/docs/transformers/fast_weights/readme.html b/docs/transformers/fast_weights/readme.html
index 48763408..271418c9 100644
--- a/docs/transformers/fast_weights/readme.html
+++ b/docs/transformers/fast_weights/readme.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/transformers/fast_weights/index.html">Fast weights transformer</a></h1>
 <p>This is an annotated implementation of the paper
-<a href="https://arxiv.org/abs/2102.11174">Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch</a>.</p>
+<a href="https://papers.labml.ai/paper/2102.11174">Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch</a>.</p>
 <p>Here is the <a href="https://nn.labml.ai/transformers/fast_weights/index.html">annotated implementation</a>.
 Here are <a href="https://nn.labml.ai/transformers/fast_weights/experiment.html">the training code</a>
 and a notebook for training a fast weights transformer on the Tiny Shakespeare dataset.</p>
diff --git a/docs/transformers/feed_forward.html b/docs/transformers/feed_forward.html
index 392c315d..e3ae3e1e 100644
--- a/docs/transformers/feed_forward.html
+++ b/docs/transformers/feed_forward.html
@@ -84,7 +84,7 @@ GELU (Gaussian Error Linear Unit) activation is also used instead of ReLU.
 <script type="math/tex; mode=display">x \Phi(x)</script> where $\Phi(x) = P(X \le x), X \sim \mathcal{N}(0,1)$</p>
 <h3>Gated Linear Units</h3>
 <p>This is a generic implementation that supports different variants including
-<a href="https://arxiv.org/abs/2002.05202">Gated Linear Units</a> (GLU).
+<a href="https://papers.labml.ai/paper/2002.05202">Gated Linear Units</a> (GLU).
 We have also implemented experiments on these:</p>
 <ul>
 <li><a href="glu_variants/experiment.html">experiment that uses <code>labml.configs</code></a></li>
diff --git a/docs/transformers/feedback/index.html b/docs/transformers/feedback/index.html
index 99516990..6f490247 100644
--- a/docs/transformers/feedback/index.html
+++ b/docs/transformers/feedback/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Feedback Transformer</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
-<a href="https://arxiv.org/abs/2002.09402">Accessing Higher-level Representations in Sequential Transformers with Feedback Memory</a>.</p>
+<a href="https://papers.labml.ai/paper/2002.09402">Accessing Higher-level Representations in Sequential Transformers with Feedback Memory</a>.</p>
 <p>Normal transformers process tokens in parallel. Each transformer layer pays attention
 to the outputs of the previous layer.
 Feedback transformer pays attention to the output of all layers in previous steps.
diff --git a/docs/transformers/feedback/readme.html b/docs/transformers/feedback/readme.html
index 077924c0..996466b1 100644
--- a/docs/transformers/feedback/readme.html
+++ b/docs/transformers/feedback/readme.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/transformers/feedback/index.html">Feedback Transformer</a></h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
-<a href="https://arxiv.org/abs/2002.09402">Accessing Higher-level Representations in Sequential Transformers with Feedback Memory</a>.</p>
+<a href="https://papers.labml.ai/paper/2002.09402">Accessing Higher-level Representations in Sequential Transformers with Feedback Memory</a>.</p>
 <p>Normal transformers process tokens in parallel. Each transformer layer pays attention
 to the outputs of the previous layer.
 Feedback transformer pays attention to the output of all layers in previous steps.
diff --git a/docs/transformers/fnet/index.html b/docs/transformers/fnet/index.html
index d39ec5c8..62d5e070 100644
--- a/docs/transformers/fnet/index.html
+++ b/docs/transformers/fnet/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>FNet: Mixing Tokens with Fourier Transforms</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
-<a href="https://arxiv.org/abs/2105.03824">FNet: Mixing Tokens with Fourier Transforms</a>.</p>
+<a href="https://papers.labml.ai/paper/2105.03824">FNet: Mixing Tokens with Fourier Transforms</a>.</p>
 <p>This paper replaces the <a href="../mha.html">self-attention layer</a> with two
 <a href="https://en.wikipedia.org/wiki/Discrete_Fourier_transform">Fourier transforms</a> to
 <em>mix</em> tokens.
diff --git a/docs/transformers/fnet/readme.html b/docs/transformers/fnet/readme.html
index 40317673..f7c89ad3 100644
--- a/docs/transformers/fnet/readme.html
+++ b/docs/transformers/fnet/readme.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/transformers/fnet/index.html">FNet: Mixing Tokens with Fourier Transforms</a></h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
-<a href="https://arxiv.org/abs/2105.03824">FNet: Mixing Tokens with Fourier Transforms</a>.</p>
+<a href="https://papers.labml.ai/paper/2105.03824">FNet: Mixing Tokens with Fourier Transforms</a>.</p>
 <p>This paper replaces the <a href="https://nn.labml.ai/transformers//mha.html">self-attention layer</a> with two
 <a href="https://en.wikipedia.org/wiki/Discrete_Fourier_transform">Fourier transforms</a> to
 <em>mix</em> tokens.
diff --git a/docs/transformers/index.html b/docs/transformers/index.html
index a6051832..f92cd215 100644
--- a/docs/transformers/index.html
+++ b/docs/transformers/index.html
@@ -69,7 +69,7 @@
                 <h1>Transformers</h1>
 <p>This module contains <a href="https://pytorch.org/">PyTorch</a>
 implementations and explanations of original transformer
-from paper <a href="https://arxiv.org/abs/1706.03762">Attention Is All You Need</a>,
+from paper <a href="https://papers.labml.ai/paper/1706.03762">Attention Is All You Need</a>,
 and derivatives and enhancements of it.</p>
 <ul>
 <li><a href="mha.html">Multi-head attention</a></li>
@@ -87,30 +87,30 @@ oldest memories to give a longer attention span.</p>
 <p>This is an implementation of GPT-2 architecture.</p>
 <h2><a href="glu_variants/simple.html">GLU Variants</a></h2>
 <p>This is an implementation of the paper
-<a href="https://arxiv.org/abs/2002.05202">GLU Variants Improve Transformer</a>.</p>
+<a href="https://papers.labml.ai/paper/2002.05202">GLU Variants Improve Transformer</a>.</p>
 <h2><a href="knn/index.html">kNN-LM</a></h2>
 <p>This is an implementation of the paper
-<a href="https://arxiv.org/abs/1911.00172">Generalization through Memorization: Nearest Neighbor Language Models</a>.</p>
+<a href="https://papers.labml.ai/paper/1911.00172">Generalization through Memorization: Nearest Neighbor Language Models</a>.</p>
 <h2><a href="feedback/index.html">Feedback Transformer</a></h2>
 <p>This is an implementation of the paper
-<a href="https://arxiv.org/abs/2002.09402">Accessing Higher-level Representations in Sequential Transformers with Feedback Memory</a>.</p>
+<a href="https://papers.labml.ai/paper/2002.09402">Accessing Higher-level Representations in Sequential Transformers with Feedback Memory</a>.</p>
 <h2><a href="switch/index.html">Switch Transformer</a></h2>
 <p>This is a miniature implementation of the paper
-<a href="https://arxiv.org/abs/2101.03961">Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity</a>.
+<a href="https://papers.labml.ai/paper/2101.03961">Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity</a>.
 Our implementation only has a few million parameters and doesn&rsquo;t do model parallel distributed training.
 It does single GPU training but we implement the concept of switching as described in the paper.</p>
 <h2><a href="fast_weights/index.html">Fast Weights Transformer</a></h2>
 <p>This is an implementation of the paper
-<a href="https://arxiv.org/abs/2102.11174">Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch</a>.</p>
+<a href="https://papers.labml.ai/paper/2102.11174">Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch</a>.</p>
 <h2><a href="fnet/index.html">FNet: Mixing Tokens with Fourier Transforms</a></h2>
 <p>This is an implementation of the paper
-<a href="https://arxiv.org/abs/2105.03824">FNet: Mixing Tokens with Fourier Transforms</a>.</p>
+<a href="https://papers.labml.ai/paper/2105.03824">FNet: Mixing Tokens with Fourier Transforms</a>.</p>
 <h2><a href="aft/index.html">Attention Free Transformer</a></h2>
 <p>This is an implementation of the paper
 <a href="https://papers.labml.ai/paper/2105.14103">An Attention Free Transformer</a>.</p>
 <h2><a href="mlm/index.html">Masked Language Model</a></h2>
 <p>This is an implementation of Masked Language Model used for pre-training in paper
-<a href="https://arxiv.org/abs/1810.04805">BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding</a>.</p>
+<a href="https://papers.labml.ai/paper/1810.04805">BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding</a>.</p>
 <h2><a href="mlp_mixer/index.html">MLP-Mixer: An all-MLP Architecture for Vision</a></h2>
 <p>This is an implementation of the paper
 <a href="https://papers.labml.ai/paper/2105.01601">MLP-Mixer: An all-MLP Architecture for Vision</a>.</p>
@@ -119,7 +119,7 @@ It does single GPU training but we implement the concept of switching as describ
 <a href="https://papers.labml.ai/paper/2105.08050">Pay Attention to MLPs</a>.</p>
 <h2><a href="vit/index.html">Vision Transformer (ViT)</a></h2>
 <p>This is an implementation of the paper
-<a href="https://arxiv.org/abs/2010.11929">An Image Is Worth 16x16 Words: Transformers For Image Recognition At Scale</a>.</p>
+<a href="https://papers.labml.ai/paper/2010.11929">An Image Is Worth 16x16 Words: Transformers For Image Recognition At Scale</a>.</p>
             </div>
             <div class='code'>
                 <div class="highlight"><pre><span class="lineno">92</span><span></span><span class="kn">from</span> <span class="nn">.configs</span> <span class="kn">import</span> <span class="n">TransformerConfigs</span>
diff --git a/docs/transformers/knn/index.html b/docs/transformers/knn/index.html
index 6db8ee6f..df4ae1e0 100644
--- a/docs/transformers/knn/index.html
+++ b/docs/transformers/knn/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>k-Nearest Neighbor Language Models</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
- <a href="https://arxiv.org/abs/1911.00172">Generalization through Memorization: Nearest Neighbor Language Models</a>.
+ <a href="https://papers.labml.ai/paper/1911.00172">Generalization through Memorization: Nearest Neighbor Language Models</a>.
 It uses k-nearest neighbors to  improve perplexity of autoregressive transformer models.</p>
 <p>An autoregressive language model estimates $p(w_t | \color{yellowgreen}{c_t})$,
  where $w_t$ is the token at step $t$
diff --git a/docs/transformers/mha.html b/docs/transformers/mha.html
index 4fe146a6..73c5d512 100644
--- a/docs/transformers/mha.html
+++ b/docs/transformers/mha.html
@@ -68,7 +68,7 @@
                 </div>
                 <h1>Multi-Headed Attention (MHA)</h1>
 <p>This is a tutorial/implementation of multi-headed attention
-from paper <a href="https://arxiv.org/abs/1706.03762">Attention Is All You Need</a>
+from paper <a href="https://papers.labml.ai/paper/1706.03762">Attention Is All You Need</a>
 in <a href="https://pytorch.org/">PyTorch</a>.
 The implementation is inspired from <a href="https://nlp.seas.harvard.edu/2018/04/03/attention.html">Annotated Transformer</a>.</p>
 <p>Here is the <a href="basic/autoregressive_experiment.html">training code</a> that uses a basic transformer
diff --git a/docs/transformers/mlm/index.html b/docs/transformers/mlm/index.html
index 143546de..228ccc13 100644
--- a/docs/transformers/mlm/index.html
+++ b/docs/transformers/mlm/index.html
@@ -70,7 +70,7 @@
                 <h1>Masked Language Model (MLM)</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the Masked Language Model (MLM)
  used to pre-train the BERT model introduced in the paper
-<a href="https://arxiv.org/abs/1810.04805">BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding</a>.</p>
+<a href="https://papers.labml.ai/paper/1810.04805">BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding</a>.</p>
 <h2>BERT Pretraining</h2>
 <p>BERT model is a transformer model.
 The paper pre-trains the model using MLM and with next sentence prediction.
diff --git a/docs/transformers/mlm/readme.html b/docs/transformers/mlm/readme.html
index 76667336..552b5f9b 100644
--- a/docs/transformers/mlm/readme.html
+++ b/docs/transformers/mlm/readme.html
@@ -70,7 +70,7 @@
                 <h1><a href="https://nn.labml.ai/transformers/mlm/index.html">Masked Language Model (MLM)</a></h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of Masked Language Model (MLM)
  used to pre-train the BERT model introduced in the paper
-<a href="https://arxiv.org/abs/1810.04805">BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding</a>.</p>
+<a href="https://papers.labml.ai/paper/1810.04805">BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding</a>.</p>
 <h2>BERT Pretraining</h2>
 <p>BERT model is a transformer model.
 The paper pre-trains the model using MLM and with next sentence prediction.
diff --git a/docs/transformers/models.html b/docs/transformers/models.html
index edfaf04c..27e2560b 100644
--- a/docs/transformers/models.html
+++ b/docs/transformers/models.html
@@ -179,7 +179,7 @@ and add the original residual vectors.
 Alternative is to do a layer normalization after adding the residuals.
 But we found this to be less stable when training.
 We found a detailed discussion about this in the paper
- <a href="https://arxiv.org/abs/2002.04745">On Layer Normalization in the Transformer Architecture</a>.</p>
+ <a href="https://papers.labml.ai/paper/2002.04745">On Layer Normalization in the Transformer Architecture</a>.</p>
             </div>
             <div class='code'>
                 <div class="highlight"><pre><span class="lineno">59</span><span class="k">class</span> <span class="nc">TransformerLayer</span><span class="p">(</span><span class="n">Module</span><span class="p">):</span></pre></div>
diff --git a/docs/transformers/switch/index.html b/docs/transformers/switch/index.html
index 68196176..c4dc9efa 100644
--- a/docs/transformers/switch/index.html
+++ b/docs/transformers/switch/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Switch Transformer</h1>
 <p>This is a miniature <a href="https://pytorch.org">PyTorch</a> implementation of the paper
-<a href="https://arxiv.org/abs/2101.03961">Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity</a>.
+<a href="https://papers.labml.ai/paper/2101.03961">Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity</a>.
 Our implementation only has a few million parameters and doesn&rsquo;t do model parallel distributed training.
 It does single GPU training, but we implement the concept of switching as described in the paper.</p>
 <p>The Switch Transformer uses different parameters for each token by switching among parameters
diff --git a/docs/transformers/switch/readme.html b/docs/transformers/switch/readme.html
index 90892c2d..fd5a3384 100644
--- a/docs/transformers/switch/readme.html
+++ b/docs/transformers/switch/readme.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/transformers/switch/index.html">Switch Transformer</a></h1>
 <p>This is a miniature <a href="https://pytorch.org">PyTorch</a> implementation of the paper
-<a href="https://arxiv.org/abs/2101.03961">Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity</a>.
+<a href="https://papers.labml.ai/paper/2101.03961">Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity</a>.
 Our implementation only has a few million parameters and doesn&rsquo;t do model parallel distributed training.
 It does single GPU training, but we implement the concept of switching as described in the paper.</p>
 <p>The Switch Transformer uses different parameters for each token by switching among parameters
diff --git a/docs/transformers/vit/index.html b/docs/transformers/vit/index.html
index 2bb20f87..b1ca1939 100644
--- a/docs/transformers/vit/index.html
+++ b/docs/transformers/vit/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Vision Transformer (ViT)</h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
-<a href="https://arxiv.org/abs/2010.11929">An Image Is Worth 16x16 Words: Transformers For Image Recognition At Scale</a>.</p>
+<a href="https://papers.labml.ai/paper/2010.11929">An Image Is Worth 16x16 Words: Transformers For Image Recognition At Scale</a>.</p>
 <p>Vision transformer applies a pure transformer to images
 without any convolution layers.
 They split the image into patches and apply a transformer on patch embeddings.
diff --git a/docs/transformers/vit/readme.html b/docs/transformers/vit/readme.html
index dce47850..da2e721c 100644
--- a/docs/transformers/vit/readme.html
+++ b/docs/transformers/vit/readme.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/transformer/vit/index.html">Vision Transformer (ViT)</a></h1>
 <p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
-<a href="https://arxiv.org/abs/2010.11929">An Image Is Worth 16x16 Words: Transformers For Image Recognition At Scale</a>.</p>
+<a href="https://papers.labml.ai/paper/2010.11929">An Image Is Worth 16x16 Words: Transformers For Image Recognition At Scale</a>.</p>
 <p>Vision transformer applies a pure transformer to images
 without any convolution layers.
 They split the image into patches and apply a transformer on patch embeddings.
diff --git a/docs/transformers/xl/index.html b/docs/transformers/xl/index.html
index dd4fc7ed..1de9266e 100644
--- a/docs/transformers/xl/index.html
+++ b/docs/transformers/xl/index.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Transformer XL</h1>
 <p>This is an implementation of
-<a href="https://arxiv.org/abs/1901.02860">Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context</a>
+<a href="https://papers.labml.ai/paper/1901.02860">Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context</a>
 in <a href="https://pytorch.org">PyTorch</a>.</p>
 <p>Transformer has a limited attention span,
 equal to the length of the sequence trained in parallel.
diff --git a/docs/transformers/xl/readme.html b/docs/transformers/xl/readme.html
index 0f8aa57c..05a21c90 100644
--- a/docs/transformers/xl/readme.html
+++ b/docs/transformers/xl/readme.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1><a href="https://nn.labml.ai/transformers/xl/index.html">Transformer XL</a></h1>
 <p>This is an implementation of
-<a href="https://arxiv.org/abs/1901.02860">Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context</a>
+<a href="https://papers.labml.ai/paper/1901.02860">Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context</a>
 in <a href="https://pytorch.org">PyTorch</a>.</p>
 <p>Transformer has a limited attention span,
 equal to the length of the sequence trained in parallel.
diff --git a/docs/transformers/xl/relative_mha.html b/docs/transformers/xl/relative_mha.html
index dcd40d04..cf87181b 100644
--- a/docs/transformers/xl/relative_mha.html
+++ b/docs/transformers/xl/relative_mha.html
@@ -69,7 +69,7 @@
                 </div>
                 <h1>Relative Multi-Headed Attention</h1>
 <p>This is an implementation of relative multi-headed attention from paper
-<a href="https://arxiv.org/abs/1901.02860">Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context</a>
+<a href="https://papers.labml.ai/paper/1901.02860">Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context</a>
 in <a href="https://pytorch.org">PyTorch</a>.</p>
             </div>
             <div class='code'>
diff --git a/utils/papers_list.py b/utils/papers_list.py
new file mode 100644
index 00000000..87632064
--- /dev/null
+++ b/utils/papers_list.py
@@ -0,0 +1,84 @@
+import json
+import re
+from pathlib import Path
+
+from labml import logger
+from labml.logger import Text
+
+HOME = Path('./labml_nn')
+
+REGEX = re.compile(r"""
+ \(
+ https://papers\.labml\.ai/paper/  # Start of a numeric entity reference
+ (?P<id>[0-9\.]+)  # Paper ID
+ \)
+""", re.VERBOSE)
+
+IGNORE = {
+    'transformers/index.html',
+    'transformers/configs.html',
+    'optimizers/noam.html',
+    'transformers/basic/autoregressive_experiment.html',
+}
+
+IGNORE_PAPERS = {
+    '2002.04745',  # On Layer Normalization in the Transformer Architecture
+    '1606.08415',  # Gaussian Error Linear Units (GELUs)
+    '1710.10196',  # Progressive Growing of GANs for Improved Quality, Stability, and Variation
+    '1904.11486',  # Making Convolutional Networks Shift-Invariant Again
+    '1801.04406',  # Which Training Methods for GANs do actually Converge?
+    '1812.04948',  # A Style-Based Generator Architecture for Generative Adversarial Networks
+    '1705.10528',  # Constrained Policy Optimization
+}
+
+
+def collect(path: Path):
+    if path.is_file():
+        html = path.relative_to(HOME)
+        if html.suffix not in {'.py'}:
+            return []
+
+        if html.stem == '__init__':
+            html = html.parent / 'index.html'
+        else:
+            html = html.parent / f'{html.stem}.html'
+
+        if str(html) in IGNORE:
+            return []
+
+        with open(str(path), 'r') as f:
+            contents = f.read()
+            papers = set()
+            for m in REGEX.finditer(contents):
+                if m.group('id') in IGNORE_PAPERS:
+                    continue
+                papers.add(m.group('id'))
+
+            if len(papers) > 1:
+                logger.log([(str(html), Text.key), ': ', str(papers)])
+            return [{'url': str(html), 'arxiv_id': p} for p in papers]
+
+    urls = []
+    for f in path.iterdir():
+        urls += collect(f)
+
+    return urls
+
+
+def main():
+    papers = []
+    for f in HOME.iterdir():
+        papers += collect(f)
+
+    by_id = {}
+    for p in papers:
+        if p['arxiv_id'] not in by_id:
+            by_id[p['arxiv_id']] = []
+        by_id[p['arxiv_id']].append(f'''https://nn.labml.ai/{p['url']}''')
+
+    with open(str(HOME.parent / 'docs' / 'papers.json'), 'w') as f:
+        f.write(json.dumps(by_id, indent=1))
+
+
+if __name__ == '__main__':
+    main()