From 996b58be041ded110e0f5a27805565a88dfd95a5 Mon Sep 17 00:00:00 2001
From: Varuna Jayasiri <vpjayasiri@gmail.com>
Date: Tue, 17 Aug 2021 14:12:33 +0530
Subject: [PATCH] paper links

---
 labml_nn/capsule_networks/__init__.py          |  2 +-
 labml_nn/capsule_networks/mnist.py             |  2 +-
 labml_nn/capsule_networks/readme.md            |  2 +-
 labml_nn/gan/cycle_gan/__init__.py             |  2 +-
 labml_nn/gan/cycle_gan/readme.md               |  2 +-
 labml_nn/gan/dcgan/__init__.py                 |  2 +-
 labml_nn/gan/dcgan/readme.md                   |  2 +-
 labml_nn/gan/original/__init__.py              |  2 +-
 labml_nn/gan/original/readme.md                |  2 +-
 labml_nn/gan/stylegan/__init__.py              | 12 ++++++------
 labml_nn/gan/stylegan/readme.md                |  6 +++---
 labml_nn/gan/wasserstein/__init__.py           |  2 +-
 .../wasserstein/gradient_penalty/__init__.py   |  4 ++--
 .../gan/wasserstein/gradient_penalty/readme.md |  4 ++--
 labml_nn/gan/wasserstein/readme.md             |  2 +-
 labml_nn/graphs/gat/__init__.py                |  2 +-
 labml_nn/graphs/gat/readme.md                  |  2 +-
 labml_nn/graphs/gatv2/__init__.py              |  2 +-
 labml_nn/graphs/gatv2/readme.md                |  2 +-
 labml_nn/hypernetworks/hyper_lstm.py           |  2 +-
 .../batch_channel_norm/__init__.py             |  2 +-
 labml_nn/normalization/batch_norm/__init__.py  |  2 +-
 labml_nn/normalization/batch_norm/readme.md    |  2 +-
 labml_nn/normalization/group_norm/__init__.py  |  2 +-
 labml_nn/normalization/group_norm/readme.md    |  2 +-
 .../normalization/instance_norm/__init__.py    |  2 +-
 labml_nn/normalization/instance_norm/readme.md |  2 +-
 labml_nn/normalization/layer_norm/__init__.py  |  2 +-
 labml_nn/normalization/layer_norm/readme.md    |  2 +-
 .../weight_standardization/__init__.py         |  4 ++--
 .../weight_standardization/readme.md           |  2 +-
 labml_nn/optimizers/ada_belief.py              |  2 +-
 labml_nn/optimizers/adam.py                    |  2 +-
 labml_nn/optimizers/amsgrad.py                 |  2 +-
 labml_nn/optimizers/noam.py                    |  2 +-
 labml_nn/optimizers/radam.py                   |  2 +-
 .../recurrent_highway_networks/__init__.py     |  2 +-
 labml_nn/resnet/__init__.py                    |  2 +-
 labml_nn/resnet/readme.md                      |  2 +-
 labml_nn/rl/dqn/__init__.py                    |  4 ++--
 labml_nn/rl/dqn/model.py                       |  2 +-
 labml_nn/rl/dqn/replay_buffer.py               |  4 ++--
 labml_nn/rl/ppo/__init__.py                    |  4 ++--
 labml_nn/rl/ppo/gae.py                         |  2 +-
 labml_nn/rl/ppo/readme.md                      |  2 +-
 labml_nn/sketch_rnn/__init__.py                |  2 +-
 labml_nn/transformers/__init__.py              | 18 +++++++++---------
 .../basic/autoregressive_experiment.py         |  2 +-
 labml_nn/transformers/compressive/__init__.py  |  2 +-
 labml_nn/transformers/compressive/readme.md    |  2 +-
 labml_nn/transformers/configs.py               |  4 ++--
 labml_nn/transformers/fast_weights/__init__.py |  2 +-
 labml_nn/transformers/fast_weights/readme.md   |  2 +-
 labml_nn/transformers/feed_forward.py          |  2 +-
 labml_nn/transformers/feedback/__init__.py     |  2 +-
 labml_nn/transformers/feedback/readme.md       |  2 +-
 labml_nn/transformers/fnet/__init__.py         |  2 +-
 labml_nn/transformers/fnet/readme.md           |  2 +-
 labml_nn/transformers/knn/__init__.py          |  2 +-
 labml_nn/transformers/mha.py                   |  2 +-
 labml_nn/transformers/mlm/__init__.py          |  2 +-
 labml_nn/transformers/mlm/readme.md            |  2 +-
 labml_nn/transformers/models.py                |  2 +-
 labml_nn/transformers/switch/__init__.py       |  2 +-
 labml_nn/transformers/switch/readme.md         |  2 +-
 labml_nn/transformers/vit/__init__.py          |  2 +-
 labml_nn/transformers/vit/readme.md            |  2 +-
 labml_nn/transformers/xl/__init__.py           |  2 +-
 labml_nn/transformers/xl/readme.md             |  2 +-
 labml_nn/transformers/xl/relative_mha.py       |  2 +-
 70 files changed, 92 insertions(+), 92 deletions(-)

diff --git a/labml_nn/capsule_networks/__init__.py b/labml_nn/capsule_networks/__init__.py
index ef90ab1c..70d30d0c 100644
--- a/labml_nn/capsule_networks/__init__.py
+++ b/labml_nn/capsule_networks/__init__.py
@@ -10,7 +10,7 @@ summary: >
 # Capsule Networks
 
 This is a [PyTorch](https://pytorch.org) implementation/tutorial of
-[Dynamic Routing Between Capsules](https://arxiv.org/abs/1710.09829).
+[Dynamic Routing Between Capsules](https://papers.labml.ai/paper/1710.09829).
 
 Capsule network is a neural network architecture that embeds features
 as capsules and routes them with a voting mechanism to next layer of capsules.
diff --git a/labml_nn/capsule_networks/mnist.py b/labml_nn/capsule_networks/mnist.py
index 0b703689..96357547 100644
--- a/labml_nn/capsule_networks/mnist.py
+++ b/labml_nn/capsule_networks/mnist.py
@@ -9,7 +9,7 @@ summary: Code for training Capsule Networks on MNIST dataset
 This is an annotated PyTorch code to classify MNIST digits with PyTorch.
 
 This paper implements the experiment described in paper
-[Dynamic Routing Between Capsules](https://arxiv.org/abs/1710.09829).
+[Dynamic Routing Between Capsules](https://papers.labml.ai/paper/1710.09829).
 """
 from typing import Any
 
diff --git a/labml_nn/capsule_networks/readme.md b/labml_nn/capsule_networks/readme.md
index 637b75d7..8b2ba936 100644
--- a/labml_nn/capsule_networks/readme.md
+++ b/labml_nn/capsule_networks/readme.md
@@ -1,7 +1,7 @@
 # [Capsule Networks](https://nn.labml.ai/capsule_networks/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation/tutorial of
-[Dynamic Routing Between Capsules](https://arxiv.org/abs/1710.09829).
+[Dynamic Routing Between Capsules](https://papers.labml.ai/paper/1710.09829).
 
 Capsule network is a neural network architecture that embeds features
 as capsules and routes them with a voting mechanism to next layer of capsules.
diff --git a/labml_nn/gan/cycle_gan/__init__.py b/labml_nn/gan/cycle_gan/__init__.py
index d3b3630b..df1e3f55 100644
--- a/labml_nn/gan/cycle_gan/__init__.py
+++ b/labml_nn/gan/cycle_gan/__init__.py
@@ -9,7 +9,7 @@ summary: >
 # Cycle GAN
 
 This is a [PyTorch](https://pytorch.org) implementation/tutorial of the paper
-[Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks](https://arxiv.org/abs/1703.10593).
+[Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks](https://papers.labml.ai/paper/1703.10593).
 
 I've taken pieces of code from [eriklindernoren/PyTorch-GAN](https://github.com/eriklindernoren/PyTorch-GAN).
 It is a very good resource if you want to checkout other GAN variations too.
diff --git a/labml_nn/gan/cycle_gan/readme.md b/labml_nn/gan/cycle_gan/readme.md
index e510b4a2..3eadffa4 100644
--- a/labml_nn/gan/cycle_gan/readme.md
+++ b/labml_nn/gan/cycle_gan/readme.md
@@ -1,4 +1,4 @@
 # [Cycle GAN](https://nn.labml.ai/gan/cycle_gan/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation/tutorial of the paper
-[Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks](https://arxiv.org/abs/1703.10593).
+[Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks](https://papers.labml.ai/paper/1703.10593).
diff --git a/labml_nn/gan/dcgan/__init__.py b/labml_nn/gan/dcgan/__init__.py
index b2e4620f..c65d02e3 100644
--- a/labml_nn/gan/dcgan/__init__.py
+++ b/labml_nn/gan/dcgan/__init__.py
@@ -7,7 +7,7 @@ summary: A simple PyTorch implementation/tutorial of Deep Convolutional Generati
 # Deep Convolutional Generative Adversarial Networks (DCGAN)
 
 This is a [PyTorch](https://pytorch.org) implementation of paper
-[Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](https://arxiv.org/abs/1511.06434).
+[Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](https://papers.labml.ai/paper/1511.06434).
 
 This implementation is based on the [PyTorch DCGAN Tutorial](https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html).
 """
diff --git a/labml_nn/gan/dcgan/readme.md b/labml_nn/gan/dcgan/readme.md
index c67f8ec3..20bf1855 100644
--- a/labml_nn/gan/dcgan/readme.md
+++ b/labml_nn/gan/dcgan/readme.md
@@ -1,4 +1,4 @@
 # [Deep Convolutional Generative Adversarial Networks - DCGAN](https://nn.labml.ai/gan/dcgan/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of paper
-[Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](https://arxiv.org/abs/1511.06434).
+[Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](https://papers.labml.ai/paper/1511.06434).
diff --git a/labml_nn/gan/original/__init__.py b/labml_nn/gan/original/__init__.py
index 0fc3be44..987683c8 100644
--- a/labml_nn/gan/original/__init__.py
+++ b/labml_nn/gan/original/__init__.py
@@ -7,7 +7,7 @@ summary: A simple PyTorch implementation/tutorial of Generative Adversarial Netw
 # Generative Adversarial Networks (GAN)
 
 This is an implementation of
-[Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+[Generative Adversarial Networks](https://papers.labml.ai/paper/1406.2661).
 
 The generator, $G(\pmb{z}; \theta_g)$ generates samples that match the
 distribution of data, while the discriminator, $D(\pmb{x}; \theta_g)$
diff --git a/labml_nn/gan/original/readme.md b/labml_nn/gan/original/readme.md
index 0d204c2e..1e410e88 100644
--- a/labml_nn/gan/original/readme.md
+++ b/labml_nn/gan/original/readme.md
@@ -1,4 +1,4 @@
 # [Generative Adversarial Networks - GAN](https://nn.labml.ai/gan/original/index.html)
 
 This is an annotated implementation of
-[Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+[Generative Adversarial Networks](https://papers.labml.ai/paper/1406.2661).
diff --git a/labml_nn/gan/stylegan/__init__.py b/labml_nn/gan/stylegan/__init__.py
index e3c68c07..3c5f7b35 100644
--- a/labml_nn/gan/stylegan/__init__.py
+++ b/labml_nn/gan/stylegan/__init__.py
@@ -8,12 +8,12 @@ summary: >
 # StyleGAN 2
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
- [Analyzing and Improving the Image Quality of StyleGAN](https://arxiv.org/abs/1912.04958)
+ [Analyzing and Improving the Image Quality of StyleGAN](https://papers.labml.ai/paper/1912.04958)
  which introduces **StyleGAN 2**.
 StyleGAN 2 is an improvement over **StyleGAN** from the paper
- [A Style-Based Generator Architecture for Generative Adversarial Networks](https://arxiv.org/abs/1812.04948).
+ [A Style-Based Generator Architecture for Generative Adversarial Networks](https://papers.labml.ai/paper/1812.04948).
 And StyleGAN is based on **Progressive GAN** from the paper
- [Progressive Growing of GANs for Improved Quality, Stability, and Variation](https://arxiv.org/abs/1710.10196).
+ [Progressive Growing of GANs for Improved Quality, Stability, and Variation](https://papers.labml.ai/paper/1710.10196).
 All three papers are from the same authors from [NVIDIA AI](https://twitter.com/NVIDIAAI).
 
 *Our implementation is a minimalistic StyleGAN 2 model training code.
@@ -650,7 +650,7 @@ class DownSample(nn.Module):
     The down-sample operation [smoothens](#smooth) each feature channel and
      scale $2 \times$ using bilinear interpolation.
     This is based on the paper
-     [Making Convolutional Networks Shift-Invariant Again](https://arxiv.org/abs/1904.11486).
+     [Making Convolutional Networks Shift-Invariant Again](https://papers.labml.ai/paper/1904.11486).
     """
 
     def __init__(self):
@@ -672,7 +672,7 @@ class UpSample(nn.Module):
 
     The up-sample operation scales the image up by $2 \times$ and [smoothens](#smooth) each feature channel.
     This is based on the paper
-     [Making Convolutional Networks Shift-Invariant Again](https://arxiv.org/abs/1904.11486).
+     [Making Convolutional Networks Shift-Invariant Again](https://papers.labml.ai/paper/1904.11486).
     """
 
     def __init__(self):
@@ -824,7 +824,7 @@ class GradientPenalty(nn.Module):
     ## Gradient Penalty
 
     This is the $R_1$ regularization penality from the paper
-    [Which Training Methods for GANs do actually Converge?](https://arxiv.org/abs/1801.04406).
+    [Which Training Methods for GANs do actually Converge?](https://papers.labml.ai/paper/1801.04406).
 
     $$R_1(\psi) = \frac{\gamma}{2} \mathbb{E}_{p_\mathcal{D}(x)}
     \Big[\Vert \nabla_x D_\psi(x)^2 \Vert\Big]$$
diff --git a/labml_nn/gan/stylegan/readme.md b/labml_nn/gan/stylegan/readme.md
index f4834363..eaadcbea 100644
--- a/labml_nn/gan/stylegan/readme.md
+++ b/labml_nn/gan/stylegan/readme.md
@@ -1,10 +1,10 @@
 # [StyleGAN 2](https://nn.labml.ai/gan/stylegan/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
- [Analyzing and Improving the Image Quality of StyleGAN](https://arxiv.org/abs/1912.04958)
+ [Analyzing and Improving the Image Quality of StyleGAN](https://papers.labml.ai/paper/1912.04958)
  which introduces **StyleGAN2**.
 StyleGAN 2 is an improvement over **StyleGAN** from the paper
- [A Style-Based Generator Architecture for Generative Adversarial Networks](https://arxiv.org/abs/1812.04948).
+ [A Style-Based Generator Architecture for Generative Adversarial Networks](https://papers.labml.ai/paper/1812.04948).
 And StyleGAN is based on **Progressive GAN** from the paper
- [Progressive Growing of GANs for Improved Quality, Stability, and Variation](https://arxiv.org/abs/1710.10196).
+ [Progressive Growing of GANs for Improved Quality, Stability, and Variation](https://papers.labml.ai/paper/1710.10196).
 All three papers are from the same authors from [NVIDIA AI](https://twitter.com/NVIDIAAI).
diff --git a/labml_nn/gan/wasserstein/__init__.py b/labml_nn/gan/wasserstein/__init__.py
index 496639d1..84128ece 100644
--- a/labml_nn/gan/wasserstein/__init__.py
+++ b/labml_nn/gan/wasserstein/__init__.py
@@ -7,7 +7,7 @@ summary: A simple PyTorch implementation/tutorial of Wasserstein Generative Adve
 # Wasserstein GAN (WGAN)
 
 This is an implementation of
-[Wasserstein GAN](https://arxiv.org/abs/1701.07875).
+[Wasserstein GAN](https://papers.labml.ai/paper/1701.07875).
 
 The original GAN loss is based on Jensen-Shannon (JS) divergence
 between the real distribution $\mathbb{P}_r$ and generated distribution $\mathbb{P}_g$.
diff --git a/labml_nn/gan/wasserstein/gradient_penalty/__init__.py b/labml_nn/gan/wasserstein/gradient_penalty/__init__.py
index e6848256..c68089b3 100644
--- a/labml_nn/gan/wasserstein/gradient_penalty/__init__.py
+++ b/labml_nn/gan/wasserstein/gradient_penalty/__init__.py
@@ -9,7 +9,7 @@ summary: >
 # Gradient Penalty for Wasserstein GAN (WGAN-GP)
 
 This is an implementation of
-[Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028).
+[Improved Training of Wasserstein GANs](https://papers.labml.ai/paper/1704.00028).
 
 [WGAN](../index.html) suggests clipping weights to enforce Lipschitz constraint
 on the discriminator network (critic).
@@ -19,7 +19,7 @@ L1, L2 weight decay have problems:
 1. Limiting the capacity of the discriminator
 2. Exploding and vanishing gradients (without [Batch Normalization](../../../normalization/batch_norm/index.html)).
 
-The paper [Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028)
+The paper [Improved Training of Wasserstein GANs](https://papers.labml.ai/paper/1704.00028)
 proposal a better way to improve Lipschitz constraint, a gradient penalty.
 
 $$\mathcal{L}_{GP} = \lambda \underset{\hat{x} \sim \mathbb{P}_{\hat{x}}}{\mathbb{E}}
diff --git a/labml_nn/gan/wasserstein/gradient_penalty/readme.md b/labml_nn/gan/wasserstein/gradient_penalty/readme.md
index a5bee4f5..bafdb642 100644
--- a/labml_nn/gan/wasserstein/gradient_penalty/readme.md
+++ b/labml_nn/gan/wasserstein/gradient_penalty/readme.md
@@ -1,7 +1,7 @@
 # [Gradient Penalty for Wasserstein GAN (WGAN-GP)](https://nn.labml.ai/gan/wasserstein/gradient_penalty/index.html)
 
 This is an implementation of
-[Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028).
+[Improved Training of Wasserstein GANs](https://papers.labml.ai/paper/1704.00028).
 
 [WGAN](https://nn.labml.ai/gan/wasserstein/index.html) suggests
 clipping weights to enforce Lipschitz constraint
@@ -12,5 +12,5 @@ L1, L2 weight decay have problems:
 1. Limiting the capacity of the discriminator
 2. Exploding and vanishing gradients (without [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html)).
 
-The paper [Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028)
+The paper [Improved Training of Wasserstein GANs](https://papers.labml.ai/paper/1704.00028)
 proposal a better way to improve Lipschitz constraint, a gradient penalty.
diff --git a/labml_nn/gan/wasserstein/readme.md b/labml_nn/gan/wasserstein/readme.md
index fab3e927..05596c60 100644
--- a/labml_nn/gan/wasserstein/readme.md
+++ b/labml_nn/gan/wasserstein/readme.md
@@ -1,4 +1,4 @@
 # [Wasserstein GAN - WGAN](https://nn.labml.ai/gan/wasserstein/index.html)
 
 This is an implementation of
-[Wasserstein GAN](https://arxiv.org/abs/1701.07875).
+[Wasserstein GAN](https://papers.labml.ai/paper/1701.07875).
diff --git a/labml_nn/graphs/gat/__init__.py b/labml_nn/graphs/gat/__init__.py
index 16fb98b1..218faa4c 100644
--- a/labml_nn/graphs/gat/__init__.py
+++ b/labml_nn/graphs/gat/__init__.py
@@ -8,7 +8,7 @@ summary: >
 # Graph Attention Networks (GAT)
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
-[Graph Attention Networks](https://arxiv.org/abs/1710.10903).
+[Graph Attention Networks](https://papers.labml.ai/paper/1710.10903).
 
 GATs work on graph data.
 A graph consists of nodes and edges connecting nodes.
diff --git a/labml_nn/graphs/gat/readme.md b/labml_nn/graphs/gat/readme.md
index a79172af..247db87b 100644
--- a/labml_nn/graphs/gat/readme.md
+++ b/labml_nn/graphs/gat/readme.md
@@ -1,7 +1,7 @@
 # [Graph Attention Networks (GAT)](https://nn.labml.ai/graphs/gat/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
-[Graph Attention Networks](https://arxiv.org/abs/1710.10903).
+[Graph Attention Networks](https://papers.labml.ai/paper/1710.10903).
 
 GATs work on graph data.
 A graph consists of nodes and edges connecting nodes.
diff --git a/labml_nn/graphs/gatv2/__init__.py b/labml_nn/graphs/gatv2/__init__.py
index c6d3841a..ae1befe7 100644
--- a/labml_nn/graphs/gatv2/__init__.py
+++ b/labml_nn/graphs/gatv2/__init__.py
@@ -6,7 +6,7 @@ summary: >
 ---
 # Graph Attention Networks v2 (GATv2)
 This is a [PyTorch](https://pytorch.org) implementation of the GATv2 operator from the paper
-[How Attentive are Graph Attention Networks?](https://arxiv.org/abs/2105.14491).
+[How Attentive are Graph Attention Networks?](https://papers.labml.ai/paper/2105.14491).
 
 GATv2s work on graph data similar to [GAT](../gat/index.html).
 A graph consists of nodes and edges connecting nodes.
diff --git a/labml_nn/graphs/gatv2/readme.md b/labml_nn/graphs/gatv2/readme.md
index d7a4692c..7b1d33ce 100644
--- a/labml_nn/graphs/gatv2/readme.md
+++ b/labml_nn/graphs/gatv2/readme.md
@@ -1,7 +1,7 @@
 # [Graph Attention Networks v2 (GATv2)](https://nn.labml.ai/graphs/gatv2/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of the GATv2 operator from the paper
-[How Attentive are Graph Attention Networks?](https://arxiv.org/abs/2105.14491).
+[How Attentive are Graph Attention Networks?](https://papers.labml.ai/paper/2105.14491).
 
 GATv2s work on graph data.
 A graph consists of nodes and edges connecting nodes.
diff --git a/labml_nn/hypernetworks/hyper_lstm.py b/labml_nn/hypernetworks/hyper_lstm.py
index d39137dd..ae911858 100644
--- a/labml_nn/hypernetworks/hyper_lstm.py
+++ b/labml_nn/hypernetworks/hyper_lstm.py
@@ -7,7 +7,7 @@ summary: A PyTorch implementation/tutorial of HyperLSTM introduced in paper Hype
 # HyperNetworks - HyperLSTM
 
 We have implemented HyperLSTM introduced in paper
-[HyperNetworks](https://arxiv.org/abs/1609.09106), with annotations
+[HyperNetworks](https://papers.labml.ai/paper/1609.09106), with annotations
 using [PyTorch](https://pytorch.org).
 [This blog post](https://blog.otoro.net/2016/09/28/hyper-networks/)
 by David Ha gives a good explanation of HyperNetworks.
diff --git a/labml_nn/normalization/batch_channel_norm/__init__.py b/labml_nn/normalization/batch_channel_norm/__init__.py
index 7617ea8a..e2d755c1 100644
--- a/labml_nn/normalization/batch_channel_norm/__init__.py
+++ b/labml_nn/normalization/batch_channel_norm/__init__.py
@@ -8,7 +8,7 @@ summary: >
 # Batch-Channel Normalization
 
 This is a [PyTorch](https://pytorch.org) implementation of Batch-Channel Normalization from the paper
- [Micro-Batch Training with Batch-Channel Normalization and Weight Standardization](https://arxiv.org/abs/1903.10520).
+ [Micro-Batch Training with Batch-Channel Normalization and Weight Standardization](https://papers.labml.ai/paper/1903.10520).
 We also have an [annotated implementation of Weight Standardization](../weight_standardization/index.html).
 
 Batch-Channel Normalization performs batch normalization followed
diff --git a/labml_nn/normalization/batch_norm/__init__.py b/labml_nn/normalization/batch_norm/__init__.py
index 100c96cc..0a825a46 100644
--- a/labml_nn/normalization/batch_norm/__init__.py
+++ b/labml_nn/normalization/batch_norm/__init__.py
@@ -8,7 +8,7 @@ summary: >
 # Batch Normalization
 
 This is a [PyTorch](https://pytorch.org) implementation of Batch Normalization from paper
- [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167).
+ [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://papers.labml.ai/paper/1502.03167).
 
 ### Internal Covariate Shift
 
diff --git a/labml_nn/normalization/batch_norm/readme.md b/labml_nn/normalization/batch_norm/readme.md
index ece7ba8d..ef8e9849 100644
--- a/labml_nn/normalization/batch_norm/readme.md
+++ b/labml_nn/normalization/batch_norm/readme.md
@@ -1,7 +1,7 @@
 # [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of Batch Normalization from paper
- [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167).
+ [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://papers.labml.ai/paper/1502.03167).
 
 ### Internal Covariate Shift
 
diff --git a/labml_nn/normalization/group_norm/__init__.py b/labml_nn/normalization/group_norm/__init__.py
index ec48d5c6..be9ca455 100644
--- a/labml_nn/normalization/group_norm/__init__.py
+++ b/labml_nn/normalization/group_norm/__init__.py
@@ -8,7 +8,7 @@ summary: >
 # Group Normalization
 
 This is a [PyTorch](https://pytorch.org) implementation of
-the [Group Normalization](https://arxiv.org/abs/1803.08494) paper.
+the [Group Normalization](https://papers.labml.ai/paper/1803.08494) paper.
 
 [Batch Normalization](../batch_norm/index.html) works well for large enough batch sizes
 but not well for small batch sizes, because it normalizes over the batch.
diff --git a/labml_nn/normalization/group_norm/readme.md b/labml_nn/normalization/group_norm/readme.md
index f6f2c269..388f93d4 100644
--- a/labml_nn/normalization/group_norm/readme.md
+++ b/labml_nn/normalization/group_norm/readme.md
@@ -1,7 +1,7 @@
 # [Group Normalization](https://nn.labml.ai/normalization/group_norm/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of
-the [Group Normalization](https://arxiv.org/abs/1803.08494) paper.
+the [Group Normalization](https://papers.labml.ai/paper/1803.08494) paper.
 
 [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html) works well for large enough batch sizes
 but not well for small batch sizes, because it normalizes over the batch.
diff --git a/labml_nn/normalization/instance_norm/__init__.py b/labml_nn/normalization/instance_norm/__init__.py
index a0d6bbda..61a4afe2 100644
--- a/labml_nn/normalization/instance_norm/__init__.py
+++ b/labml_nn/normalization/instance_norm/__init__.py
@@ -8,7 +8,7 @@ summary: >
 # Instance Normalization
 
 This is a [PyTorch](https://pytorch.org) implementation of
-[Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
+[Instance Normalization: The Missing Ingredient for Fast Stylization](https://papers.labml.ai/paper/1607.08022).
 
 Instance normalization was introduced to improve [style transfer](https://paperswithcode.com/task/style-transfer).
 It is based on the observation that stylization should not depend on the contrast of the content image.
diff --git a/labml_nn/normalization/instance_norm/readme.md b/labml_nn/normalization/instance_norm/readme.md
index a67cd54b..5af6e61b 100644
--- a/labml_nn/normalization/instance_norm/readme.md
+++ b/labml_nn/normalization/instance_norm/readme.md
@@ -1,7 +1,7 @@
 # [Instance Normalization](https://nn.labml.ai/normalization/instance_norm/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of
-[Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
+[Instance Normalization: The Missing Ingredient for Fast Stylization](https://papers.labml.ai/paper/1607.08022).
 
 Instance normalization was introduced to improve [style transfer](https://paperswithcode.com/task/style-transfer).
 It is based on the observation that stylization should not depend on the contrast of the content image.
diff --git a/labml_nn/normalization/layer_norm/__init__.py b/labml_nn/normalization/layer_norm/__init__.py
index e4bcabc6..6120ab19 100644
--- a/labml_nn/normalization/layer_norm/__init__.py
+++ b/labml_nn/normalization/layer_norm/__init__.py
@@ -8,7 +8,7 @@ summary: >
 # Layer Normalization
 
 This is a [PyTorch](https://pytorch.org) implementation of
-[Layer Normalization](https://arxiv.org/abs/1607.06450).
+[Layer Normalization](https://papers.labml.ai/paper/1607.06450).
 
 ### Limitations of [Batch Normalization](../batch_norm/index.html)
 
diff --git a/labml_nn/normalization/layer_norm/readme.md b/labml_nn/normalization/layer_norm/readme.md
index 456a2f8d..98122ff0 100644
--- a/labml_nn/normalization/layer_norm/readme.md
+++ b/labml_nn/normalization/layer_norm/readme.md
@@ -1,7 +1,7 @@
 # [Layer Normalization](https://nn.labml.ai/normalization/layer_norm/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of
-[Layer Normalization](https://arxiv.org/abs/1607.06450).
+[Layer Normalization](https://papers.labml.ai/paper/1607.06450).
 
 ### Limitations of [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html)
 
diff --git a/labml_nn/normalization/weight_standardization/__init__.py b/labml_nn/normalization/weight_standardization/__init__.py
index 6756f50a..9f74dffd 100644
--- a/labml_nn/normalization/weight_standardization/__init__.py
+++ b/labml_nn/normalization/weight_standardization/__init__.py
@@ -8,7 +8,7 @@ summary: >
 # Weight Standardization
 
 This is a [PyTorch](https://pytorch.org) implementation of Weight Standardization from the paper
- [Micro-Batch Training with Batch-Channel Normalization and Weight Standardization](https://arxiv.org/abs/1903.10520).
+ [Micro-Batch Training with Batch-Channel Normalization and Weight Standardization](https://papers.labml.ai/paper/1903.10520).
 We also have an [annotated implementation of Batch-Channel Normalization](../batch_channel_norm/index.html).
 
 Batch normalization **gives a smooth loss landscape** and
@@ -36,7 +36,7 @@ inputs. So as long as the inputs are normally distributed the outputs remain clo
 This avoids outputs of nodes from always falling beyond the active range of the activation function
 (e.g. always negative input for a ReLU).
 
-*[Refer to the paper for proofs](https://arxiv.org/abs/1903.10520)*.
+*[Refer to the paper for proofs](https://papers.labml.ai/paper/1903.10520)*.
 
 Here is [the training code](experiment.html) for training
 a VGG network that uses weight standardization to classify CIFAR-10 data.
diff --git a/labml_nn/normalization/weight_standardization/readme.md b/labml_nn/normalization/weight_standardization/readme.md
index 52eb2109..4609e832 100644
--- a/labml_nn/normalization/weight_standardization/readme.md
+++ b/labml_nn/normalization/weight_standardization/readme.md
@@ -1,6 +1,6 @@
 # [Weight Standardization](https://nn.labml.ai/normalization/weight_standardization/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of Weight Standardization from the paper
- [Micro-Batch Training with Batch-Channel Normalization and Weight Standardization](https://arxiv.org/abs/1903.10520).
+ [Micro-Batch Training with Batch-Channel Normalization and Weight Standardization](https://papers.labml.ai/paper/1903.10520).
 We also have an
 [annotated implementation of Batch-Channel Normalization](https://nn.labml.ai/normalization/batch_channel_norm/index.html).
diff --git a/labml_nn/optimizers/ada_belief.py b/labml_nn/optimizers/ada_belief.py
index 9aac87d7..11fd13e9 100644
--- a/labml_nn/optimizers/ada_belief.py
+++ b/labml_nn/optimizers/ada_belief.py
@@ -9,7 +9,7 @@ summary: A simple PyTorch implementation/tutorial of AdaBelief optimizer.
 This is based from AdaBelief
 [official implementation](https://github.com/juntang-zhuang/Adabelief-Optimizer)
 of the paper
-[AdaBelief Optimizer: Adapting Stepsizes by the Belief in Observed Gradients](https://arxiv.org/abs/2010.07468).
+[AdaBelief Optimizer: Adapting Stepsizes by the Belief in Observed Gradients](https://papers.labml.ai/paper/2010.07468).
 
 This is implemented in [PyTorch](https://pytorch.org) as an extension to [RAdam](radam.html).
 
diff --git a/labml_nn/optimizers/adam.py b/labml_nn/optimizers/adam.py
index 304d0612..4bf101c3 100644
--- a/labml_nn/optimizers/adam.py
+++ b/labml_nn/optimizers/adam.py
@@ -7,7 +7,7 @@ summary: A simple PyTorch implementation/tutorial of Adam optimizer
 # Adam Optimizer
 
 This is a [PyTorch](https://pytorch.org) implementation of popular optimizer *Adam* from paper
- [Adam: A Method for Stochastic Optimization](https://arxiv.org/abs/1412.6980v9).
+ [Adam: A Method for Stochastic Optimization](https://papers.labml.ai/paper/1412.6980v9).
 
 *Adam* update is,
 
diff --git a/labml_nn/optimizers/amsgrad.py b/labml_nn/optimizers/amsgrad.py
index 2f4d2b49..724e3e4b 100644
--- a/labml_nn/optimizers/amsgrad.py
+++ b/labml_nn/optimizers/amsgrad.py
@@ -7,7 +7,7 @@ summary: A simple PyTorch implementation/tutorial of AMSGrad optimizer.
 # AMSGrad
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
-[On the Convergence of Adam and Beyond](https://arxiv.org/abs/1904.09237).
+[On the Convergence of Adam and Beyond](https://papers.labml.ai/paper/1904.09237).
 
 We implement this as an extension to our [Adam optimizer implementation](adam.html).
 The implementation it self is really small since it's very similar to Adam.
diff --git a/labml_nn/optimizers/noam.py b/labml_nn/optimizers/noam.py
index 8443f881..7ac750a3 100644
--- a/labml_nn/optimizers/noam.py
+++ b/labml_nn/optimizers/noam.py
@@ -9,7 +9,7 @@ summary: >
 # Noam Optimizer
 
 This is the [PyTorch](https://pytorch.org) implementation of optimizer introduced in the paper
-[Attention Is All You Need](https://arxiv.org/abs/1706.03762).
+[Attention Is All You Need](https://papers.labml.ai/paper/1706.03762).
 """
 from typing import Dict
 
diff --git a/labml_nn/optimizers/radam.py b/labml_nn/optimizers/radam.py
index 74272264..97863746 100644
--- a/labml_nn/optimizers/radam.py
+++ b/labml_nn/optimizers/radam.py
@@ -9,7 +9,7 @@ summary: A simple PyTorch implementation/tutorial of RAdam optimizer.
 This implementation is based on
 [the official implementation](https://github.com/LiyuanLucasLiu/RAdam)
 of the paper
-[On the Variance of the Adaptive Learning Rate and Beyond](https://arxiv.org/abs/1908.03265).
+[On the Variance of the Adaptive Learning Rate and Beyond](https://papers.labml.ai/paper/1908.03265).
 
 We have implemented it in [PyTorch](https://pytorch.org)
 as an extension to [our AMSGrad implementation](amsgrad.html)
diff --git a/labml_nn/recurrent_highway_networks/__init__.py b/labml_nn/recurrent_highway_networks/__init__.py
index 8b9b6331..584741b5 100644
--- a/labml_nn/recurrent_highway_networks/__init__.py
+++ b/labml_nn/recurrent_highway_networks/__init__.py
@@ -6,7 +6,7 @@ summary: A simple PyTorch implementation/tutorial of Recurrent Highway Networks.
 
 # Recurrent Highway Networks
 
-This is a [PyTorch](https://pytorch.org) implementation of [Recurrent Highway Networks](https://arxiv.org/abs/1607.03474).
+This is a [PyTorch](https://pytorch.org) implementation of [Recurrent Highway Networks](https://papers.labml.ai/paper/1607.03474).
 """
 from typing import Optional
 
diff --git a/labml_nn/resnet/__init__.py b/labml_nn/resnet/__init__.py
index d5a00773..ed337bb6 100644
--- a/labml_nn/resnet/__init__.py
+++ b/labml_nn/resnet/__init__.py
@@ -8,7 +8,7 @@ summary: >
 # Deep Residual Learning for Image Recognition (ResNet)
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
-[Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385).
+[Deep Residual Learning for Image Recognition](https://papers.labml.ai/paper/1512.03385).
 
 ResNets train layers as residual functions to overcome the
 *degradation problem*.
diff --git a/labml_nn/resnet/readme.md b/labml_nn/resnet/readme.md
index 1d93253b..41c94947 100644
--- a/labml_nn/resnet/readme.md
+++ b/labml_nn/resnet/readme.md
@@ -1,7 +1,7 @@
 # [Deep Residual Learning for Image Recognition (ResNet)](https://nn.labml.ai/resnet/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
-[Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385).
+[Deep Residual Learning for Image Recognition](https://papers.labml.ai/paper/1512.03385).
 
 ResNets train layers as residual functions to overcome the
 *degradation problem*.
diff --git a/labml_nn/rl/dqn/__init__.py b/labml_nn/rl/dqn/__init__.py
index 2c2e449c..61b99817 100644
--- a/labml_nn/rl/dqn/__init__.py
+++ b/labml_nn/rl/dqn/__init__.py
@@ -12,7 +12,7 @@ summary: >
 # Deep Q Networks (DQN)
 
 This is a [PyTorch](https://pytorch.org) implementation of paper
- [Playing Atari with Deep Reinforcement Learning](https://arxiv.org/abs/1312.5602)
+ [Playing Atari with Deep Reinforcement Learning](https://papers.labml.ai/paper/1312.5602)
  along with [Dueling Network](model.html), [Prioritized Replay](replay_buffer.html)
  and Double Q Network.
 
@@ -79,7 +79,7 @@ class QFuncLoss(Module):
         \color{cyan}{Q}(s', a'; \color{cyan}{\theta}); \color{cyan}{\theta}
     \Big)
     $$
-    We use [double Q-learning](https://arxiv.org/abs/1509.06461), where
+    We use [double Q-learning](https://papers.labml.ai/paper/1509.06461), where
     the $\operatorname{argmax}$ is taken from $\color{cyan}{\theta_i}$ and
     the value is taken from $\color{orange}{\theta_i^{-}}$.
 
diff --git a/labml_nn/rl/dqn/model.py b/labml_nn/rl/dqn/model.py
index 83858ad5..3901fe34 100644
--- a/labml_nn/rl/dqn/model.py
+++ b/labml_nn/rl/dqn/model.py
@@ -17,7 +17,7 @@ class Model(Module):
     """
     ## Dueling Network ⚔️ Model for $Q$ Values
 
-    We are using a [dueling network](https://arxiv.org/abs/1511.06581)
+    We are using a [dueling network](https://papers.labml.ai/paper/1511.06581)
      to calculate Q-values.
     Intuition behind dueling network architecture is that in most states
      the action doesn't matter,
diff --git a/labml_nn/rl/dqn/replay_buffer.py b/labml_nn/rl/dqn/replay_buffer.py
index c0d4371a..f52e7e38 100644
--- a/labml_nn/rl/dqn/replay_buffer.py
+++ b/labml_nn/rl/dqn/replay_buffer.py
@@ -6,7 +6,7 @@ summary: Annotated implementation of prioritized experience replay using a binar
 
 # Prioritized Experience Replay Buffer
 
-This implements paper [Prioritized experience replay](https://arxiv.org/abs/1511.05952),
+This implements paper [Prioritized experience replay](https://papers.labml.ai/paper/1511.05952),
 using a binary segment tree.
 """
 
@@ -19,7 +19,7 @@ class ReplayBuffer:
     """
     ## Buffer for Prioritized Experience Replay
 
-    [Prioritized experience replay](https://arxiv.org/abs/1511.05952)
+    [Prioritized experience replay](https://papers.labml.ai/paper/1511.05952)
      samples important transitions more frequently.
     The transitions are prioritized by the Temporal Difference error (td error), $\delta$.
 
diff --git a/labml_nn/rl/ppo/__init__.py b/labml_nn/rl/ppo/__init__.py
index 9a99c4cb..2f119eb3 100644
--- a/labml_nn/rl/ppo/__init__.py
+++ b/labml_nn/rl/ppo/__init__.py
@@ -8,7 +8,7 @@ summary: >
 # Proximal Policy Optimization - PPO
 
 This is a [PyTorch](https://pytorch.org) implementation of
-[Proximal Policy Optimization - PPO](https://arxiv.org/abs/1707.06347).
+[Proximal Policy Optimization - PPO](https://papers.labml.ai/paper/1707.06347).
 
 PPO is a policy gradient method for reinforcement learning.
 Simple policy gradient methods do a single gradient update per sample (or a set of samples).
@@ -112,7 +112,7 @@ class ClippedPPOLoss(Module):
     The error we introduce to $J(\pi_\theta) - J(\pi_{\theta_{OLD}})$
      by this assumption is bound by the KL divergence between
      $\pi_\theta$ and $\pi_{\theta_{OLD}}$.
-    [Constrained Policy Optimization](https://arxiv.org/abs/1705.10528)
+    [Constrained Policy Optimization](https://papers.labml.ai/paper/1705.10528)
      shows the proof of this. I haven't read it.
 
 
diff --git a/labml_nn/rl/ppo/gae.py b/labml_nn/rl/ppo/gae.py
index 1b2361b4..0da1c273 100644
--- a/labml_nn/rl/ppo/gae.py
+++ b/labml_nn/rl/ppo/gae.py
@@ -7,7 +7,7 @@ summary: A PyTorch implementation/tutorial of Generalized Advantage Estimation (
 # Generalized Advantage Estimation (GAE)
 
 This is a [PyTorch](https://pytorch.org) implementation of paper
-[Generalized Advantage Estimation](https://arxiv.org/abs/1506.02438).
+[Generalized Advantage Estimation](https://papers.labml.ai/paper/1506.02438).
 
 You can find an experiment that uses it [here](experiment.html).
 """
diff --git a/labml_nn/rl/ppo/readme.md b/labml_nn/rl/ppo/readme.md
index 63c219d3..2f51976d 100644
--- a/labml_nn/rl/ppo/readme.md
+++ b/labml_nn/rl/ppo/readme.md
@@ -1,7 +1,7 @@
 # [Proximal Policy Optimization - PPO](https://nn.labml.ai/rl/ppo/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of
-[Proximal Policy Optimization - PPO](https://arxiv.org/abs/1707.06347).
+[Proximal Policy Optimization - PPO](https://papers.labml.ai/paper/1707.06347).
 
 PPO is a policy gradient method for reinforcement learning.
 Simple policy gradient methods one do a single gradient update per sample (or a set of samples).
diff --git a/labml_nn/sketch_rnn/__init__.py b/labml_nn/sketch_rnn/__init__.py
index ef153e53..c3caf948 100644
--- a/labml_nn/sketch_rnn/__init__.py
+++ b/labml_nn/sketch_rnn/__init__.py
@@ -9,7 +9,7 @@ summary: >
 # Sketch RNN
 
 This is an annotated [PyTorch](https://pytorch.org) implementation of the paper
-[A Neural Representation of Sketch Drawings](https://arxiv.org/abs/1704.03477).
+[A Neural Representation of Sketch Drawings](https://papers.labml.ai/paper/1704.03477).
 
 Sketch RNN is a sequence-to-sequence variational auto-encoder.
 Both encoder and decoder are recurrent neural network models.
diff --git a/labml_nn/transformers/__init__.py b/labml_nn/transformers/__init__.py
index 37c2ba99..d3bed6d4 100644
--- a/labml_nn/transformers/__init__.py
+++ b/labml_nn/transformers/__init__.py
@@ -10,7 +10,7 @@ summary: >
 
 This module contains [PyTorch](https://pytorch.org/)
 implementations and explanations of original transformer
-from paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762),
+from paper [Attention Is All You Need](https://papers.labml.ai/paper/1706.03762),
 and derivatives and enhancements of it.
 
 * [Multi-head attention](mha.html)
@@ -34,34 +34,34 @@ This is an implementation of GPT-2 architecture.
 ## [GLU Variants](glu_variants/simple.html)
 
 This is an implementation of the paper
-[GLU Variants Improve Transformer](https://arxiv.org/abs/2002.05202).
+[GLU Variants Improve Transformer](https://papers.labml.ai/paper/2002.05202).
 
 ## [kNN-LM](knn/index.html)
 
 This is an implementation of the paper
-[Generalization through Memorization: Nearest Neighbor Language Models](https://arxiv.org/abs/1911.00172).
+[Generalization through Memorization: Nearest Neighbor Language Models](https://papers.labml.ai/paper/1911.00172).
 
 ## [Feedback Transformer](feedback/index.html)
 
 This is an implementation of the paper
-[Accessing Higher-level Representations in Sequential Transformers with Feedback Memory](https://arxiv.org/abs/2002.09402).
+[Accessing Higher-level Representations in Sequential Transformers with Feedback Memory](https://papers.labml.ai/paper/2002.09402).
 
 ## [Switch Transformer](switch/index.html)
 
 This is a miniature implementation of the paper
-[Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961).
+[Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://papers.labml.ai/paper/2101.03961).
 Our implementation only has a few million parameters and doesn't do model parallel distributed training.
 It does single GPU training but we implement the concept of switching as described in the paper.
 
 ## [Fast Weights Transformer](fast_weights/index.html)
 
 This is an implementation of the paper
-[Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch](https://arxiv.org/abs/2102.11174).
+[Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch](https://papers.labml.ai/paper/2102.11174).
 
 ## [FNet: Mixing Tokens with Fourier Transforms](fnet/index.html)
 
 This is an implementation of the paper
-[FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824).
+[FNet: Mixing Tokens with Fourier Transforms](https://papers.labml.ai/paper/2105.03824).
 
 ## [Attention Free Transformer](aft/index.html)
 
@@ -71,7 +71,7 @@ This is an implementation of the paper
 ## [Masked Language Model](mlm/index.html)
 
 This is an implementation of Masked Language Model used for pre-training in paper
-[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805).
+[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://papers.labml.ai/paper/1810.04805).
 
 ## [MLP-Mixer: An all-MLP Architecture for Vision](mlp_mixer/index.html)
 
@@ -86,7 +86,7 @@ This is an implementation of the paper
 ## [Vision Transformer (ViT)](vit/index.html)
 
 This is an implementation of the paper
-[An Image Is Worth 16x16 Words: Transformers For Image Recognition At Scale](https://arxiv.org/abs/2010.11929).
+[An Image Is Worth 16x16 Words: Transformers For Image Recognition At Scale](https://papers.labml.ai/paper/2010.11929).
 """
 
 from .configs import TransformerConfigs
diff --git a/labml_nn/transformers/basic/autoregressive_experiment.py b/labml_nn/transformers/basic/autoregressive_experiment.py
index 6dbb92e1..0ebddf9b 100644
--- a/labml_nn/transformers/basic/autoregressive_experiment.py
+++ b/labml_nn/transformers/basic/autoregressive_experiment.py
@@ -7,7 +7,7 @@ summary: >
 
 # Transformer Auto-Regression Experiment
 
-This trains a simple transformer introduced in [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
+This trains a simple transformer introduced in [Attention Is All You Need](https://papers.labml.ai/paper/1706.03762)
 on an NLP auto-regression task (with Tiny Shakespeare dataset).
 """
 
diff --git a/labml_nn/transformers/compressive/__init__.py b/labml_nn/transformers/compressive/__init__.py
index 846e1755..78ca5330 100644
--- a/labml_nn/transformers/compressive/__init__.py
+++ b/labml_nn/transformers/compressive/__init__.py
@@ -9,7 +9,7 @@ summary: >
 # Compressive Transformer
 
 This is an implementation of
-[Compressive Transformers for Long-Range Sequence Modelling](https://arxiv.org/abs/1911.05507)
+[Compressive Transformers for Long-Range Sequence Modelling](https://papers.labml.ai/paper/1911.05507)
 in [PyTorch](https://pytorch.org).
 
 This is an extension of [Transformer XL](../xl/index.html) where past memories
diff --git a/labml_nn/transformers/compressive/readme.md b/labml_nn/transformers/compressive/readme.md
index e0ea82e4..9be2989c 100644
--- a/labml_nn/transformers/compressive/readme.md
+++ b/labml_nn/transformers/compressive/readme.md
@@ -1,7 +1,7 @@
 # [Compressive Transformer](https://nn.labml.ai/transformers/compressive/index.html)
 
 This is an implementation of
-[Compressive Transformers for Long-Range Sequence Modelling](https://arxiv.org/abs/1911.05507)
+[Compressive Transformers for Long-Range Sequence Modelling](https://papers.labml.ai/paper/1911.05507)
 in [PyTorch](https://pytorch.org).
 
 This is an extension of [Transformer XL](https://nn.labml.ai/transformers/xl/index.html) where past memories
diff --git a/labml_nn/transformers/configs.py b/labml_nn/transformers/configs.py
index a563c627..27788d18 100644
--- a/labml_nn/transformers/configs.py
+++ b/labml_nn/transformers/configs.py
@@ -66,7 +66,7 @@ def _ffn_activation_gelu():
 
     $$x \Phi(x)$$ where $\Phi(x) = P(X \le x), X \sim \mathcal{N}(0,1)$
 
-    It was introduced in paper [Gaussian Error Linear Units](https://arxiv.org/abs/1606.08415).
+    It was introduced in paper [Gaussian Error Linear Units](https://papers.labml.ai/paper/1606.08415).
     """
     return nn.GELU()
 
@@ -86,7 +86,7 @@ def _feed_forward(c: FeedForwardConfigs):
 
 # ## GLU Variants
 # These are variants with gated hidden layers for the FFN
-# as introduced in paper [GLU Variants Improve Transformer](https://arxiv.org/abs/2002.05202).
+# as introduced in paper [GLU Variants Improve Transformer](https://papers.labml.ai/paper/2002.05202).
 # We have omitted the bias terms as specified in the paper.
 
 # ### FFN with Gated Linear Units
diff --git a/labml_nn/transformers/fast_weights/__init__.py b/labml_nn/transformers/fast_weights/__init__.py
index 8c539ab3..279cd42c 100644
--- a/labml_nn/transformers/fast_weights/__init__.py
+++ b/labml_nn/transformers/fast_weights/__init__.py
@@ -9,7 +9,7 @@ summary: >
 # Fast weights transformer
 
 The paper
-[Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch](https://arxiv.org/abs/2102.11174)
+[Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch](https://papers.labml.ai/paper/2102.11174)
 finds similarities between linear self-attention and fast weight systems
 and makes modifications to self-attention update rule based on that.
 It also introduces a simpler, yet effective kernel function.
diff --git a/labml_nn/transformers/fast_weights/readme.md b/labml_nn/transformers/fast_weights/readme.md
index 0addfefc..3b345134 100644
--- a/labml_nn/transformers/fast_weights/readme.md
+++ b/labml_nn/transformers/fast_weights/readme.md
@@ -1,7 +1,7 @@
 # [Fast weights transformer](https://nn.labml.ai/transformers/fast_weights/index.html)
 
 This is an annotated implementation of the paper
-[Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch](https://arxiv.org/abs/2102.11174).
+[Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch](https://papers.labml.ai/paper/2102.11174).
 
 Here is the [annotated implementation](https://nn.labml.ai/transformers/fast_weights/index.html).
 Here are [the training code](https://nn.labml.ai/transformers/fast_weights/experiment.html)
diff --git a/labml_nn/transformers/feed_forward.py b/labml_nn/transformers/feed_forward.py
index fce34ed5..4d64df4c 100644
--- a/labml_nn/transformers/feed_forward.py
+++ b/labml_nn/transformers/feed_forward.py
@@ -28,7 +28,7 @@ $$x \Phi(x)$$ where $\Phi(x) = P(X \le x), X \sim \mathcal{N}(0,1)$
 ### Gated Linear Units
 
 This is a generic implementation that supports different variants including
-[Gated Linear Units](https://arxiv.org/abs/2002.05202) (GLU).
+[Gated Linear Units](https://papers.labml.ai/paper/2002.05202) (GLU).
 We have also implemented experiments on these:
 
 * [experiment that uses `labml.configs`](glu_variants/experiment.html)
diff --git a/labml_nn/transformers/feedback/__init__.py b/labml_nn/transformers/feedback/__init__.py
index 6ec3a2b2..c7a22c5c 100644
--- a/labml_nn/transformers/feedback/__init__.py
+++ b/labml_nn/transformers/feedback/__init__.py
@@ -8,7 +8,7 @@ summary: >
 # Feedback Transformer
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
-[Accessing Higher-level Representations in Sequential Transformers with Feedback Memory](https://arxiv.org/abs/2002.09402).
+[Accessing Higher-level Representations in Sequential Transformers with Feedback Memory](https://papers.labml.ai/paper/2002.09402).
 
 Normal transformers process tokens in parallel. Each transformer layer pays attention
 to the outputs of the previous layer.
diff --git a/labml_nn/transformers/feedback/readme.md b/labml_nn/transformers/feedback/readme.md
index f6b2dc78..f8d51478 100644
--- a/labml_nn/transformers/feedback/readme.md
+++ b/labml_nn/transformers/feedback/readme.md
@@ -1,7 +1,7 @@
 # [Feedback Transformer](https://nn.labml.ai/transformers/feedback/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
-[Accessing Higher-level Representations in Sequential Transformers with Feedback Memory](https://arxiv.org/abs/2002.09402).
+[Accessing Higher-level Representations in Sequential Transformers with Feedback Memory](https://papers.labml.ai/paper/2002.09402).
 
 Normal transformers process tokens in parallel. Each transformer layer pays attention
 to the outputs of the previous layer.
diff --git a/labml_nn/transformers/fnet/__init__.py b/labml_nn/transformers/fnet/__init__.py
index 4b123f37..7c160c8a 100644
--- a/labml_nn/transformers/fnet/__init__.py
+++ b/labml_nn/transformers/fnet/__init__.py
@@ -8,7 +8,7 @@ summary: >
 # FNet: Mixing Tokens with Fourier Transforms
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
-[FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824).
+[FNet: Mixing Tokens with Fourier Transforms](https://papers.labml.ai/paper/2105.03824).
 
 This paper replaces the [self-attention layer](../mha.html) with two
 [Fourier transforms](https://en.wikipedia.org/wiki/Discrete_Fourier_transform) to
diff --git a/labml_nn/transformers/fnet/readme.md b/labml_nn/transformers/fnet/readme.md
index 1ef7a74e..dc2d4465 100644
--- a/labml_nn/transformers/fnet/readme.md
+++ b/labml_nn/transformers/fnet/readme.md
@@ -1,7 +1,7 @@
 # [FNet: Mixing Tokens with Fourier Transforms](https://nn.labml.ai/transformers/fnet/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
-[FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824).
+[FNet: Mixing Tokens with Fourier Transforms](https://papers.labml.ai/paper/2105.03824).
 
 This paper replaces the [self-attention layer](https://nn.labml.ai/transformers//mha.html) with two
 [Fourier transforms](https://en.wikipedia.org/wiki/Discrete_Fourier_transform) to
diff --git a/labml_nn/transformers/knn/__init__.py b/labml_nn/transformers/knn/__init__.py
index 3e38f66d..a854d9c9 100644
--- a/labml_nn/transformers/knn/__init__.py
+++ b/labml_nn/transformers/knn/__init__.py
@@ -12,7 +12,7 @@ summary: >
 # k-Nearest Neighbor Language Models
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
- [Generalization through Memorization: Nearest Neighbor Language Models](https://arxiv.org/abs/1911.00172).
+ [Generalization through Memorization: Nearest Neighbor Language Models](https://papers.labml.ai/paper/1911.00172).
 It uses k-nearest neighbors to  improve perplexity of autoregressive transformer models.
 
 An autoregressive language model estimates $p(w_t | \color{yellowgreen}{c_t})$,
diff --git a/labml_nn/transformers/mha.py b/labml_nn/transformers/mha.py
index 39346752..699ef548 100644
--- a/labml_nn/transformers/mha.py
+++ b/labml_nn/transformers/mha.py
@@ -9,7 +9,7 @@ summary: >
 # Multi-Headed Attention (MHA)
 
 This is a tutorial/implementation of multi-headed attention
-from paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
+from paper [Attention Is All You Need](https://papers.labml.ai/paper/1706.03762)
 in [PyTorch](https://pytorch.org/).
 The implementation is inspired from [Annotated Transformer](https://nlp.seas.harvard.edu/2018/04/03/attention.html).
 
diff --git a/labml_nn/transformers/mlm/__init__.py b/labml_nn/transformers/mlm/__init__.py
index d0b14e7f..a3423034 100644
--- a/labml_nn/transformers/mlm/__init__.py
+++ b/labml_nn/transformers/mlm/__init__.py
@@ -9,7 +9,7 @@ summary: >
 
 This is a [PyTorch](https://pytorch.org) implementation of the Masked Language Model (MLM)
  used to pre-train the BERT model introduced in the paper
-[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805).
+[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://papers.labml.ai/paper/1810.04805).
 
 ## BERT Pretraining
 
diff --git a/labml_nn/transformers/mlm/readme.md b/labml_nn/transformers/mlm/readme.md
index cad08b8e..4b2b1312 100644
--- a/labml_nn/transformers/mlm/readme.md
+++ b/labml_nn/transformers/mlm/readme.md
@@ -2,7 +2,7 @@
 
 This is a [PyTorch](https://pytorch.org) implementation of Masked Language Model (MLM)
  used to pre-train the BERT model introduced in the paper
-[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805).
+[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://papers.labml.ai/paper/1810.04805).
 
 ## BERT Pretraining
 
diff --git a/labml_nn/transformers/models.py b/labml_nn/transformers/models.py
index fad7077d..cb63a374 100644
--- a/labml_nn/transformers/models.py
+++ b/labml_nn/transformers/models.py
@@ -71,7 +71,7 @@ class TransformerLayer(Module):
     Alternative is to do a layer normalization after adding the residuals.
     But we found this to be less stable when training.
     We found a detailed discussion about this in the paper
-     [On Layer Normalization in the Transformer Architecture](https://arxiv.org/abs/2002.04745).
+     [On Layer Normalization in the Transformer Architecture](https://papers.labml.ai/paper/2002.04745).
     """
 
     def __init__(self, *,
diff --git a/labml_nn/transformers/switch/__init__.py b/labml_nn/transformers/switch/__init__.py
index fa74705b..33431129 100644
--- a/labml_nn/transformers/switch/__init__.py
+++ b/labml_nn/transformers/switch/__init__.py
@@ -8,7 +8,7 @@ summary: >
 # Switch Transformer
 
 This is a miniature [PyTorch](https://pytorch.org) implementation of the paper
-[Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961).
+[Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://papers.labml.ai/paper/2101.03961).
 Our implementation only has a few million parameters and doesn't do model parallel distributed training.
 It does single GPU training, but we implement the concept of switching as described in the paper.
 
diff --git a/labml_nn/transformers/switch/readme.md b/labml_nn/transformers/switch/readme.md
index 72780f30..e0ea0de5 100644
--- a/labml_nn/transformers/switch/readme.md
+++ b/labml_nn/transformers/switch/readme.md
@@ -1,7 +1,7 @@
 # [Switch Transformer](https://nn.labml.ai/transformers/switch/index.html)
 
 This is a miniature [PyTorch](https://pytorch.org) implementation of the paper
-[Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961).
+[Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://papers.labml.ai/paper/2101.03961).
 Our implementation only has a few million parameters and doesn't do model parallel distributed training.
 It does single GPU training, but we implement the concept of switching as described in the paper.
 
diff --git a/labml_nn/transformers/vit/__init__.py b/labml_nn/transformers/vit/__init__.py
index d45b8c05..3fd991c4 100644
--- a/labml_nn/transformers/vit/__init__.py
+++ b/labml_nn/transformers/vit/__init__.py
@@ -9,7 +9,7 @@ summary: >
 #  Vision Transformer (ViT)
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
-[An Image Is Worth 16x16 Words: Transformers For Image Recognition At Scale](https://arxiv.org/abs/2010.11929).
+[An Image Is Worth 16x16 Words: Transformers For Image Recognition At Scale](https://papers.labml.ai/paper/2010.11929).
 
 Vision transformer applies a pure transformer to images
 without any convolution layers.
diff --git a/labml_nn/transformers/vit/readme.md b/labml_nn/transformers/vit/readme.md
index 636ddb0c..be411d75 100644
--- a/labml_nn/transformers/vit/readme.md
+++ b/labml_nn/transformers/vit/readme.md
@@ -1,7 +1,7 @@
 #  [Vision Transformer (ViT)](https://nn.labml.ai/transformer/vit/index.html)
 
 This is a [PyTorch](https://pytorch.org) implementation of the paper
-[An Image Is Worth 16x16 Words: Transformers For Image Recognition At Scale](https://arxiv.org/abs/2010.11929).
+[An Image Is Worth 16x16 Words: Transformers For Image Recognition At Scale](https://papers.labml.ai/paper/2010.11929).
 
 Vision transformer applies a pure transformer to images
 without any convolution layers.
diff --git a/labml_nn/transformers/xl/__init__.py b/labml_nn/transformers/xl/__init__.py
index b37ad7dd..75a2f866 100644
--- a/labml_nn/transformers/xl/__init__.py
+++ b/labml_nn/transformers/xl/__init__.py
@@ -9,7 +9,7 @@ summary: >
 # Transformer XL
 
 This is an implementation of
-[Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860)
+[Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://papers.labml.ai/paper/1901.02860)
 in [PyTorch](https://pytorch.org).
 
 Transformer has a limited attention span,
diff --git a/labml_nn/transformers/xl/readme.md b/labml_nn/transformers/xl/readme.md
index 24d50a3a..ec291744 100644
--- a/labml_nn/transformers/xl/readme.md
+++ b/labml_nn/transformers/xl/readme.md
@@ -1,7 +1,7 @@
 # [Transformer XL](https://nn.labml.ai/transformers/xl/index.html)
 
 This is an implementation of
-[Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860)
+[Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://papers.labml.ai/paper/1901.02860)
 in [PyTorch](https://pytorch.org).
 
 Transformer has a limited attention span,
diff --git a/labml_nn/transformers/xl/relative_mha.py b/labml_nn/transformers/xl/relative_mha.py
index 2a3e7609..ff990736 100644
--- a/labml_nn/transformers/xl/relative_mha.py
+++ b/labml_nn/transformers/xl/relative_mha.py
@@ -9,7 +9,7 @@ summary: >
 # Relative Multi-Headed Attention
 
 This is an implementation of relative multi-headed attention from paper
-[Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860)
+[Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://papers.labml.ai/paper/1901.02860)
 in [PyTorch](https://pytorch.org).
 """