From e38f9af96805f24a60d9b1fb6ace14fe6a86c90c Mon Sep 17 00:00:00 2001
From: Varuna Jayasiri <vpjayasiri@gmail.com>
Date: Sun, 8 Aug 2021 08:32:39 +0530
Subject: [PATCH] repo name

---
 docs/capsule_networks/index.html              |  2 +-
 docs/capsule_networks/readme.html             |  2 +-
 docs/cfr/index.html                           |  2 +-
 docs/cfr/kuhn/index.html                      |  2 +-
 docs/gan/cycle_gan/index.html                 |  2 +-
 docs/gan/wasserstein/index.html               |  2 +-
 docs/hypernetworks/hyper_lstm.html            |  2 +-
 docs/index.html                               |  2 +-
 .../batch_channel_norm/index.html             |  2 +-
 docs/normalization/batch_norm/index.html      |  2 +-
 docs/normalization/batch_norm/readme.html     |  2 +-
 docs/normalization/group_norm/index.html      |  2 +-
 docs/normalization/group_norm/readme.html     |  2 +-
 .../weight_standardization/index.html         |  2 +-
 docs/rl/ppo/experiment.html                   |  2 +-
 docs/rl/ppo/index.html                        |  2 +-
 docs/rl/ppo/readme.html                       |  2 +-
 docs/transformers/compressive/index.html      |  2 +-
 docs/transformers/compressive/readme.html     |  2 +-
 .../transformers/fast_weights/experiment.html |  2 +-
 docs/transformers/fast_weights/index.html     |  2 +-
 docs/transformers/fast_weights/readme.html    |  2 +-
 docs/transformers/feedback/experiment.html    |  2 +-
 docs/transformers/feedback/index.html         |  2 +-
 docs/transformers/feedback/readme.html        |  4 ++--
 docs/transformers/glu_variants/simple.html    |  2 +-
 docs/transformers/gpt/index.html              |  2 +-
 docs/transformers/switch/index.html           |  2 +-
 docs/transformers/switch/readme.html          |  2 +-
 docs/transformers/xl/index.html               |  2 +-
 docs/transformers/xl/readme.html              |  2 +-
 labml_nn/__init__.py                          |  2 +-
 labml_nn/capsule_networks/__init__.py         |  2 +-
 labml_nn/capsule_networks/mnist.ipynb         |  4 ++--
 labml_nn/capsule_networks/readme.md           |  2 +-
 labml_nn/cfr/__init__.py                      |  2 +-
 labml_nn/cfr/kuhn/__init__.py                 |  2 +-
 labml_nn/cfr/kuhn/experiment.ipynb            |  4 ++--
 labml_nn/gan/cycle_gan/__init__.py            |  2 +-
 labml_nn/gan/cycle_gan/experiment.ipynb       |  4 ++--
 labml_nn/gan/dcgan/experiment.ipynb           | 24 +++++++++----------
 labml_nn/gan/original/experiment.ipynb        | 16 ++++++-------
 labml_nn/gan/wasserstein/__init__.py          |  2 +-
 labml_nn/gan/wasserstein/experiment.ipynb     | 14 +++++------
 labml_nn/hypernetworks/experiment.ipynb       |  4 ++--
 labml_nn/hypernetworks/hyper_lstm.py          |  2 +-
 .../batch_channel_norm/__init__.py            |  2 +-
 labml_nn/normalization/batch_norm/__init__.py |  2 +-
 labml_nn/normalization/batch_norm/mnist.ipynb |  4 ++--
 labml_nn/normalization/batch_norm/readme.md   |  2 +-
 labml_nn/normalization/group_norm/__init__.py |  2 +-
 .../normalization/group_norm/experiment.ipynb |  4 ++--
 labml_nn/normalization/group_norm/readme.md   |  2 +-
 .../weight_standardization/__init__.py        |  2 +-
 .../weight_standardization/experiment.ipynb   |  4 ++--
 labml_nn/rl/ppo/__init__.py                   |  2 +-
 labml_nn/rl/ppo/experiment.ipynb              |  4 ++--
 labml_nn/rl/ppo/experiment.py                 |  2 +-
 labml_nn/rl/ppo/readme.md                     |  2 +-
 labml_nn/transformers/compressive/__init__.py |  2 +-
 .../transformers/compressive/experiment.ipynb |  4 ++--
 labml_nn/transformers/compressive/readme.md   |  2 +-
 .../transformers/fast_weights/__init__.py     |  2 +-
 .../fast_weights/experiment.ipynb             |  4 ++--
 .../transformers/fast_weights/experiment.py   |  2 +-
 labml_nn/transformers/fast_weights/readme.md  |  2 +-
 labml_nn/transformers/feedback/__init__.py    |  2 +-
 .../transformers/feedback/experiment.ipynb    |  4 ++--
 labml_nn/transformers/feedback/experiment.py  |  2 +-
 labml_nn/transformers/feedback/readme.md      |  4 ++--
 .../transformers/glu_variants/simple.ipynb    |  4 ++--
 labml_nn/transformers/glu_variants/simple.py  |  2 +-
 labml_nn/transformers/gpt/__init__.py         |  2 +-
 labml_nn/transformers/gpt/experiment.ipynb    |  4 ++--
 labml_nn/transformers/switch/__init__.py      |  2 +-
 labml_nn/transformers/switch/experiment.ipynb |  4 ++--
 labml_nn/transformers/switch/readme.md        |  2 +-
 labml_nn/transformers/xl/__init__.py          |  2 +-
 labml_nn/transformers/xl/experiment.ipynb     |  4 ++--
 labml_nn/transformers/xl/readme.md            |  2 +-
 80 files changed, 121 insertions(+), 121 deletions(-)

diff --git a/docs/capsule_networks/index.html b/docs/capsule_networks/index.html
index 2dc4c9bb..e563ece7 100644
--- a/docs/capsule_networks/index.html
+++ b/docs/capsule_networks/index.html
@@ -78,7 +78,7 @@ it is difficult to understand some concepts with just the modules.
 <p>I used <a href="https://github.com/jindongwang/Pytorch-CapsuleNet">jindongwang/Pytorch-CapsuleNet</a> to clarify some
 confusions I had with the paper.</p>
 <p>Here&rsquo;s a notebook for training a Capsule Network on MNIST dataset.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/capsule_networks/mnist.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/capsule_networks/mnist.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/e7c08e08586711ebb3e30242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/capsule_networks/readme.html b/docs/capsule_networks/readme.html
index 319ce05b..4d6e991f 100644
--- a/docs/capsule_networks/readme.html
+++ b/docs/capsule_networks/readme.html
@@ -78,7 +78,7 @@ it is difficult to understand some concepts with just the modules.
 <p>I used <a href="https://github.com/jindongwang/Pytorch-CapsuleNet">jindongwang/Pytorch-CapsuleNet</a> to clarify some
 confusions I had with the paper.</p>
 <p>Here&rsquo;s a notebook for training a Capsule Network on MNIST dataset.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/capsule_networks/mnist.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/capsule_networks/mnist.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/e7c08e08586711ebb3e30242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/cfr/index.html b/docs/cfr/index.html
index 9dde09ce..56ff1fbe 100644
--- a/docs/cfr/index.html
+++ b/docs/cfr/index.html
@@ -78,7 +78,7 @@ introduces Monte Carlo Counterfactual Regret Minimization (<strong>MCCFR</strong
 where we sample from the game tree and estimate the regrets.</p>
 <p>We tried to keep our Python implementation easy-to-understand like a tutorial.
 We run it on <a href="kuhn/index.html">a very simple imperfect information game called Kuhn poker</a>.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/cfr/kuhn/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a></p>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/cfr/kuhn/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a></p>
 <p><a href="https://twitter.com/labmlai/status/1407186002255380484"><img alt="Twitter thread" src="https://img.shields.io/twitter/url?style=social&amp;url=https%3A%2F%2Ftwitter.com%2Flabmlai%2Fstatus%2F1407186002255380484" /></a>
 Twitter thread</p>
 <h2>Introduction</h2>
diff --git a/docs/cfr/kuhn/index.html b/docs/cfr/kuhn/index.html
index 6585a61b..06b04c15 100644
--- a/docs/cfr/kuhn/index.html
+++ b/docs/cfr/kuhn/index.html
@@ -88,7 +88,7 @@ This game is played repeatedly and a good strategy will optimize for the long te
 </ul>
 <p>He we extend the <code>InfoSet</code> class and <code>History</code> class defined in <a href="../index.html"><code>__init__.py</code></a>
 with Kuhn Poker specifics.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/cfr/kuhn/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/cfr/kuhn/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/7c35d3fad29711eba588acde48001122"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/gan/cycle_gan/index.html b/docs/gan/cycle_gan/index.html
index c41e5592..9ae58422 100644
--- a/docs/gan/cycle_gan/index.html
+++ b/docs/gan/cycle_gan/index.html
@@ -84,7 +84,7 @@ One generator translates images from A to B and the other from B to A.
 The discriminators test whether the generated images look real.</p>
 <p>This file contains the model code as well as the training code.
 We also have a Google Colab notebook.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/gan/cycle_gan/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/gan/cycle_gan/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/93b11a665d6811ebaac80242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/gan/wasserstein/index.html b/docs/gan/wasserstein/index.html
index 537fd5b0..abe191ef 100644
--- a/docs/gan/wasserstein/index.html
+++ b/docs/gan/wasserstein/index.html
@@ -133,7 +133,7 @@ to minimize above formula.</p>
 while keeping $K$ bounded. <em>One way to keep $K$ bounded is to clip all weights in the neural
 network that defines $f$ clipped within a range.</em></p>
 <p>Here is the code to try this on a <a href="experiment.html">simple MNIST generation experiment</a>.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/gan/wasserstein/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a></p>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/gan/wasserstein/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a></p>
             </div>
             <div class='code'>
                 <div class="highlight"><pre><span class="lineno">87</span><span></span><span class="kn">import</span> <span class="nn">torch.utils.data</span>
diff --git a/docs/hypernetworks/hyper_lstm.html b/docs/hypernetworks/hyper_lstm.html
index abee2723..0256286d 100644
--- a/docs/hypernetworks/hyper_lstm.html
+++ b/docs/hypernetworks/hyper_lstm.html
@@ -74,7 +74,7 @@ using <a href="https://pytorch.org">PyTorch</a>.
 by David Ha gives a good explanation of HyperNetworks.</p>
 <p>We have an experiment that trains a HyperLSTM to predict text on Shakespeare dataset.
 Here&rsquo;s the link to code: <a href="experiment.html"><code>experiment.py</code></a></p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/hypernetworks/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/hypernetworks/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/9e7f39e047e811ebbaff2b26e3148b3d"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
 <p>HyperNetworks use a smaller network to generate weights of a larger network.
 There are two variants: static hyper-networks and dynamic hyper-networks.
diff --git a/docs/index.html b/docs/index.html
index 8a5a85ac..07fdb9c4 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -68,7 +68,7 @@
                 <h1><a href="index.html">labml.ai Annotated PyTorch Paper Implementations</a></h1>
 <p>This is a collection of simple PyTorch implementations of
 neural networks and related algorithms.
-<a href="https://github.com/lab-ml/nn">These implementations</a> are documented with explanations,
+<a href="https://github.com/labmlai/annotated_deep_learning_paper_implementations">These implementations</a> are documented with explanations,
 and the <a href="index.html">website</a>
 renders these as side-by-side formatted notes.
 We believe these would help you understand these algorithms better.</p>
diff --git a/docs/normalization/batch_channel_norm/index.html b/docs/normalization/batch_channel_norm/index.html
index f060a63b..d79f5882 100644
--- a/docs/normalization/batch_channel_norm/index.html
+++ b/docs/normalization/batch_channel_norm/index.html
@@ -77,7 +77,7 @@ When the batch size is small a running mean and variance is used for
 batch normalization.</p>
 <p>Here is <a href="../weight_standardization/experiment.html">the training code</a> for training
 a VGG network that uses weight standardization to classify CIFAR-10 data.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/weight_standardization/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/normalization/weight_standardization/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/f4a783a2a7df11eb921d0242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a>
 <a href="https://wandb.ai/vpj/cifar10/runs/3flr4k8w"><img alt="WandB" src="https://img.shields.io/badge/wandb-run-yellow" /></a></p>
             </div>
diff --git a/docs/normalization/batch_norm/index.html b/docs/normalization/batch_norm/index.html
index 69e2b5d1..f9ffef8e 100644
--- a/docs/normalization/batch_norm/index.html
+++ b/docs/normalization/batch_norm/index.html
@@ -132,7 +132,7 @@ The usual practice is to calculate an exponential moving average of
 mean and variance during the training phase and use that for inference.</p>
 <p>Here&rsquo;s <a href="mnist.html">the training code</a> and a notebook for training
 a CNN classifier that uses batch normalization for MNIST dataset.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/batch_norm/mnist.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/normalization/batch_norm/mnist.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/011254fe647011ebbb8e0242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/normalization/batch_norm/readme.html b/docs/normalization/batch_norm/readme.html
index b5468b75..0e7aa7dc 100644
--- a/docs/normalization/batch_norm/readme.html
+++ b/docs/normalization/batch_norm/readme.html
@@ -132,7 +132,7 @@ The usual practice is to calculate an exponential moving average of
 mean and variance during the training phase and use that for inference.</p>
 <p>Here&rsquo;s <a href="mnist.html">the training code</a> and a notebook for training
 a CNN classifier that uses batch normalization for MNIST dataset.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/batch_norm/mnist.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/normalization/batch_norm/mnist.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/011254fe647011ebbb8e0242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/normalization/group_norm/index.html b/docs/normalization/group_norm/index.html
index 2eb0f4dc..92f7d291 100644
--- a/docs/normalization/group_norm/index.html
+++ b/docs/normalization/group_norm/index.html
@@ -127,7 +127,7 @@ $m$ is the size of the set $\mathcal{S}_i$ which is the same for all $i$.</p>
 <p>where $G$ is the number of groups and $C$ is the number of channels.</p>
 <p>Group normalization normalizes values of the same sample and the same group of channels together.</p>
 <p>Here&rsquo;s a <a href="experiment.html">CIFAR 10 classification model</a> that uses instance normalization.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/group_norm/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/normalization/group_norm/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/081d950aa4e011eb8f9f0242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a>
 <a href="https://wandb.ai/vpj/cifar10/runs/310etthp"><img alt="WandB" src="https://img.shields.io/badge/wandb-run-yellow" /></a></p>
             </div>
diff --git a/docs/normalization/group_norm/readme.html b/docs/normalization/group_norm/readme.html
index b0a956f2..b0f21981 100644
--- a/docs/normalization/group_norm/readme.html
+++ b/docs/normalization/group_norm/readme.html
@@ -81,7 +81,7 @@ This is based on the observation that classical features such as
 The paper proposes dividing feature channels into groups and then separately normalizing
 all channels within each group.</p>
 <p>Here&rsquo;s a <a href="https://nn.labml.ai/normalization/group_norm/experiment.html">CIFAR 10 classification model</a> that uses instance normalization.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/group_norm/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/normalization/group_norm/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/081d950aa4e011eb8f9f0242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a>
 <a href="https://wandb.ai/vpj/cifar10/runs/310etthp"><img alt="WandB" src="https://img.shields.io/badge/wandb-run-yellow" /></a></p>
             </div>
diff --git a/docs/normalization/weight_standardization/index.html b/docs/normalization/weight_standardization/index.html
index 9889bd90..fd4d62b9 100644
--- a/docs/normalization/weight_standardization/index.html
+++ b/docs/normalization/weight_standardization/index.html
@@ -95,7 +95,7 @@ This avoids outputs of nodes from always falling beyond the active range of the
 <p>Here is <a href="experiment.html">the training code</a> for training
 a VGG network that uses weight standardization to classify CIFAR-10 data.
 This uses a <a href="conv2d.html">2D-Convolution Layer with Weight Standardization</a>.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/weight_standardization/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/normalization/weight_standardization/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/f4a783a2a7df11eb921d0242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a>
 <a href="https://wandb.ai/vpj/cifar10/runs/3flr4k8w"><img alt="WandB" src="https://img.shields.io/badge/wandb-run-yellow" /></a></p>
             </div>
diff --git a/docs/rl/ppo/experiment.html b/docs/rl/ppo/experiment.html
index 2662b965..1de56e36 100644
--- a/docs/rl/ppo/experiment.html
+++ b/docs/rl/ppo/experiment.html
@@ -70,7 +70,7 @@
                 <h1>PPO Experiment with Atari Breakout</h1>
 <p>This experiment trains Proximal Policy Optimization (PPO) agent  Atari Breakout game on OpenAI Gym.
 It runs the <a href="../game.html">game environments on multiple processes</a> to sample efficiently.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/rl/ppo/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/rl/ppo/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/6eff28a0910e11eb9b008db315936e2f"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/rl/ppo/index.html b/docs/rl/ppo/index.html
index adedd32d..36e8c4b2 100644
--- a/docs/rl/ppo/index.html
+++ b/docs/rl/ppo/index.html
@@ -80,7 +80,7 @@ It does so by clipping gradient flow if the updated policy
 is not close to the policy used to sample the data.</p>
 <p>You can find an experiment that uses it <a href="experiment.html">here</a>.
 The experiment uses <a href="gae.html">Generalized Advantage Estimation</a>.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/rl/ppo/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/rl/ppo/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/6eff28a0910e11eb9b008db315936e2f"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/rl/ppo/readme.html b/docs/rl/ppo/readme.html
index 67453117..bd384de2 100644
--- a/docs/rl/ppo/readme.html
+++ b/docs/rl/ppo/readme.html
@@ -80,7 +80,7 @@ It does so by clipping gradient flow if the updated policy
 is not close to the policy used to sample the data.</p>
 <p>You can find an experiment that uses it <a href="https://nn.labml.ai/rl/ppo/experiment.html">here</a>.
 The experiment uses <a href="https://nn.labml.ai/rl/ppo/gae.html">Generalized Advantage Estimation</a>.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/rl/ppo/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/rl/ppo/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/6eff28a0910e11eb9b008db315936e2f"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/transformers/compressive/index.html b/docs/transformers/compressive/index.html
index b24f46f4..892d00f1 100644
--- a/docs/transformers/compressive/index.html
+++ b/docs/transformers/compressive/index.html
@@ -99,7 +99,7 @@ self-attention, and the pass-through in the residual connection is not normalize
 This is supposed to be more stable in standard transformer setups.</p>
 <p>Here are <a href="experiment.html">the training code</a> and a notebook for training a compressive transformer
 model on the Tiny Shakespeare dataset.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/compressive/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/compressive/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/0d9b5338726c11ebb7c80242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/transformers/compressive/readme.html b/docs/transformers/compressive/readme.html
index 298fa520..73075774 100644
--- a/docs/transformers/compressive/readme.html
+++ b/docs/transformers/compressive/readme.html
@@ -99,7 +99,7 @@ self-attention, and the pass-through in the residual connection is not normalize
 This is supposed to be more stable in standard transformer setups.</p>
 <p>Here are <a href="https://nn.labml.ai/transformers/compressive/experiment.html">the training code</a> and a notebook for training a compressive transformer
 model on the Tiny Shakespeare dataset.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/compressive/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/compressive/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/0d9b5338726c11ebb7c80242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/transformers/fast_weights/experiment.html b/docs/transformers/fast_weights/experiment.html
index ea4cdca0..378e7fb1 100644
--- a/docs/transformers/fast_weights/experiment.html
+++ b/docs/transformers/fast_weights/experiment.html
@@ -70,7 +70,7 @@
                 <h1>Train Fast Weights Transformer</h1>
 <p>This trains a fast weights transformer model for auto-regression.</p>
 <p>Here’s a Colab notebook for training a fast weights transformer on Tiny Shakespeare dataset.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/928aadc0846c11eb85710242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/transformers/fast_weights/index.html b/docs/transformers/fast_weights/index.html
index 15534e86..f78cf2c1 100644
--- a/docs/transformers/fast_weights/index.html
+++ b/docs/transformers/fast_weights/index.html
@@ -139,7 +139,7 @@ a new update rule for $\color{cyan}{W^{(i)}} = f(\color{cyan}{W^{(i-1)}})$ and c
 $\frac{1}{z^{(i)} \cdot \color{lightgreen}{\phi(q^{(i)})}}$</p>
 <p>Here are <a href="experiment.html">the training code</a> and a notebook for training a fast weights
  transformer on the Tiny Shakespeare dataset.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/928aadc0846c11eb85710242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/transformers/fast_weights/readme.html b/docs/transformers/fast_weights/readme.html
index a28207cd..48763408 100644
--- a/docs/transformers/fast_weights/readme.html
+++ b/docs/transformers/fast_weights/readme.html
@@ -73,7 +73,7 @@
 <p>Here is the <a href="https://nn.labml.ai/transformers/fast_weights/index.html">annotated implementation</a>.
 Here are <a href="https://nn.labml.ai/transformers/fast_weights/experiment.html">the training code</a>
 and a notebook for training a fast weights transformer on the Tiny Shakespeare dataset.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/928aadc0846c11eb85710242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/transformers/feedback/experiment.html b/docs/transformers/feedback/experiment.html
index 764ddc2a..e718abd3 100644
--- a/docs/transformers/feedback/experiment.html
+++ b/docs/transformers/feedback/experiment.html
@@ -72,7 +72,7 @@
 You can pick the original feedback transformer or the new version
 where the keys and values are precalculated.</p>
 <p>Here&rsquo;s a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/feedback/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/feedback/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/d8eb9416530a11eb8fb50242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/transformers/feedback/index.html b/docs/transformers/feedback/index.html
index 04df6d3d..99516990 100644
--- a/docs/transformers/feedback/index.html
+++ b/docs/transformers/feedback/index.html
@@ -91,7 +91,7 @@ them cached.
 The <a href="#shared_kv">second half</a> of this file implements this.
 We implemented a custom PyTorch function to improve performance.</p>
 <p>Here&rsquo;s <a href="experiment.html">the training code</a> and a notebook for training a feedback transformer on Tiny Shakespeare dataset.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/feedback/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/feedback/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/d8eb9416530a11eb8fb50242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/transformers/feedback/readme.html b/docs/transformers/feedback/readme.html
index 39035b98..077924c0 100644
--- a/docs/transformers/feedback/readme.html
+++ b/docs/transformers/feedback/readme.html
@@ -91,8 +91,8 @@ them cached.
 The <a href="#shared_kv">second half</a> of this file implements this.
 We implemented a custom PyTorch function to improve performance.</p>
 <p>Here&rsquo;s <a href="experiment.html">the training code</a> and a notebook for training a feedback transformer on Tiny Shakespeare dataset.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/feedback/experiment.ipynb">Colab Notebook</a></p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/feedback/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/feedback/experiment.ipynb">Colab Notebook</a></p>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/feedback/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/d8eb9416530a11eb8fb50242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/transformers/glu_variants/simple.html b/docs/transformers/glu_variants/simple.html
index b01943de..0df3839b 100644
--- a/docs/transformers/glu_variants/simple.html
+++ b/docs/transformers/glu_variants/simple.html
@@ -72,7 +72,7 @@
 We try different variants for the <a href="../feed_forward">position-wise feedforward network</a>.</p>
 <p><em>This is a simpler implementation that doesn&rsquo;t use <a href="experiment.html"><code>labml.configs</code></a> module.
 We decided to write a simpler implementation to make it easier for readers who are not familiar.</em></p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/glu_variants/simple.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/glu_variants/simple.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/86b773f65fc911ebb2ac0242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/transformers/gpt/index.html b/docs/transformers/gpt/index.html
index d805326b..12a84ed7 100644
--- a/docs/transformers/gpt/index.html
+++ b/docs/transformers/gpt/index.html
@@ -85,7 +85,7 @@ are the parameter initialization, weight decay, and learning rate schedule.
 For the transformer we reuse the
 <a href="../transformers/index.html">existing labml/nn transformer implementation</a>.</p>
 <p>Here&rsquo;s a notebook for training a GPT model on Tiny Shakespeare dataset.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/gpt/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/gpt/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/0324c6d0562111eba65d0242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/transformers/switch/index.html b/docs/transformers/switch/index.html
index ebe258c8..68196176 100644
--- a/docs/transformers/switch/index.html
+++ b/docs/transformers/switch/index.html
@@ -89,7 +89,7 @@ In a distributed setup you would have each FFN (each very large) on a different
 <p>The paper introduces another loss term to balance load among the experts (FFNs) and
 discusses dropping tokens when routing is not balanced.</p>
 <p>Here&rsquo;s <a href="experiment.html">the training code</a> and a notebook for training a switch transformer on Tiny Shakespeare dataset.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/switch/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/switch/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/c4656c605b9311eba13d0242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/transformers/switch/readme.html b/docs/transformers/switch/readme.html
index ec208925..90892c2d 100644
--- a/docs/transformers/switch/readme.html
+++ b/docs/transformers/switch/readme.html
@@ -89,7 +89,7 @@ In a distributed setup you would have each FFN (each very large) on a different
 <p>The paper introduces another loss term to balance load among the experts (FFNs) and
 discusses dropping tokens when routing is not balanced.</p>
 <p>Here&rsquo;s <a href="experiment.html">the training code</a> and a notebook for training a switch transformer on Tiny Shakespeare dataset.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/switch/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/switch/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/c4656c605b9311eba13d0242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/transformers/xl/index.html b/docs/transformers/xl/index.html
index afc987ef..dd4fc7ed 100644
--- a/docs/transformers/xl/index.html
+++ b/docs/transformers/xl/index.html
@@ -84,7 +84,7 @@ They introduce relative positional encoding, where the positional encodings
 are introduced at the attention calculation.</p>
 <p>Annotated implementation of relative multi-headed attention is in <a href="relative_mha.html"><code>relative_mha.py</code></a>.</p>
 <p>Here&rsquo;s <a href="experiment.html">the training code</a> and a notebook for training a transformer XL model on Tiny Shakespeare dataset.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/xl/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/xl/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/d3b6760c692e11ebb6a70242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/docs/transformers/xl/readme.html b/docs/transformers/xl/readme.html
index 351a7355..0f8aa57c 100644
--- a/docs/transformers/xl/readme.html
+++ b/docs/transformers/xl/readme.html
@@ -84,7 +84,7 @@ They introduce relative positional encoding, where the positional encodings
 are introduced at the attention calculation.</p>
 <p>Annotated implementation of relative multi-headed attention is in <a href="https://nn.labml.ai/transformers/xl/relative_mha.html"><code>relative_mha.py</code></a>.</p>
 <p>Here&rsquo;s <a href="https://nn.labml.ai/transformers/xl/experiment.html">the training code</a> and a notebook for training a transformer XL model on Tiny Shakespeare dataset.</p>
-<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/xl/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/xl/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 <a href="https://app.labml.ai/run/d3b6760c692e11ebb6a70242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
             </div>
             <div class='code'>
diff --git a/labml_nn/__init__.py b/labml_nn/__init__.py
index a371695d..bae8d843 100644
--- a/labml_nn/__init__.py
+++ b/labml_nn/__init__.py
@@ -3,7 +3,7 @@
 
 This is a collection of simple PyTorch implementations of
 neural networks and related algorithms.
-[These implementations](https://github.com/lab-ml/nn) are documented with explanations,
+[These implementations](https://github.com/labmlai/annotated_deep_learning_paper_implementations) are documented with explanations,
 and the [website](index.html)
 renders these as side-by-side formatted notes.
 We believe these would help you understand these algorithms better.
diff --git a/labml_nn/capsule_networks/__init__.py b/labml_nn/capsule_networks/__init__.py
index 40b70fa9..ef90ab1c 100644
--- a/labml_nn/capsule_networks/__init__.py
+++ b/labml_nn/capsule_networks/__init__.py
@@ -26,7 +26,7 @@ confusions I had with the paper.
 
 Here's a notebook for training a Capsule Network on MNIST dataset.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/capsule_networks/mnist.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/capsule_networks/mnist.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/e7c08e08586711ebb3e30242ac1c0002)
 """
 
diff --git a/labml_nn/capsule_networks/mnist.ipynb b/labml_nn/capsule_networks/mnist.ipynb
index bf0502c0..bc6e70c9 100644
--- a/labml_nn/capsule_networks/mnist.ipynb
+++ b/labml_nn/capsule_networks/mnist.ipynb
@@ -20,8 +20,8 @@
     "id": "AYV_dMVDxyc2"
    },
    "source": [
-    "[![Github](https://img.shields.io/github/stars/lab-ml/nn?style=social)](https://github.com/lab-ml/nn)\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/capsule_networks/mnist.ipynb)                    \n",
+    "[![Github](https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social)](https://github.com/labmlai/annotated_deep_learning_paper_implementations)\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/capsule_networks/mnist.ipynb)                    \n",
     "\n",
     "## Training a Capsule Network to classify MNIST digits\n",
     "\n",
diff --git a/labml_nn/capsule_networks/readme.md b/labml_nn/capsule_networks/readme.md
index f144f985..637b75d7 100644
--- a/labml_nn/capsule_networks/readme.md
+++ b/labml_nn/capsule_networks/readme.md
@@ -17,5 +17,5 @@ confusions I had with the paper.
 
 Here's a notebook for training a Capsule Network on MNIST dataset.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/capsule_networks/mnist.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/capsule_networks/mnist.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/e7c08e08586711ebb3e30242ac1c0002)
diff --git a/labml_nn/cfr/__init__.py b/labml_nn/cfr/__init__.py
index 578b375a..d599c02e 100644
--- a/labml_nn/cfr/__init__.py
+++ b/labml_nn/cfr/__init__.py
@@ -21,7 +21,7 @@ where we sample from the game tree and estimate the regrets.
 We tried to keep our Python implementation easy-to-understand like a tutorial.
 We run it on [a very simple imperfect information game called Kuhn poker](kuhn/index.html).
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/cfr/kuhn/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/cfr/kuhn/experiment.ipynb)
 
 [![Twitter thread](https://img.shields.io/twitter/url?style=social&url=https%3A%2F%2Ftwitter.com%2Flabmlai%2Fstatus%2F1407186002255380484)](https://twitter.com/labmlai/status/1407186002255380484)
 Twitter thread
diff --git a/labml_nn/cfr/kuhn/__init__.py b/labml_nn/cfr/kuhn/__init__.py
index ab8886b0..fe94190d 100644
--- a/labml_nn/cfr/kuhn/__init__.py
+++ b/labml_nn/cfr/kuhn/__init__.py
@@ -31,7 +31,7 @@ Here's some example games:
 He we extend the `InfoSet` class and `History` class defined in [`__init__.py`](../index.html)
 with Kuhn Poker specifics.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/cfr/kuhn/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/cfr/kuhn/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/7c35d3fad29711eba588acde48001122)
 """
 
diff --git a/labml_nn/cfr/kuhn/experiment.ipynb b/labml_nn/cfr/kuhn/experiment.ipynb
index 569d1c59..aa5d7a24 100644
--- a/labml_nn/cfr/kuhn/experiment.ipynb
+++ b/labml_nn/cfr/kuhn/experiment.ipynb
@@ -33,8 +33,8 @@
         "id": "AYV_dMVDxyc2"
       },
       "source": [
-        "[![Github](https://img.shields.io/github/stars/lab-ml/nn?style=social)](https://github.com/lab-ml/nn)\n",
-        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/cfr/kuhn/experiment.ipynb)                    \n",
+        "[![Github](https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social)](https://github.com/labmlai/annotated_deep_learning_paper_implementations)\n",
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/cfr/kuhn/experiment.ipynb)                    \n",
         "\n",
         "## [Counterfactual Regret Minimization (CFR)](https://nn.labml.ai/cfr/index.html) on Kuhn Poker\n",
         "\n",
diff --git a/labml_nn/gan/cycle_gan/__init__.py b/labml_nn/gan/cycle_gan/__init__.py
index 22ab1085..d3b3630b 100644
--- a/labml_nn/gan/cycle_gan/__init__.py
+++ b/labml_nn/gan/cycle_gan/__init__.py
@@ -29,7 +29,7 @@ The discriminators test whether the generated images look real.
 This file contains the model code as well as the training code.
 We also have a Google Colab notebook.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/gan/cycle_gan/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/gan/cycle_gan/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/93b11a665d6811ebaac80242ac1c0002)
 """
 
diff --git a/labml_nn/gan/cycle_gan/experiment.ipynb b/labml_nn/gan/cycle_gan/experiment.ipynb
index b00b10ce..929615f7 100644
--- a/labml_nn/gan/cycle_gan/experiment.ipynb
+++ b/labml_nn/gan/cycle_gan/experiment.ipynb
@@ -21,8 +21,8 @@
     "id": "AYV_dMVDxyc2"
    },
    "source": [
-    "[![Github](https://img.shields.io/github/stars/lab-ml/nn?style=social)](https://github.com/lab-ml/nn)\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/gan/cycle_gan/experiment.ipynb)\n",
+    "[![Github](https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social)](https://github.com/labmlai/annotated_deep_learning_paper_implementations)\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/gan/cycle_gan/experiment.ipynb)\n",
     "\n",
     "## Cycle GAN\n",
     "\n",
diff --git a/labml_nn/gan/dcgan/experiment.ipynb b/labml_nn/gan/dcgan/experiment.ipynb
index 47907e34..f04cf5d2 100644
--- a/labml_nn/gan/dcgan/experiment.ipynb
+++ b/labml_nn/gan/dcgan/experiment.ipynb
@@ -22,8 +22,8 @@
     "id": "AYV_dMVDxyc2"
    },
    "source": [
-    "[![Github](https://img.shields.io/github/stars/lab-ml/nn?style=social)](https://github.com/lab-ml/nn)\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/gan/dcgan/experiment.ipynb)\n",
+    "[![Github](https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social)](https://github.com/labmlai/annotated_deep_learning_paper_implementations)\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/gan/dcgan/experiment.ipynb)\n",
     "\n",
     "## DCGAN\n",
     "\n",
@@ -256,16 +256,16 @@
       "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
       "\u001B[0;31mKeyboardInterrupt\u001B[0m                         Traceback (most recent call last)",
       "\u001B[0;32m<ipython-input-5-0592df1e05e4>\u001B[0m in \u001B[0;36m<module>\u001B[0;34m\u001B[0m\n\u001B[1;32m      1\u001B[0m \u001B[0;32mwith\u001B[0m \u001B[0mexperiment\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mstart\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m----> 2\u001B[0;31m     \u001B[0mconf\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mrun\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m      3\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36mrun\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    246\u001B[0m         \u001B[0m_\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtrainer\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    247\u001B[0m         \u001B[0;32mfor\u001B[0m \u001B[0m_\u001B[0m \u001B[0;32min\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtraining_loop\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 248\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mrun_step\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    249\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    250\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0msample\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36mrun_step\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    234\u001B[0m             \u001B[0;32mwith\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mmode\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mupdate\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mis_train\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;32mTrue\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    235\u001B[0m                 \u001B[0;32mwith\u001B[0m \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mnamespace\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'train'\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 236\u001B[0;31m                     \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtrainer\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    237\u001B[0m             \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mvalidator\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    238\u001B[0m                 \u001B[0;32mwith\u001B[0m \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mnamespace\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'valid'\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36m__call__\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    136\u001B[0m                 \u001B[0msm\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mon_epoch_start\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    137\u001B[0m         \u001B[0;32mwith\u001B[0m \u001B[0mtorch\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mset_grad_enabled\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mmode\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mis_train\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 138\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m__iterate\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    139\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    140\u001B[0m         \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_batch_index\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mcompleted\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36m__iterate\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    149\u001B[0m                 \u001B[0mbatch\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mnext\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m__iterable\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    150\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 151\u001B[0;31m                 \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mstep\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mbatch\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_batch_index\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    152\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    153\u001B[0m                 \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_batch_index\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mstep\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml_nn/gan/original/experiment.py\u001B[0m in \u001B[0;36mstep\u001B[0;34m(self, batch, batch_idx)\u001B[0m\n\u001B[1;32m    149\u001B[0m             \u001B[0;31m# Log stuff\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    150\u001B[0m             \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0madd\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'generated'\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mgenerated_images\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0;36m0\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;36m5\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 151\u001B[0;31m             \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0madd\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m\"loss.generator.\"\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mloss\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    152\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    153\u001B[0m             \u001B[0;31m# Train\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml/tracker.py\u001B[0m in \u001B[0;36madd\u001B[0;34m(*args, **kwargs)\u001B[0m\n\u001B[1;32m    131\u001B[0m         \u001B[0;32mif\u001B[0m \u001B[0;32mnot\u001B[0m \u001B[0misinstance\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0margs\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0;36m0\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mstr\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    132\u001B[0m             \u001B[0;32mraise\u001B[0m \u001B[0mTypeError\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'tracker.add should be called as add(name, value), add(dictionary) or add(k=v,k2=v2...)'\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 133\u001B[0;31m         \u001B[0m_internal\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mstore\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0margs\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0;36m0\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0margs\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0;36m1\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    134\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    135\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml/internal/tracker/__init__.py\u001B[0m in \u001B[0;36mstore\u001B[0;34m(self, key, value)\u001B[0m\n\u001B[1;32m    165\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    166\u001B[0m         \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_create_indicator\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mkey\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mvalue\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 167\u001B[0;31m         \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mindicators\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0mkey\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mcollect_value\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mvalue\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    168\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    169\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0mnew_line\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml/internal/tracker/indicators/numeric.py\u001B[0m in \u001B[0;36mcollect_value\u001B[0;34m(self, value)\u001B[0m\n\u001B[1;32m     79\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     80\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0mcollect_value\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mvalue\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 81\u001B[0;31m         \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_values\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mappend\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mto_numpy\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mvalue\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mravel\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m     82\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     83\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0mclear\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml/internal/util/values.py\u001B[0m in \u001B[0;36mto_numpy\u001B[0;34m(value)\u001B[0m\n\u001B[1;32m     20\u001B[0m             \u001B[0;32mreturn\u001B[0m \u001B[0mvalue\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mdata\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mcpu\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mnumpy\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     21\u001B[0m         \u001B[0;32melif\u001B[0m \u001B[0misinstance\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mvalue\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mtorch\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mTensor\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 22\u001B[0;31m             \u001B[0;32mreturn\u001B[0m \u001B[0mvalue\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mdata\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mcpu\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mnumpy\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m     23\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     24\u001B[0m     \u001B[0;32mraise\u001B[0m \u001B[0mValueError\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34mf\"Unknown type {type(value)}\"\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml_helpers/training_loop.py\u001B[0m in \u001B[0;36m__handler\u001B[0;34m(self, sig, frame)\u001B[0m\n\u001B[1;32m    162\u001B[0m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m__finish\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    163\u001B[0m             \u001B[0mlogger\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mlog\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'Killing loop...'\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mText\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mdanger\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 164\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mold_handler\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0msig\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mframe\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    165\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    166\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0m__str__\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36mrun\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    246\u001B[0m         \u001B[0m_\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtrainer\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    247\u001B[0m         \u001B[0;32mfor\u001B[0m \u001B[0m_\u001B[0m \u001B[0;32min\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtraining_loop\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 248\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mrun_step\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    249\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    250\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0msample\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36mrun_step\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    234\u001B[0m             \u001B[0;32mwith\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mmode\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mupdate\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mis_train\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;32mTrue\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    235\u001B[0m                 \u001B[0;32mwith\u001B[0m \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mnamespace\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'train'\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 236\u001B[0;31m                     \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtrainer\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    237\u001B[0m             \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mvalidator\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    238\u001B[0m                 \u001B[0;32mwith\u001B[0m \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mnamespace\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'valid'\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36m__call__\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    136\u001B[0m                 \u001B[0msm\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mon_epoch_start\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    137\u001B[0m         \u001B[0;32mwith\u001B[0m \u001B[0mtorch\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mset_grad_enabled\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mmode\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mis_train\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 138\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m__iterate\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    139\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    140\u001B[0m         \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_batch_index\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mcompleted\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36m__iterate\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    149\u001B[0m                 \u001B[0mbatch\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mnext\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m__iterable\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    150\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 151\u001B[0;31m                 \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mstep\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mbatch\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_batch_index\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    152\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    153\u001B[0m                 \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_batch_index\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mstep\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml_nn/gan/original/experiment.py\u001B[0m in \u001B[0;36mstep\u001B[0;34m(self, batch, batch_idx)\u001B[0m\n\u001B[1;32m    149\u001B[0m             \u001B[0;31m# Log stuff\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    150\u001B[0m             \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0madd\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'generated'\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mgenerated_images\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0;36m0\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;36m5\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 151\u001B[0;31m             \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0madd\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m\"loss.generator.\"\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mloss\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    152\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    153\u001B[0m             \u001B[0;31m# Train\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml/tracker.py\u001B[0m in \u001B[0;36madd\u001B[0;34m(*args, **kwargs)\u001B[0m\n\u001B[1;32m    131\u001B[0m         \u001B[0;32mif\u001B[0m \u001B[0;32mnot\u001B[0m \u001B[0misinstance\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0margs\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0;36m0\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mstr\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    132\u001B[0m             \u001B[0;32mraise\u001B[0m \u001B[0mTypeError\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'tracker.add should be called as add(name, value), add(dictionary) or add(k=v,k2=v2...)'\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 133\u001B[0;31m         \u001B[0m_internal\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mstore\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0margs\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0;36m0\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0margs\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0;36m1\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    134\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    135\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml/internal/tracker/__init__.py\u001B[0m in \u001B[0;36mstore\u001B[0;34m(self, key, value)\u001B[0m\n\u001B[1;32m    165\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    166\u001B[0m         \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_create_indicator\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mkey\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mvalue\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 167\u001B[0;31m         \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mindicators\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0mkey\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mcollect_value\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mvalue\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    168\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    169\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0mnew_line\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml/internal/tracker/indicators/numeric.py\u001B[0m in \u001B[0;36mcollect_value\u001B[0;34m(self, value)\u001B[0m\n\u001B[1;32m     79\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     80\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0mcollect_value\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mvalue\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 81\u001B[0;31m         \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_values\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mappend\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mto_numpy\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mvalue\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mravel\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m     82\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     83\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0mclear\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml/internal/util/values.py\u001B[0m in \u001B[0;36mto_numpy\u001B[0;34m(value)\u001B[0m\n\u001B[1;32m     20\u001B[0m             \u001B[0;32mreturn\u001B[0m \u001B[0mvalue\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mdata\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mcpu\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mnumpy\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     21\u001B[0m         \u001B[0;32melif\u001B[0m \u001B[0misinstance\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mvalue\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mtorch\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mTensor\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 22\u001B[0;31m             \u001B[0;32mreturn\u001B[0m \u001B[0mvalue\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mdata\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mcpu\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mnumpy\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m     23\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     24\u001B[0m     \u001B[0;32mraise\u001B[0m \u001B[0mValueError\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34mf\"Unknown type {type(value)}\"\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml_helpers/training_loop.py\u001B[0m in \u001B[0;36m__handler\u001B[0;34m(self, sig, frame)\u001B[0m\n\u001B[1;32m    162\u001B[0m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m__finish\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    163\u001B[0m             \u001B[0mlogger\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mlog\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'Killing loop...'\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mText\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mdanger\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 164\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mold_handler\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0msig\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mframe\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    165\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    166\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0m__str__\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
       "\u001B[0;31mKeyboardInterrupt\u001B[0m: "
      ]
     }
diff --git a/labml_nn/gan/original/experiment.ipynb b/labml_nn/gan/original/experiment.ipynb
index 424b4d23..c1034aa7 100644
--- a/labml_nn/gan/original/experiment.ipynb
+++ b/labml_nn/gan/original/experiment.ipynb
@@ -22,8 +22,8 @@
     "id": "AYV_dMVDxyc2"
    },
    "source": [
-    "[![Github](https://img.shields.io/github/stars/lab-ml/nn?style=social)](https://github.com/lab-ml/nn)\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/gan/original/experiment.ipynb)\n",
+    "[![Github](https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social)](https://github.com/labmlai/annotated_deep_learning_paper_implementations)\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/gan/original/experiment.ipynb)\n",
     "\n",
     "## DCGAN\n",
     "\n",
@@ -235,15 +235,15 @@
       "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
       "\u001B[0;31mKeyboardInterrupt\u001B[0m                         Traceback (most recent call last)",
       "\u001B[0;32m<ipython-input-5-0592df1e05e4>\u001B[0m in \u001B[0;36m<module>\u001B[0;34m\u001B[0m\n\u001B[1;32m      1\u001B[0m \u001B[0;32mwith\u001B[0m \u001B[0mexperiment\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mstart\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m----> 2\u001B[0;31m     \u001B[0mconf\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mrun\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m      3\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36mrun\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    246\u001B[0m         \u001B[0m_\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtrainer\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    247\u001B[0m         \u001B[0;32mfor\u001B[0m \u001B[0m_\u001B[0m \u001B[0;32min\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtraining_loop\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 248\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mrun_step\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    249\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    250\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0msample\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36mrun_step\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    234\u001B[0m             \u001B[0;32mwith\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mmode\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mupdate\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mis_train\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;32mTrue\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    235\u001B[0m                 \u001B[0;32mwith\u001B[0m \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mnamespace\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'train'\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 236\u001B[0;31m                     \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtrainer\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    237\u001B[0m             \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mvalidator\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    238\u001B[0m                 \u001B[0;32mwith\u001B[0m \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mnamespace\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'valid'\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36m__call__\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    136\u001B[0m                 \u001B[0msm\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mon_epoch_start\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    137\u001B[0m         \u001B[0;32mwith\u001B[0m \u001B[0mtorch\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mset_grad_enabled\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mmode\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mis_train\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 138\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m__iterate\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    139\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    140\u001B[0m         \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_batch_index\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mcompleted\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36m__iterate\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    149\u001B[0m                 \u001B[0mbatch\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mnext\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m__iterable\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    150\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 151\u001B[0;31m                 \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mstep\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mbatch\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_batch_index\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    152\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    153\u001B[0m                 \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_batch_index\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mstep\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml_nn/gan/original/experiment.py\u001B[0m in \u001B[0;36mstep\u001B[0;34m(self, batch, batch_idx)\u001B[0m\n\u001B[1;32m    157\u001B[0m                 \u001B[0;32mif\u001B[0m \u001B[0mbatch_idx\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mis_last\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    158\u001B[0m                     \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0madd\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'generator'\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mgenerator\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 159\u001B[0;31m                 \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mgenerator_optimizer\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mstep\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    160\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    161\u001B[0m         \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0msave\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36mrun\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    246\u001B[0m         \u001B[0m_\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtrainer\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    247\u001B[0m         \u001B[0;32mfor\u001B[0m \u001B[0m_\u001B[0m \u001B[0;32min\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtraining_loop\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 248\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mrun_step\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    249\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    250\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0msample\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36mrun_step\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    234\u001B[0m             \u001B[0;32mwith\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mmode\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mupdate\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mis_train\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;32mTrue\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    235\u001B[0m                 \u001B[0;32mwith\u001B[0m \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mnamespace\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'train'\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 236\u001B[0;31m                     \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtrainer\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    237\u001B[0m             \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mvalidator\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    238\u001B[0m                 \u001B[0;32mwith\u001B[0m \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mnamespace\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'valid'\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36m__call__\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    136\u001B[0m                 \u001B[0msm\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mon_epoch_start\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    137\u001B[0m         \u001B[0;32mwith\u001B[0m \u001B[0mtorch\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mset_grad_enabled\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mmode\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mis_train\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 138\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m__iterate\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    139\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    140\u001B[0m         \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_batch_index\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mcompleted\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36m__iterate\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    149\u001B[0m                 \u001B[0mbatch\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mnext\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m__iterable\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    150\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 151\u001B[0;31m                 \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mstep\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mbatch\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_batch_index\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    152\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    153\u001B[0m                 \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_batch_index\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mstep\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml_nn/gan/original/experiment.py\u001B[0m in \u001B[0;36mstep\u001B[0;34m(self, batch, batch_idx)\u001B[0m\n\u001B[1;32m    157\u001B[0m                 \u001B[0;32mif\u001B[0m \u001B[0mbatch_idx\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mis_last\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    158\u001B[0m                     \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0madd\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'generator'\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mgenerator\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 159\u001B[0;31m                 \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mgenerator_optimizer\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mstep\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    160\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    161\u001B[0m         \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0msave\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
       "\u001B[0;32m~/miniconda/envs/torch/lib/python3.8/site-packages/torch/autograd/grad_mode.py\u001B[0m in \u001B[0;36mdecorate_context\u001B[0;34m(*args, **kwargs)\u001B[0m\n\u001B[1;32m     24\u001B[0m         \u001B[0;32mdef\u001B[0m \u001B[0mdecorate_context\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m*\u001B[0m\u001B[0margs\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;34m**\u001B[0m\u001B[0mkwargs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     25\u001B[0m             \u001B[0;32mwith\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m__class__\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 26\u001B[0;31m                 \u001B[0;32mreturn\u001B[0m \u001B[0mfunc\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m*\u001B[0m\u001B[0margs\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;34m**\u001B[0m\u001B[0mkwargs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m     27\u001B[0m         \u001B[0;32mreturn\u001B[0m \u001B[0mcast\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mF\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mdecorate_context\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     28\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n",
       "\u001B[0;32m~/miniconda/envs/torch/lib/python3.8/site-packages/torch/optim/adam.py\u001B[0m in \u001B[0;36mstep\u001B[0;34m(self, closure)\u001B[0m\n\u001B[1;32m    106\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    107\u001B[0m             \u001B[0mbeta1\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mbeta2\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mgroup\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0;34m'betas'\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 108\u001B[0;31m             F.adam(params_with_grad,\n\u001B[0m\u001B[1;32m    109\u001B[0m                    \u001B[0mgrads\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    110\u001B[0m                    \u001B[0mexp_avgs\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
       "\u001B[0;32m~/miniconda/envs/torch/lib/python3.8/site-packages/torch/optim/functional.py\u001B[0m in \u001B[0;36madam\u001B[0;34m(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, amsgrad, beta1, beta2, lr, weight_decay, eps)\u001B[0m\n\u001B[1;32m     92\u001B[0m             \u001B[0mdenom\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;34m(\u001B[0m\u001B[0mmax_exp_avg_sq\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0msqrt\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m \u001B[0;34m/\u001B[0m \u001B[0mmath\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0msqrt\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mbias_correction2\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0madd_\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0meps\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     93\u001B[0m         \u001B[0;32melse\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 94\u001B[0;31m             \u001B[0mdenom\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;34m(\u001B[0m\u001B[0mexp_avg_sq\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0msqrt\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m \u001B[0;34m/\u001B[0m \u001B[0mmath\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0msqrt\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mbias_correction2\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0madd_\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0meps\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m     95\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     96\u001B[0m         \u001B[0mstep_size\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mlr\u001B[0m \u001B[0;34m/\u001B[0m \u001B[0mbias_correction1\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml_helpers/training_loop.py\u001B[0m in \u001B[0;36m__handler\u001B[0;34m(self, sig, frame)\u001B[0m\n\u001B[1;32m    162\u001B[0m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m__finish\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    163\u001B[0m             \u001B[0mlogger\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mlog\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'Killing loop...'\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mText\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mdanger\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 164\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mold_handler\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0msig\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mframe\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    165\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    166\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0m__str__\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml_helpers/training_loop.py\u001B[0m in \u001B[0;36m__handler\u001B[0;34m(self, sig, frame)\u001B[0m\n\u001B[1;32m    162\u001B[0m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m__finish\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    163\u001B[0m             \u001B[0mlogger\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mlog\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'Killing loop...'\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mText\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mdanger\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 164\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mold_handler\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0msig\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mframe\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    165\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    166\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0m__str__\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
       "\u001B[0;31mKeyboardInterrupt\u001B[0m: "
      ]
     }
diff --git a/labml_nn/gan/wasserstein/__init__.py b/labml_nn/gan/wasserstein/__init__.py
index d51de134..496639d1 100644
--- a/labml_nn/gan/wasserstein/__init__.py
+++ b/labml_nn/gan/wasserstein/__init__.py
@@ -81,7 +81,7 @@ network that defines $f$ clipped within a range.*
 
 Here is the code to try this on a [simple MNIST generation experiment](experiment.html).
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/gan/wasserstein/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/gan/wasserstein/experiment.ipynb)
 """
 
 import torch.utils.data
diff --git a/labml_nn/gan/wasserstein/experiment.ipynb b/labml_nn/gan/wasserstein/experiment.ipynb
index 462905b2..fca837e9 100644
--- a/labml_nn/gan/wasserstein/experiment.ipynb
+++ b/labml_nn/gan/wasserstein/experiment.ipynb
@@ -22,8 +22,8 @@
     "id": "AYV_dMVDxyc2"
    },
    "source": [
-    "[![Github](https://img.shields.io/github/stars/lab-ml/nn?style=social)](https://github.com/lab-ml/nn)\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/gan/wasserstein/experiment.ipynb)\n",
+    "[![Github](https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social)](https://github.com/labmlai/annotated_deep_learning_paper_implementations)\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/gan/wasserstein/experiment.ipynb)\n",
     "\n",
     "## DCGAN\n",
     "\n",
@@ -251,10 +251,10 @@
       "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
       "\u001B[0;31mKeyboardInterrupt\u001B[0m                         Traceback (most recent call last)",
       "\u001B[0;32m<ipython-input-18-0592df1e05e4>\u001B[0m in \u001B[0;36m<module>\u001B[0;34m\u001B[0m\n\u001B[1;32m      1\u001B[0m \u001B[0;32mwith\u001B[0m \u001B[0mexperiment\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mstart\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m----> 2\u001B[0;31m     \u001B[0mconf\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mrun\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m      3\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36mrun\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    246\u001B[0m         \u001B[0m_\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtrainer\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    247\u001B[0m         \u001B[0;32mfor\u001B[0m \u001B[0m_\u001B[0m \u001B[0;32min\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtraining_loop\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 248\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mrun_step\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    249\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    250\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0msample\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36mrun_step\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    234\u001B[0m             \u001B[0;32mwith\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mmode\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mupdate\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mis_train\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;32mTrue\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    235\u001B[0m                 \u001B[0;32mwith\u001B[0m \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mnamespace\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'train'\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 236\u001B[0;31m                     \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtrainer\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    237\u001B[0m             \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mvalidator\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    238\u001B[0m                 \u001B[0;32mwith\u001B[0m \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mnamespace\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'valid'\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36m__call__\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    136\u001B[0m                 \u001B[0msm\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mon_epoch_start\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    137\u001B[0m         \u001B[0;32mwith\u001B[0m \u001B[0mtorch\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mset_grad_enabled\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mmode\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mis_train\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 138\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m__iterate\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    139\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    140\u001B[0m         \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_batch_index\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mcompleted\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36m__iterate\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    147\u001B[0m                 \u001B[0mmonit\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mprogress\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;36m0\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    148\u001B[0m             \u001B[0;32mwhile\u001B[0m \u001B[0;32mnot\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_batch_index\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0miteration_completed\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 149\u001B[0;31m                 \u001B[0mbatch\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mnext\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m__iterable\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    150\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    151\u001B[0m                 \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mstep\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mbatch\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_batch_index\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36mrun\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    246\u001B[0m         \u001B[0m_\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtrainer\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    247\u001B[0m         \u001B[0;32mfor\u001B[0m \u001B[0m_\u001B[0m \u001B[0;32min\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtraining_loop\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 248\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mrun_step\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    249\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    250\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0msample\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36mrun_step\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    234\u001B[0m             \u001B[0;32mwith\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mmode\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mupdate\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mis_train\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;32mTrue\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    235\u001B[0m                 \u001B[0;32mwith\u001B[0m \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mnamespace\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'train'\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 236\u001B[0;31m                     \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mtrainer\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    237\u001B[0m             \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mvalidator\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    238\u001B[0m                 \u001B[0;32mwith\u001B[0m \u001B[0mtracker\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mnamespace\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'valid'\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36m__call__\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    136\u001B[0m                 \u001B[0msm\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mon_epoch_start\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    137\u001B[0m         \u001B[0;32mwith\u001B[0m \u001B[0mtorch\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mset_grad_enabled\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mmode\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mis_train\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 138\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m__iterate\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    139\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    140\u001B[0m         \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_batch_index\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mcompleted\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml_helpers/train_valid.py\u001B[0m in \u001B[0;36m__iterate\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    147\u001B[0m                 \u001B[0mmonit\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mprogress\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;36m0\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    148\u001B[0m             \u001B[0;32mwhile\u001B[0m \u001B[0;32mnot\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_batch_index\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0miteration_completed\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 149\u001B[0;31m                 \u001B[0mbatch\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mnext\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m__iterable\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    150\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    151\u001B[0m                 \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mstep\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mbatch\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_batch_index\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
       "\u001B[0;32m~/miniconda/envs/torch/lib/python3.8/site-packages/torch/utils/data/dataloader.py\u001B[0m in \u001B[0;36m__next__\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    433\u001B[0m         \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_sampler_iter\u001B[0m \u001B[0;32mis\u001B[0m \u001B[0;32mNone\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    434\u001B[0m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_reset\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 435\u001B[0;31m         \u001B[0mdata\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_next_data\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    436\u001B[0m         \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_num_yielded\u001B[0m \u001B[0;34m+=\u001B[0m \u001B[0;36m1\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    437\u001B[0m         \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_dataset_kind\u001B[0m \u001B[0;34m==\u001B[0m \u001B[0m_DatasetKind\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mIterable\u001B[0m \u001B[0;32mand\u001B[0m\u001B[0;31m \u001B[0m\u001B[0;31m\\\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
       "\u001B[0;32m~/miniconda/envs/torch/lib/python3.8/site-packages/torch/utils/data/dataloader.py\u001B[0m in \u001B[0;36m_next_data\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m    473\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0m_next_data\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    474\u001B[0m         \u001B[0mindex\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_next_index\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m  \u001B[0;31m# may raise StopIteration\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 475\u001B[0;31m         \u001B[0mdata\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_dataset_fetcher\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mfetch\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mindex\u001B[0m\u001B[0;34m)\u001B[0m  \u001B[0;31m# may raise StopIteration\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    476\u001B[0m         \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_pin_memory\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    477\u001B[0m             \u001B[0mdata\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0m_utils\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mpin_memory\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mpin_memory\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mdata\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
       "\u001B[0;32m~/miniconda/envs/torch/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py\u001B[0m in \u001B[0;36mfetch\u001B[0;34m(self, possibly_batched_index)\u001B[0m\n\u001B[1;32m     42\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0mfetch\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mpossibly_batched_index\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     43\u001B[0m         \u001B[0;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mauto_collation\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 44\u001B[0;31m             \u001B[0mdata\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;34m[\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mdataset\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0midx\u001B[0m\u001B[0;34m]\u001B[0m \u001B[0;32mfor\u001B[0m \u001B[0midx\u001B[0m \u001B[0;32min\u001B[0m \u001B[0mpossibly_batched_index\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m     45\u001B[0m         \u001B[0;32melse\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     46\u001B[0m             \u001B[0mdata\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mdataset\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0mpossibly_batched_index\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
@@ -267,7 +267,7 @@
       "\u001B[0;32m~/miniconda/envs/torch/lib/python3.8/site-packages/torch/tensor.py\u001B[0m in \u001B[0;36mwrapped\u001B[0;34m(*args, **kwargs)\u001B[0m\n\u001B[1;32m     22\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0mwrapped\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m*\u001B[0m\u001B[0margs\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;34m**\u001B[0m\u001B[0mkwargs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     23\u001B[0m         \u001B[0;32mfrom\u001B[0m \u001B[0mtorch\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0moverrides\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0mhas_torch_function\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mhandle_torch_function\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 24\u001B[0;31m         \u001B[0;32mif\u001B[0m \u001B[0;32mnot\u001B[0m \u001B[0mall\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mtype\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mt\u001B[0m\u001B[0;34m)\u001B[0m \u001B[0;32mis\u001B[0m \u001B[0mTensor\u001B[0m \u001B[0;32mfor\u001B[0m \u001B[0mt\u001B[0m \u001B[0;32min\u001B[0m \u001B[0margs\u001B[0m\u001B[0;34m)\u001B[0m \u001B[0;32mand\u001B[0m \u001B[0mhas_torch_function\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0margs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m     25\u001B[0m             \u001B[0;32mreturn\u001B[0m \u001B[0mhandle_torch_function\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mwrapped\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0margs\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;34m*\u001B[0m\u001B[0margs\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;34m**\u001B[0m\u001B[0mkwargs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     26\u001B[0m         \u001B[0;32mtry\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
       "\u001B[0;32m~/miniconda/envs/torch/lib/python3.8/site-packages/torch/overrides.py\u001B[0m in \u001B[0;36mhas_torch_function\u001B[0;34m(relevant_args)\u001B[0m\n\u001B[1;32m   1081\u001B[0m     \u001B[0mimplementations\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;32mFalse\u001B[0m \u001B[0motherwise\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1082\u001B[0m     \"\"\"\n\u001B[0;32m-> 1083\u001B[0;31m     return _is_torch_function_enabled() and any(\n\u001B[0m\u001B[1;32m   1084\u001B[0m         \u001B[0mtype\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0ma\u001B[0m\u001B[0;34m)\u001B[0m \u001B[0;32mis\u001B[0m \u001B[0;32mnot\u001B[0m \u001B[0mtorch\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mTensor\u001B[0m \u001B[0;32mand\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1085\u001B[0m         \u001B[0mgetattr\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0ma\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;34m'__torch_function__'\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0m_disabled_torch_function_impl\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
       "\u001B[0;32m~/miniconda/envs/torch/lib/python3.8/site-packages/torch/overrides.py\u001B[0m in \u001B[0;36m<genexpr>\u001B[0;34m(.0)\u001B[0m\n\u001B[1;32m   1081\u001B[0m     \u001B[0mimplementations\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;32mFalse\u001B[0m \u001B[0motherwise\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1082\u001B[0m     \"\"\"\n\u001B[0;32m-> 1083\u001B[0;31m     return _is_torch_function_enabled() and any(\n\u001B[0m\u001B[1;32m   1084\u001B[0m         \u001B[0mtype\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0ma\u001B[0m\u001B[0;34m)\u001B[0m \u001B[0;32mis\u001B[0m \u001B[0;32mnot\u001B[0m \u001B[0mtorch\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mTensor\u001B[0m \u001B[0;32mand\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m   1085\u001B[0m         \u001B[0mgetattr\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0ma\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;34m'__torch_function__'\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0m_disabled_torch_function_impl\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
-      "\u001B[0;32m~/ml/lab-ml/nn/labml_helpers/training_loop.py\u001B[0m in \u001B[0;36m__handler\u001B[0;34m(self, sig, frame)\u001B[0m\n\u001B[1;32m    162\u001B[0m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m__finish\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    163\u001B[0m             \u001B[0mlogger\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mlog\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'Killing loop...'\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mText\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mdanger\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 164\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mold_handler\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0msig\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mframe\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    165\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    166\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0m__str__\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;32m~/ml/labmlai/annotated_deep_learning_paper_implementations/labml_helpers/training_loop.py\u001B[0m in \u001B[0;36m__handler\u001B[0;34m(self, sig, frame)\u001B[0m\n\u001B[1;32m    162\u001B[0m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m__finish\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    163\u001B[0m             \u001B[0mlogger\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mlog\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m'Killing loop...'\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mText\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mdanger\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 164\u001B[0;31m             \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mold_handler\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0msig\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mframe\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m    165\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m    166\u001B[0m     \u001B[0;32mdef\u001B[0m \u001B[0m__str__\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mself\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
       "\u001B[0;31mKeyboardInterrupt\u001B[0m: "
      ]
     }
diff --git a/labml_nn/hypernetworks/experiment.ipynb b/labml_nn/hypernetworks/experiment.ipynb
index bcdc6187..3dbcf743 100644
--- a/labml_nn/hypernetworks/experiment.ipynb
+++ b/labml_nn/hypernetworks/experiment.ipynb
@@ -20,8 +20,8 @@
         "id": "AYV_dMVDxyc2"
       },
       "source": [
-        "[![Github](https://img.shields.io/github/stars/lab-ml/nn?style=social)](https://github.com/lab-ml/nn)\n",
-        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/hypernetworks/experiment.ipynb)                    \n",
+        "[![Github](https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social)](https://github.com/labmlai/annotated_deep_learning_paper_implementations)\n",
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/hypernetworks/experiment.ipynb)                    \n",
         "\n",
         "## HyperLSTM\n",
         "\n",
diff --git a/labml_nn/hypernetworks/hyper_lstm.py b/labml_nn/hypernetworks/hyper_lstm.py
index e7becb01..d39137dd 100644
--- a/labml_nn/hypernetworks/hyper_lstm.py
+++ b/labml_nn/hypernetworks/hyper_lstm.py
@@ -15,7 +15,7 @@ by David Ha gives a good explanation of HyperNetworks.
 We have an experiment that trains a HyperLSTM to predict text on Shakespeare dataset.
 Here's the link to code: [`experiment.py`](experiment.html)
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/hypernetworks/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/hypernetworks/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/9e7f39e047e811ebbaff2b26e3148b3d)
 
 HyperNetworks use a smaller network to generate weights of a larger network.
diff --git a/labml_nn/normalization/batch_channel_norm/__init__.py b/labml_nn/normalization/batch_channel_norm/__init__.py
index 66714424..7617ea8a 100644
--- a/labml_nn/normalization/batch_channel_norm/__init__.py
+++ b/labml_nn/normalization/batch_channel_norm/__init__.py
@@ -19,7 +19,7 @@ batch normalization.
 Here is [the training code](../weight_standardization/experiment.html) for training
 a VGG network that uses weight standardization to classify CIFAR-10 data.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/weight_standardization/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/normalization/weight_standardization/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/f4a783a2a7df11eb921d0242ac1c0002)
 [![WandB](https://img.shields.io/badge/wandb-run-yellow)](https://wandb.ai/vpj/cifar10/runs/3flr4k8w)
 """
diff --git a/labml_nn/normalization/batch_norm/__init__.py b/labml_nn/normalization/batch_norm/__init__.py
index 746caef6..100c96cc 100644
--- a/labml_nn/normalization/batch_norm/__init__.py
+++ b/labml_nn/normalization/batch_norm/__init__.py
@@ -91,7 +91,7 @@ mean and variance during the training phase and use that for inference.
 Here's [the training code](mnist.html) and a notebook for training
 a CNN classifier that uses batch normalization for MNIST dataset.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/batch_norm/mnist.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/normalization/batch_norm/mnist.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/011254fe647011ebbb8e0242ac1c0002)
 """
 
diff --git a/labml_nn/normalization/batch_norm/mnist.ipynb b/labml_nn/normalization/batch_norm/mnist.ipynb
index b2009021..8b7976fb 100644
--- a/labml_nn/normalization/batch_norm/mnist.ipynb
+++ b/labml_nn/normalization/batch_norm/mnist.ipynb
@@ -1005,8 +1005,8 @@
         "id": "AYV_dMVDxyc2"
       },
       "source": [
-        "[![Github](https://img.shields.io/github/stars/lab-ml/nn?style=social)](https://github.com/lab-ml/nn)\n",
-        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/batch_norm/mnist.ipynb)                    \n",
+        "[![Github](https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social)](https://github.com/labmlai/annotated_deep_learning_paper_implementations)\n",
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/normalization/batch_norm/mnist.ipynb)                    \n",
         "\n",
         "## Batch Normaliztion\n",
         "\n",
diff --git a/labml_nn/normalization/batch_norm/readme.md b/labml_nn/normalization/batch_norm/readme.md
index 3e2a5d12..ece7ba8d 100644
--- a/labml_nn/normalization/batch_norm/readme.md
+++ b/labml_nn/normalization/batch_norm/readme.md
@@ -84,5 +84,5 @@ mean and variance during the training phase and use that for inference.
 Here's [the training code](mnist.html) and a notebook for training
 a CNN classifier that uses batch normalization for MNIST dataset.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/batch_norm/mnist.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/normalization/batch_norm/mnist.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/011254fe647011ebbb8e0242ac1c0002)
diff --git a/labml_nn/normalization/group_norm/__init__.py b/labml_nn/normalization/group_norm/__init__.py
index df81b851..ec48d5c6 100644
--- a/labml_nn/normalization/group_norm/__init__.py
+++ b/labml_nn/normalization/group_norm/__init__.py
@@ -78,7 +78,7 @@ Group normalization normalizes values of the same sample and the same group of c
 
 Here's a [CIFAR 10 classification model](experiment.html) that uses instance normalization.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/group_norm/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/normalization/group_norm/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/081d950aa4e011eb8f9f0242ac1c0002)
 [![WandB](https://img.shields.io/badge/wandb-run-yellow)](https://wandb.ai/vpj/cifar10/runs/310etthp)
 """
diff --git a/labml_nn/normalization/group_norm/experiment.ipynb b/labml_nn/normalization/group_norm/experiment.ipynb
index 0bec0875..fc8eac0a 100644
--- a/labml_nn/normalization/group_norm/experiment.ipynb
+++ b/labml_nn/normalization/group_norm/experiment.ipynb
@@ -269,8 +269,8 @@
         "id": "AYV_dMVDxyc2"
       },
       "source": [
-        "[![Github](https://img.shields.io/github/stars/lab-ml/nn?style=social)](https://github.com/lab-ml/nn)\n",
-        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/group_norm/experiment.ipynb)                    \n",
+        "[![Github](https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social)](https://github.com/labmlai/annotated_deep_learning_paper_implementations)\n",
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/normalization/group_norm/experiment.ipynb)                    \n",
         "\n",
         "## Group Norm - CIFAR 10\n",
         "\n",
diff --git a/labml_nn/normalization/group_norm/readme.md b/labml_nn/normalization/group_norm/readme.md
index 06afdb0d..f6f2c269 100644
--- a/labml_nn/normalization/group_norm/readme.md
+++ b/labml_nn/normalization/group_norm/readme.md
@@ -17,6 +17,6 @@ all channels within each group.
 
 Here's a [CIFAR 10 classification model](https://nn.labml.ai/normalization/group_norm/experiment.html) that uses instance normalization.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/group_norm/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/normalization/group_norm/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/081d950aa4e011eb8f9f0242ac1c0002)
 [![WandB](https://img.shields.io/badge/wandb-run-yellow)](https://wandb.ai/vpj/cifar10/runs/310etthp)
\ No newline at end of file
diff --git a/labml_nn/normalization/weight_standardization/__init__.py b/labml_nn/normalization/weight_standardization/__init__.py
index 2cfbab4e..6756f50a 100644
--- a/labml_nn/normalization/weight_standardization/__init__.py
+++ b/labml_nn/normalization/weight_standardization/__init__.py
@@ -42,7 +42,7 @@ Here is [the training code](experiment.html) for training
 a VGG network that uses weight standardization to classify CIFAR-10 data.
 This uses a [2D-Convolution Layer with Weight Standardization](conv2d.html).
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/weight_standardization/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/normalization/weight_standardization/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/f4a783a2a7df11eb921d0242ac1c0002)
 [![WandB](https://img.shields.io/badge/wandb-run-yellow)](https://wandb.ai/vpj/cifar10/runs/3flr4k8w)
 """
diff --git a/labml_nn/normalization/weight_standardization/experiment.ipynb b/labml_nn/normalization/weight_standardization/experiment.ipynb
index 0f35ba28..d8e7f620 100644
--- a/labml_nn/normalization/weight_standardization/experiment.ipynb
+++ b/labml_nn/normalization/weight_standardization/experiment.ipynb
@@ -269,8 +269,8 @@
         "id": "AYV_dMVDxyc2"
       },
       "source": [
-        "[![Github](https://img.shields.io/github/stars/lab-ml/nn?style=social)](https://github.com/lab-ml/nn)\n",
-        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/group_norm/experiment.ipynb)                    \n",
+        "[![Github](https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social)](https://github.com/labmlai/annotated_deep_learning_paper_implementations)\n",
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/normalization/group_norm/experiment.ipynb)                    \n",
         "\n",
         "## Weight Standardization & Batch-Channel Normalization - CIFAR 10\n",
         "\n",
diff --git a/labml_nn/rl/ppo/__init__.py b/labml_nn/rl/ppo/__init__.py
index 462bbe03..9a99c4cb 100644
--- a/labml_nn/rl/ppo/__init__.py
+++ b/labml_nn/rl/ppo/__init__.py
@@ -22,7 +22,7 @@ is not close to the policy used to sample the data.
 You can find an experiment that uses it [here](experiment.html).
 The experiment uses [Generalized Advantage Estimation](gae.html).
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/rl/ppo/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/rl/ppo/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/6eff28a0910e11eb9b008db315936e2f)
 """
 
diff --git a/labml_nn/rl/ppo/experiment.ipynb b/labml_nn/rl/ppo/experiment.ipynb
index 15d98731..c185a1fe 100644
--- a/labml_nn/rl/ppo/experiment.ipynb
+++ b/labml_nn/rl/ppo/experiment.ipynb
@@ -6,8 +6,8 @@
     "id": "AYV_dMVDxyc2"
    },
    "source": [
-    "[![Github](https://img.shields.io/github/stars/lab-ml/nn?style=social)](https://github.com/lab-ml/nn)\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/rl/ppo/experiment.ipynb)                    \n",
+    "[![Github](https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social)](https://github.com/labmlai/annotated_deep_learning_paper_implementations)\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/rl/ppo/experiment.ipynb)                    \n",
     "\n",
     "## Proximal Policy Optimization - PPO\n",
     "\n",
diff --git a/labml_nn/rl/ppo/experiment.py b/labml_nn/rl/ppo/experiment.py
index fce12631..f9c2ab55 100644
--- a/labml_nn/rl/ppo/experiment.py
+++ b/labml_nn/rl/ppo/experiment.py
@@ -9,7 +9,7 @@ summary: Annotated implementation to train a PPO agent on Atari Breakout game.
 This experiment trains Proximal Policy Optimization (PPO) agent  Atari Breakout game on OpenAI Gym.
 It runs the [game environments on multiple processes](../game.html) to sample efficiently.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/rl/ppo/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/rl/ppo/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/6eff28a0910e11eb9b008db315936e2f)
 """
 
diff --git a/labml_nn/rl/ppo/readme.md b/labml_nn/rl/ppo/readme.md
index 759addfb..63c219d3 100644
--- a/labml_nn/rl/ppo/readme.md
+++ b/labml_nn/rl/ppo/readme.md
@@ -15,5 +15,5 @@ is not close to the policy used to sample the data.
 You can find an experiment that uses it [here](https://nn.labml.ai/rl/ppo/experiment.html).
 The experiment uses [Generalized Advantage Estimation](https://nn.labml.ai/rl/ppo/gae.html).
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/rl/ppo/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/rl/ppo/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/6eff28a0910e11eb9b008db315936e2f)
diff --git a/labml_nn/transformers/compressive/__init__.py b/labml_nn/transformers/compressive/__init__.py
index 1fe71056..846e1755 100644
--- a/labml_nn/transformers/compressive/__init__.py
+++ b/labml_nn/transformers/compressive/__init__.py
@@ -47,7 +47,7 @@ This is supposed to be more stable in standard transformer setups.
 Here are [the training code](experiment.html) and a notebook for training a compressive transformer
 model on the Tiny Shakespeare dataset.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/compressive/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/compressive/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/0d9b5338726c11ebb7c80242ac1c0002)
 """
 
diff --git a/labml_nn/transformers/compressive/experiment.ipynb b/labml_nn/transformers/compressive/experiment.ipynb
index 6ce329d4..d99735e3 100644
--- a/labml_nn/transformers/compressive/experiment.ipynb
+++ b/labml_nn/transformers/compressive/experiment.ipynb
@@ -20,8 +20,8 @@
         "id": "AYV_dMVDxyc2"
       },
       "source": [
-        "[![Github](https://img.shields.io/github/stars/lab-ml/nn?style=social)](https://github.com/lab-ml/nn)\n",
-        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/compressive/experiment.ipynb)                    \n",
+        "[![Github](https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social)](https://github.com/labmlai/annotated_deep_learning_paper_implementations)\n",
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/compressive/experiment.ipynb)                    \n",
         "\n",
         "## Compressive Transformer\n",
         "\n",
diff --git a/labml_nn/transformers/compressive/readme.md b/labml_nn/transformers/compressive/readme.md
index 0d154273..e0ea82e4 100644
--- a/labml_nn/transformers/compressive/readme.md
+++ b/labml_nn/transformers/compressive/readme.md
@@ -39,5 +39,5 @@ This is supposed to be more stable in standard transformer setups.
 Here are [the training code](https://nn.labml.ai/transformers/compressive/experiment.html) and a notebook for training a compressive transformer
 model on the Tiny Shakespeare dataset.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/compressive/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/compressive/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/0d9b5338726c11ebb7c80242ac1c0002)
diff --git a/labml_nn/transformers/fast_weights/__init__.py b/labml_nn/transformers/fast_weights/__init__.py
index 851dbb62..8c539ab3 100644
--- a/labml_nn/transformers/fast_weights/__init__.py
+++ b/labml_nn/transformers/fast_weights/__init__.py
@@ -88,7 +88,7 @@ $\frac{1}{z^{(i)} \cdot \color{lightgreen}{\phi(q^{(i)})}}$
 Here are [the training code](experiment.html) and a notebook for training a fast weights
  transformer on the Tiny Shakespeare dataset.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/928aadc0846c11eb85710242ac1c0002)
 """
 
diff --git a/labml_nn/transformers/fast_weights/experiment.ipynb b/labml_nn/transformers/fast_weights/experiment.ipynb
index e6c517dd..85e53567 100644
--- a/labml_nn/transformers/fast_weights/experiment.ipynb
+++ b/labml_nn/transformers/fast_weights/experiment.ipynb
@@ -20,8 +20,8 @@
         "id": "AYV_dMVDxyc2"
       },
       "source": [
-        "[![Github](https://img.shields.io/github/stars/lab-ml/nn?style=social)](https://github.com/lab-ml/nn)\n",
-        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb)                    \n",
+        "[![Github](https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social)](https://github.com/labmlai/annotated_deep_learning_paper_implementations)\n",
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb)                    \n",
         "\n",
         "## Fast Weights Transformer\n",
         "\n",
diff --git a/labml_nn/transformers/fast_weights/experiment.py b/labml_nn/transformers/fast_weights/experiment.py
index b2d4a727..eda909fe 100644
--- a/labml_nn/transformers/fast_weights/experiment.py
+++ b/labml_nn/transformers/fast_weights/experiment.py
@@ -10,7 +10,7 @@ This trains a fast weights transformer model for auto-regression.
 
 Here’s a Colab notebook for training a fast weights transformer on Tiny Shakespeare dataset.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/928aadc0846c11eb85710242ac1c0002)
 """
 
diff --git a/labml_nn/transformers/fast_weights/readme.md b/labml_nn/transformers/fast_weights/readme.md
index 3bf43ac9..0addfefc 100644
--- a/labml_nn/transformers/fast_weights/readme.md
+++ b/labml_nn/transformers/fast_weights/readme.md
@@ -7,5 +7,5 @@ Here is the [annotated implementation](https://nn.labml.ai/transformers/fast_wei
 Here are [the training code](https://nn.labml.ai/transformers/fast_weights/experiment.html)
 and a notebook for training a fast weights transformer on the Tiny Shakespeare dataset.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/928aadc0846c11eb85710242ac1c0002)
diff --git a/labml_nn/transformers/feedback/__init__.py b/labml_nn/transformers/feedback/__init__.py
index ede9f8d9..6ec3a2b2 100644
--- a/labml_nn/transformers/feedback/__init__.py
+++ b/labml_nn/transformers/feedback/__init__.py
@@ -36,7 +36,7 @@ We implemented a custom PyTorch function to improve performance.
 
 Here's [the training code](experiment.html) and a notebook for training a feedback transformer on Tiny Shakespeare dataset.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/feedback/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/feedback/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/d8eb9416530a11eb8fb50242ac1c0002)
 """
 
diff --git a/labml_nn/transformers/feedback/experiment.ipynb b/labml_nn/transformers/feedback/experiment.ipynb
index f606ff1d..5c9665b8 100644
--- a/labml_nn/transformers/feedback/experiment.ipynb
+++ b/labml_nn/transformers/feedback/experiment.ipynb
@@ -6,8 +6,8 @@
     "id": "AYV_dMVDxyc2"
    },
    "source": [
-    "[![Github](https://img.shields.io/github/stars/lab-ml/nn?style=social)](https://github.com/lab-ml/nn)\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/feedback/experiment.ipynb)                    \n",
+    "[![Github](https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social)](https://github.com/labmlai/annotated_deep_learning_paper_implementations)\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/feedback/experiment.ipynb)                    \n",
     "\n",
     "## Feedback Transformer\n",
     "\n",
diff --git a/labml_nn/transformers/feedback/experiment.py b/labml_nn/transformers/feedback/experiment.py
index b2eeafa6..e9741b7f 100644
--- a/labml_nn/transformers/feedback/experiment.py
+++ b/labml_nn/transformers/feedback/experiment.py
@@ -12,7 +12,7 @@ where the keys and values are precalculated.
 
 Here's a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/feedback/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/feedback/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/d8eb9416530a11eb8fb50242ac1c0002)
 """
 
diff --git a/labml_nn/transformers/feedback/readme.md b/labml_nn/transformers/feedback/readme.md
index 12d2ca85..f6b2dc78 100644
--- a/labml_nn/transformers/feedback/readme.md
+++ b/labml_nn/transformers/feedback/readme.md
@@ -29,7 +29,7 @@ We implemented a custom PyTorch function to improve performance.
 
 Here's [the training code](experiment.html) and a notebook for training a feedback transformer on Tiny Shakespeare dataset.
 
-[Colab Notebook](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/feedback/experiment.ipynb)
+[Colab Notebook](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/feedback/experiment.ipynb)
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/feedback/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/feedback/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/d8eb9416530a11eb8fb50242ac1c0002)
diff --git a/labml_nn/transformers/glu_variants/simple.ipynb b/labml_nn/transformers/glu_variants/simple.ipynb
index b03fcf77..34e59e96 100644
--- a/labml_nn/transformers/glu_variants/simple.ipynb
+++ b/labml_nn/transformers/glu_variants/simple.ipynb
@@ -21,8 +21,8 @@
     "id": "AYV_dMVDxyc2"
    },
    "source": [
-    "[![Github](https://img.shields.io/github/stars/lab-ml/nn?style=social)](https://github.com/lab-ml/nn)\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/glu_variants/simple.ipynb)                    \n",
+    "[![Github](https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social)](https://github.com/labmlai/annotated_deep_learning_paper_implementations)\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/glu_variants/simple.ipynb)                    \n",
     "\n",
     "## Gated Linear Units and Variants\n",
     "\n",
diff --git a/labml_nn/transformers/glu_variants/simple.py b/labml_nn/transformers/glu_variants/simple.py
index 10c801e9..302f9a3a 100644
--- a/labml_nn/transformers/glu_variants/simple.py
+++ b/labml_nn/transformers/glu_variants/simple.py
@@ -14,7 +14,7 @@ We try different variants for the [position-wise feedforward network](../feed_fo
 *This is a simpler implementation that doesn't use [`labml.configs`](experiment.html) module.
 We decided to write a simpler implementation to make it easier for readers who are not familiar.*
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/glu_variants/simple.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/glu_variants/simple.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/86b773f65fc911ebb2ac0242ac1c0002)
 """
 import dataclasses
diff --git a/labml_nn/transformers/gpt/__init__.py b/labml_nn/transformers/gpt/__init__.py
index 2e6a08d5..e9af05ee 100644
--- a/labml_nn/transformers/gpt/__init__.py
+++ b/labml_nn/transformers/gpt/__init__.py
@@ -28,7 +28,7 @@ For the transformer we reuse the
 
 Here's a notebook for training a GPT model on Tiny Shakespeare dataset.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/gpt/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/gpt/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/0324c6d0562111eba65d0242ac1c0002)
 """
 
diff --git a/labml_nn/transformers/gpt/experiment.ipynb b/labml_nn/transformers/gpt/experiment.ipynb
index 04424825..3b705c22 100644
--- a/labml_nn/transformers/gpt/experiment.ipynb
+++ b/labml_nn/transformers/gpt/experiment.ipynb
@@ -20,8 +20,8 @@
     "id": "AYV_dMVDxyc2"
    },
    "source": [
-    "[![Github](https://img.shields.io/github/stars/lab-ml/nn?style=social)](https://github.com/lab-ml/nn)\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/gpt/experiment.ipynb)                    \n",
+    "[![Github](https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social)](https://github.com/labmlai/annotated_deep_learning_paper_implementations)\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/gpt/experiment.ipynb)                    \n",
     "\n",
     "## Training a model with GPT architecture\n",
     "\n",
diff --git a/labml_nn/transformers/switch/__init__.py b/labml_nn/transformers/switch/__init__.py
index 4ebbd502..fa74705b 100644
--- a/labml_nn/transformers/switch/__init__.py
+++ b/labml_nn/transformers/switch/__init__.py
@@ -33,7 +33,7 @@ discusses dropping tokens when routing is not balanced.
 
 Here's [the training code](experiment.html) and a notebook for training a switch transformer on Tiny Shakespeare dataset.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/switch/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/switch/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/c4656c605b9311eba13d0242ac1c0002)
 """
 
diff --git a/labml_nn/transformers/switch/experiment.ipynb b/labml_nn/transformers/switch/experiment.ipynb
index 095ea783..30d00409 100644
--- a/labml_nn/transformers/switch/experiment.ipynb
+++ b/labml_nn/transformers/switch/experiment.ipynb
@@ -20,8 +20,8 @@
         "id": "AYV_dMVDxyc2"
       },
       "source": [
-        "[![Github](https://img.shields.io/github/stars/lab-ml/nn?style=social)](https://github.com/lab-ml/nn)\n",
-        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/switch/experiment.ipynb)                    \n",
+        "[![Github](https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social)](https://github.com/labmlai/annotated_deep_learning_paper_implementations)\n",
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/switch/experiment.ipynb)                    \n",
         "\n",
         "## Switch Transformer\n",
         "\n",
diff --git a/labml_nn/transformers/switch/readme.md b/labml_nn/transformers/switch/readme.md
index 2f47406e..72780f30 100644
--- a/labml_nn/transformers/switch/readme.md
+++ b/labml_nn/transformers/switch/readme.md
@@ -26,5 +26,5 @@ discusses dropping tokens when routing is not balanced.
 
 Here's [the training code](experiment.html) and a notebook for training a switch transformer on Tiny Shakespeare dataset.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/switch/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/switch/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/c4656c605b9311eba13d0242ac1c0002)
diff --git a/labml_nn/transformers/xl/__init__.py b/labml_nn/transformers/xl/__init__.py
index b80b2cca..b37ad7dd 100644
--- a/labml_nn/transformers/xl/__init__.py
+++ b/labml_nn/transformers/xl/__init__.py
@@ -28,7 +28,7 @@ Annotated implementation of relative multi-headed attention is in [`relative_mha
 
 Here's [the training code](experiment.html) and a notebook for training a transformer XL model on Tiny Shakespeare dataset.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/xl/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/xl/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/d3b6760c692e11ebb6a70242ac1c0002)
 """
 
diff --git a/labml_nn/transformers/xl/experiment.ipynb b/labml_nn/transformers/xl/experiment.ipynb
index b8ea66bd..9ca21fa8 100644
--- a/labml_nn/transformers/xl/experiment.ipynb
+++ b/labml_nn/transformers/xl/experiment.ipynb
@@ -20,8 +20,8 @@
         "id": "AYV_dMVDxyc2"
       },
       "source": [
-        "[![Github](https://img.shields.io/github/stars/lab-ml/nn?style=social)](https://github.com/lab-ml/nn)\n",
-        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/xl/experiment.ipynb)                    \n",
+        "[![Github](https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social)](https://github.com/labmlai/annotated_deep_learning_paper_implementations)\n",
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/xl/experiment.ipynb)                    \n",
         "\n",
         "## Transformer XL\n",
         "\n",
diff --git a/labml_nn/transformers/xl/readme.md b/labml_nn/transformers/xl/readme.md
index c10342c5..24d50a3a 100644
--- a/labml_nn/transformers/xl/readme.md
+++ b/labml_nn/transformers/xl/readme.md
@@ -20,5 +20,5 @@ Annotated implementation of relative multi-headed attention is in [`relative_mha
 
 Here's [the training code](https://nn.labml.ai/transformers/xl/experiment.html) and a notebook for training a transformer XL model on Tiny Shakespeare dataset.
 
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/xl/experiment.ipynb)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/xl/experiment.ipynb)
 [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/d3b6760c692e11ebb6a70242ac1c0002)