diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index 2a46b7ad..c9abf985 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -337,7 +337,14 @@
 
     <url>
       <loc>https://nn.labml.ai/transformers/switch/index.html</loc>
-      <lastmod>2021-01-30T16:30:00+00:00</lastmod>
+      <lastmod>2021-02-01T16:30:00+00:00</lastmod>
+      <priority>1.00</priority>
+    </url>
+    
+
+    <url>
+      <loc>https://nn.labml.ai/transformers/switch/readme.html</loc>
+      <lastmod>2021-02-01T16:30:00+00:00</lastmod>
       <priority>1.00</priority>
     </url>
     
diff --git a/docs/transformers/feedback/README.html b/docs/transformers/feedback/README.html
new file mode 100644
index 00000000..e77c8843
--- /dev/null
+++ b/docs/transformers/feedback/README.html
@@ -0,0 +1,141 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta http-equiv="content-type" content="text/html;charset=utf-8"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
+    <meta name="description" content=""/>
+
+    <meta name="twitter:card" content="summary"/>
+    <meta name="twitter:image:src" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
+    <meta name="twitter:title" content="Feedback Transformer"/>
+    <meta name="twitter:description" content=""/>
+    <meta name="twitter:site" content="@labmlai"/>
+    <meta name="twitter:creator" content="@labmlai"/>
+
+    <meta property="og:url" content="https://nn.labml.ai/transformers/feedback/README.html"/>
+    <meta property="og:title" content="Feedback Transformer"/>
+    <meta property="og:image" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
+    <meta property="og:site_name" content="LabML Neural Networks"/>
+    <meta property="og:type" content="object"/>
+    <meta property="og:title" content="Feedback Transformer"/>
+    <meta property="og:description" content=""/>
+
+    <title>Feedback Transformer</title>
+    <link rel="shortcut icon" href="/icon.png"/>
+    <link rel="stylesheet" href="../../pylit.css">
+    <link rel="canonical" href="https://nn.labml.ai/transformers/feedback/README.html"/>
+    <!-- Global site tag (gtag.js) - Google Analytics -->
+    <script async src="https://www.googletagmanager.com/gtag/js?id=G-4V3HC8HBLH"></script>
+    <script>
+        window.dataLayer = window.dataLayer || [];
+
+        function gtag() {
+            dataLayer.push(arguments);
+        }
+
+        gtag('js', new Date());
+
+        gtag('config', 'G-4V3HC8HBLH');
+    </script>
+</head>
+<body>
+<div id='container'>
+    <div id="background"></div>
+    <div class='section'>
+        <div class='docs'>
+            <p>
+                <a class="parent" href="/">home</a>
+                <a class="parent" href="../index.html">transformers</a>
+                <a class="parent" href="index.html">feedback</a>
+            </p>
+            <p>
+
+                <a href="https://github.com/lab-ml/labml_nn/tree/master/labml_nn/transformers/feedback/README.md">
+                    <img alt="Github"
+                         src="https://img.shields.io/github/stars/lab-ml/nn?style=social"
+                         style="max-width:100%;"/></a>
+                <a href="https://join.slack.com/t/labforml/shared_invite/zt-egj9zvq9-Dl3hhZqobexgT7aVKnD14g/"
+                   rel="nofollow">
+                    <img alt="Join Slact"
+                         src="https://img.shields.io/badge/slack-chat-green.svg?logo=slack"
+                         style="max-width:100%;"/></a>
+                <a href="https://twitter.com/labmlai"
+                   rel="nofollow">
+                    <img alt="Twitter"
+                         src="https://img.shields.io/twitter/follow/labmlai?style=social"
+                         style="max-width:100%;"/></a>
+            </p>
+        </div>
+    </div>
+    <div class='section' id='section-0'>
+            <div class='docs'>
+                <div class='section-link'>
+                    <a href='#section-0'>#</a>
+                </div>
+                <h1><a href="https://nn.labml.ai/transformers/feedback/index.html">Feedback Transformer</a></h1>
+<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper
+<a href="https://arxiv.org/abs/2002.09402">Accessing Higher-level Representations in Sequential Transformers with Feedback Memory</a>.</p>
+<p>Normal transformers process tokens in parallel. Each transformer layer pays attention
+to the outputs of the previous layer.
+Feedback transformer pays attention to the output of all layers in previous steps.
+So this adds recurrence, and we need to process token-by-token.
+This slows down the training significantly (about 5X - 10X depending on the sequence length).
+However, when predicting Feedback Transformer is faster because you can predict the next token
+if you cache the memory vectors.</p>
+<p>In order to speed up the training the paper discusses starting with a short sequence length and
+gradually increasing it.
+They also discuss using a pretrained parallel transformer as the starting point.</p>
+<p>The original feedback transformer doesn&rsquo;t keep the outputs of all layers.
+Instead it keeps weighted sum of the output of all layers.
+This reduces the memory used for caching during prediction.
+The first half of this file implements this.</p>
+<p>The updated feedback transformer shares weights used
+to calculate keys and values among the layers.
+We then calculate the keys and values for each step only once and keep
+them cached.
+The <a href="#shared_kv">second half</a> of this file implements this.
+We implemented a custom PyTorch function to improve performance.</p>
+<p>Here&rsquo;s <a href="experiment.html">the training code</a> and a notebook for training a feedback transformer on Tiny Shakespeare dataset.</p>
+<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/feedback/experiment.ipynb">Colab Notebook</a></p>
+<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/feedback/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
+<a href="https://web.lab-ml.com/run?uuid=d8eb9416530a11eb8fb50242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a>
+&ldquo;&rdquo;&ldquo;</p>
+            </div>
+            <div class='code'>
+                
+            </div>
+        </div>
+    </div>
+</div>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=TeX-AMS_HTML">
+</script>
+<!-- MathJax configuration -->
+<script type="text/x-mathjax-config">
+    MathJax.Hub.Config({
+        tex2jax: {
+            inlineMath: [ ['$','$'] ],
+            displayMath: [ ['$$','$$'] ],
+            processEscapes: true,
+            processEnvironments: true
+        },
+        // Center justify equations in code and markdown cells. Elsewhere
+        // we use CSS to left justify single line equations in code cells.
+        displayAlign: 'center',
+        "HTML-CSS": { fonts: ["TeX"] }
+    });
+
+
+
+
+
+
+
+
+
+
+
+
+
+</script>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/transformers/feedback/index.html b/docs/transformers/feedback/index.html
index 3c0322be..0cb8209d 100644
--- a/docs/transformers/feedback/index.html
+++ b/docs/transformers/feedback/index.html
@@ -78,9 +78,9 @@
 <p>Normal transformers process tokens in parallel. Each transformer layer pays attention
 to the outputs of the previous layer.
 Feedback transformer pays attention to the output of all layers in previous steps.
-So this adds recurrence and we need to process token-by-token.
+So this adds recurrence, and we need to process token-by-token.
 This slows down the training significantly (about 5X - 10X depending on the sequence length).
-However when predicting Feedback Transformer is faster because you can predict the next token
+However, when predicting Feedback Transformer is faster because you can predict the next token
 if you cache the memory vectors.</p>
 <p>In order to speed up the training the paper discusses starting with a short sequence length and
 gradually increasing it.
diff --git a/labml_nn/transformers/feedback/README.md b/labml_nn/transformers/feedback/README.md
new file mode 100644
index 00000000..1f02d9ad
--- /dev/null
+++ b/labml_nn/transformers/feedback/README.md
@@ -0,0 +1,36 @@
+# [Feedback Transformer](https://nn.labml.ai/transformers/feedback/index.html)
+
+This is a [PyTorch](https://pytorch.org) implementation of the paper
+[Accessing Higher-level Representations in Sequential Transformers with Feedback Memory](https://arxiv.org/abs/2002.09402).
+
+Normal transformers process tokens in parallel. Each transformer layer pays attention
+to the outputs of the previous layer.
+Feedback transformer pays attention to the output of all layers in previous steps.
+So this adds recurrence, and we need to process token-by-token.
+This slows down the training significantly (about 5X - 10X depending on the sequence length).
+However, when predicting Feedback Transformer is faster because you can predict the next token
+if you cache the memory vectors.
+
+In order to speed up the training the paper discusses starting with a short sequence length and
+gradually increasing it.
+They also discuss using a pretrained parallel transformer as the starting point.
+
+The original feedback transformer doesn't keep the outputs of all layers.
+Instead it keeps weighted sum of the output of all layers.
+This reduces the memory used for caching during prediction.
+The first half of this file implements this.
+
+The updated feedback transformer shares weights used
+to calculate keys and values among the layers.
+We then calculate the keys and values for each step only once and keep
+them cached.
+The [second half](#shared_kv) of this file implements this.
+We implemented a custom PyTorch function to improve performance.
+
+Here's [the training code](experiment.html) and a notebook for training a feedback transformer on Tiny Shakespeare dataset.
+
+[Colab Notebook](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/feedback/experiment.ipynb)
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/feedback/experiment.ipynb)
+[![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://web.lab-ml.com/run?uuid=d8eb9416530a11eb8fb50242ac1c0002)
+"""
\ No newline at end of file
diff --git a/labml_nn/transformers/feedback/__init__.py b/labml_nn/transformers/feedback/__init__.py
index cdddf7c4..29bef0ba 100644
--- a/labml_nn/transformers/feedback/__init__.py
+++ b/labml_nn/transformers/feedback/__init__.py
@@ -13,9 +13,9 @@ This is a [PyTorch](https://pytorch.org) implementation of the paper
 Normal transformers process tokens in parallel. Each transformer layer pays attention
 to the outputs of the previous layer.
 Feedback transformer pays attention to the output of all layers in previous steps.
-So this adds recurrence and we need to process token-by-token.
+So this adds recurrence, and we need to process token-by-token.
 This slows down the training significantly (about 5X - 10X depending on the sequence length).
-However when predicting Feedback Transformer is faster because you can predict the next token
+However, when predicting Feedback Transformer is faster because you can predict the next token
 if you cache the memory vectors.
 
 In order to speed up the training the paper discusses starting with a short sequence length and