From 1536c6ec5eda7201d3bb25cbcdb57f0c28ba18fe Mon Sep 17 00:00:00 2001
From: Varuna Jayasiri
Date: Sat, 12 Mar 2022 15:51:10 +0530
Subject: [PATCH] links
---
docs/graphs/gat/index.html | 6 +++---
docs/graphs/gatv2/index.html | 6 +++---
docs/index.html | 1 +
docs/sitemap.xml | 14 +++++++-------
docs/transformers/index.html | 10 ++++++----
labml_nn/__init__.py | 1 +
labml_nn/transformers/__init__.py | 3 +++
readme.md | 1 +
8 files changed, 25 insertions(+), 17 deletions(-)
diff --git a/docs/graphs/gat/index.html b/docs/graphs/gat/index.html
index 673796e9..ef133506 100644
--- a/docs/graphs/gat/index.html
+++ b/docs/graphs/gat/index.html
@@ -544,7 +544,7 @@
39-84.5 73-119.5s73.7-60.2 119-75.5c6-2 9-5.7 9-11s-3-9-9-11c-45.3-15.3-85
-40.5-119-75.5s-58.3-74.8-73-119.5c-4.7-14-8.3-27.3-11-40-1.3-6.7-3.2-10.8-5.5
-12.5-2.3-1.7-7.5-2.5-15.5-2.5-14 0-21 3.7-21 11 0 2 2 10.3 6 25 20.7 83.3 67
- 151.7 139 205zm0 0v40h399900v-40z">,∥g2∥g2,∥g2∥g2
+ We then normalize attention scores (or coefficients) αij=softmaxj(eij)=∑k∈Niexp(eik)exp(eij)
where Ni is the set of nodes connected to i.
We do this by setting unconnected eij to −∞ which makes exp(eij)∼0 for unconnected pairs.
diff --git a/docs/graphs/gatv2/index.html b/docs/graphs/gatv2/index.html
index bd776238..fbd1f7b4 100644
--- a/docs/graphs/gatv2/index.html
+++ b/docs/graphs/gatv2/index.html
@@ -646,7 +646,7 @@
39-84.5 73-119.5s73.7-60.2 119-75.5c6-2 9-5.7 9-11s-3-9-9-11c-45.3-15.3-85
-40.5-119-75.5s-58.3-74.8-73-119.5c-4.7-14-8.3-27.3-11-40-1.3-6.7-3.2-10.8-5.5
-12.5-2.3-1.7-7.5-2.5-15.5-2.5-14 0-21 3.7-21 11 0 2 2 10.3 6 25 20.7 83.3 67
- 151.7 139 205zm0 0v40h399900v-40z">,+gr2+gr2,+gr2+gr2
+ We then normalize attention scores (or coefficients) αij=softmaxj(eij)=∑j′∈Niexp(eij′)exp(eij)
where Ni is the set of nodes connected to i.
We do this by setting unconnected eij to −∞ which makes exp(eij)∼0 for unconnected pairs.
diff --git a/docs/index.html b/docs/index.html
index 833d9c0f..223bce20 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -78,6 +78,7 @@
Transformer XL
Relative multi-headed attention
Rotary Positional Embeddings
+RETRO
Compressive Transformer
GPT Architecture
GLU Variants
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index f281c909..7ca8efb2 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -211,7 +211,7 @@
https://nn.labml.ai/experiments/nlp_autoregression.html
- 2022-03-06T16:30:00+00:00
+ 2022-03-12T16:30:00+00:00
1.00
@@ -267,7 +267,7 @@
https://nn.labml.ai/distillation/small.html
- 2022-03-06T16:30:00+00:00
+ 2022-03-12T16:30:00+00:00
1.00
@@ -582,7 +582,7 @@
https://nn.labml.ai/transformers/rope/experiment.html
- 2022-03-06T16:30:00+00:00
+ 2022-03-12T16:30:00+00:00
1.00
@@ -596,7 +596,7 @@
https://nn.labml.ai/transformers/basic/autoregressive_experiment.html
- 2022-03-06T16:30:00+00:00
+ 2022-03-12T16:30:00+00:00
1.00
@@ -722,7 +722,7 @@
https://nn.labml.ai/transformers/retro/index.html
- 2022-03-10T16:30:00+00:00
+ 2022-03-12T16:30:00+00:00
1.00
@@ -883,7 +883,7 @@
https://nn.labml.ai/graphs/gat/index.html
- 2021-08-19T16:30:00+00:00
+ 2022-03-12T16:30:00+00:00
1.00
@@ -897,7 +897,7 @@
https://nn.labml.ai/graphs/gatv2/index.html
- 2021-08-19T16:30:00+00:00
+ 2022-03-12T16:30:00+00:00
1.00
diff --git a/docs/transformers/index.html b/docs/transformers/index.html
index 6ce6789d..9527c397 100644
--- a/docs/transformers/index.html
+++ b/docs/transformers/index.html
@@ -78,6 +78,8 @@
This implements Transformer XL model using relative multi-head attention
This implements Rotary Positional Embeddings (RoPE)
+
+This implements the Retrieval-Enhanced Transformer (RETRO).
This is an implementation of compressive transformer that extends upon Transformer XL by compressing oldest memories to give a longer attention span.
@@ -111,10 +113,10 @@
-
106from .configs import TransformerConfigs
-107from .models import TransformerLayer, Encoder, Decoder, Generator, EncoderDecoder
-108from .mha import MultiHeadAttention
-109from labml_nn.transformers.xl.relative_mha import RelativeMultiHeadAttention
+
109from .configs import TransformerConfigs
+110from .models import TransformerLayer, Encoder, Decoder, Generator, EncoderDecoder
+111from .mha import MultiHeadAttention
+112from labml_nn.transformers.xl.relative_mha import RelativeMultiHeadAttention