diff --git a/.gitignore b/.gitignore index 748c72c7..35634031 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ dist/ build/ .idea/* !.idea/dictionaries +html/ diff --git a/labml_nn/transformers/__init__.py b/labml_nn/transformers/__init__.py index 52ac31d6..7bf6c9cf 100644 --- a/labml_nn/transformers/__init__.py +++ b/labml_nn/transformers/__init__.py @@ -1,4 +1,6 @@ """ +Star + # Transformers * [Multi-head attention](mha.html) diff --git a/labml_nn/transformers/mha.py b/labml_nn/transformers/mha.py index 4f635e79..863ba351 100644 --- a/labml_nn/transformers/mha.py +++ b/labml_nn/transformers/mha.py @@ -1,4 +1,6 @@ """ +Star + # Multi-Headed Attention The implementation is inspired from [Annotated Transformer](https://nlp.seas.harvard.edu/2018/04/03/attention.html) diff --git a/labml_nn/transformers/relative_mha.py b/labml_nn/transformers/relative_mha.py index 1f6f8bb6..43d61aef 100644 --- a/labml_nn/transformers/relative_mha.py +++ b/labml_nn/transformers/relative_mha.py @@ -1,4 +1,6 @@ """ +Star + # Relative Multi-head Attention This is an implementation of