diff --git a/docs/transformers/gmlp/experiment.html b/docs/transformers/gmlp/experiment.html index 5371d757..10d9da11 100644 --- a/docs/transformers/gmlp/experiment.html +++ b/docs/transformers/gmlp/experiment.html @@ -68,17 +68,19 @@ #

Pay Attention to MLPs (gMLP) Experiment

-

This is an annotated PyTorch experiment to train a gMLP model.

+

This is an annotated PyTorch experiment to train a gMLP model. +The paper also applies a Stochastic Depth regularization where some layers are removed randomly during training. +We have not implemented that here.

This is based on training loop and configurations for a simple transformer auto-regressive NLP task.

View Run

-
16from labml import experiment
-17from labml.configs import option
-18from labml_nn.transformers import TransformerConfigs
-19from labml_nn.transformers.basic.autoregressive_experiment import Configs as BasicAutoRegressionConfigs
-20from labml_nn.transformers.gmlp import GMLPBlock
+
18from labml import experiment
+19from labml.configs import option
+20from labml_nn.transformers import TransformerConfigs
+21from labml_nn.transformers.basic.autoregressive_experiment import Configs as BasicAutoRegressionConfigs
+22from labml_nn.transformers.gmlp import GMLPBlock
@@ -91,7 +93,7 @@ training loop and configurations for a simple transformer auto-regressive NLP task.

-
23class Configs(BasicAutoRegressionConfigs):
+
25class Configs(BasicAutoRegressionConfigs):
@@ -102,7 +104,7 @@

Transformer

-
32    transformer: TransformerConfigs = 'gMLP'
+
34    transformer: TransformerConfigs = 'gMLP'
@@ -113,7 +115,7 @@

gMLP Block

-
34    gmlp: GMLPBlock
+
36    gmlp: GMLPBlock
@@ -124,7 +126,7 @@

d_ffn for gMLP projection layer

-
36    d_ffn: int = 2048
+
38    d_ffn: int = 2048
@@ -135,8 +137,8 @@

Create a gMLP block

-
39@option(Configs.gmlp, 'gMLP')
-40def _gmlp_configs(c: Configs):
+
41@option(Configs.gmlp, 'gMLP')
+42def _gmlp_configs(c: Configs):
@@ -147,7 +149,7 @@
-
44    return GMLPBlock(c.d_model, c.d_ffn, c.seq_len)
+
46    return GMLPBlock(c.d_model, c.d_ffn, c.seq_len)
@@ -158,8 +160,8 @@

Transformer configurations

-
47@option(Configs.transformer, 'gMLP')
-48def _transformer_configs(c: Configs):
+
49@option(Configs.transformer, 'gMLP')
+50def _transformer_configs(c: Configs):
@@ -171,7 +173,7 @@ configurable transformer implementation

-
55    conf = TransformerConfigs()
+
57    conf = TransformerConfigs()
@@ -182,8 +184,8 @@

Set the vocabulary sizes for embeddings and generating logits

-
57    conf.n_src_vocab = c.n_tokens
-58    conf.n_tgt_vocab = c.n_tokens
+
59    conf.n_src_vocab = c.n_tokens
+60    conf.n_tgt_vocab = c.n_tokens
@@ -194,7 +196,7 @@

Set model size

-
60    conf.d_model = c.d_model
+
62    conf.d_model = c.d_model
@@ -205,9 +207,9 @@

Replace the encoder layer with a gMLP layer

-
62    conf.encoder_layer = c.gmlp
-63
-64    return conf
+
64    conf.encoder_layer = c.gmlp
+65
+66    return conf
@@ -218,7 +220,7 @@
-
67def main():
+
69def main():
@@ -229,7 +231,7 @@

Create experiment

-
69    experiment.create(name="gMLP")
+
71    experiment.create(name="gMLP")
@@ -240,7 +242,7 @@

Create configs

-
71    conf = Configs()
+
73    conf = Configs()
@@ -251,7 +253,7 @@

Override configurations

-
73    experiment.configs(conf, {
+
75    experiment.configs(conf, {
@@ -262,7 +264,7 @@

Use character level tokenizer

-
75        'tokenizer': 'character',
+
77        'tokenizer': 'character',
@@ -273,7 +275,7 @@

Prompt separator is blank

-
77        'prompt_separator': '',
+
79        'prompt_separator': '',
@@ -284,7 +286,7 @@

Starting prompt for sampling

-
79        'prompt': 'It is ',
+
81        'prompt': 'It is ',
@@ -295,7 +297,7 @@

Use Tiny Shakespeare dataset

-
81        'text': 'tiny_shakespeare',
+
83        'text': 'tiny_shakespeare',
@@ -306,7 +308,7 @@

Use a context size of $256$

-
84        'seq_len': 256,
+
86        'seq_len': 256,
@@ -317,7 +319,7 @@

Train for $128$ epochs

-
86        'epochs': 128,
+
88        'epochs': 128,
@@ -328,7 +330,7 @@

Batch size $32$

-
88        'batch_size': 32,
+
90        'batch_size': 32,
@@ -340,7 +342,7 @@ per epoch

-
91        'inner_iterations': 10,
+
93        'inner_iterations': 10,
@@ -351,8 +353,8 @@ per epoch

Model size

-
94        'd_model': 512,
-95        'd_ffn': 2048,
+
96        'd_model': 512,
+97        'd_ffn': 2048,
@@ -363,9 +365,9 @@ per epoch

Use Noam optimizer

-
98        'optimizer.optimizer': 'Noam',
-99        'optimizer.learning_rate': 1.,
-100    })
+
100        'optimizer.optimizer': 'Noam',
+101        'optimizer.learning_rate': 1.,
+102    })
@@ -376,7 +378,7 @@ per epoch

Set models for saving and loading

-
103    experiment.add_pytorch_models({'model': conf.model})
+
105    experiment.add_pytorch_models({'model': conf.model})
@@ -387,7 +389,7 @@ per epoch

Start the experiment

-
106    with experiment.start():
+
108    with experiment.start():
@@ -398,7 +400,7 @@ per epoch

Run training

-
108        conf.run()
+
110        conf.run()
@@ -409,8 +411,8 @@ per epoch

-
112if __name__ == '__main__':
-113    main()
+
114if __name__ == '__main__':
+115    main()
diff --git a/labml_nn/transformers/gmlp/experiment.py b/labml_nn/transformers/gmlp/experiment.py index 40bdabec..78753299 100644 --- a/labml_nn/transformers/gmlp/experiment.py +++ b/labml_nn/transformers/gmlp/experiment.py @@ -7,6 +7,8 @@ summary: This experiment trains a gMLP based model on Tiny Shakespeare dataset. # [Pay Attention to MLPs (gMLP)](index.html) Experiment This is an annotated PyTorch experiment to train a [gMLP model](index.html). +The paper also applies a Stochastic Depth regularization where some layers are removed randomly during training. +We have not implemented that here. This is based on [training loop and configurations for a simple transformer auto-regressive NLP task](../basic/autoregressive_experiment.html).