diff --git a/docs/sitemap.xml b/docs/sitemap.xml index b6b0ea1f..3239ed42 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -405,6 +405,20 @@ + + https://nn.labml.ai/transformers/alibi/index.html + 2021-08-28T16:30:00+00:00 + 1.00 + + + + + https://nn.labml.ai/transformers/alibi/experiment.html + 2021-08-28T16:30:00+00:00 + 1.00 + + + https://nn.labml.ai/transformers/gmlp/index.html 2021-06-07T16:30:00+00:00 diff --git a/docs/transformers/knn/eval_knn.html b/docs/transformers/knn/eval_knn.html index 530d2141..e06a21cf 100644 --- a/docs/transformers/knn/eval_knn.html +++ b/docs/transformers/knn/eval_knn.html @@ -87,7 +87,7 @@ #

$k$-NN to get $p(w_t, c_t)$

-

Here we refer to $f($\color{yellowgreen}{c_t})$ as queries, +

Here we refer to $f(\color{yellowgreen}{c_t})$ as queries, $f(c_i)$ as keys and $w_i$ as values.

@@ -121,7 +121,7 @@ $f(c_i)$ as keys and $w_i$ as values.

-

Find 10 nearest neighbors of $f($\color{yellowgreen}{c_t})$ among $f(c_i)$. +

Find 10 nearest neighbors of $f(\color{yellowgreen}{c_t})$ among $f(c_i)$. distance is the distance given by FAISS and idx, $i$ is the index of it in keys_store.

diff --git a/docs/transformers/switch/experiment.html b/docs/transformers/switch/experiment.html index 167e9e9e..c9d45783 100644 --- a/docs/transformers/switch/experiment.html +++ b/docs/transformers/switch/experiment.html @@ -477,10 +477,11 @@ $f_i$ is the count of tokens where the argmax of $p(x)$ is equal to $i$.

Load balancing loss -

+$\mathscr{L}$ is the loss for a single layer and here we are +taking the sum of losses across all layers.

-
122        load_balancing_loss = self.n_experts * (route_frac * route_prob).sum()
+
124        load_balancing_loss = self.n_experts * (route_frac * route_prob).sum()
@@ -491,12 +492,12 @@ $f_i$ is the count of tokens where the argmax of $p(x)$ is equal to $i$.

Track stats

-
125        tracker.add('dropped.', total.new_tensor(n_dropped) / total)
-126        tracker.add('route.min.', route_frac.min())
-127        tracker.add('route.max.', route_frac.max())
-128        tracker.add('route.std.', route_frac.std())
-129        tracker.add("loss.", cross_entropy_loss)
-130        tracker.add("lb_loss.", load_balancing_loss)
+
127        tracker.add('dropped.', total.new_tensor(n_dropped) / total)
+128        tracker.add('route.min.', route_frac.min())
+129        tracker.add('route.max.', route_frac.max())
+130        tracker.add('route.std.', route_frac.std())
+131        tracker.add("loss.", cross_entropy_loss)
+132        tracker.add("lb_loss.", load_balancing_loss)
@@ -509,7 +510,7 @@ The load balancing loss is multiplied by a coefficient $\alpha$ which is set to something small like $\alpha = 0.01$.

-
135        loss = cross_entropy_loss + self.load_balancing_loss_ceof * load_balancing_loss
+
137        loss = cross_entropy_loss + self.load_balancing_loss_ceof * load_balancing_loss
@@ -520,8 +521,8 @@ set to something small like $\alpha = 0.01$.

Calculate and log accuracy

-
138        self.accuracy(output, target)
-139        self.accuracy.track()
+
140        self.accuracy(output, target)
+141        self.accuracy.track()
@@ -532,7 +533,7 @@ set to something small like $\alpha = 0.01$.

Train the model

-
142        if self.mode.is_train:
+
144        if self.mode.is_train:
@@ -543,7 +544,7 @@ set to something small like $\alpha = 0.01$.

Calculate gradients

-
144            loss.backward()
+
146            loss.backward()
@@ -554,7 +555,7 @@ set to something small like $\alpha = 0.01$.

Clip gradients

-
146            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip)
+
148            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip)
@@ -565,7 +566,7 @@ set to something small like $\alpha = 0.01$.

Take optimizer step

-
148            self.optimizer.step()
+
150            self.optimizer.step()
@@ -576,8 +577,8 @@ set to something small like $\alpha = 0.01$.

Log the model parameters and gradients on last batch of every epoch

-
150            if batch_idx.is_last:
-151                tracker.add('model', self.model)
+
152            if batch_idx.is_last:
+153                tracker.add('model', self.model)
@@ -588,7 +589,7 @@ set to something small like $\alpha = 0.01$.

Clear the gradients

-
153            self.optimizer.zero_grad()
+
155            self.optimizer.zero_grad()
@@ -599,7 +600,7 @@ set to something small like $\alpha = 0.01$.

Save the tracked metrics

-
156        tracker.save()
+
158        tracker.save()
@@ -610,8 +611,8 @@ set to something small like $\alpha = 0.01$.

Initialize the auto-regressive model

-
159@option(Configs.model)
-160def autoregressive_model(c: Configs):
+
161@option(Configs.model)
+162def autoregressive_model(c: Configs):
@@ -622,8 +623,8 @@ set to something small like $\alpha = 0.01$.

-
164    m = AutoregressiveModel(c.n_tokens, c.d_model, c.transformer)
-165    return m.to(c.device)
+
166    m = AutoregressiveModel(c.n_tokens, c.d_model, c.transformer)
+167    return m.to(c.device)
@@ -634,8 +635,8 @@ set to something small like $\alpha = 0.01$.

Initialize the switch transformer

-
168@option(Configs.transformer)
-169def switch_transformer(c: Configs):
+
170@option(Configs.transformer)
+171def switch_transformer(c: Configs):
@@ -646,21 +647,21 @@ set to something small like $\alpha = 0.01$.

-
173    from labml_nn.transformers.switch import SwitchTransformer, SwitchTransformerLayer, SwitchFeedForward
-174    from labml_nn.transformers import MultiHeadAttention
-175    from labml_nn.transformers.feed_forward import FeedForward
-176
-177    return SwitchTransformer(
-178        SwitchTransformerLayer(d_model=c.d_model,
-179                               attn=MultiHeadAttention(c.heads, c.d_model, c.dropout),
-180                               feed_forward=SwitchFeedForward(capacity_factor=c.capacity_factor,
-181                                                              drop_tokens=c.drop_tokens,
-182                                                              is_scale_prob=c.is_scale_prob,
-183                                                              n_experts=c.n_experts,
-184                                                              expert=FeedForward(c.d_model, c.d_ff, c.dropout),
-185                                                              d_model=c.d_model),
-186                               dropout_prob=c.dropout),
-187        c.n_layers)
+
175    from labml_nn.transformers.switch import SwitchTransformer, SwitchTransformerLayer, SwitchFeedForward
+176    from labml_nn.transformers import MultiHeadAttention
+177    from labml_nn.transformers.feed_forward import FeedForward
+178
+179    return SwitchTransformer(
+180        SwitchTransformerLayer(d_model=c.d_model,
+181                               attn=MultiHeadAttention(c.heads, c.d_model, c.dropout),
+182                               feed_forward=SwitchFeedForward(capacity_factor=c.capacity_factor,
+183                                                              drop_tokens=c.drop_tokens,
+184                                                              is_scale_prob=c.is_scale_prob,
+185                                                              n_experts=c.n_experts,
+186                                                              expert=FeedForward(c.d_model, c.d_ff, c.dropout),
+187                                                              d_model=c.d_model),
+188                               dropout_prob=c.dropout),
+189        c.n_layers)
@@ -671,7 +672,7 @@ set to something small like $\alpha = 0.01$.

Run the experiment

-
190def main():
+
192def main():
@@ -682,7 +683,7 @@ set to something small like $\alpha = 0.01$.

Create experiment

-
195    experiment.create(name="switch_transformer", comment='')
+
197    experiment.create(name="switch_transformer", comment='')
@@ -693,7 +694,7 @@ set to something small like $\alpha = 0.01$.

Create configs

-
197    conf = Configs()
+
199    conf = Configs()
@@ -704,7 +705,7 @@ set to something small like $\alpha = 0.01$.

Load configurations

-
199    experiment.configs(conf,
+
201    experiment.configs(conf,
@@ -715,28 +716,28 @@ set to something small like $\alpha = 0.01$.

A dictionary of configurations to override

-
201                       {'tokenizer': 'character',
-202                        'text': 'tiny_shakespeare',
-203                        'optimizer.learning_rate': 1.,
-204                        'optimizer.optimizer': 'Noam',
-205                        'prompt': 'It is',
-206                        'prompt_separator': '',
-207
-208                        'transformer': 'switch_transformer',
-209                        'is_scale_prob': False,
-210                        'n_experts': 4,
-211
-212                        'drop_tokens': True,
-213                        'capacity_factor': 1.2,
-214
-215                        'train_loader': 'shuffled_train_loader',
-216                        'valid_loader': 'shuffled_valid_loader',
-217
-218                        'seq_len': 64,
-219                        'epochs': 128,
-220                        'batch_size': 32,
-221                        'inner_iterations': 25,
-222                        })
+
203                       {'tokenizer': 'character',
+204                        'text': 'tiny_shakespeare',
+205                        'optimizer.learning_rate': 1.,
+206                        'optimizer.optimizer': 'Noam',
+207                        'prompt': 'It is',
+208                        'prompt_separator': '',
+209
+210                        'transformer': 'switch_transformer',
+211                        'is_scale_prob': False,
+212                        'n_experts': 4,
+213
+214                        'drop_tokens': True,
+215                        'capacity_factor': 1.2,
+216
+217                        'train_loader': 'shuffled_train_loader',
+218                        'valid_loader': 'shuffled_valid_loader',
+219
+220                        'seq_len': 64,
+221                        'epochs': 128,
+222                        'batch_size': 32,
+223                        'inner_iterations': 25,
+224                        })
@@ -747,7 +748,7 @@ set to something small like $\alpha = 0.01$.

Set models for saving and loading

-
225    experiment.add_pytorch_models({'model': conf.model})
+
227    experiment.add_pytorch_models({'model': conf.model})
@@ -758,7 +759,7 @@ set to something small like $\alpha = 0.01$.

Start the experiment

-
228    with experiment.start():
+
230    with experiment.start():
@@ -769,7 +770,7 @@ set to something small like $\alpha = 0.01$.

TrainValidConfigs.run

-
230        conf.run()
+
232        conf.run()
@@ -780,8 +781,8 @@ set to something small like $\alpha = 0.01$.

-
234if __name__ == '__main__':
-235    main()
+
236if __name__ == '__main__':
+237    main()