diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index b6b0ea1f..3239ed42 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -405,6 +405,20 @@
+ Here we refer to $f($\color{yellowgreen}{c_t})$ as queries,
+ Here we refer to $f(\color{yellowgreen}{c_t})$ as queries,
$f(c_i)$ as keys and $w_i$ as values. Find 10 nearest neighbors of $f($\color{yellowgreen}{c_t})$ among $f(c_i)$.
+ Find 10 nearest neighbors of $f(\color{yellowgreen}{c_t})$ among $f(c_i)$.
Load balancing loss
-$k$-NN to get $p(w_t, c_t)$
-distance
is the distance given by FAISS and idx
, $i$ is the index of it in keys_store
.
122 load_balancing_loss = self.n_experts * (route_frac * route_prob).sum()
124 load_balancing_loss = self.n_experts * (route_frac * route_prob).sum()
Track stats
125 tracker.add('dropped.', total.new_tensor(n_dropped) / total)
-126 tracker.add('route.min.', route_frac.min())
-127 tracker.add('route.max.', route_frac.max())
-128 tracker.add('route.std.', route_frac.std())
-129 tracker.add("loss.", cross_entropy_loss)
-130 tracker.add("lb_loss.", load_balancing_loss)
127 tracker.add('dropped.', total.new_tensor(n_dropped) / total)
+128 tracker.add('route.min.', route_frac.min())
+129 tracker.add('route.max.', route_frac.max())
+130 tracker.add('route.std.', route_frac.std())
+131 tracker.add("loss.", cross_entropy_loss)
+132 tracker.add("lb_loss.", load_balancing_loss)
135 loss = cross_entropy_loss + self.load_balancing_loss_ceof * load_balancing_loss
137 loss = cross_entropy_loss + self.load_balancing_loss_ceof * load_balancing_loss
Calculate and log accuracy
138 self.accuracy(output, target)
-139 self.accuracy.track()
140 self.accuracy(output, target)
+141 self.accuracy.track()
Train the model
142 if self.mode.is_train:
144 if self.mode.is_train:
Calculate gradients
144 loss.backward()
146 loss.backward()
Clip gradients
146 torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip)
148 torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip)
Take optimizer step
148 self.optimizer.step()
150 self.optimizer.step()
Log the model parameters and gradients on last batch of every epoch
150 if batch_idx.is_last:
-151 tracker.add('model', self.model)
152 if batch_idx.is_last:
+153 tracker.add('model', self.model)
Clear the gradients
153 self.optimizer.zero_grad()
155 self.optimizer.zero_grad()
Save the tracked metrics
156 tracker.save()
158 tracker.save()
159@option(Configs.model)
-160def autoregressive_model(c: Configs):
161@option(Configs.model)
+162def autoregressive_model(c: Configs):
164 m = AutoregressiveModel(c.n_tokens, c.d_model, c.transformer)
-165 return m.to(c.device)
166 m = AutoregressiveModel(c.n_tokens, c.d_model, c.transformer)
+167 return m.to(c.device)
168@option(Configs.transformer)
-169def switch_transformer(c: Configs):
170@option(Configs.transformer)
+171def switch_transformer(c: Configs):
173 from labml_nn.transformers.switch import SwitchTransformer, SwitchTransformerLayer, SwitchFeedForward
-174 from labml_nn.transformers import MultiHeadAttention
-175 from labml_nn.transformers.feed_forward import FeedForward
-176
-177 return SwitchTransformer(
-178 SwitchTransformerLayer(d_model=c.d_model,
-179 attn=MultiHeadAttention(c.heads, c.d_model, c.dropout),
-180 feed_forward=SwitchFeedForward(capacity_factor=c.capacity_factor,
-181 drop_tokens=c.drop_tokens,
-182 is_scale_prob=c.is_scale_prob,
-183 n_experts=c.n_experts,
-184 expert=FeedForward(c.d_model, c.d_ff, c.dropout),
-185 d_model=c.d_model),
-186 dropout_prob=c.dropout),
-187 c.n_layers)
175 from labml_nn.transformers.switch import SwitchTransformer, SwitchTransformerLayer, SwitchFeedForward
+176 from labml_nn.transformers import MultiHeadAttention
+177 from labml_nn.transformers.feed_forward import FeedForward
+178
+179 return SwitchTransformer(
+180 SwitchTransformerLayer(d_model=c.d_model,
+181 attn=MultiHeadAttention(c.heads, c.d_model, c.dropout),
+182 feed_forward=SwitchFeedForward(capacity_factor=c.capacity_factor,
+183 drop_tokens=c.drop_tokens,
+184 is_scale_prob=c.is_scale_prob,
+185 n_experts=c.n_experts,
+186 expert=FeedForward(c.d_model, c.d_ff, c.dropout),
+187 d_model=c.d_model),
+188 dropout_prob=c.dropout),
+189 c.n_layers)
190def main():
192def main():
Create experiment
195 experiment.create(name="switch_transformer", comment='')
197 experiment.create(name="switch_transformer", comment='')
Create configs
197 conf = Configs()
199 conf = Configs()
Load configurations
199 experiment.configs(conf,
201 experiment.configs(conf,
A dictionary of configurations to override
201 {'tokenizer': 'character',
-202 'text': 'tiny_shakespeare',
-203 'optimizer.learning_rate': 1.,
-204 'optimizer.optimizer': 'Noam',
-205 'prompt': 'It is',
-206 'prompt_separator': '',
-207
-208 'transformer': 'switch_transformer',
-209 'is_scale_prob': False,
-210 'n_experts': 4,
-211
-212 'drop_tokens': True,
-213 'capacity_factor': 1.2,
-214
-215 'train_loader': 'shuffled_train_loader',
-216 'valid_loader': 'shuffled_valid_loader',
-217
-218 'seq_len': 64,
-219 'epochs': 128,
-220 'batch_size': 32,
-221 'inner_iterations': 25,
-222 })
203 {'tokenizer': 'character',
+204 'text': 'tiny_shakespeare',
+205 'optimizer.learning_rate': 1.,
+206 'optimizer.optimizer': 'Noam',
+207 'prompt': 'It is',
+208 'prompt_separator': '',
+209
+210 'transformer': 'switch_transformer',
+211 'is_scale_prob': False,
+212 'n_experts': 4,
+213
+214 'drop_tokens': True,
+215 'capacity_factor': 1.2,
+216
+217 'train_loader': 'shuffled_train_loader',
+218 'valid_loader': 'shuffled_valid_loader',
+219
+220 'seq_len': 64,
+221 'epochs': 128,
+222 'batch_size': 32,
+223 'inner_iterations': 25,
+224 })
Set models for saving and loading
225 experiment.add_pytorch_models({'model': conf.model})
227 experiment.add_pytorch_models({'model': conf.model})
Start the experiment
228 with experiment.start():
230 with experiment.start():
TrainValidConfigs.run
230 conf.run()
232 conf.run()
234if __name__ == '__main__':
-235 main()
236if __name__ == '__main__':
+237 main()