diff --git a/docs/experiments/nlp_autoregression.html b/docs/experiments/nlp_autoregression.html index 9108f515..e35de1a6 100644 --- a/docs/experiments/nlp_autoregression.html +++ b/docs/experiments/nlp_autoregression.html @@ -357,15 +357,15 @@
Whether to log model parameters and gradients (once per epoch). These are summarized stats per layer, but it could still lead to many indicators for very deep networks.
94 def init(self):
97 is_log_model_params_grads: bool = False
Set tracker configurations
+Whether to log model activations (once per epoch). These are summarized stats per layer, but it could still lead to many indicators for very deep networks.
99 tracker.set_scalar("accuracy.*", True)
-100 tracker.set_scalar("loss.*", True)
102 is_log_model_activations: bool = False
102 hook_model_outputs(self.mode, self.model, 'model')
104 def init(self):
Add accuracy as a state module. The name is probably confusing, since it's meant to store states between training and validation for RNNs. This will keep the accuracy metric stats separate for training and validation.
+Set tracker configurations
107 self.state_modules = [self.accuracy]
109 tracker.set_scalar("accuracy.*", True)
+110 tracker.set_scalar("loss.*", True)
109 def other_metrics(self, output: torch.Tensor, target: torch.Tensor):
112 hook_model_outputs(self.mode, self.model, 'model')
Add accuracy as a state module. The name is probably confusing, since it's meant to store states between training and validation for RNNs. This will keep the accuracy metric stats separate for training and validation.
+111 pass
117 self.state_modules = [self.accuracy]
113 def step(self, batch: any, batch_idx: BatchIndex):
119 def other_metrics(self, output: torch.Tensor, target: torch.Tensor):
Set training/eval mode
- +119 self.model.train(self.mode.is_train)
121 pass
122 data, target = batch[0].to(self.device), batch[1].to(self.device)
123 def step(self, batch: any, batch_idx: BatchIndex):
Update global step (number of tokens processed) when in training mode
+Set training/eval mode
125 if self.mode.is_train:
-126 tracker.add_global_step(data.shape[0] * data.shape[1])
129 self.model.train(self.mode.is_train)
129 with self.mode.update(is_log_activations=batch_idx.is_last):
132 data, target = batch[0].to(self.device), batch[1].to(self.device)
Get model outputs. It's returning a tuple for states when using RNNs. This is not implemented yet. π
+Update global step (number of tokens processed) when in training mode
133 output, *_ = self.model(data)
135 if self.mode.is_train:
+136 tracker.add_global_step(data.shape[0] * data.shape[1])
136 loss = self.loss_func(output, target)
-137 tracker.add("loss.", loss)
139 with self.mode.update(is_log_activations=batch_idx.is_last and self.is_log_model_activations):
Calculate and log accuracy
+Get model outputs. It's returning a tuple for states when using RNNs. This is not implemented yet. π
140 self.accuracy(output, target)
-141 self.accuracy.track()
-142
-143 self.other_metrics(output, target)
143 output, *_ = self.model(data)
146 if self.mode.is_train:
146 loss = self.loss_func(output, target)
+147 tracker.add("loss.", loss)
148 loss.backward()
150 self.accuracy(output, target)
+151 self.accuracy.track()
+152
+153 self.other_metrics(output, target)
150 torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip)
156 if self.mode.is_train:
152 self.optimizer.step()
158 loss.backward()
Log the model parameters and gradients on last batch of every epoch
+Clip gradients
154 if batch_idx.is_last:
-155 tracker.add('model', self.model)
160 torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip)
157 self.optimizer.zero_grad()
162 self.optimizer.step()
Save the tracked metrics
+Log the model parameters and gradients on last batch of every epoch
160 tracker.save()
164 if batch_idx.is_last and self.is_log_model_params_grads:
+165 tracker.add('model', self.model)
162 def sample(self):
167 self.optimizer.zero_grad()
168 prompt = self.prompt
170 tracker.save()
170 log = [(prompt, Text.subtle)]
172 def sample(self):
172 for i in monit.iterate('Sample', 25):
178 prompt = self.prompt
174 data = self.text.text_to_i(prompt).unsqueeze(-1)
-175 data = data.to(self.device)
180 log = [(prompt, Text.subtle)]
177 output, *_ = self.model(data)
182 for i in monit.iterate('Sample', 25):
179 output = output.argmax(dim=-1).squeeze()
184 data = self.text.text_to_i(prompt).unsqueeze(-1)
+185 data = data.to(self.device)
181 prompt += self.prompt_separator + self.text.itos[output[-1]]
187 output, *_ = self.model(data)
183 log += [(self.prompt_separator + self.text.itos[output[-1]], Text.value)]
189 output = output.argmax(dim=-1).squeeze()
186 logger.log(log)
191 prompt += self.prompt_separator + self.text.itos[output[-1]]
189@option(NLPAutoRegressionConfigs.optimizer)
-190def _optimizer(c: NLPAutoRegressionConfigs):
193 log += [(self.prompt_separator + self.text.itos[output[-1]], Text.value)]
Print the sampled output
+195 optimizer = OptimizerConfigs()
-196 optimizer.parameters = c.model.parameters()
-197 optimizer.optimizer = 'Adam'
-198 optimizer.d_model = c.d_model
-199
-200 return optimizer
196 logger.log(log)
203@option(NLPAutoRegressionConfigs.n_tokens)
-204def _n_tokens(c: NLPAutoRegressionConfigs):
199@option(NLPAutoRegressionConfigs.optimizer)
+200def _optimizer(c: NLPAutoRegressionConfigs):
208 return c.text.n_tokens
205 optimizer = OptimizerConfigs()
+206 optimizer.parameters = c.model.parameters()
+207 optimizer.optimizer = 'Adam'
+208 optimizer.d_model = c.d_model
+209
+210 return optimizer
We use character level tokenizer in this experiment. You can switch by setting,
-'tokenizer': 'basic_english',
-in the configurations dictionary when starting the experiment.
+Get number of tokens
211@option(NLPAutoRegressionConfigs.tokenizer)
-212def basic_english():
213@option(NLPAutoRegressionConfigs.n_tokens)
+214def _n_tokens(c: NLPAutoRegressionConfigs):
226 from torchtext.data import get_tokenizer
-227 return get_tokenizer('basic_english')
218 return c.text.n_tokens
We use character level tokenizer in this experiment. You can switch by setting,
+'tokenizer': 'basic_english',
+in the configurations dictionary when starting the experiment.
230def character_tokenizer(x: str):
221@option(NLPAutoRegressionConfigs.tokenizer)
+222def basic_english():
234 return list(x)
236 from torchtext.data import get_tokenizer
+237 return get_tokenizer('basic_english')
237@option(NLPAutoRegressionConfigs.tokenizer)
-238def character():
240def character_tokenizer(x: str):
242 return character_tokenizer
244 return list(x)
It will download from the url if not present
+245@option(NLPAutoRegressionConfigs.text)
-246def tiny_shakespeare(c: NLPAutoRegressionConfigs):
247@option(NLPAutoRegressionConfigs.tokenizer)
+248def character():
252 return TextFileDataset(
-253 lab.get_data_path() / 'tiny_shakespeare.txt',
-254 c.tokenizer,
-255 url='https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt')
252 return character_tokenizer
It will download from the url if not present
258@option(NLPAutoRegressionConfigs.train_loader)
-259def sequential_train_loader(c: NLPAutoRegressionConfigs):
255@option(NLPAutoRegressionConfigs.text)
+256def tiny_shakespeare(c: NLPAutoRegressionConfigs):
263 return SequentialDataLoader(text=c.text.train,
-264 dataset=c.text,
-265 batch_size=c.batch_size,
-266 seq_len=c.seq_len)
262 return TextFileDataset(
+263 lab.get_data_path() / 'tiny_shakespeare.txt',
+264 c.tokenizer,
+265 url='https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt')
269@option(NLPAutoRegressionConfigs.valid_loader)
-270def sequential_valid_loader(c: NLPAutoRegressionConfigs):
268@option(NLPAutoRegressionConfigs.train_loader)
+269def sequential_train_loader(c: NLPAutoRegressionConfigs):
274 return SequentialDataLoader(text=c.text.valid,
-275 dataset=c.text,
-276 batch_size=c.batch_size,
-277 seq_len=c.seq_len)
273 return SequentialDataLoader(text=c.text.train,
+274 dataset=c.text,
+275 batch_size=c.batch_size,
+276 seq_len=c.seq_len)
DataLoader
- collects the batches on the first dimension. We need to transpose it to be sequence first.
280def transpose_batch(batch):
279@option(NLPAutoRegressionConfigs.valid_loader)
+280def sequential_valid_loader(c: NLPAutoRegressionConfigs):
288 transposed_data = list(zip(*batch))
284 return SequentialDataLoader(text=c.text.valid,
+285 dataset=c.text,
+286 batch_size=c.batch_size,
+287 seq_len=c.seq_len)
Stack the batch along the second dimension dim=1
-
DataLoader
+ collects the batches on the first dimension. We need to transpose it to be sequence first.
290 src = torch.stack(transposed_data[0], dim=1)
-291 tgt = torch.stack(transposed_data[1], dim=1)
-292
-293 return src, tgt
290def transpose_batch(batch):
296@option(NLPAutoRegressionConfigs.train_loader)
-297def shuffled_train_loader(c: NLPAutoRegressionConfigs):
298 transposed_data = list(zip(*batch))
Stack the batch along the second dimension dim=1
+
301 dataset = SequentialUnBatchedDataset(text=c.text.train,
-302 dataset=c.text,
-303 seq_len=c.seq_len)
-304 sampler = RandomSampler(dataset, replacement=c.dataloader_shuffle_with_replacement)
-305
-306 return DataLoader(dataset,
-307 batch_size=c.batch_size,
-308 collate_fn=transpose_batch,
-309 sampler=sampler)
300 src = torch.stack(transposed_data[0], dim=1)
+301 tgt = torch.stack(transposed_data[1], dim=1)
+302
+303 return src, tgt
312@option(NLPAutoRegressionConfigs.valid_loader)
-313def shuffled_valid_loader(c: NLPAutoRegressionConfigs):
306@option(NLPAutoRegressionConfigs.train_loader)
+307def shuffled_train_loader(c: NLPAutoRegressionConfigs):
317 dataset = SequentialUnBatchedDataset(text=c.text.valid,
-318 dataset=c.text,
-319 seq_len=c.seq_len)
-320 sampler = RandomSampler(dataset, replacement=c.dataloader_shuffle_with_replacement)
-321
-322 return DataLoader(dataset,
-323 batch_size=c.batch_size,
-324 collate_fn=transpose_batch,
-325 sampler=sampler)
311 dataset = SequentialUnBatchedDataset(text=c.text.train,
+312 dataset=c.text,
+313 seq_len=c.seq_len)
+314 sampler = RandomSampler(dataset, replacement=c.dataloader_shuffle_with_replacement)
+315
+316 return DataLoader(dataset,
+317 batch_size=c.batch_size,
+318 collate_fn=transpose_batch,
+319 sampler=sampler)
322@option(NLPAutoRegressionConfigs.valid_loader)
+323def shuffled_valid_loader(c: NLPAutoRegressionConfigs):
327 dataset = SequentialUnBatchedDataset(text=c.text.valid,
+328 dataset=c.text,
+329 seq_len=c.seq_len)
+330 sampler = RandomSampler(dataset, replacement=c.dataloader_shuffle_with_replacement)
+331
+332 return DataLoader(dataset,
+333 batch_size=c.batch_size,
+334 collate_fn=transpose_batch,
+335 sampler=sampler)
Whether to log model parameters and gradients (once per epoch). These are summarized stats per layer, but it could still lead to many indicators for very deep networks.
77 def init(self):
80 is_log_model_params_grads: bool = False
Set tracker configurations
+Whether to log model activations (once per epoch). These are summarized stats per layer, but it could still lead to many indicators for very deep networks.
82 tracker.set_scalar("accuracy.*", True)
-83 tracker.set_scalar("loss.*", True)
85 is_log_model_activations: bool = False
85 hook_model_outputs(self.mode, self.model, 'model')
87 def init(self):
Add accuracy as a state module. The name is probably confusing, since it's meant to store states between training and validation for RNNs. This will keep the accuracy metric stats separate for training and validation.
+Set tracker configurations
90 self.state_modules = [self.accuracy]
92 tracker.set_scalar("accuracy.*", True)
+93 tracker.set_scalar("loss.*", True)
92 def step(self, batch: any, batch_idx: BatchIndex):
95 hook_model_outputs(self.mode, self.model, 'model')
Move data to the device
+Add accuracy as a state module. The name is probably confusing, since it's meant to store states between training and validation for RNNs. This will keep the accuracy metric stats separate for training and validation.
98 data, target = batch[0].to(self.device), batch[1].to(self.device)
100 self.state_modules = [self.accuracy]
Update global step (number of tokens processed) when in training mode
+101 if self.mode.is_train:
-102 tracker.add_global_step(data.shape[1])
102 def step(self, batch: any, batch_idx: BatchIndex):
105 with self.mode.update(is_log_activations=batch_idx.is_last):
108 data, target = batch[0].to(self.device), batch[1].to(self.device)
Get model outputs. It's returning a tuple for states when using RNNs. This is not implemented yet. π
+Update global step (number of tokens processed) when in training mode
109 output, *_ = self.model(data)
111 if self.mode.is_train:
+112 tracker.add_global_step(data.shape[1])
112 loss = self.loss_func(output, target)
-113 tracker.add("loss.", loss)
115 with self.mode.update(is_log_activations=batch_idx.is_last and self.is_log_model_activations):
Calculate and log accuracy
+Get model outputs. It's returning a tuple for states when using RNNs. This is not implemented yet. π
116 self.accuracy(output, target)
-117 self.accuracy.track()
119 output, *_ = self.model(data)
120 if self.mode.is_train:
122 loss = self.loss_func(output, target)
+123 tracker.add("loss.", loss)
122 loss.backward()
126 self.accuracy(output, target)
+127 self.accuracy.track()
124 torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip)
130 if self.mode.is_train:
126 self.optimizer.step()
132 loss.backward()
Log the model parameters and gradients on last batch of every epoch
+Clip gradients
128 if batch_idx.is_last:
-129 tracker.add('model', self.model)
134 torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip)
131 self.optimizer.zero_grad()
136 self.optimizer.step()
Save the tracked metrics
+Log the model parameters and gradients on last batch of every epoch
134 tracker.save()
138 if batch_idx.is_last and self.is_log_model_params_grads:
+139 tracker.add('model', self.model)
137@option(NLPClassificationConfigs.optimizer)
-138def _optimizer(c: NLPClassificationConfigs):
141 self.optimizer.zero_grad()
Save the tracked metrics
+143 optimizer = OptimizerConfigs()
-144 optimizer.parameters = c.model.parameters()
-145 optimizer.optimizer = 'Adam'
-146 optimizer.d_model = c.d_model
-147
-148 return optimizer
144 tracker.save()
We use character level tokenizer in this experiment. You can switch by setting,
-'tokenizer': 'basic_english',
-in the configurations dictionary when starting the experiment.
+151@option(NLPClassificationConfigs.tokenizer)
-152def basic_english():
147@option(NLPClassificationConfigs.optimizer)
+148def _optimizer(c: NLPClassificationConfigs):
166 from torchtext.data import get_tokenizer
-167 return get_tokenizer('basic_english')
153 optimizer = OptimizerConfigs()
+154 optimizer.parameters = c.model.parameters()
+155 optimizer.optimizer = 'Adam'
+156 optimizer.d_model = c.d_model
+157
+158 return optimizer
We use character level tokenizer in this experiment. You can switch by setting,
+'tokenizer': 'basic_english',
+in the configurations dictionary when starting the experiment.
170def character_tokenizer(x: str):
161@option(NLPClassificationConfigs.tokenizer)
+162def basic_english():
174 return list(x)
176 from torchtext.data import get_tokenizer
+177 return get_tokenizer('basic_english')
177@option(NLPClassificationConfigs.tokenizer)
-178def character():
180def character_tokenizer(x: str):
182 return character_tokenizer
184 return list(x)
185@option(NLPClassificationConfigs.n_tokens)
-186def _n_tokens(c: NLPClassificationConfigs):
187@option(NLPClassificationConfigs.tokenizer)
+188def character():
190 return len(c.vocab) + 2
192 return character_tokenizer
Get number of tokens
+ +195@option(NLPClassificationConfigs.n_tokens)
+196def _n_tokens(c: NLPClassificationConfigs):
200 return len(c.vocab) + 2
193class CollateFunc:
203class CollateFunc:
198 def __init__(self, tokenizer, vocab: Vocab, seq_len: int, padding_token: int, classifier_token: int):
206 self.classifier_token = classifier_token
-207 self.padding_token = padding_token
-208 self.seq_len = seq_len
-209 self.vocab = vocab
-210 self.tokenizer = tokenizer
batch
- is the batch of data collected by the DataLoader
-212 def __call__(self, batch):
208 def __init__(self, tokenizer, vocab: Vocab, seq_len: int, padding_token: int, classifier_token: int):
Input data tensor, initialized with padding_token
-
218 data = torch.full((self.seq_len, len(batch)), self.padding_token, dtype=torch.long)
216 self.classifier_token = classifier_token
+217 self.padding_token = padding_token
+218 self.seq_len = seq_len
+219 self.vocab = vocab
+220 self.tokenizer = tokenizer
220 labels = torch.zeros(len(batch), dtype=torch.long)
222 def __call__(self, batch):
Loop through the samples
+Input data tensor, initialized with padding_token
+
223 for (i, (_label, _text)) in enumerate(batch):
228 data = torch.full((self.seq_len, len(batch)), self.padding_token, dtype=torch.long)
225 labels[i] = int(_label) - 1
230 labels = torch.zeros(len(batch), dtype=torch.long)
227 _text = [self.vocab[token] for token in self.tokenizer(_text)]
233 for (i, (_label, _text)) in enumerate(batch):
229 _text = _text[:self.seq_len]
235 labels[i] = int(_label) - 1
231 data[:len(_text), i] = data.new_tensor(_text)
237 _text = [self.vocab[token] for token in self.tokenizer(_text)]
234 data[-1, :] = self.classifier_token
239 _text = _text[:self.seq_len]
Transpose and add to data
+ +241 data[:len(_text), i] = data.new_tensor(_text)
Set the final token in the sequence to [CLS]
+
244 data[-1, :] = self.classifier_token
237 return data, labels
247 return data, labels
This loads the AG News dataset and the set the values for n_classes
@@ -835,36 +859,11 @@
240@option([NLPClassificationConfigs.n_classes,
-241 NLPClassificationConfigs.vocab,
-242 NLPClassificationConfigs.train_loader,
-243 NLPClassificationConfigs.valid_loader])
-244def ag_news(c: NLPClassificationConfigs):
Get training and validation datasets
- -253 train, valid = torchtext.datasets.AG_NEWS(root=str(lab.get_data_path() / 'ag_news'), split=('train', 'test'))
Load data to memory
- -256 with monit.section('Load data'):
-257 from labml_nn.utils import MapStyleDataset
250@option([NLPClassificationConfigs.n_classes,
+251 NLPClassificationConfigs.vocab,
+252 NLPClassificationConfigs.train_loader,
+253 NLPClassificationConfigs.valid_loader])
+254def ag_news(c: NLPClassificationConfigs):
260 train, valid = MapStyleDataset(train), MapStyleDataset(valid)
263 train, valid = torchtext.datasets.AG_NEWS(root=str(lab.get_data_path() / 'ag_news'), split=('train', 'test'))
263 tokenizer = c.tokenizer
266 with monit.section('Load data'):
+267 from labml_nn.utils import MapStyleDataset
266 counter = Counter()
270 train, valid = MapStyleDataset(train), MapStyleDataset(valid)
268 for (label, line) in train:
-269 counter.update(tokenizer(line))
273 tokenizer = c.tokenizer
271 for (label, line) in valid:
-272 counter.update(tokenizer(line))
276 counter = Counter()
274 vocab = torchtext.vocab.vocab(counter, min_freq=1)
278 for (label, line) in train:
+279 counter.update(tokenizer(line))
277 train_loader = DataLoader(train, batch_size=c.batch_size, shuffle=True,
-278 collate_fn=CollateFunc(tokenizer, vocab, c.seq_len, len(vocab), len(vocab) + 1))
281 for (label, line) in valid:
+282 counter.update(tokenizer(line))
280 valid_loader = DataLoader(valid, batch_size=c.batch_size, shuffle=True,
-281 collate_fn=CollateFunc(tokenizer, vocab, c.seq_len, len(vocab), len(vocab) + 1))
284 vocab = torchtext.vocab.vocab(counter, min_freq=1)
Create training data loader
+ +287 train_loader = DataLoader(train, batch_size=c.batch_size, shuffle=True,
+288 collate_fn=CollateFunc(tokenizer, vocab, c.seq_len, len(vocab), len(vocab) + 1))
Create validation data loader
+ +290 valid_loader = DataLoader(valid, batch_size=c.batch_size, shuffle=True,
+291 collate_fn=CollateFunc(tokenizer, vocab, c.seq_len, len(vocab), len(vocab) + 1))
Return n_classes
, vocab
, train_loader
@@ -980,7 +1004,7 @@
284 return 4, vocab, train_loader, valid_loader
294 return 4, vocab, train_loader, valid_loader
15import copy
-16
-17import torch
-18import torch.nn as nn
-19
-20from labml import experiment
-21from labml.configs import option
-22from labml_helpers.module import Module
-23from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
-24from labml_nn.normalization.deep_norm import DeepNormTransformerLayer
-25from labml_nn.transformers import MultiHeadAttention
-26from labml_nn.transformers.feed_forward import FeedForward
14import copy
+15
+16import torch
+17import torch.nn as nn
+18
+19from labml import experiment
+20from labml.configs import option
+21from labml_helpers.module import Module
+22from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
+23from labml_nn.normalization.deep_norm import DeepNormTransformerLayer
+24from labml_nn.transformers import MultiHeadAttention
+25from labml_nn.transformers.feed_forward import FeedForward
29class AutoregressiveTransformer(Module):
28class AutoregressiveTransformer(Module):
36 def __init__(self, n_tokens: int, d_model: int, n_layers: int, layer: DeepNormTransformerLayer):
35 def __init__(self, n_tokens: int, d_model: int, n_layers: int, layer: DeepNormTransformerLayer):
43 super().__init__()
42 super().__init__()
45 self.transformer = nn.Sequential(*[copy.deepcopy(layer) for _ in range(n_layers)])
44 self.transformer = nn.Sequential(*[copy.deepcopy(layer) for _ in range(n_layers)])
48 self.emb = nn.Embedding(n_tokens, d_model)
47 self.emb = nn.Embedding(n_tokens, d_model)
50 self.readout = nn.Linear(d_model, n_tokens)
49 self.readout = nn.Linear(d_model, n_tokens)
52 def forward(self, x: torch.Tensor):
51 def forward(self, x: torch.Tensor):
57 x = self.emb(x)
56 x = self.emb(x)
59 x = self.transformer(x)
58 x = self.transformer(x)
61 x = self.readout(x)
60 x = self.readout(x)
64 return x, None
63 return x, None
67class Configs(NLPAutoRegressionConfigs):
66class Configs(NLPAutoRegressionConfigs):
76 model: AutoregressiveTransformer
75 model: AutoregressiveTransformer
79 n_layers: int = 64
78 n_layers: int = 64
82 deep_norm_alpha: float
-83 deep_norm_beta: float
81 deep_norm_alpha: float
+82 deep_norm_beta: float
86 n_heads: int = 4
85 n_heads: int = 4
88 d_model: int = 64
87 d_model: int = 64
90 d_k: int = 16
89 d_k: int = 16
93@option(Configs.deep_norm_alpha)
-94def _deep_norm_alpha(c: Configs):
92@option(Configs.deep_norm_alpha)
+93def _deep_norm_alpha(c: Configs):
100 return (2. * c.n_layers) ** (1. / 4.)
99 return (2. * c.n_layers) ** (1. / 4.)
103@option(Configs.deep_norm_beta)
-104def _deep_norm_beta(c: Configs):
102@option(Configs.deep_norm_beta)
+103def _deep_norm_beta(c: Configs):
110 return (8. * c.n_layers) ** -(1. / 4.)
109 return (8. * c.n_layers) ** -(1. / 4.)
113@option(Configs.model)
-114def _model(c: Configs):
112@option(Configs.model)
+113def _model(c: Configs):
118 m = AutoregressiveTransformer(c.n_tokens, c.d_model, c.n_layers,
-119 DeepNormTransformerLayer(d_model=c.d_model,
-120 deep_norm_alpha=c.deep_norm_alpha,
-121 deep_norm_beta=c.deep_norm_beta,
-122 feed_forward=FeedForward(d_model=c.d_model,
-123 d_ff=c.d_model * 4),
-124 self_attn=MultiHeadAttention(c.n_heads, c.d_model,
-125 dropout_prob=0.0)))
-126
-127 return m.to(c.device)
117 m = AutoregressiveTransformer(c.n_tokens, c.d_model, c.n_layers,
+118 DeepNormTransformerLayer(d_model=c.d_model,
+119 deep_norm_alpha=c.deep_norm_alpha,
+120 deep_norm_beta=c.deep_norm_beta,
+121 feed_forward=FeedForward(d_model=c.d_model,
+122 d_ff=c.d_model * 4),
+123 self_attn=MultiHeadAttention(c.n_heads, c.d_model,
+124 dropout_prob=0.0)))
+125
+126 return m.to(c.device)
130def main():
129def main():
135 experiment.create(name="deep_norm", writers={'screen', 'web_api'})
134 experiment.create(name="deep_norm", writers={'screen', 'web_api', 'comet'})
137 conf = Configs()
136 conf = Configs()
139 experiment.configs(conf, {
138 experiment.configs(conf, {
141 'tokenizer': 'character',
140 'tokenizer': 'character',
143 'prompt_separator': '',
142 'prompt_separator': '',
145 'prompt': 'It is ',
144 'prompt': 'It is ',
147 'text': 'tiny_shakespeare',
146 'text': 'tiny_shakespeare',
150 'seq_len': 256,
149 'seq_len': 256,
152 'epochs': 32,
151 'epochs': 32,
154 'batch_size': 16,
153 'batch_size': 16,
156 'inner_iterations': 10,
155 'inner_iterations': 10,
159 'optimizer.optimizer': 'Adam',
-160 'optimizer.learning_rate': 3e-4,
-161 })
158 'optimizer.optimizer': 'Adam',
+159 'optimizer.learning_rate': 3e-4,
+160 })
164 experiment.add_pytorch_models({'model': conf.model})
163 experiment.add_pytorch_models({'model': conf.model})
167 with experiment.start():
166 with experiment.start():
169 conf.run()
168 conf.run()
173if __name__ == '__main__':
-174 main()
172if __name__ == '__main__':
+173 main()