diff --git a/docs/experiments/nlp_autoregression.html b/docs/experiments/nlp_autoregression.html index 9108f515..e35de1a6 100644 --- a/docs/experiments/nlp_autoregression.html +++ b/docs/experiments/nlp_autoregression.html @@ -357,15 +357,15 @@
-
+
-

Initialization

+

Whether to log model parameters and gradients (once per epoch). These are summarized stats per layer, but it could still lead to many indicators for very deep networks.

-
94    def init(self):
+
97    is_log_model_params_grads: bool = False
@@ -373,24 +373,23 @@ -

Set tracker configurations

+

Whether to log model activations (once per epoch). These are summarized stats per layer, but it could still lead to many indicators for very deep networks.

-
99        tracker.set_scalar("accuracy.*", True)
-100        tracker.set_scalar("loss.*", True)
+
102    is_log_model_activations: bool = False
-
+
-

Add a hook to log module outputs

+

Initialization

-
102        hook_model_outputs(self.mode, self.model, 'model')
+
104    def init(self):
@@ -398,23 +397,24 @@ -

Add accuracy as a state module. The name is probably confusing, since it's meant to store states between training and validation for RNNs. This will keep the accuracy metric stats separate for training and validation.

+

Set tracker configurations

-
107        self.state_modules = [self.accuracy]
+
109        tracker.set_scalar("accuracy.*", True)
+110        tracker.set_scalar("loss.*", True)
-
+
-

Override to calculate and log other metrics

+

Add a hook to log module outputs

-
109    def other_metrics(self, output: torch.Tensor, target: torch.Tensor):
+
112        hook_model_outputs(self.mode, self.model, 'model')
@@ -422,10 +422,11 @@ - +

Add accuracy as a state module. The name is probably confusing, since it's meant to store states between training and validation for RNNs. This will keep the accuracy metric stats separate for training and validation.

+
-
111        pass
+
117        self.state_modules = [self.accuracy]
@@ -433,11 +434,11 @@ -

Training or validation step

+

Override to calculate and log other metrics

-
113    def step(self, batch: any, batch_idx: BatchIndex):
+
119    def other_metrics(self, output: torch.Tensor, target: torch.Tensor):
@@ -445,23 +446,22 @@ -

Set training/eval mode

- +
-
119        self.model.train(self.mode.is_train)
+
121        pass
-
+
-

Move data to the device

+

Training or validation step

-
122        data, target = batch[0].to(self.device), batch[1].to(self.device)
+
123    def step(self, batch: any, batch_idx: BatchIndex):
@@ -469,12 +469,11 @@ -

Update global step (number of tokens processed) when in training mode

+

Set training/eval mode

-
125        if self.mode.is_train:
-126            tracker.add_global_step(data.shape[0] * data.shape[1])
+
129        self.model.train(self.mode.is_train)
@@ -482,11 +481,11 @@ -

Whether to capture model outputs

+

Move data to the device

-
129        with self.mode.update(is_log_activations=batch_idx.is_last):
+
132        data, target = batch[0].to(self.device), batch[1].to(self.device)
@@ -494,11 +493,12 @@ -

Get model outputs. It's returning a tuple for states when using RNNs. This is not implemented yet. 😜

+

Update global step (number of tokens processed) when in training mode

-
133            output, *_ = self.model(data)
+
135        if self.mode.is_train:
+136            tracker.add_global_step(data.shape[0] * data.shape[1])
@@ -506,12 +506,11 @@ -

Calculate and log loss

+

Whether to capture model outputs

-
136        loss = self.loss_func(output, target)
-137        tracker.add("loss.", loss)
+
139        with self.mode.update(is_log_activations=batch_idx.is_last and self.is_log_model_activations):
@@ -519,14 +518,11 @@ -

Calculate and log accuracy

+

Get model outputs. It's returning a tuple for states when using RNNs. This is not implemented yet. 😜

-
140        self.accuracy(output, target)
-141        self.accuracy.track()
-142
-143        self.other_metrics(output, target)
+
143            output, *_ = self.model(data)
@@ -534,11 +530,12 @@ -

Train the model

+

Calculate and log loss

-
146        if self.mode.is_train:
+
146        loss = self.loss_func(output, target)
+147        tracker.add("loss.", loss)
@@ -546,11 +543,14 @@ -

Calculate gradients

+

Calculate and log accuracy

-
148            loss.backward()
+
150        self.accuracy(output, target)
+151        self.accuracy.track()
+152
+153        self.other_metrics(output, target)
@@ -558,11 +558,11 @@ -

Clip gradients

+

Train the model

-
150            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip)
+
156        if self.mode.is_train:
@@ -570,11 +570,11 @@ -

Take optimizer step

+

Calculate gradients

-
152            self.optimizer.step()
+
158            loss.backward()
@@ -582,12 +582,11 @@ -

Log the model parameters and gradients on last batch of every epoch

+

Clip gradients

-
154            if batch_idx.is_last:
-155                tracker.add('model', self.model)
+
160            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip)
@@ -595,11 +594,11 @@ -

Clear the gradients

+

Take optimizer step

-
157            self.optimizer.zero_grad()
+
162            self.optimizer.step()
@@ -607,23 +606,24 @@ -

Save the tracked metrics

+

Log the model parameters and gradients on last batch of every epoch

-
160        tracker.save()
+
164            if batch_idx.is_last and self.is_log_model_params_grads:
+165                tracker.add('model', self.model)
-
+
-

Sampling function to generate samples periodically while training

+

Clear the gradients

-
162    def sample(self):
+
167            self.optimizer.zero_grad()
@@ -631,23 +631,23 @@ -

Starting prompt

+

Save the tracked metrics

-
168        prompt = self.prompt
+
170        tracker.save()
-
+
-

Collect output for printing

+

Sampling function to generate samples periodically while training

-
170        log = [(prompt, Text.subtle)]
+
172    def sample(self):
@@ -655,11 +655,11 @@ -

Sample 25 tokens

+

Starting prompt

-
172        for i in monit.iterate('Sample', 25):
+
178        prompt = self.prompt
@@ -667,12 +667,11 @@ -

Tokenize the prompt

+

Collect output for printing

-
174            data = self.text.text_to_i(prompt).unsqueeze(-1)
-175            data = data.to(self.device)
+
180        log = [(prompt, Text.subtle)]
@@ -680,11 +679,11 @@ -

Get the model output

+

Sample 25 tokens

-
177            output, *_ = self.model(data)
+
182        for i in monit.iterate('Sample', 25):
@@ -692,11 +691,12 @@ -

Get the model prediction (greedy)

+

Tokenize the prompt

-
179            output = output.argmax(dim=-1).squeeze()
+
184            data = self.text.text_to_i(prompt).unsqueeze(-1)
+185            data = data.to(self.device)
@@ -704,11 +704,11 @@ -

Add the prediction to prompt

+

Get the model output

-
181            prompt += self.prompt_separator + self.text.itos[output[-1]]
+
187            output, *_ = self.model(data)
@@ -716,11 +716,11 @@ -

Add the prediction for logging

+

Get the model prediction (greedy)

-
183            log += [(self.prompt_separator + self.text.itos[output[-1]], Text.value)]
+
189            output = output.argmax(dim=-1).squeeze()
@@ -728,24 +728,23 @@ -

Print the sampled output

+

Add the prediction to prompt

-
186        logger.log(log)
+
191            prompt += self.prompt_separator + self.text.itos[output[-1]]
-
+
-

Default optimizer configurations

+

Add the prediction for logging

-
189@option(NLPAutoRegressionConfigs.optimizer)
-190def _optimizer(c: NLPAutoRegressionConfigs):
+
193            log += [(self.prompt_separator + self.text.itos[output[-1]], Text.value)]
@@ -753,15 +752,11 @@ - +

Print the sampled output

+
-
195    optimizer = OptimizerConfigs()
-196    optimizer.parameters = c.model.parameters()
-197    optimizer.optimizer = 'Adam'
-198    optimizer.d_model = c.d_model
-199
-200    return optimizer
+
196        logger.log(log)
@@ -769,12 +764,12 @@ -

Get number of tokens

+

Default optimizer configurations

-
203@option(NLPAutoRegressionConfigs.n_tokens)
-204def _n_tokens(c: NLPAutoRegressionConfigs):
+
199@option(NLPAutoRegressionConfigs.optimizer)
+200def _optimizer(c: NLPAutoRegressionConfigs):
@@ -785,7 +780,12 @@
-
208    return c.text.n_tokens
+
205    optimizer = OptimizerConfigs()
+206    optimizer.parameters = c.model.parameters()
+207    optimizer.optimizer = 'Adam'
+208    optimizer.d_model = c.d_model
+209
+210    return optimizer
@@ -793,15 +793,12 @@ -

Basic english tokenizer

-

We use character level tokenizer in this experiment. You can switch by setting,

-
'tokenizer': 'basic_english',
-

in the configurations dictionary when starting the experiment.

+

Get number of tokens

-
211@option(NLPAutoRegressionConfigs.tokenizer)
-212def basic_english():
+
213@option(NLPAutoRegressionConfigs.n_tokens)
+214def _n_tokens(c: NLPAutoRegressionConfigs):
@@ -812,8 +809,7 @@
-
226    from torchtext.data import get_tokenizer
-227    return get_tokenizer('basic_english')
+
218    return c.text.n_tokens
@@ -821,11 +817,15 @@ -

Character level tokenizer

+

Basic english tokenizer

+

We use character level tokenizer in this experiment. You can switch by setting,

+
'tokenizer': 'basic_english',
+

in the configurations dictionary when starting the experiment.

-
230def character_tokenizer(x: str):
+
221@option(NLPAutoRegressionConfigs.tokenizer)
+222def basic_english():
@@ -836,7 +836,8 @@
-
234    return list(x)
+
236    from torchtext.data import get_tokenizer
+237    return get_tokenizer('basic_english')
@@ -844,12 +845,11 @@ -

Character level tokenizer configuration

+

Character level tokenizer

-
237@option(NLPAutoRegressionConfigs.tokenizer)
-238def character():
+
240def character_tokenizer(x: str):
@@ -860,7 +860,7 @@
-
242    return character_tokenizer
+
244    return list(x)
@@ -868,13 +868,12 @@ -

Tiny Shakespeare dataset

-

It will download from the url if not present

+

Character level tokenizer configuration

-
245@option(NLPAutoRegressionConfigs.text)
-246def tiny_shakespeare(c: NLPAutoRegressionConfigs):
+
247@option(NLPAutoRegressionConfigs.tokenizer)
+248def character():
@@ -885,10 +884,7 @@
-
252    return TextFileDataset(
-253        lab.get_data_path() / 'tiny_shakespeare.txt',
-254        c.tokenizer,
-255        url='https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt')
+
252    return character_tokenizer
@@ -896,12 +892,13 @@ -

Sequential training data loader

+

Tiny Shakespeare dataset

+

It will download from the url if not present

-
258@option(NLPAutoRegressionConfigs.train_loader)
-259def sequential_train_loader(c: NLPAutoRegressionConfigs):
+
255@option(NLPAutoRegressionConfigs.text)
+256def tiny_shakespeare(c: NLPAutoRegressionConfigs):
@@ -912,10 +909,10 @@
-
263    return SequentialDataLoader(text=c.text.train,
-264                                dataset=c.text,
-265                                batch_size=c.batch_size,
-266                                seq_len=c.seq_len)
+
262    return TextFileDataset(
+263        lab.get_data_path() / 'tiny_shakespeare.txt',
+264        c.tokenizer,
+265        url='https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt')
@@ -923,12 +920,12 @@ -

Sequential validation data loader

+

Sequential training data loader

-
269@option(NLPAutoRegressionConfigs.valid_loader)
-270def sequential_valid_loader(c: NLPAutoRegressionConfigs):
+
268@option(NLPAutoRegressionConfigs.train_loader)
+269def sequential_train_loader(c: NLPAutoRegressionConfigs):
@@ -939,10 +936,10 @@
-
274    return SequentialDataLoader(text=c.text.valid,
-275                                dataset=c.text,
-276                                batch_size=c.batch_size,
-277                                seq_len=c.seq_len)
+
273    return SequentialDataLoader(text=c.text.train,
+274                                dataset=c.text,
+275                                batch_size=c.batch_size,
+276                                seq_len=c.seq_len)
@@ -950,13 +947,12 @@ -

Transpose batch

-

DataLoader - collects the batches on the first dimension. We need to transpose it to be sequence first.

+

Sequential validation data loader

-
280def transpose_batch(batch):
+
279@option(NLPAutoRegressionConfigs.valid_loader)
+280def sequential_valid_loader(c: NLPAutoRegressionConfigs):
@@ -967,36 +963,35 @@
-
288    transposed_data = list(zip(*batch))
+
284    return SequentialDataLoader(text=c.text.valid,
+285                                dataset=c.text,
+286                                batch_size=c.batch_size,
+287                                seq_len=c.seq_len)
-
+
-

Stack the batch along the second dimension dim=1 -

+

Transpose batch

+

DataLoader + collects the batches on the first dimension. We need to transpose it to be sequence first.

-
290    src = torch.stack(transposed_data[0], dim=1)
-291    tgt = torch.stack(transposed_data[1], dim=1)
-292
-293    return src, tgt
+
290def transpose_batch(batch):
-
+
-

Shuffled training data loader

- +
-
296@option(NLPAutoRegressionConfigs.train_loader)
-297def shuffled_train_loader(c: NLPAutoRegressionConfigs):
+
298    transposed_data = list(zip(*batch))
@@ -1004,18 +999,15 @@ - +

Stack the batch along the second dimension dim=1 +

+
-
301    dataset = SequentialUnBatchedDataset(text=c.text.train,
-302                                         dataset=c.text,
-303                                         seq_len=c.seq_len)
-304    sampler = RandomSampler(dataset, replacement=c.dataloader_shuffle_with_replacement)
-305
-306    return DataLoader(dataset,
-307                      batch_size=c.batch_size,
-308                      collate_fn=transpose_batch,
-309                      sampler=sampler)
+
300    src = torch.stack(transposed_data[0], dim=1)
+301    tgt = torch.stack(transposed_data[1], dim=1)
+302
+303    return src, tgt
@@ -1023,12 +1015,12 @@ -

Shuffled validation data loader

+

Shuffled training data loader

-
312@option(NLPAutoRegressionConfigs.valid_loader)
-313def shuffled_valid_loader(c: NLPAutoRegressionConfigs):
+
306@option(NLPAutoRegressionConfigs.train_loader)
+307def shuffled_train_loader(c: NLPAutoRegressionConfigs):
@@ -1039,15 +1031,47 @@
-
317    dataset = SequentialUnBatchedDataset(text=c.text.valid,
-318                                         dataset=c.text,
-319                                         seq_len=c.seq_len)
-320    sampler = RandomSampler(dataset, replacement=c.dataloader_shuffle_with_replacement)
-321
-322    return DataLoader(dataset,
-323                      batch_size=c.batch_size,
-324                      collate_fn=transpose_batch,
-325                      sampler=sampler)
+
311    dataset = SequentialUnBatchedDataset(text=c.text.train,
+312                                         dataset=c.text,
+313                                         seq_len=c.seq_len)
+314    sampler = RandomSampler(dataset, replacement=c.dataloader_shuffle_with_replacement)
+315
+316    return DataLoader(dataset,
+317                      batch_size=c.batch_size,
+318                      collate_fn=transpose_batch,
+319                      sampler=sampler)
+
+ +
+
+ +

Shuffled validation data loader

+ +
+
+
322@option(NLPAutoRegressionConfigs.valid_loader)
+323def shuffled_valid_loader(c: NLPAutoRegressionConfigs):
+
+
+
+
+ + +
+
+
327    dataset = SequentialUnBatchedDataset(text=c.text.valid,
+328                                         dataset=c.text,
+329                                         seq_len=c.seq_len)
+330    sampler = RandomSampler(dataset, replacement=c.dataloader_shuffle_with_replacement)
+331
+332    return DataLoader(dataset,
+333                      batch_size=c.batch_size,
+334                      collate_fn=transpose_batch,
+335                      sampler=sampler)
-
+
-

Initialization

+

Whether to log model parameters and gradients (once per epoch). These are summarized stats per layer, but it could still lead to many indicators for very deep networks.

-
77    def init(self):
+
80    is_log_model_params_grads: bool = False
@@ -314,24 +314,23 @@ -

Set tracker configurations

+

Whether to log model activations (once per epoch). These are summarized stats per layer, but it could still lead to many indicators for very deep networks.

-
82        tracker.set_scalar("accuracy.*", True)
-83        tracker.set_scalar("loss.*", True)
+
85    is_log_model_activations: bool = False
-
+
-

Add a hook to log module outputs

+

Initialization

-
85        hook_model_outputs(self.mode, self.model, 'model')
+
87    def init(self):
@@ -339,23 +338,24 @@ -

Add accuracy as a state module. The name is probably confusing, since it's meant to store states between training and validation for RNNs. This will keep the accuracy metric stats separate for training and validation.

+

Set tracker configurations

-
90        self.state_modules = [self.accuracy]
+
92        tracker.set_scalar("accuracy.*", True)
+93        tracker.set_scalar("loss.*", True)
-
+
-

Training or validation step

+

Add a hook to log module outputs

-
92    def step(self, batch: any, batch_idx: BatchIndex):
+
95        hook_model_outputs(self.mode, self.model, 'model')
@@ -363,24 +363,23 @@ -

Move data to the device

+

Add accuracy as a state module. The name is probably confusing, since it's meant to store states between training and validation for RNNs. This will keep the accuracy metric stats separate for training and validation.

-
98        data, target = batch[0].to(self.device), batch[1].to(self.device)
+
100        self.state_modules = [self.accuracy]
-
+
-

Update global step (number of tokens processed) when in training mode

+

Training or validation step

-
101        if self.mode.is_train:
-102            tracker.add_global_step(data.shape[1])
+
102    def step(self, batch: any, batch_idx: BatchIndex):
@@ -388,11 +387,11 @@ -

Whether to capture model outputs

+

Move data to the device

-
105        with self.mode.update(is_log_activations=batch_idx.is_last):
+
108        data, target = batch[0].to(self.device), batch[1].to(self.device)
@@ -400,11 +399,12 @@ -

Get model outputs. It's returning a tuple for states when using RNNs. This is not implemented yet. 😜

+

Update global step (number of tokens processed) when in training mode

-
109            output, *_ = self.model(data)
+
111        if self.mode.is_train:
+112            tracker.add_global_step(data.shape[1])
@@ -412,12 +412,11 @@ -

Calculate and log loss

+

Whether to capture model outputs

-
112        loss = self.loss_func(output, target)
-113        tracker.add("loss.", loss)
+
115        with self.mode.update(is_log_activations=batch_idx.is_last and self.is_log_model_activations):
@@ -425,12 +424,11 @@ -

Calculate and log accuracy

+

Get model outputs. It's returning a tuple for states when using RNNs. This is not implemented yet. 😜

-
116        self.accuracy(output, target)
-117        self.accuracy.track()
+
119            output, *_ = self.model(data)
@@ -438,11 +436,12 @@ -

Train the model

+

Calculate and log loss

-
120        if self.mode.is_train:
+
122        loss = self.loss_func(output, target)
+123        tracker.add("loss.", loss)
@@ -450,11 +449,12 @@ -

Calculate gradients

+

Calculate and log accuracy

-
122            loss.backward()
+
126        self.accuracy(output, target)
+127        self.accuracy.track()
@@ -462,11 +462,11 @@ -

Clip gradients

+

Train the model

-
124            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip)
+
130        if self.mode.is_train:
@@ -474,11 +474,11 @@ -

Take optimizer step

+

Calculate gradients

-
126            self.optimizer.step()
+
132            loss.backward()
@@ -486,12 +486,11 @@ -

Log the model parameters and gradients on last batch of every epoch

+

Clip gradients

-
128            if batch_idx.is_last:
-129                tracker.add('model', self.model)
+
134            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip)
@@ -499,11 +498,11 @@ -

Clear the gradients

+

Take optimizer step

-
131            self.optimizer.zero_grad()
+
136            self.optimizer.step()
@@ -511,24 +510,24 @@ -

Save the tracked metrics

+

Log the model parameters and gradients on last batch of every epoch

-
134        tracker.save()
+
138            if batch_idx.is_last and self.is_log_model_params_grads:
+139                tracker.add('model', self.model)
-
+
-

Default optimizer configurations

+

Clear the gradients

-
137@option(NLPClassificationConfigs.optimizer)
-138def _optimizer(c: NLPClassificationConfigs):
+
141            self.optimizer.zero_grad()
@@ -536,15 +535,11 @@ - +

Save the tracked metrics

+
-
143    optimizer = OptimizerConfigs()
-144    optimizer.parameters = c.model.parameters()
-145    optimizer.optimizer = 'Adam'
-146    optimizer.d_model = c.d_model
-147
-148    return optimizer
+
144        tracker.save()
@@ -552,15 +547,12 @@ -

Basic english tokenizer

-

We use character level tokenizer in this experiment. You can switch by setting,

-
'tokenizer': 'basic_english',
-

in the configurations dictionary when starting the experiment.

+

Default optimizer configurations

-
151@option(NLPClassificationConfigs.tokenizer)
-152def basic_english():
+
147@option(NLPClassificationConfigs.optimizer)
+148def _optimizer(c: NLPClassificationConfigs):
@@ -571,8 +563,12 @@
-
166    from torchtext.data import get_tokenizer
-167    return get_tokenizer('basic_english')
+
153    optimizer = OptimizerConfigs()
+154    optimizer.parameters = c.model.parameters()
+155    optimizer.optimizer = 'Adam'
+156    optimizer.d_model = c.d_model
+157
+158    return optimizer
@@ -580,11 +576,15 @@ -

Character level tokenizer

+

Basic english tokenizer

+

We use character level tokenizer in this experiment. You can switch by setting,

+
'tokenizer': 'basic_english',
+

in the configurations dictionary when starting the experiment.

-
170def character_tokenizer(x: str):
+
161@option(NLPClassificationConfigs.tokenizer)
+162def basic_english():
@@ -595,7 +595,8 @@
-
174    return list(x)
+
176    from torchtext.data import get_tokenizer
+177    return get_tokenizer('basic_english')
@@ -603,12 +604,11 @@ -

Character level tokenizer configuration

+

Character level tokenizer

-
177@option(NLPClassificationConfigs.tokenizer)
-178def character():
+
180def character_tokenizer(x: str):
@@ -619,7 +619,7 @@
-
182    return character_tokenizer
+
184    return list(x)
@@ -627,12 +627,12 @@ -

Get number of tokens

+

Character level tokenizer configuration

-
185@option(NLPClassificationConfigs.n_tokens)
-186def _n_tokens(c: NLPClassificationConfigs):
+
187@option(NLPClassificationConfigs.tokenizer)
+188def character():
@@ -643,7 +643,7 @@
-
190    return len(c.vocab) + 2
+
192    return character_tokenizer
@@ -651,17 +651,41 @@ +

Get number of tokens

+ +
+
+
195@option(NLPClassificationConfigs.n_tokens)
+196def _n_tokens(c: NLPClassificationConfigs):
+
+ +
+
+ + +
+
+
200    return len(c.vocab) + 2
+
+
+
+
+

Function to load data into batches

-
193class CollateFunc:
+
203class CollateFunc:
-
+
  • tokenizer is the tokenizer function
  • @@ -678,36 +702,7 @@
-
198    def __init__(self, tokenizer, vocab: Vocab, seq_len: int, padding_token: int, classifier_token: int):
-
-
-
-
- - -
-
-
206        self.classifier_token = classifier_token
-207        self.padding_token = padding_token
-208        self.seq_len = seq_len
-209        self.vocab = vocab
-210        self.tokenizer = tokenizer
-
-
-
-
- -
  • batch - is the batch of data collected by the DataLoader -
- -
-
-
212    def __call__(self, batch):
+
208    def __init__(self, tokenizer, vocab: Vocab, seq_len: int, padding_token: int, classifier_token: int):
@@ -715,24 +710,28 @@ -

Input data tensor, initialized with padding_token -

- +
-
218        data = torch.full((self.seq_len, len(batch)), self.padding_token, dtype=torch.long)
+
216        self.classifier_token = classifier_token
+217        self.padding_token = padding_token
+218        self.seq_len = seq_len
+219        self.vocab = vocab
+220        self.tokenizer = tokenizer
-
+
-

Empty labels tensor

+
  • batch + is the batch of data collected by the DataLoader +
-
220        labels = torch.zeros(len(batch), dtype=torch.long)
+
222    def __call__(self, batch):
@@ -740,11 +739,12 @@ -

Loop through the samples

+

Input data tensor, initialized with padding_token +

-
223        for (i, (_label, _text)) in enumerate(batch):
+
228        data = torch.full((self.seq_len, len(batch)), self.padding_token, dtype=torch.long)
@@ -752,11 +752,11 @@ -

Set the label

+

Empty labels tensor

-
225            labels[i] = int(_label) - 1
+
230        labels = torch.zeros(len(batch), dtype=torch.long)
@@ -764,11 +764,11 @@ -

Tokenize the input text

+

Loop through the samples

-
227            _text = [self.vocab[token] for token in self.tokenizer(_text)]
+
233        for (i, (_label, _text)) in enumerate(batch):
@@ -776,12 +776,11 @@ -

Truncate upto seq_len -

+

Set the label

-
229            _text = _text[:self.seq_len]
+
235            labels[i] = int(_label) - 1
@@ -789,11 +788,11 @@ -

Transpose and add to data

+

Tokenize the input text

-
231            data[:len(_text), i] = data.new_tensor(_text)
+
237            _text = [self.vocab[token] for token in self.tokenizer(_text)]
@@ -801,12 +800,12 @@ -

Set the final token in the sequence to [CLS] +

Truncate upto seq_len

-
234        data[-1, :] = self.classifier_token
+
239            _text = _text[:self.seq_len]
@@ -814,17 +813,42 @@ +

Transpose and add to data

+ +
+
+
241            data[:len(_text), i] = data.new_tensor(_text)
+
+ +
+
+ +

Set the final token in the sequence to [CLS] +

+ +
+
+
244        data[-1, :] = self.classifier_token
+
+
+
+
+

-
237        return data, labels
+
247        return data, labels
-
+

AG News dataset

This loads the AG News dataset and the set the values for n_classes @@ -835,36 +859,11 @@

-
240@option([NLPClassificationConfigs.n_classes,
-241         NLPClassificationConfigs.vocab,
-242         NLPClassificationConfigs.train_loader,
-243         NLPClassificationConfigs.valid_loader])
-244def ag_news(c: NLPClassificationConfigs):
-
-
-
-
- -

Get training and validation datasets

- -
-
-
253    train, valid = torchtext.datasets.AG_NEWS(root=str(lab.get_data_path() / 'ag_news'), split=('train', 'test'))
-
-
-
-
- -

Load data to memory

- -
-
-
256    with monit.section('Load data'):
-257        from labml_nn.utils import MapStyleDataset
+
250@option([NLPClassificationConfigs.n_classes,
+251         NLPClassificationConfigs.vocab,
+252         NLPClassificationConfigs.train_loader,
+253         NLPClassificationConfigs.valid_loader])
+254def ag_news(c: NLPClassificationConfigs):
@@ -872,11 +871,11 @@ -

Create map-style datasets

+

Get training and validation datasets

-
260        train, valid = MapStyleDataset(train), MapStyleDataset(valid)
+
263    train, valid = torchtext.datasets.AG_NEWS(root=str(lab.get_data_path() / 'ag_news'), split=('train', 'test'))
@@ -884,11 +883,12 @@ -

Get tokenizer

+

Load data to memory

-
263    tokenizer = c.tokenizer
+
266    with monit.section('Load data'):
+267        from labml_nn.utils import MapStyleDataset
@@ -896,11 +896,11 @@ -

Create a counter

+

Create map-style datasets

-
266    counter = Counter()
+
270        train, valid = MapStyleDataset(train), MapStyleDataset(valid)
@@ -908,12 +908,11 @@ -

Collect tokens from training dataset

+

Get tokenizer

-
268    for (label, line) in train:
-269        counter.update(tokenizer(line))
+
273    tokenizer = c.tokenizer
@@ -921,12 +920,11 @@ -

Collect tokens from validation dataset

+

Create a counter

-
271    for (label, line) in valid:
-272        counter.update(tokenizer(line))
+
276    counter = Counter()
@@ -934,11 +932,12 @@ -

Create vocabulary

+

Collect tokens from training dataset

-
274    vocab = torchtext.vocab.vocab(counter, min_freq=1)
+
278    for (label, line) in train:
+279        counter.update(tokenizer(line))
@@ -946,12 +945,12 @@ -

Create training data loader

+

Collect tokens from validation dataset

-
277    train_loader = DataLoader(train, batch_size=c.batch_size, shuffle=True,
-278                              collate_fn=CollateFunc(tokenizer, vocab, c.seq_len, len(vocab), len(vocab) + 1))
+
281    for (label, line) in valid:
+282        counter.update(tokenizer(line))
@@ -959,12 +958,11 @@ -

Create validation data loader

+

Create vocabulary

-
280    valid_loader = DataLoader(valid, batch_size=c.batch_size, shuffle=True,
-281                              collate_fn=CollateFunc(tokenizer, vocab, c.seq_len, len(vocab), len(vocab) + 1))
+
284    vocab = torchtext.vocab.vocab(counter, min_freq=1)
@@ -972,6 +970,32 @@ +

Create training data loader

+ +
+
+
287    train_loader = DataLoader(train, batch_size=c.batch_size, shuffle=True,
+288                              collate_fn=CollateFunc(tokenizer, vocab, c.seq_len, len(vocab), len(vocab) + 1))
+
+ +
+
+ +

Create validation data loader

+ +
+
+
290    valid_loader = DataLoader(valid, batch_size=c.batch_size, shuffle=True,
+291                              collate_fn=CollateFunc(tokenizer, vocab, c.seq_len, len(vocab), len(vocab) + 1))
+
+
+
+
+

Return n_classes , vocab , train_loader @@ -980,7 +1004,7 @@

-
284    return 4, vocab, train_loader, valid_loader
+
294    return 4, vocab, train_loader, valid_loader
-
15import copy
-16
-17import torch
-18import torch.nn as nn
-19
-20from labml import experiment
-21from labml.configs import option
-22from labml_helpers.module import Module
-23from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
-24from labml_nn.normalization.deep_norm import DeepNormTransformerLayer
-25from labml_nn.transformers import MultiHeadAttention
-26from labml_nn.transformers.feed_forward import FeedForward
+
14import copy
+15
+16import torch
+17import torch.nn as nn
+18
+19from labml import experiment
+20from labml.configs import option
+21from labml_helpers.module import Module
+22from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
+23from labml_nn.normalization.deep_norm import DeepNormTransformerLayer
+24from labml_nn.transformers import MultiHeadAttention
+25from labml_nn.transformers.feed_forward import FeedForward
@@ -98,7 +98,7 @@
-
29class AutoregressiveTransformer(Module):
+
28class AutoregressiveTransformer(Module):
@@ -114,7 +114,7 @@
-
36    def __init__(self, n_tokens: int, d_model: int, n_layers: int, layer: DeepNormTransformerLayer):
+
35    def __init__(self, n_tokens: int, d_model: int, n_layers: int, layer: DeepNormTransformerLayer):
@@ -122,10 +122,10 @@ - +
-
43        super().__init__()
+
42        super().__init__()
@@ -138,7 +138,7 @@
-
45        self.transformer = nn.Sequential(*[copy.deepcopy(layer) for _ in range(n_layers)])
+
44        self.transformer = nn.Sequential(*[copy.deepcopy(layer) for _ in range(n_layers)])
@@ -150,7 +150,7 @@
-
48        self.emb = nn.Embedding(n_tokens, d_model)
+
47        self.emb = nn.Embedding(n_tokens, d_model)
@@ -162,7 +162,7 @@
-
50        self.readout = nn.Linear(d_model, n_tokens)
+
49        self.readout = nn.Linear(d_model, n_tokens)
@@ -175,7 +175,7 @@
-
52    def forward(self, x: torch.Tensor):
+
51    def forward(self, x: torch.Tensor):
@@ -187,7 +187,7 @@
-
57        x = self.emb(x)
+
56        x = self.emb(x)
@@ -199,7 +199,7 @@
-
59        x = self.transformer(x)
+
58        x = self.transformer(x)
@@ -211,7 +211,7 @@
-
61        x = self.readout(x)
+
60        x = self.readout(x)
@@ -223,7 +223,7 @@
-
64        return x, None
+
63        return x, None
@@ -237,7 +237,7 @@
-
67class Configs(NLPAutoRegressionConfigs):
+
66class Configs(NLPAutoRegressionConfigs):
@@ -249,7 +249,7 @@
-
76    model: AutoregressiveTransformer
+
75    model: AutoregressiveTransformer
@@ -261,7 +261,7 @@
-
79    n_layers: int = 64
+
78    n_layers: int = 64
@@ -273,8 +273,8 @@
-
82    deep_norm_alpha: float
-83    deep_norm_beta: float
+
81    deep_norm_alpha: float
+82    deep_norm_beta: float
@@ -286,7 +286,7 @@
-
86    n_heads: int = 4
+
85    n_heads: int = 4
@@ -298,7 +298,7 @@
-
88    d_model: int = 64
+
87    d_model: int = 64
@@ -310,7 +310,7 @@
-
90    d_k: int = 16
+
89    d_k: int = 16
@@ -323,8 +323,8 @@
-
93@option(Configs.deep_norm_alpha)
-94def _deep_norm_alpha(c: Configs):
+
92@option(Configs.deep_norm_alpha)
+93def _deep_norm_alpha(c: Configs):
@@ -332,10 +332,10 @@ - +
-
100    return (2. * c.n_layers) ** (1. / 4.)
+
99    return (2. * c.n_layers) ** (1. / 4.)
@@ -348,8 +348,8 @@
-
103@option(Configs.deep_norm_beta)
-104def _deep_norm_beta(c: Configs):
+
102@option(Configs.deep_norm_beta)
+103def _deep_norm_beta(c: Configs):
@@ -357,10 +357,10 @@ - +
-
110    return (8. * c.n_layers) ** -(1. / 4.)
+
109    return (8. * c.n_layers) ** -(1. / 4.)
@@ -372,8 +372,8 @@
-
113@option(Configs.model)
-114def _model(c: Configs):
+
112@option(Configs.model)
+113def _model(c: Configs):
@@ -381,19 +381,19 @@ - +
-
118    m = AutoregressiveTransformer(c.n_tokens, c.d_model, c.n_layers,
-119                                  DeepNormTransformerLayer(d_model=c.d_model,
-120                                                           deep_norm_alpha=c.deep_norm_alpha,
-121                                                           deep_norm_beta=c.deep_norm_beta,
-122                                                           feed_forward=FeedForward(d_model=c.d_model,
-123                                                                                    d_ff=c.d_model * 4),
-124                                                           self_attn=MultiHeadAttention(c.n_heads, c.d_model,
-125                                                                                        dropout_prob=0.0)))
-126
-127    return m.to(c.device)
+
117    m = AutoregressiveTransformer(c.n_tokens, c.d_model, c.n_layers,
+118                                  DeepNormTransformerLayer(d_model=c.d_model,
+119                                                           deep_norm_alpha=c.deep_norm_alpha,
+120                                                           deep_norm_beta=c.deep_norm_beta,
+121                                                           feed_forward=FeedForward(d_model=c.d_model,
+122                                                                                    d_ff=c.d_model * 4),
+123                                                           self_attn=MultiHeadAttention(c.n_heads, c.d_model,
+124                                                                                        dropout_prob=0.0)))
+125
+126    return m.to(c.device)
@@ -405,7 +405,7 @@
-
130def main():
+
129def main():
@@ -417,7 +417,7 @@
-
135    experiment.create(name="deep_norm", writers={'screen', 'web_api'})
+
134    experiment.create(name="deep_norm", writers={'screen', 'web_api', 'comet'})
@@ -429,7 +429,7 @@
-
137    conf = Configs()
+
136    conf = Configs()
@@ -441,7 +441,7 @@
-
139    experiment.configs(conf, {
+
138    experiment.configs(conf, {
@@ -453,7 +453,7 @@
-
141        'tokenizer': 'character',
+
140        'tokenizer': 'character',
@@ -465,7 +465,7 @@
-
143        'prompt_separator': '',
+
142        'prompt_separator': '',
@@ -477,7 +477,7 @@
-
145        'prompt': 'It is ',
+
144        'prompt': 'It is ',
@@ -489,7 +489,7 @@
-
147        'text': 'tiny_shakespeare',
+
146        'text': 'tiny_shakespeare',
@@ -501,7 +501,7 @@
-
150        'seq_len': 256,
+
149        'seq_len': 256,
@@ -513,7 +513,7 @@
-
152        'epochs': 32,
+
151        'epochs': 32,
@@ -525,7 +525,7 @@
-
154        'batch_size': 16,
+
153        'batch_size': 16,
@@ -537,7 +537,7 @@
-
156        'inner_iterations': 10,
+
155        'inner_iterations': 10,
@@ -549,9 +549,9 @@
-
159        'optimizer.optimizer': 'Adam',
-160        'optimizer.learning_rate': 3e-4,
-161    })
+
158        'optimizer.optimizer': 'Adam',
+159        'optimizer.learning_rate': 3e-4,
+160    })
@@ -563,7 +563,7 @@
-
164    experiment.add_pytorch_models({'model': conf.model})
+
163    experiment.add_pytorch_models({'model': conf.model})
@@ -575,7 +575,7 @@
-
167    with experiment.start():
+
166    with experiment.start():
@@ -587,7 +587,7 @@
-
169        conf.run()
+
168        conf.run()
@@ -599,8 +599,8 @@
-
173if __name__ == '__main__':
-174    main()
+
172if __name__ == '__main__':
+173    main()
-
75from typing import Union, List
-76
-77import torch
-78from torch import nn, Size
-79
-80from labml_nn.normalization.layer_norm import LayerNorm
-81from labml_nn.transformers import MultiHeadAttention
-82from labml_nn.transformers.feed_forward import FeedForward
-83from labml_nn.transformers.utils import subsequent_mask
+
74from typing import Union, List
+75
+76import torch
+77from torch import nn, Size
+78
+79from labml_nn.normalization.layer_norm import LayerNorm
+80from labml_nn.transformers import MultiHeadAttention
+81from labml_nn.transformers.feed_forward import FeedForward
+82from labml_nn.transformers.utils import subsequent_mask
@@ -110,7 +110,7 @@
-
86class DeepNorm(nn.Module):
+
85class DeepNorm(nn.Module):
@@ -125,9 +125,9 @@
-
93    def __init__(self, alpha: float, normalized_shape: Union[int, List[int], Size], *,
-94                 eps: float = 1e-5,
-95                 elementwise_affine: bool = True):
+
92    def __init__(self, alpha: float, normalized_shape: Union[int, List[int], Size], *,
+93                 eps: float = 1e-5,
+94                 elementwise_affine: bool = True):
@@ -135,12 +135,12 @@ - +
-
102        super().__init__()
-103
-104        self.alpha = alpha
+
101        super().__init__()
+102
+103        self.alpha = alpha
@@ -152,7 +152,7 @@
-
106        self.layer_norm = LayerNorm(normalized_shape, eps=eps, elementwise_affine=elementwise_affine)
+
105        self.layer_norm = LayerNorm(normalized_shape, eps=eps, elementwise_affine=elementwise_affine)
@@ -165,7 +165,7 @@
-
108    def forward(self, x: torch.Tensor, gx: torch.Tensor):
+
107    def forward(self, x: torch.Tensor, gx: torch.Tensor):
@@ -177,7 +177,7 @@
-
114        return x + self.alpha * gx
+
113        return x + self.alpha * gx
@@ -190,7 +190,7 @@
-
117class DeepNormTransformerLayer(nn.Module):
+
116class DeepNormTransformerLayer(nn.Module):
@@ -206,13 +206,13 @@
-
124    def __init__(self, *,
-125                 d_model: int,
-126                 self_attn: MultiHeadAttention,
-127                 feed_forward: FeedForward,
-128                 deep_norm_alpha: float,
-129                 deep_norm_beta: float,
-130                 ):
+
123    def __init__(self, *,
+124                 d_model: int,
+125                 self_attn: MultiHeadAttention,
+126                 feed_forward: FeedForward,
+127                 deep_norm_alpha: float,
+128                 deep_norm_beta: float,
+129                 ):
@@ -220,13 +220,13 @@ - +
-
138        super().__init__()
-139
-140        self.self_attn = self_attn
-141        self.feed_forward = feed_forward
+
137        super().__init__()
+138
+139        self.self_attn = self_attn
+140        self.feed_forward = feed_forward
@@ -238,8 +238,8 @@
-
143        self.self_attn_norm = DeepNorm(deep_norm_alpha, [d_model])
-144        self.feed_forward_norm = DeepNorm(deep_norm_alpha, [d_model])
+
142        self.self_attn_norm = DeepNorm(deep_norm_alpha, [d_model])
+143        self.feed_forward_norm = DeepNorm(deep_norm_alpha, [d_model])
@@ -251,7 +251,7 @@
-
147        with torch.no_grad():
+
146        with torch.no_grad():
@@ -263,8 +263,8 @@
-
149            feed_forward.layer1.weight *= deep_norm_beta
-150            feed_forward.layer2.weight *= deep_norm_beta
+
148            feed_forward.layer1.weight *= deep_norm_beta
+149            feed_forward.layer2.weight *= deep_norm_beta
@@ -276,7 +276,7 @@
-
153            self_attn.value.linear.weight *= deep_norm_beta
+
152            self_attn.value.linear.weight *= deep_norm_beta
@@ -288,7 +288,7 @@
-
155            self_attn.output.weight *= deep_norm_beta
+
154            self_attn.output.weight *= deep_norm_beta
@@ -300,7 +300,7 @@
-
158        self.mask = None
+
157        self.mask = None
@@ -313,7 +313,7 @@
-
160    def forward(self, x: torch.Tensor):
+
159    def forward(self, x: torch.Tensor):
@@ -325,7 +325,7 @@
-
165        if self.mask is None or self.mask.size(0) != len(x):
+
164        if self.mask is None or self.mask.size(0) != len(x):
@@ -337,7 +337,7 @@
-
167            self.mask = subsequent_mask(len(x)).to(x.device)
+
166            self.mask = subsequent_mask(len(x)).to(x.device)
@@ -349,7 +349,7 @@
-
170        x = self.self_attn_norm(x, self.self_attn(query=x, key=x, value=x, mask=self.mask))
+
169        x = self.self_attn_norm(x, self.self_attn(query=x, key=x, value=x, mask=self.mask))
@@ -361,7 +361,7 @@
-
172        x = self.feed_forward_norm(x, self.feed_forward(x))
+
171        x = self.feed_forward_norm(x, self.feed_forward(x))
@@ -373,7 +373,7 @@
-
175        return x
+
174        return x