From af233cfd73d8346059dc657cf120c2449450ea0a Mon Sep 17 00:00:00 2001 From: Varuna Jayasiri Date: Tue, 7 Jun 2022 14:00:12 +0530 Subject: [PATCH] copy multiple --- .../{copy_perm.py => copy_perm/__init__.py} | 0 labml_nn/experiments/copy_perm/continous.py | 179 ++++++++++++++++++ labml_nn/experiments/nlp_autoregression.py | 13 +- .../rope/value_pe/experiments/__init__.py | 0 .../arithmetic_experiment.py | 0 .../{ => experiments}/copy_experiment.py | 0 .../rope/value_pe/experiments/copy_repeat.py | 93 +++++++++ .../value_pe/{ => experiments}/experiment.py | 0 8 files changed, 277 insertions(+), 8 deletions(-) rename labml_nn/experiments/{copy_perm.py => copy_perm/__init__.py} (100%) create mode 100644 labml_nn/experiments/copy_perm/continous.py create mode 100644 labml_nn/transformers/rope/value_pe/experiments/__init__.py rename labml_nn/transformers/rope/value_pe/{ => experiments}/arithmetic_experiment.py (100%) rename labml_nn/transformers/rope/value_pe/{ => experiments}/copy_experiment.py (100%) create mode 100644 labml_nn/transformers/rope/value_pe/experiments/copy_repeat.py rename labml_nn/transformers/rope/value_pe/{ => experiments}/experiment.py (100%) diff --git a/labml_nn/experiments/copy_perm.py b/labml_nn/experiments/copy_perm/__init__.py similarity index 100% rename from labml_nn/experiments/copy_perm.py rename to labml_nn/experiments/copy_perm/__init__.py diff --git a/labml_nn/experiments/copy_perm/continous.py b/labml_nn/experiments/copy_perm/continous.py new file mode 100644 index 00000000..c462b16c --- /dev/null +++ b/labml_nn/experiments/copy_perm/continous.py @@ -0,0 +1,179 @@ +import random +from typing import List + +import torch +from torch.utils.data import DataLoader, Dataset + +from labml import tracker +from labml.configs import option +from labml_helpers.train_valid import BatchIndex +from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs, transpose_batch + + +class CopyPermRepeatDataset(Dataset): + """ + """ + + def __init__(self, seq_len: int, substr_len: int, rnd_len: int, n_sequences: int): + """ + :param seq_len: is the sequence length of generated math problems. + We fill as many problems as possible upto this length + """ + self.rnd_len = rnd_len + self.substr_len = substr_len + self.n_sequences = n_sequences + self.seq_len = seq_len + self.letters = 'acgt' # string.ascii_lowercase # '01' # 'acgt' # + # Token id to string + self.itos = list(self.letters + '>') + # Character to token id + self.stoi = {c: i for i, c in enumerate(self.itos)} + + def random_string(self, n_chars: int): + return ''.join(random.choice(self.letters) for _ in range(n_chars)) + + def generate_problem(self): + pure = self.random_string(self.substr_len) + out = pure + mask = [False] * len(out) + while len(out) <= self.seq_len: + s = self.random_string(random.randrange(1, self.rnd_len)) + out += s + '>' + mask += [False] * (len(s) + 1) + pure += s + + offset = random.randrange(0, len(pure) - self.substr_len) + copy = pure[offset:offset + self.substr_len] + + out += copy + mask += [False] + [True] * (self.substr_len - 1) + pure += copy + + return out, mask + + def encode(self, s: str): + """ + Encode a given string + """ + return [self.stoi[c] for c in s] + + def decode(self, arr: List[int]): + """ + Decode a list of token ids + """ + return ''.join([self.itos[c] for c in arr]) + + def __getitem__(self, idx: int): + """ + Get a input and target pair for auto-regressive modelling + """ + s, mask = self.generate_problem() + s = torch.tensor(self.encode(s)) + mask = torch.tensor(mask) + target = s * mask + -1 * (~mask) + return s[:self.seq_len], target[1:self.seq_len + 1] + + def __len__(self): + """ + Number of sequences per epoch + """ + return self.n_sequences + + +class CopyRepeatAutoregression(NLPAutoRegressionConfigs): + """ + ## Arithmetic Task Experiment Configurations + """ + # Number of training sequences per epoch + train_sequences_per_epoch: int = 2 ** 12 + # Training data loader + train_loader: DataLoader = 'copy_train_loader' + # Number of problems in evaluation + n_tests: int = 64 + # No need of a validation dataset + validator = None + # Number of times to run evaluations per epoch + inner_iterations = 4 + # Number of tokens in the vocabulary + n_tokens = len(CopyPermRepeatDataset(1, 1, 1, 1).itos) + + substr_len: int = 16 + rnd_len: int = 16 + + @torch.no_grad() + def sample(self): + pass + + def step(self, batch: any, batch_idx: BatchIndex): + """ + ### Training or validation step + """ + + # Set training/eval mode + self.model.train(self.mode.is_train) + + # Move data to the device + data, target = batch[0].to(self.device), batch[1].to(self.device) + + # Update global step (number of tokens processed) when in training mode + if self.mode.is_train: + tracker.add_global_step(data.shape[0] * data.shape[1]) + + # Whether to capture model outputs + with self.mode.update(is_log_activations=batch_idx.is_last and self.is_log_model_activations): + # Get model outputs. + # It's returning a tuple for states when using RNNs. + # This is not implemented yet. 😜 + output, *_ = self.model(data) + + # Calculate and log loss + loss = self.loss_func(output, target) + tracker.add("loss.", loss) + + # Calculate and log accuracy + self.accuracy(output, target) + self.accuracy.track() + + self.other_metrics(output, target) + + # Train the model + if self.mode.is_train: + # Calculate gradients + loss.backward() + # Clip gradients + torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip) + # Take optimizer step + self.optimizer.step() + # Log the model parameters and gradients on last batch of every epoch + if batch_idx.is_last and self.is_log_model_params_grads: + tracker.add('model', self.model) + # Clear the gradients + self.optimizer.zero_grad() + + # Save the tracked metrics + tracker.save() + + +@option(CopyRepeatAutoregression.train_loader) +def copy_train_loader(c: CopyRepeatAutoregression): + """ + Training data loader + """ + return DataLoader(CopyPermRepeatDataset(c.seq_len, c.substr_len, c.rnd_len, c.train_sequences_per_epoch), + batch_size=c.batch_size, + collate_fn=transpose_batch) + # num_workers=4) + + +def _test(): + """ + Code to test generated problems + """ + dataset = CopyPermRepeatDataset(32, 8, 8, 1) + + print(dataset.generate_problem()) + + +# +if __name__ == '__main__': + _test() diff --git a/labml_nn/experiments/nlp_autoregression.py b/labml_nn/experiments/nlp_autoregression.py index 1f4d1f40..cdcd0cc6 100644 --- a/labml_nn/experiments/nlp_autoregression.py +++ b/labml_nn/experiments/nlp_autoregression.py @@ -19,7 +19,7 @@ from labml.configs import option from labml.logger import Text from labml_helpers.datasets.text import TextDataset, SequentialDataLoader, SequentialUnBatchedDataset, TextFileDataset from labml_helpers.device import DeviceConfigs -from labml_helpers.metrics.accuracy import Accuracy, AccuracyMovingAvg +from labml_helpers.metrics.accuracy import AccuracyMovingAvg from labml_helpers.module import Module from labml_helpers.train_valid import TrainValidConfigs, hook_model_outputs, BatchIndex from labml_nn.optimizers.configs import OptimizerConfigs @@ -30,9 +30,9 @@ class CrossEntropyLoss(Module): ### Cross entropy loss """ - def __init__(self): + def __init__(self, ignore_index: int = -100): super().__init__() - self.loss = nn.CrossEntropyLoss() + self.loss = nn.CrossEntropyLoss(ignore_index=ignore_index) def forward(self, outputs, targets): return self.loss(outputs.view(-1, outputs.shape[-1]), targets.view(-1)) @@ -75,7 +75,7 @@ class NLPAutoRegressionConfigs(TrainValidConfigs): is_save_models = True # Loss function - loss_func = CrossEntropyLoss() + loss_func = CrossEntropyLoss(ignore_index=-1) # Accuracy function accuracy = AccuracyMovingAvg() # Model embedding size @@ -297,10 +297,7 @@ def transpose_batch(batch): transposed_data = list(zip(*batch)) # Stack the batch along the second dimension `dim=1` - src = torch.stack(transposed_data[0], dim=1) - tgt = torch.stack(transposed_data[1], dim=1) - - return src, tgt + return tuple(torch.stack(d, dim=1) for d in transposed_data) @option(NLPAutoRegressionConfigs.train_loader) diff --git a/labml_nn/transformers/rope/value_pe/experiments/__init__.py b/labml_nn/transformers/rope/value_pe/experiments/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/labml_nn/transformers/rope/value_pe/arithmetic_experiment.py b/labml_nn/transformers/rope/value_pe/experiments/arithmetic_experiment.py similarity index 100% rename from labml_nn/transformers/rope/value_pe/arithmetic_experiment.py rename to labml_nn/transformers/rope/value_pe/experiments/arithmetic_experiment.py diff --git a/labml_nn/transformers/rope/value_pe/copy_experiment.py b/labml_nn/transformers/rope/value_pe/experiments/copy_experiment.py similarity index 100% rename from labml_nn/transformers/rope/value_pe/copy_experiment.py rename to labml_nn/transformers/rope/value_pe/experiments/copy_experiment.py diff --git a/labml_nn/transformers/rope/value_pe/experiments/copy_repeat.py b/labml_nn/transformers/rope/value_pe/experiments/copy_repeat.py new file mode 100644 index 00000000..40c3ac85 --- /dev/null +++ b/labml_nn/transformers/rope/value_pe/experiments/copy_repeat.py @@ -0,0 +1,93 @@ +""" +--- +title: Rotary Positional Embeddings with Relative distance (RoPER) Experiment +summary: This experiment trains a transformer model with Rotary Positional Embeddings with + Relative Distance (RoPER) on the arithmetic addition task. +--- + +# Rotary Positional Embeddings with Relative distance ([RoPER](index.html)) Experiment +""" + +from labml import experiment +from labml.configs import calculate +from labml_nn.experiments.copy_perm import CopyAutoregression +from labml_nn.experiments.copy_perm.continous import CopyRepeatAutoregression +from labml_nn.transformers import TransformerConfigs +from labml_nn.transformers.rope.experiment import Configs as RoPEConfigs + + +class Configs(RoPEConfigs, CopyRepeatAutoregression): + """ + We inherit [RoPE experiment](../experiment.html) and use it for + [arithmetic addition task](../../experiments/arithmetic_dataset.html). + + We add the option to change attention to use Rotary Positional Embeddings with Relative distance (RoPER) + below. + """ + pass + + +def _rotary_value_pe_mha(c: TransformerConfigs): + """ + Use Rotary Positional Embeddings with Relative distance ([RoPER](index.html)) in attention. + """ + from labml_nn.transformers.rope.value_pe import RotaryValuePEMultiHeadAttention + return RotaryValuePEMultiHeadAttention(c.n_heads, c.d_model, 1., 1.) + + +# Configuration options +calculate(TransformerConfigs.encoder_attn, 'rotary_value', _rotary_value_pe_mha) +calculate(TransformerConfigs.decoder_attn, 'rotary_value', _rotary_value_pe_mha) +calculate(TransformerConfigs.decoder_mem_attn, 'rotary_value', _rotary_value_pe_mha) + + +def main(): + # Create experiment + experiment.create(name="roper_copy", comment="rotary rl 01", writers={'screen', 'labml'}) + # Create configs + conf = Configs() + # Override configurations + experiment.configs(conf, { + # No fixed positional embeddings + 'transformer.src_embed': 'no_pos', + 'transformer.tgt_embed': 'no_pos', + + # Encoder with RoPER attention + # 'transformer.encoder_attn': 'rotary_value', + # Encoder with RoPE attention + 'transformer.encoder_attn': 'relative', + + # + 'model': 'rotary_pe_transformer', + + # Use a context size of $256$ + 'seq_len': 512, + # Train for 32 epochs + 'epochs': 20, + # Batch size $4$ + 'batch_size': 16, + + # Model size + 'd_model': 128, + 'transformer.ffn.d_ff': 512, + 'transformer.n_heads': 4, + 'transformer.n_layers': 3, + 'transformer.dropout': 0.0, + + # Use [Adam optimizer](../../optimizers/noam.html) + 'optimizer.optimizer': 'Adam', + 'optimizer.learning_rate': 2.5e-4, + }) + + # Set models for saving and loading + experiment.add_pytorch_models({'model': conf.model}) + + # Start the experiment + with experiment.start(): + # Run training + conf.run() + + +# +if __name__ == '__main__': + main() diff --git a/labml_nn/transformers/rope/value_pe/experiment.py b/labml_nn/transformers/rope/value_pe/experiments/experiment.py similarity index 100% rename from labml_nn/transformers/rope/value_pe/experiment.py rename to labml_nn/transformers/rope/value_pe/experiments/experiment.py