mirror of
https://github.com/labmlai/annotated_deep_learning_paper_implementations.git
synced 2025-08-14 01:13:00 +08:00
copy multiple
This commit is contained in:
179
labml_nn/experiments/copy_perm/continous.py
Normal file
179
labml_nn/experiments/copy_perm/continous.py
Normal file
@ -0,0 +1,179 @@
|
||||
import random
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, Dataset
|
||||
|
||||
from labml import tracker
|
||||
from labml.configs import option
|
||||
from labml_helpers.train_valid import BatchIndex
|
||||
from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs, transpose_batch
|
||||
|
||||
|
||||
class CopyPermRepeatDataset(Dataset):
|
||||
"""
|
||||
"""
|
||||
|
||||
def __init__(self, seq_len: int, substr_len: int, rnd_len: int, n_sequences: int):
|
||||
"""
|
||||
:param seq_len: is the sequence length of generated math problems.
|
||||
We fill as many problems as possible upto this length
|
||||
"""
|
||||
self.rnd_len = rnd_len
|
||||
self.substr_len = substr_len
|
||||
self.n_sequences = n_sequences
|
||||
self.seq_len = seq_len
|
||||
self.letters = 'acgt' # string.ascii_lowercase # '01' # 'acgt' #
|
||||
# Token id to string
|
||||
self.itos = list(self.letters + '>')
|
||||
# Character to token id
|
||||
self.stoi = {c: i for i, c in enumerate(self.itos)}
|
||||
|
||||
def random_string(self, n_chars: int):
|
||||
return ''.join(random.choice(self.letters) for _ in range(n_chars))
|
||||
|
||||
def generate_problem(self):
|
||||
pure = self.random_string(self.substr_len)
|
||||
out = pure
|
||||
mask = [False] * len(out)
|
||||
while len(out) <= self.seq_len:
|
||||
s = self.random_string(random.randrange(1, self.rnd_len))
|
||||
out += s + '>'
|
||||
mask += [False] * (len(s) + 1)
|
||||
pure += s
|
||||
|
||||
offset = random.randrange(0, len(pure) - self.substr_len)
|
||||
copy = pure[offset:offset + self.substr_len]
|
||||
|
||||
out += copy
|
||||
mask += [False] + [True] * (self.substr_len - 1)
|
||||
pure += copy
|
||||
|
||||
return out, mask
|
||||
|
||||
def encode(self, s: str):
|
||||
"""
|
||||
Encode a given string
|
||||
"""
|
||||
return [self.stoi[c] for c in s]
|
||||
|
||||
def decode(self, arr: List[int]):
|
||||
"""
|
||||
Decode a list of token ids
|
||||
"""
|
||||
return ''.join([self.itos[c] for c in arr])
|
||||
|
||||
def __getitem__(self, idx: int):
|
||||
"""
|
||||
Get a input and target pair for auto-regressive modelling
|
||||
"""
|
||||
s, mask = self.generate_problem()
|
||||
s = torch.tensor(self.encode(s))
|
||||
mask = torch.tensor(mask)
|
||||
target = s * mask + -1 * (~mask)
|
||||
return s[:self.seq_len], target[1:self.seq_len + 1]
|
||||
|
||||
def __len__(self):
|
||||
"""
|
||||
Number of sequences per epoch
|
||||
"""
|
||||
return self.n_sequences
|
||||
|
||||
|
||||
class CopyRepeatAutoregression(NLPAutoRegressionConfigs):
|
||||
"""
|
||||
## Arithmetic Task Experiment Configurations
|
||||
"""
|
||||
# Number of training sequences per epoch
|
||||
train_sequences_per_epoch: int = 2 ** 12
|
||||
# Training data loader
|
||||
train_loader: DataLoader = 'copy_train_loader'
|
||||
# Number of problems in evaluation
|
||||
n_tests: int = 64
|
||||
# No need of a validation dataset
|
||||
validator = None
|
||||
# Number of times to run evaluations per epoch
|
||||
inner_iterations = 4
|
||||
# Number of tokens in the vocabulary
|
||||
n_tokens = len(CopyPermRepeatDataset(1, 1, 1, 1).itos)
|
||||
|
||||
substr_len: int = 16
|
||||
rnd_len: int = 16
|
||||
|
||||
@torch.no_grad()
|
||||
def sample(self):
|
||||
pass
|
||||
|
||||
def step(self, batch: any, batch_idx: BatchIndex):
|
||||
"""
|
||||
### Training or validation step
|
||||
"""
|
||||
|
||||
# Set training/eval mode
|
||||
self.model.train(self.mode.is_train)
|
||||
|
||||
# Move data to the device
|
||||
data, target = batch[0].to(self.device), batch[1].to(self.device)
|
||||
|
||||
# Update global step (number of tokens processed) when in training mode
|
||||
if self.mode.is_train:
|
||||
tracker.add_global_step(data.shape[0] * data.shape[1])
|
||||
|
||||
# Whether to capture model outputs
|
||||
with self.mode.update(is_log_activations=batch_idx.is_last and self.is_log_model_activations):
|
||||
# Get model outputs.
|
||||
# It's returning a tuple for states when using RNNs.
|
||||
# This is not implemented yet. 😜
|
||||
output, *_ = self.model(data)
|
||||
|
||||
# Calculate and log loss
|
||||
loss = self.loss_func(output, target)
|
||||
tracker.add("loss.", loss)
|
||||
|
||||
# Calculate and log accuracy
|
||||
self.accuracy(output, target)
|
||||
self.accuracy.track()
|
||||
|
||||
self.other_metrics(output, target)
|
||||
|
||||
# Train the model
|
||||
if self.mode.is_train:
|
||||
# Calculate gradients
|
||||
loss.backward()
|
||||
# Clip gradients
|
||||
torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip)
|
||||
# Take optimizer step
|
||||
self.optimizer.step()
|
||||
# Log the model parameters and gradients on last batch of every epoch
|
||||
if batch_idx.is_last and self.is_log_model_params_grads:
|
||||
tracker.add('model', self.model)
|
||||
# Clear the gradients
|
||||
self.optimizer.zero_grad()
|
||||
|
||||
# Save the tracked metrics
|
||||
tracker.save()
|
||||
|
||||
|
||||
@option(CopyRepeatAutoregression.train_loader)
|
||||
def copy_train_loader(c: CopyRepeatAutoregression):
|
||||
"""
|
||||
Training data loader
|
||||
"""
|
||||
return DataLoader(CopyPermRepeatDataset(c.seq_len, c.substr_len, c.rnd_len, c.train_sequences_per_epoch),
|
||||
batch_size=c.batch_size,
|
||||
collate_fn=transpose_batch)
|
||||
# num_workers=4)
|
||||
|
||||
|
||||
def _test():
|
||||
"""
|
||||
Code to test generated problems
|
||||
"""
|
||||
dataset = CopyPermRepeatDataset(32, 8, 8, 1)
|
||||
|
||||
print(dataset.generate_problem())
|
||||
|
||||
|
||||
#
|
||||
if __name__ == '__main__':
|
||||
_test()
|
@ -19,7 +19,7 @@ from labml.configs import option
|
||||
from labml.logger import Text
|
||||
from labml_helpers.datasets.text import TextDataset, SequentialDataLoader, SequentialUnBatchedDataset, TextFileDataset
|
||||
from labml_helpers.device import DeviceConfigs
|
||||
from labml_helpers.metrics.accuracy import Accuracy, AccuracyMovingAvg
|
||||
from labml_helpers.metrics.accuracy import AccuracyMovingAvg
|
||||
from labml_helpers.module import Module
|
||||
from labml_helpers.train_valid import TrainValidConfigs, hook_model_outputs, BatchIndex
|
||||
from labml_nn.optimizers.configs import OptimizerConfigs
|
||||
@ -30,9 +30,9 @@ class CrossEntropyLoss(Module):
|
||||
### Cross entropy loss
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, ignore_index: int = -100):
|
||||
super().__init__()
|
||||
self.loss = nn.CrossEntropyLoss()
|
||||
self.loss = nn.CrossEntropyLoss(ignore_index=ignore_index)
|
||||
|
||||
def forward(self, outputs, targets):
|
||||
return self.loss(outputs.view(-1, outputs.shape[-1]), targets.view(-1))
|
||||
@ -75,7 +75,7 @@ class NLPAutoRegressionConfigs(TrainValidConfigs):
|
||||
is_save_models = True
|
||||
|
||||
# Loss function
|
||||
loss_func = CrossEntropyLoss()
|
||||
loss_func = CrossEntropyLoss(ignore_index=-1)
|
||||
# Accuracy function
|
||||
accuracy = AccuracyMovingAvg()
|
||||
# Model embedding size
|
||||
@ -297,10 +297,7 @@ def transpose_batch(batch):
|
||||
|
||||
transposed_data = list(zip(*batch))
|
||||
# Stack the batch along the second dimension `dim=1`
|
||||
src = torch.stack(transposed_data[0], dim=1)
|
||||
tgt = torch.stack(transposed_data[1], dim=1)
|
||||
|
||||
return src, tgt
|
||||
return tuple(torch.stack(d, dim=1) for d in transposed_data)
|
||||
|
||||
|
||||
@option(NLPAutoRegressionConfigs.train_loader)
|
||||
|
@ -0,0 +1,93 @@
|
||||
"""
|
||||
---
|
||||
title: Rotary Positional Embeddings with Relative distance (RoPER) Experiment
|
||||
summary: This experiment trains a transformer model with Rotary Positional Embeddings with
|
||||
Relative Distance (RoPER) on the arithmetic addition task.
|
||||
---
|
||||
|
||||
# Rotary Positional Embeddings with Relative distance ([RoPER](index.html)) Experiment
|
||||
"""
|
||||
|
||||
from labml import experiment
|
||||
from labml.configs import calculate
|
||||
from labml_nn.experiments.copy_perm import CopyAutoregression
|
||||
from labml_nn.experiments.copy_perm.continous import CopyRepeatAutoregression
|
||||
from labml_nn.transformers import TransformerConfigs
|
||||
from labml_nn.transformers.rope.experiment import Configs as RoPEConfigs
|
||||
|
||||
|
||||
class Configs(RoPEConfigs, CopyRepeatAutoregression):
|
||||
"""
|
||||
We inherit [RoPE experiment](../experiment.html) and use it for
|
||||
[arithmetic addition task](../../experiments/arithmetic_dataset.html).
|
||||
|
||||
We add the option to change attention to use Rotary Positional Embeddings with Relative distance (RoPER)
|
||||
below.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def _rotary_value_pe_mha(c: TransformerConfigs):
|
||||
"""
|
||||
Use Rotary Positional Embeddings with Relative distance ([RoPER](index.html)) in attention.
|
||||
"""
|
||||
from labml_nn.transformers.rope.value_pe import RotaryValuePEMultiHeadAttention
|
||||
return RotaryValuePEMultiHeadAttention(c.n_heads, c.d_model, 1., 1.)
|
||||
|
||||
|
||||
# Configuration options
|
||||
calculate(TransformerConfigs.encoder_attn, 'rotary_value', _rotary_value_pe_mha)
|
||||
calculate(TransformerConfigs.decoder_attn, 'rotary_value', _rotary_value_pe_mha)
|
||||
calculate(TransformerConfigs.decoder_mem_attn, 'rotary_value', _rotary_value_pe_mha)
|
||||
|
||||
|
||||
def main():
|
||||
# Create experiment
|
||||
experiment.create(name="roper_copy", comment="rotary rl 01", writers={'screen', 'labml'})
|
||||
# Create configs
|
||||
conf = Configs()
|
||||
# Override configurations
|
||||
experiment.configs(conf, {
|
||||
# No fixed positional embeddings
|
||||
'transformer.src_embed': 'no_pos',
|
||||
'transformer.tgt_embed': 'no_pos',
|
||||
|
||||
# Encoder with RoPER attention
|
||||
# 'transformer.encoder_attn': 'rotary_value',
|
||||
# Encoder with RoPE attention
|
||||
'transformer.encoder_attn': 'relative',
|
||||
|
||||
#
|
||||
'model': 'rotary_pe_transformer',
|
||||
|
||||
# Use a context size of $256$
|
||||
'seq_len': 512,
|
||||
# Train for 32 epochs
|
||||
'epochs': 20,
|
||||
# Batch size $4$
|
||||
'batch_size': 16,
|
||||
|
||||
# Model size
|
||||
'd_model': 128,
|
||||
'transformer.ffn.d_ff': 512,
|
||||
'transformer.n_heads': 4,
|
||||
'transformer.n_layers': 3,
|
||||
'transformer.dropout': 0.0,
|
||||
|
||||
# Use [Adam optimizer](../../optimizers/noam.html)
|
||||
'optimizer.optimizer': 'Adam',
|
||||
'optimizer.learning_rate': 2.5e-4,
|
||||
})
|
||||
|
||||
# Set models for saving and loading
|
||||
experiment.add_pytorch_models({'model': conf.model})
|
||||
|
||||
# Start the experiment
|
||||
with experiment.start():
|
||||
# Run training
|
||||
conf.run()
|
||||
|
||||
|
||||
#
|
||||
if __name__ == '__main__':
|
||||
main()
|
Reference in New Issue
Block a user