This is an annotated PyTorch experiment to train a ALiBi model.
This is based on our GPT model.
17import torch
18from torch.utils.data import DataLoader
19
20from labml import experiment, tracker
21from labml.configs import option, calculate
22from labml_helpers.datasets.text import SequentialUnBatchedDataset
23from labml_nn.transformers.alibi import AlibiMultiHeadAttention
24from labml_nn.experiments.nlp_autoregression import transpose_batch
25from labml_nn.transformers import TransformerConfigs
26from labml_nn.transformers.gpt import Configs as GPTConfigs
29class Configs(GPTConfigs):
ALiBi based transformer (defined below)
37 transformer: TransformerConfigs = 'GPT_ALiBi'
Longer validation set
39 valid_seq_len: int = 128
40 valid_loader = 'shuffled_longer_valid_loader'
Log losses at the initial and final tokens
42 def other_metrics(self, output: torch.Tensor, target: torch.Tensor):
If there are more tokens that the training sequence length (during validation),
47 if self.seq_len < output.shape[0]:
Log the loss at training sequence length
49 tracker.add(f'loss.{self.seq_len - 1}.', self.loss_func(output[self.seq_len - 1], target[self.seq_len - 1]))
Log the loss at the first token
51 tracker.add(f'loss.0.', self.loss_func(output[0], target[0]))
Log the loss at the final token
53 tracker.add(f'loss.{int(output.shape[0]) - 1}.', self.loss_func(output[-1], target[-1]))
Create an ALiBi attention module
56def _alibi_mha(c: TransformerConfigs):
60 return AlibiMultiHeadAttention(c.n_heads, c.d_model, dropout_prob=c.dropout)
Set all attention mechanisms to ALiBi
64calculate(TransformerConfigs.encoder_attn, 'alibi_mha', _alibi_mha)
65calculate(TransformerConfigs.decoder_attn, 'alibi_mha', _alibi_mha)
66calculate(TransformerConfigs.decoder_mem_attn, 'alibi_mha', _alibi_mha)
Shuffled validation data loader with valid_seq_len
sequence length
69@option(Configs.valid_loader)
70def shuffled_longer_valid_loader(c: Configs):
74 return DataLoader(SequentialUnBatchedDataset(text=c.text.valid,
75 dataset=c.text,
76 seq_len=c.valid_seq_len),
77 batch_size=c.batch_size,
78 collate_fn=transpose_batch,
79 shuffle=True)
82@option(Configs.transformer, 'GPT_ALiBi')
83def _transformer_configs(c: Configs):
We use our configurable transformer implementation
90 conf = TransformerConfigs()
Set the vocabulary sizes for embeddings and generating logits
92 conf.n_src_vocab = c.n_tokens
93 conf.n_tgt_vocab = c.n_tokens
GPT uses GELU activation for position wise feedforward
95 conf.ffn.activation = 'GELU'
ALiBi doesn't use positional embeddings
98 conf.src_embed = 'no_pos'
99 conf.tgt_embed = 'no_pos'
Set all attention mechanisms to ALiBi
102 conf.encoder_attn = 'alibi_mha'
103 conf.decoder_attn = 'alibi_mha'
104 conf.decoder_mem_attn = 'alibi_mha'
107 return conf
110def main():
Create experiment
112 experiment.create(name="gpt_alibi")
Create configs
114 conf = Configs()
Override configurations
116 experiment.configs(conf, {
Use character level tokenizer
118 'tokenizer': 'character',
Prompt separator is blank
120 'prompt_separator': '',
Starting prompt for sampling
122 'prompt': 'It is ',
Use Tiny Shakespeare dataset
124 'text': 'tiny_shakespeare',
'text': 'tiny_shakespeare_no_split',
Use a context size of
128 'seq_len': 64,
Use a context size of
130 'valid_seq_len': 80,
Train for epochs
132 'epochs': 128,
Batch size
134 'batch_size': 128,
Switch between training and validation for times per epoch
137 'inner_iterations': 10,
Transformer configurations
140 'transformer.d_model': 128,
141 'transformer.ffn.d_ff': 512,
142 'transformer.n_heads': 8,
143 'transformer.n_layers': 4,
144 'transformer.dropout': 0.1,
145 })
Set models for saving and loading
148 experiment.add_pytorch_models({'model': conf.model})
Start the experiment
151 with experiment.start():
Run training
153 conf.run()
157if __name__ == '__main__':
158 main()