gpt notes

This commit is contained in:
Varuna Jayasiri
2021-01-14 09:37:12 +05:30
parent 4c769128cb
commit 2927aa217b
6 changed files with 215 additions and 73 deletions

View File

@ -40,7 +40,9 @@ class CrossEntropyLoss(Module):
class NLPAutoRegressionConfigs(TrainValidConfigs):
"""
<a id="NLPAutoRegressionConfigs">
## Trainer configurations
</a>
This has the basic configurations for NLP auto-regressive task training.
All the properties are configurable.

View File

@ -51,7 +51,7 @@ class AdaBelief(RAdam):
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
weight_decay: WeightDecay = WeightDecay(), amsgrad=False,
degenerated_to_sgd=True,
degenerate_to_sgd=True,
rectify=True, defaults=None):
"""
### Initialize the optimizer
@ -71,7 +71,7 @@ class AdaBelief(RAdam):
"""
defaults = {} if defaults is None else defaults
super().__init__(params, lr, betas, eps, weight_decay, amsgrad, degenerated_to_sgd, defaults)
super().__init__(params, lr, betas, eps, weight_decay, amsgrad, degenerate_to_sgd, defaults)
self.rectify = rectify
def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Parameter):

View File

@ -17,7 +17,9 @@ from labml_nn.optimizers.amsgrad import AMSGrad
class AdamWarmupCosineDecay(AMSGrad):
"""
## Adam Optimizer with Warmup
<a id="EmbeddingsWithPositionalEncoding">
## Adam Optimizer with Warmup and Cosine Decay
</a>
This class extends from AMSGrad optimizer defined in [`amsgrad.py`](amsgrad.html).
"""

View File

@ -3,6 +3,8 @@
title: Configurable optimizer module
summary: This implements a configurable module for optimizers.
---
# Configurable Optimizer
"""
from typing import Tuple
@ -14,28 +16,56 @@ from labml_nn.optimizers import WeightDecay
class OptimizerConfigs(BaseConfigs):
"""
<a id="OptimizerConfigs">
## Optimizer Configurations
</a>
"""
# Optimizer
optimizer: torch.optim.Adam
# Weight decay
weight_decay_obj: WeightDecay
# Whether weight decay is decoupled;
# i.e. weight decay is not added to gradients
weight_decouple: bool = True
# Weight decay
weight_decay: float = 0.0
# Whether weight decay is absolute or should be multiplied by learning rate
weight_decay_absolute: bool = False
# Whether the adam update is optimized (different epsilon)
optimized_adam_update: bool = True
# Parameters to be optimized
parameters: any
# Learning rate $\alpha$
learning_rate: float = 0.01
# Beta values $(\beta_1, \beta_2)$ for Adam
betas: Tuple[float, float] = (0.9, 0.999)
# Epsilon $\epsilon$ for adam
eps: float = 1e-08
# Momentum for SGD
momentum: float = 0.5
# Whether to use AMSGrad
amsgrad: bool = False
# Number of warmup optimizer steps
warmup: int = 2_000
# Total number of optimizer steps (for cosine decay)
total_steps: int = int(1e10)
degenerated_to_sgd: bool = True
# Whether to degenerate to SGD in AdaBelief
degenerate_to_sgd: bool = True
# Whether to use Rectified Adam in AdaBelief
rectify: bool = True
# Model embedding size for Noam optimizer
d_model: int
step_factor: int = 1024
def __init__(self):
super().__init__(_primary='optimizer')
@ -84,7 +114,7 @@ def _radam_optimizer(c: OptimizerConfigs):
return RAdam(c.parameters,
lr=c.learning_rate, betas=c.betas, eps=c.eps,
weight_decay=c.weight_decay_obj, amsgrad=c.amsgrad,
degenerated_to_sgd=c.degenerated_to_sgd)
degenerated_to_sgd=c.degenerate_to_sgd)
@option(OptimizerConfigs.optimizer, 'AdaBelief')
@ -93,7 +123,7 @@ def _ada_belief_optimizer(c: OptimizerConfigs):
return AdaBelief(c.parameters,
lr=c.learning_rate, betas=c.betas, eps=c.eps,
weight_decay=c.weight_decay_obj, amsgrad=c.amsgrad,
degenerated_to_sgd=c.degenerated_to_sgd,
degenerate_to_sgd=c.degenerate_to_sgd,
rectify=c.rectify)

View File

@ -18,60 +18,98 @@ from .models import EmbeddingsWithPositionalEncoding, EmbeddingsWithLearnedPosit
class TransformerConfigs(BaseConfigs):
"""
<a id="TransformerConfigs">
## Transformer Configurations
</a>
This defines configurations for a transformer.
The configurations are calculate using option functions.
These are lazy loaded and therefore only the necessary modules
are calculated.
"""
# Number of attention heads
n_heads: int = 8
# Transformer embedding size
d_model: int = 512
# Number of layers
n_layers: int = 6
# Number of features in position-wise feedforward layer
d_ff: int = 2048
# Dropout probability
dropout: float = 0.1
# Number of tokens in the source vocabulary (for token embeddings)
n_src_vocab: int
# Number of tokens in the target vocabulary (to generate logits for prediction)
n_tgt_vocab: int
# The encoder self attention
encoder_attn: MultiHeadAttention = 'mha'
# The decoder self attention
decoder_attn: MultiHeadAttention = 'mha'
# The decoder memory attention
decoder_mem_attn: MultiHeadAttention = 'mha'
# Position-wise feedforward layer
feed_forward: FeedForward
# Activation in position-wise feedforward layer
feed_forward_activation: nn.Module = 'ReLU'
encoder_layer: TransformerLayer = 'normal'
decoder_layer: TransformerLayer = 'normal'
# Encoder layer
encoder_layer: TransformerLayer = 'default'
# Decoder layer
decoder_layer: TransformerLayer = 'default'
encoder: Encoder = 'normal'
decoder: Decoder = 'normal'
# Encoder consisting of multiple encoder layers
encoder: Encoder = 'default'
# Encoder consisting of multiple decoder layers
decoder: Decoder = 'default'
# Embedding layer for source
src_embed: Module = 'fixed_pos'
# Embedding layer for target (for decoder)
tgt_embed: Module = 'fixed_pos'
# Logit generator for prediction
generator: Generator = 'default'
# Encoder-decoder
encoder_decoder: EncoderDecoder
@option(TransformerConfigs.feed_forward_activation, 'ReLU')
def _feed_forward_activation_relu():
"""
ReLU activation
"""
return nn.ReLU()
@option(TransformerConfigs.feed_forward_activation, 'GELU')
def _feed_forward_activation_relu():
"""
GELU activation
"""
return nn.GELU()
@option(TransformerConfigs.feed_forward, 'default')
def _feed_forward(c: TransformerConfigs):
"""
Create feedforward layer
"""
return FeedForward(c.d_model, c.d_ff, c.dropout, c.feed_forward_activation)
# ## MHA
# ### Multi-head Attention
def _mha(c: TransformerConfigs):
return MultiHeadAttention(c.n_heads, c.d_model)
calculate(TransformerConfigs.encoder_attn, 'mha', _mha)
calculate(TransformerConfigs.decoder_attn, 'mha', _mha)
calculate(TransformerConfigs.decoder_mem_attn, 'mha', _mha)
# ## Relative MHA
# ### Relative Multi-head Attention
def _relative_mha(c: TransformerConfigs):
from .relative_mha import RelativeMultiHeadAttention
return RelativeMultiHeadAttention(c.n_heads, c.d_model)
@ -82,60 +120,90 @@ calculate(TransformerConfigs.decoder_attn, 'relative', _relative_mha)
calculate(TransformerConfigs.decoder_mem_attn, 'relative', _relative_mha)
@option(TransformerConfigs.encoder_layer, 'normal')
@option(TransformerConfigs.encoder_layer, 'default')
def _encoder_layer(c: TransformerConfigs):
"""
Encoder layer
"""
return TransformerLayer(d_model=c.d_model, self_attn=c.encoder_attn,
src_attn=None, feed_forward=copy.deepcopy(c.feed_forward),
dropout_prob=c.dropout)
@option(TransformerConfigs.decoder_layer, 'normal')
@option(TransformerConfigs.decoder_layer, 'default')
def _decoder_layer(c: TransformerConfigs):
"""
Decoder layer
"""
return TransformerLayer(d_model=c.d_model, self_attn=c.decoder_attn,
src_attn=c.decoder_mem_attn, feed_forward=copy.deepcopy(c.feed_forward),
dropout_prob=c.dropout)
@option(TransformerConfigs.encoder, 'normal')
@option(TransformerConfigs.encoder, 'default')
def _encoder(c: TransformerConfigs):
"""
Encoder
"""
return Encoder(c.encoder_layer, c.n_layers)
@option(TransformerConfigs.decoder, 'normal')
@option(TransformerConfigs.decoder, 'default')
def _decoder(c: TransformerConfigs):
"""
Decoder
"""
return Decoder(c.decoder_layer, c.n_layers)
@option(TransformerConfigs.generator, 'default')
def _generator(c: TransformerConfigs):
"""
Logit generator
"""
return Generator(c.n_tgt_vocab, c.d_model)
# ## Positional Embeddings
@option(TransformerConfigs.src_embed, 'fixed_pos')
def _src_embed_with_positional(c: TransformerConfigs):
"""
Source embedding with fixed positional encodings
"""
return EmbeddingsWithPositionalEncoding(c.d_model, c.n_src_vocab)
@option(TransformerConfigs.tgt_embed, 'fixed_pos')
def _tgt_embed_with_positional(c: TransformerConfigs):
"""
Target embedding with fixed positional encodings
"""
return EmbeddingsWithPositionalEncoding(c.d_model, c.n_tgt_vocab)
# ## Learned Positional Embeddings
@option(TransformerConfigs.src_embed, 'learned_pos')
def _src_embed_with_learned_positional(c: TransformerConfigs):
"""
Source embedding with learned positional encodings
"""
return EmbeddingsWithLearnedPositionalEncoding(c.d_model, c.n_src_vocab)
@option(TransformerConfigs.tgt_embed, 'learned_pos')
def _tgt_embed_with_learned_positional(c: TransformerConfigs):
"""
Target embedding with learned positional encodings
"""
return EmbeddingsWithLearnedPositionalEncoding(c.d_model, c.n_tgt_vocab)
# ## No Positional Embeddings
@option(TransformerConfigs.src_embed, 'no_pos')
def _src_embed_without_positional(c: TransformerConfigs):
"""
Source embedding without positional encodings
"""
return nn.Embedding(c.n_src_vocab, c.d_model)
@ -144,6 +212,6 @@ def _tgt_embed_without_positional(c: TransformerConfigs):
return nn.Embedding(c.n_tgt_vocab, c.d_model)
@option(TransformerConfigs.encoder_decoder, 'normal')
@option(TransformerConfigs.encoder_decoder, 'default')
def _encoder_decoder(c: TransformerConfigs):
return EncoderDecoder(c.encoder, c.decoder, c.src_embed, c.tgt_embed, c.generator)

View File

@ -27,11 +27,11 @@ For the transformer we reuse the
"""
import torch
from torch import nn
from labml import experiment
from labml.configs import option
from labml_helpers.module import Module
from torch import nn
from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
from labml_nn.optimizers.configs import OptimizerConfigs
from labml_nn.transformers import TransformerConfigs, Encoder
@ -45,6 +45,7 @@ class GPT(Module):
This consists of a token embedding layer, transformer encoder, and
a final linear layer that gives token logits.
"""
def __init__(self, encoder: Encoder, src_embed: Module, generator: Module):
"""
* `encoder` is the transformer [Encoder](../models.html#Encoder)
@ -82,42 +83,71 @@ class Configs(NLPAutoRegressionConfigs):
"""
## Configurations
This inherits
This inherits from
[`NLPAutoRegressionConfigs`](../../experiments/nlp_autoregression.html#NLPAutoRegressionConfigs)
"""
# GPT model
model: GPT
# Transformer
transformer: TransformerConfigs
# Weight decay
weight_decay: float = 0.1
# Number of tokens for wamup
warmup_steps: int = 128 * 128 * 20
# Custom optimizer
optimizer = 'transformer_optimizer'
@option(Configs.transformer, 'GPT')
def _transformer_configs(c: Configs):
"""
### Transformer configurations
"""
# We use our
# [configurable transformer implementation](../configs.html#TransformerConfigs)
conf = TransformerConfigs()
# Set the vocabulary sizes for embeddings and generating logits
conf.n_src_vocab = c.n_tokens
conf.n_tgt_vocab = c.n_tokens
# GPT uses GELU activation for position wise feedforward
conf.feed_forward_activation = 'GELU'
#
return conf
def _init_weights(module):
if isinstance(module, (nn.Linear, nn.Embedding)):
module.weight.data.normal_(mean=0.0, std=0.02)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
"""
### Initialize weights
Weights of linear layers and embedding layers are initialized
to $\mathcal{N}(0, 0.02)$
instead of the default Xavier initialzation.
"""
if not isinstance(module, (nn.Linear, nn.Embedding)):
return
module.weight.data.normal_(mean=0.0, std=0.02)
# Initialize biases to $0$
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
module.weight.data.fill_(1.0)
@option(Configs.model)
def _model(c: Configs):
"""
Create GPT model and initialize weights
"""
m = GPT(c.transformer.encoder,
c.transformer.src_embed,
c.transformer.generator).to(c.device)
# Apply custom weight initialization
m.apply(_init_weights)
return m
@ -125,39 +155,25 @@ def _model(c: Configs):
@option(NLPAutoRegressionConfigs.optimizer)
def transformer_optimizer(c: NLPAutoRegressionConfigs):
optimizer = OptimizerConfigs()
"""
### Create custom optimizer with weight decay
This code is taken from [minGPT](https://github.com/karpathy/minGPT).
This applies weight decay only to weights of linear layers.
"""
# Collect names of parameters to apply weight decay
decay = set()
no_decay = set()
whitelist_weight_modules = (nn.Linear,)
blacklist_weight_modules = (nn.LayerNorm, nn.Embedding)
for mn, m in c.model.named_modules():
for pn, p in m.named_parameters():
fpn = f'{mn}.{pn}' if mn else pn # full param name
if fpn.find('positional_encodings') != -1:
no_decay.add(fpn)
elif fpn.endswith('bias'):
# all biases will not be decayed
no_decay.add(fpn)
elif fpn.endswith('weight'):
if isinstance(m, whitelist_weight_modules):
# weights of whitelist modules will be weight decayed
decay.add(fpn)
elif isinstance(m, blacklist_weight_modules):
# weights of blacklist modules will NOT be weight decayed
no_decay.add(fpn)
if fpn.endswith('weight') and isinstance(m, nn.Linear):
decay.add(fpn)
# validate that we considered every parameter
# Get all the parameters
param_dict = {pn: p for pn, p in c.model.named_parameters()}
inter_params = decay & no_decay
if inter_params:
raise ValueError("Repeated parameters", inter_params)
missing_params = set(param_dict.keys()) - (decay | no_decay)
if missing_params:
raise ValueError('Missing parameters', missing_params)
# Parameters that are not decayed
no_decay = set(param_dict.keys()) - decay
# create the pytorch optimizer object
opt_groups = [
@ -165,15 +181,33 @@ def transformer_optimizer(c: NLPAutoRegressionConfigs):
{"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
]
# Create a [configurable optimizer](../optimizers/configs.html#OptimizerConfigs),
# so that we can change these simple by passing
# a config dictionary.
optimizer = OptimizerConfigs()
# Set parameter groups for optimization
optimizer.parameters = opt_groups
# Use [cosine decay optimizer](../optimizers/adam_warmup_cosine_decay.html)
# This is what GPT uses
optimizer.optimizer = 'AdamWarmupCosineDecay'
# Set model embedding size, required if we use [Noam optimizer](../optimizers/noam.html)
# which has an exponential decay
optimizer.d_model = c.d_model
# Set default weight decay.
# This is not required since we set the weight decay in the parameter groups
optimizer.weight_decay = c.weight_decay
# GPT uses a maximum learning rate of $6 \times 10^{-4}$
optimizer.learning_rate = 6e-4
# $\beta_1 = 0.9, \beta_2 = 0.95$
optimizer.betas = (0.9, 0.95)
# $\epsilon = 10^{-8}$
optimizer.eps = 1e-8
# Weight decay decoupled from gradients
optimizer.weight_decouple = True
optimizer.total_steps = c.epochs * len(c.text.train)
# Total number of optimization steps for learning rate cosine decay
optimizer.total_steps = c.epochs * len(c.text.train) // (c.batch_size * c.seq_len)
# Number of warmup optimization steps
optimizer.warmup = c.warmup_steps // (c.batch_size * c.seq_len)
return optimizer
@ -185,33 +219,39 @@ def main():
# Create configs
conf = Configs()
# Load configurations
experiment.configs(conf,
# A dictionary of configurations to override
{'tokenizer': 'character',
'prompt_separator': '',
'prompt': 'It is ',
'text': 'tiny_shakespeare',
experiment.configs(conf, {
# Use character level tokenizer
'tokenizer': 'character',
# Prompt separator is blank
'prompt_separator': '',
# Starting prompt for sampling
'prompt': 'It is ',
# Use Tiny Shakespeare dataset
'text': 'tiny_shakespeare',
'seq_len': 128,
'epochs': 32,
'batch_size': 128,
'inner_iterations': 10,
# Use a context size of $128$
'seq_len': 128,
# Train for $32$ epochs
'epochs': 32,
# Batch size $128$
'batch_size': 128,
# Switch between training and validation for $10$ times
# per epoch
'inner_iterations': 10,
# Transformer configurations
'transformer.d_model': 512,
'transformer.d_ff': 2048,
'transformer.n_heads': 8,
'transformer.n_layers': 6})
# This is needed to initialize models
conf.n_tokens = conf.text.n_tokens
# Transformer configurations
'transformer.d_model': 512,
'transformer.d_ff': 2048,
'transformer.n_heads': 8,
'transformer.n_layers': 6
})
# Set models for saving and loading
experiment.add_pytorch_models({'model': conf.model})
# Start the experiment
with experiment.start():
# `TrainValidConfigs.run`
# Run training
conf.run()