mirror of
https://github.com/labmlai/annotated_deep_learning_paper_implementations.git
synced 2025-08-26 08:41:23 +08:00
multiplication
This commit is contained in:
@ -96,7 +96,7 @@ class ArithmeticAdditionDataset(Dataset):
|
||||
x = self.make_int(n_digits=random.randrange(1, self.max_digits + 1))
|
||||
y = self.make_int(n_digits=random.randrange(1, self.max_digits + 1))
|
||||
|
||||
return f'x={x}+{y};', f'{x + y}'
|
||||
return f'?x={x}+{y};', f'{x + y}'
|
||||
|
||||
def get_packed_math_input(self):
|
||||
"""
|
||||
@ -197,17 +197,17 @@ class ArithmeticAdditionAutoregression(NLPAutoRegressionConfigs):
|
||||
# Get the model prediction (greedy)
|
||||
output = output[-1].argmax(dim=-1)
|
||||
|
||||
# Override with the question
|
||||
for j, p in enumerate(questions):
|
||||
if len(p) > i + 1:
|
||||
output[j] = dataset.stoi[p[i + 1]]
|
||||
|
||||
# Find which sequences have finished
|
||||
finished = finished | (output == new_line)
|
||||
# Skip if all have finished
|
||||
if finished.sum() == len(finished):
|
||||
continue
|
||||
|
||||
# Override with the question
|
||||
for j, p in enumerate(questions):
|
||||
if len(p) > i + 1:
|
||||
output[j] = dataset.stoi[p[i + 1]]
|
||||
|
||||
# Add the next token to the input
|
||||
data = torch.cat([data, output[None, :]], dim=0)
|
||||
|
||||
|
273
labml_nn/experiments/algo_tasks/arithmetic_multiplication.py
Normal file
273
labml_nn/experiments/algo_tasks/arithmetic_multiplication.py
Normal file
@ -0,0 +1,273 @@
|
||||
"""
|
||||
---
|
||||
title: Arithmetic Dataset
|
||||
summary: >
|
||||
This creates arithmetic problems.
|
||||
---
|
||||
|
||||
*This is based on code by [Georges Harik (@gharik)](https://twitter.com/gharik).*
|
||||
"""
|
||||
|
||||
import random
|
||||
import string
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, Dataset
|
||||
|
||||
from labml import monit, logger, tracker
|
||||
from labml.configs import option
|
||||
from labml.logger import Text
|
||||
from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs, transpose_batch
|
||||
|
||||
|
||||
class ArithmeticMultiplicationDataset(Dataset):
|
||||
"""
|
||||
## Arithmetic Dataset
|
||||
|
||||
This creates arithmetic addition problems and solutions with workings.
|
||||
We've only implemented addition so far.
|
||||
|
||||
It's based on a character level tokenization.
|
||||
"""
|
||||
|
||||
def __init__(self, seq_len: int, max_digits: int, base: int, n_sequences: int):
|
||||
"""
|
||||
:param seq_len: is the sequence length of generated math problems.
|
||||
We fill as many problems as possible upto this length
|
||||
:max_digits: is the maximum number of digits in the operand integers
|
||||
:n_sequences: is the number of sequences per epoch
|
||||
"""
|
||||
self.base = base
|
||||
self.n_sequences = n_sequences
|
||||
self.max_digits = max_digits
|
||||
self.seq_len = seq_len
|
||||
# Token id to string
|
||||
self.itos = list(string.digits + 'x =\n?*;')
|
||||
# Character to token id
|
||||
self.stoi = {c: i for i, c in enumerate(self.itos)}
|
||||
|
||||
def make_int(self, n_digits: int):
|
||||
"""
|
||||
Generates an integer with `n_digit` number of digits
|
||||
"""
|
||||
res = 0
|
||||
for i in range(n_digits):
|
||||
d = random.randrange(1, self.base + 1) if i == 0 else random.randrange(0, self.base + 1)
|
||||
res = res * self.base + d
|
||||
|
||||
return res
|
||||
|
||||
def get_add_explanation(self, x: int, y: int):
|
||||
"""
|
||||
Generates the workings for `x + y`.
|
||||
For example for `11+29` it generates
|
||||
`1e0+9e0+0e0=10e0 1e0+2e0+1e0=4e0`.
|
||||
"""
|
||||
|
||||
explanation = []
|
||||
while x > 0:
|
||||
rx = x % self.base
|
||||
explanation.append(f"{self.to_string(y * rx)}")
|
||||
x = x // self.base
|
||||
|
||||
return ' '.join(explanation)
|
||||
|
||||
# Make a problem with a pre_explanation or not
|
||||
def make_add_problem(self):
|
||||
"""
|
||||
Creates an arithmetic addition problem with workings and answer.
|
||||
"""
|
||||
x = self.make_int(n_digits=random.randrange(1, self.max_digits + 1))
|
||||
y = self.make_int(n_digits=random.randrange(1, self.max_digits + 1))
|
||||
|
||||
explanation = self.get_add_explanation(x, y)
|
||||
return f"x={self.to_string(x)}*{self.to_string(y)}; {explanation} x=={self.to_string(x * y)}\n"
|
||||
|
||||
def to_string(self, x: int):
|
||||
if x == 0:
|
||||
return '0'
|
||||
a = []
|
||||
while x > 0:
|
||||
a += [f'{x % self.base}']
|
||||
x = x // self.base
|
||||
|
||||
return ''.join(reversed(a))
|
||||
|
||||
def get_qa(self):
|
||||
"""
|
||||
Get arithmetic problem and answer. This is used for evaluation.
|
||||
"""
|
||||
x = self.make_int(n_digits=random.randrange(1, self.max_digits + 1))
|
||||
y = self.make_int(n_digits=random.randrange(1, self.max_digits + 1))
|
||||
|
||||
return f'?x={self.to_string(x)}*{self.to_string(y)};', f'{self.to_string(x * y)}'
|
||||
|
||||
def get_packed_math_input(self):
|
||||
"""
|
||||
Generate multiple problems and pack them into a sequence.
|
||||
"""
|
||||
s_enc = []
|
||||
mask = []
|
||||
while len(s_enc) <= self.seq_len:
|
||||
s_part = self.make_add_problem()
|
||||
s_part_enc = self.encode('?' + s_part)
|
||||
prob, sol = s_part.split(';')
|
||||
mask += [False] * (len(prob) + 2)
|
||||
mask += [True] * len(sol)
|
||||
s_enc = s_enc + s_part_enc
|
||||
return s_enc, mask
|
||||
|
||||
def encode(self, s: str):
|
||||
"""
|
||||
Encode a given string
|
||||
"""
|
||||
return [self.stoi[c] for c in s]
|
||||
|
||||
def decode(self, arr: List[int]):
|
||||
"""
|
||||
Decode a list of token ids
|
||||
"""
|
||||
return ''.join([self.itos[c] for c in arr])
|
||||
|
||||
def __getitem__(self, idx: int):
|
||||
"""
|
||||
Get a input and target pair for auto-regressive modelling
|
||||
"""
|
||||
s, mask = self.get_packed_math_input()
|
||||
s = torch.tensor(s)
|
||||
mask = torch.tensor(mask)
|
||||
target = s * mask + -1 * (~mask)
|
||||
return s[:self.seq_len], target[1:self.seq_len + 1]
|
||||
|
||||
def __len__(self):
|
||||
"""
|
||||
Number of sequences per epoch
|
||||
"""
|
||||
return self.n_sequences
|
||||
|
||||
|
||||
class ArithmeticMultiplicationAutoregression(NLPAutoRegressionConfigs):
|
||||
"""
|
||||
## Arithmetic Task Experiment Configurations
|
||||
"""
|
||||
# Maximum number of digits per operand integer
|
||||
max_digits: int = 4
|
||||
# Number of training sequences per epoch
|
||||
train_sequences_per_epoch: int = 2 ** 12
|
||||
# Training data loader
|
||||
train_loader: DataLoader = 'arithmetic_train_loader'
|
||||
# Number of problems in evaluation
|
||||
n_tests: int = 64
|
||||
# No need of a validation dataset
|
||||
validator = None
|
||||
# Number of times to run evaluations per epoch
|
||||
inner_iterations = 4
|
||||
# Number of tokens in the vocabulary
|
||||
base: int = 10
|
||||
n_tokens = len(ArithmeticMultiplicationDataset(1, 1, 1, 1).itos)
|
||||
|
||||
@torch.no_grad()
|
||||
def sample(self):
|
||||
"""
|
||||
### Evaluation
|
||||
|
||||
We use the sampling function to evaluate the model on a set of problems
|
||||
"""
|
||||
|
||||
# Skip in the first epoch
|
||||
if self.training_loop.idx < 1:
|
||||
return
|
||||
|
||||
# Create a dataset to generate problems
|
||||
dataset = ArithmeticMultiplicationDataset(self.seq_len, self.max_digits, self.base, 1)
|
||||
# Get a set of problems and answers
|
||||
qa = [dataset.get_qa() for _ in range(self.n_tests)]
|
||||
# Collect the problems only
|
||||
questions = [p[0] for p in qa]
|
||||
|
||||
# Create a tensor with only the initial token
|
||||
data = torch.tensor([[dataset.stoi[p[0]] for p in questions]])
|
||||
# Move to device
|
||||
data = data.to(self.device)
|
||||
|
||||
# Number of sequences that have completed
|
||||
finished = torch.zeros((len(questions),)).bool().to(self.device)
|
||||
# Token id of the new line character - this marks end of the answer
|
||||
new_line = dataset.stoi['\n']
|
||||
|
||||
# Sampled results
|
||||
results = [p[0] for p in questions]
|
||||
|
||||
# Sample upto sequence length
|
||||
for i in monit.iterate('Sample', self.seq_len - 1):
|
||||
# If all the sequences have completed we skip this
|
||||
if finished.sum() == len(finished):
|
||||
continue
|
||||
|
||||
# Get the model output
|
||||
output, *_ = self.model(data)
|
||||
# Get the model prediction (greedy)
|
||||
output = output[-1].argmax(dim=-1)
|
||||
|
||||
# Override with the question
|
||||
for j, p in enumerate(questions):
|
||||
if len(p) > i + 1:
|
||||
output[j] = dataset.stoi[p[i + 1]]
|
||||
|
||||
# Find which sequences have finished
|
||||
finished = finished | (output == new_line)
|
||||
# Skip if all have finished
|
||||
if finished.sum() == len(finished):
|
||||
continue
|
||||
|
||||
# Add the next token to the input
|
||||
data = torch.cat([data, output[None, :]], dim=0)
|
||||
|
||||
# Get the sampled results
|
||||
for j, c in enumerate(output):
|
||||
results[j] += dataset.itos[c]
|
||||
|
||||
# Discard everything after the answer in the results
|
||||
results = [r.split('\n')[0] for r in results]
|
||||
|
||||
# Log a sample
|
||||
res_sample = results[0].split(';')
|
||||
logger.log([(res_sample[0], Text.key), (';', Text.subtle), (';'.join(res_sample[1:]), Text.none)])
|
||||
|
||||
# Get the answers
|
||||
results = [r.split('x==')[-1] for r in results]
|
||||
|
||||
# Count the number of correct answers
|
||||
correct = 0
|
||||
for r, _qa in zip(results, qa):
|
||||
if r == _qa[1]:
|
||||
correct += 1
|
||||
|
||||
# Log the score
|
||||
tracker.save('score', correct / len(results))
|
||||
|
||||
|
||||
@option(ArithmeticMultiplicationAutoregression.train_loader)
|
||||
def arithmetic_train_loader(c: ArithmeticMultiplicationAutoregression):
|
||||
"""
|
||||
Training data loader
|
||||
"""
|
||||
return DataLoader(ArithmeticMultiplicationDataset(c.seq_len, c.max_digits, c.base, c.train_sequences_per_epoch),
|
||||
batch_size=c.batch_size,
|
||||
collate_fn=transpose_batch,
|
||||
num_workers=4)
|
||||
|
||||
|
||||
def _test():
|
||||
"""
|
||||
Code to test generated problems
|
||||
"""
|
||||
dataset = ArithmeticMultiplicationDataset(256, 4, 4, 10)
|
||||
|
||||
print(dataset.decode(dataset.get_packed_math_input()[0]))
|
||||
|
||||
|
||||
#
|
||||
if __name__ == '__main__':
|
||||
_test()
|
@ -0,0 +1,94 @@
|
||||
"""
|
||||
---
|
||||
title: Rotary Positional Embeddings with Relative distance (RoPER) Experiment
|
||||
summary: This experiment trains a transformer model with Rotary Positional Embeddings with
|
||||
Relative Distance (RoPER) on the arithmetic addition task.
|
||||
---
|
||||
|
||||
# Rotary Positional Embeddings with Relative distance ([RoPER](index.html)) Experiment
|
||||
"""
|
||||
|
||||
from labml import experiment
|
||||
from labml.configs import calculate
|
||||
from labml_nn.experiments.algo_tasks.arithmetic_multiplication import ArithmeticMultiplicationAutoregression
|
||||
from labml_nn.transformers import TransformerConfigs
|
||||
from labml_nn.transformers.rope.experiment import Configs as RoPEConfigs
|
||||
|
||||
|
||||
class Configs(RoPEConfigs, ArithmeticMultiplicationAutoregression):
|
||||
"""
|
||||
We inherit [RoPE experiment](../experiment.html) and use it for
|
||||
[arithmetic addition task](../../experiments/arithmetic_dataset.html).
|
||||
|
||||
We add the option to change attention to use Rotary Positional Embeddings with Relative distance (RoPER)
|
||||
below.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def _rotary_value_pe_mha(c: TransformerConfigs):
|
||||
"""
|
||||
Use Rotary Positional Embeddings with Relative distance ([RoPER](index.html)) in attention.
|
||||
"""
|
||||
from labml_nn.transformers.rope.value_pe import RotaryValuePEMultiHeadAttention
|
||||
return RotaryValuePEMultiHeadAttention(c.n_heads, c.d_model, 1., 1.)
|
||||
|
||||
|
||||
# Configuration options
|
||||
calculate(TransformerConfigs.encoder_attn, 'rotary_value', _rotary_value_pe_mha)
|
||||
calculate(TransformerConfigs.decoder_attn, 'rotary_value', _rotary_value_pe_mha)
|
||||
calculate(TransformerConfigs.decoder_mem_attn, 'rotary_value', _rotary_value_pe_mha)
|
||||
|
||||
|
||||
def main():
|
||||
# Create experiment
|
||||
experiment.create(name="roper_mult", comment="4", writers={'screen', 'labml'})
|
||||
# Create configs
|
||||
conf = Configs()
|
||||
# Override configurations
|
||||
experiment.configs(conf, {
|
||||
'max_digits': 8,
|
||||
'base': 4,
|
||||
|
||||
# No fixed positional embeddings
|
||||
'transformer.src_embed': 'no_pos',
|
||||
'transformer.tgt_embed': 'no_pos',
|
||||
|
||||
# Encoder with RoPER attention
|
||||
# 'transformer.encoder_attn': 'rotary_value',
|
||||
# Encoder with RoPE attention
|
||||
'transformer.encoder_attn': 'rotary',
|
||||
|
||||
#
|
||||
'model': 'rotary_pe_transformer',
|
||||
|
||||
# Use a context size of $256$
|
||||
'seq_len': 512,
|
||||
# Train for 32 epochs
|
||||
'epochs': 20,
|
||||
# Batch size $4$
|
||||
'batch_size': 16,
|
||||
|
||||
# Model size
|
||||
'd_model': 128,
|
||||
'transformer.ffn.d_ff': 512,
|
||||
'transformer.n_heads': 4,
|
||||
'transformer.dropout': 0.0,
|
||||
|
||||
# Use [Adam optimizer](../../optimizers/noam.html)
|
||||
'optimizer.optimizer': 'Adam',
|
||||
'optimizer.learning_rate': 2.5e-4,
|
||||
})
|
||||
|
||||
# Set models for saving and loading
|
||||
experiment.add_pytorch_models({'model': conf.model})
|
||||
|
||||
# Start the experiment
|
||||
with experiment.start():
|
||||
# Run training
|
||||
conf.run()
|
||||
|
||||
|
||||
#
|
||||
if __name__ == '__main__':
|
||||
main()
|
Reference in New Issue
Block a user