diff --git a/docs/transformers/glu_variants/simple.html b/docs/transformers/glu_variants/simple.html index 207f5c42..ba408fec 100644 --- a/docs/transformers/glu_variants/simple.html +++ b/docs/transformers/glu_variants/simple.html @@ -76,23 +76,25 @@ We try different variants for the position-wise feedforward network.

This is a simpler implementation that doesn’t use labml.configs module. We decided to write a simpler implementation to make it easier readers who are not familiar.

+

Open In Colab +View Run

-
17import dataclasses
-18
-19import torch
-20from torch import nn
-21from torch.utils.data import Dataset, DataLoader
-22
-23from labml import experiment, lab, tracker, monit, logger
-24from labml.logger import Text
-25from labml.utils.download import download_file
-26from labml_nn.experiments.nlp_autoregression import transpose_batch
-27from labml_nn.optimizers.noam import Noam
-28from labml_nn.transformers import Encoder, MultiHeadAttention
-29from labml_nn.transformers.feed_forward import FeedForward
-30from labml_nn.transformers.models import EmbeddingsWithPositionalEncoding, TransformerLayer
-31from labml_nn.transformers.utils import subsequent_mask
+
20import dataclasses
+21
+22import torch
+23from torch import nn
+24from torch.utils.data import Dataset, DataLoader
+25
+26from labml import experiment, lab, tracker, monit, logger
+27from labml.logger import Text
+28from labml.utils.download import download_file
+29from labml_nn.experiments.nlp_autoregression import transpose_batch
+30from labml_nn.optimizers.noam import Noam
+31from labml_nn.transformers import Encoder, MultiHeadAttention
+32from labml_nn.transformers.feed_forward import FeedForward
+33from labml_nn.transformers.models import EmbeddingsWithPositionalEncoding, TransformerLayer
+34from labml_nn.transformers.utils import subsequent_mask
@@ -103,7 +105,7 @@ We decided to write a simpler implementation to make it easier readers who are n

Auto regressive model

-
34class AutoregressiveModel(nn.Module):
+
37class AutoregressiveModel(nn.Module):
@@ -114,8 +116,8 @@ We decided to write a simpler implementation to make it easier readers who are n
-
39    def __init__(self, src_embed: nn.Module, encoder: Encoder, generator: nn.Module):
-40        super().__init__()
+
42    def __init__(self, src_embed: nn.Module, encoder: Encoder, generator: nn.Module):
+43        super().__init__()
@@ -126,7 +128,7 @@ We decided to write a simpler implementation to make it easier readers who are n

Token embedding module

-
42        self.src_embed = src_embed
+
45        self.src_embed = src_embed
@@ -137,7 +139,7 @@ We decided to write a simpler implementation to make it easier readers who are n

Transformer based encoder

-
44        self.encoder = encoder
+
47        self.encoder = encoder
@@ -149,7 +151,7 @@ We decided to write a simpler implementation to make it easier readers who are n this give logits of the the next token

-
47        self.generator = generator
+
50        self.generator = generator
@@ -160,7 +162,7 @@ this give logits of the the next token

This will be initialized on the first call

-
49        self.src_mask = None
+
52        self.src_mask = None
@@ -171,7 +173,7 @@ this give logits of the the next token

-
51    def __call__(self, src: torch.Tensor):
+
54    def __call__(self, src: torch.Tensor):
@@ -182,8 +184,8 @@ this give logits of the the next token

Create subsequent mask, so that the transformer can only pay attention to past tokens.

-
53        if self.src_mask is None or self.src_mask.size(0) != len(src):
-54            self.src_mask = subsequent_mask(len(src)).to(src.device)
+
56        if self.src_mask is None or self.src_mask.size(0) != len(src):
+57            self.src_mask = subsequent_mask(len(src)).to(src.device)
@@ -194,7 +196,7 @@ this give logits of the the next token

Embed the tokens (src) and run it through the the transformer

-
56        res = self.encoder(self.src_embed(src), self.src_mask)
+
59        res = self.encoder(self.src_embed(src), self.src_mask)
@@ -205,7 +207,7 @@ this give logits of the the next token

Generate logits of the next token

-
58        return self.generator(res)
+
61        return self.generator(res)
@@ -216,8 +218,8 @@ this give logits of the the next token

Configurations

-
61@dataclasses.dataclass
-62class Configs:
+
64@dataclasses.dataclass
+65class Configs:
@@ -228,16 +230,16 @@ this give logits of the the next token

-
66    d_model: int = 512
-67    seq_len: int = 128
-68    batch_size: int = 32
-69    n_layers: int = 6
-70    n_heads: int = 8
-71    dropout: float = 0.1
-72    d_ff: int = 2048
-73    glu_variant: str = 'GLU'
-74    epochs: int = 5
-75    grad_norm_clip: float = 0.5
+
69    d_model: int = 512
+70    seq_len: int = 128
+71    batch_size: int = 32
+72    n_layers: int = 6
+73    n_heads: int = 8
+74    dropout: float = 0.1
+75    d_ff: int = 2048
+76    glu_variant: str = 'GLU'
+77    epochs: int = 5
+78    grad_norm_clip: float = 0.5
@@ -248,7 +250,7 @@ this give logits of the the next token

Tiny Shakespeare Dataset

-
78class TinyShakespeareDataset(Dataset):
+
81class TinyShakespeareDataset(Dataset):
@@ -259,7 +261,7 @@ this give logits of the the next token

-
83    def __init__(self, seq_len: int):
+
86    def __init__(self, seq_len: int):
@@ -270,7 +272,7 @@ this give logits of the the next token

Location of the text file

-
85        path = lab.get_data_path() / 'tiny_shakespeare.txt'
+
88        path = lab.get_data_path() / 'tiny_shakespeare.txt'
@@ -281,7 +283,7 @@ this give logits of the the next token

Download the file

-
87        download_file('https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt', path)
+
90        download_file('https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt', path)
@@ -292,8 +294,8 @@ this give logits of the the next token

Read the downloaded file

-
89        with open(str(path), 'r') as f:
-90            text = f.read()
+
92        with open(str(path), 'r') as f:
+93            text = f.read()
@@ -304,7 +306,7 @@ this give logits of the the next token

Extract the characters

-
93        chars = list(set(text))
+
96        chars = list(set(text))
@@ -315,7 +317,7 @@ this give logits of the the next token

Character to id (integer) map

-
95        self.stoi = {c: i for i, c in enumerate(chars)}
+
98        self.stoi = {c: i for i, c in enumerate(chars)}
@@ -326,7 +328,7 @@ this give logits of the the next token

Id to character map

-
97        self.itos = {i: c for i, c in enumerate(chars)}
+
100        self.itos = {i: c for i, c in enumerate(chars)}
@@ -337,7 +339,7 @@ this give logits of the the next token

Length of a training sample

-
99        self.seq_len = seq_len
+
102        self.seq_len = seq_len
@@ -348,7 +350,7 @@ this give logits of the the next token

Data in the form of a tensor of ids

-
101        self.data = self.text_to_i(text)
+
104        self.data = self.text_to_i(text)
@@ -359,7 +361,7 @@ this give logits of the the next token

Transform the text into a tensor of ids

-
103    def text_to_i(self, text: str):
+
106    def text_to_i(self, text: str):
@@ -370,7 +372,7 @@ this give logits of the the next token

-
107        return torch.tensor([self.stoi[c] for c in text], dtype=torch.long)
+
110        return torch.tensor([self.stoi[c] for c in text], dtype=torch.long)
@@ -382,7 +384,7 @@ this give logits of the the next token

This will read the dataset seq_len times in a single epoch.

-
109    def __len__(self):
+
112    def __len__(self):
@@ -393,7 +395,7 @@ this give logits of the the next token

-
115        return len(self.data) - self.seq_len - 1
+
118        return len(self.data) - self.seq_len - 1
@@ -404,7 +406,7 @@ this give logits of the the next token

Return a sample

-
117    def __getitem__(self, idx):
+
120    def __getitem__(self, idx):
@@ -415,7 +417,7 @@ this give logits of the the next token

-
121        return self.data[idx:idx + self.seq_len], self.data[idx + 1:idx + self.seq_len + 1]
+
124        return self.data[idx:idx + self.seq_len], self.data[idx + 1:idx + self.seq_len + 1]
@@ -426,7 +428,7 @@ this give logits of the the next token

Trainer

-
124class Trainer:
+
127class Trainer:
@@ -437,7 +439,7 @@ this give logits of the the next token

-
129    def __init__(self, configs: Configs):
+
132    def __init__(self, configs: Configs):
@@ -448,9 +450,9 @@ this give logits of the the next token

Get the device

-
131        self.device = torch.device('cpu')
-132        if torch.cuda.is_available():
-133            self.device = torch.device('cuda:0')
+
134        self.device = torch.device('cpu')
+135        if torch.cuda.is_available():
+136            self.device = torch.device('cuda:0')
@@ -461,7 +463,7 @@ this give logits of the the next token

Initialize the dataset

-
135        self.dataset = TinyShakespeareDataset(configs.seq_len)
+
138        self.dataset = TinyShakespeareDataset(configs.seq_len)
@@ -472,10 +474,10 @@ this give logits of the the next token

Initialize the dataloader

-
137        self.dataloader = DataLoader(self.dataset,
-138                                     batch_size=configs.batch_size,
-139                                     collate_fn=transpose_batch,
-140                                     shuffle=True)
+
140        self.dataloader = DataLoader(self.dataset,
+141                                     batch_size=configs.batch_size,
+142                                     collate_fn=transpose_batch,
+143                                     shuffle=True)
@@ -488,8 +490,8 @@ this give logits of the the next token

-
144        if configs.glu_variant == 'GLU':
-145            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.Sigmoid(), True, False, False, False)
+
147        if configs.glu_variant == 'GLU':
+148            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.Sigmoid(), True, False, False, False)
@@ -502,8 +504,8 @@ this give logits of the the next token

-
148        elif configs.glu_variant == 'Bilinear':
-149            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.Identity(), True, False, False, False)
+
151        elif configs.glu_variant == 'Bilinear':
+152            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.Identity(), True, False, False, False)
@@ -516,8 +518,8 @@ this give logits of the the next token

-
152        elif configs.glu_variant == 'ReGLU':
-153            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.ReLU(), True, False, False, False)
+
155        elif configs.glu_variant == 'ReGLU':
+156            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.ReLU(), True, False, False, False)
@@ -530,8 +532,8 @@ this give logits of the the next token

-
156        elif configs.glu_variant == 'GEGLU':
-157            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.GELU(), True, False, False, False)
+
159        elif configs.glu_variant == 'GEGLU':
+160            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.GELU(), True, False, False, False)
@@ -544,8 +546,8 @@ this give logits of the the next token

where $\text{Swish}_\beta(x) = x \sigma(\beta x)$

-
161        elif configs.glu_variant == 'SwiGLU':
-162            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.SiLU(), True, False, False, False)
+
164        elif configs.glu_variant == 'SwiGLU':
+165            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.SiLU(), True, False, False, False)
@@ -558,8 +560,8 @@ where $\text{Swish}_\beta(x) = x \sigma(\beta x)$

-
165        elif configs.glu_variant == 'ReLU':
-166            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.ReLU())
+
168        elif configs.glu_variant == 'ReLU':
+169            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.ReLU())
@@ -572,10 +574,10 @@ where $\text{Swish}_\beta(x) = x \sigma(\beta x)$

-
169        elif configs.glu_variant == 'GELU':
-170            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.GELU())
-171        else:
-172            raise ValueError(f'Unknown variant {configs.glu_variant}')
+
172        elif configs.glu_variant == 'GELU':
+173            ffn = FeedForward(configs.d_model, configs.d_ff, configs.dropout, nn.GELU())
+174        else:
+175            raise ValueError(f'Unknown variant {configs.glu_variant}')
@@ -586,7 +588,7 @@ where $\text{Swish}_\beta(x) = x \sigma(\beta x)$

Number of different characters

-
175        n_chars = len(self.dataset.stoi)
+
178        n_chars = len(self.dataset.stoi)
@@ -597,7 +599,7 @@ where $\text{Swish}_\beta(x) = x \sigma(\beta x)$

Initialize Multi-Head Attention module

-
178        mha = MultiHeadAttention(configs.n_heads, configs.d_model, configs.dropout)
+
181        mha = MultiHeadAttention(configs.n_heads, configs.d_model, configs.dropout)
@@ -608,8 +610,8 @@ where $\text{Swish}_\beta(x) = x \sigma(\beta x)$

Initialize the Transformer Block

-
180        transformer_layer = TransformerLayer(d_model=configs.d_model, self_attn=mha, src_attn=None,
-181                                             feed_forward=ffn, dropout_prob=configs.dropout)
+
183        transformer_layer = TransformerLayer(d_model=configs.d_model, self_attn=mha, src_attn=None,
+184                                             feed_forward=ffn, dropout_prob=configs.dropout)
@@ -624,9 +626,9 @@ where $\text{Swish}_\beta(x) = x \sigma(\beta x)$

a linear layer to generate logits.

-
187        self.model = AutoregressiveModel(EmbeddingsWithPositionalEncoding(configs.d_model, n_chars),
-188                                         Encoder(transformer_layer, configs.n_layers),
-189                                         nn.Linear(configs.d_model, n_chars))
+
190        self.model = AutoregressiveModel(EmbeddingsWithPositionalEncoding(configs.d_model, n_chars),
+191                                         Encoder(transformer_layer, configs.n_layers),
+192                                         nn.Linear(configs.d_model, n_chars))
@@ -637,7 +639,7 @@ a linear layer to generate logits.

Move the model to the current device

-
192        self.model.to(self.device)
+
195        self.model.to(self.device)
@@ -648,7 +650,7 @@ a linear layer to generate logits.

Initialize Noam optimizer

-
195        self.optimizer = Noam(self.model.parameters(), lr=1.0, warmup=2_000, d_model=configs.d_model)
+
198        self.optimizer = Noam(self.model.parameters(), lr=1.0, warmup=2_000, d_model=configs.d_model)
@@ -659,7 +661,7 @@ a linear layer to generate logits.

Cross-entropy loss

-
198        self.loss_func = nn.CrossEntropyLoss()
+
201        self.loss_func = nn.CrossEntropyLoss()
@@ -671,7 +673,7 @@ a linear layer to generate logits.

*note that our dataset definition repeats the data seq_len times in a single epoch

-
201        self.epochs = configs.epochs
+
204        self.epochs = configs.epochs
@@ -682,7 +684,7 @@ a linear layer to generate logits.

Gradient clipping norm

-
203        self.grad_norm_clip = configs.grad_norm_clip
+
206        self.grad_norm_clip = configs.grad_norm_clip
@@ -693,7 +695,7 @@ a linear layer to generate logits.

Set tracker configurations

-
206        tracker.set_scalar("loss.*", True)
+
209        tracker.set_scalar("loss.*", True)
@@ -704,7 +706,7 @@ a linear layer to generate logits.

Sampling function to generate samples periodically while training

-
208    def sample(self):
+
211    def sample(self):
@@ -715,7 +717,7 @@ a linear layer to generate logits.

Starting prompt

-
214        prompt = 'It is'
+
217        prompt = 'It is'
@@ -726,7 +728,7 @@ a linear layer to generate logits.

Collect output for printing

-
216        log = [(prompt, Text.subtle)]
+
219        log = [(prompt, Text.subtle)]
@@ -737,7 +739,7 @@ a linear layer to generate logits.

Sample 25 tokens

-
218        for i in monit.iterate('Sample', 25):
+
221        for i in monit.iterate('Sample', 25):
@@ -748,8 +750,8 @@ a linear layer to generate logits.

Tokenize the prompt

-
220            data = self.dataset.text_to_i(prompt).unsqueeze(-1)
-221            data = data.to(self.device)
+
223            data = self.dataset.text_to_i(prompt).unsqueeze(-1)
+224            data = data.to(self.device)
@@ -760,7 +762,7 @@ a linear layer to generate logits.

Get the model output

-
223            output = self.model(data)
+
226            output = self.model(data)
@@ -771,7 +773,7 @@ a linear layer to generate logits.

Get the model prediction (greedy)

-
225            output = output.argmax(dim=-1).squeeze()
+
228            output = output.argmax(dim=-1).squeeze()
@@ -782,7 +784,7 @@ a linear layer to generate logits.

Add the prediction to prompt

-
227            prompt += self.dataset.itos[output[-1].item()]
+
230            prompt += self.dataset.itos[output[-1].item()]
@@ -793,7 +795,7 @@ a linear layer to generate logits.

Add the prediction for logging

-
229            log += [(self.dataset.itos[output[-1].item()], Text.value)]
+
232            log += [(self.dataset.itos[output[-1].item()], Text.value)]
@@ -804,7 +806,7 @@ a linear layer to generate logits.

Print the sampled output

-
232        logger.log(log)
+
235        logger.log(log)
@@ -815,7 +817,7 @@ a linear layer to generate logits.

Train the model

-
234    def train(self):
+
237    def train(self):
@@ -826,7 +828,7 @@ a linear layer to generate logits.

Loop for the given number of epochs

-
240        for _ in monit.loop(self.epochs):
+
243        for _ in monit.loop(self.epochs):
@@ -837,7 +839,7 @@ a linear layer to generate logits.

Iterate over the minibatches

-
242            for i, batch in monit.enum('Train', self.dataloader):
+
245            for i, batch in monit.enum('Train', self.dataloader):
@@ -848,7 +850,7 @@ a linear layer to generate logits.

Move data to the device

-
244                data, target = batch[0].to(self.device), batch[1].to(self.device)
+
247                data, target = batch[0].to(self.device), batch[1].to(self.device)
@@ -859,7 +861,7 @@ a linear layer to generate logits.

Set tracker step, as the number of characters trained on

-
247                tracker.add_global_step(data.shape[0] * data.shape[1])
+
250                tracker.add_global_step(data.shape[0] * data.shape[1])
@@ -870,7 +872,7 @@ a linear layer to generate logits.

Set model state to training

-
250                self.model.train()
+
253                self.model.train()
@@ -881,7 +883,7 @@ a linear layer to generate logits.

Evaluate the model

-
252                output = self.model(data)
+
255                output = self.model(data)
@@ -892,7 +894,7 @@ a linear layer to generate logits.

Calculate loss

-
255                loss = self.loss_func(output.view(-1, output.shape[-1]), target.view(-1))
+
258                loss = self.loss_func(output.view(-1, output.shape[-1]), target.view(-1))
@@ -903,7 +905,7 @@ a linear layer to generate logits.

Log the loss

-
257                tracker.add("loss.train", loss)
+
260                tracker.add("loss.train", loss)
@@ -914,7 +916,7 @@ a linear layer to generate logits.

Calculate gradients

-
260                loss.backward()
+
263                loss.backward()
@@ -925,7 +927,7 @@ a linear layer to generate logits.

Clip gradients

-
262                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip)
+
265                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip)
@@ -936,7 +938,7 @@ a linear layer to generate logits.

Take optimizer step

-
264                self.optimizer.step()
+
267                self.optimizer.step()
@@ -947,8 +949,8 @@ a linear layer to generate logits.

Log the model parameters and gradients

-
266                if (i + 1) % 100 == 0:
-267                    tracker.add('model', self.model)
+
269                if (i + 1) % 100 == 0:
+270                    tracker.add('model', self.model)
@@ -959,7 +961,7 @@ a linear layer to generate logits.

Clear the gradients

-
269                self.optimizer.zero_grad()
+
272                self.optimizer.zero_grad()
@@ -970,10 +972,10 @@ a linear layer to generate logits.

Generate a sample

-
272                if (i + 1) % 100 == 0:
-273                    self.model.eval()
-274                    with torch.no_grad():
-275                        self.sample()
+
275                if (i + 1) % 100 == 0:
+276                    self.model.eval()
+277                    with torch.no_grad():
+278                        self.sample()
@@ -984,8 +986,8 @@ a linear layer to generate logits.

Save the tracked metrics

-
278                if (i + 1) % 10 == 0:
-279                    tracker.save()
+
281                if (i + 1) % 10 == 0:
+282                    tracker.save()
@@ -996,7 +998,7 @@ a linear layer to generate logits.

Save the model

-
282            experiment.save_checkpoint()
+
285            experiment.save_checkpoint()
@@ -1007,7 +1009,7 @@ a linear layer to generate logits.

-
285def main():
+
288def main():
@@ -1018,7 +1020,7 @@ a linear layer to generate logits.

Create experiment

-
287    experiment.create(name="glu_variants")
+
290    experiment.create(name="glu_variants")
@@ -1029,7 +1031,7 @@ a linear layer to generate logits.

Create configs

-
289    configs = Configs()
+
292    configs = Configs()
@@ -1040,7 +1042,7 @@ a linear layer to generate logits.

Load configurations

-
291    experiment.configs(dataclasses.asdict(configs))
+
294    experiment.configs(dataclasses.asdict(configs))
@@ -1051,7 +1053,7 @@ a linear layer to generate logits.

Create trainer

-
294    trainer = Trainer(configs)
+
297    trainer = Trainer(configs)
@@ -1062,7 +1064,7 @@ a linear layer to generate logits.

Set models for training and loading

-
296    experiment.add_pytorch_models({'model': trainer.model})
+
299    experiment.add_pytorch_models({'model': trainer.model})
@@ -1073,7 +1075,7 @@ a linear layer to generate logits.

Start the experiment

-
299    with experiment.start():
+
302    with experiment.start():
@@ -1084,11 +1086,11 @@ a linear layer to generate logits.

Train the model

-
301        trainer.train()
-302
-303
-304if __name__ == '__main__':
-305    main()
+
304        trainer.train()
+305
+306
+307if __name__ == '__main__':
+308    main()
diff --git a/labml_nn/transformers/glu_variants/simple.ipynb b/labml_nn/transformers/glu_variants/simple.ipynb new file mode 100644 index 00000000..02935b2a --- /dev/null +++ b/labml_nn/transformers/glu_variants/simple.ipynb @@ -0,0 +1,280 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Gated Linear Units and Variants", + "provenance": [], + "collapsed_sections": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "AYV_dMVDxyc2" + }, + "source": [ + "[![Github](https://img.shields.io/github/stars/lab-ml/nn?style=social)](https://github.com/lab-ml/nn)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/glu_variants/simple.ipynb) \n", + "\n", + "## Gated Linear Units and Variants\n", + "\n", + "This trains a simple [transformer](https://lab-ml.com/labml_nn/transformers/) model for auto-regression.\n", + "We try different variants for the [position-wise feedforward network](https://lab-ml.com/labml_nn/transformers/feed_forward.html).\n", + "\n", + "Annotated trainer code is at [`simple.py`](https://lab-ml.com/labml_nn/transformers/glu_variants/simple.html)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AahG_i2y5tY9" + }, + "source": [ + "Install the `labml-nn` package" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ZCzmCrAIVg0L", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "2de76edb-9911-496d-9f8c-281dad6f5680" + }, + "source": [ + "!pip install labml-nn" + ], + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Requirement already satisfied: labml-nn in /usr/local/lib/python3.6/dist-packages (0.4.82)\n", + "Requirement already satisfied: labml>=0.4.97 in /usr/local/lib/python3.6/dist-packages (from labml-nn) (0.4.97)\n", + "Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (from labml-nn) (1.7.0+cu101)\n", + "Requirement already satisfied: einops in /usr/local/lib/python3.6/dist-packages (from labml-nn) (0.3.0)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from labml-nn) (1.19.5)\n", + "Requirement already satisfied: labml-helpers>=0.4.72 in /usr/local/lib/python3.6/dist-packages (from labml-nn) (0.4.73)\n", + "Requirement already satisfied: gitpython in /usr/local/lib/python3.6/dist-packages (from labml>=0.4.97->labml-nn) (3.1.12)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.6/dist-packages (from labml>=0.4.97->labml-nn) (3.13)\n", + "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch->labml-nn) (0.16.0)\n", + "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.6/dist-packages (from torch->labml-nn) (3.7.4.3)\n", + "Requirement already satisfied: dataclasses in /usr/local/lib/python3.6/dist-packages (from torch->labml-nn) (0.8)\n", + "Requirement already satisfied: gitdb<5,>=4.0.1 in /usr/local/lib/python3.6/dist-packages (from gitpython->labml>=0.4.97->labml-nn) (4.0.5)\n", + "Requirement already satisfied: smmap<4,>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from gitdb<5,>=4.0.1->gitpython->labml>=0.4.97->labml-nn) (3.0.5)\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SE2VUQ6L5zxI" + }, + "source": [ + "Imports" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "0hJXx_g0wS2C" + }, + "source": [ + "import dataclasses\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "from labml import experiment\n", + "from labml_nn.transformers.glu_variants.simple import Configs, Trainer" + ], + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lpggo0wM6qb-" + }, + "source": [ + "Create an experiment" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "bFcr9k-l4cAg" + }, + "source": [ + "experiment.create(name=\"glu_variants\")" + ], + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-OnHLi626tJt" + }, + "source": [ + "Initialize configurations" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Piz0c5f44hRo" + }, + "source": [ + "conf = Configs()" + ], + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wwMzCqpD6vkL" + }, + "source": [ + "Set experiment configurations and assign a configurations dictionary to override configurations" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 17 + }, + "id": "e6hmQhTw4nks", + "outputId": "77eca625-7205-49ea-f275-23f2710c4d84" + }, + "source": [ + "experiment.configs(dataclasses.asdict(conf))" + ], + "execution_count": 5, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
"
+            ],
+            "text/plain": [
+              ""
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "DHyNvXfnzeWQ"
+      },
+      "source": [
+        "Create [`Trainer`](https://lab-ml.com/labml_nn/transformers/glu_variants/simple.html)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "59ZeTv5SzcVe"
+      },
+      "source": [
+        "trainer = Trainer(conf)"
+      ],
+      "execution_count": 6,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "EvI7MtgJ61w5"
+      },
+      "source": [
+        "Set PyTorch models for loading and saving"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "GDlt7dp-5ALt"
+      },
+      "source": [
+        "experiment.add_pytorch_models({'model': trainer.model})"
+      ],
+      "execution_count": 7,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KJZRf8527GxL"
+      },
+      "source": [
+        "Start the experiment and run the training loop."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 255
+        },
+        "id": "aIAWo7Fw5DR8",
+        "outputId": "18b8b334-f9e7-458b-f900-5828b4f9a5c8"
+      },
+      "source": [
+        "with experiment.start():\n",
+        "    trainer.train()"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": [
+              "
\n",
+              "glu_variants: 86b773f65fc911ebb2ac0242ac1c0002\n",
+              "\t[dirty]: \"\"\n",
+              "\n",
+              "--------------------------------------------------\n",
+              "LABML WARNING\n",
+              "LabML App Warning: empty_token: Please create a valid token at https://web.lab-ml.com.\n",
+              "Click on the experiment link to monitor the experiment and add it to your experiments list.\n",
+              "--------------------------------------------------\n",
+              "Monitor experiment at https://web.lab-ml.com/run?uuid=86b773f65fc911ebb2ac0242ac1c0002\n",
+              "It is the the the the the the \n",
+              "It is the the the the the the \n",
+              "It is the the the the the the \n",
+              "It is the the the the the the \n",
+              "1,925,120:  Train:   1%  8,427,381ms   loss.train:  2.42505  8,427,381ms  0:01m/ 11:40m  
" + ], + "text/plain": [ + "" + ] + }, + "metadata": { + "tags": [] + } + } + ] + } + ] +} \ No newline at end of file diff --git a/labml_nn/transformers/glu_variants/simple.py b/labml_nn/transformers/glu_variants/simple.py index 0cf7f494..eaf3fac2 100644 --- a/labml_nn/transformers/glu_variants/simple.py +++ b/labml_nn/transformers/glu_variants/simple.py @@ -13,6 +13,9 @@ We try different variants for the [position-wise feedforward network](../feed_fo *This is a simpler implementation that doesn't use [`labml.configs`](experiment.html) module. We decided to write a simpler implementation to make it easier readers who are not familiar.* + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/glu_variants/simple.ipynb) +[![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://web.lab-ml.com/run?uuid=86b773f65fc911ebb2ac0242ac1c0002) """ import dataclasses