diff --git a/docs/capsule_networks/index.html b/docs/capsule_networks/index.html index 20c6028e..03e752f0 100644 --- a/docs/capsule_networks/index.html +++ b/docs/capsule_networks/index.html @@ -84,7 +84,7 @@ it is difficult to understand some of the concepts with just the modules. confusions I had with the paper.
Here’s a notebook for training a Capsule Network on MNIST dataset.
+33import torch.nn as nn
diff --git a/docs/gan/cycle_gan.html b/docs/gan/cycle_gan.html
index 5067a351..538c6099 100644
--- a/docs/gan/cycle_gan.html
+++ b/docs/gan/cycle_gan.html
@@ -89,7 +89,7 @@ The discriminators test whether the generated images look real.
This file contains the model code as well as the training code.
We also have a Google Colab notebook.
+
36import itertools
diff --git a/docs/hypernetworks/hyper_lstm.html b/docs/hypernetworks/hyper_lstm.html
index ebc7d1c4..2196d261 100644
--- a/docs/hypernetworks/hyper_lstm.html
+++ b/docs/hypernetworks/hyper_lstm.html
@@ -80,7 +80,7 @@ by David Ha gives a good explanation of HyperNetworks.
We have an experiment that trains a HyperLSTM to predict text on Shakespeare dataset.
Here’s the link to code: experiment.py
+
HyperNetworks use a smaller network to generate weights of a larger network.
There are two variants: static hyper-networks and dynamic hyper-networks.
Static HyperNetworks have smaller networks that generate weights (kernels)
diff --git a/docs/index.html b/docs/index.html
index bf660b38..ed60b1fa 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -7,20 +7,20 @@
-
+
-
+
-
+
-
LabML Neural Networks
+ labml.ai Neural Networks
@@ -70,7 +70,7 @@
- LabML Neural Networks
+ labml.ai Neural Networks
This is a collection of simple PyTorch implementations of
neural networks and related algorithms.
These implementations are documented with explanations,
@@ -139,7 +139,7 @@ implementations.
author = {Varuna Jayasiri, Nipun Wijerathne},
title = {LabML: A library to organize machine learning experiments},
year = {2020},
- url = {https://lab-ml.com/},
+ url = {https://nn.labml.ai/},
}
Here’s the training code and a notebook for training a CNN classifier that uses batch normalization for MNIST dataset.
+98import torch
diff --git a/docs/normalization/batch_norm/readme.html b/docs/normalization/batch_norm/readme.html
index 9aa5f800..8d490274 100644
--- a/docs/normalization/batch_norm/readme.html
+++ b/docs/normalization/batch_norm/readme.html
@@ -138,7 +138,7 @@ mean and variance during the training phase and use that for inference.
Here’s the training code and a notebook for training
a CNN classifier that uses batch normalization for MNIST dataset.
+
Run the synthetic experiment is Adam. -Here are the results. +Here are the results. You can see that Adam converges at $x = +1$
Run the synthetic experiment is AMSGrad -Here are the results. +Here are the results. You can see that AMSGrad converges to true optimal $x = -1$
3import torch
+4import torch.nn as nn
6class MLP(nn.Module):
7 def __init__(self
+8 , in_features
+9 , out_features
+10 , hidden_layers
+11 , actv_func
+12 , pre_module_list=None
+13 , use_dropout=False
+14 , use_batch_norm=False
+15 , use_softmax=True
+16 , device="cpu"
+17 ):
+18 super(MLP, self).__init__()
+19
+20 self.in_features = in_features
+21 self.out_features = out_features
+22 self.num_hidden_layers = len(hidden_layers)
+23 self.hidden_layers = hidden_layers
+24 self.use_dropout = use_dropout
+25 self.use_batch_norm = use_batch_norm
+26 self.actv_func = actv_func
+27 self.use_softmax = use_softmax
+28
+29 self.device = device
Add on to another model
+32 if pre_module_list:
+33 self.module_list = pre_module_list
+34 else:
+35 self.module_list = nn.ModuleList()
+36
+37 self.build_()
Send to gpu
+40 self.to(self.device)
42 def build_(self):
Activation Functions for Fully connected layers # +Start with input dimensions
+45 dim = self.in_features
+46 for i in range(self.num_hidden_layers):
Create a fully connected layer between the last layer + and the current hidden layer
+49 self.module_list.append(nn.Linear(dim, self.hidden_layers[i]))
Update the current dimension
+51 dim = self.hidden_layers[i]
+52
+53 if self.use_batch_norm:
+54 self.module_list.append( nn.BatchNorm1d(dim, affine=True) )
Add the Activation function
+57 self.module_list.append( self.GetActivation(name=self.actv_func[i]) )
+58
+59 if self.use_dropout:
+60 self.module_list.append( nn.Dropout(p=0.10) )
Fully connect to output dimensions
+63 if dim != self.out_features:
+64 self.module_list.append( nn.Linear(dim, self.out_features) )
67 def forward(self, x):
Flatten the 2d image into 1d +Also convert into float for FC layer
+70 x = torch.flatten(x.float(), start_dim=1)
Apply each layer in the module list
+73 for i in range( len(self.module_list) ):
+74 x = self.module_list[i](x)
+75
+76 return x
78 def GetActivation(self, name="relu"):
+79 if name == "relu":
+80 return nn.ReLU()
+81 elif name == "leakyrelu":
+82 return nn.LeakyReLU()
+83 elif name == "Sigmoid":
+84 return nn.Sigmoid()
+85 elif name == "Tanh":
+86 return nn.Tanh()
+87 elif name == "Identity":
+88 return nn.Identity()
+89 else:
+90 return nn.ReLU()
2import torch
+3import torch.nn as nn
+4import torchvision
+5import torchvision.transforms as transforms
+6import torch.optim as optim
+7from torchsummary import summary
custom import
+10import numpy as np
+11import time
+12import os
ResBlock
+16class ResBlock(nn.Module):
17 def __init__(self, num_features, use_batch_norm=False):
+18 super(ResBlock, self).__init__()
+19 self.num_features = num_features
+20 self.conv_layer1 = nn.Conv2d(num_features, num_features, kernel_size=3, stride=1, padding=1)
+21 self.relu_layer = nn.ReLU()
+22 self.conv_layer2 = nn.Conv2d(num_features, num_features, kernel_size=3, stride=1, padding=1)
+23
+24 self.use_batch_norm = use_batch_norm
+25 if self.use_batch_norm:
+26 self.batch_norm_layer1 = nn.BatchNorm2d(self.num_features)
+27 self.batch_norm_layer2 = nn.BatchNorm2d(self.num_features)
+28
+29 for m in self.modules():
+30 if isinstance(m, nn.Conv2d):
+31 nn.init.kaiming_normal_(m.weight)
nn.init.xavier_uniform_(m.weight)
+34 def forward(self, x):
+35 residual = x
+36 x = self.conv_layer1(x)
+37 if self.use_batch_norm:
+38 x = self.batch_norm_layer1(x)
+39
+40 x = self.relu_layer(x)
+41 x = self.conv_layer2(x)
+42 if self.use_batch_norm:
+43 x = self.batch_norm_layer2(x)
+44
+45 x += residual
+46 x = self.relu_layer(x)
+47 return x
ResNet
+50class ResNet(nn.Module):
51 def __init__(self, in_features, num_class, feature_channel_list, batch_norm= False, num_stacks=1, zero_init_residual=True):
+52 super(ResNet, self).__init__()
+53 self.in_features = in_features
+54 self.num_in_channel = in_features[2]
+55 self.num_class = num_class
+56 self.feature_channel_list = feature_channel_list
+57 self.num_residual_blocks = len(self.feature_channel_list)
+58 self.num_stacks = num_stacks
+59 self.batch_norm = batch_norm
+60 self.shape_list = []
+61 self.shape_list.append(in_features)
+62 self.module_list = nn.ModuleList()
+63 self.zero_init_residual= zero_init_residual
+64 self.build_()
66 def build_(self):
track filter shape
+68 cur_shape = self.GetCurShape()
+69 cur_shape = self.CalcConvOutShape(cur_shape, kernel_size=7, padding=1, stride=2, out_filters= self.feature_channel_list[0])
+70 self.shape_list.append(cur_shape)
+71
+72 if len(self.in_features) == 2:
+73 in_channels = 1
+74 else:
+75 in_channels = self.in_features[2]
First Conv layer 7x7 stride=2, pad =1
+78 self.module_list.append(nn.Conv2d(in_channels= in_channels,
+79 out_channels= self.feature_channel_list[0],
+80 kernel_size=7,
+81 stride=2,
+82 padding=3))
batch norm
+86 if self.batch_norm: #batch_norm
+87 self.module_list.append(nn.BatchNorm2d(self.feature_channel_list[0]))
ReLU()
+90 self.module_list.append(nn.ReLU())
+91
+92 for i in range(self.num_residual_blocks-1):
+93 in_size = self.feature_channel_list[i]
+94 out_size = self.feature_channel_list[i+1]
+95
+96 res_block = ResBlock(in_size, use_batch_norm=True)
99 for num in range(self.num_stacks):
+100 self.module_list.append(res_block)
103 self.module_list.append(nn.Conv2d(in_channels=in_size,
+104 out_channels= out_size,
+105 kernel_size=3,
+106 padding=1,
+107 stride=2))
track filter shape
+110 cur_shape = self.CalcConvOutShape(cur_shape, kernel_size=3, padding=1,
+111 stride=2, out_filters=out_size)
+112
+113 self.shape_list.append(cur_shape)
116 if self.batch_norm: # batch_norm
+117 self.module_list.append(nn.BatchNorm2d(out_size))
+118
+119 self.module_list.append(nn.ReLU())
print(“shape list”, self.shape_list)
+TODO include in the main loop +Last Residual block
+125 res_block = ResBlock(out_size, use_batch_norm=True)
+126 for num in range(self.num_stacks):
+127 self.module_list.append(res_block)
Last AvgPool layer +self.module_list.append(nn.AvgPool2d(kernel_size=2, stride=2, padding=0))
+131 self.module_list.append(nn.MaxPool2d(kernel_size=2, stride=2, padding=0))
track filter shape
+134 cur_shape = self.CalcConvOutShape(cur_shape, kernel_size=2, padding=0, stride=2, out_filters=out_size)
+135 self.shape_list.append(cur_shape)
+136
+137 s = self.GetCurShape()
+138 in_features = s[0] * s[1] * s[2]
Initialization
+141 for m in self.modules():
+142 if isinstance(m, nn.Conv2d):
+143 nn.init.kaiming_normal_(m.weight)
nn.init.xavier_uniform_(m.weight)
+if self.zero_init_residual: + for m in self.modules(): + if isinstance(m, ResBlock): + nn.init.constant_(m.batch_norm_layer1.weight, 0) + nn.init.constant_(m.batch_norm_layer2.weight, 0)
+152 def GetCurShape(self):
+153 return self.shape_list[-1]
155 def CalcConvFormula(self, W, K, P, S):
+156 return int(np.floor(((W - K + 2 * P) / S) + 1))
https://stackoverflow.com/questions/53580088/calculate-the-output-size-in-convolution-layer +Calculate the output shape after applying a convolution
+160 def CalcConvOutShape(self, in_shape, kernel_size, padding, stride, out_filters):
Multiple options for different kernel shapes
+162 if type(kernel_size) == int:
+163 out_shape = [self.CalcConvFormula(in_shape[i], kernel_size, padding, stride) for i in range(2)]
+164 else:
+165 out_shape = [self.CalcConvFormula(in_shape[i], kernel_size[i], padding, stride) for i in range(2)]
+166
+167 return (out_shape[0], out_shape[1], out_filters) # , batch_size... but not necessary.
169 def AddMLP(self, MLP):
+170 if MLP:
+171 self.module_list.append(MLP)
def MLP(self, in_features, num_classes, use_batch_norm=False, use_dropout=False, use_softmax=False): + return nn.ReLU(nn.Linear(in_features, num_classes))
+176 def forward(self, x):
+177 for mod_name in self.module_list:
+178 x = mod_name(x)
+179 x = x.view(x.size(0), -1) # flat #TODO check if it works
+180 return x
3from utils.train import Trainer # Default custom training class
+4from models.resnet import *
+5from torchvision import models
GPU Check
+8device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+9print("Device: " + str(device))
Use different train/test data augmentations
+12transform_test = transforms.Compose(
+13 [transforms.ToTensor(),
+14 transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
Get Cifar 10 Datasets
+17save='./data/Cifar10'
+18transform_train = transforms.Compose([
+19 transforms.RandomHorizontalFlip(p=1.0),
+20 transforms.RandomRotation(20),
+21 transforms.RandomCrop(32, (2, 2), pad_if_needed=False, padding_mode='constant'),
+22 transforms.ToTensor(),
+23 transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
Get Cifar 10 Datasets
+26trainset = torchvision.datasets.CIFAR10(root=save, train=True, download=True, transform=transform_train)
+27testset = torchvision.datasets.CIFAR10(root=save, train=False, download=True, transform=transform_test)
Get Cifar 10 Dataloaders
+30trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
+31 shuffle=True, num_workers=4)
+32
+33testloader = torch.utils.data.DataLoader(testset, batch_size=64,
+34 shuffle=False, num_workers=4)
Load the pre-trained model
+ +40model_ft = models.resnet18(pretrained=True)
+41num_ftrs = model_ft.fc.in_features
+42model_ft.fc = nn.Sequential(
+43 nn.Dropout(0.5),
+44 nn.Linear(num_ftrs, 10)
+45)
+46
+47
+48model_ft = model_ft.to(device)
Loss function
+51cost = nn.CrossEntropyLoss()
Optimizer
+54lr = 0.0005
opt = optim.SGD(model_ft.parameters(), lr=lr, momentum=0.9)
+56opt = torch.optim.Adam(model_ft.parameters(), lr=lr, betas=(0.9, 0.95), weight_decay=1e-4) #0.0005 l2_factor.item()
Create a trainer
+59trainer = Trainer(model_ft, opt, cost, name="Transfer-learning",lr=lr , use_lr_schedule=True, device=device)
Run training
+62epochs = 25
+63trainer.Train(trainloader, epochs, testloader=testloader)
trainer.Train(trainloader, epochs) # check train error
+66print('done')
Custom classes
+4from models.mlp import MLP
+5from utils.train import Trainer
+6from models.resnet import *
GPU Check
+9device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+10print("Device: " + str(device))
Use different train/test data augmentations
+13transform_test = transforms.Compose(
+14 [transforms.ToTensor(),
+15 transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+16
+17transform_train = transforms.Compose([
+18 transforms.RandomHorizontalFlip(p=1.0),
+19 transforms.RandomRotation(20),
+20 transforms.RandomCrop(32, (2, 2), pad_if_needed=False, padding_mode='constant'),
+21 transforms.ToTensor(),
+22 transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
Get Cifar 10 Datasets
+26save='./data/Cifar10'
+27trainset = torchvision.datasets.CIFAR10(root=save, train=True, download=True, transform=transform_train)
+28testset = torchvision.datasets.CIFAR10(root=save, train=False, download=True, transform=transform_test)
Get Cifar 10 Dataloaders
+31trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
+32 shuffle=True, num_workers=4)
+33
+34testloader = torch.utils.data.DataLoader(testset, batch_size=64,
+35 shuffle=False, num_workers=4)
+36
+37epochs = 50
Create the assignment Resnet (part a)
+ +42def MyResNet():
+43 resnet = ResNet(in_features= [32, 32, 3],
+44 num_class=10,
+45 feature_channel_list = [128, 256, 512],
+46 batch_norm= True,
+47 num_stacks=1
+48 )
Create MLP +Calculate the input shape
+52 s = resnet.GetCurShape()
+53 in_features = s[0]*s[1]*s[2]
+54
+55 mlp = MLP(in_features,
+56 10,
+57 [], #512, 1024, 512
+58 [],
+59 use_batch_norm=False,
+60 use_dropout=False,
+61 use_softmax=False,
+62 device=device)
+63
+64 resnet.AddMLP(mlp)
+65 return resnet
+66
+67model = MyResNet()
+68model.to(device=device)
+69summary(model, (3, 32,32))
Optimizer
+72opt = torch.optim.Adam(model.parameters(), lr=0.0005, betas=(0.9, 0.95), weight_decay=1e-8) #0.0005 l2_factor.item()
Loss function
+75cost = nn.CrossEntropyLoss()
Create a trainer
+78trainer = Trainer(model, opt, cost, name="MyResNet", device=device, use_lr_schedule =True)
Run training
+81trainer.Train(trainloader, epochs, testloader=testloader)
+82
+83print('done')
1import torch.nn.functional as F
+2from torch import nn
4class LabelSmoothingLoss(nn.Module):
5 def __init__(self, epsilon= 0.5, reduction='mean'):
+6 super().__init__()
+7 self.epsilon = epsilon
+8 self.reduction = reduction
10 def forward(self, pred, target):
+11 n = pred.size()[-1]
+12 log_pred = F.log_softmax(pred, dim=-1)
+13 loss = -log_pred.sum(dim=-1).mean()
+14 nll = F.nll_loss(log_pred, target, reduction=self.reduction)
+15 out = (1-self.epsilon)*nll + self.epsilon*(loss / n)
+16 return out
3import torch
+4from torch.utils.data import DataLoader, ConcatDataset
from sklearn.model_selection import KFold +from torch.utils.data.sampler import SubsetRandomSampler
+8import matplotlib.pyplot as plt
+9from pylab import *
+10import os
+11
+12from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR
16class Trainer():
17 def __init__(self, net, opt, cost, name="default", lr=0.0005, use_lr_schedule =False , device=None):
+18 self.net = net
+19 self.opt = opt
+20 self.cost = cost
+21 self.device = device
+22 self.epoch = 0
+23 self.start_epoch = 0
+24 self.name = name
+25
+26 self.lr = lr
+27 self.use_lr_schedule = use_lr_schedule
+28 if self.use_lr_schedule:
+29 self.scheduler = ReduceLROnPlateau( self.opt, 'max', factor=0.1, patience=5, threshold=0.00001, verbose=True)
self.scheduler = StepLR(self.opt, step_size=15, gamma=0.1)
+Train loop over epochs. Optinal use testloader to return test accuracy after each epoch
+33 def Train(self, trainloader, epochs, testloader=None):
Enable Dropout
+Record loss/accuracies
+37 loss = torch.zeros(epochs)
+38 self.epoch = 0
If testloader is used, loss will be the accuracy
+41 for epoch in range(self.start_epoch, self.start_epoch+epochs):
+42 self.epoch = epoch+1
+43
+44 self.net.train() # Enable Dropout
+45 for data in trainloader:
Get the inputs; data is a list of [inputs, labels]
+47 if self.device:
+48 images, labels = data[0].to(self.device), data[1].to(self.device)
+49 else:
+50 images, labels = data
+51
+52 self.opt.zero_grad()
Forward + backward + optimize
+54 outputs = self.net(images)
+55 epoch_loss = self.cost(outputs, labels)
+56 epoch_loss.backward()
+57 self.opt.step()
+58
+59 loss[epoch] += epoch_loss.item()
+60
+61 if testloader:
+62 loss[epoch] = self.Test(testloader)
+63 else:
+64 loss[epoch] /= len(trainloader)
+65
+66 print("Epoch %d Learning rate %.6f %s: %.3f" % (
+67 self.epoch, self.opt.param_groups[0]['lr'], "Accuracy" if testloader else "Loss", loss[epoch]))
learning rate scheduler
+70 if self.use_lr_schedule:
+71 self.scheduler.step(loss[epoch])
self.scheduler.step()
+Saving best model
+75 if loss[epoch] >= torch.max(loss):
+76 self.save_best_model({
+77 'epoch': self.epoch,
+78 'state_dict': self.net.state_dict(),
+79 'optimizer': self.opt.state_dict(),
+80 })
+81
+82 return loss
Testing
+85 def Test(self, testloader, ret="accuracy"):
Disable Dropout
+87 self.net.eval()
Track correct and total
+90 correct = 0.0
+91 total = 0.0
+92 with torch.no_grad():
+93 for data in testloader:
+94 if self.device:
+95 images, labels = data[0].to(self.device), data[1].to(self.device)
+96 else:
+97 images, labels = data
+98
+99 outputs = self.net(images)
+100 _, predicted = torch.max(outputs.data, 1)
+101 total += labels.size(0)
+102 correct += (predicted == labels).sum().item()
+103
+104 return correct / total
106 def save_best_model(self, state):
+107 directory = os.path.dirname("./save/%s-best-model/"%(self.name))
+108 if not os.path.exists(directory):
+109 os.mkdir(directory)
+110 torch.save(state, "%s/model.pt" %(directory))
112 def save_checkpoint(self, state):
+113 directory = os.path.dirname("./save/%s-checkpoints/"%(self.name))
+114 if not os.path.exists(directory):
+115 os.mkdir(directory)
+116 torch.save(state, "%s/model_epoch_%s.pt" %(directory, self.epoch))
torch.save(state, “./save/checkpoints/model_epoch_%s.pt” % (self.epoch))
+3import torch
+4import torchvision
+5import torchvision.transforms as transforms
+6
+7import torch.nn as nn
+8import torch.nn.functional as F
+9
+10import matplotlib.pyplot as plt
+11import numpy as np
+12
+13from sklearn.model_selection import KFold
+14from torch.utils.data.sampler import SubsetRandomSampler
Plot the loss of multiple runs together
+19def PlotLosses(losses, titles, save=None):
+20 fig = plt.figure()
+21 fig.set_size_inches(14, 22)
Plot results on 3 subgraphs +subplot integers: + nrows + ncols + index
+27 sublplot_str_start = "" + str(len(losses)) + "1"
+28
+29 for i in range(len(losses)):
+30 subplot = sublplot_str_start + str(i+1)
+31 loss = losses[i]
+32 title = titles[i]
+33
+34 ax = plt.subplot(int(subplot))
+35 ax.plot(range(len(loss)), loss)
+36 ax.set_xlabel("Epoch")
+37 ax.set_title(title)
+38 ax.set_ylabel("Loss")
Save Figure
+41 if save:
+42 plt.savefig(save)
+43 else:
+44 plt.show()
48def ClassSpecificTestCifar10(net, testdata, device=None):
+49 classes = ('plane', 'car', 'bird', 'cat','deer', 'dog', 'frog', 'horse', 'ship', 'truck')
+50 class_correct = list(0. for i in range(10))
+51 class_total = list(0. for i in range(10))
+52 with torch.no_grad():
+53 for data in testdata:
+54 if device:
+55 images, labels = data[0].to(device), data[1].to(device)
+56 else:
+57 images, labels = data
+58
+59 outputs = net(images)
+60 _, predicted = torch.max(outputs, 1)
+61 c = (predicted == labels).squeeze()
+62 for i in range(4):
+63 label = labels[i]
+64 class_correct[label] += c[i].item()
+65 class_total[label] += 1
Print out
+68 for i in range(10):
+69 print('Accuracy of %5s : %2d %%' % (
+70 classes[i], 100 * class_correct[i] / class_total[i]))
74def GetActivation(name="relu"):
+75 if name == "relu":
+76 return nn.ReLU()
+77 elif name == "leakyrelu":
+78 return nn.LeakyReLU()
+79 elif name == "Sigmoid":
+80 return nn.Sigmoid()
+81 elif name == "Tanh":
+82 return nn.Tanh()
+83 elif name == "Identity":
+84 return nn.Identity()
+85 else:
+86 return nn.ReLU()
Here are the training code and a notebook for training a compressive transformer model on the Tiny Shakespeare dataset.
+54from typing import Optional, List
diff --git a/docs/transformers/compressive/readme.html b/docs/transformers/compressive/readme.html
index c8fb17af..66b73b03 100644
--- a/docs/transformers/compressive/readme.html
+++ b/docs/transformers/compressive/readme.html
@@ -105,7 +105,7 @@ This is supposed to be more stable in standard transformer setups.
Here are the training code and a notebook for training a compressive transformer
model on the Tiny Shakespeare dataset.
+
Here’s the training code and a notebook for training a feedback transformer on Tiny Shakespeare dataset.
+Here’s a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.
+19import torch
diff --git a/docs/transformers/feedback/index.html b/docs/transformers/feedback/index.html
index ba8e08c9..934af315 100644
--- a/docs/transformers/feedback/index.html
+++ b/docs/transformers/feedback/index.html
@@ -97,7 +97,7 @@ The second half of this file implements this.
We implemented a custom PyTorch function to improve performance.
Here’s the training code and a notebook for training a feedback transformer on Tiny Shakespeare dataset.
+
43import math
diff --git a/docs/transformers/glu_variants/simple.html b/docs/transformers/glu_variants/simple.html
index 422d979b..7ed19603 100644
--- a/docs/transformers/glu_variants/simple.html
+++ b/docs/transformers/glu_variants/simple.html
@@ -78,7 +78,7 @@ We try different variants for the position-wise feedfo
This is a simpler implementation that doesn’t use labml.configs
module.
We decided to write a simpler implementation to make it easier for readers who are not familiar.
+
20import dataclasses
diff --git a/docs/transformers/gpt/index.html b/docs/transformers/gpt/index.html
index e576b3e6..01a130ef 100644
--- a/docs/transformers/gpt/index.html
+++ b/docs/transformers/gpt/index.html
@@ -91,7 +91,7 @@ For the transformer we reuse the
existing labml/nn transformer implementation.
Here’s a notebook for training a GPT model on Tiny Shakespeare dataset.
+
35import torch
diff --git a/docs/transformers/switch/index.html b/docs/transformers/switch/index.html
index c7b5bbad..279ab732 100644
--- a/docs/transformers/switch/index.html
+++ b/docs/transformers/switch/index.html
@@ -95,7 +95,7 @@ In a distributed setup you would have each FFN (each very large) on a different
discusses dropping tokens when routing is not balanced.
Here’s the training code and a notebook for training a switch transformer on Tiny Shakespeare dataset.
+
40import torch
diff --git a/docs/transformers/switch/readme.html b/docs/transformers/switch/readme.html
index 3932b635..ea5d5a4d 100644
--- a/docs/transformers/switch/readme.html
+++ b/docs/transformers/switch/readme.html
@@ -95,7 +95,7 @@ In a distributed setup you would have each FFN (each very large) on a different
discusses dropping tokens when routing is not balanced.
Here’s the training code and a notebook for training a switch transformer on Tiny Shakespeare dataset.
+
Annotated implementation of relative multi-headed attention is in relative_mha.py
.
Here’s the training code and a notebook for training a transformer XL model on Tiny Shakespeare dataset.
+36from typing import List, Optional
diff --git a/docs/transformers/xl/readme.html b/docs/transformers/xl/readme.html
index 616b4255..5c476e39 100644
--- a/docs/transformers/xl/readme.html
+++ b/docs/transformers/xl/readme.html
@@ -90,7 +90,7 @@ are introduced at the attention calculation.
Annotated implementation of relative multi-headed attention is in relative_mha.py
.
Here’s the training code and a notebook for training a transformer XL model on Tiny Shakespeare dataset.
+