#

Low-Rank Adaptation (LoRA)

This is an implementation of Low-Rank Adaptation (LoRA) in PyTorch.

Low-Rank Adaptation (LoRA) freezes pre-trained model weights and injects trainable rank decomposition matrices into each layer of the transformer. This makes it possible to efficiently fine-tune large langauge models by reducing trainable parameters by a large factor.

Here's the training code for training a GPT2 model with LoRA on Tiny Shakespeare dataset.

24import torch
25import torch.nn as nn

#

LoRA Linear Layer

LoRA linear layer adds a low-rank decomposition to the pre-trained weight matrix ( $W_{0} \in R^{d \times k}$ ) of the linear layer.

$W_{0} + Δ W = W_{0} + B A$

, where $B \in R^{d \times r}$ , $A \in R^{r \times k}$ , and the rank $r ≪ min (d, k)$ .

All parameters are frozen except $A$ and $B$ .

$Δ W$ is initialized to be zero at the beginning of the training.

They multiple $x Δ W^{T}$ by $\frac{α}{r}$ where $α$ is a hyper-parameter. Once $α$ is tuned it can be kept the same when varying $r$ .

28class Linear(nn.Module):

#

in_features is the number of input features of the linear layer
out_features is the number of output features of the linear layer
bias is a flag indicating if there is a bias parameter
r is the rank of the decomposition $r$
alpha is the scaling factor $α$

49    def __init__(self, in_features: int, out_features: int, bias: bool,
50                 r: int, alpha: int = None):

#

58        super().__init__()

#

Set $α = r$ is not provided. i.e. make the scaling factor $\frac{α}{r} = 1$ .

61        if alpha is None:
62            alpha = r

#

The pre-trained weight $W_{0}$

65        self.weight = nn.Parameter(torch.empty((out_features, in_features)))

#

Freeze it

67        self.weight.requires_grad = False
68
69        if bias:

#

Bias parameter $b_{0}$ (also frozen)

71            self.bias = nn.Parameter(torch.empty(out_features))
72            self.bias.requires_grad = False
73        else:

#

No bias parameter

75            self.bias = None

#

scaling factor $\frac{α}{r}$

78        self.scaling = alpha / r

#

Matrix $A \in R^{r \times k}$

80        self.lora_a = nn.Parameter(torch.empty((r, in_features)))

#

Matrix $B \in R^{d \times r}$ , we keep $A$ and $B$ transposed

82        self.lora_b = nn.Parameter(torch.empty((out_features, r)))
83
84        with torch.no_grad():

#

Initialize $A$ similar to a weight matrix in a normal linear layer

86            nn.init.kaiming_uniform_(self.lora_a, a=5 ** 0.5)

#

Initialize $B$ to $0$ so that $Δ W = B A$ is $0$ at initialization

88            nn.init.zeros_(self.lora_b)

#

90    def forward(self, x: torch.Tensor):

#

Compute $x W_{0}^{T} + b_{0}$

92        result = nn.functional.linear(x, self.weight, bias=self.bias)

#

Add $\frac{α}{r} x Δ W^{T} = \frac{α}{r} x (B A)^{T} = \frac{α}{r} x A^{T} B^{T}$

95        result += (x @ self.lora_a.T @ self.lora_b.T) * self.scaling

#

98        return result

#

LoRA Embedding Layer

Similar to LoRA linear layer this adds a low-rank decomposition to the pre-trained embedding weights matrix ( $W_{0} \in R^{d \times k}$ ).

$W_{0} + Δ W = W_{0} + B A$

101class Embedding(nn.Module):

#

num_embeddings is the number of embeddings
embedding_dim is the number embedding dimensions
r is the rank of the decomposition $r$
alpha is the scaling factor $α$

111    def __init__(self, num_embeddings: int, embedding_dim: int,
112                 r: int, alpha: int = None):

#

120        super().__init__()

#

Set $α = r$ is not provided. i.e. make the scaling factor $\frac{α}{r} = 1$ .

123        if alpha is None:
124            alpha = r

#

The pre-trained embedding weights $W_{0}^{T}$ (frozen)

127        self.weight = nn.Parameter(torch.empty((num_embeddings, embedding_dim)))
128        self.weight.requires_grad = False

#

scaling factor $\frac{α}{r}$

131        self.scaling = alpha / r

#

Matrix $A \in R^{r \times k}$

133        self.lora_a = nn.Parameter(torch.empty((r, num_embeddings)))

#

Matrix $B \in R^{d \times r}$

135        self.lora_b = nn.Parameter(torch.empty((embedding_dim, r)))
136
137        with torch.no_grad():

#

Initialize $A$ with a normal distribution

139            nn.init.normal_(self.lora_a)

#

Initialize $B$ to $0$ so that $Δ W = B A$ is $0$ at initialization

141            nn.init.zeros_(self.lora_b)

#

143    def forward(self, x: torch.Tensor):

#

Compute the embeddings $onehot (x) W_{0}$

145        result = nn.functional.embedding(x, self.weight)

#

Add $\frac{α}{r} onehot (x) Δ W^{T} = \frac{α}{r} onehot (x) A^{T} B^{T}$

148        result += (nn.functional.embedding(x, self.lora_a.T) @ self.lora_b.T) * self.scaling

#

151        return result