diff --git a/docs/RWKV/configs.html b/docs/RWKV/configs.html index 3780bb86..463c144a 100644 --- a/docs/RWKV/configs.html +++ b/docs/RWKV/configs.html @@ -12,7 +12,7 @@ - + @@ -23,7 +23,7 @@ configs.py - + @@ -47,7 +47,7 @@

home - RWKV + rwkv

@@ -60,7 +60,7 @@ style="max-width:100%;"/>

- + View code on Github

diff --git a/docs/RWKV/experiment.html b/docs/RWKV/experiment.html index 71698823..281bcac1 100644 --- a/docs/RWKV/experiment.html +++ b/docs/RWKV/experiment.html @@ -12,7 +12,7 @@ - + @@ -23,7 +23,7 @@ experiment.py - + @@ -47,7 +47,7 @@

home - RWKV + rwkv

@@ -60,7 +60,7 @@ style="max-width:100%;"/>

- + View code on Github

@@ -78,10 +78,10 @@ 3 4import torch 5import torch.nn as nn -6from labml_nn.RWKV.configs import RWKVConfigs +6from labml_nn.rwkv.configs import RWKVConfigs 7 -8from labml_nn.RWKV import RWKV -9from labml_nn.RWKV import TimeMixing +8from labml_nn.rwkv import RWKV +9from labml_nn.rwkv import TimeMixing 10from labml import experiment 11from labml.configs import option 12from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs diff --git a/docs/RWKV/index.html b/docs/RWKV/index.html index cb73300b..5462e088 100644 --- a/docs/RWKV/index.html +++ b/docs/RWKV/index.html @@ -12,7 +12,7 @@ - + @@ -23,7 +23,7 @@ Receptance Weighted Key Value (RWKV) - + @@ -47,7 +47,7 @@

home - RWKV + rwkv

@@ -60,7 +60,7 @@ style="max-width:100%;"/>

- + View code on Github

diff --git a/docs/gan/wasserstein/index.html b/docs/gan/wasserstein/index.html index 72bb41c4..b3a28135 100644 --- a/docs/gan/wasserstein/index.html +++ b/docs/gan/wasserstein/index.html @@ -74,17 +74,17 @@

Wasserstein GAN (WGAN)

This is an implementation of Wasserstein GAN.

The original GAN loss is based on Jensen-Shannon (JS) divergence between the real distribution and generated distribution . The Wasserstein GAN is based on Earth Mover distance between these distributions.

-

+

is the set of all joint distributions, whose marginal probabilities are .

is the earth mover distance for a given joint distribution ( and are probabilities).

-

So is equal to the least earth mover distance for any joint distribution between the real distribution and generated distribution .

+

So is equal to the least earth mover distance for any joint distribution between the real distribution and generated distribution .

The paper shows that Jensen-Shannon (JS) divergence and other measures for the difference between two probability distributions are not smooth. And therefore if we are doing gradient descent on one of the probability distributions (parameterized) it will not converge.

-

Based on Kantorovich-Rubinstein duality,

+

Based on Kantorovich-Rubinstein duality,

where are all 1-Lipschitz functions.

That is, it is equal to the greatest difference among all 1-Lipschitz functions.

-

For -Lipschitz functions,

+

For -Lipschitz functions,

If all -Lipschitz functions can be represented as where is parameterized by ,

-

+

If is represented by a generator and is from a known distribution ,

Now to converge with we can gradient descent on to minimize above formula.

diff --git a/docs/lora/gpt2.html b/docs/lora/gpt2.html new file mode 100644 index 00000000..bed238dc --- /dev/null +++ b/docs/lora/gpt2.html @@ -0,0 +1,378 @@ + + + + + + + + + + + + + + + + + + + + + + + gpt2.py + + + + + + + + + + +
+
+
+
+

+ home + lora +

+

+ + Github + + Twitter +

+

+ + View code on Github +

+
+
+
+
+ + +
+
+
1import torch
+2import torch.nn as nn
+3from transformers import AutoTokenizer
+4from labml_nn.lora import Linear, Embedding
+5
+6tokenizer = AutoTokenizer.from_pretrained("gpt2")
+7
+8config = {
+9    "layer_norm_epsilon": 1e-05,
+10    "n_embd": 768,
+11    "n_head": 12,
+12    "n_layer": 12,
+13    "n_positions": 1024,
+14    "vocab_size": 50257,
+15    "device": "cuda"
+16}
+
+
+
+
+ + +
+
+
19class FFN(nn.Module):
+
+
+
+
+ + +
+
+
20    def __init__(self, dim):
+21        super().__init__()
+22        self.c_fc = Linear(config['n_embd'], dim, r=32, bias=True)
+23        self.c_proj = Linear(dim, config['n_embd'], r=32, bias=True)
+24        self.act = nn.functional.gelu
+
+
+
+
+ + +
+
+
26    def forward(self, hidden_states):
+27        hidden_states = self.c_fc(hidden_states)
+28        hidden_states = self.act(hidden_states)
+29        hidden_states = self.c_proj(hidden_states)
+30        return hidden_states
+
+
+
+
+ + +
+
+
33class MultiHeadAttention(nn.Module):
+
+
+
+
+ + +
+
+
34    def __init__(self):
+35        super().__init__()
+36        self.embed_dim = config['n_embd']
+37        self.num_heads = config['n_head']
+38        self.head_dim = self.embed_dim // self.num_heads
+39        self.split_size = self.embed_dim
+40
+41        self.c_att = Linear(config['n_embd'], config['n_embd'] * 3, r=32, bias=True)
+42        self.c_proj = Linear(config['n_embd'], config['n_embd'], r=32, bias=True)
+
+
+
+
+ +

Splits hidden_size dim into attn_head_size and num_heads

+ +
+
+
44    def _split_heads(self, tensor, num_heads, attn_head_size):
+
+
+
+
+ + +
+
+
48        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+49        tensor = tensor.view(new_shape)
+50        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+
+
+
+
+ + +
+
+
52    def forward(self, hidden_states):
+53        batch_size, seq_length, _ = hidden_states.size()
+54
+55        query, key, value = self.c_att(hidden_states).split(self.split_size, dim=2)
+56
+57        query = self._split_heads(query, self.num_heads, self.head_dim)
+58        key = self._split_heads(key, self.num_heads, self.head_dim)
+59        value = self._split_heads(value, self.num_heads, self.head_dim)
+60
+61        attn_output = torch.nn.functional.scaled_dot_product_attention(
+62            query,
+63            key,
+64            value,
+65            attn_mask=None,
+66            dropout_p=0.0,
+67            is_causal=True,  # for the triangular mask
+68        )
+69
+70        attn_output = attn_output.transpose(1, 2).contiguous()
+71        attn_output = attn_output.view(batch_size, seq_length, self.embed_dim)
+72
+73        attn_output = self.c_proj(attn_output)
+74
+75        return attn_output
+
+
+
+
+ + +
+
+
78class Block(nn.Module):
+
+
+
+
+ + +
+
+
79    def __init__(self):
+80        super().__init__()
+81        self.pre_norm = nn.LayerNorm(config['n_embd'], eps=config['layer_norm_epsilon'])
+82        self.attn = MultiHeadAttention()
+83        self.post_norm = nn.LayerNorm(config['n_embd'], eps=config['layer_norm_epsilon'])
+84        self.ffn = FFN(config['n_embd'] * 4)
+
+
+
+
+ + +
+
+
86    def forward(self, hidden_states):
+87        residual = hidden_states
+88        hidden_states = self.pre_norm(hidden_states)
+89
+90        attn_output = self.attn(hidden_states)
+91
+92        hidden_states = attn_output + residual
+93        residual = hidden_states
+94        hidden_states = self.post_norm(hidden_states)
+95        feed_forward_output = self.ffn(hidden_states)
+96        hidden_states = feed_forward_output + residual
+97
+98        return hidden_states
+
+
+
+
+ + +
+
+
101class GPTModel(nn.Module):
+
+
+
+
+ + +
+
+
102    def __init__(self):
+103        super().__init__()
+104
+105        self.token_embedding = Embedding(config['vocab_size'], config['n_embd'], r=32)
+106        self.position_embedding = Embedding(config['n_positions'], config['n_embd'], r=32)
+107
+108        self.blocks = nn.ModuleList([Block() for _ in range(config['n_layer'])])
+109
+110        self.final_norm = nn.LayerNorm(config['n_embd'], eps=config['layer_norm_epsilon'])
+111
+112        self.lm_head = Linear(config['n_embd'], config['vocab_size'], r=32, bias=False)
+
+
+
+
+ + +
+
+
114    def forward(self, input_ids):
+115        batch_size, input_shape = input_ids.size()
+116
+117        token_embeddings = self.token_embedding(input_ids)  # B T C
+118        position_ids = torch.arange(input_shape, device=config['device'])  # T C
+119        position_embeddings = self.position_embedding(position_ids)  # B T C
+120
+121        hidden_states = token_embeddings + position_embeddings
+122
+123        for block in self.blocks:
+124            hidden_states = block(hidden_states)
+125
+126        hidden_states = self.final_norm(hidden_states)
+127
+128        logits = self.lm_head(hidden_states)
+129
+130        return logits
+
+
+ +
+ + + + \ No newline at end of file diff --git a/docs/lora/index.html b/docs/lora/index.html new file mode 100644 index 00000000..46d25217 --- /dev/null +++ b/docs/lora/index.html @@ -0,0 +1,534 @@ + + + + + + + + + + + + + + + + + + + + + + + Low-Rank Adaptation (LoRA) + + + + + + + + + + +
+
+
+
+

+ home + lora +

+

+ + Github + + Twitter +

+

+ + View code on Github +

+
+
+
+
+ +

Low-Rank Adaptation (LoRA)

+

This is an implementation of Low-Rank Adaptation (LoRA) in PyTorch.

+

Low-Rank Adaptation (LoRA) freezes pre-trained model weights and injects trainable rank decomposition matrices into each layer of the transformer. This makes it possible to efficiently fine-tune large langauge models by reducing trainable parameters by a large factor.

+

Here's the training code for training a GPT2 model with LoRA on Tiny Shakespeare dataset.

+ +
+
+
24import torch
+25import torch.nn as nn
+
+
+
+
+ +

LoRA Linear Layer

+

LoRA linear layer adds a low-rank decomposition to the pre-trained weight matrix () of the linear layer.

+

+

, where , , and the rank .

+

All parameters are frozen except and .

+

is initialized to be zero at the beginning of the training.

+

They multiple by where is a hyper-parameter. Once is tuned it can be kept the same when varying .

+ +
+
+
28class Linear(nn.Module):
+
+
+
+
+ +
  • in_features + is the number of input features of the linear layer
  • +
  • out_features + is the number of output features of the linear layer
  • +
  • bias + is a flag indicating if there is a bias parameter
  • +
  • r + is the rank of the decomposition
  • +
  • alpha + is the scaling factor
+ +
+
+
49    def __init__(self, in_features: int, out_features: int, bias: bool,
+50                 r: int, alpha: int = None):
+
+
+
+
+ + +
+
+
58        super().__init__()
+
+
+
+
+ +

Set is not provided. i.e. make the scaling factor .

+ +
+
+
61        if alpha is None:
+62            alpha = r
+
+
+
+
+ +

The pre-trained weight

+ +
+
+
65        self.weight = nn.Parameter(torch.empty((out_features, in_features)))
+
+
+
+
+ +

Freeze it

+ +
+
+
67        self.weight.requires_grad = False
+68
+69        if bias:
+
+
+
+
+ +

Bias parameter (also frozen)

+ +
+
+
71            self.bias = nn.Parameter(torch.empty(out_features))
+72            self.bias.requires_grad = False
+73        else:
+
+
+
+
+ +

No bias parameter

+ +
+
+
75            self.bias = None
+
+
+
+
+ +

scaling factor

+ +
+
+
78        self.scaling = alpha / r
+
+
+
+
+ +

Matrix

+ +
+
+
80        self.lora_a = nn.Parameter(torch.empty((in_features, r)))
+
+
+
+
+ +

Matrix , we keep and transposed

+ +
+
+
82        self.lora_b = nn.Parameter(torch.empty((r, out_features)))
+83
+84        with torch.no_grad():
+
+
+
+
+ +

Initialize similar to a weight matrix in a normal linear layer

+ +
+
+
86            nn.init.kaiming_uniform_(self.lora_a, a=5 ** 0.5)
+
+
+
+
+ +

Initialize to so that is at initialization

+ +
+
+
88            nn.init.zeros_(self.lora_b)
+
+
+
+
+ + +
+
+
90    def forward(self, x: torch.Tensor):
+
+
+
+
+ +

Compute

+ +
+
+
92        result = nn.functional.linear(x, self.weight, bias=self.bias)
+
+
+
+
+ +

Add

+ +
+
+
95        result += (x @ self.lora_a @ self.lora_b) * self.scaling
+
+
+
+
+ +

+ +
+
+
98        return result
+
+
+
+
+ +

LoRA Embedding Layer

+

Similar to LoRA linear layer this adds a low-rank decomposition to the pre-trained embedding weights matrix ().

+

+ +
+
+
101class Embedding(nn.Module):
+
+
+
+
+ +
  • num_embeddings + is the number of embeddings
  • +
  • embedding_dim + is the number embedding dimensions
  • +
  • r + is the rank of the decomposition
  • +
  • alpha + is the scaling factor
+ +
+
+
111    def __init__(self, num_embeddings: int, embedding_dim: int,
+112                 r: int, alpha: int = None):
+
+
+
+
+ + +
+
+
120        super().__init__()
+
+
+
+
+ +

Set is not provided. i.e. make the scaling factor .

+ +
+
+
123        if alpha is None:
+124            alpha = r
+
+
+
+
+ +

The pre-trained embedding weights (frozen)

+ +
+
+
127        self.weight = nn.Parameter(torch.empty((num_embeddings, embedding_dim)))
+128        self.weight.requires_grad = False
+
+
+
+
+ +

scaling factor

+ +
+
+
131        self.scaling = alpha / r
+
+
+
+
+ +

Matrix

+ +
+
+
133        self.lora_a = nn.Parameter(torch.empty((num_embeddings, r)))
+
+
+
+
+ +

Matrix

+ +
+
+
135        self.lora_b = nn.Parameter(torch.empty((r, embedding_dim)))
+136
+137        with torch.no_grad():
+
+
+
+
+ +

Initialize with a normal distribution

+ +
+
+
139            nn.init.normal_(self.lora_a)
+
+
+
+
+ +

Initialize to so that is at initialization

+ +
+
+
141            nn.init.zeros_(self.lora_b)
+
+
+
+
+ + +
+
+
143    def forward(self, x: torch.Tensor):
+
+
+
+
+ +

Compute the embeddings

+ +
+
+
145        result = nn.functional.embedding(x, self.weight)
+
+
+
+
+ +

Add Error

+ +
+
+
148        result += (nn.functional.embedding(x, self.lora_a) @ self.lora_b) * self.scaling
+
+
+
+
+ +

+ +
+
+
151        return result
+
+
+ +
+ + + + \ No newline at end of file diff --git a/docs/lora/transform_hf_model.html b/docs/lora/transform_hf_model.html new file mode 100644 index 00000000..a9d34c3a --- /dev/null +++ b/docs/lora/transform_hf_model.html @@ -0,0 +1,186 @@ + + + + + + + + + + + + + + + + + + + + + + + transform_hf_model.py + + + + + + + + + + +
+
+
+
+

+ home + lora +

+

+ + Github + + Twitter +

+

+ + View code on Github +

+
+
+
+
+ + +
+
+
1import torch
+2from transformers import AutoModelForCausalLM
+
+
+
+
+ + +
+
+
5def transform_hf_model():
+6    model = AutoModelForCausalLM.from_pretrained("gpt2")
+7
+8    state_dict = model.state_dict()
+9
+10    mapping = {
+11        'transformer.wte.weight': 'token_embedding.weight',
+12        'transformer.wpe.weight': 'position_embedding.weight',
+13        'transformer.ln_f.weight': 'final_norm.weight',
+14        'transformer.ln_f.bias': 'final_norm.bias',
+15        'lm_head.weight': 'lm_head.weight'
+16    }
+17
+18    for i in range(12):
+19        mapping[f'transformer.h.{i}.ln_1.weight'] = f'blocks.{i}.pre_norm.weight'
+20        mapping[f'transformer.h.{i}.ln_1.bias'] = f'blocks.{i}.pre_norm.bias'
+21        mapping[f'transformer.h.{i}.attn.c_attn.weight'] = f'blocks.{i}.attn.c_att.weight'
+22        mapping[f'transformer.h.{i}.attn.c_attn.bias'] = f'blocks.{i}.attn.c_att.bias'
+23        mapping[f'transformer.h.{i}.attn.c_proj.weight'] = f'blocks.{i}.attn.c_proj.weight'
+24        mapping[f'transformer.h.{i}.attn.c_proj.bias'] = f'blocks.{i}.attn.c_proj.bias'
+25        mapping[f'transformer.h.{i}.ln_2.weight'] = f'blocks.{i}.post_norm.weight'
+26        mapping[f'transformer.h.{i}.ln_2.bias'] = f'blocks.{i}.post_norm.bias'
+27        mapping[f'transformer.h.{i}.mlp.c_fc.weight'] = f'blocks.{i}.ffn.c_fc.weight'
+28        mapping[f'transformer.h.{i}.mlp.c_fc.bias'] = f'blocks.{i}.ffn.c_fc.bias'
+29        mapping[f'transformer.h.{i}.mlp.c_proj.weight'] = f'blocks.{i}.ffn.c_proj.weight'
+30        mapping[f'transformer.h.{i}.mlp.c_proj.bias'] = f'blocks.{i}.ffn.c_proj.bias'
+31
+32    new_state_dict = {}
+33    for old_key, new_key in mapping.items():
+34        if old_key in state_dict:
+35            new_state_dict[new_key] = state_dict[old_key]
+
+
+
+
+ +

transpose weight matrices of convo 1d layers to use linear layers instead

+ +
+
+
38    convo_layers = ([f'blocks.{i}.ffn.c_fc.weight' for i in range(12)] +
+39                    [f'blocks.{i}.ffn.c_proj.weight' for i in range(12)] +
+40                    [f'blocks.{i}.attn.c_att.weight' for i in range(12)] +
+41                    [f'blocks.{i}.attn.c_proj.weight' for i in range(12)])
+42
+43    for layer in convo_layers:
+44        new_state_dict[layer] = torch.transpose(new_state_dict[layer], 0, 1)
+45
+46    torch.save(new_state_dict, 'transformed.pth')
+
+
+ +
+ + + + \ No newline at end of file diff --git a/docs/sitemap.xml b/docs/sitemap.xml index e1d8e169..d7cc9aff 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -8,7 +8,7 @@ https://nn.labml.ai/gan/wasserstein/index.html - 2023-10-24T16:30:00+00:00 + 2024-07-15T16:30:00+00:00 1.00 @@ -504,22 +504,22 @@ - https://nn.labml.ai/RWKV/configs.html - 2024-03-17T16:30:00+00:00 + https://nn.labml.ai/rwkv/configs.html + 2024-08-02T16:30:00+00:00 1.00 - https://nn.labml.ai/RWKV/index.html - 2024-03-17T16:30:00+00:00 + https://nn.labml.ai/rwkv/index.html + 2024-08-02T16:30:00+00:00 1.00 - https://nn.labml.ai/RWKV/experiment.html - 2024-03-17T16:30:00+00:00 + https://nn.labml.ai/rwkv/experiment.html + 2024-08-02T16:30:00+00:00 1.00 @@ -1294,6 +1294,27 @@ + + https://nn.labml.ai/lora/gpt2.html + 2024-08-02T16:30:00+00:00 + 1.00 + + + + + https://nn.labml.ai/lora/index.html + 2024-08-02T16:30:00+00:00 + 1.00 + + + + + https://nn.labml.ai/lora/transform_hf_model.html + 2024-08-02T16:30:00+00:00 + 1.00 + + + https://nn.labml.ai/graphs/gat/index.html 2023-10-24T16:30:00+00:00 @@ -1450,7 +1471,7 @@ https://nn.labml.ai/rl/ppo/gae.html - 2023-10-24T16:30:00+00:00 + 2024-06-24T16:30:00+00:00 1.00 diff --git a/docs/zh/index.html b/docs/zh/index.html index 09208a7d..322f4bf7 100644 --- a/docs/zh/index.html +++ b/docs/zh/index.html @@ -72,7 +72,7 @@

labml.ai 带注释的 PyTorch 版论文实现

这是一个用 PyTorch 实现各种神经网络和相关算法的集合。每个算法的代码实现都有详细的解释说明,且在网站上与代码逐行对应。我们相信,这些内容将帮助您更好地理解这些算法。

Screenshot

-

我们正在积极维护这个仓库并添加新的代码实现Twitter以获取更新。

+

我们正在积极维护这个仓库并添加新的代码实现。Twitter以获取更新。

翻译

英语(原版)

中文(翻译)

@@ -102,7 +102,7 @@
  • Primer
  • 沙漏网络
  • Eleuther GPT-neox

    -
  • 在一块 48GB GPU 上生成
  • @@ -149,7 +148,7 @@ -

    第一层按权重和偏差进行参数化

    +

    第一层由权重和偏差进行参数化

    @@ -161,7 +160,7 @@ -

    第一层按权重和偏差进行参数化

    +

    第一层由权重和偏差进行参数化

    @@ -173,7 +172,7 @@ -

    隐藏图层丢失

    +

    隐藏层 Dropout

    @@ -185,7 +184,7 @@ -

    激活功能

    +

    激活函数

    @@ -197,7 +196,7 @@ -

    是否有门

    +

    是否存在门控

    @@ -210,7 +209,7 @@ -

    如果有门,则转换输入的线性层将乘以门,并通过权重和偏置进行参数化

    +

    如果存在门控,则通过线性层将输入值与门相乘,并由权重 和偏置进行参数化

    @@ -245,7 +244,7 @@ -

    如果是封闭的,

    +

    如果进行门控,

    @@ -271,7 +270,7 @@ -

    申请退学

    +

    使用 Dropout

    @@ -283,7 +282,7 @@ -

    或者取决于它是否有门控

    +

    根据是否进行门控,返回或者

    diff --git a/docs/zh/transformers/index.html b/docs/zh/transformers/index.html index 43f34b9e..fb9536ab 100644 --- a/docs/zh/transformers/index.html +++ b/docs/zh/transformers/index.html @@ -3,24 +3,24 @@ - + - - + + - + - + - - + + - 变压器 + Transformers @@ -70,50 +70,50 @@ -

    变压器

    -

    本模块包含 PyTorch 实现和论文 Attention Is All You Need 中对原创变压器的解释,以及它的衍生品和增强功能。

    -
    diff --git a/docs/zh/transformers/label_smoothing_loss.html b/docs/zh/transformers/label_smoothing_loss.html index cbafc90c..3f2b49b0 100644 --- a/docs/zh/transformers/label_smoothing_loss.html +++ b/docs/zh/transformers/label_smoothing_loss.html @@ -3,12 +3,12 @@ - + - + @@ -18,7 +18,7 @@ - + 标签平滑损失 @@ -154,7 +154,7 @@ -

    显示系统预期的目标分布。

    +

    展示系统期望的目标分布。

    @@ -183,7 +183,7 @@ -

    打印(预测)

    +

    输出(预测)

    diff --git a/docs/zh/transformers/mha.html b/docs/zh/transformers/mha.html index 4fce0daf..71798236 100644 --- a/docs/zh/transformers/mha.html +++ b/docs/zh/transformers/mha.html @@ -3,24 +3,24 @@ - + - - + + - + - + - - + + - 多头注意 (MHA) + 多头注意力 (MHA) @@ -72,9 +72,7 @@

    多头注意力 (MHA)

    Open In Colab

    -

    这是 P yTorch 中论文 “注意力就是你所需要的” 多头注意力的教程/实现。该实现的灵感来自带注释的变形金刚

    -

    以下是使用带有 MHA 的基本转换器进行 NLP 自动回归的训练代码

    -

    这是一个训练简单变压器的实验实现

    +

    这是论文《 Attention is All You Need 》中多头注意力的PyTorch教程/实现。该实现的灵感来自《带注释的变形金刚》

    %n

    这是使用基础 Transformer 和 MHA 进行 NLP 自回归的训练代码

    %n

    这是一个训练简单transformer的代码实现

    @@ -93,8 +91,8 @@ #

    -

    为多头注意做好准备

    -

    该模块进行线性变换,并将向量拆分为给定数量的头部,以获得多头注意。这用于转换查询向量。

    +

    准备多头注意力

    +

    该部分执行线性变换,并将向量分割成给定数量的头以获得多头注意力。这用于查询向量。

    @@ -118,7 +116,7 @@ -

    线性变换的线性层

    +

    线性层用于线性变换/p>

    @@ -130,7 +128,7 @@ -

    头数

    +

    注意力头数

    @@ -142,7 +140,7 @@ -

    每个头部中以向量表示的维度数

    +

    每个头部中向量的维度数量

    @@ -165,9 +163,9 @@ -

    输入的形状[seq_len, batch_size, d_model] +

    输入的形状为[seq_len, batch_size, d_model][batch_size, d_model] -。我们将线性变换应用于最后一个维度,然后将其拆分为头部。

    +。我们对最后一维应用线性变换,并将其分为多个头。

    @@ -191,7 +189,7 @@ -

    将最后一个维度拆分成头部

    +

    将最后一个维度分成多个头部

    @@ -218,11 +216,11 @@ #

    -

    多头注意模块

    -

    这将计算给定key -和value -向量的缩放多头注意query -力。

    +

    多头注意力模块

    +

    这将计算给出的key +、value +和query +向量缩放后的多头注意力。

    -

    简单来说,它会找到与查询匹配的键,并获取这些键的值。

    -

    它使用查询和键的点积作为它们匹配程度的指标。在服用点产品之前,先按比例缩放

    -

    Softmax 是沿序列(或时间)的轴计算的。

    +M834 80h400000v40h-400000z">1。这样做是为了避免当较大时,大的点积值导致 Softmax 操作输出非常小的梯度。

    +

    Softmax 是沿序列(或时间)轴计算的。

    @@ -261,12 +259,12 @@ M834 80h400000v40h-400000z">heads -是头的数量。 +是注意力头的数量。
  • d_model -是query +是向量querykeyvalue -向量中的要素数。
  • +中的特征数量。
    @@ -289,7 +287,7 @@ M834 80h400000v40h-400000z">query +

    这些将对多头注意力的向量querykeyvalue -向量。

    +进行转换。

    @@ -330,7 +328,7 @@ M834 80h400000v40h-400000z">key +

    在键( Key )的时间维度上进行注意力 Softmaxkey

    @@ -355,7 +353,7 @@ M834 80h400000v40h-400000z">mask -有形状[seq_len_q, seq_len_k, batch_size] -,其中第一个维度是查询维度。如果查询维度等于它将被广播。

    +的形状为[seq_len_q, seq_len_k, batch_size] +,其中第一维是查询维度。如果查询维度等于,则会进行广播。

    @@ -443,7 +441,7 @@ M834 80h400000v40h-400000z">[seq_len_q, seq_len_k, batch_size, heads] +

    生成的掩码形状为[seq_len_q, seq_len_k, batch_size, heads]

    @@ -471,15 +469,15 @@ M834 80h400000v40h-400000z">
    query keyvalue -是存储查询向量集合的张量。它们有形状[seq_len, batch_size, d_model] +是存储查询向量集合的张量。它们的形状为[seq_len, batch_size, d_model]

    mask -有形状[seq_len, seq_len, batch_size] -并mask[i, j, b] -指示是否为批量查询b -,位置处的查询i -有权访问位置处的键值j -。

    +的形状为[seq_len, seq_len, batch_size] +,mask[i, j, b] +表示批次b +,在位置i +处查询是否有权访问位置j +处的键值对。

    @@ -497,8 +495,8 @@ M834 80h400000v40h-400000z">querykey -并且value -有形状[seq_len, batch_size, d_model] +和value +的形状为[seq_len, batch_size, d_model]

    @@ -514,10 +512,10 @@ M834 80h400000v40h-400000z">
    query +

    为注意力计算准备向量querykeyvalue -进行注意力计算。然后这些就会有形状[seq_len, batch_size, heads, d_k] +它们的形状将变为[seq_len, batch_size, heads, d_k]

    @@ -532,8 +530,8 @@ M834 80h400000v40h-400000z">
    。这给出了形状的张量[seq_len, seq_len, batch_size, heads] -。

    +

    计算注意力分数这将得到一个形状为[seq_len, seq_len, batch_size, heads] +的张量。

    @@ -545,7 +543,7 @@ M834 80h400000v40h-400000z">
    @@ -92,7 +92,7 @@ #

    -

    嵌入令牌并添加固定位置编码

    +

    嵌入 token 并添加固定位置编码

    @@ -133,7 +133,7 @@ #

    -

    嵌入令牌并添加参数化的位置编码

    +

    嵌入 token 并添加参数化的位置编码

    @@ -175,7 +175,7 @@

    Transformer Layer

    -

    This can act as an encoder layer or a decoder layer. We use pre-norm.

    +

    这可以作为编码器层或解码器层。我们使用预正则化。

    @@ -188,15 +188,15 @@ #
    +是自注意力和 FFN 后的 Dropout 率
    @@ -272,7 +272,7 @@ -

    通过自我关注,即关键和价值来自自我

    +

    通过自注意力机制运行,即键和值来自于自身

    @@ -284,7 +284,7 @@ -

    添加自我关注的结果

    +

    添加自注意力结果

    @@ -296,7 +296,7 @@ -

    如果提供了来源,则从关注源获取结果。这是当你有一个关注编码器输出的解码器层

    时 +

    如果提供了源数据,则从注意力机制中获取结果。这是指当解码器层关注编码器输出时。

    @@ -320,7 +320,7 @@ -

    注意源。即键和值来自源

    +

    关注源数据,即键和值来自源数据

    @@ -332,7 +332,7 @@ -

    添加来源关注结果

    +

    添加源关注结果

    @@ -356,7 +356,7 @@ -

    如果已指定,则将输入保存到前馈图层

    +

    如果已指定,则将输入保存到前馈层

    @@ -369,7 +369,7 @@ -

    通过前馈网络

    +

    通过前馈网络传递

    @@ -396,7 +396,7 @@ #

    -

    变压器编码

    +

    Transformer 编码器

    @@ -420,7 +420,7 @@ -

    制作变压器层的副本

    +

    制作 Transformer 层的副本

    @@ -432,7 +432,7 @@ -

    最终归一化层

    +

    最终的归一化层

    @@ -455,7 +455,7 @@ -

    穿过每个变压器层

    +

    运行每个 Transformer 层

    @@ -481,7 +481,7 @@ #

    -

    变压器解码器

    +

    Transformer 解码器

    @@ -505,7 +505,7 @@ -

    制作变压器层的副本

    +

    制作 Transformer 层的副本

    @@ -517,7 +517,7 @@ -

    最终归一化层

    +

    最终的归一化层

    @@ -540,7 +540,7 @@ -

    穿过每个变压器层

    +

    运行每个 Transformer 层

    @@ -566,9 +566,9 @@ #

    -

    发电机

    -

    这可以预测令牌并给出其中的lof softmax。如果你正在使用,你不需要这个nn.CrossEntropyLoss -。

    +

    生成器

    +

    这会预测这些标记并给出它们的 softmax 的对数。如果你使用nn.CrossEntropyLoss +,则不需要这样做。

    @@ -606,7 +606,7 @@ #

    -

    组合式编码器-解码器

    +

    组合编码器-解码器

    @@ -635,7 +635,7 @@ -

    从他们的代码来看,这很重要。使用 Glorot/fan_avg 初始化参数。

    +

    这是代码中很重要的部分。使用 Glorot/fan_avg 初始化参数。

    @@ -660,7 +660,7 @@ -

    通过编码器运行源码

    +

    通过编码器运行源代码

    diff --git a/docs/zh/transformers/positional_encoding.html b/docs/zh/transformers/positional_encoding.html index 4d14d992..c36fdbb8 100644 --- a/docs/zh/transformers/positional_encoding.html +++ b/docs/zh/transformers/positional_encoding.html @@ -3,12 +3,12 @@ - + - + @@ -18,7 +18,7 @@ - + 固定位置编码 @@ -153,7 +153,7 @@ -

    头寸指数

    +

    位置索引

    @@ -213,7 +213,7 @@ -

    添加批量维度

    +

    增加批处理维度

    diff --git a/docs/zh/transformers/relative_mha.html b/docs/zh/transformers/relative_mha.html index 3c08c520..143d6371 100644 --- a/docs/zh/transformers/relative_mha.html +++ b/docs/zh/transformers/relative_mha.html @@ -3,13 +3,13 @@ - + - + @@ -19,7 +19,7 @@ - + 相对多头注意力 diff --git a/docs/zh/transformers/utils.html b/docs/zh/transformers/utils.html index f4cdcbf4..28082e4c 100644 --- a/docs/zh/transformers/utils.html +++ b/docs/zh/transformers/utils.html @@ -3,24 +3,24 @@ - + - - + + - + - + - - + + - 变压器公用事业 + Transformer 实用工具 @@ -70,7 +70,7 @@ -

    变压器公用事业

    +

    Transformer 实用工具

    @@ -82,7 +82,7 @@ -

    后续掩码,用于掩盖未来(后续)时间步中的数据

    +

    用于屏蔽未来(后续)时间步数据的后续掩码

    diff --git a/labml_nn/gan/wasserstein/__init__.py b/labml_nn/gan/wasserstein/__init__.py index b3c52472..3c5394e0 100644 --- a/labml_nn/gan/wasserstein/__init__.py +++ b/labml_nn/gan/wasserstein/__init__.py @@ -26,7 +26,7 @@ marginal probabilities are $\gamma(x, y)$. $\mathbb{E}_{(x,y) \sim \gamma} \Vert x - y \Vert$ is the earth mover distance for a given joint distribution ($x$ and $y$ are probabilities). -So $W(\mathbb{P}_r, \mathbb{P}g)$ is equal to the least earth mover distance for +So $W(\mathbb{P}_r, \mathbb{P}_g)$ is equal to the least earth mover distance for any joint distribution between the real distribution $\mathbb{P}_r$ and generated distribution $\mathbb{P}_g$. The paper shows that Jensen-Shannon (JS) divergence and other measures for the difference between two probability diff --git a/labml_nn/lora/__init__.py b/labml_nn/lora/__init__.py new file mode 100644 index 00000000..f5fc197d --- /dev/null +++ b/labml_nn/lora/__init__.py @@ -0,0 +1,151 @@ +""" +--- +title: Low-Rank Adaptation (LoRA) +summary: > + Annotated implementation of RoRA from paper + LoRA: Low-Rank Adaptation of Large Language Models +--- + +# Low-Rank Adaptation (LoRA) + +This is an implementation of +[Low-Rank Adaptation (LoRA)](https://arxiv.org/abs/2106.09685) +in [PyTorch](https://pytorch.org). + +Low-Rank Adaptation (LoRA) freezes pre-trained model weights and injects + trainable rank decomposition matrices into each layer of the transformer. + This makes it possible to efficiently fine-tune large langauge models by + reducing trainable parameters by a large factor. + +Here's [the training code](experiment.html) for training a GPT2 model with LoRA + on Tiny Shakespeare dataset. +""" + +import torch +import torch.nn as nn + + +class Linear(nn.Module): + """ + ## LoRA Linear Layer + + LoRA linear layer adds a low-rank decomposition to the pre-trained + weight matrix ($W_0 \in \mathbb{R}^{d \times k}$) + of the linear layer. + + $$W_0 + \Delta W = W_0 + BA$$ + + , where $B \in \mathbb{R}^{d \times r}$, $A \in \mathbb{R}^{r \times k}$, + and the rank $r \ll min(d, k)$. + + All parameters are frozen except $A$ and $B$. + + $\Delta W$ is initialized to be zero at the beginning of the training. + + They multiple $\Delta W x$ by $\frac{\alpha}{r}$ where $\alpha$ is a hyper-parameter. + Once $\alpha$ is tuned it can be kept the same when varying $r$. + """ + + def __init__(self, in_features: int, out_features: int, bias: bool, + r: int, alpha: int = None): + """ + :param in_features: is the number of input features of the linear layer + :param out_features: is the number of output features of the linear layer + :param bias: is a flag indicating if there is a bias parameter + :param r: is the rank of the decomposition $r$ + :param alpha: is the scaling factor $\alpha$ + """ + super().__init__() + + # Set $\alpha = r$ is not provided. i.e. make the scaling factor $\frac{\alpha}{r} = 1$. + if alpha is None: + alpha = r + + # The pre-trained weight $W_0$ + self.weight = nn.Parameter(torch.empty((out_features, in_features))) + # Freeze it + self.weight.requires_grad = False + + if bias: + # Bias parameter $b_0$ (also frozen) + self.bias = nn.Parameter(torch.empty(out_features)) + self.bias.requires_grad = False + else: + # No bias parameter + self.bias = None + + # scaling factor $\frac{\alpha}{r}$ + self.scaling = alpha / r + # Matrix $A \in \mathbb{R}^{r \times k}$ + self.lora_a = nn.Parameter(torch.empty((in_features, r))) + # Matrix $B \in \mathbb{R}^{d \times r}$, we keep $A$ and $B$ transposed + self.lora_b = nn.Parameter(torch.empty((r, out_features))) + + with torch.no_grad(): + # Initialize $A$ similar to a weight matrix in a normal linear layer + nn.init.kaiming_uniform_(self.lora_a, a=5 ** 0.5) + # Initialize $B$ to $0$ so that $\Delta W = BA$ is $0$ at initialization + nn.init.zeros_(self.lora_b) + + def forward(self, x: torch.Tensor): + # Compute $W_0 x + b_0$ + result = nn.functional.linear(x, self.weight, bias=self.bias) + + # Add $\frac{\alpha}{r} \Delta W x = \frac{\alpha}{r} BAx$ + result += (x @ self.lora_a @ self.lora_b) * self.scaling + + # + return result + + +class Embedding(nn.Module): + """ + ## LoRA Embedding Layer + + Similar to LoRA linear layer this adds a low-rank decomposition to the pre-trained + embedding weights matrix ($W_0 \in \mathbb{R}^{d \times k}$). + + $$W_0 + \Delta W = W_0 + BA$$ + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, + r: int, alpha: int = None): + """ + + :param num_embeddings: is the number of embeddings + :param embedding_dim: is the number embedding dimensions + :param r: is the rank of the decomposition $r$ + :param alpha: is the scaling factor $\alpha$ + """ + super().__init__() + + # Set $\alpha = r$ is not provided. i.e. make the scaling factor $\frac{\alpha}{r} = 1$. + if alpha is None: + alpha = r + + # The pre-trained embedding weights $W_0$ (frozen) + self.weight = nn.Parameter(torch.empty((num_embeddings, embedding_dim))) + self.weight.requires_grad = False + + # scaling factor $\frac{\alpha}{r}$ + self.scaling = alpha / r + # Matrix $A \in \mathbb{R}^{r \times k}$ + self.lora_a = nn.Parameter(torch.empty((num_embeddings, r))) + # Matrix $B \in \mathbb{R}^{d \times r}$ + self.lora_b = nn.Parameter(torch.empty((r, embedding_dim))) + + with torch.no_grad(): + # Initialize $A$ with a normal distribution + nn.init.normal_(self.lora_a) + # Initialize $B$ to $0$ so that $\Delta W = BA$ is $0$ at initialization + nn.init.zeros_(self.lora_b) + + def forward(self, x: torch.Tensor): + # Compute the embeddings $W_0 \text{onehot}(x)$ + result = nn.functional.embedding(x, self.weight) + + # Add $\frac{\alpha}{r} \Delta W \text{onehot}(x) = \frac{\alpha}{r} BA \text{onehot}(x_$ + result += (nn.functional.embedding(x, self.lora_a) @ self.lora_b) * self.scaling + + # + return result diff --git a/labml_nn/lora/gpt2.py b/labml_nn/lora/gpt2.py new file mode 100644 index 00000000..a83a0276 --- /dev/null +++ b/labml_nn/lora/gpt2.py @@ -0,0 +1,130 @@ +import torch +import torch.nn as nn +from transformers import AutoTokenizer +from labml_nn.lora import Linear, Embedding + +tokenizer = AutoTokenizer.from_pretrained("gpt2") + +config = { + "layer_norm_epsilon": 1e-05, + "n_embd": 768, + "n_head": 12, + "n_layer": 12, + "n_positions": 1024, + "vocab_size": 50257, + "device": "cuda" +} + + +class FFN(nn.Module): + def __init__(self, dim): + super().__init__() + self.c_fc = Linear(config['n_embd'], dim, r=32, bias=True) + self.c_proj = Linear(dim, config['n_embd'], r=32, bias=True) + self.act = nn.functional.gelu + + def forward(self, hidden_states): + hidden_states = self.c_fc(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.c_proj(hidden_states) + return hidden_states + + +class MultiHeadAttention(nn.Module): + def __init__(self): + super().__init__() + self.embed_dim = config['n_embd'] + self.num_heads = config['n_head'] + self.head_dim = self.embed_dim // self.num_heads + self.split_size = self.embed_dim + + self.c_att = Linear(config['n_embd'], config['n_embd'] * 3, r=32, bias=True) + self.c_proj = Linear(config['n_embd'], config['n_embd'], r=32, bias=True) + + def _split_heads(self, tensor, num_heads, attn_head_size): + """ + Splits hidden_size dim into attn_head_size and num_heads + """ + new_shape = tensor.size()[:-1] + (num_heads, attn_head_size) + tensor = tensor.view(new_shape) + return tensor.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features) + + def forward(self, hidden_states): + batch_size, seq_length, _ = hidden_states.size() + + query, key, value = self.c_att(hidden_states).split(self.split_size, dim=2) + + query = self._split_heads(query, self.num_heads, self.head_dim) + key = self._split_heads(key, self.num_heads, self.head_dim) + value = self._split_heads(value, self.num_heads, self.head_dim) + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query, + key, + value, + attn_mask=None, + dropout_p=0.0, + is_causal=True, # for the triangular mask + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(batch_size, seq_length, self.embed_dim) + + attn_output = self.c_proj(attn_output) + + return attn_output + + +class Block(nn.Module): + def __init__(self): + super().__init__() + self.pre_norm = nn.LayerNorm(config['n_embd'], eps=config['layer_norm_epsilon']) + self.attn = MultiHeadAttention() + self.post_norm = nn.LayerNorm(config['n_embd'], eps=config['layer_norm_epsilon']) + self.ffn = FFN(config['n_embd'] * 4) + + def forward(self, hidden_states): + residual = hidden_states + hidden_states = self.pre_norm(hidden_states) + + attn_output = self.attn(hidden_states) + + hidden_states = attn_output + residual + residual = hidden_states + hidden_states = self.post_norm(hidden_states) + feed_forward_output = self.ffn(hidden_states) + hidden_states = feed_forward_output + residual + + return hidden_states + + +class GPTModel(nn.Module): + def __init__(self): + super().__init__() + + self.token_embedding = Embedding(config['vocab_size'], config['n_embd'], r=32) + self.position_embedding = Embedding(config['n_positions'], config['n_embd'], r=32) + + self.blocks = nn.ModuleList([Block() for _ in range(config['n_layer'])]) + + self.final_norm = nn.LayerNorm(config['n_embd'], eps=config['layer_norm_epsilon']) + + self.lm_head = Linear(config['n_embd'], config['vocab_size'], r=32, bias=False) + + def forward(self, input_ids): + batch_size, input_shape = input_ids.size() + + token_embeddings = self.token_embedding(input_ids) # B T C + position_ids = torch.arange(input_shape, device=config['device']) # T C + position_embeddings = self.position_embedding(position_ids) # B T C + + hidden_states = token_embeddings + position_embeddings + + for block in self.blocks: + hidden_states = block(hidden_states) + + hidden_states = self.final_norm(hidden_states) + + logits = self.lm_head(hidden_states) + + return logits diff --git a/labml_nn/lora/train.ipynb b/labml_nn/lora/train.ipynb new file mode 100644 index 00000000..68bbb7eb --- /dev/null +++ b/labml_nn/lora/train.ipynb @@ -0,0 +1,217 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "import torch\n", + "from torch.optim import Adam\n", + "from torch.utils.data import DataLoader, TensorDataset\n", + "from torch.utils.data import random_split\n", + "from transformers import AutoTokenizer\n", + "\n", + "from labml import tracker, experiment\n", + "from labml_nn.lora.gpt2 import GPTModel" + ], + "id": "f072832ec9d346e1" + }, + { + "cell_type": "code", + "id": "initial_id", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "source": "# !wget https://raw.github/zusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt", + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "3b1e507015ba6b81", + "metadata": {}, + "source": [ + "with open('input.txt', 'r', encoding='utf-8') as f:\n", + " text = f.read()" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "ac8e51ae5bbfcae7", + "metadata": {}, + "source": [ + "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n", + "\n", + "tokens = tokenizer.encode(text, add_special_tokens=False)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "aeefcdf813e427e", + "metadata": {}, + "source": [ + "context_length = 512\n", + "batch_size = 2" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "a384b42274f008a2", + "metadata": {}, + "source": [ + "num_batches = len(tokens) // (batch_size * context_length)\n", + "tokens = tokens[:num_batches * batch_size * context_length]" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "5c4cc78ac1a02c1d", + "metadata": {}, + "source": "input_ids = torch.tensor(tokens).view(-1, context_length)", + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "7037fd75e2161382", + "metadata": {}, + "source": [ + "dataset = TensorDataset(input_ids)\n", + "\n", + "train_ratio = 0.8\n", + "test_ratio = 0.2\n", + "\n", + "train_size = int(train_ratio * len(dataset))\n", + "test_size = len(dataset) - train_size\n", + "\n", + "train_dataset, test_dataset = random_split(dataset, [train_size, test_size])\n", + "\n", + "train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n", + "test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "a98b7baa064b8494", + "metadata": {}, + "source": [ + "model = GPTModel()\n", + "state_dict = torch.load('transformed.pth', weights_only=True)\n", + "\n", + "_ = model.load_state_dict(state_dict, strict=False)" + ], + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "device = \"cuda\"\n", + "model = model.to(device=\"cuda\")" + ], + "id": "2e0fa8b3082df716", + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "e2f5076894770740", + "metadata": {}, + "source": [ + "optimizer = Adam(model.parameters(), lr=5e-5)\n", + "criterion = torch.nn.CrossEntropyLoss()\n", + "\n", + "model.train()\n", + "epochs = 3\n", + "step = 0\n", + "\n", + "with experiment.record(name='LoRA.GPT2', app_url='http://localhost:5005/api/v1/track'):\n", + " for epoch in range(epochs):\n", + " for batch in train_dataloader:\n", + " inputs = batch[0]\n", + " inputs = inputs.to(device)\n", + " labels = inputs.clone()\n", + "\n", + " outputs = model(inputs)\n", + "\n", + " shift_logits = outputs[..., :-1, :]\n", + " shift_labels = labels[..., 1:]\n", + "\n", + " loss = criterion(shift_logits.reshape(-1, shift_logits.size(-1)), shift_labels.reshape(-1))\n", + "\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " tracker.save(step, {'loss': loss})\n", + " step += 1\n", + " print(f'Epoch: {epoch + 1}, Loss: {loss.item()}')\n", + "\n", + " test_loss = 0\n", + " for batch in test_dataloader:\n", + " inputs = batch[0]\n", + " inputs = inputs.to(device)\n", + " labels = inputs.clone()\n", + "\n", + " outputs = model(inputs)\n", + "\n", + " shift_logits = outputs[..., :-1, :]\n", + " shift_labels = labels[..., 1:]\n", + "\n", + " loss = criterion(shift_logits.reshape(-1, shift_logits.size(-1)), shift_labels.reshape(-1))\n", + "\n", + " test_loss += loss.item()\n", + " test_loss /= len(test_dataloader)\n", + " tracker.save(step, {'test_loss': test_loss})\n", + "\n", + "print(\"Training complete.\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "da2d4023002648dc", + "metadata": {}, + "source": [], + "outputs": [], + "execution_count": null + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "base" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/labml_nn/lora/transform_hf_model.py b/labml_nn/lora/transform_hf_model.py new file mode 100644 index 00000000..df53bbf2 --- /dev/null +++ b/labml_nn/lora/transform_hf_model.py @@ -0,0 +1,46 @@ +import torch +from transformers import AutoModelForCausalLM + + +def transform_hf_model(): + model = AutoModelForCausalLM.from_pretrained("gpt2") + + state_dict = model.state_dict() + + mapping = { + 'transformer.wte.weight': 'token_embedding.weight', + 'transformer.wpe.weight': 'position_embedding.weight', + 'transformer.ln_f.weight': 'final_norm.weight', + 'transformer.ln_f.bias': 'final_norm.bias', + 'lm_head.weight': 'lm_head.weight' + } + + for i in range(12): + mapping[f'transformer.h.{i}.ln_1.weight'] = f'blocks.{i}.pre_norm.weight' + mapping[f'transformer.h.{i}.ln_1.bias'] = f'blocks.{i}.pre_norm.bias' + mapping[f'transformer.h.{i}.attn.c_attn.weight'] = f'blocks.{i}.attn.c_att.weight' + mapping[f'transformer.h.{i}.attn.c_attn.bias'] = f'blocks.{i}.attn.c_att.bias' + mapping[f'transformer.h.{i}.attn.c_proj.weight'] = f'blocks.{i}.attn.c_proj.weight' + mapping[f'transformer.h.{i}.attn.c_proj.bias'] = f'blocks.{i}.attn.c_proj.bias' + mapping[f'transformer.h.{i}.ln_2.weight'] = f'blocks.{i}.post_norm.weight' + mapping[f'transformer.h.{i}.ln_2.bias'] = f'blocks.{i}.post_norm.bias' + mapping[f'transformer.h.{i}.mlp.c_fc.weight'] = f'blocks.{i}.ffn.c_fc.weight' + mapping[f'transformer.h.{i}.mlp.c_fc.bias'] = f'blocks.{i}.ffn.c_fc.bias' + mapping[f'transformer.h.{i}.mlp.c_proj.weight'] = f'blocks.{i}.ffn.c_proj.weight' + mapping[f'transformer.h.{i}.mlp.c_proj.bias'] = f'blocks.{i}.ffn.c_proj.bias' + + new_state_dict = {} + for old_key, new_key in mapping.items(): + if old_key in state_dict: + new_state_dict[new_key] = state_dict[old_key] + + # transpose weight matrices of convo 1d layers to use linear layers instead + convo_layers = ([f'blocks.{i}.ffn.c_fc.weight' for i in range(12)] + + [f'blocks.{i}.ffn.c_proj.weight' for i in range(12)] + + [f'blocks.{i}.attn.c_att.weight' for i in range(12)] + + [f'blocks.{i}.attn.c_proj.weight' for i in range(12)]) + + for layer in convo_layers: + new_state_dict[layer] = torch.transpose(new_state_dict[layer], 0, 1) + + torch.save(new_state_dict, 'transformed.pth') diff --git a/labml_nn/RWKV/__init__.py b/labml_nn/rwkv/__init__.py similarity index 100% rename from labml_nn/RWKV/__init__.py rename to labml_nn/rwkv/__init__.py diff --git a/labml_nn/RWKV/configs.py b/labml_nn/rwkv/configs.py similarity index 100% rename from labml_nn/RWKV/configs.py rename to labml_nn/rwkv/configs.py diff --git a/labml_nn/RWKV/experiment.py b/labml_nn/rwkv/experiment.py similarity index 97% rename from labml_nn/RWKV/experiment.py rename to labml_nn/rwkv/experiment.py index 1f99d66d..983db2c0 100644 --- a/labml_nn/RWKV/experiment.py +++ b/labml_nn/rwkv/experiment.py @@ -3,10 +3,10 @@ import math import torch import torch.nn as nn -from labml_nn.RWKV.configs import RWKVConfigs +from labml_nn.rwkv.configs import RWKVConfigs -from labml_nn.RWKV import RWKV -from labml_nn.RWKV import TimeMixing +from labml_nn.rwkv import RWKV +from labml_nn.rwkv import TimeMixing from labml import experiment from labml.configs import option from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs diff --git a/translate_cache/transformers/feed_forward.zh.json b/translate_cache/transformers/feed_forward.zh.json index 66f87871..719c685d 100644 --- a/translate_cache/transformers/feed_forward.zh.json +++ b/translate_cache/transformers/feed_forward.zh.json @@ -1,5 +1,5 @@ { - "

    Position-wise Feed-Forward Network (FFN)

    \n

    This is a PyTorch implementation of position-wise feedforward network used in transformer.

    \n

    FFN consists of two fully connected layers. Number of dimensions in the hidden layer _^_0_^_, is generally set to around four times that of the token embedding _^_1_^_. So it is sometime also called the expand-and-contract network.

    \n

    There is an activation at the hidden layer, which is usually set to ReLU (Rectified Linear Unit) activation, _^_2_^_

    \n

    That is, the FFN function is, _^_3_^_ where _^_4_^_, _^_5_^_, _^_6_^_ and _^_7_^_ are learnable parameters.

    \n

    Sometimes the GELU (Gaussian Error Linear Unit) activation is also used instead of ReLU. _^_8_^_ where _^_9_^_

    \n

    Gated Linear Units

    \n

    This is a generic implementation that supports different variants including Gated Linear Units (GLU). We have also implemented experiments on these:

    \n\n": "

    \u4f4d\u7f6e\u524d\u9988\u7f51\u7edc (FFN)

    \n

    \u8fd9\u662f Transformer \u4e2d\u4f7f\u7528\u7684\u4f4d\u7f6e\u524d\u9988\u7f51\u7edc\u7684 PyTorch \u5b9e\u73b0\u3002

    \n

    FFN \u7531\u4e24\u4e2a\u5168\u8fde\u63a5\u5c42\u7ec4\u6210\u3002\u9690\u85cf\u5c42\u4e2d\u7684\u7ef4\u5ea6\u6570_^_0_^_\u901a\u5e38\u8bbe\u7f6e\u4e3a\u6807\u8bb0\u5d4c\u5165\u7ef4\u5ea6_^_1_^_\u7684\u56db\u500d\u5de6\u53f3\u3002\u56e0\u6b64\uff0c\u5b83\u6709\u65f6\u4e5f\u88ab\u79f0\u4e3a\u6269\u5f20-\u538b\u7f29\u7f51\u7edc\u3002

    \n

    \u9690\u85cf\u5c42\u6709\u4e00\u4e2a\u6fc0\u6d3b\u51fd\u6570\uff0c\u901a\u5e38\u8bbe\u7f6e\u4e3a ReLU (Rectified Linear Unit) \u6fc0\u6d3b\u51fd\u6570\uff0c_^_2_^_

    \n

    \u5728\u6b64\u57fa\u7840\u4e0a\uff0c FFN \u51fd\u6570\u53ef\u4ee5\u5199\u4f5c\uff1a_^_3_^_\u5176\u4e2d_^_4_^__^_5_^_\u3001_^_6_^_\u548c_^_7_^_\u662f\u53ef\u5b66\u4e60\u7684\u53c2\u6570\u3002

    \n

    \u6709\u65f6\u8fd8\u4f1a\u4f7f\u7528 GELU (Gaussian Error Linear Unit) \u6fc0\u6d3b\u51fd\u6570\u6765\u4ee3\u66ff ReLU \u3002_^_8_^_\u5176\u4e2d_^_9_^_

    \n

    \u95e8\u63a7\u7ebf\u6027\u5355\u5143

    \n

    \u8fd9\u662f\u4e00\u4e2a\u901a\u7528\u5b9e\u73b0\uff0c\u652f\u6301\u5305\u62ec\u95e8\u63a7\u7ebf\u6027\u5355\u5143(GLU) \u5728\u5185\u7684\u4e0d\u540c\u53d8\u4f53\u3002\u6211\u4eec\u8fd8\u5bf9\u8fd9\u4e9b\u8fdb\u884c\u4e86\u5b9e\u9a8c\uff1a

    \n\n", + "

    Position-wise Feed-Forward Network (FFN)

    \n

    This is a PyTorch implementation of position-wise feedforward network used in transformer.

    \n

    FFN consists of two fully connected layers. Number of dimensions in the hidden layer _^_0_^_, is generally set to around four times that of the token embedding _^_1_^_. So it is sometime also called the expand-and-contract network.

    \n

    There is an activation at the hidden layer, which is usually set to ReLU (Rectified Linear Unit) activation, _^_2_^_

    \n

    That is, the FFN function is, _^_3_^_ where _^_4_^_, _^_5_^_, _^_6_^_ and _^_7_^_ are learnable parameters.

    \n

    Sometimes the GELU (Gaussian Error Linear Unit) activation is also used instead of ReLU. _^_8_^_ where _^_9_^_

    \n

    Gated Linear Units

    \n

    This is a generic implementation that supports different variants including Gated Linear Units (GLU). We have also implemented experiments on these:

    \n\n": "

    \u4f4d\u7f6e\u524d\u9988\u7f51\u7edc \uff08FFN\uff09

    \n

    \u8fd9\u662f Transformer \u4e2d\u4f7f\u7528\u7684\u4f4d\u7f6e\u524d\u9988\u7f51\u7edc\u7684 PyTorch \u5b9e\u73b0\u3002

    \n

    FFN \u7531\u4e24\u4e2a\u5168\u8fde\u63a5\u5c42\u7ec4\u6210\u3002\u9690\u85cf\u5c42\u4e2d\u7684\u7ef4\u5ea6\u6570_%5e_0_%5e_\u901a\u5e38\u8bbe\u7f6e\u4e3a\u6807\u8bb0\u5d4c\u5165\u7ef4\u5ea6_%5e_1_%5e_\u7684\u56db\u500d\u5de6\u53f3\u3002\u56e0\u6b64\uff0c\u5b83\u6709\u65f6\u4e5f\u88ab\u79f0\u4e3a\u6269\u5f20-\u538b\u7f29\u7f51\u7edc\u3002

    \n

    \u9690\u85cf\u5c42\u6709\u4e00\u4e2a\u6fc0\u6d3b\u51fd\u6570\uff0c\u901a\u5e38\u8bbe\u7f6e\u4e3a ReLU (Rectified Linear Unit) \u6fc0\u6d3b\u51fd\u6570\uff0c_%5e_2_%5e_

    \n

    \u5728\u6b64\u57fa\u7840\u4e0a\uff0c FFN \u51fd\u6570\u53ef\u4ee5\u5199\u4f5c\uff1a_%5e_3_%5e_\u5176\u4e2d_%5e_4_%5e__%5e_5_%5e_\u3001_%5e_6_%5e_\u548c_%5e_7_%5e_\u662f\u53ef\u5b66\u4e60\u7684\u53c2\u6570\u3002

    \n

    \u6709\u65f6\u8fd8\u4f1a\u4f7f\u7528 GELU (Gaussian Error Linear Unit) \u6fc0\u6d3b\u51fd\u6570\u6765\u4ee3\u66ff ReLU \u3002_%5e_8_%5e_\u5176\u4e2d_%5e_9_%5e_

    \n

    \u95e8\u63a7\u7ebf\u6027\u5355\u5143

    \n

    \u8fd9\u662f\u4e00\u4e2a\u901a\u7528\u5b9e\u73b0\uff0c\u652f\u6301\u5305\u62ec\u95e8\u63a7\u7ebf\u6027\u5355\u5143(GLU) \u5728\u5185\u7684\u4e0d\u540c\u53d8\u4f53\u3002\u6211\u4eec\u8fd8\u5bf9\u8fd9\u4e9b\u8fdb\u884c\u4e86\u5b9e\u9a8c\uff1a

    \n\n", "

    FFN module

    \n": "

    FFN \u6a21\u5757

    \n", "

    _^_0_^_

    \n": "

    _^_0_^_

    \n", "

    _^_0_^_ or _^_1_^_ depending on whether it is gated

    \n": "

    \u6839\u636e\u662f\u5426\u8fdb\u884c\u95e8\u63a7\uff0c\u8fd4\u56de_^_0_^_\u6216\u8005_^_1_^_

    \n",