From f3465ac926bf3351a923b3fb1f2a0206439a21b7 Mon Sep 17 00:00:00 2001 From: Varuna Jayasiri Date: Fri, 16 Aug 2024 16:35:25 +0530 Subject: [PATCH] Chineese translation --- docs/zh/RWKV/configs.html | 8 +- docs/zh/RWKV/experiment.html | 14 +- docs/zh/RWKV/index.html | 8 +- docs/zh/gan/wasserstein/index.html | 10 +- docs/zh/lora/experiment.html | 480 ++++++++++++++++ docs/zh/lora/gpt2.html | 493 ++++++++++++++++ docs/zh/lora/index.html | 534 ++++++++++++++++++ docs/zh/sitemap.xml | 35 +- docs/zh/transformers/aft/index.html | 16 +- docs/zh/transformers/aft/readme.html | 18 +- docs/zh/transformers/configs.html | 2 +- .../zh/transformers/label_smoothing_loss.html | 2 +- docs/zh/transformers/mha.html | 12 +- docs/zh/transformers/positional_encoding.html | 12 +- translate_cache/lora/__init__.zh.json | 25 + translate_cache/lora/experiment.zh.json | 24 + translate_cache/lora/gpt2.zh.json | 16 + 17 files changed, 1651 insertions(+), 58 deletions(-) create mode 100644 docs/zh/lora/experiment.html create mode 100644 docs/zh/lora/gpt2.html create mode 100644 docs/zh/lora/index.html create mode 100644 translate_cache/lora/__init__.zh.json create mode 100644 translate_cache/lora/experiment.zh.json create mode 100644 translate_cache/lora/gpt2.zh.json diff --git a/docs/zh/RWKV/configs.html b/docs/zh/RWKV/configs.html index 04044322..fa515cee 100644 --- a/docs/zh/RWKV/configs.html +++ b/docs/zh/RWKV/configs.html @@ -12,7 +12,7 @@ - + @@ -23,7 +23,7 @@ configs.py - + @@ -47,7 +47,7 @@

home - RWKV + rwkv

@@ -60,7 +60,7 @@ style="max-width:100%;"/>

- + View code on Github

diff --git a/docs/zh/RWKV/experiment.html b/docs/zh/RWKV/experiment.html index 000a0f6a..3b537402 100644 --- a/docs/zh/RWKV/experiment.html +++ b/docs/zh/RWKV/experiment.html @@ -12,7 +12,7 @@ - + @@ -23,7 +23,7 @@ experiment.py - + @@ -47,7 +47,7 @@

home - RWKV + rwkv

@@ -60,7 +60,7 @@ style="max-width:100%;"/>

- + View code on Github

@@ -78,10 +78,10 @@ 3 4import torch 5import torch.nn as nn -6from labml_nn.RWKV.configs import RWKVConfigs +6from labml_nn.rwkv.configs import RWKVConfigs 7 -8from labml_nn.RWKV import RWKV -9from labml_nn.RWKV import TimeMixing +8from labml_nn.rwkv import RWKV +9from labml_nn.rwkv import TimeMixing 10from labml import experiment 11from labml.configs import option 12from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs diff --git a/docs/zh/RWKV/index.html b/docs/zh/RWKV/index.html index 7ad11230..833ab8a3 100644 --- a/docs/zh/RWKV/index.html +++ b/docs/zh/RWKV/index.html @@ -12,7 +12,7 @@ - + @@ -23,7 +23,7 @@ Receptance Weighted Key Value (RWKV) - + @@ -47,7 +47,7 @@

home - RWKV + rwkv

@@ -60,7 +60,7 @@ style="max-width:100%;"/>

- + View code on Github

diff --git a/docs/zh/gan/wasserstein/index.html b/docs/zh/gan/wasserstein/index.html index adddc2bd..c79292e3 100644 --- a/docs/zh/gan/wasserstein/index.html +++ b/docs/zh/gan/wasserstein/index.html @@ -74,17 +74,17 @@

Wasserstein GAN (WGAN)

这是 Wasserstein GAN 的实现。

最初的GAN损耗基于实际分布和生成的分布之间的Jensen-Shannon(JS)差异。Wasserstein GAN 基于这些分布之间的 Earth Mover 距离。

-

+

是所有联合分布的集合,其边际概率为

是给定关节分布的地球移动器距离(也是概率)。

-

因此,等于实际分布和生成的分布之间任何关节分布的最小地球移动器距离

+

因此,等于实际分布和生成的分布之间任何关节分布的最小地球移动器距离

本文表明,Jensen-Shannon(JS)背离和其他衡量两个概率分布之间差异的度量并不平滑。因此,如果我们对其中一个概率分布(参数化)进行梯度下降,它将不会收敛。

-

基于坎托罗维奇-鲁宾斯坦二元性,

+

基于坎托罗维奇-鲁宾斯坦二元性,

所有的 1-Lipschitz 函数都在哪里。

也就是说,它等于所有 1-Lipschitz 函数之间的最大差异。

-

对于-Lipschitz 函数,

+

对于-Lipschitz 函数,

如果所有-Lipschitz 函数都可以表示为参数化了哪里

-

+

如果由生成器表示并且来自已知分布

现在为了收敛我们可以通过梯度下降来最小化上述公式。

diff --git a/docs/zh/lora/experiment.html b/docs/zh/lora/experiment.html new file mode 100644 index 00000000..c2e26052 --- /dev/null +++ b/docs/zh/lora/experiment.html @@ -0,0 +1,480 @@ + + + + + + + + + + + + + + + + + + + + + + + Finetune GPT-2 with LoRA + + + + + + + + + + +
+
+
+
+

+ home + lora +

+

+ + Github + + Twitter +

+

+ + View code on Github +

+
+
+
+
+ +

Finetune GPT-2 with LoRA

+

Here's a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.

+

Open In Colab

+ +
+
+
14import torch
+15from labml import lab, monit, tracker
+16from labml.configs import BaseConfigs, option
+17from labml.utils.download import download_file
+18from labml_helpers.device import DeviceConfigs
+19from torch.optim import Adam
+20from torch.utils.data import DataLoader, TensorDataset
+21from transformers import AutoTokenizer, AutoModelForCausalLM
+22from labml_nn.lora.gpt2 import GPTModel
+
+
+
+
+ +

Trainer configurations and the training loop

+

The default configs can and will be over-ridden when we start the experiment

+ +
+
+
25class Trainer(BaseConfigs):
+
+
+
+
+ + +
+
+
31    device: torch.device = DeviceConfigs()
+
+
+
+
+ +

GPT-2 configs

+ +
+
+
34    layer_norm_epsilon: float = 1e-05
+35    n_embed: int = 768
+36    n_layer: int = 12
+37    n_positions: int = 1024
+38    vocab_size: int = 50257
+
+
+
+
+ +

Training configs

+ +
+
+
41    epochs: int = 10
+42    batch_size: int = 32
+43    learning_rate: float = 1e-4
+44    context_len: int = 512
+
+
+
+
+ +

LoRA rank

+ +
+
+
47    lora_r: int = 32
+
+
+
+
+ +

Dataset

+ +
+
+
50    text: TensorDataset = "tiny_shakespeare"
+51    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+52    model: GPTModel
+53    optimizer: torch.optim.Adam
+54    criterion = torch.nn.CrossEntropyLoss()
+55    data_loader: DataLoader
+
+
+
+
+ +

Load pre-trained GPT-2 from huggingface

+ +
+
+
57    def _load_pretrained_weights(self):
+
+
+
+
+ +

Load the huggingface model and get the parameters

+ +
+
+
63        hf_model = AutoModelForCausalLM.from_pretrained("gpt2")
+64        state_dict = hf_model.state_dict()
+
+
+
+
+ +

Transformer embedding and prediction layer parameter mapping (hf: ours +)

+ +
+
+
67        mapping = {
+68            'transformer.wte.weight': 'token_embedding.weight',
+69            'transformer.wpe.weight': 'position_embedding.weight',
+70            'transformer.ln_f.weight': 'final_norm.weight',
+71            'transformer.ln_f.bias': 'final_norm.bias',
+72            'lm_head.weight': 'lm_head.weight'
+73        }
+
+
+
+
+ +

Mapping (hf: ours +) of decoder layers

+ +
+
+
76        for i in range(12):
+77            mapping[f'transformer.h.{i}.ln_1.weight'] = f'blocks.{i}.pre_norm.weight'
+78            mapping[f'transformer.h.{i}.ln_1.bias'] = f'blocks.{i}.pre_norm.bias'
+79            mapping[f'transformer.h.{i}.attn.c_attn.weight'] = f'blocks.{i}.attn.c_att.weight'
+80            mapping[f'transformer.h.{i}.attn.c_attn.bias'] = f'blocks.{i}.attn.c_att.bias'
+81            mapping[f'transformer.h.{i}.attn.c_proj.weight'] = f'blocks.{i}.attn.c_proj.weight'
+82            mapping[f'transformer.h.{i}.attn.c_proj.bias'] = f'blocks.{i}.attn.c_proj.bias'
+83            mapping[f'transformer.h.{i}.ln_2.weight'] = f'blocks.{i}.post_norm.weight'
+84            mapping[f'transformer.h.{i}.ln_2.bias'] = f'blocks.{i}.post_norm.bias'
+85            mapping[f'transformer.h.{i}.mlp.c_fc.weight'] = f'blocks.{i}.ffn.c_fc.weight'
+86            mapping[f'transformer.h.{i}.mlp.c_fc.bias'] = f'blocks.{i}.ffn.c_fc.bias'
+87            mapping[f'transformer.h.{i}.mlp.c_proj.weight'] = f'blocks.{i}.ffn.c_proj.weight'
+88            mapping[f'transformer.h.{i}.mlp.c_proj.bias'] = f'blocks.{i}.ffn.c_proj.bias'
+
+
+
+
+ +

Move the parameters based on mapping

+ +
+
+
91        new_state_dict = {}
+92        for old_key, new_key in mapping.items():
+93            if old_key in state_dict:
+94                new_state_dict[new_key] = state_dict[old_key]
+
+
+
+
+ +

GPT-2 hugging face uses 1D Convolution layers. We need to transpose those weights since we use linear layers

+ +
+
+
97        convo_layers = ([f'blocks.{i}.ffn.c_fc.weight' for i in range(12)] +
+98                        [f'blocks.{i}.ffn.c_proj.weight' for i in range(12)] +
+99                        [f'blocks.{i}.attn.c_att.weight' for i in range(12)] +
+100                        [f'blocks.{i}.attn.c_proj.weight' for i in range(12)])
+101
+102        for layer in convo_layers:
+103            new_state_dict[layer] = torch.transpose(new_state_dict[layer], 0, 1)
+
+
+
+
+ +

Load out model

+ +
+
+
106        self.model.load_state_dict(new_state_dict, strict=False)  # state dict does not have lora weights
+
+
+
+
+ +

Initialize the model, optimizer and dataloader

+ +
+
+
108    def initialize(self):
+
+
+
+
+ +

Initialize the model

+ +
+
+
113        self.model = GPTModel(
+114            layer_norm_epsilon=self.layer_norm_epsilon,
+115            n_embd=self.n_embed,
+116            n_layer=self.n_layer,
+117            n_positions=self.n_positions,
+118            vocab_size=self.vocab_size,
+119            r=self.lora_r,
+120        )
+121        self.model.to(self.device)
+
+
+
+
+ +

Load pre-trained model weights

+ +
+
+
123        self._load_pretrained_weights()
+
+
+
+
+ +

Initialize the optimizer

+ +
+
+
126        self.optimizer = Adam(self.model.parameters(), lr=self.learning_rate)
+
+
+
+
+ +

Initialize the data loader

+ +
+
+
129        self.data_loader = DataLoader(self.text, batch_size=self.batch_size, shuffle=True)
+
+
+
+
+ +

Training loop

+ +
+
+
131    def run(self):
+
+
+
+
+ + +
+
+
136        for _ in monit.loop(self.epochs):
+137            for i, batch in monit.enum('Train', self.data_loader):
+138                inputs = batch[0]
+139                inputs = inputs.to(self.device)
+140                labels = inputs.clone()
+141
+142                outputs = self.model(inputs)
+143
+144                shift_logits = outputs[..., :-1, :]
+145                shift_labels = labels[..., 1:]
+146
+147                loss = self.criterion(shift_logits.reshape(-1, shift_logits.size(-1)), shift_labels.reshape(-1))
+148
+149                self.optimizer.zero_grad()
+150                loss.backward()
+151                self.optimizer.step()
+152
+153                tracker.add({'loss': loss})
+154
+155                tracker.save()
+156                tracker.add_global_step()
+157            tracker.new_line()
+
+
+
+
+ +

Tiny Shakespeare dataset

+

It will download from the url if not present

+ +
+
+
160@option(Trainer.text)
+161def tiny_shakespeare(c: Trainer):
+
+
+
+
+ + +
+
+
167    path = lab.get_data_path() / 'tiny_shakespeare.txt'
+168    if not path.exists():
+169        download_file("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt", path)
+170    with open(path, 'r', encoding='utf-8') as f:
+171        text = f.read()
+172
+173    tokens = c.tokenizer.encode(text)
+174    num_batches = len(tokens) // (c.batch_size * c.context_len)
+175    tokens = tokens[:num_batches * c.batch_size * c.context_len]
+176    input_ids = torch.tensor(tokens).view(-1, c.context_len)
+177    return TensorDataset(input_ids)
+
+
+ +
+ + + + \ No newline at end of file diff --git a/docs/zh/lora/gpt2.html b/docs/zh/lora/gpt2.html new file mode 100644 index 00000000..a45838c3 --- /dev/null +++ b/docs/zh/lora/gpt2.html @@ -0,0 +1,493 @@ + + + + + + + + + + + + + + + + + + + + + + + gpt2.py + + + + + + + + + + +
+
+
+
+

+ home + lora +

+

+ + Github + + Twitter +

+

+ + View code on Github +

+
+
+
+
+ + +
+
+
1import torch
+2import torch.nn as nn
+3from labml_nn.lora import Linear, Embedding
+
+
+
+
+ + +
+
+
6class FFN(nn.Module):
+
+
+
+
+ + +
+
+
7    def __init__(self, dim: int, n_embed: int, r: int):
+8        super().__init__()
+
+
+
+
+ +

lin1

+ +
+
+
10        self.c_fc = Linear(n_embed, dim, r=r, bias=True)
+
+
+
+
+ +

lin2

+ +
+
+
12        self.c_proj = Linear(dim, n_embed, r=r, bias=True)
+13        self.act = nn.functional.gelu
+
+
+
+
+ + +
+
+
15    def forward(self, hidden_states):
+16        hidden_states = self.c_fc(hidden_states)
+17        hidden_states = self.act(hidden_states)
+18        hidden_states = self.c_proj(hidden_states)
+19        return hidden_states
+
+
+
+
+ + +
+
+
22class MultiHeadAttention(nn.Module):
+
+
+
+
+ + +
+
+
23    def __init__(self, n_embed: int, r: int):
+24        super().__init__()
+25        self.embed_dim = n_embed
+26        self.num_heads = n_embed
+27        self.head_dim = self.embed_dim // self.num_heads
+28        self.split_size = self.embed_dim
+
+
+
+
+ +

qkv

+ +
+
+
31        self.c_att = Linear(n_embed, n_embed * 3, r=r, bias=True)
+
+
+
+
+ +

out

+ +
+
+
33        self.c_proj = Linear(n_embed, n_embed, r=r, bias=True)
+
+
+
+
+ +

Splits hidden_size dim into attn_head_size and num_heads

+ +
+
+
35    def _split_heads(self, tensor, num_heads, attn_head_size):
+
+
+
+
+ + +
+
+
39        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+40        tensor = tensor.view(new_shape)
+41        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+
+
+
+
+ + +
+
+
43    def forward(self, hidden_states):
+44        batch_size, seq_length, _ = hidden_states.size()
+45
+46        query, key, value = self.c_att(hidden_states).split(self.split_size, dim=2)
+47
+48        query = self._split_heads(query, self.num_heads, self.head_dim)
+49        key = self._split_heads(key, self.num_heads, self.head_dim)
+50        value = self._split_heads(value, self.num_heads, self.head_dim)
+51
+52        attn_output = torch.nn.functional.scaled_dot_product_attention(
+53            query,
+54            key,
+55            value,
+56            attn_mask=None,
+57            dropout_p=0.0,
+58            is_causal=True,  # for the triangular mask
+59        )
+60
+61        attn_output = attn_output.transpose(1, 2).contiguous()
+62        attn_output = attn_output.view(batch_size, seq_length, self.embed_dim)
+63
+64        attn_output = self.c_proj(attn_output)
+65
+66        return attn_output
+
+
+
+
+ + +
+
+
69class Block(nn.Module):
+
+
+
+
+ + +
+
+
70    def __init__(self, n_embed: int, layer_norm_epsilon: float, r: int):
+71        super().__init__()
+72        self.pre_norm = nn.LayerNorm(n_embed, eps=layer_norm_epsilon)
+73        self.attn = MultiHeadAttention(n_embed, r)
+74        self.post_norm = nn.LayerNorm(n_embed, eps=layer_norm_epsilon)
+75        self.ffn = FFN(n_embed * 4, n_embed, r)
+
+
+
+
+ + +
+
+
77    def forward(self, hidden_states):
+78        residual = hidden_states
+79        hidden_states = self.pre_norm(hidden_states)
+80
+81        attn_output = self.attn(hidden_states)
+82
+83        hidden_states = attn_output + residual
+84        residual = hidden_states
+85        hidden_states = self.post_norm(hidden_states)
+86        feed_forward_output = self.ffn(hidden_states)
+87        hidden_states = feed_forward_output + residual
+88
+89        return hidden_states
+
+
+
+
+ + +
+
+
92class GPTModel(nn.Module):
+
+
+
+
+ + +
+
+
93    def __init__(self, layer_norm_epsilon: float, n_embd: int, n_layer: int, n_positions: int,
+94                 vocab_size: int, r: int):
+95        super().__init__()
+96
+97        self.token_embedding = Embedding(vocab_size, n_embd, r=r)
+98        self.position_embedding = Embedding(n_positions, n_embd, r=r)
+99
+100        self.blocks = nn.ModuleList([Block(n_embd, layer_norm_epsilon, r=r)
+101                                     for _ in range(n_layer)])
+102
+103        self.final_norm = nn.LayerNorm(n_embd, eps=layer_norm_epsilon)
+104
+105        self.lm_head = Linear(n_embd, vocab_size, r=r, bias=False)
+
+
+
+
+ +
  • input_ids + has shape [batch_size, seq_len] +
+ +
+
+
107    def forward(self, input_ids: torch.Tensor):
+
+
+
+
+ + +
+
+
111        batch_size, seq_len = input_ids.shape
+
+
+
+
+ +

Get token embeddings

+ +
+
+
114        token_embeddings = self.token_embedding(input_ids)
+
+
+
+
+ +

Get position ids

+ +
+
+
116        position_ids = torch.arange(seq_len, device=input_ids.device)[None, :]
+
+
+
+
+ +

Get position embeddings

+ +
+
+
118        position_embeddings = self.position_embedding(position_ids)
+
+
+
+
+ +

Add position embeddings

+ +
+
+
121        x = token_embeddings + position_embeddings
+
+
+
+
+ +

Run through transformer blocks

+ +
+
+
124        for block in self.blocks:
+125            x = block(x)
+
+
+
+
+ +

Final normalization

+ +
+
+
128        x = self.final_norm(x)
+
+
+
+
+ +

Get logits from projection layer

+ +
+
+
130        return self.lm_head(x)
+
+
+ +
+ + + + \ No newline at end of file diff --git a/docs/zh/lora/index.html b/docs/zh/lora/index.html new file mode 100644 index 00000000..3fe09490 --- /dev/null +++ b/docs/zh/lora/index.html @@ -0,0 +1,534 @@ + + + + + + + + + + + + + + + + + + + + + + + Low-Rank Adaptation (LoRA) + + + + + + + + + + +
+
+
+
+

+ home + lora +

+

+ + Github + + Twitter +

+

+ + View code on Github +

+
+
+
+
+ +

Low-Rank Adaptation (LoRA)

+

This is an implementation of Low-Rank Adaptation (LoRA) in PyTorch.

+

Low-Rank Adaptation (LoRA) freezes pre-trained model weights and injects trainable rank decomposition matrices into each layer of the transformer. This makes it possible to efficiently fine-tune large langauge models by reducing trainable parameters by a large factor.

+

Here's the training code for training a GPT2 model with LoRA on Tiny Shakespeare dataset.

+ +
+
+
24import torch
+25import torch.nn as nn
+
+
+
+
+ +

LoRA Linear Layer

+

LoRA linear layer adds a low-rank decomposition to the pre-trained weight matrix () of the linear layer.

+

+

, where , , and the rank .

+

All parameters are frozen except and .

+

is initialized to be zero at the beginning of the training.

+

They multiple by where is a hyper-parameter. Once is tuned it can be kept the same when varying .

+ +
+
+
28class Linear(nn.Module):
+
+
+
+
+ +
  • in_features + is the number of input features of the linear layer
  • +
  • out_features + is the number of output features of the linear layer
  • +
  • bias + is a flag indicating if there is a bias parameter
  • +
  • r + is the rank of the decomposition
  • +
  • alpha + is the scaling factor
+ +
+
+
49    def __init__(self, in_features: int, out_features: int, bias: bool,
+50                 r: int, alpha: int = None):
+
+
+
+
+ + +
+
+
58        super().__init__()
+
+
+
+
+ +

Set is not provided. i.e. make the scaling factor .

+ +
+
+
61        if alpha is None:
+62            alpha = r
+
+
+
+
+ +

The pre-trained weight

+ +
+
+
65        self.weight = nn.Parameter(torch.empty((out_features, in_features)))
+
+
+
+
+ +

Freeze it

+ +
+
+
67        self.weight.requires_grad = False
+68
+69        if bias:
+
+
+
+
+ +

Bias parameter (also frozen)

+ +
+
+
71            self.bias = nn.Parameter(torch.empty(out_features))
+72            self.bias.requires_grad = False
+73        else:
+
+
+
+
+ +

No bias parameter

+ +
+
+
75            self.bias = None
+
+
+
+
+ +

scaling factor

+ +
+
+
78        self.scaling = alpha / r
+
+
+
+
+ +

Matrix

+ +
+
+
80        self.lora_a = nn.Parameter(torch.empty((in_features, r)))
+
+
+
+
+ +

Matrix , we keep and transposed

+ +
+
+
82        self.lora_b = nn.Parameter(torch.empty((r, out_features)))
+83
+84        with torch.no_grad():
+
+
+
+
+ +

Initialize similar to a weight matrix in a normal linear layer

+ +
+
+
86            nn.init.kaiming_uniform_(self.lora_a, a=5 ** 0.5)
+
+
+
+
+ +

Initialize to so that is at initialization

+ +
+
+
88            nn.init.zeros_(self.lora_b)
+
+
+
+
+ + +
+
+
90    def forward(self, x: torch.Tensor):
+
+
+
+
+ +

Compute

+ +
+
+
92        result = nn.functional.linear(x, self.weight, bias=self.bias)
+
+
+
+
+ +

Add

+ +
+
+
95        result += (x @ self.lora_a @ self.lora_b) * self.scaling
+
+
+
+
+ +

+ +
+
+
98        return result
+
+
+
+
+ +

LoRA Embedding Layer

+

Similar to LoRA linear layer this adds a low-rank decomposition to the pre-trained embedding weights matrix ().

+

+ +
+
+
101class Embedding(nn.Module):
+
+
+
+
+ +
  • num_embeddings + is the number of embeddings
  • +
  • embedding_dim + is the number embedding dimensions
  • +
  • r + is the rank of the decomposition
  • +
  • alpha + is the scaling factor
+ +
+
+
111    def __init__(self, num_embeddings: int, embedding_dim: int,
+112                 r: int, alpha: int = None):
+
+
+
+
+ + +
+
+
120        super().__init__()
+
+
+
+
+ +

Set is not provided. i.e. make the scaling factor .

+ +
+
+
123        if alpha is None:
+124            alpha = r
+
+
+
+
+ +

The pre-trained embedding weights (frozen)

+ +
+
+
127        self.weight = nn.Parameter(torch.empty((num_embeddings, embedding_dim)))
+128        self.weight.requires_grad = False
+
+
+
+
+ +

scaling factor

+ +
+
+
131        self.scaling = alpha / r
+
+
+
+
+ +

Matrix

+ +
+
+
133        self.lora_a = nn.Parameter(torch.empty((num_embeddings, r)))
+
+
+
+
+ +

Matrix

+ +
+
+
135        self.lora_b = nn.Parameter(torch.empty((r, embedding_dim)))
+136
+137        with torch.no_grad():
+
+
+
+
+ +

Initialize with a normal distribution

+ +
+
+
139            nn.init.normal_(self.lora_a)
+
+
+
+
+ +

Initialize to so that is at initialization

+ +
+
+
141            nn.init.zeros_(self.lora_b)
+
+
+
+
+ + +
+
+
143    def forward(self, x: torch.Tensor):
+
+
+
+
+ +

Compute the embeddings

+ +
+
+
145        result = nn.functional.embedding(x, self.weight)
+
+
+
+
+ +

Add Error

+ +
+
+
148        result += (nn.functional.embedding(x, self.lora_a) @ self.lora_b) * self.scaling
+
+
+
+
+ +

+ +
+
+
151        return result
+
+
+ +
+ + + + \ No newline at end of file diff --git a/docs/zh/sitemap.xml b/docs/zh/sitemap.xml index 7b46859e..83188c87 100644 --- a/docs/zh/sitemap.xml +++ b/docs/zh/sitemap.xml @@ -8,7 +8,7 @@ https://nn.labml.ai/gan/wasserstein/index.html - 2023-10-24T16:30:00+00:00 + 2024-07-15T16:30:00+00:00 1.00 @@ -504,22 +504,22 @@ - https://nn.labml.ai/RWKV/configs.html - 2024-03-17T16:30:00+00:00 + https://nn.labml.ai/rwkv/configs.html + 2024-08-02T16:30:00+00:00 1.00 - https://nn.labml.ai/RWKV/index.html - 2024-03-17T16:30:00+00:00 + https://nn.labml.ai/rwkv/index.html + 2024-08-02T16:30:00+00:00 1.00 - https://nn.labml.ai/RWKV/experiment.html - 2024-03-17T16:30:00+00:00 + https://nn.labml.ai/rwkv/experiment.html + 2024-08-02T16:30:00+00:00 1.00 @@ -1294,6 +1294,27 @@ + + https://nn.labml.ai/lora/gpt2.html + 2024-08-16T16:30:00+00:00 + 1.00 + + + + + https://nn.labml.ai/lora/index.html + 2024-08-03T16:30:00+00:00 + 1.00 + + + + + https://nn.labml.ai/lora/experiment.html + 2024-08-16T16:30:00+00:00 + 1.00 + + + https://nn.labml.ai/graphs/gat/index.html 2023-10-24T16:30:00+00:00 diff --git a/docs/zh/transformers/aft/index.html b/docs/zh/transformers/aft/index.html index a390d496..0d3c9d2e 100644 --- a/docs/zh/transformers/aft/index.html +++ b/docs/zh/transformers/aft/index.html @@ -3,24 +3,24 @@ - + - - + + - + - + - - + + - 免注意的变压器 + 一种无注意力的 Transformer diff --git a/docs/zh/transformers/aft/readme.html b/docs/zh/transformers/aft/readme.html index c843bdd5..31fa59e9 100644 --- a/docs/zh/transformers/aft/readme.html +++ b/docs/zh/transformers/aft/readme.html @@ -7,20 +7,20 @@ - + - + - + - + - 免注意的变压器 + 一种无注意力的 Transformer @@ -71,10 +71,10 @@ -

一款无注意的变形金刚

-

这是 PyTorch 对无注意力的变形金刚》一文的实现。

-

本文用一种新的高效运算取代了自我注意力层,该运算的存储复杂度为O(Td),其中 T 是序列长度,是嵌入的维度。

-

本文介绍了 AFT 以及 AFT-Local 和 AFT-conv。这里我们实现了 aft-Local,它关注自回归模型中的 cloby 代币。

+

一种无注意力的 Transformer

+

这是论文 《一种无注意力的 Transformer 》PyTorch 实现。

+

这篇论文用一种新的高效操作替代了自注意力层,该运算的存储复杂度为O(Td),其中 T 是序列长度,是嵌入的维度。

+

该论文介绍了 AFT 以及 AFT-local 和 AFT-conv 。这里我们实现了 AFT-local ,它会在自回归模型中关注邻近的 token 。

diff --git a/docs/zh/transformers/configs.html b/docs/zh/transformers/configs.html index d7440d3c..83df5dfd 100644 --- a/docs/zh/transformers/configs.html +++ b/docs/zh/transformers/configs.html @@ -308,7 +308,7 @@ #

GLU 变体

-

这些是在论文 《 GLU Variants Improve Transformer 》中包含的各种带门控隐藏层的 ffn 变体。我们已按照论文规定省略了偏置项。

+

这些是在论文 《 GLU Variants Improve Transformer 》中包含的各种带门控隐藏层的 FFN 变体。我们已按照论文规定省略了偏置项。

diff --git a/docs/zh/transformers/label_smoothing_loss.html b/docs/zh/transformers/label_smoothing_loss.html index 3f2b49b0..bd267ef8 100644 --- a/docs/zh/transformers/label_smoothing_loss.html +++ b/docs/zh/transformers/label_smoothing_loss.html @@ -183,7 +183,7 @@ -

输出(预测)

+

输出预测值

diff --git a/docs/zh/transformers/mha.html b/docs/zh/transformers/mha.html index 71798236..a7a32a4a 100644 --- a/docs/zh/transformers/mha.html +++ b/docs/zh/transformers/mha.html @@ -72,7 +72,7 @@

多头注意力 (MHA)

Open In Colab

-

这是论文《 Attention is All You Need 》中多头注意力的PyTorch教程/实现。该实现的灵感来自《带注释的变形金刚》

%n

这是使用基础 Transformer 和 MHA 进行 NLP 自回归的训练代码

%n

这是一个训练简单transformer的代码实现

+

这是论文《 Attention is All You Need 》中多头注意力的PyTorch教程/实现。该实现的灵感来自《带注释的 Transformer 》

这是使用基础 Transformer 和 MHA 进行 NLP 自回归的训练代码

这是一个训练简单 Transformer 的代码实现

@@ -116,7 +116,7 @@ -

线性层用于线性变换/p> +

线性层用于线性变换

@@ -234,7 +234,7 @@ s-225.272,467,-225.272,467s-235,486,-235,486c-2.7,4.7,-9,7,-19,7 c-6,0,-10,-1,-12,-3s-194,-422,-194,-422s-65,47,-65,47z M834 80h400000v40h-400000z">QK)V

简单来说,它会找到与查询 (Query) 匹配的键 (key),并获取这些键 (Key) 的值 (Value) 。

-

它使用查询和键的点积作为衡量它们之间匹配程度的指标。在进行之前,点积会被

@@ -530,7 +530,7 @@ M834 80h400000v40h-400000z">这将得到一个形状为[seq_len, seq_len, batch_size, heads] +

计算注意力分数,这将得到一个形状为[seq_len, seq_len, batch_size, heads] 的张量。

@@ -579,7 +579,7 @@ M834 80h400000v40h-400000z">