diff --git a/docs/zh/index.html b/docs/zh/index.html index 322f4bf7..2d3bd7ee 100644 --- a/docs/zh/index.html +++ b/docs/zh/index.html @@ -101,6 +101,7 @@
Here's a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.
14import torch
-15from labml import lab, monit, tracker
-16from labml.configs import BaseConfigs, option
-17from labml.utils.download import download_file
-18from labml_helpers.device import DeviceConfigs
-19from torch.optim import Adam
-20from torch.utils.data import DataLoader, TensorDataset
-21from transformers import AutoTokenizer, AutoModelForCausalLM
-22from labml_nn.lora.gpt2 import GPTModel25class Trainer(BaseConfigs):26class Trainer(BaseConfigs):31 device: torch.device = DeviceConfigs()32 device: torch.device = DeviceConfigs()34 layer_norm_epsilon: float = 1e-05
-35 n_embed: int = 768
-36 n_layer: int = 12
-37 n_positions: int = 1024
-38 vocab_size: int = 5025735 layer_norm_epsilon: float = 1e-05
+36 d_model: int = 768
+37 n_layers: int = 12
+38 n_heads: int = 12
+39 n_positions: int = 1024
+40 vocab_size: int = 5025741 epochs: int = 10
-42 batch_size: int = 32
-43 learning_rate: float = 1e-4
-44 context_len: int = 51243 epochs: int = 10
+44 batch_size: int = 32
+45 learning_rate: float = 1e-4
+46 context_len: int = 51247 lora_r: int = 3249 lora_r: int = 3250 text: TensorDataset = "tiny_shakespeare"
-51 tokenizer = AutoTokenizer.from_pretrained("gpt2")
-52 model: GPTModel
-53 optimizer: torch.optim.Adam
-54 criterion = torch.nn.CrossEntropyLoss()
-55 data_loader: DataLoader52 text: TensorDataset = "tiny_shakespeare"57 def _load_pretrained_weights(self):54 tokenizer = AutoTokenizer.from_pretrained("gpt2")63 hf_model = AutoModelForCausalLM.from_pretrained("gpt2")
-64 state_dict = hf_model.state_dict()56 model: GPTModelTransformer embedding and prediction layer parameter mapping (hf: ours
-)
Optimizer
67 mapping = {
-68 'transformer.wte.weight': 'token_embedding.weight',
-69 'transformer.wpe.weight': 'position_embedding.weight',
-70 'transformer.ln_f.weight': 'final_norm.weight',
-71 'transformer.ln_f.bias': 'final_norm.bias',
-72 'lm_head.weight': 'lm_head.weight'
-73 }58 optimizer: torch.optim.Adam76 for i in range(12):
-77 mapping[f'transformer.h.{i}.ln_1.weight'] = f'blocks.{i}.pre_norm.weight'
-78 mapping[f'transformer.h.{i}.ln_1.bias'] = f'blocks.{i}.pre_norm.bias'
-79 mapping[f'transformer.h.{i}.attn.c_attn.weight'] = f'blocks.{i}.attn.c_att.weight'
-80 mapping[f'transformer.h.{i}.attn.c_attn.bias'] = f'blocks.{i}.attn.c_att.bias'
-81 mapping[f'transformer.h.{i}.attn.c_proj.weight'] = f'blocks.{i}.attn.c_proj.weight'
-82 mapping[f'transformer.h.{i}.attn.c_proj.bias'] = f'blocks.{i}.attn.c_proj.bias'
-83 mapping[f'transformer.h.{i}.ln_2.weight'] = f'blocks.{i}.post_norm.weight'
-84 mapping[f'transformer.h.{i}.ln_2.bias'] = f'blocks.{i}.post_norm.bias'
-85 mapping[f'transformer.h.{i}.mlp.c_fc.weight'] = f'blocks.{i}.ffn.c_fc.weight'
-86 mapping[f'transformer.h.{i}.mlp.c_fc.bias'] = f'blocks.{i}.ffn.c_fc.bias'
-87 mapping[f'transformer.h.{i}.mlp.c_proj.weight'] = f'blocks.{i}.ffn.c_proj.weight'
-88 mapping[f'transformer.h.{i}.mlp.c_proj.bias'] = f'blocks.{i}.ffn.c_proj.bias'60 loss_func = torch.nn.CrossEntropyLoss()91 new_state_dict = {}
-92 for old_key, new_key in mapping.items():
-93 if old_key in state_dict:
-94 new_state_dict[new_key] = state_dict[old_key]62 data_loader: DataLoaderGPT-2 hugging face uses 1D Convolution layers. We need to transpose those weights since we use linear layers
+97 convo_layers = ([f'blocks.{i}.ffn.c_fc.weight' for i in range(12)] +
-98 [f'blocks.{i}.ffn.c_proj.weight' for i in range(12)] +
-99 [f'blocks.{i}.attn.c_att.weight' for i in range(12)] +
-100 [f'blocks.{i}.attn.c_proj.weight' for i in range(12)])
-101
-102 for layer in convo_layers:
-103 new_state_dict[layer] = torch.transpose(new_state_dict[layer], 0, 1)64 def _load_pretrained_weights(self):106 self.model.load_state_dict(new_state_dict, strict=False) # state dict does not have lora weights70 hf_model = AutoModelForCausalLM.from_pretrained("gpt2")
+71 state_dict = hf_model.state_dict()Transformer embedding and prediction layer parameter mapping (hf: ours
+)
108 def initialize(self):74 mapping = {
+75 'transformer.wte.weight': 'token_embedding.weight',
+76 'transformer.wpe.weight': 'position_embedding.weight',
+77 'transformer.ln_f.weight': 'final_norm.weight',
+78 'transformer.ln_f.bias': 'final_norm.bias',
+79 'lm_head.weight': 'lm_head.weight'
+80 }113 self.model = GPTModel(
-114 layer_norm_epsilon=self.layer_norm_epsilon,
-115 n_embd=self.n_embed,
-116 n_layer=self.n_layer,
-117 n_positions=self.n_positions,
-118 vocab_size=self.vocab_size,
-119 r=self.lora_r,
-120 )
-121 self.model.to(self.device)83 for i in range(12):
+84 mapping[f'transformer.h.{i}.ln_1.weight'] = f'blocks.{i}.attn_norm.weight'
+85 mapping[f'transformer.h.{i}.ln_1.bias'] = f'blocks.{i}.attn_norm.bias'
+86 mapping[f'transformer.h.{i}.attn.c_attn.weight'] = f'blocks.{i}.attn.qkv_projection.weight'
+87 mapping[f'transformer.h.{i}.attn.c_attn.bias'] = f'blocks.{i}.attn.qkv_projection.bias'
+88 mapping[f'transformer.h.{i}.attn.c_proj.weight'] = f'blocks.{i}.attn.output_projection.weight'
+89 mapping[f'transformer.h.{i}.attn.c_proj.bias'] = f'blocks.{i}.attn.output_projection.bias'
+90 mapping[f'transformer.h.{i}.ln_2.weight'] = f'blocks.{i}.ffn_norm.weight'
+91 mapping[f'transformer.h.{i}.ln_2.bias'] = f'blocks.{i}.ffn_norm.bias'
+92 mapping[f'transformer.h.{i}.mlp.c_fc.weight'] = f'blocks.{i}.ffn.linear_in.weight'
+93 mapping[f'transformer.h.{i}.mlp.c_fc.bias'] = f'blocks.{i}.ffn.linear_in.bias'
+94 mapping[f'transformer.h.{i}.mlp.c_proj.weight'] = f'blocks.{i}.ffn.linear_out.weight'
+95 mapping[f'transformer.h.{i}.mlp.c_proj.bias'] = f'blocks.{i}.ffn.linear_out.bias'123 self._load_pretrained_weights()98 new_state_dict = {}
+99 for old_key, new_key in mapping.items():
+100 if old_key in state_dict:
+101 new_state_dict[new_key] = state_dict[old_key]Initialize the optimizer
+GPT-2 hugging face uses 1D Convolution layers. We need to transpose those weights since we use linear layers
126 self.optimizer = Adam(self.model.parameters(), lr=self.learning_rate)104 convo_layers = ([f'blocks.{i}.ffn.linear_in.weight' for i in range(12)] +
+105 [f'blocks.{i}.ffn.linear_out.weight' for i in range(12)] +
+106 [f'blocks.{i}.attn.qkv_projection.weight' for i in range(12)] +
+107 [f'blocks.{i}.attn.output_projection.weight' for i in range(12)])
+108
+109 for layer in convo_layers:
+110 new_state_dict[layer] = torch.transpose(new_state_dict[layer], 0, 1)Initialize the data loader
+Load out model. We use strict = False
+ because the state does not have LoRA weights
129 self.data_loader = DataLoader(self.text, batch_size=self.batch_size, shuffle=True)113 missing_keys, unexpected_keys = self.model.load_state_dict(new_state_dict, strict=False)131 def run(self):116 assert all('lora' in key for key in missing_keys)
+117 assert not unexpected_keys136 for _ in monit.loop(self.epochs):
-137 for i, batch in monit.enum('Train', self.data_loader):
-138 inputs = batch[0]
-139 inputs = inputs.to(self.device)
-140 labels = inputs.clone()
-141
-142 outputs = self.model(inputs)
-143
-144 shift_logits = outputs[..., :-1, :]
-145 shift_labels = labels[..., 1:]
-146
-147 loss = self.criterion(shift_logits.reshape(-1, shift_logits.size(-1)), shift_labels.reshape(-1))
-148
-149 self.optimizer.zero_grad()
-150 loss.backward()
-151 self.optimizer.step()
-152
-153 tracker.add({'loss': loss})
-154
-155 tracker.save()
-156 tracker.add_global_step()
-157 tracker.new_line()It will download from the url if not present
+160@option(Trainer.text)
-161def tiny_shakespeare(c: Trainer):119 def initialize(self):Initialize the GPT2 model
+ +124 self.model = GPTModel(
+125 layer_norm_epsilon=self.layer_norm_epsilon,
+126 d_model=self.d_model,
+127 n_layers=self.n_layers,
+128 n_heads=self.n_heads,
+129 n_positions=self.n_positions,
+130 vocab_size=self.vocab_size,
+131 r=self.lora_r,
+132 )
+133 self.model.to(self.device)Load pre-trained model weights
+ +135 self._load_pretrained_weights()Initialize the optimizer
+ +138 self.optimizer = Adam(self.model.parameters(), lr=self.learning_rate)Initialize the data loader
+ +141 self.data_loader = DataLoader(self.text, batch_size=self.batch_size, shuffle=True)143 def run(self):167 path = lab.get_data_path() / 'tiny_shakespeare.txt'
-168 if not path.exists():
-169 download_file("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt", path)
-170 with open(path, 'r', encoding='utf-8') as f:
-171 text = f.read()
-172
-173 tokens = c.tokenizer.encode(text)
-174 num_batches = len(tokens) // (c.batch_size * c.context_len)
-175 tokens = tokens[:num_batches * c.batch_size * c.context_len]
-176 input_ids = torch.tensor(tokens).view(-1, c.context_len)
-177 return TensorDataset(input_ids)148 for _ in monit.loop(self.epochs):inputs
+ has shape [batch_size, seq_len]
+
150 for (inputs,) in monit.iterate('Train', self.data_loader):Move inputs
+ to device
152 inputs = inputs.to(self.device)Call the model, with the all but the last token
+ +154 logits = self.model(inputs[:, :-1])Get cross entropy loss
+ +156 loss = self.loss_func(logits.reshape(-1, logits.shape[-1]), inputs[:, 1:].reshape(-1))Make gradients 0
+ +159 self.optimizer.zero_grad()Compute gradients
+ +161 loss.backward()Optimize
+ +163 self.optimizer.step()Log the loss
+ +166 tracker.save({'loss': loss})
+167 tracker.add_global_step()+ +
169 tracker.new_line()172@option(Trainer.text)
+173def tiny_shakespeare(c: Trainer):179 path = lab.get_data_path() / 'tiny_shakespeare.txt'
+180 if not path.exists():
+181 download_file("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt", path)
+182 with open(path, 'r', encoding='utf-8') as f:
+183 text = f.read()
+184
+185 tokens = c.tokenizer.encode(text)
+186 num_batches = len(tokens) // (c.batch_size * c.context_len)
+187 tokens = tokens[:num_batches * c.batch_size * c.context_len]
+188 input_ids = torch.tensor(tokens).view(-1, c.context_len)
+189 return TensorDataset(input_ids)Here's the training code for training a GPT2 model with LoRA on Tiny Shakespeare dataset.
+1import torch
-2import torch.nn as nn
-3from labml_nn.lora import Linear, Embedding13import torch
+14import torch.nn as nn
+15
+16from labml_nn.lora import Linear, Embedding6class FFN(nn.Module):19class FFN(nn.Module):d_model
+ is the number of dimensions d_ff
+ is the size of the hidden dimension r
+ is the lora rank7 def __init__(self, dim: int, n_embed: int, r: int):
-8 super().__init__()24 def __init__(self, d_model: int, d_ff: int, r: int):lin1
- +10 self.c_fc = Linear(n_embed, dim, r=r, bias=True)30 super().__init__()12 self.c_proj = Linear(dim, n_embed, r=r, bias=True)
-13 self.act = nn.functional.gelu33 self.linear_in = Linear(d_model, d_ff, r=r, bias=True)
+34 self.linear_out = Linear(d_ff, d_model, r=r, bias=True)
+35 self.act = nn.GELU()x
+ is the embeddings tensor with shape [batch_size, seq_len, d_model]
+15 def forward(self, hidden_states):
-16 hidden_states = self.c_fc(hidden_states)
-17 hidden_states = self.act(hidden_states)
-18 hidden_states = self.c_proj(hidden_states)
-19 return hidden_states37 def forward(self, x: torch.Tensor) -> torch.Tensor:22class MultiHeadAttention(nn.Module):41 x = self.linear_in(x)
+42 x = self.act(x)
+43 x = self.linear_out(x)
+44 return x23 def __init__(self, n_embed: int, r: int):
-24 super().__init__()
-25 self.embed_dim = n_embed
-26 self.num_heads = n_embed
-27 self.head_dim = self.embed_dim // self.num_heads
-28 self.split_size = self.embed_dim31 self.c_att = Linear(n_embed, n_embed * 3, r=r, bias=True)47class MultiHeadAttention(nn.Module):d_model
+ is the number of dimensions in the embeddings n_heads
+ is the number of heads r
+ is the lora rank52 def __init__(self, d_model: int, n_heads: int, r: int):out
- +33 self.c_proj = Linear(n_embed, n_embed, r=r, bias=True)58 super().__init__()
+59 self.d_model = d_model
+60 self.n_heads = n_heads
+61 self.d_head = d_model // n_heads35 def _split_heads(self, tensor, num_heads, attn_head_size):64 self.qkv_projection = Linear(d_model, d_model * 3, r=r, bias=True)Output projection
+39 new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
-40 tensor = tensor.view(new_shape)
-41 return tensor.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features)66 self.output_projection = Linear(d_model, d_model, r=r, bias=True)x
+ is the tensor with shape [batch_size, seq_len, d_model]
+43 def forward(self, hidden_states):
-44 batch_size, seq_length, _ = hidden_states.size()
-45
-46 query, key, value = self.c_att(hidden_states).split(self.split_size, dim=2)
-47
-48 query = self._split_heads(query, self.num_heads, self.head_dim)
-49 key = self._split_heads(key, self.num_heads, self.head_dim)
-50 value = self._split_heads(value, self.num_heads, self.head_dim)
-51
-52 attn_output = torch.nn.functional.scaled_dot_product_attention(
-53 query,
-54 key,
-55 value,
-56 attn_mask=None,
-57 dropout_p=0.0,
-58 is_causal=True, # for the triangular mask
-59 )
-60
-61 attn_output = attn_output.transpose(1, 2).contiguous()
-62 attn_output = attn_output.view(batch_size, seq_length, self.embed_dim)
-63
-64 attn_output = self.c_proj(attn_output)
-65
-66 return attn_output68 def _split_heads(self, x: torch.Tensor):Split last dimension to [n_heads, d_head]
+
69class Block(nn.Module):73 x = x.view(x.shape[:-1] + (self.n_heads, self.d_head))Reorder to [batch_size, head, seq_length, d_head]
+
70 def __init__(self, n_embed: int, layer_norm_epsilon: float, r: int):
-71 super().__init__()
-72 self.pre_norm = nn.LayerNorm(n_embed, eps=layer_norm_epsilon)
-73 self.attn = MultiHeadAttention(n_embed, r)
-74 self.post_norm = nn.LayerNorm(n_embed, eps=layer_norm_epsilon)
-75 self.ffn = FFN(n_embed * 4, n_embed, r)75 return x.permute(0, 2, 1, 3)x
+ is the embeddings tensor with shape [batch_size, seq_len, d_model]
+77 def forward(self, hidden_states):
-78 residual = hidden_states
-79 hidden_states = self.pre_norm(hidden_states)
-80
-81 attn_output = self.attn(hidden_states)
-82
-83 hidden_states = attn_output + residual
-84 residual = hidden_states
-85 hidden_states = self.post_norm(hidden_states)
-86 feed_forward_output = self.ffn(hidden_states)
-87 hidden_states = feed_forward_output + residual
-88
-89 return hidden_states77 def forward(self, x: torch.Tensor) -> torch.Tensor:92class GPTModel(nn.Module):81 batch_size, seq_length, _ = x.shape93 def __init__(self, layer_norm_epsilon: float, n_embd: int, n_layer: int, n_positions: int,
-94 vocab_size: int, r: int):
-95 super().__init__()
-96
-97 self.token_embedding = Embedding(vocab_size, n_embd, r=r)
-98 self.position_embedding = Embedding(n_positions, n_embd, r=r)
-99
-100 self.blocks = nn.ModuleList([Block(n_embd, layer_norm_epsilon, r=r)
-101 for _ in range(n_layer)])
-102
-103 self.final_norm = nn.LayerNorm(n_embd, eps=layer_norm_epsilon)
-104
-105 self.lm_head = Linear(n_embd, vocab_size, r=r, bias=False)107 def forward(self, input_ids: torch.Tensor):84 q, k, v = self.qkv_projection(x).split(self.d_model, dim=-1)Transform them from shape [batch_size, seq_len, d_model]
+ to [batch_size, head, seq_length, d_head]
+
87 q = self._split_heads(q)
+88 k = self._split_heads(k)
+89 v = self._split_heads(v)Apply causal attention
+111 batch_size, seq_len = input_ids.shape92 attn_output = torch.nn.functional.scaled_dot_product_attention(q, k, v, is_causal=True)Get token embeddings
+Transform them from shape [batch_size, head, seq_length, d_head]
+ to [batch_size, seq_len, d_model]
+
114 token_embeddings = self.token_embedding(input_ids)95 attn_output = attn_output.permute(0, 2, 1, 3).reshape(batch_size, seq_length, self.d_model)116 position_ids = torch.arange(seq_len, device=input_ids.device)[None, :]98 return self.output_projection(attn_output)118 position_embeddings = self.position_embedding(position_ids)101class Block(nn.Module):Add position embeddings
+d_model
+ is the number of dimensions in the embeddings n_heads
+ is the number of heads layer_norm_epsilon
+ is the layer norm epsilon r
+ is the lora rank121 x = token_embeddings + position_embeddings106 def __init__(self, d_model: int, n_heads: int, layer_norm_epsilon: float, r: int):Run through transformer blocks
- +124 for block in self.blocks:
-125 x = block(x)113 super().__init__()128 x = self.final_norm(x)115 self.attn_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon)Attention layer
+ +117 self.attn = MultiHeadAttention(d_model, n_heads, r)FFN pre-normalization layer
+ +119 self.ffn_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon)Feed-forward network
+ +121 self.ffn = FFN(d_model, d_model * 4, r)x
+ is the embeddings tensor with shape [batch_size, seq_len, d_model]
+123 def forward(self, x: torch.Tensor) -> torch.Tensor:Attention
+ +128 x = x + self.attn(self.attn_norm(x))FFN
+ +130 x = x + self.ffn(self.ffn_norm(x))
+131
+132 return x135class GPTModel(nn.Module):d_model
+ is the number of dimensions in the embeddings n_heads
+ is the number of attention heads n_layers
+ is the number of decoder layers n_positions
+ is the number of positional embeddings layer_norm_epsilon
+ is the layer norm epsilon vocab_size
+ is the vocabulary size r
+ is the lora rank140 def __init__(self, *, d_model: int,
+141 n_heads: int, n_layers: int,
+142 n_positions: int,
+143 layer_norm_epsilon: float,
+144 vocab_size: int, r: int):154 super().__init__()Token and absolute positional embeddings
+ +157 self.token_embedding = Embedding(vocab_size, d_model, r=r)
+158 self.position_embedding = Embedding(n_positions, d_model, r=r)Decoder blocks
+ +161 self.blocks = nn.ModuleList([Block(d_model, n_heads, layer_norm_epsilon, r=r)
+162 for _ in range(n_layers)])Final layer norm
+ +165 self.final_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon)Projection layer to logit space
+ +167 self.lm_head = Linear(d_model, vocab_size, r=r, bias=False)input_ids
+ has shape [batch_size, seq_len]
+169 def forward(self, input_ids: torch.Tensor):173 batch_size, seq_len = input_ids.shapeGet token embeddings
+ +176 token_embeddings = self.token_embedding(input_ids)Get position ids
+ +178 position_ids = torch.arange(seq_len, device=input_ids.device)[None, :]Get position embeddings
+ +180 position_embeddings = self.position_embedding(position_ids)Add position embeddings
+ +183 x = token_embeddings + position_embeddingsRun through transformer blocks
+ +186 for block in self.blocks:
+187 x = block(x)Final normalization
+ +190 x = self.final_norm(x)Get logits from projection layer
130 return self.lm_head(x)192 return self.lm_head(x)LoRA linear layer adds a low-rank decomposition to the pre-trained weight matrix () of the linear layer.
--
, where , , and the rank .
-All parameters are frozen except and .
+LoRA linear layer adds a low-rank decomposition to the pre-trained weight matrix () of the linear layer.
++
, where , , and the rank .
+All parameters are frozen except and .
is initialized to be zero at the beginning of the training.
-They multiple by where is a hyper-parameter. Once is tuned it can be kept the same when varying .
+They multiple by where is a hyper-parameter. Once is tuned it can be kept the same when varying .
bias
is a flag indicating if there is a bias parameter r
- is the rank of the decomposition alpha
is the scaling factor Set is not provided. i.e. make the scaling factor .
+Set is not provided. i.e. make the scaling factor .
80 self.lora_a = nn.Parameter(torch.empty((in_features, r)))80 self.lora_a = nn.Parameter(torch.empty((r, in_features)))82 self.lora_b = nn.Parameter(torch.empty((r, out_features)))
+ 82 self.lora_b = nn.Parameter(torch.empty((out_features, r)))
83
84 with torch.no_grad():
Initialize similar to a weight matrix in a normal linear layer
+Initialize similar to a weight matrix in a normal linear layer
Initialize to so that is at initialization
+Initialize to so that is at initialization
95 result += (x @ self.lora_a @ self.lora_b) * self.scaling95 result += (x @ self.lora_a.T @ self.lora_b.T) * self.scalingSimilar to LoRA linear layer this adds a low-rank decomposition to the pre-trained embedding weights matrix ().
-+
Similar to LoRA linear layer this adds a low-rank decomposition to the pre-trained embedding weights matrix ().
+
embedding_dim
is the number embedding dimensions r
- is the rank of the decomposition alpha
is the scaling factor Set is not provided. i.e. make the scaling factor .
+Set is not provided. i.e. make the scaling factor .
The pre-trained embedding weights (frozen)
+The pre-trained embedding weights (frozen)
133 self.lora_a = nn.Parameter(torch.empty((num_embeddings, r)))133 self.lora_a = nn.Parameter(torch.empty((r, num_embeddings)))135 self.lora_b = nn.Parameter(torch.empty((r, embedding_dim)))
+ 135 self.lora_b = nn.Parameter(torch.empty((embedding_dim, r)))
136
137 with torch.no_grad():
Initialize with a normal distribution
+Initialize with a normal distribution
Initialize to so that is at initialization
+Initialize to so that is at initialization
148 result += (nn.functional.embedding(x, self.lora_a) @ self.lora_b) * self.scaling148 result += (nn.functional.embedding(x, self.lora_a.T) @ self.lora_b.T) * self.scalingHere's a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.
\n\n": "Here's a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.
\n\n", + "Here's a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.
\n\n": "Here's a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.
\n\n", "The default configs can and will be over-ridden when we start the experiment
\n": "The default configs can and will be over-ridden when we start the experiment
\n", "It will download from the url if not present
\n": "It will download from the url if not present
\n", "\n": "
\n", + "\n": "\n", + "
_^_0_^_ has shape _^_1_^_
\n": "_^_0_^_ has shape _^_1_^_
\n", + "Call the model, with the all but the last token
\n": "Call the model, with the all but the last token
\n", + "Compute gradients
\n": "Compute gradients
\n", + "Cross entropy loss
\n": "Cross entropy loss
\n", + "Dataloader
\n": "Dataloader
\n", "Dataset
\n": "Dataset
\n", "GPT-2 configs
\n": "GPT-2 configs
\n", "GPT-2 hugging face uses 1D Convolution layers. We need to transpose those weights since we use linear layers
\n": "GPT-2 hugging face uses 1D Convolution layers. We need to transpose those weights since we use linear layers
\n", + "Get cross entropy loss
\n": "Get cross entropy loss
\n", + "Huggingface tokenizer
\n": "Huggingface tokenizer
\n", + "Initialize the GPT2 model
\n": "Initialize the GPT2 model
\n", "Initialize the data loader
\n": "Initialize the data loader
\n", - "Initialize the model
\n": "Initialize the model
\n", "Initialize the optimizer
\n": "Initialize the optimizer
\n", "LoRA rank
\n": "LoRA rank
\n", - "Load out model
\n": "Load out model
\n", + "Load out model. We use _^_0_^_ because the state does not have LoRA weights
\n": "Load out model. We use _^_0_^_ because the state does not have LoRA weights
\n", "Load pre-trained model weights
\n": "Load pre-trained model weights
\n", "Load the huggingface model and get the parameters
\n": "Load the huggingface model and get the parameters
\n", + "Log the loss
\n": "Log the loss
\n", + "Make gradients 0
\n": "Make gradients 0
\n", "Mapping (_^_0_^_) of decoder layers
\n": "Mapping (_^_0_^_) of decoder layers
\n", + "Move _^_0_^_ to device
\n": "Move _^_0_^_ to device
\n", "Move the parameters based on mapping
\n": "Move the parameters based on mapping
\n", + "Optimize
\n": "Optimize
\n", + "Optimizer
\n": "Optimizer
\n", "Training configs
\n": "Training configs
\n", "Transformer embedding and prediction layer parameter mapping (_^_0_^_)
\n": "Transformer embedding and prediction layer parameter mapping (_^_0_^_)
\n", + "make sure that only lora weights are not loaded
\n": "make sure that only lora weights are not loaded
\n", "Finetune GPT-2 with LoRA": "Finetune GPT-2 with LoRA", "This is training code with notes for fine-tuning pre-trained GPT-2 model with LoRA.": "This is training code with notes for fine-tuning pre-trained GPT-2 model with LoRA." } \ No newline at end of file diff --git a/translate_cache/lora/gpt2.zh.json b/translate_cache/lora/gpt2.zh.json index b82c1894..39f4d522 100644 --- a/translate_cache/lora/gpt2.zh.json +++ b/translate_cache/lora/gpt2.zh.json @@ -1,16 +1,42 @@ { - "Splits hidden_size dim into attn_head_size and num_heads
\n": "Splits hidden_size dim into attn_head_size and num_heads
\n", + "Here's the training code for training a GPT2 model with LoRA on Tiny Shakespeare dataset.
\n": "Here's the training code for training a GPT2 model with LoRA on Tiny Shakespeare dataset.
\n", + "Add position embeddings
\n": "Add position embeddings
\n", + "Apply causal attention
\n": "Apply causal attention
\n", + "Attention
\n": "Attention
\n", + "Attention layer
\n": "Attention layer
\n", + "Attention pre-normalization layer
\n": "Attention pre-normalization layer
\n", + "Decoder blocks
\n": "Decoder blocks
\n", + "FFN
\n": "FFN
\n", + "FFN pre-normalization layer
\n": "FFN pre-normalization layer
\n", + "Feed-forward network
\n": "Feed-forward network
\n", + "Final layer norm
\n": "Final layer norm
\n", "Final normalization
\n": "Final normalization
\n", + "Final project
\n": "Final project
\n", "Get logits from projection layer
\n": "Get logits from projection layer
\n", "Get position embeddings
\n": "Get position embeddings
\n", "Get position ids
\n": "Get position ids
\n", + "Get query, key and value
\n": "Get query, key and value
\n", "Get token embeddings
\n": "Get token embeddings
\n", + "Linear transformation for QKV
\n": "Linear transformation for QKV
\n", + "Output projection
\n": "Output projection
\n", + "Projection layer to logit space
\n": "Projection layer to logit space
\n", + "Reorder to _^_0_^_
\n": "Reorder to _^_0_^_
\n", "Run through transformer blocks
\n": "Run through transformer blocks
\n", - "lin1
\n": "lin1
\n", - "lin2
\n": "lin2
\n", - "out
\n": "out
\n", - "qkv
\n": "qkv
\n", + "Split last dimension to _^_0_^_
\n": "Split last dimension to _^_0_^_
\n", + "The linear layers and the activation
\n": "The linear layers and the activation
\n", + "Token and absolute positional embeddings
\n": "Token and absolute positional embeddings
\n", + "Transform them from shape _^_0_^_ to _^_1_^_
\n": "Transform them from shape _^_0_^_ to _^_1_^_
\n", "