diff --git a/docs/lora/experiment.html b/docs/lora/experiment.html index b61116e3..1afe7106 100644 --- a/docs/lora/experiment.html +++ b/docs/lora/experiment.html @@ -70,7 +70,7 @@
-Here's a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.
@@ -165,24 +165,19 @@52 text: TensorDataset = "tiny_shakespeare"
-53 tokenizer = AutoTokenizer.from_pretrained("gpt2")
-54 model: GPTModel
-55 optimizer: torch.optim.Adam
-56 loss_func = torch.nn.CrossEntropyLoss()
-57 data_loader: DataLoader
52 text: TensorDataset = "tiny_shakespeare"
59 def _load_pretrained_weights(self):
54 tokenizer = AutoTokenizer.from_pretrained("gpt2")
65 hf_model = AutoModelForCausalLM.from_pretrained("gpt2")
-66 state_dict = hf_model.state_dict()
56 model: GPTModel
Transformer embedding and prediction layer parameter mapping (hf: ours
-)
Optimizer
69 mapping = {
-70 'transformer.wte.weight': 'token_embedding.weight',
-71 'transformer.wpe.weight': 'position_embedding.weight',
-72 'transformer.ln_f.weight': 'final_norm.weight',
-73 'transformer.ln_f.bias': 'final_norm.bias',
-74 'lm_head.weight': 'lm_head.weight'
-75 }
58 optimizer: torch.optim.Adam
78 for i in range(12):
-79 mapping[f'transformer.h.{i}.ln_1.weight'] = f'blocks.{i}.pre_norm.weight'
-80 mapping[f'transformer.h.{i}.ln_1.bias'] = f'blocks.{i}.pre_norm.bias'
-81 mapping[f'transformer.h.{i}.attn.c_attn.weight'] = f'blocks.{i}.attn.qkv_projection.weight'
-82 mapping[f'transformer.h.{i}.attn.c_attn.bias'] = f'blocks.{i}.attn.qkv_projection.bias'
-83 mapping[f'transformer.h.{i}.attn.c_proj.weight'] = f'blocks.{i}.attn.output_projection.weight'
-84 mapping[f'transformer.h.{i}.attn.c_proj.bias'] = f'blocks.{i}.attn.output_projection.bias'
-85 mapping[f'transformer.h.{i}.ln_2.weight'] = f'blocks.{i}.post_norm.weight'
-86 mapping[f'transformer.h.{i}.ln_2.bias'] = f'blocks.{i}.post_norm.bias'
-87 mapping[f'transformer.h.{i}.mlp.c_fc.weight'] = f'blocks.{i}.ffn.linear_in.weight'
-88 mapping[f'transformer.h.{i}.mlp.c_fc.bias'] = f'blocks.{i}.ffn.linear_in.bias'
-89 mapping[f'transformer.h.{i}.mlp.c_proj.weight'] = f'blocks.{i}.ffn.linear_out.weight'
-90 mapping[f'transformer.h.{i}.mlp.c_proj.bias'] = f'blocks.{i}.ffn.linear_out.bias'
60 loss_func = torch.nn.CrossEntropyLoss()
93 new_state_dict = {}
-94 for old_key, new_key in mapping.items():
-95 if old_key in state_dict:
-96 new_state_dict[new_key] = state_dict[old_key]
62 data_loader: DataLoader
GPT-2 hugging face uses 1D Convolution layers. We need to transpose those weights since we use linear layers
+99 convo_layers = ([f'blocks.{i}.ffn.linear_in.weight' for i in range(12)] +
-100 [f'blocks.{i}.ffn.linear_out.weight' for i in range(12)] +
-101 [f'blocks.{i}.attn.qkv_projection.weight' for i in range(12)] +
-102 [f'blocks.{i}.attn.output_projection.weight' for i in range(12)])
-103
-104 for layer in convo_layers:
-105 new_state_dict[layer] = torch.transpose(new_state_dict[layer], 0, 1)
64 def _load_pretrained_weights(self):
Load out model. We use strict = False
- because the state does not have LoRA weights
Load the huggingface model and get the parameters
108 self.model.load_state_dict(new_state_dict, strict=False)
70 hf_model = AutoModelForCausalLM.from_pretrained("gpt2")
+71 state_dict = hf_model.state_dict()
Transformer embedding and prediction layer parameter mapping (hf: ours
+)
110 def initialize(self):
74 mapping = {
+75 'transformer.wte.weight': 'token_embedding.weight',
+76 'transformer.wpe.weight': 'position_embedding.weight',
+77 'transformer.ln_f.weight': 'final_norm.weight',
+78 'transformer.ln_f.bias': 'final_norm.bias',
+79 'lm_head.weight': 'lm_head.weight'
+80 }
115 self.model = GPTModel(
-116 layer_norm_epsilon=self.layer_norm_epsilon,
-117 d_model=self.d_model,
-118 n_layers=self.n_layers,
-119 n_heads=self.n_heads,
-120 n_positions=self.n_positions,
-121 vocab_size=self.vocab_size,
-122 r=self.lora_r,
-123 )
-124 self.model.to(self.device)
83 for i in range(12):
+84 mapping[f'transformer.h.{i}.ln_1.weight'] = f'blocks.{i}.pre_norm.weight'
+85 mapping[f'transformer.h.{i}.ln_1.bias'] = f'blocks.{i}.pre_norm.bias'
+86 mapping[f'transformer.h.{i}.attn.c_attn.weight'] = f'blocks.{i}.attn.qkv_projection.weight'
+87 mapping[f'transformer.h.{i}.attn.c_attn.bias'] = f'blocks.{i}.attn.qkv_projection.bias'
+88 mapping[f'transformer.h.{i}.attn.c_proj.weight'] = f'blocks.{i}.attn.output_projection.weight'
+89 mapping[f'transformer.h.{i}.attn.c_proj.bias'] = f'blocks.{i}.attn.output_projection.bias'
+90 mapping[f'transformer.h.{i}.ln_2.weight'] = f'blocks.{i}.post_norm.weight'
+91 mapping[f'transformer.h.{i}.ln_2.bias'] = f'blocks.{i}.post_norm.bias'
+92 mapping[f'transformer.h.{i}.mlp.c_fc.weight'] = f'blocks.{i}.ffn.linear_in.weight'
+93 mapping[f'transformer.h.{i}.mlp.c_fc.bias'] = f'blocks.{i}.ffn.linear_in.bias'
+94 mapping[f'transformer.h.{i}.mlp.c_proj.weight'] = f'blocks.{i}.ffn.linear_out.weight'
+95 mapping[f'transformer.h.{i}.mlp.c_proj.bias'] = f'blocks.{i}.ffn.linear_out.bias'
126 self._load_pretrained_weights()
98 new_state_dict = {}
+99 for old_key, new_key in mapping.items():
+100 if old_key in state_dict:
+101 new_state_dict[new_key] = state_dict[old_key]
Initialize the optimizer
+GPT-2 hugging face uses 1D Convolution layers. We need to transpose those weights since we use linear layers
129 self.optimizer = Adam(self.model.parameters(), lr=self.learning_rate)
104 convo_layers = ([f'blocks.{i}.ffn.linear_in.weight' for i in range(12)] +
+105 [f'blocks.{i}.ffn.linear_out.weight' for i in range(12)] +
+106 [f'blocks.{i}.attn.qkv_projection.weight' for i in range(12)] +
+107 [f'blocks.{i}.attn.output_projection.weight' for i in range(12)])
+108
+109 for layer in convo_layers:
+110 new_state_dict[layer] = torch.transpose(new_state_dict[layer], 0, 1)
Initialize the data loader
+Load out model. We use strict = False
+ because the state does not have LoRA weights
132 self.data_loader = DataLoader(self.text, batch_size=self.batch_size, shuffle=True)
113 self.model.load_state_dict(new_state_dict, strict=False)
134 def run(self):
115 def initialize(self):
Initialize the GPT2 model
+139 for _ in monit.loop(self.epochs):
120 self.model = GPTModel(
+121 layer_norm_epsilon=self.layer_norm_epsilon,
+122 d_model=self.d_model,
+123 n_layers=self.n_layers,
+124 n_heads=self.n_heads,
+125 n_positions=self.n_positions,
+126 vocab_size=self.vocab_size,
+127 r=self.lora_r,
+128 )
+129 self.model.to(self.device)
inputs
- has shape [batch_size, seq_len]
-
Load pre-trained model weights
141 for (inputs,) in monit.iterate('Train', self.data_loader):
131 self._load_pretrained_weights()
143 inputs = inputs.to(self.device)
134 self.optimizer = Adam(self.model.parameters(), lr=self.learning_rate)
Call the model, with the all but the last token
+Initialize the data loader
145 logits = self.model(inputs[:, :-1])
137 self.data_loader = DataLoader(self.text, batch_size=self.batch_size, shuffle=True)
147 loss = self.loss_func(logits.reshape(-1, logits.shape[-1]), inputs[:, 1:].reshape(-1))
139 def run(self):
Make gradients 0
- +150 self.optimizer.zero_grad()
144 for _ in monit.loop(self.epochs):
152 loss.backward()
146 for (inputs,) in monit.iterate('Train', self.data_loader):
154 self.optimizer.step()
148 inputs = inputs.to(self.device)
157 tracker.save({'loss': loss})
-158 tracker.add_global_step()
150 logits = self.model(inputs[:, :-1])
160 tracker.new_line()
152 loss = self.loss_func(logits.reshape(-1, logits.shape[-1]), inputs[:, 1:].reshape(-1))
163@option(Trainer.text)
-164def tiny_shakespeare(c: Trainer):
155 self.optimizer.zero_grad()
Compute gradients
+ +157 loss.backward()
Optimize
+ +159 self.optimizer.step()
Log the loss
+ +162 tracker.save({'loss': loss})
+163 tracker.add_global_step()
+ +
165 tracker.new_line()
168@option(Trainer.text)
+169def tiny_shakespeare(c: Trainer):
170 path = lab.get_data_path() / 'tiny_shakespeare.txt'
-171 if not path.exists():
-172 download_file("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt", path)
-173 with open(path, 'r', encoding='utf-8') as f:
-174 text = f.read()
-175
-176 tokens = c.tokenizer.encode(text)
-177 num_batches = len(tokens) // (c.batch_size * c.context_len)
-178 tokens = tokens[:num_batches * c.batch_size * c.context_len]
-179 input_ids = torch.tensor(tokens).view(-1, c.context_len)
-180 return TensorDataset(input_ids)
175 path = lab.get_data_path() / 'tiny_shakespeare.txt'
+176 if not path.exists():
+177 download_file("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt", path)
+178 with open(path, 'r', encoding='utf-8') as f:
+179 text = f.read()
+180
+181 tokens = c.tokenizer.encode(text)
+182 num_batches = len(tokens) // (c.batch_size * c.context_len)
+183 tokens = tokens[:num_batches * c.batch_size * c.context_len]
+184 input_ids = torch.tensor(tokens).view(-1, c.context_len)
+185 return TensorDataset(input_ids)