diff --git a/docs/lora/experiment.html b/docs/lora/experiment.html index b61116e3..1afe7106 100644 --- a/docs/lora/experiment.html +++ b/docs/lora/experiment.html @@ -70,7 +70,7 @@ -

Finetune GPT-2 with LoRA

+

Finetune GPT-2 with LoRA

Here's a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.

Open In Colab

@@ -165,24 +165,19 @@
-
52    text: TensorDataset = "tiny_shakespeare"
-53    tokenizer = AutoTokenizer.from_pretrained("gpt2")
-54    model: GPTModel
-55    optimizer: torch.optim.Adam
-56    loss_func = torch.nn.CrossEntropyLoss()
-57    data_loader: DataLoader
+
52    text: TensorDataset = "tiny_shakespeare"
-
+
-

Load pre-trained GPT-2 from huggingface

+

Huggingface tokenizer

-
59    def _load_pretrained_weights(self):
+
54    tokenizer = AutoTokenizer.from_pretrained("gpt2")
@@ -190,12 +185,11 @@ -

Load the huggingface model and get the parameters

+

GPT2 model

-
65        hf_model = AutoModelForCausalLM.from_pretrained("gpt2")
-66        state_dict = hf_model.state_dict()
+
56    model: GPTModel
@@ -203,18 +197,11 @@ -

Transformer embedding and prediction layer parameter mapping (hf: ours -)

+

Optimizer

-
69        mapping = {
-70            'transformer.wte.weight': 'token_embedding.weight',
-71            'transformer.wpe.weight': 'position_embedding.weight',
-72            'transformer.ln_f.weight': 'final_norm.weight',
-73            'transformer.ln_f.bias': 'final_norm.bias',
-74            'lm_head.weight': 'lm_head.weight'
-75        }
+
58    optimizer: torch.optim.Adam
@@ -222,24 +209,11 @@ -

Mapping (hf: ours -) of decoder layers

+

Cross entropy loss

-
78        for i in range(12):
-79            mapping[f'transformer.h.{i}.ln_1.weight'] = f'blocks.{i}.pre_norm.weight'
-80            mapping[f'transformer.h.{i}.ln_1.bias'] = f'blocks.{i}.pre_norm.bias'
-81            mapping[f'transformer.h.{i}.attn.c_attn.weight'] = f'blocks.{i}.attn.qkv_projection.weight'
-82            mapping[f'transformer.h.{i}.attn.c_attn.bias'] = f'blocks.{i}.attn.qkv_projection.bias'
-83            mapping[f'transformer.h.{i}.attn.c_proj.weight'] = f'blocks.{i}.attn.output_projection.weight'
-84            mapping[f'transformer.h.{i}.attn.c_proj.bias'] = f'blocks.{i}.attn.output_projection.bias'
-85            mapping[f'transformer.h.{i}.ln_2.weight'] = f'blocks.{i}.post_norm.weight'
-86            mapping[f'transformer.h.{i}.ln_2.bias'] = f'blocks.{i}.post_norm.bias'
-87            mapping[f'transformer.h.{i}.mlp.c_fc.weight'] = f'blocks.{i}.ffn.linear_in.weight'
-88            mapping[f'transformer.h.{i}.mlp.c_fc.bias'] = f'blocks.{i}.ffn.linear_in.bias'
-89            mapping[f'transformer.h.{i}.mlp.c_proj.weight'] = f'blocks.{i}.ffn.linear_out.weight'
-90            mapping[f'transformer.h.{i}.mlp.c_proj.bias'] = f'blocks.{i}.ffn.linear_out.bias'
+
60    loss_func = torch.nn.CrossEntropyLoss()
@@ -247,32 +221,23 @@ -

Move the parameters based on mapping

+

Dataloader

-
93        new_state_dict = {}
-94        for old_key, new_key in mapping.items():
-95            if old_key in state_dict:
-96                new_state_dict[new_key] = state_dict[old_key]
+
62    data_loader: DataLoader
-
+
-

GPT-2 hugging face uses 1D Convolution layers. We need to transpose those weights since we use linear layers

+

Load pre-trained GPT-2 from huggingface

-
99        convo_layers = ([f'blocks.{i}.ffn.linear_in.weight' for i in range(12)] +
-100                        [f'blocks.{i}.ffn.linear_out.weight' for i in range(12)] +
-101                        [f'blocks.{i}.attn.qkv_projection.weight' for i in range(12)] +
-102                        [f'blocks.{i}.attn.output_projection.weight' for i in range(12)])
-103
-104        for layer in convo_layers:
-105            new_state_dict[layer] = torch.transpose(new_state_dict[layer], 0, 1)
+
64    def _load_pretrained_weights(self):
@@ -280,24 +245,31 @@ -

Load out model. We use strict = False - because the state does not have LoRA weights

+

Load the huggingface model and get the parameters

-
108        self.model.load_state_dict(new_state_dict, strict=False)
+
70        hf_model = AutoModelForCausalLM.from_pretrained("gpt2")
+71        state_dict = hf_model.state_dict()
-
+
-

Initialize the model, optimizer and dataloader

+

Transformer embedding and prediction layer parameter mapping (hf: ours +)

-
110    def initialize(self):
+
74        mapping = {
+75            'transformer.wte.weight': 'token_embedding.weight',
+76            'transformer.wpe.weight': 'position_embedding.weight',
+77            'transformer.ln_f.weight': 'final_norm.weight',
+78            'transformer.ln_f.bias': 'final_norm.bias',
+79            'lm_head.weight': 'lm_head.weight'
+80        }
@@ -305,20 +277,24 @@ -

Initialize the model

+

Mapping (hf: ours +) of decoder layers

-
115        self.model = GPTModel(
-116            layer_norm_epsilon=self.layer_norm_epsilon,
-117            d_model=self.d_model,
-118            n_layers=self.n_layers,
-119            n_heads=self.n_heads,
-120            n_positions=self.n_positions,
-121            vocab_size=self.vocab_size,
-122            r=self.lora_r,
-123        )
-124        self.model.to(self.device)
+
83        for i in range(12):
+84            mapping[f'transformer.h.{i}.ln_1.weight'] = f'blocks.{i}.pre_norm.weight'
+85            mapping[f'transformer.h.{i}.ln_1.bias'] = f'blocks.{i}.pre_norm.bias'
+86            mapping[f'transformer.h.{i}.attn.c_attn.weight'] = f'blocks.{i}.attn.qkv_projection.weight'
+87            mapping[f'transformer.h.{i}.attn.c_attn.bias'] = f'blocks.{i}.attn.qkv_projection.bias'
+88            mapping[f'transformer.h.{i}.attn.c_proj.weight'] = f'blocks.{i}.attn.output_projection.weight'
+89            mapping[f'transformer.h.{i}.attn.c_proj.bias'] = f'blocks.{i}.attn.output_projection.bias'
+90            mapping[f'transformer.h.{i}.ln_2.weight'] = f'blocks.{i}.post_norm.weight'
+91            mapping[f'transformer.h.{i}.ln_2.bias'] = f'blocks.{i}.post_norm.bias'
+92            mapping[f'transformer.h.{i}.mlp.c_fc.weight'] = f'blocks.{i}.ffn.linear_in.weight'
+93            mapping[f'transformer.h.{i}.mlp.c_fc.bias'] = f'blocks.{i}.ffn.linear_in.bias'
+94            mapping[f'transformer.h.{i}.mlp.c_proj.weight'] = f'blocks.{i}.ffn.linear_out.weight'
+95            mapping[f'transformer.h.{i}.mlp.c_proj.bias'] = f'blocks.{i}.ffn.linear_out.bias'
@@ -326,11 +302,14 @@ -

Load pre-trained model weights

+

Move the parameters based on mapping

-
126        self._load_pretrained_weights()
+
98        new_state_dict = {}
+99        for old_key, new_key in mapping.items():
+100            if old_key in state_dict:
+101                new_state_dict[new_key] = state_dict[old_key]
@@ -338,11 +317,17 @@ -

Initialize the optimizer

+

GPT-2 hugging face uses 1D Convolution layers. We need to transpose those weights since we use linear layers

-
129        self.optimizer = Adam(self.model.parameters(), lr=self.learning_rate)
+
104        convo_layers = ([f'blocks.{i}.ffn.linear_in.weight' for i in range(12)] +
+105                        [f'blocks.{i}.ffn.linear_out.weight' for i in range(12)] +
+106                        [f'blocks.{i}.attn.qkv_projection.weight' for i in range(12)] +
+107                        [f'blocks.{i}.attn.output_projection.weight' for i in range(12)])
+108
+109        for layer in convo_layers:
+110            new_state_dict[layer] = torch.transpose(new_state_dict[layer], 0, 1)
@@ -350,11 +335,12 @@ -

Initialize the data loader

+

Load out model. We use strict = False + because the state does not have LoRA weights

-
132        self.data_loader = DataLoader(self.text, batch_size=self.batch_size, shuffle=True)
+
113        self.model.load_state_dict(new_state_dict, strict=False)
@@ -362,11 +348,11 @@ -

Training loop

+

Initialize the model, optimizer and dataloader

-
134    def run(self):
+
115    def initialize(self):
@@ -374,10 +360,20 @@ - +

Initialize the GPT2 model

+
-
139        for _ in monit.loop(self.epochs):
+
120        self.model = GPTModel(
+121            layer_norm_epsilon=self.layer_norm_epsilon,
+122            d_model=self.d_model,
+123            n_layers=self.n_layers,
+124            n_heads=self.n_heads,
+125            n_positions=self.n_positions,
+126            vocab_size=self.vocab_size,
+127            r=self.lora_r,
+128        )
+129        self.model.to(self.device)
@@ -385,13 +381,11 @@ -

inputs - has shape [batch_size, seq_len] -

+

Load pre-trained model weights

-
141            for (inputs,) in monit.iterate('Train', self.data_loader):
+
131        self._load_pretrained_weights()
@@ -399,12 +393,11 @@ -

Move inputs - to device

+

Initialize the optimizer

-
143                inputs = inputs.to(self.device)
+
134        self.optimizer = Adam(self.model.parameters(), lr=self.learning_rate)
@@ -412,23 +405,23 @@ -

Call the model, with the all but the last token

+

Initialize the data loader

-
145                logits = self.model(inputs[:, :-1])
+
137        self.data_loader = DataLoader(self.text, batch_size=self.batch_size, shuffle=True)
-
+
-

Get cross entropy loss

+

Training loop

-
147                loss = self.loss_func(logits.reshape(-1, logits.shape[-1]), inputs[:, 1:].reshape(-1))
+
139    def run(self):
@@ -436,11 +429,10 @@ -

Make gradients 0

- +
-
150                self.optimizer.zero_grad()
+
144        for _ in monit.loop(self.epochs):
@@ -448,11 +440,13 @@ -

Compute gradients

+

inputs + has shape [batch_size, seq_len] +

-
152                loss.backward()
+
146            for (inputs,) in monit.iterate('Train', self.data_loader):
@@ -460,11 +454,12 @@ -

Optimize

+

Move inputs + to device

-
154                self.optimizer.step()
+
148                inputs = inputs.to(self.device)
@@ -472,12 +467,11 @@ -

Log the loss

+

Call the model, with the all but the last token

-
157                tracker.save({'loss': loss})
-158                tracker.add_global_step()
+
150                logits = self.model(inputs[:, :-1])
@@ -485,25 +479,23 @@ -

+

Get cross entropy loss

-
160            tracker.new_line()
+
152                loss = self.loss_func(logits.reshape(-1, logits.shape[-1]), inputs[:, 1:].reshape(-1))
-
+
-

Tiny Shakespeare dataset

-

It will download from the url if not present

+

Make gradients 0

-
163@option(Trainer.text)
-164def tiny_shakespeare(c: Trainer):
+
155                self.optimizer.zero_grad()
@@ -511,20 +503,83 @@ +

Compute gradients

+ +
+
+
157                loss.backward()
+
+
+
+
+ +

Optimize

+ +
+
+
159                self.optimizer.step()
+
+
+
+
+ +

Log the loss

+ +
+
+
162                tracker.save({'loss': loss})
+163                tracker.add_global_step()
+
+
+
+
+ +

+ +
+
+
165            tracker.new_line()
+
+
+
+
+ +

Tiny Shakespeare dataset

+

It will download from the url if not present

+ +
+
+
168@option(Trainer.text)
+169def tiny_shakespeare(c: Trainer):
+
+
+
+
+
-
170    path = lab.get_data_path() / 'tiny_shakespeare.txt'
-171    if not path.exists():
-172        download_file("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt", path)
-173    with open(path, 'r', encoding='utf-8') as f:
-174        text = f.read()
-175
-176    tokens = c.tokenizer.encode(text)
-177    num_batches = len(tokens) // (c.batch_size * c.context_len)
-178    tokens = tokens[:num_batches * c.batch_size * c.context_len]
-179    input_ids = torch.tensor(tokens).view(-1, c.context_len)
-180    return TensorDataset(input_ids)
+
175    path = lab.get_data_path() / 'tiny_shakespeare.txt'
+176    if not path.exists():
+177        download_file("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt", path)
+178    with open(path, 'r', encoding='utf-8') as f:
+179        text = f.read()
+180
+181    tokens = c.tokenizer.encode(text)
+182    num_batches = len(tokens) // (c.batch_size * c.context_len)
+183    tokens = tokens[:num_batches * c.batch_size * c.context_len]
+184    input_ids = torch.tensor(tokens).view(-1, c.context_len)
+185    return TensorDataset(input_ids)