diff --git a/docs/zh/index.html b/docs/zh/index.html index 322f4bf7..2d3bd7ee 100644 --- a/docs/zh/index.html +++ b/docs/zh/index.html @@ -101,6 +101,7 @@
  • 视觉 Transformer (ViT)
  • Primer
  • 沙漏网络
  • +

    Low-Rank Adaptation (LoRA)

    Eleuther GPT-neox

    @@ -137,7 +137,7 @@ -

    Set is not provided. i.e. make the scaling factor .

    +

    Set is not provided. i.e. make the scaling factor .

    @@ -150,7 +150,7 @@ -

    The pre-trained weight

    +

    The pre-trained weight

    @@ -176,7 +176,7 @@ -

    Bias parameter (also frozen)

    +

    Bias parameter (also frozen)

    @@ -202,7 +202,7 @@ -

    scaling factor

    +

    scaling factor

    @@ -214,11 +214,11 @@ -

    Matrix

    +

    Matrix

    -
    80        self.lora_a = nn.Parameter(torch.empty((in_features, r)))
    +
    80        self.lora_a = nn.Parameter(torch.empty((r, in_features)))
    @@ -226,11 +226,11 @@ -

    Matrix , we keep and transposed

    +

    Matrix , we keep and transposed

    -
    82        self.lora_b = nn.Parameter(torch.empty((r, out_features)))
    +            
    82        self.lora_b = nn.Parameter(torch.empty((out_features, r)))
     83
     84        with torch.no_grad():
    @@ -240,7 +240,7 @@ -

    Initialize similar to a weight matrix in a normal linear layer

    +

    Initialize similar to a weight matrix in a normal linear layer

    @@ -252,7 +252,7 @@ -

    Initialize to so that is at initialization

    +

    Initialize to so that is at initialization

    @@ -275,7 +275,7 @@ -

    Compute

    +

    Compute

    @@ -287,11 +287,11 @@ -

    Add

    +

    Add

    -
    95        result += (x @ self.lora_a @ self.lora_b) * self.scaling
    +
    95        result += (x @ self.lora_a.T @ self.lora_b.T) * self.scaling
    @@ -312,8 +312,8 @@ #

    LoRA Embedding Layer

    -

    Similar to LoRA linear layer this adds a low-rank decomposition to the pre-trained embedding weights matrix ().

    -

    +

    Similar to LoRA linear layer this adds a low-rank decomposition to the pre-trained embedding weights matrix ().

    +

    @@ -330,7 +330,7 @@
  • embedding_dim is the number embedding dimensions
  • r - is the rank of the decomposition
  • + is the rank of the decomposition
  • alpha is the scaling factor
  • @@ -356,7 +356,7 @@ -

    Set is not provided. i.e. make the scaling factor .

    +

    Set is not provided. i.e. make the scaling factor .

    @@ -369,7 +369,7 @@ -

    The pre-trained embedding weights (frozen)

    +

    The pre-trained embedding weights (frozen)

    @@ -382,7 +382,7 @@ -

    scaling factor

    +

    scaling factor

    @@ -394,11 +394,11 @@ -

    Matrix

    +

    Matrix

    -
    133        self.lora_a = nn.Parameter(torch.empty((num_embeddings, r)))
    +
    133        self.lora_a = nn.Parameter(torch.empty((r, num_embeddings)))
    @@ -406,11 +406,11 @@ -

    Matrix

    +

    Matrix

    -
    135        self.lora_b = nn.Parameter(torch.empty((r, embedding_dim)))
    +            
    135        self.lora_b = nn.Parameter(torch.empty((embedding_dim, r)))
     136
     137        with torch.no_grad():
    @@ -420,7 +420,7 @@ -

    Initialize with a normal distribution

    +

    Initialize with a normal distribution

    @@ -432,7 +432,7 @@ -

    Initialize to so that is at initialization

    +

    Initialize to so that is at initialization

    @@ -455,7 +455,7 @@ -

    Compute the embeddings

    +

    Compute the embeddings

    @@ -467,11 +467,11 @@ -

    Add Error

    +

    Add

    -
    148        result += (nn.functional.embedding(x, self.lora_a) @ self.lora_b) * self.scaling
    +
    148        result += (nn.functional.embedding(x, self.lora_a.T) @ self.lora_b.T) * self.scaling
    diff --git a/docs/zh/sitemap.xml b/docs/zh/sitemap.xml index 83188c87..c7f62e1f 100644 --- a/docs/zh/sitemap.xml +++ b/docs/zh/sitemap.xml @@ -1296,21 +1296,21 @@ https://nn.labml.ai/lora/gpt2.html - 2024-08-16T16:30:00+00:00 + 2024-08-18T16:30:00+00:00 1.00 https://nn.labml.ai/lora/index.html - 2024-08-03T16:30:00+00:00 + 2024-08-23T16:30:00+00:00 1.00 https://nn.labml.ai/lora/experiment.html - 2024-08-16T16:30:00+00:00 + 2024-08-23T16:30:00+00:00 1.00 diff --git a/translate_cache/__init__.zh.json b/translate_cache/__init__.zh.json index 5f0abee4..e3124540 100644 --- a/translate_cache/__init__.zh.json +++ b/translate_cache/__init__.zh.json @@ -15,6 +15,7 @@ "

    \u2728 Distillation

    \n": "

    \u2728 \u84b8\u998f

    \n", "

    \u2728 Generative Adversarial Networks

    \n": "

    \u2728 \u751f\u6210\u5bf9\u6297\u7f51\u7edc

    \n", "

    \u2728 HyperNetworks - HyperLSTM

    \n": "

    \u2728 \u8d85\u7f51\u7edc-HyperLSTM

    \n", + "

    \u2728 Low-Rank Adaptation (LoRA)

    \n": "

    \u2728 Low-Rank Adaptation (LoRA)

    \n", "

    \u2728 LSTM

    \n": "

    \u2728 LSTM

    \n", "

    \u2728 Eleuther GPT-NeoX

    \n": "

    \u2728 Eleuther GPT-neox

    \n", "

    \u2728 Normalization Layers

    \n": "

    \u2728 \u5f52\u4e00\u5316\u5c42

    \n", diff --git a/translate_cache/lora/experiment.zh.json b/translate_cache/lora/experiment.zh.json index 3b488c79..11ec9a90 100644 --- a/translate_cache/lora/experiment.zh.json +++ b/translate_cache/lora/experiment.zh.json @@ -1,24 +1,39 @@ { - "

    Finetune GPT-2 with LoRA

    \n

    Here's a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.

    \n

    _^_0_^_

    \n": "

    Finetune GPT-2 with LoRA

    \n

    Here's a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.

    \n

    _^_0_^_

    \n", + "

    Finetune GPT-2 with LoRA

    \n

    Here's a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.

    \n

    _^_0_^_

    \n": "

    Finetune GPT-2 with LoRA

    \n

    Here's a Colab notebook for training a feedback transformer on Tiny Shakespeare dataset.

    \n

    _^_0_^_

    \n", "

    Trainer configurations and the training loop

    \n

    The default configs can and will be over-ridden when we start the experiment

    \n": "

    Trainer configurations and the training loop

    \n

    The default configs can and will be over-ridden when we start the experiment

    \n", "

    Initialize the model, optimizer and dataloader

    \n": "

    Initialize the model, optimizer and dataloader

    \n", "

    Load pre-trained GPT-2 from huggingface

    \n": "

    Load pre-trained GPT-2 from huggingface

    \n", "

    Tiny Shakespeare dataset

    \n

    It will download from the url if not present

    \n": "

    Tiny Shakespeare dataset

    \n

    It will download from the url if not present

    \n", "

    Training loop

    \n": "

    Training loop

    \n", + "

    \n": "

    \n", + "

    GPT2 model

    \n": "

    GPT2 model

    \n", + "

    _^_0_^_ has shape _^_1_^_

    \n": "

    _^_0_^_ has shape _^_1_^_

    \n", + "

    Call the model, with the all but the last token

    \n": "

    Call the model, with the all but the last token

    \n", + "

    Compute gradients

    \n": "

    Compute gradients

    \n", + "

    Cross entropy loss

    \n": "

    Cross entropy loss

    \n", + "

    Dataloader

    \n": "

    Dataloader

    \n", "

    Dataset

    \n": "

    Dataset

    \n", "

    GPT-2 configs

    \n": "

    GPT-2 configs

    \n", "

    GPT-2 hugging face uses 1D Convolution layers. We need to transpose those weights since we use linear layers

    \n": "

    GPT-2 hugging face uses 1D Convolution layers. We need to transpose those weights since we use linear layers

    \n", + "

    Get cross entropy loss

    \n": "

    Get cross entropy loss

    \n", + "

    Huggingface tokenizer

    \n": "

    Huggingface tokenizer

    \n", + "

    Initialize the GPT2 model

    \n": "

    Initialize the GPT2 model

    \n", "

    Initialize the data loader

    \n": "

    Initialize the data loader

    \n", - "

    Initialize the model

    \n": "

    Initialize the model

    \n", "

    Initialize the optimizer

    \n": "

    Initialize the optimizer

    \n", "

    LoRA rank

    \n": "

    LoRA rank

    \n", - "

    Load out model

    \n": "

    Load out model

    \n", + "

    Load out model. We use _^_0_^_ because the state does not have LoRA weights

    \n": "

    Load out model. We use _^_0_^_ because the state does not have LoRA weights

    \n", "

    Load pre-trained model weights

    \n": "

    Load pre-trained model weights

    \n", "

    Load the huggingface model and get the parameters

    \n": "

    Load the huggingface model and get the parameters

    \n", + "

    Log the loss

    \n": "

    Log the loss

    \n", + "

    Make gradients 0

    \n": "

    Make gradients 0

    \n", "

    Mapping (_^_0_^_) of decoder layers

    \n": "

    Mapping (_^_0_^_) of decoder layers

    \n", + "

    Move _^_0_^_ to device

    \n": "

    Move _^_0_^_ to device

    \n", "

    Move the parameters based on mapping

    \n": "

    Move the parameters based on mapping

    \n", + "

    Optimize

    \n": "

    Optimize

    \n", + "

    Optimizer

    \n": "

    Optimizer

    \n", "

    Training configs

    \n": "

    Training configs

    \n", "

    Transformer embedding and prediction layer parameter mapping (_^_0_^_)

    \n": "

    Transformer embedding and prediction layer parameter mapping (_^_0_^_)

    \n", + "

    make sure that only lora weights are not loaded

    \n": "

    make sure that only lora weights are not loaded

    \n", "Finetune GPT-2 with LoRA": "Finetune GPT-2 with LoRA", "This is training code with notes for fine-tuning pre-trained GPT-2 model with LoRA.": "This is training code with notes for fine-tuning pre-trained GPT-2 model with LoRA." } \ No newline at end of file diff --git a/translate_cache/lora/gpt2.zh.json b/translate_cache/lora/gpt2.zh.json index b82c1894..39f4d522 100644 --- a/translate_cache/lora/gpt2.zh.json +++ b/translate_cache/lora/gpt2.zh.json @@ -1,16 +1,42 @@ { - "

    Splits hidden_size dim into attn_head_size and num_heads

    \n": "

    Splits hidden_size dim into attn_head_size and num_heads

    \n", + "

    GPT-2 with LoRA modules

    \n

    Here's the training code for training a GPT2 model with LoRA on Tiny Shakespeare dataset.

    \n": "

    GPT-2 with LoRA modules

    \n

    Here's the training code for training a GPT2 model with LoRA on Tiny Shakespeare dataset.

    \n", + "

    GPT2 Model

    \n": "

    GPT2 Model

    \n", + "

    Decoder block

    \n": "

    Decoder block

    \n", + "

    Feedforward Network

    \n": "

    Feedforward Network

    \n", + "

    Multi-Head Attention

    \n": "

    Multi-Head Attention

    \n", "

    Add position embeddings

    \n": "

    Add position embeddings

    \n", + "

    Apply causal attention

    \n": "

    Apply causal attention

    \n", + "

    Attention

    \n": "

    Attention

    \n", + "

    Attention layer

    \n": "

    Attention layer

    \n", + "

    Attention pre-normalization layer

    \n": "

    Attention pre-normalization layer

    \n", + "

    Decoder blocks

    \n": "

    Decoder blocks

    \n", + "

    FFN

    \n": "

    FFN

    \n", + "

    FFN pre-normalization layer

    \n": "

    FFN pre-normalization layer

    \n", + "

    Feed-forward network

    \n": "

    Feed-forward network

    \n", + "

    Final layer norm

    \n": "

    Final layer norm

    \n", "

    Final normalization

    \n": "

    Final normalization

    \n", + "

    Final project

    \n": "

    Final project

    \n", "

    Get logits from projection layer

    \n": "

    Get logits from projection layer

    \n", "

    Get position embeddings

    \n": "

    Get position embeddings

    \n", "

    Get position ids

    \n": "

    Get position ids

    \n", + "

    Get query, key and value

    \n": "

    Get query, key and value

    \n", "

    Get token embeddings

    \n": "

    Get token embeddings

    \n", + "

    Linear transformation for QKV

    \n": "

    Linear transformation for QKV

    \n", + "

    Output projection

    \n": "

    Output projection

    \n", + "

    Projection layer to logit space

    \n": "

    Projection layer to logit space

    \n", + "

    Reorder to _^_0_^_

    \n": "

    Reorder to _^_0_^_

    \n", "

    Run through transformer blocks

    \n": "

    Run through transformer blocks

    \n", - "

    lin1

    \n": "

    lin1

    \n", - "

    lin2

    \n": "

    lin2

    \n", - "

    out

    \n": "

    out

    \n", - "

    qkv

    \n": "

    qkv

    \n", + "

    Split last dimension to _^_0_^_

    \n": "

    Split last dimension to _^_0_^_

    \n", + "

    The linear layers and the activation

    \n": "

    The linear layers and the activation

    \n", + "

    Token and absolute positional embeddings

    \n": "

    Token and absolute positional embeddings

    \n", + "

    Transform them from shape _^_0_^_ to _^_1_^_

    \n": "

    Transform them from shape _^_0_^_ to _^_1_^_

    \n", "\n": "\n", - "gpt2.py": "gpt2.py" + "\n": "\n", + "\n": "\n", + "\n": "\n", + "\n": "\n", + "\n": "\n", + "\n": "\n", + "GPT-2 implementation with LoRA modules": "GPT-2 implementation with LoRA modules", + "GPT-2 with LoRA": "GPT-2 with LoRA" } \ No newline at end of file