diff --git a/docs/RWKV/configs.html b/docs/RWKV/configs.html index 3780bb86..463c144a 100644 --- a/docs/RWKV/configs.html +++ b/docs/RWKV/configs.html @@ -12,7 +12,7 @@ - + @@ -23,7 +23,7 @@
This is an implementation of Wasserstein GAN.
The original GAN loss is based on Jensen-Shannon (JS) divergence between the real distribution and generated distribution . The Wasserstein GAN is based on Earth Mover distance between these distributions.
-+
is the set of all joint distributions, whose marginal probabilities are .
is the earth mover distance for a given joint distribution ( and are probabilities).
-So is equal to the least earth mover distance for any joint distribution between the real distribution and generated distribution .
+So is equal to the least earth mover distance for any joint distribution between the real distribution and generated distribution .
The paper shows that Jensen-Shannon (JS) divergence and other measures for the difference between two probability distributions are not smooth. And therefore if we are doing gradient descent on one of the probability distributions (parameterized) it will not converge.
-Based on Kantorovich-Rubinstein duality,
+Based on Kantorovich-Rubinstein duality,
where are all 1-Lipschitz functions.
That is, it is equal to the greatest difference among all 1-Lipschitz functions.
-For -Lipschitz functions,
+For -Lipschitz functions,
If all -Lipschitz functions can be represented as where is parameterized by ,
-+
If is represented by a generator and is from a known distribution ,
Now to converge with we can gradient descent on to minimize above formula.
diff --git a/docs/lora/gpt2.html b/docs/lora/gpt2.html new file mode 100644 index 00000000..bed238dc --- /dev/null +++ b/docs/lora/gpt2.html @@ -0,0 +1,378 @@ + + + + + + + + + + + + + + + + + + + + + + +1import torch
+2import torch.nn as nn
+3from transformers import AutoTokenizer
+4from labml_nn.lora import Linear, Embedding
+5
+6tokenizer = AutoTokenizer.from_pretrained("gpt2")
+7
+8config = {
+9 "layer_norm_epsilon": 1e-05,
+10 "n_embd": 768,
+11 "n_head": 12,
+12 "n_layer": 12,
+13 "n_positions": 1024,
+14 "vocab_size": 50257,
+15 "device": "cuda"
+16}19class FFN(nn.Module):20 def __init__(self, dim):
+21 super().__init__()
+22 self.c_fc = Linear(config['n_embd'], dim, r=32, bias=True)
+23 self.c_proj = Linear(dim, config['n_embd'], r=32, bias=True)
+24 self.act = nn.functional.gelu26 def forward(self, hidden_states):
+27 hidden_states = self.c_fc(hidden_states)
+28 hidden_states = self.act(hidden_states)
+29 hidden_states = self.c_proj(hidden_states)
+30 return hidden_states33class MultiHeadAttention(nn.Module):34 def __init__(self):
+35 super().__init__()
+36 self.embed_dim = config['n_embd']
+37 self.num_heads = config['n_head']
+38 self.head_dim = self.embed_dim // self.num_heads
+39 self.split_size = self.embed_dim
+40
+41 self.c_att = Linear(config['n_embd'], config['n_embd'] * 3, r=32, bias=True)
+42 self.c_proj = Linear(config['n_embd'], config['n_embd'], r=32, bias=True)Splits hidden_size dim into attn_head_size and num_heads
+ +44 def _split_heads(self, tensor, num_heads, attn_head_size):48 new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+49 tensor = tensor.view(new_shape)
+50 return tensor.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features)52 def forward(self, hidden_states):
+53 batch_size, seq_length, _ = hidden_states.size()
+54
+55 query, key, value = self.c_att(hidden_states).split(self.split_size, dim=2)
+56
+57 query = self._split_heads(query, self.num_heads, self.head_dim)
+58 key = self._split_heads(key, self.num_heads, self.head_dim)
+59 value = self._split_heads(value, self.num_heads, self.head_dim)
+60
+61 attn_output = torch.nn.functional.scaled_dot_product_attention(
+62 query,
+63 key,
+64 value,
+65 attn_mask=None,
+66 dropout_p=0.0,
+67 is_causal=True, # for the triangular mask
+68 )
+69
+70 attn_output = attn_output.transpose(1, 2).contiguous()
+71 attn_output = attn_output.view(batch_size, seq_length, self.embed_dim)
+72
+73 attn_output = self.c_proj(attn_output)
+74
+75 return attn_output78class Block(nn.Module):79 def __init__(self):
+80 super().__init__()
+81 self.pre_norm = nn.LayerNorm(config['n_embd'], eps=config['layer_norm_epsilon'])
+82 self.attn = MultiHeadAttention()
+83 self.post_norm = nn.LayerNorm(config['n_embd'], eps=config['layer_norm_epsilon'])
+84 self.ffn = FFN(config['n_embd'] * 4)86 def forward(self, hidden_states):
+87 residual = hidden_states
+88 hidden_states = self.pre_norm(hidden_states)
+89
+90 attn_output = self.attn(hidden_states)
+91
+92 hidden_states = attn_output + residual
+93 residual = hidden_states
+94 hidden_states = self.post_norm(hidden_states)
+95 feed_forward_output = self.ffn(hidden_states)
+96 hidden_states = feed_forward_output + residual
+97
+98 return hidden_states101class GPTModel(nn.Module):102 def __init__(self):
+103 super().__init__()
+104
+105 self.token_embedding = Embedding(config['vocab_size'], config['n_embd'], r=32)
+106 self.position_embedding = Embedding(config['n_positions'], config['n_embd'], r=32)
+107
+108 self.blocks = nn.ModuleList([Block() for _ in range(config['n_layer'])])
+109
+110 self.final_norm = nn.LayerNorm(config['n_embd'], eps=config['layer_norm_epsilon'])
+111
+112 self.lm_head = Linear(config['n_embd'], config['vocab_size'], r=32, bias=False)114 def forward(self, input_ids):
+115 batch_size, input_shape = input_ids.size()
+116
+117 token_embeddings = self.token_embedding(input_ids) # B T C
+118 position_ids = torch.arange(input_shape, device=config['device']) # T C
+119 position_embeddings = self.position_embedding(position_ids) # B T C
+120
+121 hidden_states = token_embeddings + position_embeddings
+122
+123 for block in self.blocks:
+124 hidden_states = block(hidden_states)
+125
+126 hidden_states = self.final_norm(hidden_states)
+127
+128 logits = self.lm_head(hidden_states)
+129
+130 return logitsThis is an implementation of Low-Rank Adaptation (LoRA) in PyTorch.
+Low-Rank Adaptation (LoRA) freezes pre-trained model weights and injects trainable rank decomposition matrices into each layer of the transformer. This makes it possible to efficiently fine-tune large langauge models by reducing trainable parameters by a large factor.
+Here's the training code for training a GPT2 model with LoRA on Tiny Shakespeare dataset.
+ +24import torch
+25import torch.nn as nnLoRA linear layer adds a low-rank decomposition to the pre-trained weight matrix () of the linear layer.
++
, where , , and the rank .
+All parameters are frozen except and .
+is initialized to be zero at the beginning of the training.
+They multiple by where is a hyper-parameter. Once is tuned it can be kept the same when varying .
+ +28class Linear(nn.Module):in_features
+ is the number of input features of the linear layer out_features
+ is the number of output features of the linear layer bias
+ is a flag indicating if there is a bias parameter r
+ is the rank of the decomposition alpha
+ is the scaling factor 49 def __init__(self, in_features: int, out_features: int, bias: bool,
+50 r: int, alpha: int = None):58 super().__init__()Set is not provided. i.e. make the scaling factor .
+ +61 if alpha is None:
+62 alpha = rThe pre-trained weight
+ +65 self.weight = nn.Parameter(torch.empty((out_features, in_features)))Freeze it
+ +67 self.weight.requires_grad = False
+68
+69 if bias:Bias parameter (also frozen)
+ +71 self.bias = nn.Parameter(torch.empty(out_features))
+72 self.bias.requires_grad = False
+73 else:No bias parameter
+ +75 self.bias = Nonescaling factor
+ +78 self.scaling = alpha / rMatrix
+ +80 self.lora_a = nn.Parameter(torch.empty((in_features, r)))Matrix , we keep and transposed
+ +82 self.lora_b = nn.Parameter(torch.empty((r, out_features)))
+83
+84 with torch.no_grad():Initialize similar to a weight matrix in a normal linear layer
+ +86 nn.init.kaiming_uniform_(self.lora_a, a=5 ** 0.5)Initialize to so that is at initialization
+ +88 nn.init.zeros_(self.lora_b)90 def forward(self, x: torch.Tensor):Compute
+ +92 result = nn.functional.linear(x, self.weight, bias=self.bias)Add
+ +95 result += (x @ self.lora_a @ self.lora_b) * self.scaling+ +
98 return resultSimilar to LoRA linear layer this adds a low-rank decomposition to the pre-trained embedding weights matrix ().
++ +
101class Embedding(nn.Module):num_embeddings
+ is the number of embeddings embedding_dim
+ is the number embedding dimensions r
+ is the rank of the decomposition alpha
+ is the scaling factor 111 def __init__(self, num_embeddings: int, embedding_dim: int,
+112 r: int, alpha: int = None):120 super().__init__()Set is not provided. i.e. make the scaling factor .
+ +123 if alpha is None:
+124 alpha = rThe pre-trained embedding weights (frozen)
+ +127 self.weight = nn.Parameter(torch.empty((num_embeddings, embedding_dim)))
+128 self.weight.requires_grad = Falsescaling factor
+ +131 self.scaling = alpha / rMatrix
+ +133 self.lora_a = nn.Parameter(torch.empty((num_embeddings, r)))Matrix
+ +135 self.lora_b = nn.Parameter(torch.empty((r, embedding_dim)))
+136
+137 with torch.no_grad():Initialize with a normal distribution
+ +139 nn.init.normal_(self.lora_a)Initialize to so that is at initialization
+ +141 nn.init.zeros_(self.lora_b)143 def forward(self, x: torch.Tensor):Compute the embeddings
+ +145 result = nn.functional.embedding(x, self.weight)Add Error
+ +148 result += (nn.functional.embedding(x, self.lora_a) @ self.lora_b) * self.scaling+ +
151 return result1import torch
+2from transformers import AutoModelForCausalLM5def transform_hf_model():
+6 model = AutoModelForCausalLM.from_pretrained("gpt2")
+7
+8 state_dict = model.state_dict()
+9
+10 mapping = {
+11 'transformer.wte.weight': 'token_embedding.weight',
+12 'transformer.wpe.weight': 'position_embedding.weight',
+13 'transformer.ln_f.weight': 'final_norm.weight',
+14 'transformer.ln_f.bias': 'final_norm.bias',
+15 'lm_head.weight': 'lm_head.weight'
+16 }
+17
+18 for i in range(12):
+19 mapping[f'transformer.h.{i}.ln_1.weight'] = f'blocks.{i}.pre_norm.weight'
+20 mapping[f'transformer.h.{i}.ln_1.bias'] = f'blocks.{i}.pre_norm.bias'
+21 mapping[f'transformer.h.{i}.attn.c_attn.weight'] = f'blocks.{i}.attn.c_att.weight'
+22 mapping[f'transformer.h.{i}.attn.c_attn.bias'] = f'blocks.{i}.attn.c_att.bias'
+23 mapping[f'transformer.h.{i}.attn.c_proj.weight'] = f'blocks.{i}.attn.c_proj.weight'
+24 mapping[f'transformer.h.{i}.attn.c_proj.bias'] = f'blocks.{i}.attn.c_proj.bias'
+25 mapping[f'transformer.h.{i}.ln_2.weight'] = f'blocks.{i}.post_norm.weight'
+26 mapping[f'transformer.h.{i}.ln_2.bias'] = f'blocks.{i}.post_norm.bias'
+27 mapping[f'transformer.h.{i}.mlp.c_fc.weight'] = f'blocks.{i}.ffn.c_fc.weight'
+28 mapping[f'transformer.h.{i}.mlp.c_fc.bias'] = f'blocks.{i}.ffn.c_fc.bias'
+29 mapping[f'transformer.h.{i}.mlp.c_proj.weight'] = f'blocks.{i}.ffn.c_proj.weight'
+30 mapping[f'transformer.h.{i}.mlp.c_proj.bias'] = f'blocks.{i}.ffn.c_proj.bias'
+31
+32 new_state_dict = {}
+33 for old_key, new_key in mapping.items():
+34 if old_key in state_dict:
+35 new_state_dict[new_key] = state_dict[old_key]transpose weight matrices of convo 1d layers to use linear layers instead
+ +38 convo_layers = ([f'blocks.{i}.ffn.c_fc.weight' for i in range(12)] +
+39 [f'blocks.{i}.ffn.c_proj.weight' for i in range(12)] +
+40 [f'blocks.{i}.attn.c_att.weight' for i in range(12)] +
+41 [f'blocks.{i}.attn.c_proj.weight' for i in range(12)])
+42
+43 for layer in convo_layers:
+44 new_state_dict[layer] = torch.transpose(new_state_dict[layer], 0, 1)
+45
+46 torch.save(new_state_dict, 'transformed.pth')这是一个用 PyTorch 实现各种神经网络和相关算法的集合。每个算法的代码实现都有详细的解释说明,且在网站上与代码逐行对应。我们相信,这些内容将帮助您更好地理解这些算法。

创建在中定义的位置前馈网络feed_forward.py
-。
在feed_forward.py
+中定义了一个位置前馈网络。
在哪里
-它是在论文中介绍的 “高斯误差线性单位”。
+其中,
+这是在论文《 Gaussian Error Linear Units 》中介绍的。
这些是用于FFN的封闭隐藏层的变体,如纸质 GLU变体改进变压器中所述。我们省略了本文中指定的偏差术语。
+这些是在论文 《 GLU Variants Improve Transformer 》中包含的各种带门控隐藏层的 ffn 变体。我们已按照论文规定省略了偏置项。
@@ -374,7 +374,7 @@ -@@ -392,8 +392,8 @@ -
在哪里
+其中,
这定义了变压器的配置。配置是使用选项函数计算的。这些是延迟加载的,因此只计算必要的模块。
+这定义了 Transformer 的配置。这些配置是通过可选择的函数进行计算的。它们是惰性加载的,因此只有必要的模块才会被计算。
这是变压器中使用的按位置前馈网络的 PyTorch 实现。
-FFN 由两个完全连接的层组成。隐藏层中的维度数,通常设置为令牌嵌入的四倍左右。因此,它有时也被称为扩张和收缩网络。
-隐藏层有一个激活,通常设置为RelU(整流线性单元)激活,
-也就是说,FFN 函数是、其中、和是可学习的参数。
-有时还会使用 GELU(高斯误差线性单位)激活来代替 RelU。在哪里
+这是 Transformer 中使用的位置前馈网络的 PyTorch 实现。
+FFN 由两个全连接层组成。隐藏层中的维度数_%5e_0_%5e_通常设置为标记嵌入维度_%5e_1_%5e_的四倍左右。因此,它有时也被称为扩张-压缩网络。
+隐藏层有一个激活函数,通常设置为 ReLU (Rectified Linear Unit) 激活函数,_%5e_2_%5e_
+在此基础上, FFN 函数可以写作:_%5e_3_%5e_其中_%5e_4_%5e__%5e_5_%5e_、_%5e_6_%5e_和_%5e_7_%5e_是可学习的参数。
+有时还会使用 GELU (Gaussian Error Linear Unit) 激活函数来代替 ReLU 。_%5e_8_%5e_其中_%5e_9_%5e_
这是一个通用实现,支持不同的变体,包括门控线性单元 (GLU)。我们还对以下方面进行了实验:
- +这是一个通用实现,支持包括门控线性单元(GLU) 在内的不同变体。我们还对这些进行了实验:
+d_model
-是令牌嵌入中的要素数量d_ff
-是 FFN 隐藏层中的要素数量dropout
-是隐藏层的丢失概率is_gated
-指定隐藏层是否为门控bias1
-指定第一个完全连接的层是否应该有可学习的偏差bias2
-指定第二个完全连接的层是否应该有可学习的偏差bias_gate
-指定门的全连接层是否应具有可学习的偏差本模块包含 PyTorch 实现和论文 Attention Is All You Need 中对原创变压器的解释,以及它的衍生品和增强功能。
-本节内容包含对论文《 Attention is All You Need 》中原始 Transformer 的解释与PyTorch 实现,以及对其衍生和增强版本的解释与实现。
+ -这使用相对的多头注意力实现了变形金刚 XL 模型
-这实现了旋转位置嵌入 (roPE)
-这实现了线性偏差注意力(AliBI)。
-这实现了检索增强型转换器(RETRO)。
-这是一种压缩变压器的实现,它通过压缩最古老的存储器来延长注意力跨度,从而在Transformer XL 上扩展。
+这是使用相对多头注意力的 Transformer XL 模型的实现。
+这是旋转式位置编码( ROPE )的实现。
+这是线性偏差注意力( ALIBI )的实现。
+这是对检索增强 Transformer ( RETRO )的实现。
+这是一个压缩transformer的实现,它在Transformer XL 的基础上,通过压缩最早期的记忆来延长注意力跨度。
这是 GPT-2 体系结构的实现。
+这是 GPT-2 结构的实现。
这是论文 GLU 变体改进变压器的实现。
-这是论文《通过记忆推广:最近邻语言模型》的实现。
-这是一篇论文《使用反馈存储器访问顺序变压器中的更高层次表示》的实现。
-这是论文《开关变压器:以简单高效的稀疏度缩放到万亿参数模型》的微型实现。我们的实现只有几百万个参数,不对并行分布式训练进行建模。它进行单个 GPU 训练,但我们实现了白皮书中描述的切换概念。
-这是 PyTorch 中线性变压器是秘密的快速重量存储系统论文的实现。
-这是论文《FNet:将令牌与傅里叶变换混合》的实现。
-这是论文《无注意力变压器》的实现。
-这是在论文《B ERT:用于语言理解的深度双向变换器的预训练》中用于预训练的蒙面语言模型的实现。
-这是论文 MLP-Mixer:视觉的全 MLP 架构的实现。
-这是 “注意 MLP” 一文的实现。
-这是论文《图像值得 16x16 Words:大规模图像识别的变形金刚》的实现。
+这是论文 《 GLU Variants Improve Transformer 》的实现。
+这是论文《 Generalization through Memorization: Nearest Neighbor Language Models 》的实现。
+这是论文《 Accessing Higher-level Representations in Sequential Transformers with Feedback Memory 》的实现。
+这是论文《 Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity 》的一个简化实现。我们的实现仅包含几百万个参数,并且只在单 GPU 上进行训练,不涉及并行分布式训练,但我们仍然实现了论文中描述的 Switch 概念。
+这是论文 《 Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch 》的实现。
+这是论文《 FNet: Mixing Tokens with Fourier Transforms 》的实现。
+这是论文《 An Attention Free Transformer 》的实现。
+这是论文《 BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding 》中用于预训练的掩码语言模型的实现
+这是论文 《 MLP-Mixer: An all-MLP Architecture for Vision 》的实现。
+这是论文《 Pay Attention to MLPs 》的实现。
+这是论文《 An Image Is Worth 16x16 Words: Transformers For Image Recognition At Scale 》的实现。
这是论文《入门:为语言建模寻找高效的变换器》的实现。
-这是论文《分层变换器是更有效的语言模型》的实现
+这是论文《 Primer: Searching for Efficient Transformers for Language Modeling 》的实现。
+这是论文《 Hierarchical Transformers Are More Efficient Language Models 》的实现
显示系统预期的目标分布。
+展示系统期望的目标分布。
这是 P yTorch 中论文 “注意力就是你所需要的” 多头注意力的教程/实现。该实现的灵感来自带注释的变形金刚。
-以下是使用带有 MHA 的基本转换器进行 NLP 自动回归的训练代码。
- +这是论文《 Attention is All You Need 》中多头注意力的PyTorch教程/实现。该实现的灵感来自《带注释的变形金刚》。
%n这是使用基础 Transformer 和 MHA 进行 NLP 自回归的训练代码。
%n这是一个训练简单transformer的代码实现。
该模块进行线性变换,并将向量拆分为给定数量的头部,以获得多头注意。这用于转换键、查询和值向量。
+该部分执行线性变换,并将向量分割成给定数量的头以获得多头注意力。这用于键、查询和值向量。
输入的形状[seq_len, batch_size, d_model]
+
输入的形状为[seq_len, batch_size, d_model]
或[batch_size, d_model]
-。我们将线性变换应用于最后一个维度,然后将其拆分为头部。
这将计算给定key
-和value
-向量的缩放多头注意query
-力。
这将计算给出的key
+、value
+和query
+向量缩放后的多头注意力。
-
简单来说,它会找到与查询匹配的键,并获取这些键的值。
-它使用查询和键的点积作为它们匹配程度的指标。在服用点产品之前,先按比例缩放
-Softmax 是沿序列(或时间)的轴计算的。
+M834 80h400000v40h-400000z">1。这样做是为了避免当较大时,大的点积值导致 Softmax 操作输出非常小的梯度。 +Softmax 是沿序列(或时间)轴计算的。
d_model
-是query
+是向量query
、key
和value
-向量中的要素数。这些将对多头注意力的向量query
、key
和value
-向量。
在键( Key )的时间维度上进行注意力 Softmaxkey
[seq_len_q, seq_len_k, batch_size]
-,其中第一个维度是查询维度。如果查询维度等于它将被广播。
+的形状为[seq_len_q, seq_len_k, batch_size]
+,其中第一维是查询维度。如果查询维度等于,则会进行广播。
生成的掩码形状为[seq_len_q, seq_len_k, batch_size, heads]
key
和value
-是存储查询、键和值向量集合的张量。它们有形状[seq_len, batch_size, d_model]
+是存储查询、键和值向量集合的张量。它们的形状为[seq_len, batch_size, d_model]
。
mask
-有形状[seq_len, seq_len, batch_size]
-并mask[i, j, b]
-指示是否为批量查询b
-,位置处的查询i
-有权访问位置处的键值j
-。
[seq_len, seq_len, batch_size]
+,mask[i, j, b]
+表示批次b
+,在位置i
+处查询是否有权访问位置j
+处的键值对。
key
-并且value
-有形状[seq_len, batch_size, d_model]
+和value
+的形状为[seq_len, batch_size, d_model]
为注意力计算准备向量query
,key
并value
-进行注意力计算。然后这些就会有形状[seq_len, batch_size, heads, d_k]
+它们的形状将变为[seq_len, batch_size, heads, d_k]
。
[seq_len, seq_len, batch_size, heads]
-。
+ 计算注意力分数这将得到一个形状为[seq_len, seq_len, batch_size, heads]
+的张量。
This can act as an encoder layer or a decoder layer. We use pre-norm.
+这可以作为编码器层或解码器层。我们使用预正则化。
d_model
-是令牌嵌入的大小self_attn
-是自我关注模块src_attn
-是源关注模块(当它在解码器中使用时)feed_forward
是前馈模块dropout_prob
-是自我关注和 FFN 后退学的概率如果提供了来源,则从关注源获取结果。这是当你有一个关注编码器输出的解码器层
时 +如果提供了源数据,则从注意力机制中获取结果。这是指当解码器层关注编码器输出时。
这可以预测令牌并给出其中的lof softmax。如果你正在使用,你不需要这个nn.CrossEntropyLoss
-。
这会预测这些标记并给出它们的 softmax 的对数。如果你使用nn.CrossEntropyLoss
+,则不需要这样做。
从他们的代码来看,这很重要。使用 Glorot/fan_avg 初始化参数。
+这是代码中很重要的部分。使用 Glorot/fan_avg 初始化参数。
头寸指数
+位置索引
This is a PyTorch implementation of position-wise feedforward network used in transformer.
\nFFN consists of two fully connected layers. Number of dimensions in the hidden layer _^_0_^_, is generally set to around four times that of the token embedding _^_1_^_. So it is sometime also called the expand-and-contract network.
\nThere is an activation at the hidden layer, which is usually set to ReLU (Rectified Linear Unit) activation, _^_2_^_
\nThat is, the FFN function is, _^_3_^_ where _^_4_^_, _^_5_^_, _^_6_^_ and _^_7_^_ are learnable parameters.
\nSometimes the GELU (Gaussian Error Linear Unit) activation is also used instead of ReLU. _^_8_^_ where _^_9_^_
\nThis is a generic implementation that supports different variants including Gated Linear Units (GLU). We have also implemented experiments on these:
\n\n": "\u8fd9\u662f Transformer \u4e2d\u4f7f\u7528\u7684\u4f4d\u7f6e\u524d\u9988\u7f51\u7edc\u7684 PyTorch \u5b9e\u73b0\u3002
\nFFN \u7531\u4e24\u4e2a\u5168\u8fde\u63a5\u5c42\u7ec4\u6210\u3002\u9690\u85cf\u5c42\u4e2d\u7684\u7ef4\u5ea6\u6570_^_0_^_\u901a\u5e38\u8bbe\u7f6e\u4e3a\u6807\u8bb0\u5d4c\u5165\u7ef4\u5ea6_^_1_^_\u7684\u56db\u500d\u5de6\u53f3\u3002\u56e0\u6b64\uff0c\u5b83\u6709\u65f6\u4e5f\u88ab\u79f0\u4e3a\u6269\u5f20-\u538b\u7f29\u7f51\u7edc\u3002
\n\u9690\u85cf\u5c42\u6709\u4e00\u4e2a\u6fc0\u6d3b\u51fd\u6570\uff0c\u901a\u5e38\u8bbe\u7f6e\u4e3a ReLU (Rectified Linear Unit) \u6fc0\u6d3b\u51fd\u6570\uff0c_^_2_^_
\n\u5728\u6b64\u57fa\u7840\u4e0a\uff0c FFN \u51fd\u6570\u53ef\u4ee5\u5199\u4f5c\uff1a_^_3_^_\u5176\u4e2d_^_4_^__^_5_^_\u3001_^_6_^_\u548c_^_7_^_\u662f\u53ef\u5b66\u4e60\u7684\u53c2\u6570\u3002
\n\u6709\u65f6\u8fd8\u4f1a\u4f7f\u7528 GELU (Gaussian Error Linear Unit) \u6fc0\u6d3b\u51fd\u6570\u6765\u4ee3\u66ff ReLU \u3002_^_8_^_\u5176\u4e2d_^_9_^_
\n\u8fd9\u662f\u4e00\u4e2a\u901a\u7528\u5b9e\u73b0\uff0c\u652f\u6301\u5305\u62ec\u95e8\u63a7\u7ebf\u6027\u5355\u5143(GLU) \u5728\u5185\u7684\u4e0d\u540c\u53d8\u4f53\u3002\u6211\u4eec\u8fd8\u5bf9\u8fd9\u4e9b\u8fdb\u884c\u4e86\u5b9e\u9a8c\uff1a
\nThis is a PyTorch implementation of position-wise feedforward network used in transformer.
\nFFN consists of two fully connected layers. Number of dimensions in the hidden layer _^_0_^_, is generally set to around four times that of the token embedding _^_1_^_. So it is sometime also called the expand-and-contract network.
\nThere is an activation at the hidden layer, which is usually set to ReLU (Rectified Linear Unit) activation, _^_2_^_
\nThat is, the FFN function is, _^_3_^_ where _^_4_^_, _^_5_^_, _^_6_^_ and _^_7_^_ are learnable parameters.
\nSometimes the GELU (Gaussian Error Linear Unit) activation is also used instead of ReLU. _^_8_^_ where _^_9_^_
\nThis is a generic implementation that supports different variants including Gated Linear Units (GLU). We have also implemented experiments on these:
\n\n": "\u8fd9\u662f Transformer \u4e2d\u4f7f\u7528\u7684\u4f4d\u7f6e\u524d\u9988\u7f51\u7edc\u7684 PyTorch \u5b9e\u73b0\u3002
\nFFN \u7531\u4e24\u4e2a\u5168\u8fde\u63a5\u5c42\u7ec4\u6210\u3002\u9690\u85cf\u5c42\u4e2d\u7684\u7ef4\u5ea6\u6570_%5e_0_%5e_\u901a\u5e38\u8bbe\u7f6e\u4e3a\u6807\u8bb0\u5d4c\u5165\u7ef4\u5ea6_%5e_1_%5e_\u7684\u56db\u500d\u5de6\u53f3\u3002\u56e0\u6b64\uff0c\u5b83\u6709\u65f6\u4e5f\u88ab\u79f0\u4e3a\u6269\u5f20-\u538b\u7f29\u7f51\u7edc\u3002
\n\u9690\u85cf\u5c42\u6709\u4e00\u4e2a\u6fc0\u6d3b\u51fd\u6570\uff0c\u901a\u5e38\u8bbe\u7f6e\u4e3a ReLU (Rectified Linear Unit) \u6fc0\u6d3b\u51fd\u6570\uff0c_%5e_2_%5e_
\n\u5728\u6b64\u57fa\u7840\u4e0a\uff0c FFN \u51fd\u6570\u53ef\u4ee5\u5199\u4f5c\uff1a_%5e_3_%5e_\u5176\u4e2d_%5e_4_%5e__%5e_5_%5e_\u3001_%5e_6_%5e_\u548c_%5e_7_%5e_\u662f\u53ef\u5b66\u4e60\u7684\u53c2\u6570\u3002
\n\u6709\u65f6\u8fd8\u4f1a\u4f7f\u7528 GELU (Gaussian Error Linear Unit) \u6fc0\u6d3b\u51fd\u6570\u6765\u4ee3\u66ff ReLU \u3002_%5e_8_%5e_\u5176\u4e2d_%5e_9_%5e_
\n\u8fd9\u662f\u4e00\u4e2a\u901a\u7528\u5b9e\u73b0\uff0c\u652f\u6301\u5305\u62ec\u95e8\u63a7\u7ebf\u6027\u5355\u5143(GLU) \u5728\u5185\u7684\u4e0d\u540c\u53d8\u4f53\u3002\u6211\u4eec\u8fd8\u5bf9\u8fd9\u4e9b\u8fdb\u884c\u4e86\u5b9e\u9a8c\uff1a
\n_^_0_^_
\n": "_^_0_^_
\n", "_^_0_^_ or _^_1_^_ depending on whether it is gated
\n": "\u6839\u636e\u662f\u5426\u8fdb\u884c\u95e8\u63a7\uff0c\u8fd4\u56de_^_0_^_\u6216\u8005_^_1_^_
\n",