Here we train a transformer that uses Fuzzy Tiling Activation in the Feed-Forward Network. We use it for a language model and train it on Tiny Shakespeare dataset for demonstration.
However, this is probably not the ideal task for FTA, and we believe FTA is more suitable for modeling data with continuous variables.
22import copy
23
24import torch
25import torch.nn as nn
26
27from labml import experiment
28from labml.configs import option
29from labml_helpers.module import Module
30from labml_nn.activations.fta import FTA
31from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
32from labml_nn.transformers import MultiHeadAttention, TransformerLayer
33from labml_nn.transformers.utils import subsequent_maskd_model
 is the number of features in a token embedding d_ff
 is the number of features in the hidden layer of the FFN activation
 is FTA activation module dropout
 is dropout probability for the hidden layer41    def __init__(self, d_model: int, d_ff: int,
42                 activation: FTA,
43                 dropout: float = 0.1):50        super().__init__()Layer one parameterized by weight and bias
52        self.layer1 = nn.Linear(d_model, d_ff)Layer two parameterized by weight and bias
54        self.layer2 = nn.Linear(d_ff * activation.expansion_factor, d_model)Hidden layer dropout
56        self.dropout = nn.Dropout(dropout)Activation function
58        self.activation = activation60    def forward(self, x: torch.Tensor):62        x = self.activation(self.layer1(x))Apply dropout
64        x = self.dropout(x)66        return self.layer2(x)This is an autoregressive transformer model that uses Feed-Forward Networks with (Fuzzy Tiling Activations)(index.html).
69class AutoregressiveTransformer(Module):n_tokens  is the number of tokens in the vocabulary d_model  is the embedding size n_layers  is the number of transformer layers layer  is the layer. We use n_layers
 copies of this for the transformer.77    def __init__(self, n_tokens: int, d_model: int, n_layers: int, layer: TransformerLayer):84        super().__init__()Transformer with n_layers
 layers 
86        self.transformer_layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(n_layers)])Token embedding layer
89        self.emb = nn.Embedding(n_tokens, d_model)Readout layer
91        self.readout = nn.Linear(d_model, n_tokens)The mask will be initialized on the first call
94        self.mask = Nonex  are the input tokens of shape [seq_len, batch_size]
96    def forward(self, x: torch.Tensor):Create auto-regressive mask
101        if self.mask is None or self.mask.size(0) != len(x):Subsequent mask, will mask out tokens from seeing future tokens
103            self.mask = subsequent_mask(len(x)).to(x.device)Get the token embeddings
106        x = self.emb(x)Transformer encoder
108        for layer in self.transformer_layers:
109            x = layer(x=x, mask=self.mask)Get logits
111        x = self.readout(x)Return results
114        return x, None117class Configs(NLPAutoRegressionConfigs):Model
126    model: AutoregressiveTransformerNumber of layers
129    n_layers: int = 4and for DeepNorm
132    deep_norm_alpha: float
133    deep_norm_beta: floatNumber of heads in the attention
136    n_heads: int = 4Embedding size
138    d_model: int = 256Size of each attention head
140    d_k: int = 16Feed forward layer size
142    d_ff: int = 256FTA
145    fta_lower_limit: float = -1.
146    fta_upper_limit: float = +1.
147    fta_delta: float = 0.2
148    fta_eta: float = 0.05151@option(Configs.model)
152def _model(c: Configs):Create FTA activation module
158    fta = FTA(c.fta_lower_limit, c.fta_upper_limit, c.fta_delta, c.fta_eta)Create the transformer. We re-use TransformerLayer
 and MultiHeadAttention
 implementations. 
162    m = AutoregressiveTransformer(c.n_tokens, c.d_model, c.n_layers,
163                                  TransformerLayer(d_model=c.d_model,
164                                                   feed_forward=FeedForwardFTA(d_model=c.d_model,
165                                                                               d_ff=c.d_ff,
166                                                                               activation=fta,
167                                                                               dropout=0.1),
168                                                   self_attn=MultiHeadAttention(c.n_heads, c.d_model,
169                                                                                dropout_prob=0.0),
170                                                   dropout_prob=0.0))Move to the device
173    return m.to(c.device)176def main():Create experiment
181    experiment.create(name="fta", writers={'screen',  'comet', 'labml'})Create configs
183    conf = Configs()Override configurations
185    experiment.configs(conf, {Use character level tokenizer
187        'tokenizer': 'character',Prompt separator is blank
189        'prompt_separator': '',Starting prompt for sampling
191        'prompt': 'It is ',Use Tiny Shakespeare dataset
193        'text': 'tiny_shakespeare',Use a context size of
196        'seq_len': 256,Train for 32 epochs
198        'epochs': 32,Batch size
200        'batch_size': 16,Switch between training and validation for times per epoch
202        'inner_iterations': 10,Adam optimizer with no warmup
205        'optimizer.optimizer': 'Adam',
206        'optimizer.learning_rate': 3e-4,
207    })Set model(s) for saving and loading
210    experiment.add_pytorch_models({'model': conf.model})Start the experiment
213    with experiment.start():Run training
215        conf.run()219if __name__ == '__main__':
220    main()