diff --git a/labml_nn/capsule_networks/mnist.py b/labml_nn/capsule_networks/mnist.py index 96357547..f9eb475f 100644 --- a/labml_nn/capsule_networks/mnist.py +++ b/labml_nn/capsule_networks/mnist.py @@ -51,7 +51,7 @@ class MNISTCapsuleNetworkModel(Module): # This is the decoder mentioned in the paper. # It takes the outputs of the $10$ digit capsules, each with $16$ features to reproduce the - # image. It goes through linear layers of sizes $512% and $1024$ with $ReLU$ activations. + # image. It goes through linear layers of sizes $512$ and $1024$ with $ReLU$ activations. self.decoder = nn.Sequential( nn.Linear(16 * 10, 512), nn.ReLU(), @@ -70,7 +70,7 @@ class MNISTCapsuleNetworkModel(Module): x = F.relu(self.conv1(data)) # Pass through the second convolution layer. # Output of this has shape `[batch_size, 32 * 8, 6, 6]`. - # *Note that this layer has a stride length of $2$. + # *Note that this layer has a stride length of $2$*. x = self.conv2(x) # Resize and permutate to get the capsules diff --git a/labml_nn/cfr/kuhn/__init__.py b/labml_nn/cfr/kuhn/__init__.py index fe94190d..af4ba980 100644 --- a/labml_nn/cfr/kuhn/__init__.py +++ b/labml_nn/cfr/kuhn/__init__.py @@ -85,6 +85,7 @@ class History(_History): This defines when a game ends, calculates the utility and sample chance events (dealing cards). The history is stored in a string: + * First two characters are the cards dealt to player 1 and player 2 * The third character is the action by the first player * Fourth character is the action by the second player diff --git a/labml_nn/conv_mixer/__init__.py b/labml_nn/conv_mixer/__init__.py index fa7f41f4..608d5c18 100644 --- a/labml_nn/conv_mixer/__init__.py +++ b/labml_nn/conv_mixer/__init__.py @@ -28,7 +28,7 @@ Also, the MLP-mixer uses MLPs of two layers for each mixing and ConvMixer uses a The paper recommends removing the residual connection across the channel mixing (point-wise convolution) and having only a residual connection over the spatial mixing (depth-wise convolution). They also use [Batch normalization](../normalization/batch_norm/index.html) instead -of [Layer normalization)(../normalization/layer_norm/index.html). +of [Layer normalization](../normalization/layer_norm/index.html). Here's [an experiment](experiment.html) that trains ConvMixer on CIFAR-10. diff --git a/labml_nn/conv_mixer/readme.md b/labml_nn/conv_mixer/readme.md index d8f8a832..56de7c21 100644 --- a/labml_nn/conv_mixer/readme.md +++ b/labml_nn/conv_mixer/readme.md @@ -18,7 +18,7 @@ Also, the MLP-mixer uses MLPs of two layers for each mixing and ConvMixer uses a The paper recommends removing the residual connection across the channel mixing (point-wise convolution) and having only a residual connection over the spatial mixing (depth-wise convolution). They also use [Batch normalization](https://nn.labml.ai/normalization/batch_norm/index.html) instead -of [Layer normalization)(../normalization/layer_norm/index.html). +of [Layer normalization](../normalization/layer_norm/index.html). Here's [an experiment](https://nn.labml.ai/conv_mixer/experiment.html) that trains ConvMixer on CIFAR-10. diff --git a/labml_nn/experiments/nlp_classification.py b/labml_nn/experiments/nlp_classification.py index c83abbbe..cb5c2006 100644 --- a/labml_nn/experiments/nlp_classification.py +++ b/labml_nn/experiments/nlp_classification.py @@ -245,7 +245,7 @@ def ag_news(c: NLPClassificationConfigs): ### AG News dataset This loads the AG News dataset and the set the values for - `n_classes', `vocab`, `train_loader`, and `valid_loader`. + `n_classes`, `vocab`, `train_loader`, and `valid_loader`. """ # Get training and validation datasets @@ -279,5 +279,5 @@ def ag_news(c: NLPClassificationConfigs): valid_loader = DataLoader(valid, batch_size=c.batch_size, shuffle=True, collate_fn=CollateFunc(tokenizer, vocab, c.seq_len, len(vocab), len(vocab) + 1)) - # Return `n_classes', `vocab`, `train_loader`, and `valid_loader` + # Return `n_classes`, `vocab`, `train_loader`, and `valid_loader` return 4, vocab, train_loader, valid_loader diff --git a/labml_nn/gan/cycle_gan/__init__.py b/labml_nn/gan/cycle_gan/__init__.py index b7c1c1c1..82501e49 100644 --- a/labml_nn/gan/cycle_gan/__init__.py +++ b/labml_nn/gan/cycle_gan/__init__.py @@ -145,7 +145,7 @@ class Discriminator(Module): super().__init__() channels, height, width = input_shape - # Output of the discriminator is also a map of probabilities* + # Output of the discriminator is also a map of probabilities, # whether each region of the image is real or generated self.output_shape = (1, height // 2 ** 4, width // 2 ** 4) @@ -528,8 +528,8 @@ class Configs(BaseConfigs): \Bigg] \end{align} - We use `generator_xy` for $G$ and `generator_yx$ for $F$. - We use `discriminator_x$ for $D_X$ and `discriminator_y` for $D_Y$. + We use `generator_xy` for $G$ and `generator_yx` for $F$. + We use `discriminator_x` for $D_X$ and `discriminator_y` for $D_Y$. """ # Replay buffers to keep generated samples diff --git a/labml_nn/gan/stylegan/__init__.py b/labml_nn/gan/stylegan/__init__.py index 3c5f7b35..0b85cceb 100644 --- a/labml_nn/gan/stylegan/__init__.py +++ b/labml_nn/gan/stylegan/__init__.py @@ -83,7 +83,7 @@ where the factors of variations are more linear (disentangled). #### AdaIN -Then $w$ is transformed into two vectors (***styles***) per layer, +Then $w$ is transformed into two vectors (**styles**) per layer, $i$, $y_i = (y_{s,i}, y_{b,i}) = f_{A_i}(w)$ and used for scaling and shifting (biasing) in each layer with $\text{AdaIN}$ operator (normalize and scale): $$\text{AdaIN}(x_i, y_i) = y_{s, i} \frac{x_i - \mu(x_i)}{\sigma(x_i)} + y_{b,i}$$ @@ -202,7 +202,7 @@ class Generator(nn.Module): *$A$ denotes a linear layer. $B$ denotes a broadcast and scaling operation (noise is a single channel). - [*toRGB*](#to_rgb) also has a style modulation which is not shown in the diagram to keep it simple.* + [`toRGB`](#to_rgb) also has a style modulation which is not shown in the diagram to keep it simple.* The generator starts with a learned constant. Then it has a series of blocks. The feature map resolution is doubled at each block @@ -243,7 +243,7 @@ class Generator(nn.Module): def forward(self, w: torch.Tensor, input_noise: List[Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]]): """ * `w` is $w$. In order to mix-styles (use different $w$ for different layers), we provide a separate - $w$ for each [generator block](#generator_block). It has shape `[n_blocks, batch_size, d_latent]1. + $w$ for each [generator block](#generator_block). It has shape `[n_blocks, batch_size, d_latent]`. * `input_noise` is the noise for each block. It's a list of pairs of noise sensors because each block (except the initial) has two noise inputs after each convolution layer (see the diagram). @@ -282,7 +282,7 @@ class GeneratorBlock(nn.Module): *$A$ denotes a linear layer. $B$ denotes a broadcast and scaling operation (noise is a single channel). - [*toRGB*](#to_rgb) also has a style modulation which is not shown in the diagram to keep it simple.* + [`toRGB`](#to_rgb) also has a style modulation which is not shown in the diagram to keep it simple.* The generator block consists of two [style blocks](#style_block) ($3 \times 3$ convolutions with style modulation) and an RGB output. @@ -731,7 +731,7 @@ class EqualizedLinear(nn.Module): ## Learning-rate Equalized Linear Layer - This uses [learning-rate equalized weights]($equalized_weights) for a linear layer. + This uses [learning-rate equalized weights](#equalized_weights) for a linear layer. """ def __init__(self, in_features: int, out_features: int, bias: float = 0.): @@ -742,7 +742,7 @@ class EqualizedLinear(nn.Module): """ super().__init__() - # [Learning-rate equalized weights]($equalized_weights) + # [Learning-rate equalized weights](#equalized_weights) self.weight = EqualizedWeight([out_features, in_features]) # Bias self.bias = nn.Parameter(torch.ones(out_features) * bias) @@ -757,7 +757,7 @@ class EqualizedConv2d(nn.Module): ## Learning-rate Equalized 2D Convolution Layer - This uses [learning-rate equalized weights]($equalized_weights) for a convolution layer. + This uses [learning-rate equalized weights](#equalized_weights) for a convolution layer. """ def __init__(self, in_features: int, out_features: int, @@ -771,7 +771,7 @@ class EqualizedConv2d(nn.Module): super().__init__() # Padding size self.padding = padding - # [Learning-rate equalized weights]($equalized_weights) + # [Learning-rate equalized weights](#equalized_weights) self.weight = EqualizedWeight([out_features, in_features, kernel_size, kernel_size]) # Bias self.bias = nn.Parameter(torch.ones(out_features)) diff --git a/labml_nn/optimizers/__init__.py b/labml_nn/optimizers/__init__.py index 7cc57797..e6e9e87c 100644 --- a/labml_nn/optimizers/__init__.py +++ b/labml_nn/optimizers/__init__.py @@ -36,14 +36,17 @@ Each group can have it's own hyper-parameters like learning rates. In most common cases there will be only one group. This is when you initialize your optimizer with, + ```python Optimizer(model.parameters()) ``` You can define multiple parameter groups when initializing the optimizer: + ```python Optimizer([{'params': model1.parameters()}, {'params': model2.parameters(), 'lr': 2}]) ``` + Here we pass a list of groups. Each group is a dictionary with it's parameters under the key 'params'. You specify any hyper-parameters as well. If the hyper parameters are not defined they will default to the optimizer level defaults. @@ -74,7 +77,7 @@ class GenericAdaptiveOptimizer(Optimizer): * `params` is the collection of parameters or set of parameter groups. * `defaults` a dictionary of default hyper-parameters - * 'lr` is the learning rate, $\alpha$ + * `lr` is the learning rate, $\alpha$ * `betas` is the tuple $(\beta_1, \beta_2)$ * `eps` is $\epsilon$ """ @@ -174,7 +177,8 @@ class WeightDecay: decay from the parameter. If added to the gradient it will go through the normal optimizer update. * `absolute` this flag indicates whether the weight decay coefficient is absolute. This is applicable when the decay is performed directly on the parameter. If this is false the actual decay is - `weight_decay` * `learning_rate`. + `weight_decay` + * `learning_rate`. """ # Check hyper-parameters if not 0.0 <= weight_decay: diff --git a/labml_nn/optimizers/ada_belief.py b/labml_nn/optimizers/ada_belief.py index 11fd13e9..75f6bf34 100644 --- a/labml_nn/optimizers/ada_belief.py +++ b/labml_nn/optimizers/ada_belief.py @@ -61,11 +61,11 @@ class AdaBelief(RAdam): * `betas` is a tuple of ($\beta_1$, $\beta_2$) * `eps` is $\hat{\epsilon}$ or $\epsilon$ based on `optimized_update` * `weight_decay` is an instance of class `WeightDecay` defined in [`__init__.py`](index.html) - * 'optimized_update' is a flag whether to optimize the bias correction of the second moment + * `optimized_update` is a flag whether to optimize the bias correction of the second moment by doing it after adding $\epsilon$ * `amsgrad` is a flag indicating whether to use AMSGrad or fallback to plain Adam - * `degenerate_to_sgd` whether to use sgd when the rectification term $r_t is intractable - * 'rectify' is whether to use RAdam update + * `degenerate_to_sgd` whether to use sgd when the rectification term $r_t$ is intractable + * `rectify` is whether to use RAdam update * `defaults` is a dictionary of default for group values. This is useful when you want to extend the class `AdaBelief`. """ diff --git a/labml_nn/optimizers/radam.py b/labml_nn/optimizers/radam.py index 97863746..3a075bf3 100644 --- a/labml_nn/optimizers/radam.py +++ b/labml_nn/optimizers/radam.py @@ -155,10 +155,10 @@ class RAdam(AMSGrad): * `betas` is a tuple of ($\beta_1$, $\beta_2$) * `eps` is $\hat{\epsilon}$ or $\epsilon$ based on `optimized_update` * `weight_decay` is an instance of class `WeightDecay` defined in [`__init__.py`](index.html) - * 'optimized_update' is a flag whether to optimize the bias correction of the second moment + * `optimized_update` is a flag whether to optimize the bias correction of the second moment by doing it after adding $\epsilon$ * `amsgrad` is a flag indicating whether to use AMSGrad or fallback to plain Adam - * `degenerate_to_sgd` whether to use sgd when the rectification term $r_t is intractable. + * `degenerate_to_sgd` whether to use sgd when the rectification term $r_t$ is intractable. * `defaults` is a dictionary of default for group values. This is useful when you want to extend the class `RAdam`. """ diff --git a/labml_nn/rl/ppo/gae.py b/labml_nn/rl/ppo/gae.py index 0da1c273..dc85f36d 100644 --- a/labml_nn/rl/ppo/gae.py +++ b/labml_nn/rl/ppo/gae.py @@ -45,10 +45,10 @@ class GAE: $\hat{A_t}$ \begin{align} - \delta_t &= r_t + \gamma V(s_{t+1}) - V(s_t)$ + \delta_t &= r_t + \gamma V(s_{t+1}) - V(s_t) \\ \hat{A_t} &= \delta_t + \gamma \lambda \delta_{t+1} + ... + - (\gamma \lambda)^{T - t + 1} \delta_{T - 1}$ + (\gamma \lambda)^{T - t + 1} \delta_{T - 1} \\ &= \delta_t + \gamma \lambda \hat{A_{t+1}} \end{align} diff --git a/labml_nn/sketch_rnn/__init__.py b/labml_nn/sketch_rnn/__init__.py index 275ea9da..d575a63b 100644 --- a/labml_nn/sketch_rnn/__init__.py +++ b/labml_nn/sketch_rnn/__init__.py @@ -114,7 +114,7 @@ class StrokesDataset(Dataset): # Mask is on until end of sequence self.mask[i, :len_seq + 1] = 1 - # Start-of-sequence is $(0, 0, 1, 0, 0) + # Start-of-sequence is $(0, 0, 1, 0, 0)$ self.data[:, 0, 2] = 1 def __len__(self): diff --git a/labml_nn/transformers/aft/__init__.py b/labml_nn/transformers/aft/__init__.py index 205cd15f..c07b8aab 100644 --- a/labml_nn/transformers/aft/__init__.py +++ b/labml_nn/transformers/aft/__init__.py @@ -42,7 +42,7 @@ AFT Local only apply learned pair-wise position biases locally: \begin{align} w'_{t,t'} = \begin{cases} -w_{t,t'}, & \text{for $\lvert t-t' \rvert \lt s$} \\ +w_{t,t'}, & {\text{for } \lvert t-t' \rvert \lt s} \\ 0, & \text{otherwise} \end{cases} \end{align} @@ -79,7 +79,7 @@ class AFTLocal(Module): \begin{align} w'_{t,t'} = \begin{cases} - w_{t,t'}, & \text{for $\lvert t-t' \rvert \lt s$} \\ + w_{t,t'}, & {\text{for } \lvert t-t' \rvert \lt s} \\ 0, & \text{otherwise} \end{cases} \end{align} @@ -119,7 +119,7 @@ class AFTLocal(Module): \begin{align} m_{t,t'} = \begin{cases} - 1, & \text{for $\lvert t-t' \rvert \lt s$} \\ + 1, & {\text{for } \lvert t-t' \rvert \lt s} \\ 0, & \text{otherwise} \end{cases} \end{align} @@ -170,7 +170,7 @@ class AFTLocal(Module): # \begin{align} # w'_{t,t'} = # \begin{cases} - # w_{t,t'}, & \text{for $\lvert t-t' \rvert \lt s$} \\ + # w_{t,t'}, & {\text{for }\lvert t-t' \rvert \lt s} \\ # 0, & \text{otherwise} # \end{cases} # \end{align} diff --git a/labml_nn/transformers/compressive/__init__.py b/labml_nn/transformers/compressive/__init__.py index 78ca5330..45ab56d8 100644 --- a/labml_nn/transformers/compressive/__init__.py +++ b/labml_nn/transformers/compressive/__init__.py @@ -40,7 +40,7 @@ We have implemented the latter here since it gives better results. This implementation uses pre-layer normalization while the paper uses post-layer normalization. -Pre-layer norm does the layer norm before FFN[../feedforward.html) and +Pre-layer norm does the layer norm before [FFN](../feedforward.html) and self-attention, and the pass-through in the residual connection is not normalized. This is supposed to be more stable in standard transformer setups. @@ -246,7 +246,7 @@ class AttentionReconstructionLoss: This is a reimplementation of ['PrepareForMultiHeadAttention'](../mha.html#PrepareMHA) where the projections are done with the parameters detached from gradient computation. - * `pmha* is the ['PrepareForMultiHeadAttention'](../mha.html#PrepareMHA) module + * `pmha` is the ['PrepareForMultiHeadAttention'](../mha.html#PrepareMHA) module * `x` is tensor with the token embeddings """ diff --git a/labml_nn/transformers/compressive/readme.md b/labml_nn/transformers/compressive/readme.md index 9be2989c..ad82cb0c 100644 --- a/labml_nn/transformers/compressive/readme.md +++ b/labml_nn/transformers/compressive/readme.md @@ -32,7 +32,7 @@ We have implemented the latter here since it gives better results. This implementation uses pre-layer normalization while the paper uses post-layer normalization. -Pre-layer norm does the layer norm before FFN[../feedforward.html) and +Pre-layer norm does the layer norm before [FFN](../feedforward.html) and self-attention, and the pass-through in the residual connection is not normalized. This is supposed to be more stable in standard transformer setups. diff --git a/labml_nn/transformers/glu_variants/simple.py b/labml_nn/transformers/glu_variants/simple.py index 302f9a3a..5c5e7f43 100644 --- a/labml_nn/transformers/glu_variants/simple.py +++ b/labml_nn/transformers/glu_variants/simple.py @@ -201,7 +201,7 @@ class Trainer: # Cross-entropy loss self.loss_func = nn.CrossEntropyLoss() # Number of training epochs; - # *note that our dataset definition repeats the data `seq_len` times in a single epoch + # *note that our dataset definition repeats the data `seq_len` times in a single epoch* self.epochs = configs.epochs # Gradient clipping norm self.grad_norm_clip = configs.grad_norm_clip diff --git a/labml_nn/transformers/knn/eval_knn.py b/labml_nn/transformers/knn/eval_knn.py index 038a07c8..80e3682c 100644 --- a/labml_nn/transformers/knn/eval_knn.py +++ b/labml_nn/transformers/knn/eval_knn.py @@ -46,7 +46,7 @@ def knn(queries: torch.Tensor, index: faiss.IndexFlatL2, keys_store: np.ndarray, # Normalize $f(c_i)$ keys_found_n = keys_found / torch.sqrt((keys_found ** 2).sum(-1, keepdims=True) + 1e-10) - # Normalize $f($\color{yellowgreen}{c_t})$ + # Normalize $f(\color{yellowgreen}{c_t})$ queries_n = queries / torch.sqrt((queries ** 2).sum(-1, keepdims=True) + 1e-10) # Get the dot-product, or cosine similarity diff --git a/labml_nn/transformers/mlm/__init__.py b/labml_nn/transformers/mlm/__init__.py index a3423034..4be6e223 100644 --- a/labml_nn/transformers/mlm/__init__.py +++ b/labml_nn/transformers/mlm/__init__.py @@ -81,7 +81,7 @@ class MLM: masking_prob: float = 0.15, randomize_prob: float = 0.1, no_change_prob: float = 0.1, ): """ - * `padding_token` is the padding token `[PAD]. + * `padding_token` is the padding token `[PAD]`. We will use this to mark the labels that shouldn't be used for loss calculation. * `mask_token` is the masking token `[MASK]`. * `no_mask_tokens` is a list of tokens that should not be masked. diff --git a/labml_nn/transformers/switch/__init__.py b/labml_nn/transformers/switch/__init__.py index 6cd82fec..beadc047 100644 --- a/labml_nn/transformers/switch/__init__.py +++ b/labml_nn/transformers/switch/__init__.py @@ -155,11 +155,13 @@ class SwitchFeedForward(Module): final_output = final_output.view(seq_len, batch_size, d_model) # Return + # # * the final output # * number of tokens routed to each expert # * sum of probabilities for each expert # * number of tokens dropped. # * routing probabilities of the selected experts + # # These are used for the load balancing loss and logging return final_output, counts, route_prob.sum(0), len(dropped), route_prob_max