fix math align

2025-10-31 02:39:16 +08:00 · 2021-10-24 17:26:28 +05:30
parent 9b2dc5a94b
commit a8954c1cbb
34 changed files with 1188 additions and 1093 deletions
--- a/labml_nn/adaptive_computation/ponder_net/init.py
+++ b/labml_nn/adaptive_computation/ponder_net/init.py
@ -39,6 +39,7 @@ $p_n$.
 The step function is applied to a maximum number of steps donated by $N$.

 The overall loss of PonderNet is
+
 \begin{align}
 L &= L_{Rec} + \beta L_{Reg} \\
 L_{Rec} &= \sum_{n=1}^N p_n \mathcal{L}(y, \hat{y}_n) \\
--- a/labml_nn/cfr/init.py
+++ b/labml_nn/cfr/init.py
@ -183,6 +183,7 @@ The average of utilities over a set of strategies is equal to the utility of the
 $$\frac{1}{T} \sum_{t=1}^T u_i(\sigma^t) = u_i(\bar{\sigma}^T)$$

 Therefore,
+
 \begin{align}
 2\epsilon &>
 \max_{\sigma^*_1 \in \Sigma_1} u_1(\sigma^*_1, \bar{\sigma}^T_{-1}) +
@ -194,6 +195,7 @@ $$\max_{\sigma^*_2 \in \Sigma_2} u_2(\sigma^*_2, \bar{\sigma}^T_{-2}) \ge u_2(\b
 = -u_1(\bar{\sigma}^T)$$

 Then,
+
 \begin{align}
 2\epsilon &>
 \max_{\sigma^*_1 \in \Sigma_1} u_1(\sigma^*_1, \bar{\sigma}^T_{-1}) +
--- a/labml_nn/diffusion/ddpm/evaluate.py
+++ b/labml_nn/diffusion/ddpm/evaluate.py
@ -52,6 +52,7 @@ class Sampler:
        alpha_bar_tm1 = torch.cat([self.alpha_bar.new_ones((1,)), self.alpha_bar[:-1]])

        # To calculate
+        #
        # \begin{align}
        # q(x_{t-1}|x_t, x_0) &= \mathcal{N} \Big(x_{t-1}; \tilde\mu_t(x_t, x_0), \tilde\beta_t \mathbf{I} \Big) \\
        # \tilde\mu_t(x_t, x_0) &= \frac{\sqrt{\bar\alpha_{t-1}}\beta_t}{1 - \bar\alpha_t}x_0
--- a/labml_nn/diffusion/ddpm/unet.py
+++ b/labml_nn/diffusion/ddpm/unet.py
@ -62,10 +62,12 @@ class TimeEmbedding(nn.Module):
    def forward(self, t: torch.Tensor):
        # Create sinusoidal position embeddings
        # [same as those from the transformer](../../transformers/positional_encoding.html)
+        #
        # \begin{align}
        # PE^{(1)}_{t,i} &= sin\Bigg(\frac{t}{10000^{\frac{i}{d - 1}}}\Bigg) \\
        # PE^{(2)}_{t,i} &= cos\Bigg(\frac{t}{10000^{\frac{i}{d - 1}}}\Bigg)
        # \end{align}
+        #
        # where $d$ is `half_dim`
        half_dim = self.n_channels // 8
        emb = math.log(10_000) / (half_dim - 1)
--- a/labml_nn/gan/cycle_gan/init.py
+++ b/labml_nn/gan/cycle_gan/init.py
@ -448,6 +448,7 @@ class Configs(BaseConfigs):
        $F$ translates images from $Y \rightarrow X$,
        $D_X$ tests if images are from $X$ space,
        $D_Y$ tests if images are from $Y$ space, and
+
        \begin{align}
        \mathcal{L}(G, F, D_X, D_Y)
            &= \mathcal{L}_{GAN}(G, D_Y, X, Y) \\
@ -490,6 +491,7 @@ class Configs(BaseConfigs):

        To solve $$G^*, F^*$$,
        discriminators $D_X$ and $D_Y$ should **ascend** on the gradient,
+
        \begin{align}
        \nabla_{\theta_{D_X, D_Y}} \frac{1}{m} \sum_{i=1}^m
        &\Bigg[
@ -499,6 +501,7 @@ class Configs(BaseConfigs):
        & +\log\Big(1 - D_X\Big(F\Big(y^{(i)}\Big)\Big)\Big)
        \Bigg]
        \end{align}
+
        That is descend on *negative* log-likelihood loss.

        In order to stabilize the training the negative log- likelihood objective
@ -506,6 +509,7 @@ class Configs(BaseConfigs):
        the least-squared error of discriminator, labelling real images with 1,
        and generated images with 0.
        So we want to descend on the gradient,
+
        \begin{align}
        \nabla_{\theta_{D_X, D_Y}} \frac{1}{m} \sum_{i=1}^m
        &\Bigg[
@ -518,6 +522,7 @@ class Configs(BaseConfigs):

        We use least-squares for generators also.
        The generators should *descend* on the gradient,
+
        \begin{align}
        \nabla_{\theta_{F, G}} \frac{1}{m} \sum_{i=1}^m
        &\Bigg[
@ -635,7 +640,9 @@ class Configs(BaseConfigs):
        """
        ### Optimize the discriminators with gan loss.
        """
+
        # GAN Loss
+        #
        # \begin{align}
        # \bigg(D_Y\Big(y ^ {(i)}\Big) - 1\bigg) ^ 2
        # + D_Y\Big(G\Big(x ^ {(i)}\Big)\Big) ^ 2 + \\
--- a/labml_nn/hypernetworks/hyper_lstm.py
+++ b/labml_nn/hypernetworks/hyper_lstm.py
@ -49,6 +49,7 @@ $W_{hz}$ will be $N_h \times N_h \times N_z$.

 To overcome this, we compute the weight parameters of the recurrent network by
 dynamically scaling each row of a matrix of same size.
+
 \begin{align}
 d(z) = W_{hz} z_h \\
 \\
@ -60,6 +61,7 @@ d_1(z) W_{hd_1} \\
 d_{N_h}(z) W_{hd_{N_h}} \\
 \end{pmatrix}
 \end{align}
+
 where $W_{hd}$ is a $N_h \times N_h$ parameter matrix.

 We can further optimize this when we compute $\textcolor{cyan}{W_h} h$,
--- a/labml_nn/lstm/init.py
+++ b/labml_nn/lstm/init.py
@ -52,7 +52,6 @@ class LSTMCell(Module):
    g_t &= lin_x^g(x_t) + lin_h^g(h_{t-1}) \\
    o_t &= lin_x^o(x_t) + lin_h^o(h_{t-1})
    \end{align}
-
    """

    def __init__(self, input_size: int, hidden_size: int, layer_norm: bool = False):
--- a/labml_nn/normalization/batch_channel_norm/init.py
+++ b/labml_nn/normalization/batch_channel_norm/init.py
@ -146,6 +146,7 @@ class EstimatedBatchNorm(Module):
                var = mean_x2 - mean ** 2

                # Update exponential moving averages
+                #
                # \begin{align}
                # \hat{\mu}_C &\longleftarrow (1 - r)\hat{\mu}_C + r \frac{1}{B H W} \sum_{b,h,w} X_{b,c,h,w} \\
                # \hat{\sigma}^2_C &\longleftarrow (1 - r)\hat{\sigma}^2_C + r \frac{1}{B H W} \sum_{b,h,w} \big(X_{b,c,h,w} - \hat{\mu}_C \big)^2
--- a/labml_nn/optimizers/amsgrad.py
+++ b/labml_nn/optimizers/amsgrad.py
@ -134,6 +134,7 @@ def _synthetic_experiment(is_adam: bool):
    optimal parameters that minimize $\mathbb{E}[f(\theta)]$.

    Now lets define the synthetic problem,
+
    \begin{align}
    f_t(x) =
    \begin{cases}
@ -141,6 +142,7 @@ def _synthetic_experiment(is_adam: bool):
    -10  x, & \text{otherwise}
    \end{cases}
    \end{align}
+
    where $-1 \le x \le +1$.
    The optimal solution is $x = -1$.

--- a/labml_nn/optimizers/radam.py
+++ b/labml_nn/optimizers/radam.py
@ -33,6 +33,7 @@ without changing parameters or calculating momentum ($m_t$).
 Let $\sigma(g_1, ..., g_t)$ and $\psi(g_1, ..., g_t)$ be the functions to calculate
 momentum and adaptive learning rate.
 For Adam, they are
+
 \begin{align}
 \sigma(g_1, ..., g_t) &=  \frac{(1 - \beta_1)\sum_{i=1}^t \beta_1^{t-i} g_i}{1 - \beta_1^t} \\
 \psi(g_1, ..., g_t) &=  \sqrt \frac{1 - \beta_2^t}{(1 - \beta_2)\sum_{i=1}^t \beta_2^{t-i} g_i^2}
@ -41,16 +42,20 @@ For Adam, they are
 ### Exponential moving average as simple moving average

 The distribution of exponential moving average can be approximated as a simple moving average.
+
 \begin{align}
 p\Bigg(\frac{(1-\beta_2) \sum_{i=1}^t \beta_2^{t-i} g_i^2}{1 - \beta_2^t} \Bigg) \approx
 p\Bigg(\frac{\sum_{i=1}^{f(t,\beta_2)} g_{t+1-i}^2}{f(t,\beta_2)} \Bigg)
 \end{align}
+
 Here we are taking the simple moving average of the last $f(t,\beta_2)$ gradients.
 $f(t,\beta_2)$ satisfies the following,
+
 \begin{align}
 \frac{(1-\beta_2) \sum_{i=1}^t \beta_2^{t-i} \cdot i}{1 - \beta_2^t} =
 \frac{\sum_{i=1}^{f(t,\beta_2)} (t+1-i)}{f(t,\beta_2)}
 \end{align}
+
 which gives,
 $$f(t,\beta_2) = \frac{2}{1-\beta_2} - 1 - \frac{2 t \beta_2^t}{1 - \beta_2^t}$$

@ -83,6 +88,7 @@ $\rho_{\infty} = \frac{2}{1-\beta_2} - 1$. Let the minimum variance be $C_{\text

 In order to ensure that the adaptive learning
 rate $\psi(.)$ has consistent variance, we rectify the variance with $r$
+
 \begin{align}
 r = \sqrt{\frac{C_{\text{var}}}{Var\big[\psi(.)\big]}}
 \end{align}
@ -94,10 +100,12 @@ based on first order expansion of $\sqrt{\psi^2(.)}$
 🤪 I didn't get how it was derived.

 From $\text{Scale-inv} \mathcal{X}^2$ distribution we have,
+
 \begin{align}
 \mathbb{E}\big[\psi^2(.)\big] &= \frac{\rho / \sigma^2}{\rho-2} \\
 Var\big[\psi^2(.)\big] &= \frac{2 \rho / \sigma^4}{(\rho-2)^2 (\rho - 2)}
 \end{align}
+
 which gives,
 $$
 Var[\psi(.)] \approx \frac{\rho}{2(\rho-2)(\rho-4)\sigma^2}
@ -106,6 +114,7 @@ $$
 ### Rectification term

 We have
+
 \begin{align}
 r &= \sqrt{\frac{C_{\text{var}}}{Var\big[\psi(.)\big]}} \\
 Var[\psi(.)] &\approx \frac{\rho}{2(\rho-2)(\rho-4)\sigma^2}
@ -121,6 +130,7 @@ Var[\psi(g_1,...,g_t)] &\approx \frac{\rho_t}{2(\rho_t-2)(\rho_t-4)\sigma^2}
 \end{align}

 This gives,
+
 \begin{align}
 r_t &= \sqrt{\frac{(\rho_t-2)(\rho_t-4)\rho_\infty}{(\rho_\infty-2)(\rho_\infty-4)\rho_t}}
 \end{align}
--- a/labml_nn/recurrent_highway_networks/init.py
+++ b/labml_nn/recurrent_highway_networks/init.py
@ -88,12 +88,14 @@ class RHNCell(Module):
                hg = self.hidden_lin[d](s)

            # Use the first half of `hg` to get $h_d^t$
+            #
            # \begin{align}
            # h_0^t &= \tanh(lin_{hx}(x) + lin_{hs}(s_D^{t-1})) \\
            # h_d^t &= \tanh(lin_{hs}^d(s_d^t))
            # \end{align}
            h = torch.tanh(hg[:, :self.hidden_size])
            # Use the second half of `hg` to get $g_d^t$
+            #
            # \begin{align}
            # g_0^t &= \sigma(lin_{gx}(x) + lin_{gs}^1(s_D^{t-1})) \\
            # g_d^t &= \sigma(lin_{gs}^d(s_d^t))
--- a/labml_nn/rl/dqn/init.py
+++ b/labml_nn/rl/dqn/init.py
@ -82,6 +82,7 @@ class QFuncLoss(Module):
    the value is taken from $\textcolor{orange}{\theta_i^{-}}$.

    And the loss function becomes,
+
    \begin{align}
        \mathcal{L}_i(\theta_i) = \mathop{\mathbb{E}}_{(s,a,r,s') \sim U(D)}
        \Bigg[
--- a/labml_nn/rl/ppo/init.py
+++ b/labml_nn/rl/ppo/init.py
@ -81,6 +81,7 @@ class ClippedPPOLoss(Module):
     $$d^\pi(s) = (1 - \gamma) \sum_{t=0}^\infty \gamma^t P(s_t = s | \pi)$$

    Then,
+
    \begin{align}
    J(\pi_\theta) - J(\pi_{\theta_{OLD}})
    &= \mathbb{E}_{\tau \sim \pi_\theta} \Biggl[
--- a/labml_nn/rl/ppo/gae.py
+++ b/labml_nn/rl/ppo/gae.py
@ -25,6 +25,7 @@ class GAE:
    def __call__(self, done: np.ndarray, rewards: np.ndarray, values: np.ndarray) -> np.ndarray:
        """
        ### Calculate advantages
+
        \begin{align}
        \hat{A_t^{(1)}} &= r_t + \gamma V(s_{t+1}) - V(s)
        \\
--- a/labml_nn/transformers/aft/init.py
+++ b/labml_nn/transformers/aft/init.py
@ -116,6 +116,7 @@ class AFTLocal(Module):
        #### Create local mask

        This creates a mask for
+
        \begin{align}
        m_{t,t'} =
        \begin{cases}
@ -167,6 +168,7 @@ class AFTLocal(Module):
        value = self.value(value)

        # Get
+        #
        #     \begin{align}
        #     w'_{t,t'} =
        #     \begin{cases}
@ -174,6 +176,7 @@ class AFTLocal(Module):
        #     0, & \text{otherwise}
        #     \end{cases}
        #     \end{align}
+        #
        # using the mask
        pos_bias = self.pos_bias[:seq_len, :seq_len] * self.local_mask[:seq_len, :seq_len]
        pos_bias = pos_bias.unsqueeze(-1)
--- a/labml_nn/transformers/fast_weights/init.py
+++ b/labml_nn/transformers/fast_weights/init.py
@ -72,6 +72,7 @@ y^{(i)} &= \frac

 With $\textcolor{cyan}{W^{(i)}} = \sum^i_{j=1} v^{(j)} \otimes \phi(k^{(j)})$ and
 $z^{(i)} = \sum^i_{j=1} \textcolor{lightgreen}{\phi(k^{(j)})}$, we can calculate them efficiently:
+
 \begin{align}
 \textcolor{cyan}{W^{(i)}} &= \textcolor{cyan}{W^{(i-1)}} + v^{(i)} \otimes \textcolor{lightgreen}{\phi(k^{(i)})} \\
 z^{(i)} &= z{(i)} + \textcolor{lightgreen}{\phi(k^{(i)})} \\
--- a/labml_nn/uncertainty/evidence/init.py
+++ b/labml_nn/uncertainty/evidence/init.py
@ -179,6 +179,7 @@ class SquaredErrorBayesRisk(Module):
     is the variance.

    This gives,
+
    \begin{align}
    \mathcal{L}(\Theta)
    &= \sum_{k=1}^K \Big( y_k^2 -2 y_k \mathbb{E}[p_k] + \mathbb{E}[p_k^2] \Big) \\
@ -255,6 +256,7 @@ class KLDivergenceLoss(Module):
        strength_tilde = alpha_tilde.sum(dim=-1)

        # The first term
+        #
        # \begin{align}
        # &\log \Bigg( \frac{\Gamma \Big( \sum_{k=1}^K \tilde{\alpha}_k \Big)}
        #     {\Gamma(K) \prod_{k=1}^K \Gamma(\tilde{\alpha}_k)} \Bigg) \\