fix math align

This commit is contained in:
Varuna Jayasiri
2021-10-24 17:26:28 +05:30
parent 9b2dc5a94b
commit a8954c1cbb
34 changed files with 1188 additions and 1093 deletions

View File

@ -39,6 +39,7 @@ $p_n$.
The step function is applied to a maximum number of steps donated by $N$.
The overall loss of PonderNet is
\begin{align}
L &= L_{Rec} + \beta L_{Reg} \\
L_{Rec} &= \sum_{n=1}^N p_n \mathcal{L}(y, \hat{y}_n) \\

View File

@ -183,6 +183,7 @@ The average of utilities over a set of strategies is equal to the utility of the
$$\frac{1}{T} \sum_{t=1}^T u_i(\sigma^t) = u_i(\bar{\sigma}^T)$$
Therefore,
\begin{align}
2\epsilon &>
\max_{\sigma^*_1 \in \Sigma_1} u_1(\sigma^*_1, \bar{\sigma}^T_{-1}) +
@ -194,6 +195,7 @@ $$\max_{\sigma^*_2 \in \Sigma_2} u_2(\sigma^*_2, \bar{\sigma}^T_{-2}) \ge u_2(\b
= -u_1(\bar{\sigma}^T)$$
Then,
\begin{align}
2\epsilon &>
\max_{\sigma^*_1 \in \Sigma_1} u_1(\sigma^*_1, \bar{\sigma}^T_{-1}) +

View File

@ -52,6 +52,7 @@ class Sampler:
alpha_bar_tm1 = torch.cat([self.alpha_bar.new_ones((1,)), self.alpha_bar[:-1]])
# To calculate
#
# \begin{align}
# q(x_{t-1}|x_t, x_0) &= \mathcal{N} \Big(x_{t-1}; \tilde\mu_t(x_t, x_0), \tilde\beta_t \mathbf{I} \Big) \\
# \tilde\mu_t(x_t, x_0) &= \frac{\sqrt{\bar\alpha_{t-1}}\beta_t}{1 - \bar\alpha_t}x_0

View File

@ -62,10 +62,12 @@ class TimeEmbedding(nn.Module):
def forward(self, t: torch.Tensor):
# Create sinusoidal position embeddings
# [same as those from the transformer](../../transformers/positional_encoding.html)
#
# \begin{align}
# PE^{(1)}_{t,i} &= sin\Bigg(\frac{t}{10000^{\frac{i}{d - 1}}}\Bigg) \\
# PE^{(2)}_{t,i} &= cos\Bigg(\frac{t}{10000^{\frac{i}{d - 1}}}\Bigg)
# \end{align}
#
# where $d$ is `half_dim`
half_dim = self.n_channels // 8
emb = math.log(10_000) / (half_dim - 1)

View File

@ -448,6 +448,7 @@ class Configs(BaseConfigs):
$F$ translates images from $Y \rightarrow X$,
$D_X$ tests if images are from $X$ space,
$D_Y$ tests if images are from $Y$ space, and
\begin{align}
\mathcal{L}(G, F, D_X, D_Y)
&= \mathcal{L}_{GAN}(G, D_Y, X, Y) \\
@ -490,6 +491,7 @@ class Configs(BaseConfigs):
To solve $$G^*, F^*$$,
discriminators $D_X$ and $D_Y$ should **ascend** on the gradient,
\begin{align}
\nabla_{\theta_{D_X, D_Y}} \frac{1}{m} \sum_{i=1}^m
&\Bigg[
@ -499,6 +501,7 @@ class Configs(BaseConfigs):
& +\log\Big(1 - D_X\Big(F\Big(y^{(i)}\Big)\Big)\Big)
\Bigg]
\end{align}
That is descend on *negative* log-likelihood loss.
In order to stabilize the training the negative log- likelihood objective
@ -506,6 +509,7 @@ class Configs(BaseConfigs):
the least-squared error of discriminator, labelling real images with 1,
and generated images with 0.
So we want to descend on the gradient,
\begin{align}
\nabla_{\theta_{D_X, D_Y}} \frac{1}{m} \sum_{i=1}^m
&\Bigg[
@ -518,6 +522,7 @@ class Configs(BaseConfigs):
We use least-squares for generators also.
The generators should *descend* on the gradient,
\begin{align}
\nabla_{\theta_{F, G}} \frac{1}{m} \sum_{i=1}^m
&\Bigg[
@ -635,7 +640,9 @@ class Configs(BaseConfigs):
"""
### Optimize the discriminators with gan loss.
"""
# GAN Loss
#
# \begin{align}
# \bigg(D_Y\Big(y ^ {(i)}\Big) - 1\bigg) ^ 2
# + D_Y\Big(G\Big(x ^ {(i)}\Big)\Big) ^ 2 + \\

View File

@ -49,6 +49,7 @@ $W_{hz}$ will be $N_h \times N_h \times N_z$.
To overcome this, we compute the weight parameters of the recurrent network by
dynamically scaling each row of a matrix of same size.
\begin{align}
d(z) = W_{hz} z_h \\
\\
@ -60,6 +61,7 @@ d_1(z) W_{hd_1} \\
d_{N_h}(z) W_{hd_{N_h}} \\
\end{pmatrix}
\end{align}
where $W_{hd}$ is a $N_h \times N_h$ parameter matrix.
We can further optimize this when we compute $\textcolor{cyan}{W_h} h$,

View File

@ -52,7 +52,6 @@ class LSTMCell(Module):
g_t &= lin_x^g(x_t) + lin_h^g(h_{t-1}) \\
o_t &= lin_x^o(x_t) + lin_h^o(h_{t-1})
\end{align}
"""
def __init__(self, input_size: int, hidden_size: int, layer_norm: bool = False):

View File

@ -146,6 +146,7 @@ class EstimatedBatchNorm(Module):
var = mean_x2 - mean ** 2
# Update exponential moving averages
#
# \begin{align}
# \hat{\mu}_C &\longleftarrow (1 - r)\hat{\mu}_C + r \frac{1}{B H W} \sum_{b,h,w} X_{b,c,h,w} \\
# \hat{\sigma}^2_C &\longleftarrow (1 - r)\hat{\sigma}^2_C + r \frac{1}{B H W} \sum_{b,h,w} \big(X_{b,c,h,w} - \hat{\mu}_C \big)^2

View File

@ -134,6 +134,7 @@ def _synthetic_experiment(is_adam: bool):
optimal parameters that minimize $\mathbb{E}[f(\theta)]$.
Now lets define the synthetic problem,
\begin{align}
f_t(x) =
\begin{cases}
@ -141,6 +142,7 @@ def _synthetic_experiment(is_adam: bool):
-10 x, & \text{otherwise}
\end{cases}
\end{align}
where $-1 \le x \le +1$.
The optimal solution is $x = -1$.

View File

@ -33,6 +33,7 @@ without changing parameters or calculating momentum ($m_t$).
Let $\sigma(g_1, ..., g_t)$ and $\psi(g_1, ..., g_t)$ be the functions to calculate
momentum and adaptive learning rate.
For Adam, they are
\begin{align}
\sigma(g_1, ..., g_t) &= \frac{(1 - \beta_1)\sum_{i=1}^t \beta_1^{t-i} g_i}{1 - \beta_1^t} \\
\psi(g_1, ..., g_t) &= \sqrt \frac{1 - \beta_2^t}{(1 - \beta_2)\sum_{i=1}^t \beta_2^{t-i} g_i^2}
@ -41,16 +42,20 @@ For Adam, they are
### Exponential moving average as simple moving average
The distribution of exponential moving average can be approximated as a simple moving average.
\begin{align}
p\Bigg(\frac{(1-\beta_2) \sum_{i=1}^t \beta_2^{t-i} g_i^2}{1 - \beta_2^t} \Bigg) \approx
p\Bigg(\frac{\sum_{i=1}^{f(t,\beta_2)} g_{t+1-i}^2}{f(t,\beta_2)} \Bigg)
\end{align}
Here we are taking the simple moving average of the last $f(t,\beta_2)$ gradients.
$f(t,\beta_2)$ satisfies the following,
\begin{align}
\frac{(1-\beta_2) \sum_{i=1}^t \beta_2^{t-i} \cdot i}{1 - \beta_2^t} =
\frac{\sum_{i=1}^{f(t,\beta_2)} (t+1-i)}{f(t,\beta_2)}
\end{align}
which gives,
$$f(t,\beta_2) = \frac{2}{1-\beta_2} - 1 - \frac{2 t \beta_2^t}{1 - \beta_2^t}$$
@ -83,6 +88,7 @@ $\rho_{\infty} = \frac{2}{1-\beta_2} - 1$. Let the minimum variance be $C_{\text
In order to ensure that the adaptive learning
rate $\psi(.)$ has consistent variance, we rectify the variance with $r$
\begin{align}
r = \sqrt{\frac{C_{\text{var}}}{Var\big[\psi(.)\big]}}
\end{align}
@ -94,10 +100,12 @@ based on first order expansion of $\sqrt{\psi^2(.)}$
🤪 I didn't get how it was derived.
From $\text{Scale-inv} \mathcal{X}^2$ distribution we have,
\begin{align}
\mathbb{E}\big[\psi^2(.)\big] &= \frac{\rho / \sigma^2}{\rho-2} \\
Var\big[\psi^2(.)\big] &= \frac{2 \rho / \sigma^4}{(\rho-2)^2 (\rho - 2)}
\end{align}
which gives,
$$
Var[\psi(.)] \approx \frac{\rho}{2(\rho-2)(\rho-4)\sigma^2}
@ -106,6 +114,7 @@ $$
### Rectification term
We have
\begin{align}
r &= \sqrt{\frac{C_{\text{var}}}{Var\big[\psi(.)\big]}} \\
Var[\psi(.)] &\approx \frac{\rho}{2(\rho-2)(\rho-4)\sigma^2}
@ -121,6 +130,7 @@ Var[\psi(g_1,...,g_t)] &\approx \frac{\rho_t}{2(\rho_t-2)(\rho_t-4)\sigma^2}
\end{align}
This gives,
\begin{align}
r_t &= \sqrt{\frac{(\rho_t-2)(\rho_t-4)\rho_\infty}{(\rho_\infty-2)(\rho_\infty-4)\rho_t}}
\end{align}

View File

@ -88,12 +88,14 @@ class RHNCell(Module):
hg = self.hidden_lin[d](s)
# Use the first half of `hg` to get $h_d^t$
#
# \begin{align}
# h_0^t &= \tanh(lin_{hx}(x) + lin_{hs}(s_D^{t-1})) \\
# h_d^t &= \tanh(lin_{hs}^d(s_d^t))
# \end{align}
h = torch.tanh(hg[:, :self.hidden_size])
# Use the second half of `hg` to get $g_d^t$
#
# \begin{align}
# g_0^t &= \sigma(lin_{gx}(x) + lin_{gs}^1(s_D^{t-1})) \\
# g_d^t &= \sigma(lin_{gs}^d(s_d^t))

View File

@ -82,6 +82,7 @@ class QFuncLoss(Module):
the value is taken from $\textcolor{orange}{\theta_i^{-}}$.
And the loss function becomes,
\begin{align}
\mathcal{L}_i(\theta_i) = \mathop{\mathbb{E}}_{(s,a,r,s') \sim U(D)}
\Bigg[

View File

@ -81,6 +81,7 @@ class ClippedPPOLoss(Module):
$$d^\pi(s) = (1 - \gamma) \sum_{t=0}^\infty \gamma^t P(s_t = s | \pi)$$
Then,
\begin{align}
J(\pi_\theta) - J(\pi_{\theta_{OLD}})
&= \mathbb{E}_{\tau \sim \pi_\theta} \Biggl[

View File

@ -25,6 +25,7 @@ class GAE:
def __call__(self, done: np.ndarray, rewards: np.ndarray, values: np.ndarray) -> np.ndarray:
"""
### Calculate advantages
\begin{align}
\hat{A_t^{(1)}} &= r_t + \gamma V(s_{t+1}) - V(s)
\\

View File

@ -116,6 +116,7 @@ class AFTLocal(Module):
#### Create local mask
This creates a mask for
\begin{align}
m_{t,t'} =
\begin{cases}
@ -167,6 +168,7 @@ class AFTLocal(Module):
value = self.value(value)
# Get
#
# \begin{align}
# w'_{t,t'} =
# \begin{cases}
@ -174,6 +176,7 @@ class AFTLocal(Module):
# 0, & \text{otherwise}
# \end{cases}
# \end{align}
#
# using the mask
pos_bias = self.pos_bias[:seq_len, :seq_len] * self.local_mask[:seq_len, :seq_len]
pos_bias = pos_bias.unsqueeze(-1)

View File

@ -72,6 +72,7 @@ y^{(i)} &= \frac
With $\textcolor{cyan}{W^{(i)}} = \sum^i_{j=1} v^{(j)} \otimes \phi(k^{(j)})$ and
$z^{(i)} = \sum^i_{j=1} \textcolor{lightgreen}{\phi(k^{(j)})}$, we can calculate them efficiently:
\begin{align}
\textcolor{cyan}{W^{(i)}} &= \textcolor{cyan}{W^{(i-1)}} + v^{(i)} \otimes \textcolor{lightgreen}{\phi(k^{(i)})} \\
z^{(i)} &= z{(i)} + \textcolor{lightgreen}{\phi(k^{(i)})} \\

View File

@ -179,6 +179,7 @@ class SquaredErrorBayesRisk(Module):
is the variance.
This gives,
\begin{align}
\mathcal{L}(\Theta)
&= \sum_{k=1}^K \Big( y_k^2 -2 y_k \mathbb{E}[p_k] + \mathbb{E}[p_k^2] \Big) \\
@ -255,6 +256,7 @@ class KLDivergenceLoss(Module):
strength_tilde = alpha_tilde.sum(dim=-1)
# The first term
#
# \begin{align}
# &\log \Bigg( \frac{\Gamma \Big( \sum_{k=1}^K \tilde{\alpha}_k \Big)}
# {\Gamma(K) \prod_{k=1}^K \Gamma(\tilde{\alpha}_k)} \Bigg) \\