mirror of
				https://github.com/labmlai/annotated_deep_learning_paper_implementations.git
				synced 2025-10-31 02:39:16 +08:00 
			
		
		
		
	fix math align
This commit is contained in:
		| @ -39,6 +39,7 @@ $p_n$. | ||||
| The step function is applied to a maximum number of steps donated by $N$. | ||||
|  | ||||
| The overall loss of PonderNet is | ||||
|  | ||||
| \begin{align} | ||||
| L &= L_{Rec} + \beta L_{Reg} \\ | ||||
| L_{Rec} &= \sum_{n=1}^N p_n \mathcal{L}(y, \hat{y}_n) \\ | ||||
|  | ||||
| @ -183,6 +183,7 @@ The average of utilities over a set of strategies is equal to the utility of the | ||||
| $$\frac{1}{T} \sum_{t=1}^T u_i(\sigma^t) = u_i(\bar{\sigma}^T)$$ | ||||
|  | ||||
| Therefore, | ||||
|  | ||||
| \begin{align} | ||||
| 2\epsilon &> | ||||
| \max_{\sigma^*_1 \in \Sigma_1} u_1(\sigma^*_1, \bar{\sigma}^T_{-1}) + | ||||
| @ -194,6 +195,7 @@ $$\max_{\sigma^*_2 \in \Sigma_2} u_2(\sigma^*_2, \bar{\sigma}^T_{-2}) \ge u_2(\b | ||||
|  = -u_1(\bar{\sigma}^T)$$ | ||||
|  | ||||
| Then, | ||||
|  | ||||
| \begin{align} | ||||
| 2\epsilon &> | ||||
| \max_{\sigma^*_1 \in \Sigma_1} u_1(\sigma^*_1, \bar{\sigma}^T_{-1}) + | ||||
|  | ||||
| @ -52,6 +52,7 @@ class Sampler: | ||||
|         alpha_bar_tm1 = torch.cat([self.alpha_bar.new_ones((1,)), self.alpha_bar[:-1]]) | ||||
|  | ||||
|         # To calculate | ||||
|         # | ||||
|         # \begin{align} | ||||
|         # q(x_{t-1}|x_t, x_0) &= \mathcal{N} \Big(x_{t-1}; \tilde\mu_t(x_t, x_0), \tilde\beta_t \mathbf{I} \Big) \\ | ||||
|         # \tilde\mu_t(x_t, x_0) &= \frac{\sqrt{\bar\alpha_{t-1}}\beta_t}{1 - \bar\alpha_t}x_0 | ||||
|  | ||||
| @ -62,10 +62,12 @@ class TimeEmbedding(nn.Module): | ||||
|     def forward(self, t: torch.Tensor): | ||||
|         # Create sinusoidal position embeddings | ||||
|         # [same as those from the transformer](../../transformers/positional_encoding.html) | ||||
|         # | ||||
|         # \begin{align} | ||||
|         # PE^{(1)}_{t,i} &= sin\Bigg(\frac{t}{10000^{\frac{i}{d - 1}}}\Bigg) \\ | ||||
|         # PE^{(2)}_{t,i} &= cos\Bigg(\frac{t}{10000^{\frac{i}{d - 1}}}\Bigg) | ||||
|         # \end{align} | ||||
|         # | ||||
|         # where $d$ is `half_dim` | ||||
|         half_dim = self.n_channels // 8 | ||||
|         emb = math.log(10_000) / (half_dim - 1) | ||||
|  | ||||
| @ -448,6 +448,7 @@ class Configs(BaseConfigs): | ||||
|         $F$ translates images from $Y \rightarrow X$, | ||||
|         $D_X$ tests if images are from $X$ space, | ||||
|         $D_Y$ tests if images are from $Y$ space, and | ||||
|  | ||||
|         \begin{align} | ||||
|         \mathcal{L}(G, F, D_X, D_Y) | ||||
|             &= \mathcal{L}_{GAN}(G, D_Y, X, Y) \\ | ||||
| @ -490,6 +491,7 @@ class Configs(BaseConfigs): | ||||
|  | ||||
|         To solve $$G^*, F^*$$, | ||||
|         discriminators $D_X$ and $D_Y$ should **ascend** on the gradient, | ||||
|  | ||||
|         \begin{align} | ||||
|         \nabla_{\theta_{D_X, D_Y}} \frac{1}{m} \sum_{i=1}^m | ||||
|         &\Bigg[ | ||||
| @ -499,6 +501,7 @@ class Configs(BaseConfigs): | ||||
|         & +\log\Big(1 - D_X\Big(F\Big(y^{(i)}\Big)\Big)\Big) | ||||
|         \Bigg] | ||||
|         \end{align} | ||||
|  | ||||
|         That is descend on *negative* log-likelihood loss. | ||||
|  | ||||
|         In order to stabilize the training the negative log- likelihood objective | ||||
| @ -506,6 +509,7 @@ class Configs(BaseConfigs): | ||||
|         the least-squared error of discriminator, labelling real images with 1, | ||||
|         and generated images with 0. | ||||
|         So we want to descend on the gradient, | ||||
|  | ||||
|         \begin{align} | ||||
|         \nabla_{\theta_{D_X, D_Y}} \frac{1}{m} \sum_{i=1}^m | ||||
|         &\Bigg[ | ||||
| @ -518,6 +522,7 @@ class Configs(BaseConfigs): | ||||
|  | ||||
|         We use least-squares for generators also. | ||||
|         The generators should *descend* on the gradient, | ||||
|  | ||||
|         \begin{align} | ||||
|         \nabla_{\theta_{F, G}} \frac{1}{m} \sum_{i=1}^m | ||||
|         &\Bigg[ | ||||
| @ -635,7 +640,9 @@ class Configs(BaseConfigs): | ||||
|         """ | ||||
|         ### Optimize the discriminators with gan loss. | ||||
|         """ | ||||
|  | ||||
|         # GAN Loss | ||||
|         # | ||||
|         # \begin{align} | ||||
|         # \bigg(D_Y\Big(y ^ {(i)}\Big) - 1\bigg) ^ 2 | ||||
|         # + D_Y\Big(G\Big(x ^ {(i)}\Big)\Big) ^ 2 + \\ | ||||
|  | ||||
| @ -49,6 +49,7 @@ $W_{hz}$ will be $N_h \times N_h \times N_z$. | ||||
|  | ||||
| To overcome this, we compute the weight parameters of the recurrent network by | ||||
| dynamically scaling each row of a matrix of same size. | ||||
|  | ||||
| \begin{align} | ||||
| d(z) = W_{hz} z_h \\ | ||||
| \\ | ||||
| @ -60,6 +61,7 @@ d_1(z) W_{hd_1} \\ | ||||
| d_{N_h}(z) W_{hd_{N_h}} \\ | ||||
| \end{pmatrix} | ||||
| \end{align} | ||||
|  | ||||
| where $W_{hd}$ is a $N_h \times N_h$ parameter matrix. | ||||
|  | ||||
| We can further optimize this when we compute $\textcolor{cyan}{W_h} h$, | ||||
|  | ||||
| @ -52,7 +52,6 @@ class LSTMCell(Module): | ||||
|     g_t &= lin_x^g(x_t) + lin_h^g(h_{t-1}) \\ | ||||
|     o_t &= lin_x^o(x_t) + lin_h^o(h_{t-1}) | ||||
|     \end{align} | ||||
|  | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, input_size: int, hidden_size: int, layer_norm: bool = False): | ||||
|  | ||||
| @ -146,6 +146,7 @@ class EstimatedBatchNorm(Module): | ||||
|                 var = mean_x2 - mean ** 2 | ||||
|  | ||||
|                 # Update exponential moving averages | ||||
|                 # | ||||
|                 # \begin{align} | ||||
|                 # \hat{\mu}_C &\longleftarrow (1 - r)\hat{\mu}_C + r \frac{1}{B H W} \sum_{b,h,w} X_{b,c,h,w} \\ | ||||
|                 # \hat{\sigma}^2_C &\longleftarrow (1 - r)\hat{\sigma}^2_C + r \frac{1}{B H W} \sum_{b,h,w} \big(X_{b,c,h,w} - \hat{\mu}_C \big)^2 | ||||
|  | ||||
| @ -134,6 +134,7 @@ def _synthetic_experiment(is_adam: bool): | ||||
|     optimal parameters that minimize $\mathbb{E}[f(\theta)]$. | ||||
|  | ||||
|     Now lets define the synthetic problem, | ||||
|  | ||||
|     \begin{align} | ||||
|     f_t(x) = | ||||
|     \begin{cases} | ||||
| @ -141,6 +142,7 @@ def _synthetic_experiment(is_adam: bool): | ||||
|     -10  x, & \text{otherwise} | ||||
|     \end{cases} | ||||
|     \end{align} | ||||
|  | ||||
|     where $-1 \le x \le +1$. | ||||
|     The optimal solution is $x = -1$. | ||||
|  | ||||
|  | ||||
| @ -33,6 +33,7 @@ without changing parameters or calculating momentum ($m_t$). | ||||
| Let $\sigma(g_1, ..., g_t)$ and $\psi(g_1, ..., g_t)$ be the functions to calculate | ||||
| momentum and adaptive learning rate. | ||||
| For Adam, they are | ||||
|  | ||||
| \begin{align} | ||||
| \sigma(g_1, ..., g_t) &=  \frac{(1 - \beta_1)\sum_{i=1}^t \beta_1^{t-i} g_i}{1 - \beta_1^t} \\ | ||||
| \psi(g_1, ..., g_t) &=  \sqrt \frac{1 - \beta_2^t}{(1 - \beta_2)\sum_{i=1}^t \beta_2^{t-i} g_i^2} | ||||
| @ -41,16 +42,20 @@ For Adam, they are | ||||
| ### Exponential moving average as simple moving average | ||||
|  | ||||
| The distribution of exponential moving average can be approximated as a simple moving average. | ||||
|  | ||||
| \begin{align} | ||||
| p\Bigg(\frac{(1-\beta_2) \sum_{i=1}^t \beta_2^{t-i} g_i^2}{1 - \beta_2^t} \Bigg) \approx | ||||
| p\Bigg(\frac{\sum_{i=1}^{f(t,\beta_2)} g_{t+1-i}^2}{f(t,\beta_2)} \Bigg) | ||||
| \end{align} | ||||
|  | ||||
| Here we are taking the simple moving average of the last $f(t,\beta_2)$ gradients. | ||||
| $f(t,\beta_2)$ satisfies the following, | ||||
|  | ||||
| \begin{align} | ||||
| \frac{(1-\beta_2) \sum_{i=1}^t \beta_2^{t-i} \cdot i}{1 - \beta_2^t} = | ||||
| \frac{\sum_{i=1}^{f(t,\beta_2)} (t+1-i)}{f(t,\beta_2)} | ||||
| \end{align} | ||||
|  | ||||
| which gives, | ||||
| $$f(t,\beta_2) = \frac{2}{1-\beta_2} - 1 - \frac{2 t \beta_2^t}{1 - \beta_2^t}$$ | ||||
|  | ||||
| @ -83,6 +88,7 @@ $\rho_{\infty} = \frac{2}{1-\beta_2} - 1$. Let the minimum variance be $C_{\text | ||||
|  | ||||
| In order to ensure that the adaptive learning | ||||
| rate $\psi(.)$ has consistent variance, we rectify the variance with $r$ | ||||
|  | ||||
| \begin{align} | ||||
| r = \sqrt{\frac{C_{\text{var}}}{Var\big[\psi(.)\big]}} | ||||
| \end{align} | ||||
| @ -94,10 +100,12 @@ based on first order expansion of $\sqrt{\psi^2(.)}$ | ||||
| 🤪 I didn't get how it was derived. | ||||
|  | ||||
| From $\text{Scale-inv} \mathcal{X}^2$ distribution we have, | ||||
|  | ||||
| \begin{align} | ||||
| \mathbb{E}\big[\psi^2(.)\big] &= \frac{\rho / \sigma^2}{\rho-2} \\ | ||||
| Var\big[\psi^2(.)\big] &= \frac{2 \rho / \sigma^4}{(\rho-2)^2 (\rho - 2)} | ||||
| \end{align} | ||||
|  | ||||
| which gives, | ||||
| $$ | ||||
| Var[\psi(.)] \approx \frac{\rho}{2(\rho-2)(\rho-4)\sigma^2} | ||||
| @ -106,6 +114,7 @@ $$ | ||||
| ### Rectification term | ||||
|  | ||||
| We have | ||||
|  | ||||
| \begin{align} | ||||
| r &= \sqrt{\frac{C_{\text{var}}}{Var\big[\psi(.)\big]}} \\ | ||||
| Var[\psi(.)] &\approx \frac{\rho}{2(\rho-2)(\rho-4)\sigma^2} | ||||
| @ -121,6 +130,7 @@ Var[\psi(g_1,...,g_t)] &\approx \frac{\rho_t}{2(\rho_t-2)(\rho_t-4)\sigma^2} | ||||
| \end{align} | ||||
|  | ||||
| This gives, | ||||
|  | ||||
| \begin{align} | ||||
| r_t &= \sqrt{\frac{(\rho_t-2)(\rho_t-4)\rho_\infty}{(\rho_\infty-2)(\rho_\infty-4)\rho_t}} | ||||
| \end{align} | ||||
|  | ||||
| @ -88,12 +88,14 @@ class RHNCell(Module): | ||||
|                 hg = self.hidden_lin[d](s) | ||||
|  | ||||
|             # Use the first half of `hg` to get $h_d^t$ | ||||
|             # | ||||
|             # \begin{align} | ||||
|             # h_0^t &= \tanh(lin_{hx}(x) + lin_{hs}(s_D^{t-1})) \\ | ||||
|             # h_d^t &= \tanh(lin_{hs}^d(s_d^t)) | ||||
|             # \end{align} | ||||
|             h = torch.tanh(hg[:, :self.hidden_size]) | ||||
|             # Use the second half of `hg` to get $g_d^t$ | ||||
|             # | ||||
|             # \begin{align} | ||||
|             # g_0^t &= \sigma(lin_{gx}(x) + lin_{gs}^1(s_D^{t-1})) \\ | ||||
|             # g_d^t &= \sigma(lin_{gs}^d(s_d^t)) | ||||
|  | ||||
| @ -82,6 +82,7 @@ class QFuncLoss(Module): | ||||
|     the value is taken from $\textcolor{orange}{\theta_i^{-}}$. | ||||
|  | ||||
|     And the loss function becomes, | ||||
|  | ||||
|     \begin{align} | ||||
|         \mathcal{L}_i(\theta_i) = \mathop{\mathbb{E}}_{(s,a,r,s') \sim U(D)} | ||||
|         \Bigg[ | ||||
|  | ||||
| @ -81,6 +81,7 @@ class ClippedPPOLoss(Module): | ||||
|      $$d^\pi(s) = (1 - \gamma) \sum_{t=0}^\infty \gamma^t P(s_t = s | \pi)$$ | ||||
|  | ||||
|     Then, | ||||
|  | ||||
|     \begin{align} | ||||
|     J(\pi_\theta) - J(\pi_{\theta_{OLD}}) | ||||
|     &= \mathbb{E}_{\tau \sim \pi_\theta} \Biggl[ | ||||
|  | ||||
| @ -25,6 +25,7 @@ class GAE: | ||||
|     def __call__(self, done: np.ndarray, rewards: np.ndarray, values: np.ndarray) -> np.ndarray: | ||||
|         """ | ||||
|         ### Calculate advantages | ||||
|  | ||||
|         \begin{align} | ||||
|         \hat{A_t^{(1)}} &= r_t + \gamma V(s_{t+1}) - V(s) | ||||
|         \\ | ||||
|  | ||||
| @ -116,6 +116,7 @@ class AFTLocal(Module): | ||||
|         #### Create local mask | ||||
|  | ||||
|         This creates a mask for | ||||
|  | ||||
|         \begin{align} | ||||
|         m_{t,t'} = | ||||
|         \begin{cases} | ||||
| @ -167,6 +168,7 @@ class AFTLocal(Module): | ||||
|         value = self.value(value) | ||||
|  | ||||
|         # Get | ||||
|         # | ||||
|         #     \begin{align} | ||||
|         #     w'_{t,t'} = | ||||
|         #     \begin{cases} | ||||
| @ -174,6 +176,7 @@ class AFTLocal(Module): | ||||
|         #     0, & \text{otherwise} | ||||
|         #     \end{cases} | ||||
|         #     \end{align} | ||||
|         # | ||||
|         # using the mask | ||||
|         pos_bias = self.pos_bias[:seq_len, :seq_len] * self.local_mask[:seq_len, :seq_len] | ||||
|         pos_bias = pos_bias.unsqueeze(-1) | ||||
|  | ||||
| @ -72,6 +72,7 @@ y^{(i)} &= \frac | ||||
|  | ||||
| With $\textcolor{cyan}{W^{(i)}} = \sum^i_{j=1} v^{(j)} \otimes \phi(k^{(j)})$ and | ||||
| $z^{(i)} = \sum^i_{j=1} \textcolor{lightgreen}{\phi(k^{(j)})}$, we can calculate them efficiently: | ||||
|  | ||||
| \begin{align} | ||||
| \textcolor{cyan}{W^{(i)}} &= \textcolor{cyan}{W^{(i-1)}} + v^{(i)} \otimes \textcolor{lightgreen}{\phi(k^{(i)})} \\ | ||||
| z^{(i)} &= z{(i)} + \textcolor{lightgreen}{\phi(k^{(i)})} \\ | ||||
|  | ||||
| @ -179,6 +179,7 @@ class SquaredErrorBayesRisk(Module): | ||||
|      is the variance. | ||||
|  | ||||
|     This gives, | ||||
|  | ||||
|     \begin{align} | ||||
|     \mathcal{L}(\Theta) | ||||
|     &= \sum_{k=1}^K \Big( y_k^2 -2 y_k \mathbb{E}[p_k] + \mathbb{E}[p_k^2] \Big) \\ | ||||
| @ -255,6 +256,7 @@ class KLDivergenceLoss(Module): | ||||
|         strength_tilde = alpha_tilde.sum(dim=-1) | ||||
|  | ||||
|         # The first term | ||||
|         # | ||||
|         # \begin{align} | ||||
|         # &\log \Bigg( \frac{\Gamma \Big( \sum_{k=1}^K \tilde{\alpha}_k \Big)} | ||||
|         #     {\Gamma(K) \prod_{k=1}^K \Gamma(\tilde{\alpha}_k)} \Bigg) \\ | ||||
|  | ||||
		Reference in New Issue
	
	Block a user
	 Varuna Jayasiri
					Varuna Jayasiri