comments fixes

This commit is contained in:
Varuna Jayasiri
2021-09-06 13:27:47 +05:30
parent 103cf81a13
commit c72d3b4f83
5 changed files with 92 additions and 75 deletions

View File

@ -405,6 +405,20 @@
</url>
<url>
<loc>https://nn.labml.ai/transformers/alibi/index.html</loc>
<lastmod>2021-08-28T16:30:00+00:00</lastmod>
<priority>1.00</priority>
</url>
<url>
<loc>https://nn.labml.ai/transformers/alibi/experiment.html</loc>
<lastmod>2021-08-28T16:30:00+00:00</lastmod>
<priority>1.00</priority>
</url>
<url>
<loc>https://nn.labml.ai/transformers/gmlp/index.html</loc>
<lastmod>2021-06-07T16:30:00+00:00</lastmod>

View File

@ -87,7 +87,7 @@
<a href='#section-1'>#</a>
</div>
<h2>$k$-NN to get $p(w_t, c_t)$</h2>
<p>Here we refer to $f($\color{yellowgreen}{c_t})$ as queries,
<p>Here we refer to $f(\color{yellowgreen}{c_t})$ as queries,
$f(c_i)$ as keys and $w_i$ as values.</p>
</div>
<div class='code'>
@ -121,7 +121,7 @@ $f(c_i)$ as keys and $w_i$ as values.</p>
<div class='section-link'>
<a href='#section-4'>#</a>
</div>
<p>Find 10 nearest neighbors of $f($\color{yellowgreen}{c_t})$ among $f(c_i)$.
<p>Find 10 nearest neighbors of $f(\color{yellowgreen}{c_t})$ among $f(c_i)$.
<code>distance</code> is the distance given by FAISS and <code>idx</code>, $i$ is the index of it in <code>keys_store</code>.</p>
</div>
<div class='code'>

View File

@ -477,10 +477,11 @@ $f_i$ is the count of tokens where the argmax of $p(x)$ is equal to $i$.</p>
</div>
<p>Load balancing loss
<script type="math/tex; mode=display">\mathscr{L} = N \sum_{i=1}^N f_i \cdot P_i</script>
</p>
$\mathscr{L}$ is the loss for a single layer and here we are
taking the sum of losses across all layers.</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">122</span> <span class="n">load_balancing_loss</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">n_experts</span> <span class="o">*</span> <span class="p">(</span><span class="n">route_frac</span> <span class="o">*</span> <span class="n">route_prob</span><span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span></pre></div>
<div class="highlight"><pre><span class="lineno">124</span> <span class="n">load_balancing_loss</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">n_experts</span> <span class="o">*</span> <span class="p">(</span><span class="n">route_frac</span> <span class="o">*</span> <span class="n">route_prob</span><span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span></pre></div>
</div>
</div>
<div class='section' id='section-36'>
@ -491,12 +492,12 @@ $f_i$ is the count of tokens where the argmax of $p(x)$ is equal to $i$.</p>
<p>Track stats</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">125</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">&#39;dropped.&#39;</span><span class="p">,</span> <span class="n">total</span><span class="o">.</span><span class="n">new_tensor</span><span class="p">(</span><span class="n">n_dropped</span><span class="p">)</span> <span class="o">/</span> <span class="n">total</span><span class="p">)</span>
<span class="lineno">126</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">&#39;route.min.&#39;</span><span class="p">,</span> <span class="n">route_frac</span><span class="o">.</span><span class="n">min</span><span class="p">())</span>
<span class="lineno">127</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">&#39;route.max.&#39;</span><span class="p">,</span> <span class="n">route_frac</span><span class="o">.</span><span class="n">max</span><span class="p">())</span>
<span class="lineno">128</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">&#39;route.std.&#39;</span><span class="p">,</span> <span class="n">route_frac</span><span class="o">.</span><span class="n">std</span><span class="p">())</span>
<span class="lineno">129</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s2">&quot;loss.&quot;</span><span class="p">,</span> <span class="n">cross_entropy_loss</span><span class="p">)</span>
<span class="lineno">130</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s2">&quot;lb_loss.&quot;</span><span class="p">,</span> <span class="n">load_balancing_loss</span><span class="p">)</span></pre></div>
<div class="highlight"><pre><span class="lineno">127</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">&#39;dropped.&#39;</span><span class="p">,</span> <span class="n">total</span><span class="o">.</span><span class="n">new_tensor</span><span class="p">(</span><span class="n">n_dropped</span><span class="p">)</span> <span class="o">/</span> <span class="n">total</span><span class="p">)</span>
<span class="lineno">128</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">&#39;route.min.&#39;</span><span class="p">,</span> <span class="n">route_frac</span><span class="o">.</span><span class="n">min</span><span class="p">())</span>
<span class="lineno">129</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">&#39;route.max.&#39;</span><span class="p">,</span> <span class="n">route_frac</span><span class="o">.</span><span class="n">max</span><span class="p">())</span>
<span class="lineno">130</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">&#39;route.std.&#39;</span><span class="p">,</span> <span class="n">route_frac</span><span class="o">.</span><span class="n">std</span><span class="p">())</span>
<span class="lineno">131</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s2">&quot;loss.&quot;</span><span class="p">,</span> <span class="n">cross_entropy_loss</span><span class="p">)</span>
<span class="lineno">132</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s2">&quot;lb_loss.&quot;</span><span class="p">,</span> <span class="n">load_balancing_loss</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-37'>
@ -509,7 +510,7 @@ The load balancing loss is multiplied by a coefficient $\alpha$ which is
set to something small like $\alpha = 0.01$.</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">135</span> <span class="n">loss</span> <span class="o">=</span> <span class="n">cross_entropy_loss</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">load_balancing_loss_ceof</span> <span class="o">*</span> <span class="n">load_balancing_loss</span></pre></div>
<div class="highlight"><pre><span class="lineno">137</span> <span class="n">loss</span> <span class="o">=</span> <span class="n">cross_entropy_loss</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">load_balancing_loss_ceof</span> <span class="o">*</span> <span class="n">load_balancing_loss</span></pre></div>
</div>
</div>
<div class='section' id='section-38'>
@ -520,8 +521,8 @@ set to something small like $\alpha = 0.01$.</p>
<p>Calculate and log accuracy</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">138</span> <span class="bp">self</span><span class="o">.</span><span class="n">accuracy</span><span class="p">(</span><span class="n">output</span><span class="p">,</span> <span class="n">target</span><span class="p">)</span>
<span class="lineno">139</span> <span class="bp">self</span><span class="o">.</span><span class="n">accuracy</span><span class="o">.</span><span class="n">track</span><span class="p">()</span></pre></div>
<div class="highlight"><pre><span class="lineno">140</span> <span class="bp">self</span><span class="o">.</span><span class="n">accuracy</span><span class="p">(</span><span class="n">output</span><span class="p">,</span> <span class="n">target</span><span class="p">)</span>
<span class="lineno">141</span> <span class="bp">self</span><span class="o">.</span><span class="n">accuracy</span><span class="o">.</span><span class="n">track</span><span class="p">()</span></pre></div>
</div>
</div>
<div class='section' id='section-39'>
@ -532,7 +533,7 @@ set to something small like $\alpha = 0.01$.</p>
<p>Train the model</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">142</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">mode</span><span class="o">.</span><span class="n">is_train</span><span class="p">:</span></pre></div>
<div class="highlight"><pre><span class="lineno">144</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">mode</span><span class="o">.</span><span class="n">is_train</span><span class="p">:</span></pre></div>
</div>
</div>
<div class='section' id='section-40'>
@ -543,7 +544,7 @@ set to something small like $\alpha = 0.01$.</p>
<p>Calculate gradients</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">144</span> <span class="n">loss</span><span class="o">.</span><span class="n">backward</span><span class="p">()</span></pre></div>
<div class="highlight"><pre><span class="lineno">146</span> <span class="n">loss</span><span class="o">.</span><span class="n">backward</span><span class="p">()</span></pre></div>
</div>
</div>
<div class='section' id='section-41'>
@ -554,7 +555,7 @@ set to something small like $\alpha = 0.01$.</p>
<p>Clip gradients</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">146</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">clip_grad_norm_</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="o">.</span><span class="n">parameters</span><span class="p">(),</span> <span class="n">max_norm</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">grad_norm_clip</span><span class="p">)</span></pre></div>
<div class="highlight"><pre><span class="lineno">148</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">clip_grad_norm_</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="o">.</span><span class="n">parameters</span><span class="p">(),</span> <span class="n">max_norm</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">grad_norm_clip</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-42'>
@ -565,7 +566,7 @@ set to something small like $\alpha = 0.01$.</p>
<p>Take optimizer step</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">148</span> <span class="bp">self</span><span class="o">.</span><span class="n">optimizer</span><span class="o">.</span><span class="n">step</span><span class="p">()</span></pre></div>
<div class="highlight"><pre><span class="lineno">150</span> <span class="bp">self</span><span class="o">.</span><span class="n">optimizer</span><span class="o">.</span><span class="n">step</span><span class="p">()</span></pre></div>
</div>
</div>
<div class='section' id='section-43'>
@ -576,8 +577,8 @@ set to something small like $\alpha = 0.01$.</p>
<p>Log the model parameters and gradients on last batch of every epoch</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">150</span> <span class="k">if</span> <span class="n">batch_idx</span><span class="o">.</span><span class="n">is_last</span><span class="p">:</span>
<span class="lineno">151</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">&#39;model&#39;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="p">)</span></pre></div>
<div class="highlight"><pre><span class="lineno">152</span> <span class="k">if</span> <span class="n">batch_idx</span><span class="o">.</span><span class="n">is_last</span><span class="p">:</span>
<span class="lineno">153</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">&#39;model&#39;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-44'>
@ -588,7 +589,7 @@ set to something small like $\alpha = 0.01$.</p>
<p>Clear the gradients</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">153</span> <span class="bp">self</span><span class="o">.</span><span class="n">optimizer</span><span class="o">.</span><span class="n">zero_grad</span><span class="p">()</span></pre></div>
<div class="highlight"><pre><span class="lineno">155</span> <span class="bp">self</span><span class="o">.</span><span class="n">optimizer</span><span class="o">.</span><span class="n">zero_grad</span><span class="p">()</span></pre></div>
</div>
</div>
<div class='section' id='section-45'>
@ -599,7 +600,7 @@ set to something small like $\alpha = 0.01$.</p>
<p>Save the tracked metrics</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">156</span> <span class="n">tracker</span><span class="o">.</span><span class="n">save</span><span class="p">()</span></pre></div>
<div class="highlight"><pre><span class="lineno">158</span> <span class="n">tracker</span><span class="o">.</span><span class="n">save</span><span class="p">()</span></pre></div>
</div>
</div>
<div class='section' id='section-46'>
@ -610,8 +611,8 @@ set to something small like $\alpha = 0.01$.</p>
<h3>Initialize the auto-regressive model</h3>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">159</span><span class="nd">@option</span><span class="p">(</span><span class="n">Configs</span><span class="o">.</span><span class="n">model</span><span class="p">)</span>
<span class="lineno">160</span><span class="k">def</span> <span class="nf">autoregressive_model</span><span class="p">(</span><span class="n">c</span><span class="p">:</span> <span class="n">Configs</span><span class="p">):</span></pre></div>
<div class="highlight"><pre><span class="lineno">161</span><span class="nd">@option</span><span class="p">(</span><span class="n">Configs</span><span class="o">.</span><span class="n">model</span><span class="p">)</span>
<span class="lineno">162</span><span class="k">def</span> <span class="nf">autoregressive_model</span><span class="p">(</span><span class="n">c</span><span class="p">:</span> <span class="n">Configs</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-47'>
@ -622,8 +623,8 @@ set to something small like $\alpha = 0.01$.</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">164</span> <span class="n">m</span> <span class="o">=</span> <span class="n">AutoregressiveModel</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">n_tokens</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">transformer</span><span class="p">)</span>
<span class="lineno">165</span> <span class="k">return</span> <span class="n">m</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">device</span><span class="p">)</span></pre></div>
<div class="highlight"><pre><span class="lineno">166</span> <span class="n">m</span> <span class="o">=</span> <span class="n">AutoregressiveModel</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">n_tokens</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">transformer</span><span class="p">)</span>
<span class="lineno">167</span> <span class="k">return</span> <span class="n">m</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">device</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-48'>
@ -634,8 +635,8 @@ set to something small like $\alpha = 0.01$.</p>
<h3>Initialize the switch transformer</h3>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">168</span><span class="nd">@option</span><span class="p">(</span><span class="n">Configs</span><span class="o">.</span><span class="n">transformer</span><span class="p">)</span>
<span class="lineno">169</span><span class="k">def</span> <span class="nf">switch_transformer</span><span class="p">(</span><span class="n">c</span><span class="p">:</span> <span class="n">Configs</span><span class="p">):</span></pre></div>
<div class="highlight"><pre><span class="lineno">170</span><span class="nd">@option</span><span class="p">(</span><span class="n">Configs</span><span class="o">.</span><span class="n">transformer</span><span class="p">)</span>
<span class="lineno">171</span><span class="k">def</span> <span class="nf">switch_transformer</span><span class="p">(</span><span class="n">c</span><span class="p">:</span> <span class="n">Configs</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-49'>
@ -646,21 +647,21 @@ set to something small like $\alpha = 0.01$.</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">173</span> <span class="kn">from</span> <span class="nn">labml_nn.transformers.switch</span> <span class="kn">import</span> <span class="n">SwitchTransformer</span><span class="p">,</span> <span class="n">SwitchTransformerLayer</span><span class="p">,</span> <span class="n">SwitchFeedForward</span>
<span class="lineno">174</span> <span class="kn">from</span> <span class="nn">labml_nn.transformers</span> <span class="kn">import</span> <span class="n">MultiHeadAttention</span>
<span class="lineno">175</span> <span class="kn">from</span> <span class="nn">labml_nn.transformers.feed_forward</span> <span class="kn">import</span> <span class="n">FeedForward</span>
<span class="lineno">176</span>
<span class="lineno">177</span> <span class="k">return</span> <span class="n">SwitchTransformer</span><span class="p">(</span>
<span class="lineno">178</span> <span class="n">SwitchTransformerLayer</span><span class="p">(</span><span class="n">d_model</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span>
<span class="lineno">179</span> <span class="n">attn</span><span class="o">=</span><span class="n">MultiHeadAttention</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">heads</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">dropout</span><span class="p">),</span>
<span class="lineno">180</span> <span class="n">feed_forward</span><span class="o">=</span><span class="n">SwitchFeedForward</span><span class="p">(</span><span class="n">capacity_factor</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">capacity_factor</span><span class="p">,</span>
<span class="lineno">181</span> <span class="n">drop_tokens</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">drop_tokens</span><span class="p">,</span>
<span class="lineno">182</span> <span class="n">is_scale_prob</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">is_scale_prob</span><span class="p">,</span>
<span class="lineno">183</span> <span class="n">n_experts</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">n_experts</span><span class="p">,</span>
<span class="lineno">184</span> <span class="n">expert</span><span class="o">=</span><span class="n">FeedForward</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">d_ff</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">dropout</span><span class="p">),</span>
<span class="lineno">185</span> <span class="n">d_model</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">),</span>
<span class="lineno">186</span> <span class="n">dropout_prob</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">dropout</span><span class="p">),</span>
<span class="lineno">187</span> <span class="n">c</span><span class="o">.</span><span class="n">n_layers</span><span class="p">)</span></pre></div>
<div class="highlight"><pre><span class="lineno">175</span> <span class="kn">from</span> <span class="nn">labml_nn.transformers.switch</span> <span class="kn">import</span> <span class="n">SwitchTransformer</span><span class="p">,</span> <span class="n">SwitchTransformerLayer</span><span class="p">,</span> <span class="n">SwitchFeedForward</span>
<span class="lineno">176</span> <span class="kn">from</span> <span class="nn">labml_nn.transformers</span> <span class="kn">import</span> <span class="n">MultiHeadAttention</span>
<span class="lineno">177</span> <span class="kn">from</span> <span class="nn">labml_nn.transformers.feed_forward</span> <span class="kn">import</span> <span class="n">FeedForward</span>
<span class="lineno">178</span>
<span class="lineno">179</span> <span class="k">return</span> <span class="n">SwitchTransformer</span><span class="p">(</span>
<span class="lineno">180</span> <span class="n">SwitchTransformerLayer</span><span class="p">(</span><span class="n">d_model</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span>
<span class="lineno">181</span> <span class="n">attn</span><span class="o">=</span><span class="n">MultiHeadAttention</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">heads</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">dropout</span><span class="p">),</span>
<span class="lineno">182</span> <span class="n">feed_forward</span><span class="o">=</span><span class="n">SwitchFeedForward</span><span class="p">(</span><span class="n">capacity_factor</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">capacity_factor</span><span class="p">,</span>
<span class="lineno">183</span> <span class="n">drop_tokens</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">drop_tokens</span><span class="p">,</span>
<span class="lineno">184</span> <span class="n">is_scale_prob</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">is_scale_prob</span><span class="p">,</span>
<span class="lineno">185</span> <span class="n">n_experts</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">n_experts</span><span class="p">,</span>
<span class="lineno">186</span> <span class="n">expert</span><span class="o">=</span><span class="n">FeedForward</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">d_ff</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">dropout</span><span class="p">),</span>
<span class="lineno">187</span> <span class="n">d_model</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">),</span>
<span class="lineno">188</span> <span class="n">dropout_prob</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">dropout</span><span class="p">),</span>
<span class="lineno">189</span> <span class="n">c</span><span class="o">.</span><span class="n">n_layers</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-50'>
@ -671,7 +672,7 @@ set to something small like $\alpha = 0.01$.</p>
<h3>Run the experiment</h3>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">190</span><span class="k">def</span> <span class="nf">main</span><span class="p">():</span></pre></div>
<div class="highlight"><pre><span class="lineno">192</span><span class="k">def</span> <span class="nf">main</span><span class="p">():</span></pre></div>
</div>
</div>
<div class='section' id='section-51'>
@ -682,7 +683,7 @@ set to something small like $\alpha = 0.01$.</p>
<p>Create experiment</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">195</span> <span class="n">experiment</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s2">&quot;switch_transformer&quot;</span><span class="p">,</span> <span class="n">comment</span><span class="o">=</span><span class="s1">&#39;&#39;</span><span class="p">)</span></pre></div>
<div class="highlight"><pre><span class="lineno">197</span> <span class="n">experiment</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s2">&quot;switch_transformer&quot;</span><span class="p">,</span> <span class="n">comment</span><span class="o">=</span><span class="s1">&#39;&#39;</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-52'>
@ -693,7 +694,7 @@ set to something small like $\alpha = 0.01$.</p>
<p>Create configs</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">197</span> <span class="n">conf</span> <span class="o">=</span> <span class="n">Configs</span><span class="p">()</span></pre></div>
<div class="highlight"><pre><span class="lineno">199</span> <span class="n">conf</span> <span class="o">=</span> <span class="n">Configs</span><span class="p">()</span></pre></div>
</div>
</div>
<div class='section' id='section-53'>
@ -704,7 +705,7 @@ set to something small like $\alpha = 0.01$.</p>
<p>Load configurations</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">199</span> <span class="n">experiment</span><span class="o">.</span><span class="n">configs</span><span class="p">(</span><span class="n">conf</span><span class="p">,</span></pre></div>
<div class="highlight"><pre><span class="lineno">201</span> <span class="n">experiment</span><span class="o">.</span><span class="n">configs</span><span class="p">(</span><span class="n">conf</span><span class="p">,</span></pre></div>
</div>
</div>
<div class='section' id='section-54'>
@ -715,28 +716,28 @@ set to something small like $\alpha = 0.01$.</p>
<p>A dictionary of configurations to override</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">201</span> <span class="p">{</span><span class="s1">&#39;tokenizer&#39;</span><span class="p">:</span> <span class="s1">&#39;character&#39;</span><span class="p">,</span>
<span class="lineno">202</span> <span class="s1">&#39;text&#39;</span><span class="p">:</span> <span class="s1">&#39;tiny_shakespeare&#39;</span><span class="p">,</span>
<span class="lineno">203</span> <span class="s1">&#39;optimizer.learning_rate&#39;</span><span class="p">:</span> <span class="mf">1.</span><span class="p">,</span>
<span class="lineno">204</span> <span class="s1">&#39;optimizer.optimizer&#39;</span><span class="p">:</span> <span class="s1">&#39;Noam&#39;</span><span class="p">,</span>
<span class="lineno">205</span> <span class="s1">&#39;prompt&#39;</span><span class="p">:</span> <span class="s1">&#39;It is&#39;</span><span class="p">,</span>
<span class="lineno">206</span> <span class="s1">&#39;prompt_separator&#39;</span><span class="p">:</span> <span class="s1">&#39;&#39;</span><span class="p">,</span>
<span class="lineno">207</span>
<span class="lineno">208</span> <span class="s1">&#39;transformer&#39;</span><span class="p">:</span> <span class="s1">&#39;switch_transformer&#39;</span><span class="p">,</span>
<span class="lineno">209</span> <span class="s1">&#39;is_scale_prob&#39;</span><span class="p">:</span> <span class="kc">False</span><span class="p">,</span>
<span class="lineno">210</span> <span class="s1">&#39;n_experts&#39;</span><span class="p">:</span> <span class="mi">4</span><span class="p">,</span>
<span class="lineno">211</span>
<span class="lineno">212</span> <span class="s1">&#39;drop_tokens&#39;</span><span class="p">:</span> <span class="kc">True</span><span class="p">,</span>
<span class="lineno">213</span> <span class="s1">&#39;capacity_factor&#39;</span><span class="p">:</span> <span class="mf">1.2</span><span class="p">,</span>
<span class="lineno">214</span>
<span class="lineno">215</span> <span class="s1">&#39;train_loader&#39;</span><span class="p">:</span> <span class="s1">&#39;shuffled_train_loader&#39;</span><span class="p">,</span>
<span class="lineno">216</span> <span class="s1">&#39;valid_loader&#39;</span><span class="p">:</span> <span class="s1">&#39;shuffled_valid_loader&#39;</span><span class="p">,</span>
<span class="lineno">217</span>
<span class="lineno">218</span> <span class="s1">&#39;seq_len&#39;</span><span class="p">:</span> <span class="mi">64</span><span class="p">,</span>
<span class="lineno">219</span> <span class="s1">&#39;epochs&#39;</span><span class="p">:</span> <span class="mi">128</span><span class="p">,</span>
<span class="lineno">220</span> <span class="s1">&#39;batch_size&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span>
<span class="lineno">221</span> <span class="s1">&#39;inner_iterations&#39;</span><span class="p">:</span> <span class="mi">25</span><span class="p">,</span>
<span class="lineno">222</span> <span class="p">})</span></pre></div>
<div class="highlight"><pre><span class="lineno">203</span> <span class="p">{</span><span class="s1">&#39;tokenizer&#39;</span><span class="p">:</span> <span class="s1">&#39;character&#39;</span><span class="p">,</span>
<span class="lineno">204</span> <span class="s1">&#39;text&#39;</span><span class="p">:</span> <span class="s1">&#39;tiny_shakespeare&#39;</span><span class="p">,</span>
<span class="lineno">205</span> <span class="s1">&#39;optimizer.learning_rate&#39;</span><span class="p">:</span> <span class="mf">1.</span><span class="p">,</span>
<span class="lineno">206</span> <span class="s1">&#39;optimizer.optimizer&#39;</span><span class="p">:</span> <span class="s1">&#39;Noam&#39;</span><span class="p">,</span>
<span class="lineno">207</span> <span class="s1">&#39;prompt&#39;</span><span class="p">:</span> <span class="s1">&#39;It is&#39;</span><span class="p">,</span>
<span class="lineno">208</span> <span class="s1">&#39;prompt_separator&#39;</span><span class="p">:</span> <span class="s1">&#39;&#39;</span><span class="p">,</span>
<span class="lineno">209</span>
<span class="lineno">210</span> <span class="s1">&#39;transformer&#39;</span><span class="p">:</span> <span class="s1">&#39;switch_transformer&#39;</span><span class="p">,</span>
<span class="lineno">211</span> <span class="s1">&#39;is_scale_prob&#39;</span><span class="p">:</span> <span class="kc">False</span><span class="p">,</span>
<span class="lineno">212</span> <span class="s1">&#39;n_experts&#39;</span><span class="p">:</span> <span class="mi">4</span><span class="p">,</span>
<span class="lineno">213</span>
<span class="lineno">214</span> <span class="s1">&#39;drop_tokens&#39;</span><span class="p">:</span> <span class="kc">True</span><span class="p">,</span>
<span class="lineno">215</span> <span class="s1">&#39;capacity_factor&#39;</span><span class="p">:</span> <span class="mf">1.2</span><span class="p">,</span>
<span class="lineno">216</span>
<span class="lineno">217</span> <span class="s1">&#39;train_loader&#39;</span><span class="p">:</span> <span class="s1">&#39;shuffled_train_loader&#39;</span><span class="p">,</span>
<span class="lineno">218</span> <span class="s1">&#39;valid_loader&#39;</span><span class="p">:</span> <span class="s1">&#39;shuffled_valid_loader&#39;</span><span class="p">,</span>
<span class="lineno">219</span>
<span class="lineno">220</span> <span class="s1">&#39;seq_len&#39;</span><span class="p">:</span> <span class="mi">64</span><span class="p">,</span>
<span class="lineno">221</span> <span class="s1">&#39;epochs&#39;</span><span class="p">:</span> <span class="mi">128</span><span class="p">,</span>
<span class="lineno">222</span> <span class="s1">&#39;batch_size&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span>
<span class="lineno">223</span> <span class="s1">&#39;inner_iterations&#39;</span><span class="p">:</span> <span class="mi">25</span><span class="p">,</span>
<span class="lineno">224</span> <span class="p">})</span></pre></div>
</div>
</div>
<div class='section' id='section-55'>
@ -747,7 +748,7 @@ set to something small like $\alpha = 0.01$.</p>
<p>Set models for saving and loading</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">225</span> <span class="n">experiment</span><span class="o">.</span><span class="n">add_pytorch_models</span><span class="p">({</span><span class="s1">&#39;model&#39;</span><span class="p">:</span> <span class="n">conf</span><span class="o">.</span><span class="n">model</span><span class="p">})</span></pre></div>
<div class="highlight"><pre><span class="lineno">227</span> <span class="n">experiment</span><span class="o">.</span><span class="n">add_pytorch_models</span><span class="p">({</span><span class="s1">&#39;model&#39;</span><span class="p">:</span> <span class="n">conf</span><span class="o">.</span><span class="n">model</span><span class="p">})</span></pre></div>
</div>
</div>
<div class='section' id='section-56'>
@ -758,7 +759,7 @@ set to something small like $\alpha = 0.01$.</p>
<p>Start the experiment</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">228</span> <span class="k">with</span> <span class="n">experiment</span><span class="o">.</span><span class="n">start</span><span class="p">():</span></pre></div>
<div class="highlight"><pre><span class="lineno">230</span> <span class="k">with</span> <span class="n">experiment</span><span class="o">.</span><span class="n">start</span><span class="p">():</span></pre></div>
</div>
</div>
<div class='section' id='section-57'>
@ -769,7 +770,7 @@ set to something small like $\alpha = 0.01$.</p>
<p><code>TrainValidConfigs.run</code></p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">230</span> <span class="n">conf</span><span class="o">.</span><span class="n">run</span><span class="p">()</span></pre></div>
<div class="highlight"><pre><span class="lineno">232</span> <span class="n">conf</span><span class="o">.</span><span class="n">run</span><span class="p">()</span></pre></div>
</div>
</div>
<div class='section' id='section-58'>
@ -780,8 +781,8 @@ set to something small like $\alpha = 0.01$.</p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">234</span><span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s1">&#39;__main__&#39;</span><span class="p">:</span>
<span class="lineno">235</span> <span class="n">main</span><span class="p">()</span></pre></div>
<div class="highlight"><pre><span class="lineno">236</span><span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s1">&#39;__main__&#39;</span><span class="p">:</span>
<span class="lineno">237</span> <span class="n">main</span><span class="p">()</span></pre></div>
</div>
</div>
<div class='footer'>

View File

@ -23,7 +23,7 @@ def knn(queries: torch.Tensor, index: faiss.IndexFlatL2, keys_store: np.ndarray,
"""
## $k$-NN to get $p(w_t, c_t)$
Here we refer to $f($\color{yellowgreen}{c_t})$ as queries,
Here we refer to $f(\color{yellowgreen}{c_t})$ as queries,
$f(c_i)$ as keys and $w_i$ as values.
"""
@ -33,7 +33,7 @@ def knn(queries: torch.Tensor, index: faiss.IndexFlatL2, keys_store: np.ndarray,
# Flatten the `batch` and `sequence` dimensions of queries
queries = queries.view(-1, queries_shape[-1])
# Find 10 nearest neighbors of $f($\color{yellowgreen}{c_t})$ among $f(c_i)$.
# Find 10 nearest neighbors of $f(\color{yellowgreen}{c_t})$ among $f(c_i)$.
# `distance` is the distance given by FAISS and `idx`, $i$ is the index of it in `keys_store`.
distance, idx = index.search(queries.numpy(), 10)

View File

@ -119,6 +119,8 @@ class Configs(NLPAutoRegressionConfigs):
route_prob = route_prob / total
# Load balancing loss
# $$\mathscr{L} = N \sum_{i=1}^N f_i \cdot P_i$$
# $\mathscr{L}$ is the loss for a single layer and here we are
# taking the sum of losses across all layers.
load_balancing_loss = self.n_experts * (route_frac * route_prob).sum()
# Track stats