|
|
|
@ -477,10 +477,11 @@ $f_i$ is the count of tokens where the argmax of $p(x)$ is equal to $i$.</p>
|
|
|
|
|
</div>
|
|
|
|
|
<p>Load balancing loss
|
|
|
|
|
<script type="math/tex; mode=display">\mathscr{L} = N \sum_{i=1}^N f_i \cdot P_i</script>
|
|
|
|
|
</p>
|
|
|
|
|
$\mathscr{L}$ is the loss for a single layer and here we are
|
|
|
|
|
taking the sum of losses across all layers.</p>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">122</span> <span class="n">load_balancing_loss</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">n_experts</span> <span class="o">*</span> <span class="p">(</span><span class="n">route_frac</span> <span class="o">*</span> <span class="n">route_prob</span><span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">124</span> <span class="n">load_balancing_loss</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">n_experts</span> <span class="o">*</span> <span class="p">(</span><span class="n">route_frac</span> <span class="o">*</span> <span class="n">route_prob</span><span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-36'>
|
|
|
|
@ -491,12 +492,12 @@ $f_i$ is the count of tokens where the argmax of $p(x)$ is equal to $i$.</p>
|
|
|
|
|
<p>Track stats</p>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">125</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">'dropped.'</span><span class="p">,</span> <span class="n">total</span><span class="o">.</span><span class="n">new_tensor</span><span class="p">(</span><span class="n">n_dropped</span><span class="p">)</span> <span class="o">/</span> <span class="n">total</span><span class="p">)</span>
|
|
|
|
|
<span class="lineno">126</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">'route.min.'</span><span class="p">,</span> <span class="n">route_frac</span><span class="o">.</span><span class="n">min</span><span class="p">())</span>
|
|
|
|
|
<span class="lineno">127</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">'route.max.'</span><span class="p">,</span> <span class="n">route_frac</span><span class="o">.</span><span class="n">max</span><span class="p">())</span>
|
|
|
|
|
<span class="lineno">128</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">'route.std.'</span><span class="p">,</span> <span class="n">route_frac</span><span class="o">.</span><span class="n">std</span><span class="p">())</span>
|
|
|
|
|
<span class="lineno">129</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s2">"loss."</span><span class="p">,</span> <span class="n">cross_entropy_loss</span><span class="p">)</span>
|
|
|
|
|
<span class="lineno">130</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s2">"lb_loss."</span><span class="p">,</span> <span class="n">load_balancing_loss</span><span class="p">)</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">127</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">'dropped.'</span><span class="p">,</span> <span class="n">total</span><span class="o">.</span><span class="n">new_tensor</span><span class="p">(</span><span class="n">n_dropped</span><span class="p">)</span> <span class="o">/</span> <span class="n">total</span><span class="p">)</span>
|
|
|
|
|
<span class="lineno">128</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">'route.min.'</span><span class="p">,</span> <span class="n">route_frac</span><span class="o">.</span><span class="n">min</span><span class="p">())</span>
|
|
|
|
|
<span class="lineno">129</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">'route.max.'</span><span class="p">,</span> <span class="n">route_frac</span><span class="o">.</span><span class="n">max</span><span class="p">())</span>
|
|
|
|
|
<span class="lineno">130</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">'route.std.'</span><span class="p">,</span> <span class="n">route_frac</span><span class="o">.</span><span class="n">std</span><span class="p">())</span>
|
|
|
|
|
<span class="lineno">131</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s2">"loss."</span><span class="p">,</span> <span class="n">cross_entropy_loss</span><span class="p">)</span>
|
|
|
|
|
<span class="lineno">132</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s2">"lb_loss."</span><span class="p">,</span> <span class="n">load_balancing_loss</span><span class="p">)</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-37'>
|
|
|
|
@ -509,7 +510,7 @@ The load balancing loss is multiplied by a coefficient $\alpha$ which is
|
|
|
|
|
set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">135</span> <span class="n">loss</span> <span class="o">=</span> <span class="n">cross_entropy_loss</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">load_balancing_loss_ceof</span> <span class="o">*</span> <span class="n">load_balancing_loss</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">137</span> <span class="n">loss</span> <span class="o">=</span> <span class="n">cross_entropy_loss</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">load_balancing_loss_ceof</span> <span class="o">*</span> <span class="n">load_balancing_loss</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-38'>
|
|
|
|
@ -520,8 +521,8 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
<p>Calculate and log accuracy</p>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">138</span> <span class="bp">self</span><span class="o">.</span><span class="n">accuracy</span><span class="p">(</span><span class="n">output</span><span class="p">,</span> <span class="n">target</span><span class="p">)</span>
|
|
|
|
|
<span class="lineno">139</span> <span class="bp">self</span><span class="o">.</span><span class="n">accuracy</span><span class="o">.</span><span class="n">track</span><span class="p">()</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">140</span> <span class="bp">self</span><span class="o">.</span><span class="n">accuracy</span><span class="p">(</span><span class="n">output</span><span class="p">,</span> <span class="n">target</span><span class="p">)</span>
|
|
|
|
|
<span class="lineno">141</span> <span class="bp">self</span><span class="o">.</span><span class="n">accuracy</span><span class="o">.</span><span class="n">track</span><span class="p">()</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-39'>
|
|
|
|
@ -532,7 +533,7 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
<p>Train the model</p>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">142</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">mode</span><span class="o">.</span><span class="n">is_train</span><span class="p">:</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">144</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">mode</span><span class="o">.</span><span class="n">is_train</span><span class="p">:</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-40'>
|
|
|
|
@ -543,7 +544,7 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
<p>Calculate gradients</p>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">144</span> <span class="n">loss</span><span class="o">.</span><span class="n">backward</span><span class="p">()</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">146</span> <span class="n">loss</span><span class="o">.</span><span class="n">backward</span><span class="p">()</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-41'>
|
|
|
|
@ -554,7 +555,7 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
<p>Clip gradients</p>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">146</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">clip_grad_norm_</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="o">.</span><span class="n">parameters</span><span class="p">(),</span> <span class="n">max_norm</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">grad_norm_clip</span><span class="p">)</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">148</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">clip_grad_norm_</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="o">.</span><span class="n">parameters</span><span class="p">(),</span> <span class="n">max_norm</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">grad_norm_clip</span><span class="p">)</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-42'>
|
|
|
|
@ -565,7 +566,7 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
<p>Take optimizer step</p>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">148</span> <span class="bp">self</span><span class="o">.</span><span class="n">optimizer</span><span class="o">.</span><span class="n">step</span><span class="p">()</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">150</span> <span class="bp">self</span><span class="o">.</span><span class="n">optimizer</span><span class="o">.</span><span class="n">step</span><span class="p">()</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-43'>
|
|
|
|
@ -576,8 +577,8 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
<p>Log the model parameters and gradients on last batch of every epoch</p>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">150</span> <span class="k">if</span> <span class="n">batch_idx</span><span class="o">.</span><span class="n">is_last</span><span class="p">:</span>
|
|
|
|
|
<span class="lineno">151</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">'model'</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="p">)</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">152</span> <span class="k">if</span> <span class="n">batch_idx</span><span class="o">.</span><span class="n">is_last</span><span class="p">:</span>
|
|
|
|
|
<span class="lineno">153</span> <span class="n">tracker</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s1">'model'</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="p">)</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-44'>
|
|
|
|
@ -588,7 +589,7 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
<p>Clear the gradients</p>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">153</span> <span class="bp">self</span><span class="o">.</span><span class="n">optimizer</span><span class="o">.</span><span class="n">zero_grad</span><span class="p">()</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">155</span> <span class="bp">self</span><span class="o">.</span><span class="n">optimizer</span><span class="o">.</span><span class="n">zero_grad</span><span class="p">()</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-45'>
|
|
|
|
@ -599,7 +600,7 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
<p>Save the tracked metrics</p>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">156</span> <span class="n">tracker</span><span class="o">.</span><span class="n">save</span><span class="p">()</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">158</span> <span class="n">tracker</span><span class="o">.</span><span class="n">save</span><span class="p">()</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-46'>
|
|
|
|
@ -610,8 +611,8 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
<h3>Initialize the auto-regressive model</h3>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">159</span><span class="nd">@option</span><span class="p">(</span><span class="n">Configs</span><span class="o">.</span><span class="n">model</span><span class="p">)</span>
|
|
|
|
|
<span class="lineno">160</span><span class="k">def</span> <span class="nf">autoregressive_model</span><span class="p">(</span><span class="n">c</span><span class="p">:</span> <span class="n">Configs</span><span class="p">):</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">161</span><span class="nd">@option</span><span class="p">(</span><span class="n">Configs</span><span class="o">.</span><span class="n">model</span><span class="p">)</span>
|
|
|
|
|
<span class="lineno">162</span><span class="k">def</span> <span class="nf">autoregressive_model</span><span class="p">(</span><span class="n">c</span><span class="p">:</span> <span class="n">Configs</span><span class="p">):</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-47'>
|
|
|
|
@ -622,8 +623,8 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">164</span> <span class="n">m</span> <span class="o">=</span> <span class="n">AutoregressiveModel</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">n_tokens</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">transformer</span><span class="p">)</span>
|
|
|
|
|
<span class="lineno">165</span> <span class="k">return</span> <span class="n">m</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">device</span><span class="p">)</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">166</span> <span class="n">m</span> <span class="o">=</span> <span class="n">AutoregressiveModel</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">n_tokens</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">transformer</span><span class="p">)</span>
|
|
|
|
|
<span class="lineno">167</span> <span class="k">return</span> <span class="n">m</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">device</span><span class="p">)</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-48'>
|
|
|
|
@ -634,8 +635,8 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
<h3>Initialize the switch transformer</h3>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">168</span><span class="nd">@option</span><span class="p">(</span><span class="n">Configs</span><span class="o">.</span><span class="n">transformer</span><span class="p">)</span>
|
|
|
|
|
<span class="lineno">169</span><span class="k">def</span> <span class="nf">switch_transformer</span><span class="p">(</span><span class="n">c</span><span class="p">:</span> <span class="n">Configs</span><span class="p">):</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">170</span><span class="nd">@option</span><span class="p">(</span><span class="n">Configs</span><span class="o">.</span><span class="n">transformer</span><span class="p">)</span>
|
|
|
|
|
<span class="lineno">171</span><span class="k">def</span> <span class="nf">switch_transformer</span><span class="p">(</span><span class="n">c</span><span class="p">:</span> <span class="n">Configs</span><span class="p">):</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-49'>
|
|
|
|
@ -646,21 +647,21 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">173</span> <span class="kn">from</span> <span class="nn">labml_nn.transformers.switch</span> <span class="kn">import</span> <span class="n">SwitchTransformer</span><span class="p">,</span> <span class="n">SwitchTransformerLayer</span><span class="p">,</span> <span class="n">SwitchFeedForward</span>
|
|
|
|
|
<span class="lineno">174</span> <span class="kn">from</span> <span class="nn">labml_nn.transformers</span> <span class="kn">import</span> <span class="n">MultiHeadAttention</span>
|
|
|
|
|
<span class="lineno">175</span> <span class="kn">from</span> <span class="nn">labml_nn.transformers.feed_forward</span> <span class="kn">import</span> <span class="n">FeedForward</span>
|
|
|
|
|
<span class="lineno">176</span>
|
|
|
|
|
<span class="lineno">177</span> <span class="k">return</span> <span class="n">SwitchTransformer</span><span class="p">(</span>
|
|
|
|
|
<span class="lineno">178</span> <span class="n">SwitchTransformerLayer</span><span class="p">(</span><span class="n">d_model</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">179</span> <span class="n">attn</span><span class="o">=</span><span class="n">MultiHeadAttention</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">heads</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">dropout</span><span class="p">),</span>
|
|
|
|
|
<span class="lineno">180</span> <span class="n">feed_forward</span><span class="o">=</span><span class="n">SwitchFeedForward</span><span class="p">(</span><span class="n">capacity_factor</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">capacity_factor</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">181</span> <span class="n">drop_tokens</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">drop_tokens</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">182</span> <span class="n">is_scale_prob</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">is_scale_prob</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">183</span> <span class="n">n_experts</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">n_experts</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">184</span> <span class="n">expert</span><span class="o">=</span><span class="n">FeedForward</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">d_ff</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">dropout</span><span class="p">),</span>
|
|
|
|
|
<span class="lineno">185</span> <span class="n">d_model</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">),</span>
|
|
|
|
|
<span class="lineno">186</span> <span class="n">dropout_prob</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">dropout</span><span class="p">),</span>
|
|
|
|
|
<span class="lineno">187</span> <span class="n">c</span><span class="o">.</span><span class="n">n_layers</span><span class="p">)</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">175</span> <span class="kn">from</span> <span class="nn">labml_nn.transformers.switch</span> <span class="kn">import</span> <span class="n">SwitchTransformer</span><span class="p">,</span> <span class="n">SwitchTransformerLayer</span><span class="p">,</span> <span class="n">SwitchFeedForward</span>
|
|
|
|
|
<span class="lineno">176</span> <span class="kn">from</span> <span class="nn">labml_nn.transformers</span> <span class="kn">import</span> <span class="n">MultiHeadAttention</span>
|
|
|
|
|
<span class="lineno">177</span> <span class="kn">from</span> <span class="nn">labml_nn.transformers.feed_forward</span> <span class="kn">import</span> <span class="n">FeedForward</span>
|
|
|
|
|
<span class="lineno">178</span>
|
|
|
|
|
<span class="lineno">179</span> <span class="k">return</span> <span class="n">SwitchTransformer</span><span class="p">(</span>
|
|
|
|
|
<span class="lineno">180</span> <span class="n">SwitchTransformerLayer</span><span class="p">(</span><span class="n">d_model</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">181</span> <span class="n">attn</span><span class="o">=</span><span class="n">MultiHeadAttention</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">heads</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">dropout</span><span class="p">),</span>
|
|
|
|
|
<span class="lineno">182</span> <span class="n">feed_forward</span><span class="o">=</span><span class="n">SwitchFeedForward</span><span class="p">(</span><span class="n">capacity_factor</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">capacity_factor</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">183</span> <span class="n">drop_tokens</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">drop_tokens</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">184</span> <span class="n">is_scale_prob</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">is_scale_prob</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">185</span> <span class="n">n_experts</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">n_experts</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">186</span> <span class="n">expert</span><span class="o">=</span><span class="n">FeedForward</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">d_ff</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">dropout</span><span class="p">),</span>
|
|
|
|
|
<span class="lineno">187</span> <span class="n">d_model</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">),</span>
|
|
|
|
|
<span class="lineno">188</span> <span class="n">dropout_prob</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">dropout</span><span class="p">),</span>
|
|
|
|
|
<span class="lineno">189</span> <span class="n">c</span><span class="o">.</span><span class="n">n_layers</span><span class="p">)</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-50'>
|
|
|
|
@ -671,7 +672,7 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
<h3>Run the experiment</h3>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">190</span><span class="k">def</span> <span class="nf">main</span><span class="p">():</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">192</span><span class="k">def</span> <span class="nf">main</span><span class="p">():</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-51'>
|
|
|
|
@ -682,7 +683,7 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
<p>Create experiment</p>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">195</span> <span class="n">experiment</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s2">"switch_transformer"</span><span class="p">,</span> <span class="n">comment</span><span class="o">=</span><span class="s1">''</span><span class="p">)</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">197</span> <span class="n">experiment</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s2">"switch_transformer"</span><span class="p">,</span> <span class="n">comment</span><span class="o">=</span><span class="s1">''</span><span class="p">)</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-52'>
|
|
|
|
@ -693,7 +694,7 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
<p>Create configs</p>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">197</span> <span class="n">conf</span> <span class="o">=</span> <span class="n">Configs</span><span class="p">()</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">199</span> <span class="n">conf</span> <span class="o">=</span> <span class="n">Configs</span><span class="p">()</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-53'>
|
|
|
|
@ -704,7 +705,7 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
<p>Load configurations</p>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">199</span> <span class="n">experiment</span><span class="o">.</span><span class="n">configs</span><span class="p">(</span><span class="n">conf</span><span class="p">,</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">201</span> <span class="n">experiment</span><span class="o">.</span><span class="n">configs</span><span class="p">(</span><span class="n">conf</span><span class="p">,</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-54'>
|
|
|
|
@ -715,28 +716,28 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
<p>A dictionary of configurations to override</p>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">201</span> <span class="p">{</span><span class="s1">'tokenizer'</span><span class="p">:</span> <span class="s1">'character'</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">202</span> <span class="s1">'text'</span><span class="p">:</span> <span class="s1">'tiny_shakespeare'</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">203</span> <span class="s1">'optimizer.learning_rate'</span><span class="p">:</span> <span class="mf">1.</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">204</span> <span class="s1">'optimizer.optimizer'</span><span class="p">:</span> <span class="s1">'Noam'</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">205</span> <span class="s1">'prompt'</span><span class="p">:</span> <span class="s1">'It is'</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">206</span> <span class="s1">'prompt_separator'</span><span class="p">:</span> <span class="s1">''</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">207</span>
|
|
|
|
|
<span class="lineno">208</span> <span class="s1">'transformer'</span><span class="p">:</span> <span class="s1">'switch_transformer'</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">209</span> <span class="s1">'is_scale_prob'</span><span class="p">:</span> <span class="kc">False</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">210</span> <span class="s1">'n_experts'</span><span class="p">:</span> <span class="mi">4</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">211</span>
|
|
|
|
|
<span class="lineno">212</span> <span class="s1">'drop_tokens'</span><span class="p">:</span> <span class="kc">True</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">213</span> <span class="s1">'capacity_factor'</span><span class="p">:</span> <span class="mf">1.2</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">214</span>
|
|
|
|
|
<span class="lineno">215</span> <span class="s1">'train_loader'</span><span class="p">:</span> <span class="s1">'shuffled_train_loader'</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">216</span> <span class="s1">'valid_loader'</span><span class="p">:</span> <span class="s1">'shuffled_valid_loader'</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">217</span>
|
|
|
|
|
<span class="lineno">218</span> <span class="s1">'seq_len'</span><span class="p">:</span> <span class="mi">64</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">219</span> <span class="s1">'epochs'</span><span class="p">:</span> <span class="mi">128</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">220</span> <span class="s1">'batch_size'</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">221</span> <span class="s1">'inner_iterations'</span><span class="p">:</span> <span class="mi">25</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">222</span> <span class="p">})</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">203</span> <span class="p">{</span><span class="s1">'tokenizer'</span><span class="p">:</span> <span class="s1">'character'</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">204</span> <span class="s1">'text'</span><span class="p">:</span> <span class="s1">'tiny_shakespeare'</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">205</span> <span class="s1">'optimizer.learning_rate'</span><span class="p">:</span> <span class="mf">1.</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">206</span> <span class="s1">'optimizer.optimizer'</span><span class="p">:</span> <span class="s1">'Noam'</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">207</span> <span class="s1">'prompt'</span><span class="p">:</span> <span class="s1">'It is'</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">208</span> <span class="s1">'prompt_separator'</span><span class="p">:</span> <span class="s1">''</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">209</span>
|
|
|
|
|
<span class="lineno">210</span> <span class="s1">'transformer'</span><span class="p">:</span> <span class="s1">'switch_transformer'</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">211</span> <span class="s1">'is_scale_prob'</span><span class="p">:</span> <span class="kc">False</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">212</span> <span class="s1">'n_experts'</span><span class="p">:</span> <span class="mi">4</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">213</span>
|
|
|
|
|
<span class="lineno">214</span> <span class="s1">'drop_tokens'</span><span class="p">:</span> <span class="kc">True</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">215</span> <span class="s1">'capacity_factor'</span><span class="p">:</span> <span class="mf">1.2</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">216</span>
|
|
|
|
|
<span class="lineno">217</span> <span class="s1">'train_loader'</span><span class="p">:</span> <span class="s1">'shuffled_train_loader'</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">218</span> <span class="s1">'valid_loader'</span><span class="p">:</span> <span class="s1">'shuffled_valid_loader'</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">219</span>
|
|
|
|
|
<span class="lineno">220</span> <span class="s1">'seq_len'</span><span class="p">:</span> <span class="mi">64</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">221</span> <span class="s1">'epochs'</span><span class="p">:</span> <span class="mi">128</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">222</span> <span class="s1">'batch_size'</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">223</span> <span class="s1">'inner_iterations'</span><span class="p">:</span> <span class="mi">25</span><span class="p">,</span>
|
|
|
|
|
<span class="lineno">224</span> <span class="p">})</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-55'>
|
|
|
|
@ -747,7 +748,7 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
<p>Set models for saving and loading</p>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">225</span> <span class="n">experiment</span><span class="o">.</span><span class="n">add_pytorch_models</span><span class="p">({</span><span class="s1">'model'</span><span class="p">:</span> <span class="n">conf</span><span class="o">.</span><span class="n">model</span><span class="p">})</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">227</span> <span class="n">experiment</span><span class="o">.</span><span class="n">add_pytorch_models</span><span class="p">({</span><span class="s1">'model'</span><span class="p">:</span> <span class="n">conf</span><span class="o">.</span><span class="n">model</span><span class="p">})</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-56'>
|
|
|
|
@ -758,7 +759,7 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
<p>Start the experiment</p>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">228</span> <span class="k">with</span> <span class="n">experiment</span><span class="o">.</span><span class="n">start</span><span class="p">():</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">230</span> <span class="k">with</span> <span class="n">experiment</span><span class="o">.</span><span class="n">start</span><span class="p">():</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-57'>
|
|
|
|
@ -769,7 +770,7 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
<p><code>TrainValidConfigs.run</code></p>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">230</span> <span class="n">conf</span><span class="o">.</span><span class="n">run</span><span class="p">()</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">232</span> <span class="n">conf</span><span class="o">.</span><span class="n">run</span><span class="p">()</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='section' id='section-58'>
|
|
|
|
@ -780,8 +781,8 @@ set to something small like $\alpha = 0.01$.</p>
|
|
|
|
|
|
|
|
|
|
</div>
|
|
|
|
|
<div class='code'>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">234</span><span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s1">'__main__'</span><span class="p">:</span>
|
|
|
|
|
<span class="lineno">235</span> <span class="n">main</span><span class="p">()</span></pre></div>
|
|
|
|
|
<div class="highlight"><pre><span class="lineno">236</span><span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s1">'__main__'</span><span class="p">:</span>
|
|
|
|
|
<span class="lineno">237</span> <span class="n">main</span><span class="p">()</span></pre></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class='footer'>
|
|
|
|
|