Transformer experiment logs (#130)

This commit is contained in:
Varuna Jayasiri
2022-06-27 14:11:44 +05:30
committed by GitHub
parent f7262109c6
commit e09ee89f36
12 changed files with 383 additions and 372 deletions

View File

@ -70,22 +70,22 @@
<a href='#section-0'>#</a>
</div>
<h1><a href="index.html">DeepNorm</a> Experiment</h1>
<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/normalization/deep_norm/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://app.labml.ai/run/ec8e4dacb7f311ec8d1cd37d50b05c3d"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen"></a> <a href="https://www.comet.ml/labml/deep-norm/61d817f80ff143c8825fba4aacd431d4?experiment-tab=chart&showOutliers=true&smoothing=0&transformY=smoothing&xAxis=step"><img alt="Open In Comet" src="https://images.labml.ai/images/comet.svg?experiment=deep_norm&file=experiment"></a></p>
<p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/normalization/deep_norm/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://www.comet.ml/labml/deep-norm/61d817f80ff143c8825fba4aacd431d4?experiment-tab=chart&showOutliers=true&smoothing=0&transformY=smoothing&xAxis=step"><img alt="Open In Comet" src="https://images.labml.ai/images/comet.svg?experiment=deep_norm&file=experiment"></a></p>
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">15</span><span></span><span class="kn">import</span> <span class="nn">copy</span>
<span class="lineno">16</span>
<span class="lineno">17</span><span class="kn">import</span> <span class="nn">torch</span>
<span class="lineno">18</span><span class="kn">import</span> <span class="nn">torch.nn</span> <span class="k">as</span> <span class="nn">nn</span>
<span class="lineno">19</span>
<span class="lineno">20</span><span class="kn">from</span> <span class="nn">labml</span> <span class="kn">import</span> <span class="n">experiment</span>
<span class="lineno">21</span><span class="kn">from</span> <span class="nn">labml.configs</span> <span class="kn">import</span> <span class="n">option</span>
<span class="lineno">22</span><span class="kn">from</span> <span class="nn">labml_helpers.module</span> <span class="kn">import</span> <span class="n">Module</span>
<span class="lineno">23</span><span class="kn">from</span> <span class="nn">labml_nn.experiments.nlp_autoregression</span> <span class="kn">import</span> <span class="n">NLPAutoRegressionConfigs</span>
<span class="lineno">24</span><span class="kn">from</span> <span class="nn">labml_nn.normalization.deep_norm</span> <span class="kn">import</span> <span class="n">DeepNormTransformerLayer</span>
<span class="lineno">25</span><span class="kn">from</span> <span class="nn">labml_nn.transformers</span> <span class="kn">import</span> <span class="n">MultiHeadAttention</span>
<span class="lineno">26</span><span class="kn">from</span> <span class="nn">labml_nn.transformers.feed_forward</span> <span class="kn">import</span> <span class="n">FeedForward</span></pre></div>
<div class="highlight"><pre><span class="lineno">14</span><span></span><span class="kn">import</span> <span class="nn">copy</span>
<span class="lineno">15</span>
<span class="lineno">16</span><span class="kn">import</span> <span class="nn">torch</span>
<span class="lineno">17</span><span class="kn">import</span> <span class="nn">torch.nn</span> <span class="k">as</span> <span class="nn">nn</span>
<span class="lineno">18</span>
<span class="lineno">19</span><span class="kn">from</span> <span class="nn">labml</span> <span class="kn">import</span> <span class="n">experiment</span>
<span class="lineno">20</span><span class="kn">from</span> <span class="nn">labml.configs</span> <span class="kn">import</span> <span class="n">option</span>
<span class="lineno">21</span><span class="kn">from</span> <span class="nn">labml_helpers.module</span> <span class="kn">import</span> <span class="n">Module</span>
<span class="lineno">22</span><span class="kn">from</span> <span class="nn">labml_nn.experiments.nlp_autoregression</span> <span class="kn">import</span> <span class="n">NLPAutoRegressionConfigs</span>
<span class="lineno">23</span><span class="kn">from</span> <span class="nn">labml_nn.normalization.deep_norm</span> <span class="kn">import</span> <span class="n">DeepNormTransformerLayer</span>
<span class="lineno">24</span><span class="kn">from</span> <span class="nn">labml_nn.transformers</span> <span class="kn">import</span> <span class="n">MultiHeadAttention</span>
<span class="lineno">25</span><span class="kn">from</span> <span class="nn">labml_nn.transformers.feed_forward</span> <span class="kn">import</span> <span class="n">FeedForward</span></pre></div>
</div>
</div>
<div class='section' id='section-1'>
@ -98,7 +98,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">29</span><span class="k">class</span> <span class="nc">AutoregressiveTransformer</span><span class="p">(</span><span class="n">Module</span><span class="p">):</span></pre></div>
<div class="highlight"><pre><span class="lineno">28</span><span class="k">class</span> <span class="nc">AutoregressiveTransformer</span><span class="p">(</span><span class="n">Module</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-2'>
@ -114,7 +114,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">36</span> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n_tokens</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">d_model</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">n_layers</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">layer</span><span class="p">:</span> <span class="n">DeepNormTransformerLayer</span><span class="p">):</span></pre></div>
<div class="highlight"><pre><span class="lineno">35</span> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n_tokens</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">d_model</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">n_layers</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">layer</span><span class="p">:</span> <span class="n">DeepNormTransformerLayer</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-3'>
@ -125,7 +125,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">43</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span></pre></div>
<div class="highlight"><pre><span class="lineno">42</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span></pre></div>
</div>
</div>
<div class='section' id='section-4'>
@ -138,7 +138,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">45</span> <span class="bp">self</span><span class="o">.</span><span class="n">transformer</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Sequential</span><span class="p">(</span><span class="o">*</span><span class="p">[</span><span class="n">copy</span><span class="o">.</span><span class="n">deepcopy</span><span class="p">(</span><span class="n">layer</span><span class="p">)</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">n_layers</span><span class="p">)])</span></pre></div>
<div class="highlight"><pre><span class="lineno">44</span> <span class="bp">self</span><span class="o">.</span><span class="n">transformer</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Sequential</span><span class="p">(</span><span class="o">*</span><span class="p">[</span><span class="n">copy</span><span class="o">.</span><span class="n">deepcopy</span><span class="p">(</span><span class="n">layer</span><span class="p">)</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">n_layers</span><span class="p">)])</span></pre></div>
</div>
</div>
<div class='section' id='section-5'>
@ -150,7 +150,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">48</span> <span class="bp">self</span><span class="o">.</span><span class="n">emb</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Embedding</span><span class="p">(</span><span class="n">n_tokens</span><span class="p">,</span> <span class="n">d_model</span><span class="p">)</span></pre></div>
<div class="highlight"><pre><span class="lineno">47</span> <span class="bp">self</span><span class="o">.</span><span class="n">emb</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Embedding</span><span class="p">(</span><span class="n">n_tokens</span><span class="p">,</span> <span class="n">d_model</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-6'>
@ -162,7 +162,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">50</span> <span class="bp">self</span><span class="o">.</span><span class="n">readout</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Linear</span><span class="p">(</span><span class="n">d_model</span><span class="p">,</span> <span class="n">n_tokens</span><span class="p">)</span></pre></div>
<div class="highlight"><pre><span class="lineno">49</span> <span class="bp">self</span><span class="o">.</span><span class="n">readout</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Linear</span><span class="p">(</span><span class="n">d_model</span><span class="p">,</span> <span class="n">n_tokens</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-7'>
@ -175,7 +175,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">52</span> <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">):</span></pre></div>
<div class="highlight"><pre><span class="lineno">51</span> <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-8'>
@ -187,7 +187,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">57</span> <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">emb</span><span class="p">(</span><span class="n">x</span><span class="p">)</span></pre></div>
<div class="highlight"><pre><span class="lineno">56</span> <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">emb</span><span class="p">(</span><span class="n">x</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-9'>
@ -199,7 +199,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">59</span> <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">transformer</span><span class="p">(</span><span class="n">x</span><span class="p">)</span></pre></div>
<div class="highlight"><pre><span class="lineno">58</span> <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">transformer</span><span class="p">(</span><span class="n">x</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-10'>
@ -211,7 +211,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">61</span> <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">readout</span><span class="p">(</span><span class="n">x</span><span class="p">)</span></pre></div>
<div class="highlight"><pre><span class="lineno">60</span> <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">readout</span><span class="p">(</span><span class="n">x</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-11'>
@ -223,7 +223,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">64</span> <span class="k">return</span> <span class="n">x</span><span class="p">,</span> <span class="kc">None</span></pre></div>
<div class="highlight"><pre><span class="lineno">63</span> <span class="k">return</span> <span class="n">x</span><span class="p">,</span> <span class="kc">None</span></pre></div>
</div>
</div>
<div class='section' id='section-12'>
@ -237,7 +237,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">67</span><span class="k">class</span> <span class="nc">Configs</span><span class="p">(</span><span class="n">NLPAutoRegressionConfigs</span><span class="p">):</span></pre></div>
<div class="highlight"><pre><span class="lineno">66</span><span class="k">class</span> <span class="nc">Configs</span><span class="p">(</span><span class="n">NLPAutoRegressionConfigs</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-13'>
@ -249,7 +249,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">76</span> <span class="n">model</span><span class="p">:</span> <span class="n">AutoregressiveTransformer</span></pre></div>
<div class="highlight"><pre><span class="lineno">75</span> <span class="n">model</span><span class="p">:</span> <span class="n">AutoregressiveTransformer</span></pre></div>
</div>
</div>
<div class='section' id='section-14'>
@ -261,7 +261,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">79</span> <span class="n">n_layers</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">64</span></pre></div>
<div class="highlight"><pre><span class="lineno">78</span> <span class="n">n_layers</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">64</span></pre></div>
</div>
</div>
<div class='section' id='section-15'>
@ -273,8 +273,8 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">82</span> <span class="n">deep_norm_alpha</span><span class="p">:</span> <span class="nb">float</span>
<span class="lineno">83</span> <span class="n">deep_norm_beta</span><span class="p">:</span> <span class="nb">float</span></pre></div>
<div class="highlight"><pre><span class="lineno">81</span> <span class="n">deep_norm_alpha</span><span class="p">:</span> <span class="nb">float</span>
<span class="lineno">82</span> <span class="n">deep_norm_beta</span><span class="p">:</span> <span class="nb">float</span></pre></div>
</div>
</div>
<div class='section' id='section-16'>
@ -286,7 +286,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">86</span> <span class="n">n_heads</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">4</span></pre></div>
<div class="highlight"><pre><span class="lineno">85</span> <span class="n">n_heads</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">4</span></pre></div>
</div>
</div>
<div class='section' id='section-17'>
@ -298,7 +298,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">88</span> <span class="n">d_model</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">64</span></pre></div>
<div class="highlight"><pre><span class="lineno">87</span> <span class="n">d_model</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">64</span></pre></div>
</div>
</div>
<div class='section' id='section-18'>
@ -310,7 +310,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">90</span> <span class="n">d_k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">16</span></pre></div>
<div class="highlight"><pre><span class="lineno">89</span> <span class="n">d_k</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">16</span></pre></div>
</div>
</div>
<div class='section' id='section-19'>
@ -323,8 +323,8 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">93</span><span class="nd">@option</span><span class="p">(</span><span class="n">Configs</span><span class="o">.</span><span class="n">deep_norm_alpha</span><span class="p">)</span>
<span class="lineno">94</span><span class="k">def</span> <span class="nf">_deep_norm_alpha</span><span class="p">(</span><span class="n">c</span><span class="p">:</span> <span class="n">Configs</span><span class="p">):</span></pre></div>
<div class="highlight"><pre><span class="lineno">92</span><span class="nd">@option</span><span class="p">(</span><span class="n">Configs</span><span class="o">.</span><span class="n">deep_norm_alpha</span><span class="p">)</span>
<span class="lineno">93</span><span class="k">def</span> <span class="nf">_deep_norm_alpha</span><span class="p">(</span><span class="n">c</span><span class="p">:</span> <span class="n">Configs</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-20'>
@ -335,7 +335,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">100</span> <span class="k">return</span> <span class="p">(</span><span class="mf">2.</span> <span class="o">*</span> <span class="n">c</span><span class="o">.</span><span class="n">n_layers</span><span class="p">)</span> <span class="o">**</span> <span class="p">(</span><span class="mf">1.</span> <span class="o">/</span> <span class="mf">4.</span><span class="p">)</span></pre></div>
<div class="highlight"><pre><span class="lineno">99</span> <span class="k">return</span> <span class="p">(</span><span class="mf">2.</span> <span class="o">*</span> <span class="n">c</span><span class="o">.</span><span class="n">n_layers</span><span class="p">)</span> <span class="o">**</span> <span class="p">(</span><span class="mf">1.</span> <span class="o">/</span> <span class="mf">4.</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-21'>
@ -348,8 +348,8 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">103</span><span class="nd">@option</span><span class="p">(</span><span class="n">Configs</span><span class="o">.</span><span class="n">deep_norm_beta</span><span class="p">)</span>
<span class="lineno">104</span><span class="k">def</span> <span class="nf">_deep_norm_beta</span><span class="p">(</span><span class="n">c</span><span class="p">:</span> <span class="n">Configs</span><span class="p">):</span></pre></div>
<div class="highlight"><pre><span class="lineno">102</span><span class="nd">@option</span><span class="p">(</span><span class="n">Configs</span><span class="o">.</span><span class="n">deep_norm_beta</span><span class="p">)</span>
<span class="lineno">103</span><span class="k">def</span> <span class="nf">_deep_norm_beta</span><span class="p">(</span><span class="n">c</span><span class="p">:</span> <span class="n">Configs</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-22'>
@ -360,7 +360,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">110</span> <span class="k">return</span> <span class="p">(</span><span class="mf">8.</span> <span class="o">*</span> <span class="n">c</span><span class="o">.</span><span class="n">n_layers</span><span class="p">)</span> <span class="o">**</span> <span class="o">-</span><span class="p">(</span><span class="mf">1.</span> <span class="o">/</span> <span class="mf">4.</span><span class="p">)</span></pre></div>
<div class="highlight"><pre><span class="lineno">109</span> <span class="k">return</span> <span class="p">(</span><span class="mf">8.</span> <span class="o">*</span> <span class="n">c</span><span class="o">.</span><span class="n">n_layers</span><span class="p">)</span> <span class="o">**</span> <span class="o">-</span><span class="p">(</span><span class="mf">1.</span> <span class="o">/</span> <span class="mf">4.</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-23'>
@ -372,8 +372,8 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">113</span><span class="nd">@option</span><span class="p">(</span><span class="n">Configs</span><span class="o">.</span><span class="n">model</span><span class="p">)</span>
<span class="lineno">114</span><span class="k">def</span> <span class="nf">_model</span><span class="p">(</span><span class="n">c</span><span class="p">:</span> <span class="n">Configs</span><span class="p">):</span></pre></div>
<div class="highlight"><pre><span class="lineno">112</span><span class="nd">@option</span><span class="p">(</span><span class="n">Configs</span><span class="o">.</span><span class="n">model</span><span class="p">)</span>
<span class="lineno">113</span><span class="k">def</span> <span class="nf">_model</span><span class="p">(</span><span class="n">c</span><span class="p">:</span> <span class="n">Configs</span><span class="p">):</span></pre></div>
</div>
</div>
<div class='section' id='section-24'>
@ -384,16 +384,16 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">118</span> <span class="n">m</span> <span class="o">=</span> <span class="n">AutoregressiveTransformer</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">n_tokens</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">n_layers</span><span class="p">,</span>
<span class="lineno">119</span> <span class="n">DeepNormTransformerLayer</span><span class="p">(</span><span class="n">d_model</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span>
<span class="lineno">120</span> <span class="n">deep_norm_alpha</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">deep_norm_alpha</span><span class="p">,</span>
<span class="lineno">121</span> <span class="n">deep_norm_beta</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">deep_norm_beta</span><span class="p">,</span>
<span class="lineno">122</span> <span class="n">feed_forward</span><span class="o">=</span><span class="n">FeedForward</span><span class="p">(</span><span class="n">d_model</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span>
<span class="lineno">123</span> <span class="n">d_ff</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">d_model</span> <span class="o">*</span> <span class="mi">4</span><span class="p">),</span>
<span class="lineno">124</span> <span class="n">self_attn</span><span class="o">=</span><span class="n">MultiHeadAttention</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">n_heads</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span>
<span class="lineno">125</span> <span class="n">dropout_prob</span><span class="o">=</span><span class="mf">0.0</span><span class="p">)))</span>
<span class="lineno">126</span>
<span class="lineno">127</span> <span class="k">return</span> <span class="n">m</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">device</span><span class="p">)</span></pre></div>
<div class="highlight"><pre><span class="lineno">117</span> <span class="n">m</span> <span class="o">=</span> <span class="n">AutoregressiveTransformer</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">n_tokens</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">n_layers</span><span class="p">,</span>
<span class="lineno">118</span> <span class="n">DeepNormTransformerLayer</span><span class="p">(</span><span class="n">d_model</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span>
<span class="lineno">119</span> <span class="n">deep_norm_alpha</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">deep_norm_alpha</span><span class="p">,</span>
<span class="lineno">120</span> <span class="n">deep_norm_beta</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">deep_norm_beta</span><span class="p">,</span>
<span class="lineno">121</span> <span class="n">feed_forward</span><span class="o">=</span><span class="n">FeedForward</span><span class="p">(</span><span class="n">d_model</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span>
<span class="lineno">122</span> <span class="n">d_ff</span><span class="o">=</span><span class="n">c</span><span class="o">.</span><span class="n">d_model</span> <span class="o">*</span> <span class="mi">4</span><span class="p">),</span>
<span class="lineno">123</span> <span class="n">self_attn</span><span class="o">=</span><span class="n">MultiHeadAttention</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">n_heads</span><span class="p">,</span> <span class="n">c</span><span class="o">.</span><span class="n">d_model</span><span class="p">,</span>
<span class="lineno">124</span> <span class="n">dropout_prob</span><span class="o">=</span><span class="mf">0.0</span><span class="p">)))</span>
<span class="lineno">125</span>
<span class="lineno">126</span> <span class="k">return</span> <span class="n">m</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">device</span><span class="p">)</span></pre></div>
</div>
</div>
<div class='section' id='section-25'>
@ -405,7 +405,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">130</span><span class="k">def</span> <span class="nf">main</span><span class="p">():</span></pre></div>
<div class="highlight"><pre><span class="lineno">129</span><span class="k">def</span> <span class="nf">main</span><span class="p">():</span></pre></div>
</div>
</div>
<div class='section' id='section-26'>
@ -417,7 +417,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">135</span> <span class="n">experiment</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s2">&quot;deep_norm&quot;</span><span class="p">,</span> <span class="n">writers</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;screen&#39;</span><span class="p">,</span> <span class="s1">&#39;web_api&#39;</span><span class="p">,</span> <span class="s1">&#39;comet&#39;</span><span class="p">})</span></pre></div>
<div class="highlight"><pre><span class="lineno">134</span> <span class="n">experiment</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s2">&quot;deep_norm&quot;</span><span class="p">,</span> <span class="n">writers</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;screen&#39;</span><span class="p">,</span> <span class="s1">&#39;web_api&#39;</span><span class="p">,</span> <span class="s1">&#39;comet&#39;</span><span class="p">})</span></pre></div>
</div>
</div>
<div class='section' id='section-27'>
@ -429,7 +429,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">137</span> <span class="n">conf</span> <span class="o">=</span> <span class="n">Configs</span><span class="p">()</span></pre></div>
<div class="highlight"><pre><span class="lineno">136</span> <span class="n">conf</span> <span class="o">=</span> <span class="n">Configs</span><span class="p">()</span></pre></div>
</div>
</div>
<div class='section' id='section-28'>
@ -441,7 +441,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">139</span> <span class="n">experiment</span><span class="o">.</span><span class="n">configs</span><span class="p">(</span><span class="n">conf</span><span class="p">,</span> <span class="p">{</span></pre></div>
<div class="highlight"><pre><span class="lineno">138</span> <span class="n">experiment</span><span class="o">.</span><span class="n">configs</span><span class="p">(</span><span class="n">conf</span><span class="p">,</span> <span class="p">{</span></pre></div>
</div>
</div>
<div class='section' id='section-29'>
@ -453,7 +453,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">141</span> <span class="s1">&#39;tokenizer&#39;</span><span class="p">:</span> <span class="s1">&#39;character&#39;</span><span class="p">,</span></pre></div>
<div class="highlight"><pre><span class="lineno">140</span> <span class="s1">&#39;tokenizer&#39;</span><span class="p">:</span> <span class="s1">&#39;character&#39;</span><span class="p">,</span></pre></div>
</div>
</div>
<div class='section' id='section-30'>
@ -465,7 +465,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">143</span> <span class="s1">&#39;prompt_separator&#39;</span><span class="p">:</span> <span class="s1">&#39;&#39;</span><span class="p">,</span></pre></div>
<div class="highlight"><pre><span class="lineno">142</span> <span class="s1">&#39;prompt_separator&#39;</span><span class="p">:</span> <span class="s1">&#39;&#39;</span><span class="p">,</span></pre></div>
</div>
</div>
<div class='section' id='section-31'>
@ -477,7 +477,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">145</span> <span class="s1">&#39;prompt&#39;</span><span class="p">:</span> <span class="s1">&#39;It is &#39;</span><span class="p">,</span></pre></div>
<div class="highlight"><pre><span class="lineno">144</span> <span class="s1">&#39;prompt&#39;</span><span class="p">:</span> <span class="s1">&#39;It is &#39;</span><span class="p">,</span></pre></div>
</div>
</div>
<div class='section' id='section-32'>
@ -489,7 +489,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">147</span> <span class="s1">&#39;text&#39;</span><span class="p">:</span> <span class="s1">&#39;tiny_shakespeare&#39;</span><span class="p">,</span></pre></div>
<div class="highlight"><pre><span class="lineno">146</span> <span class="s1">&#39;text&#39;</span><span class="p">:</span> <span class="s1">&#39;tiny_shakespeare&#39;</span><span class="p">,</span></pre></div>
</div>
</div>
<div class='section' id='section-33'>
@ -501,7 +501,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">150</span> <span class="s1">&#39;seq_len&#39;</span><span class="p">:</span> <span class="mi">256</span><span class="p">,</span></pre></div>
<div class="highlight"><pre><span class="lineno">149</span> <span class="s1">&#39;seq_len&#39;</span><span class="p">:</span> <span class="mi">256</span><span class="p">,</span></pre></div>
</div>
</div>
<div class='section' id='section-34'>
@ -513,7 +513,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">152</span> <span class="s1">&#39;epochs&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span></pre></div>
<div class="highlight"><pre><span class="lineno">151</span> <span class="s1">&#39;epochs&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span></pre></div>
</div>
</div>
<div class='section' id='section-35'>
@ -525,7 +525,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">154</span> <span class="s1">&#39;batch_size&#39;</span><span class="p">:</span> <span class="mi">16</span><span class="p">,</span></pre></div>
<div class="highlight"><pre><span class="lineno">153</span> <span class="s1">&#39;batch_size&#39;</span><span class="p">:</span> <span class="mi">16</span><span class="p">,</span></pre></div>
</div>
</div>
<div class='section' id='section-36'>
@ -537,7 +537,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">156</span> <span class="s1">&#39;inner_iterations&#39;</span><span class="p">:</span> <span class="mi">10</span><span class="p">,</span></pre></div>
<div class="highlight"><pre><span class="lineno">155</span> <span class="s1">&#39;inner_iterations&#39;</span><span class="p">:</span> <span class="mi">10</span><span class="p">,</span></pre></div>
</div>
</div>
<div class='section' id='section-37'>
@ -549,9 +549,9 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">159</span> <span class="s1">&#39;optimizer.optimizer&#39;</span><span class="p">:</span> <span class="s1">&#39;Adam&#39;</span><span class="p">,</span>
<span class="lineno">160</span> <span class="s1">&#39;optimizer.learning_rate&#39;</span><span class="p">:</span> <span class="mf">3e-4</span><span class="p">,</span>
<span class="lineno">161</span> <span class="p">})</span></pre></div>
<div class="highlight"><pre><span class="lineno">158</span> <span class="s1">&#39;optimizer.optimizer&#39;</span><span class="p">:</span> <span class="s1">&#39;Adam&#39;</span><span class="p">,</span>
<span class="lineno">159</span> <span class="s1">&#39;optimizer.learning_rate&#39;</span><span class="p">:</span> <span class="mf">3e-4</span><span class="p">,</span>
<span class="lineno">160</span> <span class="p">})</span></pre></div>
</div>
</div>
<div class='section' id='section-38'>
@ -563,7 +563,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">164</span> <span class="n">experiment</span><span class="o">.</span><span class="n">add_pytorch_models</span><span class="p">({</span><span class="s1">&#39;model&#39;</span><span class="p">:</span> <span class="n">conf</span><span class="o">.</span><span class="n">model</span><span class="p">})</span></pre></div>
<div class="highlight"><pre><span class="lineno">163</span> <span class="n">experiment</span><span class="o">.</span><span class="n">add_pytorch_models</span><span class="p">({</span><span class="s1">&#39;model&#39;</span><span class="p">:</span> <span class="n">conf</span><span class="o">.</span><span class="n">model</span><span class="p">})</span></pre></div>
</div>
</div>
<div class='section' id='section-39'>
@ -575,7 +575,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">167</span> <span class="k">with</span> <span class="n">experiment</span><span class="o">.</span><span class="n">start</span><span class="p">():</span></pre></div>
<div class="highlight"><pre><span class="lineno">166</span> <span class="k">with</span> <span class="n">experiment</span><span class="o">.</span><span class="n">start</span><span class="p">():</span></pre></div>
</div>
</div>
<div class='section' id='section-40'>
@ -587,7 +587,7 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">169</span> <span class="n">conf</span><span class="o">.</span><span class="n">run</span><span class="p">()</span></pre></div>
<div class="highlight"><pre><span class="lineno">168</span> <span class="n">conf</span><span class="o">.</span><span class="n">run</span><span class="p">()</span></pre></div>
</div>
</div>
<div class='section' id='section-41'>
@ -599,8 +599,8 @@
</div>
<div class='code'>
<div class="highlight"><pre><span class="lineno">173</span><span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s1">&#39;__main__&#39;</span><span class="p">:</span>
<span class="lineno">174</span> <span class="n">main</span><span class="p">()</span></pre></div>
<div class="highlight"><pre><span class="lineno">172</span><span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s1">&#39;__main__&#39;</span><span class="p">:</span>
<span class="lineno">173</span> <span class="n">main</span><span class="p">()</span></pre></div>
</div>
</div>
<div class='footer'>

File diff suppressed because one or more lines are too long