mirror of
				https://github.com/labmlai/annotated_deep_learning_paper_implementations.git
				synced 2025-11-04 14:29:43 +08:00 
			
		
		
		
	🚧 layer norm
This commit is contained in:
		@ -91,14 +91,12 @@ across the features.
 | 
				
			|||||||
*Note that batch normalization, fixes the zero mean and unit variance for each vector.
 | 
					*Note that batch normalization, fixes the zero mean and unit variance for each vector.
 | 
				
			||||||
Layer normalization does it for each batch across all elements.</p>
 | 
					Layer normalization does it for each batch across all elements.</p>
 | 
				
			||||||
<p>Layer normalization is generally used for NLP tasks.</p>
 | 
					<p>Layer normalization is generally used for NLP tasks.</p>
 | 
				
			||||||
<p>Here’s <a href="mnist.html">the training code</a> and a notebook for training
 | 
					<p>We have used layer normalization in most of the
 | 
				
			||||||
a CNN classifier that use batch normalization for MNIST dataset.</p>
 | 
					<a href="../../transformers/gpt/index.html">transformer implementations</a>.</p>
 | 
				
			||||||
<p><a href="https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/batch_norm/mnist.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
 | 
					 | 
				
			||||||
<a href="https://web.lab-ml.com/run?uuid=011254fe647011ebbb8e0242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
 | 
					 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">39</span><span></span><span class="kn">import</span> <span class="nn">torch</span>
 | 
					                <div class="highlight"><pre><span class="lineno">36</span><span></span><span class="kn">import</span> <span class="nn">torch</span>
 | 
				
			||||||
<span class="lineno">40</span><span class="kn">from</span> <span class="nn">torch</span> <span class="kn">import</span> <span class="n">nn</span></pre></div>
 | 
					<span class="lineno">37</span><span class="kn">from</span> <span class="nn">torch</span> <span class="kn">import</span> <span class="n">nn</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    <div class='section' id='section-1'>
 | 
					    <div class='section' id='section-1'>
 | 
				
			||||||
@ -106,10 +104,10 @@ a CNN classifier that use batch normalization for MNIST dataset.</p>
 | 
				
			|||||||
                <div class='section-link'>
 | 
					                <div class='section-link'>
 | 
				
			||||||
                    <a href='#section-1'>#</a>
 | 
					                    <a href='#section-1'>#</a>
 | 
				
			||||||
                </div>
 | 
					                </div>
 | 
				
			||||||
                <h2>Batch Normalization Layer</h2>
 | 
					                <h2>Layer Normalization</h2>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">43</span><span class="k">class</span> <span class="nc">BatchNorm</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span></pre></div>
 | 
					                <div class="highlight"><pre><span class="lineno">40</span><span class="k">class</span> <span class="nc">LayerNorm</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    <div class='section' id='section-2'>
 | 
					    <div class='section' id='section-2'>
 | 
				
			||||||
@ -127,9 +125,9 @@ a CNN classifier that use batch normalization for MNIST dataset.</p>
 | 
				
			|||||||
<p>We’ve tried to use the same names for arguments as PyTorch <code>BatchNorm</code> implementation.</p>
 | 
					<p>We’ve tried to use the same names for arguments as PyTorch <code>BatchNorm</code> implementation.</p>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">48</span>    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">channels</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span>
 | 
					                <div class="highlight"><pre><span class="lineno">45</span>    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">channels</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span>
 | 
				
			||||||
<span class="lineno">49</span>                 <span class="n">eps</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1e-5</span><span class="p">,</span> <span class="n">momentum</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.1</span><span class="p">,</span>
 | 
					<span class="lineno">46</span>                 <span class="n">eps</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">1e-5</span><span class="p">,</span> <span class="n">momentum</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.1</span><span class="p">,</span>
 | 
				
			||||||
<span class="lineno">50</span>                 <span class="n">affine</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">track_running_stats</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">):</span></pre></div>
 | 
					<span class="lineno">47</span>                 <span class="n">affine</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">track_running_stats</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">):</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    <div class='section' id='section-3'>
 | 
					    <div class='section' id='section-3'>
 | 
				
			||||||
@ -140,14 +138,14 @@ a CNN classifier that use batch normalization for MNIST dataset.</p>
 | 
				
			|||||||
                
 | 
					                
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">60</span>        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
 | 
					                <div class="highlight"><pre><span class="lineno">57</span>        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
 | 
				
			||||||
<span class="lineno">61</span>
 | 
					<span class="lineno">58</span>
 | 
				
			||||||
<span class="lineno">62</span>        <span class="bp">self</span><span class="o">.</span><span class="n">channels</span> <span class="o">=</span> <span class="n">channels</span>
 | 
					<span class="lineno">59</span>        <span class="bp">self</span><span class="o">.</span><span class="n">channels</span> <span class="o">=</span> <span class="n">channels</span>
 | 
				
			||||||
<span class="lineno">63</span>
 | 
					<span class="lineno">60</span>
 | 
				
			||||||
<span class="lineno">64</span>        <span class="bp">self</span><span class="o">.</span><span class="n">eps</span> <span class="o">=</span> <span class="n">eps</span>
 | 
					<span class="lineno">61</span>        <span class="bp">self</span><span class="o">.</span><span class="n">eps</span> <span class="o">=</span> <span class="n">eps</span>
 | 
				
			||||||
<span class="lineno">65</span>        <span class="bp">self</span><span class="o">.</span><span class="n">momentum</span> <span class="o">=</span> <span class="n">momentum</span>
 | 
					<span class="lineno">62</span>        <span class="bp">self</span><span class="o">.</span><span class="n">momentum</span> <span class="o">=</span> <span class="n">momentum</span>
 | 
				
			||||||
<span class="lineno">66</span>        <span class="bp">self</span><span class="o">.</span><span class="n">affine</span> <span class="o">=</span> <span class="n">affine</span>
 | 
					<span class="lineno">63</span>        <span class="bp">self</span><span class="o">.</span><span class="n">affine</span> <span class="o">=</span> <span class="n">affine</span>
 | 
				
			||||||
<span class="lineno">67</span>        <span class="bp">self</span><span class="o">.</span><span class="n">track_running_stats</span> <span class="o">=</span> <span class="n">track_running_stats</span></pre></div>
 | 
					<span class="lineno">64</span>        <span class="bp">self</span><span class="o">.</span><span class="n">track_running_stats</span> <span class="o">=</span> <span class="n">track_running_stats</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    <div class='section' id='section-4'>
 | 
					    <div class='section' id='section-4'>
 | 
				
			||||||
@ -158,9 +156,9 @@ a CNN classifier that use batch normalization for MNIST dataset.</p>
 | 
				
			|||||||
                <p>Create parameters for $\gamma$ and $\beta$ for scale and shift</p>
 | 
					                <p>Create parameters for $\gamma$ and $\beta$ for scale and shift</p>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">69</span>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">affine</span><span class="p">:</span>
 | 
					                <div class="highlight"><pre><span class="lineno">66</span>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">affine</span><span class="p">:</span>
 | 
				
			||||||
<span class="lineno">70</span>            <span class="bp">self</span><span class="o">.</span><span class="n">scale</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="n">channels</span><span class="p">))</span>
 | 
					<span class="lineno">67</span>            <span class="bp">self</span><span class="o">.</span><span class="n">scale</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="n">channels</span><span class="p">))</span>
 | 
				
			||||||
<span class="lineno">71</span>            <span class="bp">self</span><span class="o">.</span><span class="n">shift</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">channels</span><span class="p">))</span></pre></div>
 | 
					<span class="lineno">68</span>            <span class="bp">self</span><span class="o">.</span><span class="n">shift</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">channels</span><span class="p">))</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    <div class='section' id='section-5'>
 | 
					    <div class='section' id='section-5'>
 | 
				
			||||||
@ -172,9 +170,9 @@ a CNN classifier that use batch normalization for MNIST dataset.</p>
 | 
				
			|||||||
mean $\mathbb{E}[x^{(k)}]$ and variance $Var[x^{(k)}]$</p>
 | 
					mean $\mathbb{E}[x^{(k)}]$ and variance $Var[x^{(k)}]$</p>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">74</span>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">track_running_stats</span><span class="p">:</span>
 | 
					                <div class="highlight"><pre><span class="lineno">71</span>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">track_running_stats</span><span class="p">:</span>
 | 
				
			||||||
<span class="lineno">75</span>            <span class="bp">self</span><span class="o">.</span><span class="n">register_buffer</span><span class="p">(</span><span class="s1">'exp_mean'</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">channels</span><span class="p">))</span>
 | 
					<span class="lineno">72</span>            <span class="bp">self</span><span class="o">.</span><span class="n">register_buffer</span><span class="p">(</span><span class="s1">'exp_mean'</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">channels</span><span class="p">))</span>
 | 
				
			||||||
<span class="lineno">76</span>            <span class="bp">self</span><span class="o">.</span><span class="n">register_buffer</span><span class="p">(</span><span class="s1">'exp_var'</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="n">channels</span><span class="p">))</span></pre></div>
 | 
					<span class="lineno">73</span>            <span class="bp">self</span><span class="o">.</span><span class="n">register_buffer</span><span class="p">(</span><span class="s1">'exp_var'</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="n">channels</span><span class="p">))</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    <div class='section' id='section-6'>
 | 
					    <div class='section' id='section-6'>
 | 
				
			||||||
@ -188,7 +186,7 @@ mean $\mathbb{E}[x^{(k)}]$ and variance $Var[x^{(k)}]$</p>
 | 
				
			|||||||
<code>[batch_size, channels, height, width]</code></p>
 | 
					<code>[batch_size, channels, height, width]</code></p>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">78</span>    <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">):</span></pre></div>
 | 
					                <div class="highlight"><pre><span class="lineno">75</span>    <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">):</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    <div class='section' id='section-7'>
 | 
					    <div class='section' id='section-7'>
 | 
				
			||||||
@ -199,7 +197,7 @@ mean $\mathbb{E}[x^{(k)}]$ and variance $Var[x^{(k)}]$</p>
 | 
				
			|||||||
                <p>Keep the original shape</p>
 | 
					                <p>Keep the original shape</p>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">86</span>        <span class="n">x_shape</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">shape</span></pre></div>
 | 
					                <div class="highlight"><pre><span class="lineno">83</span>        <span class="n">x_shape</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">shape</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    <div class='section' id='section-8'>
 | 
					    <div class='section' id='section-8'>
 | 
				
			||||||
@ -210,7 +208,7 @@ mean $\mathbb{E}[x^{(k)}]$ and variance $Var[x^{(k)}]$</p>
 | 
				
			|||||||
                <p>Get the batch size</p>
 | 
					                <p>Get the batch size</p>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">88</span>        <span class="n">batch_size</span> <span class="o">=</span> <span class="n">x_shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span></pre></div>
 | 
					                <div class="highlight"><pre><span class="lineno">85</span>        <span class="n">batch_size</span> <span class="o">=</span> <span class="n">x_shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    <div class='section' id='section-9'>
 | 
					    <div class='section' id='section-9'>
 | 
				
			||||||
@ -221,7 +219,7 @@ mean $\mathbb{E}[x^{(k)}]$ and variance $Var[x^{(k)}]$</p>
 | 
				
			|||||||
                <p>Sanity check to make sure the number of features is same</p>
 | 
					                <p>Sanity check to make sure the number of features is same</p>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">90</span>        <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">channels</span> <span class="o">==</span> <span class="n">x</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span></pre></div>
 | 
					                <div class="highlight"><pre><span class="lineno">87</span>        <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">channels</span> <span class="o">==</span> <span class="n">x</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    <div class='section' id='section-10'>
 | 
					    <div class='section' id='section-10'>
 | 
				
			||||||
@ -232,7 +230,7 @@ mean $\mathbb{E}[x^{(k)}]$ and variance $Var[x^{(k)}]$</p>
 | 
				
			|||||||
                <p>Reshape into <code>[batch_size, channels, n]</code></p>
 | 
					                <p>Reshape into <code>[batch_size, channels, n]</code></p>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">93</span>        <span class="n">x</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">batch_size</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">channels</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span></pre></div>
 | 
					                <div class="highlight"><pre><span class="lineno">90</span>        <span class="n">x</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">batch_size</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">channels</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    <div class='section' id='section-11'>
 | 
					    <div class='section' id='section-11'>
 | 
				
			||||||
@ -244,7 +242,7 @@ mean $\mathbb{E}[x^{(k)}]$ and variance $Var[x^{(k)}]$</p>
 | 
				
			|||||||
if we are in training mode or if we have not tracked exponential moving averages</p>
 | 
					if we are in training mode or if we have not tracked exponential moving averages</p>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">97</span>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">training</span> <span class="ow">or</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">track_running_stats</span><span class="p">:</span></pre></div>
 | 
					                <div class="highlight"><pre><span class="lineno">94</span>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">training</span> <span class="ow">or</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">track_running_stats</span><span class="p">:</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    <div class='section' id='section-12'>
 | 
					    <div class='section' id='section-12'>
 | 
				
			||||||
@ -256,7 +254,7 @@ if we are in training mode or if we have not tracked exponential moving averages
 | 
				
			|||||||
i.e. the means for each feature $\mathbb{E}[x^{(k)}]$</p>
 | 
					i.e. the means for each feature $\mathbb{E}[x^{(k)}]$</p>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">100</span>            <span class="n">mean</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">dim</span><span class="o">=</span><span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">])</span></pre></div>
 | 
					                <div class="highlight"><pre><span class="lineno">97</span>            <span class="n">mean</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">dim</span><span class="o">=</span><span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">])</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    <div class='section' id='section-13'>
 | 
					    <div class='section' id='section-13'>
 | 
				
			||||||
@ -268,7 +266,7 @@ i.e. the means for each feature $\mathbb{E}[x^{(k)}]$</p>
 | 
				
			|||||||
i.e. the means for each feature $\mathbb{E}[(x^{(k)})^2]$</p>
 | 
					i.e. the means for each feature $\mathbb{E}[(x^{(k)})^2]$</p>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">103</span>            <span class="n">mean_x2</span> <span class="o">=</span> <span class="p">(</span><span class="n">x</span> <span class="o">**</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">dim</span><span class="o">=</span><span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">])</span></pre></div>
 | 
					                <div class="highlight"><pre><span class="lineno">100</span>            <span class="n">mean_x2</span> <span class="o">=</span> <span class="p">(</span><span class="n">x</span> <span class="o">**</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">dim</span><span class="o">=</span><span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">])</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    <div class='section' id='section-14'>
 | 
					    <div class='section' id='section-14'>
 | 
				
			||||||
@ -279,7 +277,7 @@ i.e. the means for each feature $\mathbb{E}[(x^{(k)})^2]$</p>
 | 
				
			|||||||
                <p>Variance for each feature $Var[x^{(k)}] = \mathbb{E}[(x^{(k)})^2] - \mathbb{E}[x^{(k)}]^2$</p>
 | 
					                <p>Variance for each feature $Var[x^{(k)}] = \mathbb{E}[(x^{(k)})^2] - \mathbb{E}[x^{(k)}]^2$</p>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">105</span>            <span class="n">var</span> <span class="o">=</span> <span class="n">mean_x2</span> <span class="o">-</span> <span class="n">mean</span> <span class="o">**</span> <span class="mi">2</span></pre></div>
 | 
					                <div class="highlight"><pre><span class="lineno">102</span>            <span class="n">var</span> <span class="o">=</span> <span class="n">mean_x2</span> <span class="o">-</span> <span class="n">mean</span> <span class="o">**</span> <span class="mi">2</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    <div class='section' id='section-15'>
 | 
					    <div class='section' id='section-15'>
 | 
				
			||||||
@ -290,9 +288,9 @@ i.e. the means for each feature $\mathbb{E}[(x^{(k)})^2]$</p>
 | 
				
			|||||||
                <p>Update exponential moving averages</p>
 | 
					                <p>Update exponential moving averages</p>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">108</span>            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">training</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">track_running_stats</span><span class="p">:</span>
 | 
					                <div class="highlight"><pre><span class="lineno">105</span>            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">training</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">track_running_stats</span><span class="p">:</span>
 | 
				
			||||||
<span class="lineno">109</span>                <span class="bp">self</span><span class="o">.</span><span class="n">exp_mean</span> <span class="o">=</span> <span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">momentum</span><span class="p">)</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">exp_mean</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">momentum</span> <span class="o">*</span> <span class="n">mean</span>
 | 
					<span class="lineno">106</span>                <span class="bp">self</span><span class="o">.</span><span class="n">exp_mean</span> <span class="o">=</span> <span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">momentum</span><span class="p">)</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">exp_mean</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">momentum</span> <span class="o">*</span> <span class="n">mean</span>
 | 
				
			||||||
<span class="lineno">110</span>                <span class="bp">self</span><span class="o">.</span><span class="n">exp_var</span> <span class="o">=</span> <span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">momentum</span><span class="p">)</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">exp_var</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">momentum</span> <span class="o">*</span> <span class="n">var</span></pre></div>
 | 
					<span class="lineno">107</span>                <span class="bp">self</span><span class="o">.</span><span class="n">exp_var</span> <span class="o">=</span> <span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">momentum</span><span class="p">)</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">exp_var</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">momentum</span> <span class="o">*</span> <span class="n">var</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    <div class='section' id='section-16'>
 | 
					    <div class='section' id='section-16'>
 | 
				
			||||||
@ -303,9 +301,9 @@ i.e. the means for each feature $\mathbb{E}[(x^{(k)})^2]$</p>
 | 
				
			|||||||
                <p>Use exponential moving averages as estimates</p>
 | 
					                <p>Use exponential moving averages as estimates</p>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">112</span>        <span class="k">else</span><span class="p">:</span>
 | 
					                <div class="highlight"><pre><span class="lineno">109</span>        <span class="k">else</span><span class="p">:</span>
 | 
				
			||||||
<span class="lineno">113</span>            <span class="n">mean</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">exp_mean</span>
 | 
					<span class="lineno">110</span>            <span class="n">mean</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">exp_mean</span>
 | 
				
			||||||
<span class="lineno">114</span>            <span class="n">var</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">exp_var</span></pre></div>
 | 
					<span class="lineno">111</span>            <span class="n">var</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">exp_var</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    <div class='section' id='section-17'>
 | 
					    <div class='section' id='section-17'>
 | 
				
			||||||
@ -317,7 +315,7 @@ i.e. the means for each feature $\mathbb{E}[(x^{(k)})^2]$</p>
 | 
				
			|||||||
</p>
 | 
					</p>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">117</span>        <span class="n">x_norm</span> <span class="o">=</span> <span class="p">(</span><span class="n">x</span> <span class="o">-</span> <span class="n">mean</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span> <span class="o">/</span> <span class="n">torch</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="n">var</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">eps</span><span class="p">)</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span></pre></div>
 | 
					                <div class="highlight"><pre><span class="lineno">114</span>        <span class="n">x_norm</span> <span class="o">=</span> <span class="p">(</span><span class="n">x</span> <span class="o">-</span> <span class="n">mean</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span> <span class="o">/</span> <span class="n">torch</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="n">var</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">eps</span><span class="p">)</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    <div class='section' id='section-18'>
 | 
					    <div class='section' id='section-18'>
 | 
				
			||||||
@ -329,8 +327,8 @@ i.e. the means for each feature $\mathbb{E}[(x^{(k)})^2]$</p>
 | 
				
			|||||||
</p>
 | 
					</p>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">119</span>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">affine</span><span class="p">:</span>
 | 
					                <div class="highlight"><pre><span class="lineno">116</span>        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">affine</span><span class="p">:</span>
 | 
				
			||||||
<span class="lineno">120</span>            <span class="n">x_norm</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">scale</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="o">*</span> <span class="n">x_norm</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">shift</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span></pre></div>
 | 
					<span class="lineno">117</span>            <span class="n">x_norm</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">scale</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="o">*</span> <span class="n">x_norm</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">shift</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    <div class='section' id='section-19'>
 | 
					    <div class='section' id='section-19'>
 | 
				
			||||||
@ -341,7 +339,7 @@ i.e. the means for each feature $\mathbb{E}[(x^{(k)})^2]$</p>
 | 
				
			|||||||
                <p>Reshape to original and return</p>
 | 
					                <p>Reshape to original and return</p>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
            <div class='code'>
 | 
					            <div class='code'>
 | 
				
			||||||
                <div class="highlight"><pre><span class="lineno">123</span>        <span class="k">return</span> <span class="n">x_norm</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">x_shape</span><span class="p">)</span></pre></div>
 | 
					                <div class="highlight"><pre><span class="lineno">120</span>        <span class="k">return</span> <span class="n">x_norm</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">x_shape</span><span class="p">)</span></pre></div>
 | 
				
			||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
    </div>
 | 
					    </div>
 | 
				
			||||||
 | 
				
			|||||||
@ -29,20 +29,17 @@ Layer normalization does it for each batch across all elements.
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
Layer normalization is generally used for NLP tasks.
 | 
					Layer normalization is generally used for NLP tasks.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Here's [the training code](mnist.html) and a notebook for training
 | 
					We have used layer normalization in most of the
 | 
				
			||||||
a CNN classifier that use batch normalization for MNIST dataset.
 | 
					[transformer implementations](../../transformers/gpt/index.html).
 | 
				
			||||||
 | 
					 | 
				
			||||||
[](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/batch_norm/mnist.ipynb)
 | 
					 | 
				
			||||||
[](https://web.lab-ml.com/run?uuid=011254fe647011ebbb8e0242ac1c0002)
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
from torch import nn
 | 
					from torch import nn
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class BatchNorm(nn.Module):
 | 
					class LayerNorm(nn.Module):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    ## Batch Normalization Layer
 | 
					    ## Layer Normalization
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, channels: int, *,
 | 
					    def __init__(self, channels: int, *,
 | 
				
			||||||
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user