notes

2025-10-29 17:57:14 +08:00 · 2022-08-20 10:45:31 +05:30
parent 8838601038
commit e19d95f9c3
9 changed files with 1336 additions and 157 deletions
--- a/docs/neox/evaluation/llm_int8.html
+++ b/docs/neox/evaluation/llm_int8.html
@ -0,0 +1,177 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta http-equiv="content-type" content="text/html;charset=utf-8"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
+    <meta name="description" content=""/>
+
+    <meta name="twitter:card" content="summary"/>
+    <meta name="twitter:image:src" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
+    <meta name="twitter:title" content="llm_int8.py"/>
+    <meta name="twitter:description" content=""/>
+    <meta name="twitter:site" content="@labmlai"/>
+    <meta name="twitter:creator" content="@labmlai"/>
+
+    <meta property="og:url" content="https://nn.labml.ai/neox/evaluation/llm_int8.html"/>
+    <meta property="og:title" content="llm_int8.py"/>
+    <meta property="og:image" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
+    <meta property="og:site_name" content="llm_int8.py"/>
+    <meta property="og:type" content="object"/>
+    <meta property="og:title" content="llm_int8.py"/>
+    <meta property="og:description" content=""/>
+
+    <title>llm_int8.py</title>
+    <link rel="shortcut icon" href="/icon.png"/>
+    <link rel="stylesheet" href="../../pylit.css?v=1">
+    <link rel="canonical" href="https://nn.labml.ai/neox/evaluation/llm_int8.html"/>
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.13.18/dist/katex.min.css" integrity="sha384-zTROYFVGOfTw7JV7KUu8udsvW2fx4lWOsCEDqhBreBwlHI4ioVRtmIvEThzJHGET" crossorigin="anonymous">
+
+    <!-- Global site tag (gtag.js) - Google Analytics -->
+    <script async src="https://www.googletagmanager.com/gtag/js?id=G-4V3HC8HBLH"></script>
+    <script>
+        window.dataLayer = window.dataLayer || [];
+
+        function gtag() {
+            dataLayer.push(arguments);
+        }
+
+        gtag('js', new Date());
+
+        gtag('config', 'G-4V3HC8HBLH');
+    </script>
+</head>
+<body>
+<div id='container'>
+    <div id="background"></div>
+    <div class='section'>
+        <div class='docs'>
+            <p>
+                <a class="parent" href="/">home</a>
+                <a class="parent" href="../index.html">neox</a>
+                <a class="parent" href="index.html">evaluation</a>
+            </p>
+            <p>
+                <a href="https://github.com/sponsors/labmlai" target="_blank">
+                    <img alt="Sponsor"
+                         src="https://img.shields.io/static/v1?label=Sponsor&message=%E2%9D%A4&logo=GitHub&color=%23fe8e86"
+                         style="max-width:100%;"/></a>
+                <a href="https://github.com/labmlai/annotated_deep_learning_paper_implementations" target="_blank">
+                    <img alt="Github"
+                         src="https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social"
+                         style="max-width:100%;"/></a>
+                <a href="https://twitter.com/labmlai" rel="nofollow" target="_blank">
+                    <img alt="Twitter"
+                         src="https://img.shields.io/twitter/follow/labmlai?style=social"
+                         style="max-width:100%;"/></a>
+            </p>
+            <p>
+                <a href="https://github.com/labmlai/annotated_deep_learning_paper_implementations/tree/master/labml_nn/neox/evaluation/llm_int8.py" target="_blank">
+                    View code on Github</a>
+            </p>
+        </div>
+    </div>
+    <div class='section' id='section-0'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-0'>#</a>
+            </div>
+            
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">1</span><span></span><span class="kn">import</span> <span class="nn">torch</span>
+<span class="lineno">2</span><span class="kn">from</span> <span class="nn">torch</span> <span class="kn">import</span> <span class="n">nn</span>
+<span class="lineno">3</span>
+<span class="lineno">4</span><span class="kn">from</span> <span class="nn">labml</span> <span class="kn">import</span> <span class="n">monit</span>
+<span class="lineno">5</span><span class="kn">from</span> <span class="nn">labml_nn.neox.evaluation</span> <span class="kn">import</span> <span class="n">run_eval_harness</span>
+<span class="lineno">6</span><span class="kn">from</span> <span class="nn">labml_nn.neox.model</span> <span class="kn">import</span> <span class="n">LayerGenerator</span>
+<span class="lineno">7</span>
+<span class="lineno">8</span><span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s1">&#39;__main__&#39;</span><span class="p">:</span>
+<span class="lineno">9</span>    <span class="n">device</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="s1">&#39;cuda:0&#39;</span><span class="p">)</span>
+<span class="lineno">10</span>    <span class="n">layer_generator</span> <span class="o">=</span> <span class="n">LayerGenerator</span><span class="p">(</span><span class="n">is_clone_layers</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+<span class="lineno">11</span>                                     <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float16</span><span class="p">,</span>
+<span class="lineno">12</span>                                     <span class="n">device</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="s1">&#39;cpu&#39;</span><span class="p">),</span>
+<span class="lineno">13</span>                                     <span class="p">)</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-1'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-1'>#</a>
+            </div>
+            <p>Load layers </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">15</span>    <span class="n">layers</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">layer_generator</span><span class="o">.</span><span class="n">load</span><span class="p">())</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-2'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-2'>#</a>
+            </div>
+            <p>This reduces CUDA memory fragmentation </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">18</span>    <span class="k">for</span> <span class="n">layer</span> <span class="ow">in</span> <span class="n">monit</span><span class="o">.</span><span class="n">iterate</span><span class="p">(</span><span class="s1">&#39;Convert to int8&#39;</span><span class="p">,</span> <span class="n">layers</span><span class="p">,</span> <span class="n">is_children_silent</span><span class="o">=</span><span class="kc">True</span><span class="p">):</span>
+<span class="lineno">19</span>        <span class="n">layer_generator</span><span class="o">.</span><span class="n">post_load_prepare</span><span class="p">(</span><span class="n">layer</span><span class="p">,</span>
+<span class="lineno">20</span>                                          <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">,</span>
+<span class="lineno">21</span>                                          <span class="n">is_llm_int8</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+<span class="lineno">22</span>                                          <span class="n">llm_int8_threshold</span><span class="o">=</span><span class="mf">6.0</span><span class="p">,</span>
+<span class="lineno">23</span>                                          <span class="p">)</span>
+<span class="lineno">24</span>        <span class="n">layer</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">)</span>
+<span class="lineno">25</span>
+<span class="lineno">26</span>    <span class="k">with</span> <span class="n">monit</span><span class="o">.</span><span class="n">section</span><span class="p">(</span><span class="s1">&#39;Sequential&#39;</span><span class="p">):</span>
+<span class="lineno">27</span>        <span class="n">model</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Sequential</span><span class="p">(</span><span class="o">*</span><span class="n">layers</span><span class="p">)</span>
+<span class="lineno">28</span>
+<span class="lineno">29</span>    <span class="nb">print</span><span class="p">(</span><span class="n">run_eval_harness</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="s1">&#39;half_precision&#39;</span><span class="p">,</span> <span class="p">[],</span> <span class="n">device</span><span class="p">))</span></pre></div>
+        </div>
+    </div>
+    <div class='footer'>
+        <a href="https://papers.labml.ai">Trending Research Papers</a>
+        <a href="https://labml.ai">labml.ai</a>
+    </div>
+</div>
+<script src=../../interactive.js?v=1"></script>
+<script>
+    function handleImages() {
+        var images = document.querySelectorAll('p>img')
+
+        for (var i = 0; i < images.length; ++i) {
+            handleImage(images[i])
+        }
+    }
+
+    function handleImage(img) {
+        img.parentElement.style.textAlign = 'center'
+
+        var modal = document.createElement('div')
+        modal.id = 'modal'
+
+        var modalContent = document.createElement('div')
+        modal.appendChild(modalContent)
+
+        var modalImage = document.createElement('img')
+        modalContent.appendChild(modalImage)
+
+        var span = document.createElement('span')
+        span.classList.add('close')
+        span.textContent = 'x'
+        modal.appendChild(span)
+
+        img.onclick = function () {
+            console.log('clicked')
+            document.body.appendChild(modal)
+            modalImage.src = img.src
+        }
+
+        span.onclick = function () {
+            document.body.removeChild(modal)
+        }
+    }
+
+    handleImages()
+</script>
+</body>
+</html>
--- a/docs/neox/model.html
+++ b/docs/neox/model.html
--- a/docs/neox/readme.html
+++ b/docs/neox/readme.html
@ -0,0 +1,129 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta http-equiv="content-type" content="text/html;charset=utf-8"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
+    <meta name="description" content=""/>
+
+    <meta name="twitter:card" content="summary"/>
+    <meta name="twitter:image:src" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
+    <meta name="twitter:title" content="GPT-NeoX"/>
+    <meta name="twitter:description" content=""/>
+    <meta name="twitter:site" content="@labmlai"/>
+    <meta name="twitter:creator" content="@labmlai"/>
+
+    <meta property="og:url" content="https://nn.labml.ai/neox/readme.html"/>
+    <meta property="og:title" content="GPT-NeoX"/>
+    <meta property="og:image" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
+    <meta property="og:site_name" content="GPT-NeoX"/>
+    <meta property="og:type" content="object"/>
+    <meta property="og:title" content="GPT-NeoX"/>
+    <meta property="og:description" content=""/>
+
+    <title>GPT-NeoX</title>
+    <link rel="shortcut icon" href="/icon.png"/>
+    <link rel="stylesheet" href="../pylit.css?v=1">
+    <link rel="canonical" href="https://nn.labml.ai/neox/readme.html"/>
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.13.18/dist/katex.min.css" integrity="sha384-zTROYFVGOfTw7JV7KUu8udsvW2fx4lWOsCEDqhBreBwlHI4ioVRtmIvEThzJHGET" crossorigin="anonymous">
+
+    <!-- Global site tag (gtag.js) - Google Analytics -->
+    <script async src="https://www.googletagmanager.com/gtag/js?id=G-4V3HC8HBLH"></script>
+    <script>
+        window.dataLayer = window.dataLayer || [];
+
+        function gtag() {
+            dataLayer.push(arguments);
+        }
+
+        gtag('js', new Date());
+
+        gtag('config', 'G-4V3HC8HBLH');
+    </script>
+</head>
+<body>
+<div id='container'>
+    <div id="background"></div>
+    <div class='section'>
+        <div class='docs'>
+            <p>
+                <a class="parent" href="/">home</a>
+                <a class="parent" href="index.html">neox</a>
+            </p>
+            <p>
+                <a href="https://github.com/sponsors/labmlai" target="_blank">
+                    <img alt="Sponsor"
+                         src="https://img.shields.io/static/v1?label=Sponsor&message=%E2%9D%A4&logo=GitHub&color=%23fe8e86"
+                         style="max-width:100%;"/></a>
+                <a href="https://github.com/labmlai/annotated_deep_learning_paper_implementations" target="_blank">
+                    <img alt="Github"
+                         src="https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social"
+                         style="max-width:100%;"/></a>
+                <a href="https://twitter.com/labmlai" rel="nofollow" target="_blank">
+                    <img alt="Twitter"
+                         src="https://img.shields.io/twitter/follow/labmlai?style=social"
+                         style="max-width:100%;"/></a>
+            </p>
+            <p>
+                <a href="https://github.com/labmlai/annotated_deep_learning_paper_implementations/tree/master/labml_nn/neox/readme.md" target="_blank">
+                    View code on Github</a>
+            </p>
+        </div>
+    </div>
+    <div class='section' id='section-0'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-0'>#</a>
+            </div>
+            <b>MarkdownException</b> + Small: parse error
+        </div>
+        <div class='code'>
+            
+        </div>
+    </div>
+    <div class='footer'>
+        <a href="https://papers.labml.ai">Trending Research Papers</a>
+        <a href="https://labml.ai">labml.ai</a>
+    </div>
+</div>
+<script src=../interactive.js?v=1"></script>
+<script>
+    function handleImages() {
+        var images = document.querySelectorAll('p>img')
+
+        for (var i = 0; i < images.length; ++i) {
+            handleImage(images[i])
+        }
+    }
+
+    function handleImage(img) {
+        img.parentElement.style.textAlign = 'center'
+
+        var modal = document.createElement('div')
+        modal.id = 'modal'
+
+        var modalContent = document.createElement('div')
+        modal.appendChild(modalContent)
+
+        var modalImage = document.createElement('img')
+        modalContent.appendChild(modalImage)
+
+        var span = document.createElement('span')
+        span.classList.add('close')
+        span.textContent = 'x'
+        modal.appendChild(span)
+
+        img.onclick = function () {
+            console.log('clicked')
+            document.body.appendChild(modal)
+            modalImage.src = img.src
+        }
+
+        span.onclick = function () {
+            document.body.removeChild(modal)
+        }
+    }
+
+    handleImages()
+</script>
+</body>
+</html>
--- a/docs/neox/samples/llm_int8.html
+++ b/docs/neox/samples/llm_int8.html
@ -0,0 +1,362 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta http-equiv="content-type" content="text/html;charset=utf-8"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
+    <meta name="description" content="Generate Text with GPT-NeoX using LLM.int8() quantization"/>
+
+    <meta name="twitter:card" content="summary"/>
+    <meta name="twitter:image:src" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
+    <meta name="twitter:title" content="Generate Text with GPT-NeoX using LLM.int8() quantization"/>
+    <meta name="twitter:description" content="Generate Text with GPT-NeoX using LLM.int8() quantization"/>
+    <meta name="twitter:site" content="@labmlai"/>
+    <meta name="twitter:creator" content="@labmlai"/>
+
+    <meta property="og:url" content="https://nn.labml.ai/neox/samples/llm_int8.html"/>
+    <meta property="og:title" content="Generate Text with GPT-NeoX using LLM.int8() quantization"/>
+    <meta property="og:image" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
+    <meta property="og:site_name" content="Generate Text with GPT-NeoX using LLM.int8() quantization"/>
+    <meta property="og:type" content="object"/>
+    <meta property="og:title" content="Generate Text with GPT-NeoX using LLM.int8() quantization"/>
+    <meta property="og:description" content="Generate Text with GPT-NeoX using LLM.int8() quantization"/>
+
+    <title>Generate Text with GPT-NeoX using LLM.int8() quantization</title>
+    <link rel="shortcut icon" href="/icon.png"/>
+    <link rel="stylesheet" href="../../pylit.css?v=1">
+    <link rel="canonical" href="https://nn.labml.ai/neox/samples/llm_int8.html"/>
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.13.18/dist/katex.min.css" integrity="sha384-zTROYFVGOfTw7JV7KUu8udsvW2fx4lWOsCEDqhBreBwlHI4ioVRtmIvEThzJHGET" crossorigin="anonymous">
+
+    <!-- Global site tag (gtag.js) - Google Analytics -->
+    <script async src="https://www.googletagmanager.com/gtag/js?id=G-4V3HC8HBLH"></script>
+    <script>
+        window.dataLayer = window.dataLayer || [];
+
+        function gtag() {
+            dataLayer.push(arguments);
+        }
+
+        gtag('js', new Date());
+
+        gtag('config', 'G-4V3HC8HBLH');
+    </script>
+</head>
+<body>
+<div id='container'>
+    <div id="background"></div>
+    <div class='section'>
+        <div class='docs'>
+            <p>
+                <a class="parent" href="/">home</a>
+                <a class="parent" href="../index.html">neox</a>
+                <a class="parent" href="index.html">samples</a>
+            </p>
+            <p>
+                <a href="https://github.com/sponsors/labmlai" target="_blank">
+                    <img alt="Sponsor"
+                         src="https://img.shields.io/static/v1?label=Sponsor&message=%E2%9D%A4&logo=GitHub&color=%23fe8e86"
+                         style="max-width:100%;"/></a>
+                <a href="https://github.com/labmlai/annotated_deep_learning_paper_implementations" target="_blank">
+                    <img alt="Github"
+                         src="https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social"
+                         style="max-width:100%;"/></a>
+                <a href="https://twitter.com/labmlai" rel="nofollow" target="_blank">
+                    <img alt="Twitter"
+                         src="https://img.shields.io/twitter/follow/labmlai?style=social"
+                         style="max-width:100%;"/></a>
+            </p>
+            <p>
+                <a href="https://github.com/labmlai/annotated_deep_learning_paper_implementations/tree/master/labml_nn/neox/samples/llm_int8.py" target="_blank">
+                    View code on Github</a>
+            </p>
+        </div>
+    </div>
+    <div class='section' id='section-0'>
+        <div class='docs doc-strings'>
+            <div class='section-link'>
+                <a href='#section-0'>#</a>
+            </div>
+            <h1>Generate Text with GPT-NeoX using LLM.int8() quantization</h1>
+<p>This shows how to generate text from GPT-NeoX using <a href="../utils/llm_int8.html">LLM.int8() quantization</a>.</p>
+<p>This needs a GPU with more than 45GB memory.</p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">15</span><span></span><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">List</span>
+<span class="lineno">16</span>
+<span class="lineno">17</span><span class="kn">import</span> <span class="nn">torch</span>
+<span class="lineno">18</span><span class="kn">from</span> <span class="nn">torch</span> <span class="kn">import</span> <span class="n">nn</span>
+<span class="lineno">19</span>
+<span class="lineno">20</span><span class="kn">from</span> <span class="nn">labml</span> <span class="kn">import</span> <span class="n">monit</span>
+<span class="lineno">21</span><span class="kn">from</span> <span class="nn">labml_nn.neox.model</span> <span class="kn">import</span> <span class="n">LayerGenerator</span>
+<span class="lineno">22</span><span class="kn">from</span> <span class="nn">labml_nn.neox.samples.generate</span> <span class="kn">import</span> <span class="n">PROMPT</span><span class="p">,</span> <span class="n">infer</span>
+<span class="lineno">23</span><span class="kn">from</span> <span class="nn">labml_nn.neox.utils</span> <span class="kn">import</span> <span class="n">get_tokens</span><span class="p">,</span> <span class="n">print_tokens</span>
+<span class="lineno">24</span><span class="kn">from</span> <span class="nn">labml_nn.neox.utils.cache</span> <span class="kn">import</span> <span class="n">get_cache</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-1'>
+        <div class='docs doc-strings'>
+            <div class='section-link'>
+                <a href='#section-1'>#</a>
+            </div>
+            <h2>Generate text</h2>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">27</span><span class="k">def</span> <span class="nf">generate</span><span class="p">():</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-2'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-2'>#</a>
+            </div>
+            <p>Setup <a href="../utils/cache.html">cache</a> to cache intermediate key/value pairs for faster generation </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">33</span>    <span class="n">cache</span> <span class="o">=</span> <span class="n">get_cache</span><span class="p">()</span>
+<span class="lineno">34</span>    <span class="n">cache</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s1">&#39;use_cache&#39;</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-3'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-3'>#</a>
+            </div>
+            <p>Device </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">37</span>    <span class="n">device</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="s1">&#39;cuda:0&#39;</span><span class="p">)</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-4'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-4'>#</a>
+            </div>
+            <p>Load layers in float16 into CPU. We convert the layers to int8 later, because doing that on the fly after loading layers to GPU causes CUDA memory fragmentation (about 3GB memory can get lost due to fragmentation). </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">42</span>    <span class="n">layer_generator</span> <span class="o">=</span> <span class="n">LayerGenerator</span><span class="p">(</span><span class="n">is_clone_layers</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+<span class="lineno">43</span>                                     <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float16</span><span class="p">,</span>
+<span class="lineno">44</span>                                     <span class="n">device</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="s1">&#39;cpu&#39;</span><span class="p">),</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-5'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-5'>#</a>
+            </div>
+            <p>is_llm_int8=True, </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">46</span>                                     <span class="p">)</span>
+<span class="lineno">47</span>    <span class="n">layers</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">layer_generator</span><span class="o">.</span><span class="n">load</span><span class="p">())</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-6'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-6'>#</a>
+            </div>
+            <p>This reduces CUDA memory fragmentation </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">50</span>    <span class="k">for</span> <span class="n">layer</span> <span class="ow">in</span> <span class="n">monit</span><span class="o">.</span><span class="n">iterate</span><span class="p">(</span><span class="s1">&#39;Convert to int8&#39;</span><span class="p">,</span> <span class="n">layers</span><span class="p">,</span> <span class="n">is_children_silent</span><span class="o">=</span><span class="kc">True</span><span class="p">):</span>
+<span class="lineno">51</span>        <span class="n">layer_generator</span><span class="o">.</span><span class="n">post_load_prepare</span><span class="p">(</span><span class="n">layer</span><span class="p">,</span>
+<span class="lineno">52</span>                                          <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">,</span>
+<span class="lineno">53</span>                                          <span class="n">is_llm_int8</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+<span class="lineno">54</span>                                          <span class="n">llm_int8_threshold</span><span class="o">=</span><span class="mf">6.0</span><span class="p">,</span>
+<span class="lineno">55</span>                                          <span class="p">)</span>
+<span class="lineno">56</span>        <span class="n">layer</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">)</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-7'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-7'>#</a>
+            </div>
+            <p>Create <code class="highlight"><span></span><span class="n">nn</span><span class="o">.</span><span class="n">Sequential</span></code>
+ model </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">59</span>    <span class="n">model</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Sequential</span><span class="p">(</span><span class="o">*</span><span class="n">layers</span><span class="p">)</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-8'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-8'>#</a>
+            </div>
+            <p>Clear cache and print memory summary for debugging </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">62</span>    <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">empty_cache</span><span class="p">()</span>
+<span class="lineno">63</span>    <span class="nb">print</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">memory_summary</span><span class="p">())</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-9'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-9'>#</a>
+            </div>
+            <p>Get token ids </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">66</span>    <span class="n">ids</span> <span class="o">=</span> <span class="n">get_tokens</span><span class="p">(</span><span class="n">PROMPT</span><span class="p">)</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-10'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-10'>#</a>
+            </div>
+            <p>Run the model </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">69</span>    <span class="n">cache</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s1">&#39;state_ids&#39;</span><span class="p">,</span> <span class="p">(</span><span class="kc">None</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span>
+<span class="lineno">70</span>    <span class="k">with</span> <span class="n">monit</span><span class="o">.</span><span class="n">section</span><span class="p">(</span><span class="s1">&#39;Infer&#39;</span><span class="p">):</span>
+<span class="lineno">71</span>        <span class="n">next_token</span> <span class="o">=</span> <span class="n">infer</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">ids</span><span class="p">,</span> <span class="n">device</span><span class="p">)[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-11'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-11'>#</a>
+            </div>
+            <p>Append the predicted token </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">74</span>    <span class="n">ids</span> <span class="o">+=</span> <span class="p">[</span><span class="n">next_token</span><span class="p">]</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-12'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-12'>#</a>
+            </div>
+            <p>Predict 100 tokens </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">77</span>    <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">100</span><span class="p">):</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-13'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-13'>#</a>
+            </div>
+            <p>Set the state to use cached activations </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">79</span>        <span class="n">cache</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s1">&#39;state_ids&#39;</span><span class="p">,</span> <span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="n">i</span> <span class="o">+</span> <span class="mi">1</span><span class="p">))</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-14'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-14'>#</a>
+            </div>
+            <p>Get next token. Note that we only feed the last token to the model because we cache the key/value pairs of previous tokens. </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">82</span>        <span class="k">with</span> <span class="n">monit</span><span class="o">.</span><span class="n">section</span><span class="p">(</span><span class="s1">&#39;Infer&#39;</span><span class="p">):</span>
+<span class="lineno">83</span>            <span class="n">next_token</span> <span class="o">=</span> <span class="n">infer</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="p">[</span><span class="n">next_token</span><span class="p">],</span> <span class="n">device</span><span class="p">)[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-15'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-15'>#</a>
+            </div>
+            <p>Append the predicted token </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">85</span>        <span class="n">ids</span> <span class="o">+=</span> <span class="p">[</span><span class="n">next_token</span><span class="p">]</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-16'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-16'>#</a>
+            </div>
+            <p>Print </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">87</span>        <span class="n">print_tokens</span><span class="p">(</span><span class="n">ids</span><span class="p">,</span> <span class="p">[</span><span class="n">ids</span><span class="p">])</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-17'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-17'>#</a>
+            </div>
+            <p> </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">91</span><span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s1">&#39;__main__&#39;</span><span class="p">:</span>
+<span class="lineno">92</span>    <span class="n">generate</span><span class="p">()</span></pre></div>
+        </div>
+    </div>
+    <div class='footer'>
+        <a href="https://papers.labml.ai">Trending Research Papers</a>
+        <a href="https://labml.ai">labml.ai</a>
+    </div>
+</div>
+<script src=../../interactive.js?v=1"></script>
+<script>
+    function handleImages() {
+        var images = document.querySelectorAll('p>img')
+
+        for (var i = 0; i < images.length; ++i) {
+            handleImage(images[i])
+        }
+    }
+
+    function handleImage(img) {
+        img.parentElement.style.textAlign = 'center'
+
+        var modal = document.createElement('div')
+        modal.id = 'modal'
+
+        var modalContent = document.createElement('div')
+        modal.appendChild(modalContent)
+
+        var modalImage = document.createElement('img')
+        modalContent.appendChild(modalImage)
+
+        var span = document.createElement('span')
+        span.classList.add('close')
+        span.textContent = 'x'
+        modal.appendChild(span)
+
+        img.onclick = function () {
+            console.log('clicked')
+            document.body.appendChild(modal)
+            modalImage.src = img.src
+        }
+
+        span.onclick = function () {
+            document.body.removeChild(modal)
+        }
+    }
+
+    handleImages()
+</script>
+</body>
+</html>
--- a/docs/neox/utils/llm_int8.html
+++ b/docs/neox/utils/llm_int8.html
@ -0,0 +1,247 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta http-equiv="content-type" content="text/html;charset=utf-8"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
+    <meta name="description" content="Transform nn.Linear layers to 8-bit integer layers."/>
+
+    <meta name="twitter:card" content="summary"/>
+    <meta name="twitter:image:src" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
+    <meta name="twitter:title" content="LLM.int8() on GPT-NeoX"/>
+    <meta name="twitter:description" content="Transform nn.Linear layers to 8-bit integer layers."/>
+    <meta name="twitter:site" content="@labmlai"/>
+    <meta name="twitter:creator" content="@labmlai"/>
+
+    <meta property="og:url" content="https://nn.labml.ai/neox/utils/llm_int8.html"/>
+    <meta property="og:title" content="LLM.int8() on GPT-NeoX"/>
+    <meta property="og:image" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
+    <meta property="og:site_name" content="LLM.int8() on GPT-NeoX"/>
+    <meta property="og:type" content="object"/>
+    <meta property="og:title" content="LLM.int8() on GPT-NeoX"/>
+    <meta property="og:description" content="Transform nn.Linear layers to 8-bit integer layers."/>
+
+    <title>LLM.int8() on GPT-NeoX</title>
+    <link rel="shortcut icon" href="/icon.png"/>
+    <link rel="stylesheet" href="../../pylit.css?v=1">
+    <link rel="canonical" href="https://nn.labml.ai/neox/utils/llm_int8.html"/>
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.13.18/dist/katex.min.css" integrity="sha384-zTROYFVGOfTw7JV7KUu8udsvW2fx4lWOsCEDqhBreBwlHI4ioVRtmIvEThzJHGET" crossorigin="anonymous">
+
+    <!-- Global site tag (gtag.js) - Google Analytics -->
+    <script async src="https://www.googletagmanager.com/gtag/js?id=G-4V3HC8HBLH"></script>
+    <script>
+        window.dataLayer = window.dataLayer || [];
+
+        function gtag() {
+            dataLayer.push(arguments);
+        }
+
+        gtag('js', new Date());
+
+        gtag('config', 'G-4V3HC8HBLH');
+    </script>
+</head>
+<body>
+<div id='container'>
+    <div id="background"></div>
+    <div class='section'>
+        <div class='docs'>
+            <p>
+                <a class="parent" href="/">home</a>
+                <a class="parent" href="../index.html">neox</a>
+                <a class="parent" href="index.html">utils</a>
+            </p>
+            <p>
+                <a href="https://github.com/sponsors/labmlai" target="_blank">
+                    <img alt="Sponsor"
+                         src="https://img.shields.io/static/v1?label=Sponsor&message=%E2%9D%A4&logo=GitHub&color=%23fe8e86"
+                         style="max-width:100%;"/></a>
+                <a href="https://github.com/labmlai/annotated_deep_learning_paper_implementations" target="_blank">
+                    <img alt="Github"
+                         src="https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social"
+                         style="max-width:100%;"/></a>
+                <a href="https://twitter.com/labmlai" rel="nofollow" target="_blank">
+                    <img alt="Twitter"
+                         src="https://img.shields.io/twitter/follow/labmlai?style=social"
+                         style="max-width:100%;"/></a>
+            </p>
+            <p>
+                <a href="https://github.com/labmlai/annotated_deep_learning_paper_implementations/tree/master/labml_nn/neox/utils/llm_int8.py" target="_blank">
+                    View code on Github</a>
+            </p>
+        </div>
+    </div>
+    <div class='section' id='section-0'>
+        <div class='docs doc-strings'>
+            <div class='section-link'>
+                <a href='#section-0'>#</a>
+            </div>
+            <h1>LLM.int() on GPT-NeoX</h1>
+<p>This implements a utility function to transform a <code class="highlight"><span></span><span class="n">nn</span><span class="o">.</span><span class="n">Linear</span></code>
+ layer to LLM.int8() linear layer.</p>
+<p><a href="https://papers.labml.ai/paper/eb2bcaee1d0011edaa66a71c10a887e7">LLM.int8() paper</a>  shows you can use int8 quantization while handling outliers to reduce memory footprint without performance degradation in large language models. They convert weights and inputs to scaled 8-bit integers and does matrix multiplication producing int32 results which is then converted back to float16 and rescaled. They show that in large langauge models, some features can give extreme values (outliers) that dominate the model&#x27;s output. These features get clamped in 8-bit integer space which causes the model performance to degrade. As a solution they pick these outliers (greater than a specified threshold) and compute their multiplications separately in float16 space. Since the percentage of outliers is around 0.01% this doesn&#x27;t increase memory usage, and prevents the model from degrading performance.</p>
+<p>The code to transform GPT-NoeX layers is defined in <a href="../model.html#post_load_prepare">model.py</a>.</p>
+<p>Here are example uses of GPT-NeoX with int8 quantization.</p>
+<ul><li><a href="../samples/llm_int8.html">Generate Text</a> </li>
+<li><a href="../evaluation/llm_int8.html">Run Evaluation Tests</a></li></ul>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">33</span><span></span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-1'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-1'>#</a>
+            </div>
+            <p>Import <a href="https://github.com/timdettmers/bitsandbytes"><code class="highlight"><span></span><span class="n">bitsandbytes</span></code>
+</a> package </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">34</span><span class="k">try</span><span class="p">:</span>
+<span class="lineno">35</span>    <span class="kn">from</span> <span class="nn">bitsandbytes.nn</span> <span class="kn">import</span> <span class="n">Linear8bitLt</span><span class="p">,</span> <span class="n">Int8Params</span>
+<span class="lineno">36</span><span class="k">except</span> <span class="ne">ImportError</span><span class="p">:</span>
+<span class="lineno">37</span>    <span class="k">raise</span> <span class="ne">ImportError</span><span class="p">(</span><span class="s1">&#39;&#39;&#39;Please install `bitsandbytes` with `pip install bitsandbytes -U`&#39;&#39;&#39;</span><span class="p">)</span>
+<span class="lineno">38</span>
+<span class="lineno">39</span><span class="kn">import</span> <span class="nn">torch</span>
+<span class="lineno">40</span><span class="kn">from</span> <span class="nn">torch</span> <span class="kn">import</span> <span class="n">nn</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-2'>
+        <div class='docs doc-strings'>
+            <div class='section-link'>
+                <a href='#section-2'>#</a>
+            </div>
+            <h2>Transform a <code class="highlight"><span></span><span class="n">nn</span><span class="o">.</span><span class="n">Linear</span></code>
+ layer to LLM.int8() linear layer</h2>
+<ul><li><code class="highlight"><span></span><span class="n">linear_module</span></code>
+  is the <code class="highlight"><span></span><span class="n">nn</span><span class="o">.</span><span class="n">Linear</span></code>
+ layer to transform </li>
+<li><code class="highlight"><span></span><span class="n">device</span></code>
+  is the device of the model </li>
+<li><code class="highlight"><span></span><span class="n">threshold</span></code>
+  is the threshold <span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.43056em;vertical-align:0em;"></span><span class="mord mathnormal" style="margin-right:0.0037em;">α</span></span></span></span> to use for outlier detection</li></ul>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">43</span><span class="k">def</span> <span class="nf">make_llm_int8_linear</span><span class="p">(</span><span class="n">linear_module</span><span class="p">:</span> <span class="n">nn</span><span class="o">.</span><span class="n">Linear</span><span class="p">,</span> <span class="n">device</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">,</span> <span class="n">threshold</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">6.0</span><span class="p">):</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-3'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-3'>#</a>
+            </div>
+            <p> </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">53</span>    <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">linear_module</span><span class="p">,</span> <span class="n">nn</span><span class="o">.</span><span class="n">Linear</span><span class="p">)</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-4'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-4'>#</a>
+            </div>
+            <p>Create an empty Linear8bitLt module </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">56</span>    <span class="n">int8_lin</span> <span class="o">=</span> <span class="n">Linear8bitLt</span><span class="p">(</span>
+<span class="lineno">57</span>        <span class="n">linear_module</span><span class="o">.</span><span class="n">in_features</span><span class="p">,</span>
+<span class="lineno">58</span>        <span class="n">linear_module</span><span class="o">.</span><span class="n">out_features</span><span class="p">,</span>
+<span class="lineno">59</span>        <span class="n">linear_module</span><span class="o">.</span><span class="n">bias</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span>
+<span class="lineno">60</span>        <span class="n">has_fp16_weights</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+<span class="lineno">61</span>        <span class="n">threshold</span><span class="o">=</span><span class="n">threshold</span><span class="p">,</span>
+<span class="lineno">62</span>    <span class="p">)</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-5'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-5'>#</a>
+            </div>
+            <p>Quantize the weights </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">65</span>    <span class="n">int8_lin</span><span class="o">.</span><span class="n">_parameters</span><span class="p">[</span><span class="s1">&#39;weight&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">Int8Params</span><span class="p">(</span><span class="n">linear_module</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">cpu</span><span class="p">(),</span>
+<span class="lineno">66</span>                                                <span class="n">requires_grad</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+<span class="lineno">67</span>                                                <span class="n">has_fp16_weights</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">)</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-6'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-6'>#</a>
+            </div>
+            <p>Set the bias in float16 space </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">70</span>    <span class="k">if</span> <span class="n">linear_module</span><span class="o">.</span><span class="n">bias</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+<span class="lineno">71</span>        <span class="n">int8_lin</span><span class="o">.</span><span class="n">_parameters</span><span class="p">[</span><span class="s1">&#39;bias&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">linear_module</span><span class="o">.</span><span class="n">bias</span><span class="o">.</span><span class="n">data</span><span class="p">,</span>
+<span class="lineno">72</span>                                                    <span class="n">requires_grad</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></pre></div>
+        </div>
+    </div>
+    <div class='section' id='section-7'>
+        <div class='docs'>
+            <div class='section-link'>
+                <a href='#section-7'>#</a>
+            </div>
+            <p> </p>
+
+        </div>
+        <div class='code'>
+            <div class="highlight"><pre><span class="lineno">75</span>    <span class="k">return</span> <span class="n">int8_lin</span></pre></div>
+        </div>
+    </div>
+    <div class='footer'>
+        <a href="https://papers.labml.ai">Trending Research Papers</a>
+        <a href="https://labml.ai">labml.ai</a>
+    </div>
+</div>
+<script src=../../interactive.js?v=1"></script>
+<script>
+    function handleImages() {
+        var images = document.querySelectorAll('p>img')
+
+        for (var i = 0; i < images.length; ++i) {
+            handleImage(images[i])
+        }
+    }
+
+    function handleImage(img) {
+        img.parentElement.style.textAlign = 'center'
+
+        var modal = document.createElement('div')
+        modal.id = 'modal'
+
+        var modalContent = document.createElement('div')
+        modal.appendChild(modalContent)
+
+        var modalImage = document.createElement('img')
+        modalContent.appendChild(modalImage)
+
+        var span = document.createElement('span')
+        span.classList.add('close')
+        span.textContent = 'x'
+        modal.appendChild(span)
+
+        img.onclick = function () {
+            console.log('clicked')
+            document.body.appendChild(modal)
+            modalImage.src = img.src
+        }
+
+        span.onclick = function () {
+            document.body.removeChild(modal)
+        }
+    }
+
+    handleImages()
+</script>
+</body>
+</html>
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@ -134,98 +134,119 @@

    <url>
      <loc>https://nn.labml.ai/neox/checkpoint.html</loc>
-      <lastmod>2022-08-10T16:30:00+00:00</lastmod>
+      <lastmod>2022-08-11T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    

    <url>
      <loc>https://nn.labml.ai/neox/index.html</loc>
-      <lastmod>2022-08-10T16:30:00+00:00</lastmod>
+      <lastmod>2022-08-11T16:30:00+00:00</lastmod>
+      <priority>1.00</priority>
+    </url>
+    
+
+    <url>
+      <loc>https://nn.labml.ai/neox/utils/llm_int8.html</loc>
+      <lastmod>2022-08-19T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    

    <url>
      <loc>https://nn.labml.ai/neox/utils/cache.html</loc>
-      <lastmod>2022-08-10T16:30:00+00:00</lastmod>
+      <lastmod>2022-08-11T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    

    <url>
      <loc>https://nn.labml.ai/neox/utils/index.html</loc>
-      <lastmod>2022-08-10T16:30:00+00:00</lastmod>
+      <lastmod>2022-08-11T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    

    <url>
      <loc>https://nn.labml.ai/neox/utils/text_dataset.html</loc>
-      <lastmod>2022-08-10T16:30:00+00:00</lastmod>
+      <lastmod>2022-08-11T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    

    <url>
      <loc>https://nn.labml.ai/neox/utils/trainer.html</loc>
-      <lastmod>2022-08-10T16:30:00+00:00</lastmod>
+      <lastmod>2022-08-11T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    

    <url>
      <loc>https://nn.labml.ai/neox/utils/finetune.html</loc>
-      <lastmod>2022-08-10T16:30:00+00:00</lastmod>
+      <lastmod>2022-08-11T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    

    <url>
      <loc>https://nn.labml.ai/neox/model.html</loc>
-      <lastmod>2022-08-10T16:30:00+00:00</lastmod>
+      <lastmod>2022-08-19T16:30:00+00:00</lastmod>
+      <priority>1.00</priority>
+    </url>
+    
+
+    <url>
+      <loc>https://nn.labml.ai/neox/samples/llm_int8.html</loc>
+      <lastmod>2022-08-19T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    

    <url>
      <loc>https://nn.labml.ai/neox/samples/generate.html</loc>
-      <lastmod>2022-08-10T16:30:00+00:00</lastmod>
+      <lastmod>2022-08-11T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    

    <url>
      <loc>https://nn.labml.ai/neox/samples/index.html</loc>
-      <lastmod>2022-08-10T16:30:00+00:00</lastmod>
+      <lastmod>2022-08-11T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    

    <url>
      <loc>https://nn.labml.ai/neox/samples/finetune.html</loc>
-      <lastmod>2022-08-10T16:30:00+00:00</lastmod>
+      <lastmod>2022-08-11T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    

    <url>
      <loc>https://nn.labml.ai/neox/tokenizer.html</loc>
-      <lastmod>2022-08-10T16:30:00+00:00</lastmod>
+      <lastmod>2022-08-11T16:30:00+00:00</lastmod>
+      <priority>1.00</priority>
+    </url>
+    
+
+    <url>
+      <loc>https://nn.labml.ai/neox/evaluation/llm_int8.html</loc>
+      <lastmod>2022-08-19T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    

    <url>
      <loc>https://nn.labml.ai/neox/evaluation/half_precision.html</loc>
-      <lastmod>2022-08-10T16:30:00+00:00</lastmod>
+      <lastmod>2022-08-11T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    

    <url>
      <loc>https://nn.labml.ai/neox/evaluation/index.html</loc>
-      <lastmod>2022-08-10T16:30:00+00:00</lastmod>
+      <lastmod>2022-08-11T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    
@ -463,7 +484,7 @@

    <url>
      <loc>https://nn.labml.ai/index.html</loc>
-      <lastmod>2022-08-08T16:30:00+00:00</lastmod>
+      <lastmod>2022-08-11T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    
@ -575,7 +596,7 @@

    <url>
      <loc>https://nn.labml.ai/optimizers/adam_fp16.html</loc>
-      <lastmod>2022-08-10T16:30:00+00:00</lastmod>
+      <lastmod>2022-08-11T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    
@ -659,21 +680,21 @@

    <url>
      <loc>https://nn.labml.ai/scaling/index.html</loc>
-      <lastmod>2022-08-09T16:30:00+00:00</lastmod>
+      <lastmod>2022-08-11T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    

    <url>
      <loc>https://nn.labml.ai/scaling/zero3/index.html</loc>
-      <lastmod>2022-08-10T16:30:00+00:00</lastmod>
+      <lastmod>2022-08-11T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    

    <url>
      <loc>https://nn.labml.ai/scaling/zero3/finetune_neox.html</loc>
-      <lastmod>2022-08-10T16:30:00+00:00</lastmod>
+      <lastmod>2022-08-11T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    
--- a/labml_nn/neox/model.py
+++ b/labml_nn/neox/model.py
@ -518,7 +518,22 @@ class LayerGenerator:
                          device: torch.device = None,
                          llm_int8_threshold: float = None,
                          ):
-        # If we are using int8 quantization, we need to convert the layer to int8
+        """
+        <a id="post_load_prepare"></a>
+        ### Layer transformations after loading the checkpoint
+
+        This function implements layer transformations after loading the checkpoint.
+
+        Currently, it only applies the int8 quantization.
+
+        :param layer: is the layer to prepare
+        :param is_llm_int8: specifies whether to use int8 quantization
+        :param device: is the device of the model
+        :param llm_int8_threshold: is the threshold $\alpha$ used to separate outlier features
+        :return: the prepared layer
+        """
+
+        # Get default values if not specified
        if is_llm_int8 is None:
            is_llm_int8 = self.is_llm_int8
        if device is None:
@ -526,6 +541,7 @@ class LayerGenerator:
        if llm_int8_threshold is None:
            llm_int8_threshold = self.llm_int8_threshold

+        # Skip if not using int8 quantization
        if not is_llm_int8:
            return layer

@ -536,7 +552,7 @@ class LayerGenerator:
        # Use `make_llm_int8_linear` defined in [utilities](./utils/llm_int8.html).
        from labml_nn.neox.utils.llm_int8 import make_llm_int8_linear

-        #
+        # Convert the linear layers
        with monit.section('Convert to int8'):
            layer.attention.output = make_llm_int8_linear(layer.attention.output,
                                                          device=device,
--- a/labml_nn/neox/samples/llm_int8.py
+++ b/labml_nn/neox/samples/llm_int8.py
@ -1,3 +1,17 @@
+"""
+---
+title: Generate Text with GPT-NeoX using LLM.int8() quantization
+summary: >
+     Generate Text with GPT-NeoX using LLM.int8() quantization
+---
+
+#  Generate Text with GPT-NeoX using LLM.int8() quantization
+
+This shows how to generate text from GPT-NeoX using [LLM.int8() quantization](../utils/llm_int8.html).
+
+This needs a GPU with more than 45GB memory.
+"""
+
 from typing import List

 import torch
@ -5,31 +19,10 @@ from torch import nn

 from labml import monit
 from labml_nn.neox.model import LayerGenerator
+from labml_nn.neox.samples.generate import PROMPT, infer
 from labml_nn.neox.utils import get_tokens, print_tokens
 from labml_nn.neox.utils.cache import get_cache

-# Prompt to complete
-PROMPT = 'Einstein was born in the German Empire, but moved to Switzerland in 1895, forsaking his German'
-
-
-def infer(model: nn.Module, ids: List[int], device: torch.device):
-    """
-    ### Predict the next token
-
-    :param layers: is the list of layers
-    :param ids: are the input token ids
-    :param device: is the device of the model
-    """
-
-    with torch.no_grad():
-        # Get the tokens
-        x = torch.tensor(ids)[None, :].to(device)
-        # Eval model
-        x = model(x)
-
-    # Return predicted token
-    return x[0].max(dim=-1)[1].tolist()
-

 def generate():
    """
@ -43,12 +36,14 @@ def generate():
    # Device
    device = torch.device('cuda:0')

+    # Load layers in float16 into CPU. We convert the layers to int8 later, because doing that
+    # on the fly after loading layers to GPU causes CUDA memory fragmentation
+    # (about 3GB memory can get lost due to fragmentation).
    layer_generator = LayerGenerator(is_clone_layers=True,
                                     dtype=torch.float16,
                                     device=torch.device('cpu'),
                                     # is_llm_int8=True,
                                     )
-    # Load layers
    layers = list(layer_generator.load())

    # This reduces CUDA memory fragmentation
@ -60,10 +55,11 @@ def generate():
                                          )
        layer.to(device)

+    # Create `nn.Sequential` model
    model = nn.Sequential(*layers)

+    # Clear cache and print memory summary for debugging
    torch.cuda.empty_cache()
-
    print(torch.cuda.memory_summary())

    # Get token ids
--- a/labml_nn/neox/utils/llm_int8.py
+++ b/labml_nn/neox/utils/llm_int8.py
@ -1,8 +1,36 @@
 """
-* [Generate](../samples/llm_int8.html)
-* [Evaluation](../evaluation/llm_int8.html)
+---
+title: LLM.int8() on GPT-NeoX
+summary: >
+    Transform nn.Linear layers to 8-bit integer layers.
+---
+
+# LLM.int() on GPT-NeoX
+
+This implements a utility function to transform a `nn.Linear` layer to LLM.int8() linear layer.
+
+[LLM.int8() paper](https://papers.labml.ai/paper/eb2bcaee1d0011edaa66a71c10a887e7)
+ shows you can use int8 quantization while handling outliers to
+reduce memory footprint without performance degradation in large language models.
+They convert weights and inputs to scaled 8-bit integers and does matrix multiplication
+producing int32 results which is then converted back to float16 and rescaled.
+They show that in large langauge models, some features can give extreme values (outliers)
+that dominate the model's output.
+These features get clamped in 8-bit integer space which causes the model performance to degrade.
+As a solution they pick these outliers (greater than a specified threshold)
+and compute their multiplications separately in float16 space.
+Since the percentage of outliers is around 0.01% this doesn't increase memory usage,
+and prevents the model from degrading performance.
+
+The code to transform GPT-NoeX layers is defined in [model.py](../model.html#post_load_prepare).
+
+Here are example uses of GPT-NeoX with int8 quantization.
+
+* [Generate Text](../samples/llm_int8.html)
+* [Run Evaluation Tests](../evaluation/llm_int8.html)
 """

+# Import [`bitsandbytes`](https://github.com/timdettmers/bitsandbytes) package
 try:
    from bitsandbytes.nn import Linear8bitLt, Int8Params
 except ImportError:
@ -13,7 +41,18 @@ from torch import nn


 def make_llm_int8_linear(linear_module: nn.Linear, device: torch.device, threshold: float = 6.0):
-    # Create a Linear8bitLt module
+    """
+    ## Transform a `nn.Linear` layer to LLM.int8() linear layer
+
+    :param linear_module: is the `nn.Linear` layer to transform
+    :param device: is the device of the model
+    :param threshold: is the threshold $\alpha$ to use for outlier detection
+    """
+
+    #
+    assert isinstance(linear_module, nn.Linear)
+
+    # Create an empty Linear8bitLt module
    int8_lin = Linear8bitLt(
        linear_module.in_features,
        linear_module.out_features,
@ -22,15 +61,15 @@ def make_llm_int8_linear(linear_module: nn.Linear, device: torch.device, thresho
        threshold=threshold,
    )

-    # Set the weights
+    # Quantize the weights
    int8_lin._parameters['weight'] = Int8Params(linear_module.weight.data.cpu(),
                                                requires_grad=False,
                                                has_fp16_weights=False).to(device)

-    # Set the bias.
-    # We don't have to convert this to Int8 since it doesn't use a lot of memory.
+    # Set the bias in float16 space
    if linear_module.bias is not None:
        int8_lin._parameters['bias'] = nn.Parameter(linear_module.bias.data,
                                                    requires_grad=False)

+    #
    return int8_lin