docs

2025-10-29 17:57:14 +08:00 · 2022-08-20 11:13:36 +05:30
parent e19d95f9c3
commit 4860cc680b
8 changed files with 394 additions and 227 deletions
--- a/labml_nn/neox/evaluation/half_precision.py
+++ b/labml_nn/neox/evaluation/half_precision.py
@ -1,19 +1,39 @@
+"""
+---
+title: Evaluate GPT-NeoX using LLM.int8() quantization on test suite
+summary: >
+     Evaluate GPT-NeoX using LLM.int8() quantization on test suite
+---
+
+#  Evaluate GPT-NeoX using LLM.int8() quantization on test suite
+
+This code evaluate [GPT-NeoX](../index.html) using, on a suite of tasks.
+"""
+
 import torch
 from torch import nn

-from labml import monit
 from labml_nn.neox.evaluation import run_eval_harness
 from labml_nn.neox.model import LayerGenerator

-if __name__ == '__main__':
+
+def main():
+    # Device
    device = torch.device('cuda:0')
+    # Load layers
    layers = list(LayerGenerator(is_clone_layers=True,
                                 filter_layers=None,
                                 dtype=torch.float16,
                                 device=device
                                 ).load())

-    with monit.section('Sequential'):
-        model = nn.Sequential(*layers)
+    # Create `nn.Sequential` model
+    model = nn.Sequential(*layers)

+    # Run [evaluation harness](index.html)
    print(run_eval_harness(model, 'half_precision', ['lambada'], device))
+
+
+#
+if __name__ == '__main__':
+    main()
--- a/labml_nn/neox/evaluation/llm_int8.py
+++ b/labml_nn/neox/evaluation/llm_int8.py
@ -1,3 +1,16 @@
+"""
+---
+title: Evaluate GPT-NeoX using LLM.int8() quantization on test suite
+summary: >
+     Evaluate GPT-NeoX using LLM.int8() quantization on test suite
+---
+
+#  Evaluate GPT-NeoX using LLM.int8() quantization on test suite
+
+This code evaluate [GPT-NeoX](../index.html) using [LLM.int8() quantization](../utils/llm_int8.html),
+on a suite of tasks.
+"""
+
 import torch
 from torch import nn

@ -5,8 +18,14 @@ from labml import monit
 from labml_nn.neox.evaluation import run_eval_harness
 from labml_nn.neox.model import LayerGenerator

-if __name__ == '__main__':
+
+def main():
+    # Device
    device = torch.device('cuda:0')
+
+    # Load layers in float16 into CPU. We convert the layers to int8 later, because doing that
+    # on the fly after loading layers to GPU causes CUDA memory fragmentation
+    # (about 3GB memory can get lost due to fragmentation).
    layer_generator = LayerGenerator(is_clone_layers=True,
                                     dtype=torch.float16,
                                     device=torch.device('cpu'),
@ -23,7 +42,13 @@ if __name__ == '__main__':
                                          )
        layer.to(device)

-    with monit.section('Sequential'):
-        model = nn.Sequential(*layers)
+    # Create `nn.Sequential` model
+    model = nn.Sequential(*layers)

+    # Run [evaluation harness](index.html)
    print(run_eval_harness(model, 'half_precision', [], device))
+
+
+#
+if __name__ == '__main__':
+    main()
--- a/labml_nn/neox/model.py
+++ b/labml_nn/neox/model.py
@ -520,6 +520,7 @@ class LayerGenerator:
                          ):
        """
        <a id="post_load_prepare"></a>
+
        ### Layer transformations after loading the checkpoint

        This function implements layer transformations after loading the checkpoint.
--- a/labml_nn/neox/samples/llm_int8.py
+++ b/labml_nn/neox/samples/llm_int8.py
@ -9,11 +9,9 @@ summary: >

 This shows how to generate text from GPT-NeoX using [LLM.int8() quantization](../utils/llm_int8.html).

-This needs a GPU with more than 45GB memory.
+This needs a GPU with 24GB memory.
 """

-from typing import List
-
 import torch
 from torch import nn

@ -42,7 +40,7 @@ def generate():
    layer_generator = LayerGenerator(is_clone_layers=True,
                                     dtype=torch.float16,
                                     device=torch.device('cpu'),
-                                     # is_llm_int8=True,
+                                     is_llm_int8=False,
                                     )
    layers = list(layer_generator.load())

@ -65,7 +63,8 @@ def generate():
    # Get token ids
    ids = get_tokens(PROMPT)

-    # Run the model
+    # Run the model.
+    # We use the [`infer`](generate.html) function defined in [`generate.py`](generate.html)
    cache.set('state_ids', (None, 1))
    with monit.section('Infer'):
        next_token = infer(model, ids, device)[-1]