This commit is contained in:
Varuna Jayasiri
2022-08-20 11:13:36 +05:30
parent e19d95f9c3
commit 4860cc680b
8 changed files with 394 additions and 227 deletions

View File

@ -1,19 +1,39 @@
"""
---
title: Evaluate GPT-NeoX using LLM.int8() quantization on test suite
summary: >
Evaluate GPT-NeoX using LLM.int8() quantization on test suite
---
# Evaluate GPT-NeoX using LLM.int8() quantization on test suite
This code evaluate [GPT-NeoX](../index.html) using, on a suite of tasks.
"""
import torch
from torch import nn
from labml import monit
from labml_nn.neox.evaluation import run_eval_harness
from labml_nn.neox.model import LayerGenerator
if __name__ == '__main__':
def main():
# Device
device = torch.device('cuda:0')
# Load layers
layers = list(LayerGenerator(is_clone_layers=True,
filter_layers=None,
dtype=torch.float16,
device=device
).load())
with monit.section('Sequential'):
model = nn.Sequential(*layers)
# Create `nn.Sequential` model
model = nn.Sequential(*layers)
# Run [evaluation harness](index.html)
print(run_eval_harness(model, 'half_precision', ['lambada'], device))
#
if __name__ == '__main__':
main()

View File

@ -1,3 +1,16 @@
"""
---
title: Evaluate GPT-NeoX using LLM.int8() quantization on test suite
summary: >
Evaluate GPT-NeoX using LLM.int8() quantization on test suite
---
# Evaluate GPT-NeoX using LLM.int8() quantization on test suite
This code evaluate [GPT-NeoX](../index.html) using [LLM.int8() quantization](../utils/llm_int8.html),
on a suite of tasks.
"""
import torch
from torch import nn
@ -5,8 +18,14 @@ from labml import monit
from labml_nn.neox.evaluation import run_eval_harness
from labml_nn.neox.model import LayerGenerator
if __name__ == '__main__':
def main():
# Device
device = torch.device('cuda:0')
# Load layers in float16 into CPU. We convert the layers to int8 later, because doing that
# on the fly after loading layers to GPU causes CUDA memory fragmentation
# (about 3GB memory can get lost due to fragmentation).
layer_generator = LayerGenerator(is_clone_layers=True,
dtype=torch.float16,
device=torch.device('cpu'),
@ -23,7 +42,13 @@ if __name__ == '__main__':
)
layer.to(device)
with monit.section('Sequential'):
model = nn.Sequential(*layers)
# Create `nn.Sequential` model
model = nn.Sequential(*layers)
# Run [evaluation harness](index.html)
print(run_eval_harness(model, 'half_precision', [], device))
#
if __name__ == '__main__':
main()

View File

@ -520,6 +520,7 @@ class LayerGenerator:
):
"""
<a id="post_load_prepare"></a>
### Layer transformations after loading the checkpoint
This function implements layer transformations after loading the checkpoint.

View File

@ -9,11 +9,9 @@ summary: >
This shows how to generate text from GPT-NeoX using [LLM.int8() quantization](../utils/llm_int8.html).
This needs a GPU with more than 45GB memory.
This needs a GPU with 24GB memory.
"""
from typing import List
import torch
from torch import nn
@ -42,7 +40,7 @@ def generate():
layer_generator = LayerGenerator(is_clone_layers=True,
dtype=torch.float16,
device=torch.device('cpu'),
# is_llm_int8=True,
is_llm_int8=False,
)
layers = list(layer_generator.load())
@ -65,7 +63,8 @@ def generate():
# Get token ids
ids = get_tokens(PROMPT)
# Run the model
# Run the model.
# We use the [`infer`](generate.html) function defined in [`generate.py`](generate.html)
cache.set('state_ids', (None, 1))
with monit.section('Infer'):
next_token = infer(model, ids, device)[-1]