mirror of
https://github.com/labmlai/annotated_deep_learning_paper_implementations.git
synced 2025-10-29 17:57:14 +08:00
docs
This commit is contained in:
@ -1,19 +1,39 @@
|
||||
"""
|
||||
---
|
||||
title: Evaluate GPT-NeoX using LLM.int8() quantization on test suite
|
||||
summary: >
|
||||
Evaluate GPT-NeoX using LLM.int8() quantization on test suite
|
||||
---
|
||||
|
||||
# Evaluate GPT-NeoX using LLM.int8() quantization on test suite
|
||||
|
||||
This code evaluate [GPT-NeoX](../index.html) using, on a suite of tasks.
|
||||
"""
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from labml import monit
|
||||
from labml_nn.neox.evaluation import run_eval_harness
|
||||
from labml_nn.neox.model import LayerGenerator
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
def main():
|
||||
# Device
|
||||
device = torch.device('cuda:0')
|
||||
# Load layers
|
||||
layers = list(LayerGenerator(is_clone_layers=True,
|
||||
filter_layers=None,
|
||||
dtype=torch.float16,
|
||||
device=device
|
||||
).load())
|
||||
|
||||
with monit.section('Sequential'):
|
||||
model = nn.Sequential(*layers)
|
||||
# Create `nn.Sequential` model
|
||||
model = nn.Sequential(*layers)
|
||||
|
||||
# Run [evaluation harness](index.html)
|
||||
print(run_eval_harness(model, 'half_precision', ['lambada'], device))
|
||||
|
||||
|
||||
#
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
@ -1,3 +1,16 @@
|
||||
"""
|
||||
---
|
||||
title: Evaluate GPT-NeoX using LLM.int8() quantization on test suite
|
||||
summary: >
|
||||
Evaluate GPT-NeoX using LLM.int8() quantization on test suite
|
||||
---
|
||||
|
||||
# Evaluate GPT-NeoX using LLM.int8() quantization on test suite
|
||||
|
||||
This code evaluate [GPT-NeoX](../index.html) using [LLM.int8() quantization](../utils/llm_int8.html),
|
||||
on a suite of tasks.
|
||||
"""
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
@ -5,8 +18,14 @@ from labml import monit
|
||||
from labml_nn.neox.evaluation import run_eval_harness
|
||||
from labml_nn.neox.model import LayerGenerator
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
def main():
|
||||
# Device
|
||||
device = torch.device('cuda:0')
|
||||
|
||||
# Load layers in float16 into CPU. We convert the layers to int8 later, because doing that
|
||||
# on the fly after loading layers to GPU causes CUDA memory fragmentation
|
||||
# (about 3GB memory can get lost due to fragmentation).
|
||||
layer_generator = LayerGenerator(is_clone_layers=True,
|
||||
dtype=torch.float16,
|
||||
device=torch.device('cpu'),
|
||||
@ -23,7 +42,13 @@ if __name__ == '__main__':
|
||||
)
|
||||
layer.to(device)
|
||||
|
||||
with monit.section('Sequential'):
|
||||
model = nn.Sequential(*layers)
|
||||
# Create `nn.Sequential` model
|
||||
model = nn.Sequential(*layers)
|
||||
|
||||
# Run [evaluation harness](index.html)
|
||||
print(run_eval_harness(model, 'half_precision', [], device))
|
||||
|
||||
|
||||
#
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
@ -520,6 +520,7 @@ class LayerGenerator:
|
||||
):
|
||||
"""
|
||||
<a id="post_load_prepare"></a>
|
||||
|
||||
### Layer transformations after loading the checkpoint
|
||||
|
||||
This function implements layer transformations after loading the checkpoint.
|
||||
|
||||
@ -9,11 +9,9 @@ summary: >
|
||||
|
||||
This shows how to generate text from GPT-NeoX using [LLM.int8() quantization](../utils/llm_int8.html).
|
||||
|
||||
This needs a GPU with more than 45GB memory.
|
||||
This needs a GPU with 24GB memory.
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
@ -42,7 +40,7 @@ def generate():
|
||||
layer_generator = LayerGenerator(is_clone_layers=True,
|
||||
dtype=torch.float16,
|
||||
device=torch.device('cpu'),
|
||||
# is_llm_int8=True,
|
||||
is_llm_int8=False,
|
||||
)
|
||||
layers = list(layer_generator.load())
|
||||
|
||||
@ -65,7 +63,8 @@ def generate():
|
||||
# Get token ids
|
||||
ids = get_tokens(PROMPT)
|
||||
|
||||
# Run the model
|
||||
# Run the model.
|
||||
# We use the [`infer`](generate.html) function defined in [`generate.py`](generate.html)
|
||||
cache.set('state_ids', (None, 1))
|
||||
with monit.section('Infer'):
|
||||
next_token = infer(model, ids, device)[-1]
|
||||
|
||||
Reference in New Issue
Block a user