Extend to support models of different sizes, and add "argparse" and README.

This commit is contained in:
wangxu
2022-10-15 00:45:37 +08:00
committed by wangxu
parent 15ab9ffc02
commit 4018afe61d
9 changed files with 201 additions and 50051 deletions

View File

File diff suppressed because it is too large Load Diff

View File

File diff suppressed because one or more lines are too long

30
Convert2HF/README.md Normal file
View File

@@ -0,0 +1,30 @@
# Convert to HuggingFace
This directory contains a script `convert_neox_pt_to_huggingface_neox.py` to convert PolyCoder checkpoints trained by [gpt-neox](https://github.com/EleutherAI/gpt-neox) into HuggingFace format, and a script `generate.py` to load the converted model and generate code from a given prompt.
## Environment
transformers 4.23.1
## Convert
The script `convert.sh` is an example of converting a 0.4B parameter model to HuggingFace format:
```
python convert_neox_pt_to_huggingface_neox.py \
--checkpoint_dir ../checkpoints/checkpoints-0-4B/global_step150000 \
--vocab_file ../Data/code-vocab.json \
--merge_file ../Data/code-merges.txt \
--hf_config_path ./polycoder/configs/config_0-4B.json \
--hf_save_dir ./polycoder/0-4B
```
HuggingFace configuration files for different size models are provided in `polycoder/configs/`, including `config_0-4B.json`, `config_2-7B.json` and `config_160M.json`.
After running, you can get a complete HuggingFace model in the directory specified by `hf_save_dir`. If the directory does not exist, it can be built automatically.
## Generate
The following is an example to load the converted 0.4B HuggingFace model and generate code from a given prompt:
```
python generate.py \
--model_name_or_path ./polycoder/0-4B \
--temperature 0.2 \
--top_p 0.95 \
--max_length 128
```
You can evaluate models of other sizes by specifying `model_name_or_path`.

6
Convert2HF/convert.sh Normal file
View File

@@ -0,0 +1,6 @@
python convert_neox_pt_to_huggingface_neox.py \
--checkpoint_dir ../checkpoints/checkpoints-0-4B/global_step150000 \
--vocab_file ../Data/code-vocab.json \
--merge_file ../Data/code-merges.txt \
--hf_config_path ./polycoder/configs/config_0-4B.json \
--hf_save_dir ./polycoder/0-4B

View File

@@ -1,47 +1,84 @@
# -*- coding: utf-8 -*-
import sys
import os
import torch
import argparse
from collections import OrderedDict
from transformers import GPTNeoXConfig, GPTNeoXForCausalLM, GPT2Tokenizer
input_dir = sys.argv[1]
output_file = "./0-4B/pytorch_model.bin"
def get_hf_state_dict_from_pt_files(checkpoint_dir):
layer_files = []
for root, dirs, files in os.walk(checkpoint_dir):
for file in files:
if file.startswith("layer_"):
# print(file)
layer_files.append(os.path.join(root, file))
layer_files = sorted(layer_files)
layer_files = []
layer_id = -1
state_dict = OrderedDict()
for root, dirs, files in os.walk(input_dir):
for file in files:
if file.startswith("layer_"):
# print(file)
layer_files.append(os.path.join(root, file))
layer_id = -1
state_dict = OrderedDict()
for file in layer_files:
print(f"Loading: {file}")
new_layer = True
module = torch.load(file, map_location=torch.device('cpu'))
for key, value in module.items():
if "word_embeddings" in key:
new_key = key.replace("word_embeddings", "gpt_neox.embed_in")
state_dict[new_key] = value
elif "_layernorm" in key or "attention" in key or "mlp" in key:
if new_layer:
layer_id += 1
new_layer = False
new_key = "gpt_neox.layers." + str(layer_id) + "." + key
state_dict[new_key] = value
elif key.startswith("norm."):
new_key = "gpt_neox.final_layer_norm." + key.split(".")[-1]
state_dict[new_key] = value
elif "final_linear" in key:
new_key = "embed_out." + key.split(".")[-1]
state_dict[new_key] = value
print(f"Convert \"{key}\" to \"{new_key}\"")
return state_dict
layer_files = sorted(layer_files)
for file in layer_files:
print(file)
new_layer = True
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--checkpoint_dir",
type=str,
required=True,
help="Directory that contains .pt files.")
parser.add_argument("--vocab_file",
type=str,
required=True,
help="Path to the vocab file.")
parser.add_argument('--merge_file',
type=str,
required=True,
help='Path to the BPE merge file.')
parser.add_argument("--hf_config_path",
type=str,
required=True,
help="Path to HuggingFace configuration file.")
parser.add_argument("--hf_save_dir",
type=str,
required=True,
help="Directory to save HuggingFace model.")
args = parser.parse_args()
module = torch.load(file, map_location=torch.device('cpu'))
for key, value in module.items():
if "word_embeddings" in key:
new_key = key.replace("word_embeddings", "gpt_neox.embed_in")
state_dict[new_key] = value
elif "_layernorm" in key or "attention" in key or "mlp" in key:
if new_layer:
layer_id += 1
new_layer = False
new_key = "gpt_neox.layers." + str(layer_id) + "." + key
state_dict[new_key] = value
elif key.startswith("norm."):
new_key = "gpt_neox.final_layer_norm." + key.split(".")[-1]
state_dict[new_key] = value
elif "final_linear" in key:
new_key = "embed_out." + key.split(".")[-1]
state_dict[new_key] = value
print("Convert \"{}\" to \"{}\"".format(key, new_key))
config = GPTNeoXConfig.from_json_file(args.hf_config_path)
torch.save(state_dict, output_file)
model = GPTNeoXForCausalLM(config)
state_dict = get_hf_state_dict_from_pt_files(args.checkpoint_dir)
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
print(f"missing keys: {missing_keys}")
print(f"unexpected keys: {unexpected_keys}")
tokenizer = GPT2Tokenizer(args.vocab_file, args.merge_file)
if not os.path.exists(args.hf_save_dir):
os.makedirs(args.hf_save_dir)
print(f"Save HuggingFace model to {args.hf_save_dir} ...")
model.save_pretrained(args.hf_save_dir)
tokenizer.save_pretrained(args.hf_save_dir)
print(f"Finished.")

41
Convert2HF/generate.py Normal file
View File

@@ -0,0 +1,41 @@
from transformers import GPTNeoXForCausalLM, GPT2Tokenizer
import torch
import argparse
def load_model_and_generate(model_name_or_path, prompt, gen_kwargs):
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
model = GPTNeoXForCausalLM.from_pretrained(model_name_or_path)
encoded_input = tokenizer(prompt, return_tensors="pt")
input_ids, attention_mask = encoded_input['input_ids'], encoded_input['attention_mask']
if torch.cuda.is_available():
model = model.cuda()
input_ids = input_ids.cuda()
attention_mask = attention_mask.cuda()
prediction_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, **gen_kwargs)[0]
prediction_tokens = tokenizer.decode(prediction_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[len(prompt):]
print(prompt + prediction_tokens)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--model_name_or_path", type=str, default="./polycoder/0-4B")
parser.add_argument('--temperature', type=float, default=0.2)
parser.add_argument('--top_p', type=float, default=0.95)
parser.add_argument('--max_length', type=int, default=128)
args = parser.parse_args()
gen_kwargs = {
"do_sample": True,
"temperature": args.temperature,
"max_length": args.max_length,
"top_p": args.top_p,
}
prompt = "\ndef add(x: int, y: int):\n \"\"\"Add two numbers x and y\n >>> add(2, 3)\n 5\n >>> add(5, 7)\n 12\n \"\"\"\n"
load_model_and_generate(args.model_name_or_path, prompt, gen_kwargs)

View File

@@ -17,7 +17,8 @@
"rotary_emb_base": 10000,
"torch_dtype": "float16",
"use_cache": true,
"use_parallel_residual": false,
"vocab_size": 50304,
"transformers_version": "4.21.1",
"transformers_version": "4.23.1",
"tokenizer_class": "GPT2Tokenizer"
}

View File

@@ -0,0 +1,24 @@
{
"hidden_act": "gelu",
"architectures": [
"GPTNeoXForCausalLM"
],
"bos_token_id": 0,
"eos_token_id": 0,
"initializer_range": 0.02,
"layer_norm_eps": 1e-05,
"model_type": "gpt_neox",
"hidden_size": 768,
"intermediate_size": 3072,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"max_position_embeddings": 2048,
"rotary_pct": 1.0,
"rotary_emb_base": 10000,
"torch_dtype": "float16",
"use_cache": true,
"use_parallel_residual": false,
"vocab_size": 50304,
"transformers_version": "4.23.1",
"tokenizer_class": "GPT2Tokenizer"
}

View File

@@ -0,0 +1,24 @@
{
"hidden_act": "gelu",
"architectures": [
"GPTNeoXForCausalLM"
],
"bos_token_id": 0,
"eos_token_id": 0,
"initializer_range": 0.02,
"layer_norm_eps": 1e-05,
"model_type": "gpt_neox",
"hidden_size": 2560,
"intermediate_size": 10240,
"num_attention_heads": 32,
"num_hidden_layers": 32,
"max_position_embeddings": 2048,
"rotary_pct": 1.0,
"rotary_emb_base": 10000,
"torch_dtype": "float16",
"use_cache": true,
"use_parallel_residual": false,
"vocab_size": 50304,
"transformers_version": "4.23.1",
"tokenizer_class": "GPT2Tokenizer"
}