mirror of
https://github.com/VHellendoorn/Code-LMs.git
synced 2026-03-13 10:00:47 +08:00
Extend to support models of different sizes, and add "argparse" and README.
This commit is contained in:
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
30
Convert2HF/README.md
Normal file
30
Convert2HF/README.md
Normal file
@@ -0,0 +1,30 @@
|
||||
# Convert to HuggingFace
|
||||
This directory contains a script `convert_neox_pt_to_huggingface_neox.py` to convert PolyCoder checkpoints trained by [gpt-neox](https://github.com/EleutherAI/gpt-neox) into HuggingFace format, and a script `generate.py` to load the converted model and generate code from a given prompt.
|
||||
|
||||
## Environment
|
||||
transformers 4.23.1
|
||||
|
||||
## Convert
|
||||
The script `convert.sh` is an example of converting a 0.4B parameter model to HuggingFace format:
|
||||
```
|
||||
python convert_neox_pt_to_huggingface_neox.py \
|
||||
--checkpoint_dir ../checkpoints/checkpoints-0-4B/global_step150000 \
|
||||
--vocab_file ../Data/code-vocab.json \
|
||||
--merge_file ../Data/code-merges.txt \
|
||||
--hf_config_path ./polycoder/configs/config_0-4B.json \
|
||||
--hf_save_dir ./polycoder/0-4B
|
||||
```
|
||||
HuggingFace configuration files for different size models are provided in `polycoder/configs/`, including `config_0-4B.json`, `config_2-7B.json` and `config_160M.json`.
|
||||
|
||||
After running, you can get a complete HuggingFace model in the directory specified by `hf_save_dir`. If the directory does not exist, it can be built automatically.
|
||||
|
||||
## Generate
|
||||
The following is an example to load the converted 0.4B HuggingFace model and generate code from a given prompt:
|
||||
```
|
||||
python generate.py \
|
||||
--model_name_or_path ./polycoder/0-4B \
|
||||
--temperature 0.2 \
|
||||
--top_p 0.95 \
|
||||
--max_length 128
|
||||
```
|
||||
You can evaluate models of other sizes by specifying `model_name_or_path`.
|
||||
6
Convert2HF/convert.sh
Normal file
6
Convert2HF/convert.sh
Normal file
@@ -0,0 +1,6 @@
|
||||
python convert_neox_pt_to_huggingface_neox.py \
|
||||
--checkpoint_dir ../checkpoints/checkpoints-0-4B/global_step150000 \
|
||||
--vocab_file ../Data/code-vocab.json \
|
||||
--merge_file ../Data/code-merges.txt \
|
||||
--hf_config_path ./polycoder/configs/config_0-4B.json \
|
||||
--hf_save_dir ./polycoder/0-4B
|
||||
@@ -1,47 +1,84 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
import sys
|
||||
import os
|
||||
import torch
|
||||
import argparse
|
||||
from collections import OrderedDict
|
||||
from transformers import GPTNeoXConfig, GPTNeoXForCausalLM, GPT2Tokenizer
|
||||
|
||||
|
||||
input_dir = sys.argv[1]
|
||||
output_file = "./0-4B/pytorch_model.bin"
|
||||
def get_hf_state_dict_from_pt_files(checkpoint_dir):
|
||||
layer_files = []
|
||||
for root, dirs, files in os.walk(checkpoint_dir):
|
||||
for file in files:
|
||||
if file.startswith("layer_"):
|
||||
# print(file)
|
||||
layer_files.append(os.path.join(root, file))
|
||||
layer_files = sorted(layer_files)
|
||||
|
||||
layer_files = []
|
||||
layer_id = -1
|
||||
state_dict = OrderedDict()
|
||||
for root, dirs, files in os.walk(input_dir):
|
||||
for file in files:
|
||||
if file.startswith("layer_"):
|
||||
# print(file)
|
||||
layer_files.append(os.path.join(root, file))
|
||||
layer_id = -1
|
||||
state_dict = OrderedDict()
|
||||
for file in layer_files:
|
||||
print(f"Loading: {file}")
|
||||
new_layer = True
|
||||
|
||||
module = torch.load(file, map_location=torch.device('cpu'))
|
||||
for key, value in module.items():
|
||||
if "word_embeddings" in key:
|
||||
new_key = key.replace("word_embeddings", "gpt_neox.embed_in")
|
||||
state_dict[new_key] = value
|
||||
elif "_layernorm" in key or "attention" in key or "mlp" in key:
|
||||
if new_layer:
|
||||
layer_id += 1
|
||||
new_layer = False
|
||||
new_key = "gpt_neox.layers." + str(layer_id) + "." + key
|
||||
state_dict[new_key] = value
|
||||
elif key.startswith("norm."):
|
||||
new_key = "gpt_neox.final_layer_norm." + key.split(".")[-1]
|
||||
state_dict[new_key] = value
|
||||
elif "final_linear" in key:
|
||||
new_key = "embed_out." + key.split(".")[-1]
|
||||
state_dict[new_key] = value
|
||||
print(f"Convert \"{key}\" to \"{new_key}\"")
|
||||
|
||||
return state_dict
|
||||
|
||||
|
||||
layer_files = sorted(layer_files)
|
||||
for file in layer_files:
|
||||
print(file)
|
||||
new_layer = True
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--checkpoint_dir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Directory that contains .pt files.")
|
||||
parser.add_argument("--vocab_file",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the vocab file.")
|
||||
parser.add_argument('--merge_file',
|
||||
type=str,
|
||||
required=True,
|
||||
help='Path to the BPE merge file.')
|
||||
parser.add_argument("--hf_config_path",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to HuggingFace configuration file.")
|
||||
parser.add_argument("--hf_save_dir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Directory to save HuggingFace model.")
|
||||
args = parser.parse_args()
|
||||
|
||||
module = torch.load(file, map_location=torch.device('cpu'))
|
||||
for key, value in module.items():
|
||||
if "word_embeddings" in key:
|
||||
new_key = key.replace("word_embeddings", "gpt_neox.embed_in")
|
||||
state_dict[new_key] = value
|
||||
elif "_layernorm" in key or "attention" in key or "mlp" in key:
|
||||
if new_layer:
|
||||
layer_id += 1
|
||||
new_layer = False
|
||||
new_key = "gpt_neox.layers." + str(layer_id) + "." + key
|
||||
state_dict[new_key] = value
|
||||
elif key.startswith("norm."):
|
||||
new_key = "gpt_neox.final_layer_norm." + key.split(".")[-1]
|
||||
state_dict[new_key] = value
|
||||
elif "final_linear" in key:
|
||||
new_key = "embed_out." + key.split(".")[-1]
|
||||
state_dict[new_key] = value
|
||||
print("Convert \"{}\" to \"{}\"".format(key, new_key))
|
||||
config = GPTNeoXConfig.from_json_file(args.hf_config_path)
|
||||
|
||||
torch.save(state_dict, output_file)
|
||||
model = GPTNeoXForCausalLM(config)
|
||||
state_dict = get_hf_state_dict_from_pt_files(args.checkpoint_dir)
|
||||
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
|
||||
print(f"missing keys: {missing_keys}")
|
||||
print(f"unexpected keys: {unexpected_keys}")
|
||||
|
||||
tokenizer = GPT2Tokenizer(args.vocab_file, args.merge_file)
|
||||
|
||||
if not os.path.exists(args.hf_save_dir):
|
||||
os.makedirs(args.hf_save_dir)
|
||||
print(f"Save HuggingFace model to {args.hf_save_dir} ...")
|
||||
model.save_pretrained(args.hf_save_dir)
|
||||
tokenizer.save_pretrained(args.hf_save_dir)
|
||||
print(f"Finished.")
|
||||
|
||||
41
Convert2HF/generate.py
Normal file
41
Convert2HF/generate.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from transformers import GPTNeoXForCausalLM, GPT2Tokenizer
|
||||
import torch
|
||||
import argparse
|
||||
|
||||
|
||||
def load_model_and_generate(model_name_or_path, prompt, gen_kwargs):
|
||||
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
|
||||
model = GPTNeoXForCausalLM.from_pretrained(model_name_or_path)
|
||||
|
||||
encoded_input = tokenizer(prompt, return_tensors="pt")
|
||||
input_ids, attention_mask = encoded_input['input_ids'], encoded_input['attention_mask']
|
||||
if torch.cuda.is_available():
|
||||
model = model.cuda()
|
||||
input_ids = input_ids.cuda()
|
||||
attention_mask = attention_mask.cuda()
|
||||
|
||||
prediction_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, **gen_kwargs)[0]
|
||||
prediction_tokens = tokenizer.decode(prediction_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[len(prompt):]
|
||||
print(prompt + prediction_tokens)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--model_name_or_path", type=str, default="./polycoder/0-4B")
|
||||
parser.add_argument('--temperature', type=float, default=0.2)
|
||||
parser.add_argument('--top_p', type=float, default=0.95)
|
||||
parser.add_argument('--max_length', type=int, default=128)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
gen_kwargs = {
|
||||
"do_sample": True,
|
||||
"temperature": args.temperature,
|
||||
"max_length": args.max_length,
|
||||
"top_p": args.top_p,
|
||||
}
|
||||
|
||||
prompt = "\ndef add(x: int, y: int):\n \"\"\"Add two numbers x and y\n >>> add(2, 3)\n 5\n >>> add(5, 7)\n 12\n \"\"\"\n"
|
||||
|
||||
load_model_and_generate(args.model_name_or_path, prompt, gen_kwargs)
|
||||
@@ -17,7 +17,8 @@
|
||||
"rotary_emb_base": 10000,
|
||||
"torch_dtype": "float16",
|
||||
"use_cache": true,
|
||||
"use_parallel_residual": false,
|
||||
"vocab_size": 50304,
|
||||
"transformers_version": "4.21.1",
|
||||
"transformers_version": "4.23.1",
|
||||
"tokenizer_class": "GPT2Tokenizer"
|
||||
}
|
||||
24
Convert2HF/polycoder/configs/config_160M.json
Normal file
24
Convert2HF/polycoder/configs/config_160M.json
Normal file
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"hidden_act": "gelu",
|
||||
"architectures": [
|
||||
"GPTNeoXForCausalLM"
|
||||
],
|
||||
"bos_token_id": 0,
|
||||
"eos_token_id": 0,
|
||||
"initializer_range": 0.02,
|
||||
"layer_norm_eps": 1e-05,
|
||||
"model_type": "gpt_neox",
|
||||
"hidden_size": 768,
|
||||
"intermediate_size": 3072,
|
||||
"num_attention_heads": 12,
|
||||
"num_hidden_layers": 12,
|
||||
"max_position_embeddings": 2048,
|
||||
"rotary_pct": 1.0,
|
||||
"rotary_emb_base": 10000,
|
||||
"torch_dtype": "float16",
|
||||
"use_cache": true,
|
||||
"use_parallel_residual": false,
|
||||
"vocab_size": 50304,
|
||||
"transformers_version": "4.23.1",
|
||||
"tokenizer_class": "GPT2Tokenizer"
|
||||
}
|
||||
24
Convert2HF/polycoder/configs/config_2-7B.json
Normal file
24
Convert2HF/polycoder/configs/config_2-7B.json
Normal file
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"hidden_act": "gelu",
|
||||
"architectures": [
|
||||
"GPTNeoXForCausalLM"
|
||||
],
|
||||
"bos_token_id": 0,
|
||||
"eos_token_id": 0,
|
||||
"initializer_range": 0.02,
|
||||
"layer_norm_eps": 1e-05,
|
||||
"model_type": "gpt_neox",
|
||||
"hidden_size": 2560,
|
||||
"intermediate_size": 10240,
|
||||
"num_attention_heads": 32,
|
||||
"num_hidden_layers": 32,
|
||||
"max_position_embeddings": 2048,
|
||||
"rotary_pct": 1.0,
|
||||
"rotary_emb_base": 10000,
|
||||
"torch_dtype": "float16",
|
||||
"use_cache": true,
|
||||
"use_parallel_residual": false,
|
||||
"vocab_size": 50304,
|
||||
"transformers_version": "4.23.1",
|
||||
"tokenizer_class": "GPT2Tokenizer"
|
||||
}
|
||||
Reference in New Issue
Block a user