Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

convert.py xgen support #2053

Closed
wants to merge 3 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 33 additions & 3 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import enum
import faulthandler
import functools
import importlib
import io
import itertools
import json
Expand Down Expand Up @@ -201,6 +202,30 @@ def load(model_plus: 'ModelPlus') -> 'Params':
return params


class XgenVocab:
def __init__(self, path: Path) -> None:
self.fname_tokenizer = path
self.fname_added_tokens = None
path = str((path / "tokenization_xgen.py").absolute())
spec = importlib.util.spec_from_file_location(path, path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
self.xt = module.XgenTokenizer()
self.vocab_size_base: int = self.xt.vocab_size
self.vocab_size: int = self.xt.vocab_size
self.added_tokens_list = []

def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
for index in range(0, self.vocab_size_base):
token = self.xt.encoder.decode_single_token_bytes(index)
yield (token, float(index))
for index in range(self.vocab_size_base, self.vocab_size):
yield (b'', float(index))
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
yield (b'', float(index))
yield (b'<|unk|>', float(index))

🤷

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've tried the latest commit and this is what I got.

$ python3 convert.py models/xgen-4k-7b-orig --outtype f16
Loading model file models/xgen-4k-7b-orig/pytorch_model-00001-of-00003.bin
Loading model file models/xgen-4k-7b-orig/pytorch_model-00001-of-00003.bin
Loading model file models/xgen-4k-7b-orig/pytorch_model-00002-of-00003.bin
Loading model file models/xgen-4k-7b-orig/pytorch_model-00003-of-00003.bin
params: n_vocab:51200 n_embd:4096 n_mult:256 n_head:32 n_layer:32
Writing vocab...
Traceback (most recent call last):
  File "/home/user/llama.cpp/convert.py", line 1255, in <module>
    main()
  File "/home/user/llama.cpp/convert.py", line 1250, in main
    OutputFile.write_all(outfile, params, output_type, model, vocab)
  File "/home/user/llama.cpp/convert.py", line 1041, in write_all
    of.write_vocab(vocab)
  File "/home/user/llama.cpp/convert.py", line 1022, in write_vocab
    self.fout.write(text)
TypeError: a bytes-like object is required, not 'str'

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@clyang Try changing line 220, convert.py to the following:

token = self.xt.encoder.decode_single_token_bytes(index)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can confirm convert.py and quantize both work, thanks @smdesai !!

Btw, you still need to change EOS, BOS and NL token ID in llama.cpp to make it inference correctly.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you share what you changed in llama.cpp?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tmm1 Here is what I modified in llama.cpp:

llama_token llama_token_bos() {
    return 50256;
}

llama_token llama_token_eos() {
    return 50256;
}

llama_token llama_token_nl() {
    return 198;
}


def __repr__(self) -> str:
return f"<XgenVocab with {self.vocab_size_base} base tokens and {self.vocab_size-self.vocab_size_base} extra tokens>"


class SentencePieceVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
Expand Down Expand Up @@ -265,7 +290,7 @@ def __repr__(self) -> str:
return f"<GGMLVocab with {self.vocab_size} tokens>"


Vocab = Union[SentencePieceVocab, GGMLVocab]
Vocab = Union[XgenVocab, SentencePieceVocab, GGMLVocab]


def permute(weights: NDArray, n_head: int) -> NDArray:
Expand Down Expand Up @@ -948,12 +973,15 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
def check_vocab_size(params: Params, vocab: Vocab) -> None:
if params.n_vocab != vocab.vocab_size:
# GGMLVocab comes from the same file as the model so shouldn't mismatch:
assert isinstance(vocab, SentencePieceVocab)
assert isinstance(vocab, SentencePieceVocab) or isinstance(vocab, XgenVocab)
if params.n_vocab == vocab.vocab_size_base:
print("Ignoring added_tokens.json since model matches vocab size without it.")
vocab.added_tokens_list = []
vocab.vocab_size = vocab.vocab_size_base
return
if isinstance(vocab, XgenVocab):
vocab.vocab_size = params.n_vocab
return
msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
if vocab.fname_added_tokens is not None:
msg += f" combined with {vocab.fname_added_tokens}"
Expand Down Expand Up @@ -1133,11 +1161,13 @@ def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
return {name: model[name] for name in TENSORS_LIST if name in model}


def load_vocab(path: Path) -> SentencePieceVocab:
def load_vocab(path: Path) -> Vocab:
# Be extra-friendly and accept either a file or a directory. Also, if it's
# a directory, it might be the model directory, and tokenizer.model might
# be in the parent of that.
if path.is_dir():
if (path / "tokenization_xgen.py").exists():
return XgenVocab(path)
path2 = path / "tokenizer.model"
# Use `.parent` instead of /.. to handle the symlink case better.
path3 = path.parent / "tokenizer.model"
Expand Down