Skip to content

Commit

Permalink
Merge pull request ggerganov#633 from abetlen/gguf
Browse files Browse the repository at this point in the history
GGUF (Breaking Change to Model Files)
  • Loading branch information
abetlen authored Aug 25, 2023
2 parents d644199 + ac37ea5 commit 915bbea
Show file tree
Hide file tree
Showing 6 changed files with 507 additions and 359 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ This package provides:

Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest).

> [!WARNING]
> Starting with version 0.1.79 the model format has changed from `ggmlv3` to `gguf`. Old model files can be converted using the `convert-llama-ggmlv3-to-gguf.py` script in [`llama.cpp`](https://github.com/ggerganov/llama.cpp)

## Installation from PyPI (recommended)

Expand Down
71 changes: 30 additions & 41 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def __init__(
rope_freq_scale: float = 1.0,
n_gqa: Optional[int] = None, # (TEMPORARY) must be 8 for llama2 70b
rms_norm_eps: Optional[float] = None, # (TEMPORARY)
mul_mat_q: Optional[bool] = None, # (TEMPORARY)
mul_mat_q: Optional[bool] = None,
verbose: bool = True,
):
"""Load a llama.cpp model from `model_path`.
Expand Down Expand Up @@ -290,11 +290,6 @@ def __init__(
self.params.rope_freq_base = rope_freq_base
self.params.rope_freq_scale = rope_freq_scale

if n_gqa is not None:
self.params.n_gqa = n_gqa

if rms_norm_eps is not None:
self.params.rms_norm_eps = rms_norm_eps

if mul_mat_q is not None:
self.params.mul_mat_q = mul_mat_q
Expand Down Expand Up @@ -371,8 +366,8 @@ def __init__(
sorted=sorted,
)
self._candidates = candidates
self._token_nl = Llama.token_nl()
self._token_eos = Llama.token_eos()
self._token_nl = self.token_nl()
self._token_eos = self.token_eos()
self._candidates_data_id = np.arange(self._n_vocab, dtype=np.intc) # type: ignore
self._candidates_data_p = np.zeros(self._n_vocab, dtype=np.single)

Expand Down Expand Up @@ -413,11 +408,11 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
Returns:
A list of tokens.
"""
assert self.ctx is not None
assert self.model is not None
n_ctx = self._n_ctx
tokens = (llama_cpp.llama_token * n_ctx)()
n_tokens = llama_cpp.llama_tokenize(
self.ctx,
n_tokens = llama_cpp.llama_tokenize_with_model(
self.model,
text,
tokens,
llama_cpp.c_int(n_ctx),
Expand All @@ -426,8 +421,8 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
if n_tokens < 0:
n_tokens = abs(n_tokens)
tokens = (llama_cpp.llama_token * n_tokens)()
n_tokens = llama_cpp.llama_tokenize(
self.ctx,
n_tokens = llama_cpp.llama_tokenize_with_model(
self.model,
text,
tokens,
llama_cpp.c_int(n_tokens),
Expand All @@ -448,13 +443,19 @@ def detokenize(self, tokens: List[int]) -> bytes:
Returns:
The detokenized string.
"""
assert self.ctx is not None
assert self.model is not None
output = b""
size = 8
buffer = (ctypes.c_char * size)()
for token in tokens:
output += llama_cpp.llama_token_to_str(
self.ctx, llama_cpp.llama_token(token)
n = llama_cpp.llama_token_to_str_with_model(
self.model, llama_cpp.llama_token(token), buffer, size
)
return output
assert n <= size
output += bytes(buffer[:n])
# NOTE: Llama1 models automatically added a space at the start of the prompt
# this line removes a leading space if the first token is a beginning of sentence token
return output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output

def set_cache(self, cache: Optional[BaseLlamaCache]):
"""Set the cache.
Expand Down Expand Up @@ -885,7 +886,7 @@ def _create_completion(
created: int = int(time.time())
completion_tokens: List[int] = []
# Add blank space to start of prompt to match OG llama tokenizer
prompt_tokens: List[int] = self.tokenize(b" " + prompt.encode("utf-8"))
prompt_tokens: List[int] = self.tokenize(prompt.encode("utf-8")) if prompt != "" else [self.token_bos()]
text: bytes = b""
returned_tokens: int = 0
stop = (
Expand Down Expand Up @@ -1581,13 +1582,7 @@ def __getstate__(self):
lora_base=self.lora_base,
lora_path=self.lora_path,
tensor_split=self.tensor_split,
### TEMPORARY ###
n_gqa=self.params.n_gqa,
rms_norm_eps=self.params.rms_norm_eps,
### TEMPORARY ###
### DEPRECATED ###
n_parts=self.n_parts,
### DEPRECATED ###
mul_mat_q=self.params.mul_mat_q,
)

def __setstate__(self, state):
Expand All @@ -1609,14 +1604,8 @@ def __setstate__(self, state):
lora_base=state["lora_base"],
lora_path=state["lora_path"],
tensor_split=state["tensor_split"],
mul_mat_q=state["mul_mat_q"],
verbose=state["verbose"],
### TEMPORARY ###
n_gqa=state["n_gqa"],
rms_norm_eps=state["rms_norm_eps"],
### TEMPORARY ###
### DEPRECATED ###
n_parts=state["n_parts"],
### DEPRECATED ###
)

def save_state(self) -> LlamaState:
Expand Down Expand Up @@ -1681,20 +1670,20 @@ def tokenizer(self) -> "LlamaTokenizer":
assert self.ctx is not None
return LlamaTokenizer(self)

@staticmethod
def token_eos() -> int:
def token_eos(self) -> int:
"""Return the end-of-sequence token."""
return llama_cpp.llama_token_eos()
assert self.ctx is not None
return llama_cpp.llama_token_eos(self.ctx)

@staticmethod
def token_bos() -> int:
def token_bos(self) -> int:
"""Return the beginning-of-sequence token."""
return llama_cpp.llama_token_bos()
assert self.ctx is not None
return llama_cpp.llama_token_bos(self.ctx)

@staticmethod
def token_nl() -> int:
def token_nl(self) -> int:
"""Return the newline token."""
return llama_cpp.llama_token_nl()
assert self.ctx is not None
return llama_cpp.llama_token_nl(self.ctx)

@staticmethod
def logits_to_logprobs(logits: List[float]) -> List[float]:
Expand Down
Loading

0 comments on commit 915bbea

Please sign in to comment.