-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathllama_chat_adapter.py
164 lines (128 loc) · 6.76 KB
/
llama_chat_adapter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python
# coding: utf-8
# Import our general libraries
import sys
import time
# Import CodeProject.AI SDK
from codeproject_ai_sdk import RequestData, ModuleRunner, ModuleOptions, \
LogMethod, LogVerbosity, JSON
from llama_chat import LlamaChat
class LlamaChat_adapter(ModuleRunner):
def initialise(self) -> None:
self.models_dir = ModuleOptions.getEnvVariable("CPAI_MODULE_LLAMA_MODEL_DIR", "./models")
""" Mistral
# For loading model downloaded at install time
self.model_filename = ModuleOptions.getEnvVariable("CPAI_MODULE_LLAMA_MODEL_FILENAME", "mistral-7b-instruct-v0.2.Q4_K_M.gguf")
# fallback loading (at runtime, needs internet) via llama-cpp.from_pretrained
self.model_repo = ModuleOptions.getEnvVariable("CPAI_MODULE_LLAMA_MODEL_REPO", "TheBloke/Llama-2-7B-Chat-GGUF")
self.models_fileglob = ModuleOptions.getEnvVariable("CPAI_MODULE_LLAMA_MODEL_FILEGLOB", "*.Q4_K_M.gguf")
"""
# Microsoft Phi-3
# For loading model downloaded at install time
self.model_filename = ModuleOptions.getEnvVariable("CPAI_MODULE_LLAMA_MODEL_FILENAME", "Phi-3-mini-4k-instruct-q4.gguf")
# fallback loading (at runtime, needs internet) via llama-cpp.from_pretrained
self.model_repo = ModuleOptions.getEnvVariable("CPAI_MODULE_LLAMA_MODEL_REPO", "microsoft/Phi-3-mini-4k-instruct-gguf")
self.models_fileglob = ModuleOptions.getEnvVariable("CPAI_MODULE_LLAMA_MODEL_FILEGLOB", "Phi-3-mini-4k-instruct-q4.gguf")
# llama-cpp-python packages that we are using will use GPU when it can.
# But Llama doesn't report this, so we have to make our best guess:
# - on Windows and Linux, it will use CUDA 11.6+ if possible, else CPU
# - on macOS, "Metal" is always (sorry, sometimes) used, meaning always GPU
# There is support for ROCm for when we add the appropriate requirements files.
self.inference_device = "CPU"
num_gpu_layers = -1
if self.system_info.os == "macOS":
if self.system_info.cpu_arch == 'arm64':
self.inference_device = "GPU"
self.inference_library = "Metal"
else:
num_gpu_layers = 0 # There's a bug at the moment on Intel macs
else:
(cuda_major, cuda_minor) = self.system_info.getCudaVersion
if cuda_major and (cuda_major > 11 or (cuda_major == 11 and cuda_minor >= 6)):
self.inference_device = "GPU"
self.inference_library = "CUDA"
verbose = self.log_verbosity != LogVerbosity.Quiet
self.llama_chat = LlamaChat(repo_id=self.model_repo,
file_glob=self.models_fileglob,
filename=self.model_filename,
model_dir=self.models_dir,
n_ctx=0, n_gpu_layers=num_gpu_layers,
verbose=verbose)
if self.llama_chat.model_path:
self.log(LogMethod.Info|LogMethod.Server, {
"message": f"Using model from '{self.llama_chat.model_path}'",
"loglevel": "information"
})
else:
self.log(LogMethod.Error|LogMethod.Server, {
"message": f"Unable to load Llama model",
"loglevel": "error"
})
self.reply_text = ""
self.cancelled = False
def process(self, data: RequestData) -> JSON:
return self.long_process
def long_process(self, data: RequestData) -> JSON:
self.reply_text = ""
stop_reason = None
prompt: str = data.get_value("prompt")
system_prompt: str = data.get_value("system_prompt")
max_tokens: int = data.get_int("max_tokens", 0) #0 means model default
temperature: float = data.get_float("temperature", 0.4)
try:
start_time = time.perf_counter()
completion = self.llama_chat.do_chat(prompt=prompt, system_prompt=system_prompt,
max_tokens=max_tokens, temperature=temperature,
stream=True)
if completion:
try:
for output in completion:
if self.cancelled:
self.cancelled = False
stop_reason = "cancelled"
break
# Using the raw result from the llama_chat module. In
# building modules we don't try to rewrite the code we
# are wrapping. Rather, we wrap the code so we can take
# advantage of updates to the original code more easily
# rather than having to re-apply fixes.
delta = output["choices"][0]["delta"]
if "content" in delta:
self.reply_text += delta["content"]
except StopIteration:
pass
inferenceMs : int = int((time.perf_counter() - start_time) * 1000)
if stop_reason is None:
stop_reason = "completed"
response = {
"success": True,
"reply": self.reply_text,
"stop_reason": stop_reason,
"processMs" : inferenceMs,
"inferenceMs" : inferenceMs
}
except Exception as ex:
self.report_error(ex, __file__)
response = { "success": False, "error": "Unable to generate text" }
return response
def command_status(self) -> JSON:
return {
"success": True,
"reply": self.reply_text
}
def cancel_command_task(self):
self.cancelled = True # We will cancel this long process ourselves
self.force_shutdown = False # Tell ModuleRunner not to go ballistic
def selftest(self) -> JSON:
request_data = RequestData()
request_data.queue = self.queue_name
request_data.command = "prompt"
request_data.add_value("prompt", "How many planets are there in the solar system?")
request_data.add_value("max_tokens", 256)
request_data.add_value("temperature", 0.4)
result = self.long_process(request_data)
print(f"Info: Self-test for {self.module_id}. Success: {result['success']}")
# print(f"Info: Self-test output for {self.module_id}: {result}")
return { "success": result['success'], "message": "LlamaChat test successful" }
if __name__ == "__main__":
LlamaChat_adapter().start_loop()