-
Notifications
You must be signed in to change notification settings - Fork 8
/
llm_ollama.py
325 lines (279 loc) · 10.6 KB
/
llm_ollama.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
import os
import warnings
import contextlib
from collections import defaultdict
from typing import List, Optional, Tuple
import click
import llm
import ollama
from pydantic import Field, TypeAdapter, ValidationError
@llm.hookimpl
def register_commands(cli):
@cli.group(name="ollama")
def ollama_group():
"Commands for working with models hosted on Ollama"
@ollama_group.command(name="list-models")
def list_models():
"""List models that are available locally on Ollama server."""
for model in _get_ollama_models():
click.echo(model["name"])
@llm.hookimpl
def register_models(register):
models = defaultdict(list)
for model in _get_ollama_models():
models[model["digest"]].append(model["name"])
if model["name"].endswith(":latest"):
models[model["digest"]].append(model["name"][: -len(":latest")])
for names in models.values():
name, aliases = _pick_primary_name(names)
if not _ollama_model_capability_completion(name):
continue
register(Ollama(name), aliases=aliases)
@llm.hookimpl
def register_embedding_models(register):
models = defaultdict(list)
for model in _get_ollama_models():
models[model["digest"]].append(model["name"])
if model["name"].endswith(":latest"):
models[model["digest"]].append(model["name"][: -len(":latest")])
for names in models.values():
name, aliases = _pick_primary_name(names)
register(OllamaEmbed(name), aliases=aliases)
class Ollama(llm.Model):
can_stream: bool = True
attachment_types = {
"image/png",
"image/jpeg",
"image/webp",
"image/gif",
}
class Options(llm.Options):
"""Parameters that can be set when the model is run by Ollama.
See: https://github.com/ollama/ollama/blob/main/docs/modelfile.md#parameter
"""
mirostat: Optional[int] = Field(
default=None,
description=("Enable Mirostat sampling for controlling perplexity."),
)
mirostat_eta: Optional[float] = Field(
default=None,
description=(
"Influences how quickly the algorithm responds to feedback from the generated text."
),
)
mirostat_tau: Optional[float] = Field(
default=None,
description=(
"Controls the balance between coherence and diversity of the output."
),
)
num_ctx: Optional[int] = Field(
default=None,
description="The size of the context window used to generate the next token.",
)
temperature: Optional[float] = Field(
default=None,
description=(
"The temperature of the model. Increasing the temperature will make the model answer more creatively."
),
)
seed: Optional[int] = Field(
default=None,
description=(
"Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt."
),
)
stop: Optional[List[str]] = Field(
default=None,
description=(
"Sets the stop sequences to use. When this pattern is encountered the LLM will stop generating text and return."
),
)
tfs_z: Optional[float] = Field(
default=None,
description=(
"Tail free sampling is used to reduce the impact of less probable tokens from the output."
),
)
num_predict: Optional[int] = Field(
default=None,
description=("Maximum number of tokens to predict when generating text."),
)
top_k: Optional[int] = Field(
default=None,
description=("Reduces the probability of generating nonsense."),
)
top_p: Optional[float] = Field(
default=None,
description=(
"Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text."
),
)
json_object: Optional[bool] = Field(
default=None,
description="Output a valid JSON object {...}. Prompt must mention JSON.",
)
def __init__(
self,
model_id: str,
) -> None:
self.model_id = model_id
def __str__(self) -> str:
return f"Ollama: {self.model_id}"
def execute(
self,
prompt: llm.Prompt,
stream: bool,
response: llm.Response,
conversation=None,
):
messages = self.build_messages(prompt, conversation)
response._prompt_json = {"messages": messages}
options = prompt.options.model_dump(exclude_none=True)
json_object = options.pop("json_object", None)
kwargs = {}
if json_object:
kwargs["format"] = "json"
if stream:
response_stream = ollama.chat(
model=self.model_id,
messages=messages,
stream=True,
options=options,
**kwargs,
)
for chunk in response_stream:
with contextlib.suppress(KeyError):
yield chunk["message"]["content"]
else:
response.response_json = ollama.chat(
model=self.model_id,
messages=messages,
options=options,
**kwargs,
)
yield response.response_json["message"]["content"]
def build_messages(self, prompt, conversation):
messages = []
if not conversation:
if prompt.system:
messages.append({"role": "system", "content": prompt.system})
messages.append({"role": "user", "content": prompt.prompt})
if prompt.attachments:
messages[-1]["images"] = [
attachment.base64_content() for attachment in prompt.attachments
]
return messages
current_system = None
for prev_response in conversation.responses:
if (
prev_response.prompt.system
and prev_response.prompt.system != current_system
):
messages.append(
{"role": "system", "content": prev_response.prompt.system},
)
current_system = prev_response.prompt.system
messages.append({"role": "user", "content": prev_response.prompt.prompt})
if prev_response.attachments:
messages[-1]["images"] = [
attachment.base64_content()
for attachment in prev_response.attachments
]
messages.append({"role": "assistant", "content": prev_response.text()})
if prompt.system and prompt.system != current_system:
messages.append({"role": "system", "content": prompt.system})
messages.append({"role": "user", "content": prompt.prompt})
return messages
class OllamaEmbed(llm.EmbeddingModel):
supports_text = True
supports_binary = False
batch_size = 8
def __init__(self, model_id):
self.model_id = model_id
# NOTE: truncate the input to fit in the model's context length
# if set to False, the call will error if the input is too long
try:
self.truncate = TypeAdapter(bool).validate_python(
os.getenv("OLLAMA_EMBED_TRUNCATE", "True"),
)
except ValidationError:
warnings.warn("OLLAMA_EMBED_TRUNCATE is not a valid boolean value, defaulting to True")
self.truncate = True # default value
# NOTE: this is not used, but adding it anyways
def __str__(self) -> str:
return f"Ollama: {self.model_id}"
def embed_batch(self, items):
result = ollama.embed(
model=self.model_id,
input=items,
truncate=self.truncate,
)
yield from result["embeddings"]
def _pick_primary_name(names: List[str]) -> Tuple[str, List[str]]:
"""Pick the primary model name from a list of names.
The picking algorithm prefers names with the most specific tag, e.g. "llama2:7b"
over "llama2:latest" over "llama2".
Parameters
----------
names : list[str]
A non-empty list of model names.
Returns
-------
tuple[str, list[str, ...]]
The primary model name and a list with the secondary names.
"""
if len(names) == 1:
return names[0], ()
sorted_names = sorted(
names,
key=lambda name: (
":" not in name,
name.endswith(":latest"),
name,
),
)
return sorted_names[0], tuple(sorted_names[1:])
def _get_ollama_models() -> List[dict]:
"""Get a list of models available on Ollama.
Returns
-------
list[dict]
A list of models available on Ollama. If the Ollama server is down, an empty
list is returned.
"""
try:
return ollama.list()["models"]
except:
return []
def _ollama_model_capability_completion(model: str) -> bool:
"""Check if a model is capable of completion.
This is a indicator for if a model can be used for chat or if its an embedding only model.
Source of this check is from Ollama server
https://github.com/ollama/ollama/blob/8a9bb0d000ae8201445ef1a590d7136df0a16f8b/server/images.go#L100
It works by checking if the model has a pooling_type key in the model_info,
making the model an embed only model, incapable of completion.
pooling_type is found in 'model_info' as '{model_architecture}.pooling_type'
where model_architecture is saved in the 'model_info' under 'general.architecture'.
note: from what I found, if it is present it is set to '1', but this is not checked in the reference code.
Parameters
----------
model : str
The model name.
Returns
-------
bool
True if the model is capable of completion, False otherwise.
If the model name is not present in Ollama server, False is returned.
"""
is_embedding_model = False
try:
model_data = ollama.show(model)
model_info = model_data["model_info"]
model_arch = model_info["general.architecture"]
is_embedding_model = f"{model_arch}.pooling_type" in model_info
except ollama.ResponseError:
# if ollama.show fails, model name is not present in Ollama server, return False
return False
# except ConnectionError:
return not is_embedding_model