Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

server : parallel decoding and multimodal #3589

Closed
wants to merge 41 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
63f99b1
implementing parallel decoding in server example
FSSRepo Oct 11, 2023
4712302
crash fixed
FSSRepo Oct 11, 2023
7850421
save dev progress
FSSRepo Oct 12, 2023
b716eeb
Merge branch 'master' of https://github.com/ggerganov/llama.cpp
FSSRepo Oct 12, 2023
29c8cdd
refactored sampling function
FSSRepo Oct 12, 2023
8148480
completion endpoint working
FSSRepo Oct 12, 2023
5b8e29d
multiple client support
FSSRepo Oct 12, 2023
83c2b35
grammar + no stream completion
FSSRepo Oct 12, 2023
500ac71
cached prompt support
FSSRepo Oct 13, 2023
4ba5a50
chat.mjs support cached prompt + some fixes
FSSRepo Oct 13, 2023
6358ae5
server ui now support multiple clients
FSSRepo Oct 13, 2023
a410a9e
unused change reverted
FSSRepo Oct 13, 2023
b6d9e21
fixed timings per slot
FSSRepo Oct 13, 2023
a2c2d98
add context swap
FSSRepo Oct 13, 2023
eb08201
add changes to README.md
FSSRepo Oct 13, 2023
9d98cdd
llava multimodal integration
FSSRepo Oct 13, 2023
de35b47
fixed tokens probs
FSSRepo Oct 13, 2023
9f72b44
add multimodal input - alfa
FSSRepo Oct 14, 2023
7e64bfe
refactor code + remove unused comments + improved README.md
FSSRepo Oct 14, 2023
299f6b5
fix compilation errors with llvm
damian0815 Oct 14, 2023
4e5c5c4
notify the user from server ui that multimodality is unavialable
FSSRepo Oct 14, 2023
f47fd17
Merge branch 'ggerganov:master' into master
FSSRepo Oct 15, 2023
9035978
Merge pull request #6 from damian0815/fssrepo_mac_fixes
FSSRepo Oct 15, 2023
ce961a3
some ci fixes
FSSRepo Oct 15, 2023
b727e02
fix ci make build undefined ref errors
FSSRepo Oct 15, 2023
fd64f04
fix long prompt than ctx proposed in #3639
FSSRepo Oct 15, 2023
2d9f11d
fixed premature end due stop word
FSSRepo Oct 16, 2023
d7eca25
context shift fixed
FSSRepo Oct 16, 2023
4d18043
fix llava implementation
FSSRepo Oct 16, 2023
aa2268f
sync README.md changes
FSSRepo Oct 17, 2023
fa0f22f
Merge remote-tracking branch 'upstream/master'
FSSRepo Oct 17, 2023
58f8ae9
readme change
FSSRepo Oct 17, 2023
6c277ea
update api like OpenAI
FSSRepo Oct 17, 2023
ed0c11c
multimodal support enabled by default
FSSRepo Oct 17, 2023
d2b1fac
fix make bui;d errors
FSSRepo Oct 17, 2023
c02c52e
fix multiple clients
FSSRepo Oct 17, 2023
35fd374
fix zig build
FSSRepo Oct 17, 2023
84b8f2b
Merge branch 'ggerganov:master' into master
FSSRepo Oct 18, 2023
7196c4e
new sampling API
FSSRepo Oct 18, 2023
8540568
Merge branch 'master' of https://github.com/ggerganov/llama.cpp
FSSRepo Oct 18, 2023
ab2fc00
latest changes of sampling API
FSSRepo Oct 18, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
*.gcno
*.gcda
*.dot
*.bat
*.metallib
.DS_Store
.build/
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,7 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)

$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
Expand Down
3 changes: 2 additions & 1 deletion build.zig
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ pub fn build(b: *std.build.Builder) !void {
const sampling = make.obj("sampling", "common/sampling.cpp");
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
const train = make.obj("train", "common/train.cpp");
const clip = make.obj("clip", "examples/llava/clip.cpp");

_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
Expand All @@ -139,7 +140,7 @@ pub fn build(b: *std.build.Builder) !void {
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });

const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser, clip });
if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32");
}
Expand Down
2 changes: 1 addition & 1 deletion examples/server/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif()
Expand Down
34 changes: 34 additions & 0 deletions examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ Command line options:
- `--port`: Set the port to listen. Default: `8080`.
- `--path`: path from which to serve static files (default examples/server/public)
- `--embedding`: Enable embedding extraction, Default: disabled.
- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
-

## Build

Expand Down Expand Up @@ -188,6 +192,12 @@ node index.js

`truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)

`slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)

`cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)

`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)

- **POST** `/tokenize`: Tokenize a given text.

*Options:*
Expand Down Expand Up @@ -218,8 +228,32 @@ node index.js

It also accepts all the options of `/completion` except `stream` and `prompt`.

- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.

## More examples

### Change system prompt on runtime

To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt` to achieve that. This only needs to be done once to establish it.

`prompt`: Specify a context that you want all connecting clients to respect.

`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.

`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.

```json
{
"system_prompt": {
"prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
"anti_prompt": "User:",
"assistant_name": "Assistant:"
}
}
```

**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.

### Interactive mode

Check the sample in [chat.mjs](chat.mjs).
Expand Down
5 changes: 4 additions & 1 deletion examples/server/api_like_OAI.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@


app = Flask(__name__)
slot_id = -1

parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
Expand Down Expand Up @@ -77,7 +78,8 @@ def make_postData(body, chat=False, stream=False):
if(is_present(body, "stop")): postData["stop"] += body["stop"]
postData["n_keep"] = -1
postData["stream"] = stream

postData["cache_prompt"] = True
postData["slot_id"] = slot_id
return postData

def make_resData(data, chat=False, promptToken=[]):
Expand Down Expand Up @@ -128,6 +130,7 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
}
]
}
slot_id = data["slot_id"]
if (chat):
if (start):
resData["choices"][0]["delta"] = {
Expand Down
11 changes: 11 additions & 0 deletions examples/server/chat.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ const args = process.argv.slice(2);
const grammarJsonSchemaFile = args.find(
(_, index) => args[index - 1] === "--grammar-json-schema"
);

const no_cached_prompt = args.find(
(_, index) => args[index - 1] === "--no-cache-prompt"
) ?? "false";

const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");

// Example usage: function,arguments
Expand All @@ -30,6 +35,9 @@ if (grammarFile) {
grammar = readFileSync(grammarFile, 'utf-8')
}

// for cached prompt
let slot_id = -1;

const API_URL = 'http://127.0.0.1:8080'

const chat = [
Expand Down Expand Up @@ -76,6 +84,8 @@ async function chat_completion(question) {
top_p: 0.9,
n_keep: n_keep,
n_predict: 256,
cache_prompt: no_cached_prompt === "false",
slot_id: slot_id,
stop: ["\n### Human:"], // stop completion after generating this
grammar,
stream: true,
Expand All @@ -92,6 +102,7 @@ async function chat_completion(question) {
const t = Buffer.from(chunk).toString('utf8')
if (t.startsWith('data: ')) {
const message = JSON.parse(t.substring(6))
slot_id = message.slot_id
answer += message.content
process.stdout.write(message.content)
if (message.stop) {
Expand Down
Loading