From 5408cb845967260ead2e71834fa99faf69ab58a9 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Fri, 31 Jan 2025 09:28:00 +0000 Subject: [PATCH 1/9] An empty tool_call_id is better than none! --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index e7daceef1db..a4465c30f19 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -713,7 +713,7 @@ struct server_task_result_cmpl_final : server_task_result { {"name", tc.name}, {"arguments", tc.arguments}, }}, - {"id", tc.id.empty() ? json() : json(tc.id)}, + {"id", tc.id}, }); } } From caed1ef76fbc6739fb387fbfd63117af79afc9b4 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Fri, 31 Jan 2025 09:28:23 +0000 Subject: [PATCH 2/9] sync: minja (tool call name optional https://github.com/google/minja/pull/36) --- common/chat-template.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/common/chat-template.hpp b/common/chat-template.hpp index 75ba5d938f8..58e119a3bcd 100644 --- a/common/chat-template.hpp +++ b/common/chat-template.hpp @@ -283,10 +283,12 @@ class chat_template { message["role"] = "user"; auto obj = json { {"tool_response", { - {"tool", message.at("name")}, {"content", message.at("content")}, }}, }; + if (message.contains("name")) { + obj["tool_response"]["name"] = message.at("name"); + } if (message.contains("tool_call_id")) { obj["tool_response"]["tool_call_id"] = message.at("tool_call_id"); } From 479918599b13df94a13f5a0f79650090d48a0d11 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Fri, 31 Jan 2025 10:27:07 +0000 Subject: [PATCH 3/9] Force-disable parallel_tool_calls if template doesn't support it --- examples/server/utils.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 70bd6a42cb6..929e538e4a4 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -640,6 +640,10 @@ static json oaicompat_completion_params_parse( inputs.tools = tools; inputs.tool_choice = tool_choice; inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false); + if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) { + LOG_DBG("Disabling parallel_tool_calls because the template does not support it"); + inputs.parallel_tool_calls = false; + } inputs.stream = stream; // TODO: support mixing schema w/ tools beyond generic format. inputs.json_schema = json_value(llama_params, "json_schema", json()); From b31259bfcfcaf996384339451a0cd29c5437fbaa Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Fri, 31 Jan 2025 10:27:16 +0000 Subject: [PATCH 4/9] More debug logs --- examples/server/server.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index a4465c30f19..3451e96a2b1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -345,7 +345,7 @@ struct server_task { auto it = data.find("chat_format"); if (it != data.end()) { params.oaicompat_chat_format = static_cast(it->get()); - LOG_DBG("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str()); + LOG_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str()); } else { params.oaicompat_chat_format = defaults.oaicompat_chat_format; } @@ -697,6 +697,7 @@ struct server_task_result_cmpl_final : server_task_result { std::string finish_reason = "length"; common_chat_msg message; if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { + LOG_DBG("Parsing chat message: %s\n", content.c_str()); message = common_chat_parse(content, oaicompat_chat_format); finish_reason = message.tool_calls.empty() ? "stop" : "tool_calls"; } else { From 422df5da08d8caafa25eae72e5b46b34c22be94e Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Fri, 31 Jan 2025 10:28:24 +0000 Subject: [PATCH 5/9] Llama 3.x tools: accept / trigger on more varied spaced outputs --- common/chat.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/common/chat.cpp b/common/chat.cpp index d9a654892ca..7ada85b8767 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -384,14 +384,19 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com tool_rules.push_back( builder.add_rule( name + "-call", - "\"{\" ( \"\\\"type\\\": \\\"function\\\", \" | space ) " + "\"{\" space " + "( \"\\\"type\\\":\" space \"\\\"function\\\",\" space )? " "\"\\\"name\\\": \\\"" + name + "\\\", \\\"parameters\\\": \" " + builder.add_schema(name + "-args", parameters) + " \"}\"")); data.grammar_triggers.push_back({"{\"name\": \"" + name + "\"", /* .at_start = */ true}); }); data.grammar_triggers.push_back({"{\"name\":", /* .at_start = */ true}); + data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true}); + data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true}); data.grammar_triggers.push_back({"{\"type\": \"function\"", /* .at_start = */ true}); + data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true}); + data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true}); if (!builtin_tools.empty()) { data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false}); } From 8a44b2fb921847eb2aeb579e2386b3a22b60ba89 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Fri, 31 Jan 2025 11:36:44 +0000 Subject: [PATCH 6/9] Fix empty content for functionary v3.2 tool call --- common/chat.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index 7ada85b8767..a5e350bf03d 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -591,9 +591,17 @@ static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & in } } // TODO: tighten & simplify. - auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex); - res.content = content; - return res; + try { + auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex); + res.content = content + res.content; + return res; + } catch (const std::exception & e) { + LOG_ERR("Failed to parse functionary v3.2 input: %s", e.what()); + common_chat_msg res; + res.role = "assistant"; + res.content = input; + return res; + } } static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { From fa20249305c1b117a641488400810c7a7f8db52b Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Fri, 31 Jan 2025 11:53:06 +0000 Subject: [PATCH 7/9] Add proper tool call docs to server README --- examples/server/README.md | 108 +++++++++++++++++++++++++++++++++++--- 1 file changed, 101 insertions(+), 7 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index ce1ae88585a..d61d7c79555 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -1117,17 +1117,111 @@ curl http://localhost:8080/v1/chat/completions \ }' ``` -... and even tool usage (needs `--jinja` flag): +*Tool call support* - ```shell - llama-server --jinja -hfr lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF -hff Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf -fa +[Function calling](https://platform.openai.com/docs/guides/function-calling) is supported for all models (see https://github.com/ggerganov/llama.cpp/pull/9639): + +- Needs `--jinja` flag +- Native tool call formats supported: + - Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2 + - Functionary v3.1 / v3.2 + - Hermes 2/3, Qwen 2.5 + - Mistral Nemo + - Firefunction v2 + - DeepSeek R1 (WIP / seems reluctant to call any tools?) + +
+ Show some common templates and which format handler they use + + | Template | Format | + |----------|--------| + | CohereForAI-c4ai-command-r-plus-default.jinja | generic tool calls | + | CohereForAI-c4ai-command-r-plus-rag.jinja | generic tool calls | + | CohereForAI-c4ai-command-r-plus-tool_use.jinja | generic tool calls | + | MiniMaxAI-MiniMax-Text-01.jinja | generic tool calls | + | NexaAIDev-Octopus-v2.jinja | generic tool calls | + | NousResearch-Hermes-2-Pro-Llama-3-8B-default.jinja | generic tool calls | + | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja | hermes 2 pro tool calls | + | NousResearch-Hermes-2-Pro-Mistral-7B-default.jinja | generic tool calls | + | NousResearch-Hermes-2-Pro-Mistral-7B-tool_use.jinja | hermes 2 pro tool calls | + | NousResearch-Hermes-3-Llama-3.1-70B-default.jinja | generic tool calls | + | NousResearch-Hermes-3-Llama-3.1-70B-tool_use.jinja | hermes 2 pro tool calls | + | OrionStarAI-Orion-14B-Chat.jinja | generic tool calls | + | Qwen-QwQ-32B-Preview.jinja | hermes 2 pro tool calls | + | Qwen-Qwen2-7B-Instruct.jinja | generic tool calls | + | Qwen-Qwen2-VL-7B-Instruct.jinja | generic tool calls | + | Qwen-Qwen2.5-7B-Instruct.jinja | hermes 2 pro tool calls | + | Qwen-Qwen2.5-Math-7B-Instruct.jinja | hermes 2 pro tool calls | + | TheBloke-FusionNet_34Bx2_MoE-AWQ.jinja | generic tool calls | + | abacusai-Fewshot-Metamath-OrcaVicuna-Mistral.jinja | generic tool calls | + | bofenghuang-vigogne-2-70b-chat.jinja | generic tool calls | + | databricks-dbrx-instruct.jinja | generic tool calls | + | deepseek-ai-DeepSeek-Coder-V2-Instruct.jinja | generic tool calls | + | deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja | deepseek r1 tool calls | + | deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja | deepseek r1 tool calls | + | deepseek-ai-DeepSeek-R1-Distill-Qwen-7B.jinja | deepseek r1 tool calls | + | deepseek-ai-DeepSeek-V2.5.jinja | deepseek r1 tool calls | + | deepseek-ai-deepseek-coder-33b-instruct.jinja | generic tool calls | + | google-gemma-2-2b-it.jinja | generic tool calls | + | google-gemma-7b-it.jinja | generic tool calls | + | indischepartij-MiniCPM-3B-OpenHermes-2.5-v2.jinja | generic tool calls | + | mattshumer-Reflection-Llama-3.1-70B.jinja | generic tool calls | + | meetkai-functionary-medium-v3.2.jinja | functionary v3.2 tool calls | + | meta-llama-Llama-3.1-8B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) | + | meta-llama-Llama-3.2-3B-Instruct.jinja | llama 3.x tool calls | + | meta-llama-Llama-3.3-70B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) | + | meta-llama-Meta-Llama-3.1-8B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) | + | microsoft-Phi-3-medium-4k-instruct.jinja | generic tool calls | + | microsoft-Phi-3-mini-4k-instruct.jinja | generic tool calls | + | microsoft-Phi-3-small-8k-instruct.jinja | generic tool calls | + | microsoft-Phi-3.5-mini-instruct.jinja | generic tool calls | + | microsoft-Phi-3.5-vision-instruct.jinja | generic tool calls | + | mistralai-Mistral-7B-Instruct-v0.2.jinja | generic tool calls | + | mistralai-Mistral-Large-Instruct-2407.jinja | mistral nemo tool calls | + | mistralai-Mistral-Large-Instruct-2411.jinja | generic tool calls | + | mistralai-Mistral-Nemo-Instruct-2407.jinja | mistral nemo tool calls | + | mistralai-Mixtral-8x7B-Instruct-v0.1.jinja | generic tool calls | + | mlabonne-AlphaMonarch-7B.jinja | generic tool calls | + | nvidia-Llama-3.1-Nemotron-70B-Instruct-HF.jinja | llama 3.x tool calls (w/ builtin tools) | + | openchat-openchat-3.5-0106.jinja | generic tool calls | + | teknium-OpenHermes-2.5-Mistral-7B.jinja | generic tool calls | + + This table can be generated with: - # https://huggingface.co/meetkai/functionary-medium-v3.2 - llama-server --jinja -hfr bartowski/functionary-medium-v3.2-GGUF -hff functionary-medium-v3.2-IQ4_XS.gguf -fa + ```bash + ./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null + +
- # https://huggingface.co/meetkai/functionary-medium-v3.1 - llama-server --jinja -hfr meetkai/functionary-medium-v3.1-GGUF -hff functionary-medium-llama-3.1.Q4_0.gguf -fa +- Generic tool call is supported when the template isn't recognized by native format handlers (you'll see `Chat format: Generic` in the logs). + - Use `--chat-template-file` to override the template when appropriate (see examples below) + - Generic support may consume more tokens and be less efficient than a model's native format. +- Run with: + + ```shell + # Native support: + llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M + llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M + llama-server --jinja -fa -hf bartowski/Llama-3.2-3B-Instruct-GGUF:Q6_K + llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M + llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \ + --chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B ) + + # Native support requires the right template for these GGUFs: + llama-server --jinja -fa -hf bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M \ + --chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use ) + llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \ + --chat-template-file <( python scripts/get_chat_template.py fireworks-ai/firellama-3-firefunction-v2 ) + + # Generic format support + llama-server --jinja -fa -hf bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M + llama-server --jinja -fa -hf bartowski/gemma-2-2b-it-GGUF:Q4_K_M + ``` + +- Test in CLI: + + ```bash curl http://localhost:8080/v1/chat/completions -d '{ "model": "gpt-3.5-turbo", "tools": [ From ed82223b5c3747455123d80e46fedc778b4fdc32 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Fri, 31 Jan 2025 12:09:33 +0000 Subject: [PATCH 8/9] readme: function calling *is* supported now --- examples/server/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index d61d7c79555..276b4301396 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -126,7 +126,7 @@ The project is under active development, and we are [looking for feedback and co | `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | | `--grammar-file FNAME` | file to read grammar from | | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | -| `--jinja` | Enable experimental Jinja templating engine (needed for tool use) | +| `--jinja` | Enable experimental Jinja templating engine (required for tool use) | **Example-specific params** @@ -1069,7 +1069,7 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte *Options:* -See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported. +See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). llama.cpp `/completion`-specific features such as `mirostat` are also supported. The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name", "type": "string" }, "date": { "title": "Date", "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants", "type": "string" } } } }`), similar to other OpenAI-inspired API providers. @@ -1121,7 +1121,7 @@ curl http://localhost:8080/v1/chat/completions \ [Function calling](https://platform.openai.com/docs/guides/function-calling) is supported for all models (see https://github.com/ggerganov/llama.cpp/pull/9639): -- Needs `--jinja` flag +- Requires `--jinja` flag - Native tool call formats supported: - Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2 - Functionary v3.1 / v3.2 From 45a1c2027f606cb3fd0ec5dc4aaed384acbcfedf Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Fri, 31 Jan 2025 14:14:28 +0000 Subject: [PATCH 9/9] Apply suggestions from code review Co-authored-by: Georgi Gerganov --- common/chat.cpp | 2 +- examples/server/utils.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index a5e350bf03d..58db12af9a0 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -596,7 +596,7 @@ static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & in res.content = content + res.content; return res; } catch (const std::exception & e) { - LOG_ERR("Failed to parse functionary v3.2 input: %s", e.what()); + LOG_ERR("Failed to parse functionary v3.2 input: %s\n", e.what()); common_chat_msg res; res.role = "assistant"; res.content = input; diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 929e538e4a4..550a09c3e40 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -641,7 +641,7 @@ static json oaicompat_completion_params_parse( inputs.tool_choice = tool_choice; inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false); if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) { - LOG_DBG("Disabling parallel_tool_calls because the template does not support it"); + LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n"); inputs.parallel_tool_calls = false; } inputs.stream = stream;