diff --git a/prototype/frameworks/llamastack/kubernetes/granite-cm.yaml b/prototype/frameworks/llamastack/kubernetes/granite-cm.yaml new file mode 100644 index 0000000..3e798d1 --- /dev/null +++ b/prototype/frameworks/llamastack/kubernetes/granite-cm.yaml @@ -0,0 +1,43 @@ +apiVersion: v1 +data: + tool_chat_template_granite.jinja: | + {%- if tools %} + {{- '<|start_of_role|>available_tools<|end_of_role|> + ' }} + {%- for tool in tools %} + {{- tool | tojson(indent=4) }} + {%- if not loop.last %} + {{- ' + + ' }} + {%- endif %} + {%- endfor %} + {{- '<|end_of_text|> + ' }} + {%- endif %} + + {%- for message in messages %} + {%- if message['role'] == 'system' %} + {{- '<|start_of_role|>system<|end_of_role|>' + message['content'] + '<|end_of_text|> + ' }} + {%- elif message['role'] == 'user' %} + {{- '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|> + ' }} + {%- elif message['role'] == 'assistant_tool_call' or (message['role'] == 'assistant' and message.tool_calls is defined) %} + {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message.tool_calls|map(attribute='function')|list|tojson(indent=4) + '<|end_of_text|> + ' }} + {%- elif message['role'] == 'assistant' %} + {{- '<|start_of_role|>assistant<|end_of_role|>' + message['content'] + '<|end_of_text|> + ' }} + {%- elif message['role'] == 'tool_response' or message['role'] == 'tool' %} + {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|> + ' }} + {%- endif %} + {%- if loop.last and add_generation_prompt %} + {{- '<|start_of_role|>assistant<|end_of_role|>' }} + {%- endif %} + {%- endfor %} +kind: ConfigMap +metadata: + creationTimestamp: null + name: granite-chat diff --git a/prototype/frameworks/llamastack/kubernetes/llama-serve/vllm.yaml b/prototype/frameworks/llamastack/kubernetes/llama-serve/vllm.yaml index 7c1c769..f2866b4 100644 --- a/prototype/frameworks/llamastack/kubernetes/llama-serve/vllm.yaml +++ b/prototype/frameworks/llamastack/kubernetes/llama-serve/vllm.yaml @@ -23,13 +23,13 @@ spec: spec: containers: - args: + - --tensor-parallel-size=2 - --model - - meta-llama/Llama-3.2-3B-Instruct + - ibm-granite/granite-3.2-8b-instruct - --enable-auto-tool-choice - --chat-template - - /app/tool_chat_template_llama3.2_json.jinja - - --tool-call-parser - - llama3_json + - /app/tool_chat_template_granite.jinja + - --tool-call-parser=granite - --port - "8000" env: @@ -48,7 +48,7 @@ spec: protocol: TCP resources: limits: - nvidia.com/gpu: "1" + nvidia.com/gpu: "2" terminationMessagePath: /dev/termination-log terminationMessagePolicy: File volumeMounts: @@ -72,7 +72,7 @@ spec: name: triton - configMap: defaultMode: 420 - name: template + name: granite-chat name: chat-template - emptyDir: {} name: config diff --git a/prototype/frameworks/llamastack/kubernetes/llama-stack/deployment.yaml b/prototype/frameworks/llamastack/kubernetes/llama-stack/deployment.yaml index 6caaac3..7d500dc 100644 --- a/prototype/frameworks/llamastack/kubernetes/llama-stack/deployment.yaml +++ b/prototype/frameworks/llamastack/kubernetes/llama-stack/deployment.yaml @@ -20,7 +20,7 @@ spec: - name: VLLM_MAX_TOKENS value: "128000" - name: INFERENCE_MODEL - value: meta-llama/Llama-3.2-3B-Instruct + value: ibm-granite/granite-3.2-8b-instruct - name: VLLM_URL value: http://vllm:8000/v1 - name: VLLM_API_TOKEN