Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions prototype/frameworks/llamastack/kubernetes/granite-cm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
apiVersion: v1
data:
tool_chat_template_granite.jinja: |
{%- if tools %}
{{- '<|start_of_role|>available_tools<|end_of_role|>
' }}
{%- for tool in tools %}
{{- tool | tojson(indent=4) }}
{%- if not loop.last %}
{{- '

' }}
{%- endif %}
{%- endfor %}
{{- '<|end_of_text|>
' }}
{%- endif %}

{%- for message in messages %}
{%- if message['role'] == 'system' %}
{{- '<|start_of_role|>system<|end_of_role|>' + message['content'] + '<|end_of_text|>
' }}
{%- elif message['role'] == 'user' %}
{{- '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|>
' }}
{%- elif message['role'] == 'assistant_tool_call' or (message['role'] == 'assistant' and message.tool_calls is defined) %}
{{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message.tool_calls|map(attribute='function')|list|tojson(indent=4) + '<|end_of_text|>
' }}
{%- elif message['role'] == 'assistant' %}
{{- '<|start_of_role|>assistant<|end_of_role|>' + message['content'] + '<|end_of_text|>
' }}
{%- elif message['role'] == 'tool_response' or message['role'] == 'tool' %}
{{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|>
' }}
{%- endif %}
{%- if loop.last and add_generation_prompt %}
{{- '<|start_of_role|>assistant<|end_of_role|>' }}
{%- endif %}
{%- endfor %}
kind: ConfigMap
metadata:
creationTimestamp: null
name: granite-chat
12 changes: 6 additions & 6 deletions prototype/frameworks/llamastack/kubernetes/llama-serve/vllm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ spec:
spec:
containers:
- args:
- --tensor-parallel-size=2
- --model
- meta-llama/Llama-3.2-3B-Instruct
- ibm-granite/granite-3.2-8b-instruct
- --enable-auto-tool-choice
- --chat-template
- /app/tool_chat_template_llama3.2_json.jinja
- --tool-call-parser
- llama3_json
- /app/tool_chat_template_granite.jinja
- --tool-call-parser=granite
- --port
- "8000"
env:
Expand All @@ -48,7 +48,7 @@ spec:
protocol: TCP
resources:
limits:
nvidia.com/gpu: "1"
nvidia.com/gpu: "2"
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
Expand All @@ -72,7 +72,7 @@ spec:
name: triton
- configMap:
defaultMode: 420
name: template
name: granite-chat
name: chat-template
- emptyDir: {}
name: config
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ spec:
- name: VLLM_MAX_TOKENS
value: "128000"
- name: INFERENCE_MODEL
value: meta-llama/Llama-3.2-3B-Instruct
value: ibm-granite/granite-3.2-8b-instruct
- name: VLLM_URL
value: http://vllm:8000/v1
- name: VLLM_API_TOKEN
Expand Down