redhat-et · cooktheryan · Mar 25, 2025
diff --git a/prototype/frameworks/llamastack/kubernetes/granite-cm.yaml b/prototype/frameworks/llamastack/kubernetes/granite-cm.yaml
@@ -0,0 +1,43 @@
+apiVersion: v1
+data:
+  tool_chat_template_granite.jinja: |
+    {%- if tools %}
+        {{- '<|start_of_role|>available_tools<|end_of_role|>
+    ' }}
+        {%- for tool in tools %}
+        {{- tool | tojson(indent=4) }}
+        {%- if not loop.last %}
+            {{- '
+
+    ' }}
+        {%- endif %}
+        {%- endfor %}
+        {{- '<|end_of_text|>
+    ' }}
+    {%- endif %}
+
+    {%- for message in messages %}
+        {%- if message['role'] == 'system' %}
+        {{- '<|start_of_role|>system<|end_of_role|>' + message['content'] + '<|end_of_text|>
+    ' }}
+        {%- elif message['role'] == 'user' %}
+        {{- '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|>
+    ' }}
+        {%- elif message['role'] == 'assistant_tool_call' or (message['role'] == 'assistant' and message.tool_calls is defined) %}
+        {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message.tool_calls|map(attribute='function')|list|tojson(indent=4) + '<|end_of_text|>
+    ' }}
+        {%- elif message['role'] == 'assistant' %}
+        {{- '<|start_of_role|>assistant<|end_of_role|>'  + message['content'] + '<|end_of_text|>
+    ' }}
+        {%- elif message['role'] == 'tool_response' or  message['role'] == 'tool' %}
+        {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|>
+    ' }}
+        {%- endif %}
+        {%- if loop.last and add_generation_prompt %}
+        {{- '<|start_of_role|>assistant<|end_of_role|>' }}
+        {%- endif %}
+    {%- endfor %}
+kind: ConfigMap
+metadata:
+  creationTimestamp: null
+  name: granite-chat
diff --git a/prototype/frameworks/llamastack/kubernetes/llama-serve/vllm.yaml b/prototype/frameworks/llamastack/kubernetes/llama-serve/vllm.yaml
@@ -23,13 +23,13 @@ spec:
     spec:
       containers:
       - args:
+        - --tensor-parallel-size=2
         - --model
-        - meta-llama/Llama-3.2-3B-Instruct
+        - ibm-granite/granite-3.2-8b-instruct
         - --enable-auto-tool-choice
         - --chat-template
-        - /app/tool_chat_template_llama3.2_json.jinja
-        - --tool-call-parser
-        - llama3_json
+        - /app/tool_chat_template_granite.jinja
+        - --tool-call-parser=granite
         - --port
         - "8000"
         env:
@@ -48,7 +48,7 @@ spec:
           protocol: TCP
         resources:
           limits:
-            nvidia.com/gpu: "1"
+            nvidia.com/gpu: "2"
         terminationMessagePath: /dev/termination-log
         terminationMessagePolicy: File
         volumeMounts:
@@ -72,7 +72,7 @@ spec:
         name: triton
       - configMap:
           defaultMode: 420
-          name: template
+          name: granite-chat 
         name: chat-template
       - emptyDir: {}
         name: config
diff --git a/prototype/frameworks/llamastack/kubernetes/llama-stack/deployment.yaml b/prototype/frameworks/llamastack/kubernetes/llama-stack/deployment.yaml
@@ -20,7 +20,7 @@ spec:
         - name: VLLM_MAX_TOKENS
           value: "128000"
         - name: INFERENCE_MODEL
-          value: meta-llama/Llama-3.2-3B-Instruct
+          value: ibm-granite/granite-3.2-8b-instruct
         - name: VLLM_URL
           value: http://vllm:8000/v1
         - name: VLLM_API_TOKEN