From 7d446fe9fca5a724d01ef0d461025b55c4d9ee21 Mon Sep 17 00:00:00 2001
From: vishwajeetpal
Date: Mon, 15 Dec 2025 20:06:45 +0530
Subject: [PATCH 1/3] init: added ui changes and backend for exposing Liveness.
---
api/cluster/resource/templater.go | 74 ++++++---
api/cluster/resource/templater_gpu_test.go | 4 +-
api/cluster/resource/templater_test.go | 22 +--
api/models/resource_request.go | 10 ++
swagger.yaml | 15 ++
.../components/LivnessConfigFormGroup.js | 146 ++++++++++++++++++
.../components/forms/steps/ModelStep.js | 7 +
.../components/forms/steps/TransformerStep.js | 7 +
8 files changed, 253 insertions(+), 32 deletions(-)
create mode 100644 ui/src/pages/version/components/forms/components/LivnessConfigFormGroup.js
diff --git a/api/cluster/resource/templater.go b/api/cluster/resource/templater.go
index ecc5fc734..8b28ad179 100644
--- a/api/cluster/resource/templater.go
+++ b/api/cluster/resource/templater.go
@@ -237,7 +237,7 @@ func (t *InferenceServiceTemplater) createPredictorSpec(modelService *models.Ser
}
}
- livenessProbeConfig := getLivenessProbeConfig(modelService.PredictorProtocol(), envVars, fmt.Sprintf("/v1/models/%s", modelService.Name))
+ livenessProbeConfig := getLivenessProbeConfig(modelService.PredictorProtocol(), envVars, fmt.Sprintf("/v1/models/%s", modelService.Name), modelService.ResourceRequest)
containerPorts := createContainerPorts(modelService.PredictorProtocol(), modelService.DeploymentMode)
storageUri := utils.CreateModelLocation(modelService.ArtifactURI)
@@ -411,7 +411,7 @@ func (t *InferenceServiceTemplater) createTransformerSpec(
}
}
- livenessProbeConfig := getLivenessProbeConfig(modelService.Protocol, envVars, "/")
+ livenessProbeConfig := getLivenessProbeConfig(modelService.Protocol, envVars, "/", transformer.ResourceRequest)
containerPorts := createContainerPorts(modelService.Protocol, modelService.DeploymentMode)
transformerSpec := &kservev1beta1.TransformerSpec{
@@ -515,7 +515,7 @@ func (t *InferenceServiceTemplater) enrichStandardTransformerEnvVars(modelServic
return envVars
}
-func createHTTPGetLivenessProbe(httpPath string, port int) *corev1.Probe {
+func createHTTPGetLivenessProbe(httpPath string, port int, resourceRequest *models.ResourceRequest) *corev1.Probe {
return &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
@@ -526,45 +526,81 @@ func createHTTPGetLivenessProbe(httpPath string, port int) *corev1.Probe {
},
},
},
- InitialDelaySeconds: liveProbeInitialDelaySec,
- TimeoutSeconds: liveProbeTimeoutSec,
- PeriodSeconds: liveProbePeriodSec,
- SuccessThreshold: liveProbeSuccessThreshold,
- FailureThreshold: liveProbeFailureThreshold,
+ InitialDelaySeconds: getLivenessProbeInitialDelaySeconds(resourceRequest),
+ TimeoutSeconds: getLivenessProbeTimeoutSeconds(resourceRequest),
+ PeriodSeconds: getLivenessProbePeriodSeconds(resourceRequest),
+ SuccessThreshold: getLivenessProbeSuccessThreshold(resourceRequest),
+ FailureThreshold: getLivenessProbeFailureThreshold(resourceRequest),
}
}
-func createGRPCLivenessProbe(port int) *corev1.Probe {
+func createGRPCLivenessProbe(port int, resourceRequest *models.ResourceRequest) *corev1.Probe {
return &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
Exec: &corev1.ExecAction{
Command: []string{grpcHealthProbeCommand, fmt.Sprintf("-addr=:%d", port)},
},
},
- InitialDelaySeconds: liveProbeInitialDelaySec,
- TimeoutSeconds: liveProbeTimeoutSec,
- PeriodSeconds: liveProbePeriodSec,
- SuccessThreshold: liveProbeSuccessThreshold,
- FailureThreshold: liveProbeFailureThreshold,
+ InitialDelaySeconds: getLivenessProbeInitialDelaySeconds(resourceRequest),
+ TimeoutSeconds: getLivenessProbeTimeoutSeconds(resourceRequest),
+ PeriodSeconds: getLivenessProbePeriodSeconds(resourceRequest),
+ SuccessThreshold: getLivenessProbeSuccessThreshold(resourceRequest),
+ FailureThreshold: getLivenessProbeFailureThreshold(resourceRequest),
}
}
-func getLivenessProbeConfig(protocol prt.Protocol, envVars []corev1.EnvVar, httpPath string) *corev1.Probe {
+func getLivenessProbeConfig(protocol prt.Protocol, envVars []corev1.EnvVar, httpPath string, resourceRequest *models.ResourceRequest) *corev1.Probe {
// liveness probe config. if env var to disable != true or not set, it will default to enabled
var livenessProbeConfig *corev1.Probe = nil
envVarsMap := getEnvVarMap(envVars)
if !strings.EqualFold(envVarsMap[envOldDisableLivenessProbe].Value, "true") &&
!strings.EqualFold(envVarsMap[envDisableLivenessProbe].Value, "true") {
- livenessProbeConfig = createLivenessProbeSpec(protocol, httpPath)
+ livenessProbeConfig = createLivenessProbeSpec(protocol, httpPath, resourceRequest)
}
return livenessProbeConfig
}
-func createLivenessProbeSpec(protocol prt.Protocol, httpPath string) *corev1.Probe {
+func createLivenessProbeSpec(protocol prt.Protocol, httpPath string, resourceRequest *models.ResourceRequest) *corev1.Probe {
if protocol == prt.UpiV1 {
- return createGRPCLivenessProbe(defaultGRPCPort)
+ return createGRPCLivenessProbe(defaultGRPCPort, resourceRequest)
}
- return createHTTPGetLivenessProbe(httpPath, defaultHTTPPort)
+ return createHTTPGetLivenessProbe(httpPath, defaultHTTPPort, resourceRequest)
+}
+
+// Helper functions to get liveness probe values with fallback to defaults
+func getLivenessProbeInitialDelaySeconds(resourceRequest *models.ResourceRequest) int32 {
+ if resourceRequest != nil && resourceRequest.LivenessProbeInitialDelaySeconds != nil {
+ return *resourceRequest.LivenessProbeInitialDelaySeconds
+ }
+ return liveProbeInitialDelaySec
+}
+
+func getLivenessProbeTimeoutSeconds(resourceRequest *models.ResourceRequest) int32 {
+ if resourceRequest != nil && resourceRequest.LivenessProbeTimeoutSeconds != nil {
+ return *resourceRequest.LivenessProbeTimeoutSeconds
+ }
+ return liveProbeTimeoutSec
+}
+
+func getLivenessProbePeriodSeconds(resourceRequest *models.ResourceRequest) int32 {
+ if resourceRequest != nil && resourceRequest.LivenessProbePeriodSeconds != nil {
+ return *resourceRequest.LivenessProbePeriodSeconds
+ }
+ return liveProbePeriodSec
+}
+
+func getLivenessProbeSuccessThreshold(resourceRequest *models.ResourceRequest) int32 {
+ if resourceRequest != nil && resourceRequest.LivenessProbeSuccessThreshold != nil {
+ return *resourceRequest.LivenessProbeSuccessThreshold
+ }
+ return liveProbeSuccessThreshold
+}
+
+func getLivenessProbeFailureThreshold(resourceRequest *models.ResourceRequest) int32 {
+ if resourceRequest != nil && resourceRequest.LivenessProbeFailureThreshold != nil {
+ return *resourceRequest.LivenessProbeFailureThreshold
+ }
+ return liveProbeFailureThreshold
}
func createPredictorHost(modelService *models.Service) string {
diff --git a/api/cluster/resource/templater_gpu_test.go b/api/cluster/resource/templater_gpu_test.go
index ab7b6e534..d3d4dfb60 100644
--- a/api/cluster/resource/templater_gpu_test.go
+++ b/api/cluster/resource/templater_gpu_test.go
@@ -119,8 +119,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
storageUri := fmt.Sprintf("%s/model", modelSvc.ArtifactURI)
// Liveness probe config for the model containers
- probeConfig := createLivenessProbeSpec(protocol.HttpJson, fmt.Sprintf("/v1/models/%s", modelSvc.Name))
- probeConfigUPI := createLivenessProbeSpec(protocol.UpiV1, fmt.Sprintf("/v1/models/%s", modelSvc.Name))
+ probeConfig := createLivenessProbeSpec(protocol.HttpJson, fmt.Sprintf("/v1/models/%s", modelSvc.Name), nil)
+ probeConfigUPI := createLivenessProbeSpec(protocol.UpiV1, fmt.Sprintf("/v1/models/%s", modelSvc.Name), nil)
tests := []struct {
name string
diff --git a/api/cluster/resource/templater_test.go b/api/cluster/resource/templater_test.go
index 411139a73..ff0e9e620 100644
--- a/api/cluster/resource/templater_test.go
+++ b/api/cluster/resource/templater_test.go
@@ -287,8 +287,8 @@ func TestCreateInferenceServiceSpec(t *testing.T) {
storageUri := fmt.Sprintf("%s/model", modelSvc.ArtifactURI)
// Liveness probe config for the model containers
- probeConfig := createLivenessProbeSpec(protocol.HttpJson, fmt.Sprintf("/v1/models/%s", modelSvc.Name))
- probeConfigUPI := createLivenessProbeSpec(protocol.UpiV1, fmt.Sprintf("/v1/models/%s", modelSvc.Name))
+ probeConfig := createLivenessProbeSpec(protocol.HttpJson, fmt.Sprintf("/v1/models/%s", modelSvc.Name), nil)
+ probeConfigUPI := createLivenessProbeSpec(protocol.UpiV1, fmt.Sprintf("/v1/models/%s", modelSvc.Name), nil)
tests := []struct {
name string
@@ -2075,12 +2075,12 @@ func TestCreateInferenceServiceSpecWithTransformer(t *testing.T) {
storageUri := fmt.Sprintf("%s/model", modelSvc.ArtifactURI)
// Liveness probe config for the model containers
- probeConfig := createLivenessProbeSpec(protocol.HttpJson, fmt.Sprintf("/v1/models/%s", modelSvc.Name))
- probeConfigUPI := createLivenessProbeSpec(protocol.UpiV1, fmt.Sprintf("/v1/models/%s", modelSvc.Name))
+ probeConfig := createLivenessProbeSpec(protocol.HttpJson, fmt.Sprintf("/v1/models/%s", modelSvc.Name), nil)
+ probeConfigUPI := createLivenessProbeSpec(protocol.UpiV1, fmt.Sprintf("/v1/models/%s", modelSvc.Name), nil)
// Liveness probe config for the transformers
- transformerProbeConfig := createLivenessProbeSpec(protocol.HttpJson, "/")
- transformerProbeConfigUPI := createLivenessProbeSpec(protocol.UpiV1, "/")
+ transformerProbeConfig := createLivenessProbeSpec(protocol.HttpJson, "/", nil)
+ transformerProbeConfigUPI := createLivenessProbeSpec(protocol.UpiV1, "/", nil)
tests := []struct {
name string
modelSvc *models.Service
@@ -3016,10 +3016,10 @@ func TestCreateInferenceServiceSpecWithLogger(t *testing.T) {
storageUri := fmt.Sprintf("%s/model", modelSvc.ArtifactURI)
// Liveness probe config for the model containers
- probeConfig := createLivenessProbeSpec(protocol.HttpJson, fmt.Sprintf("/v1/models/%s", modelSvc.Name))
+ probeConfig := createLivenessProbeSpec(protocol.HttpJson, fmt.Sprintf("/v1/models/%s", modelSvc.Name), nil)
// Liveness probe config for the transformers
- transformerProbeConfig := createLivenessProbeSpec(protocol.HttpJson, "/")
+ transformerProbeConfig := createLivenessProbeSpec(protocol.HttpJson, "/", nil)
tests := []struct {
name string
@@ -3503,10 +3503,10 @@ func TestCreateInferenceServiceSpecWithTopologySpreadConstraints(t *testing.T) {
storageUri := fmt.Sprintf("%s/model", modelSvc.ArtifactURI)
// Liveness probe config for the model containers
- probeConfig := createLivenessProbeSpec(protocol.HttpJson, fmt.Sprintf("/v1/models/%s", modelSvc.Name))
+ probeConfig := createLivenessProbeSpec(protocol.HttpJson, fmt.Sprintf("/v1/models/%s", modelSvc.Name), nil)
// Liveness probe config for the transformers
- transformerProbeConfig := createLivenessProbeSpec(protocol.HttpJson, "/")
+ transformerProbeConfig := createLivenessProbeSpec(protocol.HttpJson, "/", nil)
tests := []struct {
name string
@@ -4419,7 +4419,7 @@ func TestCreateTransformerSpec(t *testing.T) {
customCPULimit := resource.MustParse("8")
// Liveness probe config for the transformers
- transformerProbeConfig := createLivenessProbeSpec(protocol.HttpJson, "/")
+ transformerProbeConfig := createLivenessProbeSpec(protocol.HttpJson, "/", nil)
modelSvc := &models.Service{
Name: "model-1",
diff --git a/api/models/resource_request.go b/api/models/resource_request.go
index f92fae9f8..5fc5c537a 100644
--- a/api/models/resource_request.go
+++ b/api/models/resource_request.go
@@ -37,6 +37,16 @@ type ResourceRequest struct {
GPUName string `json:"gpu_name,omitempty"`
// GPU Quantity requests
GPURequest resource.Quantity `json:"gpu_request,omitempty"`
+ // Liveness probe initial delay seconds
+ LivenessProbeInitialDelaySeconds *int32 `json:"liveness_probe_initial_delay_seconds,omitempty"`
+ // Liveness probe period seconds
+ LivenessProbePeriodSeconds *int32 `json:"liveness_probe_period_seconds,omitempty"`
+ // Liveness probe timeout seconds
+ LivenessProbeTimeoutSeconds *int32 `json:"liveness_probe_timeout_seconds,omitempty"`
+ // Liveness probe success threshold
+ LivenessProbeSuccessThreshold *int32 `json:"liveness_probe_success_threshold,omitempty"`
+ // Liveness probe failure threshold
+ LivenessProbeFailureThreshold *int32 `json:"liveness_probe_failure_threshold,omitempty"`
}
func (r ResourceRequest) Value() (driver.Value, error) {
diff --git a/swagger.yaml b/swagger.yaml
index b76351eb2..8fd1ebb06 100644
--- a/swagger.yaml
+++ b/swagger.yaml
@@ -2117,6 +2117,21 @@ components:
type: string
gpu_request:
type: string
+ liveness_probe_initial_delay_seconds:
+ type: integer
+ description: Initial delay in seconds before liveness probe is started
+ liveness_probe_period_seconds:
+ type: integer
+ description: How often (in seconds) to perform the liveness probe
+ liveness_probe_timeout_seconds:
+ type: integer
+ description: Number of seconds after which the liveness probe times out
+ liveness_probe_success_threshold:
+ type: integer
+ description: Minimum consecutive successes for the liveness probe to be considered successful
+ liveness_probe_failure_threshold:
+ type: integer
+ description: Minimum consecutive failures for the liveness probe to be considered failed
AutoscalingPolicy:
type: object
required:
diff --git a/ui/src/pages/version/components/forms/components/LivnessConfigFormGroup.js b/ui/src/pages/version/components/forms/components/LivnessConfigFormGroup.js
new file mode 100644
index 000000000..985d52fff
--- /dev/null
+++ b/ui/src/pages/version/components/forms/components/LivnessConfigFormGroup.js
@@ -0,0 +1,146 @@
+import React, { Fragment } from "react";
+import { FormLabelWithToolTip, useOnChangeHandler } from "@caraml-dev/ui-lib";
+import { EuiDescribedFormGroup, EuiFieldNumber, EuiFormRow, EuiFlexGroup, EuiFlexItem } from "@elastic/eui";
+
+export const LivenessProbeFormGroup = ({
+ resourcesConfig,
+ onChangeHandler,
+ errors = {},
+}) => {
+ const { onChange } = useOnChangeHandler(onChangeHandler);
+
+ return (
+ Liveness Probe Configuration
}
+ description={
+
+ Configure the liveness probe settings for your deployment.
+ These settings determine how Kubernetes checks if your container is still running.
+
+ }
+ fullWidth
+ >
+
+
+
+ }
+ isInvalid={!!errors.liveness_probe_initial_delay_seconds}
+ error={errors.liveness_probe_initial_delay_seconds}
+ fullWidth
+ >
+ onChange("liveness_probe_initial_delay_seconds")(e.target.value ? parseInt(e.target.value) : undefined)}
+ isInvalid={!!errors.liveness_probe_initial_delay_seconds}
+ name="liveness_probe_initial_delay_seconds"
+ min={0}
+ fullWidth
+ />
+
+
+
+
+
+ }
+ isInvalid={!!errors.liveness_probe_period_seconds}
+ error={errors.liveness_probe_period_seconds}
+ fullWidth
+ >
+ onChange("liveness_probe_period_seconds")(e.target.value ? parseInt(e.target.value) : undefined)}
+ isInvalid={!!errors.liveness_probe_period_seconds}
+ name="liveness_probe_period_seconds"
+ min={1}
+ fullWidth
+ />
+
+
+
+
+
+ }
+ isInvalid={!!errors.liveness_probe_timeout_seconds}
+ error={errors.liveness_probe_timeout_seconds}
+ fullWidth
+ >
+ onChange("liveness_probe_timeout_seconds")(e.target.value ? parseInt(e.target.value) : undefined)}
+ isInvalid={!!errors.liveness_probe_timeout_seconds}
+ name="liveness_probe_timeout_seconds"
+ min={1}
+ fullWidth
+ />
+
+
+
+
+
+ }
+ isInvalid={!!errors.liveness_probe_success_threshold}
+ error={errors.liveness_probe_success_threshold}
+ fullWidth
+ >
+ onChange("liveness_probe_success_threshold")(e.target.value ? parseInt(e.target.value) : undefined)}
+ isInvalid={!!errors.liveness_probe_success_threshold}
+ name="liveness_probe_success_threshold"
+ min={1}
+ fullWidth
+ />
+
+
+
+
+
+ }
+ isInvalid={!!errors.liveness_probe_failure_threshold}
+ error={errors.liveness_probe_failure_threshold}
+ fullWidth
+ >
+ onChange("liveness_probe_failure_threshold")(e.target.value ? parseInt(e.target.value) : undefined)}
+ isInvalid={!!errors.liveness_probe_failure_threshold}
+ name="liveness_probe_failure_threshold"
+ min={1}
+ fullWidth
+ />
+
+
+
+
+ );
+};
diff --git a/ui/src/pages/version/components/forms/steps/ModelStep.js b/ui/src/pages/version/components/forms/steps/ModelStep.js
index aaf726204..e33a99d07 100644
--- a/ui/src/pages/version/components/forms/steps/ModelStep.js
+++ b/ui/src/pages/version/components/forms/steps/ModelStep.js
@@ -14,6 +14,7 @@ import { LoggerPanel } from "../components/LoggerPanel";
import { ResourcesPanel } from "../components/ResourcesPanel";
import { ImageBuilderSection } from "../components/ImageBuilderSection";
import { CPULimitsFormGroup } from "../components/CPULimitsFormGroup";
+import { LivenessProbeFormGroup } from "../components/LivnessConfigFormGroup";
export const ModelStep = ({ version, isEnvironmentDisabled = false, maxAllowedReplica, setMaxAllowedReplica }) => {
const { data, onChangeHandler } = useContext(FormContext);
@@ -52,6 +53,12 @@ export const ModelStep = ({ version, isEnvironmentDisabled = false, maxAllowedRe
onChangeHandler={onChange("resource_request")}
errors={get(errors, "resource_request")}
/>
+
+
{
const {
@@ -52,6 +53,12 @@ export const TransformerStep = ({ maxAllowedReplica }) => {
onChangeHandler={onChange("transformer.resource_request")}
errors={get(errors, "transformer.resource_request")}
/>
+
+
}
/>
From 0172031dbd8b24da6b3b038bfd98fad3f6f37141 Mon Sep 17 00:00:00 2001
From: vishwajeetpal
Date: Mon, 15 Dec 2025 22:56:07 +0530
Subject: [PATCH 2/3] fix: pyfunc test fix.
---
.../pyfunc-server/pyfuncserver/protocol/upi/server.py | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/python/pyfunc-server/pyfuncserver/protocol/upi/server.py b/python/pyfunc-server/pyfuncserver/protocol/upi/server.py
index b0cb47a78..5363c2b05 100644
--- a/python/pyfunc-server/pyfuncserver/protocol/upi/server.py
+++ b/python/pyfunc-server/pyfuncserver/protocol/upi/server.py
@@ -57,7 +57,7 @@ def start(self):
# multiprocessing based on https://github.com/grpc/grpc/tree/master/examples/python/multiprocessing
workers = []
for _ in range(self._config.workers - 1):
- worker = multiprocessing.Process(target=self._run_server)
+ worker = multiprocessing.Process(target=self._run_server_sync)
worker.start()
workers.append(worker)
@@ -67,7 +67,13 @@ def start(self):
publisher = Publisher(kafka_producer, sampler)
self._predict_service.set_publisher(publisher)
- asyncio.get_event_loop().run_until_complete(self._run_server())
+ self._run_server_sync()
+
+ def _run_server_sync(self):
+ """Synchronous wrapper to run the async server in a new event loop."""
+ loop = asyncio.new_event_loop()
+ asyncio.set_event_loop(loop)
+ loop.run_until_complete(self._run_server())
async def _run_server(self):
"""
From 330795f221b70c246927cbac4ca926d62da249da Mon Sep 17 00:00:00 2001
From: vishwajeetpal
Date: Wed, 17 Dec 2025 02:15:52 +0530
Subject: [PATCH 3/3] add: added liveness detail in Model Resources pane.
---
ui/src/components/ResourcesConfigTable.js | 41 +++++++++++++++++++
ui/src/services/transformer/Transformer.js | 7 +++-
.../version_endpoint/VersionEndpoint.js | 5 +++
3 files changed, 52 insertions(+), 1 deletion(-)
diff --git a/ui/src/components/ResourcesConfigTable.js b/ui/src/components/ResourcesConfigTable.js
index ab0c8eb8b..da5921d54 100644
--- a/ui/src/components/ResourcesConfigTable.js
+++ b/ui/src/components/ResourcesConfigTable.js
@@ -27,6 +27,11 @@ export const ResourcesConfigTable = ({
max_replica,
gpu_name,
gpu_request,
+ liveness_probe_initial_delay_seconds,
+ liveness_probe_period_seconds,
+ liveness_probe_timeout_seconds,
+ liveness_probe_success_threshold,
+ liveness_probe_failure_threshold,
},
}) => {
const items = [
@@ -68,6 +73,42 @@ export const ResourcesConfigTable = ({
});
}
+ // Add liveness probe configuration if any value is set
+ if (liveness_probe_initial_delay_seconds !== undefined && liveness_probe_initial_delay_seconds !== null) {
+ items.push({
+ title: "Liveness Initial Delay",
+ description: `${liveness_probe_initial_delay_seconds}s`,
+ });
+ }
+
+ if (liveness_probe_period_seconds !== undefined && liveness_probe_period_seconds !== null) {
+ items.push({
+ title: "Liveness Period",
+ description: `${liveness_probe_period_seconds}s`,
+ });
+ }
+
+ if (liveness_probe_timeout_seconds !== undefined && liveness_probe_timeout_seconds !== null) {
+ items.push({
+ title: "Liveness Timeout",
+ description: `${liveness_probe_timeout_seconds}s`,
+ });
+ }
+
+ if (liveness_probe_success_threshold !== undefined && liveness_probe_success_threshold !== null) {
+ items.push({
+ title: "Liveness Success Threshold",
+ description: liveness_probe_success_threshold,
+ });
+ }
+
+ if (liveness_probe_failure_threshold !== undefined && liveness_probe_failure_threshold !== null) {
+ items.push({
+ title: "Liveness Failure Threshold",
+ description: liveness_probe_failure_threshold,
+ });
+ }
+
return (