Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions kubernetes/apis/sandbox/v1alpha1/batchsandbox_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,10 @@ type BatchSandboxStatus struct {
TaskPending int32 `json:"taskPending"`
// TaskUnknown is the number of Unknown task
TaskUnknown int32 `json:"taskUnknown"`
// TaskLastErrorMessage holds the most recent error message from a failed task.
// This includes lifecycle hook failures (e.g., ossfs mount timeout or error output).
// +optional
TaskLastErrorMessage string `json:"taskLastErrorMessage,omitempty"`
}

// +genclient
Expand Down Expand Up @@ -145,6 +149,17 @@ type TaskTemplateSpec struct {
Spec TaskSpec `json:"spec,omitempty"`
}

// ExecMode defines where a process should be executed.
// +kubebuilder:validation:Enum=Local;Remote
type ExecMode string

const (
// ExecModeLocal executes in the task-executor container.
ExecModeLocal ExecMode = "Local"
// ExecModeRemote executes in the main container via nsenter.
ExecModeRemote ExecMode = "Remote"
)

type TaskSpec struct {
// +optional
Process *ProcessTask `json:"process,omitempty"`
Expand All @@ -169,6 +184,49 @@ type ProcessTask struct {
// WorkingDir task working directory.
// +optional
WorkingDir string `json:"workingDir,omitempty"`
// ExecMode controls where the process runs.
// Local: inside task-executor container.
// Remote: inside main container (via nsenter).
// +optional
// +kubebuilder:default=Local
ExecMode ExecMode `json:"execMode,omitempty"`
// Lifecycle defines actions to be executed before and after the main process.
// +optional
Lifecycle *ProcessLifecycle `json:"lifecycle,omitempty"`
}

// ProcessLifecycle defines lifecycle hooks for a process.
type ProcessLifecycle struct {
// PreStart is executed before the main process starts.
// +optional
PreStart *LifecycleHandler `json:"preStart,omitempty"`
// PostStop is executed after the main process stops.
// +optional
PostStop *LifecycleHandler `json:"postStop,omitempty"`
}

// LifecycleHandler defines a lifecycle action.
type LifecycleHandler struct {
// Exec specifies the action to take.
// +optional
Exec *ExecAction `json:"exec,omitempty"`
// ExecMode controls where the action runs.
// +optional
// +kubebuilder:default=Local
ExecMode ExecMode `json:"execMode,omitempty"`
// TimeoutSeconds is the maximum number of seconds the hook may run.
// If the hook does not complete within this time, it is killed and the
// enclosing operation (Start or Stop) is treated as failed.
// If not set or zero, there is no timeout.
// +optional
TimeoutSeconds *int64 `json:"timeoutSeconds,omitempty"`
}

// ExecAction describes a "run in container" action.
type ExecAction struct {
// Command is the command line to execute inside the container.
// +kubebuilder:validation:Required
Command []string `json:"command"`
}

// TaskStatus task status
Expand Down
75 changes: 75 additions & 0 deletions kubernetes/apis/sandbox/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,11 @@ spec:
description: TaskFailed is the number of Failed task
format: int32
type: integer
taskLastErrorMessage:
description: |-
TaskLastErrorMessage holds the most recent error message from a failed task.
This includes lifecycle hook failures (e.g., ossfs mount timeout or error output).
type: string
taskPending:
description: TaskPending is the number of Pending task which is unassigned
format: int32
Expand Down
4 changes: 2 additions & 2 deletions kubernetes/config/manager/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
images:
- name: controller
newName: controller
newTag: dev
newName: example.com/sandbox-k8s
newTag: v0.0.1
- name: manager
newName: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/controller
newTag: v0.0.1
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
apiVersion: sandbox.opensandbox.io/v1alpha1
kind: BatchSandbox
metadata:
labels:
app.kubernetes.io/name: opensandbox
app.kubernetes.io/managed-by: kustomize
name: batchsandbox-lifecycle-sample
namespace: opensandbox
spec:
replicas: 1
template:
metadata:
labels:
app: lifecycle-example
spec:
containers:
- name: main
image: registry.k8s.io/e2e-test-images/busybox:1.29-4
command:
- tail
- -f
- /dev/null
expireTime: "2025-12-31T23:59:59Z"
taskTemplate:
spec:
process:
command:
- sleep
args:
- "30"
env:
- name: TASK_NAME
value: lifecycle-task
lifecycle:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P3 Badge Place lifecycle sample under process spec

In this sample manifest, lifecycle is defined at taskTemplate.spec.lifecycle instead of taskTemplate.spec.process.lifecycle, so the hooks are ignored by the controller conversion path that only reads process.lifecycle. Users applying this example won’t actually run preStart/postStop, which makes the feature appear broken and gives incorrect guidance.

Useful? React with 👍 / 👎.

# preStart hook runs before the main task process starts.
# Useful for preparing the environment, checking dependencies, warming up data, etc.
preStart:
exec:
command:
- /bin/sh
- -c
- |
echo "[preStart] Starting environment preparation..."
mkdir -p /tmp/workdir
echo "$(date): Task starting" > /tmp/workdir/prestart-marker
echo "[preStart] Environment ready!"
timeoutSeconds: 30
# postStop hook runs after the main task process exits.
# Useful for cleaning up resources, uploading logs, sending notifications, etc.
postStop:
exec:
command:
- /bin/sh
- -c
- |
echo "[postStop] Starting cleanup..."
echo "$(date): Task finished" >> /tmp/workdir/poststop-marker
echo "Task logs:" && cat /tmp/workdir/*
echo "[postStop] Cleanup complete!"
timeoutSeconds: 30
13 changes: 12 additions & 1 deletion kubernetes/internal/controller/batchsandbox_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,7 @@ func (r *BatchSandboxReconciler) scheduleTasks(ctx context.Context, tSch tasksch
var (
running, failed, succeed, unknown int32
pending int32
lastErrorMessage string
)
for i := range len(tasks) {
task := tasks[i]
Expand All @@ -385,6 +386,10 @@ func (r *BatchSandboxReconciler) scheduleTasks(ctx context.Context, tSch tasksch
succeed++
case taskscheduler.FailedTaskState:
failed++
// Capture the most recent error message to surface in status.
if msg := task.GetTerminatedMessage(); msg != "" {
lastErrorMessage = msg
}
case taskscheduler.UnknownTaskState:
unknown++
}
Expand All @@ -405,8 +410,14 @@ func (r *BatchSandboxReconciler) scheduleTasks(ctx context.Context, tSch tasksch
newStatus.TaskSucceed = succeed
newStatus.TaskUnknown = unknown
newStatus.TaskPending = pending
// Persist error message from the most recently failed task (e.g. lifecycle hook stderr).
// Only update when a new non-empty message is available, to avoid clearing a previously
// recorded error on subsequent reconcile cycles where the task may have been released.
if lastErrorMessage != "" {
newStatus.TaskLastErrorMessage = lastErrorMessage
}
if !reflect.DeepEqual(newStatus, oldStatus) {
log.Info("To update BatchSandbox status", "replicas", newStatus.Replicas, "task_running", newStatus.TaskRunning, "task_succeed", newStatus.TaskSucceed, "task_failed", newStatus.TaskFailed, "task_unknown", newStatus.TaskUnknown, "task_pending", newStatus.TaskPending)
log.Info("To update BatchSandbox status", "replicas", newStatus.Replicas, "task_running", newStatus.TaskRunning, "task_succeed", newStatus.TaskSucceed, "task_failed", newStatus.TaskFailed, "task_unknown", newStatus.TaskUnknown, "task_pending", newStatus.TaskPending, "last_error", newStatus.TaskLastErrorMessage)
if err := r.updateStatus(batchSbx, newStatus); err != nil {
return err
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,21 +72,58 @@ func (s *DefaultTaskSchedulingStrategy) getTaskSpec(idx int) (*api.Task, error)
if err = json.Unmarshal(modified, newTaskTemplate); err != nil {
return nil, fmt.Errorf("batchsandbox: failed to unmarshal %s to TaskTemplateSpec, idx %d, err %w", modified, idx, err)
}
task.Process = &api.Process{
Command: newTaskTemplate.Spec.Process.Command,
Args: newTaskTemplate.Spec.Process.Args,
Env: newTaskTemplate.Spec.Process.Env,
WorkingDir: newTaskTemplate.Spec.Process.WorkingDir,
TimeoutSeconds: s.Spec.TaskTemplate.Spec.TimeoutSeconds,
}
task.Process = convertProcessSpec(newTaskTemplate.Spec.Process, s.Spec.TaskTemplate.Spec.TimeoutSeconds)
} else if s.Spec.TaskTemplate != nil && s.Spec.TaskTemplate.Spec.Process != nil {
task.Process = &api.Process{
Command: s.Spec.TaskTemplate.Spec.Process.Command,
Args: s.Spec.TaskTemplate.Spec.Process.Args,
Env: s.Spec.TaskTemplate.Spec.Process.Env,
WorkingDir: s.Spec.TaskTemplate.Spec.Process.WorkingDir,
TimeoutSeconds: s.Spec.TaskTemplate.Spec.TimeoutSeconds,
}
task.Process = convertProcessSpec(s.Spec.TaskTemplate.Spec.Process, s.Spec.TaskTemplate.Spec.TimeoutSeconds)
}
return task, nil
}

// convertProcessSpec converts sandboxv1alpha1.ProcessTask to api.Process.
func convertProcessSpec(src *sandboxv1alpha1.ProcessTask, timeoutSeconds *int64) *api.Process {
if src == nil {
return nil
}
return &api.Process{
Command: src.Command,
Args: src.Args,
Env: src.Env,
WorkingDir: src.WorkingDir,
TimeoutSeconds: timeoutSeconds,
ExecMode: api.ExecMode(src.ExecMode),
Lifecycle: convertLifecycle(src.Lifecycle),
}
}

// convertLifecycle converts sandboxv1alpha1.ProcessLifecycle to api.ProcessLifecycle.
func convertLifecycle(src *sandboxv1alpha1.ProcessLifecycle) *api.ProcessLifecycle {
if src == nil {
return nil
}
return &api.ProcessLifecycle{
PreStart: convertLifecycleHandler(src.PreStart),
PostStop: convertLifecycleHandler(src.PostStop),
}
}

// convertLifecycleHandler converts sandboxv1alpha1.LifecycleHandler to api.LifecycleHandler.
func convertLifecycleHandler(src *sandboxv1alpha1.LifecycleHandler) *api.LifecycleHandler {
if src == nil {
return nil
}
return &api.LifecycleHandler{
Exec: convertExecAction(src.Exec),
ExecMode: api.ExecMode(src.ExecMode),
TimeoutSeconds: src.TimeoutSeconds,
}
}

// convertExecAction converts sandboxv1alpha1.ExecAction to api.ExecAction.
func convertExecAction(src *sandboxv1alpha1.ExecAction) *api.ExecAction {
if src == nil {
return nil
}
return &api.ExecAction{
Command: src.Command,
}
}
13 changes: 13 additions & 0 deletions kubernetes/internal/scheduler/default_scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,19 @@ func (t *taskNode) IsResourceReleased() bool {
return t.sState == stateReleased
}

// GetTerminatedMessage returns the failure message from the task status, including
// lifecycle hook stderr output. It is used by the controller to populate
// BatchSandboxStatus.TaskLastErrorMessage.
func (t *taskNode) GetTerminatedMessage() string {
if t.Status == nil {
return ""
}
if t.Status.ProcessStatus != nil && t.Status.ProcessStatus.Terminated != nil {
return t.Status.ProcessStatus.Terminated.Message
}
return ""
}

func (t *taskNode) isTaskCompleted() bool {
return t.tState == SucceedTaskState || t.tState == FailedTaskState
}
Expand Down
Loading
Loading