Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions config/scheduler/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ scheduling:
maximumPerQueueSchedulingBurst: 1000
maxJobSchedulingContextsPerExecutor: 10000
maxRetries: 3
retryPolicy:
enabled: false
globalMaxRetries: 5
dominantResourceFairnessResourcesToConsider:
- "cpu"
- "memory"
Expand Down
13 changes: 13 additions & 0 deletions internal/common/errormatch/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// Package errormatch provides types and functions for matching job failure
// signals: exit codes, termination messages, and Kubernetes conditions.
//
// [ExitCodeMatcher] supports In/NotIn set membership against container exit
// codes. Exit code 0 never matches. [RegexMatcher] holds a pattern string
// that callers compile at construction time and pass to [MatchPattern].
//
// Pod-level condition constants ([ConditionOOMKilled], [ConditionEvicted],
// [ConditionDeadlineExceeded], [ConditionAppError]) and the [KnownConditions]
// map are provided for config validation. Non-pod conditions
// ([ConditionPreempted], [ConditionLeaseReturned]) are also defined here for
// use by the retry engine but are not included in [KnownConditions].
package errormatch
33 changes: 33 additions & 0 deletions internal/common/errormatch/match.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package errormatch

import "regexp"

// MatchExitCode returns true if the exit code matches the matcher.
// Exit code 0 never matches (successful containers are not failures).
func MatchExitCode(matcher *ExitCodeMatcher, exitCode int32) bool {
if matcher == nil || exitCode == 0 {
return false
}
switch matcher.Operator {
case ExitCodeOperatorIn:
for _, v := range matcher.Values {
if exitCode == v {
return true
}
}
case ExitCodeOperatorNotIn:
for _, v := range matcher.Values {
if exitCode == v {
return false
}
}
return true
}
return false
}

// MatchPattern returns true if the value matches the compiled regex.
// Empty values never match.
func MatchPattern(re *regexp.Regexp, value string) bool {
return value != "" && re.MatchString(value)
}
89 changes: 89 additions & 0 deletions internal/common/errormatch/match_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package errormatch

import (
"regexp"
"testing"

"github.com/stretchr/testify/assert"
)

func TestMatchExitCode(t *testing.T) {
tests := map[string]struct {
matcher *ExitCodeMatcher
exitCode int32
expected bool
}{
"In matches": {
matcher: &ExitCodeMatcher{Operator: ExitCodeOperatorIn, Values: []int32{74, 75}},
exitCode: 74,
expected: true,
},
"In does not match": {
matcher: &ExitCodeMatcher{Operator: ExitCodeOperatorIn, Values: []int32{74, 75}},
exitCode: 1,
expected: false,
},
"NotIn matches when code absent": {
matcher: &ExitCodeMatcher{Operator: ExitCodeOperatorNotIn, Values: []int32{1, 2}},
exitCode: 42,
expected: true,
},
"NotIn does not match when code present": {
matcher: &ExitCodeMatcher{Operator: ExitCodeOperatorNotIn, Values: []int32{1, 2}},
exitCode: 1,
expected: false,
},
"exit code 0 never matches In": {
matcher: &ExitCodeMatcher{Operator: ExitCodeOperatorIn, Values: []int32{0}},
exitCode: 0,
expected: false,
},
"exit code 0 never matches NotIn": {
matcher: &ExitCodeMatcher{Operator: ExitCodeOperatorNotIn, Values: []int32{1}},
exitCode: 0,
expected: false,
},
"nil matcher returns false": {
matcher: nil,
exitCode: 1,
expected: false,
},
}

for name, tc := range tests {
t.Run(name, func(t *testing.T) {
assert.Equal(t, tc.expected, MatchExitCode(tc.matcher, tc.exitCode))
})
}
}

func TestMatchPattern(t *testing.T) {
tests := map[string]struct {
pattern string
value string
expected bool
}{
"matches": {
pattern: "(?i)cuda.*error",
value: "CUDA memory error on device 0",
expected: true,
},
"does not match": {
pattern: "(?i)cuda.*error",
value: "segfault",
expected: false,
},
"empty value never matches": {
pattern: ".*",
value: "",
expected: false,
},
}

for name, tc := range tests {
t.Run(name, func(t *testing.T) {
re := regexp.MustCompile(tc.pattern)
assert.Equal(t, tc.expected, MatchPattern(re, tc.value))
})
}
}
44 changes: 44 additions & 0 deletions internal/common/errormatch/types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package errormatch

// ExitCodeOperator is a set membership operator: In or NotIn.
type ExitCodeOperator string

const (
ExitCodeOperatorIn ExitCodeOperator = "In"
ExitCodeOperatorNotIn ExitCodeOperator = "NotIn"
)

// ExitCodeMatcher specifies an operator and a set of exit code values.
type ExitCodeMatcher struct {
Operator ExitCodeOperator `yaml:"operator"`
Values []int32 `yaml:"values"`
}

// RegexMatcher specifies a regex pattern as a string.
type RegexMatcher struct {
Pattern string `yaml:"pattern"`
}

// Condition constants derived from KubernetesReason (pod-level conditions).
const (
ConditionOOMKilled = "OOMKilled"
ConditionEvicted = "Evicted"
ConditionDeadlineExceeded = "DeadlineExceeded"
ConditionAppError = "AppError"
)

// Condition constants for non-pod error types (used by the retry engine).
const (
ConditionPreempted = "Preempted"
ConditionLeaseReturned = "LeaseReturned"
)

// KnownConditions is the set of pod-level condition strings accepted by the
// executor error categorizer. Non-pod conditions (Preempted, LeaseReturned)
// are excluded because they are not observable from Kubernetes pod status.
var KnownConditions = map[string]bool{
ConditionOOMKilled: true,
ConditionEvicted: true,
ConditionDeadlineExceeded: true,
ConditionAppError: true,
}
Loading
Loading