Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions _local/executor/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,12 @@ metric:
application:
clusterId: "local-cluster"
pool: "default"
errorCategories:
- name: oom
rules:
- onConditions: ["OOMKilled"]
- name: user_error
rules:
- onExitCodes:
operator: In
values: [1, 2, 126, 127]
2 changes: 2 additions & 0 deletions client/rust/src/gen/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1458,6 +1458,8 @@ pub struct JobFailedEvent {
pub container_statuses: ::prost::alloc::vec::Vec<ContainerStatus>,
#[prost(enumeration = "Cause", tag = "12")]
pub cause: i32,
#[prost(string, repeated, tag = "15")]
pub categories: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
}
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct JobPreemptingEvent {
Expand Down
30 changes: 30 additions & 0 deletions internal/common/ingest/testfixtures/event.go
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,36 @@ var JobRunFailed = &armadaevents.EventSequence_Event{
},
}

var JobRunFailedWithFailureInfo = &armadaevents.EventSequence_Event{
Created: testfixtures.BasetimeProto,
Event: &armadaevents.EventSequence_Event_JobRunErrors{
JobRunErrors: &armadaevents.JobRunErrors{
JobId: JobId,
RunId: RunId,
Errors: []*armadaevents.Error{
{
Terminal: true,
Reason: &armadaevents.Error_PodError{
PodError: &armadaevents.PodError{
Message: ErrMsg,
DebugMessage: DebugMsg,
NodeName: NodeName,
ContainerErrors: []*armadaevents.ContainerError{
{ExitCode: ExitCode},
},
},
},
FailureInfo: &armadaevents.FailureInfo{
ExitCode: ExitCode,
TerminationMessage: "OOM killed by kernel",
Categories: []string{"RESOURCE_LIMIT", "MEMORY"},
},
},
},
},
},
}

var JobPreempted = &armadaevents.EventSequence_Event{
Created: testfixtures.BasetimeProto,
Event: &armadaevents.EventSequence_Event_JobErrors{
Expand Down
37 changes: 37 additions & 0 deletions internal/lookout/conversions/convert.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ func ToSwaggerRun(run *model.Run) *models.Run {
RunID: run.RunId,
Started: PostgreSQLTimeToSwaggerTime(run.Started),
IngressAddresses: ingressAddressesToSwagger(run.IngressAddresses),
FailureInfo: failureInfoToSwagger(run.FailureInfo),
}
}

Expand Down Expand Up @@ -122,6 +123,42 @@ func PostgreSQLTimeToSwaggerTime(t *model.PostgreSQLTime) *strfmt.DateTime {
return &s
}

func failureInfoToSwagger(failureInfo map[string]any) *models.RunFailureInfo {
if len(failureInfo) == 0 {
return nil
}

result := &models.RunFailureInfo{}
populated := false
// After JSON round-trip through PostgreSQL's json_agg, Go's json.Unmarshal
// produces float64 for numbers and []interface{} for arrays in map[string]any.
if v, ok := failureInfo["exitCode"].(float64); ok {
result.ExitCode = int32(v)
populated = true
}
if msg, ok := failureInfo["terminationMessage"].(string); ok {
result.TerminationMessage = msg
populated = true
}
if cats, ok := failureInfo["categories"].([]interface{}); ok {
result.Categories = make([]string, 0, len(cats))
populated = true
for _, c := range cats {
if s, ok := c.(string); ok {
result.Categories = append(result.Categories, s)
}
}
}
if name, ok := failureInfo["containerName"].(string); ok {
result.ContainerName = name
populated = true
}
if !populated {
return nil
}
return result
Comment thread
dejanzele marked this conversation as resolved.
}

func ingressAddressesToSwagger(addresses map[int32]string) map[string]string {
if len(addresses) == 0 {
return nil
Expand Down
113 changes: 112 additions & 1 deletion internal/lookout/gen/models/run.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

69 changes: 69 additions & 0 deletions internal/lookout/gen/restapi/embedded_spec.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions internal/lookout/model/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ type Run struct {
RunId string
Started *PostgreSQLTime
IngressAddresses map[int32]string
FailureInfo map[string]any
}

type JobGroup struct {
Expand Down
3 changes: 2 additions & 1 deletion internal/lookout/repository/querybuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,8 @@ CROSS JOIN LATERAL (
'exitCode', exit_code,
'pool', pool,
-- 'pool', NULLIF(pool, ''),
'ingressAddresses', ingress_addresses
'ingressAddresses', ingress_addresses,
'failureInfo', failure_info
)
)
ORDER BY COALESCE(leased, pending)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ALTER TABLE job_run ADD COLUMN failure_info JSONB;
16 changes: 16 additions & 0 deletions internal/lookout/swagger.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,22 @@ definitions:
type: object
additionalProperties:
type: string
failureInfo:
type: object
x-nullable: true
properties:
exitCode:
type: integer
format: int32
categories:
type: array
x-omitempty: true
items:
type: string
terminationMessage:
type: string
containerName:
type: string
group:
type: object
required:
Expand Down
Loading
Loading