diff --git a/pkg/controller.v1/common/pod.go b/pkg/controller.v1/common/pod.go index 640f99be..173245a0 100644 --- a/pkg/controller.v1/common/pod.go +++ b/pkg/controller.v1/common/pod.go @@ -24,7 +24,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" log "github.com/sirupsen/logrus" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -86,7 +86,7 @@ func (jc *JobController) AddPod(obj interface{}) { logger := commonutil.LoggerForPod(pod, jc.Controller.GetAPIGroupVersionKind().Kind) if job == nil { - if pod.Labels[apiv1.GroupNameLabel] == jc.Controller.GetGroupNameLabelValue() { + if pod.Labels[apiv1.GroupNameLabel] != jc.Controller.GetGroupNameLabelValue() { logger.Info("This pod's job does not exist") } return @@ -392,9 +392,9 @@ func (jc *JobController) ReconcilePods( } // Check if the pod is retryable. if spec.RestartPolicy == apiv1.RestartPolicyExitCode { - if pod.Status.Phase == v1.PodFailed && trainutil.IsRetryableExitCode(exitCode) { + if pod.Status.Phase == v1.PodFailed && !trainutil.IsRetryableExitCode(exitCode) { failedPodsCount.Inc() - logger.Infof("Need to restart the pod: %v.%v", pod.Namespace, pod.Name) + logger.Infof("Need to delete the pod: %v.%v", pod.Namespace, pod.Name) if err := jc.PodControl.DeletePod(pod.Namespace, pod.Name, runtimeObject); err != nil { return err }