diff --git a/containers/devops-agent-operator/README.md b/containers/devops-agent-operator/README.md index 828aaa0..9eee2e2 100644 --- a/containers/devops-agent-operator/README.md +++ b/containers/devops-agent-operator/README.md @@ -108,6 +108,38 @@ ContainerCreating, Unschedulable 같은 일시적 상태는 설정된 대기 시 | `PROCESSED_TTL` | 중복 처리 방지 기간 | `1h` | | `FAILURE_GRACE_PERIOD` | 타임아웃 대기 기간 | `3m` | | `FAILURE_RECHECK_INTERVAL` | 타임아웃 재확인 간격 | `1m` | +| `WEBHOOK_SKIP_CATEGORIES` | 웹훅/S3/CloudWatch 출력을 건너뛸 감지 레이어 (쉼표 구분) | - | +| `WEBHOOK_MIN_SEVERITY` | 출력을 트리거할 최소 심각도 | - | + +#### 출력 필터링 + +`WEBHOOK_SKIP_CATEGORIES`와 `WEBHOOK_MIN_SEVERITY`는 AND 조건으로 동작합니다. 두 조건을 모두 통과해야 CloudWatch Logs, S3, Webhook 출력이 실행됩니다. 미설정 시 모든 장애에 대해 출력이 실행됩니다. + +**WEBHOOK_SKIP_CATEGORIES** — 특정 감지 레이어의 장애를 출력에서 제외합니다. + +유효한 값: `ContainerWaiting`, `ContainerTerminated`, `PodPhase`, `PodStatus`, `PodCondition` ([감지 레이어 상세](docs/ARCHITECTURE.md#3단계-장애-감지---detectpodfailure)) + +``` +# Layer 4, 5 장애는 출력하지 않음 +WEBHOOK_SKIP_CATEGORIES=PodPhase,PodCondition +``` + +**WEBHOOK_MIN_SEVERITY** — 설정한 심각도 이상의 장애만 출력합니다. + +유효한 값 (낮을수록 심각): `CRITICAL`, `HIGH`, `MEDIUM`, `LOW` ([심각도별 장애 유형](docs/ARCHITECTURE.md#6단계-심각도-결정---determineseverity)) + +``` +# HIGH 이상(CRITICAL, HIGH)만 출력 +WEBHOOK_MIN_SEVERITY=HIGH +``` + +두 옵션을 조합하면 더 세밀하게 제어할 수 있습니다. + +``` +# PodPhase는 무조건 제외 + 나머지는 HIGH 이상만 출력 +WEBHOOK_SKIP_CATEGORIES=PodPhase,PodCondition +WEBHOOK_MIN_SEVERITY=HIGH +``` ## IAM 권한 diff --git a/containers/devops-agent-operator/examples/04-configmap.yaml b/containers/devops-agent-operator/examples/04-configmap.yaml index c7af7e7..6fe679d 100644 --- a/containers/devops-agent-operator/examples/04-configmap.yaml +++ b/containers/devops-agent-operator/examples/04-configmap.yaml @@ -19,3 +19,13 @@ data: WEBHOOK_TIMEOUT: "30s" # Duration to prevent reprocessing the same failure PROCESSED_TTL: "1h" + # Comma-separated list of failure categories that should NOT trigger webhook calls. + # Valid values: ContainerWaiting, ContainerTerminated, PodPhase, PodStatus, PodCondition + # Empty means all categories trigger webhooks (default behavior). + # Example: "PodPhase,PodCondition" + WEBHOOK_SKIP_CATEGORIES: "" + # Minimum severity level required to trigger a webhook call. + # Valid values: LOW, MEDIUM, HIGH, CRITICAL + # Empty means all severities trigger webhooks (default behavior). + # Example: "HIGH" → only HIGH and CRITICAL failures trigger webhooks + WEBHOOK_MIN_SEVERITY: "" diff --git a/containers/devops-agent-operator/internal/config/config.go b/containers/devops-agent-operator/internal/config/config.go index af65454..4b19692 100644 --- a/containers/devops-agent-operator/internal/config/config.go +++ b/containers/devops-agent-operator/internal/config/config.go @@ -80,6 +80,16 @@ type Config struct { // During the grace period, the operator requeues the pod at this interval to check // whether the transient state has resolved or the grace period has elapsed. FailureRecheckInterval time.Duration + + // WebhookSkipCategories is a list of failure categories that should not trigger webhook calls. + // Valid values: ContainerWaiting, ContainerTerminated, PodPhase, PodStatus, PodCondition + // Empty means all categories trigger webhooks. + WebhookSkipCategories []string + + // WebhookMinSeverity is the minimum severity level required to trigger a webhook call. + // Valid values: LOW, MEDIUM, HIGH, CRITICAL + // Empty means all severities trigger webhooks. + WebhookMinSeverity string } // DefaultConfig returns a Config with default values @@ -183,6 +193,14 @@ func LoadFromEnv() *Config { } } + if v := os.Getenv("WEBHOOK_SKIP_CATEGORIES"); v != "" { + cfg.WebhookSkipCategories = splitAndTrim(v, ",") + } + + if v := os.Getenv("WEBHOOK_MIN_SEVERITY"); v != "" { + cfg.WebhookMinSeverity = strings.ToUpper(strings.TrimSpace(v)) + } + return cfg } @@ -221,6 +239,42 @@ func (c *Config) IsNamespaceWatched(namespace string) bool { return false } +// severityLevel defines the numeric level of each severity for threshold comparison. +// Lower number = higher severity, consistent with P-level conventions (P0/P1/...). +var severityLevel = map[string]int{ + "CRITICAL": 0, + "HIGH": 1, + "MEDIUM": 2, + "LOW": 3, +} + +// ShouldSendWebhook returns true if the failure should trigger a webhook call. +// Both conditions must pass (AND logic): +// - The failure category must not be in WebhookSkipCategories +// - The failure severity must meet or exceed WebhookMinSeverity +// +// If neither filter is configured, always returns true (default behavior preserved). +func (c *Config) ShouldSendWebhook(category, severity string) bool { + // Category filter: skip if category is in the skip list + for _, cat := range c.WebhookSkipCategories { + if cat == category { + return false + } + } + + // Severity filter: skip if severity is below the minimum threshold + // e.g. WebhookMinSeverity=HIGH → only CRITICAL and HIGH pass + if c.WebhookMinSeverity != "" { + minLevel, minKnown := severityLevel[c.WebhookMinSeverity] + curLevel, curKnown := severityLevel[severity] + if minKnown && curKnown && curLevel > minLevel { + return false + } + } + + return true +} + // splitAndTrim splits a string by separator and trims whitespace func splitAndTrim(s, sep string) []string { parts := strings.Split(s, sep) diff --git a/containers/devops-agent-operator/internal/controller/pod_controller.go b/containers/devops-agent-operator/internal/controller/pod_controller.go index 7ecd436..e096a86 100644 --- a/containers/devops-agent-operator/internal/controller/pod_controller.go +++ b/containers/devops-agent-operator/internal/controller/pod_controller.go @@ -144,8 +144,20 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R // Build collected data data := r.buildCollectedData(ctx, &pod, failure) + // Evaluate filter once for all outputs + severity := collector.DetermineSeverity(failure) + shouldSend := r.Config.ShouldSendWebhook(failure.Category, severity) + + if !shouldSend { + logger.Info("Outputs skipped by filter", + "pod", req.NamespacedName, + "category", failure.Category, + "severity", severity, + ) + } + // Upload to CloudWatch Logs if configured (optional) - if r.CloudWatchClient != nil { + if r.CloudWatchClient != nil && shouldSend { cwResult, err := r.CloudWatchClient.Upload(ctx, data) if err != nil { logger.Error(err, "Failed to upload data to CloudWatch Logs", @@ -162,7 +174,7 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R // Upload to S3 if configured (optional) var s3URL string - if r.S3Client != nil { + if r.S3Client != nil && shouldSend { uploadResult, err := r.S3Client.Upload(ctx, data) if err != nil { logger.Error(err, "Failed to upload data to S3", @@ -178,12 +190,14 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R ) } - // Send to webhook (required) - if err := r.Webhook.Send(ctx, data, s3URL); err != nil { - logger.Error(err, "Failed to send webhook", - "pod", req.NamespacedName, - ) - // Continue to mark as processed even if webhook fails + // Send to webhook if not filtered by category or severity + if shouldSend { + if err := r.Webhook.Send(ctx, data, s3URL); err != nil { + logger.Error(err, "Failed to send webhook", + "pod", req.NamespacedName, + ) + // Continue to mark as processed even if webhook fails + } } // Mark as processed