Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions containers/devops-agent-operator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,38 @@ ContainerCreating, Unschedulable 같은 일시적 상태는 설정된 대기 시
| `PROCESSED_TTL` | 중복 처리 방지 기간 | `1h` |
| `FAILURE_GRACE_PERIOD` | 타임아웃 대기 기간 | `3m` |
| `FAILURE_RECHECK_INTERVAL` | 타임아웃 재확인 간격 | `1m` |
| `WEBHOOK_SKIP_CATEGORIES` | 웹훅/S3/CloudWatch 출력을 건너뛸 감지 레이어 (쉼표 구분) | - |
| `WEBHOOK_MIN_SEVERITY` | 출력을 트리거할 최소 심각도 | - |

#### 출력 필터링

`WEBHOOK_SKIP_CATEGORIES`와 `WEBHOOK_MIN_SEVERITY`는 AND 조건으로 동작합니다. 두 조건을 모두 통과해야 CloudWatch Logs, S3, Webhook 출력이 실행됩니다. 미설정 시 모든 장애에 대해 출력이 실행됩니다.

**WEBHOOK_SKIP_CATEGORIES** — 특정 감지 레이어의 장애를 출력에서 제외합니다.

유효한 값: `ContainerWaiting`, `ContainerTerminated`, `PodPhase`, `PodStatus`, `PodCondition` ([감지 레이어 상세](docs/ARCHITECTURE.md#3단계-장애-감지---detectpodfailure))

```
# Layer 4, 5 장애는 출력하지 않음
WEBHOOK_SKIP_CATEGORIES=PodPhase,PodCondition
```

**WEBHOOK_MIN_SEVERITY** — 설정한 심각도 이상의 장애만 출력합니다.

유효한 값 (낮을수록 심각): `CRITICAL`, `HIGH`, `MEDIUM`, `LOW` ([심각도별 장애 유형](docs/ARCHITECTURE.md#6단계-심각도-결정---determineseverity))

```
# HIGH 이상(CRITICAL, HIGH)만 출력
WEBHOOK_MIN_SEVERITY=HIGH
```

두 옵션을 조합하면 더 세밀하게 제어할 수 있습니다.

```
# PodPhase는 무조건 제외 + 나머지는 HIGH 이상만 출력
WEBHOOK_SKIP_CATEGORIES=PodPhase,PodCondition
WEBHOOK_MIN_SEVERITY=HIGH
```

## IAM 권한

Expand Down
10 changes: 10 additions & 0 deletions containers/devops-agent-operator/examples/04-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,13 @@ data:
WEBHOOK_TIMEOUT: "30s"
# Duration to prevent reprocessing the same failure
PROCESSED_TTL: "1h"
# Comma-separated list of failure categories that should NOT trigger webhook calls.
# Valid values: ContainerWaiting, ContainerTerminated, PodPhase, PodStatus, PodCondition
# Empty means all categories trigger webhooks (default behavior).
# Example: "PodPhase,PodCondition"
WEBHOOK_SKIP_CATEGORIES: ""
# Minimum severity level required to trigger a webhook call.
# Valid values: LOW, MEDIUM, HIGH, CRITICAL
# Empty means all severities trigger webhooks (default behavior).
# Example: "HIGH" → only HIGH and CRITICAL failures trigger webhooks
WEBHOOK_MIN_SEVERITY: ""
54 changes: 54 additions & 0 deletions containers/devops-agent-operator/internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,16 @@ type Config struct {
// During the grace period, the operator requeues the pod at this interval to check
// whether the transient state has resolved or the grace period has elapsed.
FailureRecheckInterval time.Duration

// WebhookSkipCategories is a list of failure categories that should not trigger webhook calls.
// Valid values: ContainerWaiting, ContainerTerminated, PodPhase, PodStatus, PodCondition
// Empty means all categories trigger webhooks.
WebhookSkipCategories []string

// WebhookMinSeverity is the minimum severity level required to trigger a webhook call.
// Valid values: LOW, MEDIUM, HIGH, CRITICAL
// Empty means all severities trigger webhooks.
WebhookMinSeverity string
}

// DefaultConfig returns a Config with default values
Expand Down Expand Up @@ -183,6 +193,14 @@ func LoadFromEnv() *Config {
}
}

if v := os.Getenv("WEBHOOK_SKIP_CATEGORIES"); v != "" {
cfg.WebhookSkipCategories = splitAndTrim(v, ",")
}

if v := os.Getenv("WEBHOOK_MIN_SEVERITY"); v != "" {
cfg.WebhookMinSeverity = strings.ToUpper(strings.TrimSpace(v))
}

return cfg
}

Expand Down Expand Up @@ -221,6 +239,42 @@ func (c *Config) IsNamespaceWatched(namespace string) bool {
return false
}

// severityLevel defines the numeric level of each severity for threshold comparison.
// Lower number = higher severity, consistent with P-level conventions (P0/P1/...).
var severityLevel = map[string]int{
"CRITICAL": 0,
"HIGH": 1,
"MEDIUM": 2,
"LOW": 3,
}

// ShouldSendWebhook returns true if the failure should trigger a webhook call.
// Both conditions must pass (AND logic):
// - The failure category must not be in WebhookSkipCategories
// - The failure severity must meet or exceed WebhookMinSeverity
//
// If neither filter is configured, always returns true (default behavior preserved).
func (c *Config) ShouldSendWebhook(category, severity string) bool {
// Category filter: skip if category is in the skip list
for _, cat := range c.WebhookSkipCategories {
if cat == category {
return false
}
}

// Severity filter: skip if severity is below the minimum threshold
// e.g. WebhookMinSeverity=HIGH → only CRITICAL and HIGH pass
if c.WebhookMinSeverity != "" {
minLevel, minKnown := severityLevel[c.WebhookMinSeverity]
curLevel, curKnown := severityLevel[severity]
if minKnown && curKnown && curLevel > minLevel {
return false
}
}

return true
}

// splitAndTrim splits a string by separator and trims whitespace
func splitAndTrim(s, sep string) []string {
parts := strings.Split(s, sep)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,20 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
// Build collected data
data := r.buildCollectedData(ctx, &pod, failure)

// Evaluate filter once for all outputs
severity := collector.DetermineSeverity(failure)
shouldSend := r.Config.ShouldSendWebhook(failure.Category, severity)

if !shouldSend {
logger.Info("Outputs skipped by filter",
"pod", req.NamespacedName,
"category", failure.Category,
"severity", severity,
)
}

// Upload to CloudWatch Logs if configured (optional)
if r.CloudWatchClient != nil {
if r.CloudWatchClient != nil && shouldSend {
cwResult, err := r.CloudWatchClient.Upload(ctx, data)
if err != nil {
logger.Error(err, "Failed to upload data to CloudWatch Logs",
Expand All @@ -162,7 +174,7 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R

// Upload to S3 if configured (optional)
var s3URL string
if r.S3Client != nil {
if r.S3Client != nil && shouldSend {
uploadResult, err := r.S3Client.Upload(ctx, data)
if err != nil {
logger.Error(err, "Failed to upload data to S3",
Expand All @@ -178,12 +190,14 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
)
}

// Send to webhook (required)
if err := r.Webhook.Send(ctx, data, s3URL); err != nil {
logger.Error(err, "Failed to send webhook",
"pod", req.NamespacedName,
)
// Continue to mark as processed even if webhook fails
// Send to webhook if not filtered by category or severity
if shouldSend {
if err := r.Webhook.Send(ctx, data, s3URL); err != nil {
logger.Error(err, "Failed to send webhook",
"pod", req.NamespacedName,
)
// Continue to mark as processed even if webhook fails
}
}

// Mark as processed
Expand Down