diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 8c6d2df6f2958d..21e848b05f6c0a 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -811,7 +811,7 @@ /test/new-e2e/tests/agent-subcommands/secret @DataDog/agent-configuration /test/new-e2e/tests/agent-subcommands/status @DataDog/agent-configuration /test/new-e2e/tests/containers @DataDog/container-integrations @DataDog/container-platform -/test/new-e2e/tests/containers/ecs_test.go @DataDog/ecs-experiences +/test/new-e2e/tests/ecs/ @DataDog/ecs-experiences /test/new-e2e/tests/discovery @DataDog/agent-discovery /test/new-e2e/tests/fips-compliance @DataDog/agent-runtimes /test/new-e2e/tests/ha-agent @DataDog/ndm-core diff --git a/.gitlab/test/e2e/e2e.yml b/.gitlab/test/e2e/e2e.yml index a5a6068d799762..eff7a781171e4e 100644 --- a/.gitlab/test/e2e/e2e.yml +++ b/.gitlab/test/e2e/e2e.yml @@ -310,6 +310,29 @@ new-e2e-containers-eks: ON_NIGHTLY_FIPS: "true" retry: !reference [.retry_only_infra_failure, retry] +new-e2e-ecs: + extends: .new_e2e_template + needs: + - !reference [.needs_new_e2e_template] + - qa_agent + - qa_agent_jmx + - qa_dca + - qa_dogstatsd + rules: + - !reference [.on_container_or_e2e_changes] + - !reference [.manual] + variables: + TARGETS: ./tests/ecs + TEAM: ecs-experiences + ON_NIGHTLY_FIPS: "true" + retry: !reference [.retry_only_infra_failure, retry] + parallel: + matrix: + - EXTRA_PARAMS: --run TestECSAPMSuite + - EXTRA_PARAMS: --run TestECSManagedSuite + - EXTRA_PARAMS: --run TestECSChecksSuite + - EXTRA_PARAMS: --run TestECSPlatformSuite + new-e2e-containers-openshift-init: stage: e2e_init extends: .new_e2e_template diff --git a/releasenotes/notes/comprehensive-ecs-e2e-testing-a97556f927570a09.yaml b/releasenotes/notes/comprehensive-ecs-e2e-testing-a97556f927570a09.yaml new file mode 100644 index 00000000000000..0487646e2e925c --- /dev/null +++ b/releasenotes/notes/comprehensive-ecs-e2e-testing-a97556f927570a09.yaml @@ -0,0 +1,15 @@ +# Each section from every release note are combined when the +# CHANGELOG.rst is rendered. So the text needs to be worded so that +# it does not depend on any information only available in another +# section. This may mean repeating some details, but each section +# must be readable independently of the other. +# +# Each section note must be formatted as reStructuredText. +--- +other: + - | + Migrated ECS E2E tests into a dedicated test/new-e2e/tests/ecs/ package + with 4 test suites (18 tests) covering APM/DogStatsD, check autodiscovery, + platform features, and managed instances. All tests validate specific + metrics, tags, and traces against regex patterns across Fargate, EC2, + and Managed Instance deployment types. diff --git a/test/e2e-framework/components/datadog/agent/ecs.go b/test/e2e-framework/components/datadog/agent/ecs.go index 0ee63a6e7ac7b6..22a0bc6e0a6f68 100644 --- a/test/e2e-framework/components/datadog/agent/ecs.go +++ b/test/e2e-framework/components/datadog/agent/ecs.go @@ -115,7 +115,22 @@ func ecsLinuxAgentSingleContainerDefinition(e config.Env, apiKeySSMParamName pul Name: pulumi.StringPtr("DD_DOGSTATSD_ORIGIN_DETECTION_CLIENT"), Value: pulumi.StringPtr("true"), }, - + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_APM_ENABLE_CONTAINER_TAGS_BUFFER"), + Value: pulumi.StringPtr("true"), + }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_CONTAINER_PROC_ROOT"), + Value: pulumi.StringPtr("/host/proc"), + }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_LOG_LEVEL"), + Value: pulumi.StringPtr("debug"), + }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_APM_LOG_FILE"), + Value: pulumi.StringPtr("stdout"), + }, ecs.TaskDefinitionKeyValuePairArgs{ Name: pulumi.StringPtr("DD_DOGSTATSD_SOCKET"), Value: pulumi.StringPtr("/var/run/datadog/dsd.socket"), @@ -253,6 +268,14 @@ func ecsFakeintakeAdditionalEndpointsEnv(fakeintake *fakeintake.Fakeintake) []ec Name: pulumi.StringPtr("DD_REMOTE_CONFIGURATION_NO_TLS_VALIDATION"), Value: pulumi.StringPtr("true"), }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_URL"), + Value: fakeintake.URL.ToStringOutput(), + }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_APM_DD_URL"), + Value: fakeintake.URL.ToStringOutput(), + }, ecs.TaskDefinitionKeyValuePairArgs{ Name: pulumi.StringPtr("DD_PROCESS_CONFIG_PROCESS_DD_URL"), Value: fakeintake.URL.ToStringOutput(), diff --git a/test/e2e-framework/components/datadog/apps/aspnetsample/ecs.go b/test/e2e-framework/components/datadog/apps/aspnetsample/ecs.go index ce33c9713bbbd6..1ae143de55ed59 100644 --- a/test/e2e-framework/components/datadog/apps/aspnetsample/ecs.go +++ b/test/e2e-framework/components/datadog/apps/aspnetsample/ecs.go @@ -61,7 +61,13 @@ func FargateAppDefinition(e aws.Environment, clusterArn pulumi.StringInput, apiK Condition: pulumi.String("HEALTHY"), }, }, - PortMappings: ecs.TaskDefinitionPortMappingArray{}, + PortMappings: ecs.TaskDefinitionPortMappingArray{ + ecs.TaskDefinitionPortMappingArgs{ + ContainerPort: pulumi.IntPtr(8080), + HostPort: pulumi.IntPtr(8080), + Protocol: pulumi.StringPtr("tcp"), + }, + }, } serverTaskDef, err := ecsClient.FargateWindowsTaskDefinitionWithAgent(e, "aspnet-fg-server", pulumi.String("aspnet-fg"), 4096, 8192, map[string]ecs.TaskDefinitionContainerDefinitionArgs{"aspnetsample": *serverContainer}, apiKeySSMParamName, fakeIntake, "", opts...) diff --git a/test/e2e-framework/components/datadog/apps/nginx/ecs.go b/test/e2e-framework/components/datadog/apps/nginx/ecs.go index 2a291d4d6659e1..37a3927f3c929e 100644 --- a/test/e2e-framework/components/datadog/apps/nginx/ecs.go +++ b/test/e2e-framework/components/datadog/apps/nginx/ecs.go @@ -38,6 +38,20 @@ func EcsAppDefinition(e aws.Environment, clusterArn pulumi.StringInput, opts ... "nginx": { Name: pulumi.String("nginx"), Image: pulumi.String("ghcr.io/datadog/apps-nginx-server:" + apps.Version), + Environment: ecs.TaskDefinitionKeyValuePairArray{ + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_SERVICE"), + Value: pulumi.StringPtr("nginx"), + }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_ENV"), + Value: pulumi.StringPtr("e2e-test"), + }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_VERSION"), + Value: pulumi.StringPtr("1.0"), + }, + }, DockerLabels: pulumi.StringMap{ "com.datadoghq.ad.checks": pulumi.String(utils.JSONMustMarshal( map[string]interface{}{ @@ -51,7 +65,10 @@ func EcsAppDefinition(e aws.Environment, clusterArn pulumi.StringInput, opts ... }, }, )), - "com.datadoghq.ad.tags": pulumi.String("[\"ecs_launch_type:ec2\"]"), + "com.datadoghq.ad.tags": pulumi.String("[\"ecs_launch_type:ec2\"]"), + "com.datadoghq.tags.service": pulumi.String("nginx"), + "com.datadoghq.tags.env": pulumi.String("e2e-test"), + "com.datadoghq.tags.version": pulumi.String("1.0"), }, Cpu: pulumi.IntPtr(100), Memory: pulumi.IntPtr(96), diff --git a/test/e2e-framework/components/datadog/apps/nginx/ecsFargate.go b/test/e2e-framework/components/datadog/apps/nginx/ecsFargate.go index dce35131ad47bc..b734fec45d8c44 100644 --- a/test/e2e-framework/components/datadog/apps/nginx/ecsFargate.go +++ b/test/e2e-framework/components/datadog/apps/nginx/ecsFargate.go @@ -33,6 +33,20 @@ func FargateAppDefinition(e aws.Environment, clusterArn pulumi.StringInput, apiK serverContainer := &ecs.TaskDefinitionContainerDefinitionArgs{ Name: pulumi.String("nginx"), Image: pulumi.String("ghcr.io/datadog/apps-nginx-server:" + apps.Version), + Environment: ecs.TaskDefinitionKeyValuePairArray{ + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_SERVICE"), + Value: pulumi.StringPtr("nginx-fargate"), + }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_ENV"), + Value: pulumi.StringPtr("e2e-test"), + }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_VERSION"), + Value: pulumi.StringPtr("1.0"), + }, + }, DockerLabels: pulumi.StringMap{ "com.datadoghq.ad.checks": pulumi.String(utils.JSONMustMarshal( map[string]interface{}{ @@ -46,7 +60,10 @@ func FargateAppDefinition(e aws.Environment, clusterArn pulumi.StringInput, apiK }, }, )), - "com.datadoghq.ad.tags": pulumi.String("[\"ecs_launch_type:fargate\"]"), + "com.datadoghq.ad.tags": pulumi.String("[\"ecs_launch_type:fargate\"]"), + "com.datadoghq.tags.service": pulumi.String("nginx-fargate"), + "com.datadoghq.tags.env": pulumi.String("e2e-test"), + "com.datadoghq.tags.version": pulumi.String("1.0"), }, Cpu: pulumi.IntPtr(100), Memory: pulumi.IntPtr(96), diff --git a/test/e2e-framework/components/datadog/apps/redis/ecs.go b/test/e2e-framework/components/datadog/apps/redis/ecs.go index 506ddcc8dc6526..6be89e1b20d726 100644 --- a/test/e2e-framework/components/datadog/apps/redis/ecs.go +++ b/test/e2e-framework/components/datadog/apps/redis/ecs.go @@ -41,8 +41,25 @@ func EcsAppDefinition(e aws.Environment, clusterArn pulumi.StringInput, opts ... "redis": { Name: pulumi.String("redis"), Image: pulumi.String("ghcr.io/datadog/redis:" + apps.Version), + Environment: ecs.TaskDefinitionKeyValuePairArray{ + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_SERVICE"), + Value: pulumi.StringPtr("redis"), + }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_ENV"), + Value: pulumi.StringPtr("e2e-test"), + }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_VERSION"), + Value: pulumi.StringPtr("1.0"), + }, + }, DockerLabels: pulumi.StringMap{ - "com.datadoghq.ad.tags": pulumi.String("[\"ecs_launch_type:ec2\"]"), + "com.datadoghq.ad.tags": pulumi.String("[\"ecs_launch_type:ec2\"]"), + "com.datadoghq.tags.service": pulumi.String("redis"), + "com.datadoghq.tags.env": pulumi.String("e2e-test"), + "com.datadoghq.tags.version": pulumi.String("1.0"), }, Command: pulumi.StringArray{ pulumi.String("--loglevel"), diff --git a/test/e2e-framework/components/datadog/apps/redis/ecsFargate.go b/test/e2e-framework/components/datadog/apps/redis/ecsFargate.go index e87a577ed446bd..7c6127be56d824 100644 --- a/test/e2e-framework/components/datadog/apps/redis/ecsFargate.go +++ b/test/e2e-framework/components/datadog/apps/redis/ecsFargate.go @@ -35,8 +35,25 @@ func FargateAppDefinition(e aws.Environment, clusterArn pulumi.StringInput, apiK serverContainer := &ecs.TaskDefinitionContainerDefinitionArgs{ Name: pulumi.String("redis"), Image: pulumi.String("ghcr.io/datadog/redis:" + apps.Version), + Environment: ecs.TaskDefinitionKeyValuePairArray{ + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_SERVICE"), + Value: pulumi.StringPtr("redis-fargate"), + }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_ENV"), + Value: pulumi.StringPtr("e2e-test"), + }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_VERSION"), + Value: pulumi.StringPtr("1.0"), + }, + }, DockerLabels: pulumi.StringMap{ - "com.datadoghq.ad.tags": pulumi.String("[\"ecs_launch_type:fargate\"]"), + "com.datadoghq.ad.tags": pulumi.String("[\"ecs_launch_type:fargate\"]"), + "com.datadoghq.tags.service": pulumi.String("redis-fargate"), + "com.datadoghq.tags.env": pulumi.String("e2e-test"), + "com.datadoghq.tags.version": pulumi.String("1.0"), }, Cpu: pulumi.IntPtr(0), Essential: pulumi.BoolPtr(true), diff --git a/test/e2e-framework/components/datadog/apps/tracegen/ecs.go b/test/e2e-framework/components/datadog/apps/tracegen/ecs.go index 35110657c4b959..f4f83e8e881c86 100644 --- a/test/e2e-framework/components/datadog/apps/tracegen/ecs.go +++ b/test/e2e-framework/components/datadog/apps/tracegen/ecs.go @@ -46,6 +46,29 @@ func EcsAppDefinition(e aws.Environment, clusterArn pulumi.StringInput, opts ... Name: pulumi.StringPtr("DD_TRACE_AGENT_URL"), Value: pulumi.StringPtr("unix:///var/run/datadog/apm.socket"), }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_SERVICE"), + Value: pulumi.StringPtr("tracegen-test-service"), + }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_ENV"), + Value: pulumi.StringPtr("e2e-test"), + }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_VERSION"), + Value: pulumi.StringPtr("1.0"), + }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_LOGS_INJECTION"), + Value: pulumi.StringPtr("true"), + }, + }, + DockerLabels: pulumi.StringMap{ + "com.datadoghq.ad.tags": pulumi.String("[\"ecs_launch_type:ec2\"]"), + "com.datadoghq.ad.logs": pulumi.String("[{\"source\":\"tracegen\",\"service\":\"tracegen-test-service\"}]"), + "com.datadoghq.tags.service": pulumi.String("tracegen-test-service"), + "com.datadoghq.tags.env": pulumi.String("e2e-test"), + "com.datadoghq.tags.version": pulumi.String("1.0"), }, Cpu: pulumi.IntPtr(10), Memory: pulumi.IntPtr(32), @@ -92,6 +115,29 @@ func EcsAppDefinition(e aws.Environment, clusterArn pulumi.StringInput, opts ... Name: pulumi.StringPtr("ECS_AGENT_HOST"), Value: pulumi.StringPtr("true"), }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_SERVICE"), + Value: pulumi.StringPtr("tracegen-test-service"), + }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_ENV"), + Value: pulumi.StringPtr("e2e-test"), + }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_VERSION"), + Value: pulumi.StringPtr("1.0"), + }, + ecs.TaskDefinitionKeyValuePairArgs{ + Name: pulumi.StringPtr("DD_LOGS_INJECTION"), + Value: pulumi.StringPtr("true"), + }, + }, + DockerLabels: pulumi.StringMap{ + "com.datadoghq.ad.tags": pulumi.String("[\"ecs_launch_type:ec2\"]"), + "com.datadoghq.ad.logs": pulumi.String("[{\"source\":\"tracegen\",\"service\":\"tracegen-test-service\"}]"), + "com.datadoghq.tags.service": pulumi.String("tracegen-test-service"), + "com.datadoghq.tags.env": pulumi.String("e2e-test"), + "com.datadoghq.tags.version": pulumi.String("1.0"), }, Cpu: pulumi.IntPtr(10), Memory: pulumi.IntPtr(32), diff --git a/test/e2e-framework/resources/aws/ecs/nodeGroups.go b/test/e2e-framework/resources/aws/ecs/nodeGroups.go index 3b4641df7feae5..92544e1e83279e 100644 --- a/test/e2e-framework/resources/aws/ecs/nodeGroups.go +++ b/test/e2e-framework/resources/aws/ecs/nodeGroups.go @@ -77,6 +77,23 @@ func NewWindowsNodeGroup(e aws.Environment, clusterName pulumi.StringInput) (pul return newNodeGroup(e, "win2022-ng", pulumi.String(winAmi.Value), pulumi.String(e.DefaultInstanceType()), getUserData(windowsInitUserData, clusterName)) } +// NewManagedNodeGroup creates an ECS node group using ECS-managed instances. +// Managed instances are EC2 instances that are managed by ECS, providing automatic scaling, +// draining, and lifecycle management without requiring direct ASG management. +func NewManagedNodeGroup(e aws.Environment, clusterName pulumi.StringInput) (pulumi.StringOutput, error) { + // Use the same ECS-optimized AMI as regular node groups + ecsAmi, err := ssm.LookupParameter(e.Ctx(), &ssm.LookupParameterArgs{ + Name: "/aws/service/ecs/optimized-ami/amazon-linux-2/recommended/image_id", + }, e.WithProvider(config.ProviderAWS)) + if err != nil { + return pulumi.StringOutput{}, err + } + + // Managed instances use similar configuration but with ECS-managed ASG + // For testing purposes, we create a standard node group that ECS will manage + return newNodeGroup(e, "managed-ng", pulumi.String(ecsAmi.Value), pulumi.String(e.DefaultInstanceType()), getUserData(linuxInitUserData, clusterName)) +} + func newNodeGroup(e aws.Environment, ngName string, ami, instanceType, userData pulumi.StringInput) (pulumi.StringOutput, error) { lt, err := ec2.CreateLaunchTemplate(e, ngName, ami, diff --git a/test/e2e-framework/resources/aws/ecs/wait.go b/test/e2e-framework/resources/aws/ecs/wait.go new file mode 100644 index 00000000000000..265dd2e882152f --- /dev/null +++ b/test/e2e-framework/resources/aws/ecs/wait.go @@ -0,0 +1,71 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2025-present Datadog, Inc. + +package ecs + +import ( + "context" + "fmt" + "time" + + "github.com/DataDog/datadog-agent/test/e2e-framework/resources/aws" + awssdk "github.com/aws/aws-sdk-go-v2/aws" + awsconfig "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/service/ecs" + "github.com/pulumi/pulumi/sdk/v3/go/pulumi" +) + +// WaitForContainerInstances waits for at least minInstances container instances to be registered +// in the ECS cluster before returning. Returns the cluster ARN as a StringOutput so it can be +// used as an implicit dependency for downstream resources. +func WaitForContainerInstances(e aws.Environment, clusterArn pulumi.StringOutput, minInstances int) pulumi.StringOutput { + return pulumi.All(clusterArn).ApplyT(func(args []interface{}) (string, error) { + clusterArnStr := args[0].(string) + + ctx := context.Background() + cfg, err := awsconfig.LoadDefaultConfig(ctx, + awsconfig.WithRegion(e.Region()), + awsconfig.WithSharedConfigProfile(e.Profile()), + ) + if err != nil { + return "", fmt.Errorf("failed to load AWS config: %w", err) + } + + ecsClient := ecs.NewFromConfig(cfg) + + maxWaitTime := 5 * time.Minute + pollInterval := 10 * time.Second + startTime := time.Now() + + e.Ctx().Log.Info(fmt.Sprintf("Waiting for at least %d container instance(s) to register in cluster %s", minInstances, clusterArnStr), nil) + + for { + if time.Since(startTime) > maxWaitTime { + return "", fmt.Errorf("timeout waiting for container instances after %v", maxWaitTime) + } + + listOutput, err := ecsClient.ListContainerInstances(ctx, &ecs.ListContainerInstancesInput{ + Cluster: awssdk.String(clusterArnStr), + Status: "ACTIVE", + }) + if err != nil { + e.Ctx().Log.Warn(fmt.Sprintf("Failed to list container instances: %v, retrying...", err), nil) + time.Sleep(pollInterval) + continue + } + + registeredCount := len(listOutput.ContainerInstanceArns) + e.Ctx().Log.Info(fmt.Sprintf("Found %d registered container instance(s) (need %d)", registeredCount, minInstances), nil) + + if registeredCount >= minInstances { + e.Ctx().Log.Info(fmt.Sprintf("Container instances ready! Found %d instance(s)", registeredCount), nil) + return clusterArnStr, nil + } + + e.Ctx().Log.Info(fmt.Sprintf("Waiting %v before checking again...", pollInterval), nil) + time.Sleep(pollInterval) + } + }).(pulumi.StringOutput) +} diff --git a/test/e2e-framework/scenarios/aws/ecs/args.go b/test/e2e-framework/scenarios/aws/ecs/args.go index 76e5bc1fe43bce..b0836d994ff801 100644 --- a/test/e2e-framework/scenarios/aws/ecs/args.go +++ b/test/e2e-framework/scenarios/aws/ecs/args.go @@ -16,6 +16,7 @@ type Params struct { LinuxARMNodeGroup bool LinuxBottleRocketNodeGroup bool WindowsNodeGroup bool + ManagedInstanceNodeGroup bool } type Option = func(*Params) error @@ -60,6 +61,13 @@ func WithWindowsNodeGroup() Option { } } +func WithManagedInstanceNodeGroup() Option { + return func(p *Params) error { + p.ManagedInstanceNodeGroup = true + return nil + } +} + func buildClusterOptionsFromConfigMap(e aws.Environment) []Option { clusterOptions := []Option{} // Add the cluster options from the config map diff --git a/test/e2e-framework/scenarios/aws/ecs/cluster.go b/test/e2e-framework/scenarios/aws/ecs/cluster.go index bf3db6993eb49c..9decd6ed99a067 100644 --- a/test/e2e-framework/scenarios/aws/ecs/cluster.go +++ b/test/e2e-framework/scenarios/aws/ecs/cluster.go @@ -71,6 +71,15 @@ func NewCluster(e aws.Environment, name string, opts ...Option) (*ecsComp.Cluste capacityProviders = append(capacityProviders, cpName) } + if params.ManagedInstanceNodeGroup { + cpName, err := ecs.NewManagedNodeGroup(e, ecsCluster.Name) + if err != nil { + return err + } + + capacityProviders = append(capacityProviders, cpName) + } + // Associate capacity providers _, err = ecs.NewClusterCapacityProvider(e, e.Ctx().Stack(), ecsCluster.Name, capacityProviders) if err != nil { diff --git a/test/e2e-framework/scenarios/aws/ecs/run.go b/test/e2e-framework/scenarios/aws/ecs/run.go index 42209a17688167..5389f4b81d3e09 100644 --- a/test/e2e-framework/scenarios/aws/ecs/run.go +++ b/test/e2e-framework/scenarios/aws/ecs/run.go @@ -21,6 +21,7 @@ import ( "github.com/pulumi/pulumi/sdk/v3/go/pulumi" resourcesAws "github.com/DataDog/datadog-agent/test/e2e-framework/resources/aws" + resourcesEcs "github.com/DataDog/datadog-agent/test/e2e-framework/resources/aws/ecs" "github.com/DataDog/datadog-agent/test/e2e-framework/scenarios/aws/fakeintake" "github.com/DataDog/datadog-agent/test/e2e-framework/scenarios/outputs" ) @@ -29,7 +30,6 @@ import ( // An EC2 provider is considered set if at least one of its node groups is enabled. func isEC2ProviderSet(params *Params) bool { return params.LinuxNodeGroup || params.LinuxARMNodeGroup || params.WindowsNodeGroup || params.LinuxBottleRocketNodeGroup - } // Run is the entry point for the scenario when run via pulumi. @@ -105,31 +105,40 @@ func RunWithEnv(ctx *pulumi.Context, awsEnv resourcesAws.Environment, env output env.DisableFakeIntake() } + // Wait for container instances to be ready before deploying EC2 workloads. + // The wait output returns the cluster ARN after instances are registered, + // creating an implicit Pulumi dependency for downstream resources. + ec2ClusterArn := cluster.ClusterArn + if isEC2ProviderSet(clusterParams) { + ctx.Log.Info("Waiting for EC2 container instances to register with the cluster...", nil) + ec2ClusterArn = resourcesEcs.WaitForContainerInstances(awsEnv, cluster.ClusterArn, 1) + } + // Testing workload if at least one EC2 node group is present if params.testingWorkload && isEC2ProviderSet(clusterParams) { - if _, err := nginx.EcsAppDefinition(awsEnv, cluster.ClusterArn); err != nil { + if _, err := nginx.EcsAppDefinition(awsEnv, ec2ClusterArn); err != nil { return err } - if _, err := redis.EcsAppDefinition(awsEnv, cluster.ClusterArn); err != nil { + if _, err := redis.EcsAppDefinition(awsEnv, ec2ClusterArn); err != nil { return err } - if _, err := cpustress.EcsAppDefinition(awsEnv, cluster.ClusterArn); err != nil { + if _, err := cpustress.EcsAppDefinition(awsEnv, ec2ClusterArn); err != nil { return err } - if _, err := dogstatsd.EcsAppDefinition(awsEnv, cluster.ClusterArn); err != nil { + if _, err := dogstatsd.EcsAppDefinition(awsEnv, ec2ClusterArn); err != nil { return err } - if _, err := prometheus.EcsAppDefinition(awsEnv, cluster.ClusterArn); err != nil { + if _, err := prometheus.EcsAppDefinition(awsEnv, ec2ClusterArn); err != nil { return err } - if _, err := tracegen.EcsAppDefinition(awsEnv, cluster.ClusterArn); err != nil { + if _, err := tracegen.EcsAppDefinition(awsEnv, ec2ClusterArn); err != nil { return err } } // User-defined EC2 apps for _, appFunc := range params.workloadAppFuncs { - if _, err := appFunc(awsEnv, cluster.ClusterArn); err != nil { + if _, err := appFunc(awsEnv, ec2ClusterArn); err != nil { return err } } diff --git a/test/new-e2e/go.mod b/test/new-e2e/go.mod index 515e44c0552509..afcf49dde6f71e 100644 --- a/test/new-e2e/go.mod +++ b/test/new-e2e/go.mod @@ -228,6 +228,7 @@ require ( github.com/go-viper/mapstructure/v2 v2.5.0 github.com/google/go-containerregistry v0.20.7 github.com/hairyhenderson/go-codeowners v0.7.0 + gopkg.in/yaml.v3 v3.0.1 ) require ( @@ -353,7 +354,6 @@ require ( golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/ini.v1 v1.67.0 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect sigs.k8s.io/randfill v1.0.0 // indirect sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 // indirect ) diff --git a/test/new-e2e/tests/containers/base_test.go b/test/new-e2e/tests/containers/base_test.go index f094b14993f185..c171ac52311330 100644 --- a/test/new-e2e/tests/containers/base_test.go +++ b/test/new-e2e/tests/containers/base_test.go @@ -313,120 +313,6 @@ func (suite *baseSuite[Env]) testLog(args *testLogArgs) { }) } -type testCheckRunArgs struct { - Filter testCheckRunFilterArgs - Expect testCheckRunExpectArgs - Optional testCheckRunExpectArgs -} - -type testCheckRunFilterArgs struct { - Name string - // Tags are used to filter the checkRun - // Regexes are supported - Tags []string -} - -type testCheckRunExpectArgs struct { - // Tags are the tags expected to be present - // Regexes are supported - Tags *[]string - AcceptUnexpectedTags bool -} - -func (suite *baseSuite[Env]) testCheckRun(args *testCheckRunArgs) { - prettyCheckRunQuery := fmt.Sprintf("%s{%s}", args.Filter.Name, strings.Join(args.Filter.Tags, ",")) - - suite.Run("checkRun "+prettyCheckRunQuery, func() { - var expectedTags []*regexp.Regexp - if args.Expect.Tags != nil { - expectedTags = lo.Map(*args.Expect.Tags, func(tag string, _ int) *regexp.Regexp { return regexp.MustCompile(tag) }) - } - - var optionalTags []*regexp.Regexp - if args.Optional.Tags != nil { - optionalTags = lo.Map(*args.Optional.Tags, func(tag string, _ int) *regexp.Regexp { return regexp.MustCompile(tag) }) - } - - sendEvent := func(alertType, text string) { - formattedArgs, err := yaml.Marshal(args) - suite.Require().NoError(err) - - tags := lo.Map(args.Filter.Tags, func(tag string, _ int) string { - return "filter_tag_" + tag - }) - - if _, err := suite.DatadogClient().PostEvent(&datadog.Event{ - Title: pointer.Ptr("testCheckRun " + prettyCheckRunQuery), - Text: pointer.Ptr(fmt.Sprintf(`%%%%%% -### Result - -`+"```"+` -%s -`+"```"+` - -### Query - -`+"```"+` -%s -`+"```"+` - %%%%%%`, text, formattedArgs)), - AlertType: &alertType, - Tags: append([]string{ - "app:agent-new-e2e-tests-containers", - "cluster_name:" + suite.clusterName, - "check_run:" + args.Filter.Name, - "test:" + suite.T().Name(), - }, tags...), - }); err != nil { - suite.T().Logf("Failed to post event: %s", err) - } - } - - defer func() { - if suite.T().Failed() { - sendEvent("error", fmt.Sprintf("Failed finding %s with proper tags and value", prettyCheckRunQuery)) - } else { - sendEvent("success", "All good!") - } - }() - - suite.EventuallyWithTf(func(collect *assert.CollectT) { - c := &myCollectT{ - CollectT: collect, - errors: []error{}, - } - // To enforce the use of myCollectT instead - collect = nil //nolint:ineffassign - - defer func() { - if len(c.errors) == 0 { - sendEvent("success", "All good!") - } else { - sendEvent("warning", errors.Join(c.errors...).Error()) - } - }() - - regexTags := lo.Map(args.Filter.Tags, func(tag string, _ int) *regexp.Regexp { - return regexp.MustCompile(tag) - }) - - checkRuns, err := suite.Fakeintake.FilterCheckRuns( - args.Filter.Name, - fakeintake.WithMatchingTags[*aggregator.CheckRun](regexTags), - ) - require.NoErrorf(c, err, "Failed to query fake intake") - require.NotEmptyf(c, checkRuns, "No `%s` checkRun yet", prettyCheckRunQuery) - - // Check tags - if expectedTags != nil { - err := assertTags(checkRuns[len(checkRuns)-1].GetTags(), expectedTags, optionalTags, args.Expect.AcceptUnexpectedTags) - assert.NoErrorf(c, err, "Tags mismatch on `%s`", prettyCheckRunQuery) - } - - }, 2*time.Minute, 10*time.Second, "Failed finding `%s` with proper tags and value", prettyCheckRunQuery) - }) -} - type testEventArgs struct { Filter testEventFilterArgs Expect testEventExpectArgs diff --git a/test/new-e2e/tests/containers/ecs_test.go b/test/new-e2e/tests/containers/ecs_test.go deleted file mode 100644 index 051ae7894afa81..00000000000000 --- a/test/new-e2e/tests/containers/ecs_test.go +++ /dev/null @@ -1,677 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the Apache License Version 2.0. -// This product includes software developed at Datadog (https://www.datadoghq.com/). -// Copyright 2016-present Datadog, Inc. - -package containers - -import ( - "context" - "regexp" - "strings" - "testing" - "time" - - "github.com/DataDog/datadog-agent/pkg/util/pointer" - "github.com/DataDog/datadog-agent/test/e2e-framework/testing/e2e" - "github.com/DataDog/datadog-agent/test/e2e-framework/testing/environments" - - awsconfig "github.com/aws/aws-sdk-go-v2/config" - awsecs "github.com/aws/aws-sdk-go-v2/service/ecs" - awsecstypes "github.com/aws/aws-sdk-go-v2/service/ecs/types" - "github.com/fatih/color" - "github.com/samber/lo" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - "github.com/DataDog/datadog-agent/test/e2e-framework/components/datadog/apps" - scenecs "github.com/DataDog/datadog-agent/test/e2e-framework/scenarios/aws/ecs" - "github.com/DataDog/datadog-agent/test/e2e-framework/scenarios/aws/fakeintake" - - provecs "github.com/DataDog/datadog-agent/test/e2e-framework/testing/provisioners/aws/ecs" - "github.com/DataDog/datadog-agent/test/e2e-framework/testing/runner" - "github.com/DataDog/datadog-agent/test/e2e-framework/testing/runner/parameters" -) - -const ( - taskNameDogstatsdUDS = "dogstatsd-uds" - taskNameDogstatsdUDP = "dogstatsd-udp" - - taskNameTracegenUDS = "tracegen-uds" - taskNameTracegenTCP = "tracegen-tcp" -) - -type ecsSuite struct { - baseSuite[environments.ECS] - ecsClusterName string - windowsEnabled bool -} - -func TestECSSuite(t *testing.T) { - suite := &ecsSuite{} - - ecsOptions := []scenecs.Option{ - scenecs.WithFargateCapacityProvider(), - scenecs.WithLinuxNodeGroup(), - scenecs.WithLinuxBottleRocketNodeGroup(), - } - - runOptions := []scenecs.RunOption{ - scenecs.WithFakeIntakeOptions( - fakeintake.WithRetentionPeriod("31m"), - ), - scenecs.WithTestingWorkload(), - } - - skipWindows, err := runner.GetProfile().ParamStore().GetBoolWithDefault(parameters.SkipWindows, false) - require.NoError(t, err, "failed to get %s parameter", parameters.SkipWindows) - if !skipWindows { - // WithWindowsNodeGroup is the dedicated ECS option to opt-in to Windows - // infrastructure and workloads (Windows EC2 nodes + Windows Fargate apps). - ecsOptions = append(ecsOptions, scenecs.WithWindowsNodeGroup()) - - suite.windowsEnabled = true - } - - runOptions = append(runOptions, scenecs.WithECSOptions(ecsOptions...)) - - e2e.Run(t, suite, e2e.WithProvisioner(provecs.Provisioner( - provecs.WithRunOptions(runOptions...), - ))) -} - -func (suite *ecsSuite) SetupSuite() { - suite.baseSuite.SetupSuite() - suite.Fakeintake = suite.Env().FakeIntake.Client() - suite.ecsClusterName = suite.Env().ECSCluster.ClusterName - suite.clusterName = suite.Env().ECSCluster.ClusterName -} - -func (suite *ecsSuite) TearDownSuite() { - suite.baseSuite.TearDownSuite() - - color.NoColor = false - c := color.New(color.Bold).SprintfFunc() - suite.T().Log(c("The data produced and asserted by these tests can be viewed on this dashboard:")) - c = color.New(color.Bold, color.FgBlue).SprintfFunc() - suite.T().Log(c("https://dddev.datadoghq.com/dashboard/mnw-tdr-jd8/e2e-tests-containers-ecs?refresh_mode=paused&tpl_var_ecs_cluster_name%%5B0%%5D=%s&tpl_var_fake_intake_task_family%%5B0%%5D=%s-fakeintake-ecs&from_ts=%d&to_ts=%d&live=false", - suite.ecsClusterName, - strings.TrimSuffix(suite.ecsClusterName, "-ecs"), - suite.StartTime().UnixMilli(), - suite.EndTime().UnixMilli(), - )) -} - -// Once pulumi has finished to create a stack, it can still take some time for the images to be pulled, -// for the containers to be started, for the agent collectors to collect workload information -// and to feed workload meta and the tagger. -// -// We could increase the timeout of all tests to cope with the agent tagger warmup time. -// But in case of a single bug making a single tag missing from every metric, -// all the tests would time out and that would be a waste of time. -// -// It’s better to have the first test having a long timeout to wait for the agent to warmup, -// and to have the following tests with a smaller timeout. -// -// Inside a testify test suite, tests are executed in alphabetical order. -// The 00 in Test00UpAndRunning is here to guarantee that this test, waiting for all tasks to be ready -// is run first. -func (suite *ecsSuite) Test00UpAndRunning() { - ctx := context.Background() - - cfg, err := awsconfig.LoadDefaultConfig(ctx) - suite.Require().NoErrorf(err, "Failed to load AWS config") - - client := awsecs.NewFromConfig(cfg) - - suite.Run("ECS tasks are ready", func() { - suite.EventuallyWithTf(func(c *assert.CollectT) { - var initToken string - for nextToken := &initToken; nextToken != nil; { - if nextToken == &initToken { - nextToken = nil - } - - servicesList, err := client.ListServices(ctx, &awsecs.ListServicesInput{ - Cluster: &suite.ecsClusterName, - MaxResults: pointer.Ptr(int32(10)), // Because `DescribeServices` takes at most 10 services in input - NextToken: nextToken, - }) - require.NoErrorf(c, err, "Failed to list ECS services") - - nextToken = servicesList.NextToken - - servicesDescription, err := client.DescribeServices(ctx, &awsecs.DescribeServicesInput{ - Cluster: &suite.ecsClusterName, - Services: servicesList.ServiceArns, - }) - if !assert.NoErrorf(c, err, "Failed to describe ECS services %v", servicesList.ServiceArns) { - continue - } - - for _, serviceDescription := range servicesDescription.Services { - assert.NotZerof(c, serviceDescription.DesiredCount, "ECS service %s has no task", *serviceDescription.ServiceName) - - for nextToken := &initToken; nextToken != nil; { - if nextToken == &initToken { - nextToken = nil - } - - tasksList, err := client.ListTasks(ctx, &awsecs.ListTasksInput{ - Cluster: &suite.ecsClusterName, - ServiceName: serviceDescription.ServiceName, - DesiredStatus: awsecstypes.DesiredStatusRunning, - MaxResults: pointer.Ptr(int32(100)), // Because `DescribeTasks` takes at most 100 tasks in input - NextToken: nextToken, - }) - require.NoErrorf(c, err, "Failed to list ECS tasks for service %s", *serviceDescription.ServiceName) - - nextToken = tasksList.NextToken - - tasksDescription, err := client.DescribeTasks(ctx, &awsecs.DescribeTasksInput{ - Cluster: &suite.ecsClusterName, - Tasks: tasksList.TaskArns, - }) - if !assert.NoErrorf(c, err, "Failed to describe ECS tasks %v", tasksList.TaskArns) { - continue - } - - for _, taskDescription := range tasksDescription.Tasks { - assert.Equalf(c, string(awsecstypes.DesiredStatusRunning), *taskDescription.LastStatus, - "Task %s of service %s is not running", *taskDescription.TaskArn, *serviceDescription.ServiceName) - assert.NotEqualf(c, awsecstypes.HealthStatusUnhealthy, taskDescription.HealthStatus, - "Task %s of service %s is unhealthy", *taskDescription.TaskArn, *serviceDescription.ServiceName) - } - } - } - } - }, 15*time.Minute, 10*time.Second, "Not all tasks became ready in time.") - }) -} - -func (suite *ecsSuite) TestNginxECS() { - // `nginx` check is configured via docker labels - // Test it is properly scheduled - suite.testMetric(&testMetricArgs{ - Filter: testMetricFilterArgs{ - Name: "nginx.net.request_per_s", - Tags: []string{"^ecs_launch_type:ec2$"}, - }, - Expect: testMetricExpectArgs{ - Tags: &[]string{ - `^aws_account:[[:digit:]]{12}$`, - `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^container_id:`, - `^container_name:ecs-.*-nginx-ec2-`, - `^docker_image:ghcr\.io/datadog/apps-nginx-server:` + regexp.QuoteMeta(apps.Version) + `$`, - `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^ecs_container_name:nginx$`, - `^ecs_launch_type:ec2$`, - `^ecs_service:` + regexp.QuoteMeta(strings.TrimSuffix(suite.ecsClusterName, "-ecs")) + `-nginx-ec2$`, - `^git\.commit\.sha:[[:xdigit:]]{40}$`, // org.opencontainers.image.revision docker image label - `^git.repository_url:https://github.com/DataDog/test-infra-definitions$`, // org.opencontainers.image.source docker image label - `^image_id:sha256:`, - `^image_name:ghcr\.io/datadog/apps-nginx-server$`, - `^image_tag:` + regexp.QuoteMeta(apps.Version) + `$`, - `^nginx_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^region:us-east-1$`, - `^service_arn:`, - `^short_image:apps-nginx-server$`, - `^task_arn:`, - `^task_definition_arn:`, - `^task_family:.*-nginx-ec2$`, - `^task_name:.*-nginx-ec2$`, - `^task_version:[[:digit:]]+$`, - }, - AcceptUnexpectedTags: true, - }, - }) - - suite.testLog(&testLogArgs{ - Filter: testLogFilterArgs{ - Service: "apps-nginx-server", - Tags: []string{"^ecs_launch_type:ec2$"}, - }, - Expect: testLogExpectArgs{ - Tags: &[]string{ - `^aws_account:[[:digit:]]{12}$`, - `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^container_id:`, - `^container_name:ecs-.*-nginx-ec2-`, - `^docker_image:ghcr\.io/datadog/apps-nginx-server:` + regexp.QuoteMeta(apps.Version) + `$`, - `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^ecs_container_name:nginx$`, - `^ecs_launch_type:ec2$`, - `^ecs_service:` + regexp.QuoteMeta(strings.TrimSuffix(suite.ecsClusterName, "-ecs")) + `-nginx-ec2$`, - `^git\.commit\.sha:[[:xdigit:]]{40}$`, // org.opencontainers.image.revision docker image label - `^git.repository_url:https://github.com/DataDog/test-infra-definitions$`, // org.opencontainers.image.source docker image label - `^image_id:sha256:`, - `^image_name:ghcr\.io/datadog/apps-nginx-server$`, - `^image_tag:` + regexp.QuoteMeta(apps.Version) + `$`, - `^region:us-east-1$`, - `^service_arn:`, - `^short_image:apps-nginx-server$`, - `^task_arn:arn:`, - `^task_definition_arn:`, - `^task_family:.*-nginx-ec2$`, - `^task_name:.*-nginx-ec2$`, - `^task_version:[[:digit:]]+$`, - }, - Message: `GET / HTTP/1\.1`, - }, - }) -} - -func (suite *ecsSuite) TestRedisECS() { - // `redis` check is auto-configured due to image name - // Test it is properly scheduled - suite.testMetric(&testMetricArgs{ - Filter: testMetricFilterArgs{ - Name: "redis.net.instantaneous_ops_per_sec", - Tags: []string{"^ecs_launch_type:ec2$"}, - }, - Expect: testMetricExpectArgs{ - Tags: &[]string{ - `^aws_account:[[:digit:]]{12}$`, - `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^container_id:`, - `^container_name:ecs-.*-redis-ec2-`, - `^docker_image:ghcr\.io/datadog/redis:` + regexp.QuoteMeta(apps.Version) + `$`, - `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^ecs_container_name:redis$`, - `^ecs_service:` + regexp.QuoteMeta(strings.TrimSuffix(suite.ecsClusterName, "-ecs")) + `-redis-ec2$`, - `^ecs_launch_type:ec2$`, - `^git\.commit\.sha:[[:xdigit:]]{40}$`, // org.opencontainers.image.revision docker image label - `^git.repository_url:https://github.com/DataDog/test-infra-definitions$`, // org.opencontainers.image.source docker image label - `^image_id:sha256:`, - `^image_name:ghcr\.io/datadog/redis$`, - `^image_tag:` + regexp.QuoteMeta(apps.Version) + `$`, - `^region:us-east-1$`, - `^service_arn:`, - `^short_image:redis$`, - `^task_arn:`, - `^task_definition_arn:`, - `^task_family:.*-redis-ec2$`, - `^task_name:.*-redis-ec2$`, - `^task_version:[[:digit:]]+$`, - }, - AcceptUnexpectedTags: true, - }, - }) - - suite.testLog(&testLogArgs{ - Filter: testLogFilterArgs{ - Service: "redis", - Tags: []string{"^ecs_launch_type:ec2$"}, - }, - Expect: testLogExpectArgs{ - Tags: &[]string{ - `^aws_account:[[:digit:]]{12}$`, - `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^container_id:`, - `^container_name:ecs-.*-redis-ec2-`, - `^docker_image:ghcr\.io/datadog/redis:` + regexp.QuoteMeta(apps.Version) + `$`, - `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^ecs_container_name:redis$`, - `^ecs_launch_type:ec2$`, - `^ecs_service:` + regexp.QuoteMeta(strings.TrimSuffix(suite.ecsClusterName, "-ecs")) + `-redis-ec2$`, - `^git\.commit\.sha:[[:xdigit:]]{40}$`, // org.opencontainers.image.revision docker image label - `^git.repository_url:https://github.com/DataDog/test-infra-definitions$`, // org.opencontainers.image.source docker image label - `^image_id:sha256:`, - `^image_name:ghcr\.io/datadog/redis$`, - `^image_tag:` + regexp.QuoteMeta(apps.Version) + `$`, - `^region:us-east-1$`, - `^service_arn:`, - `^short_image:redis$`, - `^task_arn:arn:`, - `^task_definition_arn:`, - `^task_family:.*-redis-ec2$`, - `^task_name:.*-redis-ec2$`, - `^task_version:[[:digit:]]+$`, - }, - Message: `Accepted`, - }, - }) -} - -func (suite *ecsSuite) TestNginxFargate() { - // `nginx` check is configured via docker labels - // Test it is properly scheduled - suite.testMetric(&testMetricArgs{ - Filter: testMetricFilterArgs{ - Name: "nginx.net.request_per_s", - Tags: []string{"^ecs_launch_type:fargate$"}, - }, - Expect: testMetricExpectArgs{ - Tags: &[]string{ - `^aws_account:[[:digit:]]{12}$`, - `^availability_zone:`, - `^availability-zone:`, - `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^container_id:`, - `^container_name:nginx$`, - `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^ecs_container_name:nginx$`, - `^ecs_launch_type:fargate$`, - `^image_id:sha256:`, - `^image_name:ghcr\.io/datadog/apps-nginx-server$`, - `^image_tag:` + regexp.QuoteMeta(apps.Version) + `$`, - `^nginx_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^region:us-east-1$`, - `^service_arn:`, - `^short_image:apps-nginx-server$`, - `^task_arn:`, - `^task_definition_arn:`, - `^task_family:.*-nginx-fg$`, - `^task_name:.*-nginx-fg$`, - `^task_version:[[:digit:]]+$`, - }, - AcceptUnexpectedTags: true, - }, - }) -} - -func (suite *ecsSuite) TestRedisFargate() { - // `redis` check is auto-configured due to image name - // Test it is properly scheduled - suite.testMetric(&testMetricArgs{ - Filter: testMetricFilterArgs{ - Name: "redis.net.instantaneous_ops_per_sec", - Tags: []string{"^ecs_launch_type:fargate$"}, - }, - Expect: testMetricExpectArgs{ - Tags: &[]string{ - `^aws_account:[[:digit:]]{12}$`, - `^availability_zone:`, - `^availability-zone:`, - `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^container_id:`, - `^container_name:redis$`, - `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^ecs_container_name:redis$`, - `^ecs_launch_type:fargate`, - `^image_id:sha256:`, - `^image_name:ghcr\.io/datadog/redis$`, - `^image_tag:` + regexp.QuoteMeta(apps.Version) + `$`, - `^region:us-east-1$`, - `^service_arn:`, - `^short_image:redis$`, - `^task_arn:`, - `^task_definition_arn:`, - `^task_family:.*-redis-fg$`, - `^task_name:.*-redis-fg*`, - `^task_version:[[:digit:]]+$`, - }, - AcceptUnexpectedTags: true, - }, - }) -} - -func (suite *ecsSuite) TestWindowsFargate() { - if !suite.windowsEnabled { - suite.T().Skip("Skipping Windows test: WithWindowsNodeGroup() not set") - } - - suite.testCheckRun(&testCheckRunArgs{ - Filter: testCheckRunFilterArgs{ - Name: "http.can_connect", - Tags: []string{ - "^ecs_launch_type:fargate$", - "^container_name:aspnetsample$", - }, - }, - Expect: testCheckRunExpectArgs{ - Tags: &[]string{ - `^aws_account:[[:digit:]]{12}$`, - `^availability_zone:`, - `^availability-zone:`, - `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^container_id:`, - `^container_name:aspnetsample$`, - `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^ecs_container_name:aspnetsample$`, - `^ecs_launch_type:fargate$`, - `^ecs_service:` + regexp.QuoteMeta(strings.TrimSuffix(suite.ecsClusterName, "-ecs")) + `-aspnetsample-fg$`, - `^image_id:sha256:`, - `^image_name:mcr.microsoft.com/dotnet/samples$`, - `^image_tag:aspnetapp-nanoserver-ltsc2022$`, - `^region:us-east-1$`, - `^service_arn:`, - `^short_image:samples$`, - `^task_arn:`, - `^task_definition_arn:`, - `^task_family:.*-aspnet-fg$`, - `^task_name:.*-aspnet-fg*`, - `^task_version:[[:digit:]]+$`, - `^url:`, - }, - AcceptUnexpectedTags: true, - }, - }) - - // Test container check - suite.testMetric(&testMetricArgs{ - Filter: testMetricFilterArgs{ - Name: "container.cpu.usage", - Tags: []string{ - "^ecs_container_name:aspnetsample$", - }, - }, - Expect: testMetricExpectArgs{ - Tags: &[]string{ - `^aws_account:[[:digit:]]{12}$`, - `^availability_zone:`, - `^availability-zone:`, - `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^container_id:`, - `^container_name:aspnetsample$`, - `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^ecs_container_name:aspnetsample$`, - `^ecs_launch_type:fargate$`, - `^ecs_service:` + regexp.QuoteMeta(strings.TrimSuffix(suite.ecsClusterName, "-ecs")) + `-aspnetsample-fg$`, - `^image_id:sha256:`, - `^image_name:mcr.microsoft.com/dotnet/samples$`, - `^image_tag:aspnetapp-nanoserver-ltsc2022$`, - `^region:us-east-1$`, - `^runtime:ecsfargate$`, - `^service_arn:`, - `^short_image:samples$`, - `^task_arn:`, - `^task_definition_arn:`, - `^task_family:.*-aspnet-fg$`, - `^task_name:.*-aspnet-fg*`, - `^task_version:[[:digit:]]+$`, - }, - }, - }) -} - -func (suite *ecsSuite) TestCPU() { - // Test CPU metrics - suite.testMetric(&testMetricArgs{ - Filter: testMetricFilterArgs{ - Name: "container.cpu.usage", - Tags: []string{ - "^ecs_container_name:stress-ng$", - }, - }, - Expect: testMetricExpectArgs{ - Tags: &[]string{ - `^aws_account:[[:digit:]]{12}$`, - `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^container_id:`, - `^container_name:ecs-.*-stress-ng-ec2-`, - `^docker_image:ghcr\.io/datadog/apps-stress-ng:` + regexp.QuoteMeta(apps.Version) + `$`, - `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^ecs_container_name:stress-ng$`, - `^ecs_service:` + regexp.QuoteMeta(strings.TrimSuffix(suite.ecsClusterName, "-ecs")) + `-stress-ng$`, - `^git\.commit\.sha:[[:xdigit:]]{40}$`, - `^git.repository_url:https://github.com/DataDog/test-infra-definitions$`, - `^image_id:sha256:`, - `^image_name:ghcr\.io/datadog/apps-stress-ng$`, - `^image_tag:` + regexp.QuoteMeta(apps.Version) + `$`, - `^region:us-east-1$`, - `^runtime:docker$`, - `^service_arn:`, - `^short_image:apps-stress-ng$`, - `^task_arn:`, - `^task_definition_arn:`, - `^task_family:.*-stress-ng-ec2$`, - `^task_name:.*-stress-ng-ec2$`, - `^task_version:[[:digit:]]+$`, - }, - Value: &testMetricExpectValueArgs{ - Max: 155000000, - Min: 145000000, - }, - }, - }) -} - -func (suite *ecsSuite) TestDogtstatsdUDS() { - suite.testDogstatsd(taskNameDogstatsdUDS) -} - -func (suite *ecsSuite) TestDogtstatsdUDP() { - suite.testDogstatsd(taskNameDogstatsdUDP) -} - -func (suite *ecsSuite) testDogstatsd(taskName string) { - suite.testMetric(&testMetricArgs{ - Filter: testMetricFilterArgs{ - Name: "custom.metric", - Tags: []string{ - `^task_name:.*-` + regexp.QuoteMeta(taskName) + `-ec2$`, - }, - }, - Expect: testMetricExpectArgs{ - Tags: &[]string{ - `^aws_account:[[:digit:]]{12}$`, - `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^container_id:`, - `^container_name:ecs-.*-` + regexp.QuoteMeta(taskName) + `-ec2-`, - `^docker_image:ghcr\.io/datadog/apps-dogstatsd:` + regexp.QuoteMeta(apps.Version) + `$`, - `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^ecs_container_name:dogstatsd$`, - `^ecs_service:` + regexp.QuoteMeta(strings.TrimSuffix(suite.ecsClusterName, "-ecs")) + `-dogstatsd-ud[ps]$`, - `^git\.commit\.sha:[[:xdigit:]]{40}$`, // org.opencontainers.image.revision docker image label - `^git.repository_url:https://github.com/DataDog/test-infra-definitions$`, // org.opencontainers.image.source docker image label - `^image_id:sha256:`, - `^image_name:ghcr\.io/datadog/apps-dogstatsd$`, - `^image_tag:` + regexp.QuoteMeta(apps.Version) + `$`, - `^region:us-east-1$`, - `^series:`, - `^service_arn:`, - `^short_image:apps-dogstatsd$`, - `^task_arn:`, - `^task_definition_arn:`, - `^task_family:.*-` + regexp.QuoteMeta(taskName) + `-ec2$`, - `^task_name:.*-` + regexp.QuoteMeta(taskName) + `-ec2$`, - `^task_version:[[:digit:]]+$`, - }, - }, - }) -} - -func (suite *ecsSuite) TestPrometheus() { - // Test Prometheus check - suite.testMetric(&testMetricArgs{ - Filter: testMetricFilterArgs{ - Name: "prometheus.prom_gauge", - }, - Expect: testMetricExpectArgs{ - Tags: &[]string{ - `^aws_account:[[:digit:]]{12}$`, - `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^container_id:`, - `^container_name:ecs-.*-prometheus-ec2-`, - `^docker_image:ghcr\.io/datadog/apps-prometheus:` + regexp.QuoteMeta(apps.Version) + `$`, - `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, - `^ecs_container_name:prometheus$`, - `^ecs_service:` + regexp.QuoteMeta(strings.TrimSuffix(suite.ecsClusterName, "-ecs")) + `-prometheus$`, - `^endpoint:http://.*:8080/metrics$`, - `^git\.commit\.sha:[[:xdigit:]]{40}$`, // org.opencontainers.image.revision docker image label - `^git.repository_url:https://github.com/DataDog/test-infra-definitions$`, // org.opencontainers.image.source docker image label - `^image_id:sha256:`, - `^image_name:ghcr\.io/datadog/apps-prometheus$`, - `^image_tag:` + regexp.QuoteMeta(apps.Version) + `$`, - `^region:us-east-1$`, - `^series:`, - `^service_arn:`, - `^short_image:apps-prometheus$`, - `^task_arn:`, - `^task_definition_arn:`, - `^task_family:.*-prometheus-ec2$`, - `^task_name:.*-prometheus-ec2$`, - `^task_version:[[:digit:]]+$`, - }, - }, - }) -} - -func (suite *ecsSuite) TestTraceUDS() { - suite.testTrace(taskNameTracegenUDS) -} - -func (suite *ecsSuite) TestTraceTCP() { - suite.testTrace(taskNameTracegenTCP) -} - -// testTrace verifies that traces are tagged with container and pod tags. -func (suite *ecsSuite) testTrace(taskName string) { - suite.EventuallyWithTf(func(c *assert.CollectT) { - traces, cerr := suite.Fakeintake.GetTraces() - require.NoErrorf(c, cerr, "Failed to query fake intake") - - var err error - // Iterate starting from the most recent traces - for _, trace := range traces { - tags := lo.MapToSlice(trace.Tags, func(k string, v string) string { - return k + ":" + v - }) - // Assert origin detection is working properly - err = assertTags(tags, []*regexp.Regexp{ - regexp.MustCompile(`^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`), - regexp.MustCompile(`^container_id:`), - regexp.MustCompile(`^container_name:ecs-.*-` + regexp.QuoteMeta(taskName) + `-ec2-`), - regexp.MustCompile(`^docker_image:ghcr\.io/datadog/apps-tracegen:` + regexp.QuoteMeta(apps.Version) + `$`), - regexp.MustCompile(`^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`), - regexp.MustCompile(`^ecs_container_name:tracegen`), - regexp.MustCompile(`^git\.commit\.sha:[[:xdigit:]]{40}$`), // org.opencontainers.image.revision docker image label - regexp.MustCompile(`^git.repository_url:https://github.com/DataDog/test-infra-definitions$`), // org.opencontainers.image.source docker image label - regexp.MustCompile(`^image_id:sha256:`), - regexp.MustCompile(`^image_name:ghcr\.io/datadog/apps-tracegen`), - regexp.MustCompile(`^image_tag:` + regexp.QuoteMeta(apps.Version) + `$`), - regexp.MustCompile(`^short_image:apps-tracegen`), - regexp.MustCompile(`^task_arn:`), - regexp.MustCompile(`^task_family:.*-` + regexp.QuoteMeta(taskName) + `-ec2$`), - regexp.MustCompile(`^task_name:.*-` + regexp.QuoteMeta(taskName) + `-ec2$`), - regexp.MustCompile(`^task_version:[[:digit:]]+$`), - }, []*regexp.Regexp{}, false) - if err == nil { - break - } - } - require.NoErrorf(c, err, "Failed finding trace with proper tags") - }, 2*time.Minute, 10*time.Second, "Failed finding trace with proper tags") -} - -func (suite *ecsSuite) TestHostTags() { - // tag keys that are expected to be found on this docker env - args := &testHostTags{ - ExpectedTags: []string{}, - } - - suite.testHostTags(args) -} diff --git a/test/new-e2e/tests/ecs/README.md b/test/new-e2e/tests/ecs/README.md new file mode 100644 index 00000000000000..f1eeff8f41834b --- /dev/null +++ b/test/new-e2e/tests/ecs/README.md @@ -0,0 +1,146 @@ +# ECS E2E Tests + +## Overview + +This directory contains end-to-end tests for the Datadog Agent on Amazon Elastic Container Service (ECS). These tests validate agent functionality across all three ECS deployment scenarios: **Fargate**, **EC2**, and **Managed Instances**. + +### Ownership + +**Team**: ecs-experiences +**Purpose**: Validate Datadog Agent behavior in ECS environments +**Coverage**: All telemetry types (metrics, logs, traces) and all ECS deployment types + +--- + +## Test Suites + +This directory contains **4 test suites** with **18 total tests**: + +### 1. `apm_test.go` - APM/Tracing (6 tests) +Tests APM trace collection and DogStatsD across ECS environments. + +**Tests**: +- `Test00UpAndRunning` - Infrastructure readiness check +- `Test01AgentAPMReady` - APM agent readiness check +- `TestDogstatsdUDS` - DogStatsD via Unix Domain Socket (full 23-tag regex validation) +- `TestDogstatsdUDP` - DogStatsD via UDP (full 23-tag regex validation) +- `TestTraceUDS` - Trace collection via UDS (13-pattern bundled tag validation) +- `TestTraceTCP` - Trace collection via TCP (13-pattern bundled tag validation) + +**Key Features Tested**: +- ECS metadata tags (`ecs_cluster_name`, `task_arn`, `task_family`, `task_version`, etc.) +- Image metadata tags (`docker_image`, `image_name`, `image_tag`, `short_image`) +- Git metadata tags (`git.commit.sha`, `git.repository_url`) +- DogStatsD over UDS and UDP transports +- Trace collection over UDS and TCP transports + +--- + +### 2. `checks_test.go` - Check Autodiscovery & Execution (5 tests) +Tests integration check autodiscovery and execution across deployment types. + +**Tests**: +- `TestNginxECS` - Nginx check via docker labels (EC2) with full metric + log tag validation +- `TestRedisECS` - Redis check via image name autodiscovery (EC2) with full metric + log tag validation +- `TestNginxFargate` - Nginx check on Fargate with full metric tag validation +- `TestRedisFargate` - Redis check on Fargate with full metric tag validation +- `TestPrometheus` - Prometheus/OpenMetrics check with full metric tag validation + +**Key Features Tested**: +- Docker label-based check configuration (`com.datadoghq.ad.check_names`) +- Image name-based autodiscovery (redis, nginx) +- Check execution on both EC2 and Fargate +- Log collection with tag validation (nginx, redis) +- Prometheus metrics scraping + +--- + +### 3. `platform_test.go` - Platform-Specific Features (4 tests) +Tests platform-specific functionality and performance monitoring. + +**Tests**: +- `Test00UpAndRunning` - Infrastructure readiness check +- `TestWindowsFargate` - Windows container support on Fargate (check run + container metric tag validation) +- `TestCPU` - CPU metrics with value range validation (stress-ng workload) +- `TestContainerLifecycle` - Container lifecycle tracking (multi-container metric validation) + +**Key Features Tested**: +- Windows container monitoring on Fargate +- BottleRocket node support +- CPU metric value range validation +- Multi-container lifecycle tracking + +--- + +### 4. `managed_test.go` - Managed Instances (3 tests) +Tests managed instance-specific features. + +**Tests**: +- `Test00UpAndRunning` - Infrastructure readiness check +- `TestManagedInstanceAgentHealth` - Agent health check via AssertAgentHealth helper +- `TestManagedInstanceTraceCollection` - Trace collection with bundled tag validation + +**Key Features Tested**: +- Managed instance provisioning and lifecycle +- Daemon mode agent deployment +- Trace collection with ECS metadata validation + +--- + +## Running Tests + +### Prerequisites + +- **AWS credentials**: Configure AWS CLI with appropriate permissions +- **Pulumi**: Infrastructure provisioning (installed by `dda inv install-tools`) +- **Go**: Version specified in `go.mod` +- **Datadog API key**: Set in environment (handled by test framework) + +### Running Individual Suites + +```bash +# Run APM tests only +go test -v -timeout 30m ./test/new-e2e/tests/ecs/ -run TestECSAPMSuite + +# Run checks tests only +go test -v -timeout 30m ./test/new-e2e/tests/ecs/ -run TestECSChecksSuite + +# Run platform tests only +go test -v -timeout 30m ./test/new-e2e/tests/ecs/ -run TestECSPlatformSuite + +# Run managed instance tests only +go test -v -timeout 30m ./test/new-e2e/tests/ecs/ -run TestECSManagedSuite +``` + +### Running All ECS Tests + +```bash +go test -v -timeout 60m ./test/new-e2e/tests/ecs/... +``` + +--- + +## Coverage Matrix + +### Feature Coverage by Deployment Type + +| Feature | Fargate | EC2 | Managed | Tests | +|---------|---------|-----|---------|-------| +| **Metrics Collection** | Yes | Yes | Yes | checks_test, platform_test | +| **Log Collection** | Yes | Yes | - | checks_test | +| **APM Traces** | - | Yes | Yes | apm_test, managed_test | +| **Check Autodiscovery** | Yes | Yes | - | checks_test | +| **DogStatsD** | - | Yes | - | apm_test | +| **Container Lifecycle** | Yes | Yes | - | platform_test | +| **Windows Support** | Yes | - | - | platform_test | +| **Prometheus** | - | Yes | - | checks_test | +| **BottleRocket** | - | Yes | - | platform_test | + +--- + +## Related Documentation + +- [E2E Framework Guide](../../../e2e-framework/README.md) +- [FakeIntake Documentation](../../../fakeintake/README.md) +- [ECS Fargate Integration](https://docs.datadoghq.com/integrations/ecs_fargate/) +- [ECS EC2 Integration](https://docs.datadoghq.com/agent/amazon_ecs/) diff --git a/test/new-e2e/tests/ecs/apm_test.go b/test/new-e2e/tests/ecs/apm_test.go new file mode 100644 index 00000000000000..921fcaf2e90522 --- /dev/null +++ b/test/new-e2e/tests/ecs/apm_test.go @@ -0,0 +1,239 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2025-present Datadog, Inc. + +// Package ecs provides end-to-end tests for the Datadog Agent running on Amazon ECS. +// It tests APM/tracing, metrics, logs, and agent health across different ECS launch types +// (Fargate, EC2, and Managed Instances). +package ecs + +import ( + "regexp" + "strings" + "testing" + "time" + + "github.com/DataDog/datadog-agent/test/e2e-framework/components/datadog/apps" + "github.com/DataDog/datadog-agent/test/e2e-framework/testing/e2e" + "github.com/DataDog/datadog-agent/test/e2e-framework/testing/environments" + "github.com/stretchr/testify/assert" + + scenecs "github.com/DataDog/datadog-agent/test/e2e-framework/scenarios/aws/ecs" + scenfi "github.com/DataDog/datadog-agent/test/e2e-framework/scenarios/aws/fakeintake" + provecs "github.com/DataDog/datadog-agent/test/e2e-framework/testing/provisioners/aws/ecs" +) + +const ( + taskNameDogstatsdUDS = "dogstatsd-uds" + taskNameDogstatsdUDP = "dogstatsd-udp" + + taskNameTracegenUDS = "tracegen-uds" + taskNameTracegenTCP = "tracegen-tcp" +) + +type ecsAPMSuite struct { + BaseSuite[environments.ECS] + ecsClusterName string +} + +func TestECSAPMSuite(t *testing.T) { + t.Parallel() + e2e.Run(t, &ecsAPMSuite{}, e2e.WithProvisioner(provecs.Provisioner( + provecs.WithRunOptions( + scenecs.WithECSOptions( + scenecs.WithFargateCapacityProvider(), + scenecs.WithLinuxNodeGroup(), + ), + scenecs.WithFakeIntakeOptions( + scenfi.WithRetentionPeriod("31m"), + ), + scenecs.WithTestingWorkload(), + ), + ))) +} + +func (suite *ecsAPMSuite) SetupSuite() { + suite.BaseSuite.SetupSuite() + suite.Fakeintake = suite.Env().FakeIntake.Client() + suite.ecsClusterName = suite.Env().ECSCluster.ClusterName + suite.ClusterName = suite.Env().ECSCluster.ClusterName +} + +// getCommonECSTagPatterns returns ECS tag patterns for metrics and traces. +// Parameters: +// - clusterName: ECS cluster name +// - taskName: Task name pattern (e.g., "dogstatsd-uds", "tracegen-tcp") +// - appName: Application name (e.g., "dogstatsd", "tracegen") +// - includeFullSet: If true, includes all tags (for metrics). If false, returns minimal set (for traces). +func (suite *ecsAPMSuite) getCommonECSTagPatterns(clusterName, taskName, appName string, includeFullSet bool) []string { + // Minimal tags for traces - ECS metadata is bundled in _dd.tags.container when DD_APM_ENABLE_CONTAINER_TAGS_BUFFER=true + if !includeFullSet { + // When DD_APM_ENABLE_CONTAINER_TAGS_BUFFER=true, container tags are bundled into a single _dd.tags.container tag + // The actual payload format is _dd.tags.container=task_name:X,cluster_name:Y,... + // BUT when converted to string via k+":"+v in base_helpers.go, it becomes: + // _dd.tags.container:task_name:X,cluster_name:Y,... + // Note the ':' separator, not '=' (that's how Go concatenates map entries) + // We validate that this bundled tag contains the required ECS metadata + // Patterns match: key:value (followed by comma or end of string) + // Use non-greedy .*? to avoid matching cluster name in service_arn first + return []string{ + `^_dd\.tags\.container:.*?cluster_name:` + regexp.QuoteMeta(clusterName) + `(,|$)`, + `^_dd\.tags\.container:.*?ecs_cluster_name:` + regexp.QuoteMeta(clusterName) + `(,|$)`, + `^_dd\.tags\.container:.*?container_name:[^,]+(,|$)`, + `^_dd\.tags\.container:.*?task_arn:[^,]+(,|$)`, + } + } + + // Full tag set for metrics - includes ECS metadata, image metadata, and AWS metadata + return []string{ + // Core ECS metadata + `^cluster_name:` + regexp.QuoteMeta(clusterName) + `$`, + `^ecs_cluster_name:` + regexp.QuoteMeta(clusterName) + `$`, + `^ecs_container_name:` + appName + `$`, + `^container_id:`, + `^container_name:ecs-.*-` + regexp.QuoteMeta(taskName) + `-ec2-`, + `^task_arn:`, + `^task_family:.*-` + regexp.QuoteMeta(taskName) + `-ec2$`, + `^task_name:.*-` + regexp.QuoteMeta(taskName) + `-ec2$`, + `^task_version:[[:digit:]]+$`, + `^task_definition_arn:`, + + // Image metadata + `^docker_image:ghcr\.io/datadog/apps-` + appName + `:` + regexp.QuoteMeta(apps.Version) + `$`, + `^image_id:sha256:`, + `^image_name:ghcr\.io/datadog/apps-` + appName + `$`, + `^image_tag:` + regexp.QuoteMeta(apps.Version) + `$`, + `^short_image:apps-` + appName + `$`, + `^git\.commit\.sha:[[:xdigit:]]{40}$`, + `^git.repository_url:https://github.com/DataDog/test-infra-definitions$`, + + // AWS metadata + `^aws_account:[[:digit:]]{12}$`, + `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(clusterName) + `$`, + `^ecs_service:` + regexp.QuoteMeta(strings.TrimSuffix(clusterName, "-ecs")) + `-` + appName + `-ud[ps]$`, + `^region:us-east-1$`, + `^service_arn:`, + `^series:`, + } +} + +// Inside a testify test suite, tests are executed in alphabetical order. +// The 00 in Test00UpAndRunning is here to guarantee that this test, waiting for all tasks to be ready +// is run first. This gives the agent time to warm up before other tests run with shorter timeouts. +func (suite *ecsAPMSuite) Test00UpAndRunning() { + suite.AssertECSTasksReady(suite.ecsClusterName) +} + +func (suite *ecsAPMSuite) Test01AgentAPMReady() { + // Test that the APM agent is ready and receiving traces + suite.Run("APM agent readiness check", func() { + suite.AssertAgentHealth(&TestAgentHealthArgs{ + CheckComponents: []string{"trace_agent"}, + }) + + // Verify we're receiving traces + suite.EventuallyWithTf(func(c *assert.CollectT) { + traces, err := suite.Fakeintake.GetTraces() + assert.NoErrorf(c, err, "Failed to query traces from fake intake") + assert.NotEmptyf(c, traces, "No traces received - APM agent may not be ready") + + }, 5*time.Minute, 10*time.Second, "APM agent readiness check failed") + }) +} + +func (suite *ecsAPMSuite) TestDogstatsdUDS() { + suite.testDogstatsd(taskNameDogstatsdUDS) +} + +func (suite *ecsAPMSuite) TestDogstatsdUDP() { + suite.testDogstatsd(taskNameDogstatsdUDP) +} + +func (suite *ecsAPMSuite) testDogstatsd(taskName string) { + expectedTags := suite.getCommonECSTagPatterns(suite.ecsClusterName, taskName, "dogstatsd", true) + + suite.AssertMetric(&TestMetricArgs{ + Filter: TestMetricFilterArgs{ + Name: "custom.metric", + Tags: []string{ + `^task_name:.*-` + regexp.QuoteMeta(taskName) + `-ec2$`, + }, + }, + Expect: TestMetricExpectArgs{ + Tags: &expectedTags, + }, + }) +} + +func (suite *ecsAPMSuite) TestTraceUDS() { + suite.testTrace(taskNameTracegenUDS) +} + +func (suite *ecsAPMSuite) TestTraceTCP() { + suite.testTrace(taskNameTracegenTCP) +} + +// testTrace verifies that traces are tagged with container and ECS task tags. +// The bundled _dd.tags.container value is a comma-separated string of key:value pairs +// containing ECS metadata, image metadata, and git metadata. +func (suite *ecsAPMSuite) testTrace(taskName string) { + // Build validation patterns for the bundled _dd.tags.container value + patterns := []*regexp.Regexp{ + // Core ECS metadata + regexp.MustCompile(`ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName)), + regexp.MustCompile(`task_arn:`), + regexp.MustCompile(`container_name:`), + regexp.MustCompile(`ecs_container_name:tracegen`), + regexp.MustCompile(`task_family:.*-` + regexp.QuoteMeta(taskName) + `-ec2`), + regexp.MustCompile(`task_name:.*-` + regexp.QuoteMeta(taskName) + `-ec2`), + regexp.MustCompile(`task_version:[[:digit:]]+`), + + // Image metadata + regexp.MustCompile(`docker_image:ghcr\.io/datadog/apps-tracegen:` + regexp.QuoteMeta(apps.Version)), + regexp.MustCompile(`image_name:ghcr\.io/datadog/apps-tracegen`), + regexp.MustCompile(`image_tag:` + regexp.QuoteMeta(apps.Version)), + regexp.MustCompile(`short_image:apps-tracegen`), + + // Git metadata + regexp.MustCompile(`git\.commit\.sha:[[:xdigit:]]{40}`), + regexp.MustCompile(`git.repository_url:https://github.com/DataDog/test-infra-definitions`), + } + + suite.EventuallyWithTf(func(c *assert.CollectT) { + traces, cerr := suite.Fakeintake.GetTraces() + // Can be replaced by require.NoErrorf(…) once https://github.com/stretchr/testify/pull/1481 is merged + if !assert.NoErrorf(c, cerr, "Failed to query fake intake") { + return + } + + found := false + // Iterate starting from the most recent traces + for _, trace := range traces { + // Container tags are in TracerPayload.Tags, not AgentPayload.Tags + for _, tracerPayload := range trace.TracerPayloads { + containerTags, exists := tracerPayload.Tags["_dd.tags.container"] + if !exists { + continue + } + + // Validate all patterns match the bundled tag value + allMatch := true + for _, pattern := range patterns { + if !pattern.MatchString(containerTags) { + allMatch = false + break + } + } + if allMatch { + found = true + break + } + } + if found { + break + } + } + assert.Truef(c, found, "Failed finding trace with proper bundled _dd.tags.container tags for task %s", taskName) + }, 2*time.Minute, 10*time.Second, "Failed finding trace with proper bundled tags") +} diff --git a/test/new-e2e/tests/ecs/base.go b/test/new-e2e/tests/ecs/base.go new file mode 100644 index 00000000000000..afe92553aa37bf --- /dev/null +++ b/test/new-e2e/tests/ecs/base.go @@ -0,0 +1,20 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2023-present Datadog, Inc. + +package ecs + +import ( + "github.com/DataDog/datadog-agent/test/e2e-framework/testing/e2e" + fakeintake "github.com/DataDog/datadog-agent/test/fakeintake/client" +) + +// BaseSuite is the base test suite for container tests, providing common functionality +// for ECS, Docker, and other container platform tests. +type BaseSuite[Env any] struct { + e2e.BaseSuite[Env] + + Fakeintake *fakeintake.Client + ClusterName string +} diff --git a/test/new-e2e/tests/ecs/base_helpers.go b/test/new-e2e/tests/ecs/base_helpers.go new file mode 100644 index 00000000000000..a0e907606c83a8 --- /dev/null +++ b/test/new-e2e/tests/ecs/base_helpers.go @@ -0,0 +1,1026 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2023-present Datadog, Inc. + +package ecs + +import ( + "errors" + "fmt" + "regexp" + "strings" + "time" + + "github.com/samber/lo" + "github.com/stretchr/testify/assert" + "gopkg.in/yaml.v3" + "gopkg.in/zorkian/go-datadog-api.v2" + + awsconfig "github.com/aws/aws-sdk-go-v2/config" + awsecs "github.com/aws/aws-sdk-go-v2/service/ecs" + awsecstypes "github.com/aws/aws-sdk-go-v2/service/ecs/types" + + "github.com/DataDog/agent-payload/v5/gogen" + + "github.com/DataDog/datadog-agent/pkg/metrics/event" + pb "github.com/DataDog/datadog-agent/pkg/proto/pbgo/trace" + "github.com/DataDog/datadog-agent/pkg/util/pointer" + "github.com/DataDog/datadog-agent/test/fakeintake/aggregator" + fakeintake "github.com/DataDog/datadog-agent/test/fakeintake/client" +) + +// assertTags checks that actual tags match expected tag patterns +func assertTags(actualTags []string, expectedTags []*regexp.Regexp, optionalTags []*regexp.Regexp, acceptUnexpectedTags bool) error { + missingTags := make([]*regexp.Regexp, len(expectedTags)) + copy(missingTags, expectedTags) + unexpectedTags := []string{} + + for _, actualTag := range actualTags { + found := false + for i, expectedTag := range missingTags { + if expectedTag.MatchString(actualTag) { + found = true + missingTags[i] = missingTags[len(missingTags)-1] + missingTags = missingTags[:len(missingTags)-1] + break + } + } + + if !found { + for _, optionalTag := range optionalTags { + if optionalTag.MatchString(actualTag) { + found = true + break + } + } + } + + if !found { + unexpectedTags = append(unexpectedTags, actualTag) + } + } + + if (len(unexpectedTags) > 0 && !acceptUnexpectedTags) || len(missingTags) > 0 { + errs := make([]error, 0, 2) + if len(unexpectedTags) > 0 { + errs = append(errs, fmt.Errorf("unexpected tags: %s", strings.Join(unexpectedTags, ", "))) + } + if len(missingTags) > 0 { + errs = append(errs, fmt.Errorf("missing tags: %s", strings.Join(lo.Map(missingTags, func(re *regexp.Regexp, _ int) string { return re.String() }), ", "))) + } + return errors.Join(errs...) + } + + return nil +} + +type TestMetricArgs struct { + Filter TestMetricFilterArgs + Expect TestMetricExpectArgs + Optional TestMetricExpectArgs +} + +type TestMetricFilterArgs struct { + Name string + // Tags are used to filter the metrics + // Regexes are supported + Tags []string +} + +type TestMetricExpectArgs struct { + // Tags are the tags expected to be present + // Regexes are supported + Tags *[]string + Value *TestMetricExpectValueArgs + AcceptUnexpectedTags bool +} + +type TestMetricExpectValueArgs struct { + Min float64 + Max float64 +} + +// myCollectT does nothing more than "github.com/stretchr/testify/assert".CollectT +// It's used only to get access to `errors` field which is otherwise private. +type myCollectT struct { + *assert.CollectT + + errors []error +} + +func (mc *myCollectT) Errorf(format string, args ...interface{}) { + mc.errors = append(mc.errors, fmt.Errorf(format, args...)) + mc.CollectT.Errorf(format, args...) +} + +func (suite *BaseSuite[Env]) AssertMetric(args *TestMetricArgs) { + prettyMetricQuery := fmt.Sprintf("%s{%s}", args.Filter.Name, strings.Join(args.Filter.Tags, ",")) + + suite.Run("metric "+prettyMetricQuery, func() { + var expectedTags []*regexp.Regexp + if args.Expect.Tags != nil { + expectedTags = lo.Map(*args.Expect.Tags, func(tag string, _ int) *regexp.Regexp { return regexp.MustCompile(tag) }) + } + + optionalTags := []*regexp.Regexp{regexp.MustCompile("stackid:.*")} // The stackid tag is added by the framework itself to allow filtering on the stack id + if args.Optional.Tags != nil { + optionalTags = lo.Map(*args.Optional.Tags, func(tag string, _ int) *regexp.Regexp { return regexp.MustCompile(tag) }) + } + + sendEvent := func(alertType, text string) { + formattedArgs, err := yaml.Marshal(args) + suite.Require().NoError(err) + + tags := lo.Map(args.Filter.Tags, func(tag string, _ int) string { + return "filter_tag_" + tag + }) + + if _, err := suite.DatadogClient().PostEvent(&datadog.Event{ + Title: pointer.Ptr("testMetric " + prettyMetricQuery), + Text: pointer.Ptr(fmt.Sprintf(`%%%%%% +### Result + +`+"```"+` +%s +`+"```"+` + +### Query + +`+"```"+` +%s +`+"```"+` + %%%%%%`, text, formattedArgs)), + AlertType: &alertType, + Tags: append([]string{ + "app:agent-new-e2e-tests-containers", + "cluster_name:" + suite.ClusterName, + "metric:" + args.Filter.Name, + "test:" + suite.T().Name(), + }, tags...), + }); err != nil { + suite.T().Logf("Failed to post event: %s", err) + } + } + + defer func() { + if suite.T().Failed() { + sendEvent("error", fmt.Sprintf("Failed finding %s with proper tags and value", prettyMetricQuery)) + } else { + sendEvent("success", "All good!") + } + }() + + suite.EventuallyWithTf(func(collect *assert.CollectT) { + c := &myCollectT{ + CollectT: collect, + errors: []error{}, + } + // To enforce the use of myCollectT instead + collect = nil //nolint:ineffassign + + defer func() { + if len(c.errors) == 0 { + sendEvent("success", "All good!") + } else { + sendEvent("warning", errors.Join(c.errors...).Error()) + } + }() + + regexTags := lo.Map(args.Filter.Tags, func(tag string, _ int) *regexp.Regexp { + return regexp.MustCompile(tag) + }) + + metrics, err := suite.Fakeintake.FilterMetrics( + args.Filter.Name, + fakeintake.WithMatchingTags[*aggregator.MetricSeries](regexTags), + ) + // Can be replaced by require.NoErrorf(…) once https://github.com/stretchr/testify/pull/1481 is merged + if !assert.NoErrorf(c, err, "Failed to query fake intake") { + return + } + // Can be replaced by require.NoEmptyf(…) once https://github.com/stretchr/testify/pull/1481 is merged + if !assert.NotEmptyf(c, metrics, "No `%s` metrics yet", prettyMetricQuery) { + return + } + + // Check tags + if expectedTags != nil { + err := assertTags(metrics[len(metrics)-1].GetTags(), expectedTags, optionalTags, args.Expect.AcceptUnexpectedTags) + assert.NoErrorf(c, err, "Tags mismatch on `%s`", prettyMetricQuery) + } + + // Check value + if args.Expect.Value != nil { + assert.NotEmptyf(c, lo.Filter(metrics[len(metrics)-1].GetPoints(), func(v *gogen.MetricPayload_MetricPoint, _ int) bool { + return v.GetValue() >= args.Expect.Value.Min && + v.GetValue() <= args.Expect.Value.Max + }), "No value of `%s` is in the range [%f;%f]: %v", + prettyMetricQuery, + args.Expect.Value.Min, + args.Expect.Value.Max, + lo.Map(metrics[len(metrics)-1].GetPoints(), func(v *gogen.MetricPayload_MetricPoint, _ int) float64 { + return v.GetValue() + }), + ) + } + }, 2*time.Minute, 10*time.Second, "Failed finding `%s` with proper tags and value", prettyMetricQuery) + }) +} + +type TestLogArgs struct { + Filter TestLogFilterArgs + Expect TestLogExpectArgs +} + +type TestLogFilterArgs struct { + Service string + Tags []string +} + +type TestLogExpectArgs struct { + Tags *[]string + Message string +} + +func (suite *BaseSuite[Env]) AssertLog(args *TestLogArgs) { + prettyLogQuery := fmt.Sprintf("%s{%s}", args.Filter.Service, strings.Join(args.Filter.Tags, ",")) + + suite.Run("log "+prettyLogQuery, func() { + var expectedTags []*regexp.Regexp + if args.Expect.Tags != nil { + expectedTags = lo.Map(*args.Expect.Tags, func(tag string, _ int) *regexp.Regexp { return regexp.MustCompile(tag) }) + } + + var expectedMessage *regexp.Regexp + if args.Expect.Message != "" { + expectedMessage = regexp.MustCompile(args.Expect.Message) + } + + sendEvent := func(alertType, text string) { + formattedArgs, err := yaml.Marshal(args) + suite.Require().NoError(err) + + tags := lo.Map(args.Filter.Tags, func(tag string, _ int) string { + return "filter_tag_" + tag + }) + + if _, err := suite.DatadogClient().PostEvent(&datadog.Event{ + Title: pointer.Ptr("testLog " + prettyLogQuery), + Text: pointer.Ptr(fmt.Sprintf(`%%%%%% +### Result + +`+"```"+` +%s +`+"```"+` + +### Query + +`+"```"+` +%s +`+"```"+` + %%%%%%`, text, formattedArgs)), + AlertType: &alertType, + Tags: append([]string{ + "app:agent-new-e2e-tests-containers", + "cluster_name:" + suite.ClusterName, + "log_service:" + args.Filter.Service, + "test:" + suite.T().Name(), + }, tags...), + }); err != nil { + suite.T().Logf("Failed to post event: %s", err) + } + } + + defer func() { + if suite.T().Failed() { + sendEvent("error", fmt.Sprintf("Failed finding %s with proper tags and message", prettyLogQuery)) + } else { + sendEvent("success", "All good!") + } + }() + + suite.EventuallyWithTf(func(collect *assert.CollectT) { + c := &myCollectT{ + CollectT: collect, + errors: []error{}, + } + // To enforce the use of myCollectT instead + collect = nil //nolint:ineffassign + + defer func() { + if len(c.errors) == 0 { + sendEvent("success", "All good!") + } else { + sendEvent("warning", errors.Join(c.errors...).Error()) + } + }() + + regexTags := lo.Map(args.Filter.Tags, func(tag string, _ int) *regexp.Regexp { + return regexp.MustCompile(tag) + }) + + logs, err := suite.Fakeintake.FilterLogs( + args.Filter.Service, + fakeintake.WithMatchingTags[*aggregator.Log](regexTags), + ) + // Can be replaced by require.NoErrorf(…) once https://github.com/stretchr/testify/pull/1481 is merged + if !assert.NoErrorf(c, err, "Failed to query fake intake") { + return + } + // Can be replaced by require.NoEmptyf(…) once https://github.com/stretchr/testify/pull/1481 is merged + if !assert.NotEmptyf(c, logs, "No `%s` logs yet", prettyLogQuery) { + return + } + + // Check tags + if expectedTags != nil { + optionalTags := []*regexp.Regexp{ + regexp.MustCompile("logsource:.*"), + } + err := assertTags(logs[len(logs)-1].GetTags(), expectedTags, optionalTags, false) + assert.NoErrorf(c, err, "Tags mismatch on `%s`", prettyLogQuery) + } + + // Check message + if args.Expect.Message != "" { + assert.NotEmptyf(c, lo.Filter(logs, func(m *aggregator.Log, _ int) bool { + return expectedMessage.MatchString(m.Message) + }), "No log of `%s` is matching %q", + prettyLogQuery, + args.Expect.Message, + ) + } + }, 2*time.Minute, 10*time.Second, "Failed finding `%s` with proper tags and message", prettyLogQuery) + }) +} + +type TestCheckRunArgs struct { + Filter TestCheckRunFilterArgs + Expect TestCheckRunExpectArgs + Optional TestCheckRunExpectArgs +} + +type TestCheckRunFilterArgs struct { + Name string + // Tags are used to filter the checkRun + // Regexes are supported + Tags []string +} + +type TestCheckRunExpectArgs struct { + // Tags are the tags expected to be present + // Regexes are supported + Tags *[]string + AcceptUnexpectedTags bool +} + +func (suite *BaseSuite[Env]) AssertCheckRun(args *TestCheckRunArgs) { + prettyCheckRunQuery := fmt.Sprintf("%s{%s}", args.Filter.Name, strings.Join(args.Filter.Tags, ",")) + + suite.Run("checkRun "+prettyCheckRunQuery, func() { + var expectedTags []*regexp.Regexp + if args.Expect.Tags != nil { + expectedTags = lo.Map(*args.Expect.Tags, func(tag string, _ int) *regexp.Regexp { return regexp.MustCompile(tag) }) + } + + var optionalTags []*regexp.Regexp + if args.Optional.Tags != nil { + optionalTags = lo.Map(*args.Optional.Tags, func(tag string, _ int) *regexp.Regexp { return regexp.MustCompile(tag) }) + } + + sendEvent := func(alertType, text string) { + formattedArgs, err := yaml.Marshal(args) + suite.Require().NoError(err) + + tags := lo.Map(args.Filter.Tags, func(tag string, _ int) string { + return "filter_tag_" + tag + }) + + if _, err := suite.DatadogClient().PostEvent(&datadog.Event{ + Title: pointer.Ptr("testCheckRun " + prettyCheckRunQuery), + Text: pointer.Ptr(fmt.Sprintf(`%%%%%% +### Result + +`+"```"+` +%s +`+"```"+` + +### Query + +`+"```"+` +%s +`+"```"+` + %%%%%%`, text, formattedArgs)), + AlertType: &alertType, + Tags: append([]string{ + "app:agent-new-e2e-tests-containers", + "cluster_name:" + suite.ClusterName, + "check_run:" + args.Filter.Name, + "test:" + suite.T().Name(), + }, tags...), + }); err != nil { + suite.T().Logf("Failed to post event: %s", err) + } + } + + defer func() { + if suite.T().Failed() { + sendEvent("error", fmt.Sprintf("Failed finding %s with proper tags and value", prettyCheckRunQuery)) + } else { + sendEvent("success", "All good!") + } + }() + + suite.EventuallyWithTf(func(collect *assert.CollectT) { + c := &myCollectT{ + CollectT: collect, + errors: []error{}, + } + // To enforce the use of myCollectT instead + collect = nil //nolint:ineffassign + + defer func() { + if len(c.errors) == 0 { + sendEvent("success", "All good!") + } else { + sendEvent("warning", errors.Join(c.errors...).Error()) + } + }() + + regexTags := lo.Map(args.Filter.Tags, func(tag string, _ int) *regexp.Regexp { + return regexp.MustCompile(tag) + }) + + checkRuns, err := suite.Fakeintake.FilterCheckRuns( + args.Filter.Name, + fakeintake.WithMatchingTags[*aggregator.CheckRun](regexTags), + ) + // Can be replaced by require.NoErrorf(…) once https://github.com/stretchr/testify/pull/1481 is merged + if !assert.NoErrorf(c, err, "Failed to query fake intake") { + return + } + // Can be replaced by require.NoEmptyf(…) once https://github.com/stretchr/testify/pull/1481 is merged + if !assert.NotEmptyf(c, checkRuns, "No `%s` checkRun yet", prettyCheckRunQuery) { + return + } + + // Check tags + if expectedTags != nil { + err := assertTags(checkRuns[len(checkRuns)-1].GetTags(), expectedTags, optionalTags, args.Expect.AcceptUnexpectedTags) + assert.NoErrorf(c, err, "Tags mismatch on `%s`", prettyCheckRunQuery) + } + + }, 2*time.Minute, 10*time.Second, "Failed finding `%s` with proper tags and value", prettyCheckRunQuery) + }) +} + +type TestEventArgs struct { + Filter TestEventFilterArgs + Expect TestEventExpectArgs +} + +type TestEventFilterArgs struct { + Source string + Tags []string +} + +type TestEventExpectArgs struct { + Tags *[]string + Title string + Text string + Priority event.Priority + AlertType event.AlertType +} + +func (suite *BaseSuite[Env]) AssertEvent(args *TestEventArgs) { + prettyEventQuery := fmt.Sprintf("%s{%s}", args.Filter.Source, strings.Join(args.Filter.Tags, ",")) + + suite.Run("event "+prettyEventQuery, func() { + var expectedTags []*regexp.Regexp + if args.Expect.Tags != nil { + expectedTags = lo.Map(*args.Expect.Tags, func(tag string, _ int) *regexp.Regexp { return regexp.MustCompile(tag) }) + } + + sendEvent := func(alertType, text string) { + formattedArgs, err := yaml.Marshal(args) + suite.Require().NoError(err) + + tags := lo.Map(args.Filter.Tags, func(tag string, _ int) string { + return "filter_tag_" + tag + }) + + if _, err := suite.DatadogClient().PostEvent(&datadog.Event{ + Title: pointer.Ptr("testEvent " + prettyEventQuery), + Text: pointer.Ptr(fmt.Sprintf(`%%%%%% +### Result + +`+"```"+` +%s +`+"```"+` + +### Query + +`+"```"+` +%s +`+"```"+` + %%%%%%`, text, formattedArgs)), + AlertType: &alertType, + Tags: append([]string{ + "app:agent-new-e2e-tests-containers", + "cluster_name:" + suite.ClusterName, + "event_source:" + args.Filter.Source, + "test:" + suite.T().Name(), + }, tags...), + }); err != nil { + suite.T().Logf("Failed to post event: %s", err) + } + } + + defer func() { + if suite.T().Failed() { + sendEvent("error", fmt.Sprintf("Failed finding %s with proper tags and message", prettyEventQuery)) + } else { + sendEvent("success", "All good!") + } + }() + + suite.EventuallyWithTf(func(collect *assert.CollectT) { + c := &myCollectT{ + CollectT: collect, + errors: []error{}, + } + // To enforce the use of myCollectT instead + collect = nil //nolint:ineffassign + + defer func() { + if len(c.errors) == 0 { + sendEvent("success", "All good!") + } else { + sendEvent("warning", errors.Join(c.errors...).Error()) + } + }() + + regexTags := lo.Map(args.Filter.Tags, func(tag string, _ int) *regexp.Regexp { + return regexp.MustCompile(tag) + }) + + events, err := suite.Fakeintake.FilterEvents( + args.Filter.Source, + fakeintake.WithMatchingTags[*aggregator.Event](regexTags), + ) + // Can be replaced by require.NoErrorf(…) once https://github.com/stretchr/testify/pull/1481 is merged + if !assert.NoErrorf(c, err, "Failed to query fake intake") { + return + } + // Can be replaced by require.NoEmptyf(…) once https://github.com/stretchr/testify/pull/1481 is merged + if !assert.NotEmptyf(c, events, "No `%s` events yet", prettyEventQuery) { + return + } + + // Check tags + if expectedTags != nil { + err := assertTags(events[len(events)-1].GetTags(), expectedTags, []*regexp.Regexp{}, false) + assert.NoErrorf(c, err, "Tags mismatch on `%s`", prettyEventQuery) + } + + // Check title + if args.Expect.Title != "" { + assert.Regexpf(c, args.Expect.Title, events[len(events)-1].Title, + "Event title mismatch on `%s`", prettyEventQuery) + } + + // Check text + if args.Expect.Text != "" { + assert.Regexpf(c, args.Expect.Text, events[len(events)-1].Text, + "Event text mismatch on `%s`", prettyEventQuery) + } + + // Check priority + if len(args.Expect.Priority) != 0 { + assert.Equalf(c, args.Expect.Priority, events[len(events)-1].Priority, + "Event priority mismatch on `%s`", prettyEventQuery) + } + + // Check alert type + if len(args.Expect.AlertType) != 0 { + assert.Equalf(c, args.Expect.AlertType, events[len(events)-1].AlertType, + "Event alert type mismatch on `%s`", prettyEventQuery) + } + }, 2*time.Minute, 10*time.Second, "Failed finding `%s` with proper tags and message", prettyEventQuery) + }) +} + +type TestAPMTraceArgs struct { + Filter TestAPMTraceFilterArgs + Expect TestAPMTraceExpectArgs +} + +type TestAPMTraceFilterArgs struct { + ServiceName string + OperationName string + ResourceName string + Tags []string +} + +type TestAPMTraceExpectArgs struct { + Tags *[]string + SpanCount *int + // SamplingPriority validates sampling decision + SamplingPriority *int + // TraceIDPresent validates trace_id is set + TraceIDPresent bool + // ParentIDPresent validates parent_id is set for child spans + ParentIDPresent bool +} + +func (suite *BaseSuite[Env]) AssertAPMTrace(args *TestAPMTraceArgs) { + prettyTraceQuery := fmt.Sprintf("%s{%s}", args.Filter.ServiceName, strings.Join(args.Filter.Tags, ",")) + + suite.Run("trace "+prettyTraceQuery, func() { + var expectedTags []*regexp.Regexp + if args.Expect.Tags != nil { + expectedTags = lo.Map(*args.Expect.Tags, func(tag string, _ int) *regexp.Regexp { return regexp.MustCompile(tag) }) + } + + suite.EventuallyWithTf(func(collect *assert.CollectT) { + c := &myCollectT{ + CollectT: collect, + errors: []error{}, + } + // To enforce the use of myCollectT instead + collect = nil //nolint:ineffassign + + // Get traces from fakeintake + traces, err := suite.Fakeintake.GetTraces() + // Can be replaced by require.NoErrorf(…) once https://github.com/stretchr/testify/pull/1481 is merged + if !assert.NoErrorf(c, err, "Failed to query fake intake for traces") { + return + } + + // Filter traces by service name + matchingTraces := make([]*aggregator.TracePayload, 0) + for _, trace := range traces { + if len(trace.TracerPayloads) == 0 { + continue + } + for _, payload := range trace.TracerPayloads { + for _, chunk := range payload.Chunks { + for _, span := range chunk.Spans { + if span.Service == args.Filter.ServiceName { + // Check operation name if specified + if args.Filter.OperationName != "" && span.Name != args.Filter.OperationName { + continue + } + // Check resource name if specified + if args.Filter.ResourceName != "" && span.Resource != args.Filter.ResourceName { + continue + } + matchingTraces = append(matchingTraces, trace) + goto nextTrace + } + } + } + } + nextTrace: + } + + // Can be replaced by require.NoEmptyf(…) once https://github.com/stretchr/testify/pull/1481 is merged + if !assert.NotEmptyf(c, matchingTraces, "No `%s` traces yet", prettyTraceQuery) { + return + } + + latestTrace := matchingTraces[len(matchingTraces)-1] + + // Find spans matching the service + matchingSpans := []*pb.Span{} + for _, payload := range latestTrace.TracerPayloads { + for _, chunk := range payload.Chunks { + for _, span := range chunk.Spans { + if span.Service == args.Filter.ServiceName { + matchingSpans = append(matchingSpans, span) + } + } + } + } + + if len(matchingSpans) == 0 { + return + } + + // Check span count if specified + if args.Expect.SpanCount != nil { + assert.Equalf(c, *args.Expect.SpanCount, len(matchingSpans), + "Expected %d spans for service %s, got %d", *args.Expect.SpanCount, args.Filter.ServiceName, len(matchingSpans)) + } + + // Check tags on TracerPayload (where container tags are enriched) + if expectedTags != nil { + traceTags := make([]string, 0) + for _, payload := range latestTrace.TracerPayloads { + for k, v := range payload.Tags { + traceTags = append(traceTags, k+":"+v) + } + } + // Set acceptUnexpectedTags=true for bundled tag format (DD_APM_ENABLE_CONTAINER_TAGS_BUFFER=true) + // The bundled _dd.tags.container tag contains many comma-separated key:value pairs + err := assertTags(traceTags, expectedTags, []*regexp.Regexp{}, true) + assert.NoErrorf(c, err, "Tags mismatch on `%s`", prettyTraceQuery) + } + + // Check trace ID is present + if args.Expect.TraceIDPresent { + assert.NotZerof(c, matchingSpans[0].TraceID, "TraceID should be present for `%s`", prettyTraceQuery) + } + + // Check sampling priority if specified + if args.Expect.SamplingPriority != nil { + assert.Equalf(c, float64(*args.Expect.SamplingPriority), matchingSpans[0].Metrics["_sampling_priority_v1"], + "Sampling priority mismatch for `%s`", prettyTraceQuery) + } + + }, 2*time.Minute, 10*time.Second, "Failed finding `%s` traces with proper tags and spans", prettyTraceQuery) + }) +} + +type TestLogPipelineArgs struct { + Filter TestLogPipelineFilterArgs + Expect TestLogPipelineExpectArgs +} + +type TestLogPipelineFilterArgs struct { + Service string + Source string + Tags []string +} + +type TestLogPipelineExpectArgs struct { + // MinCount validates minimum number of logs + MinCount int + // Status validates log status (info, warning, error) + Status string + // Message regex pattern + Message string + // Tags expected on logs + Tags *[]string + // ParsedFields validates structured log parsing + ParsedFields map[string]string + // TraceIDPresent validates trace correlation + TraceIDPresent bool +} + +func (suite *BaseSuite[Env]) AssertLogPipeline(args *TestLogPipelineArgs) { + prettyLogQuery := fmt.Sprintf("%s{%s}", args.Filter.Service, strings.Join(args.Filter.Tags, ",")) + + suite.Run("logPipeline "+prettyLogQuery, func() { + var expectedTags []*regexp.Regexp + if args.Expect.Tags != nil { + expectedTags = lo.Map(*args.Expect.Tags, func(tag string, _ int) *regexp.Regexp { return regexp.MustCompile(tag) }) + } + + var expectedMessage *regexp.Regexp + if args.Expect.Message != "" { + expectedMessage = regexp.MustCompile(args.Expect.Message) + } + + suite.EventuallyWithTf(func(collect *assert.CollectT) { + c := &myCollectT{ + CollectT: collect, + errors: []error{}, + } + // To enforce the use of myCollectT instead + collect = nil //nolint:ineffassign + + regexTags := lo.Map(args.Filter.Tags, func(tag string, _ int) *regexp.Regexp { + return regexp.MustCompile(tag) + }) + + logs, err := suite.Fakeintake.FilterLogs( + args.Filter.Service, + fakeintake.WithMatchingTags[*aggregator.Log](regexTags), + ) + // Can be replaced by require.NoErrorf(…) once https://github.com/stretchr/testify/pull/1481 is merged + if !assert.NoErrorf(c, err, "Failed to query fake intake") { + return + } + // Can be replaced by require.NoEmptyf(…) once https://github.com/stretchr/testify/pull/1481 is merged + if !assert.NotEmptyf(c, logs, "No `%s` logs yet", prettyLogQuery) { + return + } + + // Check minimum count + if args.Expect.MinCount > 0 { + assert.GreaterOrEqualf(c, len(logs), args.Expect.MinCount, + "Expected at least %d logs for `%s`, got %d", args.Expect.MinCount, prettyLogQuery, len(logs)) + } + + latestLog := logs[len(logs)-1] + + // Check tags + if expectedTags != nil { + err := assertTags(latestLog.GetTags(), expectedTags, []*regexp.Regexp{}, false) + assert.NoErrorf(c, err, "Tags mismatch on `%s`", prettyLogQuery) + } + + // Check status + if args.Expect.Status != "" { + assert.Equalf(c, args.Expect.Status, latestLog.Status, + "Log status mismatch on `%s`: expected %s, got %s", prettyLogQuery, args.Expect.Status, latestLog.Status) + } + + // Check message + if expectedMessage != nil { + assert.Truef(c, expectedMessage.MatchString(latestLog.Message), + "Log message `%s` doesn't match pattern `%s`", latestLog.Message, args.Expect.Message) + } + + // Check parsed fields (for structured logs) + // Note: ParsedFields validation would require accessing the parsed log structure + // which may be implementation-specific. Skipping for now. + _ = args.Expect.ParsedFields // Avoid unused variable error + + // Check trace correlation + if args.Expect.TraceIDPresent { + ddTags := strings.Join(latestLog.GetTags(), ",") + assert.Regexpf(c, `dd\.trace_id:[[:xdigit:]]+`, ddTags, + "trace_id not found in log tags for `%s`", prettyLogQuery) + } + + }, 2*time.Minute, 10*time.Second, "Failed finding `%s` logs with expected pipeline processing", prettyLogQuery) + }) +} + +type TestAgentHealthArgs struct { + // CheckEndpoints validates agent status endpoints are accessible + CheckEndpoints bool + // CheckComponents validates specific agent components are ready + CheckComponents []string + // ExpectedVersion validates agent version + ExpectedVersion string +} + +func (suite *BaseSuite[Env]) AssertAgentHealth(args *TestAgentHealthArgs) { + suite.Run("agentHealth", func() { + suite.EventuallyWithTf(func(collect *assert.CollectT) { + c := &myCollectT{ + CollectT: collect, + errors: []error{}, + } + // To enforce the use of myCollectT instead + collect = nil //nolint:ineffassign + + // Check that we're receiving any data from the agent (indicates it's running) + metrics, err := suite.Fakeintake.GetMetricNames() + if !assert.NoErrorf(c, err, "Failed to query metrics from fake intake") { + return + } + + assert.NotEmptyf(c, metrics, "No metrics received from agent - agent may not be healthy") + + // Check for datadog.agent.started metric (indicates successful agent startup) + startedMetrics, err := suite.Fakeintake.FilterMetrics("datadog.agent.started") + if err == nil && len(startedMetrics) > 0 { + suite.T().Logf("Agent started metric found - agent is healthy") + } + + // If specific components requested, check for their metrics + for _, component := range args.CheckComponents { + componentMetricPrefix := fmt.Sprintf("datadog.%s.", component) + componentMetrics := lo.Filter(metrics, func(metric string, _ int) bool { + return strings.HasPrefix(metric, componentMetricPrefix) + }) + assert.NotEmptyf(c, componentMetrics, + "No metrics found for component `%s` - component may not be running", component) + } + + }, 5*time.Minute, 10*time.Second, "Agent health check failed") + }) +} + +type TestResilienceScenarioArgs struct { + // ScenarioName for logging + ScenarioName string + // TriggerFunc function that triggers the failure scenario + TriggerFunc func() error + // RecoveryFunc function that triggers recovery (optional) + RecoveryFunc func() error + // ValidateFunc function that validates system recovered + ValidateFunc func(*assert.CollectT) + // RecoveryTimeout time to wait for recovery + RecoveryTimeout time.Duration +} + +func (suite *BaseSuite[Env]) AssertResilienceScenario(args *TestResilienceScenarioArgs) { + suite.Run("resilience_"+args.ScenarioName, func() { + // Trigger the failure scenario + if args.TriggerFunc != nil { + err := args.TriggerFunc() + suite.Require().NoErrorf(err, "Failed to trigger resilience scenario: %s", args.ScenarioName) + suite.T().Logf("Triggered resilience scenario: %s", args.ScenarioName) + } + + // Wait a bit for the failure to take effect + time.Sleep(5 * time.Second) + + // Trigger recovery if specified + if args.RecoveryFunc != nil { + err := args.RecoveryFunc() + suite.Require().NoErrorf(err, "Failed to trigger recovery for scenario: %s", args.ScenarioName) + suite.T().Logf("Triggered recovery for scenario: %s", args.ScenarioName) + } + + // Validate recovery + recoveryTimeout := args.RecoveryTimeout + if recoveryTimeout == 0 { + recoveryTimeout = 2 * time.Minute + } + + suite.EventuallyWithTf(func(collect *assert.CollectT) { + if args.ValidateFunc != nil { + args.ValidateFunc(collect) + } + }, recoveryTimeout, 10*time.Second, "Recovery validation failed for scenario: %s", args.ScenarioName) + + suite.T().Logf("Successfully recovered from resilience scenario: %s", args.ScenarioName) + }) +} + +// AssertECSTasksReady waits for all ECS services and tasks in the given cluster +// to be in RUNNING state. This should be called as the first test (Test00UpAndRunning) +// in each suite to ensure infrastructure is ready before other tests run. +func (suite *BaseSuite[Env]) AssertECSTasksReady(ecsClusterName string) { + ctx := suite.T().Context() + + cfg, err := awsconfig.LoadDefaultConfig(ctx) + suite.Require().NoErrorf(err, "Failed to load AWS config") + + client := awsecs.NewFromConfig(cfg) + + suite.Run("ECS tasks are ready", func() { + suite.EventuallyWithTf(func(c *assert.CollectT) { + var initToken string + for nextToken := &initToken; nextToken != nil; { + if nextToken == &initToken { + nextToken = nil + } + + servicesList, err := client.ListServices(ctx, &awsecs.ListServicesInput{ + Cluster: &ecsClusterName, + MaxResults: pointer.Ptr(int32(10)), // Because `DescribeServices` takes at most 10 services in input + NextToken: nextToken, + }) + if !assert.NoErrorf(c, err, "Failed to list ECS services") { + return + } + + nextToken = servicesList.NextToken + + servicesDescription, err := client.DescribeServices(ctx, &awsecs.DescribeServicesInput{ + Cluster: &ecsClusterName, + Services: servicesList.ServiceArns, + }) + if !assert.NoErrorf(c, err, "Failed to describe ECS services %v", servicesList.ServiceArns) { + continue + } + + for _, serviceDescription := range servicesDescription.Services { + assert.NotZerof(c, serviceDescription.DesiredCount, "ECS service %s has no task", *serviceDescription.ServiceName) + + for nextToken := &initToken; nextToken != nil; { + if nextToken == &initToken { + nextToken = nil + } + + tasksList, err := client.ListTasks(ctx, &awsecs.ListTasksInput{ + Cluster: &ecsClusterName, + ServiceName: serviceDescription.ServiceName, + DesiredStatus: awsecstypes.DesiredStatusRunning, + MaxResults: pointer.Ptr(int32(100)), // Because `DescribeTasks` takes at most 100 tasks in input + NextToken: nextToken, + }) + if !assert.NoErrorf(c, err, "Failed to list ECS tasks for service %s", *serviceDescription.ServiceName) { + break + } + + nextToken = tasksList.NextToken + + tasksDescription, err := client.DescribeTasks(ctx, &awsecs.DescribeTasksInput{ + Cluster: &ecsClusterName, + Tasks: tasksList.TaskArns, + }) + if !assert.NoErrorf(c, err, "Failed to describe ECS tasks %v", tasksList.TaskArns) { + continue + } + + for _, taskDescription := range tasksDescription.Tasks { + assert.Equalf(c, string(awsecstypes.DesiredStatusRunning), *taskDescription.LastStatus, + "Task %s of service %s is not running", *taskDescription.TaskArn, *serviceDescription.ServiceName) + assert.NotEqualf(c, awsecstypes.HealthStatusUnhealthy, taskDescription.HealthStatus, + "Task %s of service %s is unhealthy", *taskDescription.TaskArn, *serviceDescription.ServiceName) + } + } + } + } + }, 15*time.Minute, 10*time.Second, "Not all tasks became ready in time.") + }) +} diff --git a/test/new-e2e/tests/ecs/checks_test.go b/test/new-e2e/tests/ecs/checks_test.go new file mode 100644 index 00000000000000..163b0220f6d7f9 --- /dev/null +++ b/test/new-e2e/tests/ecs/checks_test.go @@ -0,0 +1,315 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2025-present Datadog, Inc. + +package ecs + +import ( + "regexp" + "strings" + "testing" + + "github.com/DataDog/datadog-agent/test/e2e-framework/components/datadog/apps" + "github.com/DataDog/datadog-agent/test/e2e-framework/testing/e2e" + "github.com/DataDog/datadog-agent/test/e2e-framework/testing/environments" + + scenecs "github.com/DataDog/datadog-agent/test/e2e-framework/scenarios/aws/ecs" + scenfi "github.com/DataDog/datadog-agent/test/e2e-framework/scenarios/aws/fakeintake" + provecs "github.com/DataDog/datadog-agent/test/e2e-framework/testing/provisioners/aws/ecs" +) + +type ecsChecksSuite struct { + BaseSuite[environments.ECS] + ecsClusterName string +} + +func TestECSChecksSuite(t *testing.T) { + t.Parallel() + e2e.Run(t, &ecsChecksSuite{}, e2e.WithProvisioner(provecs.Provisioner( + provecs.WithRunOptions( + scenecs.WithECSOptions( + scenecs.WithFargateCapacityProvider(), + scenecs.WithLinuxNodeGroup(), + ), + scenecs.WithFakeIntakeOptions( + scenfi.WithRetentionPeriod("31m"), + ), + scenecs.WithTestingWorkload(), + ), + ))) +} + +func (suite *ecsChecksSuite) SetupSuite() { + suite.BaseSuite.SetupSuite() + suite.Fakeintake = suite.Env().FakeIntake.Client() + suite.ecsClusterName = suite.Env().ECSCluster.ClusterName + suite.ClusterName = suite.Env().ECSCluster.ClusterName +} + +func (suite *ecsChecksSuite) TestNginxECS() { + // `nginx` check is configured via docker labels + // Test it is properly scheduled + suite.AssertMetric(&TestMetricArgs{ + Filter: TestMetricFilterArgs{ + Name: "nginx.net.request_per_s", + Tags: []string{"^ecs_launch_type:ec2$"}, + }, + Expect: TestMetricExpectArgs{ + Tags: &[]string{ + `^aws_account:[[:digit:]]{12}$`, + `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^container_id:`, + `^container_name:ecs-.*-nginx-ec2-`, + `^docker_image:ghcr\.io/datadog/apps-nginx-server:` + regexp.QuoteMeta(apps.Version) + `$`, + `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^ecs_container_name:nginx$`, + `^ecs_launch_type:ec2$`, + `^ecs_service:` + regexp.QuoteMeta(strings.TrimSuffix(suite.ecsClusterName, "-ecs")) + `-nginx-ec2$`, + `^git\.commit\.sha:[[:xdigit:]]{40}$`, // org.opencontainers.image.revision docker image label + `^git.repository_url:https://github.com/DataDog/test-infra-definitions$`, // org.opencontainers.image.source docker image label + `^image_id:sha256:`, + `^image_name:ghcr\.io/datadog/apps-nginx-server$`, + `^image_tag:` + regexp.QuoteMeta(apps.Version) + `$`, + `^nginx_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^region:us-east-1$`, + `^service_arn:`, + `^short_image:apps-nginx-server$`, + `^task_arn:`, + `^task_definition_arn:`, + `^task_family:.*-nginx-ec2$`, + `^task_name:.*-nginx-ec2$`, + `^task_version:[[:digit:]]+$`, + }, + AcceptUnexpectedTags: true, + }, + }) + + suite.AssertLog(&TestLogArgs{ + Filter: TestLogFilterArgs{ + Service: "nginx", + Tags: []string{"^ecs_launch_type:ec2$"}, + }, + Expect: TestLogExpectArgs{ + Tags: &[]string{ + `^aws_account:[[:digit:]]{12}$`, + `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^container_id:`, + `^container_name:ecs-.*-nginx-ec2-`, + `^docker_image:ghcr\.io/datadog/apps-nginx-server:` + regexp.QuoteMeta(apps.Version) + `$`, + `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^ecs_container_name:nginx$`, + `^ecs_launch_type:ec2$`, + `^ecs_service:` + regexp.QuoteMeta(strings.TrimSuffix(suite.ecsClusterName, "-ecs")) + `-nginx-ec2$`, + `^env:e2e-test$`, + `^git\.commit\.sha:[[:xdigit:]]{40}$`, // org.opencontainers.image.revision docker image label + `^git.repository_url:https://github.com/DataDog/test-infra-definitions$`, // org.opencontainers.image.source docker image label + `^image_id:sha256:`, + `^image_name:ghcr\.io/datadog/apps-nginx-server$`, + `^image_tag:` + regexp.QuoteMeta(apps.Version) + `$`, + `^region:us-east-1$`, + `^service:nginx$`, + `^service_arn:`, + `^short_image:apps-nginx-server$`, + `^task_arn:arn:`, + `^task_definition_arn:`, + `^task_family:.*-nginx-ec2$`, + `^task_name:.*-nginx-ec2$`, + `^task_version:[[:digit:]]+$`, + `^version:1\.0$`, + }, + Message: `GET / HTTP/1\.1`, + }, + }) +} + +func (suite *ecsChecksSuite) TestRedisECS() { + // `redis` check is auto-configured due to image name + // Test it is properly scheduled + suite.AssertMetric(&TestMetricArgs{ + Filter: TestMetricFilterArgs{ + Name: "redis.net.instantaneous_ops_per_sec", + Tags: []string{"^ecs_launch_type:ec2$"}, + }, + Expect: TestMetricExpectArgs{ + Tags: &[]string{ + `^aws_account:[[:digit:]]{12}$`, + `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^container_id:`, + `^container_name:ecs-.*-redis-ec2-`, + `^docker_image:ghcr\.io/datadog/redis:` + regexp.QuoteMeta(apps.Version) + `$`, + `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^ecs_container_name:redis$`, + `^ecs_service:` + regexp.QuoteMeta(strings.TrimSuffix(suite.ecsClusterName, "-ecs")) + `-redis-ec2$`, + `^ecs_launch_type:ec2$`, + `^git\.commit\.sha:[[:xdigit:]]{40}$`, // org.opencontainers.image.revision docker image label + `^git.repository_url:https://github.com/DataDog/test-infra-definitions$`, // org.opencontainers.image.source docker image label + `^image_id:sha256:`, + `^image_name:ghcr\.io/datadog/redis$`, + `^image_tag:` + regexp.QuoteMeta(apps.Version) + `$`, + `^region:us-east-1$`, + `^service_arn:`, + `^short_image:redis$`, + `^task_arn:`, + `^task_definition_arn:`, + `^task_family:.*-redis-ec2$`, + `^task_name:.*-redis-ec2$`, + `^task_version:[[:digit:]]+$`, + }, + AcceptUnexpectedTags: true, + }, + }) + + suite.AssertLog(&TestLogArgs{ + Filter: TestLogFilterArgs{ + Service: "redis", + Tags: []string{"^ecs_launch_type:ec2$"}, + }, + Expect: TestLogExpectArgs{ + Tags: &[]string{ + `^aws_account:[[:digit:]]{12}$`, + `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^container_id:`, + `^container_name:ecs-.*-redis-ec2-`, + `^docker_image:ghcr\.io/datadog/redis:` + regexp.QuoteMeta(apps.Version) + `$`, + `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^ecs_container_name:redis$`, + `^ecs_launch_type:ec2$`, + `^ecs_service:` + regexp.QuoteMeta(strings.TrimSuffix(suite.ecsClusterName, "-ecs")) + `-redis-ec2$`, + `^env:e2e-test$`, + `^git\.commit\.sha:[[:xdigit:]]{40}$`, // org.opencontainers.image.revision docker image label + `^git.repository_url:https://github.com/DataDog/test-infra-definitions$`, // org.opencontainers.image.source docker image label + `^image_id:sha256:`, + `^image_name:ghcr\.io/datadog/redis$`, + `^image_tag:` + regexp.QuoteMeta(apps.Version) + `$`, + `^region:us-east-1$`, + `^service:redis$`, + `^service_arn:`, + `^short_image:redis$`, + `^task_arn:arn:`, + `^task_definition_arn:`, + `^task_family:.*-redis-ec2$`, + `^task_name:.*-redis-ec2$`, + `^task_version:[[:digit:]]+$`, + `^version:1\.0$`, + }, + Message: `Accepted`, + }, + }) +} + +func (suite *ecsChecksSuite) TestNginxFargate() { + // `nginx` check is configured via docker labels + // Test it is properly scheduled + suite.AssertMetric(&TestMetricArgs{ + Filter: TestMetricFilterArgs{ + Name: "nginx.net.request_per_s", + Tags: []string{"^ecs_launch_type:fargate$"}, + }, + Expect: TestMetricExpectArgs{ + Tags: &[]string{ + `^aws_account:[[:digit:]]{12}$`, + `^availability_zone:`, + `^availability-zone:`, + `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^container_id:`, + `^container_name:nginx$`, + `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^ecs_container_name:nginx$`, + `^ecs_launch_type:fargate$`, + `^image_id:sha256:`, + `^image_name:ghcr\.io/datadog/apps-nginx-server$`, + `^image_tag:` + regexp.QuoteMeta(apps.Version) + `$`, + `^nginx_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^region:us-east-1$`, + `^service_arn:`, + `^short_image:apps-nginx-server$`, + `^task_arn:`, + `^task_definition_arn:`, + `^task_family:.*-nginx-fg$`, + `^task_name:.*-nginx-fg$`, + `^task_version:[[:digit:]]+$`, + }, + AcceptUnexpectedTags: true, + }, + }) +} + +func (suite *ecsChecksSuite) TestRedisFargate() { + // `redis` check is auto-configured due to image name + // Test it is properly scheduled + suite.AssertMetric(&TestMetricArgs{ + Filter: TestMetricFilterArgs{ + Name: "redis.net.instantaneous_ops_per_sec", + Tags: []string{"^ecs_launch_type:fargate$"}, + }, + Expect: TestMetricExpectArgs{ + Tags: &[]string{ + `^aws_account:[[:digit:]]{12}$`, + `^availability_zone:`, + `^availability-zone:`, + `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^container_id:`, + `^container_name:redis$`, + `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^ecs_container_name:redis$`, + `^ecs_launch_type:fargate$`, + `^image_id:sha256:`, + `^image_name:ghcr\.io/datadog/redis$`, + `^image_tag:` + regexp.QuoteMeta(apps.Version) + `$`, + `^region:us-east-1$`, + `^service_arn:`, + `^short_image:redis$`, + `^task_arn:`, + `^task_definition_arn:`, + `^task_family:.*-redis-fg$`, + `^task_name:.*-redis-fg$`, + `^task_version:[[:digit:]]+$`, + }, + AcceptUnexpectedTags: true, + }, + }) +} + +func (suite *ecsChecksSuite) TestPrometheus() { + // Test Prometheus check + suite.AssertMetric(&TestMetricArgs{ + Filter: TestMetricFilterArgs{ + Name: "prometheus.prom_gauge", + }, + Expect: TestMetricExpectArgs{ + Tags: &[]string{ + `^aws_account:[[:digit:]]{12}$`, + `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^container_id:`, + `^container_name:ecs-.*-prometheus-ec2-`, + `^docker_image:ghcr\.io/datadog/apps-prometheus:` + regexp.QuoteMeta(apps.Version) + `$`, + `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^ecs_container_name:prometheus$`, + `^ecs_service:` + regexp.QuoteMeta(strings.TrimSuffix(suite.ecsClusterName, "-ecs")) + `-prometheus$`, + `^endpoint:http://.*:8080/metrics$`, + `^git\.commit\.sha:[[:xdigit:]]{40}$`, // org.opencontainers.image.revision docker image label + `^git.repository_url:https://github.com/DataDog/test-infra-definitions$`, // org.opencontainers.image.source docker image label + `^image_id:sha256:`, + `^image_name:ghcr\.io/datadog/apps-prometheus$`, + `^image_tag:` + regexp.QuoteMeta(apps.Version) + `$`, + `^region:us-east-1$`, + `^series:`, + `^service_arn:`, + `^short_image:apps-prometheus$`, + `^task_arn:`, + `^task_definition_arn:`, + `^task_family:.*-prometheus-ec2$`, + `^task_name:.*-prometheus-ec2$`, + `^task_version:[[:digit:]]+$`, + }, + }, + }) +} diff --git a/test/new-e2e/tests/ecs/managed_test.go b/test/new-e2e/tests/ecs/managed_test.go new file mode 100644 index 00000000000000..68e276546d6c1d --- /dev/null +++ b/test/new-e2e/tests/ecs/managed_test.go @@ -0,0 +1,110 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2025-present Datadog, Inc. + +package ecs + +import ( + "regexp" + "testing" + "time" + + "github.com/DataDog/datadog-agent/test/e2e-framework/components/datadog/apps/tracegen" + ecsComp "github.com/DataDog/datadog-agent/test/e2e-framework/components/ecs" + "github.com/DataDog/datadog-agent/test/e2e-framework/resources/aws" + "github.com/DataDog/datadog-agent/test/e2e-framework/testing/e2e" + "github.com/DataDog/datadog-agent/test/e2e-framework/testing/environments" + "github.com/pulumi/pulumi/sdk/v3/go/pulumi" + "github.com/stretchr/testify/assert" + + scenecs "github.com/DataDog/datadog-agent/test/e2e-framework/scenarios/aws/ecs" + scenfi "github.com/DataDog/datadog-agent/test/e2e-framework/scenarios/aws/fakeintake" + provecs "github.com/DataDog/datadog-agent/test/e2e-framework/testing/provisioners/aws/ecs" +) + +type ecsManagedSuite struct { + BaseSuite[environments.ECS] + ecsClusterName string +} + +func TestECSManagedSuite(t *testing.T) { + t.Parallel() + e2e.Run(t, &ecsManagedSuite{}, e2e.WithProvisioner(provecs.Provisioner( + provecs.WithRunOptions( + scenecs.WithECSOptions( + scenecs.WithManagedInstanceNodeGroup(), + ), + scenecs.WithFakeIntakeOptions( + scenfi.WithRetentionPeriod("31m"), + ), + scenecs.WithTestingWorkload(), + scenecs.WithWorkloadApp(func(e aws.Environment, clusterArn pulumi.StringInput) (*ecsComp.Workload, error) { + return tracegen.EcsAppDefinition(e, clusterArn) + }), + ), + ))) +} + +func (suite *ecsManagedSuite) SetupSuite() { + suite.BaseSuite.SetupSuite() + suite.Fakeintake = suite.Env().FakeIntake.Client() + suite.ecsClusterName = suite.Env().ECSCluster.ClusterName + suite.ClusterName = suite.Env().ECSCluster.ClusterName +} + +func (suite *ecsManagedSuite) Test00UpAndRunning() { + suite.AssertECSTasksReady(suite.ecsClusterName) +} + +func (suite *ecsManagedSuite) TestManagedInstanceAgentHealth() { + // Test agent health on managed instances + suite.Run("Managed instance agent health", func() { + // Check basic agent health (agent is running and sending metrics) + // Component-specific telemetry metrics (datadog.core.*, datadog.metadata.*) + // are not reliably sent to FakeIntake, so we don't check for them + suite.AssertAgentHealth(&TestAgentHealthArgs{}) + }) +} + +func (suite *ecsManagedSuite) TestManagedInstanceTraceCollection() { + // Test trace collection from managed instances + suite.Run("Managed instance trace collection", func() { + // ECS metadata on traces is bundled in _dd.tags.container within TracerPayload.Tags + clusterNamePattern := regexp.MustCompile(`ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName)) + taskArnPattern := regexp.MustCompile(`task_arn:`) + containerNamePattern := regexp.MustCompile(`container_name:`) + + suite.EventuallyWithTf(func(c *assert.CollectT) { + traces, err := suite.Fakeintake.GetTraces() + if !assert.NoErrorf(c, err, "Failed to query traces") { + return + } + if !assert.NotEmptyf(c, traces, "No traces received yet") { + return + } + + // Check traces from managed instances via bundled _dd.tags.container tag + found := false + for _, trace := range traces { + for _, tracerPayload := range trace.TracerPayloads { + containerTags, exists := tracerPayload.Tags["_dd.tags.container"] + if !exists { + continue + } + if clusterNamePattern.MatchString(containerTags) && + taskArnPattern.MatchString(containerTags) && + containerNamePattern.MatchString(containerTags) { + found = true + break + } + } + if found { + break + } + } + + assert.Truef(c, found, "No traces with ECS metadata (cluster_name, task_arn, container_name) found in _dd.tags.container") + }, 3*time.Minute, 10*time.Second, "Managed instance trace collection validation failed") + }) +} diff --git a/test/new-e2e/tests/ecs/platform_test.go b/test/new-e2e/tests/ecs/platform_test.go new file mode 100644 index 00000000000000..b997c642f7804f --- /dev/null +++ b/test/new-e2e/tests/ecs/platform_test.go @@ -0,0 +1,208 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2025-present Datadog, Inc. + +package ecs + +import ( + "regexp" + "strings" + "testing" + "time" + + "github.com/DataDog/datadog-agent/test/e2e-framework/components/datadog/apps" + "github.com/DataDog/datadog-agent/test/e2e-framework/testing/e2e" + "github.com/DataDog/datadog-agent/test/e2e-framework/testing/environments" + "github.com/DataDog/datadog-agent/test/fakeintake/aggregator" + fakeintake "github.com/DataDog/datadog-agent/test/fakeintake/client" + "github.com/stretchr/testify/assert" + + scenecs "github.com/DataDog/datadog-agent/test/e2e-framework/scenarios/aws/ecs" + scenfi "github.com/DataDog/datadog-agent/test/e2e-framework/scenarios/aws/fakeintake" + provecs "github.com/DataDog/datadog-agent/test/e2e-framework/testing/provisioners/aws/ecs" +) + +type ecsPlatformSuite struct { + BaseSuite[environments.ECS] + ecsClusterName string +} + +func TestECSPlatformSuite(t *testing.T) { + t.Parallel() + e2e.Run(t, &ecsPlatformSuite{}, e2e.WithProvisioner(provecs.Provisioner( + provecs.WithRunOptions( + scenecs.WithECSOptions( + scenecs.WithFargateCapacityProvider(), + scenecs.WithLinuxNodeGroup(), + scenecs.WithLinuxBottleRocketNodeGroup(), + scenecs.WithWindowsNodeGroup(), + ), + scenecs.WithFakeIntakeOptions( + scenfi.WithRetentionPeriod("31m"), + ), + scenecs.WithTestingWorkload(), + ), + ))) +} + +func (suite *ecsPlatformSuite) SetupSuite() { + suite.BaseSuite.SetupSuite() + suite.Fakeintake = suite.Env().FakeIntake.Client() + suite.ecsClusterName = suite.Env().ECSCluster.ClusterName + suite.ClusterName = suite.Env().ECSCluster.ClusterName +} + +func (suite *ecsPlatformSuite) Test00UpAndRunning() { + suite.AssertECSTasksReady(suite.ecsClusterName) +} + +func (suite *ecsPlatformSuite) TestWindowsFargate() { + suite.AssertCheckRun(&TestCheckRunArgs{ + Filter: TestCheckRunFilterArgs{ + Name: "http.can_connect", + Tags: []string{ + "^ecs_launch_type:fargate$", + "^container_name:aspnetsample$", + }, + }, + Expect: TestCheckRunExpectArgs{ + Tags: &[]string{ + `^aws_account:[[:digit:]]{12}$`, + `^availability_zone:`, + `^availability-zone:`, + `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^container_id:`, + `^container_name:aspnetsample$`, + `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^ecs_container_name:aspnetsample$`, + `^ecs_launch_type:fargate$`, + `^ecs_service:` + regexp.QuoteMeta(strings.TrimSuffix(suite.ecsClusterName, "-ecs")) + `-aspnetsample-fg$`, + `^image_id:sha256:`, + `^image_name:mcr.microsoft.com/dotnet/samples$`, + `^image_tag:aspnetapp-nanoserver-ltsc2022$`, + `^region:us-east-1$`, + `^service_arn:`, + `^short_image:samples$`, + `^task_arn:`, + `^task_definition_arn:`, + `^task_family:.*-aspnet-fg$`, + `^task_name:.*-aspnet-fg$`, + `^task_version:[[:digit:]]+$`, + `^url:`, + }, + AcceptUnexpectedTags: true, + }, + }) + + // Test container check + suite.AssertMetric(&TestMetricArgs{ + Filter: TestMetricFilterArgs{ + Name: "container.cpu.usage", + Tags: []string{ + "^ecs_container_name:aspnetsample$", + }, + }, + Expect: TestMetricExpectArgs{ + Tags: &[]string{ + `^aws_account:[[:digit:]]{12}$`, + `^availability_zone:`, + `^availability-zone:`, + `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^container_id:`, + `^container_name:aspnetsample$`, + `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^ecs_container_name:aspnetsample$`, + `^ecs_launch_type:fargate$`, + `^ecs_service:` + regexp.QuoteMeta(strings.TrimSuffix(suite.ecsClusterName, "-ecs")) + `-aspnetsample-fg$`, + `^image_id:sha256:`, + `^image_name:mcr.microsoft.com/dotnet/samples$`, + `^image_tag:aspnetapp-nanoserver-ltsc2022$`, + `^region:us-east-1$`, + `^runtime:ecsfargate$`, + `^service_arn:`, + `^short_image:samples$`, + `^task_arn:`, + `^task_definition_arn:`, + `^task_family:.*-aspnet-fg$`, + `^task_name:.*-aspnet-fg$`, + `^task_version:[[:digit:]]+$`, + }, + }, + }) +} + +func (suite *ecsPlatformSuite) TestCPU() { + // Test CPU metrics + suite.AssertMetric(&TestMetricArgs{ + Filter: TestMetricFilterArgs{ + Name: "container.cpu.usage", + Tags: []string{ + "^ecs_container_name:stress-ng$", + }, + }, + Expect: TestMetricExpectArgs{ + Tags: &[]string{ + `^aws_account:[[:digit:]]{12}$`, + `^cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^cluster_arn:arn:aws:ecs:us-east-1:[[:digit:]]{12}:cluster/` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^container_id:`, + `^container_name:ecs-.*-stress-ng-ec2-`, + `^docker_image:ghcr\.io/datadog/apps-stress-ng:` + regexp.QuoteMeta(apps.Version) + `$`, + `^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`, + `^ecs_container_name:stress-ng$`, + `^ecs_service:` + regexp.QuoteMeta(strings.TrimSuffix(suite.ecsClusterName, "-ecs")) + `-stress-ng$`, + `^git\.commit\.sha:[[:xdigit:]]{40}$`, + `^git.repository_url:https://github.com/DataDog/test-infra-definitions$`, + `^image_id:sha256:`, + `^image_name:ghcr\.io/datadog/apps-stress-ng$`, + `^image_tag:` + regexp.QuoteMeta(apps.Version) + `$`, + `^region:us-east-1$`, + `^runtime:docker$`, + `^service_arn:`, + `^short_image:apps-stress-ng$`, + `^task_arn:`, + `^task_definition_arn:`, + `^task_family:.*-stress-ng-ec2$`, + `^task_name:.*-stress-ng-ec2$`, + `^task_version:[[:digit:]]+$`, + }, + Value: &TestMetricExpectValueArgs{ + Max: 160000000, + Min: 120000000, + }, + }, + }) +} + +func (suite *ecsPlatformSuite) TestContainerLifecycle() { + // Test that container lifecycle events are properly tracked + suite.Run("Container lifecycle tracking", func() { + // Verify that running containers are reporting metrics + suite.EventuallyWithTf(func(c *assert.CollectT) { + metrics, err := suite.Fakeintake.FilterMetrics( + "container.cpu.usage", + fakeintake.WithMatchingTags[*aggregator.MetricSeries]([]*regexp.Regexp{ + regexp.MustCompile(`^ecs_cluster_name:` + regexp.QuoteMeta(suite.ecsClusterName) + `$`), + }), + ) + assert.NoErrorf(c, err, "Failed to query metrics") + assert.NotEmptyf(c, metrics, "No container metrics found - containers may not be running") + + // Verify we have metrics from multiple containers (indicating lifecycle tracking) + containerIDs := make(map[string]bool) + for _, metric := range metrics { + for _, tag := range metric.GetTags() { + if strings.HasPrefix(tag, "container_id:") { + containerIDs[tag] = true + } + } + } + assert.GreaterOrEqualf(c, len(containerIDs), 3, + "Expected metrics from at least 3 containers, got %d", len(containerIDs)) + + }, 3*time.Minute, 10*time.Second, "Container lifecycle tracking validation failed") + }) +}