diff --git a/README.md b/README.md index 0337acc..d40bac3 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,10 @@ # Terraform modules by FastRobot This is a collection of modules in use by FastRobot. We typically call them -via `terragrunt`. Each top-level directory is its own self-contained module. +via `terragrunt`. Eventually we'll have some common utility modules but for now each top level dir is standalone and defaults to the cheapest possible way to accomplish a task. ## Modules -* `elk` - stands up an AWS ES endpoint and an instance running logstash \ No newline at end of file +* `atlantis` - opinionated wrapper for the official atlantis terraform module, setting up github webhooks and auth for selected repos, plus some ALB authentication schemes through OIDC providers. +* `elk` - stands up an AWS ES endpoint and an instance running logstash. +* `monitoring` - prometheus with various exporters as ecs tasks remote writing to an Amazon Managed Prometheus central collector, fronted by your choice of three grafanas, Amazon Managed Grafana, Grafana Cloud, Open Source Grafana as an ECS task diff --git a/monitoring/README.md b/monitoring/README.md index 20b81d6..5b7296f 100644 --- a/monitoring/README.md +++ b/monitoring/README.md @@ -1,25 +1,46 @@ # FR approved Monitoring stack -Sets up: +Conditionally sets up most mixes of: * AWS Managed Prometheus backend (done) -* ECS cluster - * Prometheus task - * prometheus server - * prometheus-sdconfig-reloader - * node_exporter (daemonset) - * blackbox +* ECS fargate cluster (done) + * Prometheus scraper task (done) + * prometheus server (done) + * prometheus-sdconfig-reloader (done) + * node_exporter (daemonset) TODO + * blackbox_exporter TODO * cadvisor (daemonset) - * grafana (optional!) * Grafana - * in ecs, cheapest? - * via aws managed (default) - * via grafana cloud (fanciest) + * As an ecs service, maybe the cheapest? + * via aws managed (default) (done) + * via grafana cloud (fanciest, TODO) + +We're using a AMP workspace as the central collection point for as many prometheus scrapers and their assorted exporters as you need. Your choice of grafana to view the collected metrics, plus cloudwatch, opensearch and any other service you want to configure. + +## AMP Metric Retention +from https://docs.aws.amazon.com/prometheus/latest/userguide/what-is-Amazon-Managed-Service-Prometheus.html +> Metrics ingested into a workspace are stored for 150 days, and are then automatically deleted. ## AMP Ruler and AlertManager + +This module also allows you to import a collection of yaml files as recording and alerting rules. We store these in terragrunt's live repo structure, each namespace as their own file, but you can pass any map that matches the structure as linked below. + * use https://docs.aws.amazon.com/prometheus/latest/userguide/AMP-Ruler.html to pass a map of names of yaml strings in by namespace * Expects prometheus standard rule and alert configuration * express the inputs however you like. Examples include templates through terragrunt or direct yaml conversion of hcl types in the calling module - Roughly based off of the structure from https://github.com/aws-samples/prometheus-for-ecs -# Developing for +## All the ways to run grafana +Depending on your complexity/scale/money tradeoffs you may have a clear preference for one of these grafana interfaces: + +### Amazon Managed Grafana +https://aws.amazon.com/grafana/ + +This one is presumably the easiest to run going forward, as it's completely hosted by Amazon, but charges per user/month ($9 per admin, $5 per lesser user) so is probably best suited for very small teams. Because it's charger per user, you have to set up some form of enterprise auth, in this case I defaulted to AWS SSO and it was a complicated thing I didn't want to take on AND has org-wide implications I had to setup with manual intervention. + +### Open Source Grafana in ECS Fargate +Not working yet, but next, as I suspect it's the cheapest AND most flexible way to run the grafana I'm used to. I'd prefer to use grafana's auth-against-github to manage access and AFAICT you can't against AMG. + +### Grafana Cloud +https://grafana.com/products/cloud/ + +Free forever for 3 users, probably easy to point at the AMP/ES/opensearch. Will probably be cutting edge grafana so worth exploring the premium tier which is a dollar cheaper than AMG. Not working yet diff --git a/monitoring/data.tf b/monitoring/data.tf new file mode 100644 index 0000000..e16e84b --- /dev/null +++ b/monitoring/data.tf @@ -0,0 +1,12 @@ +data "aws_region" "current" {} + +data "aws_subnets" "private" { + filter { + name = "vpc-id" + values = [var.vpc_id] + } +} + +#output "subnet_cidr_blocks" { +# value = [for s in data.aws_subnet.example : s.cidr_block] +#} diff --git a/monitoring/ecs.tf b/monitoring/ecs.tf new file mode 100644 index 0000000..a8cce05 --- /dev/null +++ b/monitoring/ecs.tf @@ -0,0 +1,35 @@ +locals { + create_ecs = alltrue([var.enable, anytrue([var.enable_grafana_ecs, var.enable_prometheus])]) + # also local.create_prometheus from main.tf +} + +# ECS cluster for monitoring tasks +module "ecs" { + count = local.create_ecs ? 1 : 0 + source = "terraform-aws-modules/ecs/aws" + version = "3.5.0" + name = "${local.full_name}-ecs" + + container_insights = true + + capacity_providers = ["FARGATE", "FARGATE_SPOT"] + + default_capacity_provider_strategy = [ + { + capacity_provider = "FARGATE_SPOT" + } + ] + + tags = local.tags +} + +# alternately, setup the cloudwatch agent to scrape and remote-write to AMP +# https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/ContainerInsights-Prometheus.html + +# define container definitions for a prometheus and sd sidecar and other exporters +# https://github.com/cloudposse/terraform-aws-ecs-container-definition + + + +# combine multiple above defs into +# https://github.com/cloudposse/terraform-aws-ecs-alb-service-task diff --git a/monitoring/grafana.tf b/monitoring/grafana.tf new file mode 100644 index 0000000..e07c15e --- /dev/null +++ b/monitoring/grafana.tf @@ -0,0 +1,297 @@ +# all the grafana related tasks, doing any combination of +# 1) optionally configuring the Amazon Managed Grafana +# * satisfies a desire to "keep everything in AWS" +# * Free for 90 days, then $9/mo per admin (min 1) and $5/mo per read-only user (min 0) +# 2) optionally configuring an ECS task running `grafana/grafana-oss` +# * update or rewrite of https://github.com/56kcloud/terraform-grafana +# 3) optionally configuring grafana cloud + +locals { + create_grafana_managed = alltrue([var.enable, var.enable_grafana_managed]) + create_grafana_ecs = alltrue([var.enable, var.enable_grafana_ecs]) + + # local.full_name is defined in main.tf + # full_name = "${var.namespace}-${var.name}-${var.environment}" + # local.tags comes from main.tf +} + +######################## +# Amazon Managed Grafana +######################## + +resource "aws_grafana_workspace" "grafana_managed" { + count = local.create_grafana_managed ? 1 : 0 + # required + account_access_type = var.grafana_managed_account_access_type + authentication_providers = var.grafana_managed_authentication_providers + permission_type = var.grafana_managed_permission_type + # optional + data_sources = var.grafana_managed_data_sources + description = "tf managed for ${local.full_name}" + name = "${local.full_name}-grafana" + notification_destinations = ["SNS"] # seems like the only possible value + role_arn = aws_iam_role.grafana[0].arn + stack_set_name = local.full_name +} + +resource "aws_iam_role" "grafana" { + count = local.create_grafana_managed ? 1 : 0 + name = "${local.full_name}-grafana" + assume_role_policy = data.aws_iam_policy_document.grafana-assume-policy-doc[0].json +} + +data "aws_iam_policy_document" "grafana-assume-policy-doc" { + count = local.create_grafana_managed ? 1 : 0 + statement { + actions = ["sts:AssumeRole"] + sid = "AllowGrafanaService${var.environment}role" + principals { + identifiers = ["grafana.amazonaws.com"] + type = "Service" + } + } +} + +resource "aws_iam_policy" "grafana" { + count = local.create_grafana_managed ? 1 : 0 # might expand that to also include creating grafana via ecs task + name = "${local.full_name}-grafana" + description = "Policy for Managed Grafana" + policy = data.aws_iam_policy_document.grafana-service-access[0].json +} + +data "aws_iam_policy_document" "grafana-service-access" { + count = local.create_grafana_managed ? 1 : 0 + # AMP + statement { + sid = "ListAllPrometheusWorkspaces" + effect = "Allow" + actions = [ + "aps:ListWorkspaces", + ] + resources = ["*"] + } + statement { + sid = "PerPrometheusWorkspacesPermissions" + effect = "Allow" + actions = [ + "aps:DescribeWorkspace", + "aps:QueryMetrics", + "aps:GetLabels", + "aps:GetSeries", + "aps:GetMetricMetadata" + ] + resources = ["*"] # this can be tighter + } + # OpenSearch (formerly elasticsearch) + statement { + sid = "OpenSearchList" + effect = "Allow" + actions = [ + "es:ESHttpGet", + "es:DescribeElasticsearchDomains", + "es:ListDomainNames" + ] + resources = ["*"] + } + statement { + sid = "OpenSearchPost" + actions = [ + "es:ESHttpPost" + ] + resources = [ + "arn:aws:es:*:*:domain/*/_msearch*", + "arn:aws:es:*:*:domain/*/_opendistro/_ppl" + ] + } + # SNS Alerting + statement { + sid = "SNSPublish" + actions = ["sns:Publish"] + resources = [var.sns_topic_arn] + } + # CloudWatch + statement { + sid = "AllowReadingMetricsFromCloudWatch" + actions = [ + "cloudwatch:DescribeAlarmsForMetric", + "cloudwatch:DescribeAlarmHistory", + "cloudwatch:DescribeAlarms", + "cloudwatch:ListMetrics", + "cloudwatch:GetMetricStatistics", + "cloudwatch:GetMetricData" + ] + resources = ["*"] + } + statement { + sid = "AllowReadingLogsFromCloudWatch" + actions = [ + "logs:DescribeLogGroups", + "logs:GetLogGroupFields", + "logs:StartQuery", + "logs:StopQuery", + "logs:GetQueryResults", + "logs:GetLogEvents" + ] + resources = ["*"] + } + statement { + sid = "AllowReadingTagsInstancesRegionsFromEC2" + actions = [ + "ec2:DescribeTags", + "ec2:DescribeInstances", + "ec2:DescribeRegions" + ] + resources = ["*"] + } + statement { + sid = "AllowReadingResourcesForTags" + actions = ["tag:GetResources"] + resources = ["*"] + } +} + +resource "aws_iam_role_policy_attachment" "grafana-service-attach" { + count = local.create_grafana_managed ? 1 : 0 + policy_arn = aws_iam_policy.grafana[0].arn + role = aws_iam_role.grafana[0].name +} + +# made by hand the AWS SSO signup, created a user and +# created a grafana-admin group +# should make a grafana-reader group +# had to assign the new group to the account, gave it a bunch of iam admin stuff +# was making a aws-sso permission set + +resource "aws_grafana_role_association" "aws_sso_admin_role" { + count = local.create_grafana_managed ? 1 : 0 + role = "ADMIN" + group_ids = [data.aws_identitystore_group.admins[0].group_id] + workspace_id = aws_grafana_workspace.grafana_managed[0].id +} + +data "aws_ssoadmin_instances" "internal" { + count = local.create_grafana_managed ? 1 : 0 +} + +data "aws_identitystore_group" "admins" { + count = local.create_grafana_managed ? 1 : 0 + identity_store_id = tolist(data.aws_ssoadmin_instances.internal[0].identity_store_ids)[0] + + filter { + attribute_path = "DisplayName" + attribute_value = "grafana-admin" + } +} + + +# regardless of which grafana methods we used, do the following +# load the datastores (did so manually from the UI, sucked) +# load some dashboards +# * cAdvisor with filtering for ECS cluster/task https://grafana.com/grafana/dashboards/15200 + +########### +# Grafana on ECS (open source, self hosted) +########### + +module "grafana_ecs_container_definition" { + count = local.create_grafana_ecs ? 1 : 0 + source = "registry.terraform.io/cloudposse/ecs-container-definition/aws" + version = "0.58.1" + container_name = "${local.full_name}-grafana" + container_image = var.grafana_ecs_container_image + container_memory = var.grafana_ecs_container_memory + secrets = [ + { + valueFrom = "/grafana/${local.full_name}/GF_AUTH_GITHUB_ALLOW_SIGN_UP" + name = "GF_AUTH_GITHUB_ALLOW_SIGN_UP" + }, + { + valueFrom = "/grafana/${local.full_name}/GF_AUTH_GITHUB_ALLOWED_ORGANIZATIONS" + name = "GF_AUTH_GITHUB_ALLOWED_ORGANIZATIONS" + }, + { + valueFrom = "/grafana/${local.full_name}/GF_AUTH_GITHUB_API_URL" + name = "GF_AUTH_GITHUB_API_URL" + }, + { + valueFrom = "/grafana/${local.full_name}/GF_AUTH_GITHUB_AUTH_URL" + name = "GF_AUTH_GITHUB_AUTH_URL" + }, + { + valueFrom = "/grafana/${local.full_name}/GF_AUTH_GITHUB_CLIENT_ID" + name = "GF_AUTH_GITHUB_CLIENT_ID" + }, + { + valueFrom = "/grafana/${local.full_name}/GF_AUTH_GITHUB_CLIENT_SECRET" + name = "GF_AUTH_GITHUB_CLIENT_SECRET" + }, + { + valueFrom = "/grafana/${local.full_name}/GF_AUTH_GITHUB_ENABLED" + name = "GF_AUTH_GITHUB_ENABLED" + }, + { + valueFrom = "/grafana/${local.full_name}/GF_AUTH_GITHUB_SCOPES" + name = "GF_AUTH_GITHUB_SCOPES" + }, + { + valueFrom = "/grafana/${local.full_name}/GF_AUTH_GITHUB_TOKEN_URL" + name = "GF_AUTH_GITHUB_TOKEN_URL" + }, + { + valueFrom = "/grafana/${local.full_name}/GF_SERVER_ROOT_URL" + name = "GF_SERVER_ROOT_URL" + }, + { + valueFrom = "/grafana/${local.full_name}/GF_SERVER_ENABLE_GZIP" + name = "GF_SERVER_ENABLE_GZIP" + }, + { + valueFrom = "/grafana/${local.full_name}/GF_DEFAULT_INSTANCE_NAME" + name = "GF_DEFAULT_INSTANCE_NAME" + }, + ] + port_mappings = [ + { + containerPort = 3000 + hostPort = 3000 # ?maybe? + protocol = "tcp" + } + ] + log_configuration = { + logDriver = "awslogs" + options = { + awslogs-group = "/ecs/Prometheus" + awslogs-create-group = true + awslogs-region = data.aws_region.current.name + awslogs-stream-prefix = "${local.full_name}-reloader" + } + } +} + + +module "grafana_ecs_alb_service_task" { + count = local.create_grafana_ecs ? 1 : 0 + source = "registry.terraform.io/cloudposse/ecs-alb-service-task/aws" + version = "0.64.0" + namespace = var.namespace + stage = var.environment + name = var.name + delimiter = "-" + alb_security_group = aws_security_group.grafana_alb_sg[0].id + container_definition_json = module.grafana_ecs_container_definition[0].json_map_encoded_list + ecs_cluster_arn = module.ecs[0].ecs_cluster_arn + launch_type = var.grafana_ecs_launch_types + vpc_id = var.vpc_id + security_group_ids = [aws_security_group.grafana_ecs_sg[0].id] + subnet_ids = var.private_subnet_ids + tags = local.tags + ignore_changes_task_definition = false + assign_public_ip = var.grafana_ecs_assign_public_ip + propagate_tags = "SERVICE" + deployment_minimum_healthy_percent = 0 # undeploy old task before new + deployment_maximum_percent = 100 # never run more + deployment_controller_type = "ECS" + desired_count = 1 + task_memory = var.grafana_ecs_task_memory + task_cpu = var.grafana_ecs_task_cpu +} diff --git a/monitoring/iam.tf b/monitoring/iam.tf new file mode 100644 index 0000000..7fb47eb --- /dev/null +++ b/monitoring/iam.tf @@ -0,0 +1,125 @@ +## prometheus ecs IAM +# First, the roles needed for the prom scraper ecs tasks +resource "aws_iam_role" "prom-scraper-app-role" { + name = "${local.full_name}-prom-srv-ecs-app-role" + assume_role_policy = data.aws_iam_policy_document.prom_ecs_task_assume_policy.json +} + +resource "aws_iam_role" "prom-scraper-task-execution-role" { + name = "${local.full_name}-prom-srv-task-execution-role" + assume_role_policy = data.aws_iam_policy_document.prom_ecs_task_assume_policy.json +} + +# Attach some policies, managed and not, to the app role +resource "aws_iam_role_policy_attachment" "prom-generic" { + for_each = toset([ + "arn:aws:iam::aws:policy/CloudWatchLogsFullAccess", + ]) + policy_arn = each.value + role = aws_iam_role.prom-scraper-app-role.name +} + +resource "aws_iam_role_policy_attachment" "prom-generic-writer" { + policy_arn = aws_iam_policy.prom_writer.arn + role = aws_iam_role.prom-scraper-app-role.name +} + +resource "aws_iam_role_policy_attachment" "prom-generic-discovery" { + policy_arn = aws_iam_policy.prom_stack_discovery.arn + role = aws_iam_role.prom-scraper-app-role.name +} + +# this attachment is for the ECS agent +resource "aws_iam_role_policy_attachment" "attach_custom_to_task_execution" { + for_each = toset([ + "arn:aws:iam::aws:policy/CloudWatchLogsFullAccess", + "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" + ]) + policy_arn = each.value + role = aws_iam_role.prom-scraper-task-execution-role.name +} + +data "aws_iam_policy_document" "prom_ecs_task_assume_policy" { + statement { + sid = "AssumeRole" + effect = "Allow" + actions = ["sts:AssumeRole"] + + principals { + type = "Service" + identifiers = ["ecs-tasks.amazonaws.com"] + } + } +} + +resource "aws_iam_policy" "prom_stack_discovery" { + name = "${local.full_name}-prom-ecs-task-discovery-policy" + path = "/" + policy = data.aws_iam_policy_document.prom_custom_task_policy.json +} + +data "aws_iam_policy_document" "prom_custom_task_policy" { + statement { + sid = "AllowReadingTagsInstancesRegionsFromEC2" + effect = "Allow" + resources = ["*"] + + actions = [ + "ec2:DescribeTags", + "ec2:DescribeInstances", + "ec2:DescribeRegions", + ] + } + + statement { + sid = "AllowReadingResourcesForTags" + effect = "Allow" + resources = ["*"] + actions = ["tag:GetResources"] + } +} + +resource "aws_iam_policy" "prom_writer" { + name = "${local.full_name}-prom-ecs-task-writer-policy" + path = "/" + policy = data.aws_iam_policy_document.prom_task_aps_write.json +} + +data "aws_iam_policy_document" "prom_task_aps_write" { + statement { + sid = "WritePrometheusMetrics" + effect = "Allow" + resources = ["*"] + + actions = [ + "aps:RemoteWrite", + "aps:GetSeries", + "aps:GetLabels", + "aps:GetMetricMetadata", + ] + } + + statement { + sid = "SSMGet" + effect = "Allow" + resources = ["*"] + actions = ["ssm:GetParameter"] + } + + statement { + sid = "AllowServiceDiscovery" + effect = "Allow" + resources = ["*"] + actions = ["servicediscovery:*"] + } + statement { + sid = "AllowPromEFS" + effect = "Allow" + resources = [aws_efs_file_system.prom_service_storage.arn] + actions = [ + "elasticfilesystem:ClientWrite", + "elasticfilesystem:ClientMount", + "elasticfilesystem:ClientRoot" + ] + } +} diff --git a/monitoring/main.tf b/monitoring/main.tf index a90aae3..0de66ff 100644 --- a/monitoring/main.tf +++ b/monitoring/main.tf @@ -1,7 +1,6 @@ locals { create_amp = alltrue([var.enable, var.enable_amp]) create_prometheus = alltrue([var.enable, var.enable_prometheus]) - create_grafana = alltrue([var.enable, var.enable_grafana_managed]) full_name = "${var.namespace}-${var.name}-${var.environment}" tags = { Environment = var.environment @@ -30,3 +29,184 @@ resource "aws_prometheus_alert_manager_definition" "prom-alerts" { definition = each.value workspace_id = aws_prometheus_workspace.prom[0].id } + + +resource "aws_efs_file_system" "prom_service_storage" { + encrypted = true + tags = { + Name = "${local.full_name}-prometheus-config" + } +} + +resource "aws_efs_mount_target" "prom_service_storage" { + count = length(var.private_subnet_ids) + + file_system_id = aws_efs_file_system.prom_service_storage.id + subnet_id = var.private_subnet_ids[count.index] + security_groups = [aws_security_group.prom_efs_sg.id] +} + +# define the prometheus config-reloader (from https://github.com/aws-samples/prometheus-for-ecs) +# This is the least hacky way I can find to get a templated prometheus.yml into the stock prometheus +# container task. The 4 variables in the environment control the frequency at which this sidecar task +# will check ParameterStore for a prometheus.yml +# this container will template configs onto a EFS mount shared with a prometheus server +module "reloader_container_def" { + count = local.create_prometheus ? 1 : 0 + + source = "cloudposse/ecs-container-definition/aws" + version = "0.58.1" + + container_name = "${local.full_name}-prometheus-config-reloader" + container_image = "public.ecr.aws/awsvijisarathy/prometheus-sdconfig-reloader:4.0" + container_memory = 128 # tiny process, tiny memory + container_cpu = 10 + user = "root" + map_environment = { + CONFIG_FILE_DIR = "/etc/config" + CONFIG_RELOAD_FREQUENCY = 60 + PROMETHEUS_CONFIG_PARAMETER_NAME = "/${var.environment}/ECS-Prometheus-Configuration" + DISCOVERY_NAMESPACES_PARAMETER_NAME = "/${var.environment}/ECS-ServiceDiscovery-Namespaces" + AWS_REGION = data.aws_region.current.name + } + log_configuration = { + logDriver = "awslogs" + options = { + awslogs-group = "/ecs/Prometheus" + awslogs-create-group = true + awslogs-region = data.aws_region.current.name + awslogs-stream-prefix = "${local.full_name}-reloader" + } + } + mount_points = [ + { + containerPath = "/etc/config" + sourceVolume = "${local.full_name}-prometheus-config" + readOnly = false + } + ] +} + +resource "aws_ssm_parameter" "prom_config" { + name = "/${var.environment}/ECS-Prometheus-Configuration" + type = "String" + value = replace(var.prometheus_config, "REMOTE_WRITE_URL", "${aws_prometheus_workspace.prom[0].prometheus_endpoint}api/v1/remote_write") +} + +resource "aws_ssm_parameter" "prom_sd_ns" { + name = "/${var.environment}/ECS-ServiceDiscovery-Namespaces" + type = "String" + value = "ecs-services" +} + +# define the prometheus scraper container +# this container will scrape various prometheus exporters, then remote_write all data to the AMP prometheus endpoint +module "prometheus_container_def" { + count = local.create_prometheus ? 1 : 0 + + source = "cloudposse/ecs-container-definition/aws" + version = "0.58.1" + + container_name = "${local.full_name}-prometheus" + container_image = var.prometheus_image + container_memory = var.prometheus_container_memory + container_cpu = var.prometheus_container_cpu + user = "root" + port_mappings = [ + { + containerPort = 9090 + hostPort = 9090 # ?maybe? + protocol = "tcp" + } + ] + command = [ + "--storage.tsdb.retention.time=15d", + "--config.file=/etc/config/prometheus.yaml", + "--storage.tsdb.path=/data", + "--web.console.libraries=/etc/prometheus/console_libraries", + "--web.console.templates=/etc/prometheus/consoles", + "--web.enable-lifecycle" + ] + log_configuration = { + logDriver = "awslogs" + options = { + awslogs-group = "/ecs/Prometheus" + awslogs-create-group = true + awslogs-region = data.aws_region.current.name + awslogs-stream-prefix = "${local.full_name}-prometheus" + } + } + mount_points = [ + { + containerPath = "/etc/config" + sourceVolume = "${local.full_name}-prometheus-config" + readOnly = true + }, + { + containerPath = "/data" + sourceVolume = "${local.full_name}-prometheus-data" + readOnly = false + } + ] + healthcheck = { + command = [ + "CMD-SHELL", + "wget http://localhost:9090/-/healthy -O /dev/null || exit 1" + ] + retries = 2 + timeout = 2 + interval = 10 + startPeriod = 10 + } + container_depends_on = [ + { + containerName = "${local.full_name}-prometheus-config-reloader" + condition = "START" + } + ] +} + +# join all above containers into a single task +resource "aws_ecs_task_definition" "prom_stack" { + count = local.create_prometheus ? 1 : 0 + container_definitions = jsonencode([ + module.reloader_container_def[0].json_map_object, + module.prometheus_container_def[0].json_map_object + ]) + cpu = 256 + memory = 512 + family = "${local.full_name}-prometheus-stack" + requires_compatibilities = ["FARGATE"] + network_mode = "awsvpc" + task_role_arn = aws_iam_role.prom-scraper-app-role.arn # perms for the actual prom scraper + execution_role_arn = aws_iam_role.prom-scraper-task-execution-role.arn # perms for the ecs agent to launch the containers + volume { + name = "${local.full_name}-prometheus-config" + efs_volume_configuration { + file_system_id = aws_efs_file_system.prom_service_storage.id + root_directory = "/" + transit_encryption = "ENABLED" + transit_encryption_port = 2049 + } + } + volume { + # without any other identifiers, this becomes an ephemeral volume, which is fine cause we're remote-writing data to AMP + name = "${local.full_name}-prometheus-data" + } +} + +resource "aws_ecs_service" "prom" { + name = "${local.full_name}-prometheus-scraper" + count = local.create_prometheus ? 1 : 0 + cluster = module.ecs[0].ecs_cluster_id + task_definition = aws_ecs_task_definition.prom_stack[0].arn + desired_count = 1 + deployment_maximum_percent = 100 + deployment_minimum_healthy_percent = 0 + launch_type = "FARGATE" + network_configuration { + subnets = var.private_subnet_ids + security_groups = [aws_security_group.prom_ecs_sg[0].id] + assign_public_ip = false + } +} diff --git a/monitoring/outputs.tf b/monitoring/outputs.tf new file mode 100644 index 0000000..6fc4a70 --- /dev/null +++ b/monitoring/outputs.tf @@ -0,0 +1,24 @@ +output "prometheus_endpoint" { + value = aws_prometheus_workspace.prom[0].prometheus_endpoint +} + +output "prometheus_endpoint_remote_write" { + value = "${aws_prometheus_workspace.prom[0].prometheus_endpoint}api/v1/remote_write" +} + +output "prometheus_endpoint_query" { + value = "${aws_prometheus_workspace.prom[0].prometheus_endpoint}api/v1/query" +} + +output "prometheus_arn" { + value = aws_prometheus_workspace.prom[*].arn +} + +output "grafana_managed_endpoint" { + value = [for ws in aws_grafana_workspace.grafana_managed[*] : "https://${ws.endpoint}"] +} + +# not yet supported? +#output "grafana_managed_version" { +# value = aws_grafana_workspace.grafana_managed[*].version +#} diff --git a/monitoring/security_groups.tf b/monitoring/security_groups.tf new file mode 100644 index 0000000..37bf051 --- /dev/null +++ b/monitoring/security_groups.tf @@ -0,0 +1,117 @@ +# misc security groups needed across the monitoring stack, they're so bulky I like to move them out of the way + +# Security group for the ALB +resource "aws_security_group" "grafana_alb_sg" { + count = local.create_grafana_ecs ? 1 : 0 + name = "${local.full_name}-grafana-alb-sg" + description = "Allow traffic to the ALB created for the ${local.full_name} grafana service" + vpc_id = var.vpc_id + +} + +resource "aws_security_group_rule" "allow_outbound_grafana_ecs_alb_http_redirect_all" { + count = local.create_grafana_ecs ? 1 : 0 + security_group_id = aws_security_group.grafana_alb_sg[0].id + type = "ingress" + from_port = 80 + to_port = 80 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] +} + +resource "aws_security_group_rule" "allow_outbound_grafana_ecs_alb_service_all" { + count = local.create_grafana_ecs ? 1 : 0 + security_group_id = aws_security_group.grafana_alb_sg[0].id + type = "ingress" + from_port = 443 + to_port = 443 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] +} + +resource "aws_security_group_rule" "allow_outbound_grafana_ecs_alb_all" { + count = local.create_grafana_ecs ? 1 : 0 + security_group_id = aws_security_group.grafana_alb_sg[0].id + type = "egress" + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] +} + +# Security group for the grafana ecs task +resource "aws_security_group" "grafana_ecs_sg" { + count = local.create_grafana_ecs ? 1 : 0 + name = "${local.full_name}-grafana-ecs-sg" + description = "Allow traffic to the ecs task created for the ${local.full_name} grafana service" + vpc_id = var.vpc_id +} + +resource "aws_security_group_rule" "allow_grafana_ecs_http_all" { + count = local.create_grafana_ecs ? 1 : 0 + security_group_id = aws_security_group.grafana_ecs_sg[0].id + type = "ingress" + from_port = 3000 + to_port = 3000 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] +} + +resource "aws_security_group_rule" "allow_outbound_grafana_ecs_all" { + count = local.create_grafana_ecs ? 1 : 0 + security_group_id = aws_security_group.grafana_ecs_sg[0].id + type = "egress" + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] +} + +# Security group for the prometheus server ecs task +resource "aws_security_group" "prom_ecs_sg" { + count = local.create_prometheus ? 1 : 0 + name = "${local.full_name}-grafana-ecs-sg" + description = "Allow traffic to the ecs task created for the ${local.full_name} grafana service" + vpc_id = var.vpc_id +} + +resource "aws_security_group_rule" "allow_prom_ecs_http_all" { + count = local.create_prometheus ? 1 : 0 + security_group_id = aws_security_group.prom_ecs_sg[0].id + type = "ingress" + from_port = 9090 + to_port = 9090 + protocol = "tcp" + # TODO lock this down to something much more restrictive, nothing needs api access but admins + cidr_blocks = ["0.0.0.0/0"] +} + +resource "aws_security_group_rule" "allow_outbound_prom_ecs_all" { + count = local.create_prometheus ? 1 : 0 + security_group_id = aws_security_group.prom_ecs_sg[0].id + type = "egress" + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] +} + +resource "aws_security_group" "prom_efs_sg" { + name = "${local.full_name}-efs-sg" + description = "Allow traffic to the prometheus EFS storage volume" + vpc_id = var.vpc_id + + ingress { + from_port = 2049 + to_port = 2049 + protocol = "tcp" + security_groups = [aws_security_group.prom_ecs_sg[0].id] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} diff --git a/monitoring/variables.tf b/monitoring/variables.tf index d8846de..b164fb4 100644 --- a/monitoring/variables.tf +++ b/monitoring/variables.tf @@ -26,7 +26,7 @@ variable "enable_amp" { } variable "enable_prometheus" { - description = "should we make the ECS cluster and prometheus collector services" + description = "Make the ECS fargate cluster and prometheus collector services" default = true type = bool } @@ -35,11 +35,16 @@ variable "enable_prometheus" { # enable_grafana_task - run grafana as an ECS task behind an ALB # enable_grafana_cloud variable "enable_grafana_managed" { - description = "should we make the managed grafana resources" + description = "Make the Amazaon Managed Grafana resources" default = true type = bool } +variable "enable_grafana_ecs" { + description = "Should we make an ECS fargate cluster to run OSS grafana resources" + default = false + type = bool +} variable "rule_groups" { default = { basic = <