diff --git a/.github/actions/test_provider/action.yaml b/.github/actions/test_provider/action.yaml deleted file mode 100644 index 2eb1f0c90..000000000 --- a/.github/actions/test_provider/action.yaml +++ /dev/null @@ -1,29 +0,0 @@ -name: 'Test provider' -description: 'Try to initialize a Magic Castle provider folder' -inputs: - provider: - description: 'name of the provider' - required: true - path: - description: 'path to terraform' -runs: - using: "composite" - steps: - - run: ${{ inputs.path }}/terraform -chdir=${{ inputs.provider }} init - shell: bash - id: init - - run: ${{ inputs.path }}/terraform -chdir=${{ inputs.provider }} validate - shell: bash - id: validate - - run: find examples -name ${{ inputs.provider }} -type d -not -path '*/\.*' - shell: bash - id: find-examples - - run: sed -E -i 's;(source)\s*=.*${{ inputs.provider }}.*;\1 = "../../${{ inputs.provider }}";g' examples/${{ inputs.provider }}/main.tf; - shell: bash - id: sed-example - - run: ${{ inputs.path }}/terraform -chdir=examples/${{ inputs.provider }} init - shell: bash - id: init-example - - run: ${{ inputs.path }}/terraform -chdir=examples/${{ inputs.provider }} validate - shell: bash - id: validate-example diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml deleted file mode 100644 index 843696c1d..000000000 --- a/.github/workflows/docs.yaml +++ /dev/null @@ -1,24 +0,0 @@ -name: Check Markdown links - -on: - push: - branches: - - main - pull_request: - branches: - - main - schedule: - - cron: "0 9 * * *" - -jobs: - markdown-link-check: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@master - - uses: gaurav-nelson/github-action-markdown-link-check@v1 - with: - config-file: './.github/workflows/mlc_config.json' - use-quiet-mode: 'yes' - use-verbose-mode: 'yes' - folder-path: '.' - file-path: './README.md, ./CHANGELOG.md, ./LICENSE' \ No newline at end of file diff --git a/.github/workflows/mkdocs_test.yml b/.github/workflows/mkdocs_test.yml index ce7a9e945..c7405949a 100644 --- a/.github/workflows/mkdocs_test.yml +++ b/.github/workflows/mkdocs_test.yml @@ -1,6 +1,13 @@ # documentation: https://help.github.com/en/articles/workflow-syntax-for-github-actions name: build documentation -on: [push, pull_request] +on: + push: + paths: + - docs/* + pull_request: + paths: + - docs/* + # Declare default permissions as read only. permissions: read-all jobs: diff --git a/.github/workflows/mlc_config.json b/.github/workflows/mlc_config.json deleted file mode 100644 index 95cf07e9e..000000000 --- a/.github/workflows/mlc_config.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "ignorePatterns": [ - { - "pattern": "^https://dash.cloudflare.com" - } - ], - "replacementPatterns": [ - { - "pattern": "^/", - "replacement": "https://github.com/ComputeCanada/magic_castle/tree/main/" - } - ], - "aliveStatusCodes": [200, 206, 429] -} \ No newline at end of file diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index c502800bb..7ea387ffb 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -10,16 +10,16 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@main + uses: actions/checkout@v4 - name: Retrieve tag name id: tag_name run: | echo ::set-output name=SOURCE_TAG::${GITHUB_REF#refs/tags/} - - uses: hashicorp/setup-terraform@v1 + - uses: hashicorp/setup-terraform@v3 with: - terraform_version: 1.4.0 + terraform_version: 1.5.7 - name: Create tarballs and zips if: startsWith(github.ref, 'refs/tags/') diff --git a/.github/workflows/spelling.yaml b/.github/workflows/spelling.yaml index 13f4e220d..f1d8ea789 100644 --- a/.github/workflows/spelling.yaml +++ b/.github/workflows/spelling.yaml @@ -12,9 +12,9 @@ jobs: codespell: runs-on: ubuntu-latest steps: - - uses: actions/checkout@master - - uses: codespell-project/actions-codespell@master + - uses: actions/checkout@v4 + - uses: codespell-project/actions-codespell@v2.1 with: check_filenames: true - ignore_words_list: keypair + ignore_words_list: keypair, te only_warn: 1 diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 61d7af8cc..7a5ee7166 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -3,78 +3,157 @@ name: Validate Terraform code on: push: branches: - - '*' + - main + paths: + - aws/* + - azure/* + - common/* + - dns/* + - examples/* + - openstack/* + - ovh/* + - .github/workflows/test.yaml pull_request: branches: - main + paths: + - aws/* + - azure/* + - common/* + - dns/* + - examples/* + - openstack/* + - ovh/* + - .github/workflows/test.yaml jobs: - test: - env: - TF_VERSION: 1.4.0 - + validate_cloud_providers: runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + provider: ['aws', 'azure', 'gcp', 'openstack', 'ovh'] + steps: + - uses: actions/checkout@v4 + - uses: hashicorp/setup-terraform@v3 + with: + terraform_version: "1.5.7" + - run: terraform -chdir=${{ matrix.provider }} init + - run: terraform -chdir=${{ matrix.provider }} validate + validate_dns_providers: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + provider: ['cloudflare', 'gcloud', 'txt'] steps: - name: Checkout code - uses: actions/checkout@main - - - name: Cache Terraform - id: cache-terraform - uses: actions/cache@v4 + uses: actions/checkout@v4 + - uses: hashicorp/setup-terraform@v3 with: - path: ~/bin - key: terraform-${{ env.TF_VERSION }} + terraform_version: "1.5.7" + - run: terraform -chdir=dns/${{ matrix.provider }} init + - run: terraform -chdir=dns/${{ matrix.provider }} validate - - name: Download terraform - if: steps.cache-terraform.outputs.cache-hit != 'true' - run: | - mkdir -p "${HOME}/bin" - curl -sSL -o terraform.zip "https://releases.hashicorp.com/terraform/${TF_VERSION}/terraform_${TF_VERSION}_linux_amd64.zip" - unzip terraform.zip - mv -v terraform "${HOME}/bin/terraform" - ~/bin/terraform version - - - name: Create SSH keys - run: | - ssh-keygen -b 2048 -t rsa -q -N "" -f ~/.ssh/id_rsa - - - name: Test AWS - uses: ./.github/actions/test_provider + validate_examples: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + example: + - aws + - azure + - gcp + - openstack + - ovh + # - advanced/spot_instances/aws + steps: + - uses: actions/checkout@v4 + - uses: hashicorp/setup-terraform@v3 with: - path: ~/bin - provider: 'aws' + terraform_version: "1.5.7" + - name: Generate an SSH key + run: ssh-keygen -b 2048 -t rsa -q -N "" -f ~/.ssh/id_rsa + - run: sed -i "s;git::${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}.git//;../../;g" examples/${{ matrix.example }}/main.tf; + - run: terraform -chdir=examples/${{ matrix.example }} init + - run: terraform -chdir=examples/${{ matrix.example }} validate - - name: Test Azure - uses: ./.github/actions/test_provider + validate_advanced_examples: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + example: + - spot_instances/aws + - spot_instances/azure + - spot_instances/gcp + - basic_puppet/openstack + - elk/openstack + - k8s/openstack + - lustre/openstack + - spark/openstack + steps: + - uses: actions/checkout@v4 + - uses: hashicorp/setup-terraform@v3 with: - path: ~/bin - provider: 'azure' + terraform_version: "1.5.7" + - name: Generate an SSH key + run: ssh-keygen -b 2048 -t rsa -q -N "" -f ~/.ssh/id_rsa + - run: sed -i "s;git::${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}.git//;../../../../;g" examples/advanced/${{ matrix.example }}/main.tf; + - run: terraform -chdir=examples/advanced/${{ matrix.example }} init + - run: terraform -chdir=examples/advanced/${{ matrix.example }} validate - - name: Test GCP - uses: ./.github/actions/test_provider - with: - path: ~/bin - provider: 'gcp' + trivy-vuln-scan: + name: Running Trivy Scan + runs-on: ubuntu-latest + needs: [validate_cloud_providers, validate_examples] + steps: + - uses: actions/checkout@v4 - - name: Test OpenStack - uses: ./.github/actions/test_provider - with: - path: ~/bin - provider: 'openstack' + - name: Resolve symbolic links and fix source + run: | + rm {aws,azure,gcp,openstack}/{outputs.tf,variables.tf} + for cloud in aws azure gcp openstack; do + cp common/outputs.tf common/variables.tf $cloud/; + done + sed -i 's;git::https://github.com/ComputeCanada/magic_castle.git//;../../;g' examples/*/*.tf - - name: Test OVH - uses: ./.github/actions/test_provider + - name: Manual Trivy Setup + uses: aquasecurity/setup-trivy@v0.2.2 with: - path: ~/bin - provider: 'ovh' + version: v0.61.1 + cache: true + + - name: Run Trivy on providers + run: trivy config --misconfig-scanners terraform --tf-exclude-downloaded-modules --skip-dirs examples/advanced --format json -o trivy-results.json . - - name: Test CloudFlare DNS + - name: Convert Trivy JSON output to SARIF and filter duplicated results run: | - ~/bin/terraform -chdir=dns/cloudflare init - ~/bin/terraform -chdir=dns/cloudflare validate + trivy convert --format sarif trivy-results.json --output trivy-results.sarif + # When converting from JSON to SARIF, some information, like origin of the misconfiguration, is lost. + # The lost information results in duplicated issues. We filter these issues with jq and create a new + # sarif file that will be uploaded to the security tab. + jq 'reduce .runs[0].results[] as $a ([]; if IN(.[]; $a) then . else . += [$a] end)' trivy-results.sarif > trivy-results-filtered.sarif + jq ".runs[0].results |= $(cat trivy-results-filtered.sarif)" trivy-results.sarif > trivy-results-final.sarif + mv trivy-results-final.sarif trivy-results.sarif + + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v3 + with: + sarif_file: "trivy-results.sarif" - - name: Test Google Cloud DNS + - name: Publish Trivy Output to Summary run: | - ~/bin/terraform -chdir=dns/gcloud init - ~/bin/terraform -chdir=dns/gcloud validate + if [[ -s trivy-results.json ]]; then + { + echo "### Trivy Misconfiguration Scan Output" + echo "
Click to expand" + echo "" + echo '```console' + echo '$ trivy config --misconfig-scanners terraform --tf-exclude-downloaded-modules --skip-dirs examples/advanced .' + trivy convert --format table trivy-results.json + echo '```' + echo "
" + } >> $GITHUB_STEP_SUMMARY + fi diff --git a/.trivyignore b/.trivyignore new file mode 100644 index 000000000..39e04c9c1 --- /dev/null +++ b/.trivyignore @@ -0,0 +1,6 @@ +# Some instance should have public ip addresses +AVD-GCP-0031 + +# Magic Castle does not handle VPC flow logs +AVD-GCP-0029 +AVD-AWS-0178 diff --git a/CHANGELOG.md b/CHANGELOG.md index b72d0279c..ab3f2b2eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,29 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [14.3.0] 2025-05-22 + +### Added +- [github] Added Trivy misconfiguration scan of Terraform code (PR #355) +- [github] Added advanced examples to validation in CI/CD (PR #358) + +### Changed + +- [dns] The default list of vhost subdomains has been replaced by a `["*"]`. +This simplifies configuration of new virtual hosts in the reverse proxy. (PR #347) +- [common] Made sure ssh keys do not have whitespace prefix or suffix (PR #350) +- [aws] Reduced choices of availablity zones in AWS (PR #351) +- [common] Bumped terraform minimum version to 1.5.7 +- [common] Improved instance root disk size computation and warnings (PR #353) +- [github] Modernized github workflows (PR #356) +- [common] Made `count` optional in validation (PR #357) +- [cloud-init] Enabled puppet prometheus reporting (PR #349) +- [cloud-init] Moved puppet server inclusion in /etc/hosts to earlier steps + +### Removed + +- [aws] Removed key pair resource (PR #359) + ## [14.2.1] 2025-02-21 No changes to infrastructure code. @@ -19,7 +42,7 @@ Refer to [puppet-magic_castle changelog](https://github.com/ComputeCanada/puppet - Generalized definition of instance's specs (PR #341) - Made tf user a system user (PR #343) -- Splited sshd config so that Match directives are in their own files (PR #345) +- Split sshd config so that Match directives are in their own files (PR #345) ## [14.1.3] 2025-01-29 diff --git a/README.md b/README.md index 32e0e6d3e..2e5c39642 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ From these new possibilities emerged an open-source software project named Magic ## Setup -- Install [Terraform](https://releases.hashicorp.com/terraform/) (>= 1.4.0) +- Install [Terraform](https://releases.hashicorp.com/terraform/) (>= 1.5.7) - Download the [latest release of Magic Castle](https://github.com/ComputeCanada/magic_castle/releases) for the cloud provider you wish to use. - Uncompress the release - Follow the instructions diff --git a/aws/infrastructure.tf b/aws/infrastructure.tf index 0a567fb72..f196e3221 100644 --- a/aws/infrastructure.tf +++ b/aws/infrastructure.tf @@ -8,6 +8,7 @@ module "design" { cluster_name = var.cluster_name domain = var.domain instances = var.instances + min_disk_size = 20 pool = var.pool volumes = var.volumes firewall_rules = var.firewall_rules @@ -48,22 +49,58 @@ module "provision" { data "aws_availability_zones" "available" { state = "available" + lifecycle { + postcondition { + condition = var.availability_zone == "" || contains(self.names, var.availability_zone) + error_message = "var.availability_zone must be one of ${jsonencode(self.names)}" + } + } +} + +# Retrieve the availability zones in which each unique instance type is available +data "aws_ec2_instance_type_offerings" "inst_az" { + filter { + name = "instance-type" + values = distinct([for instance in module.design.instances: instance.type]) + } + location_type = "availability-zone" +} + +# Build a set of availability zones that offer all selected instance types +locals { + instance_types = distinct([for instance in module.design.instances: instance.type]) + az_choices = setintersection(data.aws_availability_zones.available.names, values({ + for type in local.instance_types: type => + [ for idx, zone in data.aws_ec2_instance_type_offerings.inst_az.locations: zone + if data.aws_ec2_instance_type_offerings.inst_az.instance_types[idx] == type + ] + })...) +} + +resource "terraform_data" "az_check" { + lifecycle { + precondition { + condition = length(local.az_choices) > 0 + error_message = "There is not a single availability zone in ${var.region} that provides all instance types you have selected." + } + precondition { + condition = var.availability_zone == "" || contains(local.az_choices, var.availability_zone) + error_message = < values.local_ip if contains(values.tags, "puppet")} all_tags = toset(flatten([for key, values in var.inventory : values.tags])) @@ -70,7 +71,7 @@ locals { tag_ip = local.tag_ip data = { sudoer_username = var.sudoer_username - public_keys = var.public_keys + public_keys = local.public_keys cluster_name = lower(var.cluster_name) domain_name = var.domain_name guest_passwd = local.guest_passwd @@ -98,7 +99,7 @@ locals { puppetservers = local.puppetservers, puppetserver_password = local.puppet_passwd, sudoer_username = var.sudoer_username, - ssh_authorized_keys = var.public_keys + ssh_authorized_keys = local.public_keys tf_ssh_public_key = tls_private_key.ssh.public_key_openssh # If there is no bastion, the terraform data has to be packed with the user_data of the puppetserver. # We do not packed it systematically because it increases the user-data size to a value that can be diff --git a/common/configuration/puppet.yaml b/common/configuration/puppet.yaml index e3f5a48fd..ec28b649f 100644 --- a/common/configuration/puppet.yaml +++ b/common/configuration/puppet.yaml @@ -32,6 +32,10 @@ runcmd: - chmod 644 /etc/ssh/ssh_host_*_key.pub - chgrp ssh_keys /etc/ssh/ssh_host_*_key.pub - systemctl restart sshd +# Make sure puppet server can be reached by name early in the process if we need to debug. +%{ for host, ip in puppetservers ~} + - echo "${ip} ${host}" >> /etc/hosts +%{ endfor ~} # Enable fastest mirror for distribution using dnf package manager - dnf config-manager --setopt=fastestmirror=True --save # Install package and configure kernel only if building from a "vanilla" linux image @@ -72,6 +76,8 @@ runcmd: - chown puppet:puppet /var/log/autosign.log - /opt/puppetlabs/bin/puppet config set autosign /opt/puppetlabs/puppet/bin/autosign-validator --section server - /opt/puppetlabs/bin/puppet config set allow_duplicate_certs true --section server + - /opt/puppetlabs/bin/puppet config set reports prometheus --section server + - install -d -m 0755 -o puppet -g puppet /var/lib/node_exporter # allow puppet to write report as prometheus metrics on first run # Generate bootstrap hieradata asymmetric encryption key - mkdir -p /etc/puppetlabs/puppet/eyaml - "(cd /etc/puppetlabs/puppet/eyaml; openssl req -x509 -nodes -newkey rsa:2048 -keyout boot_private_key.pkcs7.pem -out boot_public_key.pkcs7.pem -batch)" @@ -101,16 +107,12 @@ runcmd: %{ endif } - chgrp puppet /etc/puppetlabs/puppet/csr_attributes.yaml %{ endif } -# Setup puppet servers -%{ for host, ip in puppetservers ~} - - echo "${ip} ${host}" >> /etc/hosts -%{ endfor ~} %{ if length(puppetservers) > 0 ~} - /opt/puppetlabs/bin/puppet config set server ${keys(puppetservers)[0]} %{ endif ~} - /opt/puppetlabs/bin/puppet config set certname ${node_name} - /opt/puppetlabs/bin/puppet config set waitforcert 15s - - /opt/puppetlabs/bin/puppet config set report false + - /opt/puppetlabs/bin/puppet config set report true - /opt/puppetlabs/bin/puppet config set postrun_command /opt/puppetlabs/bin/postrun - systemctl enable puppet # Remove all ifcfg configuration files that have no corresponding network interface in ip link show. diff --git a/common/configuration/versions.tf b/common/configuration/versions.tf index adb9abffc..9d01be053 100644 --- a/common/configuration/versions.tf +++ b/common/configuration/versions.tf @@ -1,6 +1,6 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" required_providers { random = { source = "hashicorp/random" diff --git a/common/design/main.tf b/common/design/main.tf index acb71aded..81941fda9 100644 --- a/common/design/main.tf +++ b/common/design/main.tf @@ -5,11 +5,16 @@ data "http" "agent_ip" { locals { domain_name = "${lower(var.cluster_name)}.${lower(var.domain)}" + min_disk_size_per_tags = { + "mgmt": 20 + } + instances = merge( flatten([ for prefix, attrs in var.instances : [ for i in range(lookup(attrs, "count", 1)) : { (format("%s%d", prefix, i + 1)) = merge( + { disk_size = max(var.min_disk_size, [for tag in attrs.tags: lookup(local.min_disk_size_per_tags, tag, 0)]...)}, { for attr, value in attrs : attr => value if ! contains(["count"], attr) }, { prefix = prefix, @@ -73,3 +78,14 @@ locals { 0), "") } + +check "disk_space_per_tag" { + assert { + condition = alltrue(flatten([for inst in local.instances: [for tag in inst.tags: lookup(local.min_disk_size_per_tags, tag, var.min_disk_size) <= inst.disk_size ]])) + error_message = "At least one instance's disk_size is smaller than what is recommended given its set of tags.\nMininum disk size per tags: ${jsonencode(local.min_disk_size_per_tags)}" + } + assert { + condition = alltrue([for inst in local.instances: var.min_disk_size <= inst.disk_size ]) + error_message = "At least one instance's disk_size is smaller than what is recommended by the cloud provider.\nMinimum disk size for provider: ${var.min_disk_size}" + } +} diff --git a/common/design/variables.tf b/common/design/variables.tf index 922c999c2..1aa8bbc26 100644 --- a/common/design/variables.tf +++ b/common/design/variables.tf @@ -3,4 +3,5 @@ variable "domain" { } variable "instances" { } variable "volumes" { } variable "pool" { } -variable "firewall_rules" { } \ No newline at end of file +variable "firewall_rules" { } +variable "min_disk_size" { } \ No newline at end of file diff --git a/common/variables.tf b/common/variables.tf index 8edaf9424..1fde95565 100644 --- a/common/variables.tf +++ b/common/variables.tf @@ -16,7 +16,7 @@ variable "nb_users" { variable "instances" { description = "Map that defines the parameters for each type of instance of the cluster" validation { - condition = alltrue([for key, values in var.instances: can(regex("^[a-z][0-9a-z-]{1,63}$", "${key}${values.count}"))]) + condition = alltrue([for key, values in var.instances: can(regex("^[a-z][0-9a-z-]{1,63}$", "${key}${lookup(values, "count", 1)}"))]) error_message = "Instances' prefix plus index must be at most 63 lowercase alphanumeric characters and start with a letter. It can include dashes." } validation { @@ -24,7 +24,7 @@ variable "instances" { error_message = "Each entry in var.instances needs to have at least a type and a list of tags." } validation { - condition = sum([for key, values in var.instances: contains(values["tags"], "proxy") ? values["count"] : 0]) < 2 + condition = sum([for key, values in var.instances: contains(values["tags"], "proxy") ? lookup(values, "count", 1) : 0]) < 2 error_message = "At most one instance in var.instances can have the _proxy_ tag" } validation { diff --git a/dns/cloudflare/variables.tf b/dns/cloudflare/variables.tf index f021da3bd..46385c820 100644 --- a/dns/cloudflare/variables.tf +++ b/dns/cloudflare/variables.tf @@ -7,7 +7,7 @@ variable "domain" { variable "vhosts" { description = "List of vhost dns records to create as vhost.name.domain_name." type = list(string) - default = ["ipa", "jupyter", "mokey", "explore"] + default = ["*"] } variable "domain_tag" { diff --git a/dns/cloudflare/versions.tf b/dns/cloudflare/versions.tf index 8cf00d458..01dc1a90d 100644 --- a/dns/cloudflare/versions.tf +++ b/dns/cloudflare/versions.tf @@ -1,6 +1,6 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" required_providers { cloudflare = { source = "cloudflare/cloudflare" diff --git a/dns/gcloud/versions.tf b/dns/gcloud/versions.tf index 582096c43..ec19d1607 100644 --- a/dns/gcloud/versions.tf +++ b/dns/gcloud/versions.tf @@ -1,5 +1,5 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" required_providers { google = { source = "hashicorp/google" diff --git a/dns/record_generator/versions.tf b/dns/record_generator/versions.tf index 45c61689f..4d9570d0a 100644 --- a/dns/record_generator/versions.tf +++ b/dns/record_generator/versions.tf @@ -1,5 +1,5 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" required_providers { external = { source = "hashicorp/external" diff --git a/docs/README.md b/docs/README.md index 44e5c189c..4fe8ebdb8 100644 --- a/docs/README.md +++ b/docs/README.md @@ -6,7 +6,7 @@ To use Magic Castle you will need: -1. Terraform (>= 1.4.0) +1. Terraform (>= 1.5.7) 2. Authenticated access to a cloud 3. Ability to communicate with the cloud provider API from your computer 4. A project with operational limits meeting the requirements described in _Quotas_ subsection. @@ -498,7 +498,7 @@ instance, while in Puppet code tags are used to identify roles of the instances. Terraform tags: - `login`: identify instances accessible with SSH from Internet and pointed by the domain name A records -- `pool`: identify instances created only when their hostname appears in the [`var.pool`](#417-pool-optional) list. +- `pool`: identify instances created only when their hostname appears in the [`var.pool`](#419-pool-optional) list. - `proxy`: identify instances accessible with HTTP/HTTPS and pointed by the vhost A records - `public`: identify instances that need to have a public ip address reachable from Internet - `puppet`: identify instances configured as Puppet servers @@ -532,14 +532,20 @@ be leveraged to accelerate compute node configuration. | Provider | `disk_type` | `disk_size` (GiB) | | -------- | :---------- | ----------------: | | Azure |`Premium_LRS`| 30 | - | AWS | `gp2` | 10 | + | AWS | `gp2` | 20 | | GCP | `pd-ssd` | 20 | | OpenStack| `null` | 10 | | OVH | `null` | 10 | 4. `disk_size`: size in gibibytes (GiB) of the instance's root disk containing -the operating system and service software -(default: see the previous table). +the operating system and service software. The default value is computed has the +maximum between the cloud provider default size (see previous table) and the +recommended minimum size per tag as specified in the following table. + + | Tag | min `disk_size` (GiB) | + | -------- | --------------------: | + | `mgmt` | 20 | + 5. `mig`: map of [NVIDIA Multi-Instance GPU (MIG)](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html) short profile names and count used to partition the instances' GPU, example for an A100: ``` mig = { "1g.5gb" = 2, "2g.10gb" = 1, "3g.20gb" = 1 } @@ -1111,7 +1117,7 @@ Refer to the subsection [6.3](#63-unsupported-providers) for more details. #### 6.1.2 Cloudflare API Token -If you prefer using an API token instead of the global API key, you will need to configure a token with the following four permissions with the [Cloudflare API Token interface](https://dash.cloudflare.com/profile/api-tokens). +If you prefer using an API token instead of the global API key, you will need to configure a token with the following permissions using the [Cloudflare API Token interface](https://dash.cloudflare.com/profile/api-tokens). | Section | Subsection | Permission| | :------ |:---------- | :-------- | @@ -1182,7 +1188,7 @@ described by the `main.tf` configuration file. Terraform should now be able to communicate with your cloud provider. To test your configuration file, enter the following command ``` -terraform plan +terraform plan -out tfplan ``` This command will validate the syntax of your configuration file and @@ -1191,6 +1197,49 @@ is only a dry-run. If Terraform does not report any error, you can move to the next step. Otherwise, read the errors and fix your configuration file accordingly. +### 7.1 Scanning plan for misconfiguration (optional) + +[Trivy](https://trivy.dev/latest/) is an open source security scanner +that scans Terraform files (code and plan) and reports about potential issues. +Magic Castle development team has integrated Trivy in its +[CI/CD pipeline](https://github.com/ComputeCanada/magic_castle/blob/main/.github/workflows/trivy_scan.yaml) +to prevent misconfiguration and security issues that could be introduced +by commits or a pull-requests. You too can use Trivy to verify your Terraform plan +before applying it. + +After [installing Trivy](https://trivy.dev/latest/getting-started/), you can +scan the Terraform plan produced in section 7, like this: +``` +trivy conf tfplan +``` + +Trivy then produces a report about configuration issues like this: +```console +AVD-OPNSTK-0003 (MEDIUM): Security group rule allows ingress to multiple public addresses. +═════════════════════════════════════════════════════════════════════════ +Opening up ports to the public internet is generally to be avoided. You should +restrict access to IP addresses or ranges that explicitly require it where possible. + +See https://avd.aquasec.com/misconfig/avd-opnstk-0003 +───────────────────────────────────────────────────────────────────────── + ./openstack/openstack/network-2.tf:53 + via ./openstack/openstack/network-2.tf:45-56 (openstack_networking_secgroup_rule_v2.rule["ssh"]) + via main.tf:10-44 (module.openstack) +───────────────────────────────────────────────────────────────────────── + 45 resource openstack_networking_secgroup_rule_v2 "rule" { + .. + 53 [ remote_ip_prefix = each.value.cidr + .. + 56 } +───────────────────────────────────────────────────────────────────────── +``` + +The most common configuration issues identified by Trivy in Magic Castle plans +(illustrated in the previous output example), are firewall rules allowing access to port from +public internet. If you know which IP addresses should have access to the cluster, +you can harden the firewall rules. Refer to section [4.16 firewall_rules](#416-firewall_rules-optional) +for more information. + ## 8. Deployment To create the resources defined by your main, enter the following command @@ -1274,16 +1323,15 @@ It is possible to destroy only the instances and keep the rest of the infrastruc like the floating ip, the volumes, the generated SSH host key, etc. To do so, set the count value of the instance type you wish to destroy to 0. -### 9.2 Reset +### 9.2 Instance Replacement On some occasions, it is desirable to rebuild some of the instances from scratch. -Using `terraform taint`, you can designate resources that will be rebuilt at -next application of the plan. +Using the `-replace` option of `terraform apply`, you can designate resources +that will be rebuilt at next application of the plan. -To rebuild the first login node : +For example, to rebuild the first login node : ``` -terraform taint 'module.openstack.openstack_compute_instance_v2.instances["login1"]' -terraform apply +terraform apply -replace='module.openstack.openstack_compute_instance_v2.instances["login1"]' ``` ## 10. Customize Cluster Software Configuration @@ -1408,7 +1456,7 @@ By default, instances tagged `login` have their port 22 opened to entire world. If you know the range of ip addresses that will connect to your cluster, we strongly recommend that you limit the access to port 22 to this range. -To limit the access to port 22, refer to [section 4.14 firewall_rules](#414-firewall_rules-optional), +To limit the access to port 22, refer to [section 4.16 firewall_rules](#416-firewall_rules-optional), and replace the `cidr` of the `ssh` rule to match the range of ip addresses that have be the allowed to connect to the cluster. If there are more than one range, create multiple rules with distinct names. @@ -1688,6 +1736,28 @@ a volume, add `enable_resize = true` to its specs map. You can then increase the The corresponding volume will be expanded by the cloud provider and the filesystem will be extended by Puppet. +### 10.15 Access Prometheus' expression browser + +Prometheus is an open-source systems monitoring and alerting toolkit. It is installed by default +in Magic Castle. Every instance exposes their usage metrics and some services do to. To explore +and visualize this data, it possible to access the [expression browser](https://prometheus.io/docs/visualization/browser/). + +From inside the cluster, it is typically available at `http://mgmt1:9090`. Given DNS is configured +for your cluster, you can add the following snippet to your [hieradata](#413-hieradata-optional). to access the expression browser +from Internet. + +```yaml +lookup_options: + profile::reverse_proxy::subdomains: + merge: 'hash' +profile::reverse_proxy::subdomains: + metrics: "%{lookup('terraform.tag_ip.mgmt.0')}:9090" +profile::reverse_proxy::remote_ips: + metrics: [''] +``` + +Prometheus will then be available at `http://metrics.your-cluster.yourdomain.tld/`. + ## 11. Customize Magic Castle Terraform Files You can modify the Terraform module files in the folder named after your cloud diff --git a/docs/developers.md b/docs/developers.md index a0274d478..6942598b3 100644 --- a/docs/developers.md +++ b/docs/developers.md @@ -12,11 +12,11 @@ ## 1. Setup To develop for Magic Castle you will need: -* Terraform (>= 1.4.0) +* Terraform (>= 1.5.7) * git * Access to a Cloud (e.g.: Compute Canada Arbutus) * Ability to communicate with the cloud provider API from your computer -* A cloud project with enough room for the resource described in section [Magic Caslte Doc 1.1](README.md#11-quotas). +* A cloud project with enough room for the resources described in [section 1.4](README.md#14-quotas). * [optional] [Puppet Development Kit (PDK)](https://www.puppet.com/docs/pdk/latest/pdk.html) ## 2. Where to start diff --git a/docs/terraform_cloud.md b/docs/terraform_cloud.md index 412380233..9f7a90dfe 100644 --- a/docs/terraform_cloud.md +++ b/docs/terraform_cloud.md @@ -188,7 +188,7 @@ plan will then be automatically applied. Terraform cloud only allows to apply or destroy the plan as stated in the main.tf, but sometimes it can be useful to run some other terraform commands that are only -available through the command-line interface, for example `terraform taint`. +available through the command-line interface. It is possible to import the terraform state of a cluster on your local computer and then use the CLI on it. diff --git a/examples/advanced/basic_puppet/openstack/main.tf b/examples/advanced/basic_puppet/openstack/main.tf index ee7bb1359..9700f4785 100644 --- a/examples/advanced/basic_puppet/openstack/main.tf +++ b/examples/advanced/basic_puppet/openstack/main.tf @@ -1,5 +1,5 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" } module "openstack" { diff --git a/examples/advanced/elk/openstack/main.tf b/examples/advanced/elk/openstack/main.tf index b1a4d66ca..3f3bfc9a5 100644 --- a/examples/advanced/elk/openstack/main.tf +++ b/examples/advanced/elk/openstack/main.tf @@ -1,5 +1,5 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" } module "openstack" { diff --git a/examples/advanced/k8s/openstack/main.tf b/examples/advanced/k8s/openstack/main.tf index 46b7ca223..4359aaf7d 100644 --- a/examples/advanced/k8s/openstack/main.tf +++ b/examples/advanced/k8s/openstack/main.tf @@ -1,5 +1,5 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" } module "openstack" { diff --git a/examples/advanced/lustre/openstack/main.tf b/examples/advanced/lustre/openstack/main.tf index eca16c04b..b22be985b 100644 --- a/examples/advanced/lustre/openstack/main.tf +++ b/examples/advanced/lustre/openstack/main.tf @@ -1,5 +1,5 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" } module "openstack" { diff --git a/examples/advanced/spark/openstack/main.tf b/examples/advanced/spark/openstack/main.tf index 532a29a92..edc71e57e 100644 --- a/examples/advanced/spark/openstack/main.tf +++ b/examples/advanced/spark/openstack/main.tf @@ -1,5 +1,5 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" } module "openstack" { diff --git a/examples/advanced/spot_instances/aws/main.tf b/examples/advanced/spot_instances/aws/main.tf index bd91cc078..6432f499e 100644 --- a/examples/advanced/spot_instances/aws/main.tf +++ b/examples/advanced/spot_instances/aws/main.tf @@ -1,5 +1,5 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" } module "aws" { diff --git a/examples/advanced/spot_instances/azure/main.tf b/examples/advanced/spot_instances/azure/main.tf index 3bc5bbdcf..a3f7f2051 100644 --- a/examples/advanced/spot_instances/azure/main.tf +++ b/examples/advanced/spot_instances/azure/main.tf @@ -1,5 +1,5 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" } module "azure" { diff --git a/examples/advanced/spot_instances/gcp/main.tf b/examples/advanced/spot_instances/gcp/main.tf index 7d870c7e3..2a3c98866 100644 --- a/examples/advanced/spot_instances/gcp/main.tf +++ b/examples/advanced/spot_instances/gcp/main.tf @@ -1,5 +1,5 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" } module "gcp" { diff --git a/examples/aws/main.tf b/examples/aws/main.tf index 12970d0a7..eb9edea30 100644 --- a/examples/aws/main.tf +++ b/examples/aws/main.tf @@ -1,5 +1,5 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" } variable "pool" { diff --git a/examples/azure/main.tf b/examples/azure/main.tf index 1270e23fd..f5bec05c1 100644 --- a/examples/azure/main.tf +++ b/examples/azure/main.tf @@ -1,5 +1,5 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" } variable "pool" { diff --git a/examples/gcp/main.tf b/examples/gcp/main.tf index 2b2ca4ac8..a2a70c3c1 100644 --- a/examples/gcp/main.tf +++ b/examples/gcp/main.tf @@ -1,5 +1,5 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" } variable "pool" { diff --git a/examples/openstack/main.tf b/examples/openstack/main.tf index 3312548d4..cddbf2e60 100644 --- a/examples/openstack/main.tf +++ b/examples/openstack/main.tf @@ -1,5 +1,5 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" } variable "pool" { diff --git a/examples/ovh/main.tf b/examples/ovh/main.tf index 7c39f881f..3242e754b 100644 --- a/examples/ovh/main.tf +++ b/examples/ovh/main.tf @@ -1,5 +1,5 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" } variable "pool" { diff --git a/gcp/infrastructure.tf b/gcp/infrastructure.tf index 575871be2..79a458c06 100644 --- a/gcp/infrastructure.tf +++ b/gcp/infrastructure.tf @@ -8,6 +8,7 @@ module "design" { cluster_name = var.cluster_name domain = var.domain instances = var.instances + min_disk_size = 20 pool = var.pool volumes = var.volumes firewall_rules = var.firewall_rules @@ -95,7 +96,7 @@ resource "google_compute_instance" "instances" { initialize_params { image = lookup(each.value, "image", var.image) type = lookup(each.value, "disk_type", "pd-ssd") - size = lookup(each.value, "disk_size", 20) + size = each.value.disk_size } } diff --git a/gcp/versions.tf b/gcp/versions.tf index cb2d9bae3..ce57ed791 100644 --- a/gcp/versions.tf +++ b/gcp/versions.tf @@ -1,4 +1,4 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" } diff --git a/mkdocs.yml b/mkdocs.yml index fbc14f626..31549ab85 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,4 +1,6 @@ site_name: Magic Castle +validation: + anchors: warn theme: name: material logo: img/logo.png diff --git a/openstack/infrastructure.tf b/openstack/infrastructure.tf index 7644ff982..1f0d3d1ad 100644 --- a/openstack/infrastructure.tf +++ b/openstack/infrastructure.tf @@ -3,6 +3,7 @@ module "design" { cluster_name = var.cluster_name domain = var.domain instances = var.instances + min_disk_size = 10 pool = var.pool volumes = var.volumes firewall_rules = var.firewall_rules @@ -58,7 +59,7 @@ data "openstack_compute_flavor_v2" "flavors" { resource "openstack_compute_instance_v2" "instances" { for_each = module.design.instances_to_build name = format("%s-%s", var.cluster_name, each.key) - image_id = lookup(each.value, "disk_size", 10) > data.openstack_compute_flavor_v2.flavors[each.value.prefix].disk ? null : data.openstack_images_image_v2.image[each.value.prefix].id + image_id = each.value.disk_size > data.openstack_compute_flavor_v2.flavors[each.value.prefix].disk ? null : data.openstack_images_image_v2.image[each.value.prefix].id flavor_name = each.value.type user_data = base64gzip(module.configuration.user_data[each.key]) @@ -76,7 +77,7 @@ resource "openstack_compute_instance_v2" "instances" { } dynamic "block_device" { - for_each = lookup(each.value, "disk_size", 10) > data.openstack_compute_flavor_v2.flavors[each.value.prefix].disk ? [{ volume_size = lookup(each.value, "disk_size", 10) }] : [] + for_each = each.value.disk_size > data.openstack_compute_flavor_v2.flavors[each.value.prefix].disk ? [{ volume_size = each.value.disk_size }] : [] content { uuid = data.openstack_images_image_v2.image[each.value.prefix].id source_type = "image" diff --git a/openstack/versions.tf b/openstack/versions.tf index 64dd2480f..1270ee941 100644 --- a/openstack/versions.tf +++ b/openstack/versions.tf @@ -1,6 +1,6 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" required_providers { openstack = { source = "terraform-provider-openstack/openstack" diff --git a/ovh/versions.tf b/ovh/versions.tf index 10c008a81..8d769ed8e 100644 --- a/ovh/versions.tf +++ b/ovh/versions.tf @@ -1,6 +1,6 @@ terraform { - required_version = ">= 1.4.0" + required_version = ">= 1.5.7" required_providers { openstack = { source = "terraform-provider-openstack/openstack"