diff --git a/chameleon_devops/terraform/clouds.yaml b/chameleon_devops/terraform/clouds.yaml deleted file mode 100644 index b456af6..0000000 --- a/chameleon_devops/terraform/clouds.yaml +++ /dev/null @@ -1,69 +0,0 @@ -# This is a clouds.yaml file, which can be used by OpenStack tools as a source -# of configuration on how to connect to a cloud. If this is your only cloud, -# just put this file in ~/.config/openstack/clouds.yaml and tools like -# python-openstackclient will just work with no further config. (You will need -# to add your password to the auth section) -# If you have more than one cloud account, add the cloud entry to the clouds -# section of your existing file and you can refer to them by name with -# OS_CLOUD=openstack or --os-cloud=openstack -# clouds: -# uc: -# region_name: "CHI@UC" -# interface: "public" -# identity_api_version: 3 -# auth_type: "v3applicationcredential" -# auth: -# auth_url: https://chi.uc.chameleoncloud.org:5000 -# application_credential_id: "cb25b38cbf2e46c292b3e6db41a6031f" -# application_credential_secret: "ntDzeKvsZXGDLReTVENnX-40NoeXGjMY6wvTIvBzW8B8Uhp7LIvIDqQWnT7BbMYsLedm3UW__KKX_M81gQHWQg" - -# chi: -# region_name: "CHI@TACC" -# interface: "public" -# identity_api_version: 3 -# auth_type: "v3applicationcredential" -# auth: -# auth_url: https://chi.tacc.chameleoncloud.org:5000 -# application_credential_id: "f5989572a68047feac8c3abeff67b483" -# application_credential_secret: "0nimRVXbNiSpdNOXlB2D0NzUk0MwrVHoaxB67Pk2OH3rIlL62QIoDkfS2fi59De19vZJ7NnP-iI2NbQUgfVlpg" - -# kvm: -# region_name: "KVM@TACC" -# interface: "public" -# identity_api_version: 3 -# auth_type: "v3applicationcredential" -# auth: -# auth_url: https://chi.tacc.chameleoncloud.org:5000 -# application_credential_id: "dedfb7d5e3f94bfbb6331cca69bd49ab" -# application_credential_secret: "zdGt_4g08mBpjni9XdN1PLTfsId6YBZGhATnI-RuNRpOoafSISPmV0-bfm-8PtQe3gHnMCMFmkQwUMTtDd7iRA" - -clouds: - kvm: - auth: - auth_url: https://kvm.tacc.chameleoncloud.org:5000 - application_credential_id: "d70e3d0a045a475cbe144ee7fc4165b2" - application_credential_secret: "i6HIpgQxC7yv4E1B4FfwJ2lfJiD0rgK6P2vPVY_TXNSm2IaMltdM2vrvUZN_82jyKA6DKVL7uXJ_kHPwVDg7DQ" - region_name: "KVM@TACC" - interface: "public" - identity_api_version: 3 - auth_type: "v3applicationcredential" - - chi: - auth: - auth_url: https://chi.tacc.chameleoncloud.org:5000 - application_credential_id: "5197696b4ee14e4ea618cea39fa858d2" - application_credential_secret: "J5JlyFe78OiaiRiVXQRsO1cv4uivczckfetFgwRdjVl9q1tkt4Cc1e8WcJvMf5tMkJxQQzZfbXDWwa8AJsj4KQ" - region_name: "CHI@TACC" - interface: "public" - identity_api_version: 3 - auth_type: "v3applicationcredential" - - uc: - auth: - auth_url: https://chi.uc.chameleoncloud.org:5000 - application_credential_id: "8f342610b0664b379c31b77d25c675d0" - application_credential_secret: "zU32xkLrRN1MWq2oDqFHycXQ80NKHlBFIyEwid9W1igxsJWeHRGqbiPuloDYFRTbEC9xrrWzOJ94_loD7v-r0A" - region_name: "CHI@UC" - interface: "public" - identity_api_version: 3 - auth_type: "v3applicationcredential" \ No newline at end of file diff --git a/chameleon_devops/terraform/main.tf b/chameleon_devops/terraform/main.tf index a26f5a0..62547ca 100644 --- a/chameleon_devops/terraform/main.tf +++ b/chameleon_devops/terraform/main.tf @@ -1,60 +1,36 @@ ############################################# -# main.tf # +# main.tf ############################################# -locals { - use_chi_tacc = var.gpu_region == var.chi_region - use_chi_uc = var.gpu_region == var.uc_region - - // must be strings for for_each - app_ports_str = [ - "5000", "3000", "9090", - "9000", "9001", "8000", - "8265", "10001", - ] - - # path to your private SSH key (drops the ".pub") - private_key_path = "~/.ssh/${var.keypair_name}" -} - # ────────────────────────────────────────── # 1) External (public) network data sources # ────────────────────────────────────────── data "openstack_networking_network_v2" "external_kvm" { provider = openstack.kvm - name = var.ext_net_name_kvm -} - -data "openstack_networking_network_v2" "external_uc" { - provider = openstack.uc - name = var.ext_net_name_uc + name = var.ext_net_name_kvm # e.g. "sharednet4" } data "openstack_networking_network_v2" "external_chi" { provider = openstack.chi - name = var.ext_net_name_tacc + name = var.ext_net_name_chi # e.g. "public" } # ────────────────────────────────────────── -# 2) SSH Keypair import +# 2) SSH Keypair import (KVM@TACC) # ────────────────────────────────────────── -resource "openstack_compute_keypair_v2" "keypair_kvm" { +resource "openstack_compute_keypair_v2" "keypair" { provider = openstack.kvm name = var.keypair_name public_key = file(var.public_key_path) } +# SSH keypair import to CHI@UC resource "openstack_compute_keypair_v2" "keypair_uc" { provider = openstack.uc name = var.keypair_name public_key = file(var.public_key_path) } -resource "openstack_compute_keypair_v2" "keypair_chi" { - provider = openstack.chi - name = var.keypair_name - public_key = file(var.public_key_path) -} # ────────────────────────────────────────── # 3) Security Group + rules (KVM@TACC) @@ -64,16 +40,11 @@ resource "openstack_networking_secgroup_v2" "mlops_secgrp" { name = var.security_group_name description = "Security group for MLOps VMs" } -resource "openstack_networking_secgroup_v2" "mlops_secgrp_proj4" { - provider = openstack.kvm - name = "mlops-secgrp-proj4" # must be globally unique in your project - description = "Security group for MLOps VMs (proj4)" -} -# Ingress rules +# SSH resource "openstack_networking_secgroup_rule_v2" "ssh_ingress" { provider = openstack.kvm - security_group_id = openstack_networking_secgroup_v2.mlops_secgrp_proj4.id + security_group_id = openstack_networking_secgroup_v2.mlops_secgrp.id direction = "ingress" protocol = "tcp" port_range_min = 22 @@ -82,36 +53,108 @@ resource "openstack_networking_secgroup_rule_v2" "ssh_ingress" { ethertype = "IPv4" } +# ICMP (ping) resource "openstack_networking_secgroup_rule_v2" "icmp_ingress" { provider = openstack.kvm - security_group_id = openstack_networking_secgroup_v2.mlops_secgrp_proj4.id + security_group_id = openstack_networking_secgroup_v2.mlops_secgrp.id direction = "ingress" protocol = "icmp" remote_ip_prefix = "0.0.0.0/0" ethertype = "IPv4" } -# Intra-group traffic +# Intra‑group traffic (all TCP) resource "openstack_networking_secgroup_rule_v2" "internal" { provider = openstack.kvm - security_group_id = openstack_networking_secgroup_v2.mlops_secgrp_proj4.id + security_group_id = openstack_networking_secgroup_v2.mlops_secgrp.id direction = "ingress" protocol = "tcp" port_range_min = 1 port_range_max = 65535 - remote_group_id = openstack_networking_secgroup_v2.mlops_secgrp_proj4.id + remote_group_id = openstack_networking_secgroup_v2.mlops_secgrp.id ethertype = "IPv4" } -// App‑specific ports (loop) -resource "openstack_networking_secgroup_rule_v2" "app_ports" { - for_each = toset(local.app_ports_str) +# MLflow UI +resource "openstack_networking_secgroup_rule_v2" "mlflow_ui_ingress" { provider = openstack.kvm - security_group_id = openstack_networking_secgroup_v2.mlops_secgrp_proj4.id + security_group_id = openstack_networking_secgroup_v2.mlops_secgrp.id direction = "ingress" protocol = "tcp" - port_range_min = tonumber(each.value) - port_range_max = tonumber(each.value) + port_range_min = 5000 + port_range_max = 5000 + remote_ip_prefix = "0.0.0.0/0" + ethertype = "IPv4" +} + +# Grafana +resource "openstack_networking_secgroup_rule_v2" "grafana_ingress" { + provider = openstack.kvm + security_group_id = openstack_networking_secgroup_v2.mlops_secgrp.id + direction = "ingress" + protocol = "tcp" + port_range_min = 3000 + port_range_max = 3000 + remote_ip_prefix = "0.0.0.0/0" + ethertype = "IPv4" +} + +# Prometheus +resource "openstack_networking_secgroup_rule_v2" "prometheus_ingress" { + provider = openstack.kvm + security_group_id = openstack_networking_secgroup_v2.mlops_secgrp.id + direction = "ingress" + protocol = "tcp" + port_range_min = 9090 + port_range_max = 9090 + remote_ip_prefix = "0.0.0.0/0" + ethertype = "IPv4" +} + +# MinIO (9000 & 9001) +resource "openstack_networking_secgroup_rule_v2" "minio_ingress" { + provider = openstack.kvm + security_group_id = openstack_networking_secgroup_v2.mlops_secgrp.id + direction = "ingress" + protocol = "tcp" + port_range_min = 9000 + port_range_max = 9001 + remote_ip_prefix = "0.0.0.0/0" + ethertype = "IPv4" +} + +# FastAPI +resource "openstack_networking_secgroup_rule_v2" "fastapi_ingress" { + provider = openstack.kvm + security_group_id = openstack_networking_secgroup_v2.mlops_secgrp.id + direction = "ingress" + protocol = "tcp" + port_range_min = 8000 + port_range_max = 8000 + remote_ip_prefix = "0.0.0.0/0" + ethertype = "IPv4" +} + +# Ray Dashboard +resource "openstack_networking_secgroup_rule_v2" "ray_dashboard_ingress" { + provider = openstack.kvm + security_group_id = openstack_networking_secgroup_v2.mlops_secgrp.id + direction = "ingress" + protocol = "tcp" + port_range_min = 8265 + port_range_max = 8265 + remote_ip_prefix = "0.0.0.0/0" + ethertype = "IPv4" +} + +# Ray Client Server +resource "openstack_networking_secgroup_rule_v2" "ray_client_ingress" { + provider = openstack.kvm + security_group_id = openstack_networking_secgroup_v2.mlops_secgrp.id + direction = "ingress" + protocol = "tcp" + port_range_min = 10001 + port_range_max = 10001 remote_ip_prefix = "0.0.0.0/0" ethertype = "IPv4" } @@ -157,10 +200,10 @@ resource "openstack_networking_router_interface_v2" "router_intf_kvm" { # ────────────────────────────────────────── resource "openstack_compute_instance_v2" "services_node" { provider = openstack.kvm - name = "services-node-${var.network_name}" + name = "services-node-project4" image_name = var.services_image flavor_name = var.services_flavor - key_pair = openstack_compute_keypair_v2.keypair_kvm.name + key_pair = openstack_compute_keypair_v2.keypair.name security_groups = [openstack_networking_secgroup_v2.mlops_secgrp.name] network { @@ -169,23 +212,22 @@ resource "openstack_compute_instance_v2" "services_node" { } # ────────────────────────────────────────── -# 6) Private net on CHI@TACC for GPU & staging -# (use existing network to avoid stale-ID errors) +# 6) (Optional) Private net on CHI@TACC for GPU & staging # ────────────────────────────────────────── resource "openstack_networking_network_v2" "private_net_chi" { - provider = openstack.chi - name = "${var.network_name}-chi" - admin_state_up = true - port_security_enabled = false + provider = openstack.chi + name = "${var.network_name}-chi" + admin_state_up = true + port_security_enabled = false } resource "openstack_networking_subnet_v2" "private_subnet_chi" { provider = openstack.chi name = "${var.network_name}-chi-subnet" network_id = openstack_networking_network_v2.private_net_chi.id - cidr = var.network_cidr + cidr = var.network_cidr # or introduce var.network_cidr_chi ip_version = 4 - gateway_ip = var.network_gateway + gateway_ip = var.network_gateway # adjust if needed allocation_pool { start = var.network_pool_start end = var.network_pool_end @@ -193,23 +235,35 @@ resource "openstack_networking_subnet_v2" "private_subnet_chi" { dns_nameservers = var.dns_nameservers } -resource "openstack_networking_router_v2" "router_chi" { - provider = openstack.chi - name = "${var.network_name}-router-chi" - admin_state_up = true - external_network_id = data.openstack_networking_network_v2.external_chi.id -} - -resource "openstack_networking_router_interface_v2" "router_intf_chi" { - provider = openstack.chi - router_id = openstack_networking_router_v2.router_chi.id - # subnet_id = data.openstack_networking_subnet_v2.private_subnet_chi.id - subnet_id = openstack_networking_subnet_v2.private_subnet_chi.id -} - # ────────────────────────────────────────── -# 7) Private net on CHI@UC (GPU fallback) +# 7) Security Group + rules (CHI@TACC) # ────────────────────────────────────────── +# CHI@TACC security group (new) +# This leads to an error: Quota exceeded for resources: ['security_group'] +# resource "openstack_networking_secgroup_v2" "mlops_secgrp_chi" { +# provider = openstack.chi +# name = var.security_group_name +# description = "Security group for MLOps GPU VM" +# } + +# data "openstack_networking_secgroup_v2" "mlops_secgrp_chi" { +# provider = openstack.chi +# name = var.security_group_name +# } + +# SSH +# resource "openstack_networking_secgroup_rule_v2" "ssh_ingress_chi" { +# provider = openstack.chi +# # security_group_id = openstack_networking_secgroup_v2.mlops_secgrp_chi.id +# # security_group_id = data.openstack_networking_secgroup_v2.mlops_secgrp_chi.id +# direction = "ingress" +# ethertype = "IPv4" +# protocol = "tcp" +# port_range_min = 22 +# port_range_max = 22 +# remote_ip_prefix = "0.0.0.0/0" +# } + resource "openstack_networking_network_v2" "private_net_uc" { provider = openstack.uc name = "${var.network_name}-uc" @@ -231,6 +285,11 @@ resource "openstack_networking_subnet_v2" "private_subnet_uc" { dns_nameservers = var.dns_nameservers } +data "openstack_networking_network_v2" "external_uc" { + provider = openstack.uc + name = var.ext_net_name_chi +} + resource "openstack_networking_router_v2" "router_uc" { provider = openstack.uc name = "${var.network_name}-router-uc" @@ -244,48 +303,55 @@ resource "openstack_networking_router_interface_v2" "router_intf_uc" { subnet_id = openstack_networking_subnet_v2.private_subnet_uc.id } + # ────────────────────────────────────────── -# 8) GPU VM(s) on CHI depending on region +# 7) Private subnet (CHI@TACC)) # ────────────────────────────────────────── -resource "openstack_compute_instance_v2" "gpu_node_chi_tacc" { - count = local.use_chi_tacc ? 1 : 0 - provider = openstack.chi - name = "gpu-node-${var.network_name}" - image_id = var.gpu_image_id_tacc - flavor_name = var.gpu_flavor - key_pair = openstack_compute_keypair_v2.keypair_chi.name - config_drive = true +# 1) Router in CHI@TACC +resource "openstack_networking_router_v2" "router_chi" { + provider = openstack.chi + name = "${var.network_name}-router-chi" + admin_state_up = true + external_network_id = data.openstack_networking_network_v2.external_chi.id +} - block_device { - uuid = var.gpu_image_id_tacc - source_type = "image" - destination_type = "local" - boot_index = 0 - delete_on_termination = true - } +# 2) Hook CHI subnet into that router +resource "openstack_networking_router_interface_v2" "router_intf_chi" { + provider = openstack.chi + router_id = openstack_networking_router_v2.router_chi.id + subnet_id = openstack_networking_subnet_v2.private_subnet_chi.id +} - network { - # uuid = data.openstack_networking_network_v2.private_net_chi.id - uuid = openstack_networking_network_v2.private_net_chi.id - } +# ────────────────────────────────────────── +# 8) GPU VM on CHI@TACC +# ────────────────────────────────────────── +# resource "openstack_compute_instance_v2" "gpu_node" { +# provider = openstack.chi +# name = "gpu-node-project4" +# image_name = var.gpu_image +# flavor_name = var.gpu_flavor +# key_pair = openstack_compute_keypair_v2.keypair.name +# # security_groups = [data.openstack_networking_secgroup_v2.mlops_secgrp_chi.name] +# # no security_groups declared → port_security_disabled on this network - scheduler_hints { - additional_properties = { - reservation = var.gpu_reservation_id_tacc - } - } +# network { +# uuid = openstack_networking_network_v2.private_net_chi.id +# } - depends_on = [openstack_networking_subnet_v2.private_subnet_chi] - # depends_on = [data.openstack_networking_subnet_v2.private_subnet_chi] -} +# scheduler_hints { +# additional_properties = { +# reservation = var.gpu_reservation_id +# } +# } +# } + +resource "openstack_compute_instance_v2" "gpu_node" { + provider = openstack.uc + name = "gpu-node-project4" + image_name = var.gpu_image + flavor_name = var.gpu_flavor + key_pair = openstack_compute_keypair_v2.keypair_uc.name -resource "openstack_compute_instance_v2" "gpu_node_chi_uc" { - count = local.use_chi_uc ? 1 : 0 - provider = openstack.uc - name = "gpu-node-${var.network_name}" - image_id = var.gpu_image_id_uc - flavor_name = var.gpu_flavor - key_pair = openstack_compute_keypair_v2.keypair_uc.name config_drive = true block_device { @@ -307,81 +373,81 @@ resource "openstack_compute_instance_v2" "gpu_node_chi_uc" { } } + # ────────────────────────────────────────── # 9) Optional staging VM on CHI@TACC # ────────────────────────────────────────── -# resource "openstack_compute_instance_v2" "staging_node" { -# count = var.enable_staging ? 1 : 0 -# provider = openstack.chi -# name = "staging-node-${var.network_name}" -# image_name = var.staging_image -# flavor_name = var.staging_flavor -# key_pair = openstack_compute_keypair_v2.keypair_chi.name +resource "openstack_compute_instance_v2" "staging_node" { + count = var.enable_staging ? 1 : 0 + provider = openstack.chi + name = "staging-node" + image_name = var.staging_image + flavor_name = var.staging_flavor + key_pair = openstack_compute_keypair_v2.keypair.name + security_groups = [openstack_networking_secgroup_v2.mlops_secgrp.name] -# network { -# uuid = data.openstack_networking_network_v2.private_net_chi.id -# } -# } + network { + uuid = openstack_networking_network_v2.private_net_chi.id + } +} # ────────────────────────────────────────── # 10) Floating IPs & associations # ────────────────────────────────────────── +# Services node resource "openstack_networking_floatingip_v2" "fip_services" { provider = openstack.kvm pool = data.openstack_networking_network_v2.external_kvm.name } - resource "openstack_compute_floatingip_associate_v2" "assoc_services" { provider = openstack.kvm floating_ip = openstack_networking_floatingip_v2.fip_services.address instance_id = openstack_compute_instance_v2.services_node.id } -resource "openstack_networking_floatingip_v2" "fip_gpu_chi_tacc" { - count = local.use_chi_tacc ? 1 : 0 - provider = openstack.chi - pool = data.openstack_networking_network_v2.external_chi.name -} - -resource "openstack_compute_floatingip_associate_v2" "assoc_gpu_chi_tacc" { - count = local.use_chi_tacc ? 1 : 0 - provider = openstack.chi - floating_ip = openstack_networking_floatingip_v2.fip_gpu_chi_tacc[0].address - instance_id = openstack_compute_instance_v2.gpu_node_chi_tacc[0].id -} +# GPU node +# resource "openstack_networking_floatingip_v2" "fip_gpu" { +# provider = openstack.chi +# pool = data.openstack_networking_network_v2.external_chi.name +# } +# resource "openstack_compute_floatingip_associate_v2" "assoc_gpu" { +# provider = openstack.chi +# floating_ip = openstack_networking_floatingip_v2.fip_gpu.address +# instance_id = openstack_compute_instance_v2.gpu_node.id +# } -resource "openstack_networking_floatingip_v2" "fip_gpu_chi_uc" { - count = local.use_chi_uc ? 1 : 0 +resource "openstack_networking_floatingip_v2" "fip_gpu_uc" { provider = openstack.uc pool = data.openstack_networking_network_v2.external_uc.name } -resource "openstack_compute_floatingip_associate_v2" "assoc_gpu_chi_uc" { - count = local.use_chi_uc ? 1 : 0 +resource "openstack_compute_floatingip_associate_v2" "assoc_gpu_uc" { provider = openstack.uc - floating_ip = openstack_networking_floatingip_v2.fip_gpu_chi_uc[0].address - instance_id = openstack_compute_instance_v2.gpu_node_chi_uc[0].id + floating_ip = openstack_networking_floatingip_v2.fip_gpu_uc.address + instance_id = openstack_compute_instance_v2.gpu_node.id } -# resource "openstack_networking_floatingip_v2" "fip_staging" { -# count = var.enable_staging ? 1 : 0 -# provider = openstack.chi -# pool = data.openstack_networking_network_v2.external_chi.name -# } -# resource "openstack_compute_floatingip_associate_v2" "assoc_staging" { -# count = var.enable_staging ? 1 : 0 -# provider = openstack.chi -# floating_ip = openstack_networking_floatingip_v2.fip_staging[count.index].address -# instance_id = openstack_compute_instance_v2.staging_node[count.index].id -# } +# Staging node (if enabled) +resource "openstack_networking_floatingip_v2" "fip_staging" { + count = var.enable_staging ? 1 : 0 + provider = openstack.chi + pool = data.openstack_networking_network_v2.external_chi.name +} +resource "openstack_compute_floatingip_associate_v2" "assoc_staging" { + count = var.enable_staging ? 1 : 0 + provider = openstack.chi + floating_ip = openstack_networking_floatingip_v2.fip_staging[count.index].address + instance_id = openstack_compute_instance_v2.staging_node[count.index].id +} -# ────────────────────────────────────────── -# 11) Persistent storage -# ────────────────────────────────────────── + +# STORAGE + +# Persistent storage for Services Node (KVM) resource "openstack_blockstorage_volume_v3" "persistent_volume_services" { provider = openstack.kvm - size = 100 + size = 100 # Adjust size (GB) as required name = "persistent-storage-services" } @@ -389,33 +455,34 @@ resource "openstack_compute_volume_attach_v2" "attach_persistent_services" { provider = openstack.kvm instance_id = openstack_compute_instance_v2.services_node.id volume_id = openstack_blockstorage_volume_v3.persistent_volume_services.id - device = "/dev/vdb" + device = "/dev/vdb" # typical first device } -resource "openstack_blockstorage_volume_v3" "persistent_volume_gpu_uc" { - count = local.use_chi_uc && var.enable_gpu_block_storage ? 1 : 0 +resource "openstack_blockstorage_volume_v3" "persistent_volume_gpu" { + count = var.enable_gpu_block_storage ? 1 : 0 provider = openstack.uc - size = 100 name = "gpu-persistent-volume" + size = 100 } -resource "openstack_compute_volume_attach_v2" "attach_gpu_volume_uc" { - count = local.use_chi_uc && var.enable_gpu_block_storage ? 1 : 0 +resource "openstack_compute_volume_attach_v2" "attach_gpu_volume" { + count = var.enable_gpu_block_storage ? 1 : 0 provider = openstack.uc - instance_id = openstack_compute_instance_v2.gpu_node_chi_uc[0].id - volume_id = openstack_blockstorage_volume_v3.persistent_volume_gpu_uc[0].id + instance_id = openstack_compute_instance_v2.gpu_node.id + volume_id = openstack_blockstorage_volume_v3.persistent_volume_gpu[0].id } -resource "openstack_blockstorage_volume_v3" "persistent_volume_gpu_tacc" { - count = local.use_chi_tacc && var.enable_gpu_block_storage ? 1 : 0 - provider = openstack.chi - size = 100 - name = "gpu-persistent-volume" -} +# Persistent storage for GPU Node (UC) +# resource "openstack_blockstorage_volume_v3" "persistent_volume_gpu" { +# provider = openstack.uc +# size = 100 # Adjust size (GB) as required +# name = "persistent-storage-gpu" +# } + +# resource "openstack_compute_volume_attach_v2" "attach_persistent_gpu" { +# provider = openstack.uc +# instance_id = openstack_compute_instance_v2.gpu_node.id +# volume_id = openstack_blockstorage_volume_v3.persistent_volume_gpu.id +# device = "/dev/vdb" +# } -resource "openstack_compute_volume_attach_v2" "attach_gpu_volume_tacc" { - count = local.use_chi_tacc && var.enable_gpu_block_storage ? 1 : 0 - provider = openstack.chi - instance_id = openstack_compute_instance_v2.gpu_node_chi_tacc[0].id - volume_id = openstack_blockstorage_volume_v3.persistent_volume_gpu_tacc[0].id -} diff --git a/chameleon_devops/terraform/outputs.tf b/chameleon_devops/terraform/outputs.tf index 17d8c14..f3eaac7 100644 --- a/chameleon_devops/terraform/outputs.tf +++ b/chameleon_devops/terraform/outputs.tf @@ -1,41 +1,43 @@ +# This playbook installs Docker and the NVIDIA container runtime on Ubuntu 24.04. output "services_node_ip" { value = openstack_networking_floatingip_v2.fip_services.address } - output "ssh_command_services_node" { - value = "ssh -i ~/.ssh/${openstack_compute_keypair_v2.keypair_kvm.name} cc@${openstack_networking_floatingip_v2.fip_services.address}" + description = "SSH command to connect to the Services VM using the floating IP" + value = "ssh -i ~/.ssh/${openstack_compute_keypair_v2.keypair.name} cc@${openstack_networking_floatingip_v2.fip_services.address}" } - output "gpu_node_ip" { - value = local.use_chi_tacc ? openstack_networking_floatingip_v2.fip_gpu_chi_tacc[0].address : openstack_networking_floatingip_v2.fip_gpu_chi_uc[0].address + value = openstack_networking_floatingip_v2.fip_gpu_uc.address } - output "ssh_command_gpu_node" { - value = local.use_chi_tacc ? "ssh -i ${local.private_key_path} cc@${openstack_networking_floatingip_v2.fip_gpu_chi_tacc[0].address}" : "ssh -i ${local.private_key_path} cc@${openstack_networking_floatingip_v2.fip_gpu_chi_uc[0].address}" + description = "SSH command to connect to the GPU VM using the floating IP" + value = "ssh -i ~/.ssh/${openstack_compute_keypair_v2.keypair_uc.name} cc@${openstack_networking_floatingip_v2.fip_gpu_uc.address}" } - output "services_private_ip" { value = openstack_compute_instance_v2.services_node.network[0].fixed_ip_v4 } - output "gpu_private_ip" { - value = local.use_chi_tacc ? openstack_compute_instance_v2.gpu_node_chi_tacc[0].network[0].fixed_ip_v4 : openstack_compute_instance_v2.gpu_node_chi_uc[0].network[0].fixed_ip_v4 + value = openstack_compute_instance_v2.gpu_node.network[0].fixed_ip_v4 } +# Optionally, output an Ansible inventory snippet output "ansible_inventory" { - value = <<-EOT + description = "Suggested inventory snippet for Ansible" + # use a “left‑trim” heredoc so any indentation is removed + value = <<-EOF [services_nodes] services-node ansible_host=${openstack_networking_floatingip_v2.fip_services.address} ansible_user=cc [gpu_nodes] - gpu-node ansible_host=${local.use_chi_tacc ? openstack_networking_floatingip_v2.fip_gpu_chi_tacc[0].address : openstack_networking_floatingip_v2.fip_gpu_chi_uc[0].address} ansible_user=cc + gpu-node ansible_host=${openstack_networking_floatingip_v2.fip_gpu_uc.address} ansible_user=cc [k8s_control_plane] services-node ansible_host=${openstack_networking_floatingip_v2.fip_services.address} ansible_user=cc [all:vars] - ansible_ssh_private_key_file=${local.private_key_path} + ansible_ssh_private_key_file=~/.ssh/${openstack_compute_keypair_v2.keypair.name} ansible_python_interpreter=/usr/bin/python3 - EOT + EOF } + diff --git a/chameleon_devops/terraform/terraform.tfvars b/chameleon_devops/terraform/terraform.tfvars index 3674da6..110936f 100644 --- a/chameleon_devops/terraform/terraform.tfvars +++ b/chameleon_devops/terraform/terraform.tfvars @@ -1,7 +1,4 @@ # cloud = "chameleon" - -# gpu_site = "chi_tacc" # or "chi_uc" -gpu_site = "chi_uc" # or "chi_tacc" cloud = "kvm" cloud_kvm = "kvm" @@ -33,14 +30,8 @@ enable_staging = false # gpu_reservation_id = "47d0e0cd-883c-4639-a646-a47fde6e7c4a" # gpu_reservation_id = "8c7169a8-0f36-4d4a-a1f3-eaddf7646112" # gpu_reservation_id = "15b16b92-95c9-4d4e-87b9-5e2eac70ee29" -# gpu_reservation_id = "d97ab437-5cab-47a6-99dc-001ba9822c18" - -gpu_image_id_uc = "45661d6e-d442-48b2-892f-e39a246011cc" -gpu_image_id_tacc = "fab0dfeb-52d7-46fd-9398-613daf8e63c0" +gpu_reservation_id = "d97ab437-5cab-47a6-99dc-001ba9822c18" +gpu_image_id_uc = "45661d6e-d442-48b2-892f-e39a246011cc" # gpu_reservation_id_uc = "d97ab437-5cab-47a6-99dc-001ba9822c18" -# gpu_reservation_id_uc = "1def9c07-1eda-44fc-9564-dae33761ba88" -# gpu_reservation_id_uc = "dfac712e-3045-4a19-ad2c-7ff0a0ba8d45" -gpu_reservation_id_uc = "323712db-e65b-47dd-bf48-3df29532959a" -gpu_reservation_id_tacc = "1e1edcc2-7276-4201-a5d4-408ddb6f850a" -private_net_chi_id = "c95788a7-75fd-422d-91af-8bd31704f03f" +gpu_reservation_id_uc = "1def9c07-1eda-44fc-9564-dae33761ba88" \ No newline at end of file diff --git a/chameleon_devops/terraform/variables.tf b/chameleon_devops/terraform/variables.tf index 06d121c..f550309 100644 --- a/chameleon_devops/terraform/variables.tf +++ b/chameleon_devops/terraform/variables.tf @@ -58,12 +58,7 @@ variable "ext_net_name_kvm" { type = string default = "public" } -variable "ext_net_name_uc" { - description = "Public network in KVM@TACC" - type = string - default = "public" -} -variable "ext_net_name_tacc" { +variable "ext_net_name_chi" { description = "Public network in CHI@TACC" type = string default = "public" # <— adjust if your CHI site uses a different name @@ -118,12 +113,6 @@ variable "network_cidr" { default = "10.0.0.0/24" } -variable "private_net_chi_id" { - description = "The single CHI@TACC private-network ID to attach GPU instances to" - type = string -} - - # Security group variable "security_group_name" { description = "Name for the security group" @@ -188,28 +177,8 @@ variable "gpu_reservation_id_uc" { type = string } -variable "gpu_image_id_tacc" { - description = "GPU image UUID for CHI@UC" - type = string -} - -variable "gpu_reservation_id_tacc" { - description = "Reservation UUID for GPU node in CHI@UC" - type = string -} - variable "enable_gpu_block_storage" { description = "Whether to attach persistent block storage to the GPU node" type = bool default = false } - -variable "gpu_site" { - description = "Which CHI site to use for the GPU host: chi_tacc or chi_uc" - type = string - default = "chi_tacc" - validation { - condition = contains(["chi_tacc", "chi_uc"], var.gpu_site) - error_message = "gpu_site must be either \"chi_tacc\" or \"chi_uc\"" - } -} diff --git a/devops/k8s/fastapi.yaml b/devops/k8s/fastapi.yaml new file mode 100644 index 0000000..e69de29 diff --git a/devops/scripts/deploy_api.sh b/devops/scripts/deploy_api.sh new file mode 100644 index 0000000..e69de29