From f780349c6c9ad8060c86129f0c5a7f552e320e6b Mon Sep 17 00:00:00 2001 From: Sanjay Srikakulam Date: Wed, 15 Oct 2025 10:58:52 +0200 Subject: [PATCH 1/2] Add Ansible playbook and roles to configure the host and deploy Koina server --- .gitignore | 6 + README.md | 4 +- docs/server/.gpu-driver.yaml.swp | Bin 12288 -> 0 bytes docs/server/deployment/ansible/README.md | 54 ++++++++ docs/server/deployment/ansible/hosts | 6 + .../deployment/ansible/koina_server.yml | 131 ++++++++++++++++++ .../deployment/ansible/requirements.yml | 13 ++ .../ansible/roles/koina-server/README.md | 26 ++++ .../roles/koina-server/defaults/main.yml | 7 + .../roles/koina-server/handlers/main.yml | 1 + .../ansible/roles/koina-server/meta/main.yml | 45 ++++++ .../ansible/roles/koina-server/tasks/main.yml | 34 +++++ .../templates/docker-compose.yml.j2 | 38 +++++ .../ansible/roles/koina-server/vars/main.yml | 2 + .../roles/nvidia-container-toolkit/README.md | 29 ++++ .../defaults/main.yml | 1 + .../nvidia-container-toolkit/meta/main.yml | 45 ++++++ .../nvidia-container-toolkit/tasks/main.yml | 35 +++++ .../ansible/templates/nginx/koina.conf.j2 | 63 +++++++++ .../ansible/templates/nginx/koinarpc.conf.j2 | 60 ++++++++ 20 files changed, 598 insertions(+), 2 deletions(-) delete mode 100644 docs/server/.gpu-driver.yaml.swp create mode 100644 docs/server/deployment/ansible/README.md create mode 100644 docs/server/deployment/ansible/hosts create mode 100644 docs/server/deployment/ansible/koina_server.yml create mode 100644 docs/server/deployment/ansible/requirements.yml create mode 100644 docs/server/deployment/ansible/roles/koina-server/README.md create mode 100644 docs/server/deployment/ansible/roles/koina-server/defaults/main.yml create mode 100644 docs/server/deployment/ansible/roles/koina-server/handlers/main.yml create mode 100644 docs/server/deployment/ansible/roles/koina-server/meta/main.yml create mode 100644 docs/server/deployment/ansible/roles/koina-server/tasks/main.yml create mode 100644 docs/server/deployment/ansible/roles/koina-server/templates/docker-compose.yml.j2 create mode 100644 docs/server/deployment/ansible/roles/koina-server/vars/main.yml create mode 100644 docs/server/deployment/ansible/roles/nvidia-container-toolkit/README.md create mode 100644 docs/server/deployment/ansible/roles/nvidia-container-toolkit/defaults/main.yml create mode 100644 docs/server/deployment/ansible/roles/nvidia-container-toolkit/meta/main.yml create mode 100644 docs/server/deployment/ansible/roles/nvidia-container-toolkit/tasks/main.yml create mode 100644 docs/server/deployment/ansible/templates/nginx/koina.conf.j2 create mode 100644 docs/server/deployment/ansible/templates/nginx/koinarpc.conf.j2 diff --git a/.gitignore b/.gitignore index dd8bd20e..f3ab079b 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,9 @@ models/ms2pip/**/*.json # Hide symlink in models/repo models/repo/* node_modules + +# Ansible +*ansible.log +*.vault_password +*collections/ +*geerlingguy* diff --git a/README.md b/README.md index a7b207c9..032d1ceb 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ## Accessing a public server ### cURL -Here is an example HTTP request using only cURL sending a POST request to with a JSON body. You can find examples for all available models at https://koina.wilhelmlab.org/. +Here is an example HTTP request using only cURL sending a POST request to with a JSON body. You can find examples for all available models at https://koina.wilhelmlab.org/. ```bash curl "https://koina.wilhelmlab.org/v2/models/Prosit_2019_intensity/infer" \ @@ -101,7 +101,7 @@ For examples of how to access models using Python, you can check out [our OpenAP Koina depends on [docker](https://docs.docker.com/engine/install/) and [nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/overview.html). It has only been tested on Linux (Debian/Ubuntu) with Nvidia GPUs. -You can find an ansible script that installs all dependencies [here](docs/server/). +You can find an Ansible playbook that installs all dependencies and sets up the Koina server [here](docs/server/deployment/ansible/). ### How to run it After installing the dependencies, you can pull the docker image and run it. If you have multiple GPUs installed on your server, you can choose which one is used by modifying `--gpus '"device=0"'`. The time it takes to pull the image depends on your connection speed. The first time, it might take up to 5 min. Due to the layered design of Docker images, updating to the latest version will likely (depending on the amount of changes) only take seconds. When the server is first started, Model files are downloaded from Zenodo. The duration of this also depends on connection speed but might take ~10 min as well. Once models are downloaded, the server startup takes ~2 minutes. diff --git a/docs/server/.gpu-driver.yaml.swp b/docs/server/.gpu-driver.yaml.swp deleted file mode 100644 index a78b3b81e2856a48123b616f5dbece6b8aa72f03..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12288 zcmeI2O^Xyq7{_aKRbxKD6fc3fZM z)qD#Lc<|~c5cJ|L1O#t_$3Q+nRQ&hbtcZ$xTzG2uPrp1>PyL@?)n%9phQF`;N{j9> zf%csc|LprKI`n*e<=oXdk-CXAabDJ;FSRgFi?nja#l%S#hpxI7=-i!NpSpIEx{;a< zz1=4qOtp!XNsSB!Nv<>Luj9+K_k$=8t3jp84OI^6oKdN9>u$=_%u?|U6JP?{AuzG) z&f=c6g=0sD^{^l257YjAYum}deVG6gU;<2l2`~XBzyz286WHMdY%wS9poVX{I)3b~ zb01y1N4_utCcp%k025#WOn?b60Vco%m;e)C0y~g^jD+}mmk{@M!#w`~?|=Wl+bhHy z=zr)n^a6SUJ%%1Z_n^B_44r`vKySVk;y>sv^apeq`UN@(EkNHx|LzguCG-q>0NscF zglNfC(@GCcp%k02BCt0QJqP%KA7` zoXRZnXd34-)uIm;IfA7M^4$87GIG1pa3;u|{B(t%wo5h@*<=O$^=w|yMK;pqg2rFGs7 zn`#^T+N!jfyY3|mRA2jBx^bfp+`2`RB0^Puh#A#V>x-zjqqEAw9u+c7Y zgInh`b56bU&Sq_lXgXGwoNR-f^A5$<7L_*`7?}o<#8L6e1-Lx z@lTz-I2!qv&ij{s9{I~_r&m|~b1TcE)!#--M+to#(M&^Oik)?{d64ZYy1_&nt21O$ z#?;Hym)}>=&PUCBbi~R;RV1-JwNXEHY1>6NI-6W`VkIM0;!wQ8I_a?M@gZoBY&lV_ q2`tGcro=*K%21aI9X`W-EN!exe_e+4R_d~TVq/defaults/main.yml`). +- Override values per-host or per-group using `host_vars/` or `group_vars/` or pass via `-e` on the command line. +- Ensure to review and modify accordingly: + - `koina_server.yml` for the overall orchestration and variable flow. + - `roles/*/defaults/main.yml` to find complete variable names and defaults. + +## Nginx templates + +- Nginx virtual host templates live under `templates/nginx` and are consumed by the `geerlingguy.nginx` role. Modify or copy these templates to customize upstreams, SSL, or proxy rules before running the playbook. + +Example templates path: +``` +templates/nginx/koina.conf.j2 +templates/nginx/koinarpc.conf.j2 +``` + +### TLS/SSL Certificates +- The playbook uses the `geerlingguy.certbot` role to automatically obtain and renew TLS/SSL certificates from Let's Encrypt using Certbot. +- Ensure that the domain names specified in the variables are correctly pointed to your server's IP address before running the playbook. +- Port 80 must be open and accessible for the HTTP-01 challenge used by Let's Encrypt. +- The email address provided in the variables is used for important account notifications from Let's Encrypt. \ No newline at end of file diff --git a/docs/server/deployment/ansible/hosts b/docs/server/deployment/ansible/hosts new file mode 100644 index 00000000..33bc53b6 --- /dev/null +++ b/docs/server/deployment/ansible/hosts @@ -0,0 +1,6 @@ +[koina_servers] +koina-bi-01 ansible_host= + +[all:vars] +ansible_ssh_user=ubuntu # Change this if your server uses a different user (it is assumed that this user has sudo privileges) +ansible_ssh_private_key_file=~/.ssh/id_rsa # Path to your SSH private key diff --git a/docs/server/deployment/ansible/koina_server.yml b/docs/server/deployment/ansible/koina_server.yml new file mode 100644 index 00000000..08bcb100 --- /dev/null +++ b/docs/server/deployment/ansible/koina_server.yml @@ -0,0 +1,131 @@ +--- +# Playbook to deploy Minio on a VM +- hosts: koina_servers + become: yes + vars_files: + - secret_group_vars/all.yml + vars: + koinarpc_docker_port: 8500 + koinahttp_docker_port: 8501 + koinametrics_docker_port: 8502 + KOINA_SERVER_NAME: "yourdomain.com" # Change this to your domain + KOINA_RPC_SERVER_NAME: "rpc.yourdomain.com" # Change this to your domain + ADMIN_EMAIL_ADDRESS: "admin@yourdomain.com" # Change this to your email address for Certbot/Let's Encrypt + KOINA_CONTAINER_DIR: "/opt/koina" # Directory to store Koina Docker container data and the compose file + pre_tasks: + - name: Update and upgrade apt packages + ansible.builtin.apt: + update_cache: yes + upgrade: dist + + - name: Install dependencies + ansible.builtin.apt: + name: + - python3 + - python3-venv + - python3-pip + - ufw + - ubuntu-drivers-common + state: present + update_cache: yes + + - name: Enable UFW + community.general.ufw: + state: enabled + + - name: Allow SSH (port 22) + community.general.ufw: + rule: allow + port: '22' + proto: tcp + + - name: Allow HTTP (port 80) + community.general.ufw: + rule: allow + port: '80' + proto: tcp + + - name: Allow HTTPS (port 443) + community.general.ufw: + rule: allow + port: '443' + proto: tcp + + - name: Allow all outgoing traffic + community.general.ufw: + default: allow + direction: outgoing + + - name: Deny all other incoming traffic + community.general.ufw: + default: deny + direction: incoming + + - name: Install Nvidia driver for GPU # This is not idempotent (need to re-work this). + ansible.builtin.command: ubuntu-drivers install --gpgpu + register: nvidia_install + changed_when: "'installed' in nvidia_install.stdout" + + - name: Detect installed Nvidia server driver versions + ansible.builtin.command: bash -c "dpkg -l | awk '/nvidia-compute-utils-[0-9]+-server/{print $2}' | sort -V | tail -n 1" + register: nvidia_driver_pkg + changed_when: false + + - name: Extract Nvidia driver version number + ansible.builtin.set_fact: + nvidia_driver_version: "{{ nvidia_driver_pkg.stdout | regex_search('[0-9]+') }}" + + - name: Install matching Nvidia server-utils package + ansible.builtin.apt: + name: "nvidia-utils-{{ nvidia_driver_version }}-server" + state: present + update_cache: yes + when: nvidia_driver_version is defined and nvidia_driver_version | length > 0 + + - name: Reboot if Nvidia driver was installed # This is not idempotent due to the driver install step above (so reboot always runs :( ). + ansible.builtin.reboot: + msg: "Rebooting after Nvidia driver installation" + pre_reboot_delay: 10 + when: "'installed' in nvidia_install.stdout" + + - name: Check if nvidia-smi works + ansible.builtin.command: nvidia-smi + register: nvidia_smi + failed_when: nvidia_smi.rc != 0 + changed_when: false + roles: + - role: geerlingguy.docker + vars: + docker_users: + - 'ubuntu' + + - role: nvidia-container-toolkit + + - role: geerlingguy.nginx + vars: + nginx_remove_default_vhost: true + nginx_vhosts: + - server_name: "{{ KOINA_SERVER_NAME }}" + template: "{{ playbook_dir }}/templates/nginx/koina.conf.j2" + - server_name: "{{ KOINA_RPC_SERVER_NAME }}" + template: "{{ playbook_dir }}/templates/nginx/koinarpc.conf.j2" + ssl_certificate_path: '/etc/letsencrypt/live/{{ KOINA_SERVER_NAME }}/fullchain.pem' + ssl_certificate_key_path: '/etc/letsencrypt/live/{{ KOINA_SERVER_NAME }}/privkey.pem' + + - role: geerlingguy.certbot + vars: + certbot_create_if_missing: true + certbot_create_extra_args: '' + certbot_create_method: standalone + certbot_admin_email: "{{ ADMIN_EMAIL_ADDRESS }}" + certbot_create_standalone_stop_services: + - nginx + certbot_certs: + - domains: + - "{{ KOINA_SERVER_NAME }}" + - "{{ KOINA_RPC_SERVER_NAME }}" + webroot: '/var/www/certbot' + + - role: koina-server + vars: + koina_container_dir: "{{ KOINA_CONTAINER_DIR }}" diff --git a/docs/server/deployment/ansible/requirements.yml b/docs/server/deployment/ansible/requirements.yml new file mode 100644 index 00000000..2ecd23ac --- /dev/null +++ b/docs/server/deployment/ansible/requirements.yml @@ -0,0 +1,13 @@ +--- +collections: + - name: community.general + source: https://galaxy.ansible.com + - name: community.docker + source: https://galaxy.ansible.com +roles: + - name: geerlingguy.docker + version: 7.6.0 + - name: geerlingguy.certbot + version: 5.4.1 + - name: geerlingguy.nginx + version: 3.2.0 diff --git a/docs/server/deployment/ansible/roles/koina-server/README.md b/docs/server/deployment/ansible/roles/koina-server/README.md new file mode 100644 index 00000000..90caa8a8 --- /dev/null +++ b/docs/server/deployment/ansible/roles/koina-server/README.md @@ -0,0 +1,26 @@ +# Ansible role: koina-server + +Role to provision and configure a Koina inference server. + +This role installs and configures the components required to run a Koina server (Docker + NVIDIA runtime, Triton model repository deployment steps, service configuration). It is intended for use from the repository's Ansible playbook [koina-server.yml](https://github.com/wilhelm-lab/koina/tree/main/docs/server/deployment/ansible/koina_server.yml). + +## Features +- Deploys Koina container with GPU support. + +## Requirements +- A target host with sudo privileges. +- Internet access to download packages and model artifacts. +- Docker, NVIDIA Container Toolkit, NVIDIA drivers. +- Other roles in the ansible roles directory as well as the [koina-server.yml](https://github.com/wilhelm-lab/koina/tree/main/docs/server/deployment/ansible/koina_server.yml) playbook. + +## Role variables +Define role variables in your playbook or inventory group_vars/host_vars. Typical variables include (examples only — adjust for your environment): + +- koina_container_name: "koina-server" +- koina_container_dir: "" +- koinarpc_docker_port: 8500 +- koinahttp_docker_port: 8501 +- koinametrics_docker_port: 8502 +- koina_shm_size: '8gb' + +(Note: Adapt the variables to your specific needs and also the Docker Compose template in the role's templates/ directory.) diff --git a/docs/server/deployment/ansible/roles/koina-server/defaults/main.yml b/docs/server/deployment/ansible/roles/koina-server/defaults/main.yml new file mode 100644 index 00000000..f40e7042 --- /dev/null +++ b/docs/server/deployment/ansible/roles/koina-server/defaults/main.yml @@ -0,0 +1,7 @@ +--- +koina_container_name: "koina-server" +koina_container_dir: "/opt/koina" +koinarpc_docker_port: 8500 +koinahttp_docker_port: 8501 +koinametrics_docker_port: 8502 +koina_shm_size: "2gb" diff --git a/docs/server/deployment/ansible/roles/koina-server/handlers/main.yml b/docs/server/deployment/ansible/roles/koina-server/handlers/main.yml new file mode 100644 index 00000000..ed97d539 --- /dev/null +++ b/docs/server/deployment/ansible/roles/koina-server/handlers/main.yml @@ -0,0 +1 @@ +--- diff --git a/docs/server/deployment/ansible/roles/koina-server/meta/main.yml b/docs/server/deployment/ansible/roles/koina-server/meta/main.yml new file mode 100644 index 00000000..29f62807 --- /dev/null +++ b/docs/server/deployment/ansible/roles/koina-server/meta/main.yml @@ -0,0 +1,45 @@ +galaxy_info: + author: Sanjay Srikakulam + description: Ansible role to deploy and manage KOINA inference server + company: Forschungszentrum Jülich + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: MIT + + min_ansible_version: "2.1" + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + platforms: + - name: Ubuntu + versions: + - all + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. diff --git a/docs/server/deployment/ansible/roles/koina-server/tasks/main.yml b/docs/server/deployment/ansible/roles/koina-server/tasks/main.yml new file mode 100644 index 00000000..44d64bca --- /dev/null +++ b/docs/server/deployment/ansible/roles/koina-server/tasks/main.yml @@ -0,0 +1,34 @@ +--- +- name: Ensure koina container directory exists + ansible.builtin.file: + path: "{{ koina_container_dir }}" + state: directory + owner: root + group: root + mode: '0755' + +- name: Template docker-compose.yml + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{ koina_container_dir }}/docker-compose.yml" + owner: root + group: root + mode: '0644' + +- name: Start koina server container + community.docker.docker_compose_v2: + project_src: "{{ koina_container_dir }}" + files: + - docker-compose.yml + state: present + +# - name: Ensure Koina server is running by curl health endpoint +# ansible.builtin.uri: +# url: "http://localhost:{{ koinahttp_docker_port }}/v2/health/ready" +# method: GET +# return_content: true +# status_code: 200 +# register: koina_health_check +# retries: 3 +# delay: 10 +# until: koina_health_check.status == 200 diff --git a/docs/server/deployment/ansible/roles/koina-server/templates/docker-compose.yml.j2 b/docs/server/deployment/ansible/roles/koina-server/templates/docker-compose.yml.j2 new file mode 100644 index 00000000..891072ad --- /dev/null +++ b/docs/server/deployment/ansible/roles/koina-server/templates/docker-compose.yml.j2 @@ -0,0 +1,38 @@ +services: + koina-server: + container_name: "{{ koina_container_name }}" + image: ghcr.io/wilhelm-lab/koina:latest + restart: unless-stopped + shm_size: "{{ koina_shm_size }}" + ulimits: + memlock: + soft: -1 + hard: -1 + stack: + soft: 67108864 + hard: 67108864 + ports: + - "{{ koinarpc_docker_port }}:{{ koinarpc_docker_port }}" + - "{{ koinahttp_docker_port }}:{{ koinahttp_docker_port }}" + - "{{ koinametrics_docker_port }}:{{ koinametrics_docker_port }}" + networks: + - koinanetwork + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + environment: + - NVIDIA_VISIBLE_DEVICES=all + healthcheck: + test: ["CMD-SHELL", "curl -v http://localhost:{{ koinahttp_docker_port }}/v2/health/ready || exit 1"] + interval: 2m + timeout: 10s + retries: 3 + start_period: 10m + +networks: + koinanetwork: + driver: bridge diff --git a/docs/server/deployment/ansible/roles/koina-server/vars/main.yml b/docs/server/deployment/ansible/roles/koina-server/vars/main.yml new file mode 100644 index 00000000..cd21505a --- /dev/null +++ b/docs/server/deployment/ansible/roles/koina-server/vars/main.yml @@ -0,0 +1,2 @@ +--- + diff --git a/docs/server/deployment/ansible/roles/nvidia-container-toolkit/README.md b/docs/server/deployment/ansible/roles/nvidia-container-toolkit/README.md new file mode 100644 index 00000000..dfadb0cf --- /dev/null +++ b/docs/server/deployment/ansible/roles/nvidia-container-toolkit/README.md @@ -0,0 +1,29 @@ +# Ansible role: nvidia-container-toolkit + +Role to install and configure the NVIDIA Container Toolkit (nvidia-docker / nvidia-container-runtime) on target hosts (Ubuntu only) so Docker/containers can access NVIDIA GPUs. + +This role prepares the host by adding NVIDIA package repositories, installing the runtime/toolkit packages, and configuring Docker to use the NVIDIA runtime when requested. + +## Features +- Adds NVIDIA apt repositories +- Installs nvidia-container-toolkit +- Configures Nvidia runtime for Docker and restarts Docker daemon + +## Requirements +- Sudo/root privileges on target hosts +- Docker engine already installed (this role does not install Docker itself) +- Internet access to fetch NVIDIA packages and GPG keys +- Compatible NVIDIA driver installed on host (driver installation is out of scope) + +## Example playbook +```yaml +- hosts: gpu-servers + become: true + roles: + - nvidia-container-toolkit +``` + +Run with your inventory: +```bash +ansible-playbook -i inventory.ini playbooks/setup-gpu.yml --ask-become-pass +``` diff --git a/docs/server/deployment/ansible/roles/nvidia-container-toolkit/defaults/main.yml b/docs/server/deployment/ansible/roles/nvidia-container-toolkit/defaults/main.yml new file mode 100644 index 00000000..ed97d539 --- /dev/null +++ b/docs/server/deployment/ansible/roles/nvidia-container-toolkit/defaults/main.yml @@ -0,0 +1 @@ +--- diff --git a/docs/server/deployment/ansible/roles/nvidia-container-toolkit/meta/main.yml b/docs/server/deployment/ansible/roles/nvidia-container-toolkit/meta/main.yml new file mode 100644 index 00000000..39fa6725 --- /dev/null +++ b/docs/server/deployment/ansible/roles/nvidia-container-toolkit/meta/main.yml @@ -0,0 +1,45 @@ +galaxy_info: + author: Sanjay Srikakulam + description: Ansible role to install and configure NVIDIA Container Toolkit + company: Forschungszentrum Jülich + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: MIT + + min_ansible_version: "2.1" + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + platforms: + - name: Ubuntu + versions: + - all + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. diff --git a/docs/server/deployment/ansible/roles/nvidia-container-toolkit/tasks/main.yml b/docs/server/deployment/ansible/roles/nvidia-container-toolkit/tasks/main.yml new file mode 100644 index 00000000..85a96368 --- /dev/null +++ b/docs/server/deployment/ansible/roles/nvidia-container-toolkit/tasks/main.yml @@ -0,0 +1,35 @@ +--- +- name: Add apt key for NVIDIA toolkit + ansible.builtin.apt_key: + url: "https://nvidia.github.io/libnvidia-container/gpgkey" + state: present + +- name: NVIDIA toolkit repository is registered + ansible.builtin.apt_repository: + repo: 'deb https://nvidia.github.io/libnvidia-container/stable/deb/amd64 /' + state: present + filename: nvidia_container_toolkit + update_cache: "{{ 'yes' if not ansible_check_mode | bool else 'no' }}" + register: nvidia_container_toolkit_apt_repo + +- name: Install NVIDIA container toolkit + ansible.builtin.apt: + name: nvidia-container-toolkit + state: present + +- name: Configure NVIDIA container runtime for Docker + ansible.builtin.command: nvidia-ctk runtime configure --runtime=docker + register: configure_runtime + changed_when: "'already configured' not in configure_runtime.stdout" + +- name: Restart Docker to apply NVIDIA runtime + ansible.builtin.systemd: + name: docker + state: restarted + enabled: true + +- name: Verify NVIDIA driver + ansible.builtin.command: nvidia-smi + register: nvidia_smi + changed_when: false + failed_when: nvidia_smi.rc != 0 diff --git a/docs/server/deployment/ansible/templates/nginx/koina.conf.j2 b/docs/server/deployment/ansible/templates/nginx/koina.conf.j2 new file mode 100644 index 00000000..c8df9696 --- /dev/null +++ b/docs/server/deployment/ansible/templates/nginx/koina.conf.j2 @@ -0,0 +1,63 @@ +upstream koina { + server 127.0.0.1:{{ koinahttp_docker_port }} fail_timeout=30s; + keepalive 32; +} + +server { + listen 80; + listen [::]:80; + server_name "{{ KOINA_SERVER_NAME }}"; + + # ACME Challenge Support + location /.well-known/acme-challenge/ { + root /var/www/certbot; + allow all; + limit_except GET { deny all; } + } + + # Prevent nginx HTTP Server Detection + server_tokens off; + + # Enforce HTTPS + return 301 https://$server_name$request_uri; +} + +server { + listen 443 ssl http2; + listen [::]:443 ssl http2; + server_name "{{ KOINA_SERVER_NAME }}"; + + ssl_certificate "{{ ssl_certificate_path }}"; + ssl_certificate_key "{{ ssl_certificate_key_path }}"; + + ssl_protocols TLSv1.2 TLSv1.3; + + add_header X-Content-Type-Options "nosniff" always; + add_header X-Frame-Options "DENY" always; + add_header X-XSS-Protection "1; mode=block" always; + + server_tokens off; + ignore_invalid_headers on; + client_max_body_size 64m; + proxy_buffering off; + proxy_request_buffering off; + + access_log /var/log/nginx/koina.access.log; + error_log /var/log/nginx/koina.error.log; + + location / { + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + proxy_http_version 1.1; + proxy_set_header Connection close; + + proxy_connect_timeout 30s; + proxy_send_timeout 300s; + proxy_read_timeout 300s; + + proxy_pass http://koina; + } +} diff --git a/docs/server/deployment/ansible/templates/nginx/koinarpc.conf.j2 b/docs/server/deployment/ansible/templates/nginx/koinarpc.conf.j2 new file mode 100644 index 00000000..0b97d05e --- /dev/null +++ b/docs/server/deployment/ansible/templates/nginx/koinarpc.conf.j2 @@ -0,0 +1,60 @@ +upstream koinarpc { + server 127.0.0.1:{{ koinarpc_docker_port }} fail_timeout=30s; + keepalive 32; +} + +server { + listen 80; + listen [::]:80; + server_name "{{ KOINA_RPC_SERVER_NAME }}"; + + # ACME Challenge Support + location /.well-known/acme-challenge/ { + root /var/www/certbot; + allow all; + limit_except GET { deny all; } + } + + # Prevent nginx HTTP Server Detection + server_tokens off; + + # Enforce HTTPS + return 301 https://$server_name$request_uri; +} + +server { + listen 443 ssl http2; + listen [::]:443 ssl http2; + server_name "{{ KOINA_RPC_SERVER_NAME }}"; + + ssl_certificate "{{ ssl_certificate_path }}"; + ssl_certificate_key "{{ ssl_certificate_key_path }}"; + + ssl_protocols TLSv1.2 TLSv1.3; + + add_header X-Content-Type-Options "nosniff" always; + add_header X-Frame-Options "DENY" always; + add_header X-XSS-Protection "1; mode=block" always; + + server_tokens off; + ignore_invalid_headers on; + client_max_body_size 64m; + proxy_buffering off; + proxy_request_buffering off; + + access_log /var/log/nginx/koinarpc.access.log; + error_log /var/log/nginx/koinarpc.error.log; + + location / { + grpc_connect_timeout 30s; + grpc_read_timeout 300s; + grpc_send_timeout 300s; + + grpc_set_header Host $host; + grpc_set_header X-Real-IP $remote_addr; + grpc_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + grpc_set_header X-Forwarded-Proto $scheme; + + grpc_pass grpc://koinarpc; + } +} From ebe495df9085618ae5ec80d236636e7e8a111fea Mon Sep 17 00:00:00 2001 From: Sanjay Srikakulam Date: Tue, 9 Dec 2025 09:08:37 +0100 Subject: [PATCH 2/2] move certain tasks to dedicated roles and update documentation --- docs/server/deployment/ansible/README.md | 1 + .../deployment/ansible/koina_server.yml | 78 +++---------------- .../ansible/roles/firewall/README.md | 20 +++++ .../ansible/roles/firewall/defaults/main.yml | 5 ++ .../ansible/roles/firewall/meta/main.yml | 45 +++++++++++ .../ansible/roles/firewall/tasks/main.yml | 28 +++++++ .../ansible/roles/nvidia-driver/README.md | 19 +++++ .../roles/nvidia-driver/defaults/main.yml | 1 + .../ansible/roles/nvidia-driver/meta/main.yml | 45 +++++++++++ .../roles/nvidia-driver/tasks/main.yml | 33 ++++++++ 10 files changed, 206 insertions(+), 69 deletions(-) create mode 100644 docs/server/deployment/ansible/roles/firewall/README.md create mode 100644 docs/server/deployment/ansible/roles/firewall/defaults/main.yml create mode 100644 docs/server/deployment/ansible/roles/firewall/meta/main.yml create mode 100644 docs/server/deployment/ansible/roles/firewall/tasks/main.yml create mode 100644 docs/server/deployment/ansible/roles/nvidia-driver/README.md create mode 100644 docs/server/deployment/ansible/roles/nvidia-driver/defaults/main.yml create mode 100644 docs/server/deployment/ansible/roles/nvidia-driver/meta/main.yml create mode 100644 docs/server/deployment/ansible/roles/nvidia-driver/tasks/main.yml diff --git a/docs/server/deployment/ansible/README.md b/docs/server/deployment/ansible/README.md index 5d1b373a..f5038fb7 100644 --- a/docs/server/deployment/ansible/README.md +++ b/docs/server/deployment/ansible/README.md @@ -36,6 +36,7 @@ Adjust the inventory path, extra-vars, and become options for your environment. - Ensure to review and modify accordingly: - `koina_server.yml` for the overall orchestration and variable flow. - `roles/*/defaults/main.yml` to find complete variable names and defaults. +- Ensure that the `docker_user` variable in `koina_server.yml` is set to the user that should have permissions to run Docker commands (usually the default user on the server, e.g., `ubuntu`). ## Nginx templates diff --git a/docs/server/deployment/ansible/koina_server.yml b/docs/server/deployment/ansible/koina_server.yml index 08bcb100..36b788a0 100644 --- a/docs/server/deployment/ansible/koina_server.yml +++ b/docs/server/deployment/ansible/koina_server.yml @@ -1,7 +1,7 @@ --- # Playbook to deploy Minio on a VM - hosts: koina_servers - become: yes + become: true vars_files: - secret_group_vars/all.yml vars: @@ -12,10 +12,11 @@ KOINA_RPC_SERVER_NAME: "rpc.yourdomain.com" # Change this to your domain ADMIN_EMAIL_ADDRESS: "admin@yourdomain.com" # Change this to your email address for Certbot/Let's Encrypt KOINA_CONTAINER_DIR: "/opt/koina" # Directory to store Koina Docker container data and the compose file + docker_user: "ubuntu" # Change this to the user that should have Docker permissions pre_tasks: - name: Update and upgrade apt packages ansible.builtin.apt: - update_cache: yes + update_cache: true upgrade: dist - name: Install dependencies @@ -27,77 +28,16 @@ - ufw - ubuntu-drivers-common state: present - update_cache: yes - - - name: Enable UFW - community.general.ufw: - state: enabled - - - name: Allow SSH (port 22) - community.general.ufw: - rule: allow - port: '22' - proto: tcp - - - name: Allow HTTP (port 80) - community.general.ufw: - rule: allow - port: '80' - proto: tcp - - - name: Allow HTTPS (port 443) - community.general.ufw: - rule: allow - port: '443' - proto: tcp - - - name: Allow all outgoing traffic - community.general.ufw: - default: allow - direction: outgoing - - - name: Deny all other incoming traffic - community.general.ufw: - default: deny - direction: incoming - - - name: Install Nvidia driver for GPU # This is not idempotent (need to re-work this). - ansible.builtin.command: ubuntu-drivers install --gpgpu - register: nvidia_install - changed_when: "'installed' in nvidia_install.stdout" - - - name: Detect installed Nvidia server driver versions - ansible.builtin.command: bash -c "dpkg -l | awk '/nvidia-compute-utils-[0-9]+-server/{print $2}' | sort -V | tail -n 1" - register: nvidia_driver_pkg - changed_when: false - - - name: Extract Nvidia driver version number - ansible.builtin.set_fact: - nvidia_driver_version: "{{ nvidia_driver_pkg.stdout | regex_search('[0-9]+') }}" - - - name: Install matching Nvidia server-utils package - ansible.builtin.apt: - name: "nvidia-utils-{{ nvidia_driver_version }}-server" - state: present - update_cache: yes - when: nvidia_driver_version is defined and nvidia_driver_version | length > 0 - - - name: Reboot if Nvidia driver was installed # This is not idempotent due to the driver install step above (so reboot always runs :( ). - ansible.builtin.reboot: - msg: "Rebooting after Nvidia driver installation" - pre_reboot_delay: 10 - when: "'installed' in nvidia_install.stdout" - - - name: Check if nvidia-smi works - ansible.builtin.command: nvidia-smi - register: nvidia_smi - failed_when: nvidia_smi.rc != 0 - changed_when: false + update_cache: true roles: - role: geerlingguy.docker vars: docker_users: - - 'ubuntu' + - "{{ docker_user }}" + + - role: firewall + + - role: nvidia-driver - role: nvidia-container-toolkit diff --git a/docs/server/deployment/ansible/roles/firewall/README.md b/docs/server/deployment/ansible/roles/firewall/README.md new file mode 100644 index 00000000..ed1ccc9a --- /dev/null +++ b/docs/server/deployment/ansible/roles/firewall/README.md @@ -0,0 +1,20 @@ +# Ansible role: firewall + +Role to install and configure the required firewall on target hosts (Ubuntu only). + +## Features +- Configures UFW with predefined rules +- Denies all incoming connections by default +- Allows all outgoing connections by default +- Opens specific ports for SSH, HTTP, HTTPS + +## Requirements +- Sudo/root privileges on target hosts + +## Example playbook +```yaml +- hosts: all + become: true + roles: + - firewall +``` diff --git a/docs/server/deployment/ansible/roles/firewall/defaults/main.yml b/docs/server/deployment/ansible/roles/firewall/defaults/main.yml new file mode 100644 index 00000000..33633859 --- /dev/null +++ b/docs/server/deployment/ansible/roles/firewall/defaults/main.yml @@ -0,0 +1,5 @@ +--- +allowed_ports: + - 22 + - 80 + - 443 diff --git a/docs/server/deployment/ansible/roles/firewall/meta/main.yml b/docs/server/deployment/ansible/roles/firewall/meta/main.yml new file mode 100644 index 00000000..e8b76d1e --- /dev/null +++ b/docs/server/deployment/ansible/roles/firewall/meta/main.yml @@ -0,0 +1,45 @@ +galaxy_info: + author: Sanjay Srikakulam + description: Ansible role to configure the required firewall on target hosts. + company: Forschungszentrum Jülich + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: MIT + + min_ansible_version: "2.1" + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + platforms: + - name: Ubuntu + versions: + - all + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. diff --git a/docs/server/deployment/ansible/roles/firewall/tasks/main.yml b/docs/server/deployment/ansible/roles/firewall/tasks/main.yml new file mode 100644 index 00000000..cde7e51a --- /dev/null +++ b/docs/server/deployment/ansible/roles/firewall/tasks/main.yml @@ -0,0 +1,28 @@ +--- +- name: Install required packages for firewall + ansible.builtin.apt: + name: + - ufw + state: present + update_cache: true + +- name: Enable UFW + community.general.ufw: + state: enabled + +- name: Allow required incoming ports + community.general.ufw: + rule: allow + port: '{{ item }}' + proto: tcp + loop: "{{ allowed_ports }}" + +- name: Allow all outgoing traffic + community.general.ufw: + default: allow + direction: outgoing + +- name: Deny all other incoming traffic + community.general.ufw: + default: deny + direction: incoming diff --git a/docs/server/deployment/ansible/roles/nvidia-driver/README.md b/docs/server/deployment/ansible/roles/nvidia-driver/README.md new file mode 100644 index 00000000..7b4a6bb6 --- /dev/null +++ b/docs/server/deployment/ansible/roles/nvidia-driver/README.md @@ -0,0 +1,19 @@ +# Ansible role: nvidia-driver + +Role to install the Nvidia driver on target hosts (Ubuntu only). + +## Features +- Installs the latest Nvidia GPU driver and utilities +- Reboots the system if a new driver is installed +- Verifies the installation with `nvidia-smi` + +## Requirements +- Sudo/root privileges on target hosts + +## Example playbook +```yaml +- hosts: all + become: true + roles: + - nvidia-driver +``` diff --git a/docs/server/deployment/ansible/roles/nvidia-driver/defaults/main.yml b/docs/server/deployment/ansible/roles/nvidia-driver/defaults/main.yml new file mode 100644 index 00000000..ed97d539 --- /dev/null +++ b/docs/server/deployment/ansible/roles/nvidia-driver/defaults/main.yml @@ -0,0 +1 @@ +--- diff --git a/docs/server/deployment/ansible/roles/nvidia-driver/meta/main.yml b/docs/server/deployment/ansible/roles/nvidia-driver/meta/main.yml new file mode 100644 index 00000000..9bbe9ab8 --- /dev/null +++ b/docs/server/deployment/ansible/roles/nvidia-driver/meta/main.yml @@ -0,0 +1,45 @@ +galaxy_info: + author: Sanjay Srikakulam + description: Ansible role to install Nvidia driver on target hosts. + company: Forschungszentrum Jülich + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: MIT + + min_ansible_version: "2.1" + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + platforms: + - name: Ubuntu + versions: + - all + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. diff --git a/docs/server/deployment/ansible/roles/nvidia-driver/tasks/main.yml b/docs/server/deployment/ansible/roles/nvidia-driver/tasks/main.yml new file mode 100644 index 00000000..843f0170 --- /dev/null +++ b/docs/server/deployment/ansible/roles/nvidia-driver/tasks/main.yml @@ -0,0 +1,33 @@ +--- +- name: Install Nvidia driver for GPU # This is not idempotent (need to re-work this). + ansible.builtin.command: ubuntu-drivers install --gpgpu + register: nvidia_install + changed_when: "'installed' in nvidia_install.stdout" + +- name: Detect installed Nvidia server driver versions + ansible.builtin.command: bash -c "dpkg -l | awk '/nvidia-compute-utils-[0-9]+-server/{print $2}' | sort -V | tail -n 1" + register: nvidia_driver_pkg + changed_when: false + +- name: Extract Nvidia driver version number + ansible.builtin.set_fact: + nvidia_driver_version: "{{ nvidia_driver_pkg.stdout | regex_search('[0-9]+') }}" + +- name: Install matching Nvidia server-utils package + ansible.builtin.apt: + name: "nvidia-utils-{{ nvidia_driver_version }}-server" + state: present + update_cache: true + when: nvidia_driver_version is defined and nvidia_driver_version | length > 0 + +- name: Reboot if Nvidia driver was installed # This is not idempotent due to the driver install step above (so reboot always runs :( ). + ansible.builtin.reboot: + msg: "Rebooting after Nvidia driver installation" + pre_reboot_delay: 10 + when: "'installed' in nvidia_install.stdout" + +- name: Check if nvidia-smi works + ansible.builtin.command: nvidia-smi + register: nvidia_smi + failed_when: nvidia_smi.rc != 0 + changed_when: false