precimed · espenhgn · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025
diff --git a/.github/workflows/test-downloads.yml b/.github/workflows/test-downloads.yml
@@ -0,0 +1,57 @@
+name: Test Binary Downloads
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  test-downloads:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.11'
+
+    - name: Install Snakemake
+      run: |
+        pip install snakemake
+
+    - name: Verify Snakefile syntax
+      run: |
+        snakemake --list-rules
+
+    - name: Dry run workflow
+      run: |
+        snakemake --cores 1 --dry-run
+
+    - name: Download and unpack binaries
+      run: |
+        make download
+
+    - name: Verify downloaded binaries
+      run: |
+        ./verify_binaries.sh
+
+    - name: Check binary versions
+      run: |
+        ./bin/plink1 --version || true
+        ./bin/plink2 --version || true
+        ./bin/regenie --version || true
+
+    - name: Upload binaries as artifacts
+      uses: actions/upload-artifact@v4
+      with:
+        name: bioinformatics-binaries
+        path: bin/
+        retention-days: 7
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,13 @@ __pycache__/
 *.py[codz]
 *$py.class
 
+# Downloaded binaries and archives
+bin/
+downloads/
+
+# Snakemake
+.snakemake/
+
 # C extensions
 *.so
 

diff --git a/ADDING_BINARIES.md b/ADDING_BINARIES.md
@@ -0,0 +1,64 @@
+# Adding New Binaries
+
+This document explains how to add new bioinformatics binaries to the workflow.
+
+## Steps to Add a New Binary
+
+1. **Find the download URL**: Locate the direct download URL for the Linux x86_64 version of the binary (usually a .zip file)
+
+2. **Edit config.yaml**: Add a new entry under `binaries:` following the existing pattern
+
+3. **Run the workflow**: Execute `make` or `snakemake --cores 1` to download the new binary
+
+## Example: Adding bcftools
+
+To add bcftools to the workflow, edit `config.yaml`:
+
+```yaml
+binaries:
+  # ... existing entries ...
+
+  bcftools:
+    version: "1.19"
+    url: "https://github.com/samtools/bcftools/releases/download/1.19/bcftools-1.19-linux-x86_64.tar.bz2"
+    executable: "bcftools"
+    description: "BCFtools - utilities for variant calling and manipulating VCFs and BCFs"
+```
+
+Then run:
+```bash
+make bcftools
+```
+
+Or to download all binaries including the new one:
+```bash
+make
+```
+
+## Configuration Fields
+
+- `version`: Version number of the binary (for documentation purposes)
+- `url`: Direct download URL for the binary archive
+- `executable`: Name of the executable file inside the archive (before renaming)
+- `description`: Brief description of the tool (optional, for documentation)
+
+## Supported Archive Formats
+
+Currently, the workflow supports:
+- `.zip` files (using unzip)
+
+To add support for other formats (e.g., `.tar.gz`, `.tar.bz2`), modify the `unpack_binary` rule in the Snakefile.
+
+## Testing
+
+After adding a new binary, test it:
+
+1. **Dry run**: `snakemake bin/BINARY_NAME --cores 1 --dry-run`
+2. **Download**: `snakemake bin/BINARY_NAME --cores 1`
+3. **Verify**: `./verify_binaries.sh` (after updating the script to check the new binary)
+
+## Troubleshooting
+
+- **Wrong executable name**: If the unpacked executable has a different name than expected, adjust the `executable` field in `config.yaml`
+- **Archive format not supported**: Modify the `unpack_binary` rule in `Snakefile` to handle the new format
+- **Download fails**: Verify the URL is correct and accessible
diff --git a/Makefile b/Makefile
@@ -0,0 +1,48 @@
+# Makefile for managing bioinformatics binaries
+
+.PHONY: all download clean help test
+
+# Default target
+all: download
+
+# Download and unpack all binaries
+download:
+	@echo "Downloading and unpacking binaries..."
+	snakemake --cores 1
+
+# Download specific binaries
+plink1:
+	snakemake bin/plink1 --cores 1
+
+plink2:
+	snakemake bin/plink2 --cores 1
+
+regenie:
+	snakemake bin/regenie --cores 1
+
+# Clean downloaded files and binaries
+clean:
+	@echo "Cleaning downloaded files..."
+	rm -rf bin/ downloads/
+	rm -rf .snakemake/
+
+# Show what would be downloaded
+dry-run:
+	snakemake --cores 1 --dry-run
+
+# List all rules
+list:
+	snakemake --list-rules
+
+# Show help
+help:
+	@echo "Available targets:"
+	@echo "  all       - Download and unpack all binaries (default)"
+	@echo "  download  - Download and unpack all binaries"
+	@echo "  plink1    - Download and unpack PLINK1 only"
+	@echo "  plink2    - Download and unpack PLINK2 only"
+	@echo "  regenie   - Download and unpack regenie only"
+	@echo "  clean     - Remove downloaded files and binaries"
+	@echo "  dry-run   - Show what would be downloaded without actually downloading"
+	@echo "  list      - List all available rules"
+	@echo "  help      - Show this help message"
diff --git a/README.md b/README.md
@@ -1,2 +1,161 @@
 # tsd_software
 Scripts to fetch specific binaries to be synced to TSD's s3-api dir
+
+## Overview
+
+This repository contains Snakemake workflows to download and unpack bioinformatics binaries including:
+- **PLINK1** - Whole genome association analysis toolset (version 20250819)
+- **PLINK2** - Next generation of PLINK (version 20251019)
+- **regenie** - Fast whole genome regression modelling (version 4.1)
+
+## Requirements
+
+- Python 3.6+
+- Snakemake
+- curl (for downloading)
+- unzip (for unpacking archives)
+
+## Installation
+
+Install Snakemake using pip in the current Python environment:
+```bash
+pip install snakemake
+```
+
+Or using conda/mamba:
+```bash
+mamba install -c conda-forge -c bioconda snakemake
+```
+
+Or using the provided conda environment file:
+```bash
+conda env create -f environment.yaml
+conda activate tsd_software
+```
+
+## Usage
+
+### Using Make (recommended)
+
+The easiest way to use this workflow is via the provided Makefile:
+
+```bash
+# Download and unpack all binaries
+make
+
+# Download specific binaries
+make plink1
+make plink2
+make regenie
+
+# Show what would be downloaded without downloading
+make dry-run
+
+# Clean all downloaded files
+make clean
+
+# Show available commands
+make help
+```
+
+### Using Snakemake directly
+
+Run the workflow to download and unpack all binaries:
+```bash
+snakemake --cores 1
+```
+
+Download specific binaries:
+```bash
+snakemake bin/plink1 --cores 1
+snakemake bin/plink2 --cores 1
+snakemake bin/regenie --cores 1
+```
+
+Perform a dry run to see what would be downloaded:
+```bash
+snakemake --cores 1 --dry-run
+```
+
+Clean downloaded files:
+```bash
+rm -rf bin/ downloads/
+```
+
+### Verifying Downloads
+
+After downloading, verify the binaries are correctly installed:
+```bash
+./verify_binaries.sh
+```
+
+This will check that all binaries exist, are executable, and display version information.
+
+## Configuration
+
+Binary versions and download URLs are configured in `config.yaml`. To update a binary version:
+
+1. Edit `config.yaml`
+2. Update the `version` and `url` fields for the desired binary
+3. Run `make clean` to remove old binaries
+4. Run `make` to download the new version
+
+Example configuration entry:
+```yaml
+binaries:
+  plink1:
+    version: "20231211"
+    url: "https://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20231211.zip"
+    executable: "plink"
+    description: "PLINK 1.9 - Whole genome association analysis toolset"
+```
+
+## Output
+
+Binaries are downloaded to the `bin/` directory in the repository root:
+- `bin/plink1` - PLINK 1.9 executable
+- `bin/plink2` - PLINK 2.0 executable  
+- `bin/regenie` - regenie executable
+
+Downloaded archives are stored in the `downloads/` directory (can be safely deleted after unpacking).
+
+## Workflow Structure
+
+The Snakemake workflow consists of:
+
+1. **download_binary** - Downloads binary archives from configured URLs
+2. **unpack_binary** - Unpacks archives and renames executables as needed
+3. **all** - Default rule that downloads and unpacks all binaries
+
+The workflow automatically:
+- Creates necessary directories (`bin/`, `downloads/`)
+- Downloads archives only if not already present
+- Unpacks binaries only if not already present
+- Makes binaries executable
+- Handles renaming of executables to standard names
+
+## Troubleshooting
+
+### Download failures
+If downloads fail, check:
+- Internet connectivity
+- The URLs in `config.yaml` are still valid
+- You have write permissions in the repository directory
+
+### Binary not executable
+If a binary exists but is not executable, run:
+```bash
+chmod +x bin/plink1 bin/plink2 bin/regenie
+```
+
+### Re-downloading binaries
+To force re-download of binaries:
+```bash
+make clean
+make
+```
+
+## License
+
+See LICENSE file for details.
+
diff --git a/Snakefile b/Snakefile
@@ -0,0 +1,44 @@
+"""
+Snakemake workflow to download and unpack bioinformatics binaries.
+Downloads PLINK1, PLINK2, and regenie to the bin/ directory.
+"""
+
+configfile: "config.yaml"
+
+# Get list of binaries from config
+BINARIES = list(config["binaries"].keys())
+
+# Default rule - download and unpack all binaries
+rule all:
+    input:
+        expand("bin/{binary}", binary=BINARIES)
+
+# Rule to download a binary archive
+rule download_binary:
+    output:
+        "downloads/{binary}.zip"
+    params:
+        url = lambda wildcards: config["binaries"][wildcards.binary]["url"]
+    shell:
+        """
+        mkdir -p downloads
+        curl -L -o {output} {params.url}
+        """
+
+# Rule to unpack a binary archive
+rule unpack_binary:
+    input:
+        "downloads/{binary}.zip"
+    output:
+        "bin/{binary}"
+    params:
+        executable = lambda wildcards: config["binaries"][wildcards.binary]["executable"]
+    shell:
+        """
+        mkdir -p bin
+        unzip -o {input} -d bin/
+        if [ "{params.executable}" != "{wildcards.binary}" ]; then
+            mv bin/{params.executable} {output}
+        fi
+        chmod +x {output}
+        """