diff --git a/.github/workflows/test-downloads.yml b/.github/workflows/test-downloads.yml new file mode 100644 index 0000000..1d3622e --- /dev/null +++ b/.github/workflows/test-downloads.yml @@ -0,0 +1,57 @@ +name: Test Binary Downloads + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + workflow_dispatch: + +permissions: + contents: read + +jobs: + test-downloads: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install Snakemake + run: | + pip install snakemake + + - name: Verify Snakefile syntax + run: | + snakemake --list-rules + + - name: Dry run workflow + run: | + snakemake --cores 1 --dry-run + + - name: Download and unpack binaries + run: | + make download + + - name: Verify downloaded binaries + run: | + ./verify_binaries.sh + + - name: Check binary versions + run: | + ./bin/plink1 --version || true + ./bin/plink2 --version || true + ./bin/regenie --version || true + + - name: Upload binaries as artifacts + uses: actions/upload-artifact@v4 + with: + name: bioinformatics-binaries + path: bin/ + retention-days: 7 diff --git a/.gitignore b/.gitignore index b7faf40..628d30f 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,13 @@ __pycache__/ *.py[codz] *$py.class +# Downloaded binaries and archives +bin/ +downloads/ + +# Snakemake +.snakemake/ + # C extensions *.so diff --git a/ADDING_BINARIES.md b/ADDING_BINARIES.md new file mode 100644 index 0000000..dfa66e5 --- /dev/null +++ b/ADDING_BINARIES.md @@ -0,0 +1,64 @@ +# Adding New Binaries + +This document explains how to add new bioinformatics binaries to the workflow. + +## Steps to Add a New Binary + +1. **Find the download URL**: Locate the direct download URL for the Linux x86_64 version of the binary (usually a .zip file) + +2. **Edit config.yaml**: Add a new entry under `binaries:` following the existing pattern + +3. **Run the workflow**: Execute `make` or `snakemake --cores 1` to download the new binary + +## Example: Adding bcftools + +To add bcftools to the workflow, edit `config.yaml`: + +```yaml +binaries: + # ... existing entries ... + + bcftools: + version: "1.19" + url: "https://github.com/samtools/bcftools/releases/download/1.19/bcftools-1.19-linux-x86_64.tar.bz2" + executable: "bcftools" + description: "BCFtools - utilities for variant calling and manipulating VCFs and BCFs" +``` + +Then run: +```bash +make bcftools +``` + +Or to download all binaries including the new one: +```bash +make +``` + +## Configuration Fields + +- `version`: Version number of the binary (for documentation purposes) +- `url`: Direct download URL for the binary archive +- `executable`: Name of the executable file inside the archive (before renaming) +- `description`: Brief description of the tool (optional, for documentation) + +## Supported Archive Formats + +Currently, the workflow supports: +- `.zip` files (using unzip) + +To add support for other formats (e.g., `.tar.gz`, `.tar.bz2`), modify the `unpack_binary` rule in the Snakefile. + +## Testing + +After adding a new binary, test it: + +1. **Dry run**: `snakemake bin/BINARY_NAME --cores 1 --dry-run` +2. **Download**: `snakemake bin/BINARY_NAME --cores 1` +3. **Verify**: `./verify_binaries.sh` (after updating the script to check the new binary) + +## Troubleshooting + +- **Wrong executable name**: If the unpacked executable has a different name than expected, adjust the `executable` field in `config.yaml` +- **Archive format not supported**: Modify the `unpack_binary` rule in `Snakefile` to handle the new format +- **Download fails**: Verify the URL is correct and accessible diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..379a704 --- /dev/null +++ b/Makefile @@ -0,0 +1,48 @@ +# Makefile for managing bioinformatics binaries + +.PHONY: all download clean help test + +# Default target +all: download + +# Download and unpack all binaries +download: + @echo "Downloading and unpacking binaries..." + snakemake --cores 1 + +# Download specific binaries +plink1: + snakemake bin/plink1 --cores 1 + +plink2: + snakemake bin/plink2 --cores 1 + +regenie: + snakemake bin/regenie --cores 1 + +# Clean downloaded files and binaries +clean: + @echo "Cleaning downloaded files..." + rm -rf bin/ downloads/ + rm -rf .snakemake/ + +# Show what would be downloaded +dry-run: + snakemake --cores 1 --dry-run + +# List all rules +list: + snakemake --list-rules + +# Show help +help: + @echo "Available targets:" + @echo " all - Download and unpack all binaries (default)" + @echo " download - Download and unpack all binaries" + @echo " plink1 - Download and unpack PLINK1 only" + @echo " plink2 - Download and unpack PLINK2 only" + @echo " regenie - Download and unpack regenie only" + @echo " clean - Remove downloaded files and binaries" + @echo " dry-run - Show what would be downloaded without actually downloading" + @echo " list - List all available rules" + @echo " help - Show this help message" diff --git a/README.md b/README.md index 9c1f894..f47debb 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,161 @@ # tsd_software Scripts to fetch specific binaries to be synced to TSD's s3-api dir + +## Overview + +This repository contains Snakemake workflows to download and unpack bioinformatics binaries including: +- **PLINK1** - Whole genome association analysis toolset (version 20250819) +- **PLINK2** - Next generation of PLINK (version 20251019) +- **regenie** - Fast whole genome regression modelling (version 4.1) + +## Requirements + +- Python 3.6+ +- Snakemake +- curl (for downloading) +- unzip (for unpacking archives) + +## Installation + +Install Snakemake using pip in the current Python environment: +```bash +pip install snakemake +``` + +Or using conda/mamba: +```bash +mamba install -c conda-forge -c bioconda snakemake +``` + +Or using the provided conda environment file: +```bash +conda env create -f environment.yaml +conda activate tsd_software +``` + +## Usage + +### Using Make (recommended) + +The easiest way to use this workflow is via the provided Makefile: + +```bash +# Download and unpack all binaries +make + +# Download specific binaries +make plink1 +make plink2 +make regenie + +# Show what would be downloaded without downloading +make dry-run + +# Clean all downloaded files +make clean + +# Show available commands +make help +``` + +### Using Snakemake directly + +Run the workflow to download and unpack all binaries: +```bash +snakemake --cores 1 +``` + +Download specific binaries: +```bash +snakemake bin/plink1 --cores 1 +snakemake bin/plink2 --cores 1 +snakemake bin/regenie --cores 1 +``` + +Perform a dry run to see what would be downloaded: +```bash +snakemake --cores 1 --dry-run +``` + +Clean downloaded files: +```bash +rm -rf bin/ downloads/ +``` + +### Verifying Downloads + +After downloading, verify the binaries are correctly installed: +```bash +./verify_binaries.sh +``` + +This will check that all binaries exist, are executable, and display version information. + +## Configuration + +Binary versions and download URLs are configured in `config.yaml`. To update a binary version: + +1. Edit `config.yaml` +2. Update the `version` and `url` fields for the desired binary +3. Run `make clean` to remove old binaries +4. Run `make` to download the new version + +Example configuration entry: +```yaml +binaries: + plink1: + version: "20231211" + url: "https://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20231211.zip" + executable: "plink" + description: "PLINK 1.9 - Whole genome association analysis toolset" +``` + +## Output + +Binaries are downloaded to the `bin/` directory in the repository root: +- `bin/plink1` - PLINK 1.9 executable +- `bin/plink2` - PLINK 2.0 executable +- `bin/regenie` - regenie executable + +Downloaded archives are stored in the `downloads/` directory (can be safely deleted after unpacking). + +## Workflow Structure + +The Snakemake workflow consists of: + +1. **download_binary** - Downloads binary archives from configured URLs +2. **unpack_binary** - Unpacks archives and renames executables as needed +3. **all** - Default rule that downloads and unpacks all binaries + +The workflow automatically: +- Creates necessary directories (`bin/`, `downloads/`) +- Downloads archives only if not already present +- Unpacks binaries only if not already present +- Makes binaries executable +- Handles renaming of executables to standard names + +## Troubleshooting + +### Download failures +If downloads fail, check: +- Internet connectivity +- The URLs in `config.yaml` are still valid +- You have write permissions in the repository directory + +### Binary not executable +If a binary exists but is not executable, run: +```bash +chmod +x bin/plink1 bin/plink2 bin/regenie +``` + +### Re-downloading binaries +To force re-download of binaries: +```bash +make clean +make +``` + +## License + +See LICENSE file for details. + diff --git a/Snakefile b/Snakefile new file mode 100644 index 0000000..f6975bd --- /dev/null +++ b/Snakefile @@ -0,0 +1,44 @@ +""" +Snakemake workflow to download and unpack bioinformatics binaries. +Downloads PLINK1, PLINK2, and regenie to the bin/ directory. +""" + +configfile: "config.yaml" + +# Get list of binaries from config +BINARIES = list(config["binaries"].keys()) + +# Default rule - download and unpack all binaries +rule all: + input: + expand("bin/{binary}", binary=BINARIES) + +# Rule to download a binary archive +rule download_binary: + output: + "downloads/{binary}.zip" + params: + url = lambda wildcards: config["binaries"][wildcards.binary]["url"] + shell: + """ + mkdir -p downloads + curl -L -o {output} {params.url} + """ + +# Rule to unpack a binary archive +rule unpack_binary: + input: + "downloads/{binary}.zip" + output: + "bin/{binary}" + params: + executable = lambda wildcards: config["binaries"][wildcards.binary]["executable"] + shell: + """ + mkdir -p bin + unzip -o {input} -d bin/ + if [ "{params.executable}" != "{wildcards.binary}" ]; then + mv bin/{params.executable} {output} + fi + chmod +x {output} + """ diff --git a/WORKFLOW_DIAGRAM.md b/WORKFLOW_DIAGRAM.md new file mode 100644 index 0000000..97dcbd1 --- /dev/null +++ b/WORKFLOW_DIAGRAM.md @@ -0,0 +1,173 @@ +# Workflow Diagram + +## Repository Structure + +``` +tsd_software/ +├── .github/ +│ └── workflows/ +│ └── test-downloads.yml # CI/CD workflow for automated testing +├── bin/ # Output: Downloaded binaries (gitignored) +│ ├── plink1 # PLINK 1.9 executable +│ ├── plink2 # PLINK 2.0 executable +│ └── regenie # regenie executable +├── downloads/ # Temporary: Downloaded archives (gitignored) +│ ├── plink1.zip +│ ├── plink2.zip +│ └── regenie.zip +├── Snakefile # Main workflow definition +├── config.yaml # Binary configuration (URLs, versions) +|-- environment.yaml # conda environment file +├── Makefile # Convenience interface +├── verify_binaries.sh # Binary verification script +├── example_usage.sh # Usage demonstration +├── test_workflow.sh # Comprehensive test suite +├── README.md # Main documentation +├── ADDING_BINARIES.md # Guide for adding new binaries +├── .gitignore # Excludes bin/, downloads/, .snakemake/ +└── LICENSE # License file +``` + +## Workflow Flow + +``` +User Command + ↓ +make / snakemake + ↓ + ├─→ Read config.yaml (binary definitions) + │ + ├─→ download_binary (for each binary) + │ └─→ curl → downloads/{binary}.zip + │ + ├─→ unpack_binary (for each binary) + │ └─→ unzip → bin/{binary} + │ └─→ chmod +x + │ + └─→ all (final rule) + └─→ bin/plink1, bin/plink2, bin/regenie ✓ +``` + +## Data Flow + +``` +config.yaml + ↓ (defines) +Binary URLs & Versions + ↓ (downloads) +downloads/*.zip + ↓ (unpacks) +bin/plink1, bin/plink2, bin/regenie + ↓ (verified by) +verify_binaries.sh +``` + +## Snakemake Rules + +``` +┌────────────────────────────────────────┐ +│ rule: all (default) │ +│ input: bin/plink1, bin/plink2, │ +│ bin/regenie │ +└───────────────┬────────────────────────┘ + │ + ┌───────┴───────┬────────────┐ + │ │ │ + ▼ ▼ ▼ +┌──────────────┐ ┌──────────────┐ ┌──────────────┐ +│ unpack_binary│ │ unpack_binary│ │ unpack_binary│ +│ plink1 │ │ plink2 │ │ regenie │ +└──────┬───────┘ └──────┬───────┘ └──────┬───────┘ + │ │ │ + ▼ ▼ ▼ +┌──────────────┐ ┌──────────────┐ ┌──────────────┐ +│download_ │ │download_ │ │download_ │ +│binary plink1 │ │binary plink2 │ │binary regenie│ +└──────────────┘ └──────────────┘ └──────────────┘ +``` + +## User Interaction Paths + +### Path 1: Download All Binaries +```bash +make # User command + → snakemake --cores 1 # Executes workflow + → Downloads all binaries + → Unpacks to bin/ +``` + +### Path 2: Download Specific Binary +```bash +make plink1 # User command + → snakemake bin/plink1 --cores 1 # Executes workflow + → Downloads plink1.zip + → Unpacks to bin/plink1 +``` + +### Path 3: Dry Run (Validation) +```bash +make dry-run # User command + → snakemake --dry-run # Shows what would be done + → No actual downloads + → Reports planned actions +``` + +### Path 4: Verification +```bash +./verify_binaries.sh # User command + → Checks bin/ directory + → Verifies executability + → Shows version info +``` + +## Configuration Extension + +To add a new binary (e.g., samtools): + +1. Edit `config.yaml`: +```yaml +binaries: + # ... existing entries ... + samtools: + version: "1.19" + url: "https://github.com/samtools/samtools/releases/download/1.19/samtools-1.19-linux-x86_64.tar.bz2" + executable: "samtools" + description: "SAM/BAM file manipulation" +``` + +2. Run: +```bash +make samtools +``` + +3. Binary appears at `bin/samtools` + +## CI/CD Pipeline + +``` +GitHub Event (push/PR) + ↓ +.github/workflows/test-downloads.yml + ↓ + ├─→ Setup Python & Snakemake + ├─→ Verify Snakefile syntax + ├─→ Run dry-run + ├─→ Download all binaries + ├─→ Verify binaries + └─→ Upload as artifacts ✓ +``` + +## Testing Hierarchy + +``` +test_workflow.sh + ├─→ Test 1: File existence + ├─→ Test 2: Snakefile syntax + ├─→ Test 3: config.yaml validity + ├─→ Test 4: Binary configuration + ├─→ Test 5: Workflow dry-run + ├─→ Test 6: Makefile targets + └─→ Test 7: Script permissions +``` + +All tests must pass for the workflow to be considered functional. diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..6b88abd --- /dev/null +++ b/config.yaml @@ -0,0 +1,21 @@ +# Configuration file for bioinformatics binaries +# Defines download URLs and versions for each tool + +binaries: + plink1: + version: "20250819" + url: "https://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20250819.zip" + executable: "plink" + description: "PLINK 1.9 - Whole genome association analysis toolset" + + plink2: + version: "20251019" + url: "https://s3.amazonaws.com/plink2-assets/plink2_linux_amd_avx2_20251019.zip" + executable: "plink2" + description: "PLINK 2.0 - Next generation of PLINK" + + regenie: + version: "v4.1" + url: "https://github.com/rgcgithub/regenie/releases/download/v4.1/regenie_v4.1.gz_x86_64_Linux_mkl.zip" + executable: "regenie_v4.1.gz_x86_64_Linux_mkl" + description: "regenie - Fast whole genome regression modelling" diff --git a/environment.yaml b/environment.yaml new file mode 100644 index 0000000..b2a8057 --- /dev/null +++ b/environment.yaml @@ -0,0 +1,13 @@ +name: tsd_software +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - python>=3.8 + - snakemake>=7.0 + - curl + - unzip + - pip + - pip: + - pyyaml diff --git a/example_usage.sh b/example_usage.sh new file mode 100755 index 0000000..8a8a6cb --- /dev/null +++ b/example_usage.sh @@ -0,0 +1,72 @@ +#!/bin/bash +# Example usage script for the bioinformatics binary download workflow +# This script demonstrates various ways to use the workflow + +set -e + +echo "==============================================" +echo "Bioinformatics Binary Download Workflow Demo" +echo "==============================================" +echo "" + +# Show help +echo "1. Showing available make targets:" +echo " $ make help" +echo "" +make help +echo "" + +# Dry run +echo "2. Performing a dry-run to see what would be downloaded:" +echo " $ make dry-run" +echo "" +make dry-run +echo "" + +# List rules +echo "3. Listing all Snakemake rules:" +echo " $ make list" +echo "" +make list +echo "" + +# Show configuration +echo "4. Current binary configuration:" +echo " $ cat config.yaml" +echo "" +cat config.yaml +echo "" + +# Download instructions (not executed due to network limitations) +echo "5. To download all binaries (when internet is available):" +echo " $ make" +echo " or" +echo " $ make download" +echo "" + +echo "6. To download a specific binary:" +echo " $ make plink1" +echo " $ make plink2" +echo " $ make regenie" +echo "" + +echo "7. To verify downloaded binaries (after download):" +echo " $ ./verify_binaries.sh" +echo "" + +echo "8. To clean all downloaded files:" +echo " $ make clean" +echo "" + +echo "9. Using Snakemake directly:" +echo " $ snakemake --cores 1 # Download all" +echo " $ snakemake bin/plink1 --cores 1 # Download PLINK1 only" +echo " $ snakemake --cores 1 --dry-run # Dry run" +echo "" + +echo "==============================================" +echo "For more information, see:" +echo " - README.md for general usage" +echo " - ADDING_BINARIES.md for adding new tools" +echo " - config.yaml for binary configuration" +echo "==============================================" diff --git a/test_workflow.sh b/test_workflow.sh new file mode 100755 index 0000000..3bfceb2 --- /dev/null +++ b/test_workflow.sh @@ -0,0 +1,139 @@ +#!/bin/bash +# Comprehensive test of the workflow implementation + +echo "=====================================" +echo "Testing Snakemake Workflow" +echo "=====================================" +echo "" + +# Test 1: Check files exist +echo "Test 1: Verifying all required files exist..." +files=( + "Snakefile" + "config.yaml" + "Makefile" + "README.md" + "ADDING_BINARIES.md" + "verify_binaries.sh" + "example_usage.sh" + ".github/workflows/test-downloads.yml" +) + +all_exist=true +for file in "${files[@]}"; do + if [ -f "$file" ]; then + echo " ✓ $file" + else + echo " ✗ $file (MISSING)" + all_exist=false + fi +done + +if [ "$all_exist" = true ]; then + echo " Result: PASSED" +else + echo " Result: FAILED" + exit 1 +fi +echo "" + +# Test 2: Check Snakefile syntax +echo "Test 2: Checking Snakefile syntax..." +if snakemake --list-rules > /dev/null 2>&1; then + echo " ✓ Snakefile syntax valid" + echo " Result: PASSED" +else + echo " ✗ Snakefile has syntax errors" + echo " Result: FAILED" + exit 1 +fi +echo "" + +# Test 3: Check config.yaml is valid +echo "Test 3: Checking config.yaml validity..." +python3 -c "import yaml; yaml.safe_load(open('config.yaml'))" 2>/dev/null +if [ $? -eq 0 ]; then + echo " ✓ config.yaml is valid YAML" + echo " Result: PASSED" +else + echo " ✗ config.yaml has syntax errors" + echo " Result: FAILED" + exit 1 +fi +echo "" + +# Test 4: Verify binaries in config +echo "Test 4: Checking configured binaries..." +binaries=$(python3 -c "import yaml; c=yaml.safe_load(open('config.yaml')); print(','.join(c['binaries'].keys()))") +echo " Configured binaries: $binaries" +expected="plink1,plink2,regenie" +if [ "$binaries" = "$expected" ]; then + echo " ✓ All expected binaries configured" + echo " Result: PASSED" +else + echo " ✗ Unexpected binaries (expected: $expected)" + echo " Result: FAILED" + exit 1 +fi +echo "" + +# Test 5: Dry run +echo "Test 5: Running workflow dry-run..." +if snakemake --cores 1 --dry-run > /dev/null 2>&1; then + echo " ✓ Dry run successful" + echo " Result: PASSED" +else + echo " ✗ Dry run failed" + echo " Result: FAILED" + exit 1 +fi +echo "" + +# Test 6: Check Makefile targets +echo "Test 6: Checking Makefile targets..." +targets=("help" "list" "dry-run") +all_work=true +for target in "${targets[@]}"; do + if make $target > /dev/null 2>&1; then + echo " ✓ make $target" + else + echo " ✗ make $target (FAILED)" + all_work=false + fi +done + +if [ "$all_work" = true ]; then + echo " Result: PASSED" +else + echo " Result: FAILED" + exit 1 +fi +echo "" + +# Test 7: Check scripts are executable +echo "Test 7: Checking script permissions..." +scripts=("verify_binaries.sh" "example_usage.sh") +all_exec=true +for script in "${scripts[@]}"; do + if [ -x "$script" ]; then + echo " ✓ $script is executable" + else + echo " ✗ $script is not executable" + all_exec=false + fi +done + +if [ "$all_exec" = true ]; then + echo " Result: PASSED" +else + echo " Result: FAILED" + exit 1 +fi +echo "" + +echo "=====================================" +echo "All tests PASSED!" +echo "=====================================" +echo "" +echo "The Snakemake workflow is ready to use." +echo "Run 'make' or 'snakemake --cores 1' to download binaries." diff --git a/verify_binaries.sh b/verify_binaries.sh new file mode 100755 index 0000000..32847a3 --- /dev/null +++ b/verify_binaries.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# Script to verify downloaded binaries + +set -e + +BIN_DIR="bin" + +echo "Verifying downloaded binaries..." +echo "================================" + +# Check if bin directory exists +if [ ! -d "$BIN_DIR" ]; then + echo "ERROR: bin/ directory not found. Run 'make download' first." + exit 1 +fi + +# Function to check a binary +check_binary() { + local name=$1 + local path="$BIN_DIR/$name" + + echo -n "Checking $name... " + if [ -f "$path" ]; then + if [ -x "$path" ]; then + echo "✓ Found and executable" + # Try to get version info + case $name in + plink1) + "$path" --version 2>&1 | head -1 || echo " (version info unavailable)" + ;; + plink2) + "$path" --version 2>&1 | head -1 || echo " (version info unavailable)" + ;; + regenie) + "$path" --version 2>&1 | head -1 || echo " (version info unavailable)" + ;; + esac + else + echo "✗ Found but not executable" + return 1 + fi + else + echo "✗ Not found" + return 1 + fi +} + +# Check all binaries +PASSED=0 +FAILED=0 + +# Disable exit on error +set +e + +for binary in plink1 plink2 regenie; do + if check_binary "$binary"; then + ((PASSED++)) + else + ((FAILED++)) + fi +done + +echo "" +echo "Results: $PASSED passed, $FAILED failed" + +if [ $FAILED -gt 0 ]; then + exit 1 +fi