diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index b290e090..97c8c97f 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,20 +1,20 @@ { "name": "nfcore", - "image": "nfcore/gitpod:latest", - "remoteUser": "gitpod", - "runArgs": ["--privileged"], + "image": "nfcore/devcontainer:latest", - // Configure tool-specific properties. - "customizations": { - // Configure properties specific to VS Code. - "vscode": { - // Set *default* container specific settings.json values on container create. - "settings": { - "python.defaultInterpreterPath": "/opt/conda/bin/python" - }, + "remoteUser": "root", + "privileged": true, - // Add the IDs of extensions you want installed when the container is created. - "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"] - } + "remoteEnv": { + // Workspace path on the host for mounting with docker-outside-of-docker + "LOCAL_WORKSPACE_FOLDER": "${localWorkspaceFolder}" + }, + + "onCreateCommand": "./.devcontainer/setup.sh", + + "hostRequirements": { + "cpus": 4, + "memory": "16gb", + "storage": "32gb" } } diff --git a/.devcontainer/setup.sh b/.devcontainer/setup.sh new file mode 100755 index 00000000..f9b8e3f2 --- /dev/null +++ b/.devcontainer/setup.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +# Customise the terminal command prompt +echo "export PROMPT_DIRTRIM=2" >> $HOME/.bashrc +echo "export PS1='\[\e[3;36m\]\w ->\[\e[0m\\] '" >> $HOME/.bashrc +export PROMPT_DIRTRIM=2 +export PS1='\[\e[3;36m\]\w ->\[\e[0m\\] ' + +# Update Nextflow +nextflow self-update + +# Update welcome message +echo "Welcome to the nf-core/stableexpression devcontainer!" > /usr/local/etc/vscode-dev-containers/first-run-notice.txt diff --git a/.editorconfig b/.editorconfig deleted file mode 100644 index 6d9b74cc..00000000 --- a/.editorconfig +++ /dev/null @@ -1,37 +0,0 @@ -root = true - -[*] -charset = utf-8 -end_of_line = lf -insert_final_newline = true -trim_trailing_whitespace = true -indent_size = 4 -indent_style = space - -[*.{md,yml,yaml,html,css,scss,js}] -indent_size = 2 - -# These files are edited and tested upstream in nf-core/modules -[/modules/nf-core/**] -charset = unset -end_of_line = unset -insert_final_newline = unset -trim_trailing_whitespace = unset -indent_style = unset -[/subworkflows/nf-core/**] -charset = unset -end_of_line = unset -insert_final_newline = unset -trim_trailing_whitespace = unset -indent_style = unset - -[/assets/email*] -indent_size = unset - -# ignore python and markdown -[*.{py,md}] -indent_style = unset - -# ignore ro-crate metadata files -[**/ro-crate-metadata.json] -insert_final_newline = unset diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index ac31ba3a..bdd34869 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -78,7 +78,7 @@ If you wish to contribute a new step, please use the following coding standards: 5. Add any new parameters to `nextflow_schema.json` with help text (via the `nf-core pipelines schema build` tool). 6. Add sanity checks and validation for all relevant parameters. 7. Perform local tests to validate that the new code works as expected. -8. If applicable, add a new test command in `.github/workflow/ci.yml`. +8. If applicable, add a new test in the `tests` directory. 9. Update MultiQC config `assets/multiqc_config.yml` so relevant suffixes, file name clean up and module plots are in the appropriate order. If applicable, add a [MultiQC](https://https://multiqc.info/) module. 10. Add a description of the output files and if relevant any appropriate images from the MultiQC report to `docs/output.md`. diff --git a/.github/actions/get-shards/action.yml b/.github/actions/get-shards/action.yml new file mode 100644 index 00000000..34085279 --- /dev/null +++ b/.github/actions/get-shards/action.yml @@ -0,0 +1,69 @@ +name: "Get number of shards" +description: "Get the number of nf-test shards for the current CI job" +inputs: + max_shards: + description: "Maximum number of shards allowed" + required: true + paths: + description: "Component paths to test" + required: false + tags: + description: "Tags to pass as argument for nf-test --tag parameter" + required: false +outputs: + shard: + description: "Array of shard numbers" + value: ${{ steps.shards.outputs.shard }} + total_shards: + description: "Total number of shards" + value: ${{ steps.shards.outputs.total_shards }} +runs: + using: "composite" + steps: + - name: Install nf-test + uses: nf-core/setup-nf-test@v1 + with: + version: ${{ env.NFT_VER }} + - name: Get number of shards + id: shards + shell: bash + run: | + # Run nf-test with dynamic parameter + nftest_output=$(nf-test test \ + --profile +docker \ + $(if [ -n "${{ inputs.tags }}" ]; then echo "--tag ${{ inputs.tags }}"; fi) \ + --dry-run \ + --ci \ + --changed-since HEAD^) || { + echo "nf-test command failed with exit code $?" + echo "Full output: $nftest_output" + exit 1 + } + echo "nf-test dry-run output: $nftest_output" + + # Default values for shard and total_shards + shard="[]" + total_shards=0 + + # Check if there are related tests + if echo "$nftest_output" | grep -q 'No tests to execute'; then + echo "No related tests found." + else + # Extract the number of related tests + number_of_shards=$(echo "$nftest_output" | sed -n 's|.*Executed \([0-9]*\) tests.*|\1|p') + if [[ -n "$number_of_shards" && "$number_of_shards" -gt 0 ]]; then + shards_to_run=$(( $number_of_shards < ${{ inputs.max_shards }} ? $number_of_shards : ${{ inputs.max_shards }} )) + shard=$(seq 1 "$shards_to_run" | jq -R . | jq -c -s .) + total_shards="$shards_to_run" + else + echo "Unexpected output format. Falling back to default values." + fi + fi + + # Write to GitHub Actions outputs + echo "shard=$shard" >> $GITHUB_OUTPUT + echo "total_shards=$total_shards" >> $GITHUB_OUTPUT + + # Debugging output + echo "Final shard array: $shard" + echo "Total number of shards: $total_shards" diff --git a/.github/actions/nf-test/action.yml b/.github/actions/nf-test/action.yml new file mode 100644 index 00000000..3b9724c7 --- /dev/null +++ b/.github/actions/nf-test/action.yml @@ -0,0 +1,111 @@ +name: "nf-test Action" +description: "Runs nf-test with common setup steps" +inputs: + profile: + description: "Profile to use" + required: true + shard: + description: "Shard number for this CI job" + required: true + total_shards: + description: "Total number of test shards(NOT the total number of matrix jobs)" + required: true + paths: + description: "Test paths" + required: true + tags: + description: "Tags to pass as argument for nf-test --tag parameter" + required: false +runs: + using: "composite" + steps: + - name: Setup Nextflow + uses: nf-core/setup-nextflow@v2 + with: + version: "${{ env.NXF_VERSION }}" + + - name: Set up Python + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 + with: + python-version: "3.14" + + - name: Install nf-test + uses: nf-core/setup-nf-test@v1 + with: + version: "${{ env.NFT_VER }}" + install-pdiff: true + + - name: Setup apptainer + if: contains(inputs.profile, 'singularity') + uses: eWaterCycle/setup-apptainer@main + + - name: Set up Singularity + if: contains(inputs.profile, 'singularity') + shell: bash + run: | + mkdir -p $NXF_SINGULARITY_CACHEDIR + mkdir -p $NXF_SINGULARITY_LIBRARYDIR + + - name: Conda setup + if: contains(inputs.profile, 'conda') + uses: conda-incubator/setup-miniconda@505e6394dae86d6a5c7fbb6e3fb8938e3e863830 # v3 + with: + auto-update-conda: true + conda-solver: libmamba + channels: conda-forge + channel-priority: strict + conda-remove-defaults: true + + - name: Run nf-test + shell: bash + env: + NFT_WORKDIR: ${{ env.NFT_WORKDIR }} + run: | + nf-test test \ + --profile=+${{ inputs.profile }} \ + $(if [ -n "${{ inputs.tags }}" ]; then echo "--tag ${{ inputs.tags }}"; fi) \ + --ci \ + --changed-since HEAD^ \ + --verbose \ + --tap=test.tap \ + --shard ${{ inputs.shard }}/${{ inputs.total_shards }} + + # Save the absolute path of the test.tap file to the output + echo "tap_file_path=$(realpath test.tap)" >> $GITHUB_OUTPUT + + - name: Generate test summary + if: always() + shell: bash + run: | + # Add header if it doesn't exist (using a token file to track this) + if [ ! -f ".summary_header" ]; then + echo "# 🚀 nf-test results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "| Status | Test Name | Profile | Shard |" >> $GITHUB_STEP_SUMMARY + echo "|:------:|-----------|---------|-------|" >> $GITHUB_STEP_SUMMARY + touch .summary_header + fi + + if [ -f test.tap ]; then + while IFS= read -r line; do + if [[ $line =~ ^ok ]]; then + test_name="${line#ok }" + # Remove the test number from the beginning + test_name="${test_name#* }" + echo "| ✅ | ${test_name} | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + elif [[ $line =~ ^not\ ok ]]; then + test_name="${line#not ok }" + # Remove the test number from the beginning + test_name="${test_name#* }" + echo "| ❌ | ${test_name} | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + fi + done < test.tap + else + echo "| ⚠ | No test results found | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + fi + + - name: Clean up + if: always() + shell: bash + run: | + sudo rm -rf /home/ubuntu/tests/ diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index e34b4e6d..a5f81c34 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -14,7 +14,7 @@ jobs: run-platform: name: Run AWS full tests # run only if the PR is approved by at least 2 reviewers and against the master/main branch or manually triggered - if: github.repository == 'nf-core/stableexpression' && github.event.review.state == 'approved' && (github.event.pull_request.base.ref == 'master' || github.event.pull_request.base.ref == 'main') || github.event_name == 'workflow_dispatch' + if: github.repository == 'nf-core/stableexpression' && github.event.review.state == 'approved' && (github.event.pull_request.base.ref == 'master' || github.event.pull_request.base.ref == 'main') || github.event_name == 'workflow_dispatch' || github.event_name == 'release' runs-on: ubuntu-latest steps: - name: Set revision variable @@ -28,21 +28,21 @@ jobs: # Add full size test data (but still relatively small datasets for few samples) # on the `test_full.config` test runs with only one set of parameters with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + workspace_id: ${{ vars.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + compute_env: ${{ vars.TOWER_COMPUTE_ENV }} revision: ${{ steps.revision.outputs.revision }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/stableexpression/work-${{ steps.revision.outputs.revision }} + workdir: s3://${{ vars.AWS_S3_BUCKET }}/work/stableexpression/work-${{ steps.revision.outputs.revision }} parameters: | { "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/stableexpression/results-${{ steps.revision.outputs.revision }}" + "outdir": "s3://${{ vars.AWS_S3_BUCKET }}/stableexpression/results-${{ steps.revision.outputs.revision }}" } profiles: test_full - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5 with: name: Seqera Platform debug log file path: | - seqera_platform_action_*.log - seqera_platform_action_*.json + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index 76e01b25..429570cd 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -14,20 +14,20 @@ jobs: - name: Launch workflow via Seqera Platform uses: seqeralabs/action-tower-launch@v2 with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + workspace_id: ${{ vars.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + compute_env: ${{ vars.TOWER_COMPUTE_ENV }} revision: ${{ github.sha }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/stableexpression/work-${{ github.sha }} + workdir: s3://${{ vars.AWS_S3_BUCKET }}/work/stableexpression/work-${{ github.sha }} parameters: | { - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/stableexpression/results-test-${{ github.sha }}" + "outdir": "s3://${{ vars.AWS_S3_BUCKET }}/stableexpression/results-test-${{ github.sha }}" } profiles: test - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5 with: name: Seqera Platform debug log file path: | - seqera_platform_action_*.log - seqera_platform_action_*.json + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index e0c0b489..00000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,158 +0,0 @@ -name: nf-core CI -# This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors -on: - push: - branches: - - dev - pull_request: - release: - types: [published] - workflow_dispatch: - -env: - NXF_ANSI_LOG: false - NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity - NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity - NFT_VER: "0.9.2" - -concurrency: - group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" - cancel-in-progress: true - -jobs: - test: - name: "Run pipeline with test data (${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }})" - # Only run on push if this is the nf-core dev branch (merged PRs) - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/stableexpression') }}" - runs-on: ubuntu-latest - strategy: - matrix: - NXF_VER: - - "25.04.00" - - "latest-everything" - profile: - - "conda" - - "docker" - - "singularity" - test_name: - - "test" - isMaster: - - ${{ github.base_ref == 'master' }} - # Exclude conda and singularity on dev - exclude: - - isMaster: false - profile: "conda" - - isMaster: false - profile: "singularity" - steps: - - name: Check out pipeline code - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 - with: - fetch-depth: 0 - - - name: Set up Nextflow - uses: nf-core/setup-nextflow@v2 - with: - version: "${{ matrix.NXF_VER }}" - - - name: Set up Apptainer - if: matrix.profile == 'singularity' - uses: eWaterCycle/setup-apptainer@main - - - name: Set up Singularity - if: matrix.profile == 'singularity' - run: | - mkdir -p $NXF_SINGULARITY_CACHEDIR - mkdir -p $NXF_SINGULARITY_LIBRARYDIR - - - name: Set up Miniconda - if: matrix.profile == 'conda' - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3 - with: - miniconda-version: "latest" - auto-update-conda: true - conda-solver: libmamba - channels: conda-forge,bioconda - - - name: Set up Conda - if: matrix.profile == 'conda' - run: | - echo $(realpath $CONDA)/condabin >> $GITHUB_PATH - echo $(realpath python) >> $GITHUB_PATH - - - name: Clean up Disk space - uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - - name: "Run pipeline with test data ${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }}" - continue-on-error: ${{ matrix.NXF_VER == 'latest-everything' }} - run: | - nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.test_name }},${{ matrix.profile }} --outdir ./results - - nf_test: - name: "Run nf-test (${{ matrix.NXF_VER }} | ${{ matrix.profile }})" - # Only run on push if this is the nf-core dev branch (merged PRs) - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/stableexpression') }}" - runs-on: ubuntu-latest - strategy: - matrix: - NXF_VER: - - "25.04.00" - - "latest-everything" - profile: - - "conda" - - "docker" - - "singularity" - shard: [1, 2, 3, 4] - isMaster: - - ${{ github.base_ref == 'master' }} - # Exclude conda and singularity on dev - exclude: - - isMaster: false - profile: "conda" - - isMaster: false - profile: "singularity" - steps: - - name: Check out pipeline code - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - - - name: Set up Nextflow - uses: nf-core/setup-nextflow@v2 - with: - version: "${{ matrix.NXF_VER }}" - - - name: Set up Apptainer - if: matrix.profile == 'singularity' - uses: eWaterCycle/setup-apptainer@main - - - name: Set up Singularity - if: matrix.profile == 'singularity' - run: | - mkdir -p $NXF_SINGULARITY_CACHEDIR - mkdir -p $NXF_SINGULARITY_LIBRARYDIR - - - name: Set up Miniconda - if: matrix.profile == 'conda' - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3 - with: - miniconda-version: "latest" - auto-update-conda: true - conda-solver: libmamba - channels: conda-forge,bioconda - - - name: Set up Conda - if: matrix.profile == 'conda' - run: | - echo $(realpath $CONDA)/condabin >> $GITHUB_PATH - echo $(realpath python) >> $GITHUB_PATH - - - name: Install nf-test - uses: nf-core/setup-nf-test@v1 - with: - version: ${{ env.NFT_VER }} - - - name: Disk space cleanup - uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - - name: Run Tests - run: | - nf-test test --ci --profile ${{ matrix.profile }} --shard ${{ matrix.shard }}/${{ strategy.job-total }} diff --git a/.github/workflows/clean-up.yml b/.github/workflows/clean-up.yml index 0b6b1f27..6adb0fff 100644 --- a/.github/workflows/clean-up.yml +++ b/.github/workflows/clean-up.yml @@ -10,7 +10,7 @@ jobs: issues: write pull-requests: write steps: - - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9 + - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10 with: stale-issue-message: "This issue has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment otherwise this issue will be closed in 20 days." stale-pr-message: "This PR has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment if it is still useful." diff --git a/.github/workflows/download_pipeline.yml b/.github/workflows/download_pipeline.yml index 642b9887..45884ff9 100644 --- a/.github/workflows/download_pipeline.yml +++ b/.github/workflows/download_pipeline.yml @@ -2,7 +2,7 @@ name: Test successful pipeline download with 'nf-core pipelines download' # Run the workflow when: # - dispatched manually -# - when a PR is opened or reopened to master branch +# - when a PR is opened or reopened to main/master branch # - the head branch of the pull request is updated, i.e. if fixes for a release are pushed last minute to dev. on: workflow_dispatch: @@ -12,14 +12,6 @@ on: required: true default: "dev" pull_request: - types: - - opened - - edited - - synchronize - branches: - - main - - master - pull_request_target: branches: - main - master @@ -28,8 +20,23 @@ env: NXF_ANSI_LOG: false jobs: + configure: + runs-on: ubuntu-latest + outputs: + REPO_LOWERCASE: ${{ steps.get_repo_properties.outputs.REPO_LOWERCASE }} + REPOTITLE_LOWERCASE: ${{ steps.get_repo_properties.outputs.REPOTITLE_LOWERCASE }} + REPO_BRANCH: ${{ steps.get_repo_properties.outputs.REPO_BRANCH }} + steps: + - name: Get the repository name and current branch + id: get_repo_properties + run: | + echo "REPO_LOWERCASE=${GITHUB_REPOSITORY,,}" >> "$GITHUB_OUTPUT" + echo "REPOTITLE_LOWERCASE=$(basename ${GITHUB_REPOSITORY,,})" >> "$GITHUB_OUTPUT" + echo "REPO_BRANCH=${{ github.event.inputs.testbranch || 'dev' }}" >> "$GITHUB_OUTPUT" + download: runs-on: ubuntu-latest + needs: configure steps: - name: Install Nextflow uses: nf-core/setup-nextflow@v2 @@ -37,9 +44,9 @@ jobs: - name: Disk space cleanup uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" architecture: "x64" - name: Setup Apptainer @@ -50,13 +57,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install git+https://github.com/nf-core/tools.git@dev - - - name: Get the repository name and current branch set as environment variable - run: | - echo "REPO_LOWERCASE=${GITHUB_REPOSITORY,,}" >> ${GITHUB_ENV} - echo "REPOTITLE_LOWERCASE=$(basename ${GITHUB_REPOSITORY,,})" >> ${GITHUB_ENV} - echo "REPO_BRANCH=${{ github.event.inputs.testbranch || 'dev' }}" >> ${GITHUB_ENV} + pip install git+https://github.com/nf-core/tools.git - name: Make a cache directory for the container images run: | @@ -66,24 +67,27 @@ jobs: env: NXF_SINGULARITY_CACHEDIR: ./singularity_container_images run: | - nf-core pipelines download ${{ env.REPO_LOWERCASE }} \ - --revision ${{ env.REPO_BRANCH }} \ - --outdir ./${{ env.REPOTITLE_LOWERCASE }} \ + nf-core pipelines download ${{ needs.configure.outputs.REPO_LOWERCASE }} \ + --revision ${{ needs.configure.outputs.REPO_BRANCH }} \ + --outdir ./${{ needs.configure.outputs.REPOTITLE_LOWERCASE }} \ --compress "none" \ --container-system 'singularity' \ - --container-library "quay.io" -l "docker.io" -l "community.wave.seqera.io" \ + --container-library "quay.io" -l "docker.io" -l "community.wave.seqera.io/library/" \ --container-cache-utilisation 'amend' \ --download-configuration 'yes' - name: Inspect download - run: tree ./${{ env.REPOTITLE_LOWERCASE }} + run: tree ./${{ needs.configure.outputs.REPOTITLE_LOWERCASE }} + + - name: Inspect container images + run: tree ./singularity_container_images | tee ./container_initial - name: Count the downloaded number of container images id: count_initial run: | image_count=$(ls -1 ./singularity_container_images | wc -l | xargs) echo "Initial container image count: $image_count" - echo "IMAGE_COUNT_INITIAL=$image_count" >> ${GITHUB_ENV} + echo "IMAGE_COUNT_INITIAL=$image_count" >> "$GITHUB_OUTPUT" - name: Run the downloaded pipeline (stub) id: stub_run_pipeline @@ -91,31 +95,40 @@ jobs: env: NXF_SINGULARITY_CACHEDIR: ./singularity_container_images NXF_SINGULARITY_HOME_MOUNT: true - run: nextflow run ./${{ env.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ env.REPO_BRANCH }}) -stub -profile test,singularity --outdir ./results + run: nextflow run ./${{needs.configure.outputs.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ needs.configure.outputs.REPO_BRANCH }}) -stub -profile test,singularity --outdir ./results - name: Run the downloaded pipeline (stub run not supported) id: run_pipeline - if: ${{ job.steps.stub_run_pipeline.status == failure() }} + if: ${{ steps.stub_run_pipeline.outcome == 'failure' }} env: NXF_SINGULARITY_CACHEDIR: ./singularity_container_images NXF_SINGULARITY_HOME_MOUNT: true - run: nextflow run ./${{ env.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ env.REPO_BRANCH }}) -profile test,singularity --outdir ./results + run: nextflow run ./${{ needs.configure.outputs.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ needs.configure.outputs.REPO_BRANCH }}) -profile test,singularity --outdir ./results - name: Count the downloaded number of container images id: count_afterwards run: | image_count=$(ls -1 ./singularity_container_images | wc -l | xargs) echo "Post-pipeline run container image count: $image_count" - echo "IMAGE_COUNT_AFTER=$image_count" >> ${GITHUB_ENV} + echo "IMAGE_COUNT_AFTER=$image_count" >> "$GITHUB_OUTPUT" - name: Compare container image counts + id: count_comparison run: | - if [ "${{ env.IMAGE_COUNT_INITIAL }}" -ne "${{ env.IMAGE_COUNT_AFTER }}" ]; then - initial_count=${{ env.IMAGE_COUNT_INITIAL }} - final_count=${{ env.IMAGE_COUNT_AFTER }} + if [ "${{ steps.count_initial.outputs.IMAGE_COUNT_INITIAL }}" -ne "${{ steps.count_afterwards.outputs.IMAGE_COUNT_AFTER }}" ]; then + initial_count=${{ steps.count_initial.outputs.IMAGE_COUNT_INITIAL }} + final_count=${{ steps.count_afterwards.outputs.IMAGE_COUNT_AFTER }} difference=$((final_count - initial_count)) echo "$difference additional container images were \n downloaded at runtime . The pipeline has no support for offline runs!" - tree ./singularity_container_images + tree ./singularity_container_images > ./container_afterwards + diff ./container_initial ./container_afterwards exit 1 else echo "The pipeline can be downloaded successfully!" fi + + - name: Upload Nextflow logfile for debugging purposes + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5 + with: + name: nextflow_logfile.txt + path: .nextflow.log* + include-hidden-files: true diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix_linting.yml similarity index 80% rename from .github/workflows/fix-linting.yml rename to .github/workflows/fix_linting.yml index 207a1b24..6df255c4 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix_linting.yml @@ -13,13 +13,13 @@ jobs: runs-on: ubuntu-latest steps: # Use the @nf-core-bot token to check out so we can push later - - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 with: token: ${{ secrets.nf_core_bot_auth_token }} # indication that the linting is being fixed - name: React on comment - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: eyes @@ -32,9 +32,9 @@ jobs: GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} # Install and run pre-commit - - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" - name: Install pre-commit run: pip install pre-commit @@ -47,7 +47,7 @@ jobs: # indication that the linting has finished - name: react if linting finished succesfully if: steps.pre-commit.outcome == 'success' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: "+1" @@ -67,21 +67,21 @@ jobs: - name: react if linting errors were fixed id: react-if-fixed if: steps.commit-and-push.outcome == 'success' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: hooray - name: react if linting errors were not fixed if: steps.commit-and-push.outcome == 'failure' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: confused - name: react if linting errors were not fixed if: steps.commit-and-push.outcome == 'failure' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: issue-number: ${{ github.event.issue.number }} body: | diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index dbd52d5a..7a527a34 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -3,9 +3,6 @@ name: nf-core linting # It runs the `nf-core pipelines lint` and markdown lint tests to ensure # that the code meets the nf-core guidelines. on: - push: - branches: - - dev pull_request: release: types: [published] @@ -14,12 +11,12 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 - - name: Set up Python 3.12 - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5 + - name: Set up Python 3.14 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" - name: Install pre-commit run: pip install pre-commit @@ -31,18 +28,18 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 - name: Install Nextflow uses: nf-core/setup-nextflow@v2 - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5 + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" architecture: "x64" - name: read .nf-core.yml - uses: pietrobolcato/action-read-yaml@1.1.0 + uses: pietrobolcato/action-read-yaml@9f13718d61111b69f30ab4ac683e67a56d254e1d # 1.1.0 id: read_yml with: config: ${{ github.workspace }}/.nf-core.yml @@ -74,7 +71,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 95b6b6af..e6e9bc26 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@20319c5641d495c8a52e688b7dc5fada6c3a9fbc # v8 + uses: dawidd6/action-download-artifact@ac66b43f0e6a346234dd65d4d0c8fbb31cb316e5 # v11 with: workflow: linting.yml workflow_conclusion: completed @@ -21,7 +21,7 @@ jobs: run: echo "pr_number=$(cat linting-logs/PR_number.txt)" >> $GITHUB_OUTPUT - name: Post PR comment - uses: marocchino/sticky-pull-request-comment@331f8f5b4215f0445d3c07b4967662a32a2d3e31 # v2 + uses: marocchino/sticky-pull-request-comment@773744901bac0e8cbb5a0dc842800d45e9b2b405 # v2 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} number: ${{ steps.pr_number.outputs.pr_number }} diff --git a/.github/workflows/nf-test.yml b/.github/workflows/nf-test.yml new file mode 100644 index 00000000..c98d76ec --- /dev/null +++ b/.github/workflows/nf-test.yml @@ -0,0 +1,144 @@ +name: Run nf-test +on: + pull_request: + paths-ignore: + - "docs/**" + - "**/meta.yml" + - "**/*.md" + - "**/*.png" + - "**/*.svg" + release: + types: [published] + workflow_dispatch: + +# Cancel if a newer run is started +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + NFT_VER: "0.9.3" + NFT_WORKDIR: "~" + NXF_ANSI_LOG: false + NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity + NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity + +jobs: + nf-test-changes: + name: nf-test-changes + runs-on: # use self-hosted runners + - runs-on=${{ github.run_id }}-nf-test-changes + - runner=4cpu-linux-x64 + outputs: + shard: ${{ steps.set-shards.outputs.shard }} + total_shards: ${{ steps.set-shards.outputs.total_shards }} + steps: + - name: Clean Workspace # Purge the workspace in case it's running on a self-hosted runner + run: | + ls -la ./ + rm -rf ./* || true + rm -rf ./.??* || true + ls -la ./ + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + with: + fetch-depth: 0 + + - name: get number of shards + id: set-shards + uses: ./.github/actions/get-shards + env: + NFT_VER: ${{ env.NFT_VER }} + with: + max_shards: 7 + + - name: debug + run: | + echo ${{ steps.set-shards.outputs.shard }} + echo ${{ steps.set-shards.outputs.total_shards }} + + nf-test: + name: "${{ matrix.profile }} | ${{ matrix.NXF_VER }} | ${{ matrix.shard }}/${{ needs.nf-test-changes.outputs.total_shards }}" + needs: [nf-test-changes] + if: ${{ needs.nf-test-changes.outputs.total_shards != '0' }} + runs-on: # use self-hosted runners + - runs-on=${{ github.run_id }}-nf-test + - runner=4cpu-linux-x64 + strategy: + fail-fast: false + matrix: + shard: ${{ fromJson(needs.nf-test-changes.outputs.shard) }} + profile: [conda, docker, singularity] + isMain: + - ${{ github.base_ref == 'master' || github.base_ref == 'main' }} + # Exclude conda and singularity on dev + exclude: + - isMain: false + profile: "conda" + - isMain: false + profile: "singularity" + NXF_VER: + - "25.04.0" + - "latest-everything" + env: + NXF_ANSI_LOG: false + TOTAL_SHARDS: ${{ needs.nf-test-changes.outputs.total_shards }} + + steps: + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + with: + fetch-depth: 0 + + - name: Run nf-test + id: run_nf_test + uses: ./.github/actions/nf-test + continue-on-error: ${{ matrix.NXF_VER == 'latest-everything' }} + env: + NFT_WORKDIR: ${{ env.NFT_WORKDIR }} + NXF_VERSION: ${{ matrix.NXF_VER }} + with: + profile: ${{ matrix.profile }} + shard: ${{ matrix.shard }} + total_shards: ${{ env.TOTAL_SHARDS }} + + - name: Report test status + if: ${{ always() }} + run: | + if [[ "${{ steps.run_nf_test.outcome }}" == "failure" ]]; then + echo "::error::Test with ${{ matrix.NXF_VER }} failed" + # Add to workflow summary + echo "## ❌ Test failed: ${{ matrix.profile }} | ${{ matrix.NXF_VER }} | Shard ${{ matrix.shard }}/${{ env.TOTAL_SHARDS }}" >> $GITHUB_STEP_SUMMARY + if [[ "${{ matrix.NXF_VER }}" == "latest-everything" ]]; then + echo "::warning::Test with latest-everything failed but will not cause workflow failure. Please check if the error is expected or if it needs fixing." + fi + if [[ "${{ matrix.NXF_VER }}" != "latest-everything" ]]; then + exit 1 + fi + fi + + confirm-pass: + needs: [nf-test] + if: always() + runs-on: # use self-hosted runners + - runs-on=${{ github.run_id }}-confirm-pass + - runner=2cpu-linux-x64 + steps: + - name: One or more tests failed (excluding latest-everything) + if: ${{ contains(needs.*.result, 'failure') }} + run: exit 1 + + - name: One or more tests cancelled + if: ${{ contains(needs.*.result, 'cancelled') }} + run: exit 1 + + - name: All tests ok + if: ${{ contains(needs.*.result, 'success') }} + run: exit 0 + + - name: debug-print + if: always() + run: | + echo "::group::DEBUG: `needs` Contents" + echo "DEBUG: toJSON(needs) = ${{ toJSON(needs) }}" + echo "DEBUG: toJSON(needs.*.result) = ${{ toJSON(needs.*.result) }}" + echo "::endgroup::" diff --git a/.github/workflows/release-announcements.yml b/.github/workflows/release-announcements.yml index 76a9e67e..431d3d44 100644 --- a/.github/workflows/release-announcements.yml +++ b/.github/workflows/release-announcements.yml @@ -14,6 +14,10 @@ jobs: run: | echo "topics=$(curl -s https://nf-co.re/pipelines.json | jq -r '.remote_workflows[] | select(.full_name == "${{ github.repository }}") | .topics[]' | awk '{print "#"$0}' | tr '\n' ' ')" | sed 's/-//g' >> $GITHUB_OUTPUT + - name: get description + id: get_description + run: | + echo "description=$(curl -s https://nf-co.re/pipelines.json | jq -r '.remote_workflows[] | select(.full_name == "${{ github.repository }}") | .description')" >> $GITHUB_OUTPUT - uses: rzr/fediverse-action@master with: access-token: ${{ secrets.MASTODON_ACCESS_TOKEN }} @@ -22,7 +26,7 @@ jobs: # https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#release message: | Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! - + ${{ steps.get_description.outputs.description }} Please see the changelog: ${{ github.event.release.html_url }} ${{ steps.get_topics.outputs.topics }} #nfcore #openscience #nextflow #bioinformatics @@ -30,7 +34,7 @@ jobs: bsky-post: runs-on: ubuntu-latest steps: - - uses: zentered/bluesky-post-action@80dbe0a7697de18c15ad22f4619919ceb5ccf597 # v0.1.0 + - uses: zentered/bluesky-post-action@6461056ea355ea43b977e149f7bf76aaa572e5e8 # v0.3.0 with: post: | Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! diff --git a/.github/workflows/template_version_comment.yml b/.github/workflows/template-version-comment.yml similarity index 91% rename from .github/workflows/template_version_comment.yml rename to .github/workflows/template-version-comment.yml index e8aafe44..e8560fc7 100644 --- a/.github/workflows/template_version_comment.yml +++ b/.github/workflows/template-version-comment.yml @@ -9,12 +9,12 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 + uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 with: ref: ${{ github.event.pull_request.head.sha }} - name: Read template version from .nf-core.yml - uses: nichmor/minimal-read-yaml@v0.0.2 + uses: nichmor/minimal-read-yaml@1f7205277e25e156e1f63815781db80a6d490b8f # v0.0.2 id: read_yml with: config: ${{ github.workspace }}/.nf-core.yml diff --git a/.gitignore b/.gitignore index c790fe08..2f9c5b0d 100644 --- a/.gitignore +++ b/.gitignore @@ -9,8 +9,10 @@ testing* null/ .nf-test* .idea/ +.vscode/ taggers/ tokenizers/ corpora/ .github/act.custom_runner.Dockerfile .ruff_cache +galaxy/test_output/ diff --git a/.gitpod.yml b/.gitpod.yml deleted file mode 100644 index 46118637..00000000 --- a/.gitpod.yml +++ /dev/null @@ -1,17 +0,0 @@ -image: nfcore/gitpod:latest -tasks: - - name: Update Nextflow and setup pre-commit - command: | - pre-commit install --install-hooks - nextflow self-update - -vscode: - extensions: # based on nf-core.nf-core-extensionpack - #- esbenp.prettier-vscode # Markdown/CommonMark linting and style checking for Visual Studio Code - - EditorConfig.EditorConfig # override user/workspace settings with settings found in .editorconfig files - - Gruntfuggly.todo-tree # Display TODO and FIXME in a tree view in the activity bar - - mechatroner.rainbow-csv # Highlight columns in csv files in different colors - - nextflow.nextflow # Nextflow syntax highlighting - - oderwat.indent-rainbow # Highlight indentation level - - streetsidesoftware.code-spell-checker # Spelling checker for source code - - charliermarsh.ruff # Code linter Ruff diff --git a/.nf-core.yml b/.nf-core.yml index 19f4e027..24459e2a 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,4 +1,3 @@ -bump_version: null lint: files_exist: - conf/igenomes.config @@ -7,12 +6,18 @@ lint: - conf/igenomes_ignored.config files_unchanged: - assets/nf-core-stableexpression_logo_light.png + - docs/images/nf-core-stableexpression_logo_light.png + - docs/images/nf-core-stableexpression_logo_dark.png - .github/PULL_REQUEST_TEMPLATE.md nextflow_config: - params.input + template_strings: + - tests/test_data/genorm/compute_m_measure/input/std.0.0.parquet + - tests/test_data/genorm/compute_m_measure/input/std.1.2.parquet + - tests/test_data/genorm/compute_m_measure/input/std.1.2.parquet schema_lint: false -nf_core_version: 3.2.1 -org_path: null + +nf_core_version: 3.5.1 repository_type: pipeline template: author: Olivier Coen diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 46cf9e1d..c7942f15 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,25 +1,41 @@ repos: - repo: https://github.com/pre-commit/mirrors-prettier - rev: "v3.1.0" + rev: "v4.0.0-alpha.8" hooks: - id: prettier additional_dependencies: - - prettier@3.2.5 + - prettier@3.6.2 + exclude: galaxy/ - - repo: https://github.com/editorconfig-checker/editorconfig-checker.python - rev: "3.1.2" + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 hooks: - - id: editorconfig-checker - exclude: '\.drawio$' - alias: ec + - id: trailing-whitespace + args: [--markdown-linebreak-ext=md] + exclude: | + (?x)^( + .*ro-crate-metadata.json$| + modules/nf-core/.*| + subworkflows/nf-core/.*| + .*\.snap$ + )$ + - id: end-of-file-fixer + exclude: | + (?x)^( + .*ro-crate-metadata.json$| + modules/nf-core/.*| + subworkflows/nf-core/.*| + .*\.snap$ + )$ - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.4 + rev: v0.14.1 hooks: # Run the linter. - id: ruff files: \.py$ args: [--fix] + exclude: bin/old/ # Run the formatter. - id: ruff-format files: \.py$ diff --git a/.prettierignore b/.prettierignore index 4a73c2cc..dd749d43 100644 --- a/.prettierignore +++ b/.prettierignore @@ -10,5 +10,7 @@ testing/ testing* *.pyc bin/ -ro-crate-metadata.json .nf-test/ +ro-crate-metadata.json +modules/nf-core/ +subworkflows/nf-core/ diff --git a/.prettierrc.yml b/.prettierrc.yml index c81f9a76..07dbd8bb 100644 --- a/.prettierrc.yml +++ b/.prettierrc.yml @@ -1 +1,6 @@ printWidth: 120 +tabWidth: 4 +overrides: + - files: "*.{md,yml,yaml,html,css,scss,js,cff}" + options: + tabWidth: 2 diff --git a/CITATIONS.md b/CITATIONS.md index d14723b5..e7423ab9 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,29 +10,29 @@ ## Pipeline tools -- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) +- [EBI Expression Atlas](https://www.ebi.ac.uk/gxa/home) -> Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. - -- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) - -> Ewels P, Magnusson M, Lundin S, KĂ€ller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. +> Papatheodorou I, Fonseca NA, Keays M, Tang YA, Barrera E, Bazant W, Burke M, FĂŒllgrabe A, Muñoz-Pomer Fuentes A, George N, Huerta L, Koskinen S, Mohammed S, Geniza M, Preece J, Jaiswal P, Jarnuczak AF, Huber W, Stegle O, Vizcaino JA, Brazma A, Petryszak R. Expression Atlas: gene and protein expression across multiple studies and organisms. Nucleic Acids Res. 2017 Nov 20;46(Database issue):D246–D251. doi: 10.1093/nar/gkx1158. PubMed PMID: 29165655. -- [Expression Atlas](https://www.ebi.ac.uk/gxa/home) +- [NCBI GEO](https://www.ncbi.nlm.nih.gov/geo/) -> Papatheodorou I, Fonseca NA, Keays M, Tang YA, Barrera E, Bazant W, Burke M, FĂŒllgrabe A, Muñoz-Pomer Fuentes A, George N, Huerta L, Koskinen S, Mohammed S, Geniza M, Preece J, Jaiswal P, Jarnuczak AF, Huber W, Stegle O, Vizcaino JA, Brazma A, Petryszak R. Expression Atlas: gene and protein expression across multiple studies and organisms. Nucleic Acids Res. 2017 Nov 20;46(Database issue):D246–D251. doi: 10.1093/nar/gkx1158. PubMed PMID: 29165655. +> Ron Edgar, Michael Domrachev & Alex E Lash. Gene Expression Omnibus: NCBI gene expression and hybridization array data repository. Nucleic Acids Res. 2002 Jan 1;30(1):207-10. doi: 10.1093/nar/30.1.207. PubMed PMID: 11752295. - [g:Profiler](https://biit.cs.ut.ee/gprofiler/gost) > Reimand J, Kull M, Peterson H, Hansen J, Vilo J. g:Profiler—a web-based toolset for functional profiling of gene lists from large-scale experiments. Nucleic Acids Res. 2007 May 3;35(Web Server issue):W193–W200. doi:10.1093/nar/gkm226. PubMed PMID: 17478515. -- [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) +- [Normfinder](https://rdrr.io/github/dhammarstrom/generefer/man/normfinder.html) + +> Claus Lindbjerg Andersen, Jens Ledet Jensen, Torben Falck Ørntoft. Normalization of Real-Time Quantitative Reverse Transcription-PCR Data: A Model-Based Variance Estimation Approach to Identify Genes Suited for Normalization, Applied to Bladder and Colon Cancer Data Sets. Cancer Res (2004) 64 (15): 5245–5250. doi:10.1158/0008-5472.CAN-04-0496. PubMed PMID: 15289330. -> Love MI, Huber W & Anders S. Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology. 2014;15(12):550. doi: 10.1186/s13059-014-0550-8. PubMed PMID: 25516281. +- [GeNorm](https://pypi.org/project/rna-genorm/) -## [EdgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html) +> Jo Vandesompele, Katleen De Preter, Filip Pattyn, Bruce Poppe, Nadine Van Roy, Anne De Paepe, Frank Speleman. Accurate normalization of real-time quantitative RT-PCR data by geometric averaging of multiple internal control genes. Genome Biol. 2002 Jun 18;3(7):RESEARCH0034. doi: 10.1186/gb-2002-3-7-research0034 Pubmed PMID: 12184808. -> Robinson MD, McCarthy DJ, Smyth GK. edgeR: a Bioconductor package for differential expression analysis of digital gene expression data. Bioinformatics. 2010 Jan 1;26(1):139-40. doi: 10.1093/bioinformatics/btp616. Pubmed PMID: 19910308. +- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) + +> Ewels P, Magnusson M, Lundin S, KĂ€ller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. ## Software packaging/containerisation tools diff --git a/README.md b/README.md index 44e4b8ce..7eb5509b 100644 --- a/README.md +++ b/README.md @@ -5,87 +5,66 @@ -[![GitHub Actions CI Status](https://github.com/nf-core/stableexpression/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/stableexpression/actions/workflows/ci.yml) +[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/stableexpression) +[![GitHub Actions CI Status](https://github.com/nf-core/stableexpression/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/stableexpression/actions/workflows/nf-test.yml) [![GitHub Actions Linting Status](https://github.com/nf-core/stableexpression/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/stableexpression/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/stableexpression/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) [![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com) -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A525.04.00-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.04.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/) +[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1) +[![run with apptainer](https://custom-icon-badges.demolab.com/badge/run%20with-apptainer-4545?logo=apptainer&color=teal&labelColor=000000)](https://apptainer.org/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/stableexpression) -[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23stableexpression-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/stableexpression)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) +[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23stableexpression-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/stableexpression)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) ## Introduction -**nf-core/stableexpression** is a bioinformatics pipeline that aims at finding the most stable genes among a single or multiple public / local count datasets. It takes as input a species name (mandatory), keywords for expression atlas search (optional) and / or a CSV input file listing local raw / normalised count datasets (optional). **A typical usage is to find the most suitable qPCR housekeeping genes for a specific species (and optionally specific conditions)**. +**nf-core/stableexpression** is a bioinformatics pipeline aiming to aggregate multiple count datasets (public / provided by the user) for a specific species and find the most stable genes.

- +

-## Pipeline summary +It takes as main inputs : -1. Get Expression Atlas accessions corresponding to the provided species (and optionally keywords) ([Expression Atlas](https://www.ebi.ac.uk/gxa/home); optional) -2. Download Expression Atlas data ([Expression Atlas](https://www.ebi.ac.uk/gxa/home); optional) -3. Normalize raw data (using [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) or [EdgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html)) -4. Map gene IDS to Ensembl IDS for standardisation among datasets ([g:Profiler](https://biit.cs.ut.ee/gprofiler/gost)) -5. Compute pairwise gene variation -6. Compute gene variation statistics and get the most stable genes -7. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +- a species name (mandatory) +- keywords for Expression Atlas / GEO search (optional) +- a CSV input file listing your own raw / normalised count datasets (optional). -## Usage +**Use cases**: -> [!NOTE] -> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. - -First, prepare a samplesheet listing the different count datasets: +- **find the most suitable genes as RT-qPCR reference genes for a specific species (and optionally specific conditions)** +- download all Expression Atlas and / or NCBI GEO datasets for a species (and optionally keywords) -`datasets.csv`: +## Basic usage -```csv -counts,design,normalised -path/to/normalised.counts.csv,path/to/normalised.design.csv,true -path/to/raw.counts.csv,path/to/raw.design.csv,false -``` - -Make sure to format your datasets properly: +> [!NOTE] +> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. -`counts.csv`: +To search the most stable genes in a species considering all public datasets, simply run: -```csv -,sample_A,sample_B,sample_C -gene_1,1,2,3 -gene_2,1,2,3 -... +```bash +nextflow run nf-core/stableexpression \ + -profile \ + --species \ + --outdir ``` -`design.csv`: +## More advanced usage -```csv -sample,condition -sample_A,condition_1 -sample_B,condition_2 -... -``` +For more specific scenarios, like: -Now you can run the pipeline as follows: +- **fetching only specific conditions** +- **using your own expression dataset(s)** -> ```bash -> nextflow run nf-core/stableexpression \ -> -profile docker \ -> --species \ -> --eatlas_accessions \ -> --eatlas_keywords \ -> --datasets ./datasets.csv \ -> --outdir ./results -> ``` +please refer to the [usage documentation](https://nf-co.re/stableexpression/usage). -> [!WARNING] -> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files). +## Profiles -For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/stableexpression/usage) and the [parameter documentation](https://nf-co.re/stableexpression/parameters). +See [here](https://nf-co.re/stableexpression/usage#profiles) for more information about profiles. ## Pipeline output @@ -93,6 +72,12 @@ To see the results of an example test run with a full size dataset refer to the For more details about the output files and reports, please refer to the [output documentation](https://nf-co.re/stableexpression/output). +## Support us + +If you like nf-core/stableexpression, please make sure you give it a star on GitHub. + +[![stars - stableexpression](https://img.shields.io/github/stars/nf-core/stableexpression?style=social)](https://github.com/nf-core/stableexpression) + ## Credits nf-core/stableexpression was originally written by Olivier Coen. @@ -112,8 +97,6 @@ For further information or help, don't hesitate to get in touch on the [Slack `# - - An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. You can cite the `nf-core` publication as follows: diff --git a/assets/email_template.html b/assets/email_template.html index 5cce5f7d..711c3b4b 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -4,7 +4,7 @@ - + nf-core/stableexpression Pipeline Report diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index e116bbee..d4670c25 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -3,8 +3,6 @@ description: "Suggested text and references to use when describing pipeline usag section_name: "nf-core/stableexpression Methods Description" section_href: "https://github.com/nf-core/stableexpression" plot_type: "html" -## TODO nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline -## You inject any metadata in the Nextflow '${workflow}' object data: |

Methods

Data was processed using nf-core/stableexpression v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (GrĂŒning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 5c965ffa..9aed3942 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -21,6 +21,10 @@ disable_version_detection: true max_table_rows: 5000 table_cond_formatting_colours: + - first: "#ffd700" + - second: "#C0C0C0" + - third: "#CD7F32" + - between_fourth_and_tenth: "#468F8F" - very_low: "#337ab7" - low: "#5bc0de" - medium: "#5cb85c" @@ -28,56 +32,122 @@ table_cond_formatting_colours: - very_high: "#d9534f" custom_data: - ranked_top_stable_genes_summary: - section_name: "Top stable genes - ranked by stability" + ranked_most_stable_genes_summary: + section_name: "Stable genes ranking" file_format: "csv" no_violin: true - sort_rows: false description: | Expression descriptive statistics of all genes, ranked by stability. - Expression were normalised and cpm (counts per million) were used for all calculations. - Genes are sorted by M-measure - from the most stable to the least stable. + Genes are sorted by stability score - from the most stable to the least stable. plot_type: "table" pconfig: - col1_header: "Rank" + col1_header: "Gene ID" + sort_rows: false headers: + gene_id: + title: "Gene ID" rank: - rid: "Rank" - hidden: true - ensembl_gene_id: - title: "Ensembl Gene ID" + title: "Rank" description: | - Gene IDs as shown in Ensembl - m_measure: - title: "Stability" + Rank of the gene based on stability score + scale: "RdYlGn-rev" + cond_formatting_rules: + between_fourth_and_tenth: + - eq: 4 + - eq: 5 + - eq: 6 + - eq: 7 + - eq: 8 + - eq: 9 + - eq: 10 + third: + - eq: 3 + second: + - eq: 2 + first: + - eq: 1 + name: + title: "Gene name" + description: + title: "Gene description" + original_gene_ids: + title: "Original gene IDs" description: | - Gene stability measure M as defined in Vandesompele et al., Genome Biology (2002). - Lower values indicate higher stability. - M-measures were calculated directly from cpm (normalised count per million). - format: "{:,.4f}" - #minrange: 0 - standard_deviation: - title: "Std" + Original gene IDs as stated in the input (provided or downloaded) datasets + stability_score: + title: "Stability score" + description: | + Final stability score : the lower, the better + format: "{:,.6f}" + scale: "RdYlGn-rev" + coefficient_of_variation_normalised: + title: "Normalised CV" description: | - Standard deviation of the expression across samples. - For each sample, expression was first computed as log2(cpm + 1), then fitted to [0, 1] using scikit-learn's QuantileTransformer. + Quantile normalised (among candidate genes) coefficient of variation ( std(expression) / mean(expression) ) across all samples. + format: "{:,.6f}" + scale: "PRGn-rev" + robust_coefficient_of_variation_median_normalised: + title: "Normalised RCVm" + description: | + Quantile normalised (among candidate genes) robust coefficient of variation on median of the expression across all samples. format: "{:,.4f}" - variation_coefficient: - title: "Var coeff" + scale: "PRGn-rev" + normfinder_stability_value_normalised: + title: "Normalised Normfinder score" + description: | + Quantile normalised (among candidate genes) stability value as computed by Normfinder + format: "{:,.6f}" + scale: "PRGn-rev" + genorm_m_measure_normalised: + title: "Normalised Genorm score" + description: | + Quantile normalised (among candidate genes) M-measure as computed by Genorm + format: "{:,.6f}" + scale: "PRGn-rev" + coefficient_of_variation: + title: "CV" description: | - Variation coefficient: std(expression) / mean(expression). - For each sample, expression was first computed as log2(cpm + 1), then fitted to [0, 1] using scikit-learn's QuantileTransformer. + Coefficient of variation ( std(expression) / mean(expression) ) across all samples. + format: "{:,.6f}" + robust_coefficient_of_variation_median: + title: "RCVm" + description: | + Robust coefficient of variation on median of the expression across all samples. format: "{:,.4f}" + normfinder_stability_value: + title: "Normfinder stability value " + description: | + Stability value as computed by Normfinder + format: "{:,.6f}" + genorm_m_measure: + title: "Genorm M-measure" + description: | + M-measure as computed by Genorm + format: "{:,.6f}" mean: title: "Average" description: | - Average expression across samples. - For each sample, expression was first computed as log2(cpm + 1), then fitted to [0, 1] using scikit-learn's QuantileTransformer. + Average expression across all samples. + format: "{:,.4f}" + standard_deviation: + title: "Standard deviation" + description: | + Standard deviation of the expression across all samples. + format: "{:,.6f}" + median: + title: "Median" + description: | + Median expression across all samples. + format: "{:,.4f}" + median_absolute_deviation: + title: "MAD" + description: | + Median absolute deviation of the expression across all samples. format: "{:,.4f}" expression_level_status: title: "Expression level" description: | - Indication about the average gene expression level compared to the whole pool of genes. + Indication about the average gene expression level across all samples (compared to the whole pool of genes). Expression in [0, 0.05]: Very low expression. Expression in [0.05, 0.1]: Low expression. Expression in [0.1, 0.9]: Medium range. @@ -94,159 +164,554 @@ custom_data: - s_eq: "Low expression" very_low: - s_eq: "Very low expression" - name: - title: "Ensembl name" + rnaseq_coefficient_of_variation: + title: "Var coeff [RNA-seq only]" description: | - Gene name as shown in Ensembl (g:Profiler) - description: - title: "Ensembl description" + Coefficient of variation ( std(expression) / mean(expression) ) across RNA-Seq samples. + format: "{:,.4f}" + rnaseq_robust_coefficient_of_variation_median: + title: "RCVm [RNA-seq only]" description: | - Gene description as shown in Ensembl (g:Profiler) - original_gene_ids: - title: "Original gene IDs" + Robust coefficient of variation on median of the expression across RNA-Seq samples. + format: "{:,.4f}" + rnaseq_mean: + title: "Average [RNA-seq only]" description: | - Original gene IDs as stated in the input (provided or downloaded) datasets - total_nb_nulls: - title: "Nb nulls" + Average expression across RNA-Seq samples. + format: "{:,.4f}" + rnaseq_standard_deviation: + title: "Std [RNA-seq only]" + description: | + Standard deviation of the expression across RNA-Seq samples. + format: "{:,.4f}" + rnaseq_median: + title: "Median [RNA-seq only]" + description: | + Median expression across RNA-Seq samples. + format: "{:,.4f}" + rnaseq_median_absolute_deviation: + title: "MAD [RNA-seq only]" + description: | + Median absolute deviation of the expression across RNA-Seq samples. + format: "{:,.4f}" + rnaseq_expression_level_status: + title: "Expression level [RNA-seq only]" + description: | + Indication about the average gene expression level across RNA-Seq samples (compared to the whole pool of genes). + Expression in [0, 0.05]: Very low expression. + Expression in [0.05, 0.1]: Low expression. + Expression in [0.1, 0.9]: Medium range. + Expression in [0.9, 0.95]: High expression. + Expression in [0.95, 1]: Very high expression. + cond_formatting_rules: + very_high: + - s_eq: "Very high expression" + high: + - s_eq: "High expression" + medium: + - s_eq: "Medium range" + low: + - s_eq: "Low expression" + very_low: + - s_eq: "Very low expression" + microarray_coefficient_of_variation: + title: "Var coeff [Microarray only]" + description: | + Coefficient of variation ( std(expression) / mean(expression) ) across Microarray samples. + format: "{:,.4f}" + microarray_robust_coefficient_of_variation_median: + title: "RCVm [Microarray only]" + description: | + Robust coefficient of variation on median of the expression across Microarray samples. + format: "{:,.4f}" + microarray_mean: + title: "Average [Microarray only]" + description: | + Average expression across Microarray samples. + format: "{:,.4f}" + microarray_standard_deviation: + title: "Std [Microarray only]" + description: | + Standard deviation of the expression across Microarray samples. + format: "{:,.4f}" + microarray_median: + title: "Median [Microarray only]" + description: | + Median expression across Microarray samples. + format: "{:,.4f}" + microarray_median_absolute_deviation: + title: "MAD [Microarray only]" + description: | + Median absolute deviation of the expression across Microarray samples. + format: "{:,.4f}" + microarray_expression_level_status: + title: "Expression level [Microarray only]" + description: | + Indication about the average gene expression level across Microarray samples (compared to the whole pool of genes). + Expression in [0, 0.05]: Very low expression. + Expression in [0.05, 0.1]: Low expression. + Expression in [0.1, 0.9]: Medium range. + Expression in [0.9, 0.95]: High expression. + Expression in [0.95, 1]: Very high expression. + cond_formatting_rules: + very_high: + - s_eq: "Very high expression" + high: + - s_eq: "High expression" + medium: + - s_eq: "Medium range" + low: + - s_eq: "Low expression" + very_low: + - s_eq: "Very low expression" + ratio_nulls_in_all_samples: + title: "Ratio null values (all samples)" + description: | + Ratio of samples in which the gene is not represented. + ratio_nulls_in_valid_samples: + title: "Ratio null values (valid samples)" + description: | + Ratio of samples in which the gene is not represented, excluding samples with particularly low overall gene count. + ratio_zeros: + title: "Ratio zero values" + description: | + Ratio of samples in which the gene has a zero value. + rnaseq_ratio_nulls_in_all_samples: + title: "Ratio null values (all samples) [RNA-seq only]" + description: | + Ratio of RNA-Seq samples in which the gene is not represented. + rnaseq_ratio_nulls_in_valid_samples: + title: "Ratio null values (valid samples) [RNA-seq only]" + description: | + Ratio of RNA-Seq samples in which the gene is not represented, excluding samples with particularly low overall gene count. + rnaseq_ratio_zeros: + title: "Ratio zero values [RNA-seq only]" description: | - Number of samples in which the gene is not represented. - nb_nulls_valid_samples: - title: "Nb nulls (valid samples)" + Ratio of RNA-Seq samples in which the gene has a zero value. + microarray_ratio_nulls_in_all_samples: + title: "Ratio null values (all samples) [Microarray only]" description: | - Number of samples in which the gene is not represented, excluding samples with particularly low overall gene count. + Ratio of Microarray samples in which the gene is not represented. + microarray_ratio_nulls_in_valid_samples: + title: "Ratio null values (valid samples) [Microarray only]" + description: | + Ratio of Microarray samples in which the gene is not represented, excluding samples with particularly low overall gene count. + microarray_ratio_zeros: + title: "Ratio zero values [Microarray only]" + description: | + Ratio of Microarray samples in which the gene has a zero value. - expression_distributions_top_stable_genes: - section_name: "Expression distribution of the top stable genes (ranked by stability)" + expression_distributions_most_stable_genes: + section_name: "Count distributions" file_format: "csv" pconfig: sort_samples: false + #xmin: 0 + #xmax: 1 + xlab: Expression + ylab: Gene description: | - Distribution of gene expression across samples for the most stable genes. - For each sample, expression was first computed as log2(cpm + 1), then fitted to [0, 1] using scikit-learn's QuantileTransformer. - Genes are ranked from the most stable to the least stable. + Distribution of normalised gene expression (between 0 and 1) across samples for the most stable genes. + Only the 100 most stable genes are shown and genes are ranked from the most stable to the least stable. plot_type: "boxplot" - #xlab: Expression - #ylab: Gene gene_statistics: section_name: "Descriptive statistics - All genes" + parent_id: stats + parent_name: "Statistics" + parent_description: "Various statistics at the gene or sample level" file_format: "csv" description: | Distribution of descriptive statistics for all genes. - Expression was first computed as log2(cpm + 1), then fitted to [0, 1] using scikit-learn's QuantileTransformer. plot_type: "violin" - pconfig: - col1_header: "Ensembl Gene ID" headers: - # colors from https://colorbrewer2.org/#type=diverging&scheme=BrBG&n=8 - m_measure: - title: "Gene stability M-measure" - description: | - Gene stability measure M as defined in Vandesompele et al., Genome Biology (2002) - color: "#bf812d" - standard_deviation: - title: "Standard deviation" - description: | - Standard deviation of the expression across samples - color: "#bf812d" - variation_coefficient: - title: "Variation coefficient" - description: | - Ratio of standard deviation to mean - color: "#f6e8c3" + stability_score: + title: "Stability score" + color: "rgb(186,43,32)" + coefficient_of_variation_normalised: + title: "Normalised CV" + color: "rgb(64, 122, 22)" + robust_coefficient_of_variation_median_normalised: + title: "Normalised RCVm" + color: "rgb(64, 122, 22)" + normfinder_stability_value_normalised: + title: "Normalised Normfinder score" + color: "rgb(64, 122, 22)" + genorm_m_measure_normalised: + title: "Normalised Genorm score" + color: "rgb(64, 122, 22)" + coefficient_of_variation: + title: "CV" + color: "rgb(26, 167, 178)" + robust_coefficient_of_variation_median: + title: "RCVm" + color: "rgb(26, 167, 178)" + normfinder_stability_value: + title: "Normfinder stability value " + color: "rgb(26, 167, 178)" + genorm_m_measure: + title: "Genorm M-measure" + color: "rgb(26, 167, 178)" mean: title: "Average" - description: | - Average expression across samples. - Expression was first computed as log2(cpm + 1), then fitted to [0, 1] using scikit-learn's QuantileTransformer. - color: "#01665e" + color: "rgb(26, 167, 178)" + standard_deviation: + title: "Standard deviation" + color: "rgb(26, 167, 178)" + median: + title: "Median" + color: "rgb(26, 167, 178)" + median_absolute_deviation: + title: "MAD" + color: "rgb(26, 167, 178)" + rnaseq_coefficient_of_variation: + title: "Var coeff [RNA-seq only]" + color: "rgb(140, 50, 76)" + rnaseq_robust_coefficient_of_variation_median: + title: "RCVm [RNA-seq only]" + color: "rgb(140, 50, 76)" + rnaseq_mean: + title: "Average [RNA-seq only]" + color: "rgb(140, 50, 76)" + rnaseq_standard_deviation: + title: "Std [RNA-seq only]" + color: "rgb(140, 50, 76)" + rnaseq_median: + title: "Median [RNA-seq only]" + color: "rgb(140, 50, 76)" + rnaseq_median_absolute_deviation: + title: "MAD [RNA-seq only]" + color: "rgb(140, 50, 76)" + microarray_coefficient_of_variation: + title: "Var coeff [Microarray only]" + color: "rgb(27, 83, 73)" + microarray_robust_coefficient_of_variation_median: + title: "RCVm [Microarray only]" + color: "rgb(27, 83, 73)" + microarray_mean: + title: "Average [Microarray only]" + color: "rgb(27, 83, 73)" + microarray_standard_deviation: + title: "Std [Microarray only]" + color: "rgb(27, 83, 73)" + microarray_median: + title: "Median [Microarray only]" + color: "rgb(27, 83, 73)" + microarray_median_absolute_deviation: + title: "MAD [Microarray only]" + color: "rgb(27, 83, 73)" + ratio_nulls_in_all_samples: + title: "Ratio null values (all samples)" + color: "rgb(106, 78, 193)" + ratio_nulls_in_valid_samples: + title: "Ratio null values (valid samples)" + color: "rgb(106, 78, 193)" + ratio_zeros: + title: "Ratio zero values" + color: "rgb(106, 78, 193)" + rnaseq_ratio_nulls_in_all_samples: + title: "Ratio null values (all samples) [RNA-seq only]" + color: "rgb(106, 78, 193)" + rnaseq_ratio_nulls_in_valid_samples: + title: "Ratio null values (valid samples) [RNA-seq only]" + color: "rgb(106, 78, 193)" + rnaseq_ratio_zeros: + title: "Ratio zero values [RNA-seq only]" + color: "rgb(106, 78, 193)" + microarray_ratio_nulls_in_all_samples: + title: "Ratio null values (all samples) [Microarray only]" + color: "rgb(106, 78, 193)" + microarray_ratio_nulls_in_valid_samples: + title: "Ratio null values (valid samples) [Microarray only]" + color: "rgb(106, 78, 193)" + microarray_ratio_zeros: + title: "Ratio zero values [Microarray only]" + color: "rgb(106, 78, 193)" - gene_counts: - section_name: "Gene counts" + skewness: + section_name: "Count skewness" + parent_id: stats + parent_name: "Statistics" + parent_description: "Various statistics at the gene or sample level" file_format: "csv" + pconfig: + sort_samples: false + #xmin: 0 + #xmax: 1 + xlab: Skewness + ylab: Dataset description: | - Distribution of gene counts across samples. - plot_type: "linegraph" + Distribution of count skewness across samples, displayed dataset per dataset. + plot_type: "boxplot" + + ratio_zeros: + section_name: "Proportion of zeros" + parent_id: stats + parent_name: "Statistics" + parent_description: "Various statistics at the gene or sample level" + file_format: "csv" pconfig: - categories: true - headers: - count: - title: "Gene count" - description: | - Number of genes included in the dataset after normalisation - color: "#bf812d" + sort_samples: false + #xmin: 0 + #xmax: 1 + xlab: Proportion of zeros + ylab: Dataset + description: | + Distribution of zeros across samples, displayed dataset per dataset. + plot_type: "boxplot" - skewness: - section_name: "Skewness" + id_mapping_stats: + section_name: "Gene ID mapping statistics" + parent_id: idmapping + parent_name: "ID mapping" + parent_description: "Information about the ID mapping" file_format: "csv" + pconfig: + sort_samples: true + tt_decimals: 0 + cpswitch: true # show the 'Counts / Percentages' switch + cpswitch_c_active: true # show counts per default + stacking: "relative" description: | - Distribution of expression skewness across samples. - plot_type: "linegraph" + Statistics of gene ID mapping, dataset per dataset + categories: + final: + name: "Nb final gene IDs" + color: "#2ABF96" + merged: + name: "Nb gene IDs merged with other IDs" + color: "#38B4F2" + not_valid: + name: "Nb rare gene IDs removed" + color: "#F2C038" + unmapped: + name: "Nb unmapped gene IDs" + color: "#E3224A" + plot_type: "barplot" + + total_gene_id_occurrence_quantiles: + section_name: "Distribution of gene ID occurrence quantiles" + parent_id: idmapping + parent_name: "ID mapping" + parent_description: "Information about the ID mapping" + file_format: "csv" pconfig: categories: true - headers: - skewness: - title: "Skewness" - description: | - Skewness of the distribution of the normalised expression - color: "#bf812d" + #ymax: 1.1 + #ymin: -0.1 + #y_lines: + # - value: 1 + # color: "#ff0000" + # width: 2 + # dash: "dash" + # label: "Threshold" + description: | + Quantiles of the total number of occurrences of gene IDs across all datasets. Quantile values were sorted from greatest to least. + plot_type: "linegraph" + helptext: | + Gene IDs can be present or absent in the datasets. For each gene ID, the total number of occurrences across all datasets was calculated and quantile values were computed from these totals. + + eatlas_selected_experiments_metadata: + section_name: "Selected" + parent_id: eatlas + parent_name: "Expression Atlas datasets" + parent_description: "Information about the Expression Atlas datasets processed in the analysis" + file_format: "tsv" + no_violin: true + description: | + Metadata of selected Expression Atlas datasets corresponding to the provided species (and optionally the provided keywords) + plot_type: "table" - uniform_distribution_probabilities: - section_name: "Probabilities of uniform count distribution" + eatlas_all_experiments_metadata: + section_name: "All datasets" + parent_id: eatlas + parent_name: "Expression Atlas datasets" + parent_description: "Information about the Expression Atlas datasets processed in the analysis" + file_format: "tsv" + no_violin: true + description: | + Metadata of all Expression Atlas datasets corresponding to the provided species + plot_type: "table" + + eatlas_failure_reasons: + section_name: "Failure reasons" + parent_id: eatlas + parent_name: "Expression Atlas datasets" + parent_description: "Information about the Expression Atlas datasets processed in the analysis" file_format: "csv" + no_violin: true description: | - Pvalue of Kolmogorov-Smirnov test to uniform distribution. - The higher the pvalue, the more likely the distribution is uniform. - If the pvalue < 0.05, the null hypothesis is rejected and the distribution is not uniform. - Samples showing a pvalue lower than the threshold (set in parameters) when not considered for stability scoring. - plot_type: "linegraph" - pconfig: - categories: true - logswitch: true - logswitch_active: true - logswitch_label: "Log10" - headers: - kolmogorov_smirnov_to_uniform_dist_pvalue: - title: "KS test to uniform distribution - pvalue" - description: | - Pvalue of Kolmogorov-Smirnov test to uniform distribution. - color: "#bf812d" + Reasons of failure during download of Expression Atlas datasets + plot_type: "table" - distribution_correlations: - section_name: "Correlation to mean count distribution" + eatlas_warning_reasons: + section_name: "Warnings" + parent_id: eatlas + parent_name: "Expression Atlas datasets" + parent_description: "Information about the Expression Atlas datasets processed in the analysis" file_format: "csv" + no_violin: true description: | - For each sample, the correlation between the count distribution and the overall mean count distribution (including all samples) was computed. - This graph can help identify samples that are deviating from the mean count distribution. - plot_type: "linegraph" - pconfig: - categories: true - headers: - correlation: - title: "Pearson correlation" - description: | - Pearson correlation - color: "#bf812d" + Warnings during download of Expression Atlas datasets + plot_type: "table" + + geo_selected_experiments_metadata: + section_name: "Selected" + parent_id: geo + parent_name: "GEO dataset metadata" + parent_description: "Information about the GEO datasets processed in the analysis" + file_format: "tsv" + no_violin: true + description: | + Metadata of selected GEO datasets corresponding to the provided species (and optionally the provided keywords) + plot_type: "table" + + geo_all_experiments_metadata: + section_name: "All datasets" + parent_id: geo + parent_name: "GEO datasets" + parent_description: "Information about the GEO datasets processed in the analysis" + file_format: "tsv" + no_violin: true + description: | + Metadata of all GEO datasets corresponding to the provided species + plot_type: "table" + + geo_rejected_experiments_metadata: + section_name: "Rejected GEO datasets" + parent_id: geo + parent_name: "GEO datasets" + parent_description: "Information about the GEO datasets processed in the analysis" + file_format: "tsv" + no_violin: true + description: | + Metadata of all GEO datasets which were rejected + plot_type: "table" + + geo_failure_reasons: + section_name: "Failure reasons" + parent_id: geo + parent_name: "GEO datasets" + parent_description: "Information about the GEO datasets processed in the analysis" + file_format: "csv" + no_violin: true + description: | + Reasons of failure during download of GEO datasets + plot_type: "table" + + geo_warning_reasons: + section_name: "Warnings" + parent_id: geo + parent_name: "GEO datasets" + parent_description: "Information about the GEO datasets processed in the analysis" + file_format: "csv" + no_violin: true + description: | + Warnings during download of GEO datasets + plot_type: "table" + + id_cleaning_failure_reasons: + section_name: "Gene ID cleaning failure reasons" + parent_id: idmapping + parent_name: "ID mapping" + parent_description: "Information about the ID mapping" + file_format: "tsv" + no_violin: true + description: | + Reasons of failure during gene ID cleaning + plot_type: "table" + + renaming_warning_reasons: + section_name: "Gene renaming warning reasons" + parent_id: idmapping + parent_name: "ID mapping" + parent_description: "Information about the ID mapping" + file_format: "tsv" + no_violin: true + description: | + Reasons of warning during gene ID renaming. You can further investigate ID mapping issues on the g:Profiler website at https://biit.cs.ut.ee/gprofiler/convert + plot_type: "table" + + renaming_failure_reasons: + section_name: "Gene renaming failure reasons" + parent_id: idmapping + parent_name: "ID mapping" + parent_description: "Information about the ID mapping" + file_format: "tsv" + no_violin: true + description: | + Reasons of failure during gene ID renaming + plot_type: "table" + + normalisation_failure_reasons: + section_name: "Failure reasons" + parent_id: normalisation + parent_name: "Normalisation" + parent_description: "Information about the normalisation" + file_format: "tsv" + no_violin: true + description: | + Reasons of failure during Normalisation (DESeq2 or edgeR) + plot_type: "table" + + normalisation_warning_reasons: + section_name: "Warning reasons" + parent_id: normalisation + parent_name: "Normalisation" + parent_description: "Information about the normalisation" + file_format: "tsv" + no_violin: true + description: | + Reasons of failure during Normalisation (DESeq2 or edgeR) + plot_type: "table" #violin_downsample_after: 10000 log_filesize_limit: 10000000000 # 10GB sp: - ranked_top_stable_genes_summary: - fn: "*top_stable_genes_summary.csv" + ranked_most_stable_genes_summary: + fn: "*most_stable_genes_summary.csv" max_filesize: 5000000 # 5MB - expression_distributions_top_stable_genes: - fn: "*top_stable_genes_transposed_counts.csv" + expression_distributions_most_stable_genes: + fn: "*most_stable_genes_transposed_counts*.csv" max_filesize: 50000000 # 50MB gene_statistics: - fn: "*stats_all_genes.csv" + fn: "*all_genes_summary.csv" max_filesize: 50000000 # 50MB - gene_counts: - fn: "*gene_count_statistics.csv" + id_mapping_stats: + fn: "*id_mapping_stats.csv" + total_gene_id_occurrence_quantiles: + fn: "*total_gene_id_occurrence_quantiles.csv" skewness: - fn: "*skewness_statistics.csv" - uniform_distribution_probabilities: - fn: "*ks_test_statistics.csv" - distribution_correlations: - fn: "*distribution_correlations.csv" + fn: "*skewness.transposed.csv" + ratio_zeros: + fn: "*ratio_zeros.transposed.csv" + eatlas_selected_experiments_metadata: + fn: "*selected_experiments.metadata.tsv" + eatlas_all_experiments_metadata: + fn: "*species_experiments.metadata.tsv" + eatlas_failure_reasons: + fn: "*eatlas_failure_reasons.csv" + eatlas_warning_reasons: + fn: "*eatlas_warning_reasons.csv" + geo_selected_experiments_metadata: + fn: "*geo_selected_datasets.metadata.tsv" + geo_all_experiments_metadata: + fn: "*geo_all_datasets.metadata.tsv" + geo_rejected_experiments_metadata: + fn: "*geo_rejected_datasets.metadata.tsv" + geo_failure_reasons: + fn: "*geo_failure_reasons.csv" + geo_warning_reasons: + fn: "*geo_warning_reasons.csv" + id_cleaning_failure_reasons: + fn: "*id_cleaning_failure_reasons.tsv" + renaming_warning_reasons: + fn: "*renaming_warning_reasons.tsv" + renaming_failure_reasons: + fn: "*renaming_failure_reasons.tsv" + normalisation_failure_reasons: + fn: "*normalisation_failure_reasons.csv" + normalisation_warning_reasons: + fn: "*normalisation_warning_reasons.csv" diff --git a/assets/nf-core-stableexpression_logo_light.png b/assets/nf-core-stableexpression_logo_light.png index 4c4f1483..d1af4a9c 100644 Binary files a/assets/nf-core-stableexpression_logo_light.png and b/assets/nf-core-stableexpression_logo_light.png differ diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv deleted file mode 100644 index c34b510a..00000000 --- a/assets/samplesheet.csv +++ /dev/null @@ -1,3 +0,0 @@ -ample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, diff --git a/assets/schema_datasets.json b/assets/schema_datasets.json index 86aac3bf..fa320e10 100644 --- a/assets/schema_datasets.json +++ b/assets/schema_datasets.json @@ -11,14 +11,15 @@ "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.csv$", + "pattern": "^\\S+\\.(csv|tsv|dat)$", "errorMessage": "You must provide a count dataset file" }, "design": { "type": "string", "format": "file-path", + "schema": "assets/schema_design.json", "exists": true, - "pattern": "^\\S+\\.csv$", + "pattern": "^\\S+\\.(csv|tsv|dat)$", "errorMessage": "You must provide a design file", "meta": ["design"] }, diff --git a/assets/schema_design.json b/assets/schema_design.json index 9d3335bf..dc1e4b87 100644 --- a/assets/schema_design.json +++ b/assets/schema_design.json @@ -2,7 +2,7 @@ "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/master/assets/schema_design.json", "title": "nf-core/stableexpression pipeline - design schema", - "description": "Schema for the design file provided in the design column of the params.datasets CSV file", + "description": "Schema for the design file provided in the design column of the params.datasets CSV / TSV file", "type": "array", "items": { "type": "object", diff --git a/assets/schema_gene_id_mapping.json b/assets/schema_gene_id_mapping.json index f484ef45..fc537199 100644 --- a/assets/schema_gene_id_mapping.json +++ b/assets/schema_gene_id_mapping.json @@ -2,7 +2,7 @@ "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/master/assets/schema_gene_id_mapping.json", "title": "nf-core/stableexpression pipeline - custom mappings schema", - "description": "Schema for the file provided with in the design column of the params.gene_id_mapping CSV file", + "description": "Schema for the file provided with the params.gene_id_mapping CSV / TSV file", "type": "array", "items": { "type": "object", @@ -12,12 +12,12 @@ "pattern": "^\\S+$", "errorMessage": "You must provide a column for original gene IDs." }, - "ensembl_gene_id": { + "gene_id": { "type": "string", "pattern": "^\\S+$", - "errorMessage": "You must provide a column for mapped IDs (ensembl gene IDs)." + "errorMessage": "You must provide a column for mapped IDs." } }, - "required": ["original_gene_id", "ensembl_gene_id"] + "required": ["original_gene_id", "gene_id"] } } diff --git a/assets/schema_gene_length.json b/assets/schema_gene_length.json new file mode 100644 index 00000000..b395cea0 --- /dev/null +++ b/assets/schema_gene_length.json @@ -0,0 +1,23 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/master/assets/schema_gene_length.json", + "title": "nf-core/stableexpression pipeline - custom mappings schema", + "description": "Schema for the file provided with in the design column of the params.gene_length CSV file", + "type": "array", + "items": { + "type": "object", + "properties": { + "gene_id": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "You must provide a column for original gene IDs." + }, + "length": { + "type": "integer", + "minimum": 0, + "errorMessage": "You must provide a column for gene lengths." + } + }, + "required": ["gene_id", "length"] + } +} diff --git a/assets/schema_gene_metadata.json b/assets/schema_gene_metadata.json index e03bcfce..d3faad8c 100644 --- a/assets/schema_gene_metadata.json +++ b/assets/schema_gene_metadata.json @@ -2,15 +2,15 @@ "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/master/assets/schema_gene_metadata.json", "title": "nf-core/stableexpression pipeline - custom mappings schema", - "description": "Schema for the file provided with in the design column of the params.gene_metadata CSV file", + "description": "Schema for the file provided with the params.gene_metadata CSV / TSV file", "type": "array", "items": { "type": "object", "properties": { - "ensembl_gene_id": { + "gene_id": { "type": "string", "pattern": "^\\S+$", - "errorMessage": "You must provide a column for mapped IDs (ensembl gene IDs)." + "errorMessage": "You must provide a column for mapped IDs." }, "name": { "type": "string", @@ -23,6 +23,6 @@ "errorMessage": "You must provide a column for gene descriptions." } }, - "required": ["ensembl_gene_id", "name", "description"] + "required": ["gene_id", "name", "description"] } } diff --git a/bin/aggregate_results.py b/bin/aggregate_results.py new file mode 100755 index 00000000..9fccdc9a --- /dev/null +++ b/bin/aggregate_results.py @@ -0,0 +1,297 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# outfile names +ALL_GENE_SUMMARY_OUTFILENAME = "all_genes_summary.csv" +MOST_STABLE_GENE_SUMMARY_OUTFILENAME = "most_stable_genes_summary.csv" +ALL_COUNTS_FILTERED_PARQUET_OUTFILENAME = "all_counts_filtered.parquet" +MOST_STABLE_GENES_COUNTS_OUTFILENAME = ( + "most_stable_genes_transposed_counts_filtered.csv" +) + +# nb of top stable genes to select and to display at the end +NB_MOST_STABLE_GENES = 1000 +# quantile intervals +NB_QUANTILES = 100 +NB_TOP_GENES_TO_SHOW_IN_BOX_PLOTS = 100 + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Get statistics from count data for each gene" + ) + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument( + "--stats", + type=Path, + dest="stat_file", + required=True, + help="File containing statistics for all genes and stability scores by candidate genes", + ) + parser.add_argument( + "--platform-stats", + type=Path, + dest="platform_stat_files", + nargs="+", + help="File containing base statistics for all genes and for all datasets for a specific platform", + ) + parser.add_argument( + "--metadata", + type=str, + dest="metadata_files", + help="Metadata file", + ) + parser.add_argument( + "--mappings", type=str, dest="mapping_files", help="Mapping file" + ) + + return parser.parse_args() + + +def parse_stat_file(file: Path) -> pl.DataFrame: + return pl.read_csv(file).with_columns( + pl.col(config.GENE_ID_COLNAME).cast(pl.String()) + ) + + +def get_non_empty_dataframes(files: list[Path]) -> list[pl.DataFrame]: + dfs = [pl.read_csv(file) for file in files] + return [df for df in dfs if not df.is_empty()] + + +def cast_cols_to_string(df: pl.DataFrame) -> pl.DataFrame: + return df.select( + [pl.col(column).cast(pl.String) for column in df.collect_schema().names()] + ) + + +def concat_cast_to_string_and_drop_duplicates(files: list[Path]) -> pl.DataFrame: + """Concatenate DataFrames, cast all columns to String, and drop duplicates. + + The first step is to concatenate the DataFrames. Then, the dataframe is cast + to String to ensure that all columns have the same data type. Finally, duplicate + rows are dropped. + """ + dfs = get_non_empty_dataframes(files) + dfs = [cast_cols_to_string(df) for df in dfs] + concat_df = pl.concat(dfs) + # dropping duplicates + # casting all columns to String + return concat_df.unique() + + +def cast_count_columns_to_float(df: pl.DataFrame) -> pl.DataFrame: + return df.select( + pl.col(config.GENE_ID_COLNAME), + pl.exclude(config.GENE_ID_COLNAME).cast(pl.Float64), + ) + + +def join_data_on_gene_id(stat_df: pl.DataFrame, *dfs: pl.DataFrame) -> pl.DataFrame: + """Merge the statistics dataframe with the metadata dataframe and the mapping dataframe.""" + # we need to ensure that the index of stat_df are strings + for df in dfs: + stat_df = stat_df.join(df, on=config.GENE_ID_COLNAME, how="left") + return stat_df + + +def get_counts(file: Path) -> pl.DataFrame: + # sorting dataframe (necessary to get consistent output) + return pl.read_parquet(file).sort(config.GENE_ID_COLNAME, descending=False) + + +def get_metadata(metadata_files: list[Path]) -> pl.DataFrame | None: + """Retrieve and concatenate metadata from a list of metadata files.""" + if not metadata_files: + return None + return concat_cast_to_string_and_drop_duplicates(metadata_files) + + +def get_mappings(mapping_files: list[Path]) -> pl.DataFrame | None: + if not mapping_files: + return None + concat_df = concat_cast_to_string_and_drop_duplicates(mapping_files) + # group by new gene IDs and gets the lis + # convert the list column to a string representation + # separate the original gene IDs with a semicolon + return concat_df.group_by(config.GENE_ID_COLNAME).agg( + pl.col(config.ORIGINAL_GENE_ID_COLNAME) + .unique() + .sort() + .str.join(";") + .alias(config.ORIGINAL_GENE_IDS_COLNAME) + ) + + +def get_status(quantile_interval: int) -> str: + """Return the expression level status of the gene given its quantile interval.""" + if NB_QUANTILES - 5 <= quantile_interval: + return "Very high expression" + elif NB_QUANTILES - 10 <= quantile_interval < NB_QUANTILES - 5: + return "High expression" + elif 4 < quantile_interval <= 9: + return "Low expression" + elif quantile_interval <= 4: + return "Very low expression" + else: + return "Medium range" + + +def add_expression_level_status(df: pl.DataFrame) -> pl.DataFrame: + logger.info("Adding expression level status") + mapping_dict = { + quantile_interval: get_status(quantile_interval) + for quantile_interval in range(NB_QUANTILES) + } + return df.with_columns( + pl.col(config.EXPRESSION_LEVEL_QUANTILE_INTERVAL_COLNAME) + .replace_strict(mapping_dict) + .alias(config.EXPRESSION_LEVEL_STATUS_COLNAME) + ) + + +def get_all_genes_summary( + stat_summary_df: pl.DataFrame, *dfs: pl.DataFrame +) -> pl.DataFrame: + """ + Extract the most stable genes from the statistics dataframe. + """ + # add gene name, description and original gene IDs to statistics summary + stat_summary_df = join_data_on_gene_id(stat_summary_df, *dfs) + stat_summary_df = add_expression_level_status(stat_summary_df) + return stat_summary_df + + +def get_most_stable_genes_counts( + log_count_df: pl.DataFrame, stat_summary_df: pl.DataFrame +) -> pl.DataFrame: + # getting list of top stable genes with their order + top_genes_with_order = ( + stat_summary_df.head(NB_TOP_GENES_TO_SHOW_IN_BOX_PLOTS) + .select(config.GENE_ID_COLNAME) + .with_row_index("sort_order") + ) + + # join to get only existing genes and maintain order + sorted_transposed_counts_df = log_count_df.join( + top_genes_with_order, on=config.GENE_ID_COLNAME, how="inner" + ).sort("sort_order", descending=False) + + # get the actual gene names that were found (in order) + actual_gene_names = ( + sorted_transposed_counts_df.select(config.GENE_ID_COLNAME).to_series().to_list() + ) + return sorted_transposed_counts_df.drop( + ["sort_order", config.GENE_ID_COLNAME] + ).transpose(column_names=actual_gene_names) + + +def export_data( + all_genes_summary_df: pl.DataFrame, + most_stable_genes_summary_df: pl.DataFrame, + all_counts_df: pl.DataFrame, + most_stable_genes_counts_df: pl.DataFrame, +): + """Export gene expression data to CSV files.""" + logger.info(f"Exporting statistics of all genes to: {ALL_GENE_SUMMARY_OUTFILENAME}") + all_genes_summary_df.write_csv( + ALL_GENE_SUMMARY_OUTFILENAME, float_precision=config.CSV_FLOAT_PRECISION + ) + + logger.info( + f"Exporting statistics of the top stable genes to: {MOST_STABLE_GENE_SUMMARY_OUTFILENAME}" + ) + most_stable_genes_summary_df.write_csv( + MOST_STABLE_GENE_SUMMARY_OUTFILENAME, float_precision=config.CSV_FLOAT_PRECISION + ) + + logger.info(f"Exporting all counts to: {ALL_COUNTS_FILTERED_PARQUET_OUTFILENAME}") + all_counts_df.write_parquet(ALL_COUNTS_FILTERED_PARQUET_OUTFILENAME) + + logger.info( + f"Exporting counts of the top stable genes to: {MOST_STABLE_GENES_COUNTS_OUTFILENAME}" + ) + most_stable_genes_counts_df.write_csv( + MOST_STABLE_GENES_COUNTS_OUTFILENAME, float_precision=config.CSV_FLOAT_PRECISION + ) + + logger.info("Done") + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + metadata_files = ( + [Path(file) for file in args.metadata_files.split(" ")] + if args.metadata_files is not None + else [] + ) + mapping_files = ( + [Path(file) for file in args.mapping_files.split(" ")] + if args.mapping_files is not None + else [] + ) + + count_df = get_counts(args.count_file) + + # getting data, including metadata and mappings + all_genes_stat_summary_df = parse_stat_file(args.stat_file) + + platform_datasets_stat_dfs = [ + parse_stat_file(file) for file in args.platform_stat_files if file is not None + ] + + metadata_df = get_metadata(metadata_files) + mapping_df = get_mappings(mapping_files) + optional_dfs = [df for df in [metadata_df, mapping_df] if df is not None] + + additional_data_dfs = optional_dfs + platform_datasets_stat_dfs + all_genes_summary_df = get_all_genes_summary( + all_genes_stat_summary_df, *additional_data_dfs + ) + + top_stable_stat_summary_df = all_genes_summary_df.head(NB_MOST_STABLE_GENES) + + # reducing dataframe size (it is only used for plotting by MultiQC) + count_df = cast_count_columns_to_float(count_df) + most_stable_genes_counts_df = get_most_stable_genes_counts( + count_df, top_stable_stat_summary_df + ) + + # exporting computed data + export_data( + all_genes_summary_df, + top_stable_stat_summary_df, + count_df, + most_stable_genes_counts_df, + ) + + +if __name__ == "__main__": + main() diff --git a/bin/clean_gene_ids.py b/bin/clean_gene_ids.py new file mode 100755 index 00000000..7c5c802e --- /dev/null +++ b/bin/clean_gene_ids.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from pathlib import Path + +import config +import polars as pl +from common import parse_count_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +################################################################## +# CONSTANTS +################################################################## + +CLEANED_COUNTS_SUFFIX = ".cleaned.parquet" +CLEANED_GENE_IDS_SUFFIX = ".cleaned_gene_ids.txt" + +FAILURE_REASON_FILE = "failure_reason.txt" + +################################################################## +# FUNCTIONS +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Rename gene IDs using mapped IDs") + parser.add_argument( + "--count-file", type=Path, required=True, help="Input file containing counts" + ) + return parser.parse_args() + + +def clean_ensembl_gene_id_versioning(df: pl.DataFrame): + """ + Clean Ensembl gene IDs by removing version numbers. + Remove the dot and the numbers after it in IDs like ENSG00000000003.17 + """ + return df.with_columns( + pl.when(pl.col(config.GENE_ID_COLNAME).str.starts_with("ENSG")) + .then(pl.col(config.GENE_ID_COLNAME).str.extract(r"^(ENSG[a-zA-Z0-9]+)", 1)) + .otherwise(pl.col(config.GENE_ID_COLNAME)) + .alias(config.GENE_ID_COLNAME) + ) + + +def clean_mirna_ids(df: pl.DataFrame): + """ + Clean miRNA IDs by removing the 5p / 3p identifier. + """ + return df.with_columns( + pl.when(pl.col(config.GENE_ID_COLNAME).str.contains(r"-[53]p$")) + .then(pl.col(config.GENE_ID_COLNAME).str.extract(r"^(.*?)-[53]p$")) + .otherwise(pl.col(config.GENE_ID_COLNAME)) + .alias(config.GENE_ID_COLNAME) + ) + + +################################################################## +# MAIN +################################################################## + + +def main(): + args = parse_args() + + logger.info(f"Converting IDs for count file {args.count_file.name}...") + + ############################################################# + # PARSING FILES + ############################################################# + + df = parse_count_table(args.count_file) + + if df.is_empty(): + msg = "COUNT FILE IS EMPTY" + logger.warning(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + sys.exit(0) + + try: + df = clean_ensembl_gene_id_versioning(df) + df = clean_mirna_ids(df) + except Exception as e: + msg = f"ERROR CLEANING IDS in count file {args.count_file.name}: {e}" + logger.error(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + sys.exit(0) + + ############################################################# + # WRITING RESULTS + ############################################################# + + logger.info("Writing cleaned IDs") + gene_ids_outfile = args.count_file.with_name( + args.count_file.stem + CLEANED_GENE_IDS_SUFFIX + ) + gene_ids = ( + df.select(config.GENE_ID_COLNAME) + .sort(config.GENE_ID_COLNAME) + .to_series() + .to_list() + ) + + with open(gene_ids_outfile, "w") as fout: + fout.write("\n".join(gene_ids)) + + logger.info("Writing count file with cleaned IDs") + count_outfile = args.count_file.with_name( + args.count_file.stem + CLEANED_COUNTS_SUFFIX + ) + df.write_parquet(count_outfile) + + +if __name__ == "__main__": + main() diff --git a/bin/collect_gene_ids.py b/bin/collect_gene_ids.py new file mode 100755 index 00000000..d4531444 --- /dev/null +++ b/bin/collect_gene_ids.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from collections import Counter +from pathlib import Path + +import config +from tqdm import tqdm + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +UNIQUE_GENE_IDS_OUTFILE = "unique_gene_ids.txt" +GENE_ID_OCCURRENCES_OUTFILE = "gene_id_occurrences.csv" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Collect gene IDs from count files") + parser.add_argument( + "--ids", type=str, dest="gene_id_files", required=True, help="Gene ID files" + ) + return parser.parse_args() + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + gene_id_files = [Path(file) for file in args.gene_id_files.split(" ")] + logger.info(f"Getting gene IDs from {len(gene_id_files)} files") + + unique_gene_ids = set() + counter = Counter() + for gene_id_file in tqdm(gene_id_files): + with open(gene_id_file, "r") as fin: + gene_ids = [line.strip() for line in fin] + unique_gene_ids.update(gene_ids) + counter.update(gene_ids) + + with open(UNIQUE_GENE_IDS_OUTFILE, "w") as fout: + fout.write("\n".join([str(gene_id) for gene_id in sorted(unique_gene_ids)])) + + with open(GENE_ID_OCCURRENCES_OUTFILE, "w") as fout: + fout.write( + f"{config.ORIGINAL_GENE_ID_COLNAME},{config.GENE_ID_COUNT_COLNAME}\n" + ) + for gene_id, count in sorted(counter.items()): + fout.write(f"{gene_id},{count}\n") + + +if __name__ == "__main__": + main() diff --git a/bin/collect_statistics.py b/bin/collect_statistics.py new file mode 100755 index 00000000..8a185864 --- /dev/null +++ b/bin/collect_statistics.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import logging +import sys +from pathlib import Path + +import pandas as pd + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + file = Path(sys.argv[1]) + + logger.info("Collecting statistics...") + # parsing file manually because it's not a standard CSV format + with open(file, "r") as f: + lines = f.readlines() + data = [line.strip().split(",") for line in lines] + + # getting max number of columns + max_nb_cols = max(len(row) for row in data) + # fill missing values with None + for row in data: + row += [None] * (max_nb_cols - len(row)) + + df = pd.DataFrame(data) + # the first item is the dataset name + df.set_index(df.columns[0], inplace=True) + + outfile = file.name.replace(".csv", ".transposed.csv") + logger.info(f"Saving statistics to {outfile}") + df.T.to_csv(outfile, index=False, header=True) + + +if __name__ == "__main__": + main() diff --git a/bin/common.py b/bin/common.py new file mode 100644 index 00000000..8c875fd5 --- /dev/null +++ b/bin/common.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import logging +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def parse_header(file: Path, sep: str): + with open(file, "r") as fin: + header = fin.readline().strip().split(sep) + first_row = fin.readline().strip().split(sep) + if len(header) == len(first_row): + return header + elif len(header) == len(first_row) - 1: + return [config.GENE_ID_COLNAME] + header + else: + raise ValueError( + f"Header has length: {len(header)} while first row has length: {len(first_row)}" + ) + + +def parse_table(file: Path): + # parsing header first + if file.suffix in [".csv", ".tsv"]: + # parsing header manually + sep = "," if file.suffix == ".csv" else "\t" + header = parse_header(file, sep) + return pl.read_csv( + file, separator=sep, has_header=False, skip_rows=1, new_columns=header + ) + elif file.suffix == ".parquet": + return pl.read_parquet(file) + else: + raise ValueError(f"Unsupported file format: {file.suffix}") + + +def parse_count_table(file: Path): + df = parse_table(file) + first_col = df.columns[0] + # whatever the name of the first col, rename it to "gene_id" + return df.rename({first_col: config.GENE_ID_COLNAME}).select( + pl.col(config.GENE_ID_COLNAME).cast(pl.String()), + pl.exclude(config.GENE_ID_COLNAME).cast(pl.Float64()), + ) + + +def compute_log2(df: pl.DataFrame) -> pl.DataFrame: + """ + Compute log2 values. + """ + return df.select( + pl.col(config.GENE_ID_COLNAME), + (pl.exclude(config.GENE_ID_COLNAME) + 1).log(base=2), + ) + + +def export_parquet(df: pl.DataFrame, count_file: Path, suffix: str): + outfilename = count_file.with_suffix(suffix).name + logger.info(f"Exporting processed counts to: {outfilename}") + df.write_parquet(outfilename) diff --git a/bin/compute_base_statistics.py b/bin/compute_base_statistics.py new file mode 100755 index 00000000..cf83d196 --- /dev/null +++ b/bin/compute_base_statistics.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from dataclasses import dataclass, field +from pathlib import Path +from typing import ClassVar + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# outfile names +ALL_GENES_RESULT_OUTFILE_SUFFIX = "stats_all_genes.csv" + +RCV_MULTIFILER = 1.4826 # see https://pmc.ncbi.nlm.nih.gov/articles/PMC9196089/ + + +############################################################################ +# POLARS EXTENSIONS +############################################################################ + + +@pl.api.register_expr_namespace("row") +class StatsExtension: + def __init__(self, expr: pl.Expr): + self._expr = expr + + def not_null_values(self): + return self._expr.list.drop_nulls().list + + def mean(self) -> pl.Expr: + """Mean over non nulls values in row""" + return self.not_null_values().mean() + + def std(self) -> pl.Expr: + """Std over non nulls values in row""" + return self.not_null_values().std() + + def median(self) -> pl.Expr: + """Median over non nulls values in row""" + return self.not_null_values().median() + + def mad(self) -> pl.Expr: + """Median Absolute Deviation over non nulls values in row""" + return ( + self.not_null_values() + .eval( + (pl.element() - pl.element().median()).abs().median() + ) # returns a list with one element + .list.first() + ) + + +@dataclass +class GeneStatistician: + # we want to select samples that show a particularly low nb of genes + MIN_RATIO_GENE_COUNT_TO_MEAN: ClassVar[float] = 0.75 # experimentally chosen + # quantile intervals + NB_QUANTILES: ClassVar[int] = 100 + + count_df: pl.DataFrame + platform: str | None = field(default=None) + + gene_count_per_sample_df: pl.DataFrame = field(init=False) + stat_df: pl.DataFrame = field(init=False) + samples: list[str] = field(init=False) + samples_with_low_gene_count: list[str] = field(init=False) + + def __post_init__(self): + self.gene_count_per_sample_df = self.get_gene_counts_per_sample() + self.samples = [ + col for col in self.count_df.columns if col != config.GENE_ID_COLNAME + ] + self.samples_with_low_gene_count = self.get_samples_with_low_gene_count() + + def get_colname(self, colname: str) -> str: + return f"{self.platform}_{colname}" if self.platform else colname + + def get_valid_counts(self) -> pl.DataFrame: + return self.count_df.select(pl.exclude(config.GENE_ID_COLNAME)) + + def get_gene_counts_per_sample(self) -> pl.DataFrame: + """ + Get the number of non-null values per sample. + :return: + A polars dataframe containing 2 columns: + - sample: name of the sample + - nb_not_nulls: number of non-null values + """ + return ( + self.count_df.select(pl.exclude(config.GENE_ID_COLNAME)) + .count() + .transpose( + include_header=True, header_name="sample", column_names=["count"] + ) + ) + + def get_samples_with_low_gene_count(self) -> list[str]: + mean_gene_count = self.gene_count_per_sample_df[ + config.GENE_COUNT_COLNAME + ].mean() + return ( + self.gene_count_per_sample_df.filter( + (pl.col(config.GENE_COUNT_COLNAME) / mean_gene_count) + < self.MIN_RATIO_GENE_COUNT_TO_MEAN + ) + .select(config.SAMPLE_COLNAME) + .to_series() + .to_list() + ) + + def get_main_statistics(self) -> pl.DataFrame: + """ + Compute count descriptive statistics for each gene in the count dataframe. + """ + logger.info("Getting descriptive statistics") + # computing main stats + augmented_count_df = self.count_df.with_columns( + mean=pl.concat_list(self.samples).row.mean(), + std=pl.concat_list(self.samples).row.std(), + median=pl.concat_list(self.samples).row.median(), + mad=pl.concat_list(self.samples).row.mad(), + ) + + return augmented_count_df.select( + pl.col(config.GENE_ID_COLNAME), + pl.col("mean").alias(self.get_colname(config.MEAN_COLNAME)), + pl.col("std").alias(self.get_colname(config.STANDARD_DEVIATION_COLNAME)), + pl.col("median").alias(self.get_colname(config.MEDIAN_COLNAME)), + pl.col("mad").alias(self.get_colname(config.MAD_COLNAME)), + (pl.col("std") / pl.col("mean")).alias( + self.get_colname(config.VARIATION_COEFFICIENT_COLNAME) + ), + (pl.col("mad") / pl.col("median") * RCV_MULTIFILER).alias( + self.get_colname(config.ROBUST_COEFFICIENT_OF_VARIATION_MEDIAN_COLNAME) + ), + ) + + def compute_ratios_null_values(self): + # the samples showing a low gene count will not be taken into account for the zero count penalty + valid_samples = [ + sample + for sample in self.samples + if sample not in self.samples_with_low_gene_count + ] + + nb_nulls = self.count_df.select( + pl.exclude(config.GENE_ID_COLNAME).is_null() + ).sum_horizontal() + nb_nulls_valid_samples = self.count_df.select( + pl.col(valid_samples).is_null() + ).sum_horizontal() + + self.stat_df = self.stat_df.with_columns( + (nb_nulls / len(self.samples)).alias( + self.get_colname(config.RATIO_NULLS_COLNAME) + ), + (nb_nulls_valid_samples / len(valid_samples)).alias( + self.get_colname(config.RATIO_NULLS_VALID_SAMPLES_COLNAME) + ), + ) + + def compute_ratio_zeros(self): + nb_zeros = self.count_df.select( + pl.exclude(config.GENE_ID_COLNAME) == 0 + ).sum_horizontal() + + self.stat_df = self.stat_df.with_columns( + (nb_zeros / len(self.samples)).alias( + self.get_colname(config.RATIO_ZEROS_COLNAME) + ), + ) + + def get_quantile_intervals(self): + """ + Compute the quantile intervals for the mean expression levels of each gene in the dataframe. + + The function assigns to each gene a quantile interval of its mean cpm compared to all genes. + """ + logger.info("Getting cpm quantiles") + mean_colname = self.get_colname(config.MEAN_COLNAME) + self.stat_df = self.stat_df.with_columns( + ( + pl.col(mean_colname).rank() + / pl.col(mean_colname).count() + * self.NB_QUANTILES + ) + .floor() + .cast(pl.Int8) + # we want the only value = NB_QUANTILES to be NB_QUANTILES - 1 + # because the last quantile interval is [NB_QUANTILES - 1, NB_QUANTILES] + .replace({self.NB_QUANTILES: self.NB_QUANTILES - 1}) + .alias(self.get_colname(config.EXPRESSION_LEVEL_QUANTILE_INTERVAL_COLNAME)) + ) + + def compute_statistics(self) -> pl.DataFrame: + logger.info("Computing statistics and stability score") + # getting expression statistics + self.stat_df = self.get_main_statistics() + # adding column for nb of null values for each gene + self.compute_ratios_null_values() + # adding a column for the frequency of zero values + self.compute_ratio_zeros() + # getting quantile intervals + self.get_quantile_intervals() + return self.stat_df + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Get base statistics from count data for each gene" + ) + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument("--platform", type=str, help="Platform name") + return parser.parse_args() + + +def get_counts(file: Path) -> pl.DataFrame: + # sorting dataframe (necessary to get consistent output) + return pl.read_parquet(file).sort(config.GENE_ID_COLNAME, descending=False) + + +def export_data(stat_df: pl.DataFrame, platform: str | None): + """Export gene expression data to CSV files.""" + outfile = ( + f"{platform}.{ALL_GENES_RESULT_OUTFILE_SUFFIX}" + if platform + else ALL_GENES_RESULT_OUTFILE_SUFFIX + ) + logger.info(f"Exporting statistics for all genes to: {outfile}") + stat_df.write_csv(outfile, float_precision=config.CSV_FLOAT_PRECISION) + logger.info("Done") + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + # putting all counts into a single dataframe + logger.info("Loading count data...") + count_df = get_counts(args.count_file) + logger.info( + f"Loaded count data with {count_df.shape[0]} rows and {count_df.shape[1]} columns" + ) + + # computing statistics (mean, standard deviation, coefficient of variation, quantiles) + gene_stat = GeneStatistician(count_df, args.platform) + stat_df = gene_stat.compute_statistics() + + # exporting computed data + export_data(stat_df, args.platform) + + +if __name__ == "__main__": + main() diff --git a/bin/compute_cpm.py b/bin/compute_cpm.py new file mode 100755 index 00000000..b2548896 --- /dev/null +++ b/bin/compute_cpm.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from pathlib import Path + +import config +import polars as pl +from common import compute_log2, export_parquet, parse_count_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +OUTFILE_SUFFIX = ".cpm.parquet" + +WARNING_REASON_FILE = "warning_reason.txt" +FAILURE_REASON_FILE = "failure_reason.txt" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Normalise data to CPM") + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + return parser.parse_args() + + +def calculate_cpm(df: pl.DataFrame) -> pl.DataFrame: + """ + Calculate CPM (Counts Per Million) from raw count data. + + Parameters: + ----------- + counts_df : polars.DataFrame + DataFrame with genes as rows and samples as columns + + Returns: + -------- + cpm_df : polars.DataFrame + DataFrame with CPM values + """ + # Calculate total counts per sample (column sums) + sums = df.select(pl.exclude(config.GENE_ID_COLNAME).sum()) + + # Calculate CPM: (count / total_counts) * 1,000,000 + count_columns = df.select(pl.exclude(config.GENE_ID_COLNAME)).columns + return df.select( + [pl.col(config.GENE_ID_COLNAME)] + + [(pl.col(col) / sums[col][0] * 1e6).alias(col) for col in count_columns] + ) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + logger.info("Parsing data") + + try: + count_df = parse_count_table(args.count_file) + + logger.info(f"Normalising {args.count_file.name}") + count_df = calculate_cpm(count_df) + + logger.info("Computing log2 values") + count_df = compute_log2(count_df) + + export_parquet(count_df, args.count_file, OUTFILE_SUFFIX) + + except Exception as e: + logger.error(f"Error occurred while normalising data: {e}") + msg = "UNEXPECTED ERROR" + logger.error(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/bin/compute_dataset_statistics.py b/bin/compute_dataset_statistics.py new file mode 100755 index 00000000..985099f7 --- /dev/null +++ b/bin/compute_dataset_statistics.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl +from common import parse_count_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +KEY_TO_OUTFILE = {"skewness": "skewness.txt", "ratio_zeros": "ratio_zeros.txt"} + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Compute general statistics from count data for each sample" + ) + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + return parser.parse_args() + + +def compute_dataset_statistics(df: pl.DataFrame) -> dict: + # sample count skewness + skewness = df.select(pl.exclude(config.GENE_ID_COLNAME).skew()).row(0) + # sample count ratio of zeros + ratio_zeros = df.select( + pl.exclude(config.GENE_ID_COLNAME).eq(pl.lit(0)).sum() / len(df) + ).row(0) + return dict(skewness=list(skewness), ratio_zeros=list(ratio_zeros)) + + +def export_count_data(stats: dict): + """ + Export dataset statistics to CSV files. + Write each statistic to a separate file, on a single row + """ + for key, outfile_name in KEY_TO_OUTFILE.items(): + logger.info(f"Exporting dataset statistics {key} to: {outfile_name}") + with open(outfile_name, "w") as outfile: + outfile.write(",".join([str(val) for val in stats[key]])) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + count_file = args.count_file + + logger.info(f"Computing dataset statistics for {count_file.name}") + count_df = parse_count_table(count_file) + + stat_dict = compute_dataset_statistics(count_df) + + export_count_data(stat_dict) + + +if __name__ == "__main__": + main() diff --git a/bin/compute_gene_transcript_lengths.py b/bin/compute_gene_transcript_lengths.py new file mode 100755 index 00000000..2b0e3b1d --- /dev/null +++ b/bin/compute_gene_transcript_lengths.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import pandas as pd + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +OUTFILE = "gene_transcript_lengths.csv" + +GFF_COLUMNS = [ + "chromosome", + "source", + "feature", + "start", + "end", + "score", + "strand", + "phase", + "attributes", +] + +DTYPES = { + "chromosome": str, + "source": str, + "feature": str, + "start": int, + "end": int, + "score": str, + "strand": str, + "phase": str, + "attributes": str, +} + + +################################################################## +################################################################## +# FUNCTIONS +################################################################## +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Get CDNA lengths from GFF3 annotation file") + parser.add_argument( + "--annotation", + type=Path, + dest="annotation_file", + required=True, + help="Annotation file in GFF3 format", + ) + return parser.parse_args() + + +def parse_gff3_file(annotation_file: Path): + return pd.read_csv( + annotation_file, + sep="\t", + names=GFF_COLUMNS, + dtype=DTYPES, + comment="#", + on_bad_lines="warn", + ) + + +def compute_transcript_lengths(df: pd.DataFrame): + exon_df = df.loc[df["feature"] == "exon"].copy() + # extract transcript ID from attributes column for each exon + exon_df["transcript_id"] = exon_df["attributes"].str.extract( + r"Parent=transcript:([^;]+)" + ) + # compute transcript length + exon_df[config.CDNA_LENGTH_COLNAME] = exon_df["end"] - exon_df["start"] + 1 + exon_df = exon_df[["transcript_id", config.CDNA_LENGTH_COLNAME]] + return exon_df.groupby("transcript_id", as_index=False).agg( + {config.CDNA_LENGTH_COLNAME: "sum"} + ) + + +def compute_max_transcript_lengths_per_gene( + df: pd.DataFrame, transcript_lengths_df: pd.DataFrame +): + rna_cols = [ + feature + for feature in df["feature"].unique() + if "RNA" in feature and "gene" not in feature + ] + rna_df = df.loc[df["feature"].isin(rna_cols)].copy() + + # extract gene ID from attributes column for each transcript + rna_df[config.GENE_ID_COLNAME] = rna_df["attributes"].str.extract( + r"Parent=gene:([^;]+)" + ) + # extract transcript ID from attributes column + rna_df["transcript_id"] = rna_df["attributes"].str.extract(r"ID=transcript:([^;]+)") + + # merge with transcript lengths dataframe to get length + merged_df = rna_df.merge(transcript_lengths_df, how="left", on="transcript_id") + logger.info( + f"Got length for {len(merged_df) / len(rna_df) * 100:.2f}% of transcripts" + ) + # compute max transcript length per gene + merged_df = merged_df[[config.GENE_ID_COLNAME, config.CDNA_LENGTH_COLNAME]] + return merged_df.groupby(config.GENE_ID_COLNAME, as_index=False).agg( + {config.CDNA_LENGTH_COLNAME: "max"} + ) + + +################################################################## +################################################################## +# MAIN +################################################################## +################################################################## + + +def main(): + args = parse_args() + + logger.info("Parsing annotation file") + df = parse_gff3_file(args.annotation_file) + + logger.info("Computing transcript lengths") + transcript_lengths_df = compute_transcript_lengths(df) + + # keep only mRNA and exon features + logger.info("Getting max transcript length per gene") + gene_length_df = compute_max_transcript_lengths_per_gene(df, transcript_lengths_df) + + logger.info(f"Writing to {OUTFILE}") + gene_length_df.to_csv(OUTFILE, index=False, header=True) + + +if __name__ == "__main__": + main() diff --git a/bin/compute_m_measures.py b/bin/compute_m_measures.py new file mode 100755 index 00000000..bc1d783e --- /dev/null +++ b/bin/compute_m_measures.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +M_MEASURE_OUTFILE_NAME = "m_measures.csv" + +DEFAULT_CHUNKSIZE = 300 +NB_GENE_ID_CHUNK_FOLDERS = 100 + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Compute M-measure for each gene") + parser.add_argument( + "--counts", + type=Path, + dest="count_file", + required=True, + help="File containing std of lof expression ratios", + ) + parser.add_argument( + "--std-files", + type=str, + dest="std_files", + required=True, + help="File containing std of lof expression ratios", + ) + parser.add_argument( + "--task-attempts", + dest="task_attempts", + type=int, + default=1, + help="Number of task attempts", + ) + return parser.parse_args() + + +def get_nb_rows(lf: pl.LazyFrame): + return lf.select(pl.len()).collect().item() + + +def concat_all_std_data(files: list[Path], low_memory: bool) -> pl.LazyFrame: + lfs = [pl.scan_parquet(file, low_memory=low_memory) for file in files] + lf = pl.concat(lfs) + return ( + lf.explode(config.RATIOS_STD_COLNAME) + .group_by(config.GENE_ID_COLNAME) + .agg(pl.col(config.RATIOS_STD_COLNAME)) + ) + + +def compute_m_measures(lf: pl.LazyFrame) -> pl.LazyFrame: + return lf.select( + pl.col(config.GENE_ID_COLNAME), + ( + pl.col(config.RATIOS_STD_COLNAME).list.sum() + / (pl.col(config.RATIOS_STD_COLNAME).list.len() - 1) + ).alias(config.GENORM_M_MEASURE_COLNAME), + ) + + +def get_chunks(lst: list, chunksize: int): + """Yield successive n-sized chunks from lst.""" + for i in range(0, len(lst), chunksize): + yield lst[i : i + chunksize] + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + low_memory = True if args.task_attempts > 1 else False + files = [Path(file) for file in args.std_files.split(" ")] + + logger.info("Getting list of gene IDs") + count_lf = pl.scan_parquet(args.count_file, low_memory=low_memory) + + ############################################################################# + # MAKING A FOLDER FOR EACH CHUNK OF GENE IDS + ############################################################################# + gene_ids = count_lf.select(config.GENE_ID_COLNAME).collect().to_series().to_list() + gene_ids = sorted(gene_ids) + + chunksize = max( + 1, int(len(gene_ids) / NB_GENE_ID_CHUNK_FOLDERS) + ) # 1 if len(gene_ids) < NB_GENE_ID_CHUNK_FOLDERS + gene_id_list_chunks = list(get_chunks(gene_ids, chunksize=chunksize)) + + gene_id_chunk_folders = [] + for i in range(len(gene_id_list_chunks)): + gene_id_chunk_folder = Path(f"gene_ids_{i}") + gene_id_chunk_folder.mkdir(exist_ok=True) + gene_id_chunk_folders.append(gene_id_chunk_folder) + + ############################################################################# + # EXPORTING GENE DATA TO THEIR RESPECTIVE CHUNK FOLDER + ############################################################################# + # progressively decreasing the chunksize if OOM + chunksize = int(DEFAULT_CHUNKSIZE / args.task_attempts) + chunk_files_list = [ + files[i : i + chunksize] for i in range(0, len(files), chunksize) + ] + + logger.info("Parsing std data by chunks") + for i, chunk_files in enumerate(chunk_files_list): + # parsing files and making a first list concatenation + concat_lf = concat_all_std_data(chunk_files, low_memory) + + # looping through each group of gene IDs + for j, (gene_id_list_chunk, gene_id_chunk_folder) in enumerate( + zip(gene_id_list_chunks, gene_id_chunk_folders) + ): + # writing all data corresponding to this group of gene IDs in a specific folder + outfile = gene_id_chunk_folder / f"chunk.{i}.parquet" + concat_df = concat_lf.filter( + pl.col(config.GENE_ID_COLNAME).is_in(gene_id_list_chunk) + ).collect() + concat_df.write_parquet(outfile) + + ############################################################################# + # GATHERING ALL DATA CHUNK BY CHUNK AND COMPUTING M MEASURE FOR EACH GENE + ############################################################################# + computed_genes = 0 + nb_ratios_per_gene = set() + logger.info( + "Concatenating all std data by chunk of gene IDs and computing M measures" + ) + with open(M_MEASURE_OUTFILE_NAME, "a") as fout: + for i, gene_id_chunk_folder in enumerate(gene_id_chunk_folders): + chunk_files = list(gene_id_chunk_folder.iterdir()) + + concat_lf = concat_all_std_data(chunk_files, low_memory).sort( + config.GENE_ID_COLNAME + ) + + # computing M measures for these gene IDs + m_measure_lf = compute_m_measures(concat_lf) + m_measure_df = m_measure_lf.collect() + + ################################################# + # checks + ################################################# + if m_measure_df[config.GENE_ID_COLNAME].is_duplicated().any(): + raise ValueError("Duplicate values found for gene IDs!") + + process_gene_ids = sorted( + m_measure_df.select(config.GENE_ID_COLNAME).to_series().to_list() + ) + if process_gene_ids != gene_id_list_chunks[i]: + raise ValueError("Incorrect gene IDs found!") + + computed_genes += len(m_measure_df) + + unique_nb_ratios = ( + concat_lf.with_columns( + pl.col(config.RATIOS_STD_COLNAME).list.len().alias("length") + ) + .select("length") + .unique() + .collect() + .to_series() + .to_list() + ) + nb_ratios_per_gene.update(unique_nb_ratios) + + ################################################# + ################################################# + + # appending to output file + if i == 0: + m_measure_df.write_csv( + fout, + include_header=True, + float_precision=config.CSV_FLOAT_PRECISION, + ) + else: + m_measure_df.write_csv( + fout, + include_header=False, + float_precision=config.CSV_FLOAT_PRECISION, + ) + + logger.info(f"Number of gene IDs: {len(gene_ids)}") + logger.info(f"Number of computed genes: {computed_genes}") + if computed_genes != len(gene_ids): + raise ValueError( + f"Number of computed genes: {computed_genes} != number of gene IDs: {len(gene_ids)}" + ) + + if len(nb_ratios_per_gene) > 1: + logger.warning( + f"Got multiple number of std ratios to compute: {list(nb_ratios_per_gene)}" + ) + + +if __name__ == "__main__": + main() diff --git a/bin/compute_stability_scores.py b/bin/compute_stability_scores.py new file mode 100755 index 00000000..0dd75cde --- /dev/null +++ b/bin/compute_stability_scores.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from dataclasses import dataclass, field +from pathlib import Path +from typing import ClassVar + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# outfile names +STATISTICS_WITH_SCORES_OUTFILENAME = "stats_with_scores.csv" + + +@dataclass +class StabilityScorer: + N_QUANTILES: ClassVar[int] = 1000 + + WEIGHT_FIELDS: ClassVar[list[str]] = [ + config.VARIATION_COEFFICIENT_COLNAME, + config.ROBUST_COEFFICIENT_OF_VARIATION_MEDIAN_COLNAME, + config.NORMFINDER_STABILITY_VALUE_COLNAME, + config.GENORM_M_MEASURE_COLNAME, + ] + + WEIGHT_RATIO_NB_NULLS_TO_SCORING: ClassVar[float] = 1 + + df: pl.DataFrame + stability_score_weights_str: str + weights: dict[str, float] = field(default_factory=dict) + + def __post_init__(self): + self.parse_stability_score_weights() + self.compute_stability_score() + + def parse_stability_score_weights(self): + for weight_field, weight in zip( + self.WEIGHT_FIELDS, self.stability_score_weights_str.split(",") + ): + self.weights[weight_field] = float(weight) + + def linear_normalise(self, data: pl.Series, new_name: str) -> pl.Series: + """ + Linearly normalise a series + """ + min_val = data.min() + max_val = data.max() + return pl.Series(new_name, (data - min_val) / (max_val - min_val)) + + @staticmethod + def get_normalised_col(col: str) -> str: + return f"{col}_normalised" + + def compute_stability_score(self): + logger.info("Computing stability score for candidate genes") + + candidate_df = self.df.filter( + pl.col(config.IS_CANDIDATE_COLNAME) == 1 + ) # keep only candidate genes + non_candidate_df = self.df.filter(pl.col(config.IS_CANDIDATE_COLNAME).is_null()) + + normalised_data = {} + null_data = {} + weight_sum = 0 + # iterate over columns that can participate in stability score calculation + for col, weight in self.weights.items(): + # if a column is absent, skip it + if col not in self.df.columns: + continue + data = candidate_df.select(col).to_series() + # for each column present, we quantile normalise the data to have values between 0 and 1 + # and put these normalised data in another column suffixed with "_normalised" + normalised_col = self.get_normalised_col(col) + normalised_data[col] = self.linear_normalise(data, new_name=normalised_col) + # creating a null column with same name + null_data[col] = pl.Series(normalised_col, [None] * len(non_candidate_df)) + # counting the sum of weights corresponding to the columns present + # so that we can normalise the weights afterwards + weight_sum += weight + + # replacing original data with quantile normalised ones + candidate_df = candidate_df.with_columns( + data for data in normalised_data.values() + ) + # adding null columns to the non-candidate df to allow concatenation + non_candidate_df = non_candidate_df.with_columns( + data for data in null_data.values() + ) + + # concatenating with non candidate genes to have all genes + self.df = pl.concat([candidate_df, non_candidate_df]) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # GENERAL FORMULA FOR STABILITY + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # adding penalty for samples with null values + # genes with at least one zero value are already excluded at that stage + stability_scoring_expr = ( + pl.col(config.RATIO_NULLS_VALID_SAMPLES_COLNAME) + * self.WEIGHT_RATIO_NB_NULLS_TO_SCORING + ) + + for col, weight in self.weights.items(): + if col not in self.df.columns: + logger.warning(f"Column {col} not found in dataframe") + continue + normalised_col = self.get_normalised_col(col) + # we do not want to include null / nan values in the stability score calculation + # because this would result in a total null / nan value for the stability score + stability_scoring_expr += ( + pl.when( + pl.col(normalised_col).is_not_null() + & pl.col(normalised_col).is_not_nan() + ) + .then(pl.col(normalised_col)) + .otherwise(pl.lit(0)) + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + expr = ( + pl.when(pl.col(config.IS_CANDIDATE_COLNAME).is_not_null()) + .then(stability_scoring_expr) + .otherwise(None) + ) + # add stability score column + self.df = self.df.with_columns(expr.alias(config.STABILITY_SCORE_COLNAME)) + + def get_statistics_with_stability_scores(self) -> pl.DataFrame: + return ( + self.df.sort( + config.STABILITY_SCORE_COLNAME, descending=False, nulls_last=True + ) + .with_row_index(name="index") + .with_columns((pl.col("index") + 1).alias(config.RANK_COLNAME)) + .drop("index") + ) + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Computes stability score for each gene" + ) + parser.add_argument( + "--stats", + type=str, + dest="platform_stat_files", + required=True, + help="Platform stat file", + ) + parser.add_argument( + "--normfinder-stability", + type=str, + required=True, + dest="normfinder_stability_file", + help="Output files of Normfinder", + ) + parser.add_argument( + "--genorm-stability", + type=str, + dest="genorm_stability_file", + help="Output files of Genorm", + ) + parser.add_argument( + "--weights", + dest="stability_score_weights", + type=str, + required=True, + help="Weights for Coefficient of Variation / Robust Coefficient of Variation on Median / Normfinder / Genorm respectively. Must be a comma-separated string. Example: 0.7,0.1,0.1,0.1", + ) + return parser.parse_args() + + +def get_stabilities(stability_files: list[Path]) -> pl.LazyFrame: + """Retrieve and concatenate stability values from a list of stability files.""" + lf = pl.scan_csv(stability_files[0]) + if len(stability_files) > 1: + for file in stability_files[1:]: + new_df = pl.scan_csv(file) + lf = lf.join(new_df, on=config.GENE_ID_COLNAME, how="left") + return lf.with_columns(pl.lit(1).alias(config.IS_CANDIDATE_COLNAME)) + + +def get_statistics(stat_files: list[Path]) -> pl.LazyFrame: + """Retrieve and concatenate data from a list of statistics files.""" + lf = pl.scan_csv(stat_files[0]) + if len(stat_files) > 1: + for file in stat_files[1:]: + new_df = pl.scan_csv(file) + lf = lf.join(new_df, on=config.GENE_ID_COLNAME, how="left") + return lf + + +def export_data(scored_df: pl.DataFrame): + """Export gene expression data to CSV files.""" + logger.info(f"Exporting stability scores to: {STATISTICS_WITH_SCORES_OUTFILENAME}") + scored_df.write_csv( + STATISTICS_WITH_SCORES_OUTFILENAME, float_precision=config.CSV_FLOAT_PRECISION + ) + logger.info("Done") + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + stat_files = [Path(file) for file in args.platform_stat_files.split(" ")] + stat_lf = get_statistics(stat_files) + + stability_files = [ + Path(file) + for file in [args.normfinder_stability_file, args.genorm_stability_file] + if file is not None + ] + + # getting metadata and mappings + stability_lf = get_stabilities(stability_files) + # merges base statistics with computed stability measurements + lf = stat_lf.join(stability_lf, on=config.GENE_ID_COLNAME, how="left") + + # sort genes according to the metrics present in the dataframe + stability_scorer = StabilityScorer(lf.collect(), args.stability_score_weights) + scored_df = stability_scorer.get_statistics_with_stability_scores() + + # exporting computed data + export_data(scored_df) + + +if __name__ == "__main__": + main() diff --git a/bin/compute_tpm.py b/bin/compute_tpm.py new file mode 100755 index 00000000..77e936b3 --- /dev/null +++ b/bin/compute_tpm.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from pathlib import Path + +import config +import polars as pl +from common import compute_log2, export_parquet, parse_count_table, parse_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +OUTFILE_SUFFIX = ".tpm.parquet" + +WARNING_REASON_FILE = "warning_reason.txt" +FAILURE_REASON_FILE = "failure_reason.txt" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Normalise data to TPM") + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument( + "--gene-lengths", + type=Path, + dest="gene_lengths_file", + required=True, + help="Gene lengths file (CSV format)", + ) + return parser.parse_args() + + +def try_cast_to_int(df: pl.DataFrame) -> pl.DataFrame: + """Try casting columns to integers.""" + count_columns = df.select(pl.exclude(config.GENE_ID_COLNAME)).columns + # try casting to handle integer values that are float-formated like 1.0 + for col in count_columns: + is_all_integers = df.select(pl.col(col).round().eq(pl.col(col)).all()).item() + if is_all_integers: + df = df.with_columns(pl.col(col).cast(pl.Int64())) + return df + + +def is_raw_counts(df: pl.DataFrame) -> bool: + """Check if the data are raw counts (integers).""" + count_columns = df.select(pl.exclude(config.GENE_ID_COLNAME)).columns + return all( + dtype + in ( + pl.Int8(), + pl.Int16(), + pl.Int32(), + pl.Int64(), + pl.UInt8(), + pl.UInt16(), + pl.UInt32(), + pl.UInt64(), + ) + for dtype in df.select(count_columns).schema.values() + ) + + +def is_tpm(df: pl.DataFrame) -> bool: + """Check if the data are TPM (sum to 1e6 per sample).""" + sample_sums_df = df.select(pl.exclude(config.GENE_ID_COLNAME).sum()) + # a small error is possible, and we assume that if the sum is close to 1e6, it is TPM + # setting the tolerance to 100 + is_tpm_col_df = sample_sums_df.select((pl.all() - 1e6).abs() < 1e2) + return is_tpm_col_df.select( + pl.any_horizontal(pl.all()) + ).item() # Allow for floating-point precision + + +def compute_rpkm(df: pl.DataFrame, cdna_length_df: pl.DataFrame) -> pl.DataFrame: + """ + Process raw counts to RPKM. + """ + logger.info("Computing RPKM.") + df = df.join(cdna_length_df, on=config.GENE_ID_COLNAME) + return df.select( + pl.col(config.GENE_ID_COLNAME), + pl.exclude([config.GENE_ID_COLNAME, config.CDNA_LENGTH_COLNAME]).truediv( + pl.col(config.CDNA_LENGTH_COLNAME) + ), + ) + + +def compute_tpm_from_rpkm(rpkm_df: pl.DataFrame) -> pl.DataFrame: + """ + Process RPKM to TPM. + """ + logger.info("Computing TPM from RPKM.") + sums = rpkm_df.select(pl.exclude(config.GENE_ID_COLNAME).sum()) + # Divide each column by its sum and multiply by 1e6 + count_columns = rpkm_df.select(pl.exclude(config.GENE_ID_COLNAME)).columns + return rpkm_df.select( + [pl.col(config.GENE_ID_COLNAME)] + + [(pl.col(col) / sums[col][0] * 1e6).alias(col) for col in count_columns], + ) + + +def compute_tpm(df: pl.DataFrame, cdna_length_df: pl.DataFrame) -> pl.DataFrame: + """ + Process raw counts, FPKM, or RPKM to TPM. + """ + if is_raw_counts(df): + logger.info("Raw counts detected → computing TPM directly.") + rpkm_df = compute_rpkm(df, cdna_length_df) + return compute_tpm_from_rpkm(rpkm_df) + elif is_tpm(df): + logger.info("Data are already TPM. No conversion needed.") + return df + else: + # Convert FPKM/RPKM to TPM + logger.info("Assuming FPKM/RPKM normalisation.") + return compute_tpm_from_rpkm(df) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + try: + logger.info("Parsing data") + count_df = parse_count_table(args.count_file) + cdna_length_df = parse_table(args.gene_lengths_file) + + logger.info("Converting data types") + count_df = try_cast_to_int(count_df) + + logger.info(f"Normalising {args.count_file.name}") + count_df = compute_tpm(count_df, cdna_length_df) + + logger.info("Computing log2 values") + count_df = compute_log2(count_df) + + export_parquet(count_df, args.count_file, OUTFILE_SUFFIX) + + except Exception as e: + logger.error(f"Error occurred while normalising data: {e}") + msg = "UNEXPECTED ERROR" + logger.error(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/bin/config.py b/bin/config.py new file mode 100644 index 00000000..dbf02757 --- /dev/null +++ b/bin/config.py @@ -0,0 +1,47 @@ +# general column names +GENE_ID_COLNAME = "gene_id" +GENE_ID_COUNT_COLNAME = "count" +CDNA_LENGTH_COLNAME = "length" +RANK_COLNAME = "rank" + +# base statistics +VARIATION_COEFFICIENT_COLNAME = "coefficient_of_variation" +ROBUST_COEFFICIENT_OF_VARIATION_MEDIAN_COLNAME = ( + "robust_coefficient_of_variation_median" +) +STANDARD_DEVIATION_COLNAME = "standard_deviation" +STABILITY_SCORE_COLNAME = "stability_score" +MEAN_COLNAME = "mean" +MEDIAN_COLNAME = "median" +MAD_COLNAME = "median_absolute_deviation" +EXPRESSION_LEVEL_STATUS_COLNAME = "expression_level_status" +EXPRESSION_LEVEL_QUANTILE_INTERVAL_COLNAME = "expression_level_quantile_interval" +RATIO_NULLS_COLNAME = "ratio_nulls_in_all_samples" +RATIO_NULLS_VALID_SAMPLES_COLNAME = "ratio_nulls_in_valid_samples" +RATIO_ZEROS_COLNAME = "ratio_zeros" +IS_CANDIDATE_COLNAME = "is_candidate" + +# dataset statistics +KS_TEST_COLNAME = "kolmogorov_smirnov_pvalue" + +# count dataframe +GENE_COUNT_COLNAME = "count" +SAMPLE_COLNAME = "sample" + +# gene metadata +ORIGINAL_GENE_ID_COLNAME = "original_gene_id" +ORIGINAL_GENE_IDS_COLNAME = "original_gene_ids" +GENE_NAME_COLNAME = "name" +GENE_DESCRIPTION_COLNAME = "description" + +# computed stability values +NORMFINDER_STABILITY_VALUE_COLNAME = "normfinder_stability_value" +GENORM_M_MEASURE_COLNAME = "genorm_m_measure" +RATIOS_STD_COLNAME = "ratios_stds" + +SCORING_BASE_TO_STABILITY_SCORE_COLUMN = { + "cv": VARIATION_COEFFICIENT_COLNAME, + "rcvm": ROBUST_COEFFICIENT_OF_VARIATION_MEDIAN_COLNAME, +} + +CSV_FLOAT_PRECISION = 6 diff --git a/bin/detect_rare_genes.py b/bin/detect_rare_genes.py new file mode 100755 index 00000000..5235b84c --- /dev/null +++ b/bin/detect_rare_genes.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl +from common import parse_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +VALID_GENE_IDS_OUTFILE = "valid_gene_ids.txt" +TOTAL_OCCURRENCES_OUTFILE = "total_gene_id_occurrence_quantiles.csv" + +################################################################## +# FUNCTIONS +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Get genes with good occurrence") + parser.add_argument( + "--occurrences", + type=Path, + required=True, + dest="gene_id_occurrence_file", + help="Input file containing gene ID occurrences", + ) + parser.add_argument( + "--mappings", + type=Path, + required=True, + dest="mapping_file", + help="Mapping file containing gene IDs", + ) + parser.add_argument( + "--nb-datasets", + type=int, + required=True, + dest="nb_datasets", + help="Number of datasets", + ) + parser.add_argument( + "--min-occurrence-frequency", + type=float, + required=True, + dest="min_occurrence_frequency", + help="Minimum frequency of occurrences for a gene among all datasets", + ) + parser.add_argument( + "--min-occurrence-quantile", + type=float, + required=True, + dest="min_occurrence_quantile", + help="Minimum frequency of occurrences for a gene among all datasets", + ) + return parser.parse_args() + + +################################################################## +# MAIN +################################################################## + + +def main(): + args = parse_args() + + original_gene_id_occurrence_df = parse_table(args.gene_id_occurrence_file) + mapping_df = parse_table(args.mapping_file) + nb_mapped_genes = len(mapping_df) + + df = original_gene_id_occurrence_df.join( + mapping_df, + on=config.ORIGINAL_GENE_ID_COLNAME, + ) + + total_gene_id_occurrence_df = df.group_by(config.GENE_ID_COLNAME).agg( + pl.col(config.GENE_ID_COUNT_COLNAME).sum().alias("total_occurrences") + ) + + df = ( + df.join( + total_gene_id_occurrence_df, + on=config.GENE_ID_COLNAME, + ) + .with_columns( + total_occurrences_quantile=( + pl.col("total_occurrences").rank(method="max") + / pl.col("total_occurrences").count() + ), + total_occurrences_frequency=( + pl.col("total_occurrences") / args.nb_datasets + ), + ) + .select( + [ + config.GENE_ID_COLNAME, + "total_occurrences_frequency", + "total_occurrences_quantile", + ] + ) + .unique() + ) + + # writing total occurrences in a csv before filtering + df.select([config.GENE_ID_COLNAME, "total_occurrences_quantile"]).sort( + "total_occurrences_quantile", descending=True + ).write_csv(TOTAL_OCCURRENCES_OUTFILE) + + # filtering genes + valid_gene_ids = ( + df.filter(pl.col("total_occurrences_quantile") >= args.min_occurrence_quantile) + .filter(pl.col("total_occurrences_frequency") >= args.min_occurrence_frequency) + .select(config.GENE_ID_COLNAME) + .unique() + .to_series() + .to_list() + ) + + with open(VALID_GENE_IDS_OUTFILE, "w") as f: + f.write("\n".join(valid_gene_ids)) + + nb_valid_genes = len(valid_gene_ids) + + logger.info( + f"Found {nb_valid_genes} valid gene IDs ({nb_valid_genes / nb_mapped_genes:.2%})" + ) + + +if __name__ == "__main__": + main() diff --git a/bin/get_eatlas_data.R b/bin/download_eatlas_data.R similarity index 76% rename from bin/get_eatlas_data.R rename to bin/download_eatlas_data.R index b2a46c35..e41ba82a 100755 --- a/bin/get_eatlas_data.R +++ b/bin/download_eatlas_data.R @@ -2,9 +2,15 @@ # Written by Olivier Coen. Released under the MIT license. +options(error = traceback) +suppressPackageStartupMessages(library("ExpressionAtlas")) library(ExpressionAtlas) library(optparse) +FAILURE_REASON_FILE <- "failure_reason.txt" +WARNING_REASON_FILE <- "warning_reason.txt" + + ##################################################### ##################################################### # FUNCTIONS @@ -39,7 +45,8 @@ download_expression_atlas_data_with_retries <- function(accession, max_retries = # if the accession os not valid, we stop immediately (useless to keep going) if (grepl("does not look like an ArrayExpress/BioStudies experiment accession.", w$message)) { warning(w$message) - quit(save = "no", status = 100) # quit & ignore process + write("EXPERIMENT NOT FOUND", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) } # else, retrying @@ -53,13 +60,16 @@ download_expression_atlas_data_with_retries <- function(accession, max_retries = if (grepl("550 Requested action not taken; file unavailable", w$message)) { warning(w$message) - quit(save = "no", status = 100) # quit & ignore process + write("EXPERIMENT SUMMARY NOT FOUND", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) } else if (grepl("Failure when receiving data from the peer", w$message)) { warning(w$message) - quit(save = "no", status = 100) # quit & ignore process + write("EXPERIMENT NOT FOUND", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) } else { warning("Unhandled warning: ", w$message) - quit(save = "no", status = 102) # quit & stop workflow + write("UNKNOWN ERROR", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) } } @@ -75,10 +85,12 @@ download_expression_atlas_data_with_retries <- function(accession, max_retries = if (grepl("Download appeared successful but no experiment summary object was found", e$message)) { warning(e$message) - quit(save = "no", status = 101) # quit & ignore process + write("EXPERIMENT SUMMARY NOT FOUND", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) } else { warning("Unhandled error: ", e$message) - quit(save = "no", status = 102) # quit & stop workflow + write("UNKNOWN ERROR", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) } } @@ -127,7 +139,7 @@ export_count_data <- function(result, batch_id) { # exporting to CSV file # index represents gene names - print(paste('Exporting count data to file', outfilename)) + cat(paste('Exporting count data to file', outfilename)) write.table(result$count_data, outfilename, sep = ',', row.names = TRUE, col.names = TRUE, quote = FALSE) } @@ -143,7 +155,7 @@ export_metadata <- function(result, batch_id) { ) outfilename <- paste0(batch_id, '.design.csv') - print(paste('Exporting design data to file', outfilename)) + cat(paste('Exporting design data to file', outfilename)) write.table(df, outfilename, sep = ',', row.names = FALSE, col.names = TRUE, quote = FALSE) } @@ -161,18 +173,20 @@ process_data <- function(atlas_data, accession) { # getting count dataframe tryCatch({ - if (data_type == 'rnaseq') { + if ( data_type == 'rnaseq' ) { result <- get_rnaseq_data(data) - } else if (startsWith(data_type, 'A-AFFY-')) { + } else if ( startsWith(data_type, 'A-') ) { # typically: A-AFFY- or A-GEOD- result <- get_one_colour_microarray_data(data) } else { - stop(paste('ERROR: Unknown data type:', data_type)) + warning(paste("Unknown data type:", data_type)) + write(paste("UNKNOWN DATA TYPE:", data_type), file = WARNING_REASON_FILE, append=TRUE) + skip_iteration <<- TRUE } }, error = function(e) { - print(paste("Caught an error: ", e$message)) - print(paste('ERROR: Could not get assay data for experiment ID', accession, 'and data type', data_type)) - skip_iteration <- TRUE + warning(paste("Caught an error: ", e$message)) + write(paste('ERROR: COULD NOT GET ASSAY DATA FOR EXPERIMENT ID', accession, 'AND DATA TYPE', data_type), file = WARNING_REASON_FILE, append=TRUE) + skip_iteration <<- TRUE }) # If an error occurred, skip to the next iteration @@ -199,10 +213,13 @@ process_data <- function(atlas_data, accession) { args <- get_args() +cat(paste("Getting data for accession", args$accession, "\n")) + accession <- trimws(args$accession) if (startsWith(accession, "E-PROT")) { warning("Ignoring the ", accession, " experiment.") - quit(save = "no", status = 100) # quit & ignore process + write("PROTEOME ACCESSIONS NOT HANDLED", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) } # searching and downloading expression atlas data @@ -210,4 +227,3 @@ atlas_data <- download_expression_atlas_data_with_retries(args$accession) # writing count data in atlas_data to specific CSV files process_data(atlas_data, args$accession) - diff --git a/bin/download_geo_data.R b/bin/download_geo_data.R new file mode 100755 index 00000000..4b8dcc1e --- /dev/null +++ b/bin/download_geo_data.R @@ -0,0 +1,802 @@ +#!/usr/bin/env Rscript + +# Written by Olivier Coen. Released under the MIT license. + +suppressPackageStartupMessages(library("GEOquery")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("tibble")) +suppressPackageStartupMessages(library("stringr")) +library(GEOquery) +library(optparse) +library(dplyr) +library(tibble) +library(stringr) + +options(error = traceback) + +COUNT_FILE_EXTENSION <- ".counts.csv" +DESIGN_FILE_EXTENSION <- ".design.csv" +MAPPING_FILE_EXTENSION <- ".sample_name_mapping.csv" +METADATA_FILE_EXTENSION <- ".platform_metadata.csv" +BASE_REJECTED_DIR <- "rejected" + +FAILURE_REASON_FILE <- "failure_reason.txt" +WARNING_REASON_FILE <- "warning_reason.txt" + +##################################################### +##################################################### +# ARG PARSER +##################################################### +##################################################### + +get_args <- function() { + option_list <- list( + make_option("--accession", type = "character", help = "Accession number of GEO dataset. Example: GSE56413"), + make_option("--species", type = "character", help = "Species name") + ) + + args <- parse_args(OptionParser( + option_list = option_list, + description = "Get GEO data" + )) + return(args) +} + + +##################################################### +##################################################### +# UTILS +##################################################### +##################################################### + +format_species_name <- function(x) { + x <- tools::toTitleCase(x) + x <- gsub("[_-]", " ", x) + return(x) +} + +write_warning <- function(msg) { + message(msg) + file_conn <- file( WARNING_REASON_FILE, open = "a") + cat(paste0(msg, "; "), file = file_conn, sep = "", fill = FALSE) + close(file_conn) +} + + +get_extensions <- function(file){ + extensions <- strsplit(basename(file), split="\\.")[[1]] + return(extensions) +} + + +get_rejected_dir <- function(platform, series) { + rejected_dir <- file.path(BASE_REJECTED_DIR, paste0(series$accession, '_', platform$id)) + dir.exists(rejected_dir) || dir.create(rejected_dir, recursive = TRUE) + return(rejected_dir) +} + + +clean_column_names <- function(df){ + + if (length(unique(colnames(df))) < length(colnames(df))){ + colnames(df) <- paste0(colnames(df), '_', seq_along(df)) + return(df) + } +} + + +##################################################### +##################################################### +# DOWNLOAD +##################################################### +##################################################### + +download_geo_data_with_retries <- function(accession, max_retries = 3, wait_time = 5) { + + success <- FALSE + attempts <- 0 + + while (!success && attempts < max_retries) { + attempts <- attempts + 1 + + tryCatch({ + geo_data <- GEOquery::getGEO( accession ) + success <- TRUE + + }, error = function(e) { + + message("Attempt ", attempts, " Message: ", e$message) + + if (attempts < max_retries) { + warning("Retrying in ", wait_time, " seconds...") + Sys.sleep(wait_time) + + } else { + warning("Unhandled error: ", e$message) + write("EXPERIMENT NOT FOUND", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } + }) + + } + return(geo_data) +} + +##################################################### +##################################################### +# PARSE SERIES / PLATFORM METADATA +##################################################### +##################################################### + +get_experiment_data <- function(geo_data) { + data <- geo_data[[1]] + experiment_data <- experimentData(data) + #print(experiment_data) + return(experiment_data) +} + + +get_experiment_type <- function(geo_data) { + experiment_data <- get_experiment_data(geo_data) + experiment_type <- tolower(attr(experiment_data, "other")$type) + if (experiment_type == "expression profiling by high throughput sequencing") { + return("rnaseq") + } else if (experiment_type == "expression profiling by array") { + return("microarray") + } else { + return(gsub("\n", " ; ", experiment_type)) + } +} + +get_series_species <- function(geo_data) { + message("Getting species included in series") + species_list <- list() + for (i in 1:length(geo_data)) { + data <- geo_data[[ i ]] + metadata <- pData(data) + li <- unique(metadata$organism_ch1) + # check if organism_ch2 exists + if ("organism_ch2" %in% colnames(metadata)) { + li <- append(li, unique(metadata$organism_ch2)) + } + species_list[[i]] <- li + } + species_list <- unique(unlist(species_list)) + return(species_list) +} + + +get_series_supplementary_data <- function(geo_data, series) { + series_species <- get_series_species(geo_data) + if (length(series_species) > 1) { + message(paste("Multiple species found in series:", paste(series_species, collapse = ", "), ". Will not download supplementary data")) + return(list()) + } else if (length(series_species) == 0) { + message("No species found in series...") + return(list()) + } else { + if (series_species != series$species) { + message(paste("Species provided by the user:", series_species, "does not match species in GEO data:", series$species)) + return(list()) + } + experiment_data <- get_experiment_data(geo_data) + suppl_data_str <- attr(experiment_data, "other")$supplementary_file + return(stringr::str_split(suppl_data_str, "\n")[[1]]) + } +} + + +get_platform_id <- function(metadata) { + platform_id <- as.character(unique(metadata$platform_id))[1] + return(platform_id) +} + + +##################################################### +##################################################### +# RNASEQ SAMPLES +##################################################### +##################################################### + +get_rnaseq_samples <- function(geo_data, design_df) { + + rnaseq_sample_df_list <- list() + for (i in 1:length(geo_data)) { + data <- geo_data[[ i ]] + metadata <- pData(data) + if (!("library_strategy" %in% colnames(metadata))) { + message("library_strategy column not found in metadata") + next + } + rnaseq_sample_df_list[[i]] <- metadata %>% + filter(library_strategy == "RNA-Seq" & geo_accession %in% design_df$sample) %>% + select(geo_accession) + } + # concatenate rows + rnaseq_sample_df <- Reduce( + function(df1, df2) dplyr::bind_rows(df1, df2), + rnaseq_sample_df_list + ) + return(rnaseq_sample_df$geo_accession) +} + + +##################################################### +##################################################### +# SAMPLE NAME MAPPING +##################################################### +##################################################### + + +make_sample_name_mapping <- function(geo_data) { + message("Making sample name mapping") + mapping_df_list <- list() + for (i in 1:length(geo_data)) { + data <- geo_data[[ i ]] + metadata <- pData(data) + mapping_df_list[[i]] <- metadata %>% + mutate( + sample_id = geo_accession, + sample_name = title + ) %>% + select(sample_id, sample_name) + } + # concatenate rows + mapping_df <- Reduce( + function(df1, df2) dplyr::bind_rows(df1, df2), + mapping_df_list + ) + return(mapping_df) +} + +rename_columns <- function(df, mapping_df) { + id_map <- setNames(mapping_df$sample_id, mapping_df$sample_name) + names(df) <- ifelse( + names(df) %in% names(id_map), + id_map[names(df)], + names(df) + ) + return(df) +} + +##################################################### +##################################################### +# DESIGN +##################################################### +##################################################### + +get_samples_for_species <- function(metadata, species) { + # check if organism_ch2 exists + if ("organism_ch2" %in% colnames(metadata)) { + keep <- metadata$organism_ch1 == species & metadata$organism_ch2 == species + } else { + keep <- metadata$organism_ch1 == species + } + + # return a data.frame with matching samples + return(metadata$geo_accession[keep]) +} + + +get_columns_for_grouping <- function(df) { + + base_columns <- c("characteristics", "treatment_protocol", "label_protocol", "extract_protocol", "growth_protocol") + + columns_to_group <- c() + for (base_col in base_columns) { + ch1_col <- paste0(base_col, "_ch1") + ch2_col <- paste0(base_col, "_ch2") + + if (ch1_col %in% colnames(df)) { + columns_to_group <- c(columns_to_group, ch1_col) + } + if (ch2_col %in% colnames(df)) { + columns_to_group <- c(columns_to_group, ch2_col) + } + } + + return(columns_to_group) +} + + +build_design_dataframe <- function(df, accession) { + columns_to_group <- get_columns_for_grouping(df) + + design_df <- df %>% + mutate(sample = geo_accession) %>% # change column name geo_accession to sample + group_by(!!!syms(columns_to_group)) %>% # group by all columns for grouping found + mutate(group_num = cur_group_id()) %>% # create column made from group id + ungroup() %>% + mutate( + condition = paste0("G", group_num), # create condition column from group number + batch = accession + ) %>% + select(sample, condition, batch) %>% + arrange(condition) + + return(design_df) +} + + +get_design_for_platform <- function(design_df, metadata) { + platform_samples <- metadata$geo_accession + platform_design_df <- design_df %>% + filter(sample %in% platform_samples) + return(platform_design_df) +} + +get_design_for_rnaseq <- function(design_df, rnaseq_samples) { + rnaseq_design_df <- design_df %>% + filter(sample %in% rnaseq_samples) + return(rnaseq_design_df) +} + + +make_design <- function(metadata, series) { + design_df <- build_design_dataframe(metadata, series$accession) + # get samples corresponding to species + species_samples <- get_samples_for_species(metadata, series$species) + # filter design dataframe + design_df <- design_df %>% + filter(sample %in% species_samples) + return(design_df) +} + + +make_overall_design <- function(geo_data, series) { + message("Making overall design") + design_df_list <- list() + for (i in 1:length(geo_data)) { + data <- geo_data[[ i ]] + metadata <- pData(data) + #print(metadata) + # make design dataframe + # keep only samples corresponding to the species of interest + design_df <- make_design(metadata, series) + design_df_list[[i]] <- design_df + } + # full outer join + design_df <- Reduce( + function(df1, df2) dplyr::bind_rows(df1, df2), + design_df_list + ) + return(design_df) +} + + +##################################################### +##################################################### +# PARSE COUNTS FROM DATA +##################################################### +##################################################### + + +get_microarray_counts <- function(platform) { + # get count data corresponding to samples in the design + counts <- data.frame(exprs(platform$data)) %>% + select(all_of(platform$design$sample)) + # for now, only one element in the list + return(counts) +} + + +get_raw_counts_from_url <- function(data_url) { + + if ( tolower(data_url) == "none" || is.na(data_url) || data_url == "") { + write_warning(paste("MISFORMED URL:", data_url)) + return(NULL) + } + + filename <- tolower(basename(data_url)) + extensions <- get_extensions(filename) + ext <- extensions[length(extensions)] + if (ext == "gz") { + ext <- extensions[length(extensions) - 1] + } + if (!(ext %in% c("txt", "tsv", "csv", "tab"))) { + write_warning(paste("UNSUPPORTED EXTENSION:", ext, "for URL:", data_url)) + return(NULL) + } + + message(paste("Downloading", filename)) + tryCatch({ + download.file(data_url, filename, method = "wget", quiet = TRUE) + }, error = function(e) { + write_warning(paste("ERROR WHILE DOWNLOADING:", filename)) + return(NULL) + }) + + separator <- NULL + for (sep in c("\t", ",", " ")) { + # parsing the first line to determine the separator and see if there is a header + counts <- read.table(filename, header = FALSE, sep = sep, row.names = 1, nrows = 1) + if (ncol(counts) > 0) { + separator <- sep + if (is.numeric(counts[1, 1])) { + has_header <- FALSE + } else { + has_header <- TRUE + } + break + } + } + + if (is.null(separator)) { + write_warning(paste("NO VALID SEPARATOR:", filename)) + return(NULL) + } + + message(paste("Parsing", filename)) + tryCatch({ + counts <- read.table(filename, header = has_header, sep = separator, row.names = 1) + }, error = function(e) { + write_warning(paste("ERROR WHILE PARSING", filename, ":", e)) + return(NULL) + }) + + # removes rows that are all NA + counts <- counts[rowSums(!is.na(counts)) > 0, , drop = FALSE] + return(counts) +} + + +get_all_rnaseq_counts <- function(platform) { + pdata <- platform$metadata + # getting list of samples + samples <- pdata$geo_accession + # getting list of columns corresponding to supp data + # IMPORTANT: we assume here that data are of the same type (raw, TPM, FPKM, etc.) in each supplementary file column + supplementary_cols <- grep("^supplementary_file(_\\d)?$", names(pdata), value = TRUE) + + if (length(supplementary_cols) == 0) { + message("No supplementary files found") + return(data.frame()) + } else if (length(supplementary_cols) > 1) { + message("Multiple supplementary files found") + } + + suppl_df_cpt <- 1 + suppl_count_dfs <- list() + # building one count dataframe by type of suppl data + for (i in 1:length(supplementary_cols)) { + + count_df_list <- list() + cpt = 1 + for (j in 1:length(samples)) { + sample <- samples[[j]] + data_url <- pdata[pdata$geo_accession == sample, supplementary_cols[i]] + + counts <- get_raw_counts_from_url(data_url) + if (is.null(counts)) { + next + } + + if (ncol(counts) == 1) { + colnames(counts) <- c(sample) + } else { + # if multiple columns, we don't know how to deal with it + # nut it will be filtered out later at column match checking + message(paste("Multiple columns found for sample", sample)) + } + + # in case there is already a gene_id column, remove it + if ("gene_id" %in% names(counts)) { + counts <- counts[, -which(names(counts) == "gene_id")] + } + # setting the row names (gene ids) as a column + counts <- tibble::rownames_to_column(counts, var = "gene_id") + # adding to list + count_df_list[[cpt]] <- counts + cpt = cpt + 1 + } + + # checking if all files were skipped + if (length(count_df_list) == 0) { + message("No valid files found") + next + } + + # full outer join + joined_df <- Reduce( + function(df1, df2) merge(df1, df2, by = "gene_id", all = TRUE), + count_df_list + ) + # setting the column gene_id as row names + joined_df <- tibble::column_to_rownames(joined_df, var = "gene_id") + # cleaning column names in case of duplicates + # it should happen only when there were multiple columns for the same sample + joined_df <- clean_column_names(joined_df) + + suppl_count_dfs[[suppl_df_cpt]] <- joined_df + suppl_df_cpt = suppl_df_cpt + 1 + } + return(suppl_count_dfs) +} + + +##################################################### +##################################################### +# DATA QUALITY CONTROL +##################################################### +##################################################### + +is_valid_microarray <- function(counts, platform) { + + if (!all(colnames(counts) %in% platform$design$sample)) { + message("Column names do not match samples in design") + return(FALSE) + } + + vals <- unlist(counts, use.names = FALSE) + vals <- vals[!is.na(vals)] + + all_integers <- all(abs(vals - round(vals)) < 1e-8) + value_range <- range(vals, na.rm = TRUE) + + if (value_range[2] <= 20) { + message(paste(platform$id, ": normalized, log2 scale (e.g. RMA, quantile)")) + return(TRUE) + } else if (all_integers) { + write_warning(paste(platform$id, ": RAW PROBE INTENSITIES FOUND")) + return(FALSE) + } else if (value_range[2] > 1000) { + write_warning(paste(platform$id, ": PARSED INTENSITIES: NORMALIZED BUT NOT LOG-TRANSFORMED")) + return(FALSE) + } else { + write_warning(paste(platform$id, ": UNCLEAR DATA ORIGIN: CHECK GEO METADATA")) + return(FALSE) + } +} + +is_valid_rnaseq <- function(counts, platform) { + + if (!all(colnames(counts) %in% platform$design$sample)) { + message(paste(platform$id, ": column names do not match samples in design")) + return(FALSE) + } + + return(TRUE) +} + + +check_rnaseq_normalisation_state <- function(counts, platform) { + + # checking if all values are integers + tryCatch({ + is_all_integer <- function(x) all(floor(x) == x) + int_counts <- counts %>% + select_if(is_all_integer) + + # if all or the majority of values are decimals + if (nrow(int_counts) < nrow(counts) * 0.01 ) { + return("normalised") + } else if (nrow(int_counts) == nrow(counts)) { + return("raw") + } else { + return("unknown") + } + + }, error = function(e) { + write_warning(paste(platform$id, ": COULD NOT COMPUTE FLOOR")) + return("unknown") + }) + +} + + +##################################################### +##################################################### +# EXPORT +##################################################### +##################################################### + +export_count_data <- function(data, platform, series) { + # renaming columns, to make them specific to accession and data type + colnames(data$counts) <- paste0(series$accession, '_', colnames(data$counts)) + outfilename <- paste0(series$accession, '_', platform$id, '.', platform$type, '.', data$norm_state, COUNT_FILE_EXTENSION) + if (!data$is_valid) { + outfilename <- file.path(get_rejected_dir(platform, series), outfilename) + } + + # exporting to CSV file + # index represents gene names + message(paste(platform$id, ': exporting count data to file', outfilename)) + write.table(data$counts, outfilename, sep = ',', row.names = TRUE, col.names = TRUE, quote = FALSE) +} + + +export_design <- function(data, platform, series) { + new_sample_names <- paste0(series$accession, '_', series$design$sample) + design_df <- series$design %>% + mutate(sample = new_sample_names ) %>% + select(sample, condition, batch) + + outfilename <- paste0(series$accession, '_', platform$id, '.', platform$type,'.', data$norm_state, DESIGN_FILE_EXTENSION) + if (!data$is_valid) { + outfilename <- file.path(get_rejected_dir(platform, series), outfilename) + } + + message(paste(platform$id, ': exporting design data to file', outfilename)) + write.table(design_df, outfilename, sep = ',', row.names = FALSE, col.names = TRUE, quote = FALSE) +} + + +export_name_mapping <- function(data, platform, series) { + outfilename <- paste0(series$accession, '_', platform$id, '.', platform$type, '.', data$norm_state, MAPPING_FILE_EXTENSION) + if (!data$is_valid) { + outfilename <- file.path(get_rejected_dir(platform, series), outfilename) + } + message(paste(platform$id, ': exporting design data to file', outfilename)) + write.table(series$mapping, outfilename, sep = ',', row.names = FALSE, col.names = TRUE, quote = FALSE) +} + +export_metadata <- function(data, platform, series) { + outfilename <- paste0(series$accession, '_', platform$id, '.', platform$type, '.', data$norm_state, METADATA_FILE_EXTENSION) + if (!data$is_valid) { + outfilename <- file.path(get_rejected_dir(platform, series), outfilename) + } + message(paste(platform$id, ': exporting metadata to file', outfilename)) + write.table(platform$metadata, outfilename, sep = ',', row.names = FALSE, col.names = TRUE, quote = FALSE) +} + + +##################################################### +##################################################### +# PROCESS DATA +##################################################### +##################################################### + +post_process_and_export <- function(data, platform, series) { + # keeping only non empty data + if (nrow(data$counts) == 0 || ncol(data$counts) == 0) { + message(paste(platform$id, ': no data found')) + write_warning(paste(platform$id, ": NO DATA")) + return(NULL) + } + # rename columns when needed + counts <- rename_columns(counts, series$mapping) + + export_count_data(data, platform, series) + export_design(data, platform, series) + export_name_mapping(data, platform, series) + export_metadata(data, platform, series) +} + + +process_platform_data <- function(platform, series) { + + platform$metadata <- pData(platform$data) + platform$design <- get_design_for_platform(series$design, platform$metadata) + valid_samples <- as.character(platform$design$sample) + platform$id <- get_platform_id(platform$metadata) + + if (length(valid_samples) == 0) { + message(paste(platform$id, ": no sample corresponding to species", series$species)) + return(NULL) + } + + if (platform$type == "microarray") { + + counts <- get_microarray_counts(platform) + data <- list( counts = counts ) + data$is_valid <- is_valid_microarray(counts, platform) + data$norm_state <- "normalised" + post_process_and_export(data, platform, series) + + } else { + + parsed_counts <- get_all_rnaseq_counts(platform) + for (counts in parsed_counts) { + data <- list( + counts = counts, + is_valid = is_valid_rnaseq(counts, platform), + norm_state = check_rnaseq_normalisation_state(counts, platform) + ) + post_process_and_export(data, platform, series) + } + + } + +} + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +main <- function() { + + args <- get_args() + + series <- list() + + series$accession <- args$accession + series$species <- format_species_name(args$species) + + message(paste("Getting data for accession", series$accession)) + # searching and downloading expression atlas data + geo_data <- download_geo_data_with_retries(series$accession) + + # make a single design dataframe for all samples in the series + series$design <- make_overall_design(geo_data, series) + if ( length(series$design) == 0 ) { + message("No sample corresponding to species", series$species) + write(paste("NO SAMPLES FOR SPECIES", series$species), file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } + + # make a map associating sample names to sample IDs + series$mapping <- make_sample_name_mapping(geo_data) + + series$experiment_type <- get_experiment_type(geo_data) + + suppl_data_urls <- get_series_supplementary_data(geo_data, series) + # for now, considering suppl data as raw rnaseq data + # TODO: check if these are always raw rnaseq data + if (length(suppl_data_urls) > 0) { + + message("Processing supplementary data") + for (supp_data_url in suppl_data_urls) { + counts <- get_raw_counts_from_url(supp_data_url) + if (is.null(counts)) { + next + } + platform <- list( + type = "rnaseq", + id = "suppl", + design = series$design + ) + data <- list( + counts = counts, + is_valid = is_valid_rnaseq(counts, platform), + norm_state = check_rnaseq_normalisation_state(counts, platform) + ) + post_process_and_export(data, platform, series) + } + + } + + # NOTE: we consider that a series is either a microarray series OR contains RNA-seq data + # mixed types should be found only in SuperSeries, and it is not handled for now + if ( series$experiment_type == "microarray" ) { + + message("Processing microarray data") + for (i in 1:length(geo_data)) { + platform <- list( + type = "microarray", + data = geo_data[[ i ]] + ) + process_platform_data(platform, series) + } + + } else { + + rnaseq_samples <- get_rnaseq_samples(geo_data, series$design) + if ( series$experiment_type == "rnaseq" || length(rnaseq_samples) > 0 ) { + + message("Processing RNA-seq data") + # taking a subset of the design corresponding to bona-fide RNA-seq samples + rnaseq_design_df <- get_design_for_rnaseq(series$design, rnaseq_samples) + for (i in 1:length(geo_data)) { + platform <- list( + type = "rnaseq", + count_type = "raw", + data = geo_data[[ i ]] + ) + process_platform_data(platform, series) + } + + } else { + write_warning(paste("UNSUPPORTED PLATFORM:", series$experiment_type)) + } + } +} + + +##################################################### +# ENTRYPOINT +##################################################### +main() diff --git a/bin/download_latest_ensembl_annotation.py b/bin/download_latest_ensembl_annotation.py new file mode 100755 index 00000000..205a2014 --- /dev/null +++ b/bin/download_latest_ensembl_annotation.py @@ -0,0 +1,402 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from datetime import datetime +from urllib.request import urlretrieve + +import pandas as pd +import requests +from bs4 import BeautifulSoup +from tenacity import ( + before_sleep_log, + retry, + stop_after_delay, + wait_exponential, +) +from tqdm import tqdm + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +GENE_IDS_CHUNKSIZE = 50 # max allowed by Ensembl REST API + +ENSEMBL_REST_SERVER = "https://rest.ensembl.org/" +SPECIES_INFO_BASE_ENDPOINT = "info/genomes/taxonomy/{species}" +TAXONOMY_NAME_ENDPOINT = "taxonomy/name/{species}" +ENSEMBL_API_HEADERS = { + "Content-Type": "application/json", + "Accept": "application/json", +} +STOP_RETRY_AFTER_DELAY = 120 + +NCBI_TAXONOMY_API_URL = "https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy" +NCBI_API_HEADERS = {"accept": "application/json", "content-type": "application/json"} + +ENSEMBL_DIVISION_TO_FOLDER = { + "EnsemblPlants": "plants", + "EnsemblVertebrates": "vertebrates", + "EnsemblMetazoa": "metazoa", + "EnsemblFungi": "fungi", + "EnsemblBacteria": "bacteria", + "EnsemblProtists": "protists", +} + +ENSEMBL_GENOMES_BASE_URL = "https://ftp.ebi.ac.uk/ensemblgenomes/pub/current/{}/gff3/" +ENSEMBL_VERTEBRATES_BASE_URL = "https://ftp.ensembl.org/pub/current/gff3/" + + +################################################################## +################################################################## +# FUNCTIONS +################################################################## +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Get GEO Datasets accessions") + parser.add_argument( + "--species", + type=str, + dest="species", + required=True, + help="Species name", + ) + return parser.parse_args() + + +################################################################## +################################################################## +# REQUESTS +################################################################## +################################################################## + + +@retry( + stop=stop_after_delay(600), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def parse_page_data(url: str) -> BeautifulSoup: + page = requests.get(url) + page.raise_for_status() + return BeautifulSoup(page.content, "html.parser") + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def send_request_to_ncbi_taxonomy(taxid: str | int): + logger.info(f"Sending POST request to {NCBI_TAXONOMY_API_URL}") + taxons = [str(taxid)] + data = {"taxons": taxons} + response = requests.post(NCBI_TAXONOMY_API_URL, headers=NCBI_API_HEADERS, json=data) + response.raise_for_status() + return response.json() + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def send_get_request_to_ensembl(url: str) -> list[dict]: + logger.info(f"Sending GET request to {url}") + response = requests.get(url, headers=ENSEMBL_API_HEADERS) + if response.status_code == 200: + response.raise_for_status() + else: + raise RuntimeError( + f"Failed to retrieve data: encountered error {response.status_code}" + ) + return response.json() + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def download_file(url: str, output_path: str): + try: + urlretrieve(url, output_path) + except Exception as e: + logger.error(f"Failed to download file from {url}: {e}") + raise + + +################################################################## +################################################################## +# PARSING +################################################################## +################################################################## + + +def get_species_taxid(species: str) -> int: + try: + return get_species_taxid_from_ensembl(species) + except Exception as e: + logger.error( + f"Could not get species taxid for species {species} using the Ensembl REST API: {e}.\nTrying NCBI taxonomy." + ) + ncbi_formated_species_name = format_species_name_for_ncbi_taxonomy(species) + return get_species_taxid_from_ncbi(ncbi_formated_species_name) + + +def get_species_taxid_from_ensembl(species: str) -> int: + url = ENSEMBL_REST_SERVER + TAXONOMY_NAME_ENDPOINT.format(species=species) + data = send_get_request_to_ensembl(url) + if len(data) == 0: + raise ValueError(f"No species found for species {species}") + elif len(data) > 1: + logger.warning( + f"Multiple species found for species {species}. Keeping the first one." + ) + species_data = data[0] + if "id" not in species_data: + raise ValueError( + f"Could not find taxid for species {species}. Data collected: {species_data}" + ) + return species_data["id"] + + +def get_species_taxid_from_ncbi(species: str) -> int: + result = send_request_to_ncbi_taxonomy(species) + if len(result["taxonomy_nodes"]) > 1: + raise ValueError(f"Multiple taxids for species {species}") + metadata = result["taxonomy_nodes"][0] + if "taxonomy" not in metadata: + raise ValueError(f"Could not find taxonomy results for species {species}") + return int(metadata["taxonomy"]["tax_id"]) + + +def get_species_division(species_taxid: int) -> str: + url = ENSEMBL_REST_SERVER + SPECIES_INFO_BASE_ENDPOINT.format( + species=str(species_taxid) + ) + data = send_get_request_to_ensembl(url) + if len(data) == 0: + raise ValueError(f"No division found for species Taxon ID {species_taxid}") + elif len(data) > 1: + logger.warning( + f"Multiple divisions found for species Taxon ID {species_taxid}. Keeping the first one." + ) + return data[0]["division"] + + +def get_species_category(species: str) -> str: + species_taxid = get_species_taxid(species) + logger.info(f"Got species taxid: {species_taxid}") + division = get_species_division(species_taxid) + logger.info(f"Got division: {division}") + return ENSEMBL_DIVISION_TO_FOLDER[division] + + +def get_division_url(species: str) -> str: + category = get_species_category(species) + if category == "vertebrates": + return ENSEMBL_VERTEBRATES_BASE_URL + else: + return ENSEMBL_GENOMES_BASE_URL.format(category) + + +def format_species_name_for_ensembl(species: str) -> str: + return species.replace(" ", "_").lower() + + +def format_species_name_for_ncbi_taxonomy(species: str) -> str: + return species.replace("_", " ").lower() + + +def parse_last_modified_date(dt_string: str) -> datetime | None: + try: + return datetime.strptime(dt_string, "%Y-%m-%d %H:%M") + except ValueError: + return None + + +def get_candidate_species_folders( + species: str, url: str, first_level: bool = True +) -> list[dict]: + soup = parse_page_data(url) + species_url_records = [] + + # adding progress bar only at the first level + iterator = tqdm(soup.find_all("tr")) if first_level else soup.find_all("tr") + for item in iterator: + # all line sections + line_sections = list(item.find_all("td")) + # all folders of interest have an associated date + if len(line_sections) < 2: + continue + + folder_name_section = line_sections[1] + date_section = line_sections[2] + last_modified_date = parse_last_modified_date(date_section.text.strip()) + + for folder in folder_name_section.find_all("a"): + folder_url = f"{url}{folder.text}" + if folder.text.startswith(species): + d = { + "date": last_modified_date, + "url": folder_url, + "name": folder.text.rstrip("/"), + } + species_url_records.append(d) + print(folder.text) + elif folder.text.endswith("_collection/"): + species_url_records += get_candidate_species_folders( + species, folder_url, first_level=False + ) + else: + continue + + return species_url_records + + +def get_main_folder_url(records: list[dict], species: str) -> str | None: + main_folder_url = None + for record in records: + if record["name"] == species: + main_folder_url = record["url"] + break + return main_folder_url + + +def get_last_modified_folder_url(records: list[dict]) -> str: + df = pd.DataFrame.from_dict(records) + df.sort_values(by="date", ascending=False, inplace=True) + return df.iloc[0]["url"] + + +def get_current_annotation_folder(records: list[dict], species: str) -> str: + main_folder_url = get_main_folder_url(records, species) + if main_folder_url is not None: + return main_folder_url + + logger.info( + "Could not find a folder having the species as name. Checking for gca folders." + ) + gca_records = [ + record for record in records if record["name"].startswith(f"{species}_gca") + ] + if gca_records: + return get_last_modified_folder_url(gca_records) + + logger.info( + "Could not find a folder having the species as name. Getting the last modified one." + ) + return get_last_modified_folder_url(records) + + +def parse_size(size_str): + """ + Convert size strings like '902K', '4.1M', '5G' to bytes. + + Parameters: + ----------- + size_str : str + Size string with suffix (K, M, G, T, etc.) + + Returns: + -------- + int : size in bytes + """ + size_str = size_str.strip().upper() + + # Define multipliers + multipliers = {"K": 1024, "M": 1024**2, "G": 1024**3, "T": 1024**4, "P": 1024**5} + + # Check if last character is a unit + if size_str[-1] in multipliers: + number = float(size_str[:-1]) + multiplier = multipliers[size_str[-1]] + return int(number * multiplier) + else: + # No suffix, assume it's already in bytes + return int(float(size_str)) + + +def get_annotation_file(url: str) -> str: + soup = parse_page_data(url) + file_records = [] + + for item in soup.find_all("tr"): + # all line sections + line_sections = list(item.find_all("td")) + if len(line_sections) < 4: + continue + + file = line_sections[1].text.strip() + if not file.endswith(".gff3.gz"): + continue + + d = { + "file": file, + "date": parse_last_modified_date(line_sections[2].text.strip()), + "size": parse_size(line_sections[3].text.strip()), + } + file_records.append(d) + + if not file_records: + raise ValueError("No annotation files found") + + df = pd.DataFrame.from_dict(file_records) + + # keeping the biggest annotation + max_size_df = df.loc[ + [df["size"].idxmax()] + ] # double brackets to keep it as a DataFrame + if len(max_size_df) == 1: + return max_size_df["file"].iloc[0] + + # if multiple files with the same size, return the most recent + most_recent_df = max_size_df.loc[ + [max_size_df["date"].idxmax()] + ] # double brackets to keep it as a DataFrame + if len(most_recent_df) == 1: + return max_size_df["file"].iloc[0] + + # if still multiple files, return the first one + # remove the one ending with 'chr.gff3.gz' if it exists + if max_size_df["file"].str.endswith("chr.gff3.gz").any(): + max_size_df = max_size_df[~max_size_df["file"].str.endswith("chr.gff3.gz")] + return max_size_df["file"].iloc[0] + + +################################################################## +################################################################## +# MAIN +################################################################## +################################################################## + + +def main(): + args = parse_args() + + species = format_species_name_for_ensembl(args.species) + division_url = get_division_url(species) + logger.info(f"Searching for the right folder in {division_url}") + + species_url_records = get_candidate_species_folders(species, division_url) + if not species_url_records: + raise ValueError(f"No species folder found for {species}") + + annotation_folder_url = get_current_annotation_folder(species_url_records, species) + logger.info(f"Found current annotation folder: {annotation_folder_url}") + + annotation_file = get_annotation_file(annotation_folder_url) + + annotation_full_url = annotation_folder_url + annotation_file + logger.info(f"Found annotation URL: {annotation_full_url}.\nDownloading...") + + download_file(annotation_full_url, annotation_file) + logger.info("Done") + + +if __name__ == "__main__": + main() diff --git a/bin/download_latest_ncbi_annotation.py b/bin/download_latest_ncbi_annotation.py new file mode 100755 index 00000000..609a4e54 --- /dev/null +++ b/bin/download_latest_ncbi_annotation.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import shutil +import sys +import zipfile +from pathlib import Path + +import requests +from tenacity import ( + before_sleep_log, + retry, + stop_after_delay, + wait_exponential, +) + +logging.basicConfig( + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO +) +logger = logging.getLogger(__name__) + +# Modern NCBI API +NCBI_DATASET_API_URL = "https://api.ncbi.nlm.nih.gov/datasets/v2/" + +NCBI_TAXONOMY_ENDPOINT = "taxonomy" +NCBI_GENOME_DATASET_REPORT_BASE_ENDPOINT = "genome/taxon/{taxid}/dataset_report" +NCBI_DOWNLOAD_ENDPOINT = "genome/download" + + +NCBI_GENOME_DATASET_REPORT_API_PARAMS = { + "filters.has_annotation": True, + "page_size": 1000, +} +NCBI_API_HEADERS = {"accept": "application/json", "content-type": "application/json"} + +DOWNLOADED_FILENAME = "ncbi_dataset.zip" +ACCESSION_FILE = "accession.txt" + + +##################################################### +##################################################### +# PARSER +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Get best assembly for a specific taxon ID" + ) + parser.add_argument("--species", type=str, required=True, help="Species name") + return parser.parse_args() + + +##################################################### +##################################################### +# REQUESTS +##################################################### +##################################################### + + +@retry( + stop=stop_after_delay(600), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def send_post_request_to_ncbi_dataset(endpoint: str, data: dict, params: dict = {}): + url = NCBI_DATASET_API_URL + endpoint + response = requests.post(url, headers=NCBI_API_HEADERS, json=data, params=params) + response.raise_for_status() + return response.json() + + +@retry( + stop=stop_after_delay(600), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def send_get_request_to_ncbi_dataset(endpoint: str, params: dict = {}): + url = NCBI_DATASET_API_URL + endpoint + response = requests.get(url, headers=NCBI_API_HEADERS, params=params) + response.raise_for_status() + return response.json() + + +##################################################### +##################################################### +# DATA HANDLING +##################################################### +##################################################### + + +def get_species_taxid(species: str) -> int: + data = {"taxons": [species]} + result = send_post_request_to_ncbi_dataset(NCBI_TAXONOMY_ENDPOINT, data) + + if len(result["taxonomy_nodes"]) > 1: + raise ValueError(f"Multiple taxids for species {species}") + metadata = result["taxonomy_nodes"][0] + + if "taxonomy" not in metadata: + logger.info(f"Could not find taxonomy results for species {species}") + if "errors" in metadata: + for error in metadata["errors"]: + logger.error(f"Error: {error['reason']}\n") + sys.exit(100) + return int(metadata["taxonomy"]["tax_id"]) + + +def get_assembly_reports(taxid: int): + result = send_get_request_to_ncbi_dataset( + endpoint=NCBI_GENOME_DATASET_REPORT_BASE_ENDPOINT.format(taxid=taxid), + params=NCBI_GENOME_DATASET_REPORT_API_PARAMS, + ) + return result.get("reports", []) + + +def get_assembly_with_best_stats(reports: list[dict]): + sorted_reports = sorted( + reports, + key=lambda x: ( + int(x.get("assembly_stats").get("total_sequence_length", 0)), + -int(x.get("assembly_stats", {}).get("total_number_of_chromosomes", 1e9)), + ), + reverse=True, + ) + return sorted_reports[0] + + +def get_current_assemblies(reports: list[dict]) -> dict | None: + current_assembly_reports = [ + report + for report in reports + if report.get("assembly_info", {}).get("refseq_category") == "reference genome" + ] + if not current_assembly_reports: + return None + + refseq_reports = [ + report + for report in current_assembly_reports + if report.get("source_database") == "SOURCE_DATABASE_REFSEQ" + ] + + if refseq_reports: + return refseq_reports[0] + else: + return None + + +def get_reference_assembly(reports: list[dict]) -> dict: + best_assembly_report = get_current_assemblies(reports) + if best_assembly_report is not None: + return best_assembly_report + else: + return get_assembly_with_best_stats(reports) + + +def format_species_name(species: str): + return species.replace("_", " ").lower() + + +def download_genome_annotation(genome_accession: str) -> str: + data = {"accessions": [genome_accession], "include_annotation_type": ["GENOME_GFF"]} + params = {"filename": DOWNLOADED_FILENAME} + send_post_request_to_ncbi_dataset(NCBI_TAXONOMY_ENDPOINT, data, params) + + +def extract_annotation_file_from_archive(): + with zipfile.ZipFile(DOWNLOADED_FILENAME, "r") as zip_ref: + zip_ref.extractall() + + valid_files = list(Path().cwd().glob(f"ncbi_dataset/data/{accession}/*.gff")) + + if not valid_files: + raise ValueError(f"No annotation file found for accession {accession}") + + if len(valid_files) > 1: + logger.warning( + f"Multiple annotation files found for accession {accession}. Taking the first one" + ) + + annotation_file = valid_files[0] + shutil.move(annotation_file, f"{accession}.gff") + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + +if __name__ == "__main__": + args = parse_args() + species = format_species_name(args.species) + + species_taxid = get_species_taxid(species) + logger.info(f"Species taxid: {species_taxid}") + + logger.info(f"Getting best NCBI assembly for taxid: {species_taxid}") + reports = get_assembly_reports(species_taxid) + + if not reports: + logger.error(f"No assembly reports found for taxid {species_taxid}") + sys.exit(100) + + # looping while we can get an annotation file + annotation_found = False + while not annotation_found: + best_assembly_report = get_reference_assembly(reports) + logger.info( + f"Best assembly: {best_assembly_report['accession']}. Trying to download annotation" + ) + accession = best_assembly_report["accession"] + try: + download_genome_annotation(accession) + extract_annotation_file_from_archive() + annotation_found = True + except Exception as e: + logger.error(f"Error downloading annotation for accession {accession}: {e}") + + if not annotation_found: + # Remove the best assembly report from the list of reports + reports = [report for report in reports if report["accession"] != accession] + + if not annotation_found: + logger.error(f"No annotation found for taxid {species_taxid}") + sys.exit(100) + + logger.info("Done") diff --git a/bin/filter_and_rename_genes.py b/bin/filter_and_rename_genes.py new file mode 100755 index 00000000..106f8013 --- /dev/null +++ b/bin/filter_and_rename_genes.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from pathlib import Path + +import config +import polars as pl +from common import parse_count_table, parse_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +################################################################## +# CONSTANTS +################################################################## + +RENAMED_FILE_SUFFIX = ".renamed.parquet" + +WARNING_REASON_FILE = "warning_reason.txt" +FAILURE_REASON_FILE = "failure_reason.txt" + +UNMAPPED_FILE_SUFFIX = "unmapped.txt" +NOT_VALID_FILE_SUFFIX = "not_valid.txt" +MERGED_FILE_SUFFIX = "merged.txt" +FINAL_FILE_SUFFIX = "final.txt" + +################################################################## +# FUNCTIONS +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Rename gene IDs using mapped IDs") + parser.add_argument( + "--count-file", type=Path, required=True, help="Input file containing counts" + ) + parser.add_argument( + "--mappings", + type=Path, + dest="mapping_file", + help="Mapping file containing gene IDs", + ) + parser.add_argument( + "--valid-gene-ids", + type=Path, + dest="valid_gene_ids_file", + help="File containing valid gene IDs", + ) + return parser.parse_args() + + +################################################################## +# MAIN +################################################################## + + +def main(): + args = parse_args() + + logger.info(f"Converting IDs for count file {args.count_file.name}...") + + ############################################################# + # PARSING FILES + ############################################################# + + df = parse_count_table(args.count_file) + + if df.is_empty(): + msg = "COUNT FILE IS EMPTY" + logger.warning(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + sys.exit(0) + + ############################################################# + # GETTING MAPPINGS + ############################################################# + + mapping_df = parse_table(args.mapping_file) + mapping_dict = dict( + zip( + mapping_df[config.ORIGINAL_GENE_ID_COLNAME], + mapping_df[config.GENE_ID_COLNAME], + ) + ) + + ############################################################# + # MAPPING GENE IDS IN DATAFRAME + ############################################################# + + # IMPORTANT: KEEPING ONLY GENES THAT HAVE BEEN CONVERTED + # filtering the DataFrame to keep only the rows where the index can be mapped + original_nb_genes = len(df) + + rejected_df = df.filter(~pl.col(config.GENE_ID_COLNAME).is_in(mapping_dict.keys())) + nb_unmapped_genes = len(rejected_df) + + # df = df.loc[df.index.isin(mapping_dict)] + df = df.filter(pl.col(config.GENE_ID_COLNAME).is_in(mapping_dict.keys())) + nb_mapped_genes = len(df) + + with open(UNMAPPED_FILE_SUFFIX, "w") as f: + f.write(str(nb_unmapped_genes)) + + if df.is_empty(): + sample_size = min(5, nb_unmapped_genes) + example_rejected_genes = ( + rejected_df[config.GENE_ID_COLNAME].head(sample_size).to_list() + ) + msg = f"NO GENES WERE MAPPED. EXAMPLE OF GENE IDS: {example_rejected_genes}" + logger.error(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + + with open(NOT_VALID_FILE_SUFFIX, "w") as f: + f.write("0") + with open(MERGED_FILE_SUFFIX, "w") as f: + f.write("0") + with open(FINAL_FILE_SUFFIX, "w") as f: + f.write("0") + + sys.exit(0) + + if len(df) < original_nb_genes: + sample_size = min(5, nb_unmapped_genes) + example_rejected_genes = ( + rejected_df[config.GENE_ID_COLNAME].head(sample_size).to_list() + ) + msg = ( + f"{nb_mapped_genes / original_nb_genes:.2%} of genes were mapped ({nb_mapped_genes} out of {original_nb_genes}). " + + f"Example of unmapped genes: {example_rejected_genes}" + ) + logger.warning(msg) + with open(WARNING_REASON_FILE, "a") as f: + f.write(msg) + else: + logger.info( + f"All genes were mapped ({nb_mapped_genes} out of {original_nb_genes})" + ) + + logger.info("Renaming gene names") + # renaming gene names to mapped ids using mapping dict + df = df.with_columns( + pl.col(config.GENE_ID_COLNAME) + .replace(mapping_dict) + .alias(config.GENE_ID_COLNAME) + ) + + ############################################################# + # GETTING VALID GENE IDS + ############################################################# + + logger.info("Keeping only genes with sufficient occurrence over datasets") + nb_genes_before_validation = len(df) + + with open(args.valid_gene_ids_file, "r") as fin: + valid_gene_ids = [line.strip() for line in fin.readlines()] + + df = df.filter(pl.col(config.GENE_ID_COLNAME).is_in(valid_gene_ids)) + + nb_not_valid_genes = nb_genes_before_validation - len(df) + logger.info( + f"{nb_not_valid_genes} ({nb_not_valid_genes / nb_genes_before_validation:.2%}) genes were not valid" + ) + + with open(NOT_VALID_FILE_SUFFIX, "w") as f: + f.write(str(nb_not_valid_genes)) + + if df.is_empty(): + msg = "NO GENES LEFT AFTER REMOVING RARE GENE IDS" + logger.error(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + + with open(MERGED_FILE_SUFFIX, "w") as f: + f.write("0") + with open(FINAL_FILE_SUFFIX, "w") as f: + f.write("0") + + sys.exit(0) + + ############################################################# + # GENE COUNT HANDLING + ############################################################# + + # handling cases where multiple genes have the same Gene ID + # since subsequent steps in the pipeline require integer values, + # we need to ensure that the resulting DataFrame has integer values + + # TODO: check is there is another way to avoid duplicate gene names + # sometimes different gene names have the same Gene ID + # for now, we just get the max of values, but this is not ideal + # we do not take the mean because if counts are integers, we want to keep them as integers + + logger.info("Computing max counts for genes with duplicate IDs") + df = df.group_by(config.GENE_ID_COLNAME, maintain_order=True).agg( + pl.exclude(config.GENE_ID_COLNAME).max() + ) + + ############################################################# + # WRITING OUTFILES + ############################################################# + + nb_merged = nb_mapped_genes - len(df) + with open(MERGED_FILE_SUFFIX, "w") as f: + f.write(str(nb_merged)) + with open(FINAL_FILE_SUFFIX, "w") as f: + f.write(str(len(df))) + + logger.info("Writing output file") + outfilename = args.count_file.with_suffix(RENAMED_FILE_SUFFIX).name + df.write_parquet(outfilename) + + +if __name__ == "__main__": + main() diff --git a/bin/get_candidate_genes.py b/bin/get_candidate_genes.py new file mode 100755 index 00000000..c41177cf --- /dev/null +++ b/bin/get_candidate_genes.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# outfile names +CANDIDATE_COUNTS_OUTFILENAME = "candidate_counts.parquet" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Get statistics from count data for each gene" + ) + parser.add_argument( + "--counts", + type=Path, + dest="count_file", + required=True, + help="File containing counts for all genes", + ) + parser.add_argument( + "--stats", + type=Path, + dest="stat_file", + required=True, + help="File containing statistics of expression over all datasets", + ) + parser.add_argument( + "--candidate_selection_descriptor", + type=str, + dest="candidate_selection_descriptor", + required=True, + help="Statistical descriptor for gene candidate selection.", + ) + parser.add_argument( + "--nb-top-stable-genes", + type=int, + dest="nb_most_stable_genes", + required=True, + help="Number of top stable genes to show", + ) + parser.add_argument( + "--min-pct-quantile-expr-level", + type=float, + dest="min_pct_quantile_expr_level", + required=True, + help="Minimum percentage of quantile expression level", + ) + return parser.parse_args() + + +def parse_stats(file: Path) -> pl.DataFrame: + return pl.read_csv(file).select( + pl.col(config.GENE_ID_COLNAME).cast(pl.String()), + pl.exclude(config.GENE_ID_COLNAME).cast(pl.Float64()), + ) + + +def get_best_candidates( + stat_df: pl.DataFrame, + candidate_selection_descriptor: str, + nb_most_stable_genes: int, +) -> list[str]: + logger.info("Getting best candidates") + column_for_sorting = config.SCORING_BASE_TO_STABILITY_SCORE_COLUMN[ + candidate_selection_descriptor + ] + return ( + stat_df.sort(column_for_sorting, descending=False, nulls_last=True) + .head(nb_most_stable_genes) + .select(config.GENE_ID_COLNAME) + .to_series() + .to_list() + ) + + +""" +def filter_out_genes_with_zero_counts(stat_lf: pl.LazyFrame) -> pl.LazyFrame: + # keep only genes that show no zero count (ie. count > 0 for all samples) + return stat_lf.filter(pl.col(config.RATIO_ZEROS_COLNAME) == 0) +""" + + +def filter_out_low_expression_genes( + stat_df: pl.DataFrame, min_pct_quantile_expr_level: float +) -> pl.DataFrame: + logger.info("Filtering out low expression genes") + max_quantile = ( + stat_df.select(config.EXPRESSION_LEVEL_QUANTILE_INTERVAL_COLNAME).max().item() + ) + return stat_df.filter( + pl.col(config.EXPRESSION_LEVEL_QUANTILE_INTERVAL_COLNAME) + >= max_quantile * min_pct_quantile_expr_level + ) + + +def get_counts_for_candidates(file: Path, best_candidates: list[str]) -> pl.DataFrame: + logger.info("Getting counts for candidate genes") + return pl.read_parquet(file).filter( + pl.col(config.GENE_ID_COLNAME).is_in(best_candidates) + ) + + +def export_data(filtered_count_df: pl.DataFrame): + """Export gene expression data to CSV files.""" + logger.info( + f"Exporting counts for candidate genes to: {CANDIDATE_COUNTS_OUTFILENAME}" + ) + filtered_count_df.write_parquet(CANDIDATE_COUNTS_OUTFILENAME) + logger.info("Done") + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + stat_df = parse_stats(args.stat_file) + + # first basic filters + stat_df = filter_out_low_expression_genes(stat_df, args.min_pct_quantile_expr_level) + # stat_lf = filter_out_genes_with_zero_counts(stat_lf) + + # get base candidate genes based on the chosen statistical descriptor (cv, rcvm) + best_candidates = get_best_candidates( + stat_df, args.candidate_selection_descriptor, args.nb_most_stable_genes + ) + + # get counts for candidate genes + candidate_gene_count_lf = get_counts_for_candidates( + args.count_file, best_candidates + ) + + export_data(candidate_gene_count_lf) + + +if __name__ == "__main__": + main() diff --git a/bin/get_dataset_statistics.py b/bin/get_dataset_statistics.py deleted file mode 100755 index 9c772442..00000000 --- a/bin/get_dataset_statistics.py +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env python3 - -# Written by Olivier Coen. Released under the MIT license. - -import argparse -from pathlib import Path -from scipy import stats -import pandas as pd -import logging - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -QUANT_NORM_SUFFIX = ".quant_norm.parquet" -DATASET_STATISTICS_SUFFIX = ".dataset_stats.csv" - -ENSEMBL_GENE_ID_COLNAME = "ensembl_gene_id" -SAMPLE_COLNAME = "sample" -KS_TEST_COLNAME = "kolmogorov_smirnov_to_uniform_dist_pvalue" - - -##################################################### -##################################################### -# FUNCTIONS -##################################################### -##################################################### - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Compute general statistics from count data for each sample" - ) - parser.add_argument( - "--counts", type=Path, dest="count_file", required=True, help="Count file" - ) - return parser.parse_args() - - -def compute_kolmogorov_smirnov_test_to_uniform_distribution(count_df: pd.DataFrame): - """Compute Kolmogorov-Smirnov test to uniform distribution.""" - ks_tests = pd.Series(index=count_df.columns) - for col in count_df.columns: - ks = stats.ks_1samp(count_df[col], stats.uniform.cdf, nan_policy="omit") - ks_tests[col] = ks.pvalue - return ks_tests - - -def compute_dataset_statistics(count_df: pd.DataFrame): - dataset_stats_df = count_df.describe() - dataset_stats_df.loc["skewness"] = count_df.skew() - # for each sample, test distance to uniform distribution - ks_tests = compute_kolmogorov_smirnov_test_to_uniform_distribution(count_df) - dataset_stats_df.loc[KS_TEST_COLNAME] = ks_tests - return dataset_stats_df.T - - -def export_count_data(dataset_stats_df: pd.DataFrame, count_file: Path): - """Export dataset statistics to CSV files.""" - outfilename = count_file.name.replace(QUANT_NORM_SUFFIX, DATASET_STATISTICS_SUFFIX) - logger.info(f"Exporting dataset statistics counts to: {outfilename}") - dataset_stats_df.index.name = SAMPLE_COLNAME - dataset_stats_df.to_csv(outfilename, index=True, header=True) - - -##################################################### -##################################################### -# MAIN -##################################################### -##################################################### - - -def main(): - args = parse_args() - count_file = args.count_file - - logger.info(f"Computing dataset statistics for {count_file.name}") - count_df = pd.read_parquet(count_file) - count_df.set_index(ENSEMBL_GENE_ID_COLNAME, inplace=True) - - dataset_stats_df = compute_dataset_statistics(count_df) - - export_count_data(dataset_stats_df, count_file) - - -if __name__ == "__main__": - main() diff --git a/bin/get_eatlas_accessions.py b/bin/get_eatlas_accessions.py index 06347785..b2aa2f53 100755 --- a/bin/get_eatlas_accessions.py +++ b/bin/get_eatlas_accessions.py @@ -3,50 +3,38 @@ # Written by Olivier Coen. Released under the MIT license. import argparse +import logging +import random +from functools import partial +from multiprocessing import Pool + +import pandas as pd import requests +import yaml +from natural_language_utils import keywords_in_fields from tenacity import ( + before_sleep_log, retry, - retry_if_exception_type, stop_after_delay, wait_exponential, - before_sleep_log, ) -import json -from functools import partial -from multiprocessing import Pool -import nltk -from nltk.corpus import wordnet -import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +ALLOWED_PLATFORMS = ["rnaseq", "microarray"] +# accessions that should not be fetched automatically: +# - E-GTEX-8 contains 17350 samples (way too big) +EXCLUDED_ACCESSION_PATTERNS = ["E-GTEX-"] + ALL_EXP_URL = "https://www.ebi.ac.uk/gxa/json/experiments/" ACCESSION_OUTFILE_NAME = "accessions.txt" -FILTERED_EXPERIMENTS_OUTFILE_NAME = "filtered_experiments.json" - -################################################################## -################################################################## -# NLTK MODELS AND OBJECTS -################################################################## -################################################################## - -nltk.download("punkt_tab") -nltk.download("averaged_perceptron_tagger_eng") -nltk.download("wordnet") - -lemmatizer = nltk.WordNetLemmatizer() -stemmer = nltk.PorterStemmer() - -################################################################## -################################################################## -# EXCEPTIONS -################################################################## -################################################################## +# ALL_EXPERIMENTS_METADATA_OUTFILE_NAME = "all_experiments.metadata.tsv" +SPECIES_EXPERIMENTS_METADATA_OUTFILE_NAME = "species_experiments.metadata.tsv" +SELECTED_EXPERIMENTS_METADATA_OUTFILE_NAME = "selected_experiments.metadata.tsv" +FILTERED_EXPERIMENTS_WITH_KEYWORDS_OUTFILE_NAME = "filtered_experiments.keywords.yaml" - -class ExpressionAtlasNothingFoundError(Exception): - pass +SAMPLING_QUOTA_OUTFILE = "sampling_quota.txt" ################################################################## @@ -58,144 +46,49 @@ class ExpressionAtlasNothingFoundError(Exception): def parse_args(): parser = argparse.ArgumentParser("Get expression atlas accessions") - parser.add_argument("--species", type=str, help="Species to convert IDs for") + parser.add_argument( + "--species", + type=str, + required=True, + help="Search Expression Atlas for this specific species", + ) + parser.add_argument( + "--cpus", + dest="nb_cpus", + type=int, + required=True, + help="Number of CPUs to use", + ) parser.add_argument( "--keywords", type=str, nargs="*", help="Keywords to search for in experiment description", ) + parser.add_argument( + "--platform", type=str, help="Platform type", choices=ALLOWED_PLATFORMS + ) + parser.add_argument( + "--random-sampling-size", + dest="random_sampling_size", + type=int, + help="Random sampling size", + ) + parser.add_argument( + "--random-sampling-seed", + dest="random_sampling_seed", + type=int, + help="Random sampling seed", + ) return parser.parse_args() -def get_wordnet_pos(token: str): - tag = nltk.pos_tag([token])[0][1][0].upper() - tag_dict = { - "J": wordnet.ADJ, - "N": wordnet.NOUN, - "V": wordnet.VERB, - "R": wordnet.ADV, - } - return tag_dict.get(tag, wordnet.NOUN) # Default to NOUN if not found - - -def get_stemmed_tokens(sentence: str): - """ - Tokenize a sentence into its constituent words, and then stem each word - - Parameters - ---------- - sentence : str - The sentence to be tokenized and stemmed - - Returns - ------- - tokens : List[str] - The list of stemmed tokens - """ - - tokens = nltk.word_tokenize(sentence) - return [stemmer.stem(token) for token in tokens] - - -def get_lemmed_tokens(sentence: str): - """ - Tokenize a sentence into its constituent words, and then lemmatize each word - - Parameters - ---------- - sentence : str - The sentence to be tokenized and lemmatized - - Returns - ------- - tokens : List[str] - The list of lemmatized tokens - """ - tokens = nltk.word_tokenize(sentence) - return [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens] - - -def get_synonyms(word): - """ - Get all synonyms of a word from the wordnet database. - - Parameters - ---------- - word : str - The word for which to get synonyms - - Returns - ------- - synonyms : set - A set of all synonyms of the word - """ - synonyms = [] - for syn in wordnet.synsets(word): - for lemma in syn.lemmas(): - synonyms.append(lemma.name()) # Get the name of each lemma (synonym) - return set(synonyms) # Return as a set to avoid duplicates - - -def get_all_candidate_target_words(sentence: str): - """ - Get all candidate target words from a sentence by stemming and lemmatizing the - tokens and getting synonyms from the wordnet database. - - Parameters - ---------- - sentence : str - The sentence from which to get candidate target words - - Returns - ------- - candidates : list - A list of all candidate target words - """ - candidates = [] - lemmatized_tokens = get_stemmed_tokens(sentence) - stemmed_tokens = get_stemmed_tokens(sentence) - tokens = list(set(lemmatized_tokens + stemmed_tokens)) - for token in tokens: - candidates += get_synonyms(token) - return candidates - - -def word_in_sentence(word: str, sentence: str): - """ - Checks if a word (or a stemmed version of it) is in a sentence, or if it is a - subword of a stemmed version of any word in the sentence. - - Parameters - ---------- - word : str - The word to be searched for - sentence : str - The sentence in which to search for the word - - Returns - ------- - bool - True if the word is found in the sentence, False otherwise - """ - for stemmed_word in [word] + get_stemmed_tokens(word): - # testing if stemmed word is in sentence as it is - if stemmed_word in sentence: - return True - # or testing if stemmed word is a subword of a stemmed word from the sentence - for target_word in get_all_candidate_target_words(sentence): - if stemmed_word in target_word: - return True - return False - - @retry( - retry=retry_if_exception_type(ExpressionAtlasNothingFoundError), stop=stop_after_delay(600), wait=wait_exponential(multiplier=1, min=1, max=30), before_sleep=before_sleep_log(logger, logging.WARNING), ) -def get_data(url: str): +def get_data(url: str) -> dict: """ Queries a URL and returns the data as a JSON object @@ -215,12 +108,8 @@ def get_data(url: str): If the query fails """ response = requests.get(url) - if response.status_code == 200: - return response.json() - elif response.status_code == 500: - raise ExpressionAtlasNothingFoundError - else: - raise RuntimeError(f"Failed to retrieve data: {response.status_code}") + response.raise_for_status() + return response.json() def get_experiment_description(exp_dict: dict): @@ -253,7 +142,7 @@ def get_experiment_description(exp_dict: dict): raise KeyError(f"Could not find description field in {exp_dict}") -def get_experiment_accesssion(exp_dict: dict): +def get_experiment_accession(exp_dict: dict): """ Gets the accession from an experiment dictionary @@ -314,14 +203,72 @@ def get_properties_values(exp_dict: dict): return list(set(values)) -def get_species_experiments( - species: str, -): +def get_eatlas_experiments(): + """ + Gets all experiments from Expression Atlas + + Parameters + ---------- + + Returns + ------- + experiments : list + A list of experiment dictionaries + """ + data = get_data(ALL_EXP_URL) + return data["experiments"] + + +def filter_by_platform(experiments: list[dict], platform: str | None): """ - Gets all experiments for a given species + Gets all experiments for a given platform from Expression Atlas + Possible platforms in Expression Atlas are 'rnaseq', 'microarray', 'proteomics' Parameters ---------- + experiments: list[str] + platform : str + Name of platform. Example: "rnaseq" + + Returns + ------- + experiments : list + A list of experiment dictionaries + """ + platform_experiments = [] + for exp_dict in experiments: + if technology_type := exp_dict.get("technologyType"): + parsed_technology_type = ( + technology_type[0] + if isinstance(technology_type, list) + else technology_type + ) + # parsed_platform is in ["rnaseq", "microarray", "proteomics", ...] + parsed_platform = ( + parsed_technology_type.lower().split(" ")[0].replace("-", "") + ) + + if platform is not None: + if parsed_platform == platform: + platform_experiments.append(exp_dict) + else: + if parsed_platform in ALLOWED_PLATFORMS: + platform_experiments.append(exp_dict) + + else: + logger.warning( + f"Technology type not found for experiment {exp_dict['accession']}" + ) + return platform_experiments + + +def get_species_experiments(experiments: list[dict], species: str): + """ + Gets all experiments for a given species from Expression Atlas + + Parameters + ---------- + experiments: list[str] species : str Name of species. Example: "Arabidopsis thaliana" @@ -330,12 +277,11 @@ def get_species_experiments( experiments : list A list of experiment dictionaries """ - data = get_data(ALL_EXP_URL) - experiments = [] - for exp_dict in data["experiments"]: + species_experiments = [] + for exp_dict in experiments: if exp_dict["species"] == species: - experiments.append(exp_dict) - return experiments + species_experiments.append(exp_dict) + return species_experiments def get_experiment_data(exp_dict: dict): @@ -356,9 +302,23 @@ def get_experiment_data(exp_dict: dict): return get_data(exp_url) +def filter_out_excluded_accessions(experiments: list[dict]) -> list[dict]: + valid_experiments = [] + for exp_dict in experiments: + for accession_pattern in EXCLUDED_ACCESSION_PATTERNS: + if exp_dict["experimentAccession"].startswith(accession_pattern): + logger.warning( + f"Skipping experiment {exp_dict['experimentAccession']} due to exclusion pattern" + ) + break + else: + valid_experiments.append(exp_dict) + return valid_experiments + + def parse_experiment(exp_dict: dict): # getting accession and description - accession = get_experiment_accesssion(exp_dict) + accession = get_experiment_accession(exp_dict) description = get_experiment_description(exp_dict) # getting properties of this experiment exp_data = get_experiment_data(exp_dict) @@ -371,18 +331,9 @@ def parse_experiment(exp_dict: dict): } -def keywords_in_experiment(fields: list[str], keywords: list[str]): - return [ - keyword - for keyword in keywords - for field in fields - if word_in_sentence(keyword, field) - ] - - -def filter_experiment(exp_dict: dict, keywords: list[str]): +def filter_experiment_with_keywords(exp_dict: dict, keywords: list[str]) -> dict | None: all_searchable_fields = [exp_dict["description"]] + exp_dict["properties"] - found_keywords = keywords_in_experiment(all_searchable_fields, keywords) + found_keywords = keywords_in_fields(all_searchable_fields, keywords) # only returning experiments if found keywords if found_keywords: exp_dict["found_keywords"] = list(set(found_keywords)) @@ -391,7 +342,60 @@ def filter_experiment(exp_dict: dict, keywords: list[str]): return None -def format_species_name(species: str): +def get_metadata_for_selected_experiments( + experiments: list[dict], results: list[dict] +) -> list[dict]: + filtered_accessions = [result_dict["accession"] for result_dict in results] + return [ + exp_dict + for exp_dict in experiments + if get_experiment_accession(exp_dict) in filtered_accessions + ] + + +def sample_experiments_randomly( + experiments: list[dict], sampling_size: int, seed: int +) -> tuple[list[str], bool]: + random.seed(seed) + sampled_experiments = [] + + total_nb_samples = 0 + sampling_quota_reached = False + experiments_left = list(experiments) + while experiments_left: + # if the min number of samples is greater than the remaining space left, we get out of the loop + experiments_left_nb_samples = [exp["nb_samples"] for exp in experiments_left] + min_nb_samples = min(experiments_left_nb_samples) + if min_nb_samples > sampling_size - total_nb_samples: + sampling_quota_reached = True + logger.warning("Sampling quota reached") + break + + experiment = None + test_total_nb_samples = int(total_nb_samples) + experiments_not_tested = list(experiments_left) + while experiments_not_tested: + experiment = random.choice(experiments_not_tested) + experiments_not_tested.remove(experiment) + # if we do not exceed the sampling size with this experiment + # we keep it + test_total_nb_samples = total_nb_samples + experiment["nb_samples"] + if test_total_nb_samples <= sampling_size: + break + + # this should not happen but we keep it for safety + if experiment is None: + logger.error("No experiment found") + continue + + total_nb_samples = test_total_nb_samples + experiments_left.remove(experiment) + sampled_experiments.append(experiment) + + return [exp["accession"] for exp in sampled_experiments], sampling_quota_reached + + +def format_species_name(species: str) -> str: return species.replace("_", " ").capitalize().strip() @@ -405,39 +409,121 @@ def format_species_name(species: str): def main(): args = parse_args() + results = None + selected_accessions = [] + selected_experiments = [] + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # PARSING EXPRESSION ATLAS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # Getting arguments species_name = format_species_name(args.species) keywords = args.keywords logger.info(f"Getting experiments corresponding to species {species_name}") - experiments = get_species_experiments(species_name) - logger.info(f"Found {len(experiments)} experiments") + experiments = get_eatlas_experiments() + + logger.info("Filtering on species name") + experiments = get_species_experiments(experiments, species_name) + logger.info(f"Found {len(experiments)} experiments for species {species_name}") + + logger.info("Filtering experiments based on platform") + experiments = filter_by_platform(experiments, args.platform) + + logger.info("Filtering out excluded accessions") + experiments = filter_out_excluded_accessions(experiments) logger.info("Parsing experiments") - with Pool() as pool: + with Pool(processes=args.nb_cpus) as pool: results = pool.map(parse_experiment, experiments) if keywords: logger.info(f"Filtering experiments with keywords {keywords}") - func = partial(filter_experiment, keywords=keywords) - with Pool() as pool: + func = partial(filter_experiment_with_keywords, keywords=keywords) + with Pool(processes=args.nb_cpus) as pool: results = [res for res in pool.map(func, results) if res is not None] + logger.info( + f"Found {len(results)} experiments corresponding to keywords {keywords}" + ) - if results: - logger.info(f"Kept {len(results)} experiments") - else: - raise RuntimeError( - f"Could not find experiments for species {args.species} and keywords {args.keywords}" - ) - + # getting accessions of selected experiments selected_accessions = [exp_dict["accession"] for exp_dict in results] + + if args.random_sampling_size and args.random_sampling_seed: + selected_accession_to_nb_samples = [ + { + "accession": exp_dict["experimentAccession"], + "nb_samples": exp_dict["numberOfAssays"], + } + for exp_dict in experiments + if exp_dict["experimentAccession"] in selected_accessions + ] + + nb_samples_df = pd.DataFrame.from_dict(selected_accession_to_nb_samples) + nb_samples_df.to_csv("selected_accession_to_nb_samples.csv", index=False) + + logger.info("Sampling experiments randomly") + selected_accessions, sampling_quota_reached = sample_experiments_randomly( + selected_accession_to_nb_samples, + args.random_sampling_size, + args.random_sampling_seed, + ) + logger.info( + f"Kept {len(selected_accessions)} experiments after random sampling" + ) + + # writing status to file + # so that the wrapper module can get the status + with open(SAMPLING_QUOTA_OUTFILE, "w") as fout: + sampling_status = "full" if sampling_quota_reached else "ok" + fout.write(sampling_status) + + # keeping metadata only for selected experiments + selected_experiments = get_metadata_for_selected_experiments(experiments, results) + + if not selected_accessions: + logger.warning( + f"Could not find experiments for species {species_name} and keywords {keywords}" + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # EXPORTING DATA + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + # exporting list of accessions logger.info(f"Writing accessions to {ACCESSION_OUTFILE_NAME}") with open(ACCESSION_OUTFILE_NAME, "w") as fout: fout.writelines([f"{acc}\n" for acc in selected_accessions]) - logger.info(f"Writing filtered experiments to {FILTERED_EXPERIMENTS_OUTFILE_NAME}") - with open(FILTERED_EXPERIMENTS_OUTFILE_NAME, "w") as fout: - json.dump(results, fout) + # exporting metadata + logger.info( + f"Writing metadata of all experiments for species {species_name} to {SPECIES_EXPERIMENTS_METADATA_OUTFILE_NAME}" + ) + df = pd.DataFrame.from_dict(experiments) + df.to_csv( + SPECIES_EXPERIMENTS_METADATA_OUTFILE_NAME, sep="\t", index=False, header=True + ) + + if selected_experiments: + logger.info( + f"Writing metadata of filtered experiments to {SELECTED_EXPERIMENTS_METADATA_OUTFILE_NAME}" + ) + df = pd.DataFrame.from_dict(selected_experiments) + df.to_csv( + SELECTED_EXPERIMENTS_METADATA_OUTFILE_NAME, + sep="\t", + index=False, + header=True, + ) + + if results: + # exporting list of selected experiments with their keywords + logger.info( + f"Writing filtered experiments with keywords to {FILTERED_EXPERIMENTS_WITH_KEYWORDS_OUTFILE_NAME}" + ) + with open(FILTERED_EXPERIMENTS_WITH_KEYWORDS_OUTFILE_NAME, "w") as fout: + yaml.dump(results, fout) if __name__ == "__main__": diff --git a/bin/get_gene_statistics.py b/bin/get_gene_statistics.py deleted file mode 100755 index eb24cf20..00000000 --- a/bin/get_gene_statistics.py +++ /dev/null @@ -1,554 +0,0 @@ -#!/usr/bin/env python3 - -# Written by Olivier Coen. Released under the MIT license. - -import argparse -import polars as pl -from pathlib import Path -from dataclasses import dataclass, field -import logging - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -# nb of top stable genes to select and to display at the end -DEFAULT_NB_TOP_STABLE_GENES = 1000 -# we want to select samples that show a particularly low nb of genes -MIN_RATIO_GENE_COUNT_TO_MEAN = 0.75 # experimentally chosen -WEIGHT_RATIO_NB_NULLS = 1 - - -# outfile names -TOP_STABLE_GENE_SUMMARY_OUTFILENAME = "top_stable_genes_summary.csv" -ALL_GENES_RESULT_OUTFILENAME = "stats_all_genes.csv" -ALL_COUNTS_FILTERED_PARQUET_OUTFILENAME = "all_counts_filtered.parquet" -TOP_STABLE_GENES_COUNTS_OUTFILENAME = "top_stable_genes_transposed_counts_filtered.csv" - -# column names -RANK_COLNAME = "Rank" -ORIGINAL_GENE_ID_COLNAME = "original_gene_id" -ORIGINAL_GENE_IDS_COLNAME = "original_gene_ids" -ENSEMBL_GENE_ID_COLNAME = "ensembl_gene_id" -GENE_NAME_COLNAME = "name" -GENE_DESCRIPTION_COLNAME = "description" -VARIATION_COEFFICIENT_COLNAME = "variation_coefficient" -STANDARD_DEVIATION_COLNAME = "standard_deviation" -MEAN_COLNAME = "mean" -EXPRESSION_LEVEL_QUANTILE_INTERVAL_COLNAME = "expression_level_quantile_interval" -EXPRESSION_LEVEL_STATUS_COLNAME = "expression_level_status" -GENE_COUNT_COLNAME = "count" -SAMPLE_COLNAME = "sample" -NB_NULLS_COLNAME = "total_nb_nulls" -NB_NULLS_VALID_SAMPLES_COLNAME = "nb_nulls_valid_samples" -NB_ZEROS_COLNAME = "nb_zeros" -STABILITY_SCORE_COLNAME = "stability_score" -KS_TEST_COLNAME = "kolmogorov_smirnov_to_uniform_dist_pvalue" - -STATISTICS_COLS = [ - RANK_COLNAME, - ENSEMBL_GENE_ID_COLNAME, - STABILITY_SCORE_COLNAME, - STANDARD_DEVIATION_COLNAME, - VARIATION_COEFFICIENT_COLNAME, - MEAN_COLNAME, - EXPRESSION_LEVEL_STATUS_COLNAME, - NB_NULLS_COLNAME, - NB_NULLS_VALID_SAMPLES_COLNAME, - GENE_NAME_COLNAME, - GENE_DESCRIPTION_COLNAME, - ORIGINAL_GENE_IDS_COLNAME, -] - -ALL_GENES_STATS_COLS = [ - ENSEMBL_GENE_ID_COLNAME, - STABILITY_SCORE_COLNAME, - MEAN_COLNAME, - STANDARD_DEVIATION_COLNAME, - VARIATION_COEFFICIENT_COLNAME, -] - -# quantile intervals -NB_QUANTILES = 100 - -NB_TOP_GENES_TO_SHOW_IN_LOG_COUNTS = 100 - - -##################################################### -##################################################### -# FUNCTIONS -##################################################### -##################################################### - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Get statistics from count data for each gene" - ) - parser.add_argument( - "--counts", type=Path, dest="count_file", required=True, help="Count file" - ) - parser.add_argument( - "--metadata", - type=str, - dest="metadata_files", - required=True, - help="Metadata file", - ) - parser.add_argument( - "--mappings", type=str, dest="mapping_files", required=True, help="Mapping file" - ) - parser.add_argument( - "--nb-top-stable-genes", - type=int, - dest="nb_top_stable_genes", - required=True, - help="Number of top stable genes to show", - ) - parser.add_argument( - "--ks-stats", - type=Path, - dest="ks_stats_file", - required=True, - help="KS stats file", - ) - parser.add_argument( - "--ks-pvalue-threshold", - type=str, - dest="ks_pvalue_threshold", - required=True, - help="KS p-value threshold", - ) - return parser.parse_args() - - -def is_valid_lf(lf: pl.LazyFrame, file: Path) -> bool: - """Check if a LazyFrame is valid. - - A LazyFrame is considered valid if it contains at least one row. - """ - try: - return not lf.limit(1).collect().is_empty() - except FileNotFoundError: - # strangely enough we get this error for some files existing but empty - logger.error(f"Could not find file {str(file)}") - return False - except pl.exceptions.NoDataError as err: - logger.error(f"File {str(file)} is empty: {err}") - return False - - -def get_valid_lazy_lfs(files: list[Path]) -> list[pl.LazyFrame]: - """Get a list of valid LazyFrames from a list of files. - - A LazyFrame is considered valid if it contains at least one row. - """ - lf_dict = {file: pl.scan_csv(file) for file in files} - return [lf for file, lf in lf_dict.items() if is_valid_lf(lf, file)] - - -def cast_cols_to_string(lf: pl.LazyFrame) -> pl.LazyFrame: - return lf.select( - [pl.col(column).cast(pl.String) for column in lf.collect_schema().names()] - ) - - -def concat_cast_to_string_and_drop_duplicates(files: list[Path]) -> pl.LazyFrame: - """Concatenate LazyFrames, cast all columns to String, and drop duplicates. - - The first step is to concatenate the LazyFrames. Then, the dataframe is cast - to String to ensure that all columns have the same data type. Finally, duplicate - rows are dropped. - """ - lfs = get_valid_lazy_lfs(files) - lfs = [cast_cols_to_string(lf) for lf in lfs] - concat_lf = pl.concat(lfs) - # dropping duplicates - # casting all columns to String - return concat_lf.unique() - - -def get_count_columns(lf: pl.LazyFrame) -> list[str]: - """Get all column names except the ENSEMBL_GENE_ID_COLNAME column. - - The ENSEMBL_GENE_ID_COLNAME column contains only gene IDs. - """ - return lf.select(pl.exclude(ENSEMBL_GENE_ID_COLNAME)).collect_schema().names() - - -def cast_count_columns_to_float32(lf: pl.LazyFrame) -> pl.LazyFrame: - return lf.select( - [pl.col(ENSEMBL_GENE_ID_COLNAME)] - + [pl.col(column).cast(pl.Float32) for column in get_count_columns(lf)] - ) - - -def get_counts( - file: Path, ks_stats_file: Path, ks_pvalue_threshold: str -) -> pl.LazyFrame: - # sorting dataframe (necessary to get consistent output) - count_lf = pl.scan_parquet(file).sort(ENSEMBL_GENE_ID_COLNAME, descending=False) - ks_stats_df = pl.read_csv( - ks_stats_file, has_header=False, new_columns=[SAMPLE_COLNAME, KS_TEST_COLNAME] - ) - - # parsing threshold - try: - ks_pvalue_threshold = float(ks_pvalue_threshold) - except ValueError: - raise ValueError( - f"KS p-value threshold {ks_pvalue_threshold} could not be cast to float" - ) - - # logging number of samples excluded from analysis - not_valid_samples = ks_stats_df.filter( - ks_stats_df[KS_TEST_COLNAME] <= ks_pvalue_threshold - )[SAMPLE_COLNAME].to_list() - logger.warning( - f"Excluded {len(not_valid_samples)} samples showing a KS p-value below {ks_pvalue_threshold}" - ) - - # getting samples for which the Kolmogorov-Smirnov test pvalue is above the threshold - valid_samples = ks_stats_df.filter( - ks_stats_df[KS_TEST_COLNAME] > ks_pvalue_threshold - )[SAMPLE_COLNAME].to_list() - # filtering the count dataframe to keep only the valid samples - return count_lf.select([ENSEMBL_GENE_ID_COLNAME] + valid_samples) - - -def get_metadata(metadata_files: list[Path]) -> pl.LazyFrame: - """Retrieve and concatenate metadata from a list of metadata files.""" - return concat_cast_to_string_and_drop_duplicates(metadata_files) - - -def get_mappings(mapping_files: list[Path]) -> pl.LazyFrame: - concat_lf = concat_cast_to_string_and_drop_duplicates(mapping_files) - # group by new gene IDs and gets the lis - """Group by new gene IDs, get the list of distinct original gene IDs and convert to a string representation.""" - # t of distinct original gene IDs for each group - # convert the list column to a string representation - # separate the original gene IDs with a semicolon - return concat_lf.group_by(ENSEMBL_GENE_ID_COLNAME).agg( - pl.col(ORIGINAL_GENE_ID_COLNAME) - .unique() - .sort() - .str.join(";") - .alias(ORIGINAL_GENE_IDS_COLNAME) - ) - - -def merge_data( - stat_lf: pl.LazyFrame, metadata_lf: pl.LazyFrame, mapping_lf: pl.LazyFrame -) -> pl.LazyFrame: - """Merge the statistics dataframe with the metadata dataframe and the mapping dataframe.""" - # we need to ensure that the index of stat_lf are strings - return stat_lf.join(metadata_lf, on=ENSEMBL_GENE_ID_COLNAME, how="left").join( - mapping_lf, on=ENSEMBL_GENE_ID_COLNAME, how="left" - ) - - -def sort_dataframe(lf: pl.LazyFrame) -> pl.LazyFrame: - return ( - lf.sort(STABILITY_SCORE_COLNAME, descending=False, nulls_last=True) - .with_row_index(name="index") - .with_columns((pl.col("index") + 1).alias("Rank")) - .drop("index") - ) - - -def get_status(quantile_interval: int) -> str: - """Return the expression level status of the gene given its quantile interval.""" - if NB_QUANTILES - 5 <= quantile_interval: - return "Very high expression" - elif NB_QUANTILES - 10 <= quantile_interval < NB_QUANTILES - 5: - return "High expression" - elif 4 < quantile_interval <= 9: - return "Low expression" - elif quantile_interval <= 4: - return "Very low expression" - else: - return "Medium range" - - -def get_top_stable_gene_summary( - stat_lf: pl.LazyFrame, nb_top_stable_genes: int -) -> pl.LazyFrame: - """ - Extract the most stable genes from the statistics dataframe. - """ - logger.info("Getting most stable genes per quantile interval") - mapping_dict = { - quantile_interval: get_status(quantile_interval) - for quantile_interval in range(NB_QUANTILES) - } - lf = stat_lf.head(nb_top_stable_genes).with_columns( - pl.col(EXPRESSION_LEVEL_QUANTILE_INTERVAL_COLNAME) - .replace_strict(mapping_dict) - .alias(EXPRESSION_LEVEL_STATUS_COLNAME) - ) - return lf.select( - [column for column in STATISTICS_COLS if column in lf.collect_schema().names()] - ) - - -def format_all_genes_statistics(stat_lf: pl.LazyFrame) -> pl.LazyFrame: - """ - Format the dataframe containing statistics for all genes by selecting the right columns - and sorting the dataframe by gene ID. - """ - return stat_lf.select( - [ - column - for column in ALL_GENES_STATS_COLS - if column in stat_lf.collect_schema().names() - ] - ).sort(STABILITY_SCORE_COLNAME, descending=False) - - -def get_top_stable_genes_counts( - log_count_lf: pl.LazyFrame, top_stable_genes_summary_lf: pl.LazyFrame -) -> pl.DataFrame: - # getting list of top stable genes in the order - sorted_stable_genes = ( - top_stable_genes_summary_lf.head(NB_TOP_GENES_TO_SHOW_IN_LOG_COUNTS) - .select(ENSEMBL_GENE_ID_COLNAME) - .collect() - .to_series() - .to_list() - ) - mapping_dict = {item: index for index, item in enumerate(sorted_stable_genes)} - - # extracting log counts of top stable genes - sorted_transposed_counts_df = ( - log_count_lf.filter(pl.col(ENSEMBL_GENE_ID_COLNAME).is_in(sorted_stable_genes)) - .with_columns( - pl.col(ENSEMBL_GENE_ID_COLNAME) - .replace_strict(mapping_dict) - .alias("sort_order") - ) - .sort("sort_order", descending=False) - .drop(["sort_order", ENSEMBL_GENE_ID_COLNAME]) - ).collect() - - return sorted_transposed_counts_df.transpose(column_names=sorted_stable_genes) - - -def export_data( - top_stable_genes_summary_lf: pl.LazyFrame, - formated_stat_lf: pl.LazyFrame, - all_counts_lf: pl.LazyFrame, - top_stable_genes_counts_df: pl.DataFrame, -): - """Export gene expression data to CSV files.""" - logger.info( - f"Exporting statistics of the top stable genes to: {TOP_STABLE_GENE_SUMMARY_OUTFILENAME}" - ) - top_stable_genes_summary_lf.collect().write_csv(TOP_STABLE_GENE_SUMMARY_OUTFILENAME) - - logger.info( - f"Exporting statistics for all genes to: {ALL_GENES_RESULT_OUTFILENAME}" - ) - formated_stat_lf.collect().write_csv(ALL_GENES_RESULT_OUTFILENAME) - - logger.info(f"Exporting all counts to: {ALL_COUNTS_FILTERED_PARQUET_OUTFILENAME}") - all_counts_lf.collect().write_parquet(ALL_COUNTS_FILTERED_PARQUET_OUTFILENAME) - - logger.info( - f"Exporting counts of the top stable genes to: {TOP_STABLE_GENES_COUNTS_OUTFILENAME}" - ) - top_stable_genes_counts_df.write_csv(TOP_STABLE_GENES_COUNTS_OUTFILENAME) - - logger.info("Done") - - -##################################################### -##################################################### -# CLASSES -##################################################### -##################################################### - - -@dataclass -class StabilityScorer: - count_lf: pl.LazyFrame - - gene_count_per_sample_df: pl.DataFrame = field(init=False) - stat_lf: pl.LazyFrame = field(init=False) - count_columns: list[str] = field(init=False) - samples_with_low_gene_count: list[str] = field(init=False) - - def __post_init__(self): - self.count_columns = get_count_columns(self.count_lf) - self.gene_count_per_sample_df = self.get_gene_counts_per_sample() - self.samples_with_low_gene_count = self.get_samples_with_low_gene_count() - - def get_valid_counts(self) -> pl.LazyFrame: - return self.count_lf.select(pl.exclude(ENSEMBL_GENE_ID_COLNAME)) - - def get_gene_counts_per_sample(self) -> pl.DataFrame: - """ - Get the number of non-null values per sample. - :return: - A polars dataframe containing 2 columns: - - sample: name of the sample - - nb_not_nulls: number of non-null values - """ - return ( - self.count_lf.select(pl.exclude(ENSEMBL_GENE_ID_COLNAME)) - .count() - .collect() - .transpose( - include_header=True, header_name="sample", column_names=["count"] - ) - ) - - def get_samples_with_low_gene_count(self) -> list[str]: - mean_gene_count = self.gene_count_per_sample_df[GENE_COUNT_COLNAME].mean() - return ( - self.gene_count_per_sample_df.filter( - (pl.col(GENE_COUNT_COLNAME) / mean_gene_count) - < MIN_RATIO_GENE_COUNT_TO_MEAN - ) - .select(SAMPLE_COLNAME) - .to_series() - .to_list() - ) - - def get_main_statistics(self) -> pl.LazyFrame: - """ - Compute count descriptive statistics for each gene in the count dataframe. - """ - logger.info("Getting descriptive statistics") - # computing main stats - augmented_count_lf = self.count_lf.with_columns( - mean=pl.concat_list(self.count_columns).list.drop_nulls().list.mean(), - std=pl.concat_list(self.count_columns).list.drop_nulls().list.std(), - ) - return augmented_count_lf.select( - pl.col(ENSEMBL_GENE_ID_COLNAME), - pl.col("mean").alias(MEAN_COLNAME), - pl.col("std").alias(STANDARD_DEVIATION_COLNAME), - (pl.col("std") / pl.col("mean")).alias(VARIATION_COEFFICIENT_COLNAME), - ) - - def compute_nb_null_values(self): - # the samples showing a low gene count will not be taken into account for the zero count penalty - cols_to_exclude = [ENSEMBL_GENE_ID_COLNAME] + self.samples_with_low_gene_count - total_nb_nulls = ( - self.count_lf.select(pl.exclude(ENSEMBL_GENE_ID_COLNAME).is_null()) - .collect() - .sum_horizontal() - ) - nb_nulls_valid_samples = ( - self.count_lf.select(pl.exclude(cols_to_exclude).is_null()) - .collect() - .sum_horizontal() - ) - self.stat_lf = self.stat_lf.with_columns( - total_nb_nulls.alias(NB_NULLS_COLNAME), - nb_nulls_valid_samples.alias(NB_NULLS_VALID_SAMPLES_COLNAME), - ) - - def get_quantile_intervals(self): - """ - Compute the quantile intervals for the mean expression levels of each gene in the dataframe. - - The function assigns to each gene a quantile interval of its mean cpm compared to all genes. - """ - logger.info("Getting cpm quantiles") - self.stat_lf = self.stat_lf.with_columns( - (pl.col(MEAN_COLNAME).rank() / pl.col(MEAN_COLNAME).count() * NB_QUANTILES) - .floor() - .cast(pl.Int8) - # we want the only value = NB_QUANTILES to be NB_QUANTILES - 1 - # because the last quantile interval is [NB_QUANTILES - 1, NB_QUANTILES] - .replace({NB_QUANTILES: NB_QUANTILES - 1}) - .alias(EXPRESSION_LEVEL_QUANTILE_INTERVAL_COLNAME) - ) - - def compute_stability_score(self): - logger.info("Computing stability score") - nb_valid_samples = self.gene_count_per_sample_df.select(pl.len()).item() - len( - self.samples_with_low_gene_count - ) - ratio_nb_nulls = ( - self.stat_lf.select( - pl.col(NB_NULLS_VALID_SAMPLES_COLNAME) / nb_valid_samples - ) - .collect() - .to_series() - ) - expr = ( - pl.col(STANDARD_DEVIATION_COLNAME) + ratio_nb_nulls * WEIGHT_RATIO_NB_NULLS - ) - self.stat_lf = self.stat_lf.with_columns(expr.alias(STABILITY_SCORE_COLNAME)) - - def compute_statistics_and_score(self) -> pl.LazyFrame: - logger.info("Computing statistics and stability score") - # getting expression statistics - self.stat_lf = self.get_main_statistics() - # adding column for nb of null values for each gene - self.compute_nb_null_values() - # computing stability score - self.compute_stability_score() - # getting quantile intervals - self.get_quantile_intervals() - - return self.stat_lf - - -##################################################### -##################################################### -# MAIN -##################################################### -##################################################### - - -def main(): - args = parse_args() - metadata_files = [Path(file) for file in args.metadata_files.split(" ")] - mapping_files = [Path(file) for file in args.mapping_files.split(" ")] - - # putting all counts into a single dataframe - count_lf = get_counts(args.count_file, args.ks_stats_file, args.ks_pvalue_threshold) - - # getting metadata and mappings - metadata_lf = get_metadata(metadata_files) - mapping_lf = get_mappings(mapping_files) - - # computing statistics (mean, standard deviation, coefficient of variation, quantiles) - stability_scorer = StabilityScorer(count_lf) - stat_lf = stability_scorer.compute_statistics_and_score() - - # add gene name, description and original gene IDs - stat_lf = merge_data(stat_lf, metadata_lf, mapping_lf) - - # sort genes according to the metrics present in the dataframe - stat_lf = sort_dataframe(stat_lf) - - # getting the most stable genes - # we don't want to exceed 1000 (for multiqc) - nb_top_stable_genes = min(args.nb_top_stable_genes, DEFAULT_NB_TOP_STABLE_GENES) - top_stable_genes_summary_lf = get_top_stable_gene_summary( - stat_lf, nb_top_stable_genes - ) - - formated_stat_lf = format_all_genes_statistics(stat_lf) - - # reducing dataframe size (it is only used for plotting by MultiQC) - count_lf = cast_count_columns_to_float32(count_lf) - - top_stable_genes_counts_df = get_top_stable_genes_counts( - count_lf, top_stable_genes_summary_lf - ) - - # exporting computed data - export_data( - top_stable_genes_summary_lf, - formated_stat_lf, - count_lf, - top_stable_genes_counts_df, - ) - - -if __name__ == "__main__": - main() diff --git a/bin/get_geo_dataset_accessions.py b/bin/get_geo_dataset_accessions.py new file mode 100755 index 00000000..21383235 --- /dev/null +++ b/bin/get_geo_dataset_accessions.py @@ -0,0 +1,964 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import random +import tarfile +from functools import partial +from multiprocessing import Pool +from pathlib import Path +from urllib.request import urlretrieve + +import pandas as pd +import requests +import xmltodict +from Bio import Entrez +from natural_language_utils import keywords_in_fields +from requests.exceptions import ConnectionError, HTTPError +from tenacity import ( + before_sleep_log, + retry, + stop_after_delay, + wait_exponential, +) +from tqdm import tqdm + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# set a custom writable directory before any Entrez operations +# mandatory for running the script in an apptainer container +# Entrez.Parser.Parser.directory("/tmp/biopython") + +ALLOWED_PLATFORMS = ["rnaseq", "microarray"] + +ACCESSION_OUTFILE_NAME = "accessions.txt" +SPECIES_DATASETS_OUTFILE_NAME = "geo_all_datasets.metadata.tsv" +REJECTED_DATASETS_OUTFILE_NAME = "geo_rejected_datasets.metadata.tsv" +# WRONG_SPECS_DATASETS_METADATA_OUTFILE_NAME = "geo_wrong_platform_moltype_datasets.metadata.tsv" +# WRONG_KEYWORDS_DATASETS_METADATA_OUTFILE_NAME = "geo_wrong_keywords_datasets.metadata.tsv" +# PLATFORM_NOT_AVAILABLE_DATASETS_METADATA_OUTFILE_NAME = "platform_not_available_datasets.metadata.tsv" +# GENE_ID_MAPPING_ISSUES_DATASETS_METADATA_OUTFILE_NAME = "gene_id_mapping_issues_datasets.metadata.tsv" +SELECTED_DATASETS_OUTFILE_NAME = "geo_selected_datasets.metadata.tsv" + +ENTREZ_QUERY_MAX_RESULTS = 9999 +ENTREZ_EMAIL = "stableexpression@nfcore.com" +PLATFORM_METADATA_CHUNKSIZE = 2000 + +NCBI_API_BASE_URL = ( + "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?view=data&acc={accession}" +) +STOP_RETRY_AFTER_DELAY = 600 + +NB_PROBE_IDS_TO_PARSE = 1000 +NB_PROBE_IDS_TO_SAMPLE = 10 + +SUPERSERIES_SUMMARY = "This SuperSeries is composed of the SubSeries listed below." + +ALLOWED_LIBRARY_SOURCES = ["transcriptomic", "RNA"] +ALLOWED_MOLECULE_TYPES = ["RNA", "SRA"] + +GEO_EXPERIMENT_TYPE_TO_PLATFORM = { + "Expression profiling by array": "microarray", + "Expression profiling by high throughput sequencing": "rnaseq", +} + +MINIML_TMPDIR = "geo_miniml" +PLATFORM_SOFT_TMPDIR = "geo_platform_soft" +Path(MINIML_TMPDIR).mkdir(exist_ok=True) +Path(PLATFORM_SOFT_TMPDIR).mkdir(exist_ok=True) + + +################################################################## +################################################################## +# EXCEPTIONS +################################################################## +################################################################## + + +class GeoDatasetNothingFoundError(Exception): + pass + + +class GeoPlatformDataTableNotFound(Exception): + pass + + +################################################################## +################################################################## +# FUNCTIONS +################################################################## +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Get GEO Datasets accessions") + parser.add_argument( + "--species", + type=str, + required=True, + help="Search GEO Datasets for this specific species", + ) + parser.add_argument( + "--keywords", + type=str, + nargs="*", + help="Keywords to search for in datasets description", + ) + parser.add_argument( + "--platform", type=str, help="Platform type", choices=ALLOWED_PLATFORMS + ) + parser.add_argument( + "--exclude-accessions-in", + dest="excluded_accessions_file", + type=Path, + help="Exclude accessions contained in this file", + ) + parser.add_argument( + "--random-sampling-size", + dest="random_sampling_size", + type=int, + help="Random sampling size", + ) + parser.add_argument( + "--random-sampling-seed", + dest="random_sampling_seed", + type=int, + help="Random sampling seed", + ) + parser.add_argument( + "--cpus", + dest="nb_cpus", + type=int, + required=True, + help="Number of CPUs to use", + ) + parser.add_argument( + "--accessions", + type=str, + help="[For dev purposes / testing: provide directly accessions (separated by commas) and try to get their metadata]", + ) + return parser.parse_args() + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# QUERIES TO ENTREZ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), + retry_error_callback=(lambda _: {}), +) +def send_request_to_entrez_esearch(query: str) -> dict: + Entrez.email = ENTREZ_EMAIL + with Entrez.esearch( + db="gds", term=query, retmax=ENTREZ_QUERY_MAX_RESULTS + ) as handle: + return Entrez.read(handle) + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), + retry_error_callback=(lambda _: []), +) +def send_request_to_entrez_esummary(ids: list[str]) -> list[dict]: + Entrez.email = ENTREZ_EMAIL + ids_str = ",".join(ids) + with Entrez.esummary( + db="gds", id=ids_str, retmax=ENTREZ_QUERY_MAX_RESULTS + ) as handle: + return Entrez.read(handle) + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), + retry_error_callback=(lambda _: None), +) +def send_request_to_ncbi_api(accession: str) -> requests.Response | None: + url = NCBI_API_BASE_URL.format(accession=accession) + server_error = False + response = None + + try: + response = requests.get(url, stream=True) + except requests.exceptions.ConnectionError: + server_error = True + else: + try: + response.raise_for_status() + except (HTTPError, ConnectionError) as err: + if str(response.status_code).startswith("5"): # error 500 -> 509 + server_error = True + raise err + else: + logger.error( + f"Error {response.status_code} while sending request to NCBI: {err}" + ) + raise err + + # if we get connection issues or 500 -> 509 server errors + # we stop immediately for this accession (return None) + if server_error: + logger.critical( + f"Server error while sending request to NCBI for accession {accession}" + ) + + return response + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), + retry_error_callback=(lambda _: None), +) +def download_file_at_url(url: str, output_file: Path): + urlretrieve(url, output_file) + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# GEO DATASETS +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def chunk_list(lst: list, chunksize: int) -> list: + """Splits a list into chunks of a given size. + + Args: + lst (list): The list to split. + chunksize (int): The size of each chunk. + + Returns: + list: A list of chunks, where each chunk is a list of len(chunksize). + """ + return [lst[i : i + chunksize] for i in range(0, len(lst), chunksize)] + + +def fetch_geo_datasets_for_species(species: str) -> list[dict]: + """ + Fetch GEO datasets (GSE series) for a given species + + Args: + species (str): Scientific name of the species (e.g. "Homo sapiens"). + """ + dataset_types = [ + f'"{experiment_type}"[DataSet Type]' + for experiment_type in GEO_EXPERIMENT_TYPE_TO_PLATFORM + ] + formatted_dataset_type = "(" + " OR ".join(dataset_types) + ")" + + query = f'"{species}"[Organism] AND "gse"[Entry Type] AND {formatted_dataset_type}' + logger.info(f"Fetching GEO datasets with query: {query}") + + # getting list of all datasets IDs for this species + # we need possibly to perform multiple queries because the max number of returned results is capped + nb_entries = None + retstart = 0 + while not nb_entries or retstart < nb_entries: + record = send_request_to_entrez_esearch(query) + + if not record: + logger.warning(f"Failed to query Entrey Esearch with query: {query}") + return [] + + # getting total nb of entries + if not nb_entries: + nb_entries = int(record["Count"]) + + # if there is no entry for this species + if nb_entries == 0: + logger.info(f"No entries found for query: {query}") + return [] + + # setting next cursor to the next group + retstart += ENTREZ_QUERY_MAX_RESULTS + + ids = record.get("IdList", []) + if not ids: + logger.warning("No GEO datasets found for your query.") + return [] + + # fetching summary info + results = send_request_to_entrez_esummary(ids) + + # keeping only series datasets (just a double check here) + # and removing superseries (they are just containers of series that are also contained here) + return [ + r + for r in results + if "GSE" in r["Accession"] and r["summary"] != SUPERSERIES_SUMMARY + ] + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# FORMATTING +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def format_species(species: str) -> str: + return "_".join(species.lower().split(" ")) + + +def format_platform_name(platform_name: str) -> str: + return platform_name.replace("_", "").replace("-", "").lower() + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# GET METADATA +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def download_dataset_metadata(ftp_link: str, accession: str) -> Path | None: + filename = f"miniml/{accession}_family.xml.tgz" + ftp_url = ftp_link + filename + output_file = Path(MINIML_TMPDIR) / f"{accession}.tar.gz" + download_file_at_url(ftp_url, output_file) + if output_file.exists(): + return output_file + else: + logger.error(f"Failed to download dataset metadata for accession: {accession}") + return None + + +def parse_dataset_metadata(file: Path, accession: str) -> dict | None: + with tarfile.open(file, "r:gz") as tar: + file_to_read = f"{accession}_family.xml" + + try: + f = tar.extractfile(file_to_read) + except KeyError: + file_to_read = f"{accession}_family.xml/{accession}_family.xml" + try: + f = tar.extractfile(file_to_read) + except KeyError: + return None + + if f is None: + logger.warning(f"Failed to get file: {file_to_read}") + return None + + try: + xml_content = f.read().decode("utf-8") + except UnicodeDecodeError: + logger.warning(f"Failed to decode file: {file_to_read}") + return None + + return xmltodict.parse(xml_content)["MINiML"] + + +def parse_characteristics( + characteristics: str | dict | list, stored_characteristics: list +): + if isinstance(characteristics, str): + stored_characteristics.append(characteristics) + elif isinstance(characteristics, dict): + if "#text" in characteristics: + stored_characteristics.append(characteristics["#text"]) + elif isinstance(characteristics, list): + for c in characteristics: + parse_characteristics(c, stored_characteristics) + + +def parse_interesting_metadata( + dataset_metadata: dict, additional_metadata: dict +) -> dict: + """ + Parses interesting metadata from a dataset metadata dictionary and additional metadata dictionary. + + Args: + dataset_metadata (dict): The dataset metadata dictionary. + additional_metadata (dict): The additional metadata dictionary. + + Returns: + dict: The parsed interesting metadata dictionary. + """ + sample_characteristics = [] + sample_library_strategies = [] + sample_library_sources = [] + sample_descriptions = [] + sample_titles = [] + sample_molecule_types = [] + + platform_accessions = [ + "GPL" + gpl_id for gpl_id in dataset_metadata["GPL"].split(";") + ] + + experiment_types = dataset_metadata["gdsType"] + experiment_types = ( + experiment_types if isinstance(experiment_types, list) else [experiment_types] + ) + + # if additional metadata have sample information + if "Sample" in additional_metadata: + # change to list if it's a single dictionary + if isinstance(additional_metadata["Sample"], dict): + additional_metadata["Sample"] = [additional_metadata["Sample"]] + + for sample in additional_metadata["Sample"]: + # storing description if exists + if sample_description := sample.get("Description"): + sample_descriptions.append(sample_description) + + # storing title if exists + if sample_title := sample.get("Title"): + sample_titles.append(sample_title) + + # storing molecule type if exists + if sample_molecule_type := sample.get("Type"): + sample_molecule_types.append(sample_molecule_type) + + # storing library strategy if exists + if sample_library_strategy := sample.get("Library-Strategy"): + sample_library_strategies.append(sample_library_strategy) + + # storing library source if exists + if sample_library_source := sample.get("Library-Source"): + sample_library_sources.append(sample_library_source) + + # parsing sample metadata + if channels := sample.get("Channel"): + if isinstance(channels, dict): + channels = [channels] + for channel in channels: + parse_characteristics( + channel["Characteristics"], sample_characteristics + ) + + return { + "accession": dataset_metadata["Accession"], + "taxon": dataset_metadata["taxon"], + "platform_accessions": platform_accessions, + "summary": dataset_metadata["summary"], + "title": dataset_metadata["title"], + "overall_design": additional_metadata["Series"]["Overall-Design"], + "experiment_types": experiment_types, + "sample_characteristics": list(set(sample_characteristics)), + "sample_library_strategies": list(set(sample_library_strategies)), + "sample_library_sources": list(set(sample_library_sources)), + "sample_descriptions": list(set(sample_descriptions)), + "sample_titles": list(set(sample_titles)), + "sample_molecule_types": list(set(sample_molecule_types)), + } + + +def fetch_dataset_metadata(dataset_metadata: dict) -> dict | None: + """ + Parses metadata from a dataset metadata dictionary. + + Args: + dataset_metadata (dict): The dataset metadata dictionary. + + Returns: + dict | None: The parsed metadata dictionary or None if the metadata is missing. + """ + accession = dataset_metadata["Accession"] + ftp_link = dataset_metadata["FTPLink"].replace("ftp://", "https://") + downloaded_file = download_dataset_metadata(ftp_link, accession) + if downloaded_file is None: + logger.warning(f"Skipping {accession} as metadata download failed") + return None + + additional_metadata = parse_dataset_metadata(downloaded_file, accession) + + # if we could not get additional metadata, we lack too much information to conclude + if additional_metadata is None: + logger.warning(f"Skipping {accession} as additional metadata is missing") + return None + + # parsing interesting information in all available metadata + return parse_interesting_metadata(dataset_metadata, additional_metadata) + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# METADATA TESTS +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def exclude_unwanted_accessions( + datasets: list[dict], excluded_accessions: list[str] +) -> tuple[list[dict], list[dict]]: + datasets_to_keep = [] + excluded_datasets = [] + for dataset in datasets: + if dataset["accession"] in excluded_accessions: + excluded_datasets.append(dataset) + else: + datasets_to_keep.append(dataset) + return datasets_to_keep, excluded_datasets + + +def check_species_issues(parsed_species_list: list, species: str) -> str | None: + # trying to find our species in the list of species parsed + for parsed_species in parsed_species_list: + if format_species(parsed_species) == format_species(species): + return None + return f"PARSED SPECIES: {parsed_species_list}" + + +def check_molecule_type_issues(molecules_types: list) -> str | None: + # we want only GEO series that contain only RNA molecules + # for other series, they should be superseries contained other series that are being parsed too + # so anyway, this would lead in duplicates + if any( + [ + molecule_type.upper() in ALLOWED_MOLECULE_TYPES + for molecule_type in molecules_types + ] + ): + return None + return f"MOLECULE TYPES: {molecules_types}" + + +def check_experiment_type_issues(experiment_types: list, platform: str) -> str | None: + for experiment_type in experiment_types: + # if at least one experiment type is ok, we keep this dataset + if GEO_EXPERIMENT_TYPE_TO_PLATFORM.get(experiment_type) == platform: + return None + return f"EXPERIMENT TYPES: {experiment_types}" + + +def check_source_issues(library_sources: list) -> str | None: + # if we have no data about library sources, we just cannot infer + if not library_sources: + return None + if any( + library_source in ALLOWED_LIBRARY_SOURCES for library_source in library_sources + ): + return None + return f"LIBRARY SOURCES: {library_sources}" + + +def search_keywords(dataset: dict, keywords: list[str]) -> tuple[list, str | None]: + accession = dataset["accession"] + all_searchable_fields = ( + [dataset["summary"], dataset["title"]] + + dataset["sample_characteristics"] + + dataset["sample_descriptions"] + + dataset["sample_titles"] + ) + found_keywords = keywords_in_fields(all_searchable_fields, keywords) + # only returning experiments if found keywords + if found_keywords: + dataset["found_keywords"] = list(set(found_keywords)) + logger.info(f"Found keywords: {found_keywords} in accession {accession}") + return found_keywords, None + else: + return [], "NO KEYWORDS_FOUND" + + +def check_dataset( + dataset: dict, species: str, platform: str | None, keywords: list[str] | None +) -> tuple[list, dict]: + accession = dataset["accession"] + parsed_species_list = dataset["taxon"].split("; ") + experiment_types = dataset["experiment_types"] + library_sources = dataset["sample_library_sources"] + molecules_types = dataset["sample_molecule_types"] + + issues = [] + + # checking species + if issue := check_species_issues(parsed_species_list, species): + issues.append(issue) + + # checking platform + if platform is not None: + if issue := check_experiment_type_issues(experiment_types, platform): + issues.append(issue) + + # checking that library sources fit + if issue := check_source_issues(library_sources): + issues.append(issue) + + # checking that all molecule types are RNA + if issue := check_molecule_type_issues(molecules_types): + issues.append(issue) + + found_keywords = [] + if keywords: + found_keywords, keyword_issue = search_keywords(dataset, keywords) + if keyword_issue: + issues.append(keyword_issue) + + if issues: + rejection_dict = {accession: issues} + else: + rejection_dict = {} + + return found_keywords, rejection_dict + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# GEO PLATFORMS +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def fetch_geo_platform_metadata(datasets: list[dict]) -> dict: + """ + Fetch data for a GEO platform + + Args: + platform_accession (str): accession of the platform + """ + # unique list of platform accessions + platform_accessions = list( + set( + [ + platform_accession + for dataset in datasets + for platform_accession in dataset["platform_accessions"] + ] + ) + ) + # formating query + formatted_platform_accessions = [ + f'"{platform_accession}"[GEO Accession]' + for platform_accession in platform_accessions + ] + platform_accessions_str = " OR ".join(formatted_platform_accessions) + query = f'({platform_accessions_str}) AND "gpl"[Entry Type] ' + + record = send_request_to_entrez_esearch(query=query) + + ids = record.get("IdList", []) + if not ids: + logger.warning(f"No GEO platform found for accessions {platform_accessions}.") + return {} + + # fetching summary info + # one single request to NCBI for all platform accessions + platform_metadatas = send_request_to_entrez_esummary(ids) + # return dict associating dataset accessions with platform metadata + return { + platform_metadata["Accession"]: platform_metadata + for platform_metadata in platform_metadatas + } + + +def check_dataset_platforms( + dataset: dict, accession_to_platform_metadata: dict, species: str +) -> dict: + accession = dataset["accession"] + platform_accessions = dataset["platform_accessions"] + + if not platform_accessions: + return {accession: "NO PLATFORM ACCESSIONS"} + + platforms_metadata = [ + accession_to_platform_metadata[platform_accession] + for platform_accession in dataset["platform_accessions"] + ] + + # getting list of platform taxon + platforms_taxons = [] + for metadata in platforms_metadata: + if metadata.get("taxon") is not None: + platforms_taxons += metadata.get("taxon").split("; ") + platforms_taxons = list(set(platforms_taxons)) + + if not platforms_taxons: + return {accession: "NO PLATFORM TAXON"} + + # checking if at least one of the platform accession is the good one + # sample will be further filtered during download (download_geo_data.R) + if not any( + format_species(species) == format_species(taxon) for taxon in platforms_taxons + ): + return {accession: f"TAXON MISMATCH: {platforms_taxons}"} + + return {} + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# RANDOM SAMPLING +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def sample_experiments_randomly( + experiments: list[dict], sampling_size: int, seed: int +) -> list[str]: + random.seed(seed) + sampled_experiments = [] + + total_nb_samples = 0 + experiments_left = list(experiments) + while experiments_left and total_nb_samples <= sampling_size: + # if the min number of samples is greater than the remaining space left, we get out of the loop + experiments_left_nb_samples = [exp["nb_samples"] for exp in experiments_left] + min_nb_samples = min(experiments_left_nb_samples) + if min_nb_samples > sampling_size - total_nb_samples: + break + + found_experiment = False + test_total_nb_samples = int(total_nb_samples) + not_chosen_yet = list(experiments_left) + while not_chosen_yet and not found_experiment: + experiment = random.choice(not_chosen_yet) + not_chosen_yet.remove(experiment) + test_total_nb_samples = total_nb_samples + experiment["nb_samples"] + if test_total_nb_samples <= sampling_size: + found_experiment = True + + # if the last one was not good, it means we reached the limit of samples we can take + if not found_experiment: + break + else: + total_nb_samples = test_total_nb_samples + experiments_left.remove(experiment) + sampled_experiments.append(experiment) + + return [exp["accession"] for exp in sampled_experiments] + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# EXPORT +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def sort_if_list(x): + if isinstance(x, list): + return sorted(x) + else: + return x + + +def export_dataset_metadatas( + datasets: list[dict], output_file: str, clean_columns: bool = True +): + if datasets: + df = pd.DataFrame.from_dict(datasets) + # all dataframe contain the column "accession" + # sorting by accessions to ensure that outputs are reproducible + df.sort_values(by="accession", inplace=True) + for col in df.columns: + df[col] = df[col].apply(sort_if_list) + # cleaning columns so that MultiQC can parse them + if clean_columns: + for col in df.columns: + df[col] = df[col].astype(str).str.replace("\n", "") + df[col] = df[col].astype(str).str.replace("\t", "") + df.to_csv( + output_file, + sep="\t", + index=False, + header=True, + ) + + +################################################################## +################################################################## +# MAIN +################################################################## +################################################################## + + +def main(): + args = parse_args() + random_sampling_size = args.random_sampling_size + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # PARSING GEO DATASETS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + logger.info(f"Getting datasets corresponding to species {args.species}") + datasets = fetch_geo_datasets_for_species(args.species) + logger.info(f"Found {len(datasets)} datasets for species {args.species}") + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # FOR DEV PURPOSES / TESTING: RESTRICT TO SPECIFIC ACCESSIONS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + if args.accessions: + logger.info(f"Keeping only accessions {args.accessions}") + dev_accessions = args.accessions.split(",") + datasets = [d for d in datasets if d["Accession"] in dev_accessions] + logger.info(f"Kept {len(datasets)} datasets for dev / testing purposes") + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # PARSING DATASET METADATA + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + logger.info(f"Parsing metadata for {len(datasets)} datasets") + augmented_datasets = [] + with ( + Pool(processes=args.nb_cpus) as p, + tqdm(total=len(datasets)) as pbar, + ): + for result in p.imap_unordered(fetch_dataset_metadata, datasets): + pbar.update() + pbar.refresh() + if result is None: + continue + augmented_datasets.append(result) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # VALIDATING DATASETS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + logger.info(f"Validating {len(augmented_datasets)} datasets") + checked_datasets = [] + rejection_dict = {} + for dataset in tqdm(augmented_datasets): + found_keywords, issue_dict = check_dataset( + dataset, args.species, args.platform, args.keywords + ) + if issue_dict: + rejection_dict |= issue_dict + else: + if found_keywords: + dataset["found_keywords"] = found_keywords + checked_datasets.append(dataset) + + logger.info(f"Validated {len(checked_datasets)} datasets") + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # EXCLUDING UNWANTED ACCESSIONS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + # we exclude unwanted accessions only now + # because we want to get the metadata of the excluded datasets + # in order to adjust the random sampling size + if args.excluded_accessions_file: + # parsing list of accessions which were already fetched from Expression Atlas + with open(args.excluded_accessions_file) as fin: + excluded_accessions = fin.read().splitlines() + logger.info("Excluding unwanted datasets") + checked_datasets, excluded_datasets = exclude_unwanted_accessions( + checked_datasets, excluded_accessions + ) + logger.info( + f"{len(checked_datasets)} datasets remaining after excluding unwanted accessions" + ) + + # adjusting random sampling size by substracting the number of excluded accessions + if random_sampling_size: + total_nb_excluded_samples = sum( + [len(dataset["sample_titles"]) for dataset in excluded_datasets] + ) + logger.info( + f"Subtracting {total_nb_excluded_samples} samples from random sampling size" + ) + random_sampling_size -= total_nb_excluded_samples + # keeping it positive (just in case) + if random_sampling_size < 0: + logger.warning( + f"Random sampling size is negative ({random_sampling_size}), setting it to 0" + ) + random_sampling_size = 0 + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # GETTING METADATA OF SEQUENCING PLATFORMS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + logger.info("Getting platform metadata") + # making chunks to group requests to NCBI GEO + checked_datasets_chunks = chunk_list(checked_datasets, PLATFORM_METADATA_CHUNKSIZE) + # resetting selecting datasets + accession_to_platform_metadata = {} + for selected_datasets_chunk in tqdm(checked_datasets_chunks): + accession_to_platform_metadata |= fetch_geo_platform_metadata( + selected_datasets_chunk + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # VALIDATING EACH PLATFORM SEPARATELY, DATASET BY DATASET + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + logger.info(f"Checking each platform for {len(checked_datasets)} datasets") + func = partial( + check_dataset_platforms, + accession_to_platform_metadata=accession_to_platform_metadata, + species=args.species, + ) + selected_datasets = [] + # resetting selecting datasets + for dataset in tqdm(checked_datasets): + accession = dataset["accession"] + issue_dict = func(dataset) + if issue_dict: + if accession in rejection_dict: # should not happen but in case + rejection_dict[accession] += issue_dict[accession] + else: + rejection_dict |= issue_dict + else: + selected_datasets.append(dataset) + + if rejection_dict: + logger.warning(f"{len(rejection_dict)} datasets rejected") + logger.warning(f"Reasons for rejection: {rejection_dict}") + + selected_accessions = sorted( + [dataset["accession"] for dataset in selected_datasets] + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # RANDOM SAMPLING + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + if random_sampling_size is not None and args.random_sampling_seed is not None: + selected_accession_to_nb_samples = [ + { + "accession": dataset["accession"], + "nb_samples": len(dataset["sample_titles"]), + } + for dataset in selected_datasets + ] + + nb_samples_df = pd.DataFrame.from_dict(selected_accession_to_nb_samples) + nb_samples_df.to_csv("selected_accession_to_nb_samples.csv", index=False) + + logger.info("Sampling experiments randomly") + selected_accessions = sample_experiments_randomly( + selected_accession_to_nb_samples, + random_sampling_size, + args.random_sampling_seed, + ) + logger.info( + f"Kept {len(selected_accessions)} experiments after random sampling" + ) + selected_datasets = [ + dataset + for dataset in selected_datasets + if dataset["accession"] in selected_accessions + ] + else: + logger.info( + f"No random sampling requested. Kept {len(selected_datasets)} datasets" + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # EXPORTING ACCESSIONS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + # sorting accessions to ensure that outputs are reproducible + selected_accessions = sorted(selected_accessions) + with open(ACCESSION_OUTFILE_NAME, "w") as fout: + fout.write("\n".join(selected_accessions)) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # EXPORTING DATASETS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + export_dataset_metadatas(augmented_datasets, SPECIES_DATASETS_OUTFILE_NAME) + export_dataset_metadatas(selected_datasets, SELECTED_DATASETS_OUTFILE_NAME) + + rejected_datasets = [ + {"accession": accession, "reason": reason} + for accession, reason in rejection_dict.items() + ] + export_dataset_metadatas( + rejected_datasets, REJECTED_DATASETS_OUTFILE_NAME, clean_columns=False + ) + + +if __name__ == "__main__": + main() diff --git a/bin/get_ratio_standard_variation.py b/bin/get_ratio_standard_variation.py new file mode 100755 index 00000000..043fd9a4 --- /dev/null +++ b/bin/get_ratio_standard_variation.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# experimentally chosen +RATIO_CHUNK_SIZE = 100 + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Compute M-measure for each gene") + parser.add_argument( + "--file", + type=Path, + dest="ratio_file", + required=True, + help="File log of pairwise expression ratios", + ) + parser.add_argument( + "--task-attempts", + dest="task_attempts", + type=int, + default=1, + help="Number of task attempts", + ) + return parser.parse_args() + + +def get_nb_rows(lf: pl.LazyFrame): + return lf.select(pl.len()).collect().item() + + +def get_count_columns(lf: pl.LazyFrame) -> list[str]: + """Get all column names except the config.GENE_ID_COLNAME column. + + The config.GENE_ID_COLNAME column contains only gene IDs. + """ + return [ + col + for col in lf.collect_schema().names() + if not col.startswith(config.GENE_ID_COLNAME) + ] + + +def compute_standard_deviations(file: Path, low_memory: bool) -> pl.LazyFrame: + ratios_lf = pl.scan_parquet(file, low_memory=low_memory) + ratio_columns = [ + col for col in ratios_lf.collect_schema().names() if col.endswith("_log_ratio") + ] + concat_ratios_lf = ratios_lf.select( + [ + pl.concat_list( + [pl.col(col) for col in ratio_columns[i : i + RATIO_CHUNK_SIZE]] + ).alias(f"concat_list_chunk_{i // RATIO_CHUNK_SIZE}") + for i in range(0, len(ratio_columns), RATIO_CHUNK_SIZE) + ] + ).select(pl.concat_list(pl.all()).alias("ratios")) + return pl.concat( + [ + concat_ratios_lf.select("ratios"), + ratios_lf.select(pl.exclude("^.*_log_ratio$")), # gene_id & gene_id_other + ], + how="horizontal", + ).select( + pl.col("ratios").list.std(ddof=0).alias(config.RATIOS_STD_COLNAME), + pl.col(config.GENE_ID_COLNAME), + pl.col(f"{config.GENE_ID_COLNAME}_other"), + ) + + +def get_column_standard_deviations(std_lf: pl.LazyFrame, column: str) -> pl.LazyFrame: + # column is either config.GENE_ID_COLNAME or f"{config.GENE_ID_COLNAME}_other" + return ( + std_lf.group_by(column) + .agg(config.RATIOS_STD_COLNAME) # getting list of ratio std for this gene + .select( + pl.col(column).alias(config.GENE_ID_COLNAME), + pl.col(config.RATIOS_STD_COLNAME), + ) + ) + + +def group_standard_deviations(std_lf: pl.LazyFrame) -> pl.LazyFrame: + # getting the standard devs for genes in the gene_id column + std_a = get_column_standard_deviations(std_lf, column=config.GENE_ID_COLNAME) + # getting the standard devs for genes in the gene_id_other column + std_b = get_column_standard_deviations( + std_lf, column=f"{config.GENE_ID_COLNAME}_other" + ) + # concatenating both dataframes vertically + # if both lists of gene ids are the identical, + # we need to collect values only for one column to avoid duplicates + return ( + pl.concat([std_a, std_b], how="vertical") + .unique(subset=config.GENE_ID_COLNAME) + .sort( + config.GENE_ID_COLNAME + ) # only needed to have consistent output (for snapshots) + ) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + file = args.ratio_file + + low_memory = True if args.task_attempts > 1 else False + std_lf = compute_standard_deviations(file, low_memory) + std_lf = group_standard_deviations(std_lf) + + # when the ratio file corresponds to the same gene ids cross joined with themselves (i == i) + # then we want only only one row per gene id + + std_df = std_lf.collect() + if len(std_df) == 0: + raise ValueError(f"No output following treatment of file {str(file)}") + + outfile = args.ratio_file.name.replace("ratios", "std") + std_df.write_parquet(outfile) + + +if __name__ == "__main__": + main() diff --git a/bin/gprofiler_map_ids.py b/bin/gprofiler_map_ids.py new file mode 100755 index 00000000..4a559bde --- /dev/null +++ b/bin/gprofiler_map_ids.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from pathlib import Path + +import config +import pandas as pd +from gprofiler_utils import convert_ids + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +################################################################## +# CONSTANTS +################################################################## + +MAPPED_GENE_IDS_OUTFILE = "mapped_gene_ids.csv" +METADATA_OUTFILE = "gene_metadata.csv" + +TARGET_DATABASE_CHOICES = ["ENTREZGENE", "ENSG"] + +FAILURE_REASON_FILE = "failure_reason.txt" + +################################################################## +# FUNCTIONS +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Map IDs using g:Profiler") + parser.add_argument( + "--gene-ids", + type=Path, + dest="gene_id_file", + required=True, + help="Input file containing gene IDs", + ) + parser.add_argument( + "--species", type=str, required=True, help="Species to convert IDs for" + ) + parser.add_argument( + "--target-db", + type=str, + dest="gprofiler_target_db", + required=True, + choices=TARGET_DATABASE_CHOICES, + help="Target database to convert IDs to", + ) + return parser.parse_args() + + +################################################################## +# MAIN +################################################################## + + +def main(): + args = parse_args() + + with open(args.gene_id_file, "r") as fin: + gene_ids = list(set([line.strip() for line in fin])) + + logger.info(f"Converting {len(gene_ids)} IDs for species {args.species} ") + + ############################################################# + # QUERYING g:PROFILER SERVER + ############################################################# + + gene_metadata_dfs = [] + + mapping_dict, gene_metadata_dfs = convert_ids( + gene_ids, args.species, args.gprofiler_target_db + ) + + if not mapping_dict: + msg = ( + f"No mapping found for gene IDs such as {' '.join(gene_ids[:5])} on species {args.species} " + + f"and g:Profiler target database {args.gprofiler_target_db}" + ) + logger.error(msg) + with open(FAILURE_REASON_FILE, "w") as fout: + fout.write(msg) + sys.exit(100) + + ############################################################# + # WRITING MAPPING + ############################################################# + + # making dataframe for mapping (only two columns: original and new) + mapping_df = ( + pd.DataFrame(mapping_dict, index=[0]) + .T.reset_index() # transpose: setting keys as indexes instead of columns + .rename( + columns={ + "index": config.ORIGINAL_GENE_ID_COLNAME, + 0: config.GENE_ID_COLNAME, + } + ) + .sort_values(by=config.ORIGINAL_GENE_ID_COLNAME) + ) + mapping_df.to_csv(MAPPED_GENE_IDS_OUTFILE, index=False, header=True) + + ############################################################# + # WRITING METADATA + ############################################################# + + gene_metadata_df = pd.concat(gene_metadata_dfs, ignore_index=True) + # dropping duplicates and keeping the first occurence + gene_metadata_df.drop_duplicates( + subset=[config.GENE_ID_COLNAME], keep="first" + ).sort_values(by=config.GENE_ID_COLNAME).to_csv( + METADATA_OUTFILE, index=False, header=True + ) + + +if __name__ == "__main__": + main() diff --git a/bin/gprofiler_utils.py b/bin/gprofiler_utils.py new file mode 100755 index 00000000..18b9d41a --- /dev/null +++ b/bin/gprofiler_utils.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import logging + +import config +import pandas as pd +import requests +from requests.exceptions import ConnectionError, HTTPError +from tenacity import ( + before_sleep_log, + retry, + stop_after_delay, + wait_exponential, +) + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +################################################################## +# CONSTANTS +################################################################## + +GPROFILER_CONVERT_API_ENDPOINT = "https://biit.cs.ut.ee/gprofiler/api/convert/convert/" +GPROFILER_CONVERT_BETA_API_ENDPOINT = ( + "https://biit.cs.ut.ee/gprofiler_beta/api/convert/convert/" +) + +CHUNKSIZE = 2000 # number of IDs to convert at a time - may create trouble if > 2000 + +COLS_TO_KEEP = ["incoming", "converted", "name", "description"] +DESCRIPTION_PART_TO_REMOVE_REGEX = r"\s*\[Source:.*?\]" + +GPROFILER_ERROR_MESSAGE = ( + "g:Profiler servers (main and beta) seem to be down... Please retry later... " + "If you have gene ID mappings and / or gene metadata for these datasets, you can provide them " + "directly using the `--gene_id_mapping` and `--gene_metadata` parameters respectively, " + "and by skipping the g:Profiler ID mapping step with `--skip_id_mapping`." +) + + +################################################################## +# FUNCTIONS +################################################################## + + +class GProfilerConnectionError(Exception): + pass + + +def format_species_name(species: str): + """ + Format a species name into a format accepted by g:Profiler. + Example: Arabidopsis thaliana -> athaliana + + Parameters + ---------- + species : str + The species name. + + Returns + ------- + str + The formatted species name. + """ + splitted_species = species.lower().replace("_", " ").split(" ") + return splitted_species[0][0] + splitted_species[1] + + +@retry( + stop=stop_after_delay(600), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def request_conversion( + gene_ids: list, + species: str, + target_database: str, + url: str = GPROFILER_CONVERT_API_ENDPOINT, + attempts: int = 0, +) -> list[str]: + """ + Send a request to the g:Profiler API to convert a list of gene IDs. + + Parameters + ---------- + gene_ids : list + The list of gene IDs to convert. + species : str + The species to convert the IDs for. + url : str, optionalrequest_conversion + The URL to send the request to, by default GPROFILER_CONVERT_API_ENDPOINT + attempts : int, optional + The number of attempts already performed, by default 0 + + Returns + ------- + list + The list of dicts corresponding to the converted IDs. + """ + + # formatting species for g:Profiler + organism = format_species_name(species) + + if attempts > 0: + logger.warning( + "g:Profiler main server appears down, trying with the beta server..." + ) + + server_appears_down = False + + try: + response = requests.post( + url=url, + json={"organism": organism, "query": gene_ids, "target": target_database}, + ) + except requests.exceptions.ConnectionError: + server_appears_down = True + else: + try: + response.raise_for_status() + except (HTTPError, ConnectionError) as err: + if str(response.status_code).startswith("5"): # error 500 -> 509 + server_appears_down = True + else: + logger.error( + f"Error {response.status_code} while converting IDs: {err}" + ) + raise err + + if server_appears_down: + if attempts == 0: + logger.warning( + "g:Profiler main server appears down, trying with the beta server..." + ) + return request_conversion( + gene_ids, + species, + target_database=target_database, + url=GPROFILER_CONVERT_BETA_API_ENDPOINT, # backup endpoint + attempts=1, + ) + else: + # both servers appear down, we stop here... + logger.error(GPROFILER_ERROR_MESSAGE) + raise GProfilerConnectionError(GPROFILER_ERROR_MESSAGE) + + else: + return response.json()["result"] + + +def convert_chunk_of_ids( + gene_ids: list, species: str, gprofiler_target_db: str +) -> tuple[dict, pd.DataFrame]: + """ + Wrapper function that converts a list of gene IDs to another namespace. + + Parameters + ---------- + species : str + The species to convert the IDs for. + gene_ids : list + The IDs to convert. + target_database : str + The target database to convert to. + + Returns + ------- + dict + A dictionary where the keys are the original IDs and the values are the converted IDs. + """ + + results = request_conversion(gene_ids, species, gprofiler_target_db) + df = pd.DataFrame.from_records(results) + + if df.empty: + return {}, pd.DataFrame() + + # keeping only rows where 'converted' is not null and only the columns of interest + df = df.loc[df["converted"] != "None", COLS_TO_KEEP] + + # dict associating incoming IDs to converted IDs + mapping_dict = df.set_index("incoming").to_dict()["converted"] + + # DataFrame associating converted IDs to name and description + meta_df = df.drop(columns=["incoming"]).rename( + columns={"converted": config.GENE_ID_COLNAME} + ) + + meta_df["name"] = meta_df["name"].str.replace(",", ";") + + # Extract the part before '[Source:...]', or the whole string if not found + meta_df["description"] = ( + meta_df["description"] + .str.replace(DESCRIPTION_PART_TO_REMOVE_REGEX, "", regex=True) + .str.replace(",", ";") + ) + + return mapping_dict, meta_df + + +def chunk_list(lst: list, chunksize: int) -> list: + """Splits a list into chunks of a given size. + + Args: + lst (list): The list to split. + chunksize (int): The size of each chunk. + + Returns: + list: A list of chunks, where each chunk is a list of len(chunksize). + """ + return [lst[i : i + chunksize] for i in range(0, len(lst), chunksize)] + + +def convert_ids( + ids: list[str], species: str, gprofiler_target_db: str +) -> tuple[dict, pd.DataFrame]: + mapping_dict = {} + gene_metadata_dfs = [] + + chunks = chunk_list(ids, chunksize=CHUNKSIZE) + for chunk_gene_ids in chunks: + # converting to Gene IDs for all IDs comprised in this chunk + gene_mapping, meta_df = convert_chunk_of_ids( + chunk_gene_ids, species, gprofiler_target_db + ) + mapping_dict.update(gene_mapping) + gene_metadata_dfs.append(meta_df) + + return mapping_dict, gene_metadata_dfs diff --git a/bin/make_cross_join.py b/bin/make_cross_join.py new file mode 100755 index 00000000..f28a597c --- /dev/null +++ b/bin/make_cross_join.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Compute M-measure for each gene") + parser.add_argument( + "--file1", + type=Path, + dest="count_file_1", + required=True, + help="Chunk count file 1", + ) + parser.add_argument( + "--file2", + type=Path, + dest="count_file_2", + required=True, + help="Chunk count file 2", + ) + parser.add_argument( + "--index1", + type=Path, + dest="count_file_1_index", + required=True, + help="Index of chunk count file 1", + ) + parser.add_argument( + "--index2", + type=Path, + dest="count_file_2_index", + required=True, + help="Index of chunk count file 2", + ) + parser.add_argument( + "--task-attempts", + dest="task_attempts", + type=int, + default=1, + help="Number of task attempts", + ) + return parser.parse_args() + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + low_memory = True if args.task_attempts > 1 else False + lf = pl.scan_parquet(args.count_file_1, low_memory=low_memory) + lf_other = pl.scan_parquet(args.count_file_2, low_memory=low_memory) + + logger.info("Computing cross join data") + lf = lf.join( + lf_other, how="cross", suffix="_other" + ) # Perform a cross join with itself + + df = lf.collect() + if len(df) == 0: + raise ValueError( + f"No output following treatment of files {str(args.count_file_1)} and {str(args.count_file_2)}" + ) + + outfile = f"cross_join.{args.count_file_1_index}.{args.count_file_2_index}.parquet" + df.write_parquet(outfile) + + +if __name__ == "__main__": + main() diff --git a/bin/make_pairwise_gene_expression_ratio.py b/bin/make_pairwise_gene_expression_ratio.py new file mode 100755 index 00000000..7ecaf194 --- /dev/null +++ b/bin/make_pairwise_gene_expression_ratio.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Compute M-measure for each gene") + parser.add_argument( + "--file", + type=Path, + dest="cross_joined_file", + required=True, + help="File where each row contains counts for two genes", + ) + parser.add_argument( + "--task-attempts", + dest="task_attempts", + type=int, + default=1, + help="Number of task attempts", + ) + return parser.parse_args() + + +def get_count_columns(lf: pl.LazyFrame) -> list[str]: + """Get all column names except the config.GENE_ID_COLNAME column. + + The config.GENE_ID_COLNAME column contains only gene IDs. + """ + return [ + col + for col in lf.collect_schema().names() + if not col.startswith(config.GENE_ID_COLNAME) + ] + + +def compute_ratios(file: Path, low_memory: bool) -> pl.LazyFrame: + # getting ratios for each sample + cross_join_lf = pl.scan_parquet(file, low_memory=low_memory) + column_pairs = { + col: f"{col}_other" + for col in get_count_columns(cross_join_lf) + if not col.endswith("_other") + } + return cross_join_lf.select( + [pl.col(config.GENE_ID_COLNAME), pl.col(f"{config.GENE_ID_COLNAME}_other")] + + [ + (pl.col(col) / pl.col(other_col)).log(base=2).alias(f"{col}_log_ratio") + for col, other_col in column_pairs.items() + ] + ) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + file = args.cross_joined_file + + low_memory = True if args.task_attempts > 1 else False + ratios_lf = compute_ratios(file, low_memory) + + ratios_df = ratios_lf.collect() + + if len(ratios_df) == 0: + raise ValueError(f"No output following treatment of file {str(file)}") + + outfilename = args.cross_joined_file.name.replace("cross_join", "ratios") + ratios_df.write_parquet(outfilename) + + +if __name__ == "__main__": + main() diff --git a/bin/make_parquet_chunks.py b/bin/make_parquet_chunks.py new file mode 100755 index 00000000..f1a56578 --- /dev/null +++ b/bin/make_parquet_chunks.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from math import ceil +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# experimentally chosen +GENE_CHUNK_SIZE = 300 +ZERO_REPLACE_VALUE = 1e-8 + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Compute M-measure for each gene") + parser.add_argument( + "--counts", + type=Path, + dest="count_file", + required=True, + help="File containing normalised counts for all genes and all samples", + ) + parser.add_argument( + "--task-attempts", + dest="task_attempts", + type=int, + default=1, + help="Number of task attempts", + ) + return parser.parse_args() + + +def get_nb_rows(lf: pl.LazyFrame): + return lf.select(pl.len()).collect().item() + + +def parse_count_dataset(file: Path, low_memory: bool) -> pl.LazyFrame: + lf = pl.scan_parquet(file, low_memory=low_memory).fill_null(0).fill_nan(0) + count_columns = get_count_columns(lf) + cols = [pl.col(config.GENE_ID_COLNAME)] + [ + pl.col(column).replace({0: ZERO_REPLACE_VALUE}).cast(pl.Float64) + for column in count_columns + ] + return lf.select(cols) + + +def get_count_columns(lf: pl.LazyFrame) -> list[str]: + """Get all column names except the config.GENE_ID_COLNAME column. + + The config.GENE_ID_COLNAME column contains only gene IDs. + """ + return [ + col + for col in lf.collect_schema().names() + if not col.startswith(config.GENE_ID_COLNAME) + ] + + +def split_count_summary_in_chunks(lf: pl.LazyFrame): + lf = lf.with_row_index(name="index") + + nb_rows = get_nb_rows(lf) + logger.info(f"Number of rows (genes) in count file: {nb_rows}") + nb_chunks = ceil(nb_rows / GENE_CHUNK_SIZE) + logger.info(f"Number of chunks: {nb_chunks}") + + for i, start in enumerate(range(0, nb_rows, GENE_CHUNK_SIZE)): + partition = ( + lf.filter( + (pl.col("index") >= start) & (pl.col("index") < start + GENE_CHUNK_SIZE) + ) + .drop("index") + .collect() + ) + outfile = f"count_chunk.{i}.parquet" + partition.write_parquet(outfile) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + low_memory = True if args.task_attempts > 1 else False + logger.info("Parsing count file") + lf = parse_count_dataset(args.count_file, low_memory) + + logger.info("Splitting count file into chunks") + split_count_summary_in_chunks(lf) + + +if __name__ == "__main__": + main() diff --git a/bin/map_ids_to_ensembl.py b/bin/map_ids_to_ensembl.py deleted file mode 100755 index 0b1d6b15..00000000 --- a/bin/map_ids_to_ensembl.py +++ /dev/null @@ -1,311 +0,0 @@ -#!/usr/bin/env python3 - -# Written by Olivier Coen. Released under the MIT license. - -import requests -import pandas as pd -from pathlib import Path -import argparse -import logging -import sys - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -################################################################## -# CONSTANTS -################################################################## - -RENAMED_FILE_SUFFIX = ".renamed.csv" -METADATA_FILE_SUFFIX = ".metadata.csv" -MAPPING_FILE_SUFFIX = ".mapping.csv" - -CHUNKSIZE = 2000 # number of IDs to convert at a time - may create trouble if > 2000 - -GPROFILER_CONVERT_API_ENDPOINT = "https://biit.cs.ut.ee/gprofiler/api/convert/convert/" -GPROFILER_CONVERT_BETA_API_ENDPOINT = ( - "https://biit.cs.ut.ee/gprofiler_beta/api/convert/convert/" -) - -TARGET_DATABASE = "ENSG" # Ensembl database -COLS_TO_KEEP = ["incoming", "converted", "name", "description"] -DESCRIPTION_PART_TO_REMOVE_REGEX = r"\s*\[Source:.*?\]" -ORIGINAL_GENE_ID_COLNAME = "original_gene_id" -ENSEMBL_GENE_ID_COLNAME = "ensembl_gene_id" - - -################################################################## -# FUNCTIONS -################################################################## - - -def parse_args(): - parser = argparse.ArgumentParser("Map IDs to Ensembl") - parser.add_argument( - "--count-file", type=Path, required=True, help="Input file containing counts" - ) - parser.add_argument( - "--species", type=str, required=True, help="Species to convert IDs for" - ) - parser.add_argument( - "--custom-mappings", type=str, help="Optional file containing custom mappings" - ) - return parser.parse_args() - - -def format_species_name(species: str): - """ - Format a species name into a format accepted by g:Profiler. - Example: Arabidopsis thaliana -> athaliana - - Parameters - ---------- - species : str - The species name. - - Returns - ------- - str - The formatted species name. - """ - splitted_species = species.lower().replace("_", " ").split(" ") - return splitted_species[0][0] + splitted_species[1] - - -def chunk_list(lst: list, chunksize: int): - """Splits a list into chunks of a given size. - - Args: - lst (list): The list to split. - chunksize (int): The size of each chunk. - - Returns: - list: A list of chunks, where each chunk is a list of len(chunksize). - """ - return [lst[i : i + chunksize] for i in range(0, len(lst), chunksize)] - - -def request_conversion( - gene_ids: list, - species: str, - target_database: str, - url: str = GPROFILER_CONVERT_API_ENDPOINT, - attempts: int = 0, -) -> list[str]: - """ - Send a request to the g:Profiler API to convert a list of gene IDs. - - Parameters - ---------- - gene_ids : list - The list of gene IDs to convert. - species : str - The species to convert the IDs for. - url : str, optional - The URL to send the request to, by default GPROFILER_CONVERT_API_ENDPOINT - attempts : int, optional - The number of attempts already performed, by default 0 - - Returns - ------- - list - The list of dicts corresponding to the converted IDs. - """ - - if attempts > 0: - logger.warning( - "g:Profiler main server appears down, trying with the beta server..." - ) - - response = requests.post( - url=url, - json={"organism": species, "query": gene_ids, "target": target_database}, - ) - - try: - response.raise_for_status() - except requests.exceptions.HTTPError as err: - if err.response.status_code == 502: - # server appears down - if attempts == 0: - # we only tried with the main server, we try with the beta server - return request_conversion( - gene_ids, - species, - target_database=target_database, - url=GPROFILER_CONVERT_BETA_API_ENDPOINT, - attempts=1, - ) - else: - # both servers appear down, we stop here... - logger.error( - "g:Profiler servers (main and beta) seem to be down... Please retry later... " - "If you have gene ID mappings and / or gene metadata for these datasets, you can provide them " - "directly using the `--gene_id_mapping` and `--gene_metadata` parameters respectively, " - "and by skipping the g:Profiler ID mapping step with `--skip_gprofiler`." - ) - sys.exit(102) - - logger.error(f"Error {err.response.status_code} while converting IDs: {err}") - sys.exit(101) - - return response.json()["result"] - - -def convert_ids(gene_ids: list, species: str): - """ - Wrapper function that converts a list of gene IDs to another namespace. - - Parameters - ---------- - species : str - The species to convert the IDs for. - gene_ids : list - The IDs to convert. - target_database : str - The target database to convert to. - - Returns - ------- - dict - A dictionary where the keys are the original IDs and the values are the converted IDs. - """ - - results = request_conversion(gene_ids, species, TARGET_DATABASE) - df = pd.DataFrame.from_records(results) - - if df.empty: - return {} - - # keeping only rows where 'converted' is not null and only the columns of interest - df = df.loc[df["converted"] != "None", COLS_TO_KEEP] - - # dict associating incoming IDs to converted IDs - mapping_dict = df.set_index("incoming").to_dict()["converted"] - - # DataFrame associating converted IDs to name and description - meta_df = df.drop(columns=["incoming"]).rename( - columns={"converted": ENSEMBL_GENE_ID_COLNAME} - ) - - meta_df["name"] = meta_df["name"].str.replace(",", ";") - - # Extract the part before '[Source:...]', or the whole string if not found - meta_df["description"] = ( - meta_df["description"] - .str.replace(DESCRIPTION_PART_TO_REMOVE_REGEX, "", regex=True) - .str.replace(",", ";") - ) - - return mapping_dict, meta_df - - -################################################################## -# MAIN -################################################################## - - -def main(): - args = parse_args() - - count_file = args.count_file - species_name = format_species_name(args.species) - logger.info( - f"Converting IDs for species {args.species} and count file {count_file.name}..." - ) - - #############################################################" - # PARSING FILES - ############################################################# - df = pd.read_csv(count_file, header=0, index_col=0) - if df.empty: - logger.error("Count file is empty! Aborting ID mapping...") - sys.exit(100) - - df.index = df.index.astype(str) - gene_ids = df.index.tolist() - - custom_mappings_dict = {} - custom_mapping_file = args.custom_mappings - if custom_mapping_file: - if Path(custom_mapping_file).is_file(): - custom_mapping_df = pd.read_csv(custom_mapping_file) - custom_mappings_dict = custom_mapping_df.set_index( - ORIGINAL_GENE_ID_COLNAME - )[ENSEMBL_GENE_ID_COLNAME].to_dict() - - gene_ids_left_to_map = [ - gene_id for gene_id in gene_ids if gene_id not in custom_mappings_dict.keys() - ] - logger.info(f"Number of genes left to map: {len(gene_ids_left_to_map)}") - - ############################################################# - # QUERYING g:PROFILER SERVER - ############################################################# - mapping_dict = {} - gene_metadata_dfs = [] - - if gene_ids_left_to_map: - chunks = chunk_list(gene_ids_left_to_map, chunksize=CHUNKSIZE) - for chunk_gene_ids in chunks: - # converting to Ensembl IDs for all IDs comprised in this chunk - gene_mapping, meta_df = convert_ids(chunk_gene_ids, species_name) - mapping_dict.update(gene_mapping) - gene_metadata_dfs.append(meta_df) - - # adding custom mappings - mapping_dict.update(custom_mappings_dict) - # if mapping dict is empty - if not mapping_dict: - logger.error( - f"No mapping found for gene names in count file {count_file.name} " - f"and for species {args.species}! " - f"Example of gene names found in the provided dataframe: {df.index[:5].tolist()}" - f"Count file is empty! Aborting ID mapping..." - ) - sys.exit(101) - - #############################################################" - # MAPPING GENE IDS IN DATAFRAME - ############################################################# - # filtering the DataFrame to keep only the rows where the index can be mapped - df = df.loc[df.index.isin(mapping_dict)] - - # renaming gene names to mapped ids using mapping dict - df.index = df.index.map(mapping_dict) - df.reset_index(inplace=True) - df.rename(columns={"index": ENSEMBL_GENE_ID_COLNAME}, inplace=True) - - # TODO: check is there is another way to avoid duplicate gene names - # sometimes different gene names have the same ensembl ID - # for now, we just get the mean of values, but this is not ideal - df = df.groupby(ENSEMBL_GENE_ID_COLNAME, as_index=False).mean() - - #############################################################" - # WRITING OUTFILES - ############################################################# - # writing to output file - outfile = count_file.with_name(count_file.stem + RENAMED_FILE_SUFFIX) - df.to_csv(outfile, index=False, header=True) - - # concatenating all metadata and ensuring there are no duplicates - if gene_metadata_dfs: - gene_metadata_df = pd.concat(gene_metadata_dfs, ignore_index=True) - gene_metadata_df.drop_duplicates(inplace=True) - # writing gene metadata to file - metadata_file = count_file.with_name(count_file.stem + METADATA_FILE_SUFFIX) - gene_metadata_df.to_csv(metadata_file, index=False, header=True) - - # making dataframe for mapping (only two columns: original and new) - mapping_df = ( - pd.DataFrame(mapping_dict, index=[0]) - .T.reset_index() # transpose: setting keys as indexes instead of columns - .rename(columns={"index": ORIGINAL_GENE_ID_COLNAME, 0: ENSEMBL_GENE_ID_COLNAME}) - ) - mapping_file = count_file.with_name(count_file.stem + MAPPING_FILE_SUFFIX) - mapping_df.to_csv(mapping_file, index=False, header=True) - - -if __name__ == "__main__": - main() diff --git a/bin/merge_counts.py b/bin/merge_counts.py new file mode 100755 index 00000000..7ec9352b --- /dev/null +++ b/bin/merge_counts.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import hashlib +import json +import logging +from functools import reduce +from operator import attrgetter +from pathlib import Path + +import config +import polars as pl +from tqdm import tqdm + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +ALL_COUNTS_PARQUET_OUTFILENAME = "all_counts.parquet" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Merge count datasets") + parser.add_argument( + "--counts", type=str, dest="count_files", required=True, help="Count files" + ) + return parser.parse_args() + + +##################################################### +# COUNTS +##################################################### + + +def parse_count_file(count_file: Path) -> pl.DataFrame: + df = pl.read_parquet(count_file) + # in some cases, the first column may have an empty name or be different than config.GENE_ID_COLNAME + # in any case, this column must have the config.GENE_ID_COLNAME name + first_column_name = df.columns[0] + if first_column_name != config.GENE_ID_COLNAME: + df = df.rename({first_column_name: config.GENE_ID_COLNAME}) + return df + + +def is_valid_df(df: pl.DataFrame, file: Path) -> bool: + """Check if a DataFrame is valid. + A DataFrame is considered valid if it contains at least one row. + """ + try: + return not df.limit(1).is_empty() + except FileNotFoundError: + # strangely enough we get this error for some files existing but empty + logger.error(f"Could not find file {str(file)}") + return False + except pl.exceptions.NoDataError as err: + logger.error(f"File {str(file)} is empty: {err}") + return False + + +def get_valid_dfs(files: list[Path]) -> list[pl.DataFrame]: + """Get a list of valid DataFrames from a list of files. + A DataFrame is considered valid if it contains at least one row. + """ + df_dict = {file: parse_count_file(file) for file in tqdm(files)} + return [df for file, df in df_dict.items()] + + +def join_count_dfs(df1: pl.DataFrame, df2: pl.DataFrame) -> pl.DataFrame: + """Join two DataFrames on the config.GENE_ID_COLNAME column. + + The how parameter is set to "full" to include all rows from both dfs. + The coalesce parameter is set to True to fill NaN values in the + resulting dataframe with values from the other dataframe. + """ + return df1.join(df2, on=config.GENE_ID_COLNAME, how="full", coalesce=True) + + +def get_count_columns(df: pl.DataFrame) -> list[str]: + """Get all column names except the config.GENE_ID_COLNAME column. + + The config.GENE_ID_COLNAME column contains only gene IDs. + """ + return df.select(pl.exclude(config.GENE_ID_COLNAME)).columns + + +def reproducible_hash(tpl: tuple[str]) -> str: + """ + Return a deterministic MD5 hash for the given tuple. + + Steps: + 1. Convert the tuple (and any nested structures) to a canonical JSON string. + - `sort_keys=True` guarantees that dictionaries are ordered consistently. + - `separators=(',', ':')` removes unnecessary whitespace. + 2. Encode the string as UTF‑8 bytes. + 3. Feed the bytes to hashlib.md5 and return the hex digest. + + The result is a 64‑character hexadecimal string that will be identical + across Python runs, machines, and even different Python versions + (provided the data types are JSON‑compatible). + """ + # Canonical JSON representation + canonical_str = json.dumps(tpl, sort_keys=True, separators=(",", ":")) + # Encode to bytes + data_bytes = canonical_str.encode("utf-8") + # Compute MD5 + hash_obj = hashlib.md5(data_bytes) + return hash_obj.hexdigest() + + +def get_counts(files: list[Path]) -> pl.DataFrame: + """Get all count data from a list of files. + + The files are merged into a single dataframe. The config.GENE_ID_COLNAME column is cast + to String, and all other columns are cast to Float64. + """ + logger.info("Parsing counts") + dfs = get_valid_dfs(files) + + # sorting dataframes by a hash on column names + # this is crucial for consistent output of the script + # in case multiple files have the same name + dfs.sort(key=lambda df: reproducible_hash(tuple(df.columns))) + + # joining all count files + logger.info( + f"Joining count files recursively on the {config.GENE_ID_COLNAME} column" + ) + merged_df = reduce(join_count_dfs, tqdm(dfs)) + + count_columns = get_count_columns(merged_df) + # casting count columns to Float64 + # casting gene id column to Stringcount_files + # casting nans to nulls + logger.info("Cleaning merged dataframe") + return merged_df.select( + [pl.col(config.GENE_ID_COLNAME).cast(pl.String)] + + [pl.col(column).cast(pl.Float64) for column in count_columns] + ).fill_nan(None) + + +##################################################### +# EXPORT +##################################################### + + +def export_data(count_df: pl.DataFrame): + """Export gene expression data.""" + logger.info(f"Exporting normalised counts to: {ALL_COUNTS_PARQUET_OUTFILENAME}") + count_df.write_parquet(ALL_COUNTS_PARQUET_OUTFILENAME) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + # parsing count files + count_files = [Path(file) for file in args.count_files.split(" ")] + # sorting them by file name to ensure consistent order between runs + count_files.sort(key=attrgetter("name")) + logger.info(f"Merging {len(count_files)} count files") + + # putting all counts into a single dataframe + count_df = get_counts(count_files) + export_data(count_df) + + +if __name__ == "__main__": + main() diff --git a/bin/merge_data.py b/bin/merge_data.py deleted file mode 100755 index 1cc63f43..00000000 --- a/bin/merge_data.py +++ /dev/null @@ -1,304 +0,0 @@ -#!/usr/bin/env python3 - -# Written by Olivier Coen. Released under the MIT license. - -import argparse -import polars as pl -from pathlib import Path -import logging -from functools import reduce - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -ALL_COUNTS_PARQUET_OUTFILENAME = "all_counts.parquet" -ALL_DESIGNS_OUTFILENAME = "all_designs.csv" -GENE_COUNT_STATS_OUTFILENAME = "gene_count_statistics.csv" -SKEWNESS_STATS_OUTFILENAME = "skewness_statistics.csv" -KS_TEST_STATS_OUTFILENAME = "ks_test_statistics.csv" -CANDIDATE_GENE_COUNTS_PARQUET_OUTFILENAME = "candidate_gene_counts.parquet" -DISTRIBUTION_CORRELATIONS_OUTFILENAME = "distribution_correlations.csv" - -ENSEMBL_GENE_ID_COLNAME = "ensembl_gene_id" -STATISTIC_TYPE_COLNAME = "stat_type" -GENE_COUNT_COLNAME = "count" -SKEWNESS_COLNAME = "skewness" -KS_TEST_COLNAME = "kolmogorov_smirnov_to_uniform_dist_pvalue" -SAMPLE_COLNAME = "sample" - -STAT_COLNAME_TO_PARAMS = { - GENE_COUNT_COLNAME: { - "outfilename": GENE_COUNT_STATS_OUTFILENAME, - "descending": False, - }, - SKEWNESS_COLNAME: {"outfilename": SKEWNESS_STATS_OUTFILENAME, "descending": False}, - KS_TEST_COLNAME: {"outfilename": KS_TEST_STATS_OUTFILENAME, "descending": True}, -} - - -##################################################### -##################################################### -# FUNCTIONS -##################################################### -##################################################### - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Get variation from count data for each gene" - ) - parser.add_argument( - "--counts", type=str, dest="count_files", required=True, help="Count files" - ) - parser.add_argument( - "--designs", type=str, dest="design_files", required=True, help="Design files" - ) - parser.add_argument( - "--stats", - type=str, - dest="dataset_stat_files", - required=True, - help="Dataset stats files", - ) - parser.add_argument( - "--nb-candidate-genes", - type=int, - dest="nb_candidate_genes", - required=True, - help="Number of candidate genes to keep", - ) - return parser.parse_args() - - -##################################################### -# COUNTS -##################################################### - - -def parse_count_file(count_file: Path) -> pl.LazyFrame: - lf = pl.scan_parquet(count_file) - # in some cases, the first column may have an empty name or be different than ENSEMBL_GENE_ID_COLNAME - # in any case, this column must have the ENSEMBL_GENE_ID_COLNAME name - first_column_name = lf.collect_schema().names()[0] - if first_column_name != ENSEMBL_GENE_ID_COLNAME: - lf = lf.rename({first_column_name: ENSEMBL_GENE_ID_COLNAME}) - return lf - - -def is_valid_df(lf: pl.LazyFrame, file: Path) -> bool: - """Check if a LazyFrame is valid. - - A LazyFrame is considered valid if it contains at least one row. - """ - try: - return not lf.limit(1).collect().is_empty() - except FileNotFoundError: - # strangely enough we get this error for some files existing but empty - logger.error(f"Could not find file {str(file)}") - return False - except pl.exceptions.NoDataError as err: - logger.error(f"File {str(file)} is empty: {err}") - return False - - -def get_valid_lazy_dfs(files: list[Path]) -> list[pl.LazyFrame]: - """Get a list of valid LazyFrames from a list of files. - - A LazyFrame is considered valid if it contains at least one row. - """ - lf_dict = {file: parse_count_file(file) for file in files} - return [lf for file, lf in lf_dict.items() if is_valid_df(lf, file)] - - -def join_count_dfs(lf1: pl.LazyFrame, lf2: pl.LazyFrame) -> pl.LazyFrame: - """Join two LazyFrames on the ENSEMBL_GENE_ID_COLNAME column. - - The how parameter is set to "full" to include all rows from both dfs. - The coalesce parameter is set to True to fill NaN values in the - resulting dataframe with values from the other dataframe. - """ - return lf1.join(lf2, on=ENSEMBL_GENE_ID_COLNAME, how="full", coalesce=True) - - -def get_count_columns(lf: pl.LazyFrame) -> list[str]: - """Get all column names except the ENSEMBL_GENE_ID_COLNAME column. - - The ENSEMBL_GENE_ID_COLNAME column contains only gene IDs. - """ - return lf.select(pl.exclude(ENSEMBL_GENE_ID_COLNAME)).collect_schema().names() - - -def get_counts(files: list[Path]) -> pl.DataFrame: - """Get all count data from a list of files. - - The files are merged into a single dataframe. The ENSEMBL_GENE_ID_COLNAME column is cast - to String, and all other columns are cast to Float64. - """ - # lazy loading - lfs = get_valid_lazy_dfs(files) - # joining all count files - merged_lf = reduce(join_count_dfs, lfs) - - count_columns = get_count_columns(merged_lf) - # casting count columns to Float64 - # casting gene id column to String - # casting nans to nulls - return ( - merged_lf.select( - [pl.col(ENSEMBL_GENE_ID_COLNAME).cast(pl.String)] - + [pl.col(column).cast(pl.Float64) for column in count_columns] - ) - .fill_nan(None) - .collect() - ) - - -def get_nb_rows(lf: pl.LazyFrame) -> int: - return lf.select(pl.len()).collect().item() - - -##################################################### -# DESIGNS -##################################################### - - -def parse_design_file(design_file: Path) -> pl.DataFrame: - design_df = pl.read_csv(design_file, has_header=True) - # adding batch name from file stem if not present - if "batch" not in design_df.columns: - design_df = design_df.with_columns(pl.lit(design_file.stem).alias("batch")) - return design_df.select("batch", "condition", "sample") - - -def merge_designs(design_files: list[Path]) -> pl.DataFrame: - design_dfs = [parse_design_file(design_file) for design_file in design_files] - return pl.concat(design_dfs, how="vertical") - - -##################################################### -# STATISTICS -##################################################### - - -def parse_stat_file(stat_file: Path) -> pl.DataFrame: - return pl.read_csv(stat_file, has_header=True) - - -def merge_stats(stat_files: list[Path]) -> pl.DataFrame: - stat_dfs = [parse_stat_file(stat_file) for stat_file in stat_files] - return pl.concat(stat_dfs, how="vertical") - - -def compute_distances_to_mean(count_df: pl.DataFrame) -> pl.DataFrame: - corr_dict = {"sample": [], "correlation": []} - - count_df = count_df.select(pl.exclude(ENSEMBL_GENE_ID_COLNAME)) - mean_series = count_df.mean_horizontal() - - for sample in count_df.columns: - correlation = count_df.select(pl.corr(count_df[sample], mean_series)) - corr_dict["sample"].append(sample) - corr_dict["correlation"].append(correlation.item()) - - return ( - pl.DataFrame(corr_dict) - .fill_nan(None) - .sort(by="correlation", descending=True, nulls_last=True) - ) - - -##################################################### -# CANDIDATE GENES -##################################################### - - -def get_candidate_gene_counts( - count_df: pl.DataFrame, nb_candidate_genes: int -) -> pl.DataFrame: - candidate_gene_lf = ( - count_df.with_columns( - std=pl.concat_list(pl.exclude(ENSEMBL_GENE_ID_COLNAME)) - .list.drop_nulls() - .list.std() - ) - .sort("std", descending=False) - .head(nb_candidate_genes) - ) - candidate_gene_ids = ( - candidate_gene_lf.select(ENSEMBL_GENE_ID_COLNAME).to_series().to_list() - ) - return count_df.filter(pl.col(ENSEMBL_GENE_ID_COLNAME).is_in(candidate_gene_ids)) - - -##################################################### -# EXPORT -##################################################### - - -def export_data( - count_df: pl.DataFrame, - design_df: pl.DataFrame, - candidate_gene_counts_df: pl.DataFrame, - corr_df: pl.DataFrame, -): - """Export gene expression data.""" - logger.info(f"Exporting normalised counts to: {ALL_COUNTS_PARQUET_OUTFILENAME}") - count_df.write_parquet(ALL_COUNTS_PARQUET_OUTFILENAME) - - logger.info(f"Exporting designs to: {ALL_DESIGNS_OUTFILENAME}") - design_df.write_csv(ALL_DESIGNS_OUTFILENAME) - - logger.info( - f"Exporting candidate gene counts to: {CANDIDATE_GENE_COUNTS_PARQUET_OUTFILENAME}" - ) - candidate_gene_counts_df.write_parquet(CANDIDATE_GENE_COUNTS_PARQUET_OUTFILENAME) - - logger.info( - f"Exporting distribution correlations to: {DISTRIBUTION_CORRELATIONS_OUTFILENAME}" - ) - corr_df.write_csv(DISTRIBUTION_CORRELATIONS_OUTFILENAME, include_header=False) - - -def export_individual_statistics(dataset_stats_df: pl.DataFrame): - for data_col, params in STAT_COLNAME_TO_PARAMS.items(): - outfilename = params["outfilename"] - logger.info(f"Exporting {data_col} statistics to: {outfilename}") - sorted_data = dataset_stats_df[[SAMPLE_COLNAME, data_col]].sort( - data_col, descending=params["descending"] - ) - sorted_data.write_csv(outfilename, include_header=False) - - -##################################################### -##################################################### -# MAIN -##################################################### -##################################################### - - -def main(): - args = parse_args() - count_files = [Path(file) for file in args.count_files.split(" ")] - design_files = [Path(file) for file in args.design_files.split(" ")] - dataset_stat_files = [Path(file) for file in args.dataset_stat_files.split(" ")] - - # putting all counts into a single dataframe - count_df = get_counts(count_files) - # putting all design data into a single dataframe - design_df = merge_designs(design_files) - # putting all stats data into a single dataframe - dataset_stats_df = merge_stats(dataset_stat_files) - - candidate_gene_counts_df = get_candidate_gene_counts( - count_df, args.nb_candidate_genes - ) - - # adding stat about divergence to mean distribution - corr_df = compute_distances_to_mean(count_df) - - export_data(count_df, design_df, candidate_gene_counts_df, corr_df) - export_individual_statistics(dataset_stats_df) - - -if __name__ == "__main__": - main() diff --git a/bin/natural_language_utils.py b/bin/natural_language_utils.py new file mode 100755 index 00000000..79f8463c --- /dev/null +++ b/bin/natural_language_utils.py @@ -0,0 +1,139 @@ +import nltk +from nltk.corpus import wordnet + +nltk.download("punkt_tab") +nltk.download("averaged_perceptron_tagger_eng") +nltk.download("wordnet") + +lemmatizer = nltk.WordNetLemmatizer() +stemmer = nltk.PorterStemmer() + + +def get_wordnet_pos(token: str) -> str: + tag = nltk.pos_tag([token])[0][1][0].upper() + tag_dict = { + "J": wordnet.ADJ, + "N": wordnet.NOUN, + "V": wordnet.VERB, + "R": wordnet.ADV, + } + return tag_dict.get(tag, wordnet.NOUN) # Default to NOUN if not found + + +def get_stemmed_tokens(sentence: str) -> list[str]: + """ + Tokenize a sentence into its constituent words, and then stem each word + + Parameters + ---------- + sentence : str + The sentence to be tokenized and stemmed + + Returns + ------- + tokens : List[str] + The list of stemmed tokens + """ + + tokens = nltk.word_tokenize(sentence) + return [stemmer.stem(token) for token in tokens] + + +def get_lemmed_tokens(sentence: str) -> list[str]: + """ + Tokenize a sentence into its constituent words, and then lemmatize each word + + Parameters + ---------- + sentence : str + The sentence to be tokenized and lemmatized + + Returns + ------- + tokens : List[str] + The list of lemmatized tokens + """ + tokens = nltk.word_tokenize(sentence) + return [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens] + + +def get_synonyms(word) -> set[str]: + """ + Get all synonyms of a word from the wordnet database. + + Parameters + ---------- + word : str + The word for which to get synonyms + + Returns + ------- + synonyms : set + A set of all synonyms of the word + """ + synonyms = [] + for syn in wordnet.synsets(word): + for lemma in syn.lemmas(): + synonyms.append(lemma.name()) # Get the name of each lemma (synonym) + return set(synonyms) # Return as a set to avoid duplicates + + +def get_all_candidate_target_words(sentence: str) -> list[str]: + """ + Get all candidate target words from a sentence by stemming and lemmatizing the + tokens and getting synonyms from the wordnet database. + + Parameters + ---------- + sentence : str + The sentence from which to get candidate target words + + Returns + ------- + candidates : list + A list of all candidate target words + """ + candidates = [] + lemmatized_tokens = get_stemmed_tokens(sentence) + stemmed_tokens = get_stemmed_tokens(sentence) + tokens = list(set(lemmatized_tokens + stemmed_tokens)) + for token in tokens: + candidates += get_synonyms(token) + return candidates + + +def word_is_in_sentence(word: str, sentence: str) -> bool: + """ + Check if a word (or a stemmed version of it) is in a sentence, or if it is a + subword of a stemmed version of any word in the sentence. + + Parameters + ---------- + word : str + The word to be searched for + sentence : str + The sentence in which to search for the word + + Returns + ------- + bool + True if the word is found in the sentence, False otherwise + """ + for stemmed_word in [word] + get_stemmed_tokens(word): + # testing if stemmed word is in sentence as it is + if stemmed_word in sentence: + return True + # or testing if stemmed word is a subword of a stemmed word from the sentence + for target_word in get_all_candidate_target_words(sentence): + if stemmed_word in target_word: + return True + return False + + +def keywords_in_fields(fields: list[str], keywords: list[str]) -> list[str]: + return [ + keyword + for keyword in keywords + for field in fields + if word_is_in_sentence(keyword, field) + ] diff --git a/bin/normalise_microarray.R b/bin/normalise_microarray.R new file mode 100755 index 00000000..f9343ebf --- /dev/null +++ b/bin/normalise_microarray.R @@ -0,0 +1,129 @@ +#!/usr/bin/env Rscript + +# Written by Olivier Coen. Released under the MIT license. + +suppressPackageStartupMessages(library("affy")) +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("AnnotationDbi")) +suppressPackageStartupMessages(library("dplyr")) + +# Load library +library(affy) +library(optparse) +library(AnnotationDbi) +library(dplyr) +library(tibble) + +options(error = traceback) + +# we need to install the affy package manually while disabling threading +# when installed through conda, we get: ERROR; return code from pthread_create() is 22 +if (!requireNamespace("affy", quietly = TRUE)) { + BiocManager::install("affy", configure.args="--disable-threading", force = TRUE, quiet = TRUE) +} + + +##################################################### +##################################################### +# ARG PARSER +##################################################### +##################################################### + +get_args <- function() { + option_list <- list( + make_option("--input", help = "Folder containing CEL files"), + make_option("--target-gene-id-db", dest = "target_gene_id_db", help = "Target database for gene IDs (ENSEMBL or ENTREZID)") + ) + + args <- parse_args(OptionParser( + option_list = option_list, + description = "Normalize microarray data using RMA" + )) + return(args) +} + +get_probe_id_mapping <- function(data, annot_db, target_gene_id_db, stringent) { + + probe_ids <- rownames(data) + annotations <- AnnotationDbi::select( + annot_db, + keys = probe_ids, + columns = c(target_gene_id_db), + keytype = "PROBEID" + ) + + if (stringent) { + annotations <- annotations %>% + group_by(PROBEID) %>% + filter(n_distinct(.data[[target_gene_id_db]], na.rm = TRUE) == 1) %>% + ungroup() + } + + return(annotations) +} + +replace_probe_ids_by_target_ids <- function(data, annotations, target_gene_id_db) { + data <- as.data.frame(data) + data$PROBEID <- rownames(data) + + data <- merge(annotations, data, by = "PROBEID", all.x = TRUE) + + # computing mean of probe values for each gene + data <- data %>% + group_by(.data[[target_gene_id_db]]) %>% + summarise(across(where(is.numeric), function(x) mean(x, na.rm = TRUE))) %>% + ungroup() + + data <- tibble::column_to_rownames(data, var = target_gene_id_db) + return(data) +} + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +main <- function() { + + args <- get_args() + + # Read CEL files from a directory + message("Reading CEL files from", args$input) + data <- ReadAffy(celfile.path = args$input) + + message("Installing annotation database") + db_name <- paste0(annotation(data), ".db") + if (!requireNamespace(db_name, quietly = TRUE)) { + BiocManager::install(db_name, quiet = TRUE) + } + library(db_name, character.only = TRUE) + + # Normalize using RMA (most common method) + eset <- rma(data) + # Extract normalized expression values + message("Extracting normalized expression values") + normalised_data <- exprs(eset) + + annotations <- get_probe_id_mapping( + normalised_data, + annot_db = get(db_name), # Get the database object using get() + target_gene_id_db = args$target_gene_id_db, + stringent = TRUE + ) + + normalised_data_df <- replace_probe_ids_by_target_ids(normalised_data, annotations, args$target_gene_id_db) + + # cleaning colnames + colnames(normalised_data_df) <- sub("\\..*", "", colnames(normalised_data_df)) + colnames(normalised_data_df) <- sub("-", "_", colnames(normalised_data_df)) + + # Save results + message("Saving results to normalised_expression.csv") + write.csv(normalised_data_df, "normalised_expression.csv") + +} + +main() diff --git a/bin/normfinder.py b/bin/normfinder.py new file mode 100755 index 00000000..774d463d --- /dev/null +++ b/bin/normfinder.py @@ -0,0 +1,519 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from dataclasses import dataclass, field +from pathlib import Path +from statistics import mean + +import config +import numpy as np +import polars as pl +from numba import njit, prange +from tqdm import tqdm + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +STABILITY_OUTFILENAME = "stability_values.normfinder.csv" + + +############################################################################ +# POLARS EXTENSIONS +############################################################################ + + +@pl.api.register_expr_namespace("row") +class StatsExtension: + def __init__(self, expr: pl.Expr): + self._expr = expr + + def not_null_values(self): + return self._expr.list.eval(pl.element().drop_nulls().drop_nans()).list + + def mean(self) -> pl.Expr: + """Mean over non nulls values in row""" + return self.not_null_values().mean() + + def sum(self) -> pl.Expr: + """Median over non nulls values in row""" + return self.not_null_values().sum() + + def min(self) -> pl.Expr: + """Median over non nulls values in row""" + return self.not_null_values().min() + + +############################################################################ +# NUMBA-ACCELERATED FUNCTIONS +############################################################################ + + +@njit(parallel=True) +def compute_minvars(z: np.ndarray, target_idx: np.ndarray) -> np.ndarray: + """ + z: (ngenes, nsamples) array + target_idx: 1D array of indices (int64) for which to compute minvar + returns: 1D array of length len(target_idx) + """ + ngenes, nsamples = z.shape + + # should not happen as it is controlled before, but just in case + if nsamples < 2: + raise ValueError("Number of samples must be at least 2") + + minvars = np.empty(len(target_idx), dtype=np.float64) + for k in prange(len(target_idx)): + i = target_idx[k] + # checking if counts for this gene are all nans + nb_valid_counts = (~np.isnan(z[i, :])).sum() + if nb_valid_counts < 1: + minvars[k] = np.nan + continue # skip this gene + # computing variances of pairwise differences + minv = 1e18 + for j in prange(ngenes): + if i == j: + continue + diffs = z[i, :] - z[j, :] + mean = np.sum(diffs) / nsamples # scalar + var = np.sum((diffs - mean) ** 2) / (nsamples - 1) # scalar + if np.isnan(var): + continue # skip + if var < minv: + minv = var + minvars[k] = minv / 4.0 if minv < 1e18 else np.inf + return minvars + + +##################################################### +# NORMFINDER CLASS +##################################################### + + +@dataclass +class NormFinder: + count_lf: pl.LazyFrame + design_df: pl.DataFrame + + genes: list[str] = field(init=False) + + group_to_samples_dict: dict[str, list[str]] = field(init=False) + + n_groups: int = field(init=False) + n_genes: int = field(init=False) + + def __post_init__(self): + # format_design + self.design_df = self.design_df.with_columns( + pl.concat_str([pl.col("batch"), pl.col("condition")], separator="_").alias( + "group" + ) + ).select("sample", "group") + + # make dict associating a group to the list of its samples + group_to_sample_df = self.design_df.group_by("group", maintain_order=True).agg( + "sample" + ) # maintain order is better for repeatability and testing + + self.group_to_samples_dict = { + d["group"]: d["sample"] for d in group_to_sample_df.to_dicts() + } + + groups = list(self.group_to_samples_dict.keys()) + self.n_groups = len(groups) + + self.genes = ( + self.count_lf.select(config.GENE_ID_COLNAME).collect().to_series().to_list() + ) + self.n_genes = len(self.genes) + + if self.n_genes <= 2: + logger.error("Too few genes") + sys.exit(100) + + @staticmethod + def get_overall_mean_for_group(df_with_means_over_samples: pl.DataFrame) -> float: + return df_with_means_over_samples.mean().item() + + @staticmethod + def get_means_over_samples(df: pl.DataFrame) -> pl.DataFrame: + return df.with_columns( + mean_over_samples_for_gene=pl.concat_list(pl.all()).row.mean() + ).select("mean_over_samples_for_gene") + + def correct_negative_values( + self, intra_var_df: pl.DataFrame, group_count_df: pl.DataFrame + ) -> pl.DataFrame: + genes_with_negative_values = intra_var_df.select( + col for col in self.genes if (intra_var_df[col] < 0).all() + ).columns # intra_var_df has only one row but it is a dataframe + + # getting indexes of genes for which we must compute minvar + indexes_of_genes_with_negative_values = np.array( + [ + i + for i, gene in enumerate(self.genes) + if gene in genes_with_negative_values + ], + dtype=np.int64, + ) + + minvars = compute_minvars( + group_count_df.to_numpy(), indexes_of_genes_with_negative_values + ) + + # associating back minvars to their respective gene + minvar_dict = { + gene: minvars[i] for i, gene in enumerate(genes_with_negative_values) + } + return intra_var_df.with_columns( + [pl.lit(val).alias(col) for col, val in minvar_dict.items()] + ) + + def get_unbiased_intragroup_variance_for_group( + self, + group_count_df: pl.DataFrame, + means_over_samples_df: pl.DataFrame, + group_overall_mean: float, + samples: list[str], + ): + # TODO: see if it's correct + # if only one sample in the group, there's no variance + if len(samples) == 1: + data = {gene: [0] for gene in self.genes} + return pl.DataFrame(data) + + # lf is a lazyframe with a column being the gene ids (gene_id) + # and other columns being the samples + # the current chunk corresponds to only one group + # means_over_samples_df is a single column dataframe containing the means across each row (ie for each gene across samples) + ng = len(samples) + + means_over_samples = means_over_samples_df.to_series().rename( + "mean_over_samples_for_gene" + ) + + mean_over_genes = ( + group_count_df.mean() + .transpose() + .to_series() + .rename("mean_over_genes_for_sample") + ) + + sample_variance_df = ( + group_count_df.hstack( + [means_over_samples] + ) # adding column containing means over all samples in this group (for each gene) + .select( + [ + (pl.col(c) - pl.col("mean_over_samples_for_gene")).alias( + c + ) # y_igj - mean(y_ig*) + for c in samples + ] + ) + .transpose( + include_header=True, column_names=self.genes + ) # columns are now genes + .hstack( + [mean_over_genes] + ) # adding column containing means over all genes (for each sample) + .select( + [ + ( + ( + pl.col(c) + - pl.col("mean_over_genes_for_sample") + + group_overall_mean + ) + ** 2 + ).alias( + c + ) # r_igj ^2 = (y_igj - mean(y_ig*) -mean(y_*gj) + mean(y_*g*) ) ^ 2 + for c in self.genes + ] + ) + .transpose(include_header=True, column_names=samples) + .with_columns( + sample_variance=pl.concat_list(samples).row.sum() + / ( + (ng - 1) * (1 - 2 / self.n_genes) + ) # sum over j (samples) of r_igj ^2 terms + ) + .select("sample_variance") + .transpose() + .rename({f"column_{i}": gene for i, gene in enumerate(self.genes)}) + ) + + # sum of all sample variances for all genes + sample_variance_sum_over_genes = sample_variance_df.select( + pl.sum_horizontal(pl.all()) + ).item() # sum of all s_ijÂČ over all genes + + intra_var_df = sample_variance_df.select( + [ + ( + pl.col(c) + - sample_variance_sum_over_genes + / (self.n_genes * (self.n_genes - 1)) + ).alias(c) + for c in self.genes + ] + ) + # if some values are negative, we need a special process + corrected_intra_var_df = self.correct_negative_values( + intra_var_df, group_count_df + ) + + return corrected_intra_var_df + + def get_unbiased_intragroup_variances(self): + unbiased_intragroup_variance_dfs = [] + means_over_samples_dfs = [] + group_overall_means = [] + + for group, samples in tqdm(self.group_to_samples_dict.items()): + # sub dataframe corresponding to this group + chunk_df = self.count_lf.select(samples).collect() + # computing means over samples for each gene + means_over_samples_df = self.get_means_over_samples(chunk_df) + # getting overall expression average in the group for all genes + group_overall_mean = self.get_overall_mean_for_group(means_over_samples_df) + + group_unbiased_intragroup_variance_df = ( + self.get_unbiased_intragroup_variance_for_group( + chunk_df, means_over_samples_df, group_overall_mean, samples + ) + ) + + # storing intragroup values for each gene in this group + unbiased_intragroup_variance_dfs.append( + group_unbiased_intragroup_variance_df + ) + # storing means over samples in this group for each gene + means_over_samples_df = means_over_samples_df.rename( + {"mean_over_samples_for_gene": group} + ) + means_over_samples_dfs.append(means_over_samples_df) + # storing overall mean of expression in this group, for all genes and samples + group_overall_means.append(group_overall_mean) + + # cast all values to float (to avoid issues when concat) + unbiased_intragroup_variance_dfs = [ + df.select([pl.col(col).cast(pl.Float64) for col in df.columns]) + for df in unbiased_intragroup_variance_dfs + ] + + # removing None values in group_overall_means + # which would originate from group chunk dataframes that are full of null values + group_overall_means = [mean for mean in group_overall_means if mean is not None] + + # before returning: + # concatenate together all intragroup variance data to have a single df for all groups + # stack all means over samples horizontally (becomes a gene * group df ) + # get the mean of group_overall_means to get the overall mean expression value in the count dataframe + return ( + pl.concat(unbiased_intragroup_variance_dfs), + pl.concat(means_over_samples_dfs, how="horizontal"), + mean(group_overall_means), + ) + + def adjust_for_nb_of_samples_in_groups( + self, unbiased_intragroup_variance_df: pl.DataFrame + ): + n_samples_list = [ + len(samples) for samples in self.group_to_samples_dict.values() + ] + return unbiased_intragroup_variance_df.with_columns( + n_samples=pl.Series(n_samples_list) + ).select([(pl.col(c) / pl.col("n_samples")).alias(c) for c in self.genes]) + + def get_unbiased_intergroup_variance( + self, gene_means_in_groups_df: pl.DataFrame, dataset_overall_mean: float + ): + mean_over_genes = ( + gene_means_in_groups_df.mean() + .transpose() + .to_series() + .rename("mean_over_genes_for_group") + ) + + return ( + gene_means_in_groups_df.with_columns( + mean_over_groups_for_gene=pl.concat_list(pl.all()).row.mean() + ) + .select( + [ + (pl.col(c) - pl.col("mean_over_groups_for_gene")).alias(c) + for c in gene_means_in_groups_df.columns + ] + ) + .transpose(column_names=self.genes) + .hstack([mean_over_genes]) + .select( + [ + ( + pl.col(c) + - pl.col("mean_over_genes_for_group") + + dataset_overall_mean + ).alias(c) + for c in self.genes + ] + ) + .select( + [(pl.col(c) ** 2).alias(c) for c in self.genes] + ) # square to get variance + ) + + def compute_gamma_factor(self, diff_df: pl.DataFrame, vardiff_df: pl.DataFrame): + logger.info("Computing gamma factor") + first_term = ( + diff_df.with_columns( + sum_of_squares=pl.concat_list(pl.all()).row.sum() # sum over columns + ) + .select("sum_of_squares") + .sum() # sum over rows + .select( + ( + pl.col("sum_of_squares") + / ((self.n_groups - 1) * (self.n_genes - 1)) + ).alias("normalised_sum_of_squares") + ) + .item() + ) + + second_term = ( + vardiff_df.with_columns( + sum=pl.concat_list(pl.all()).row.sum() # sum over columns + ) + .select("sum") + .sum() # sum over rows + .select( + (pl.col("sum") / (self.n_groups * self.n_genes)).alias("normalised_sum") + ) + .item() + ) + + return max(first_term - second_term, 0) # set to 0 if negative + + @staticmethod + def apply_gamma_factor( + gamma: float, diff_df: pl.DataFrame, vardiff_df: pl.DataFrame + ): + difnew = diff_df * gamma / (gamma + vardiff_df) + varnew = vardiff_df + gamma * vardiff_df / (gamma + vardiff_df) + return difnew, varnew + + def apply_shrinkage( + self, intergroup_variance_df: pl.DataFrame, group_mean_variance_df: pl.DataFrame + ): + gamma = self.compute_gamma_factor( + intergroup_variance_df, group_mean_variance_df + ) + return self.apply_gamma_factor( + gamma, intergroup_variance_df, group_mean_variance_df + ) + + def get_stability_values( + self, shrunk_intervar_df: pl.DataFrame, shrunk_gr_mean_var_df: pl.DataFrame + ): + return ( + ( + shrunk_intervar_df.select([pl.col(c).abs() for c in self.genes]) + + shrunk_gr_mean_var_df.select([pl.col(c).sqrt() for c in self.genes]) + ) + .mean() + .transpose( + include_header=True, + header_name=config.GENE_ID_COLNAME, + column_names=[config.NORMFINDER_STABILITY_VALUE_COLNAME], + ) + ) + + def compute_stability_scoring(self): + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # UNBIASED INTRAGROUP VARIANCE + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + logger.info("Computing intragroup variances") + intragroup_variance_df, gene_means_in_groups_df, dataset_overall_mean = ( + self.get_unbiased_intragroup_variances() + ) + + logger.info("Adjusting variances by group size") + group_mean_variance_df = self.adjust_for_nb_of_samples_in_groups( + intragroup_variance_df + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # INTERGROUP VARIANCE + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + logger.info("Computing intergroup variances") + intergroup_variance_df = self.get_unbiased_intergroup_variance( + gene_means_in_groups_df, dataset_overall_mean + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # STABILITY VALUES + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + logger.info("Shrinking intragroup and intergroup variances using gamma factor") + shrunk_intervar_df, shrunk_gr_mean_var_df = self.apply_shrinkage( + intergroup_variance_df, group_mean_variance_df + ) + + logger.info("Computing stability values") + return self.get_stability_values(shrunk_intervar_df, shrunk_gr_mean_var_df) + + +##################################################### +# FUNCTIONS +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Quantile normalise count data for each sample in the dataset" + ) + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument( + "--design", type=Path, dest="design_file", required=True, help="Design file" + ) + + return parser.parse_args() + + +def export_stability(stabilities: pl.DataFrame): + """Export stability values to CSV file.""" + logger.info(f"Exporting stability values to: {STABILITY_OUTFILENAME}") + stabilities.write_csv( + STABILITY_OUTFILENAME, float_precision=config.CSV_FLOAT_PRECISION + ) + + +def main(): + args = parse_args() + + logger.info(f"Getting counts from {args.count_file}") + count_lf = pl.scan_parquet(args.count_file) + + logger.info(f"Getting design from {args.design_file}") + design_df = pl.read_csv(args.design_file) + # filter design df to keep only samples that are present in the count dataframe + design_df = design_df.filter( + pl.col("sample").is_in(count_lf.collect_schema().names()) + ) + + nfd = NormFinder(count_lf, design_df) + stabilities = nfd.compute_stability_scoring() + + logger.info(f"Stability values:\n{stabilities}") + export_stability(stabilities) + + +if __name__ == "__main__": + main() diff --git a/bin/old/clean_count_data.py b/bin/old/clean_count_data.py new file mode 100755 index 00000000..333c1040 --- /dev/null +++ b/bin/old/clean_count_data.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# outfile names +ALL_COUNTS_FILTERED_PARQUET_OUTFILENAME = "cleaned_counts_filtered.parquet" + +FAILURE_REASON_FILE = "failure_reason.txt" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Clean data by removing aberrant samples and performing some other cleaning operations." + ) + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument( + "--ks-stats", + type=Path, + dest="ks_stats_file", + required=True, + help="KS stats file", + ) + parser.add_argument( + "--ks-pvalue-threshold", + type=float, + dest="ks_pvalue_threshold", + required=True, + help="KS p-value threshold", + ) + return parser.parse_args() + + +def get_count_columns(lf: pl.LazyFrame) -> list[str]: + """Get all column names except the config.GENE_ID_COLNAME column. + + The config.GENE_ID_COLNAME column contains only gene IDs. + """ + return lf.select(pl.exclude(config.GENE_ID_COLNAME)).collect_schema().names() + + +def get_counts( + file: Path, +) -> pl.DataFrame: + # sorting dataframe (necessary to get consistent output) + return pl.read_parquet(file).sort(config.GENE_ID_COLNAME, descending=False) + + +def remove_samples_with_low_ks_pvalue( + count_lf: pl.DataFrame, ks_stats_file: Path, ks_pvalue_threshold: float +) -> pl.DataFrame: + ks_stats_df = pl.read_csv(ks_stats_file, has_header=True).select( + [config.SAMPLE_COLNAME, config.KS_TEST_COLNAME] + ) + + # logging number of samples excluded from analysis + not_valid_samples = ks_stats_df.filter( + ks_stats_df[config.KS_TEST_COLNAME] <= ks_pvalue_threshold + )[config.SAMPLE_COLNAME].to_list() + + if not_valid_samples: + logger.warning( + f"Excluded {len(not_valid_samples)} samples showing a KS p-value below {ks_pvalue_threshold}" + ) + else: + logger.info("No sample was excluded") + + # getting samples for which the Kolmogorov-Smirnov test pvalue is above the threshold + valid_samples = ks_stats_df.filter( + ks_stats_df[config.KS_TEST_COLNAME] > ks_pvalue_threshold + )[config.SAMPLE_COLNAME].to_list() + + if not valid_samples: + logger.warning("No more valid sample to process...") + msg = "COUNT FILE IS EMPTY" + logger.warning(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + sys.exit(0) + + # filtering the count dataframe to keep only the valid samples + return count_lf.select([config.GENE_ID_COLNAME] + valid_samples) + + +def export_data(all_counts_lf: pl.DataFrame): + all_counts_lf.write_parquet(ALL_COUNTS_FILTERED_PARQUET_OUTFILENAME) + logger.info("Done") + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + # putting all counts into a single dataframe + count_lf = get_counts(args.count_file) + + # removing aberrant samples (ks p-value under the threshold) + count_lf = remove_samples_with_low_ks_pvalue( + count_lf, args.ks_stats_file, args.ks_pvalue_threshold + ) + + # exporting computed data + export_data(count_lf) + + +if __name__ == "__main__": + main() diff --git a/bin/old/get_array_express_accessions.py b/bin/old/get_array_express_accessions.py new file mode 100755 index 00000000..7c6371d9 --- /dev/null +++ b/bin/old/get_array_express_accessions.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import math +import urllib.parse +from functools import partial +from multiprocessing import Pool + +import requests +from natural_language_utils import keywords_in_fields +from tenacity import ( + before_sleep_log, + retry, + stop_after_delay, + wait_exponential, +) +from tqdm import tqdm + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +SEARCH_URL = "https://www.ebi.ac.uk/biostudies/api/v1/arrayexpress/search?" +SEARCH_MAX_PAGE_SIZE = 100 +SEARCH_BASE_PARAMS = {"pageSize": SEARCH_MAX_PAGE_SIZE} + +STUDY_SEARCH_URL = ( + "https://www.ebi.ac.uk/biostudies/api/v1/arrayexpress/study/{accession}" +) +ACCESSION_OUTFILE_NAME = "accessions.txt" +# ALL_EXPERIMENTS_METADATA_OUTFILE_NAME = "all_experiments.metadata.tsv" +SPECIES_EXPERIMENTS_METADATA_OUTFILE_NAME = "species_experiments.metadata.tsv" +SELECTED_EXPERIMENTS_METADATA_OUTFILE_NAME = "selected_experiments.metadata.tsv" +FILTERED_EXPERIMENTS_WITH_KEYWORDS_OUTFILE_NAME = "filtered_experiments.keywords.yaml" + + +################################################################## +################################################################## +# FUNCTIONS +################################################################## +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Get expression atlas accessions") + parser.add_argument( + "--species", + type=str, + required=True, + help="Search Expression Atlas for this specific species", + ) + parser.add_argument( + "--keywords", + type=str, + nargs="*", + help="Keywords to search for in experiment description", + ) + parser.add_argument("--platform", type=str, help="Platform type") + return parser.parse_args() + + +@retry( + stop=stop_after_delay(600), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def get_data(url: str) -> dict: + """ + Queries a URL and returns the data as a JSON object + + Parameters + ---------- + url : str + The URL to query + + Returns + ------- + data : dict + The JSON object returned by the query + + Raises + ------ + RuntimeError + If the query fails + """ + response = requests.get(url) + if response.status_code == 200: + return response.json() + else: + raise RuntimeError( + f"Failed to retrieve data: encountered error {response.status_code}" + ) + + +def get_array_express_studies(species: str): + """ + Gets all experiments from Array Express + + Parameters + ---------- + species : str + Name of species. Example: "human" + + Returns + ------- + experiments : list + A list of experiment dictionaries + """ + nb_hits = None + page_number = 0 + results = [] + while not nb_hits or page_number * SEARCH_MAX_PAGE_SIZE < nb_hits: + params = {"organism": species, "page": page_number} + all_formatted_params = [ + f"{key}={value}" for key, value in (params | SEARCH_BASE_PARAMS).items() + ] + all_params_str = " AND ".join(all_formatted_params) + print(all_params_str) + query_url = SEARCH_URL + urllib.parse.quote(all_params_str) + logger.info(f"Sending request {query_url}") + result = get_data(query_url) + + if not result: + logger.warning(f"Failed to query Entrey Esearch with query: {query_url}") + continue + print() + # getting total nb of entries + if not nb_hits: + nb_hits = int(result["totalHits"]) + nb_iters = math.ceil(nb_hits / SEARCH_MAX_PAGE_SIZE) + pbar = tqdm(total=nb_iters) + + # if there is no entry for this species + if nb_hits == 0: + logger.info(f"No entries found for query: {query_url}") + return [] + + results += result + # setting next cursor to the next group + page_number += 1 + pbar.update(page_number) + + pbar.close() + return results + + +def get_experiment_description(exp_dict: dict): + """ + Gets the description from an experiment dictionary + + Parameters + ---------- + exp_dict : dict + The experiment dictionary + + Returns + ------- + description : str + The experiment description + + Raises + ------ + KeyError + If the description field is not found in the experiment dictionary + """ + if "experiment" in exp_dict: + if "description" in exp_dict["experiment"]: + return exp_dict["experiment"]["description"] + else: + raise KeyError(f"Could not find description field in {exp_dict}") + elif "experimentDescription" in exp_dict: + return exp_dict["experimentDescription"] + else: + raise KeyError(f"Could not find description field in {exp_dict}") + + +def get_experiment_accession(exp_dict: dict): + """ + Gets the accession from an experiment dictionary + + Parameters + ---------- + exp_dict : dict + The experiment dictionary + + Returns + ------- + accession : str + The experiment accession + + Raises + ------ + KeyError + If the accession field is not found in the experiment dictionary + """ + if "experiment" in exp_dict: + if "accession" in exp_dict["experiment"]: + return exp_dict["experiment"]["accession"] + else: + raise KeyError(f"Could not find accession field in {exp_dict}") + elif "experimentAccession" in exp_dict: + return exp_dict["experimentAccession"] + else: + raise KeyError(f"Could not find accession field in {exp_dict}") + + +def get_properties_values(exp_dict: dict): + """ + Gets all values from properties from an experiment dictionary + + Parameters + ---------- + exp_dict : dict + The experiment dictionary + + Returns + ------- + values : list + A list of all values from properties + """ + values = [] + for column_header_dict in exp_dict["columnHeaders"]: + key_found = False + for key in ["assayGroupSummary", "contrastSummary"]: + if key in column_header_dict: + for property_dict in column_header_dict[key]["properties"]: + values.append(property_dict["testValue"]) + key_found = True + break + if not key_found: + raise KeyError(f"Could not find property value in {column_header_dict}") + # removing empty strings + values = [value for value in values if value != ""] + # removing duplicates + return list(set(values)) + + +def get_platform_specific_studies(experiments: list[dict], platform: str): + """ + Gets all experiments for a given platform from Expression Atlas + Possible platforms in Expression Atlas are 'rnaseq', 'microarray', 'proteomics' + + Parameters + ---------- + experiments: list[str] + platform : str + Name of platform. Example: "rnaseq" + + Returns + ------- + experiments : list + A list of experiment dictionaries + """ + platform_experiments = [] + for exp_dict in experiments: + if technology_type := exp_dict.get("technologyType"): + parsed_technology_type = ( + technology_type[0] + if isinstance(technology_type, list) + else technology_type + ) + parsed_platform = ( + parsed_technology_type.lower().split(" ")[0].replace("-", "") + ) + if platform == parsed_platform: + platform_experiments.append(exp_dict) + return platform_experiments + + +def get_study_details(study: dict): + """ + Get details of a study + + Parameters + ---------- + study : dict + A dictionary containing study details + + Returns + ------- + study_details : dict + A dictionary containing study details + """ + url = STUDY_SEARCH_URL.format(accession=study["accession"]) + logger.info(f"Sending request {url}") + data = get_data(url) + return data["hits"] + + +def get_experiment_data(exp_dict: dict): + """ + Gets the full data for an experiment given its dictionary + + Parameters + ---------- + exp_dict : dict + The experiment dictionary + + Returns + ------- + exp_data : dict + The full experiment data + """ + exp_url = ALL_EXP_URL + exp_dict["experimentAccession"] + return get_data(exp_url) + + +def parse_experiment(exp_dict: dict): + # getting accession and description + accession = get_experiment_accession(exp_dict) + description = get_experiment_description(exp_dict) + # getting properties of this experiment + exp_data = get_experiment_data(exp_dict) + properties_values = get_properties_values(exp_data) + + return { + "accession": accession, + "description": description, + "properties": properties_values, + } + + +def filter_experiment_with_keywords(exp_dict: dict, keywords: list[str]) -> dict | None: + all_searchable_fields = [exp_dict["description"]] + exp_dict["properties"] + found_keywords = keywords_in_fields(all_searchable_fields, keywords) + # only returning experiments if found keywords + if found_keywords: + exp_dict["found_keywords"] = list(set(found_keywords)) + return exp_dict + else: + return None + + +def get_metadata_for_selected_experiments( + experiments: list[dict], results: list[dict] +) -> list[dict]: + filtered_accessions = [result_dict["accession"] for result_dict in results] + return [ + exp_dict + for exp_dict in experiments + if get_experiment_accession(exp_dict) in filtered_accessions + ] + + +def format_species_name(species: str) -> str: + return species.replace("_", " ").strip() + + +################################################################## +################################################################## +# MAIN +################################################################## +################################################################## + + +def main(): + args = parse_args() + + results = None + selected_accessions = [] + selected_experiments = [] + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # PARSING EXPRESSION ATLAS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + # Getting arguments + species_name = format_species_name(args.species) + keywords = args.keywords + + logger.info(f"Getting experiments corresponding to species {species_name}") + all_studies = get_array_express_studies(species_name) + + if args.platform: + logger.info(f"Getting experiments corresponding to platform {args.platform}") + all_studies = get_platform_specific_studies(all_studies, args.platform) + + detailed_studies = [get_study_details(study) for study in all_studies] + print(detailed_studies[0]) + logger.info( + f"Found {len(species_experiments)} experiments for species {species_name}" + ) + + logger.info("Parsing experiments") + with Pool() as pool: + results = pool.map(parse_experiment, species_experiments) + + if keywords: + logger.info(f"Filtering experiments with keywords {keywords}") + func = partial(filter_experiment_with_keywords, keywords=keywords) + with Pool() as pool: + results = [res for res in pool.map(func, results) if res is not None] + + if results: + logger.info(f"Kept {len(results)} experiments") + # getting accessions of selected experiments + selected_accessions = [exp_dict["accession"] for exp_dict in results] + # keeping metadata only for selected experiments + selected_experiments = get_metadata_for_selected_experiments( + species_experiments, results + ) + + else: + logger.warning( + f"Could not find experiments for species {species_name} and keywords {keywords}" + ) + + +if __name__ == "__main__": + main() diff --git a/bin/old/get_gene_lengths_from_ensembl_api.py b/bin/old/get_gene_lengths_from_ensembl_api.py new file mode 100755 index 00000000..9cfe9a68 --- /dev/null +++ b/bin/old/get_gene_lengths_from_ensembl_api.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import json +import logging +from pathlib import Path + +import config +import pandas as pd +import requests +from tenacity import ( + before_sleep_log, + retry, + stop_after_delay, + wait_exponential, +) +from tqdm.contrib.concurrent import process_map + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +GENE_IDS_CHUNKSIZE = 50 # max allowed by Ensembl REST API + +ENSEMBL_REST_SERVER = "https://rest.ensembl.org" +SEQUENCE_INFO_EXT = "/sequence/id" +HEADERS = { + "Content-Type": "application/json", + "Accept": "application/json", +} +STOP_RETRY_AFTER_DELAY = 600 + +OUTFILE = "gene_ids_lengths.csv" + + +################################################################## +################################################################## +# FUNCTIONS +################################################################## +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Get GEO Datasets accessions") + parser.add_argument( + "--genes", + type=Path, + dest="gene_file", + required=True, + help="File containing gene IDs", + ) + return parser.parse_args() + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# QUERIES TO ENSEMBL +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def send_post_request_to_ensembl(gene_ids: list[str]) -> list[dict]: + data = {"ids": gene_ids, "type": "cdna"} + url = ENSEMBL_REST_SERVER + SEQUENCE_INFO_EXT + response = requests.post(url, headers=HEADERS, data=json.dumps(data)) + if response.status_code == 200: + response.raise_for_status() + else: + raise RuntimeError( + f"Failed to retrieve data: encountered error {response.status_code}" + ) + return response.json() + + +def get_gene_lengths(gene_ids: list[str]) -> list[dict]: + records = send_post_request_to_ensembl(gene_ids) + return [ + { + config.GENE_ID_COLNAME: record["query"], + config.CDNA_LENGTH_COLNAME: len(record["seq"]), + } + for record in records + if record.get("query") is not None and record.get("seq") is not None + ] + + +def chunk_list(lst: list, chunksize: int) -> list: + """Splits a list into chunks of a given size. + + Args: + lst (list): The list to split. + chunksize (int): The size of each chunk. + + Returns: + list: A list of chunks, where each chunk is a list of len(chunksize). + """ + return [lst[i : i + chunksize] for i in range(0, len(lst), chunksize)] + + +################################################################## +################################################################## +# MAIN +################################################################## +################################################################## + + +def main(): + args = parse_args() + + with open(args.gene_file, "r") as fin: + gene_ids = [line.strip() for line in fin] + + gene_id_chunks = chunk_list(gene_ids, GENE_IDS_CHUNKSIZE) + # getting gene lengths chunk by chunk + records_list = process_map(get_gene_lengths, gene_id_chunks, max_workers=12) + # flattening list of lists into a single list + records = [record for sublist in records_list for record in sublist] + + df = pd.DataFrame.from_dict(records) + # taking the length of the longest transcript for each gene + df = df.groupby(config.GENE_ID_COLNAME, as_index=False).agg( + {config.CDNA_LENGTH_COLNAME: "max"} + ) + + df.to_csv(OUTFILE, index=False, header=True) + + +if __name__ == "__main__": + main() diff --git a/bin/deseq2_normalise.R b/bin/old/normalise_with_deseq2.R similarity index 56% rename from bin/deseq2_normalise.R rename to bin/old/normalise_with_deseq2.R index f4f3652a..40b12a30 100755 --- a/bin/deseq2_normalise.R +++ b/bin/old/normalise_with_deseq2.R @@ -1,10 +1,14 @@ #!/usr/bin/env Rscript # Written by Olivier Coen. Released under the MIT license. - +options(error = traceback) +suppressPackageStartupMessages(library("DESeq2")) library(DESeq2) library(optparse) +FAILURE_REASON_FILE <- "failure_reason.txt" +WARNING_REASON_FILE <- "warning_reason.txt" + ##################################################### ##################################################### # FUNCTIONS @@ -27,27 +31,52 @@ get_args <- function() { return(args) } +parse_dataframe <- function(file_path, ...) { + if (grepl("\\.csv$", file_path)) { + data <- read.csv(file_path, ...) + } else if (grepl("\\.tsv$", file_path)) { + data <- read.table(file_path, sep = "\t", header = TRUE, ...) + } else { + write("UNSUPPORTED FILE FORMAT", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } + return(data) +} + check_samples <- function(count_matrix, design_data) { # check if the column names of count_matrix match the sample names - if (!all(colnames(count_matrix) == design_data$sample)) { - stop("Sample names in the count matrix do not match the design data.") + if (!all( colnames(count_matrix) == design_data$sample )) { + write("SAMPLE NAMES IN COUNT MATRIX DO NOT MATCH DESIGN DATA", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } + # check for extra samples + extra_samples <- setdiff( colnames(count_matrix), design_data$sample ) + if (length(extra_samples) > 0) { + write( + "THE FOLLOWING SAMPLES ARE IN THE COUNT MATRIX BUT NOT IN DESIGN: ", paste(extra_samples, collapse = ", "), + file = WARNING_REASON_FILE + ) } } prefilter_counts <- function(count_matrix, design_data) { - # see https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html - # getting size of smallest group - group_sizes <- table(design_data$condition) - smallest_group_size <- min(group_sizes) - # keep genes with at least 10 counts over a certain number of samples - keep <- rowSums(count_matrix >= 10) >= smallest_group_size - filtered_count_matrix <- count_matrix[keep,] + if (ncol(count_matrix) == 1) { + keep <- count_matrix[, 1] >= 1 + } else { + # see https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html + # getting size of smallest group + group_sizes <- table(design_data$condition) + smallest_group_size <- min(group_sizes) + # keep genes with at least 10 counts over a certain number of samples + keep <- rowSums(count_matrix >= 1) >= smallest_group_size + } + filtered_count_matrix <- count_matrix[keep, , drop = FALSE] # drop = FALSE: keep dataframe structure even if only one column remains return(filtered_count_matrix) } remove_all_zero_columns <- function(df) { # remove columns which contains only zeros - df <- df[, colSums(df) != 0] + df <- df[, colSums(df) != 0, drop = FALSE] return(df) } @@ -78,46 +107,70 @@ get_cpm_counts <- function(normalised_counts, filtered_count_matrix) { get_normalised_cpm_counts <- function(count_file, design_file) { - print(paste('Normalizing counts in:', count_file)) - - count_data <- read.csv(count_file, row.names = 1) + message("Parsing count file") + count_data <- parse_dataframe(count_file, row.names = 1) # data should all be integers but sometimes they are integers converted to floats (1234 -> 1234.0) # DESeq2 does not accept that so we must convert them into integers count_data[] <- lapply(count_data, as.integer) - design_data <- read.csv(design_file) - count_matrix <- as.matrix(count_data) + # in some rare datasets, columns can contain only zeros # we do not consider these columns + message("Removing columns with all zeros") count_matrix <- remove_all_zero_columns(count_matrix) + if (ncol(count_matrix) == 0) { + message("All columns were full of zeros.") + write("ALL COLUMNS WERE FULL OF ZEROS", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } + # getting design data - design_data <- design_data[design_data$sample %in% colnames(count_matrix), ] + message("Parsing design file") + design_data <- parse_dataframe(design_file) + + # removing extra samples in design table + message("Removing extra samples in design table") + design_data <- design_data[design_data$sample %in% colnames(count_matrix), , drop = FALSE] + + if (nrow(design_data) == 0) { + message("Design and sample names do not match.") + write("DESIGN AND SAMPLE NAMES DO NOT MATCH", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } # check if the column names of count_matrix match the sample names + message("Checking sample names") check_samples(count_matrix, design_data) - col_data <- data.frame( - row.names = design_data$sample, - condition = factor(design_data$condition) - ) + # reorder count matrix columns to match design row order + # this is absolutely mandatory + # see https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html at part "Count matrix input" + count_matrix <- count_matrix[, as.character(design_data$sample), drop = FALSE] # pre-filter genes with low counts + message("Pre-filtering genes") filtered_count_matrix <- prefilter_counts(count_matrix, design_data) # if the dataframe is now empty, stop the process if (nrow(filtered_count_matrix) == 0) { message("No genes left after pre-filtering.") - quit(save = "no", status = 100) + write("NO GENES LEFT AFTER PRE-FILTERING", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) } # add a small pseudocount to avoid zero counts + message("Replacing zero counts with pseudocounts") filtered_count_matrix <- replace_zero_counts_with_pseudocounts(filtered_count_matrix) - # create DESeq2 object # if the number of distinct conditions is only 1, DESeq2 returns an error + message("Creating DESeqDataSet") + col_data <- data.frame( + row.names = design_data$sample, + condition = factor(design_data$condition) + ) num_unique_conditions <- length(unique(design_data$condition)) if (num_unique_conditions == 1) { dds <- DESeqDataSetFromMatrix(countData = filtered_count_matrix, colData = col_data, design = ~ 1) @@ -125,16 +178,18 @@ get_normalised_cpm_counts <- function(count_file, design_file) { dds <- DESeqDataSetFromMatrix(countData = filtered_count_matrix, colData = col_data, design = ~ condition) } + message("Normalising counts") normalised_counts <- get_normalised_counts(dds) + message("Calculating CPM counts") cpm_counts <- get_cpm_counts(normalised_counts, filtered_count_matrix) return(cpm_counts) } export_data <- function(cpm_counts, filename) { - filename <- sub("\\.csv$", ".cpm.csv", filename) - print(paste('Exporting normalised counts per million to:', filename)) + filename <- sub("\\.(csv|tsv)$", ".cpm.csv", filename) + message(paste('Exporting normalised counts per million to:', filename)) write.table(cpm_counts, filename, sep = ',', row.names = TRUE, col.names = NA, quote = FALSE) } @@ -146,6 +201,12 @@ export_data <- function(cpm_counts, filename) { args <- get_args() +if ( is.null(args$design_file) ) { + message("A design dataframe must be provided.") + quit(save = "no", status = 1) +} + +message(paste("Normalising counts in", args$count_file)) cpm_counts <- get_normalised_cpm_counts(args$count_file, args$design_file) export_data(cpm_counts, basename(args$count_file)) diff --git a/bin/edger_normalise.R b/bin/old/normalise_with_edger.R similarity index 55% rename from bin/edger_normalise.R rename to bin/old/normalise_with_edger.R index 65f2c17e..f12d7e74 100755 --- a/bin/edger_normalise.R +++ b/bin/old/normalise_with_edger.R @@ -5,6 +5,9 @@ library(edgeR) library(optparse) +FAILURE_REASON_FILE <- "failure_reason.txt" +WARNING_REASON_FILE <- "warning_reason.txt" + ##################################################### ##################################################### # FUNCTIONS @@ -26,24 +29,45 @@ get_args <- function() { return(args) } +parse_dataframe <- function(file_path, ...) { + if (grepl("\\.csv$", file_path)) { + data <- read.csv(file_path, ...) + } else if (grepl("\\.tsv$", file_path)) { + data <- read.table(file_path, sep = "\t", header = TRUE, ...) + } else { + write("UNSUPPORTED FILE FORMAT", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } + return(data) +} + remove_all_zero_columns <- function(df) { # remove columns which contain only zeros - df <- df[, colSums(df) != 0] + df <- df[, colSums(df) != 0, drop = FALSE] return(df) } check_samples <- function(count_matrix, design_data) { # check if the column names of count_matrix match the sample names - if (!all(colnames(count_matrix) == design_data$sample)) { - stop("Sample names in the count matrix do not match the design data.") + if (!all( colnames(count_matrix) == design_data$sample )) { + write("SAMPLE NAMES IN COUNT MATRIX DO NOT MATCH DESIGN DATA", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } + # check for extra samples + extra_samples <- setdiff( colnames(count_matrix), design_data$sample ) + if (length(extra_samples) > 0) { + write( + "THE FOLLOWING SAMPLES ARE IN THE COUNT MATRIX BUT NOT IN DESIGN: ", paste(extra_samples, collapse = ", "), + file = WARNING_REASON_FILE + ) } } prefilter_counts <- function(count_matrix) { # remove genes having zeros for all counts # it is advised to remove them analysis - non_zero_rows <- rownames(count_matrix[apply(count_matrix!=0, 1, any),]) - filtered_count_matrix <- count_matrix[rownames(count_matrix) %in% non_zero_rows, ] + non_zero_rows <- rownames(count_matrix[apply(count_matrix!=0, 1, any), , drop = FALSE]) + filtered_count_matrix <- count_matrix[rownames(count_matrix) %in% non_zero_rows, , drop = FALSE] return(filtered_count_matrix) } @@ -70,52 +94,82 @@ get_cpm_counts <- function(dge) { get_normalised_cpm_counts <- function(count_file, design_file) { - print(paste('Normalizing counts in:', count_file)) - - count_data <- read.csv(args$count_file, row.names = 1) - design_data <- read.csv(design_file) + message(paste('Normalizing counts in:', count_file)) + message("Parsing count file") + count_data <- parse_dataframe(count_file, row.names = 1) count_matrix <- as.matrix(count_data) # in some rare datasets, columns can contain only zeros # we do not consider these columns + message("Removing columns with all zeros") count_matrix <- remove_all_zero_columns(count_matrix) + if (ncol(count_matrix) == 0) { + message("All columns were full of zeros.") + write("ALL COLUMNS WERE FULL OF ZEROS", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } + # getting design data + message("Parsing design file") + design_data <- parse_dataframe(design_file) + # removing extra samples in design table + message("Removing extra samples in design table") design_data <- design_data[design_data$sample %in% colnames(count_matrix), ] + if (nrow(design_data) == 0) { + message("Design and sample names do not match.") + write("DESIGN AND SAMPLE NAMES DO NOT MATCH", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } + # check if the column names of count_matrix match the sample names + message("Checking sample names") check_samples(count_matrix, design_data) # pre-filter genes with low counts + message("Pre-filtering genes") count_matrix <- prefilter_counts(count_matrix) + # if the dataframe is now empty, stop the process + if (nrow(count_matrix) == 0) { + message("No genes left after pre-filtering.") + write("NO GENES LEFT AFTER PRE-FILTERING", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } # Add a small pseudocount to avoid zero counts + message("Replacing zero counts with pseudocounts") count_matrix_pseudocount <- replace_zero_counts_with_pseudocounts(count_matrix) + message("Normalising data") group <- factor(design_data$condition) dge <- DGEList(counts = count_matrix_pseudocount, group = group) rownames(dge) <- rownames(count_matrix) colnames(dge) <- colnames(count_matrix) + message("Filtering out lowly expressed genes") dge <- filter_out_lowly_expressed_genes(dge) # if the dataframe is now empty, stop the process if (nrow(dge) == 0) { - message("No genes left after pre-filtering.") - quit(save = "no", status = 100) + message("No genes left after filtering lowly expressed genes.") + write("NO GENES LEFT AFTER FILTERING LOWLY EXPRESSED GENES", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) } # normalisation + message("Calculating normalisation factors") dge <- calcNormFactors(dge, method="TMM") + message("Calculating CPM counts") cpm_counts <- get_cpm_counts(dge) return(cpm_counts) } export_data <- function(cpm_counts, filename) { - filename <- sub("\\.csv$", ".cpm.csv", filename) - print(paste('Exporting normalised counts per million to:', filename)) + filename <- sub("\\.(csv|tsv)$", ".cpm.csv", filename) + message(paste('Exporting normalised counts per million to:', filename)) write.table(cpm_counts, filename, sep = ',', row.names = TRUE, col.names = NA, quote = FALSE) } @@ -127,6 +181,11 @@ export_data <- function(cpm_counts, filename) { args <- get_args() +if ( is.null(args$design_file) ) { + message("A design dataframe must be provided.") + quit(save = "no", status = 1) +} + cpm_counts <- get_normalised_cpm_counts(args$count_file, args$design_file) export_data(cpm_counts, basename(args$count_file)) diff --git a/bin/quantile_normalise.py b/bin/quantile_normalise.py index f4d903b8..241cd536 100755 --- a/bin/quantile_normalise.py +++ b/bin/quantile_normalise.py @@ -3,19 +3,22 @@ # Written by Olivier Coen. Released under the MIT license. import argparse -from pathlib import Path -import pandas as pd -from sklearn.preprocessing import QuantileTransformer import logging +from pathlib import Path + +import config +import polars as pl +from common import export_parquet, parse_count_table +from sklearn.preprocessing import quantile_transform logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -QUANT_NORM_SUFFIX = ".quant_norm.parquet" +OUTFILE_SUFFIX = ".quant_norm.parquet" -ENSEMBL_GENE_ID_COLNAME = "ensembl_gene_id" N_QUANTILES = 1000 -OUTPUT_DISTRIBUTION = "uniform" + +ALLOWED_TARGET_DISTRIBUTIONS = ["normal", "uniform"] ##################################################### @@ -32,29 +35,30 @@ def parse_args(): parser.add_argument( "--counts", type=Path, dest="count_file", required=True, help="Count file" ) + parser.add_argument( + "--target-distrib", + type=str, + dest="target_distribution", + required=True, + choices=ALLOWED_TARGET_DISTRIBUTIONS, + help="Target distribution to map counts to", + ) return parser.parse_args() -def quantile_normalize(data: pd.DataFrame): +def quantile_normalise(df: pl.DataFrame, target_distribution: str): """ - Quantile normalize a data matrix based on a target distribution. + Quantile normalize a dataframe; column by column, based on a target distribution. """ - transformer = QuantileTransformer( - n_quantiles=N_QUANTILES, output_distribution=OUTPUT_DISTRIBUTION + kwargs = dict( + n_quantiles=N_QUANTILES, output_distribution=target_distribution, subsample=None + ) + return df.with_columns( + pl.exclude(config.GENE_ID_COLNAME).map_batches( + lambda x: quantile_transform(x.to_frame(), **kwargs).flatten(), + return_dtype=pl.Float64, + ) ) - - normalised_data = pd.DataFrame(index=data.index, columns=data.columns) - for col in data.columns: - normalised_data[col] = transformer.fit_transform(data[col].to_frame()) - - return normalised_data - - -def export_count_data(quantile_normalized_counts: pd.DataFrame, count_file: Path): - """Export gene expression data to CSV files.""" - outfilename = count_file.name.replace(".csv", QUANT_NORM_SUFFIX) - logger.info(f"Exporting quantile normalised counts to: {outfilename}") - quantile_normalized_counts.reset_index().to_parquet(outfilename) ##################################################### @@ -68,13 +72,13 @@ def main(): args = parse_args() count_file = args.count_file - logger.info(f"Quantile normalising {count_file.name}") - count_df = pd.read_csv(count_file, index_col=0) - count_df.index.name = ENSEMBL_GENE_ID_COLNAME + logger.info(f"Parsing {count_file.name}") + count_df = parse_count_table(count_file) - quantile_normalized_counts = quantile_normalize(count_df) + logger.info(f"Quantile normalising {count_file.name}") + quantile_normalized_counts = quantile_normalise(count_df, args.target_distribution) - export_count_data(quantile_normalized_counts, count_file) + export_parquet(quantile_normalized_counts, count_file, OUTFILE_SUFFIX) if __name__ == "__main__": diff --git a/bin/remove_samples_not_valid.py b/bin/remove_samples_not_valid.py new file mode 100755 index 00000000..25adc84d --- /dev/null +++ b/bin/remove_samples_not_valid.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from pathlib import Path + +import config +import polars as pl +from common import parse_count_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +OUTFILE_SUFFIX = ".filtered.parquet" + +MAX_RATIO_ZEROS = 0.75 + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Filter out samples not valid") + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + return parser.parse_args() + + +def filter_out_columns_with_high_zero_ratio(df: pl.DataFrame, max_ratio_zeros: float): + zero_ratio_df = df.select(pl.exclude(config.GENE_ID_COLNAME).eq(pl.lit(0)).mean()) + valid_zero_ratio_samples = [ + col for col in zero_ratio_df.columns if zero_ratio_df[col][0] <= max_ratio_zeros + ] + return df.select(pl.col(config.GENE_ID_COLNAME), pl.col(valid_zero_ratio_samples)) + + +def export_data(df: pl.DataFrame, outfile: Path): + logger.info(f"Exporting filtered counts to: {outfile}") + df.write_parquet(outfile) + logger.info("Done") + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + # putting all counts into a single dataframe + logger.info("Loading count data...") + count_df = parse_count_table(args.count_file) + logger.info( + f"Loaded count data with {len(count_df)} rows and {count_df.shape[1]} columns" + ) + + valid_count_df = filter_out_columns_with_high_zero_ratio(count_df, MAX_RATIO_ZEROS) + + if valid_count_df.shape[1] == 0: + logger.error("No valid columns remaining") + sys.exit(0) + else: + logger.info( + f"Filtered out {count_df.shape[1] - valid_count_df.shape[1]} columns" + ) + outfile = args.count_file.with_suffix(OUTFILE_SUFFIX) + export_data(valid_count_df, outfile) + + +if __name__ == "__main__": + main() diff --git a/conf/base.config b/conf/base.config index bf07fd33..614bbb35 100644 --- a/conf/base.config +++ b/conf/base.config @@ -8,15 +8,37 @@ ---------------------------------------------------------------------------------------- */ +executor { + cpus = 8 + memory = 24.GB +} + process { - // TODO nf-core: Check the defaults for all processes + resourceLimits = [ + cpus: 16, + memory: '25.GB', + time: '4.h' + ] + cpus = { 1 * task.attempt } memory = { 6.GB * task.attempt } time = { 4.h * task.attempt } - errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } - maxRetries = 1 + errorStrategy = { + if (task.exitStatus in (100..102)) { // managed errors; they should not be retried but ignored at once + 'ignore' + } else if (task.exitStatus in ((130..145) + 104 + 175) && task.attempt <= 10) { // OOM & related errors; should be retried as long as memory does not fit + sleep(Math.pow(2, task.attempt) * 200 as long) + 'retry' + } else if (task.attempt <= 3) { // all other errors should be retried with exponential backoff with max retry = 3 + sleep(Math.pow(2, task.attempt) * 200 as long) + 'retry' + } else { // after 3 retries, ignore the error + 'ignore' + } + } + maxRetries = 10 maxErrors = '-1' // Process-specific resource requirements @@ -28,35 +50,23 @@ process { // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { cpus = { 1 } - memory = { 6.GB * task.attempt } - time = { 4.h * task.attempt } + memory = { 2.GB * task.attempt } + time = { 1.h * task.attempt } } withLabel:process_low { - cpus = { 2 * task.attempt } - memory = { 12.GB * task.attempt } - time = { 4.h * task.attempt } + cpus = { 2 } + memory = { 4.GB + 2.GB * task.attempt } + time = { 2.h * task.attempt } } withLabel:process_medium { - cpus = { 6 * task.attempt } - memory = { 36.GB * task.attempt } - time = { 8.h * task.attempt } + cpus = { 4 } + memory = { 6.GB + 2.GB * task.attempt } + time = { 4.h * task.attempt } } withLabel:process_high { - cpus = { 12 * task.attempt } - memory = { 72.GB * task.attempt } - time = { 16.h * task.attempt } - } - withLabel:process_long { - time = { 20.h * task.attempt } - } - withLabel:process_high_memory { - memory = { 200.GB * task.attempt } - } - withLabel:error_ignore { - errorStrategy = 'ignore' - } - withLabel:error_retry { - errorStrategy = 'retry' - maxRetries = 2 + cpus = { 4 } + memory = { 8.GB + 4.GB * task.attempt } + time = { 8.h * task.attempt } } + } diff --git a/conf/modules.config b/conf/modules.config index f0b0d55a..5c75b088 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -10,21 +10,16 @@ ---------------------------------------------------------------------------------------- */ -process { - - publishDir = [ - path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - - withName: 'MULTIQC' { - ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } - publishDir = [ - path: { "${params.outdir}/multiqc" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } +/* +publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } +] +*/ -} +includeConfig 'modules/public_data.config' +includeConfig 'modules/id_mapping.config' +includeConfig 'modules/normalisation.config' +includeConfig 'modules/qc.config' +includeConfig 'modules/aggregation.config' diff --git a/conf/modules/aggregation.config b/conf/modules/aggregation.config new file mode 100644 index 00000000..46a07491 --- /dev/null +++ b/conf/modules/aggregation.config @@ -0,0 +1,18 @@ +process { + + withName: AGGREGATE_RESULTS { + publishDir = [ + path: { "${params.outdir}/aggregated" }, + mode: params.publish_dir_mode + ] + } + + withName: MERGE_PLATFORM_COUNTS { + maxForks = 1 + } + + withName: COMPUTE_PLATFORM_STATISTICS { + maxForks = 1 + } + +} diff --git a/conf/modules/id_mapping.config b/conf/modules/id_mapping.config new file mode 100644 index 00000000..022316fe --- /dev/null +++ b/conf/modules/id_mapping.config @@ -0,0 +1,24 @@ +process { + + withName: COLLECT_GENE_IDS { + publishDir = [ + path: { "${params.outdir}/idmapping/collected_gene_ids" }, + mode: params.publish_dir_mode + ] + } + + withName: GPROFILER_IDMAPPING { + publishDir = [ + path: { "${params.outdir}/idmapping/gprofiler" }, + mode: params.publish_dir_mode + ] + } + + withName: RENAME_GENE_IDS { + publishDir = [ + path: { "${params.outdir}/idmapping/renamed" }, + mode: params.publish_dir_mode + ] + } + +} diff --git a/conf/modules/normalisation.config b/conf/modules/normalisation.config new file mode 100644 index 00000000..a6a496b0 --- /dev/null +++ b/conf/modules/normalisation.config @@ -0,0 +1,24 @@ +process { + + withName: COMPUTE_CPM { + publishDir = [ + path: { "${params.outdir}/normalised/${meta.dataset}/cpm/" }, + mode: params.publish_dir_mode + ] + } + + withName: COMPUTE_TPM { + publishDir = [ + path: { "${params.outdir}/normalised/${meta.dataset}/tpm/" }, + mode: params.publish_dir_mode + ] + } + + withName: QUANTILE_NORMALISATION { + publishDir = [ + path: { "${params.outdir}/normalised/${meta.dataset}/quantile_normalised/" }, + mode: params.publish_dir_mode + ] + } + +} diff --git a/conf/modules/public_data.config b/conf/modules/public_data.config new file mode 100644 index 00000000..df10451b --- /dev/null +++ b/conf/modules/public_data.config @@ -0,0 +1,33 @@ +process { + + withName: EXPRESSIONATLAS_GETACCESSIONS { + publishDir = [ + path: { "${params.outdir}/public_data/expression_atlas/accessions/" }, + mode: params.publish_dir_mode + ] + } + + withName: EXPRESSIONATLAS_GETDATA { + + publishDir = [ + path: { "${params.outdir}/public_data/expression_atlas/datasets/" }, + mode: params.publish_dir_mode + ] + + } + + withName: GEO_GETACCESSIONS { + publishDir = [ + path: { "${params.outdir}/public_data/geo/accessions/" }, + mode: params.publish_dir_mode + ] + } + + withName: GEO_GETDATA { + publishDir = [ + path: { "${params.outdir}/public_data/geo/datasets/" }, + mode: params.publish_dir_mode + ] + } + +} diff --git a/conf/modules/qc.config b/conf/modules/qc.config new file mode 100644 index 00000000..a81b42e1 --- /dev/null +++ b/conf/modules/qc.config @@ -0,0 +1,21 @@ +process { + + withName: 'MULTIQC' { + ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } + publishDir = [ + path: { "${params.outdir}/multiqc" }, + mode: 'copy' + ] + } + + withName: 'DASH_APP' { + publishDir = [ + path: { "${params.outdir}/dash_app/" }, + mode: 'copy', + saveAs: { + filename -> ['versions.yml', 'file_system_backend'].contains(filename) ? null : filename + } + ] + } + +} diff --git a/conf/test.config b/conf/test.config index e1092e88..fdf68236 100644 --- a/conf/test.config +++ b/conf/test.config @@ -6,26 +6,16 @@ It tests the different ways to use the pipeline, with small data Use as follows: - nextflow run nf-core/stableexpression -profile test, --outdir + nextflow run nf-core/stableexpression -profile test_dataset, --outdir ---------------------------------------------------------------------------------------- */ -process { - resourceLimits = [ - cpus: 4, - memory: '15.GB', - time: '1.h' - ] -} - params { - config_profile_name = 'Test profile' + config_profile_name = 'Test dataset profile' config_profile_description = 'Minimal test dataset to check pipeline function' // Input data - species = 'solanum tuberosum' - eatlas_keywords = "potato,stress" - eatlas_accessions = "E-MTAB-552" - outdir = "results" + species = 'beta vulgaris' + outdir = "results/test" } diff --git a/conf/test_dataset.config b/conf/test_dataset.config deleted file mode 100644 index 9049c848..00000000 --- a/conf/test_dataset.config +++ /dev/null @@ -1,22 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for running minimal tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines input files and everything required to run a fast and simple pipeline test. - It tests the different ways to use the pipeline, with small data - - Use as follows: - nextflow run nf-core/stableexpression -profile test, --outdir - ----------------------------------------------------------------------------------------- -*/ - -params { - config_profile_name = 'Test dataset profile' - config_profile_description = 'Minimal test dataset to check pipeline function' - - // Input data - species = 'solanum tuberosum' - datasets = "tests/test_data/custom_datasets/input.csv" - outdir = "results" -} diff --git a/conf/test_dataset_custom_mapping.config b/conf/test_dataset_custom_mapping.config deleted file mode 100644 index 52994331..00000000 --- a/conf/test_dataset_custom_mapping.config +++ /dev/null @@ -1,25 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for running minimal tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines input files and everything required to run a fast and simple pipeline test. - It tests the different ways to use the pipeline, with small data - - Use as follows: - nextflow run nf-core/stableexpression -profile test, --outdir - ----------------------------------------------------------------------------------------- -*/ - -params { - config_profile_name = 'Test dataset custom gene data profile' - config_profile_description = 'Minimal test dataset with custom gene metadata to check pipeline function' - - // Input data - species = 'solanum tuberosum' - datasets = "tests/test_data/custom_datasets/input.csv" - skip_gprofiler = true - gene_id_mapping = "tests/test_data/custom_datasets/mapping.csv" - gene_metadata = "tests/test_data/custom_datasets/metadata.csv" - outdir = "results" -} diff --git a/conf/test_dataset_eatlas.config b/conf/test_dataset_eatlas.config new file mode 100644 index 00000000..c46350ad --- /dev/null +++ b/conf/test_dataset_eatlas.config @@ -0,0 +1,25 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running full-size tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a full size pipeline test. + This tests the capacity of the pipeline to process a full size dataset. + + Use as follows: + nextflow run nf-core/stableexpression -profile test_full, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Full test profile' + config_profile_description = 'Full test dataset to check pipeline function' + + // Input data + species = 'mus_musculus' + accessions = "E-MTAB-2262" + skip_fetch_eatlas_accessions = true + fetch_geo_accessions = false + datasets = 'https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/input_big.yaml' + outdir = "results/test_dataset_eatlas" +} diff --git a/conf/test_full.config b/conf/test_full.config index 718e4eb1..d09774c0 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -16,7 +16,6 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data - species = 'arabidopsis thaliana' - fetch_eatlas_accessions = true - outdir = "results" + species = 'homo_sapiens' + outdir = "results/test_full" } diff --git a/conf/test_local_and_downloaded.config b/conf/test_local_and_downloaded.config deleted file mode 100644 index ea6f22af..00000000 --- a/conf/test_local_and_downloaded.config +++ /dev/null @@ -1,32 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for running minimal tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines input files and everything required to run a fast and simple pipeline test. - It tests the different ways to use the pipeline, with small data - - Use as follows: - nextflow run nf-core/stableexpression -profile test, --outdir - ----------------------------------------------------------------------------------------- -*/ - -process { - resourceLimits = [ - cpus: 4, - memory: '15.GB', - time: '1.h' - ] -} - -params { - config_profile_name = 'Test profile' - config_profile_description = 'Minimal test dataset to check pipeline function' - - // Input data - species = 'solanum tuberosum' - eatlas_keywords = "potato,stress" - eatlas_accessions = "E-MTAB-552" - datasets = "tests/test_data/custom_datasets/input.csv" - outdir = "results" -} diff --git a/conf/test_one_accession.config b/conf/test_one_accession.config deleted file mode 100644 index b54b01b8..00000000 --- a/conf/test_one_accession.config +++ /dev/null @@ -1,30 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for running minimal tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines input files and everything required to run a fast and simple pipeline test. - It tests the different ways to use the pipeline, with small data - - Use as follows: - nextflow run nf-core/stableexpression -profile test, --outdir - ----------------------------------------------------------------------------------------- -*/ - -process { - resourceLimits = [ - cpus: 4, - memory: '15.GB', - time: '1.h' - ] -} - -params { - config_profile_name = 'Test profile' - config_profile_description = 'Minimal test dataset to check pipeline function with only one accession to fetch from Expression Atlas.' - - // Input data - species = 'solanum tuberosum' - eatlas_accessions = "E-MTAB-552" - outdir = "results" -} diff --git a/conf/test_one_accession_low_gene_count.config b/conf/test_one_accession_low_gene_count.config deleted file mode 100644 index 03bd4250..00000000 --- a/conf/test_one_accession_low_gene_count.config +++ /dev/null @@ -1,30 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for running minimal tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines input files and everything required to run a fast and simple pipeline test. - It tests the different ways to use the pipeline, with small data - - Use as follows: - nextflow run nf-core/stableexpression -profile test, --outdir - ----------------------------------------------------------------------------------------- -*/ - -process { - resourceLimits = [ - cpus: 4, - memory: '15.GB', - time: '1.h' - ] -} - -params { - config_profile_name = 'Test profile' - config_profile_description = 'Minimal test dataset to check pipeline function with only one accession to fetch from Expression Atlas. This accession shows a very low gene count.' - - // Input data - species = 'arabidopsis thaliana' - eatlas_accessions = "E-GEOD-51720" - outdir = "results" -} diff --git a/docs/images/nf-core-stableexpression_logo_dark.png b/docs/images/nf-core-stableexpression_logo_dark.png index 168432f9..24d8da8b 100644 Binary files a/docs/images/nf-core-stableexpression_logo_dark.png and b/docs/images/nf-core-stableexpression_logo_dark.png differ diff --git a/docs/images/nf-core-stableexpression_logo_light.png b/docs/images/nf-core-stableexpression_logo_light.png index 6d092d2b..c4a8482e 100644 Binary files a/docs/images/nf-core-stableexpression_logo_light.png and b/docs/images/nf-core-stableexpression_logo_light.png differ diff --git a/docs/images/nf-core-stableexpression_metro_map.drawio b/docs/images/nf-core-stableexpression_metro_map.drawio deleted file mode 100644 index 67fd7efe..00000000 --- a/docs/images/nf-core-stableexpression_metro_map.drawio +++ /dev/null @@ -1,238 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/images/nf_core_stableexpression.metromap.png b/docs/images/nf_core_stableexpression.metromap.png new file mode 100644 index 00000000..eb1463dc Binary files /dev/null and b/docs/images/nf_core_stableexpression.metromap.png differ diff --git a/docs/output.md b/docs/output.md index d96305b5..a88d9667 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,42 +1,59 @@ # nf-core/stableexpression: Output +## Pipeline reports (TLDR) + +The main output of the pipeline is the MultiQC report, which summarises results at the end of the pipeline. This report is located at `/multiqc/multiqc_report.html` and can be opened in your favorite browser. + +For advanced users who seek to explore more deeply the distributions of normalised counts gene per gene or sample per sample, a Dash Plotly app is readily prepared at the end of each pipeline run. See [here](#dash-plotly-app) for explanation on how to run the app. + ## Introduction -This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. +This document describes the output produced by the pipeline. The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution -- [Expression Atlas](#expression-atlas): get Expression Atlas accessions and download data -- [Normalisation](#normalisation): normalise raw data (with DESeq2 or EdgeR) -- [gProfiler](#gprofiler-idmapping): map gene IDS to Ensembl IDS -- [Gene Statistics](#gene-statistics): merge all counts, compute gene variation statistics and get the most stable genes +1. Get accessions -## Output files +- Get [Expression Atlas](https://www.ebi.ac.uk/gxa/home) dataset accessions corresponding to the provided species (and optionally keywords) (run by default; optional) +- Get NBCI [GEO](https://www.ncbi.nlm.nih.gov/gds) **microarray** dataset accessions corresponding to the provided species (and optionally keywords) (run by default; optional) -### FastQC +2. Download data -
-Output files +- Download [Expression Atlas](https://www.ebi.ac.uk/gxa/home) data (run by default; optional) +- Download NBCI [GEO](https://www.ncbi.nlm.nih.gov/gds) data (run by default; optional) -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +3. ID Mapping -
+- Map gene IDS to NCBI Entrez Gene IDS (or Ensembl IDs) for standardisation among datasets using [g:Profiler](https://biit.cs.ut.ee/gprofiler/gost) (run by default; optional) + +4. Data normalisation + +- Normalize RNAseq raw data using TPM (necessitates downloading the corresponding genome and computing transcript lengths) or CPM. +- Perform quantile normalisation on each dataset separately using [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.quantile_transform.html) + +6. Merge all data +7. Compute base statistics for each gene, platform-wide and for each platform (RNAseq and microarray) +8. Compute stability scoring -[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +- Get list of candidate genes based on base statistics +- Run optimised, scalable version of [Normfinder](https://www.moma.dk/software/normfinder) +- Run optimised, scalable version of [Genorm](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2002-3-7-research0034) (NOT run by default; optional) +- Compute stability scores for each candidate gene + +9. Aggregate results +10. Prepare [Dash Plotly](https://dash.plotly.com/) app for further investigation of gene / sample counts +11. Make [`MultiQC`](http://multiqc.info/) report + +## Output files ### MultiQC +This report is located at `multiqc/multiqc_report.html` and can be opened in a browser. +
Output files @@ -51,26 +68,58 @@ MultiQC](http://multiqc.info) is a visualization tool that generates a single HT Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . -### Gene Variation +### Dash Plotly app + +`dash_app/`: folder containing the Dash Plotly app + +To launch the app, you must first create and activate the appropriate conda environment: + +```bash +conda env create -n nf-core-stableexpression-dash -f /dash_app/spec-file.txt +conda activate nf-core-stableexpression-dash +``` + +then: + +``` +cd dash_app +python app.py +``` + +and open your browser at `http://localhost:8080` + +> [!NOTE] +> The app will try to use the port `8080` by default. If it is already in use, it will try `8081`, `8082` and so on. Check the logs to see which port it is using. + +### Expression Atlas
Output files -- `gene_variation/` - - A list of the most stable genes in `stats_most_stable_genes.csv`. - - Descriptive statistics for all genes in `stats_all_genes.csv` - - All normalised counts (for each gene and each sample) in `count_summary.csv`. +- `public_data/expression_atlas/accessions/`: accessions found when querying Expression Atlas +- `public_data/expression_atlas/datasets/`: count datasets (normalized: `*.normalised.csv` / raw: `*.raw.csv`) and experimental designs (`*.design.csv`) downloaded from Expression Atlas.
-### Expression Atlas +### GEO + +
+Output files + +- `public_data/geo/accessions/`: accessions found when querying GEO +- `public_data/geo/datasets/`: count datasets (normalized: `*.normalised.csv` / raw: `*.raw.csv`) and experimental designs (`*.design.csv`) downloaded from GEO. + +
+ +### IDMapping (g:Profiler)
Output files -- `expressionatlas/` - - List of accessions found when querying Expression Atlas: `accessions.txt`. - - List of count datasets (normalized: `*.normalised.csv` / raw: `*.raw.csv`) and experimental designs (`*.design.csv`) downloaded from Expression Atlas. +- `idmapping/` + - Count datasets whose gene IDs have been mapped: `*.renamed.csv`. + - Table associating original gene IDs and mapped gene IDs: `*.mapping.csv`. + - Gene metadata (name and description): `*.metadata.csv`.
@@ -79,22 +128,51 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ
Output files -List of newly normalised datasets in `normalisation/` +- `normalised/`: Newly normalised datasets + - `normalised/deseq2/` for DESeq2 + - `normalised/edger/` for EdgeR +- `quantile_normalised` : Quantile normalised datasets + +### Gene base statistics -- `normalisation/deseq2/` for DESeq2 -- `normalisation/edger/` for EdgeR +
+Output files + +- `merged_datasets/`: Merged count datasets (sample-wide) + - `merged_datasets/all/` : all datasets together + - `merged_datasets/rnaseq/` : only RNA-seq datasets + - `merged_datasets/microarray/` : only microarray datasets
-### GProfiler IDMapping +### Merged counts + +The file containing all normalised counts is bundled as a Parquet file with the Dash Plotly app.
Output files -- `idmapping/` - - Count datasets whose gene IDs have been mapped to Ensembl IDs: `*_renamed.csv`. - - Table associating original gene IDs and Ensembl IDs: `*_mapping.csv`. - - Ensembl gene metadata (name and description): `*_metadata.csv`. +- `dash_app/data/all_counts.parquet`: Merged count datasets (sample-wide) + +
+ +### Summary of gene statistics and scores + +The gene stat summary is also bundled with the Dash Plotly app. + +
+Output files + +- `dash_app/data/all_genes_summary.csv`: file containing all gene statistics, scores and ranked by stability score + +
+ +### Overall experimental design + +
+Output files + +- `dash_app/data/whole_design.csv`: file containing all experimental design information
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md new file mode 100644 index 00000000..49034524 --- /dev/null +++ b/docs/troubleshooting.md @@ -0,0 +1,68 @@ +# nf-core/stableexpression: Troubleshooting + +## Error 139 on macOS + +If you are running the pipeline on macOS with containers (`docker`, `apptainer`, `singularity`, ...), you may encounter issues like: + +``` +NOTE: Process `NFCORE_STABLEEXPRESSION:STABLEEXPRESSION:ID_MAPPING:CLEAN_GENE_IDS ()` terminated with an error exit status (139) -- Execution is retried (1) +``` + +eventually leading to pipeline failure. + +This is likely due to the python polars library not being compatible with macOS when run inside a container. + +You should run the pipeline with `-profile micromamba` or `-profile conda`. + +## Çžo dataset found + +For species that are not on Expression Atlas, the pipeline will not be able to find suitable datasets and will log the following message: + +``` +ERROR: Could not find any readily usable public dataset +... +``` + +> [!TIP] +> You can first try to have the pipeline fetch suitable datasets from NCBI GEO by providing the `--fetch_geo_accessions` flag. + +In case no datasets are found, you'll have to find a way to get count datasets and to prepare them for the pipeline. +A good start is to check in the folder `/public_data/geo/datasets/` if there are `rejected` subfolders. Such subfolders contain datasets that were downloaded (together with their experimental design) but failed to pass checks. Quite often, some of them be manually reprocessed to be suitable for the pipeline. + +Finally, you may want to check by yourself on [NCBI GEO](https://www.ncbi.nlm.nih.gov/gds). + +Alternatively, some public websites contain expression datasets that may be suitable for the pipeline, such as: + +- [Bgee](https://www.bgee.org/) + +## Not enough memory + +The pipeline limits the number of downloaded datasets to a certain number in order to limit RAM usage, especially for `homo sapiens`. + +However, on small computers, the limit may be too permissive and lead to RAM overhead. You can reduce the number of datasets downloaded by setting the `--random_sampling_size` to a lower value. + +## Why do I get only a fraction of the public datasets available on Expression Atlas or NCBI GEO? Give them back! + +To reduce the RAM overhead, the pipeline selects randomly a certain number of datasets, based on the number of samples they contain. To increase the number of collected datasets, you can increase the `--random_sampling_size` parameter. + +[!TIP] + +> A seed is also set in order to make the runs reproducible. You can change the subset of chosen datasets by changing the `--random_sampling_seed`. + +## The pipeline failed to find a genome annotation for the specified species + +If you know the length of the longest cDNA for each gene, you can provide gene lengths yourself with the `--gene_length` flag (see [Custom gene ID mapping / metadata / length](docs/usage.md#5-custom-gene-id-mapping-and-metadata)). In case you do not have access to gene length, TPM normalisation cannot be formed. A fallback is to use CPM normalisation by setting `--normalisation_method cpm`. It will introduce a small bias towards long genes, but this should not result in big changes. + +## Java heap space + +In some cases, in particular when running the pipeline on a very large number of datasets (such as for `Homo sapiens`), the Nextflow Java virtual machines can start to request a large amount of memory. You may happen to see the following error: + +``` +java.lang.OutOfMemoryError: Java heap space +``` + +We recommend to increase the memory available to Java: + +```bash +export NXF_OPTS='-Xms1g -Xmx4g' +``` diff --git a/docs/usage.md b/docs/usage.md index 6966c25d..9a970f1f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -2,128 +2,250 @@ ## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/stableexpression/usage](https://nf-co.re/stableexpression/usage) +> [!WARNING] +> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files). + +> [!TIP] +> In case of issues with the pipeline, please check the [troubleshooting page](troubleshooting.md) or [report a new issue](https://github.com/nf-core/stableexpression/issues). + > _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ -## Introduction +## 1. Basic run + +This pipeline fetches Expression Atlas and GEO accessions for the provided species and downloads the corresponding data. + +```bash +nextflow run nf-core/stableexpression \ + -profile \ + --species \ + --outdir +``` -You can run this pipeline in multiple ways. +> [!NOTE] +> See [here](#profiles) for more information about profiles. -1. Expression Atlas **automatic mode**: without keywords +## 2. Specific public datasets -This pipeline fetches Expression Atlas accessions for the provided species and downloads the corresponding data. +You can provide keywords to restrict downloaded datasets to specific conditions. ```bash nextflow run nf-core/stableexpression \ - -profile \ - --species \ - --fetch_eatlas_accessions \ + -profile \ + --species \ + --keywords --outdir ``` -1. Expression Atlas **automatic mode**: with keywords +> [!NOTE] +> +> - Multiple keywords must be separated by commas. +> - Note that the keywords are additive: you will get datasets that fit with **either of the keywords**. +> - A dataset will be downloaded if a keyword is found in its summary or in the same of a sample. +> - The natural language processing [`Çčltk`](https://www.nltk.org/) python package is used to find keywords as well as derived words. For example, the `leaf` keyword should match 'leaf', 'leaves', 'leafy', etc. + +## 3. Provide your own accessions -The pipeline fetches Expression Atlas accessions for the provided species / keywords and downloads the corresponding data. You do not need to specify the `--fetch_eatlas_accessions` parameter when you specify keywords. +You may already have an idea of specific Expression Atlas / GEO accessions you want to use in the analysis. +In this case, you can provide them directly to the pipeline. ```bash nextflow run nf-core/stableexpression \ - -profile \ - --species \ - --eatlas_keywords + -profile \ + --species \ + --skip_fetch_eatlas_accessions \ + [--eatlas_accessions ] \ + [--eatlas_accessions_file ] \ + [--geo_accessions ] \ + [--geo_accessions_file ] \ --outdir ``` -3. Expression Atlas **manual mode** +> [!WARNING] +> If you want to download only the datasets corresponding to the accessions supplied, you must set the `--skip_fetch_eatlas_accessions` parameter. + +> [!NOTE] +> If you provide accessions through `--eatlas_accessions_file` or `--geo_accessions_file`, there must be one accession per line. The extension of the file does not matter. -The pipeline downloads the count datasets and experimental designs for the provided accessions. +In case you do not know which accessions you want but you would like to control precisely which datasets are included in you analysis, you may run first: ```bash nextflow run nf-core/stableexpression \ - -profile \ - --species \ - --eatlas_accessions \ + -profile \ + --species \ + --accessions_only \ --outdir ``` -4. Using local count datasets +Fetched accessions with their respective metadata will be available in `/expression_atlas/accessions/` and `/geo/accessions/` -Conversely, you can provide your own counts datasets / experiment designs. +## 4. Use your own expression datasets -First, prepare a samplesheet listing the different count datasets you want to use. Each row represents a specific dataset and must contain: +You can of course provide your own counts datasets / experimental designs. -- counts: the path to the count dataset (a CSV file) -- design: the path to the experimental design associated to this dataset (a CSV file) -- normalised: a boolean (true / false) representing whether the counts are already normalised or not +> [!NOTE] +> +> - To ensure all RNAseq datasets are processed the same way, you should provide **raw counts**. +> - In case normalised counts are provided, you should provide the same normalisation method for all of them (TPM, FPKM, etc.). -It should look as follows: +> [!WARNING] +> Microarray data must be already normalised. When mixing your own datasets with public ones in a single run, you should use the `RMA` method to be compliant with Expression Atlas and GEO datasets. + +First, prepare a CSV samplesheet listing the different count datasets you want to use. Each row represents a specific dataset and must contain: -`datasets.csv`: +| Column | Description | +| ------------ | ----------------------------------------------------------------------------------------- | +| `counts` | Path to the count dataset (a CSV / TSV file) | +| `design` | Path to the experimental design associated to this dataset (a CSV / TSV file) | +| `platform` | Platform used to generate the counts (`rnaseq` or `microarray`) | +| `normalised` | Boolean (`true` / `false`) representing whether the counts are already normalised or not. | -```csv +It should look as follows: + +```csv title=datasets.csv counts,design,platform,normalised path/to/normalised.counts.csv,path/to/normalised.design.csv,rnaseq,true -path/to/raw.counts.csv,path/to/raw.design.csv,microarray,false -... +path/to/raw.counts.csv,path/to/raw.design.csv,rnaseq,false +path/to/microarray.counts.csv,path/to/microarray.design.csv,microarray,true ``` -(the `platform` field can be either `rnaseq` or `microarray`). - -While the counts and design CSV files should have the following structure: +It can also be a YAML file: + +```yaml title=datasets.yaml +- counts: path/to/normalised.counts.csv + design: path/to/normalised.design.csv + platform: rnaseq + normalised: true +- counts: path/to/raw.counts.csv + design: path/to/raw.design.csv + platform: rnaseq + normalised: false +- counts: path/to/microarray.counts.csv + design: path/to/microarray.design.csv + platform: microarray + normalised: true +``` -`counts.csv`: +The counts should have the following structure: -```csv -,sample_A,sample_B,sample_C +```csv title=counts.csv +gene_id,sample_A,sample_B,sample_C gene_1,1,2,3 gene_2,1,2,3 -... ``` -> [!NOTE] -> -> - To ensure all RNAseq datasets are processed the same way, it is better to provide them raw. -> In case you want to provide normalise counts, please provide CPMs (counts per million) in order to stay aligned with the way raw datasets are processed in the pipeline. -> - Microarray data must be already normalised. To be compliant with Expression Atlas, you can use the `RMA` or `LOESS` methods. - -> [!WARNING] -> Remember to write a comma before the first sample name. This serves to indicate that the actual first column (gene IDs) is the index +While the design should look like: -`design.csv`: - -```csv +```csv title=design.csv sample,condition sample_A,condition_1 sample_B,condition_2 -... -... +sample_C,condition_1 ``` +> [!WARNING] +> +> - In the count file, the first header column (corresponding to gene IDs) should not be empty. However, its name can be anything. +> - The count file should not have any column other than the first one (gene IDs) and the sample columns. Extra columns will be ignored. + +> [!TIP] +> Both counts and design files can also be supplied as TSV files. + Now run the pipeline with: ```bash nextflow run nf-core/stableexpression \ - -profile \ - --species \ - --datasets \ + -profile \ + --species \ + --datasets \ + --skip_fetch_eatlas_accessions \ --outdir ``` -## Running the pipeline +> [!TIP] +> The `--skip_fetch_eatlas_accessions` parameter is supplied here to show how to analyse **only your own dataset**. You may remove this parameter if you want to mix you dataset(s) with public ones. -You can run the pipeline using a mix of the different pathways. +> [!IMPORTANT] +> By default, the pipeline tries to map gene IDs to NCBI Entrez Gene IDs. **All genes that cannot be mapped are discarded from the analysis**. This ensures that all genes are named the same between datasets and allows comparing multiple datasets with each other. If you are confident that your genes have the same name between your different datasets or if you think on the contrary that your gene IDs just won't be mapped properly, you can disable this mapping by adding the `--skip_id_mapping` parameter. In such case, you may supply your own gene id mapping file and gene metadata file with the `--gene_id_mapping` and `--gene_metadata` parameters respectively. See [next section](#5-custom-gene-id-mapping-and-metadata) for further details. -Example usage: +> [!TIP] +> You can check if your gene IDs can be mapped using the [g:Profiler server](https://biit.cs.ut.ee/gprofiler/convert). -> ```bash -> nextflow run nf-core/stableexpression \ -> -profile docker \ -> --species "Arabidopsis thaliana" \ -> --eatlas_accessions "E-MTAB-552,E-GEOD-61690" \ -> --eatlas_keywords "stress,flowering" \ -> --datasets ./datasets.csv \ -> --outdir ./results -> ``` +### 5. Custom gene ID mapping / metadata / length + +You can supply your own: + +- gene id mapping file +- gene metadata file +- gene length file + +The gene ID mapping file is used to map gene IDs in count table(s) (local or downloaded) to more generic IDs that will be used as basis fore subsequent steps. + +The gene metadata file provides additional information about the genes, such as their common name and description. + +The gene length file provides the length of each gene, which is used to compute the TPM values during gene expression normalisation. -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +```bash +nextflow run nf-core/stableexpression \ + -profile \ + --species \ + --datasets \ + --gene_id_mapping \ + --gene_metadata \ + --gene_length \ + --skip_fetch_eatlas_accessions \ + --outdir +``` + +Structure of the gene id mapping file: + +| Column | Description | +| ------------------ | --------------------------------------------- | +| `original_gene_id` | Gene ID used in the provided count dataset(s) | +| `gene_id` | Mapped gene ID | + +Example: + +```csv title=gene_id_mapping.csv +original_gene_id,gene_id +gene_A,ENSG1234567890 +geneB,OTHERmappedgeneID +``` + +Structure of the gene metadata file: + +| Column | Description | +| ------------- | ---------------- | +| `gene_id` | Mapped gene ID | +| `name` | Gene common name | +| `description` | Gene description | + +Example: + +```csv title=gene_metadata.csv +gene_id,name,description +ENSG1234567890,Gene A,Description of gene A +OTHERmappedgeneID,My OTHER Gene,Another description +``` + +Structure of the gene length file: + +| Column | Description | +| --------- | -------------------------------- | +| `gene_id` | Mapped gene ID | +| `length` | Gene length (longest transcript) | + +Example: + +````csv title=gene_length.csv +gene_id,length +ENSG1234567890,1000 +OTHERmappedgeneID,2000 + +### 6. More advanced scenarios + +For advanced scenarios, you can see the list of available parameters in the [parameter documentation](https://nf-co.re/stableexpression/parameters). + +## Pipeline output Note that the pipeline will create the following files in your working directory: @@ -132,20 +254,23 @@ work # Directory containing the nextflow working files # Finished results in specified location (defined with --outdir) .nextflow_log # Log file from Nextflow # Other nextflow hidden files, eg. history of pipeline runs and old logs. -``` +```` + +For a detailed description of the output files, please consult the [nf-core stableexpression output directory structure](https://nf-co.re/stableexpression/output). + +## Parameters If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. -:::warning -Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). -::: +> [!WARNING] +> Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). The above pipeline run specified with a params file in yaml format: ```bash -nextflow run nf-core/stableexpression -profile docker -params-file params.yaml +nextflow run -r dev nf-core/stableexpression -profile docker -params-file params.yaml ``` with: @@ -169,7 +294,7 @@ nextflow pull nf-core/stableexpression ### Reproducibility -It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. +It is a good idea to specify the pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. First, go to the [nf-core/stableexpression releases page](https://github.com/nf-core/stableexpression/releases) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. @@ -177,25 +302,43 @@ This version number will be logged in reports when you run the pipeline, so that To further assist in reproducibility, you can use share and reuse [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. -:::tip -If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. -::: +> [!TIP] +> If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. ## Core Nextflow arguments -:::note -These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). -::: +> [!NOTE] +> These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen) -### `-profile` +### [`-profile`](#profiles) Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. -:::info -We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. -::: +> [!IMPORTANT] +> We highly recommend the use of Apptainer (Singularity) or Docker containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. + +> [!TIP] + +> When running the pipeline of multi-user server or on a cluster, the best practice is to use Apptainer (formerly Singularity). You can install Apptainer by following these [instructions](https://apptainer.org/docs/admin/main/installation.html#). +> In case you encounter the following error when running Apptainer: +> +> ``` +> ERROR : Could not write info to setgroups: Permission denied +> ERROR : Error while waiting event for user namespace mappings: no event received +> ``` +> +> you may need to install the `apptainer-suid` package instead of `apptainer`: +> +> ``` +> # Debian / Ubuntu +> sudo apt install apptainer-suid +> # RHEL / CentOS +> sudo yum install apptainer-suid +> # Fedora +> sudo dnf install apptainer-suid +> ``` The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to check if your system is supported, please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). @@ -207,6 +350,8 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - `test` - A profile with a complete configuration for automated testing - Includes links to test data so needs no other parameters +- `apptainer` + - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) - `docker` - A generic configuration profile to be used with [Docker](https://docker.com/) - `singularity` @@ -216,13 +361,13 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - `shifter` - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) - `charliecloud` - - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) -- `apptainer` - - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) + - A generic configuration profile to be used with [Charliecloud](https://charliecloud.io/) - `wave` - A generic configuration profile to enable [Wave](https://seqera.io/wave/) containers. Use together with one of the above (requires Nextflow ` 24.03.0-edge` or later). - `conda` - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter, Charliecloud, or Apptainer. +- `micromamba` + - A faster, more lightweight alternative to Conda. As for Conda, use Micromamba as a last resort. ### `-resume` diff --git a/galaxy/README.md b/galaxy/README.md new file mode 100644 index 00000000..112eb49d --- /dev/null +++ b/galaxy/README.md @@ -0,0 +1,89 @@ +# Galaxy + +## Setup build / testing environment + +NB: You need conda installed (micromamba does not work, since the Galaxy installer looks for a venv / conda environment) + +Create a new environment with python and planemo installed: + +``` +conda env create -f environment.yml -y +conda activate planemo +``` + +## Build tool XML file + +The XML definition file is partially generated dynamically by: + +- parsing nextflow_schema.json +- fetching latest version of Nextflow, Singularity and OpenJDK in Conda channels + +However, you need to build a boilerplate file with things that cannot be directly interpreted from nextflow_schema.json, such as: + +- path to selected output files +- tests +- specific conditions for the inputs + +### Build boilerplate XML file (only once) + +``` +python build/build_boilerplate.py +``` + +The boilerplate XML file is generated at `galaxy/build/static/boilerplate.xml`. + +### Customise boilerplate XML file + +You must edit the boilerplate XML file to add your customisations: + +- Mandatory (at least if your pipeline uses a samplesheet): modify file paths in the samplesheet + Galaxy has its own path system, and you must retrieve dynamically the paths of the files provided, in order to modify them in the samplesheet + "Running the pipeline" + In this cas, add "&&" before "nextflow drop ..." + +- modify outputs +- add tests + +``` +python build/build_custom.py +``` + +### Build XML file (at each release) + +``` +python build/build_tool.py +``` + +This script will fetch : + +- all the parameters in your nextflow_schema.json +- the latest version of Nextflow, Singularity and OpenJDK in Conda channels. + +Your tool is ready to be used! + +## Test tool + +### Launch local Galaxy server + +You may want to have a first look at what your tool looks like in the Galaxy interface. +To launch a local instance of Galaxy with your tool already installed: + +``` +tool/serve.sh +``` + +You can test the behaviour of your tool by providing different inputs and check the corrsponding output. + +### Linting and testing + +To lint your tool: + +``` +test/lint.sh +``` + +To test your tool: + +``` +test/test.sh +``` diff --git a/galaxy/build/build_boilerplate.py b/galaxy/build/build_boilerplate.py new file mode 100755 index 00000000..12c7a21a --- /dev/null +++ b/galaxy/build/build_boilerplate.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 + +import logging +from pathlib import Path + +from formatters import ConfigFormatter + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +STATIC_TOOL_FILENAME = Path(__file__).parent / "static/boilerplate.xml" +TEMPLATE_FILENAME = Path(__file__).parent / "static/boilerplate.template.xml" + + +def main(): + logger.info("Parsing config") + pipeline_metadata = ConfigFormatter.get_pipeline_metadata() + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # REPLACING ACTUAL PARAMS IN STATIC TOOL + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + with open(TEMPLATE_FILENAME, "r") as fin: + template_string = fin.read() + + pipeline_name = pipeline_metadata["name"].replace("nf-core/", "") + + logger.info("Building boilerplate XML file") + template_string = template_string.replace("PIPELINE_NAME", pipeline_name) + + with open(STATIC_TOOL_FILENAME, "w") as fout: + fout.write(template_string) + + logger.info("Done") + + +if __name__ == "__main__": + main() diff --git a/galaxy/build/build_tool.py b/galaxy/build/build_tool.py new file mode 100755 index 00000000..9ba1454f --- /dev/null +++ b/galaxy/build/build_tool.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 + +import logging +from pathlib import Path + +from formatters import SchemaFormatter, ConfigFormatter + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +tool_boilerplate_file = Path(__file__).parent / "static/boilerplate.xml" +tool_file = Path(__file__).parents[1] / "tool_shed/tool/nf_core_{}.xml" + + +def main(): + logger.info("Formatting config") + # package_versions = ConfigFormatter.get_package_versions() + pipeline_metadata = ConfigFormatter.get_pipeline_metadata() + + logger.info("Formatting schema") + schema_formatter = SchemaFormatter() + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # REPLACING ACTUAL PARAMS IN STATIC TOOL + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + with open(tool_boilerplate_file, "r") as fin: + static_string = fin.read() + + # checking if package versions were filled by the user + for package_version in ["OPENJDK_VERSION"]: + if package_version in static_string: + raise ValueError( + f"You must fill the package version in place of {package_version} before building" + ) + + logger.info("Building tool XML file") + tool_string = ( + static_string + # .replace("NEXTFLOW_VERSION", package_versions["nextflow"]) + # .replace("APPTAINER_VERSION", package_versions["apptainer"]) + # .replace("OPENJDK_VERSION", package_versions["openjdk"]) + .replace("PIPELINE_VERSION", pipeline_metadata["version"]) + .replace("DESCRIPTION", schema_formatter.pipeline_description) + .replace("PARAMETERS", schema_formatter.params_cli) + .replace("INPUTS", schema_formatter.inputs) + ) + + pipeline_name = pipeline_metadata["name"].replace("nf-core/", "") + outfile = Path(str(tool_file).format(pipeline_name)) + with open(outfile, "w") as fout: + fout.write(tool_string) + + logger.info("Done") + + +if __name__ == "__main__": + main() diff --git a/galaxy/build/formatters/__init__.py b/galaxy/build/formatters/__init__.py new file mode 100644 index 00000000..4f46ac15 --- /dev/null +++ b/galaxy/build/formatters/__init__.py @@ -0,0 +1,4 @@ +from .schema.base import SchemaFormatter +from .config.base import ConfigFormatter + +__all__ = ["SchemaFormatter", "ConfigFormatter"] diff --git a/galaxy/build/formatters/config/base.py b/galaxy/build/formatters/config/base.py new file mode 100644 index 00000000..9b31d537 --- /dev/null +++ b/galaxy/build/formatters/config/base.py @@ -0,0 +1,84 @@ +from pathlib import Path +import requests +import re +from dataclasses import dataclass +from typing import ClassVar +from packaging.version import parse as vparse +import logging + +logger = logging.getLogger(__name__) + + +@dataclass +class BaseConfigFormatter: + CONFIG_FILE: ClassVar[Path] = Path(__file__).parents[4] / "nextflow.config" + MAIN_FILE: ClassVar[Path] = Path(__file__).parents[4] / "main.nf" + PACKAGES_REPOS: ClassVar[dict] = { + "nextflow": "bioconda", + "apptainer": "conda-forge", + # "openjdk": "conda-forge", + } + + @classmethod + def get_package_versions(cls) -> dict: + # CONDA PACKAGE VERSIONS + package_version = {} + for package, repo in cls.PACKAGES_REPOS.items(): + package_version[package] = cls.get_package_version(package, repo) + return package_version + + @staticmethod + def get_package_version(package: str, repo: str) -> str: + """ + Get latest pip version of package + """ + logger.info(f"Getting latest version of package {package}") + url = f" https://api.anaconda.org/package/{repo}/{package}" + try: + response = requests.get(url) + response.raise_for_status() + data = response.json() + versions = sorted( + data["versions"], reverse=True, key=vparse + ) # from latest to oldest + return versions[0] # most recent + except requests.RequestException as e: + raise RuntimeError(f"Error fetching version info: {e}") + + @classmethod + def get_pipeline_metadata(cls) -> dict: + # PARSING CONFIG + with open(cls.CONFIG_FILE, "r") as f: + pipeline_config = f.read() + + # regular expression to find the manifest block and extract the version + manifest_pattern = re.compile(r"manifest\s*{\s*(.*?)\s*}", re.DOTALL) + manifest_match = manifest_pattern.search(pipeline_config) + version = None + name = None + + if manifest_match: + manifest_content = manifest_match.group(1) + + # regular expression to find the version field + name_pattern = re.compile(r'name\s*=\s*[\'"](.*?)[\'"]') + name_match = name_pattern.search(manifest_content) + if name_match: + name = name_match.group(1) + else: + raise ValueError("No name found in pipeline config") + + # regular expression to find the version field + version_pattern = re.compile(r'version\s*=\s*[\'"](.*?)[\'"]') + version_match = version_pattern.search(manifest_content) + if version_match: + version = version_match.group(1) + else: + raise ValueError("No version found in pipeline config") + + return dict(name=name, version=version) + + +@dataclass +class ConfigFormatter(BaseConfigFormatter): + pass diff --git a/galaxy/build/formatters/schema/base.py b/galaxy/build/formatters/schema/base.py new file mode 100644 index 00000000..50a702f0 --- /dev/null +++ b/galaxy/build/formatters/schema/base.py @@ -0,0 +1,98 @@ +from pathlib import Path +import json +from dataclasses import dataclass, field +from typing import ClassVar +from . import parameter + + +@dataclass +class SchemaFormatter: + SCHEMA_FILE: ClassVar[Path] = Path(__file__).parents[4] / "nextflow_schema.json" + PARAMS_TO_IGNORE: ClassVar[list] = ["outdir", "email", "multiqc_title"] + SECTIONS_TO_IGNORE: ClassVar[list] = [ + "institutional_config_options", + "generic_options", + ] + SECTIONS_TO_EXPAND: ClassVar[list] = ["input_output_options"] + + pipeline_description: str = field(init=False) + inputs: str = field(init=False) + params_cli: str = field(init=False) + _pipeline_params: dict = field(init=False) + + _inputs: list = field(init=False, default_factory=list) + _params_cli: list = field(init=False, default_factory=list) + + def __post_init__(self): + self.parse_schema_file() + + def parse_schema_file(self): + with open(self.SCHEMA_FILE, "r") as f: + pipeline_schema = json.load(f) + + self.pipeline_description = pipeline_schema["description"].strip("\n") + self._pipeline_params = pipeline_schema["$defs"] + + # PARSING PARAMETERS AND BUILDING STRINGS + for section, section_dict in self._pipeline_params.items(): + if section in self.SECTIONS_TO_IGNORE: + continue + + section_inputs, section_params_cli, section_usage_options = ( + self.format_input_section(section, section_dict) + ) + + self._inputs += section_inputs + self._params_cli += section_params_cli + + self.inputs = "\n".join(self._inputs) + self.params_cli = "\n".join(self._params_cli) + + def format_input_section( + self, section: str, section_dict: dict + ) -> tuple[list, list, list]: + section_inputs = [] + section_params_cli = [] + section_usage_options = [] + + section_title = "" + section_help = "" + + if title := section_dict.get("title"): + section_title = f' title="{title}"' + if description := section_dict.get("description"): + section_help = f' help="{description}"' + + section_expanded = ( + ' expanded="true"' + if section in self.SECTIONS_TO_EXPAND + else ' expanded="false"' + ) + + section_inputs.append( + f'\t\t
' + ) + section_usage_options.append("\n\t" + section.capitalize().replace("_", " ")) + + required_params = section_dict.get("required", []) + + for param, param_dict in section_dict["properties"].items(): + if param not in self.PARAMS_TO_IGNORE: + optional = param not in required_params + + # checking if param must be parsed in a generic or in a custom way + if param in parameter.PARAMETER_TO_CUSTOM_CLASS: + class_ = parameter.PARAMETER_TO_CUSTOM_CLASS[param] + else: + class_ = parameter.BaseParameterFormatter + + param_formatter = class_(param, section, param_dict, optional) + + # input arguments + section_inputs.append(param_formatter.get_input()) + # cli + section_params_cli.append(param_formatter.get_cli()) + + section_inputs.append("\t\t
") + + return section_inputs, section_params_cli, section_usage_options diff --git a/galaxy/build/formatters/schema/parameter/__init__.py b/galaxy/build/formatters/schema/parameter/__init__.py new file mode 100644 index 00000000..2708d3be --- /dev/null +++ b/galaxy/build/formatters/schema/parameter/__init__.py @@ -0,0 +1,14 @@ +from .base import BaseParameterFormatter +from .datasets import DatasetsParameterFormatter +from .required import RequiredParameterFormatter + +# from .default_value import DefaultValueParameterFormatter + +PARAMETER_TO_CUSTOM_CLASS = { + "datasets": DatasetsParameterFormatter, + "normalisation_method": RequiredParameterFormatter, + "nb_top_gene_candidates": RequiredParameterFormatter, + # "species": DefaultValueParameterFormatter, +} + +__all__ = ["BaseParameterFormatter"] diff --git a/galaxy/build/formatters/schema/parameter/base.py b/galaxy/build/formatters/schema/parameter/base.py new file mode 100644 index 00000000..fb09787d --- /dev/null +++ b/galaxy/build/formatters/schema/parameter/base.py @@ -0,0 +1,164 @@ +from dataclasses import dataclass +from typing import ClassVar + + +@dataclass +class Validator: + PATTERN: ClassVar[str] = ( + '\t\t\t{expression}\n' + ) + + type: str + message: str + expression: str + + def __str__(self): + return self.PATTERN.format( + type=self.type, message=self.message, expression=self.expression + ) + + +@dataclass +class Option: + PATTERN: ClassVar[str] = ( + '\t\t\t\n' + ) + + value: str + default_value: str + optional: bool + + def __str__(self): + selected_arg = ' selected="true"' if self.value == self.default_value else "" + return self.PATTERN.format( + option=self.value, label=self.value.capitalize(), selected_arg=selected_arg + ) + + +@dataclass +class BaseParameterFormatter: + NF_TYPES_TO_GALAXY: ClassVar[dict] = { + "string": "text", + "boolean": "boolean", + "integer": "integer", + "number": "float", + } + + param: str + section: str + param_dict: dict + optional: bool + + @staticmethod + def enrich_input_param(input_param_str: str, args: list[str]) -> str: + # opening param for enrichment + input_param_str = input_param_str.replace(" />", ">\n") + # adding each arg in a separate line + for arg in args: + input_param_str += "\t" + arg + # closing + input_param_str += "\t\t\t" + return input_param_str + + def get_input(self) -> str: + """ + building input param + """ + + input_param_str = '\t\t\t' + + param_format = "" + param_label = "" + param_help = "" + param_true_false = "" + param_value = "" + param_min = "" + param_max = "" + param_optional = ' optional="true"' if self.optional else ' optional="false"' + + param_type = self.param_dict["type"] + default_value = self.param_dict.get("default") + + if param_type == "string" and self.param_dict.get("format") == "file-path": + input_type = "data" + # removing extension check as files are renamed in .dat files by Galaxy + if pattern := self.param_dict.get( + "pattern" + ): # going from something like "^\\S+\\.(csv|yaml)$" to "csv,ya + # getting the extensions part + extension_str = pattern.split(".")[-1] + # removes recursively all leading and traling "(", ")" and "$" + extension_str = extension_str.strip("$()") + # getting list of extensions; removing dat because this extension is specifically made to handle Galaxy filename + extensions = [ext for ext in extension_str.split("|") if ext != "dat"] + formated_extensions_str = ",".join(extensions) + param_format = f' format="{formated_extensions_str}"' + else: + # there is no specific pattern provided in the schema, this means that the format does not matter much + # however, the planemo linter needs a format, so we specify format="data" + param_format = ' format="data"' + + else: + input_type = self.NF_TYPES_TO_GALAXY[param_type] + + if param_type == "boolean": + param_true_false = f' truevalue="--{self.param}" falsevalue=""' + + elif param_type in ["integer", "number"]: + if minimum := self.param_dict.get("minimum"): + param_min = f' min="{minimum}"' + if maximum := self.param_dict.get("maximum"): + param_max = f' max="{maximum}"' + + elif param_type == "string": + # TODO: handle (rare) case where bot enum and pattern are given + if pattern := self.param_dict.get("pattern"): # regex + msg = f"must match regular expression {pattern}" + validator = Validator(type="regex", message=msg, expression=pattern) + input_param_str = self.enrich_input_param( + input_param_str, args=[str(validator)] + ) + + # handle parameter with enum (options) + if option_values := self.param_dict.get("enum"): + input_type = "select" + options = [ + Option(value, default_value, self.optional) for value in option_values + ] + input_param_str = self.enrich_input_param( + input_param_str, args=[str(option) for option in options] + ) + + else: + if default_value is not None: + param_value = f' value="{default_value}"' + + if description := self.param_dict.get("description"): + param_label = f'label="{description}"' + if help_text := self.param_dict.get("help_text"): + param_help = f' help="{help_text}"' + + return input_param_str.format( + param=self.param, + type=input_type, + label=param_label, + format=param_format, + value=param_value, + min=param_min, + max=param_max, + true_false=param_true_false, + help=param_help, + optional=param_optional, + ) + + def get_cli(self) -> str: + # extra quotes if string parameter + value = ( + f'"${self.section}.{self.param}"' + if self.param_dict["type"] == "string" + else f"${self.section}.{self.param}" + ) + if self.optional: + return f"\t\t\t#if ${self.section}.{self.param}\n\t\t\t --{self.param} {value}\n\t\t\t#end if" + else: + return f"\t\t\t--{self.param} {value}" diff --git a/galaxy/build/formatters/schema/parameter/datasets.py b/galaxy/build/formatters/schema/parameter/datasets.py new file mode 100644 index 00000000..bcdb42cd --- /dev/null +++ b/galaxy/build/formatters/schema/parameter/datasets.py @@ -0,0 +1,43 @@ +import re +from dataclasses import dataclass +from .base import BaseParameterFormatter + + +@dataclass +class DatasetsParameterFormatter(BaseParameterFormatter): + # if param is an optional file with multiple possible values, it requires special handling + # see https://docs.galaxyproject.org/en/latest/dev/schema.html#id51 + + def get_input(self) -> str: + input_param_str = super().get_input() + # setting to required + # changing param name + input_param_str = input_param_str.replace( + 'optional="true"', 'optional="false"' + ).replace(self.param, "samplesheet") + # changing label + input_param_str = re.sub( + r'label="[\s\w]*"', 'label="Samplesheet"', input_param_str + ) + + # adding conditional statement + return f""" \t\t\t + + + + + + {input_param_str} + + + + + + """ + + def get_cli(self) -> str: + # see https://planemo.readthedocs.io/en/latest/writing_advanced.html#consuming-collections + return f""" + \t#if ${self.section}.datasets.provide_datasets == "true": + \t\t--datasets renamed_samplesheet.csv + \t#end if""" diff --git a/galaxy/build/formatters/schema/parameter/default_value.py b/galaxy/build/formatters/schema/parameter/default_value.py new file mode 100644 index 00000000..e141f461 --- /dev/null +++ b/galaxy/build/formatters/schema/parameter/default_value.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass +from .base import BaseParameterFormatter + + +@dataclass +class DefaultValueParameterFormatter(BaseParameterFormatter): + def __post_init__(self): + self.param_dict["default"] = "Solanum tuberosum" diff --git a/galaxy/build/formatters/schema/parameter/required.py b/galaxy/build/formatters/schema/parameter/required.py new file mode 100644 index 00000000..52cdbb82 --- /dev/null +++ b/galaxy/build/formatters/schema/parameter/required.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass +from .base import BaseParameterFormatter + + +@dataclass +class RequiredParameterFormatter(BaseParameterFormatter): + def __post_init__(self): + self.optional = False diff --git a/galaxy/build/static/boilerplate.template.xml b/galaxy/build/static/boilerplate.template.xml new file mode 100644 index 00000000..f2429c20 --- /dev/null +++ b/galaxy/build/static/boilerplate.template.xml @@ -0,0 +1,53 @@ + + DESCRIPTION + + nextflow + apptainer + openjdk + + + + + +INPUTS + + + + + + + + + @misc{nf-core/PIPELINE_NAME, + author = {}, + year = {}, + title = {nf-core/PIPELINE_NAME}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/nf-core/PIPELINE_NAME}, + } + + + diff --git a/galaxy/build/static/boilerplate.xml b/galaxy/build/static/boilerplate.xml new file mode 100644 index 00000000..0d79424c --- /dev/null +++ b/galaxy/build/static/boilerplate.xml @@ -0,0 +1,148 @@ + + DESCRIPTION + + nextflow + micromamba + openjdk + + + + + +INPUTS + + + + + + + + +
+ + + + + + + +
+
+ +
+ + + + + + + + + + +
+ +
+ + + + + +
+ + + + + + + + + + +
+ +
+ + + + + + + +
+ + + + + + + + + + +
+
+ + + + @misc{nf-core/stableexpression, + author = {Coen, Olivier}, + year = {2025}, + title = {nf-core/stableexpression}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/OlivierCoen/stableexpression}, + } + + +
diff --git a/galaxy/dev/nextflow_apptainer.xml b/galaxy/dev/nextflow_apptainer.xml new file mode 100644 index 00000000..27f1f851 --- /dev/null +++ b/galaxy/dev/nextflow_apptainer.xml @@ -0,0 +1,34 @@ + + This pipeline is dedicated to finding the most stable genes across count datasets + + nextflow + apptainer + fuse-overlayfs + openjdk + + + results/species.txt + + && zip -r results.zip results + + ]]> + + + + + + + + diff --git a/galaxy/environment.yml b/galaxy/environment.yml new file mode 100644 index 00000000..feae659f --- /dev/null +++ b/galaxy/environment.yml @@ -0,0 +1,10 @@ +name: planemo +channels: + - defaults + - conda-forge + - bioconda + - nodefaults +dependencies: + - python=3.12 + - pip: + - planemo==0.75.33 diff --git a/galaxy/lint b/galaxy/lint new file mode 100755 index 00000000..dd141d2f --- /dev/null +++ b/galaxy/lint @@ -0,0 +1,8 @@ +#!/bin/bash + +galaxy_dir="$(dirname $(readlink -f "$0"))" +tool_file="${galaxy_dir}/tool_shed/tool/nf_core_stableexpression.xml" + +planemo lint $tool_file + +planemo shed_lint tool_shed/tool --tools diff --git a/galaxy/serve b/galaxy/serve new file mode 100755 index 00000000..019e9ee0 --- /dev/null +++ b/galaxy/serve @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +galaxy_dir="$(dirname $(readlink -f "$0"))" +tool_dir="${galaxy_dir}/tool_shed/tool" + +planemo serve \ + $tool_dir + +# add --no_cleanup to keep the pipelines workdirs after a run +# very useful for debugging diff --git a/galaxy/test b/galaxy/test new file mode 100755 index 00000000..2a511baf --- /dev/null +++ b/galaxy/test @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +galaxy_dir="$(dirname $(readlink -f "$0"))" +tool_file="${galaxy_dir}/tool_shed/tool/nf_core_stableexpression.xml" + +TEST_OUTDIR="tests/output" + + +ARGS="$@" + +# add --update_test_data to create output file +planemo test \ + $tool_file \ + --job_output_files $TEST_OUTDIR \ + $ARGS diff --git a/galaxy/tool_shed/.shed.yml b/galaxy/tool_shed/.shed.yml new file mode 100644 index 00000000..1cc55908 --- /dev/null +++ b/galaxy/tool_shed/.shed.yml @@ -0,0 +1,13 @@ +categories: + - Transcriptomics + - RNA + - Micro-array Analysis +description: Pipeline dedicated to finding the most stable genes across count datasets +homepage_url: https://nf-co.re/stableexpression/ +long_description: | + nf-core/stableexpression is a bioinformatics pipeline that aims at finding the most stable genes among a single or multiple public / local count datasets. + It takes as input a species name (mandatory), keywords for expression atlas search (optional) and / or a CSV input file listing local raw / normalised count datasets (optional). + A typical usage is to find the most suitable qPCR housekeeping genes for a specific species (and optionally specific conditions). +name: nf_core_stableexpression +owner: ocoen +remote_repository_url: https://github.com/OlivierCoen/stableexpression/ diff --git a/galaxy/tool_shed/tool/nf_core_stableexpression.xml b/galaxy/tool_shed/tool/nf_core_stableexpression.xml new file mode 100644 index 00000000..15b10895 --- /dev/null +++ b/galaxy/tool_shed/tool/nf_core_stableexpression.xml @@ -0,0 +1,299 @@ + + This pipeline is dedicated to finding the most stable genes across count datasets + + nextflow + micromamba + openjdk + + + + + +
+ + ([a-zA-Z]+)[_ ]([a-zA-Z]+) + + + + + + + + + + + + + + + + ([a-zA-Z,]+) + + + + + + + +
+
+ + + ([A-Z0-9-]+,?)+ + + + + ([A-Z0-9-]+,?)+ + + +
+
+ + + ([A-Z0-9-]+,?)+ + + + + ([A-Z0-9-]+,?)+ + + +
+
+ + + +
+
+ + + + + + + + + + +
+
+ + + + + + + + ^\d+(\.\d+)?,\d+(\.\d+)?,\d+(\.\d+)?,\d+(\.\d+)?$ + + +
+
+ + + + + + + +
+ + + + + + + +
+
+ +
+ + + + + + + + + + +
+ +
+ + + + + +
+ + + + + + + + + + +
+ +
+ + + + + + + +
+ + + + + + + + + + +
+
+ + + + @misc{nf-core/stableexpression, + author = {Coen, Olivier}, + year = {2025}, + title = {nf-core/stableexpression}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/OlivierCoen/stableexpression}, + } + + +
diff --git a/galaxy/tool_shed/tool/rebuild_samplesheet.py b/galaxy/tool_shed/tool/rebuild_samplesheet.py new file mode 100644 index 00000000..43666e66 --- /dev/null +++ b/galaxy/tool_shed/tool/rebuild_samplesheet.py @@ -0,0 +1,70 @@ +#!/usr/env/bin python +""" +Script dedicated to renaming files in the samplesheet provided. +In Galaxy, data files provided by users are given a new file name. +However, original file names can be retrieved from the name attribute of the file object (inside the tool XML file). +In this script, we replace the original name with the actual Galaxy path. + +""" + +import argparse +import logging +from pathlib import Path +import csv + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--in", dest="samplesheet", type=Path, required=True) + parser.add_argument("--count-files", dest="count_files", type=str, required=True) + parser.add_argument( + "--count-filenames", dest="count_filenames", type=str, nargs="+", required=True + ) + parser.add_argument("--design-files", dest="design_files", type=str, required=True) + parser.add_argument( + "--design-filenames", + dest="design_filenames", + type=str, + nargs="+", + required=True, + ) + parser.add_argument("--out", dest="outfile", type=Path, required=True) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + + # files and names arrive in the same order + count_files = args.count_files.split(",") + design_files = args.design_files.split(",") + + count_names_to_files = { + name: file for file, name in zip(count_files, args.count_filenames) + } + design_names_to_files = { + name: file for file, name in zip(design_files, args.design_filenames) + } + + renamed_rows = [] + with open(args.samplesheet, "r", newline="") as fin: + reader = csv.DictReader(fin) + header = reader.fieldnames + for row in reader: + # getting original names (file names as written in the samplesheet) + original_count_filename = Path(row["counts"]).name + original_design_filename = Path(row["design"]).name + # turning original names into new names (Galaxy file names) + row["counts"] = count_names_to_files[original_count_filename] + row["design"] = design_names_to_files[original_design_filename] + renamed_rows.append(row) + + with open(args.outfile, "w", newline="") as fout: + writer = csv.DictWriter(fout, fieldnames=header) + + writer.writeheader() + for row in renamed_rows: + writer.writerow(row) diff --git a/galaxy/tool_shed/tool/test_data/input.csv b/galaxy/tool_shed/tool/test_data/input.csv new file mode 100644 index 00000000..6ea4aa16 --- /dev/null +++ b/galaxy/tool_shed/tool/test_data/input.csv @@ -0,0 +1,3 @@ +counts,design,platform,normalised +tests/test_data/input_datasets/microarray.normalised.csv,tests/test_data/input_datasets/microarray.normalised.design.csv,microarray,true +tests/test_data/input_datasets/rnaseq.raw.csv,tests/test_data/input_datasets/rnaseq.raw.design.csv,rnaseq,false diff --git a/tests/test_data/custom_datasets/microarray.normalised.csv b/galaxy/tool_shed/tool/test_data/microarray.normalised.csv similarity index 93% rename from tests/test_data/custom_datasets/microarray.normalised.csv rename to galaxy/tool_shed/tool/test_data/microarray.normalised.csv index 60869917..81f3f904 100644 --- a/tests/test_data/custom_datasets/microarray.normalised.csv +++ b/galaxy/tool_shed/tool/test_data/microarray.normalised.csv @@ -1,4 +1,4 @@ -,GSM1528575,GSM1528576,GSM1528579,GSM1528583,GSM1528584,GSM1528585,GSM1528580,GSM1528586,GSM1528582,GSM1528578,GSM1528581,GSM1528577 +gene_id,GSM1528575,GSM1528576,GSM1528579,GSM1528583,GSM1528584,GSM1528585,GSM1528580,GSM1528586,GSM1528582,GSM1528578,GSM1528581,GSM1528577 ENSRNA049453121,20925.1255070264,136184.261516502,144325.370645564,89427.0987612997,164143.182734208,34178.6378088171,28842.7323281157,76973.395782103,41906.9367255656,44756.5602263121,252562.049703724,6953.65643340122 ENSRNA049453138,196173.051628372,16607.8367703051,344972.83715281,22602.4535330758,13678.598561184,104546.421532852,15451.4637472048,71664.8857281649,160643.257448002,91459.0578537683,88396.7173963033,281623.08555275 ENSRNA049454388,91547.4240932405,11625.4857392136,84483.143792525,80582.6604222701,218857.576978944,58304.7350856292,42234.0009090266,88475.1675656357,87306.1181782617,17513.436610296,90922.3378933406,76490.2207674135 diff --git a/tests/test_data/custom_datasets/microarray.normalised.design.csv b/galaxy/tool_shed/tool/test_data/microarray.normalised.design.csv similarity index 100% rename from tests/test_data/custom_datasets/microarray.normalised.design.csv rename to galaxy/tool_shed/tool/test_data/microarray.normalised.design.csv diff --git a/tests/test_data/custom_datasets/rnaseq.raw.csv b/galaxy/tool_shed/tool/test_data/rnaseq.raw.csv similarity index 100% rename from tests/test_data/custom_datasets/rnaseq.raw.csv rename to galaxy/tool_shed/tool/test_data/rnaseq.raw.csv diff --git a/tests/test_data/custom_datasets/rnaseq.raw.design.csv b/galaxy/tool_shed/tool/test_data/rnaseq.raw.design.csv similarity index 100% rename from tests/test_data/custom_datasets/rnaseq.raw.design.csv rename to galaxy/tool_shed/tool/test_data/rnaseq.raw.design.csv diff --git a/main.nf b/main.nf index dc57aeaa..be4d42ea 100644 --- a/main.nf +++ b/main.nf @@ -59,7 +59,12 @@ workflow { params.version, params.validate_params, params.monochrome_logs, - args + args, + params.outdir, + params.datasets, + params.help, + params.help_full, + params.show_hidden ) // diff --git a/modules.json b/modules.json index 187b1ad8..249895b3 100644 --- a/modules.json +++ b/modules.json @@ -7,7 +7,7 @@ "nf-core": { "multiqc": { "branch": "master", - "git_sha": "f0719ae309075ae4a291533883847c3f7c441dad", + "git_sha": "af27af1be706e6a2bb8fe454175b0cdf77f47b49", "installed_by": ["modules"] } } @@ -16,17 +16,17 @@ "nf-core": { "utils_nextflow_pipeline": { "branch": "master", - "git_sha": "c2b22d85f30a706a3073387f30380704fcae013b", + "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", - "git_sha": "51ae5406a030d4da1e49e4dab49756844fdd6c7a", + "git_sha": "df4d1c8cdee98a1bbbed8fc51e82296568e0f9c1", "installed_by": ["subworkflows"] }, "utils_nfschema_plugin": { "branch": "master", - "git_sha": "2fd2cd6d0e7b273747f32e465fdc6bcc3ae0814e", + "git_sha": "e753770db613ce014b3c4bc94f6cba443427b726", "installed_by": ["subworkflows"] } } diff --git a/modules/local/aggregate_results/environment.yml b/modules/local/aggregate_results/environment.yml new file mode 100644 index 00000000..f0eaf3dd --- /dev/null +++ b/modules/local/aggregate_results/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.35.2 diff --git a/modules/local/aggregate_results/main.nf b/modules/local/aggregate_results/main.nf new file mode 100644 index 00000000..b0d54c42 --- /dev/null +++ b/modules/local/aggregate_results/main.nf @@ -0,0 +1,43 @@ +process AGGREGATE_RESULTS { + + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f8a5d02e7b31980c887253a9f118da0ef91ead1c7b158caf855199e5c5d5473/data': + 'community.wave.seqera.io/library/polars_python:cab787b788e5eba7' }" + + input: + path count_file + path stat_file + path platform_stat_files, stageAs: "?/*" + path metadata_files + path mapping_files + + output: + path 'all_genes_summary.csv', emit: all_genes_summary + path 'most_stable_genes_summary.csv', emit: most_stable_genes_summary + path 'all_counts_filtered.parquet', emit: all_counts_filtered + path 'most_stable_genes_transposed_counts_filtered.csv', emit: most_stable_genes_transposed_counts_filtered + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def mapping_files_arg = mapping_files ? "--mappings " + "$mapping_files" : "" + def metadata_files_arg = metadata_files ? "--metadata " + "$metadata_files" : "" + def is_using_containers = workflow.containerEngine ? true : false + """ + # limiting number of threads when using conda / micromamba + if [ "${is_using_containers}" == "false" ]; then + export POLARS_MAX_THREADS=${task.cpus} + fi + + aggregate_results.py \\ + --counts $count_file \\ + --stats $stat_file \\ + --platform-stats $platform_stat_files \\ + $mapping_files_arg \\ + $metadata_files_arg + """ + +} diff --git a/modules/local/clean_gene_ids/environment.yml b/modules/local/clean_gene_ids/environment.yml new file mode 100644 index 00000000..f0eaf3dd --- /dev/null +++ b/modules/local/clean_gene_ids/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.35.2 diff --git a/modules/local/clean_gene_ids/main.nf b/modules/local/clean_gene_ids/main.nf new file mode 100644 index 00000000..67eb867d --- /dev/null +++ b/modules/local/clean_gene_ids/main.nf @@ -0,0 +1,40 @@ +process CLEAN_GENE_IDS { + + label 'process_low' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f8a5d02e7b31980c887253a9f118da0ef91ead1c7b158caf855199e5c5d5473/data': + 'community.wave.seqera.io/library/polars_python:cab787b788e5eba7' }" + + input: + tuple val(meta), path(count_file) + + output: + tuple val(meta), path('*.cleaned.parquet'), optional: true, emit: counts + path('*.cleaned_gene_ids.txt'), optional: true, emit: gene_ids + tuple val(meta.dataset), path("failure_reason.txt"), optional: true, topic: id_cleaning_failure_reason + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def is_using_containers = workflow.containerEngine ? true : false + """ + # limiting number of threads when using conda / micromamba + if [ "${is_using_containers}" == "false" ]; then + export POLARS_MAX_THREADS=${task.cpus} + fi + + clean_gene_ids.py \\ + --count-file "$count_file" + """ + + + stub: + """ + touch fake.cleaned.csv + """ + +} diff --git a/modules/local/collect_gene_ids/environment.yml b/modules/local/collect_gene_ids/environment.yml new file mode 100644 index 00000000..75afc696 --- /dev/null +++ b/modules/local/collect_gene_ids/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.2 + - conda-forge::tqdm==4.67.1 diff --git a/modules/local/collect_gene_ids/main.nf b/modules/local/collect_gene_ids/main.nf new file mode 100644 index 00000000..f7b0bdfe --- /dev/null +++ b/modules/local/collect_gene_ids/main.nf @@ -0,0 +1,25 @@ +process COLLECT_GENE_IDS { + + label "process_high" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/70/70c17cde84896904c0620d614cba74ff029f1255db64e66416e63c91b7c959a2/data': + 'community.wave.seqera.io/library/python_tqdm:4e039400f75bdad0' }" + + input: + path count_files, stageAs: "?/*" + + output: + path 'unique_gene_ids.txt', emit: unique_gene_ids + path 'gene_id_occurrences.csv', emit: gene_id_occurrences + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('tqdm'), eval('python3 -c "import tqdm; print(tqdm.__version__)"'), topic: versions + + script: + """ + collect_gene_ids.py \\ + --ids "$count_files" + """ + +} diff --git a/modules/local/collect_statistics/environment.yml b/modules/local/collect_statistics/environment.yml new file mode 100644 index 00000000..5d27c0af --- /dev/null +++ b/modules/local/collect_statistics/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.13.7 + - conda-forge::pandas==2.3.3 diff --git a/modules/local/collect_statistics/main.nf b/modules/local/collect_statistics/main.nf new file mode 100644 index 00000000..0fd5bdd2 --- /dev/null +++ b/modules/local/collect_statistics/main.nf @@ -0,0 +1,24 @@ +process COLLECT_STATISTICS { + + tag "${file.baseName}" + label "process_high" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/3d/3d7126100b0eb7cb53dfb50291707ea8dda3b9738b76551ab73605d0acbe114b/data': + 'community.wave.seqera.io/library/pandas:2.3.3--5a902bf824a79745' }" + + input: + path file + + output: + path '*.transposed.csv', emit: csv + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions + + script: + """ + collect_statistics.py $file + """ + +} diff --git a/modules/local/compute_base_statistics/environment.yml b/modules/local/compute_base_statistics/environment.yml new file mode 100644 index 00000000..f0eaf3dd --- /dev/null +++ b/modules/local/compute_base_statistics/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.35.2 diff --git a/modules/local/compute_base_statistics/main.nf b/modules/local/compute_base_statistics/main.nf new file mode 100644 index 00000000..89c66855 --- /dev/null +++ b/modules/local/compute_base_statistics/main.nf @@ -0,0 +1,36 @@ +process COMPUTE_BASE_STATISTICS { + + tag "${meta.platform}" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f8a5d02e7b31980c887253a9f118da0ef91ead1c7b158caf855199e5c5d5473/data': + 'community.wave.seqera.io/library/polars_python:cab787b788e5eba7' }" + + input: + tuple val(meta), path(count_file) + + output: + path '*stats_all_genes.csv', emit: stats + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def args = task.ext.args ?: '' + if ( meta.platform != "all" ) { + args += " --platform $meta.platform" + } + def is_using_containers = workflow.containerEngine ? true : false + """ + # limiting number of threads when using conda / micromamba + if [ "${is_using_containers}" == "false" ]; then + export POLARS_MAX_THREADS=${task.cpus} + fi + + compute_base_statistics.py \\ + --counts $count_file \\ + $args + """ + +} diff --git a/modules/local/compute_dataset_statistics/environment.yml b/modules/local/compute_dataset_statistics/environment.yml new file mode 100644 index 00000000..f0eaf3dd --- /dev/null +++ b/modules/local/compute_dataset_statistics/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.35.2 diff --git a/modules/local/compute_dataset_statistics/main.nf b/modules/local/compute_dataset_statistics/main.nf new file mode 100644 index 00000000..6280d8dc --- /dev/null +++ b/modules/local/compute_dataset_statistics/main.nf @@ -0,0 +1,28 @@ +process COMPUTE_DATASET_STATISTICS { + + label 'process_single' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f8a5d02e7b31980c887253a9f118da0ef91ead1c7b158caf855199e5c5d5473/data': + 'community.wave.seqera.io/library/polars_python:cab787b788e5eba7' }" + + input: + tuple val(meta), path(count_file) + + output: + tuple val(meta.dataset), path("skewness.txt"), topic: skewness + tuple val(meta.dataset), path("ratio_zeros.txt"), topic: ratio_zeros + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def prefix = task.ext.prefix ?: "${meta.dataset}" + """ + compute_dataset_statistics.py \\ + --counts $count_file + """ + +} diff --git a/modules/local/compute_gene_transcript_lengths/environment.yml b/modules/local/compute_gene_transcript_lengths/environment.yml new file mode 100644 index 00000000..5d27c0af --- /dev/null +++ b/modules/local/compute_gene_transcript_lengths/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.13.7 + - conda-forge::pandas==2.3.3 diff --git a/modules/local/compute_gene_transcript_lengths/main.nf b/modules/local/compute_gene_transcript_lengths/main.nf new file mode 100644 index 00000000..ada61de4 --- /dev/null +++ b/modules/local/compute_gene_transcript_lengths/main.nf @@ -0,0 +1,38 @@ +process COMPUTE_GENE_TRANSCRIPT_LENGTHS { + + label 'process_single' + + tag "${gff3.baseName}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/3d/3d7126100b0eb7cb53dfb50291707ea8dda3b9738b76551ab73605d0acbe114b/data': + 'community.wave.seqera.io/library/pandas:2.3.3--5a902bf824a79745' }" + + input: + path gff3 + + output: + path('gene_transcript_lengths.csv'), emit: csv + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions + + script: + def is_compressed = gff3.getExtension() == "gz" ? true : false + def gff3_name = is_compressed ? gff3.getBaseName() : gff3 + """ + if [ "${is_compressed}" == "true" ]; then + gzip -c -d ${gff3} > ${gff3_name} + fi + + compute_gene_transcript_lengths.py \\ + --annotation ${gff3_name} + """ + + + stub: + """ + touch gene_transcript_lengths.csv + """ + +} diff --git a/modules/local/compute_stability_scores/environment.yml b/modules/local/compute_stability_scores/environment.yml new file mode 100644 index 00000000..f0eaf3dd --- /dev/null +++ b/modules/local/compute_stability_scores/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.35.2 diff --git a/modules/local/compute_stability_scores/main.nf b/modules/local/compute_stability_scores/main.nf new file mode 100644 index 00000000..ed0615ca --- /dev/null +++ b/modules/local/compute_stability_scores/main.nf @@ -0,0 +1,37 @@ +process COMPUTE_STABILITY_SCORES { + + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f8a5d02e7b31980c887253a9f118da0ef91ead1c7b158caf855199e5c5d5473/data': + 'community.wave.seqera.io/library/polars_python:cab787b788e5eba7' }" + + input: + path stat_file + val stability_score_weights + path normfinder_stability_file + val genorm_stability_file + + output: + path 'stats_with_scores.csv', emit: stats_with_stability_scores + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def genorm_stability_file_arg = genorm_stability_file ? "--genorm-stability $genorm_stability_file" : "" + def is_using_containers = workflow.containerEngine ? true : false + """ + # limiting number of threads when using conda / micromamba + if [ "${is_using_containers}" == "false" ]; then + export POLARS_MAX_THREADS=${task.cpus} + fi + + compute_stability_scores.py \\ + --stats $stat_file \\ + --weights "$stability_score_weights" \\ + --normfinder-stability $normfinder_stability_file \\ + $genorm_stability_file_arg + """ + +} diff --git a/modules/local/dash_app/app/app.py b/modules/local/dash_app/app/app.py new file mode 100755 index 00000000..27fc7bf6 --- /dev/null +++ b/modules/local/dash_app/app/app.py @@ -0,0 +1,91 @@ +import socket +import dash_mantine_components as dmc + +from dash_extensions.enrich import ( + DashProxy, + html, + ServersideOutputTransform, + TriggerTransform, +) +from dash_extensions.logging import NotificationsLogHandler + +from src.utils import config, style +from src.components import stores, tooltips +from src.components import top, right_sidebar +from src.callbacks import common, genes, samples + +DEBUG = True +# DEBUG = False + +# -------------------- SETUP LOGGING -------------------- + +log_handler = NotificationsLogHandler() +logger = log_handler.setup_logger(__name__) + +# -------------------- APP -------------------- +# init the application +logger.info("Creating app") + +app = DashProxy( + __name__, + title=config.APP_TITLE, + prevent_initial_callbacks="initial_duplicate", + suppress_callback_exceptions=(not DEBUG), + update_title=config.UPDATE_TITLE, + external_stylesheets=[dmc.styles.ALL], + transforms=[TriggerTransform(), ServersideOutputTransform()], +) + +# -------------------- LAYOUT -------------------- + + +def serve_layout(): + return dmc.MantineProvider( + children=[ + html.Div( + [ + top.header, + right_sidebar.drawer, + *stores.stores_to_load, + *tooltips.tooltips_to_load, + ] + + log_handler.embed(), + id="layout", + style=style.LAYOUT, + ) + ] + ) + + +app.layout = serve_layout + +# -------------------- IMPORTING CALLBACKS -------------------- + +common.register_callbacks() +genes.register_callbacks() +samples.register_callbacks() + +# -------------------- LAUNCH SERVER -------------------- + + +def find_port(port: int) -> int: + """Find a port not in use starting at given port""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + if s.connect_ex(("localhost", port)) == 0: + return find_port(port=port + 1) + else: + return port + + +if __name__ == "__main__": + logger.info("Running server") + # setting prune_errors to False avoids error message pruning + # in order to get original tracebacks + # (very useful for debugging) + prune_errors = False if DEBUG else True + app.run( + debug=DEBUG, + host=config.HOST, + port=find_port(port=config.PLOTLY_APP_PORT), + dev_tools_prune_errors=prune_errors, + ) diff --git a/modules/local/dash_app/app/assets/style.css b/modules/local/dash_app/app/assets/style.css new file mode 100755 index 00000000..f32fc1a6 --- /dev/null +++ b/modules/local/dash_app/app/assets/style.css @@ -0,0 +1,9 @@ +.modebar { + background: transparent; + left: 50%; + transform: translateX(-50%); +} + +.mantine-Drawer-root { + width: 0.1em !important; +} diff --git a/modules/local/dash_app/app/environment.yml b/modules/local/dash_app/app/environment.yml new file mode 100644 index 00000000..36fb5a36 --- /dev/null +++ b/modules/local/dash_app/app/environment.yml @@ -0,0 +1,16 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.13.8 + - conda-forge::pandas==2.3.3 + - conda-forge::polars==1.35.2 + - conda-forge::pyarrow==22.0.0 + - conda-forge::scipy==1.16.3 + - conda-forge::dash==3.3.0 + - conda-forge::dash-mantine-components==2.4.0 + - conda-forge::dash-extensions==2.0.4 + - conda-forge::dash-iconify==0.1.2 + - conda-forge::dash-ag-grid==32.3.2 diff --git a/modules/local/dash_app/app/src/callbacks/common.py b/modules/local/dash_app/app/src/callbacks/common.py new file mode 100644 index 00000000..bd349a21 --- /dev/null +++ b/modules/local/dash_app/app/src/callbacks/common.py @@ -0,0 +1,36 @@ +from dash_extensions.enrich import Input, Trigger, Output, State, callback + + +############################################## +############################################## +# CALLBACKS +############################################## +############################################## + + +def register_callbacks(): + @callback( + Output("drawer", "opened"), + Trigger("settings-button", "n_clicks"), + prevent_initial_call=True, + ) + def open_drawer(): + return True + + @callback( + Output("sidebar-genes-items", "style"), + Output("sidebar-samples-items", "style"), + Input("tabs", "value"), + State("sidebar-genes-items", "style"), + State("sidebar-samples-items", "style"), + ) + def manage_drawer_content( + tabs_value: str, gene_stack_style: dict, sample_stack_style: dict + ): + if tabs_value == "genes": + gene_stack_style["display"] = "block" + sample_stack_style["display"] = "none" + else: # tabs_value == 'samples': + gene_stack_style["display"] = "none" + sample_stack_style["display"] = "block" + return gene_stack_style, sample_stack_style diff --git a/modules/local/dash_app/app/src/callbacks/genes.py b/modules/local/dash_app/app/src/callbacks/genes.py new file mode 100644 index 00000000..10cd2f74 --- /dev/null +++ b/modules/local/dash_app/app/src/callbacks/genes.py @@ -0,0 +1,109 @@ +import plotly.graph_objects as go +from dash_extensions.enrich import Input, Output, Serverside, State, callback, ctx +from src.utils.data_management import DataManager + +data_manager = DataManager() + + +############################################## +############################################## +# CALLBACKS +############################################## +############################################## + + +def get_selected_rows(selected_genes: list[str]) -> list[dict]: + return data_manager.all_genes_stat_df.filter( + data_manager.all_genes_stat_df["gene_id"].is_in(selected_genes) + ).to_dicts() + + +def register_callbacks(): + @callback( + Output("gene-counts", "data"), + Output("gene-dropdown", "value"), + Output("gene-stats-table", "selectedRows"), + Input("gene-dropdown", "value"), + Input("gene-stats-table", "selectedRows"), + State("gene-counts", "data"), + # prevent_initial_call=True, + ) + def update_gene_stored_data( + selected_genes: list[str], table_selected_rows: list[dict], stored_data: dict + ) -> dict: + if ctx.triggered_id == "gene-stats-table": + # updating selected genes + if table_selected_rows is not None: + selected_genes = [row["gene_id"] for row in table_selected_rows] + else: + selected_genes = [] + else: + # ctx.triggered_id is None (callback triggered at app launch / refresh) + # or ctx.triggered_id == "gene-dropdown": + # taking the dropdown values as reference (since there is persistence on it) + table_selected_rows = get_selected_rows(selected_genes) + + # deleting stored data for genes not anymore in the selected list + for stored_gene in list( + stored_data.keys() + ): # we need to copy the list of keys before changing the dict + if stored_gene not in selected_genes: + del stored_data[stored_gene] + + # storing data for new genes in the selected list + for gene in selected_genes: + if gene not in stored_data: + gene_data = data_manager.get_gene_counts(gene) + stored_data[gene] = { + "counts": gene_data.to_list(), + "samples": gene_data.index.to_list(), + } + + return Serverside(stored_data), selected_genes, table_selected_rows + + @callback( + Output("gene-graph", "figure"), + Output("gene-graph", "style"), + Input("gene-counts", "data"), + Input("gene-graph-jitter", "value"), + Input("gene-graph-pointpos", "value"), + Input("gene-graph-boxmean", "value"), + Input("gene-graph-display-points", "value"), + State("gene-graph", "style"), + # prevent_initial_call=True, + ) + def update_gene_graph( + gene_stored_data: dict, + jitter: float, + pointpos: float, + boxmean: str | bool, + point_display_mode: str, + graph_style: dict, + ): + if not gene_stored_data: + graph_style["display"] = "none" + return {}, graph_style + + graph_style["display"] = "block" + + fig = go.Figure() + + # we need to use the reversed order, otherwise the last traced added is at the top of the graph + for gene, gene_data in reversed(gene_stored_data.items()): + fig.add_trace( + go.Box( + name=gene, + x=gene_data["counts"], + boxmean=boxmean, + jitter=jitter, + pointpos=pointpos, + boxpoints=point_display_mode, + customdata=gene_data["samples"], + hovertemplate="Sample: %{customdata}
Normalised count: %{x}
", + showlegend=False, + ) + ) + + fig.update_layout(xaxis=dict(range=[0, 1]), yaxis=dict(ticklabelstandoff=10)) + + return fig, graph_style diff --git a/modules/local/dash_app/app/src/callbacks/samples.py b/modules/local/dash_app/app/src/callbacks/samples.py new file mode 100644 index 00000000..1ac296c8 --- /dev/null +++ b/modules/local/dash_app/app/src/callbacks/samples.py @@ -0,0 +1,126 @@ +import plotly.graph_objects as go +import numpy as np +from scipy.stats import gaussian_kde +from dash_extensions.enrich import Input, Output, State, callback, Serverside + +from src.utils.data_management import DataManager + +data_manager = DataManager() + + +############################################## +############################################## +# CALLBACKS +############################################## +############################################## + + +def register_callbacks(): + @callback( + Output("sample-counts", "data"), + Input("sample-dropdown", "value"), + State("sample-counts", "data"), + # prevent_initial_call=True, + ) + def update_stored_data( + sample_dropdown_values: list[str], stored_sample_counts: dict + ): + updated_stored_sample_counts = dict(stored_sample_counts) # deep copy + + # deleting stored data for samples not anymore in the selected list + for stored_sample in ( + stored_sample_counts + ): # we need to copy the list of keys before changing the dict + if stored_sample not in sample_dropdown_values: + del updated_stored_sample_counts[stored_sample] + + # storing data for new samples in the selected list + for sample in sample_dropdown_values: + if sample not in updated_stored_sample_counts: + sample_data = data_manager.get_sample_counts(sample) + updated_stored_sample_counts[sample] = { + "counts": sample_data.to_list(), + "genes": sample_data.index.to_list(), + } + + return Serverside(updated_stored_sample_counts) + + @callback( + Output("sample-graph", "figure"), + Output("sample-graph", "style"), + Output("sample_stats_display_accordion_control", "disabled"), + Output("sample_points_display_accordion_control", "disabled"), + Output("sample_plot_customisation_accordion_control", "disabled"), + Input("sample-counts", "data"), + Input("curve-type", "value"), + Input("sample-graph-jitter", "value"), + Input("sample-graph-pointpos", "value"), + Input("sample-graph-boxmean", "value"), + Input("sample-graph-display-points", "value"), + State("sample-graph", "style"), + # prevent_initial_call=True, + ) + def update_sample_histogram( + sample_counts: dict, + curve_type: str, + jitter: float, + pointpos: float, + boxmean: str | bool, + point_display_mode: str, + graph_style: dict, + ): + if not sample_counts: + graph_style["display"] = "none" + return {}, graph_style, True, True, True + + graph_style["display"] = "block" + + fig = go.Figure() + + sample_stats_display_ac_disabled = True + sample_points_display_ac_disabled = True + sample_plot_customisation_ac_disabled = True + + # we need to use the reversed order, otherwise the last traced added is at the top of the graph + for sample, sample_data in reversed(sample_counts.items()): + counts = sample_data["counts"] + + if curve_type == "histogram": + fig.add_trace(go.Histogram(name=sample, x=counts)) + + elif curve_type == "kde": + kde_function = gaussian_kde(counts) + xvals = np.linspace(min(counts), max(counts), 1000) + yvals = kde_function(xvals) + fig.add_trace(go.Scatter(name=sample, x=xvals, y=yvals)) + + else: # boxplot + # we need to use the reversed order, otherwise the last traced added is at the top of the graph + fig.add_trace( + go.Box( + name=sample, + x=counts, + jitter=jitter, + pointpos=pointpos, + boxpoints=point_display_mode, + boxmean=boxmean, + customdata=sample_data["genes"], + hovertemplate="Gene: %{customdata}
Count: %{x}
", + ) + ) + # update the layout to remove y-axis labels + fig.update_layout(yaxis=dict(showticklabels=False)) + + sample_stats_display_ac_disabled = False + sample_points_display_ac_disabled = False + sample_plot_customisation_ac_disabled = False + + fig.update_layout(xaxis=dict(range=[0, 1]), yaxis=dict(ticklabelstandoff=10)) + + return ( + fig, + graph_style, + sample_stats_display_ac_disabled, + sample_points_display_ac_disabled, + sample_plot_customisation_ac_disabled, + ) diff --git a/modules/local/dash_app/app/src/components/graphs.py b/modules/local/dash_app/app/src/components/graphs.py new file mode 100644 index 00000000..3d373e4f --- /dev/null +++ b/modules/local/dash_app/app/src/components/graphs.py @@ -0,0 +1,12 @@ +from dash_extensions.enrich import dcc + +from src.utils import style + + +def get_graph(graph_id: str): + return dcc.Graph(id=graph_id, figure={}, style=style.GRAPH) + + +gene_graph = get_graph("gene-graph") + +sample_graph = get_graph("sample-graph") diff --git a/modules/local/dash_app/app/src/components/icons.py b/modules/local/dash_app/app/src/components/icons.py new file mode 100755 index 00000000..2ac16c1c --- /dev/null +++ b/modules/local/dash_app/app/src/components/icons.py @@ -0,0 +1,11 @@ +from dash_iconify import DashIconify + +# all dash-iconify icons can be found at +# https://icon-sets.iconify.design/ + +# --------------- SIDEBAR --------------------- + + +magnifying_glass_icon = DashIconify(icon="radix-icons:magnifying-glass") + +data_loaded_icon = DashIconify(icon="akar-icons:circle-check", color="white", width=30) diff --git a/modules/local/dash_app/app/src/components/right_sidebar.py b/modules/local/dash_app/app/src/components/right_sidebar.py new file mode 100644 index 00000000..706a4142 --- /dev/null +++ b/modules/local/dash_app/app/src/components/right_sidebar.py @@ -0,0 +1,21 @@ +import dash_mantine_components as dmc + +from src.utils import style + +from src.components.settings import genes, samples + +drawer = dmc.Drawer( + children=[ + genes.sidebar_stack, + samples.sidebar_stack, + ], + id="drawer", + opened=False, + position="right", + withCloseButton=True, + closeOnEscape=True, + overlayProps=dict(backgroundOpacity=0), + trapFocus=False, + zIndex=10000, + style=style.SIDEBAR, +) diff --git a/modules/local/dash_app/app/src/components/settings/genes.py b/modules/local/dash_app/app/src/components/settings/genes.py new file mode 100644 index 00000000..0cbe0e41 --- /dev/null +++ b/modules/local/dash_app/app/src/components/settings/genes.py @@ -0,0 +1,137 @@ +import dash_mantine_components as dmc + +from src.utils import style +from src.utils.data_management import DataManager + +data_manager = DataManager() + +gene_selection_stack = dmc.Stack( + [ + dmc.MultiSelect( + id="gene-dropdown", + label=dmc.Text("Genes to display", fw=600, style={"paddingBottom": "5px"}), + placeholder="Select genes of interest", + nothingFoundMessage="No gene found", + data=data_manager.get_sorted_genes(), + value=[], + w=400, + clearable=True, + searchable=True, + limit=100, + maxValues=20, + size="sm", + checkIconPosition="right", + hidePickedOptions=True, + disabled=False, + persistence=True, + persisted_props=["value"], + persistence_type="session", + # style=style.DROPDOWN, + comboboxProps={ + "shadow": "md", + "transitionProps": {"transition": "pop", "duration": 200}, + }, + ) + ], + align="stretch", + gap="xl", +) + +gene_graph_stats_display_stack = dmc.Stack( + [ + dmc.Text( + "Display mean / standard deviation", style=style.STACK_SUBSECTION_TITLE + ), + dmc.SegmentedControl( + id="gene-graph-boxmean", + value="sd", + color="teal", + data=[ + {"value": False, "label": "None"}, + {"value": True, "label": "Mean only"}, + {"value": "sd", "label": "Mean + Std"}, + ], + mb=10, + ), + ], + align="center", + gap="xl", +) + +gene_graph_points_display_stack = dmc.Stack( + [ + dmc.Text("Display points", style=style.STACK_SUBSECTION_TITLE), + dmc.SegmentedControl( + id="gene-graph-display-points", + value="outliers", + color="teal", + data=[ + {"value": "outliers", "label": "Outliers"}, + {"value": "suspectedoutliers", "label": "Suspected Outliers"}, + {"value": "all", "label": "All points"}, + ], + mb=10, + ), + dmc.Text( + "Position of points relatively to boxes", style=style.STACK_SUBSECTION_TITLE + ), + dmc.Slider( + id="gene-graph-pointpos", + value=-1.8, + color="teal", + min=-2, + max=2, + step=0.1, + persistence=True, + persisted_props=["value"], + persistence_type="session", + mb=35, + ), + dmc.Text( + "Spreading of displayed points (jitter)", style=style.STACK_SUBSECTION_TITLE + ), + dmc.Slider( + id="gene-graph-jitter", + value=0.3, + color="teal", + min=0, + max=1, + step=0.1, + persistence=True, + persisted_props=["value"], + persistence_type="session", + mb=35, + ), + ], + align="center", + gap="xl", +) + +sidebar_stack = dmc.Accordion( + value="gene_selection", + children=[ + dmc.AccordionItem( + [ + dmc.AccordionControl("Gene selection"), + dmc.AccordionPanel(gene_selection_stack), + ], + value="gene_selection", + ), + dmc.AccordionItem( + [ + dmc.AccordionControl("Statistics display"), + dmc.AccordionPanel(gene_graph_stats_display_stack), + ], + value="gene_stats_display", + ), + dmc.AccordionItem( + [ + dmc.AccordionControl("Points display"), + dmc.AccordionPanel(gene_graph_points_display_stack), + ], + value="gene_points_display", + ), + ], + id="sidebar-genes-items", + style={"marginTop": "20px", "display": "none"}, +) diff --git a/modules/local/dash_app/app/src/components/settings/samples.py b/modules/local/dash_app/app/src/components/settings/samples.py new file mode 100644 index 00000000..c7e6a97c --- /dev/null +++ b/modules/local/dash_app/app/src/components/settings/samples.py @@ -0,0 +1,173 @@ +import dash_mantine_components as dmc + +from src.utils import style +from src.utils.data_management import DataManager + +data_manager = DataManager() + +sample_selection_stack = dmc.Stack( + [ + dmc.MultiSelect( + id="sample-dropdown", + label="Select list of samples to visualise", + placeholder="Select samples", + nothingFoundMessage="No samples found", + data=data_manager.get_sorted_samples(), + value=[], + w=400, + clearable=True, + searchable=True, + limit=100, + maxValues=20, + size="sm", + checkIconPosition="right", + hidePickedOptions=True, + disabled=False, + persistence=True, + persisted_props=["value"], + persistence_type="session", + # style=style.DROPDOWN, + comboboxProps={ + "shadow": "md", + "transitionProps": {"transition": "pop", "duration": 200}, + }, + ) + ], + align="left", + gap="xl", +) + + +sample_graph_plot_type_stack = dmc.Stack( + [ + dmc.Text("Type of plot", style=style.STACK_SUBSECTION_TITLE), + dmc.SegmentedControl( + id="curve-type", + value="ng", + color="teal", + data=[ + {"value": "histogram", "label": "Histogram"}, + {"value": "kde", "label": "Kde"}, + {"value": "boxplot", "label": "Box-plot"}, + ], + mb=10, + ), + ], + align="left", + gap="xl", +) + + +sample_graph_stats_display_stack = dmc.Stack( + [ + dmc.Text( + "Display mean / standard deviation", style=style.STACK_SUBSECTION_TITLE + ), + dmc.SegmentedControl( + id="sample-graph-boxmean", + value="sd", + color="teal", + data=[ + {"value": False, "label": "None"}, + {"value": True, "label": "Mean only"}, + {"value": "sd", "label": "Mean + Std"}, + ], + mb=10, + ), + ], + align="left", + gap="xl", +) + +sample_graph_points_display_stack = dmc.Stack( + [ + dmc.Text("Display points", style=style.STACK_SUBSECTION_TITLE), + dmc.SegmentedControl( + id="sample-graph-display-points", + value="outliers", + color="teal", + data=[ + {"value": "outliers", "label": "Outliers"}, + {"value": "suspectedoutliers", "label": "Suspected Outliers"}, + {"value": "all", "label": "All points"}, + ], + mb=10, + ), + dmc.Text( + "Position of points relatively to boxes", style=style.STACK_SUBSECTION_TITLE + ), + dmc.Slider( + id="sample-graph-pointpos", + value=-1.8, + color="teal", + min=-2, + max=2, + step=0.1, + persistence=True, + persisted_props=["value"], + persistence_type="session", + mb=35, + ), + dmc.Text( + "Spreading of displayed points (jitter)", style=style.STACK_SUBSECTION_TITLE + ), + dmc.Slider( + id="sample-graph-jitter", + value=0.3, + color="teal", + min=0, + max=1, + step=0.1, + persistence=True, + persisted_props=["value"], + persistence_type="session", + mb=35, + ), + ], + align="left", + gap="xl", +) + + +sidebar_stack = dmc.Accordion( + value="sample_selection", + children=[ + dmc.AccordionItem( + [ + dmc.AccordionControl("Sample selection"), + dmc.AccordionPanel(sample_selection_stack), + ], + value="sample_selection", + ), + dmc.AccordionItem( + [ + dmc.AccordionControl( + "Plot customisation", + id="sample_plot_customisation_accordion_control", + ), + dmc.AccordionPanel(sample_graph_plot_type_stack), + ], + value="sample_plot_customisation", + ), + dmc.AccordionItem( + [ + dmc.AccordionControl( + "Statistics display", id="sample_stats_display_accordion_control" + ), + dmc.AccordionPanel(sample_graph_stats_display_stack), + ], + value="sample_stats_display", + ), + dmc.AccordionItem( + [ + dmc.AccordionControl( + "Points display", id="sample_points_display_accordion_control" + ), + dmc.AccordionPanel(sample_graph_points_display_stack), + ], + value="sample_points_display", + ), + ], + id="sidebar-samples-items", + style={"marginTop": "20px", "display": "none"}, +) diff --git a/modules/local/dash_app/app/src/components/stores.py b/modules/local/dash_app/app/src/components/stores.py new file mode 100755 index 00000000..1e6a1479 --- /dev/null +++ b/modules/local/dash_app/app/src/components/stores.py @@ -0,0 +1,14 @@ +from dash_extensions.enrich import dcc + +selected_samples = dcc.Store("selected-sample", storage_type="session") +gene_counts = dcc.Store(id="gene-counts", storage_type="session", data={}) +sample_counts = dcc.Store(id="sample-counts", storage_type="session", data={}) +# filtered_sample_counts = dcc.Store(id='filtered-sample-counts', storage_type='memory', data={}) + + +stores_to_load = [ + # selected_samples, + gene_counts, + sample_counts, + # filtered_sample_counts +] diff --git a/modules/local/dash_app/app/src/components/tables.py b/modules/local/dash_app/app/src/components/tables.py new file mode 100644 index 00000000..f9ba8605 --- /dev/null +++ b/modules/local/dash_app/app/src/components/tables.py @@ -0,0 +1,47 @@ +import dash_ag_grid as dag +from src.utils import style +from src.utils.data_management import DataManager + +data_manager = DataManager() + +NB_GENES_SELECTED_DEFAULT = 10 + +row_data = data_manager.all_genes_stat_df.to_dicts() +default_selected_rows = data_manager.all_genes_stat_df.head( + NB_GENES_SELECTED_DEFAULT +).to_dicts() +column_defs = [ + {"field": col, "headerName": col.replace("_", " ").capitalize()} + for col in data_manager.all_genes_stat_df.columns +] + + +all_genes_stats_table = dag.AgGrid( + rowData=row_data, + columnDefs=column_defs, + className="ag-theme-alpine", + # columnSizeOptions=dict(skipHeader=False), + # columnSize="autoSizetoFit", + defaultColDef=dict( + filter=True, + resizable=True, + editable=False, + sortable=True, + ), + dashGridOptions=dict( + pagination=True, + paginationAutoPageSize=True, + enableCellTextSelection=True, + ensureDomOrder=True, + animateRows=False, + rowSelection=dict(mode="multiRow"), + headerCheckboxSelection=False, + getRowId="params.data.gene_id", + ), + selectedRows=default_selected_rows, + style=style.AG_GRID, + persistence=True, + persistence_type="session", + persisted_props=["selectedRows"], + id="gene-stats-table", +) diff --git a/modules/local/dash_app/app/src/components/tooltips.py b/modules/local/dash_app/app/src/components/tooltips.py new file mode 100644 index 00000000..cf0e9421 --- /dev/null +++ b/modules/local/dash_app/app/src/components/tooltips.py @@ -0,0 +1,43 @@ +import dash_mantine_components as dmc + + +def get_tooltip( + classname: str, label: str, position: str = "bottom", multiline: bool = True +): + return dmc.Tooltip( + target=f".{classname}", + label=label, + multiline=multiline, + position=position, + color="grey", + withArrow=True, + arrowSize=8, + zIndex=20000, + radius=4, + transitionProps={ + "transition": "fade", + "duration": 200, + "timingFunction": "ease", + }, + ) + + +genes_tabitem_tooltip = get_tooltip( + classname="genes-tabitem", label="Distribution of normalised counts gene per gene" +) + +samples_tabitem_tooltip = get_tooltip( + classname="samples-tabitem", + label="Distribution of normalised counts sample per sample", +) + +settings_button_tooltip = get_tooltip( + classname="settings-button", + label="Open settings to select genes / samples and to customise display", +) + +tooltips_to_load = [ + genes_tabitem_tooltip, + samples_tabitem_tooltip, + settings_button_tooltip, +] diff --git a/modules/local/dash_app/app/src/components/top.py b/modules/local/dash_app/app/src/components/top.py new file mode 100755 index 00000000..90612ec9 --- /dev/null +++ b/modules/local/dash_app/app/src/components/top.py @@ -0,0 +1,91 @@ +import dash_mantine_components as dmc +from dash_iconify import DashIconify + +from src.utils import style +from src.components import graphs, tables + +gene_icon = DashIconify(icon="material-symbols:genetics", width=20) + +sample_icon = DashIconify(icon="ic:baseline-dashboard-customize", width=20) + + +tabs = dmc.Tabs( + children=[ + dmc.TabsList( + children=[ + dmc.TabsTab( + dmc.Text("Counts / gene", fw=500), + className="genes-tabitem", + color="teal", + leftSection=gene_icon, + value="genes", + style=style.HEADER_TABLIST_ITEM, + ), + dmc.TabsTab( + dmc.Text("Counts / sample", fw=500), + className="samples-tabitem", + leftSection=sample_icon, + value="samples", + color="red", + style=style.HEADER_TABLIST_ITEM, + ), + dmc.TabsTab( + dmc.Text("Statistics - all genes", fw=500), + leftSection=sample_icon, + value="gene_stats", + color="orange", + style=style.HEADER_TABLIST_ITEM, + ), + ], + style=style.HEADER_TABLIST, + ), + dmc.TabsPanel( + children=[ + graphs.gene_graph, + ], + style=style.TABS_PANEL, + value="genes", + ), + dmc.TabsPanel( + children=[ + graphs.sample_graph, + ], + style=style.TABS_PANEL, + value="samples", + ), + dmc.TabsPanel( + children=[tables.all_genes_stats_table], + style=style.TABS_PANEL, + value="gene_stats", + ), + ], + id="tabs", + variant="default", + radius="md", + orientation="horizontal", + placement="right", + value="genes", + persistence=True, + persisted_props=["value"], + persistence_type="session", + style=style.TAB, +) + +settings_button = dmc.Button( + "Open settings", + id="settings-button", + className="settings-button", + color="teal", + style=style.SETTINGS_BUTTON, +) + +header = dmc.Grid( + children=[ + dmc.GridCol(tabs, span=10), + dmc.GridCol( + settings_button, span=2, style={"textAlign": "right", "marginTop": "20px"} + ), + ], + style={"marginRight": "20px"}, + # gutter="xl", +) diff --git a/modules/local/dash_app/app/src/utils/config.py b/modules/local/dash_app/app/src/utils/config.py new file mode 100644 index 00000000..0aac91b4 --- /dev/null +++ b/modules/local/dash_app/app/src/utils/config.py @@ -0,0 +1,29 @@ +PLOTLY_APP_PORT = 8080 +HOST = "0.0.0.0" + +LOGO_FILENAME = "assets/nf-core-stableexpression_logo_light_small.png" + +LOGGING_FORMAT = "[%(asctime)s] [%(name)s] %(levelname)s - %(message)s" +DATE_FORMAT = "%Y-%m-%d_%H-%M-%S" + +APP_TITLE = "Counts" +UPDATE_TITLE = "Updating ..." + +DATA_FOLDER = "data" + +ALL_COUNT_FILENAME = "all_counts.parquet" +ALL_GENES_STAT_FILENAME = "all_genes_summary.csv" +ALL_DESIGNS_FILENAME = "whole_design.csv" + +GENE_ID_COLNAME = "gene_id" +STD_COLNAME = "standard_deviation" +STABILITY_SCORE_COLNAME = "stability_score" + +AG_GRID_DEFAULT_COLUMN_DEF = { + "filter": True, + "resizable": True, + "editable": False, + "sortable": True, +} + +AG_GRID_DEFAULT_OPTIONS = {"pagination": True, "paginationAutoPageSize": True} diff --git a/modules/local/dash_app/app/src/utils/data_management.py b/modules/local/dash_app/app/src/utils/data_management.py new file mode 100644 index 00000000..4548ea4f --- /dev/null +++ b/modules/local/dash_app/app/src/utils/data_management.py @@ -0,0 +1,83 @@ +from functools import lru_cache + +import pandas as pd +import polars as pl +from src.utils import config + + +@lru_cache(maxsize=None) +class DataManager: + def __init__(self): + self.all_counts_lf: pl.LazyFrame = self.get_all_count_data() + self.all_genes_stat_df: pl.DataFrame = self.get_all_genes_stat_data() + + @staticmethod + def get_all_count_data() -> pl.LazyFrame: + file = f"{config.DATA_FOLDER}/{config.ALL_COUNT_FILENAME}" + return pl.scan_parquet(file) + + def get_sorted_samples(self) -> list[str]: + return sorted( + self.all_counts_lf.select(pl.exclude(config.GENE_ID_COLNAME)) + .collect_schema() + .names() + ) + + def get_all_genes_stat_data(self) -> pl.DataFrame: + file = f"{config.DATA_FOLDER}/{config.ALL_GENES_STAT_FILENAME}" + stat_df = pl.read_csv(file) + cols_to_select = ["rank"] + [ + col for col in stat_df.columns if col not in ["rank", "is_candidate"] + ] + return stat_df.select(cols_to_select) + + """ + def get_samples_grouped_by_dataset(self) -> list[dict]: + + samples_grouped_by_dataset = [] + + design_file = f"{config.DATA_FOLDER}/{config.ALL_DESIGNS_FILENAME}" + design_df = pd.read_csv(design_file) + + for group, samples in design_df.groupby(["batch", "condition"])["sample"]: + batch, condition = group # unpacking + batch_condition_samples_dict = { + "group": f"Dataset: {batch} || Condition: {condition}", + "items": [ + {"value": sample, "label": sample} + for sample in samples.to_list() + if sample in samples_in_count_data + ], + } + samples_grouped_by_dataset.append(batch_condition_samples_dict) + + return samples_grouped_by_dataset + """ + + def get_sorted_genes(self) -> list[str]: + return ( + self.all_genes_stat_df.sort( + by=config.STABILITY_SCORE_COLNAME, descending=False + ) + .select(config.GENE_ID_COLNAME) + .to_series() + .to_list() + ) + + def get_gene_counts(self, gene: str) -> pd.Series: + return ( + self.all_counts_lf.filter(pl.col(config.GENE_ID_COLNAME) == gene) + .select(pl.exclude(config.GENE_ID_COLNAME)) + .collect() + .to_pandas() + .iloc[0] + ) + + def get_sample_counts(self, sample: str) -> pd.Series: + return ( + self.all_counts_lf.select(sample) + .drop_nulls() + .collect() + .to_pandas() + .iloc[:, 0] + ) diff --git a/modules/local/dash_app/app/src/utils/style.py b/modules/local/dash_app/app/src/utils/style.py new file mode 100644 index 00000000..2b4a39f5 --- /dev/null +++ b/modules/local/dash_app/app/src/utils/style.py @@ -0,0 +1,80 @@ +LAYOUT = { + "left": "0px", + "top": "0px", + "position": "absolute", + "width": "100%", + "height": "100%", +} + +HEADER_HEIGHT = "5em" + +TAB = { + "position": "fixed", + "top": 0, + "left": 10, + "right": 0, + "width": "100%", + "height": "100%", + #'zIndex': '1001', +} + +HEADER_TABLIST = { + "position": "fixed", + "top": 0, + "left": 10, + "right": 0, + "width": "60%", + "height": HEADER_HEIGHT, + #'zIndex': '1001' +} + +HEADER_TABLIST_ITEM = { + #'width': '15vh', + # "text-align": "center", + "paddingRight": "20px", + #'paddingTop': '26px', + #'paddingBottom': '26px', + #'width': LEFT_SIDEBAR_WIDTH +} + +TABS_PANEL = {"margin-top": HEADER_HEIGHT, "height": f"calc(100% - {HEADER_HEIGHT})"} + + +SETTINGS_BUTTON = { + "right": "20px", +} + +SIDEBAR_WIDTH = "15em" + +SIDEBAR = { + "position": "fixed", + "top": HEADER_HEIGHT, + "bottom": 0, + "width": SIDEBAR_WIDTH, + "height": "100vh", + "alignItems": "center", +} + + +DROPDOWN = {"marginTop": "10px", "paddingLeft": "4.2em", "paddingRight": "4.5em"} + +STACK_SUBSECTION_TITLE = {"marginBottom": "-20px"} + +AG_GRID = { + "height": "calc(100% - 10px)", + "top": HEADER_HEIGHT, + "paddingTop": "10px", + "marginRight": "15px", + "paddingRight": "25px", + "marginLeft": "5px", +} + +GRAPH = { + #'width': '100vh', + "top": HEADER_HEIGHT, + "marginLeft": "0px", + "marginRight": "3em", + "marginTop": "2px", + "marginBottom": "3px", + "display": "none", +} diff --git a/modules/local/dash_app/main.nf b/modules/local/dash_app/main.nf new file mode 100644 index 00000000..16753ff9 --- /dev/null +++ b/modules/local/dash_app/main.nf @@ -0,0 +1,61 @@ +process DASH_APP { + + label 'process_high' + + conda "${moduleDir}/app/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/4e/4eec747f2063edcc2d1b64e3b84a6b154fde1b9cd9d698446321b4a535432272/data': + 'community.wave.seqera.io/library/dash-ag-grid_dash-extensions_dash-iconify_dash-mantine-components_pruned:7cf6396dd8cd850e' }" + + errorStrategy { + if (task.exitStatus == 100) { + log.warn("Could not start the Dash application.") + return 'ignore' // only report errors but ignores it + } else { + log.warn("Could not start the Dash application due to unhandled error.") + return 'ignore' // ignore anyway + } + } + + input: + path all_counts + path whole_design + path all_genes_summary + + output: + path("*"), emit: app + path "versions.yml", emit: versions + + script: + def is_using_containers = workflow.containerEngine ? true : false + """ + # limiting number of threads when using conda / micromamba + if [ "${is_using_containers}" == "false" ]; then + export POLARS_MAX_THREADS=${task.cpus} + fi + + mkdir -p data + mv ${all_counts} ${whole_design} ${all_genes_summary} data/ + cp -r ${moduleDir}/app/* . + + # as of Nextflow version 25.04.8, having these versions sent to the versions topic channel + # results in ERROR ~ No such file or directory: /.command.env + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$( python3 --version | sed "s/Python //" ) + dash: \$( python3 -c "import dash; print(dash.__version__)" ) + dash-extensions: \$( python3 -c "import dash_extensions; print(dash_extensions.__version__)" ) + dash-mantine-components: \$( python3 -c "import dash_mantine_components; print(dash_mantine_components.__version__)" ) + dash-ag-grid: \$( python3 -c "import dash_ag_grid; print(dash_ag_grid.__version__)" ) + polars: \$( python3 -c "import polars; print(polars.__version__)" ) + pandas: \$( python3 -c "import pandas; print(pandas.__version__)" ) + pyarrow: \$( python3 -c "import pyarrow; print(pyarrow.__version__)" ) + scipy: \$( python3 -c "import scipy; print(scipy.__version__)" ) + END_VERSIONS + + # trying to launch the app + # if the resulting exit code is not 124 (exit code of timeout) then there is an error + timeout 10 python -B app.py || exit_code=\$?; [ "\$exit_code" -eq 124 ] && exit 0 || exit 100 + """ + +} diff --git a/modules/local/dataset_statistics/environment.yml b/modules/local/dataset_statistics/environment.yml deleted file mode 100644 index 2e3af0bd..00000000 --- a/modules/local/dataset_statistics/environment.yml +++ /dev/null @@ -1,8 +0,0 @@ -name: quant_norm -channels: - - conda-forge -dependencies: - - conda-forge::python==3.12.8 - - conda-forge::pandas==2.2.3 - - conda-forge::scipy==1.15.0 - - conda-forge::pyarrow==19.0.0 diff --git a/modules/local/dataset_statistics/main.nf b/modules/local/dataset_statistics/main.nf deleted file mode 100644 index 4595a4d8..00000000 --- a/modules/local/dataset_statistics/main.nf +++ /dev/null @@ -1,36 +0,0 @@ -process DATASET_STATISTICS { - - label 'process_low' - - publishDir "${params.outdir}/dataset_statistics" - - tag "${meta.dataset}" - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/5f/5fe497e7a739fa611fedd6f72ab9a3cf925873a5ded3188161fc85fd376b2c1c/data': - 'community.wave.seqera.io/library/pandas_pyarrow_python_scipy:7cad0d297a717147' }" - - input: - tuple val(meta), path(count_file) - - output: - tuple val(meta), path('*.dataset_stats.csv'), emit: stats - tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions - tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions - tuple val("${task.process}"), val('scipy'), eval('python3 -c "import scipy; print(scipy.__version__)"'), topic: versions - tuple val("${task.process}"), val('pyarrow'), eval('python3 -c "import pyarrow; print(pyarrow.__version__)"'), topic: versions - - - script: - """ - get_dataset_statistics.py --counts $count_file - """ - - - stub: - """ - touch count.cpm.dataset_stats.csv - """ - -} diff --git a/modules/local/deseq2/normalise/environment.yml b/modules/local/deseq2/normalise/environment.yml deleted file mode 100644 index 2ddc8904..00000000 --- a/modules/local/deseq2/normalise/environment.yml +++ /dev/null @@ -1,8 +0,0 @@ -name: deseq_normalise -channels: - - conda-forge - - bioconda -dependencies: - - conda-forge::r-base==4.3.3 - - bioconda::bioconductor-deseq2==1.42.0 - - conda-forge::r-optparse==1.7.5 diff --git a/modules/local/deseq2/normalise/main.nf b/modules/local/deseq2/normalise/main.nf deleted file mode 100644 index f8e9921e..00000000 --- a/modules/local/deseq2/normalise/main.nf +++ /dev/null @@ -1,34 +0,0 @@ -process DESEQ2_NORMALISE { - - label 'process_low' - - publishDir "${params.outdir}/normalisation/deseq2" - - tag "${meta.dataset}" - - // ignoring cases when the count dataframe gets empty after filtering (the script throws a 100 in this case) - // the subsequent steps will not be run for this dataset - errorStrategy { task.exitStatus == 100 ? 'ignore' : 'terminate' } - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/ce/cef7164b168e74e5db11dcd9acf6172d47ed6753e4814c68f39835d0c6c22f6d/data': - 'community.wave.seqera.io/library/bioconductor-deseq2_r-base_r-optparse:c84cd7ffdb298fa7' }" - - input: - tuple val(meta), path(count_file) - - output: - tuple val(meta), path('*.cpm.csv'), emit: cpm - tuple val("${task.process}"), val('R'), eval('Rscript -e "cat(R.version.string)" | sed "s/R version //"'), topic: versions - tuple val("${task.process}"), val('DESeq2'), eval('Rscript -e "cat(as.character(packageVersion(\'DESeq2\')))"'), topic: versions - - - script: - def design_file = meta.design - """ - deseq2_normalise.R --counts "$count_file" --design "$design_file" - """ - - -} diff --git a/modules/local/detect_rare_genes/environment.yml b/modules/local/detect_rare_genes/environment.yml new file mode 100644 index 00000000..f0eaf3dd --- /dev/null +++ b/modules/local/detect_rare_genes/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.35.2 diff --git a/modules/local/detect_rare_genes/main.nf b/modules/local/detect_rare_genes/main.nf new file mode 100644 index 00000000..2f45c375 --- /dev/null +++ b/modules/local/detect_rare_genes/main.nf @@ -0,0 +1,46 @@ +process DETECT_RARE_GENES { + + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f8a5d02e7b31980c887253a9f118da0ef91ead1c7b158caf855199e5c5d5473/data': + 'community.wave.seqera.io/library/polars_python:cab787b788e5eba7' }" + + input: + path(gene_id_mapping_file) + path(gene_id_occurrences_file) + val(nb_datasets) + val(min_occurrence_frequency) + val(min_occurrence_quantile) + + output: + path('valid_gene_ids.txt'), emit: valid_gene_ids + path('total_gene_id_occurrence_quantiles.csv'), topic: total_gene_id_occurrence_quantiles + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def is_using_containers = workflow.containerEngine ? true : false + """ + # limiting number of threads when using conda / micromamba + if [ "${is_using_containers}" == "false" ]; then + export POLARS_MAX_THREADS=${task.cpus} + fi + + detect_rare_genes.py \\ + --occurrences $gene_id_occurrences_file \\ + --mappings $gene_id_mapping_file \\ + --nb-datasets $nb_datasets \\ + --min-occurrence-frequency $min_occurrence_frequency \\ + --min-occurrence-quantile $min_occurrence_quantile + + """ + + + stub: + """ + touch fake.validated_genes.txt + """ + +} diff --git a/modules/local/download_ensembl_annotation/environment.yml b/modules/local/download_ensembl_annotation/environment.yml new file mode 100644 index 00000000..95caa5f9 --- /dev/null +++ b/modules/local/download_ensembl_annotation/environment.yml @@ -0,0 +1,12 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.0 + - conda-forge::pandas==2.3.3 + - conda-forge::requests==2.32.5 + - conda-forge::tqdm==4.67.1 + - conda-forge::bs4==4.14.2 + - conda-forge::tenacity==9.1.2 diff --git a/modules/local/download_ensembl_annotation/main.nf b/modules/local/download_ensembl_annotation/main.nf new file mode 100644 index 00000000..59cbf722 --- /dev/null +++ b/modules/local/download_ensembl_annotation/main.nf @@ -0,0 +1,34 @@ +process DOWNLOAD_ENSEMBL_ANNOTATION { + + label 'process_single' + + tag "${species}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/5f/5fa11d593e2f2d68c60acc6a00c812793112bff4691754c992fff6b038458604/data': + 'community.wave.seqera.io/library/bs4_pandas_requests_tenacity_tqdm:32f7387852168716' }" + + input: + val species + + output: + path "*.gff3.gz", emit: gff3 + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('requests'), eval('python3 -c "import requests; print(requests.__version__)"'), topic: versions + tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions + tuple val("${task.process}"), val('bs4'), eval('python3 -c "import bs4; print(bs4.__version__)"'), topic: versions + tuple val("${task.process}"), val('tqdm'), eval('python3 -c "import tqdm; print(tqdm.__version__)"'), topic: versions + + script: + """ + download_latest_ensembl_annotation.py \\ + --species ${species} + """ + + stub: + """ + touch fake.gff3.gz.txt + """ + +} diff --git a/modules/local/download_ncbi_annotation/environment.yml b/modules/local/download_ncbi_annotation/environment.yml new file mode 100644 index 00000000..f644045d --- /dev/null +++ b/modules/local/download_ncbi_annotation/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.13.5 + - conda-forge::requests==2.32.5 + - conda-forge::tenacity==9.1.2 diff --git a/modules/local/download_ncbi_annotation/main.nf b/modules/local/download_ncbi_annotation/main.nf new file mode 100644 index 00000000..c59fce96 --- /dev/null +++ b/modules/local/download_ncbi_annotation/main.nf @@ -0,0 +1,33 @@ +process DOWNLOAD_NCBI_ANNOTATION { + + label 'process_single' + + tag "${species}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/5c/5c28c8e613c062828aaee4b950029bc90a1a1aa94d5f61016a588c8ec7be8b65/data': + 'community.wave.seqera.io/library/pandas_requests_tenacity:5ba56df089a9d718' }" + + input: + val species + + output: + path "*.gff.gz", emit: gff + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('requests'), eval('python3 -c "import requests; print(requests.__version__)"'), topic: versions + + script: + """ + download_latest_ncbi_annotation.py \\ + --species ${species} + + gzip -n *.gff + """ + + stub: + """ + touch fake.gff3.gz.txt + """ + +} diff --git a/modules/local/edger/normalise/environment.yml b/modules/local/edger/normalise/environment.yml deleted file mode 100644 index d460b7cd..00000000 --- a/modules/local/edger/normalise/environment.yml +++ /dev/null @@ -1,8 +0,0 @@ -name: edger_normalise -channels: - - conda-forge - - bioconda -dependencies: - - conda-forge::r-base==4.3.3 - - bioconda::bioconductor-edger==4.0.16 - - conda-forge::r-optparse==1.7.5 diff --git a/modules/local/edger/normalise/main.nf b/modules/local/edger/normalise/main.nf deleted file mode 100644 index d25347ab..00000000 --- a/modules/local/edger/normalise/main.nf +++ /dev/null @@ -1,33 +0,0 @@ -process EDGER_NORMALISE { - - label 'process_low' - - publishDir "${params.outdir}/normalisation/edger" - - tag "${meta.dataset}" - - // ignoring cases when the count dataframe gets empty after filtering (the script throws a 100 in this case) - // the subsequent steps will not be run for this dataset - errorStrategy { task.exitStatus == 100 ? 'ignore' : 'terminate' } - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/89/89bbc9544e18b624ed6d0a30e701cf8cec63e063cc9b5243e1efde362fe92228/data': - 'community.wave.seqera.io/library/bioconductor-edger_r-base_r-optparse:400aaabddeea1574' }" - - input: - tuple val(meta), path(count_file) - - output: - tuple val(meta), path('*.cpm.csv'), emit: cpm - tuple val("${task.process}"), val('R'), eval('Rscript -e "cat(R.version.string)" | sed "s/R version //"'), topic: versions - tuple val("${task.process}"), val('edgeR'), eval('Rscript -e "cat(as.character(packageVersion(\'edgeR\')))"'), topic: versions - - - script: - def design_file = meta.design - """ - edger_normalise.R --counts "$count_file" --design "$design_file" - """ - -} diff --git a/modules/local/expressionatlas/getaccessions/environment.yml b/modules/local/expressionatlas/getaccessions/environment.yml index a6e12662..207b56fc 100644 --- a/modules/local/expressionatlas/getaccessions/environment.yml +++ b/modules/local/expressionatlas/getaccessions/environment.yml @@ -1,9 +1,12 @@ -name: eatlas_get_accessions +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json channels: - conda-forge + - bioconda dependencies: - - conda-forge::python==3.12.8 - - conda-forge::requests==2.32.3 - - conda-forge::nltk==3.9.1 - - conda-forge::tenacity==9.0.0 - - conda-forge::pandas==2.2.3 + - conda-forge::python=3.13.5 + - conda-forge::pandas==2.3.3 + - conda-forge::requests==2.32.5 + - conda-forge::tenacity==9.1.2 + - conda-forge::pyyaml==6.0.3 + - conda-forge::nltk==3.9.2 diff --git a/modules/local/expressionatlas/getaccessions/main.nf b/modules/local/expressionatlas/getaccessions/main.nf index 100602fa..0ce4133c 100644 --- a/modules/local/expressionatlas/getaccessions/main.nf +++ b/modules/local/expressionatlas/getaccessions/main.nf @@ -1,41 +1,68 @@ process EXPRESSIONATLAS_GETACCESSIONS { - label 'process_low' + label 'process_high' + + tag "${species}" conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/e4/e459ae44332297f0429e7dd501bc3a6f9b5504b13e2db0002a5d3021cc9ac443/data': - 'community.wave.seqera.io/library/nltk_pandas_python_requests_tenacity:a29bfda256e4f39f' }" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/f2/f2219a174683388670dc0817da45717014aca444323027480f84aaaf12bfb460/data': + 'community.wave.seqera.io/library/nltk_data_pandas_pyyaml_requests_tenacity:5f5f82f858433879' }" input: val species val keywords + val platform + val random_sampling_size + val random_sampling_seed output: - path 'accessions.txt', emit: txt + path "accessions.txt", optional: true, emit: accessions + env("SAMPLING_QUOTA"), emit: sampling_quota + path "selected_experiments.metadata.tsv", optional: true, topic: eatlas_selected_datasets + path "species_experiments.metadata.tsv", optional: true, topic: eatlas_all_datasets + //path "filtered_experiments.metadata.tsv", optional: true, topic: filtered_eatlas_experiment_metadata + //path "filtered_experiments.keywords.yaml", optional: true, topic: filtered_eatlas_experiment_keywords tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions tuple val("${task.process}"), val('requests'), eval('python3 -c "import requests; print(requests.__version__)"'), topic: versions tuple val("${task.process}"), val('nltk'), eval('python3 -c "import nltk; print(nltk.__version__)"'), topic: versions - + tuple val("${task.process}"), val('pyyaml'), eval('python3 -c "import yaml; print(yaml.__version__)"'), topic: versions + tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions script: def keywords_string = keywords.split(',').collect { it.trim() }.join(' ') - - // the folder where nltk will download data needs to be writable (necessary for singularity) - if (keywords_string == "") { - """ - NLTK_DATA=$PWD get_eatlas_accessions.py --species $species - """ - } else { - """ - NLTK_DATA=$PWD get_eatlas_accessions.py --species $species --keywords $keywords_string - """ + def args = " --species $species" + if ( keywords_string != "" ) { + args += " --keywords $keywords_string" + } + if ( platform ) { + args += " --platform $platform" + } + if ( random_sampling_size ) { + args += " --random-sampling-size $random_sampling_size" } + if ( random_sampling_seed ) { + args += " --random-sampling-seed $random_sampling_seed" + } + """ + # the folder where nltk will download data needs to be writable (necessary for singularity) + export NLTK_DATA=\${PWD} + get_eatlas_accessions.py \\ + $args \\ + --cpus ${task.cpus} + + SAMPLING_QUOTA=\$(cat sampling_quota.txt) + """ stub: """ - touch accessions.csv + touch accessions.txt \\ + all_experiments.metadata.tsv \\ + filtered_experiments.metadata.tsv \\ + filtered_experiments.keywords.yaml + + SAMPLING_QUOTA="ok" """ } diff --git a/modules/local/expressionatlas/getdata/environment.yml b/modules/local/expressionatlas/getdata/environment.yml index 156f457b..cdb6c8ed 100644 --- a/modules/local/expressionatlas/getdata/environment.yml +++ b/modules/local/expressionatlas/getdata/environment.yml @@ -1,8 +1,9 @@ -name: eatlas_get_data +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json channels: - conda-forge - bioconda dependencies: - - conda-forge::r-base==4.3.3 - - bioconda::bioconductor-expressionatlas==1.30.0 + - conda-forge::r-base==4.4.3 - conda-forge::r-optparse==1.7.5 + - bioconda::bioconductor-expressionatlas==1.34.0 diff --git a/modules/local/expressionatlas/getdata/main.nf b/modules/local/expressionatlas/getdata/main.nf index 4da386a8..267b3233 100644 --- a/modules/local/expressionatlas/getdata/main.nf +++ b/modules/local/expressionatlas/getdata/main.nf @@ -1,58 +1,31 @@ process EXPRESSIONATLAS_GETDATA { - label 'process_low' - - // limiting to 8 threads at a time to avoid 429 errors with the Expression Atlas API server - maxForks 8 + label 'process_single' tag "$accession" - errorStrategy = { - if (task.exitStatus == 100) { - // ignoring accessions that cannot be retrieved from Expression Atlas (the script throws a 100 in this case) - // sometimes, some datasets are transiently unavailable from Expression Atlas: - // we ignore them as there is no point in trying again and again - // they will be available again soon but we can't know when - // for some other files, they are simply unavailable for good... - log.warn("Could not retrieve data for accession ${accession}. This could be a transient network issue or a permission error.") - return 'ignore' - } else if (task.exitStatus == 101) { - // some datasets are not associated with experiment summary - // we ignore them as there they would be useless for us - log.warn("Failure to download whole dataset for accession ${accession}. No experiment summary found.") - return 'ignore' - } else if (task.exitStatus == 102) { - // unhandled error: we print an extra message to warn the user - log.warn("Unhandled error occurred with accession: ${accession}") - return 'ignore' - } else if (task.exitStatus == 137) { // override default behaviour to sleep some time before retry - // in case of OOM errors, we wait a bit and try again - sleep(Math.pow(2, task.attempt) * 2000 as long) - return 'retry' - } else { - return 'terminate' - } - } - maxRetries = 5 + maxForks 8 // limiting to 8 threads at a time to avoid 429 errors with the Expression Atlas API server conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/7f/7fd21450c3a3f7df37fa0480170780019e9686be319da1c9e10712f7f17cca26/data': 'community.wave.seqera.io/library/bioconductor-expressionatlas_r-base_r-optparse:ca0f8cd9d3f44af9' }" input: - val(accession) + val accession output: - path "*.design.csv", emit: design - path "*.counts.csv", emit: counts + path("*.counts.csv"), optional: true, emit: counts + path("*.design.csv"), optional: true, emit: design + tuple val(accession), path("failure_reason.txt"), optional: true, topic: eatlas_failure_reason + tuple val(accession), path("warning_reason.txt"), optional: true, topic: eatlas_warning_reason tuple val("${task.process}"), val('R'), eval('Rscript -e "cat(R.version.string)" | sed "s/R version //"'), topic: versions tuple val("${task.process}"), val('ExpressionAtlas'), eval('Rscript -e "cat(as.character(packageVersion(\'ExpressionAtlas\')))"'), topic: versions - script: """ - get_eatlas_data.R --accession $accession + which python + download_eatlas_data.R --accession $accession """ stub: diff --git a/modules/local/filter_and_rename_genes/environment.yml b/modules/local/filter_and_rename_genes/environment.yml new file mode 100644 index 00000000..f0eaf3dd --- /dev/null +++ b/modules/local/filter_and_rename_genes/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.35.2 diff --git a/modules/local/filter_and_rename_genes/main.nf b/modules/local/filter_and_rename_genes/main.nf new file mode 100644 index 00000000..684c2f04 --- /dev/null +++ b/modules/local/filter_and_rename_genes/main.nf @@ -0,0 +1,52 @@ +process FILTER_AND_RENAME_GENES { + + label 'process_low' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f8a5d02e7b31980c887253a9f118da0ef91ead1c7b158caf855199e5c5d5473/data': + 'community.wave.seqera.io/library/polars_python:cab787b788e5eba7' }" + + input: + tuple val(meta), path(count_file) + path gene_id_mapping_file + path valid_gene_ids_file + + output: + tuple val(meta), path('*.renamed.parquet'), optional: true, emit: counts + tuple val(meta.dataset), path("failure_reason.txt"), optional: true, topic: renaming_failure_reason + tuple val(meta.dataset), path("warning_reason.txt"), optional: true, topic: renaming_warning_reason + tuple val(meta.dataset), env("NB_FINAL"), env("NB_MERGED"), env("NB_NOT_VALID"), env("NB_UNMAPPED"), topic: id_mapping_stats + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def mapping_arg = gene_id_mapping_file ? "--mappings $gene_id_mapping_file" : "" + def valid_ids_arg = valid_gene_ids_file ? "--valid-gene-ids $valid_gene_ids_file" : "" + def is_using_containers = workflow.containerEngine ? true : false + """ + # limiting number of threads when using conda / micromamba + if [ "${is_using_containers}" == "false" ]; then + export POLARS_MAX_THREADS=${task.cpus} + fi + + filter_and_rename_genes.py \\ + --count-file "$count_file" \\ + $mapping_arg \\ + $valid_ids_arg + + NB_UNMAPPED=\$(cat unmapped.txt) + NB_MERGED=\$(cat merged.txt) + NB_NOT_VALID=\$(cat not_valid.txt) + NB_FINAL=\$(cat final.txt) + """ + + + stub: + """ + touch fake_renamed.csv + """ + +} diff --git a/modules/local/gene_statistics/environment.yml b/modules/local/gene_statistics/environment.yml deleted file mode 100644 index a170b24e..00000000 --- a/modules/local/gene_statistics/environment.yml +++ /dev/null @@ -1,6 +0,0 @@ -name: gene_statistics -channels: - - conda-forge -dependencies: - - conda-forge::python==3.12.8 - - conda-forge::polars==1.17.1 diff --git a/modules/local/gene_statistics/main.nf b/modules/local/gene_statistics/main.nf deleted file mode 100644 index 3e1262b6..00000000 --- a/modules/local/gene_statistics/main.nf +++ /dev/null @@ -1,50 +0,0 @@ -process GENE_STATISTICS { - debug true - label 'process_low' - - errorStrategy = { - if (task.exitStatus == 100) { - log.error( - "No count could be found before merging datasets! " - + "Please check the provided accessions and datasets and run again" - ) - return 'terminate' - } - } - - publishDir "${params.outdir}/gene_statistics" - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f8a5d02e7b31980c887253a9f118da0ef91ead1c7b158caf855199e5c5d5473/data': - 'community.wave.seqera.io/library/polars_python:cab787b788e5eba7' }" - - input: - path count_file - path metadata_files, stageAs: "?/*" - path mapping_files, stageAs: "?/*" - val nb_top_stable_genes - path ks_stats_file - val ks_pvalue_threshold - - output: - path 'top_stable_genes_summary.csv', emit: top_stable_genes_summary - path 'stats_all_genes.csv', emit: all_statistics - path 'all_counts_filtered.parquet', emit: all_counts - path 'top_stable_genes_transposed_counts_filtered.csv', emit: top_stable_genes_transposed_counts - tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions - tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions - - - script: - """ - get_gene_statistics.py \ - --counts $count_file \ - --metadata "$metadata_files" \ - --mappings "$mapping_files" \ - --nb-top-stable-genes $nb_top_stable_genes \ - --ks-stats $ks_stats_file \ - --ks-pvalue-threshold $ks_pvalue_threshold - """ - -} diff --git a/modules/local/genorm/compute_m_measure/environment.yml b/modules/local/genorm/compute_m_measure/environment.yml new file mode 100644 index 00000000..f0eaf3dd --- /dev/null +++ b/modules/local/genorm/compute_m_measure/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.35.2 diff --git a/modules/local/genorm/compute_m_measure/main.nf b/modules/local/genorm/compute_m_measure/main.nf new file mode 100644 index 00000000..e2048e0a --- /dev/null +++ b/modules/local/genorm/compute_m_measure/main.nf @@ -0,0 +1,34 @@ +process COMPUTE_M_MEASURE { + + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f8a5d02e7b31980c887253a9f118da0ef91ead1c7b158caf855199e5c5d5473/data': + 'community.wave.seqera.io/library/polars_python:cab787b788e5eba7' }" + + input: + path count_file + path files + + output: + path 'm_measures.csv', emit: m_measures + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + + script: + def args = "--task-attempts ${task.attempt}" + def is_using_containers = workflow.containerEngine ? true : false + """ + # limiting number of threads when using conda / micromamba + if [ "${is_using_containers}" == "false" ]; then + export POLARS_MAX_THREADS=${task.cpus} + fi + + compute_m_measures.py \\ + --counts $count_file \\ + --std-files "$files" $args + """ + +} diff --git a/modules/local/genorm/cross_join/environment.yml b/modules/local/genorm/cross_join/environment.yml new file mode 100644 index 00000000..f0eaf3dd --- /dev/null +++ b/modules/local/genorm/cross_join/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.35.2 diff --git a/modules/local/genorm/cross_join/main.nf b/modules/local/genorm/cross_join/main.nf new file mode 100644 index 00000000..9c6f4f5b --- /dev/null +++ b/modules/local/genorm/cross_join/main.nf @@ -0,0 +1,35 @@ +process CROSS_JOIN { + + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f8a5d02e7b31980c887253a9f118da0ef91ead1c7b158caf855199e5c5d5473/data': + 'community.wave.seqera.io/library/polars_python:cab787b788e5eba7' }" + + input: + tuple val(meta), path("count_chunk_file_1"), path("count_chunk_file_2") + + output: + path 'cross_join.*.parquet', emit: data + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + + script: + def args = "--task-attempts ${task.attempt}" + def is_using_containers = workflow.containerEngine ? true : false + """ + # limiting number of threads when using conda / micromamba + if [ "${is_using_containers}" == "false" ]; then + export POLARS_MAX_THREADS=${task.cpus} + fi + + make_cross_join.py \\ + --file1 count_chunk_file_1 \\ + --file2 count_chunk_file_2 \\ + --index1 ${meta.index_1} \\ + --index2 ${meta.index_2} $args + """ + +} diff --git a/modules/local/genorm/expression_ratio/environment.yml b/modules/local/genorm/expression_ratio/environment.yml new file mode 100644 index 00000000..f0eaf3dd --- /dev/null +++ b/modules/local/genorm/expression_ratio/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.35.2 diff --git a/modules/local/genorm/expression_ratio/main.nf b/modules/local/genorm/expression_ratio/main.nf new file mode 100644 index 00000000..5299a591 --- /dev/null +++ b/modules/local/genorm/expression_ratio/main.nf @@ -0,0 +1,31 @@ +process EXPRESSION_RATIO { + + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f8a5d02e7b31980c887253a9f118da0ef91ead1c7b158caf855199e5c5d5473/data': + 'community.wave.seqera.io/library/polars_python:cab787b788e5eba7' }" + + input: + path file + + output: + path 'ratios.*.parquet', emit: data + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + + script: + def args = "--task-attempts ${task.attempt}" + def is_using_containers = workflow.containerEngine ? true : false + """ + # limiting number of threads when using conda / micromamba + if [ "${is_using_containers}" == "false" ]; then + export POLARS_MAX_THREADS=${task.cpus} + fi + + make_pairwise_gene_expression_ratio.py --file $file + """ + +} diff --git a/modules/local/genorm/make_chunks/environment.yml b/modules/local/genorm/make_chunks/environment.yml new file mode 100644 index 00000000..f0eaf3dd --- /dev/null +++ b/modules/local/genorm/make_chunks/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.35.2 diff --git a/modules/local/genorm/make_chunks/main.nf b/modules/local/genorm/make_chunks/main.nf new file mode 100644 index 00000000..ea17520f --- /dev/null +++ b/modules/local/genorm/make_chunks/main.nf @@ -0,0 +1,31 @@ +process MAKE_CHUNKS { + + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f8a5d02e7b31980c887253a9f118da0ef91ead1c7b158caf855199e5c5d5473/data': + 'community.wave.seqera.io/library/polars_python:cab787b788e5eba7' }" + + input: + path count_file + + output: + path 'count_chunk.*.parquet', emit: chunks + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + + script: + def args = "--task-attempts ${task.attempt}" + def is_using_containers = workflow.containerEngine ? true : false + """ + # limiting number of threads when using conda / micromamba + if [ "${is_using_containers}" == "false" ]; then + export POLARS_MAX_THREADS=${task.cpus} + fi + + make_parquet_chunks.py --counts $count_file $args + """ + +} diff --git a/modules/local/genorm/ratio_standard_variation/environment.yml b/modules/local/genorm/ratio_standard_variation/environment.yml new file mode 100644 index 00000000..f0eaf3dd --- /dev/null +++ b/modules/local/genorm/ratio_standard_variation/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.35.2 diff --git a/modules/local/genorm/ratio_standard_variation/main.nf b/modules/local/genorm/ratio_standard_variation/main.nf new file mode 100644 index 00000000..624c26a8 --- /dev/null +++ b/modules/local/genorm/ratio_standard_variation/main.nf @@ -0,0 +1,31 @@ +process RATIO_STANDARD_VARIATION { + + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f8a5d02e7b31980c887253a9f118da0ef91ead1c7b158caf855199e5c5d5473/data': + 'community.wave.seqera.io/library/polars_python:cab787b788e5eba7' }" + + input: + path file + + output: + path 'std.*.parquet', emit: data + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + + script: + def args = "--task-attempts ${task.attempt}" + def is_using_containers = workflow.containerEngine ? true : false + """ + # limiting number of threads when using conda / micromamba + if [ "${is_using_containers}" == "false" ]; then + export POLARS_MAX_THREADS=${task.cpus} + fi + + get_ratio_standard_variation.py --file $file $args + """ + +} diff --git a/modules/local/geo/getaccessions/environment.yml b/modules/local/geo/getaccessions/environment.yml new file mode 100644 index 00000000..91b4f94e --- /dev/null +++ b/modules/local/geo/getaccessions/environment.yml @@ -0,0 +1,14 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.13.7 + - conda-forge::pandas==2.3.3 + - conda-forge::requests==2.32.5 + - conda-forge::tenacity==9.1.2 + - conda-forge::nltk==3.9.2 + - conda-forge::tqdm==4.67.1 + - conda-forge::xmltodict==1.0.2 + - conda-forge::biopython==1.86 diff --git a/modules/local/geo/getaccessions/main.nf b/modules/local/geo/getaccessions/main.nf new file mode 100644 index 00000000..bf8ac3e9 --- /dev/null +++ b/modules/local/geo/getaccessions/main.nf @@ -0,0 +1,73 @@ +process GEO_GETACCESSIONS { + + label 'process_high' + + tag "${species}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/ca/caae35ec5dc72367102a616a47b6f1a7b3de9ff272422f2c08895b8bb5f0566c/data': + 'community.wave.seqera.io/library/biopython_nltk_pandas_parallelbar_pruned:5fc501b07f8e0428' }" + + input: + val species + val keywords + val platform + path excluded_accessions_file + val random_sampling_size + val random_sampling_seed + + output: + path "accessions.txt", optional: true, emit: accessions + path "geo_selected_datasets.metadata.tsv", optional: true, topic: geo_selected_datasets + path "geo_all_datasets.metadata.tsv", optional: true, topic: geo_all_datasets + path "geo_rejected_datasets.metadata.tsv", optional: true, topic: geo_rejected_datasets + + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('requests'), eval('python3 -c "import requests; print(requests.__version__)"'), topic: versions + tuple val("${task.process}"), val('nltk'), eval('python3 -c "import nltk; print(nltk.__version__)"'), topic: versions + tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions + tuple val("${task.process}"), val('biopython'), eval('python3 -c "import Bio; print(Bio.__version__)"'), topic: versions + tuple val("${task.process}"), val('tqdm'), eval('python3 -c "import tqdm; print(tqdm.__version__)"'), topic: versions + + script: + def keywords_string = keywords.split(',').collect { it.trim() }.join(' ') + def args = " --species $species" + if ( keywords_string != "" ) { + args += " --keywords $keywords_string" + } + if ( platform ) { + args += " --platform $platform" + } + if ( excluded_accessions_file ) { + args += " --exclude-accessions-in $excluded_accessions_file" + } + if ( random_sampling_size ) { + args += " --random-sampling-size $random_sampling_size" + } + if ( random_sampling_seed ) { + args += " --random-sampling-seed $random_sampling_seed" + } + // the folder where nltk will download data needs to be writable (necessary for singularity) + """ + # the Entrez module from biopython automatically stores temp results in /.config + # if this directory is not writable, the script fails + export HOME=/tmp/biopython + mkdir -p /tmp/biopython + + export NLTK_DATA=\${PWD} + + get_geo_dataset_accessions.py \\ + $args \\ + --cpus ${task.cpus} + """ + + stub: + """ + touch accessions.txt \\ + all_experiments.metadata.tsv \\ + filtered_experiments.metadata.tsv \\ + filtered_experiments.keywords.yaml + """ + +} diff --git a/modules/local/geo/getdata/environment.yml b/modules/local/geo/getdata/environment.yml new file mode 100644 index 00000000..1a45cb95 --- /dev/null +++ b/modules/local/geo/getdata/environment.yml @@ -0,0 +1,10 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::r-base==4.4.3 + - conda-forge::r-optparse==1.7.5 + - conda-forge::r-dplyr==1.1.4 + - bioconda::bioconductor-geoquery==2.74.0 diff --git a/modules/local/geo/getdata/main.nf b/modules/local/geo/getdata/main.nf new file mode 100644 index 00000000..99040abd --- /dev/null +++ b/modules/local/geo/getdata/main.nf @@ -0,0 +1,41 @@ +process GEO_GETDATA { + + label 'process_single' + + tag "$accession" + + maxForks 8 // limiting to 8 threads at a time to avoid 429 errors with the NCBI server + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/4c/4cb08d96e62942e7b6288abf2cfd30e813521a022459700e610325a3a7c0b1c8/data': + 'community.wave.seqera.io/library/bioconductor-geoquery_r-base_r-dplyr_r-optparse:fcd002470b7d6809' }" + + input: + val accession + val species + + output: + path("*.counts.csv"), optional: true, emit: counts + path("*.design.csv"), optional: true, emit: design + path("rejected/**"), optional: true, emit: rejected + tuple val(accession), path("failure_reason.txt"), optional: true, topic: geo_failure_reason + tuple val(accession), path("warning_reason.txt"), optional: true, topic: geo_warning_reason + tuple val("${task.process}"), val('R'), eval('Rscript -e "cat(R.version.string)" | sed "s/R version //"'), topic: versions + tuple val("${task.process}"), val('GEOquery'), eval('Rscript -e "cat(as.character(packageVersion(\'GEOquery\')))"'), topic: versions + tuple val("${task.process}"), val('dplyr'), eval('Rscript -e "cat(as.character(packageVersion(\'dplyr\')))"'), topic: versions + + script: + """ + download_geo_data.R \\ + --accession $accession \\ + --species $species + """ + + stub: + """ + touch acc.microarray.normalised.counts.csv + touch acc.design.csv + """ + +} diff --git a/modules/local/get_candidate_genes/environment.yml b/modules/local/get_candidate_genes/environment.yml new file mode 100644 index 00000000..f0eaf3dd --- /dev/null +++ b/modules/local/get_candidate_genes/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.35.2 diff --git a/modules/local/get_candidate_genes/main.nf b/modules/local/get_candidate_genes/main.nf new file mode 100644 index 00000000..194585ae --- /dev/null +++ b/modules/local/get_candidate_genes/main.nf @@ -0,0 +1,38 @@ +process GET_CANDIDATE_GENES { + + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f8a5d02e7b31980c887253a9f118da0ef91ead1c7b158caf855199e5c5d5473/data': + 'community.wave.seqera.io/library/polars_python:cab787b788e5eba7' }" + + input: + path count_file + path stat_file + val candidate_selection_descriptor + val nb_most_stable_genes + val min_pct_quantile_expr_level + + output: + path 'candidate_counts.parquet', emit: counts + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def is_using_containers = workflow.containerEngine ? true : false + """ + # limiting number of threads when using conda / micromamba + if [ "${is_using_containers}" == "false" ]; then + export POLARS_MAX_THREADS=${task.cpus} + fi + + get_candidate_genes.py \\ + --counts $count_file \\ + --stats $stat_file \\ + --candidate_selection_descriptor $candidate_selection_descriptor \\ + --nb-top-stable-genes $nb_most_stable_genes \\ + --min-pct-quantile-expr-level $min_pct_quantile_expr_level + """ + +} diff --git a/modules/local/gprofiler/idmapping/environment.yml b/modules/local/gprofiler/idmapping/environment.yml index 8329e64b..02344ca8 100644 --- a/modules/local/gprofiler/idmapping/environment.yml +++ b/modules/local/gprofiler/idmapping/environment.yml @@ -1,7 +1,10 @@ -name: gprofiler_idmapping +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json channels: - conda-forge + - bioconda dependencies: - - conda-forge::python==3.12.8 - - conda-forge::pandas==2.2.3 - - conda-forge::requests==2.32.3 + - conda-forge::python=3.13.5 + - conda-forge::pandas==2.3.3 + - conda-forge::requests==2.32.5 + - conda-forge::tenacity==9.1.2 diff --git a/modules/local/gprofiler/idmapping/main.nf b/modules/local/gprofiler/idmapping/main.nf index f859bb68..64106af7 100644 --- a/modules/local/gprofiler/idmapping/main.nf +++ b/modules/local/gprofiler/idmapping/main.nf @@ -1,65 +1,53 @@ process GPROFILER_IDMAPPING { - - label 'process_low' - - publishDir "${params.outdir}/idmapping" - - tag "${meta.dataset}" - - // limiting to 8 threads at a time to avoid 429 errors with the G Profiler API server - maxForks 8 - - errorStrategy = { - if (task.exitStatus == 100) { - // ignoring cases when the count dataframe is empty - log.warn("Count file is empty for dataset ${meta.dataset}.") - return 'ignore' - } else if (task.exitStatus == 101) { - // likewise, when no mapping could be found, we do not want to continue with the subsequent steps for this specific dataset - log.warn("Could not map gene IDs to Ensembl for dataset ${meta.dataset}.") - return 'ignore' - } else if (task.exitStatus == 102) { - // if the server appears to be down, we stop immediately - log.error("gProfiler server appears to be down, stopping pipeline") - return 'terminate' - } else { - return 'terminate' + label 'process_medium' + + tag "${species} IDs to ${gprofiler_target_db}" + + errorStrategy { + if (task.exitStatus == 100 ) { + log.error("Could not map gene IDs to ${gprofiler_target_db} database.") + 'terminate' + } else if (task.exitStatus in ((130..145) + 104 + 175) && task.attempt <= 10) { // OOM & related errors; should be retried as long as memory does not fit + sleep(Math.pow(2, task.attempt) * 200 as long) + 'retry' + } else if (task.attempt <= 3) { // all other errors should be retried with exponential backoff with max retry = 3 + sleep(Math.pow(2, task.attempt) * 200 as long) + 'retry' + } else { // after 3 retries, ignore the error + 'finish' } } conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/aa/aad4e61f15d97b7c0a24a4e3ee87a11552464fb7110f530e43bdc9acc374cf13/data': - 'community.wave.seqera.io/library/pandas_python_requests:8c6da05a2935a952' }" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/5c/5c28c8e613c062828aaee4b950029bc90a1a1aa94d5f61016a588c8ec7be8b65/data': + 'community.wave.seqera.io/library/pandas_requests_tenacity:5ba56df089a9d718' }" input: - tuple val(meta), path(count_file), val(species) - val gene_id_mapping_file + path gene_id_file + val species + val gprofiler_target_db output: - tuple val(meta), path('*.renamed.csv'), emit: renamed - path('*.metadata.csv'), optional: true, emit: metadata - path('*.mapping.csv'), optional: true, emit: mapping + path('mapped_gene_ids.csv'), emit: mapping + path('gene_metadata.csv'), emit: metadata tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions tuple val("${task.process}"), val('requests'), eval('python3 -c "import requests; print(requests.__version__)"'), topic: versions - script: - def custom_mapping_arg = gene_id_mapping_file ? "--custom-mappings $gene_id_mapping_file" : "" """ - map_ids_to_ensembl.py \ - --count-file "$count_file" \ - --species "$species" \ - $custom_mapping_arg + gprofiler_map_ids.py \\ + --gene-ids $gene_id_file \\ + --species "$species" \\ + --target-db "$gprofiler_target_db" """ stub: """ - touch fake_renamed.csv - touch fake_metadata.csv - touch fake_mapping.json + touch mapped_gene_ids.csv + touch gene_metadata.csv """ } diff --git a/modules/local/merge_counts/environment.yml b/modules/local/merge_counts/environment.yml new file mode 100644 index 00000000..fc0aa746 --- /dev/null +++ b/modules/local/merge_counts/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.0 + - conda-forge::polars==1.35.2 + - conda-forge::tqdm==4.67.1 diff --git a/modules/local/merge_counts/main.nf b/modules/local/merge_counts/main.nf new file mode 100644 index 00000000..94858c45 --- /dev/null +++ b/modules/local/merge_counts/main.nf @@ -0,0 +1,32 @@ +process MERGE_COUNTS { + + tag "${meta.platform}" + label "process_high" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/90/90617e987f709570820b8e7752baf9004ba85917111425d4b44b429b27b201ca/data': + 'community.wave.seqera.io/library/polars_tqdm:54b124dde91d1bf3' }" + + input: + tuple val(meta), path(count_files, stageAs: "?/*") + + output: + tuple val(meta), path('all_counts.parquet'), emit: counts + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + tuple val("${task.process}"), val('tqdm'), eval('python3 -c "import tqdm; print(tqdm.__version__)"'), topic: versions + + script: + def is_using_containers = workflow.containerEngine ? true : false + """ + # limiting number of threads when using conda / micromamba + if [ "${is_using_containers}" == "false" ]; then + export POLARS_MAX_THREADS=${task.cpus} + fi + + merge_counts.py \\ + --counts "$count_files" + """ + +} diff --git a/modules/local/merge_data/environment.yml b/modules/local/merge_data/environment.yml deleted file mode 100644 index 8ad3830b..00000000 --- a/modules/local/merge_data/environment.yml +++ /dev/null @@ -1,6 +0,0 @@ -name: merge_data -channels: - - conda-forge -dependencies: - - conda-forge::python==3.12.8 - - conda-forge::polars==1.17.1 diff --git a/modules/local/merge_data/main.nf b/modules/local/merge_data/main.nf deleted file mode 100644 index 6507e5c2..00000000 --- a/modules/local/merge_data/main.nf +++ /dev/null @@ -1,39 +0,0 @@ -process MERGE_DATA { - - label 'process_low' - - publishDir "${params.outdir}/merged_data" - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f8a5d02e7b31980c887253a9f118da0ef91ead1c7b158caf855199e5c5d5473/data': - 'community.wave.seqera.io/library/polars_python:cab787b788e5eba7' }" - - input: - path count_files, stageAs: "?/*" - path design_files, stageAs: "?/*" - path dataset_stat_files, stageAs: "?/*" - val nb_candidate_genes - - output: - path 'all_counts.parquet', emit: all_counts - path 'all_designs.csv', emit: all_designs - path 'gene_count_statistics.csv', emit: gene_count_statistics - path 'skewness_statistics.csv', emit: skewness_statistics - path 'ks_test_statistics.csv', emit: ks_test_statistics - path 'candidate_gene_counts.parquet', emit: candidate_gene_counts - path 'distribution_correlations.csv', emit: distribution_correlations - tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions - tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions - - - script: - """ - merge_data.py \ - --counts "$count_files" \ - --designs "$design_files" \ - --stats "$dataset_stat_files" \ - --nb-candidate-genes $nb_candidate_genes - """ - -} diff --git a/modules/local/normalisation/compute_cpm/environment.yml b/modules/local/normalisation/compute_cpm/environment.yml new file mode 100644 index 00000000..f0eaf3dd --- /dev/null +++ b/modules/local/normalisation/compute_cpm/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.35.2 diff --git a/modules/local/normalisation/compute_cpm/main.nf b/modules/local/normalisation/compute_cpm/main.nf new file mode 100644 index 00000000..5045c4f4 --- /dev/null +++ b/modules/local/normalisation/compute_cpm/main.nf @@ -0,0 +1,29 @@ +process NORMALISATION_COMPUTE_CPM { + + label 'process_single' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f8a5d02e7b31980c887253a9f118da0ef91ead1c7b158caf855199e5c5d5473/data': + 'community.wave.seqera.io/library/polars_python:cab787b788e5eba7' }" + + input: + tuple val(meta), path(count_file) + + output: + tuple val(meta), path('*.cpm.parquet'), optional: true, emit: counts + tuple val(meta.dataset), path("failure_reason.txt"), optional: true, topic: normalisation_failure_reason + tuple val(meta.dataset), path("warning_reason.txt"), optional: true, topic: normalisation_warning_reason + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + compute_cpm.py \\ + --counts $count_file + """ + + +} diff --git a/modules/local/normalisation/compute_tpm/environment.yml b/modules/local/normalisation/compute_tpm/environment.yml new file mode 100644 index 00000000..f0eaf3dd --- /dev/null +++ b/modules/local/normalisation/compute_tpm/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.35.2 diff --git a/modules/local/normalisation/compute_tpm/main.nf b/modules/local/normalisation/compute_tpm/main.nf new file mode 100644 index 00000000..c5b57203 --- /dev/null +++ b/modules/local/normalisation/compute_tpm/main.nf @@ -0,0 +1,31 @@ +process NORMALISATION_COMPUTE_TPM { + + label 'process_single' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f8a5d02e7b31980c887253a9f118da0ef91ead1c7b158caf855199e5c5d5473/data': + 'community.wave.seqera.io/library/polars_python:cab787b788e5eba7' }" + + input: + tuple val(meta), path(count_file) + path gene_lengths_file + + output: + tuple val(meta), path('*.tpm.parquet'), optional: true, emit: counts + tuple val(meta.dataset), path("failure_reason.txt"), optional: true, topic: normalisation_failure_reason + tuple val(meta.dataset), path("warning_reason.txt"), optional: true, topic: normalisation_warning_reason + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + compute_tpm.py \\ + --counts $count_file \\ + --gene-lengths $gene_lengths_file + """ + + +} diff --git a/modules/local/normfinder/environment.yml b/modules/local/normfinder/environment.yml new file mode 100644 index 00000000..8e9b3ad3 --- /dev/null +++ b/modules/local/normfinder/environment.yml @@ -0,0 +1,11 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.13.7 + - conda-forge::polars==1.35.2 + - conda-forge::tqdm==4.67.1 + - conda-forge::numpy==2.3.5 + - conda-forge::numba==0.62.1 diff --git a/modules/local/normfinder/main.nf b/modules/local/normfinder/main.nf new file mode 100644 index 00000000..57ce0dc8 --- /dev/null +++ b/modules/local/normfinder/main.nf @@ -0,0 +1,38 @@ +process NORMFINDER { + + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0e/0e0445114887dd260f1632afe116b1e81e02e1acc74a86adca55099469b490d9/data': + 'community.wave.seqera.io/library/numba_numpy_polars_tqdm:6923cfab6fc04dec' }" + + input: + path count_file + path design_file + + output: + path('stability_values.normfinder.csv'), emit: stability_values + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def is_using_containers = workflow.containerEngine ? true : false + """ + # limiting number of threads when using conda / micromamba + if [ "${is_using_containers}" == "false" ]; then + export POLARS_MAX_THREADS=${task.cpus} + fi + + normfinder.py \ + --counts $count_file \ + --design $design_file + """ + + stub: + + """ + touch stability_values.normfinder.csv + """ + +} diff --git a/modules/local/old/clean_count_data/main.nf b/modules/local/old/clean_count_data/main.nf new file mode 100644 index 00000000..0500f78a --- /dev/null +++ b/modules/local/old/clean_count_data/main.nf @@ -0,0 +1,36 @@ +process CLEAN_COUNT_DATA { + + label 'process_single' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f8a5d02e7b31980c887253a9f118da0ef91ead1c7b158caf855199e5c5d5473/data': + 'community.wave.seqera.io/library/polars_python:cab787b788e5eba7' }" + + input: + tuple val(meta), path(count_file), path(ks_stats_file) + val ks_pvalue_threshold + + output: + tuple val(meta), path('cleaned_counts_filtered.parquet'), optional: true, emit: counts + tuple val(meta.dataset), path("failure_reason.txt"), optional: true, topic: clean_count_failure_reason + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def is_using_containers = workflow.containerEngine ? true : false + """ + # limiting number of threads when using conda / micromamba + if [ "${is_using_containers}" == "false" ]; then + export POLARS_MAX_THREADS=${task.cpus} + fi + + clean_count_data.py \\ + --counts $count_file \\ + --ks-stats $ks_stats_file \\ + --ks-pvalue-threshold $ks_pvalue_threshold + """ + +} diff --git a/modules/local/old/clean_count_data/spec-file.txt b/modules/local/old/clean_count_data/spec-file.txt new file mode 100644 index 00000000..1bf5d691 --- /dev/null +++ b/modules/local/old/clean_count_data/spec-file.txt @@ -0,0 +1,40 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: linux-64 +@EXPLICIT +https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 +https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_3.conda#3cd1a7238a0dd3d0860fdefc496cc854 +https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d +https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_3.conda#9e60c55e725c20d23125a5f0dd69af5d +https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_3.conda#e66f2b8ad787e7beb0f846e4bd7e8493 +https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda#62ee74e96c5ebb0af99386de58cf9553 +https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.6.15-hbd8a1cb_0.conda#72525f07d72806e3b639ad4504c30ce5 +https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_0.conda#e31316a586cac398b1fcdb10ace786b9 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.1.0-hcea5267_3.conda#530566b68c3b8ce7eec4cd047eae19fe +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.1.0-h69a702a_3.conda#bfbca721fd33188ef923dfe9ba172f29 +https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_0.conda#323dc8f259224d13078aaf7ce96c3efe +https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-32_h59b9bed_openblas.conda#2af9f3d5c2e39f417ce040f5a35c40c6 +https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-32_he106b2a_openblas.conda#3d3f9355e52f269cd8bc2c440d8a5263 +https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.0-h5888daf_0.conda#db0bfbe7dd197b68ad5f30333bae6ce0 +https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda#ede4673863426c0883c0063d853bbd85 +https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-32_h7ac8fdf_openblas.conda#6c3f04ccb6c578138e9f9899da0bd714 +https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda#1a580f7796c7bf6393fddb8bbbde58dc +https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hb9d3cd8_1.conda#d864d34357c3b65a4b731f78c0801dc4 +https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda#edb0dca6bc32e4f4789199455a1dbeb8 +https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.2-h6cd9bfd_0.conda#b04c7eda6d7dab1e6503135e7fad4d25 +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.1.0-h8f9b012_3.conda#6d11a5edae89fe413c0569f16d308f5a +https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b +https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda#5aa797f8787fe7a17d1b0821485b5adc +https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda#47e340acb35de30501a76c7c799c41d7 +https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.1-h7b32b05_0.conda#c87df2ab1448ba69169652ab9547082d +https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda#283b96675859b20a825f8fa30f311446 +https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda#a0116df4f4ed05c303811a837d5b39d8 +https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda#4222072737ccff51314b5ece9c7d6f5a +https://conda.anaconda.org/conda-forge/linux-64/python-3.12.8-h9e4cc4f_1_cpython.conda#7fd2fd79436d9b473812f14e86746844 +https://conda.anaconda.org/conda-forge/noarch/python_abi-3.12-7_cp312.conda#0dfcdc155cf23812a0c9deada86fb723 +https://conda.anaconda.org/conda-forge/linux-64/numpy-2.3.1-py312h6cf2f7f_0.conda#7e086a30150af2536a1059885368dcf0 +https://conda.anaconda.org/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda#58335b26c38bf4a20f399384c33cbcf9 +https://conda.anaconda.org/conda-forge/noarch/setuptools-80.9.0-pyhff2d567_0.conda#4de79c071274a53dcaf2a8c749d1499e +https://conda.anaconda.org/conda-forge/noarch/wheel-0.45.1-pyhd8ed1ab_1.conda#75cb7132eb58d97896e173ef12ac9986 +https://conda.anaconda.org/conda-forge/noarch/pip-25.1.1-pyh8b19718_0.conda#32d0781ace05105cc99af55d36cbec7c +https://conda.anaconda.org/conda-forge/linux-64/polars-1.17.1-py312hda0fa55_1.conda#d9d77bfc286b6044dc045d1696c6acdc diff --git a/modules/local/old/deseq2/main.nf b/modules/local/old/deseq2/main.nf new file mode 100644 index 00000000..18038106 --- /dev/null +++ b/modules/local/old/deseq2/main.nf @@ -0,0 +1,30 @@ +process NORMALISATION_DESEQ2 { + + label 'process_single' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/ce/cef7164b168e74e5db11dcd9acf6172d47ed6753e4814c68f39835d0c6c22f6d/data': + 'community.wave.seqera.io/library/bioconductor-deseq2_r-base_r-optparse:c84cd7ffdb298fa7' }" + + input: + tuple val(meta), path(count_file), path(design_file) + + output: + tuple val(meta), path('*.cpm.csv'), optional: true, emit: cpm + tuple val(meta.dataset), path("failure_reason.txt"), optional: true, topic: normalisation_failure_reason + tuple val(meta.dataset), path("warning_reason.txt"), optional: true, topic: normalisation_warning_reason + tuple val("${task.process}"), val('R'), eval('Rscript -e "cat(R.version.string)" | sed "s/R version //"'), topic: versions + tuple val("${task.process}"), val('DESeq2'), eval('Rscript -e "cat(as.character(packageVersion(\'DESeq2\')))"'), topic: versions + + script: + """ + normalise_with_deseq2.R \\ + --counts $count_file \\ + --design $design_file + """ + + +} diff --git a/modules/local/old/deseq2/spec-file.txt b/modules/local/old/deseq2/spec-file.txt new file mode 100644 index 00000000..ab36fbb1 --- /dev/null +++ b/modules/local/old/deseq2/spec-file.txt @@ -0,0 +1,190 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: linux-64 +@EXPLICIT +https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 +https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_3.conda#3cd1a7238a0dd3d0860fdefc496cc854 +https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d +https://conda.anaconda.org/conda-forge/noarch/_r-mutex-1.0.1-anacondar_1.tar.bz2#19f9db5f4f1b7f5ef5f6d67207f25f38 +https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_3.conda#9e60c55e725c20d23125a5f0dd69af5d +https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_3.conda#e66f2b8ad787e7beb0f846e4bd7e8493 +https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda#62ee74e96c5ebb0af99386de58cf9553 +https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_0.conda#e31316a586cac398b1fcdb10ace786b9 +https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.0-h5888daf_0.conda#db0bfbe7dd197b68ad5f30333bae6ce0 +https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda#ede4673863426c0883c0063d853bbd85 +https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda#1a580f7796c7bf6393fddb8bbbde58dc +https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda#c7e925f37e3b40d893459e625f6a53f1 +https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda#edb0dca6bc32e4f4789199455a1dbeb8 +https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.2-h6cd9bfd_0.conda#b04c7eda6d7dab1e6503135e7fad4d25 +https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b +https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda#47e340acb35de30501a76c7c799c41d7 +https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.6.15-hbd8a1cb_0.conda#72525f07d72806e3b639ad4504c30ce5 +https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.1-h7b32b05_0.conda#c87df2ab1448ba69169652ab9547082d +https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-7_cp313.conda#e84b44e6300f1703cb25d29120c5b1d8 +https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda#283b96675859b20a825f8fa30f311446 +https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda#a0116df4f4ed05c303811a837d5b39d8 +https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda#4222072737ccff51314b5ece9c7d6f5a +https://conda.anaconda.org/conda-forge/linux-64/python-3.13.5-hec9711d_102_cp313.conda#89e07d92cf50743886f41638d58c4328 +https://conda.anaconda.org/conda-forge/noarch/argcomplete-3.6.2-pyhd8ed1ab_0.conda#eb9d4263271ca287d2e0cf5a86da2d3a +https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-3.10.0-he073ed8_18.conda#ad8527bf134a90e1c9ed35fa0b64318c +https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.17-h0157908_18.conda#460eba7851277ec1fd80a1a24080787a +https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.44-h4bf12b8_0.conda#7a1b5c3fbc0419961eaed361eedc90d4 +https://conda.anaconda.org/conda-forge/linux-64/bwidget-1.10.1-ha770c72_1.conda#983b92277d78c0d0ec498e460caa0e6d +https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.50-h943b412_0.conda#51de14db340a848869e69c632b43cca7 +https://conda.anaconda.org/conda-forge/linux-64/libfreetype6-2.13.3-h48d6fc4_1.conda#3c255be50a506c50765a93a6644f32fe +https://conda.anaconda.org/conda-forge/linux-64/libfreetype-2.13.3-ha770c72_1.conda#51f5be229d83ecd401fb369ab96ae669 +https://conda.anaconda.org/conda-forge/linux-64/freetype-2.13.3-ha770c72_1.conda#9ccd736d31e0c6e41f54e704e5312811 +https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.15.0-h7e30c49_1.conda#8f5b0b297b59e1ac160ad4beec99dbee +https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2#0c96522c6bdaed4b1566d11387caaf45 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2#34893075a5c9e55cdafac56607368fc6 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2#4d59c254e01d9cde7957100457e2d5fb +https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda#49023d73832ef61042f6a237cb2687e7 +https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-0.tar.bz2#f766549260d6815b0c52253f1fb1bb29 +https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2#fee5683a3f04bd15cbd8318b096a27ab +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.1.0-h8f9b012_3.conda#6d11a5edae89fe413c0569f16d308f5a +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.1.0-h4852527_3.conda#57541755b5a51691955012b8e197c06c +https://conda.anaconda.org/conda-forge/linux-64/icu-75.1-he02047a_0.conda#8b189310083baabfb622af68fd9d3ae3 +https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h4ce23a2_1.conda#e796ff8ddc598affdf7c173d6145f087 +https://conda.anaconda.org/conda-forge/linux-64/pcre2-10.45-hc749103_0.conda#b90bece58b4c2bf25969b70f3be42d25 +https://conda.anaconda.org/conda-forge/linux-64/libglib-2.84.2-h3618099_0.conda#072ab14a02164b7c0c089055368ff776 +https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-hb9d3cd8_1002.conda#b3c17d95b5a10c6e64a21fa17573e70e +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.12-hb9d3cd8_0.conda#f6ebe2cb3f82ba6c057dde5d9debe4f7 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb9d3cd8_0.conda#8035c64cb77ed555e3f150b7b3972480 +https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.17.0-h8a09558_0.conda#92ed62436b625154323d40d5f2f11dd7 +https://conda.anaconda.org/conda-forge/linux-64/pixman-0.46.2-h29eaf8c_0.conda#39b4228a867772d610c02e06f939a5b8 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libice-1.1.2-hb9d3cd8_0.conda#fb901ff28063514abb6046c9ec2c4a45 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libsm-1.2.6-he73a12e_0.conda#1c74ff8c35dcadf952a16f752ca5aa49 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libx11-1.8.12-h4f16b4b_0.conda#db038ce880f100acc74dba10302b5630 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxext-1.3.6-hb9d3cd8_0.conda#febbab7d15033c913d53c7a2c102309d +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrender-0.9.12-hb9d3cd8_0.conda#96d57aba173e878a2089d5638016dc5e +https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-h3394656_0.conda#09262e66b19567aff4f592fb53b28760 +https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2#30186d27e2c9fa62b45fb1476b7200e3 +https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda#c277e0a4d549b03ac1e9d6cbbe3d017b +https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.3-h659f571_0.conda#3f43953b7d3fb3aaa1d0d0723d91e368 +https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.5-hb9d3cd8_0.conda#f7f0d6cc2dc986d42ac2689ec88192be +https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda#172bf1cd1ff8629f2b1179945ed45055 +https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.64.0-h161d5f1_0.conda#19e57602824042dfd0446292ef90488b +https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda#eecce068c7e4eddeb169591baac20ac4 +https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb8e6e7a_2.conda#6432cb5d4ac0046c3ac0a8a0f95842f9 +https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.14.1-h332b0f4_0.conda#45f6713cb00f124af300342512219182 +https://conda.anaconda.org/conda-forge/linux-64/curl-8.14.1-h332b0f4_0.conda#60279087a10b4ab59a70daa838894e4b +https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-15.1.0-h4c094af_103.conda#ea67e87d658d31dc33818f9574563269 +https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-15.1.0-h97b714f_3.conda#bbcff9bf972a0437bea8e431e4b327bb +https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-15.1.0-h4393ad2_3.conda#f39f96280dd8b1ec8cbd395a3d3fdd1e +https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.1.0-hcea5267_3.conda#530566b68c3b8ce7eec4cd047eae19fe +https://conda.anaconda.org/conda-forge/linux-64/gfortran_impl_linux-64-15.1.0-h3b9cdf2_3.conda#649c5fe0593a880702e434bc375f3e8a +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.1.0-h69a702a_3.conda#bfbca721fd33188ef923dfe9ba172f29 +https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_0.conda#323dc8f259224d13078aaf7ce96c3efe +https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-32_h59b9bed_openblas.conda#2af9f3d5c2e39f417ce040f5a35c40c6 +https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-32_he106b2a_openblas.conda#3d3f9355e52f269cd8bc2c440d8a5263 +https://conda.anaconda.org/conda-forge/linux-64/gsl-2.7-he838d99_0.tar.bz2#fec079ba39c9cca093bf4c00001825de +https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-15.1.0-h4c094af_103.conda#83bbc814f0aeccccb5ea10267bea0d2e +https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-15.1.0-h6a1bac1_3.conda#d71cc504fcfdbee8dd7925ebb9c2bf85 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-15.1.0-h69a702a_3.conda#6e5d0574e57a38c36e674e9a18eee2b4 +https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.0-hb9d3cd8_0.conda#9fa334557db9f63da6c9285fd2a48638 +https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-32_h7ac8fdf_openblas.conda#6c3f04ccb6c578138e9f9899da0bd714 +https://conda.anaconda.org/conda-forge/linux-64/lerc-4.0.0-h0aef613_1.conda#9344155d33912347b37f0ae6c410a835 +https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.24-h86f0d12_0.conda#64f0c503da58ec25ebd359e4d990afa8 +https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.5.0-h851e524_0.conda#63f790534398730f59e1b899c3644d4a +https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.7.0-hf01ce69_5.conda#e79a094918988bb1807462cd42c83962 +https://conda.anaconda.org/conda-forge/linux-64/make-4.4.1-hb9d3cd8_2.conda#33405d2a66b1411db9f7242c8b97c9e7 +https://conda.anaconda.org/conda-forge/linux-64/fribidi-1.0.10-h36c2ea0_0.tar.bz2#ac7bc6a654f8f41b352b38f4051135f8 +https://conda.anaconda.org/conda-forge/linux-64/graphite2-1.3.14-h5888daf_0.conda#951ff8d9e5536896408e89d63230b8d5 +https://conda.anaconda.org/conda-forge/linux-64/harfbuzz-11.2.1-h3beb420_0.conda#0e6e192d4b3d95708ad192d957cf3163 +https://conda.anaconda.org/conda-forge/linux-64/pango-1.56.4-hadf4263_0.conda#79f71230c069a287efe3a8614069ddf1 +https://conda.anaconda.org/conda-forge/linux-64/sed-4.9-h6688a6e_0.conda#171afc5f7ca0408bbccbcb69ade85f92 +https://conda.anaconda.org/conda-forge/linux-64/tktable-2.10-h8d826fa_7.conda#3ac51142c19ba95ae0fadefa333c9afb +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxt-1.3.1-hb9d3cd8_0.conda#279b0de5f6ba95457190a1c459a64e31 +https://conda.anaconda.org/conda-forge/linux-64/r-base-4.3.3-h65010dc_18.conda#721ea26859f44b206b0146eae8444657 +https://conda.anaconda.org/bioconda/noarch/bioconductor-biocgenerics-0.48.1-r43hdfd78af_2.tar.bz2#a313dd8a932cfd178fad2f3e7e6a6184 +https://conda.anaconda.org/conda-forge/linux-64/oniguruma-6.9.10-hb9d3cd8_0.conda#6ce853cb231f18576d2db5c2d4cb473e +https://conda.anaconda.org/conda-forge/linux-64/jq-1.8.1-h73b1eb8_0.conda#2714e43bfc035f7ef26796632aa1b523 +https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h7f98852_2.tar.bz2#4cb3ad778ec2d5a7acbdf254eb1c42ae +https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.2-py313h8060acc_2.conda#50992ba61a8a1f8c2d346168ae1c86df +https://conda.anaconda.org/conda-forge/noarch/setuptools-80.9.0-pyhff2d567_0.conda#4de79c071274a53dcaf2a8c749d1499e +https://conda.anaconda.org/conda-forge/noarch/toml-0.10.2-pyhd8ed1ab_1.conda#b0dd904de08b7db706167240bf37b164 +https://conda.anaconda.org/conda-forge/noarch/tomlkit-0.13.3-pyha770c72_0.conda#146402bf0f11cbeb8f781fa4309a95d3 +https://conda.anaconda.org/conda-forge/noarch/xmltodict-0.14.2-pyhd8ed1ab_1.conda#96ef17b8734b174d35346da0762f0137 +https://conda.anaconda.org/conda-forge/noarch/yq-3.4.3-pyhe01879c_2.conda#18cefe7c50c1228da474ea0e95a8e646 +https://conda.anaconda.org/bioconda/noarch/bioconductor-data-packages-20250625-hdfd78af_0.tar.bz2#34d7066b99d7e6769305dcebf0a9de87 +https://conda.anaconda.org/bioconda/noarch/bioconductor-genomeinfodbdata-1.2.11-r43hdfd78af_1.tar.bz2#14721a7fde8cfe4703796dfd5a119d76 +https://conda.anaconda.org/bioconda/linux-64/bioconductor-s4vectors-0.40.2-r43ha9d7317_2.tar.bz2#6aa465e83dabb7ed5b853519d8a334e4 +https://conda.anaconda.org/bioconda/linux-64/bioconductor-iranges-2.36.0-r43ha9d7317_2.tar.bz2#cca51afd40439bea147c1adf9857bec0 +https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.13.8-h4bc477f_0.conda#14dbe05b929e329dbaa6f2d0aa19466d +https://conda.anaconda.org/conda-forge/linux-64/r-bitops-1.0_9-r43h2b5f3a1_0.conda#8643d84c1d28ea73e48db9deb9a2eff3 +https://conda.anaconda.org/conda-forge/linux-64/r-rcurl-1.98_1.16-r43he8228da_1.conda#e03c3ff98b32efffb620d7dec4df34b1 +https://conda.anaconda.org/bioconda/noarch/bioconductor-genomeinfodb-1.38.1-r43hdfd78af_1.tar.bz2#03e20a01b672b693c9470dec80d83993 +https://conda.anaconda.org/bioconda/linux-64/bioconductor-zlibbioc-1.48.0-r43ha9d7317_2.tar.bz2#b460a5493c1d67ff386a0e63eb078a64 +https://conda.anaconda.org/bioconda/linux-64/bioconductor-xvector-0.42.0-r43ha9d7317_2.tar.bz2#16f45b1c97517cc3d063a442a43689a4 +https://conda.anaconda.org/bioconda/linux-64/bioconductor-genomicranges-1.54.1-r43ha9d7317_2.tar.bz2#01031256b035b2d4a15c14b690be39aa +https://conda.anaconda.org/bioconda/linux-64/bioconductor-biobase-2.62.0-r43ha9d7317_3.tar.bz2#536352cf94bc990f2d723564fe0d6ff9 +https://conda.anaconda.org/conda-forge/linux-64/r-matrixstats-1.5.0-r43h2b5f3a1_0.conda#bbf709a87ed6a14852cb0a4171539a06 +https://conda.anaconda.org/bioconda/noarch/bioconductor-matrixgenerics-1.14.0-r43hdfd78af_3.tar.bz2#c79f36cc0cd464874aefd50a700d0079 +https://conda.anaconda.org/conda-forge/noarch/r-abind-1.4_5-r43hc72bb7e_1006.conda#75d26096ffa98e1cde7b27b9530899a1 +https://conda.anaconda.org/conda-forge/noarch/r-crayon-1.5.3-r43hc72bb7e_1.conda#bafc77be1942ea00228cf18d2cb30e35 +https://conda.anaconda.org/conda-forge/linux-64/r-lattice-0.22_7-r43h2b5f3a1_0.conda#1902233545ef5232dacdd973153d77c4 +https://conda.anaconda.org/conda-forge/linux-64/r-matrix-1.6_5-r43he966344_1.conda#df8a1175a62460e02dbf340966cbfeab +https://conda.anaconda.org/bioconda/linux-64/bioconductor-s4arrays-1.2.0-r43ha9d7317_2.tar.bz2#28fd3fe7fd8d087c1cfa7805bbd16661 +https://conda.anaconda.org/bioconda/linux-64/bioconductor-sparsearray-1.2.2-r43ha9d7317_2.tar.bz2#41f1e8c1cfb7ff594e923c05e02d9ecb +https://conda.anaconda.org/bioconda/linux-64/bioconductor-delayedarray-0.28.0-r43ha9d7317_2.tar.bz2#cec6a218547ee2af2b823957a373a655 +https://conda.anaconda.org/bioconda/noarch/bioconductor-summarizedexperiment-1.32.0-r43hdfd78af_0.tar.bz2#6bed161da6d64cef9f9ebd5fbc2452e7 +https://conda.anaconda.org/conda-forge/linux-64/r-bdsmatrix-1.3_7-r43h2b5f3a1_2.conda#ac4eb8896121a376698bccf410c51cf6 +https://conda.anaconda.org/conda-forge/linux-64/r-mass-7.3_60.0.1-r43hb1dbf0f_1.conda#c3c9184486ccabe19b86aba11351652e +https://conda.anaconda.org/conda-forge/linux-64/r-mvtnorm-1.3_3-r43h9ad1c49_0.conda#53e04c32e1d4cba4181832befe4601f8 +https://conda.anaconda.org/conda-forge/noarch/r-numderiv-2016.8_1.1-r43hc72bb7e_6.conda#f9bd335fa3579f2e0ed2cdd315fc05ed +https://conda.anaconda.org/conda-forge/noarch/r-bbmle-1.0.25.1-r43hc72bb7e_1.conda#f4dba61e861b8c2459ebf5caa575c495 +https://conda.anaconda.org/conda-forge/noarch/r-coda-0.19_4.1-r43hc72bb7e_1.conda#675d29e567d6eced1089f695d19cfff3 +https://conda.anaconda.org/conda-forge/linux-64/r-rcpp-1.1.0-r43h93ab643_0.conda#b10c60cf4d65df16b0fe2a17e2324375 +https://conda.anaconda.org/conda-forge/linux-64/r-plyr-1.8.9-r43ha18555a_1.conda#d93aedee4cc78f78413969b1e891842c +https://conda.anaconda.org/conda-forge/noarch/r-emdbook-1.3.13-r43hc72bb7e_1.conda#c6d8d2535e70b1bbf3d7ceecf6f60bcc +https://conda.anaconda.org/conda-forge/linux-64/r-rcppeigen-0.3.4.0.2-r43hb79369c_0.conda#02aedbcf8e80e09bd14a6512344993bf +https://conda.anaconda.org/conda-forge/linux-64/r-rcppnumerical-0.6_0-r43h0d4f4ea_1.conda#be66552558a5d23eff73aeb0784205d6 +https://conda.anaconda.org/bioconda/linux-64/bioconductor-apeglm-1.24.0-r43hf17093f_1.tar.bz2#e3f97df1d5f32a3eb3472ec85344c9f3 +https://conda.anaconda.org/conda-forge/noarch/r-bh-1.87.0_1-r43hc72bb7e_0.conda#9e6364aa396f48b73fb81d56b44ceedc +https://conda.anaconda.org/conda-forge/noarch/r-codetools-0.2_20-r43hc72bb7e_1.conda#f54a935134de901af63096b29a56697e +https://conda.anaconda.org/conda-forge/noarch/r-cpp11-0.5.2-r43h785f33e_1.conda#7bc23dbad7c6015d9b2b9c59bb3e5d85 +https://conda.anaconda.org/conda-forge/noarch/r-futile.options-1.0.1-r43hc72bb7e_1005.conda#57962626cdffa616861bb383076195a2 +https://conda.anaconda.org/conda-forge/noarch/r-formatr-1.14-r43hc72bb7e_2.conda#20d39b48868b55b5335a0c578fdda15b +https://conda.anaconda.org/conda-forge/noarch/r-lambda.r-1.2.4-r43hc72bb7e_4.conda#bf0eed6164eb10fefeda18059a78193c +https://conda.anaconda.org/conda-forge/noarch/r-futile.logger-1.4.3-r43hc72bb7e_1006.conda#dbfd04b54b6ac781070393f2184b3c6d +https://conda.anaconda.org/conda-forge/noarch/r-snow-0.4_4-r43hc72bb7e_3.conda#60eeeef67921f38a80c1778eae3bbbb9 +https://conda.anaconda.org/bioconda/linux-64/bioconductor-biocparallel-1.36.0-r43hf17093f_2.tar.bz2#6e03fcbba328db0b9f2a722ce663e916 +https://conda.anaconda.org/conda-forge/noarch/r-etrunct-0.1-r43hc72bb7e_1006.conda#0db2b2af6060135475a77e6a4a366c0c +https://conda.anaconda.org/conda-forge/noarch/r-invgamma-1.2-r43hc72bb7e_0.conda#5c23063ced21dcb74c6c6d65a996848a +https://conda.anaconda.org/conda-forge/linux-64/r-irlba-2.3.5.1-r43h0d28552_3.conda#9d0d3d499b5670b3dc626ba1d51ebe0c +https://conda.anaconda.org/conda-forge/linux-64/r-rcpparmadillo-14.4.2_1-r43hc2d650c_0.conda#a9ded0b699d83238bcb9ba6949924fce +https://conda.anaconda.org/conda-forge/linux-64/r-mixsqp-0.3_54-r43hb79369c_3.conda#e61f97f4dac929092d11a0a98eacd1b6 +https://conda.anaconda.org/conda-forge/noarch/r-squarem-2021.1-r43hc72bb7e_3.conda#7540130cd26e12a02742dac9ddc184d6 +https://conda.anaconda.org/conda-forge/linux-64/r-truncnorm-1.0_9-r43h2b5f3a1_4.conda#c7a9a8c285e9c8e49efa16081b7b54d6 +https://conda.anaconda.org/conda-forge/linux-64/r-ashr-2.2_63-r43h93ab643_2.conda#ed9e685349325869eb0d716bf4db2f68 +https://conda.anaconda.org/conda-forge/linux-64/r-cli-3.6.5-r43h93ab643_0.conda#5d49a07fdd4ca869c4a79082692c4d2a +https://conda.anaconda.org/conda-forge/linux-64/r-glue-1.8.0-r43h2b5f3a1_0.conda#381d612db7519f2a54f1b187e738ac7b +https://conda.anaconda.org/conda-forge/linux-64/r-rlang-1.1.6-r43h93ab643_0.conda#057b78b5adfffc99092504a1da563abe +https://conda.anaconda.org/conda-forge/noarch/r-lifecycle-1.0.4-r43hc72bb7e_1.conda#7a0a8ba1fe2cf12b39062d8291e2fca8 +https://conda.anaconda.org/conda-forge/noarch/r-gtable-0.3.6-r43hc72bb7e_0.conda#08f643f31ac131aa067e42ad5f832313 +https://conda.anaconda.org/conda-forge/linux-64/r-isoband-0.2.7-r43ha18555a_3.conda#39459e8609d9461d90ee0683a7fd2f3a +https://conda.anaconda.org/conda-forge/linux-64/r-nlme-3.1_168-r43hb67ce94_0.conda#4add921d1d71c646c037a91202d0f75f +https://conda.anaconda.org/conda-forge/linux-64/r-mgcv-1.9_3-r43h2ae2be5_0.conda#66398dfe29e3bc9c415393ddb4ea864c +https://conda.anaconda.org/conda-forge/linux-64/r-farver-2.1.2-r43ha18555a_1.conda#85a82a5b78397daf57f002120aed9e3e +https://conda.anaconda.org/conda-forge/noarch/r-labeling-0.4.3-r43hc72bb7e_1.conda#0464c37b6ff6701cbb8606e8f4bfebe4 +https://conda.anaconda.org/conda-forge/linux-64/r-colorspace-2.1_1-r43hdb488b9_0.conda#0c6d4c26ca41246a4053d79e1b4d78ff +https://conda.anaconda.org/conda-forge/noarch/r-munsell-0.5.1-r43hc72bb7e_1.conda#8b2f9bb8064ae0896ffedd984661a2d5 +https://conda.anaconda.org/conda-forge/noarch/r-r6-2.6.1-r43hc72bb7e_0.conda#be02712c703445dc5cabbe0f22d0d063 +https://conda.anaconda.org/conda-forge/noarch/r-rcolorbrewer-1.1_3-r43h785f33e_3.conda#ceb1c167b7d9e5eefed0ecbe759540de +https://conda.anaconda.org/conda-forge/noarch/r-viridislite-0.4.2-r43hc72bb7e_2.conda#2a5b8c2803b5714f3319a238c66cc9e7 +https://conda.anaconda.org/conda-forge/noarch/r-scales-1.4.0-r43hc72bb7e_0.conda#ba5fa427e6421aded56f95bd925e3572 +https://conda.anaconda.org/conda-forge/linux-64/r-fansi-1.0.6-r43hb1dbf0f_1.conda#4c17a0f74a974316fdfafa5a9fe91b52 +https://conda.anaconda.org/conda-forge/linux-64/r-magrittr-2.0.3-r43hb1dbf0f_3.conda#fc61bcf37e59037b486c8841a704e9da +https://conda.anaconda.org/conda-forge/linux-64/r-ellipsis-0.3.2-r43hb1dbf0f_3.conda#b8349582a31b17184a7674f4c847a5ad +https://conda.anaconda.org/conda-forge/linux-64/r-utf8-1.2.6-r43h2b5f3a1_0.conda#a2b3283964103f1ff47d6acea6f69e24 +https://conda.anaconda.org/conda-forge/linux-64/r-vctrs-0.6.5-r43h0d4f4ea_1.conda#7f4c30bb576acec2a682c40790c2d406 +https://conda.anaconda.org/conda-forge/noarch/r-pillar-1.11.0-r43hc72bb7e_0.conda#657672af86f156a821b7a8d5ac88f916 +https://conda.anaconda.org/conda-forge/noarch/r-pkgconfig-2.0.3-r43hc72bb7e_4.conda#509adf7f5bc34d77064f28f487d7fa6e +https://conda.anaconda.org/conda-forge/linux-64/r-tibble-3.3.0-r43h2b5f3a1_0.conda#e12f4dc87c2ab21d2e4384cbf8f42111 +https://conda.anaconda.org/conda-forge/noarch/r-withr-3.0.2-r43hc72bb7e_0.conda#e503cae9a96ad7771fa6ccd3af90477b +https://conda.anaconda.org/conda-forge/noarch/r-ggplot2-3.5.2-r43hc72bb7e_0.conda#0245640d7215b4e4f1f07ce7cb08378f +https://conda.anaconda.org/conda-forge/linux-64/r-locfit-1.5_9.12-r43h2b5f3a1_0.conda#94bb7f425967b333ba97ce778b5a2efc +https://conda.anaconda.org/bioconda/linux-64/bioconductor-deseq2-1.42.0-r43hf17093f_2.tar.bz2#f600800873b9b0d08c42215182fc88b1 +https://conda.anaconda.org/conda-forge/noarch/pip-25.1.1-pyh145f28c_0.conda#01384ff1639c6330a0924791413b8714 +https://conda.anaconda.org/conda-forge/noarch/r-getopt-1.20.4-r43ha770c72_1.conda#cf6793c369dbc7ef63d9c1bc9b186615 +https://conda.anaconda.org/conda-forge/noarch/r-optparse-1.7.5-r43hc72bb7e_1.conda#ae32080aac0f74e73e7cd6e774db1c73 diff --git a/modules/local/old/download_genome_annotation/main.nf b/modules/local/old/download_genome_annotation/main.nf new file mode 100644 index 00000000..c4914fe7 --- /dev/null +++ b/modules/local/old/download_genome_annotation/main.nf @@ -0,0 +1,27 @@ +process DOWNLOAD_GENOME_ANNOTATION { + + label 'process_single' + + tag "$accession" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/a6/a6b13690259900baef6865722cb3a319103acc83b5bcab67504c88bde1e3a9f6/data': + 'community.wave.seqera.io/library/ncbi-datasets-cli_unzip:785aabe86637bae4' }" + + input: + val(accession) + + output: + path('genomic.gff'), emit: annotation + tuple val("${task.process}"), val('ncbi-datasets-cli'), eval("datasets --version | sed 's/datasets version: //g'"), topic: versions + + script: + """ + datasets download genome accession $accession --include gff3 + + unzip -o ncbi_dataset.zip + mv ncbi_dataset/data/${accession}/* . + """ + +} diff --git a/modules/local/old/download_genome_annotation/spec-file.txt b/modules/local/old/download_genome_annotation/spec-file.txt new file mode 100644 index 00000000..f979bd9f --- /dev/null +++ b/modules/local/old/download_genome_annotation/spec-file.txt @@ -0,0 +1,12 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: linux-64 +@EXPLICIT +https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 +https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_3.conda#3cd1a7238a0dd3d0860fdefc496cc854 +https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d +https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.7.14-hbd8a1cb_0.conda#d16c90324aef024877d8713c0b7fea5b +https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_3.conda#9e60c55e725c20d23125a5f0dd69af5d +https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_3.conda#e66f2b8ad787e7beb0f846e4bd7e8493 +https://conda.anaconda.org/conda-forge/linux-64/ncbi-datasets-cli-18.5.0-ha770c72_0.conda#28b6b83d9152d8af1bebacdcc070c13a +https://conda.anaconda.org/conda-forge/linux-64/unzip-6.0-h7f98852_3.tar.bz2#7cb7109505433a5abbf68bb34b31edac diff --git a/modules/local/old/edger/main.nf b/modules/local/old/edger/main.nf new file mode 100644 index 00000000..ceb6e2d8 --- /dev/null +++ b/modules/local/old/edger/main.nf @@ -0,0 +1,29 @@ +process NORMALISATION_EDGER { + + label 'process_single' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/89/89bbc9544e18b624ed6d0a30e701cf8cec63e063cc9b5243e1efde362fe92228/data': + 'community.wave.seqera.io/library/bioconductor-edger_r-base_r-optparse:400aaabddeea1574' }" + + input: + tuple val(meta), path(count_file), path(design_file) + + output: + tuple val(meta), path('*.cpm.csv'), optional: true, emit: cpm + tuple val(meta.dataset), path("failure_reason.txt"), optional: true, topic: normalisation_failure_reason + tuple val(meta.dataset), path("warning_reason.txt"), optional: true, topic: normalisation_warning_reason + tuple val("${task.process}"), val('R'), eval('Rscript -e "cat(R.version.string)" | sed "s/R version //"'), topic: versions + tuple val("${task.process}"), val('edgeR'), eval('Rscript -e "cat(as.character(packageVersion(\'edgeR\')))"'), topic: versions + + script: + """ + normalise_with_edger.R \\ + --counts $count_file \\ + --design $design_file + """ + +} diff --git a/modules/local/old/edger/spec-file.txt b/modules/local/old/edger/spec-file.txt new file mode 100644 index 00000000..54a7d740 --- /dev/null +++ b/modules/local/old/edger/spec-file.txt @@ -0,0 +1,101 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: linux-64 +@EXPLICIT +https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 +https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_3.conda#3cd1a7238a0dd3d0860fdefc496cc854 +https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d +https://conda.anaconda.org/conda-forge/noarch/_r-mutex-1.0.1-anacondar_1.tar.bz2#19f9db5f4f1b7f5ef5f6d67207f25f38 +https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_0.conda#e31316a586cac398b1fcdb10ace786b9 +https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-3.10.0-he073ed8_18.conda#ad8527bf134a90e1c9ed35fa0b64318c +https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda#4222072737ccff51314b5ece9c7d6f5a +https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.17-h0157908_18.conda#460eba7851277ec1fd80a1a24080787a +https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.44-h4bf12b8_0.conda#7a1b5c3fbc0419961eaed361eedc90d4 +https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_3.conda#9e60c55e725c20d23125a5f0dd69af5d +https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.1.0-hcea5267_3.conda#530566b68c3b8ce7eec4cd047eae19fe +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.1.0-h69a702a_3.conda#bfbca721fd33188ef923dfe9ba172f29 +https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_0.conda#323dc8f259224d13078aaf7ce96c3efe +https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-32_h59b9bed_openblas.conda#2af9f3d5c2e39f417ce040f5a35c40c6 +https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_3.conda#e66f2b8ad787e7beb0f846e4bd7e8493 +https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-32_h7ac8fdf_openblas.conda#6c3f04ccb6c578138e9f9899da0bd714 +https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda#edb0dca6bc32e4f4789199455a1dbeb8 +https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda#a0116df4f4ed05c303811a837d5b39d8 +https://conda.anaconda.org/conda-forge/linux-64/bwidget-1.10.1-ha770c72_1.conda#983b92277d78c0d0ec498e460caa0e6d +https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda#62ee74e96c5ebb0af99386de58cf9553 +https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.50-h943b412_0.conda#51de14db340a848869e69c632b43cca7 +https://conda.anaconda.org/conda-forge/linux-64/libfreetype6-2.13.3-h48d6fc4_1.conda#3c255be50a506c50765a93a6644f32fe +https://conda.anaconda.org/conda-forge/linux-64/libfreetype-2.13.3-ha770c72_1.conda#51f5be229d83ecd401fb369ab96ae669 +https://conda.anaconda.org/conda-forge/linux-64/freetype-2.13.3-ha770c72_1.conda#9ccd736d31e0c6e41f54e704e5312811 +https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.0-h5888daf_0.conda#db0bfbe7dd197b68ad5f30333bae6ce0 +https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b +https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.15.0-h7e30c49_1.conda#8f5b0b297b59e1ac160ad4beec99dbee +https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2#0c96522c6bdaed4b1566d11387caaf45 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2#34893075a5c9e55cdafac56607368fc6 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2#4d59c254e01d9cde7957100457e2d5fb +https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda#49023d73832ef61042f6a237cb2687e7 +https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-0.tar.bz2#f766549260d6815b0c52253f1fb1bb29 +https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2#fee5683a3f04bd15cbd8318b096a27ab +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.1.0-h8f9b012_3.conda#6d11a5edae89fe413c0569f16d308f5a +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.1.0-h4852527_3.conda#57541755b5a51691955012b8e197c06c +https://conda.anaconda.org/conda-forge/linux-64/icu-75.1-he02047a_0.conda#8b189310083baabfb622af68fd9d3ae3 +https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda#ede4673863426c0883c0063d853bbd85 +https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h4ce23a2_1.conda#e796ff8ddc598affdf7c173d6145f087 +https://conda.anaconda.org/conda-forge/linux-64/pcre2-10.45-hc749103_0.conda#b90bece58b4c2bf25969b70f3be42d25 +https://conda.anaconda.org/conda-forge/linux-64/libglib-2.84.2-h3618099_0.conda#072ab14a02164b7c0c089055368ff776 +https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-hb9d3cd8_1002.conda#b3c17d95b5a10c6e64a21fa17573e70e +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.12-hb9d3cd8_0.conda#f6ebe2cb3f82ba6c057dde5d9debe4f7 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb9d3cd8_0.conda#8035c64cb77ed555e3f150b7b3972480 +https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.17.0-h8a09558_0.conda#92ed62436b625154323d40d5f2f11dd7 +https://conda.anaconda.org/conda-forge/linux-64/pixman-0.46.2-h29eaf8c_0.conda#39b4228a867772d610c02e06f939a5b8 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libice-1.1.2-hb9d3cd8_0.conda#fb901ff28063514abb6046c9ec2c4a45 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libsm-1.2.6-he73a12e_0.conda#1c74ff8c35dcadf952a16f752ca5aa49 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libx11-1.8.12-h4f16b4b_0.conda#db038ce880f100acc74dba10302b5630 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxext-1.3.6-hb9d3cd8_0.conda#febbab7d15033c913d53c7a2c102309d +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrender-0.9.12-hb9d3cd8_0.conda#96d57aba173e878a2089d5638016dc5e +https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-h3394656_0.conda#09262e66b19567aff4f592fb53b28760 +https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2#30186d27e2c9fa62b45fb1476b7200e3 +https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda#47e340acb35de30501a76c7c799c41d7 +https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda#c277e0a4d549b03ac1e9d6cbbe3d017b +https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.6.15-hbd8a1cb_0.conda#72525f07d72806e3b639ad4504c30ce5 +https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.1-h7b32b05_0.conda#c87df2ab1448ba69169652ab9547082d +https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.3-h659f571_0.conda#3f43953b7d3fb3aaa1d0d0723d91e368 +https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.5-hb9d3cd8_0.conda#f7f0d6cc2dc986d42ac2689ec88192be +https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda#172bf1cd1ff8629f2b1179945ed45055 +https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.64.0-h161d5f1_0.conda#19e57602824042dfd0446292ef90488b +https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda#eecce068c7e4eddeb169591baac20ac4 +https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb8e6e7a_2.conda#6432cb5d4ac0046c3ac0a8a0f95842f9 +https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.14.1-h332b0f4_0.conda#45f6713cb00f124af300342512219182 +https://conda.anaconda.org/conda-forge/linux-64/curl-8.14.1-h332b0f4_0.conda#60279087a10b4ab59a70daa838894e4b +https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-15.1.0-h4c094af_103.conda#ea67e87d658d31dc33818f9574563269 +https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-15.1.0-h97b714f_3.conda#bbcff9bf972a0437bea8e431e4b327bb +https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-15.1.0-h4393ad2_3.conda#f39f96280dd8b1ec8cbd395a3d3fdd1e +https://conda.anaconda.org/conda-forge/linux-64/gfortran_impl_linux-64-15.1.0-h3b9cdf2_3.conda#649c5fe0593a880702e434bc375f3e8a +https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-32_he106b2a_openblas.conda#3d3f9355e52f269cd8bc2c440d8a5263 +https://conda.anaconda.org/conda-forge/linux-64/gsl-2.7-he838d99_0.tar.bz2#fec079ba39c9cca093bf4c00001825de +https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-15.1.0-h4c094af_103.conda#83bbc814f0aeccccb5ea10267bea0d2e +https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-15.1.0-h6a1bac1_3.conda#d71cc504fcfdbee8dd7925ebb9c2bf85 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-15.1.0-h69a702a_3.conda#6e5d0574e57a38c36e674e9a18eee2b4 +https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.0-hb9d3cd8_0.conda#9fa334557db9f63da6c9285fd2a48638 +https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda#1a580f7796c7bf6393fddb8bbbde58dc +https://conda.anaconda.org/conda-forge/linux-64/lerc-4.0.0-h0aef613_1.conda#9344155d33912347b37f0ae6c410a835 +https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.24-h86f0d12_0.conda#64f0c503da58ec25ebd359e4d990afa8 +https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.5.0-h851e524_0.conda#63f790534398730f59e1b899c3644d4a +https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.7.0-hf01ce69_5.conda#e79a094918988bb1807462cd42c83962 +https://conda.anaconda.org/conda-forge/linux-64/make-4.4.1-hb9d3cd8_2.conda#33405d2a66b1411db9f7242c8b97c9e7 +https://conda.anaconda.org/conda-forge/linux-64/fribidi-1.0.10-h36c2ea0_0.tar.bz2#ac7bc6a654f8f41b352b38f4051135f8 +https://conda.anaconda.org/conda-forge/linux-64/graphite2-1.3.14-h5888daf_0.conda#951ff8d9e5536896408e89d63230b8d5 +https://conda.anaconda.org/conda-forge/linux-64/harfbuzz-11.2.1-h3beb420_0.conda#0e6e192d4b3d95708ad192d957cf3163 +https://conda.anaconda.org/conda-forge/linux-64/pango-1.56.4-hadf4263_0.conda#79f71230c069a287efe3a8614069ddf1 +https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda#283b96675859b20a825f8fa30f311446 +https://conda.anaconda.org/conda-forge/linux-64/sed-4.9-h6688a6e_0.conda#171afc5f7ca0408bbccbcb69ade85f92 +https://conda.anaconda.org/conda-forge/linux-64/tktable-2.10-h8d826fa_7.conda#3ac51142c19ba95ae0fadefa333c9afb +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxt-1.3.1-hb9d3cd8_0.conda#279b0de5f6ba95457190a1c459a64e31 +https://conda.anaconda.org/conda-forge/linux-64/r-base-4.3.3-h65010dc_18.conda#721ea26859f44b206b0146eae8444657 +https://conda.anaconda.org/conda-forge/linux-64/r-statmod-1.5.0-r43ha36c22a_2.conda#d1b3431cbf858fec53e7eb00f8b8cde0 +https://conda.anaconda.org/bioconda/linux-64/bioconductor-limma-3.58.1-r43ha9d7317_1.tar.bz2#c8af3f878cedd1c3c4b6a61a722cddc0 +https://conda.anaconda.org/conda-forge/linux-64/r-lattice-0.22_7-r43h2b5f3a1_0.conda#1902233545ef5232dacdd973153d77c4 +https://conda.anaconda.org/conda-forge/linux-64/r-locfit-1.5_9.12-r43h2b5f3a1_0.conda#94bb7f425967b333ba97ce778b5a2efc +https://conda.anaconda.org/conda-forge/linux-64/r-rcpp-1.1.0-r43h93ab643_0.conda#b10c60cf4d65df16b0fe2a17e2324375 +https://conda.anaconda.org/bioconda/linux-64/bioconductor-edger-4.0.16-r43hf17093f_1.tar.bz2#7b499c193120f59dc5f034a069ab277b +https://conda.anaconda.org/conda-forge/noarch/r-getopt-1.20.4-r43ha770c72_1.conda#cf6793c369dbc7ef63d9c1bc9b186615 +https://conda.anaconda.org/conda-forge/noarch/r-optparse-1.7.5-r43hc72bb7e_1.conda#ae32080aac0f74e73e7cd6e774db1c73 diff --git a/modules/local/old/get_annotation_accession/main.nf b/modules/local/old/get_annotation_accession/main.nf new file mode 100644 index 00000000..0236126c --- /dev/null +++ b/modules/local/old/get_annotation_accession/main.nf @@ -0,0 +1,32 @@ +process GET_ANNOTATION_ACCESSION { + + label 'process_single' + + tag "$species" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/b4/b4d686ef63e22bc4d461178fc241cefddd2aa3436e189d3787c8e019448f056e/data': + 'community.wave.seqera.io/library/requests_tenacity_tqdm:126dbed8ef3ff96f' }" + + input: + val(species) + + output: + env("ACCESSION"), emit: accession + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('requests'), eval('python3 -c "import requests; print(requests.__version__)"'), topic: versions + tuple val("${task.process}"), val('tenacity'), eval('python3 -c "from importlib.metadata import version; print(version(\'tenacity\'))"'), topic: versions + + script: + """ + get_annotation_accession.py --species $species + ACCESSION=\$(cat accession.txt) + """ + + stub: + """ + touch accession.txt + """ + +} diff --git a/modules/local/old/get_annotation_accession/spec-file.txt b/modules/local/old/get_annotation_accession/spec-file.txt new file mode 100644 index 00000000..9c4d257a --- /dev/null +++ b/modules/local/old/get_annotation_accession/spec-file.txt @@ -0,0 +1,53 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: linux-64 +@EXPLICIT +https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda#c3473ff8bdb3d124ed5ff11ec380d6f9 +https://repo.anaconda.com/pkgs/main/linux-64/_openmp_mutex-5.1-1_gnu.conda#71d281e9c2192cb3fa425655a8defb85 +https://repo.anaconda.com/pkgs/main/linux-64/libgcc-15.2.0-h69a1729_7.conda#01fb1b8725fc7f66312b9d409758917a +https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-15.2.0-h39759b7_7.conda#7dc7ec61ceea5de17f3e2c4c5f442fc6 +https://repo.anaconda.com/pkgs/main/linux-64/libgcc-ng-15.2.0-h166f726_7.conda#2783efb2502b9caa7f08e25fd54df899 +https://repo.anaconda.com/pkgs/main/linux-64/bzip2-1.0.8-h5eee18b_6.conda#f21a3ff51c1b271977f53ce956a69297 +https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-ng-15.2.0-hc03a8fd_7.conda#cf200522c0b13d64bf81035358d05f5b +https://repo.anaconda.com/pkgs/main/linux-64/expat-2.7.3-h3385a95_0.conda#105822d24b4de9055e705a7d76549416 +https://repo.anaconda.com/pkgs/main/linux-64/ld_impl_linux-64-2.44-h153f514_2.conda#dffdc9a0e09d04051d4bd758e104f4b3 +https://repo.anaconda.com/pkgs/main/linux-64/libffi-3.4.4-h6a678d5_1.conda#70646cc713f0c43926cfdcfe9b695fe0 +https://repo.anaconda.com/pkgs/main/linux-64/libmpdec-4.0.0-h5eee18b_0.conda#feb10f42b1a7b523acbf85461be41a3e +https://repo.anaconda.com/pkgs/main/linux-64/libuuid-1.41.5-h5eee18b_0.conda#4a6a2354414c9080327274aa514e5299 +https://repo.anaconda.com/pkgs/main/linux-64/ncurses-6.5-h7934f7d_0.conda#0abfc090299da4bb031b84c64309757b +https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2025.11.4-h06a4308_0.conda#f04cd5aa67216b77e8f664bb4c7098a4 +https://repo.anaconda.com/pkgs/main/linux-64/openssl-3.0.18-hd6dcaed_0.conda#3762b8999909b69745881cf4b8dd2816 +https://repo.anaconda.com/pkgs/main/linux-64/python_abi-3.13-1_cp313.conda#bea705c35663f9394ec82e87dc692c85 +https://repo.anaconda.com/pkgs/main/linux-64/readline-8.3-hc2a1206_0.conda#8578e006d4ef5cb98a6cda232b3490f6 +https://repo.anaconda.com/pkgs/main/linux-64/libzlib-1.3.1-hb25bd0a_0.conda#338ee51e19ee211b7fc994d4ba88c631 +https://repo.anaconda.com/pkgs/main/linux-64/zlib-1.3.1-hb25bd0a_0.conda#9f3a877e5e0fa0fb39253a59ff824861 +https://repo.anaconda.com/pkgs/main/linux-64/sqlite-3.51.0-h2a70700_0.conda#99a4278be9c6901ee6989b24fd213240 +https://repo.anaconda.com/pkgs/main/linux-64/pthread-stubs-0.3-h0ce48e5_1.conda#973a642312d2a28927aaf5b477c67250 +https://repo.anaconda.com/pkgs/main/linux-64/xorg-libxau-1.0.12-h9b100fa_0.conda#a8005a9f6eb903e113cd5363e8a11459 +https://repo.anaconda.com/pkgs/main/linux-64/xorg-libxdmcp-1.1.5-h9b100fa_0.conda#c284a09ddfba81d9c4e740110f09ea06 +https://repo.anaconda.com/pkgs/main/linux-64/libxcb-1.17.0-h9b100fa_0.conda#fdf0d380fa3809a301e2dbc0d5183883 +https://repo.anaconda.com/pkgs/main/linux-64/xorg-xorgproto-2024.1-h5eee18b_1.conda#412a0d97a7a51d23326e57226189da92 +https://repo.anaconda.com/pkgs/main/linux-64/xorg-libx11-1.8.12-h9b100fa_1.conda#6298b27afae6f49f03765b2a03df2fcb +https://repo.anaconda.com/pkgs/main/linux-64/tk-8.6.15-h54e0aa7_0.conda#1fa91e0c4fc9c9435eda3f1a25a676fd +https://repo.anaconda.com/pkgs/main/noarch/tzdata-2025b-h04d1e81_0.conda#1d027393db3427ab22a02aa44a56f143 +https://repo.anaconda.com/pkgs/main/linux-64/xz-5.6.4-h5eee18b_1.conda#3581505fa450962d631bd82b8616350e +https://repo.anaconda.com/pkgs/main/linux-64/python-3.13.9-h7e8bc2b_100_cp313.conda#9ea34b30a1bdb8f7c9d62c072697e681 +https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.2.0-py313h09d1b84_0.conda#dfd94363b679c74937b3926731ee861a +https://conda.anaconda.org/conda-forge/noarch/certifi-2025.11.12-pyhd8ed1ab_0.conda#96a02a5c1a65470a7e4eedb644c872fd +https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda#12c566707c80111f9799308d9e265aef +https://conda.anaconda.org/conda-forge/linux-64/cffi-1.17.1-py313hfab6e84_0.conda#ce6386a5892ef686d6d680c345c40ad1 +https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.4-pyhd8ed1ab_0.conda#a22d1fd9bf98827e280a02875d9a007a +https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda#0a802cb9888dd14eeefc611f05c40b6e +https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda#8e6923fc12f1fe8f8c4e5c9f343256ac +https://conda.anaconda.org/conda-forge/noarch/h2-4.3.0-pyhcf101f3_0.conda#164fc43f0b53b6e3a7bc7dce5e4f1dc9 +https://conda.anaconda.org/conda-forge/noarch/idna-3.11-pyhd8ed1ab_0.conda#53abe63df7e10a6ba605dc5f9f961d36 +https://repo.anaconda.com/pkgs/main/linux-64/libgomp-15.2.0-h4751f2c_7.conda#82025ed6da944bd419d42d9b1ff116aa +https://repo.anaconda.com/pkgs/main/linux-64/setuptools-80.9.0-py313h06a4308_0.conda#42ffd8d5a0c04d5e55431e3d4f6e8408 +https://repo.anaconda.com/pkgs/main/linux-64/wheel-0.45.1-py313h06a4308_0.conda#29057e876eedce0e37c2388c138a19f9 +https://repo.anaconda.com/pkgs/main/noarch/pip-25.3-pyhc872135_0.conda#f713912a259ec613b3832c3bc842e9d4 +https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda#461219d1a5bd61342293efa2c0c90eac +https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb8e6e7a_2.conda#6432cb5d4ac0046c3ac0a8a0f95842f9 +https://conda.anaconda.org/conda-forge/linux-64/zstandard-0.25.0-py313h54dd161_1.conda#710d4663806d0f72b2fb414e936223b5 +https://conda.anaconda.org/conda-forge/noarch/urllib3-2.5.0-pyhd8ed1ab_0.conda#436c165519e140cb08d246a4472a9d6a +https://conda.anaconda.org/conda-forge/noarch/requests-2.32.5-pyhd8ed1ab_0.conda#db0c6b99149880c8ba515cf4abe93ee4 +https://conda.anaconda.org/conda-forge/noarch/tenacity-9.1.2-pyhd8ed1ab_0.conda#5d99943f2ae3cc69e1ada12ce9d4d701 diff --git a/modules/local/old/normalise_microarray/environment.yml b/modules/local/old/normalise_microarray/environment.yml new file mode 100644 index 00000000..651528a4 --- /dev/null +++ b/modules/local/old/normalise_microarray/environment.yml @@ -0,0 +1,11 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::r-biocmanager + - conda-forge::r-optparse + - conda-forge::r-dplyr + - bioconda::bioconductor-genomeinfodb + - bioconda::bioconductor-annotationdbi diff --git a/modules/local/quantile_normalisation/environment.yml b/modules/local/quantile_normalisation/environment.yml index 72ffff31..69fedff0 100644 --- a/modules/local/quantile_normalisation/environment.yml +++ b/modules/local/quantile_normalisation/environment.yml @@ -1,8 +1,9 @@ -name: quant_norm +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json channels: - conda-forge + - bioconda dependencies: - - conda-forge::python==3.12.8 - - conda-forge::pandas==2.2.3 - - conda-forge::scikit-learn==1.6.1 - - conda-forge::pyarrow==19.0.0 + - conda-forge::python=3.14.2 + - conda-forge::polars==1.36.1 + - conda-forge::scikit-learn==1.8.0 diff --git a/modules/local/quantile_normalisation/main.nf b/modules/local/quantile_normalisation/main.nf index b2891d38..c1d7e94c 100644 --- a/modules/local/quantile_normalisation/main.nf +++ b/modules/local/quantile_normalisation/main.nf @@ -1,33 +1,31 @@ -process QUANTILE_NORMALISE { +process QUANTILE_NORMALISATION { - label 'process_low' - - publishDir "${params.outdir}/quantile_normalisation" + label 'process_single' tag "${meta.dataset}" conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/2d/2df931a4ea181fe1ea9527abe0fd4aff9453d6ea56d56aee7c4ac5dceed611e3/data': - 'community.wave.seqera.io/library/pandas_pyarrow_python_scikit-learn:6f85e3c4d1706e81' }" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/eb/eb8feda3812519f6f6f085e1d058f534b0aedba570c1443c4479d79975e81906/data': + 'community.wave.seqera.io/library/polars_scikit-learn:a30d22b117dad962' }" input: tuple val(meta), path(count_file) + val target_distribution output: tuple val(meta), path('*.quant_norm.parquet'), emit: counts tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions - tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions tuple val("${task.process}"), val('scikit-learn'), eval('python3 -c "import sklearn; print(sklearn.__version__)"'), topic: versions - tuple val("${task.process}"), val('pyarrow'), eval('python3 -c "import pyarrow; print(pyarrow.__version__)"'), topic: versions - script: """ - quantile_normalise.py --counts $count_file + quantile_normalise.py \\ + --counts $count_file \\ + --target-distrib $target_distribution """ - stub: """ touch count.cpm.quant_norm.parquet diff --git a/modules/local/remove_samples_not_valid/environment.yml b/modules/local/remove_samples_not_valid/environment.yml new file mode 100644 index 00000000..f0eaf3dd --- /dev/null +++ b/modules/local/remove_samples_not_valid/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.35.2 diff --git a/modules/local/remove_samples_not_valid/main.nf b/modules/local/remove_samples_not_valid/main.nf new file mode 100644 index 00000000..e61528df --- /dev/null +++ b/modules/local/remove_samples_not_valid/main.nf @@ -0,0 +1,26 @@ +process REMOVE_SAMPLES_NOT_VALID { + + label 'process_single' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f8a5d02e7b31980c887253a9f118da0ef91ead1c7b158caf855199e5c5d5473/data': + 'community.wave.seqera.io/library/polars_python:cab787b788e5eba7' }" + + input: + tuple val(meta), path(count_file) + + output: + tuple val(meta), path("*.filtered.parquet"), optional: true, emit: counts + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + remove_samples_not_valid.py \\ + --counts $count_file + """ + +} diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml index a27122ce..d02016a0 100644 --- a/modules/nf-core/multiqc/environment.yml +++ b/modules/nf-core/multiqc/environment.yml @@ -1,5 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json channels: - conda-forge - bioconda dependencies: - - bioconda::multiqc=1.27 + - bioconda::multiqc=1.32 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 58d9313c..c1158fb0 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -3,8 +3,8 @@ process MULTIQC { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.27--pyhdfd78af_0' : - 'biocontainers/multiqc:1.27--pyhdfd78af_0' }" + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/8c/8c6c120d559d7ee04c7442b61ad7cf5a9e8970be5feefb37d68eeaa60c1034eb/data' : + 'community.wave.seqera.io/library/multiqc:1.32--d58f60e4deb769bf' }" input: path multiqc_files, stageAs: "?/*" diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index e69de29b..ce30eb73 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -0,0 +1,92 @@ +name: multiqc +description: Aggregate results from bioinformatics analyses across many samples into + a single report +keywords: + - QC + - bioinformatics tools + - Beautiful stand-alone HTML report +tools: + - multiqc: + description: | + MultiQC searches a given directory for analysis logs and compiles a HTML report. + It's a general use tool, perfect for summarising the output from numerous bioinformatics tools. + homepage: https://multiqc.info/ + documentation: https://multiqc.info/docs/ + licence: ["GPL-3.0-or-later"] + identifier: biotools:multiqc +input: + - multiqc_files: + type: file + description: | + List of reports / files recognised by MultiQC, for example the html and zip output of FastQC + ontologies: [] + - multiqc_config: + type: file + description: Optional config yml for MultiQC + pattern: "*.{yml,yaml}" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML + - extra_multiqc_config: + type: file + description: Second optional config yml for MultiQC. Will override common sections + in multiqc_config. + pattern: "*.{yml,yaml}" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML + - multiqc_logo: + type: file + description: Optional logo file for MultiQC + pattern: "*.{png}" + ontologies: [] + - replace_names: + type: file + description: | + Optional two-column sample renaming file. First column a set of + patterns, second column a set of corresponding replacements. Passed via + MultiQC's `--replace-names` option. + pattern: "*.{tsv}" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + - sample_names: + type: file + description: | + Optional TSV file with headers, passed to the MultiQC --sample_names + argument. + pattern: "*.{tsv}" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV +output: + report: + - "*multiqc_report.html": + type: file + description: MultiQC report file + pattern: "multiqc_report.html" + ontologies: [] + data: + - "*_data": + type: directory + description: MultiQC data dir + pattern: "multiqc_data" + plots: + - "*_plots": + type: file + description: Plots created by MultiQC + pattern: "*_data" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@abhi18av" + - "@bunop" + - "@drpatelh" + - "@jfy133" +maintainers: + - "@abhi18av" + - "@bunop" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test index e69de29b..33316a7d 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test +++ b/modules/nf-core/multiqc/tests/main.nf.test @@ -0,0 +1,92 @@ +nextflow_process { + + name "Test Process MULTIQC" + script "../main.nf" + process "MULTIQC" + + tag "modules" + tag "modules_nfcore" + tag "multiqc" + + config "./nextflow.config" + + test("sarscov2 single-end [fastqc]") { + + when { + process { + """ + input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) + input[1] = [] + input[2] = [] + input[3] = [] + input[4] = [] + input[5] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, + { assert process.out.data[0] ==~ ".*/multiqc_data" }, + { assert snapshot(process.out.versions).match("multiqc_versions_single") } + ) + } + + } + + test("sarscov2 single-end [fastqc] [config]") { + + when { + process { + """ + input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) + input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) + input[2] = [] + input[3] = [] + input[4] = [] + input[5] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, + { assert process.out.data[0] ==~ ".*/multiqc_data" }, + { assert snapshot(process.out.versions).match("multiqc_versions_config") } + ) + } + } + + test("sarscov2 single-end [fastqc] - stub") { + + options "-stub" + + when { + process { + """ + input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) + input[1] = [] + input[2] = [] + input[3] = [] + input[4] = [] + input[5] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.report.collect { file(it).getName() } + + process.out.data.collect { file(it).getName() } + + process.out.plots.collect { file(it).getName() } + + process.out.versions ).match("multiqc_stub") } + ) + } + + } +} diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap index e69de29b..a88bafd6 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test.snap +++ b/modules/nf-core/multiqc/tests/main.nf.test.snap @@ -0,0 +1,41 @@ +{ + "multiqc_versions_single": { + "content": [ + [ + "versions.yml:md5,737bb2c7cad54ffc2ec020791dc48b8f" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "24.10.4" + }, + "timestamp": "2025-10-27T13:33:24.356715" + }, + "multiqc_stub": { + "content": [ + [ + "multiqc_report.html", + "multiqc_data", + "multiqc_plots", + "versions.yml:md5,737bb2c7cad54ffc2ec020791dc48b8f" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "24.10.4" + }, + "timestamp": "2025-10-27T13:34:11.103619" + }, + "multiqc_versions_config": { + "content": [ + [ + "versions.yml:md5,737bb2c7cad54ffc2ec020791dc48b8f" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "24.10.4" + }, + "timestamp": "2025-10-27T13:34:04.615233" + } +} \ No newline at end of file diff --git a/modules/nf-core/multiqc/tests/tags.yml b/modules/nf-core/multiqc/tests/tags.yml deleted file mode 100644 index bea6c0d3..00000000 --- a/modules/nf-core/multiqc/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -multiqc: - - modules/nf-core/multiqc/** diff --git a/nextflow.config b/nextflow.config index 79aeee03..8fe61c85 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,28 +12,53 @@ params { // Mandatory inputs species = null + // general options + keywords = "" + platform = null + accessions_only = false + download_only = false + // Local datasets datasets = null - // statistics - normalisation_method = 'deseq2' - nb_top_gene_candidates = 1000 - ks_pvalue_threshold = 0 + // Expression atlas + skip_fetch_eatlas_accessions = false + fetch_geo_accessions = false + accessions = "" + excluded_accessions = "" + accessions_file = null + excluded_accessions_file = null // ID mapping + gprofiler_target_db = "ENSG" gene_metadata = null gene_id_mapping = null - skip_gprofiler = false + skip_id_mapping = false + min_occurrence_freq = 0.1 + min_occurrence_quantile = 0.2 - // Expression atlas - fetch_eatlas_accessions = false - eatlas_keywords = "" - eatlas_accessions = "" + // statistics + normalisation_method = 'tpm' + gene_length = null + quantile_norm_target_distrib = 'uniform' + nb_top_gene_candidates = 5000 + min_expr_threshold = 0.2 - // Expression atlas - fetch_eatlas_accessions = false - eatlas_keywords = "" - eatlas_accessions = "" + // stability scoring + run_genorm = false + candidate_selection_descriptor = "cv" + stability_score_weights = "0.8,0.1,0.1,0" + + // random sampling + random_sampling_seed = 42 + random_sampling_size = 5000 + + // MultiQC options + multiqc_config = null + multiqc_title = null + multiqc_logo = null + max_multiqc_email_size = '25.MB' + multiqc_methods_description = null // Boilerplate options outdir = null @@ -42,13 +67,15 @@ params { email_on_fail = null plaintext_email = false monochrome_logs = false - hook_url = null + hook_url = System.getenv('HOOK_URL') help = false help_full = false show_hidden = false version = false pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/' - trace_report_suffix = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')// Config options + trace_report_suffix = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') + + // Config options config_profile_name = null config_profile_description = null @@ -57,23 +84,10 @@ params { config_profile_contact = null config_profile_url = null - // MultiQC - multiqc_config = null - multiqc_logo = null - multiqc_methods_description = null - max_multiqc_email_size = "25.MB" - multiqc_title = null - // Schema validation default options validate_params = true } -validation { - // logs - monochromeLogs = false - help.enabled = true -} - // Load base.config by default for all pipelines includeConfig 'conf/base.config' @@ -104,6 +118,17 @@ profiles { charliecloud.enabled = false apptainer.enabled = false } + micromamba { + conda.enabled = true + conda.useMicromamba = true + conda.channels = ['conda-forge', 'bioconda'] + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + } docker { docker.enabled = true conda.enabled = false @@ -114,7 +139,18 @@ profiles { apptainer.enabled = false docker.runOptions = '-u $(id -u):$(id -g)' } - arm { + arm64 { + process.arch = 'arm64' + // TODO https://github.com/nf-core/modules/issues/6694 + // For now if you're using arm64 you have to use wave for the sake of the maintainers + // wave profile + apptainer.ociAutoPull = true + singularity.ociAutoPull = true + wave.enabled = true + wave.freeze = true + wave.strategy = 'conda,container' + } + emulate_amd64 { docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' } singularity { @@ -171,32 +207,26 @@ profiles { wave.freeze = true wave.strategy = 'conda,container' } - gitpod { - executor.name = 'local' - executor.cpus = 4 - executor.memory = 8.GB - process { - resourceLimits = [ - memory: 8.GB, - cpus : 4, - time : 1.h - ] - } + gpu { + docker.runOptions = '-u $(id -u):$(id -g) --gpus all' + apptainer.runOptions = '--nv' + singularity.runOptions = '--nv' } + test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } - test_dataset { includeConfig 'conf/test_dataset.config' } - test_dataset_custom_mapping { includeConfig 'conf/test_dataset_custom_mapping.config' } - test_one_accession { includeConfig 'conf/test_one_accession.config' } - test_one_accession_low_gene_count { includeConfig 'conf/test_one_accession_low_gene_count.config' } - test_local_and_downloaded { includeConfig 'conf/test_local_and_downloaded.config' } + test_dataset_eatlas { includeConfig 'conf/test_dataset_eatlas.config' } } -// Load nf-core custom profiles from different Institutions -includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/nfcore_custom.config" : "/dev/null" +// Load nf-core custom profiles from different institutions + +// If params.custom_config_base is set AND either the NXF_OFFLINE environment variable is not set or params.custom_config_base is a local path, the nfcore_custom.config file from the specified base path is included. +// Load nf-core/stableexpression custom profiles from different institutions. +includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !params.custom_config_base.startsWith('http')) ? "${params.custom_config_base}/nfcore_custom.config" : "/dev/null" // Load nf-core/stableexpression custom profiles from different institutions. -// includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/pipeline/stableexpression.config" : "/dev/null" +// TODO nf-core: Optionally, you can add a pipeline-specific nf-core config at https://github.com/nf-core/configs +// includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !params.custom_config_base.startsWith('http')) ? "${params.custom_config_base}/pipeline/stableexpression.config" : "/dev/null" // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile // Will not be used unless Apptainer / Docker / Podman / Singularity are enabled @@ -207,6 +237,8 @@ podman.registry = 'quay.io' singularity.registry = 'quay.io' charliecloud.registry = 'quay.io' + + // Export these variables to prevent local Python/R libraries from conflicting with those in the container // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. @@ -262,51 +294,22 @@ manifest { ], ] homePage = 'https://github.com/nf-core/stableexpression' - description = """ -This pipeline is dedicated to finding the most stable genes across count datasets -""" + description = """This pipeline is dedicated to finding the most stable genes across count datasets""" mainScript = 'main.nf' defaultBranch = 'main' - nextflowVersion = '!>=25.04.00' + nextflowVersion = '!>=25.04.0' version = '1.0dev' doi = '' } // Nextflow plugins plugins { - id 'nf-schema@2.2.0' // Validation of pipeline parameters and creation of an input channel from a sample sheet + id 'nf-schema@2.5.1' // Validation of pipeline parameters and creation of an input channel from a sample sheet } validation { defaultIgnoreParams = ["genomes"] monochromeLogs = params.monochrome_logs - help { - enabled = true - command = "nextflow run nf-core/stableexpression -profile --input samplesheet.csv --outdir " - fullParameter = "help_full" - showHiddenParameter = "show_hidden" - beforeText = """ --\033[2m----------------------------------------------------\033[0m- - \033[0;32m,--.\033[0;30m/\033[0;32m,-.\033[0m -\033[0;34m ___ __ __ __ ___ \033[0;32m/,-._.--~\'\033[0m -\033[0;34m |\\ | |__ __ / ` / \\ |__) |__ \033[0;33m} {\033[0m -\033[0;34m | \\| | \\__, \\__/ | \\ |___ \033[0;32m\\`-._,-`-,\033[0m - \033[0;32m`._,._,\'\033[0m -\033[0;35m nf-core/stableexpression ${manifest.version}\033[0m --\033[2m----------------------------------------------------\033[0m- -""" - afterText = """${manifest.doi ? "\n* The pipeline\n" : ""}${manifest.doi.tokenize(",").collect { " https://doi.org/${it.trim().replace('https://doi.org/','')}"}.join("\n")}${manifest.doi ? "\n" : ""} -* The nf-core framework - https://doi.org/10.1038/s41587-020-0439-x - -* Software dependencies - https://github.com/nf-core/stableexpression/blob/main/CITATIONS.md -""" - } - summary { - beforeText = validation.help.beforeText - afterText = validation.help.afterText - } } // Load modules.config for DSL2 module specific options diff --git a/nextflow_schema.json b/nextflow_schema.json index edf79775..23901fa9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,8 +1,8 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/master/nextflow_schema.json", + "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/main/nextflow_schema.json", "title": "nf-core/stableexpression pipeline parameters", - "description": "\nThis pipeline is dedicated to finding the most stable genes across count datasets\n", + "description": "This pipeline is dedicated to finding the most stable genes across count datasets", "type": "object", "$defs": { "input_output_options": { @@ -14,15 +14,16 @@ "properties": { "species": { "type": "string", - "description": "Species name.", + "description": "Scientifc species name (genus and species)", "fa_icon": "fas fa-hippo", - "pattern": "([a-zA-Z]+)[_ ]([a-zA-Z]+)", - "help_text": "e.g. `--species 'Arabidopsis thaliana'` or `--species 'homo_sapiens'`" + "pattern": "^([a-zA-Z]+)[_ ]([a-zA-Z]+)[_ a-zA-Z]*$", + "help_text": "At least genus and species name should be supplied. Words should be separated by ` ` or `_`. Note that character case is ignored. Examples: `--species 'Arabidopsis thaliana'`, `--species 'homo_sapiens' or `--species MARMOTA_MARMOTA_MARMOTA`." }, "outdir": { "type": "string", "format": "directory-path", - "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", + "description": "Output directory", + "help_text": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" }, "datasets": { @@ -30,12 +31,37 @@ "format": "file-path", "exists": true, "schema": "assets/schema_datasets.json", - "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", - "description": "Path to comma-separated file containing information about the input count datasets and their related experimental design.", - "help_text": "The dataset file should be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/stableexpression/usage#samplesheet-input) for more information. Before running the pipeline, you will need to create a design file with information about the samples in your experiment. Use this parameter to specify its location.", + "pattern": "^\\S+\\.(csv|yaml|yml|dat)$", + "description": "Custom datasets (counts + designs)", + "help_text": "Path to CSV / YAML file listing your own count datasets and their related experimental design. This file should be a comma-separated file with 4 columns (`counts`, `design`, `platform` and `normalised`). It must have a header row. Before running the pipeline, and for each count dataset provided by you, a design file with information about the samples in your experiment is required. Combine with --skip_fetch_eatlas_accessions if you only want to analyse your own count datasets. Otherwise, accessions from Expression Atlas and GEO will be fetched automatically. See [usage docs](https://nf-co.re/stableexpression/usage#samplesheet-input) for more information. ", "fa_icon": "fas fa-file-csv" }, + "keywords": { + "type": "string", + "description": "Keywords used for selecting specific Expression Atlas / GEO accessions", + "fa_icon": "fas fa-font", + "pattern": "(([a-zA-Z,]+))?", + "help_text": "Keywords (separated by commas) to use when retrieving specific experiments from Expression Atlas and / or GEO datasets. The pipeline will select all Expression Atlas experiments / GEO datasets that contain the provided keywords in their description of in one of the condition names. Example: `--keywords 'stress,flowering'`. This parameter is unused if --skip_fetch_eatlas_accessions is set and --fetch_geo_accessions is not set." + }, + "platform": { + "type": "string", + "enum": ["rnaseq", "microarray"], + "description": "Only download from this platform", + "fa_icon": "fas fa-arrows-alt-h", + "help_text": "By default, data from both RNA-seq and Microarray platforms are downloaded. Setting this parameter applies a filter to get data from only one of the two platforms. This filter is only used while fetching appropriate Expression atlas / GEO accessions. It will not filter accessions provided directly by the user." + }, + "accessions_only": { + "type": "boolean", + "description": "Only get accessions from Expression Atlas / GEO and exit.", + "fa_icon": "far fa-stop-circle", + "help_text": "Use this option if you only want to get Expression Atlas accessions and skip the rest of the pipeline." + }, + "download_only": { + "type": "boolean", + "description": "Only get accessions from Expression Atlas / GEO and download the selected datasets.", + "fa_icon": "far fa-stop-circle", + "help_text": "Use this option if you only want to get Expression Atlas / GEO accessions, download the selected data, and skip the rest of the pipeline." + }, "email": { "type": "string", "description": "Email address for completion summary.", @@ -50,31 +76,53 @@ } } }, - "expression_atlas_options": { - "title": "Expression Atlas options", + "public_data_options": { + "title": "Public data options", "type": "object", "fa_icon": "fas fa-book-atlas", - "description": "Options for fetching datasets from Expression Atlas.", + "description": "Options for fetching experiment data from Expression Atlas / GEO.", "properties": { - "fetch_eatlas_accessions": { + "skip_fetch_eatlas_accessions": { "type": "boolean", "fa_icon": "fas fa-cloud-arrow-down", - "description": "Automatically etches Expression Atlas accessions for this species. and downloads the corresponding count datasets and experimental designs.", - "help_text": "If no Expression Atlas keywords are provided (with `--eatlas_keywords`) and if you want to get all Expression Atlas accessions for this species, provide this parameter." + "description": "Skip fetching Expression Atlas accessions", + "help_text": "Expression Atlas accessions are automatically fetched by default. Set this parameter to skip this step." }, - "eatlas_keywords": { + "fetch_geo_accessions": { + "type": "boolean", + "fa_icon": "fas fa-cloud-arrow-down", + "description": "Fetch GEO accessions from NCBI", + "help_text": "Set this parameter to fetch GEO accessions from NCBI. This feature is experimental and may not work as expected. Please report any issues to https://github.com/nf-core/stableexpression." + }, + "accessions": { "type": "string", - "description": "Keywords (separated by commas) to use when retrieving specific experiments from Expression Atlas.", - "fa_icon": "fas fa-highlighter", - "pattern": "([a-zA-Z,]+)", - "help_text": "The pipeline will select all Expression Atlas experiments that contain the provided keywords in their description of in one of the condition names. Example: `--eatlas_keywords 'stress,flowering'`" + "pattern": "([A-Z0-9-]+,?)+", + "description": "Expression Atlas / GEO accession(s) to include", + "fa_icon": "fas fa-address-card", + "help_text": "Provide Expression Atlas / GEO accession(s) that you want to download. The accessions should be comma-separated. Example: `--accessions E-MTAB-552,E-GEOD-61690,GSE8165,GSE8161`. Combine with --skip_fetch_eatlas_accessions if you want only these accessions to be used. User provided accessions are prioritised over excluded accessions." }, - "eatlas_accessions": { + "accessions_file": { + "type": "string", + "format": "file-path", + "exists": true, + "description": "File containing Expression Atlas / GEO accession(s) to download", + "fa_icon": "fas fa-file", + "help_text": "File containing Expression Atlas / GEO accession(s) that you want to download. One accession per line. Example: `--accessions_file included_accessions.txt`. Combine with --skip_fetch_accessions if you want only these accessions to be used. User provided accessions are prioritised over excluded accessions." + }, + "excluded_accessions": { "type": "string", "pattern": "([A-Z0-9-]+,?)+", - "description": "Provide directly Expression Atlas accession(s) (separated by commas) that you want to download.", + "description": "Expression Atlas accession(s) to exclude", "fa_icon": "fas fa-id-card", - "help_text": "Example: `--eatlas_accessions 'E-MTAB-552,E-GEOD-61690'`" + "help_text": "Provide Expression Atlas / GEO accession(s) that you want to exclude. The accessions should be comma-separated. Example: `--excluded_accessions E-MTAB-552,E-GEOD-61690`" + }, + "excluded_accessions_file": { + "type": "string", + "format": "file-path", + "exists": true, + "description": "File containing Expression Atlas accession(s) to exclude", + "fa_icon": "fas fa-file", + "help_text": "File containing Expression Atlas / GEO accession(s) that you want to exclude. One accession per line. Example: `--excluded_accessions_file excluded_accessions.txt`." } } }, @@ -84,11 +132,19 @@ "fa_icon": "fas fa-map", "description": "Options for mapping gene IDs.", "properties": { - "skip_gprofiler": { + "skip_id_mapping": { "type": "boolean", - "description": "Skip g:Profiler ID mapping step.", + "description": "Skip g:Profiler ID mapping step", "fa_icon": "fas fa-ban", - "help": "If you don't want to map gene IDs with g:Profiler, you can skip this step by providing `--skip_gprofiler`. It can be in particular useful if the g:Profiler is down and if you already have a custom mapping file." + "help": "If you don't want to map gene IDs with g:Profiler, you can skip this step by providing `--skip_id_mapping`. It can be in particular useful if the g:Profiler is down and if you already have a custom mapping file." + }, + "gprofiler_target_db": { + "type": "string", + "description": "Target database for g:Profiler", + "fa_icon": "fas fa-divide", + "enum": ["ENSG", "ENTREZGENE", "UNIPROTSPTREMBL", "UNIPROTSWISSPROT"], + "default": "ENSG", + "help_text": "Target database for g:Profiler. You can see the full list of available target databases at https://biit.cs.ut.ee/gprofiler/convert." }, "gene_id_mapping": { "type": "string", @@ -96,10 +152,10 @@ "exists": true, "schema": "assets/schema_gene_id_mapping.json", "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", - "description": "Path to comma-separated file containing custom gene id mappings. Each row represents a mapping from the original gene ID in your count datasets to the ensembl ID in g:Profiler.", - "help_text": "The mapping file should be a comma-separated file with 2 columns (original_gene_id and ensembl_gene_id) and a header row.", - "fa_icon": "fas fa-file-csv" + "pattern": "^\\S+\\.(csv|dat)$", + "description": "Custom gene id mapping file", + "help_text": "Path to comma-separated file containing custom gene id mappings. Each row represents a mapping from the original gene ID in your count datasets to a prefered gene ID. The mapping file should be a comma-separated file with 2 columns (original_gene_id and gene_id) and a header row.", + "fa_icon": "fas fa-file" }, "gene_metadata": { "type": "string", @@ -107,10 +163,28 @@ "exists": true, "schema": "assets/schema_gene_metadata.json", "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", - "description": "Path to comma-separated file containing custom gene metadata information. Each row represents a gene and links its ensembl gene ID to its name and description.", - "help_text": "The metadata file should be a comma-separated file with 3 columns (ensembl_gene_id, name and description) and a header row.", - "fa_icon": "fas fa-file-csv" + "pattern": "^\\S+\\.(csv|dat)$", + "description": "Custom gene metadata file", + "help_text": "Path to comma-separated file containing custom gene metadata information. Each row represents a gene and links its gene ID to its name and description. The metadata file should be a comma-separated file with 3 columns (gene_id, name and description) and a header row.", + "fa_icon": "fas fa-file" + }, + "min_occurrence_quantile": { + "type": "number", + "description": "Minimum quantile for the frequency of occurrence", + "fa_icon": "fas fa-battery-three-quarters", + "minimum": 0, + "maximum": 1, + "default": 0.2, + "help_text": "To avoid genes that are rarely observed, genes less represented than the specified quantile will be filtered out. For example, value of 0.2 means that the 20% less represented will be filtered out. This filter is applied before using the absolute filter `--min_occurrence_freq`." + }, + "min_occurrence_freq": { + "type": "number", + "description": "Minimum frequency of occurrence among all datasets", + "fa_icon": "fas fa-battery-three-quarters", + "minimum": 0, + "maximum": 1, + "default": 0.1, + "help_text": "To avoid genes that are rarely observed, genes showing a frequency of occurrence below this threshold will be filtered out." } } }, @@ -122,26 +196,97 @@ "properties": { "normalisation_method": { "type": "string", - "description": "Tool to use for normalisation.", - "fa_icon": "fas fa-chart-simple", - "enum": ["deseq2", "edger"], - "default": "deseq2" + "description": "Count normalisation method", + "fa_icon": "fas fa-divide", + "enum": ["tpm", "cpm"], + "default": "tpm", + "help_text": "Raw RNAseq data must be normalised before further processing. `tmp offers a more accurate representation of gene expression levels as it is unbiased toward gene length. However, you can choose `cpm` if you do not have access to a genome annotation." + }, + "gene_length": { + "type": "string", + "format": "file-path", + "exists": true, + "schema": "assets/schema_gene_length.json", + "mimetype": "text/csv", + "pattern": "^\\S+\\.(csv|tsv|dat)$", + "description": "Gene length file", + "help_text": "Path to comma-separated file containing gene lengths. Each row represents a gene and gives the length of its longest transcript. The file should be a comma-separated file with 2 columns (gene_id and length) and a header row.", + "fa_icon": "fas fa-file" + }, + "quantile_norm_target_distrib": { + "type": "string", + "description": "Target distribution for quantile normalisation", + "fa_icon": "fas fa-chart-bar", + "enum": ["uniform", "normal"], + "default": "uniform", + "help_text": "In order to compare counts between samples and different datasets, all normalised counts are quantile normalised and mapped to a specific distribution. The pipeline uses scikit-learn's quantile_transform function. You can select the target distribution to map counts to." + }, + "min_expr_threshold": { + "type": "number", + "description": "Minimum percentage of quantile expression level", + "fa_icon": "fas fa-battery-three-quarters", + "minimum": 0, + "maximum": 1, + "default": 0.2, + "help_text": "To avoid genes with low expression levels from being considered for stability scoring, a threshold is set up. For example, a value of 0.2 means that the pipeline will reject genes showing an average expression level in the bottom 20% of all genes." + } + } + }, + "stability_scoring_options": { + "title": "Stability scoring options", + "type": "object", + "fa_icon": "fas fa-chart-line", + "description": "Options relative to assessment of stability for each gene.", + "properties": { + "candidate_selection_descriptor": { + "type": "string", + "description": "Statistic descriptor for prior gene candidate selection. Either coefficient of variation (cv), or robust median absolute deviation (rcvm).", + "fa_icon": "far fa-chart-bar", + "enum": ["cv", "rcvm"], + "default": "cv", + "help_text": "Candidate genes are chosen based on a certain statistical descriptor. Set this parameter to modify the descriptor used." }, "nb_top_gene_candidates": { "type": "integer", - "description": "Number of candidate genes to keep in the final list.", - "fa_icon": "fas fa-chart-simple", + "description": "Number of candidate genes to keep for stability scoring", + "fa_icon": "fas fa-sort-numeric-up-alt", "minimum": 1, - "default": 1000, - "help_text": "Number of candidate genes to keep in the final list. These candidates genes are chosen as the ones showing the least standard variation. Default is 1000." + "help_text": "Number of candidate genes to keep in the final list. These candidates genes are chosen as the ones showing the least standard variation." }, - "ks_pvalue_threshold": { - "type": "number", - "description": "Threshold for KS p-value for considering samples counts as a uniform distribution.", - "fa_icon": "fas fa-chart-simple", - "maximum": 1, - "default": 0, - "help_text": "P-value threshold for the Kolmogorov-Smirnov test of samples counts against a uniform distribution. Samples showing a p-value equal or below this threshold are considered not uniform and will therefore not be considered for computation of the stability score. Examples: `0`, `'0.05'`, `'1E-27'`. Provide a negative value to disable this filter. By default, all samples showing a pvalue of 0 will be discarded." + "run_genorm": { + "type": "boolean", + "description": "Run Genorm", + "fa_icon": "fas fa-check", + "help": "Genorm is not run by default. To run and get additional information about gene stability, set this parameter to true. Moreover, you can integrate the Genorm M-measure in the stability score by modifying the score weights with --stability_score_weights." + }, + "stability_score_weights": { + "type": "string", + "description": "Weights for stability score calculation", + "fa_icon": "fas fa-balance-scale", + "help_text": "Weights for Coefficient of Variation / Robust Coefficient of Variation on Median / Normfinder / Genorm respectively. Must be a comma-separated string. Example: 0.8,0.1,0.1,0", + "pattern": "^\\d+(\\.\\d+)?,\\d+(\\.\\d+)?,\\d+(\\.\\d+)?,\\d+(\\.\\d+)?$" + } + } + }, + "scalability_options": { + "title": "Scalability options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Options to improve pipeline scalability and robustness", + "properties": { + "random_sampling_size": { + "type": "integer", + "description": "Number of public dataset samples to choose randomly before downloading.", + "fa_icon": "fas fa-sort-numeric-up-alt", + "minimum": 1, + "help_text": "When dealing with species for which there is a large number (eg. >10000) of samples considering all the downloaded datasets, users may encounter RAM issues (eg. errors with `137` exit codes). In such cases, it is recommended to sample a random subset of these datasets to reduce the computational load. A first subsampling is performedduring the search for Expression Atlas accessions. In case there is still room for datasets and if the `--fetch_geo_accessions` flag was set, a second ssubsampling is performed during the search for NCBI GEO accessions." + }, + "random_sampling_seed": { + "type": "integer", + "description": "Seed for dataset random sampling.", + "fa_icon": "fas fa-sort-numeric-up-alt", + "minimum": 0, + "help_text": "Seed for dataset random sampling. This ensures reproducibility of the random sampling process. Changing the seed will result in a different random sample being selected." } } }, @@ -233,6 +378,7 @@ "type": "string", "description": "File size limit when attaching MultiQC reports to summary emails.", "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "default": "25.MB", "fa_icon": "fas fa-file-upload", "hidden": true }, @@ -286,6 +432,18 @@ "fa_icon": "far calendar", "description": "Suffix to add to the trace report filename. Default is the date and time in the format yyyy-MM-dd_HH-mm-ss.", "hidden": true + }, + "help": { + "type": ["boolean", "string"], + "description": "Display the help message." + }, + "help_full": { + "type": "boolean", + "description": "Display the full detailed help message." + }, + "show_hidden": { + "type": "boolean", + "description": "Display hidden parameters in the help message (only works when --help or --help_full are provided)." } } } @@ -295,7 +453,7 @@ "$ref": "#/$defs/input_output_options" }, { - "$ref": "#/$defs/expression_atlas_options" + "$ref": "#/$defs/public_data_options" }, { "$ref": "#/$defs/idmapping_options" @@ -303,6 +461,12 @@ { "$ref": "#/$defs/statistical_options" }, + { + "$ref": "#/$defs/stability_scoring_options" + }, + { + "$ref": "#/$defs/scalability_options" + }, { "$ref": "#/$defs/institutional_config_options" }, diff --git a/nf-test.config b/nf-test.config index 56ad9f69..a0a009fd 100644 --- a/nf-test.config +++ b/nf-test.config @@ -1,13 +1,30 @@ config { + // location for all nf-test tests + testsDir "." + // nf-test directory including temporary files for each test + workDir System.getenv("NFT_WORKDIR") ?: ".nf-test" + + // location of an optional nextflow.config file specific for executing tests testsDir "tests" workDir ".nf-test" configFile "tests/nextflow.config" - profile "docker" + + // ignore tests coming from the nf-core/modules repo + ignore 'modules/nf-core/**/tests/*', 'subworkflows/nf-core/**/tests/*' + + // run all test with defined profile(s) from the main nextflow.config + //profile "apptainer" + + // list of filenames or patterns that should be trigger a full test run + triggers 'nextflow.config', 'nf-test.config', 'conf/test.config', 'tests/nextflow.config', 'tests/.nftignore' + + // load the necessary plugins requires ( - "nf-test": "0.9.2" + "nf-test": "0.9.3" ) plugins { + load "nft-utils@0.0.3" load "nft-csv@0.1.0" } diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..5fbcbac4 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,17 @@ +[tool.ruff.lint] +# Avoid enforcing line-length violations (`E501`) +ignore = ["E501"] + +[tool.ruff.format] +# Use single quotes when formatting. +quote-style = "double" +indent-style = "space" + +[tool.basedpyright] +reportUnusedCallResult = "none" +reportUnknownMemberType = "none" +reportUnknownVariableType = "none" +reportUnknownParameterType = "none" +reportUnknownArgumentType = "none" +reportAny = "none" +reportImplicitRelativeImport = "none" diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index e33dbd05..7532dcc9 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -22,8 +22,8 @@ "@id": "./", "@type": "Dataset", "creativeWorkStatus": "InProgress", - "datePublished": "2025-05-08T08:00:57+00:00", - "description": "

\n \n \n \"nf-core/stableexpression\"\n \n

\n\n[![GitHub Actions CI Status](https://github.com/nf-core/stableexpression/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/stableexpression/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/stableexpression/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/stableexpression/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/stableexpression/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A525.04.00-23aa62.svg)](https://www.nextflow.io/)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/stableexpression)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23stableexpression-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/stableexpression)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/stableexpression** is a bioinformatics pipeline that ...\n\n\n\n\n1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\n\n\nNow, you can run the pipeline using:\n\n\n\n```bash\nnextflow run nf-core/stableexpression \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/stableexpression/usage) and the [parameter documentation](https://nf-co.re/stableexpression/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/stableexpression/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/stableexpression/output).\n\n## Credits\n\nnf-core/stableexpression was originally written by Olivier Coen.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#stableexpression` channel](https://nfcore.slack.com/channels/stableexpression) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\n\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "datePublished": "2025-12-08T14:37:14+00:00", + "description": "

\n \n \n \"nf-core/stableexpression\"\n \n

\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/stableexpression)\n[![GitHub Actions CI Status](https://github.com/nf-core/stableexpression/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/stableexpression/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/stableexpression/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/stableexpression/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/stableexpression/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.04.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with apptainer](https://custom-icon-badges.demolab.com/badge/run%20with-apptainer-4545?logo=apptainer&color=teal&labelColor=000000)](https://apptainer.org/)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/stableexpression)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23stableexpression-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/stableexpression)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/stableexpression** is a bioinformatics pipeline aiming to aggregate multiple count datasets (public / provided by the user) for a specific species and find the most stable genes.\n\n

\n \n

\n\nIt takes as main inputs :\n\n- a species name (mandatory)\n- keywords for Expression Atlas / GEO search (optional)\n- a CSV input file listing your own raw / normalised count datasets (optional).\n\n**Use cases**:\n\n- **find the most suitable genes as RT-qPCR reference genes for a specific species (and optionally specific conditions)**\n- download all Expression Atlas and / or NCBI GEO datasets for a species (and optionally keywords)\n\n## Basic usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nTo search the most stable genes in a species considering all public datasets, simply run:\n\n```bash\nnextflow run nf-core/stableexpression \\\n -profile \\\n --species \\\n --outdir \n```\n\n## More advanced usage\n\nFor more specific scenarios, like:\n\n- **fetching only specific conditions**\n- **using your own expression dataset(s)**\n\nplease refer to the [usage documentation](https://nf-co.re/stableexpression/usage).\n\n## Profiles\n\nSee [here](https://nf-co.re/stableexpression/usage#profiles) for more information about profiles.\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/stableexpression/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/stableexpression/output).\n\n## Support us\n\nIf you like nf-core/stableexpression, please make sure you give it a star on GitHub.\n\n[![stars - stableexpression](https://img.shields.io/github/stars/nf-core/stableexpression?style=social)](https://github.com/nf-core/stableexpression)\n\n## Credits\n\nnf-core/stableexpression was originally written by Olivier Coen.\n\n\n\n\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#stableexpression` channel](https://nfcore.slack.com/channels/stableexpression) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" @@ -31,6 +31,9 @@ { "@id": "assets/" }, + { + "@id": "bin/" + }, { "@id": "conf/" }, @@ -99,7 +102,7 @@ }, "mentions": [ { - "@id": "#ea61203f-47c5-4f0f-bd91-43195b3225f1" + "@id": "#b88c0077-fb2a-4dd6-93f0-ba79d2516560" } ], "name": "nf-core/stableexpression" @@ -132,7 +135,7 @@ } ], "dateCreated": "", - "dateModified": "2025-05-08T10:00:57Z", + "dateModified": "2025-12-08T15:37:14Z", "dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/", "keywords": [ "nf-core", @@ -176,14 +179,14 @@ "url": { "@id": "https://www.nextflow.io/" }, - "version": "!>=25.04.00" + "version": "!>=25.04.0" }, { - "@id": "#ea61203f-47c5-4f0f-bd91-43195b3225f1", + "@id": "#b88c0077-fb2a-4dd6-93f0-ba79d2516560", "@type": "TestSuite", "instance": [ { - "@id": "#6c536155-37c4-45b0-87c2-977dd7e02196" + "@id": "#2468a657-9383-474d-8030-d66d92db7f20" } ], "mainEntity": { @@ -192,10 +195,10 @@ "name": "Test suite for nf-core/stableexpression" }, { - "@id": "#6c536155-37c4-45b0-87c2-977dd7e02196", + "@id": "#2468a657-9383-474d-8030-d66d92db7f20", "@type": "TestInstance", "name": "GitHub Actions workflow for testing nf-core/stableexpression", - "resource": "repos/nf-core/stableexpression/actions/workflows/ci.yml", + "resource": "repos/nf-core/stableexpression/actions/workflows/nf-test.yml", "runsOn": { "@id": "https://w3id.org/ro/terms/test#GithubService" }, @@ -214,6 +217,11 @@ "@type": "Dataset", "description": "Additional files" }, + { + "@id": "bin/", + "@type": "Dataset", + "description": "Scripts that must be callable from a pipeline process" + }, { "@id": "conf/", "@type": "Dataset", @@ -327,4 +335,4 @@ "name": "Olivier Coen" } ] -} +} \ No newline at end of file diff --git a/subworkflows/local/base_statistics/main.nf b/subworkflows/local/base_statistics/main.nf new file mode 100644 index 00000000..8ac6ff15 --- /dev/null +++ b/subworkflows/local/base_statistics/main.nf @@ -0,0 +1,35 @@ +include { COMPUTE_BASE_STATISTICS as COMPUTE_GLOBAL_STATISTICS } from '../../../modules/local/compute_base_statistics' +include { COMPUTE_BASE_STATISTICS as COMPUTE_PLATFORM_STATISTICS } from '../../../modules/local/compute_base_statistics' + +/* +======================================================================================== + SUBWORKFLOW TO DOWNLOAD EXPRESSIONATLAS ACCESSIONS AND DATASETS +======================================================================================== +*/ + +workflow BASE_STATISTICS { + + take: + ch_all_counts // [ [ platform: platform, dataset_size: size], file ] + ch_platform_counts // [ [ platform: platform, dataset_size: size], file ] + + main: + + // ----------------------------------------------------------------- + // PLATFORM-SPECIFIC STATISTICS + // ----------------------------------------------------------------- + + COMPUTE_PLATFORM_STATISTICS( ch_platform_counts ) + + + // ----------------------------------------------------------------- + // ALL DATA + // ----------------------------------------------------------------- + + COMPUTE_GLOBAL_STATISTICS( ch_all_counts ) + + emit: + stats = COMPUTE_GLOBAL_STATISTICS.out.stats + platform_stats = COMPUTE_PLATFORM_STATISTICS.out.stats + +} diff --git a/subworkflows/local/download_public_datasets/main.nf b/subworkflows/local/download_public_datasets/main.nf new file mode 100644 index 00000000..6ea3f11f --- /dev/null +++ b/subworkflows/local/download_public_datasets/main.nf @@ -0,0 +1,66 @@ +include { EXPRESSIONATLAS_GETDATA as EXPRESSION_ATLAS } from '../../../modules/local/expressionatlas/getdata' +include { GEO_GETDATA as GEO } from '../../../modules/local/geo/getdata' + +include { addDatasetIdToMetadata } from '../utils_nfcore_stableexpression_pipeline' +include { groupFilesByDatasetId } from '../utils_nfcore_stableexpression_pipeline' +include { augmentMetadata } from '../utils_nfcore_stableexpression_pipeline' + +/* +======================================================================================== + SUBWORKFLOW TO DOWNLOAD GEO ACCESSIONS AND DATASETS +======================================================================================== +*/ + +workflow DOWNLOAD_PUBLIC_DATASETS { + + take: + species + ch_accessions + + + main: + + ch_datasets = channel.empty() + ch_fetched_accessions = channel.empty() + + ch_accessions = ch_accessions + .branch { acc -> + eatlas: acc.startsWith('E-') + geo: acc.startsWith('GSE') + } + + // ------------------------------------------------------------------------------------ + // DOWNLOAD EXPRESSION ATLAS DATASETS + // ------------------------------------------------------------------------------------ + + // Downloading Expression Atlas data for each accession in ch_accessions + EXPRESSION_ATLAS( ch_accessions.eatlas ) + + // ------------------------------------------------------------------------------------ + // DOWNLOAD GEO DATASETS + // ------------------------------------------------------------------------------------ + + // Downloading GEO datasets for each accession in ch_accessions + GEO( + ch_accessions.geo, + species + ) + + ch_downloaded_counts = EXPRESSION_ATLAS.out.counts.mix ( GEO.out.counts ) + ch_downloaded_design = EXPRESSION_ATLAS.out.design.mix ( GEO.out.design ) + + // adding dataset id (accession + data_type) in the file meta + // flattening in case multiple files are returned at once + ch_counts = addDatasetIdToMetadata( ch_downloaded_counts.flatten() ) + ch_design = addDatasetIdToMetadata( ch_downloaded_design.flatten() ) + + // adding design files to the meta of their respective count files + ch_datasets = groupFilesByDatasetId( ch_design, ch_counts ) + + // adding normalisation state in the meta + ch_datasets = augmentMetadata( ch_datasets ) + + emit: + datasets = ch_datasets + +} diff --git a/subworkflows/local/expression_normalisation/main.nf b/subworkflows/local/expression_normalisation/main.nf index 34249204..0223d8c1 100644 --- a/subworkflows/local/expression_normalisation/main.nf +++ b/subworkflows/local/expression_normalisation/main.nf @@ -1,17 +1,8 @@ -// -// Subworkflow with functionality specific to the nf-core/stableexpression pipeline -// +include { NORMALISATION_COMPUTE_CPM as COMPUTE_CPM } from '../../../modules/local/normalisation/compute_cpm' +include { NORMALISATION_COMPUTE_TPM as COMPUTE_TPM } from '../../../modules/local/normalisation/compute_tpm' +include { QUANTILE_NORMALISATION } from '../../../modules/local/quantile_normalisation' -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -include { DESEQ2_NORMALISE } from '../../../modules/local/deseq2/normalise/main' -include { EDGER_NORMALISE } from '../../../modules/local/edger/normalise/main' -include { QUANTILE_NORMALISE } from '../../../modules/local/quantile_normalisation/main' -include { DATASET_STATISTICS } from '../../../modules/local/dataset_statistics/main' +include { GET_TRANSCRIPT_LENGTHS } from '../../../subworkflows/local/get_transcript_lengths' /* ======================================================================================== @@ -22,9 +13,11 @@ include { DATASET_STATISTICS } from '../../../modules/local/da workflow EXPRESSION_NORMALISATION { take: + species ch_datasets normalisation_method - + quantile_norm_target_distrib + gene_length main: @@ -39,15 +32,34 @@ workflow EXPRESSION_NORMALISATION { normalised: meta.normalised == true } - ch_raw_rnaseq_datasets = ch_datasets.raw.filter { meta, file -> meta.platform == 'rnaseq' } + ch_raw_rnaseq_datasets_to_normalise = ch_datasets.raw.filter { meta, file -> meta.platform == 'rnaseq' } + + if ( normalisation_method == 'tpm' ) { + + if ( params.gene_length ) { + + ch_gene_length_file = channel.fromPath( params.gene_length, checkIfExists: true ) + + } else { + + // download genome annotation + // and computing length of the longest transcript gene per gene + GET_TRANSCRIPT_LENGTHS (species) + ch_gene_length_file = GET_TRANSCRIPT_LENGTHS.out.csv + + } + + COMPUTE_TPM( + ch_raw_rnaseq_datasets_to_normalise, + ch_gene_length_file + ) + ch_raw_rnaseq_datasets_normalised = COMPUTE_TPM.out.counts + + } else { // 'cpm' - if ( normalisation_method == 'deseq2' ) { - DESEQ2_NORMALISE( ch_raw_rnaseq_datasets ) - ch_raw_rnaseq_datasets_normalised = DESEQ2_NORMALISE.out.cpm + COMPUTE_CPM( ch_raw_rnaseq_datasets_to_normalise ) + ch_raw_rnaseq_datasets_normalised = COMPUTE_CPM.out.counts - } else { // 'edger' - EDGER_NORMALISE( ch_raw_rnaseq_datasets ) - ch_raw_rnaseq_datasets_normalised = EDGER_NORMALISE.out.cpm } // @@ -55,21 +67,13 @@ workflow EXPRESSION_NORMALISATION { // // putting all normalised count datasets together and performing quantile normalisation - ch_datasets.normalised.concat( ch_raw_rnaseq_datasets_normalised ) | QUANTILE_NORMALISE - ch_quantile_normalised_datasets = QUANTILE_NORMALISE.out.counts - - // - // MODULE: Dataset statistics - // + QUANTILE_NORMALISATION ( + ch_datasets.normalised.mix( ch_raw_rnaseq_datasets_normalised ), + quantile_norm_target_distrib + ) - DATASET_STATISTICS( ch_quantile_normalised_datasets ) emit: - normalised_counts = ch_quantile_normalised_datasets - dataset_statistics = DATASET_STATISTICS.out.stats + counts = QUANTILE_NORMALISATION.out.counts } - - - - diff --git a/subworkflows/local/expressionatlas_fetchdata/main.nf b/subworkflows/local/expressionatlas_fetchdata/main.nf deleted file mode 100644 index 439ebd86..00000000 --- a/subworkflows/local/expressionatlas_fetchdata/main.nf +++ /dev/null @@ -1,140 +0,0 @@ -// -// Subworkflow with functionality specific to the nf-core/stableexpression pipeline -// - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -include { EXPRESSIONATLAS_GETACCESSIONS } from '../../../modules/local/expressionatlas/getaccessions/main' -include { EXPRESSIONATLAS_GETDATA } from '../../../modules/local/expressionatlas/getdata/main' - -/* -======================================================================================== - SUBWORKFLOW TO DOWNLOAD EXPRESSIONATLAS ACCESSIONS AND DATASETS -======================================================================================== -*/ - -workflow EXPRESSIONATLAS_FETCHDATA { - - take: - ch_species - eatlas_accessions - eatlas_keywords - fetch_eatlas_accessions - - - main: - - ch_accessions = Channel.fromList( eatlas_accessions.tokenize(',') ) - - // fetching Expression Atlas accessions if applicable - if ( fetch_eatlas_accessions || eatlas_keywords ) { - - // - // MODULE: Expression Atlas - Get accessions - // - ch_eatlas_keywords = Channel.value( eatlas_keywords ) - - // getting Expression Atlas accessions given a species name and keywords - // keywords can be an empty string - EXPRESSIONATLAS_GETACCESSIONS( ch_species, ch_eatlas_keywords ) - - // appending to accessions provided by the user - // ensures that no accessions is present twice (provided by the user and fetched from E. Atlas) - // removing E-PROT- accessions - ch_accessions = ch_accessions - .concat( EXPRESSIONATLAS_GETACCESSIONS.out.txt.splitText() ) - .unique() - .map { it -> it.trim() } - .filter { it.startsWith('E-') && !it.startsWith('E-PROT-') } - } - - // - // MODULE: Expression Atlas - Get data - // - - // Downloading Expression Atlas data for each accession in ch_accessions - EXPRESSIONATLAS_GETDATA( ch_accessions ) - - // adding dataset id (accession + data_type) in the file meta - ch_etlas_design = addDatasetIdToMetadata( EXPRESSIONATLAS_GETDATA.out.design.flatten() ) - ch_eatlas_counts = addDatasetIdToMetadata( EXPRESSIONATLAS_GETDATA.out.counts.flatten() ) - - // adding design files to the meta of their respective count files - ch_eatlas_datasets = groupFilesByDatasetId( ch_etlas_design, ch_eatlas_counts ) - - // adding normalisation state in the meta - augmentToMetadata( ch_eatlas_datasets ) - - emit: - downloaded_datasets = ch_eatlas_datasets - -} - - - -/* -======================================================================================== - FUNCTIONS -======================================================================================== -*/ - - -// -// Get Expression Atlas Batch ID (accession + data_type) from file stem -// -def addDatasetIdToMetadata( ch_files ) { - return ch_files - .map { - file -> - def meta = [dataset: file.getSimpleName()] - [meta, file] - } -} - -// -// Groups design and data files by accession and data_type -// Design and count files have necessarily the same dataset ID (same file stem) -// -def groupFilesByDatasetId(ch_design, ch_counts) { - return ch_design - .concat( ch_counts ) // puts counts at the end of the resulting channel - .groupTuple() // groups by dataset ID; design files are necessarily BEFORE count files - .filter { - it.get(1).size() == 2 // only groups with two files - } - .filter { // only groups with first file as design file and second one as count file - meta, files -> - files.get(0).name.endsWith('.design.csv') && !files.get(1).name.endsWith('.design.csv') - } - .map { // putting design file in meta - meta, files -> - def new_meta = meta + [design: files[0]] - [new_meta, files[1]] - } -} - -def getNthPartFromEnd(String s, int n) { - def tokens = s.tokenize('.') - return tokens[tokens.size() - n] -} - -// -// Add normalised: true / false in meta -// -def augmentToMetadata( ch_files ) { - return ch_files - .map { - meta, file -> - if ( getNthPartFromEnd(file.name, 3) == 'raw' ) { - meta.normalised = false - } else { - meta.normalised = true - } - meta.platform = getNthPartFromEnd(file.name, 4) - [meta, file] - } -} diff --git a/subworkflows/local/filter_datasets/main.nf b/subworkflows/local/filter_datasets/main.nf new file mode 100644 index 00000000..fbe44e88 --- /dev/null +++ b/subworkflows/local/filter_datasets/main.nf @@ -0,0 +1,26 @@ +include { REMOVE_SAMPLES_NOT_VALID } from '../../../modules/local/remove_samples_not_valid' + + +/* +======================================================================================== + SUBWORKFLOW TO DOWNLOAD EXPRESSIONATLAS ACCESSIONS AND DATASETS +======================================================================================== +*/ + +workflow FILTER_DATASETS { + + take: + ch_counts + + main: + + // ----------------------------------------------------------------- + // REMOVE SAMPLES WITH TOO MANY ZEROS + // ----------------------------------------------------------------- + + REMOVE_SAMPLES_NOT_VALID ( ch_counts ) + + emit: + counts = REMOVE_SAMPLES_NOT_VALID.out.counts + +} diff --git a/subworkflows/local/genorm/main.nf b/subworkflows/local/genorm/main.nf new file mode 100644 index 00000000..2a5a061b --- /dev/null +++ b/subworkflows/local/genorm/main.nf @@ -0,0 +1,80 @@ +// +// Subworkflow with functionality specific to the nf-core/stableexpression pipeline +// + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { MAKE_CHUNKS } from '../../../modules/local/genorm/make_chunks' +include { CROSS_JOIN } from '../../../modules/local/genorm/cross_join' +include { EXPRESSION_RATIO } from '../../../modules/local/genorm/expression_ratio' +include { RATIO_STANDARD_VARIATION } from '../../../modules/local/genorm/ratio_standard_variation' +include { COMPUTE_M_MEASURE } from '../../../modules/local/genorm/compute_m_measure' + +/* +======================================================================================== + SUBWORKFLOW TO COMPUTE PAIRWISE GENE VARIATION +======================================================================================== +*/ + +workflow GENORM { + + take: + ch_counts + + + main: + + MAKE_CHUNKS( ch_counts ) + + // we need to flatten to set each chunk file as a separate item in the channel + ch_count_chunks = MAKE_CHUNKS.out.chunks.flatten() + getUniqueFilePairs( ch_count_chunks ) | CROSS_JOIN + + CROSS_JOIN.out.data | EXPRESSION_RATIO + + EXPRESSION_RATIO.out.data | RATIO_STANDARD_VARIATION + + COMPUTE_M_MEASURE( + ch_counts, + RATIO_STANDARD_VARIATION.out.data.collect( sort: true ) + ) + + emit: + m_measures = COMPUTE_M_MEASURE.out.m_measures + +} + + + +/* +======================================================================================== + FUNCTIONS +======================================================================================== +*/ + +// +// Generate channels consisting of unique pairs of files +// Gets +// +def getUniqueFilePairs( ch_count_chunks ) { + + def ch_count_chunks_with_indexes = ch_count_chunks + .map { file -> [file.name.tokenize('.')[1], file] } // extract file index + + return ch_count_chunks_with_indexes + .combine( ch_count_chunks_with_indexes ) // full cartesian product with itself + .map { // steps not mandatory but helps to make the filter clearer + index_1, file_1, index_2, file_2 -> + [index_1: index_1, index_2: index_2, file_1: file_1, file_2: file_2] + } + .filter { it -> it.index_1 <= it.index_2 } // keeps only pairs where i <= j + .map { + it -> + def meta = [index_1: it.index_1, index_2: it.index_2] // puts indexes in a meta tuple + [ meta, it.file_1, it.file_2 ] + } +} diff --git a/subworkflows/local/get_public_accessions/main.nf b/subworkflows/local/get_public_accessions/main.nf new file mode 100644 index 00000000..debb29e0 --- /dev/null +++ b/subworkflows/local/get_public_accessions/main.nf @@ -0,0 +1,137 @@ +include { EXPRESSIONATLAS_GETACCESSIONS as EXPRESSION_ATLAS } from '../../../modules/local/expressionatlas/getaccessions' +include { GEO_GETACCESSIONS as GEO } from '../../../modules/local/geo/getaccessions' + +/* +======================================================================================== + SUBWORKFLOW TO DOWNLOAD EXPRESSIONATLAS ACCESSIONS AND DATASETS +======================================================================================== +*/ + +workflow GET_PUBLIC_ACCESSIONS { + + take: + species + skip_fetch_eatlas_accessions + fetch_geo_accessions + platform + keywords + ch_accessions + ch_accessions_file + ch_excluded_accessions + ch_excluded_accessions_file + random_sampling_size + random_sampling_seed + outdir + + main: + + ch_fetched_eatlas_accessions = channel.empty() + ch_fetched_geo_accessions = channel.empty() + ch_sampling_quota = channel.of( "ok" ) + + // ----------------------------------------------------------------- + // GET EATLAS ACCESSIONS + // ----------------------------------------------------------------- + + // fetching Expression Atlas accessions if applicable + if ( !skip_fetch_eatlas_accessions ) { + + // getting Expression Atlas accessions given a species name and keywords + // keywords can be an empty string + EXPRESSION_ATLAS( + species, + keywords, + platform?: [], + random_sampling_size?: [], + random_sampling_seed?: [] + ) + + ch_fetched_eatlas_accessions = EXPRESSION_ATLAS.out.accessions.splitText() + ch_sampling_quota = EXPRESSION_ATLAS.out.sampling_quota + + } + + // ------------------------------------------------------------------------------------ + // GET GEO ACCESSIONS + // ------------------------------------------------------------------------------------ + + // fetching GEO accessions if applicable + if ( fetch_geo_accessions ) { + + // all Expression Atlas accessions starting with E-GEOD- are imported from GEO + // we do not want to collect these GEO data if we already get them from Expression Atlas + ch_excluded_eatlas_accessions_file = ch_fetched_eatlas_accessions + .filter { accession -> accession.startsWith("E-GEOD-") } + .map { accession -> accession.replace("E-GEOD-", "GSE") } + .collectFile( + name: 'excluded_geo_accessions.txt', + storeDir: "${outdir}/geo/", + sort: true, + newLine: true + ) + .ifEmpty( [] ) + + // trick to avoid fetching accessions from GEO when the sampling quota is already exceeded + ch_species = channel.of( species ) + .combine( ch_sampling_quota ) + .filter { species_name, quota -> quota == "ok" } + .map { species_name, quota -> species_name } + + // getting GEO accessions given a species name and keywords + // keywords can be an empty string + GEO( + ch_species, + keywords, + platform?: [], + ch_excluded_eatlas_accessions_file, + random_sampling_size?: [], + random_sampling_seed?: [] + ) + + ch_fetched_geo_accessions = GEO.out.accessions.splitText() + } + + // ----------------------------------------------------------------- + // MERGING AND EXCLUDING UNWANTED ACCESSIONS + // ----------------------------------------------------------------- + + // getting accessions to exclude and preparing in the right format + ch_excluded_accessions = ch_excluded_accessions + .mix( ch_excluded_accessions_file.splitText() ) + .unique() + .map { acc -> acc.trim() } + .toList() + .map { lst -> [lst] } // list of lists : mandatory when combining in the next step + + ch_fetched_public_accessions = ch_fetched_eatlas_accessions + .mix( ch_fetched_geo_accessions ) + .map { acc -> acc.trim() } + .filter { acc -> + (acc.startsWith('E-') || acc.startsWith('GSE')) && !acc.startsWith('E-PROT-') + } + .combine ( ch_excluded_accessions ) + .filter { accession, excluded_accessions -> !(accession in excluded_accessions) } + .map { accession, excluded_accessions -> accession } + + // ----------------------------------------------------------------- + // ADDING USER PROVIDED ACCESSIONS + // ----------------------------------------------------------------- + + ch_input_accessions = ch_accessions + .mix( ch_accessions_file.splitText() ) + .unique() + .map { acc -> acc.trim() } + + // appending to accessions provided by the user + // ensures that no accessions is present twice (provided by the user and fetched from E. Atlas) + // removing E-PROT- accessions because they are not supported in subsequent steps + // removing excluded accessions + ch_all_accessions = ch_input_accessions + .mix( ch_fetched_public_accessions ) + .unique() + .map { acc -> acc.trim() } + + emit: + accessions = ch_all_accessions + +} diff --git a/subworkflows/local/get_transcript_lengths/main.nf b/subworkflows/local/get_transcript_lengths/main.nf new file mode 100644 index 00000000..16c73707 --- /dev/null +++ b/subworkflows/local/get_transcript_lengths/main.nf @@ -0,0 +1,30 @@ +include { COMPUTE_GENE_TRANSCRIPT_LENGTHS } from '../../../modules/local/compute_gene_transcript_lengths' +include { DOWNLOAD_ENSEMBL_ANNOTATION } from '../../../modules/local/download_ensembl_annotation' + + +/* +======================================================================================== + SUBWORKFLOW TO DOWNLOAD EXPRESSIONATLAS ACCESSIONS AND DATASETS +======================================================================================== +*/ + +workflow GET_TRANSCRIPT_LENGTHS { + + take: + species + + main: + + DOWNLOAD_ENSEMBL_ANNOTATION (species) + ch_annotation = DOWNLOAD_ENSEMBL_ANNOTATION.out.gff3 + + COMPUTE_GENE_TRANSCRIPT_LENGTHS (ch_annotation) + + + + emit: + csv = COMPUTE_GENE_TRANSCRIPT_LENGTHS.out.csv + + + +} diff --git a/subworkflows/local/idmapping/main.nf b/subworkflows/local/idmapping/main.nf new file mode 100644 index 00000000..cce21a54 --- /dev/null +++ b/subworkflows/local/idmapping/main.nf @@ -0,0 +1,138 @@ +include { CLEAN_GENE_IDS } from '../../../modules/local/clean_gene_ids' +include { COLLECT_GENE_IDS } from '../../../modules/local/collect_gene_ids' +include { GPROFILER_IDMAPPING } from '../../../modules/local/gprofiler/idmapping' +include { DETECT_RARE_GENES } from '../../../modules/local/detect_rare_genes' +include { FILTER_AND_RENAME_GENES } from '../../../modules/local/filter_and_rename_genes' + +/* +======================================================================================== + SUBWORKFLOW TO DOWNLOAD EXPRESSIONATLAS ACCESSIONS AND DATASETS +======================================================================================== +*/ + +workflow ID_MAPPING { + + take: + ch_counts + species + skip_id_mapping + gprofiler_target_db + custom_gene_id_mapping + custom_gene_metadata + min_occurrence_freq + min_occurrence_quantile + outdir + + main: + + ch_gene_id_mapping = channel.empty() + ch_gene_metadata = channel.empty() + ch_valid_gene_ids = channel.empty() + + if ( !skip_id_mapping ) { + + // ----------------------------------------------------------------- + // CLEANING GENE IDS + // ----------------------------------------------------------------- + + CLEAN_GENE_IDS ( ch_counts ) + ch_counts = CLEAN_GENE_IDS.out.counts + ch_cleaned_gene_ids = CLEAN_GENE_IDS.out.gene_ids + + // ----------------------------------------------------------------- + // COLLECTING ALL CLEANED GENE IDS FROM ALL DATASETS + // ----------------------------------------------------------------- + + // sorting files in order to have a consistent input and be able to retry + COLLECT_GENE_IDS( + ch_cleaned_gene_ids.toSortedList() + ) + + // ----------------------------------------------------------------- + // MAPPING THESE GENE IDS TO THE CHOSEN TARGET DB + // ----------------------------------------------------------------- + + GPROFILER_IDMAPPING( + COLLECT_GENE_IDS.out.unique_gene_ids, + species, + gprofiler_target_db + ) + ch_gene_id_mapping = GPROFILER_IDMAPPING.out.mapping + ch_gene_metadata = GPROFILER_IDMAPPING.out.metadata + + // ----------------------------------------------------------------- + // FILTERING OUT GENE IDS THAT DO NOT HAVE ENOUGH OCCURRENCES + // ----------------------------------------------------------------- + + DETECT_RARE_GENES( + ch_gene_id_mapping, + COLLECT_GENE_IDS.out.gene_id_occurrences, + ch_counts.count(), + min_occurrence_freq, + min_occurrence_quantile + ) + ch_valid_gene_ids = DETECT_RARE_GENES.out.valid_gene_ids + } + + // ----------------------------------------------------------------- + // COLLECTING GLOBAL GENE ID MAPPING AND METADATA + // ----------------------------------------------------------------- + + ch_global_gene_id_mapping = ch_gene_id_mapping + .mix( + custom_gene_id_mapping ? + channel.fromPath( custom_gene_id_mapping, checkIfExists: true ) : + channel.empty() + ) + .splitCsv( header: true ) + .unique() + .collectFile( + name: 'global_gene_id_mapping.csv', + seed: "original_gene_id,gene_id", + newLine: true, + storeDir: "${outdir}/idmapping/", + sort: true + ) { + item -> "${item["original_gene_id"]},${item["gene_id"]}" + } + + ch_global_gene_metadata = ch_gene_metadata + .mix( + custom_gene_metadata ? + channel.fromPath( custom_gene_metadata, checkIfExists: true ) : + channel.empty() + ) + .splitCsv( header: true ) + .unique() + .collectFile( + name: 'global_gene_metadata.csv', + seed: "gene_id,name,description", + newLine: true, + storeDir: "${outdir}/idmapping/", + sort: true + ) { + item -> "${item["gene_id"]},${item["name"]},${item["description"]}" + } + + // ----------------------------------------------------------------- + // RENAMING GENE IDS IN ALL COUNT DATASETS (ONLY IF NECESSARY) + // ----------------------------------------------------------------- + + if ( !skip_id_mapping || custom_gene_id_mapping ) { + + FILTER_AND_RENAME_GENES( + ch_counts, + ch_global_gene_id_mapping.first(), + ch_valid_gene_ids.collect() + ) + ch_counts = FILTER_AND_RENAME_GENES.out.counts + + } + + + emit: + counts = ch_counts + mapping = ch_global_gene_id_mapping + metadata = ch_global_gene_metadata + +} diff --git a/subworkflows/local/merge_data/main.nf b/subworkflows/local/merge_data/main.nf new file mode 100644 index 00000000..5941757e --- /dev/null +++ b/subworkflows/local/merge_data/main.nf @@ -0,0 +1,126 @@ +include { MERGE_COUNTS as MERGE_PLATFORM_COUNTS } from '../../../modules/local/merge_counts' +include { MERGE_COUNTS as MERGE_ALL_COUNTS } from '../../../modules/local/merge_counts' + +/* +======================================================================================== + SUBWORKFLOW TO DOWNLOAD EXPRESSIONATLAS ACCESSIONS AND DATASETS +======================================================================================== +*/ + +workflow MERGE_DATA { + + take: + ch_normalised_counts + ch_gene_id_mapping + ch_gene_metadata + outdir + + main: + + // ----------------------------------------------------------------- + // MERGE COUNTS FOR EACH PLATFORM SEPARATELY + // ----------------------------------------------------------------- + + + ch_normalised_rnaseq_counts = ch_normalised_counts.filter { meta, file -> meta.platform == "rnaseq" } + ch_normalised_microarray_counts = ch_normalised_counts.filter { meta, file -> meta.platform == "microarray" } + + ch_collected_rnaseq_counts = ch_normalised_rnaseq_counts + .map { meta, file -> file } + .collect( sort: true ) + .map { files -> [ [ platform: "rnaseq" ], files ] } + + ch_collected_microarray_counts = ch_normalised_microarray_counts + .map { meta, file -> file } + .collect( sort: true ) + .map { files -> [ [ platform: "microarray" ], files ] } + + MERGE_PLATFORM_COUNTS ( + ch_collected_rnaseq_counts.concat( ch_collected_microarray_counts ) + ) + + ch_platform_counts = MERGE_PLATFORM_COUNTS.out.counts + + // ----------------------------------------------------------------- + // MERGE ALL COUNTS + // ----------------------------------------------------------------- + + ch_collected_merged_counts = ch_platform_counts + .map { meta, file -> file } + .collect( sort: true ) + .map { files -> [ [ platform: "all" ], files ] } + + MERGE_ALL_COUNTS( ch_collected_merged_counts ) + + // ----------------------------------------------------------------- + // MERGE ALL DESIGNS IN A SINGLE TABLE + // ----------------------------------------------------------------- + + ch_whole_design = ch_normalised_counts + .map { + meta, file -> // extracts design file and adds batch column whenever missing (for custom datasets) + def design_content = meta.design.splitCsv( header: true ) + // if there is no batch, it is custom data + def updated_design_content = design_content.collect { row -> + row.batch = row.batch ?: "custom_${meta.dataset}" + return row + } + [ updated_design_content ] + } + .flatten() + .unique() + .collectFile( + name: 'whole_design.csv', + seed: "batch,condition,sample", + newLine: true, + sort: true, + storeDir: "${outdir}/merged_datasets/" + ) { + item -> "${item.batch},${item.condition},${item.sample}" + } + + // ----------------------------------------------------------------- + // MERGE ALL GENE ID MAPPINGS + // ----------------------------------------------------------------- + + ch_whole_gene_id_mapping = ch_gene_id_mapping + .filter { it != [] } // handle case where there are no mappings + .splitCsv( header: true ) + .unique() + .collectFile( + name: 'whole_gene_id_mapping.csv', + seed: "original_gene_id,gene_id", + newLine: true, + sort: true, + storeDir: "${outdir}/idmapping/" + ) { + item -> "${item.original_gene_id},${item.gene_id}" + } + .ifEmpty([]) // handle case where there are no mappings + + // ----------------------------------------------------------------- + // MERGE ALL GENE METADATA + // ----------------------------------------------------------------- + + ch_whole_gene_metadata = ch_gene_metadata + .filter { it != [] } // handle case where there are no mappings + .splitCsv( header: true ) + .unique() + .collectFile( + name: 'whole_gene_metadata.csv', + seed: "gene_id,name,description", + newLine: true, + sort: true, + storeDir: "${outdir}/idmapping/" + ) { + item -> "${item.gene_id},${item.name},${item.description}" + } + .ifEmpty([]) // handle case where there are no mappings + + emit: + all_counts = MERGE_ALL_COUNTS.out.counts + platform_counts = ch_platform_counts + whole_design = ch_whole_design + whole_gene_id_mapping = ch_whole_gene_id_mapping + whole_gene_metadata = ch_whole_gene_metadata +} diff --git a/subworkflows/local/multiqc/main.nf b/subworkflows/local/multiqc/main.nf new file mode 100644 index 00000000..c23d46ce --- /dev/null +++ b/subworkflows/local/multiqc/main.nf @@ -0,0 +1,285 @@ +include { MULTIQC } from '../../../modules/nf-core/multiqc' +include { COLLECT_STATISTICS } from '../../../modules/local/collect_statistics' + +include { methodsDescriptionText } from '../utils_nfcore_stableexpression_pipeline' +include { paramsSummaryMultiqc } from '../../nf-core/utils_nfcore_pipeline' +include { softwareVersionsToYAML } from '../../nf-core/utils_nfcore_pipeline' +include { paramsSummaryMap } from 'plugin/nf-schema' + +/* +======================================================================================== + SUBWORKFLOW TO DOWNLOAD EXPRESSIONATLAS ACCESSIONS AND DATASETS +======================================================================================== +*/ + +workflow MULTIQC_WORKFLOW { + + take: + ch_multiqc_files + ch_versions + multiqc_config + multiqc_logo + multiqc_methods_description + outdir + + main: + + // ------------------------------------------------------------------------------------ + // STATS + // ------------------------------------------------------------------------------------ + + ch_id_mapping_stats = channel.topic('id_mapping_stats') + .collectFile( + name: 'id_mapping_stats.csv', + seed: "dataset,final,merged,not_valid,unmapped", + newLine: true, + storeDir: "${outdir}/statistics/" + ) { + item -> "${item[0]},${item[1]},${item[2]},${item[3]},${item[4]}" + } + + ch_skewness = channel.topic('skewness') + .map { dataset, file -> "${dataset},${file.readLines()[0]}" } // concatenate dataset name with skewness values + .collectFile( + name: 'skewness.csv', + newLine: true, + sort: true, + storeDir: "${outdir}/statistics/" + ) + + + ch_ratio_zeros = channel.topic('ratio_zeros') + .map { dataset, file -> "${dataset},${file.readLines()[0]}" } // concatenate dataset name with ratio values + .collectFile( + name: 'ratio_zeros.csv', + newLine: true, + sort: true, + storeDir: "${outdir}/statistics/" + ) + + COLLECT_STATISTICS( + ch_skewness.mix( ch_ratio_zeros ) + ) + + // ------------------------------------------------------------------------------------ + // FAILURE / WARNING REPORTS + // ------------------------------------------------------------------------------------ + + ch_eatlas_failure_reasons = channel.topic('eatlas_failure_reason') + .map { accession, file -> [ accession, file.readLines()[0] ] } + .collectFile( + name: 'eatlas_failure_reasons.csv', + seed: "Accession,Reason", + newLine: true, + sort: true, + storeDir: "${outdir}/errors/", + ) { + item -> "${item[0]},${item[1]}" + } + + ch_eatlas_warning_reasons = channel.topic('eatlas_warning_reason') + .map { accession, file -> [ accession, file.readLines()[0] ] } + .collectFile( + name: 'eatlas_warning_reasons.csv', + seed: "Accession,Reason", + newLine: true, + sort: true, + storeDir: "${outdir}/warnings/" + ) { + item -> "${item[0]},${item[1]}" + } + + ch_geo_failure_reasons = channel.topic('geo_failure_reason') + .map { accession, file -> [ accession, file.readLines()[0] ] } + .collectFile( + name: 'geo_failure_reasons.csv', + seed: "Accession,Reason", + newLine: true, + sort: true, + storeDir: "${outdir}/errors/" + ) { + item -> "${item[0]},${item[1]}" + } + + + ch_geo_warning_reasons = channel.topic('geo_warning_reason') + .map { accession, file -> [ accession, file.readLines()[0] ] } + .collectFile( + name: 'geo_warning_reasons.csv', + seed: "Accession,Reason", + newLine: true, + sort: true, + storeDir: "${outdir}/warnings/" + ) { + item -> "${item[0]},${item[1]}" + } + + ch_id_cleaning_failure_reasons = channel.topic('id_cleaning_failure_reason') + .map { dataset, file -> [ dataset, file.readLines()[0] ] } + .collectFile( + name: 'id_cleaning_failure_reasons.tsv', + seed: "Dataset\tReason", + newLine: true, + sort: true, + storeDir: "${outdir}/errors/" + ) { + item -> "${item[0]}\t${item[1]}" + } + + ch_id_mapping_warning_reasons = channel.topic('renaming_warning_reason') + .map { dataset, file -> [ dataset, file.readLines()[0] ] } + .collectFile( + name: 'renaming_warning_reasons.tsv', + seed: "Dataset\tReason", + newLine: true, + sort: true, + storeDir: "${outdir}/warnings/" + ) { + item -> "${item[0]}\t${item[1]}" + } + + ch_id_mapping_failure_reasons = channel.topic('renaming_failure_reason') + .map { dataset, file -> [ dataset, file.readLines()[0] ] } + .collectFile( + name: 'renaming_failure_reasons.tsv', + seed: "Dataset\tReason", + newLine: true, + sort: true, + storeDir: "${outdir}/errors/" + ) { + item -> "${item[0]}\t${item[1]}" + } + + ch_normalisation_warning_reasons = channel.topic('normalisation_warning_reason') + .map { dataset, file -> [ dataset, file.readLines()[0] ] } + .collectFile( + name: 'normalisation_warning_reasons.tsv', + seed: "Dataset\tReason", + newLine: true, + sort: true, + storeDir: "${outdir}/warnings/" + ) { + item -> "${item[0]}\t${item[1]}" + } + + ch_normalisation_failure_reasons = channel.topic('normalisation_failure_reason') + .map { dataset, file -> [ dataset, file.readLines()[0] ] } + .collectFile( + name: 'normalisation_failure_reasons.tsv', + seed: "Dataset\tReason", + newLine: true, + sort: true, + storeDir: "${outdir}/errors/" + ) { + item -> "${item[0]}\t${item[1]}" + } + + + // ------------------------------------------------------------------------------------ + // MULTIQC FILES + // ------------------------------------------------------------------------------------ + + ch_multiqc_files = ch_multiqc_files + .mix( channel.topic('eatlas_all_datasets').collect() ) // single item + .mix( channel.topic('eatlas_selected_datasets').collect() ) // single item + .mix( channel.topic('geo_all_datasets').collect() ) // single item + .mix( channel.topic('geo_selected_datasets').collect() ) // single item + .mix( channel.topic('geo_rejected_datasets').collect() ) // single item + .mix( COLLECT_STATISTICS.out.csv ) + .mix( ch_id_mapping_stats ) + .mix( channel.topic('total_gene_id_occurrence_quantiles').collect() ) // single item + .mix( ch_eatlas_failure_reasons ) + .mix( ch_eatlas_warning_reasons ) + .mix( ch_geo_failure_reasons ) + .mix( ch_geo_warning_reasons ) + .mix( ch_id_cleaning_failure_reasons ) + .mix( ch_id_mapping_warning_reasons ) + .mix( ch_id_mapping_failure_reasons ) + .mix( ch_normalisation_failure_reasons ) + .mix( ch_normalisation_warning_reasons ) + + // ------------------------------------------------------------------------------------ + // VERSIONS + // ------------------------------------------------------------------------------------ + + // Collate and save software versions + // + def topic_versions = channel.topic("versions") + .distinct() + .branch { entry -> + versions_file: entry instanceof Path + versions_tuple: true + } + + def topic_versions_string = topic_versions.versions_tuple + .map { process, tool, version -> + [ process[process.lastIndexOf(':')+1..-1], " ${tool}: ${version}" ] + } + .groupTuple(by:0) + .map { process, tool_versions -> + tool_versions.unique().sort() + "${process}:\n${tool_versions.join('\n')}" + } + + ch_collated_versions = softwareVersionsToYAML(ch_versions.mix(topic_versions.versions_file)) + .mix(topic_versions_string) + .collectFile( + storeDir: "${outdir}/pipeline_info", + name: 'nf_core_' + 'stableexpression_software_' + 'mqc_' + 'versions.yml', + sort: true, + newLine: true + ) + + // ------------------------------------------------------------------------------------ + // CONFIG + // ------------------------------------------------------------------------------------ + + ch_multiqc_config = channel.fromPath( + "$projectDir/assets/multiqc_config.yml", checkIfExists: true) + + ch_multiqc_custom_config = multiqc_config ? + channel.fromPath(multiqc_config, checkIfExists: true) : + channel.empty() + + ch_multiqc_logo = multiqc_logo ? + channel.fromPath(multiqc_logo, checkIfExists: true) : + channel.empty() + + summary_params = paramsSummaryMap( + workflow, + parameters_schema: "nextflow_schema.json" + ) + ch_workflow_summary = channel.value(paramsSummaryMultiqc(summary_params)) + + ch_multiqc_files = ch_multiqc_files + .mix( ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml') ) + + ch_multiqc_custom_methods_description = multiqc_methods_description ? + file(multiqc_methods_description, checkIfExists: true) : + file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + + ch_methods_description = channel.value( + methodsDescriptionText(ch_multiqc_custom_methods_description) + ) + + ch_multiqc_files = ch_multiqc_files + .mix( ch_collated_versions ) + .mix( + ch_methods_description.collectFile( + name: 'methods_description_mqc.yaml', + sort: true + ) + ) + + MULTIQC ( + ch_multiqc_files.collect(), + ch_multiqc_config.toList(), + ch_multiqc_custom_config.toList(), + ch_multiqc_logo.toList(), + [], + [] + ) + + emit: + report = MULTIQC.out.report +} diff --git a/subworkflows/local/stability_scoring/main.nf b/subworkflows/local/stability_scoring/main.nf new file mode 100644 index 00000000..efdb0c8b --- /dev/null +++ b/subworkflows/local/stability_scoring/main.nf @@ -0,0 +1,74 @@ +include { GET_CANDIDATE_GENES } from '../../../modules/local/get_candidate_genes' +include { NORMFINDER } from '../../../modules/local/normfinder' +include { COMPUTE_STABILITY_SCORES } from '../../../modules/local/compute_stability_scores' + +include { GENORM } from '../genorm' +/* +======================================================================================== + COMPUTE STABILITY SCORES +======================================================================================== +*/ + +workflow STABILITY_SCORING { + + take: + ch_counts + ch_design + ch_stats + candidate_selection_descriptor + nb_top_gene_candidates + min_expr_threshold + run_genorm + stability_score_weights + + main: + + // ----------------------------------------------------------------- + // GETTING CANDIDATE GENES + // ----------------------------------------------------------------- + + GET_CANDIDATE_GENES( + ch_counts.collect(), // single item + ch_stats.collect(), // single item + candidate_selection_descriptor, + nb_top_gene_candidates, + min_expr_threshold + ) + ch_candidate_gene_counts = GET_CANDIDATE_GENES.out.counts + + // ----------------------------------------------------------------- + // NORMFINDER + // ----------------------------------------------------------------- + + NORMFINDER ( + ch_candidate_gene_counts.collect(), // single item + ch_design.collect() // single item + ) + + // ----------------------------------------------------------------- + // GENORM + // ----------------------------------------------------------------- + + if ( run_genorm ) { + GENORM ( ch_candidate_gene_counts ) + ch_genorm_stability = GENORM.out.m_measures + } else { + ch_genorm_stability = channel.value([]) + } + + // ----------------------------------------------------------------- + // AGGREGATION AND FINAL STABILITY SCORE + // ----------------------------------------------------------------- + + COMPUTE_STABILITY_SCORES ( + ch_stats.collect(), // single item + stability_score_weights, + NORMFINDER.out.stability_values, + ch_genorm_stability + ) + + + emit: + summary_statistics = COMPUTE_STABILITY_SCORES.out.stats_with_stability_scores + +} diff --git a/subworkflows/local/utils_nfcore_stableexpression_pipeline/main.nf b/subworkflows/local/utils_nfcore_stableexpression_pipeline/main.nf index 9e726b40..9e238a2e 100644 --- a/subworkflows/local/utils_nfcore_stableexpression_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_stableexpression_pipeline/main.nf @@ -2,8 +2,6 @@ // Subworkflow with functionality specific to the nf-core/stableexpression pipeline // -import org.yaml.snakeyaml.Yaml - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT FUNCTIONS / MODULES / SUBWORKFLOWS @@ -12,17 +10,17 @@ import org.yaml.snakeyaml.Yaml include { UTILS_NFSCHEMA_PLUGIN } from '../../nf-core/utils_nfschema_plugin' include { paramsSummaryMap } from 'plugin/nf-schema' +include { samplesheetToList } from 'plugin/nf-schema' include { completionEmail } from '../../nf-core/utils_nfcore_pipeline' include { completionSummary } from '../../nf-core/utils_nfcore_pipeline' include { imNotification } from '../../nf-core/utils_nfcore_pipeline' include { UTILS_NFCORE_PIPELINE } from '../../nf-core/utils_nfcore_pipeline' include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipeline' -include { workflowVersionToYAML } from '../../nf-core/utils_nfcore_pipeline' /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUBWORKFLOW TO INITIALISE PIPELINE -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ workflow PIPELINE_INITIALISATION { @@ -32,6 +30,11 @@ workflow PIPELINE_INITIALISATION { validate_params // boolean: Boolean whether to validate parameters against the schema at runtime monochrome_logs // boolean: Do not use coloured log outputs nextflow_cli_args // array: List of positional nextflow CLI args + outdir // string: The output directory where the results will be saved + input // string: Path to input samplesheet + help // boolean: Display help message and exit + help_full // boolean: Show the full help message + show_hidden // boolean: Show hidden parameters in the help message main: @@ -41,17 +44,42 @@ workflow PIPELINE_INITIALISATION { UTILS_NEXTFLOW_PIPELINE ( version, true, - params.outdir, + outdir, workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1 ) // // Validate parameters and generate parameter summary to stdout // + before_text = """ +-\033[2m----------------------------------------------------\033[0m- + \033[0;32m,--.\033[0;30m/\033[0;32m,-.\033[0m +\033[0;34m ___ __ __ __ ___ \033[0;32m/,-._.--~\'\033[0m +\033[0;34m |\\ | |__ __ / ` / \\ |__) |__ \033[0;33m} {\033[0m +\033[0;34m | \\| | \\__, \\__/ | \\ |___ \033[0;32m\\`-._,-`-,\033[0m + \033[0;32m`._,._,\'\033[0m +\033[0;35m nf-core/stableexpression ${workflow.manifest.version}\033[0m +-\033[2m----------------------------------------------------\033[0m- +""" + after_text = """${workflow.manifest.doi ? "\n* The pipeline\n" : ""}${workflow.manifest.doi.tokenize(",").collect { doi -> " https://doi.org/${doi.trim().replace('https://doi.org/','')}"}.join("\n")}${workflow.manifest.doi ? "\n" : ""} +* The nf-core framework + https://doi.org/10.1038/s41587-020-0439-x + +* Software dependencies + https://github.com/nf-core/stableexpression/blob/main/CITATIONS.md +""" + command = "nextflow run ${workflow.manifest.name} -profile --species --outdir " + UTILS_NFSCHEMA_PLUGIN ( workflow, validate_params, - null + null, + help, + help_full, + show_hidden, + before_text, + after_text, + command ) // @@ -73,7 +101,7 @@ workflow PIPELINE_INITIALISATION { ch_input_datasets = parseInputDatasets( params.datasets ) validateInputSamplesheet( ch_input_datasets ) } else { - ch_input_datasets = Channel.empty() + ch_input_datasets = channel.empty() } emit: @@ -82,9 +110,9 @@ workflow PIPELINE_INITIALISATION { } /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUBWORKFLOW FOR PIPELINE COMPLETION -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ workflow PIPELINE_COMPLETION { @@ -129,16 +157,40 @@ workflow PIPELINE_COMPLETION { } } - /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FUNCTIONS -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // // Check and validate pipeline parameters // + +def check_accession(accession) { + if ( !( accession.startsWith('E-') || accession.startsWith('GSE') ) ) { + error('Accession ' + accession + ' is not well formated. All accessions should start with "E-" or "GSE".') + } +} + + +def check_accession_string(accessions_str) { + if ( accessions_str != null && accessions_str != "" ) { + accessions_str.tokenize(',').each { accession -> + check_accession(accession) + } + } +} + +def check_accession_file(accession_file) { + if ( accession_file != null ) { + def lines = new File(accession_file).readLines() + lines.each { accession -> + check_accession(accession) + } + } +} + def validateInputParameters(params) { // checking that a species has been provided @@ -146,23 +198,15 @@ def validateInputParameters(params) { error('You must provide a species name') } - // checking that the user has provided at least one dataset and / or expression atlas arguments - if ( - !params.datasets - && !params.eatlas_accessions - && !params.fetch_eatlas_accessions - && !params.eatlas_keywords - ) { - error('You must provide at least either --datasets or --fetch_eatlas_accessions or --eatlas_accessions or --eatlas_keywords') - } + // if accessions are provided or excluded, checking that they are well formated + check_accession_string( params.accessions ) + check_accession_string( params.excluded_accessions ) - // if expression atlas accessions are provided, checking that they are well formated - if ( params.eatlas_accessions ) { - for ( accession in params.eatlas_accessions.tokenize(',') ) { - if ( !accession.startsWith('E-') ) { - error('Expression Atlas accession ' + accession + ' is not well formated. All accessions should start with "E-".') - } - } + check_accession_file( params.accessions_file ) + check_accession_file( params.excluded_accessions_file ) + + if ( params.keywords && params.skip_fetch_eatlas_accessions && !params.fetch_geo_accessions ) { + log.warn "Ignoring keywords as accessions will not be fetched from Expression Atlas or GEO" } } @@ -171,11 +215,11 @@ def validateInputParameters(params) { // Parses files from input dataset and creates two subchannels raw and normalized // with elements like [meta, count_file, normalised] def parseInputDatasets(samplesheet) { - return Channel.fromList( samplesheetToList(samplesheet, "assets/schema_datasets.json") ) + return channel.fromList( samplesheetToList(samplesheet, "assets/schema_datasets.json") ) .map { item -> def (meta, count_file) = item - new_meta = meta + [dataset: count_file.getBaseName()] + def new_meta = meta + [dataset: count_file.getBaseName()] [new_meta, count_file] } } @@ -184,27 +228,39 @@ def parseInputDatasets(samplesheet) { // // Validate channels from input samplesheet // -def validateInputSamplesheet(input) { +def validateInputSamplesheet( ch_datasets ) { // checking that all microarray datasets (if any) are normalised - input.filter { - meta, file -> - meta.platform == 'microarray' && !meta.normalised - } - .count() - .map { count -> - if (count > 0) { - def error_text = [ - "Error: You provided at least one microarray dataset that is not normalised. ", - "Microarray datasets must already be normalised before being submitted. ", - "Please perform normalisation (typically using RMA for one-colour intensities / LOESS (limma) for two-colour intensities) and run again." - ].join(' ').trim() - error(error_text) + ch_datasets + .filter { + meta, file -> + meta.platform == 'microarray' && !meta.normalised + } + .count() + .map { count -> + if (count > 0) { + def error_text = [ + "Error: You provided at least one microarray dataset that is not normalised. ", + "Microarray datasets must already be normalised before being submitted. ", + "Please perform normalisation (typically using RMA for one-colour intensities / LOESS (limma) for two-colour intensities) and run again." + ].join(' ').trim() + error(error_text) + } } - } -} + // checking that all count files are well formated (same number of columns in header and rows) + ch_datasets + .map { meta, file -> + def header = file.withReader { reader -> reader.readLine() } + def separator = header.contains(',') ? "," : + header.contains('\t') ? "\t" : + " " + def first_row = file.splitCsv( header: false, skip: 1, limit: 1, sep: separator ) + + assert header.split(separator).size() == first_row[0].size() : "Header and first row do not have the same number of columns in file ${file}" + } +} // -// Get channel of software versions used in pipeline in YAML format +// Generate methods description for MultiQC // def toolCitationText() { // TODO nf-core: Optionally add in-text citation tools to this list. @@ -267,37 +323,104 @@ def methodsDescriptionText(mqc_methods_yaml) { return description_html.toString() } + +/* +======================================================================================== + FUNCTIONS FOR FORMATTING DATA FETCHED FROM EXPRESSION ATLAS / GEO +======================================================================================== +*/ + // -// Get software versions for pipeline -// temporary replacements of the native processVersionsFromYAML +// Get Expression Atlas Batch ID (accession + data_type) from file stem // -def customProcessVersionsFromYAML(yaml_file) { - Yaml yaml = new Yaml() - versions = yaml.load(yaml_file) - return yaml.dumpAsMap(versions).trim() +def addDatasetIdToMetadata( ch_files ) { + return ch_files + .map { + file -> + def meta = [ dataset: file.getSimpleName() ] + [meta, file] + } } // -// Get channel of software versions used in pipeline in YAML format -// temporary replacements of the native softwareVersionsToYAML +// Groups design and data files by accession and data_type +// Design and count files have necessarily the same dataset ID (same file stem) // -def customSoftwareVersionsToYAML(versions) { - return Channel.of(workflowVersionToYAML()) - .concat( - versions - .unique() - .map { - name, tool, version -> [ name.tokenize(':').last(), [ tool, version ] ] - } - .groupTuple() - .map { - processName, toolInfo -> - def toolVersions = toolInfo.collect { tool, version -> " ${tool}: ${version}" }.join('\n') - "${processName}:\n${toolVersions}\n" - } - .map { customProcessVersionsFromYAML(it) } - ) +def groupFilesByDatasetId(ch_design, ch_counts) { + return ch_design + .concat( ch_counts ) // puts counts at the end of the resulting channel + .groupTuple() // groups by dataset ID; design files are necessarily BEFORE count files + .filter { + it.get(1).size() == 2 // only groups with two files + } + .filter { // only groups with first file as design file and second one as count fileWARN: java.net.ConnectException: Connexion refusée + meta, files -> + files.get(0).name.endsWith('.design.csv') && !files.get(1).name.endsWith('.design.csv') + } + .map { // putting design file in meta + meta, files -> + def new_meta = meta + [design: files[0]] + [new_meta, files[1]] + } } +def getNthPartFromEnd(String s, int n) { + def tokens = s.tokenize('.') + return tokens[tokens.size() - n] +} +// +// Add normalised: true / false in meta +// +def augmentMetadata( ch_files ) { + return ch_files + .map { + meta, file -> + def norm_state = getNthPartFromEnd(file.name, 3) + def normalised = false + if ( norm_state == 'normalised' ) { + normalised = true + } else if ( norm_state == 'raw' ) { + normalised = false + } else { + error("Invalid normalisation state: ${norm_state}") + } + + def platform = getNthPartFromEnd(file.name, 4) + def new_meta = meta + [normalised: normalised, platform: platform] + [new_meta, file] + } +} + +/* +======================================================================================== + FUNCTIONS FOR CHECKING NB OF DATASETS +======================================================================================== +*/ + +def checkCounts(ch_counts) { + + ch_counts.count().map { n -> + if( n == 0 ) { + // display a warning if no datasets are found + def msg_lst = [] + if ( !params.fetch_geo_accessions ) { + msg_lst = [ + "Could not find any readily usable public dataset.", + "Please set the --fetch_geo_accessions flag and run again." + ] + } else { + msg_lst = [ + "Could not find any readily usable public dataset.", + "You can check directly on NCBI GEO if there are datasets for this species that you can prepare yourself:", + "https://www.ncbi.nlm.nih.gov/gds", + "Once you have prepared your own data, you can relaunch the pipeline and provided your prepared count datasets using the --datasets parameter. ", + "For more information, see the online documentation at https://nf-co.re/stableexpression." + ] + } + def msg = msg_lst.join("\n").trim() + error(msg) + } + } +} diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml b/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml deleted file mode 100644 index f8476112..00000000 --- a/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -subworkflows/utils_nextflow_pipeline: - - subworkflows/nf-core/utils_nextflow_pipeline/** diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf index 267f062a..bfd25876 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf @@ -56,21 +56,6 @@ def checkProfileProvided(nextflow_cli_args) { } } -// -// Citation string for pipeline -// -def workflowCitation() { - def temp_doi_ref = "" - def manifest_doi = workflow.manifest.doi.tokenize(",") - // Handling multiple DOIs - // Removing `https://doi.org/` to handle pipelines using DOIs vs DOI resolvers - // Removing ` ` since the manifest.doi is a string and not a proper list - manifest_doi.each { doi_ref -> - temp_doi_ref += " https://doi.org/${doi_ref.replace('https://doi.org/', '').replace(' ', '')}\n" - } - return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + "* The pipeline\n" + temp_doi_ref + "\n" + "* The nf-core framework\n" + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + "* Software dependencies\n" + " https://github.com/${workflow.manifest.name}/blob/master/CITATIONS.md" -} - // // Generate workflow version string // diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/tests/nextflow.config b/subworkflows/nf-core/utils_nfcore_pipeline/tests/nextflow.config index 9d366ee2..d0a926bf 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/tests/nextflow.config +++ b/subworkflows/nf-core/utils_nfcore_pipeline/tests/nextflow.config @@ -3,7 +3,7 @@ manifest { author = """nf-core""" homePage = 'https://127.0.0.1' description = """Dummy pipeline""" - nextflowVersion = '!>=25.04.00' + nextflowVersion = '!>=23.04.0' version = '9.9.9' doi = 'https://doi.org/10.5281/zenodo.5070524' } diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml b/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml deleted file mode 100644 index ac8523c9..00000000 --- a/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -subworkflows/utils_nfcore_pipeline: - - subworkflows/nf-core/utils_nfcore_pipeline/** diff --git a/subworkflows/nf-core/utils_nfschema_plugin/main.nf b/subworkflows/nf-core/utils_nfschema_plugin/main.nf index 4994303e..acb39724 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/main.nf +++ b/subworkflows/nf-core/utils_nfschema_plugin/main.nf @@ -4,6 +4,7 @@ include { paramsSummaryLog } from 'plugin/nf-schema' include { validateParameters } from 'plugin/nf-schema' +include { paramsHelp } from 'plugin/nf-schema' workflow UTILS_NFSCHEMA_PLUGIN { @@ -15,32 +16,58 @@ workflow UTILS_NFSCHEMA_PLUGIN { // when this input is empty it will automatically use the configured schema or // "${projectDir}/nextflow_schema.json" as default. This input should not be empty // for meta pipelines + help // boolean: show help message + help_full // boolean: show full help message + show_hidden // boolean: show hidden parameters in help message + before_text // string: text to show before the help message and parameters summary + after_text // string: text to show after the help message and parameters summary + command // string: an example command of the pipeline main: + if(help || help_full) { + help_options = [ + beforeText: before_text, + afterText: after_text, + command: command, + showHidden: show_hidden, + fullHelp: help_full, + ] + if(parameters_schema) { + help_options << [parametersSchema: parameters_schema] + } + log.info paramsHelp( + help_options, + params.help instanceof String ? params.help : "", + ) + exit 0 + } + // // Print parameter summary to stdout. This will display the parameters // that differ from the default given in the JSON schema // + + summary_options = [:] if(parameters_schema) { - log.info paramsSummaryLog(input_workflow, parameters_schema:parameters_schema) - } else { - log.info paramsSummaryLog(input_workflow) + summary_options << [parametersSchema: parameters_schema] } + log.info before_text + log.info paramsSummaryLog(summary_options, input_workflow) + log.info after_text // // Validate the parameters using nextflow_schema.json or the schema // given via the validation.parametersSchema configuration option // if(validate_params) { + validateOptions = [:] if(parameters_schema) { - validateParameters(parameters_schema:parameters_schema) - } else { - validateParameters() + validateOptions << [parametersSchema: parameters_schema] } + validateParameters(validateOptions) } emit: dummy_emit = true } - diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test index 8fb30164..c977917a 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test @@ -25,6 +25,12 @@ nextflow_workflow { input[0] = workflow input[1] = validate_params input[2] = "" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" """ } } @@ -51,6 +57,12 @@ nextflow_workflow { input[0] = workflow input[1] = validate_params input[2] = "" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" """ } } @@ -77,6 +89,12 @@ nextflow_workflow { input[0] = workflow input[1] = validate_params input[2] = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" """ } } @@ -103,6 +121,12 @@ nextflow_workflow { input[0] = workflow input[1] = validate_params input[2] = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" """ } } @@ -114,4 +138,36 @@ nextflow_workflow { ) } } + + test("Should create a help message") { + + when { + + params { + test_data = '' + outdir = null + } + + workflow { + """ + validate_params = true + input[0] = workflow + input[1] = validate_params + input[2] = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" + input[3] = true + input[4] = false + input[5] = false + input[6] = "Before" + input[7] = "After" + input[8] = "nextflow run test/test" + """ + } + } + + then { + assertAll( + { assert workflow.success } + ) + } + } } diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config index 0907ac58..8d8c7371 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config @@ -1,8 +1,8 @@ plugins { - id "nf-schema@2.1.0" + id "nf-schema@2.5.1" } validation { parametersSchema = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" monochromeLogs = true -} \ No newline at end of file +} diff --git a/test/nb_samples.ipynb b/test/nb_samples.ipynb new file mode 100644 index 00000000..cea98f7a --- /dev/null +++ b/test/nb_samples.ipynb @@ -0,0 +1,268 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 18, + "id": "2054a417", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "37fbbdec", + "metadata": {}, + "outputs": [], + "source": [ + "file = \"/home/olivier/repositories/nf-core-stableexpression/test/selected_accession_to_nb_samples.csv\"\n", + "df = pd.read_csv(file)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "2f72632c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
accessionnb_samples
0E-MTAB-3578122
1E-GEOD-5256417
2E-GEOD-4990611
3E-MTAB-1144212
4E-MTAB-258212
.........
1278E-TABM-7558
1279E-TABM-77816
1280E-TABM-86523
1281E-TABM-8930
1282E-TABM-90356
\n", + "

1283 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " accession nb_samples\n", + "0 E-MTAB-3578 122\n", + "1 E-GEOD-52564 17\n", + "2 E-GEOD-49906 11\n", + "3 E-MTAB-11442 12\n", + "4 E-MTAB-2582 12\n", + "... ... ...\n", + "1278 E-TABM-755 8\n", + "1279 E-TABM-778 16\n", + "1280 E-TABM-865 23\n", + "1281 E-TABM-89 30\n", + "1282 E-TABM-903 56\n", + "\n", + "[1283 rows x 2 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "84325c97", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nb_samples
count1283.000000
mean20.771629
std79.120320
min4.000000
25%7.000000
50%12.000000
75%18.000000
max2660.000000
\n", + "
" + ], + "text/plain": [ + " nb_samples\n", + "count 1283.000000\n", + "mean 20.771629\n", + "std 79.120320\n", + "min 4.000000\n", + "25% 7.000000\n", + "50% 12.000000\n", + "75% 18.000000\n", + "max 2660.000000" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "e47bc3b4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "np.int64(26650)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"nb_samples\"].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bcb812c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/.nftignore b/tests/.nftignore new file mode 100644 index 00000000..4d022cb7 --- /dev/null +++ b/tests/.nftignore @@ -0,0 +1,6 @@ +.DS_Store +pipeline_info/*.{html,json,txt,yml} +multiqc/** +**.parquet +**.metadata.tsv +**.py diff --git a/tests/default.nf.test b/tests/default.nf.test new file mode 100644 index 00000000..5213b474 --- /dev/null +++ b/tests/default.nf.test @@ -0,0 +1,330 @@ +nextflow_pipeline { + + name "Test pipeline" + script "../main.nf" + tag "pipeline" + + test("-profile test") { + + tag "test" + + when { + params { + species = 'beta vulgaris' + keywords = "leaf" + datasets = "https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/input_beta_vulgaris.csv" + fetch_geo_accessions = true + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + stable_name, + stable_path + ).match() } + ) + } + } + + test("-profile test_dataset_only") { + tag "test_dataset_only" + + when { + params { + species = 'mus musculus' + datasets = "https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/input_big.yaml" + skip_fetch_eatlas_accessions = true + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + removeNextflowVersion("$outputDir/pipeline_info/nf_core_stableexpression_software_mqc_versions.yml"), + stable_name, + stable_path + ).match() } + ) + } + } + + test("-profile test_accessions_only") { + tag "test_accessions_only" + + when { + params { + species = 'beta vulgaris' + accessions_only = true + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_stableexpression_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } + + test("-profile test_download_only") { + tag "test_download_only" + + when { + params { + species = 'beta vulgaris' + download_only = true + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_stableexpression_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } + + test("-profile test_one_accession_low_gene_count") { + tag "test_one_accession_low_gene_count" + + when { + params { + species = 'arabidopsis thaliana' + accessions = "E-GEOD-51720" + skip_fetch_eatlas_accessions = true + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + removeNextflowVersion("$outputDir/pipeline_info/nf_core_stableexpression_software_mqc_versions.yml"), + stable_name, + stable_path + ).match() } + ) + } + } + + test("-profile test_eatlas_only_with_keywords") { + tag "test_eatlas_only_with_keywords" + + when { + params { + species = 'beta vulgaris' + keywords = "leaf" + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + removeNextflowVersion("$outputDir/pipeline_info/nf_core_stableexpression_software_mqc_versions.yml"), + stable_name, + stable_path + ).match() } + ) + } + } + + test("-profile test_skip_id_mapping") { + tag "test_skip_id_mapping" + + when { + params { + species = 'solanum tuberosum' + datasets = "${projectDir}/tests/test_data/input_datasets/input.csv" + skip_id_mapping = true + skip_fetch_eatlas_accessions = true + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + stable_name, + stable_path + ).match() } + ) + } + } + + + test("-profile test_dataset_custom_mapping_and_gene_length") { + tag "test_dataset_custom_mapping_and_gene_length" + + when { + params { + species = 'solanum tuberosum' + datasets = "${projectDir}/tests/test_data/input_datasets/input.csv" + skip_id_mapping = true + skip_fetch_eatlas_accessions = true + gene_id_mapping = "${projectDir}/tests/test_data/input_datasets/mapping.csv" + gene_metadata = "${projectDir}/tests/test_data/input_datasets/metadata.csv" + gene_length = "${projectDir}/tests/test_data/input_datasets/gene_lengths.csv" + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + removeNextflowVersion("$outputDir/pipeline_info/nf_core_stableexpression_software_mqc_versions.yml"), + stable_name, + stable_path + ).match() } + ) + } + } + + /* + test("-profile test_no_dataset_found") { + tag "test_no_dataset_found" + + when { + params { + species = 'marmota_marmota_marmota' + outdir = "$outputDir" + } + } + + then { + assert !workflow.success + } + } + */ + + test("-profile test_included_and_excluded_accessions") { + tag "test_included_and_excluded_accessions" + + when { + params { + species = "solanum tuberosum" + accessions = "E-MTAB-552,E-GEOD-61690" + excluded_accessions = "E-MTAB-4251" + accessions_file = "${projectDir}/tests/test_data/misc/accessions_to_include.txt" + excluded_accessions_file = "${projectDir}/tests/test_data/misc/excluded_accessions.txt" + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_stableexpression_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } + + test("-profile test_gprofiler_target_database_entrez") { + + when { + params { + species = 'beta vulgaris' + gprofiler_target_database = 'ENTREZGENE' + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + stable_name, + stable_path + ).match() } + ) + } + } + + test("-profile test_bigger_with_genorm") { + tag "test_bigger_with_genorm" + + when { + params { + species = 'arabidopsis_lyrata' + run_genorm = true + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_stableexpression_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap new file mode 100644 index 00000000..12c6970b --- /dev/null +++ b/tests/default.nf.test.snap @@ -0,0 +1,1908 @@ +{ + "-profile test_dataset_only": { + "content": [ + { + "AGGREGATE_RESULTS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "CLEAN_GENE_IDS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COLLECT_GENE_IDS": { + "python": "3.14.2", + "tqdm": "4.67.1" + }, + "COLLECT_STATISTICS": { + "pandas": "2.3.3", + "python": "3.13.7" + }, + "COMPUTE_DATASET_STATISTICS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_GENE_TRANSCRIPT_LENGTHS": { + "pandas": "2.3.3", + "python": "3.13.7" + }, + "COMPUTE_GLOBAL_STATISTICS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_PLATFORM_STATISTICS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_STABILITY_SCORES": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_TPM": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "DASH_APP": { + "python": "3.13.8", + "dash": "3.2.0", + "dash-extensions": "2.0.4", + "dash-mantine-components": "2.3.0", + "dash-ag-grid": "32.3.2", + "polars": "1.35.0", + "pandas": "2.3.3", + "pyarrow": "22.0.0", + "scipy": "1.16.3" + }, + "DETECT_RARE_GENES": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "DOWNLOAD_ENSEMBL_ANNOTATION": { + "bs4": "4.14.2", + "pandas": "2.3.3", + "python": "3.14.0", + "requests": "2.32.5", + "tqdm": "4.67.1" + }, + "FILTER_AND_RENAME_GENES": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "GET_CANDIDATE_GENES": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "GPROFILER_IDMAPPING": { + "pandas": "2.3.1", + "python": "3.13.5", + "requests": "2.32.4" + }, + "MERGE_ALL_COUNTS": { + "polars": "1.34.0", + "python": "3.14.0", + "tqdm": "4.67.1" + }, + "MERGE_PLATFORM_COUNTS": { + "polars": "1.34.0", + "python": "3.14.0", + "tqdm": "4.67.1" + }, + "NORMFINDER": { + "polars": "1.33.1", + "python": "3.13.7" + }, + "QUANTILE_NORMALISATION": { + "polars": "1.36.1", + "python": "3.14.2", + "scikit-learn": "1.8.0" + }, + "REMOVE_SAMPLES_NOT_VALID": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "Workflow": { + "nf-core/stableexpression": "v1.0dev" + } + }, + [ + "aggregated", + "aggregated/all_counts_filtered.parquet", + "aggregated/all_genes_summary.csv", + "aggregated/most_stable_genes_summary.csv", + "aggregated/most_stable_genes_transposed_counts_filtered.csv", + "dash_app", + "dash_app/app.py", + "dash_app/assets", + "dash_app/assets/style.css", + "dash_app/data", + "dash_app/data/all_counts.parquet", + "dash_app/data/all_genes_summary.csv", + "dash_app/data/whole_design.csv", + "dash_app/environment.yml", + "dash_app/src", + "dash_app/src/callbacks", + "dash_app/src/callbacks/common.py", + "dash_app/src/callbacks/genes.py", + "dash_app/src/callbacks/samples.py", + "dash_app/src/components", + "dash_app/src/components/graphs.py", + "dash_app/src/components/icons.py", + "dash_app/src/components/right_sidebar.py", + "dash_app/src/components/settings", + "dash_app/src/components/settings/genes.py", + "dash_app/src/components/settings/samples.py", + "dash_app/src/components/stores.py", + "dash_app/src/components/tables.py", + "dash_app/src/components/tooltips.py", + "dash_app/src/components/top.py", + "dash_app/src/utils", + "dash_app/src/utils/config.py", + "dash_app/src/utils/data_management.py", + "dash_app/src/utils/style.py", + "errors", + "idmapping", + "idmapping/collected_gene_ids", + "idmapping/collected_gene_ids/gene_id_occurrences.csv", + "idmapping/collected_gene_ids/unique_gene_ids.txt", + "idmapping/global_gene_id_mapping.csv", + "idmapping/global_gene_metadata.csv", + "idmapping/gprofiler", + "idmapping/gprofiler/gene_metadata.csv", + "idmapping/gprofiler/mapped_gene_ids.csv", + "idmapping/whole_gene_id_mapping.csv", + "idmapping/whole_gene_metadata.csv", + "merged_datasets", + "merged_datasets/whole_design.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_expression_distributions_most_stable_genes.txt", + "multiqc/multiqc_data/multiqc_gene_statistics.txt", + "multiqc/multiqc_data/multiqc_id_mapping_stats.txt", + "multiqc/multiqc_data/multiqc_ranked_most_stable_genes_summary.txt", + "multiqc/multiqc_data/multiqc_ratio_zeros.txt", + "multiqc/multiqc_data/multiqc_renaming_warning_reasons.txt", + "multiqc/multiqc_data/multiqc_skewness.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_data/multiqc_total_gene_id_occurrence_quantiles.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/expression_distributions_most_stable_genes.pdf", + "multiqc/multiqc_plots/pdf/gene_statistics.pdf", + "multiqc/multiqc_plots/pdf/id_mapping_stats-cnt.pdf", + "multiqc/multiqc_plots/pdf/id_mapping_stats-pct.pdf", + "multiqc/multiqc_plots/pdf/ranked_most_stable_genes_summary.pdf", + "multiqc/multiqc_plots/pdf/ratio_zeros.pdf", + "multiqc/multiqc_plots/pdf/renaming_warning_reasons.pdf", + "multiqc/multiqc_plots/pdf/skewness.pdf", + "multiqc/multiqc_plots/pdf/total_gene_id_occurrence_quantiles.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/expression_distributions_most_stable_genes.png", + "multiqc/multiqc_plots/png/gene_statistics.png", + "multiqc/multiqc_plots/png/id_mapping_stats-cnt.png", + "multiqc/multiqc_plots/png/id_mapping_stats-pct.png", + "multiqc/multiqc_plots/png/ranked_most_stable_genes_summary.png", + "multiqc/multiqc_plots/png/ratio_zeros.png", + "multiqc/multiqc_plots/png/renaming_warning_reasons.png", + "multiqc/multiqc_plots/png/skewness.png", + "multiqc/multiqc_plots/png/total_gene_id_occurrence_quantiles.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/expression_distributions_most_stable_genes.svg", + "multiqc/multiqc_plots/svg/gene_statistics.svg", + "multiqc/multiqc_plots/svg/id_mapping_stats-cnt.svg", + "multiqc/multiqc_plots/svg/id_mapping_stats-pct.svg", + "multiqc/multiqc_plots/svg/ranked_most_stable_genes_summary.svg", + "multiqc/multiqc_plots/svg/ratio_zeros.svg", + "multiqc/multiqc_plots/svg/renaming_warning_reasons.svg", + "multiqc/multiqc_plots/svg/skewness.svg", + "multiqc/multiqc_plots/svg/total_gene_id_occurrence_quantiles.svg", + "multiqc/multiqc_report.html", + "multiqc/versions.yml", + "normalised", + "normalised/SRP254919.salmon.merged.gene_counts.top1000cov.assay", + "normalised/SRP254919.salmon.merged.gene_counts.top1000cov.assay/quantile_normalised", + "normalised/SRP254919.salmon.merged.gene_counts.top1000cov.assay/quantile_normalised/SRP254919.salmon.merged.gene_counts.top1000cov.assay.cleaned.renamed.filtered.tpm.quant_norm.parquet", + "normalised/SRP254919.salmon.merged.gene_counts.top1000cov.assay/tpm", + "normalised/SRP254919.salmon.merged.gene_counts.top1000cov.assay/tpm/SRP254919.salmon.merged.gene_counts.top1000cov.assay.cleaned.renamed.filtered.tpm.parquet", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "statistics", + "statistics/id_mapping_stats.csv", + "statistics/ratio_zeros.csv", + "statistics/skewness.csv", + "warnings", + "warnings/renaming_warning_reasons.tsv" + ], + [ + "all_genes_summary.csv:md5,09c1c8807fbb69ae7baf5c4a7772c8d2", + "most_stable_genes_summary.csv:md5,09c1c8807fbb69ae7baf5c4a7772c8d2", + "most_stable_genes_transposed_counts_filtered.csv:md5,68fa221be589ee1a5970ae1461a743ac", + "style.css:md5,e6ba182eaf06980dbda49920efbf6e64", + "all_genes_summary.csv:md5,09c1c8807fbb69ae7baf5c4a7772c8d2", + "whole_design.csv:md5,f29515bc2c783e593fb9028127342593", + "environment.yml:md5,f9b192ef98a67f2084ad2fed6da01bc1", + "gene_id_occurrences.csv:md5,72c8f7ffe8413be06419c10ae66c35e5", + "unique_gene_ids.txt:md5,6b2ece983fd9da133e719914216852b0", + "global_gene_id_mapping.csv:md5,78934d2ac5fe7d863f114c5703f57a06", + "global_gene_metadata.csv:md5,bd76860b422e45eca7cd583212a977d2", + "gene_metadata.csv:md5,bd76860b422e45eca7cd583212a977d2", + "mapped_gene_ids.csv:md5,78934d2ac5fe7d863f114c5703f57a06", + "whole_gene_id_mapping.csv:md5,78934d2ac5fe7d863f114c5703f57a06", + "whole_gene_metadata.csv:md5,bd76860b422e45eca7cd583212a977d2", + "whole_design.csv:md5,f29515bc2c783e593fb9028127342593", + "id_mapping_stats.csv:md5,b47d6ebd34e3fb11a40665b0a38db3da", + "ratio_zeros.csv:md5,2272ebcf58ac8bb283d238f87d508b96", + "skewness.csv:md5,2ef6f5a2aa5834110fda06e705adcbf8", + "renaming_warning_reasons.tsv:md5,0a11a59b5b547a39ab7a0e4dac622173" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-13T13:10:38.553975658" + }, + "-profile test_eatlas_only_with_keywords": { + "content": [ + { + "AGGREGATE_RESULTS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "CLEAN_GENE_IDS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COLLECT_GENE_IDS": { + "python": "3.14.2", + "tqdm": "4.67.1" + }, + "COLLECT_STATISTICS": { + "pandas": "2.3.3", + "python": "3.13.7" + }, + "COMPUTE_DATASET_STATISTICS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_GENE_TRANSCRIPT_LENGTHS": { + "pandas": "2.3.3", + "python": "3.13.7" + }, + "COMPUTE_GLOBAL_STATISTICS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_PLATFORM_STATISTICS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_STABILITY_SCORES": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_TPM": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "DASH_APP": { + "python": "3.13.8", + "dash": "3.2.0", + "dash-extensions": "2.0.4", + "dash-mantine-components": "2.3.0", + "dash-ag-grid": "32.3.2", + "polars": "1.35.0", + "pandas": "2.3.3", + "pyarrow": "22.0.0", + "scipy": "1.16.3" + }, + "DETECT_RARE_GENES": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "DOWNLOAD_ENSEMBL_ANNOTATION": { + "bs4": "4.14.2", + "pandas": "2.3.3", + "python": "3.14.0", + "requests": "2.32.5", + "tqdm": "4.67.1" + }, + "EXPRESSION_ATLAS": { + "ExpressionAtlas": "1.30.0", + "R": "4.3.3 (2024-02-29)", + "nltk": "3.9.1", + "pandas": "2.3.0", + "python": "3.13.5", + "pyyaml": "6.0.2", + "requests": "2.32.4" + }, + "FILTER_AND_RENAME_GENES": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "GET_CANDIDATE_GENES": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "GPROFILER_IDMAPPING": { + "pandas": "2.3.1", + "python": "3.13.5", + "requests": "2.32.4" + }, + "MERGE_ALL_COUNTS": { + "polars": "1.34.0", + "python": "3.14.0", + "tqdm": "4.67.1" + }, + "MERGE_PLATFORM_COUNTS": { + "polars": "1.34.0", + "python": "3.14.0", + "tqdm": "4.67.1" + }, + "NORMFINDER": { + "polars": "1.33.1", + "python": "3.13.7" + }, + "QUANTILE_NORMALISATION": { + "polars": "1.36.1", + "python": "3.14.2", + "scikit-learn": "1.8.0" + }, + "REMOVE_SAMPLES_NOT_VALID": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "Workflow": { + "nf-core/stableexpression": "v1.0dev" + } + }, + [ + "aggregated", + "aggregated/all_counts_filtered.parquet", + "aggregated/all_genes_summary.csv", + "aggregated/most_stable_genes_summary.csv", + "aggregated/most_stable_genes_transposed_counts_filtered.csv", + "dash_app", + "dash_app/app.py", + "dash_app/assets", + "dash_app/assets/style.css", + "dash_app/data", + "dash_app/data/all_counts.parquet", + "dash_app/data/all_genes_summary.csv", + "dash_app/data/whole_design.csv", + "dash_app/environment.yml", + "dash_app/src", + "dash_app/src/callbacks", + "dash_app/src/callbacks/common.py", + "dash_app/src/callbacks/genes.py", + "dash_app/src/callbacks/samples.py", + "dash_app/src/components", + "dash_app/src/components/graphs.py", + "dash_app/src/components/icons.py", + "dash_app/src/components/right_sidebar.py", + "dash_app/src/components/settings", + "dash_app/src/components/settings/genes.py", + "dash_app/src/components/settings/samples.py", + "dash_app/src/components/stores.py", + "dash_app/src/components/tables.py", + "dash_app/src/components/tooltips.py", + "dash_app/src/components/top.py", + "dash_app/src/utils", + "dash_app/src/utils/config.py", + "dash_app/src/utils/data_management.py", + "dash_app/src/utils/style.py", + "errors", + "idmapping", + "idmapping/collected_gene_ids", + "idmapping/collected_gene_ids/gene_id_occurrences.csv", + "idmapping/collected_gene_ids/unique_gene_ids.txt", + "idmapping/global_gene_id_mapping.csv", + "idmapping/global_gene_metadata.csv", + "idmapping/gprofiler", + "idmapping/gprofiler/gene_metadata.csv", + "idmapping/gprofiler/mapped_gene_ids.csv", + "idmapping/whole_gene_id_mapping.csv", + "idmapping/whole_gene_metadata.csv", + "merged_datasets", + "merged_datasets/whole_design.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_eatlas_all_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_eatlas_selected_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_expression_distributions_most_stable_genes.txt", + "multiqc/multiqc_data/multiqc_gene_statistics.txt", + "multiqc/multiqc_data/multiqc_id_mapping_stats.txt", + "multiqc/multiqc_data/multiqc_ranked_most_stable_genes_summary.txt", + "multiqc/multiqc_data/multiqc_ratio_zeros.txt", + "multiqc/multiqc_data/multiqc_skewness.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_data/multiqc_total_gene_id_occurrence_quantiles.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/eatlas_all_experiments_metadata.pdf", + "multiqc/multiqc_plots/pdf/eatlas_selected_experiments_metadata.pdf", + "multiqc/multiqc_plots/pdf/expression_distributions_most_stable_genes.pdf", + "multiqc/multiqc_plots/pdf/gene_statistics.pdf", + "multiqc/multiqc_plots/pdf/id_mapping_stats-cnt.pdf", + "multiqc/multiqc_plots/pdf/id_mapping_stats-pct.pdf", + "multiqc/multiqc_plots/pdf/ranked_most_stable_genes_summary.pdf", + "multiqc/multiqc_plots/pdf/ratio_zeros.pdf", + "multiqc/multiqc_plots/pdf/skewness.pdf", + "multiqc/multiqc_plots/pdf/total_gene_id_occurrence_quantiles.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/eatlas_all_experiments_metadata.png", + "multiqc/multiqc_plots/png/eatlas_selected_experiments_metadata.png", + "multiqc/multiqc_plots/png/expression_distributions_most_stable_genes.png", + "multiqc/multiqc_plots/png/gene_statistics.png", + "multiqc/multiqc_plots/png/id_mapping_stats-cnt.png", + "multiqc/multiqc_plots/png/id_mapping_stats-pct.png", + "multiqc/multiqc_plots/png/ranked_most_stable_genes_summary.png", + "multiqc/multiqc_plots/png/ratio_zeros.png", + "multiqc/multiqc_plots/png/skewness.png", + "multiqc/multiqc_plots/png/total_gene_id_occurrence_quantiles.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/eatlas_all_experiments_metadata.svg", + "multiqc/multiqc_plots/svg/eatlas_selected_experiments_metadata.svg", + "multiqc/multiqc_plots/svg/expression_distributions_most_stable_genes.svg", + "multiqc/multiqc_plots/svg/gene_statistics.svg", + "multiqc/multiqc_plots/svg/id_mapping_stats-cnt.svg", + "multiqc/multiqc_plots/svg/id_mapping_stats-pct.svg", + "multiqc/multiqc_plots/svg/ranked_most_stable_genes_summary.svg", + "multiqc/multiqc_plots/svg/ratio_zeros.svg", + "multiqc/multiqc_plots/svg/skewness.svg", + "multiqc/multiqc_plots/svg/total_gene_id_occurrence_quantiles.svg", + "multiqc/multiqc_report.html", + "multiqc/versions.yml", + "normalised", + "normalised/E_MTAB_8187_rnaseq", + "normalised/E_MTAB_8187_rnaseq/quantile_normalised", + "normalised/E_MTAB_8187_rnaseq/quantile_normalised/E_MTAB_8187_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.quant_norm.parquet", + "normalised/E_MTAB_8187_rnaseq/tpm", + "normalised/E_MTAB_8187_rnaseq/tpm/E_MTAB_8187_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.parquet", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "public_data", + "public_data/expression_atlas", + "public_data/expression_atlas/accessions", + "public_data/expression_atlas/accessions/accessions.txt", + "public_data/expression_atlas/accessions/selected_experiments.metadata.tsv", + "public_data/expression_atlas/accessions/species_experiments.metadata.tsv", + "public_data/expression_atlas/datasets", + "public_data/expression_atlas/datasets/E_MTAB_8187_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv", + "statistics", + "statistics/id_mapping_stats.csv", + "statistics/ratio_zeros.csv", + "statistics/skewness.csv", + "warnings" + ], + [ + "all_genes_summary.csv:md5,829fd6221705fc1588a11bfdb1a37210", + "most_stable_genes_summary.csv:md5,73cbb4d6462e01ed7778f076726613fd", + "most_stable_genes_transposed_counts_filtered.csv:md5,4f50fafaa96ffd7a7b0d98d9c8d6beb4", + "style.css:md5,e6ba182eaf06980dbda49920efbf6e64", + "all_genes_summary.csv:md5,829fd6221705fc1588a11bfdb1a37210", + "whole_design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "environment.yml:md5,f9b192ef98a67f2084ad2fed6da01bc1", + "gene_id_occurrences.csv:md5,b3ee7b1c575f83d247c5bce88382fb2b", + "unique_gene_ids.txt:md5,ba79f5609df755c0f75de41357319c84", + "global_gene_id_mapping.csv:md5,7eecbd2d88adaf5f213f238a72d28b99", + "global_gene_metadata.csv:md5,cc8d4afdbaf03cd39a4e10f2a9040b7e", + "gene_metadata.csv:md5,cc8d4afdbaf03cd39a4e10f2a9040b7e", + "mapped_gene_ids.csv:md5,7eecbd2d88adaf5f213f238a72d28b99", + "whole_gene_id_mapping.csv:md5,7eecbd2d88adaf5f213f238a72d28b99", + "whole_gene_metadata.csv:md5,cc8d4afdbaf03cd39a4e10f2a9040b7e", + "whole_design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "accessions.txt:md5,76e5e3af7c72eac7a1993a2bd75b4d1a", + "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1", + "id_mapping_stats.csv:md5,17ccaa8e70c67c7d0de4ec3c630c2e5b", + "ratio_zeros.csv:md5,32889cf6de2af6413c42b8810a99a2df", + "skewness.csv:md5,178449bbd2361aa1e804e3f18e092ef1" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-13T13:14:45.841673917" + }, + "-profile test_included_and_excluded_accessions": { + "content": [ + { + "AGGREGATE_RESULTS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "CLEAN_GENE_IDS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COLLECT_GENE_IDS": { + "python": "3.14.2", + "tqdm": "4.67.1" + }, + "COLLECT_STATISTICS": { + "pandas": "2.3.3", + "python": "3.13.7" + }, + "COMPUTE_DATASET_STATISTICS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_GENE_TRANSCRIPT_LENGTHS": { + "pandas": "2.3.3", + "python": "3.13.7" + }, + "COMPUTE_GLOBAL_STATISTICS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_PLATFORM_STATISTICS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_STABILITY_SCORES": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_TPM": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "DASH_APP": { + "python": "3.13.8", + "dash": "3.2.0", + "dash-extensions": "2.0.4", + "dash-mantine-components": "2.3.0", + "dash-ag-grid": "32.3.2", + "polars": "1.35.0", + "pandas": "2.3.3", + "pyarrow": "22.0.0", + "scipy": "1.16.3" + }, + "DETECT_RARE_GENES": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "DOWNLOAD_ENSEMBL_ANNOTATION": { + "bs4": "4.14.2", + "pandas": "2.3.3", + "python": "3.14.0", + "requests": "2.32.5", + "tqdm": "4.67.1" + }, + "EXPRESSION_ATLAS": { + "ExpressionAtlas": "1.30.0", + "R": "4.3.3 (2024-02-29)", + "nltk": "3.9.1", + "pandas": "2.3.0", + "python": "3.13.5", + "pyyaml": "6.0.2", + "requests": "2.32.4" + }, + "FILTER_AND_RENAME_GENES": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "GET_CANDIDATE_GENES": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "GPROFILER_IDMAPPING": { + "pandas": "2.3.1", + "python": "3.13.5", + "requests": "2.32.4" + }, + "MERGE_ALL_COUNTS": { + "polars": "1.34.0", + "python": "3.14.0", + "tqdm": "4.67.1" + }, + "MERGE_PLATFORM_COUNTS": { + "polars": "1.34.0", + "python": "3.14.0", + "tqdm": "4.67.1" + }, + "NORMFINDER": { + "polars": "1.33.1", + "python": "3.13.7" + }, + "QUANTILE_NORMALISATION": { + "polars": "1.36.1", + "python": "3.14.2", + "scikit-learn": "1.8.0" + }, + "REMOVE_SAMPLES_NOT_VALID": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "Workflow": { + "nf-core/stableexpression": "v1.0dev" + } + }, + [ + "aggregated", + "aggregated/all_counts_filtered.parquet", + "aggregated/all_genes_summary.csv", + "aggregated/most_stable_genes_summary.csv", + "aggregated/most_stable_genes_transposed_counts_filtered.csv", + "dash_app", + "dash_app/app.py", + "dash_app/assets", + "dash_app/assets/style.css", + "dash_app/data", + "dash_app/data/all_counts.parquet", + "dash_app/data/all_genes_summary.csv", + "dash_app/data/whole_design.csv", + "dash_app/environment.yml", + "dash_app/src", + "dash_app/src/callbacks", + "dash_app/src/callbacks/common.py", + "dash_app/src/callbacks/genes.py", + "dash_app/src/callbacks/samples.py", + "dash_app/src/components", + "dash_app/src/components/graphs.py", + "dash_app/src/components/icons.py", + "dash_app/src/components/right_sidebar.py", + "dash_app/src/components/settings", + "dash_app/src/components/settings/genes.py", + "dash_app/src/components/settings/samples.py", + "dash_app/src/components/stores.py", + "dash_app/src/components/tables.py", + "dash_app/src/components/tooltips.py", + "dash_app/src/components/top.py", + "dash_app/src/utils", + "dash_app/src/utils/config.py", + "dash_app/src/utils/data_management.py", + "dash_app/src/utils/style.py", + "errors", + "errors/eatlas_failure_reasons.csv", + "errors/renaming_failure_reasons.tsv", + "idmapping", + "idmapping/collected_gene_ids", + "idmapping/collected_gene_ids/gene_id_occurrences.csv", + "idmapping/collected_gene_ids/unique_gene_ids.txt", + "idmapping/global_gene_id_mapping.csv", + "idmapping/global_gene_metadata.csv", + "idmapping/gprofiler", + "idmapping/gprofiler/gene_metadata.csv", + "idmapping/gprofiler/mapped_gene_ids.csv", + "idmapping/whole_gene_id_mapping.csv", + "idmapping/whole_gene_metadata.csv", + "merged_datasets", + "merged_datasets/whole_design.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_eatlas_all_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_eatlas_failure_reasons.txt", + "multiqc/multiqc_data/multiqc_eatlas_selected_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_expression_distributions_most_stable_genes.txt", + "multiqc/multiqc_data/multiqc_gene_statistics.txt", + "multiqc/multiqc_data/multiqc_id_mapping_stats.txt", + "multiqc/multiqc_data/multiqc_ranked_most_stable_genes_summary.txt", + "multiqc/multiqc_data/multiqc_ratio_zeros.txt", + "multiqc/multiqc_data/multiqc_renaming_failure_reasons.txt", + "multiqc/multiqc_data/multiqc_skewness.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_data/multiqc_total_gene_id_occurrence_quantiles.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/eatlas_all_experiments_metadata.pdf", + "multiqc/multiqc_plots/pdf/eatlas_failure_reasons.pdf", + "multiqc/multiqc_plots/pdf/eatlas_selected_experiments_metadata.pdf", + "multiqc/multiqc_plots/pdf/expression_distributions_most_stable_genes.pdf", + "multiqc/multiqc_plots/pdf/gene_statistics.pdf", + "multiqc/multiqc_plots/pdf/id_mapping_stats-cnt.pdf", + "multiqc/multiqc_plots/pdf/id_mapping_stats-pct.pdf", + "multiqc/multiqc_plots/pdf/ranked_most_stable_genes_summary.pdf", + "multiqc/multiqc_plots/pdf/ratio_zeros.pdf", + "multiqc/multiqc_plots/pdf/renaming_failure_reasons.pdf", + "multiqc/multiqc_plots/pdf/skewness.pdf", + "multiqc/multiqc_plots/pdf/total_gene_id_occurrence_quantiles.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/eatlas_all_experiments_metadata.png", + "multiqc/multiqc_plots/png/eatlas_failure_reasons.png", + "multiqc/multiqc_plots/png/eatlas_selected_experiments_metadata.png", + "multiqc/multiqc_plots/png/expression_distributions_most_stable_genes.png", + "multiqc/multiqc_plots/png/gene_statistics.png", + "multiqc/multiqc_plots/png/id_mapping_stats-cnt.png", + "multiqc/multiqc_plots/png/id_mapping_stats-pct.png", + "multiqc/multiqc_plots/png/ranked_most_stable_genes_summary.png", + "multiqc/multiqc_plots/png/ratio_zeros.png", + "multiqc/multiqc_plots/png/renaming_failure_reasons.png", + "multiqc/multiqc_plots/png/skewness.png", + "multiqc/multiqc_plots/png/total_gene_id_occurrence_quantiles.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/eatlas_all_experiments_metadata.svg", + "multiqc/multiqc_plots/svg/eatlas_failure_reasons.svg", + "multiqc/multiqc_plots/svg/eatlas_selected_experiments_metadata.svg", + "multiqc/multiqc_plots/svg/expression_distributions_most_stable_genes.svg", + "multiqc/multiqc_plots/svg/gene_statistics.svg", + "multiqc/multiqc_plots/svg/id_mapping_stats-cnt.svg", + "multiqc/multiqc_plots/svg/id_mapping_stats-pct.svg", + "multiqc/multiqc_plots/svg/ranked_most_stable_genes_summary.svg", + "multiqc/multiqc_plots/svg/ratio_zeros.svg", + "multiqc/multiqc_plots/svg/renaming_failure_reasons.svg", + "multiqc/multiqc_plots/svg/skewness.svg", + "multiqc/multiqc_plots/svg/total_gene_id_occurrence_quantiles.svg", + "multiqc/multiqc_report.html", + "multiqc/versions.yml", + "normalised", + "normalised/E_GEOD_61690_rnaseq", + "normalised/E_GEOD_61690_rnaseq/quantile_normalised", + "normalised/E_GEOD_61690_rnaseq/quantile_normalised/E_GEOD_61690_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.quant_norm.parquet", + "normalised/E_GEOD_61690_rnaseq/tpm", + "normalised/E_GEOD_61690_rnaseq/tpm/E_GEOD_61690_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.parquet", + "normalised/E_GEOD_77826_rnaseq", + "normalised/E_GEOD_77826_rnaseq/quantile_normalised", + "normalised/E_GEOD_77826_rnaseq/quantile_normalised/E_GEOD_77826_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.quant_norm.parquet", + "normalised/E_GEOD_77826_rnaseq/tpm", + "normalised/E_GEOD_77826_rnaseq/tpm/E_GEOD_77826_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.parquet", + "normalised/E_MTAB_5038_rnaseq", + "normalised/E_MTAB_5038_rnaseq/quantile_normalised", + "normalised/E_MTAB_5038_rnaseq/quantile_normalised/E_MTAB_5038_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.quant_norm.parquet", + "normalised/E_MTAB_5038_rnaseq/tpm", + "normalised/E_MTAB_5038_rnaseq/tpm/E_MTAB_5038_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.parquet", + "normalised/E_MTAB_5215_rnaseq", + "normalised/E_MTAB_5215_rnaseq/quantile_normalised", + "normalised/E_MTAB_5215_rnaseq/quantile_normalised/E_MTAB_5215_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.quant_norm.parquet", + "normalised/E_MTAB_5215_rnaseq/tpm", + "normalised/E_MTAB_5215_rnaseq/tpm/E_MTAB_5215_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.parquet", + "normalised/E_MTAB_552_rnaseq", + "normalised/E_MTAB_552_rnaseq/quantile_normalised", + "normalised/E_MTAB_552_rnaseq/quantile_normalised/E_MTAB_552_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.quant_norm.parquet", + "normalised/E_MTAB_552_rnaseq/tpm", + "normalised/E_MTAB_552_rnaseq/tpm/E_MTAB_552_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.parquet", + "normalised/E_MTAB_7711_rnaseq", + "normalised/E_MTAB_7711_rnaseq/quantile_normalised", + "normalised/E_MTAB_7711_rnaseq/quantile_normalised/E_MTAB_7711_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.quant_norm.parquet", + "normalised/E_MTAB_7711_rnaseq/tpm", + "normalised/E_MTAB_7711_rnaseq/tpm/E_MTAB_7711_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.parquet", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "public_data", + "public_data/expression_atlas", + "public_data/expression_atlas/accessions", + "public_data/expression_atlas/accessions/accessions.txt", + "public_data/expression_atlas/accessions/selected_experiments.metadata.tsv", + "public_data/expression_atlas/accessions/species_experiments.metadata.tsv", + "public_data/expression_atlas/datasets", + "public_data/expression_atlas/datasets/E_GEOD_61690_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_GEOD_61690_rnaseq.rnaseq.raw.counts.csv", + "public_data/expression_atlas/datasets/E_GEOD_77826_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_GEOD_77826_rnaseq.rnaseq.raw.counts.csv", + "public_data/expression_atlas/datasets/E_MTAB_4252_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_MTAB_4252_rnaseq.rnaseq.raw.counts.csv", + "public_data/expression_atlas/datasets/E_MTAB_5038_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_MTAB_5038_rnaseq.rnaseq.raw.counts.csv", + "public_data/expression_atlas/datasets/E_MTAB_5215_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_MTAB_5215_rnaseq.rnaseq.raw.counts.csv", + "public_data/expression_atlas/datasets/E_MTAB_552_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_MTAB_552_rnaseq.rnaseq.raw.counts.csv", + "public_data/expression_atlas/datasets/E_MTAB_7711_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_MTAB_7711_rnaseq.rnaseq.raw.counts.csv", + "public_data/expression_atlas/datasets/failure_reason.txt", + "statistics", + "statistics/id_mapping_stats.csv", + "statistics/ratio_zeros.csv", + "statistics/skewness.csv", + "warnings" + ], + [ + "all_genes_summary.csv:md5,b8661916743026427c8df986722c38ea", + "most_stable_genes_summary.csv:md5,e281330972fdc9cce06be4e079af8913", + "most_stable_genes_transposed_counts_filtered.csv:md5,be29a5a60aa6785bcfe06d93cef0cdf8", + "style.css:md5,e6ba182eaf06980dbda49920efbf6e64", + "all_genes_summary.csv:md5,b8661916743026427c8df986722c38ea", + "whole_design.csv:md5,cc24405dce8d22b93b9999a2287113ef", + "environment.yml:md5,f9b192ef98a67f2084ad2fed6da01bc1", + "eatlas_failure_reasons.csv:md5,2a8cd0ed795e82647d19c484a79acde6", + "renaming_failure_reasons.tsv:md5,af783264770a861c263480141fdd8bf6", + "gene_id_occurrences.csv:md5,5f07f7504156cf5dd6db26f230eb73a2", + "unique_gene_ids.txt:md5,e9681582a09fe58f5258977db1d9da3f", + "global_gene_id_mapping.csv:md5,a86823539deb80c0aa44378d3078969d", + "global_gene_metadata.csv:md5,e33e0ed63a3dec26bc95fe422f02844c", + "gene_metadata.csv:md5,e33e0ed63a3dec26bc95fe422f02844c", + "mapped_gene_ids.csv:md5,a86823539deb80c0aa44378d3078969d", + "whole_gene_id_mapping.csv:md5,a86823539deb80c0aa44378d3078969d", + "whole_gene_metadata.csv:md5,e33e0ed63a3dec26bc95fe422f02844c", + "whole_design.csv:md5,cc24405dce8d22b93b9999a2287113ef", + "accessions.txt:md5,e38a0aaf5191ba5f94cb7a96b8d30aa7", + "E_GEOD_61690_rnaseq.design.csv:md5,ba807caf74e0b55f5c3fe23810a89560", + "E_GEOD_61690_rnaseq.rnaseq.raw.counts.csv:md5,2e3f1a125b3d41d622e2d24447620eb3", + "E_GEOD_77826_rnaseq.design.csv:md5,5aa61df754aa9c6c107b247c642d2e53", + "E_GEOD_77826_rnaseq.rnaseq.raw.counts.csv:md5,85cea79c602a9924d5a4d6b597ef5530", + "E_MTAB_4252_rnaseq.design.csv:md5,5aef2d1f8b78b3e60225855a6aafe6ad", + "E_MTAB_4252_rnaseq.rnaseq.raw.counts.csv:md5,80d4fdb7f02fc7875827b61e104da56e", + "E_MTAB_5038_rnaseq.design.csv:md5,352ed3163d7deef2be35d899418d5ad4", + "E_MTAB_5038_rnaseq.rnaseq.raw.counts.csv:md5,b4acb3d7c39cdb2bd6cef6c9314c5b2a", + "E_MTAB_5215_rnaseq.design.csv:md5,2741dcd5b45bacce865db632f626a273", + "E_MTAB_5215_rnaseq.rnaseq.raw.counts.csv:md5,273704bdf762c342271b33958a84d1e7", + "E_MTAB_552_rnaseq.design.csv:md5,b81490696f638e90c1cf14236bb0c08c", + "E_MTAB_552_rnaseq.rnaseq.raw.counts.csv:md5,830f50b60b17b62f9ca2f6a163a2879f", + "E_MTAB_7711_rnaseq.design.csv:md5,3e7748b54a0c25c008d9bd2ddbf1bf00", + "E_MTAB_7711_rnaseq.rnaseq.raw.counts.csv:md5,3c02cf432c29d3751c978439539df388", + "failure_reason.txt:md5,bf97c58555bcb575f0e36df513e1e4c4", + "id_mapping_stats.csv:md5,ca5e05936cbc8a1e8ea7c942752c883a", + "ratio_zeros.csv:md5,b02e3ef082c377aa69dafb8dc894f754", + "skewness.csv:md5,b7bf553b6c85f5d09612fe2630c06aa9" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-13T13:18:59.960782988" + }, + "-profile test_skip_id_mapping": { + "content": [ + [ + "errors", + "idmapping", + "merged_datasets", + "merged_datasets/whole_design.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_ratio_zeros.txt", + "multiqc/multiqc_data/multiqc_skewness.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/ratio_zeros.pdf", + "multiqc/multiqc_plots/pdf/skewness.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/ratio_zeros.png", + "multiqc/multiqc_plots/png/skewness.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/ratio_zeros.svg", + "multiqc/multiqc_plots/svg/skewness.svg", + "multiqc/multiqc_report.html", + "multiqc/versions.yml", + "normalised", + "normalised/microarray.normalised", + "normalised/microarray.normalised/quantile_normalised", + "normalised/microarray.normalised/quantile_normalised/microarray.normalised.filtered.quant_norm.parquet", + "normalised/rnaseq.raw", + "normalised/rnaseq.raw/quantile_normalised", + "normalised/rnaseq.raw/quantile_normalised/rnaseq.raw.filtered.tpm.quant_norm.parquet", + "normalised/rnaseq.raw/tpm", + "normalised/rnaseq.raw/tpm/rnaseq.raw.filtered.tpm.parquet", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "statistics", + "statistics/ratio_zeros.csv", + "statistics/skewness.csv", + "warnings" + ], + [ + "whole_design.csv:md5,70d6c2673e619ca52d2774fb3e368382", + "ratio_zeros.csv:md5,3d3fca62a7d1067f0ae0980c8ff570b9", + "skewness.csv:md5,3380ed4915bdbb1be1e5354c6cd99e8c" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-13T13:15:27.393976699" + }, + "-profile test": { + "content": [ + [ + "aggregated", + "aggregated/all_counts_filtered.parquet", + "aggregated/all_genes_summary.csv", + "aggregated/most_stable_genes_summary.csv", + "aggregated/most_stable_genes_transposed_counts_filtered.csv", + "dash_app", + "dash_app/app.py", + "dash_app/assets", + "dash_app/assets/style.css", + "dash_app/data", + "dash_app/data/all_counts.parquet", + "dash_app/data/all_genes_summary.csv", + "dash_app/data/whole_design.csv", + "dash_app/environment.yml", + "dash_app/src", + "dash_app/src/callbacks", + "dash_app/src/callbacks/common.py", + "dash_app/src/callbacks/genes.py", + "dash_app/src/callbacks/samples.py", + "dash_app/src/components", + "dash_app/src/components/graphs.py", + "dash_app/src/components/icons.py", + "dash_app/src/components/right_sidebar.py", + "dash_app/src/components/settings", + "dash_app/src/components/settings/genes.py", + "dash_app/src/components/settings/samples.py", + "dash_app/src/components/stores.py", + "dash_app/src/components/tables.py", + "dash_app/src/components/tooltips.py", + "dash_app/src/components/top.py", + "dash_app/src/utils", + "dash_app/src/utils/config.py", + "dash_app/src/utils/data_management.py", + "dash_app/src/utils/style.py", + "errors", + "geo", + "idmapping", + "idmapping/collected_gene_ids", + "idmapping/collected_gene_ids/gene_id_occurrences.csv", + "idmapping/collected_gene_ids/unique_gene_ids.txt", + "idmapping/global_gene_id_mapping.csv", + "idmapping/global_gene_metadata.csv", + "idmapping/gprofiler", + "idmapping/gprofiler/gene_metadata.csv", + "idmapping/gprofiler/mapped_gene_ids.csv", + "idmapping/whole_gene_id_mapping.csv", + "idmapping/whole_gene_metadata.csv", + "merged_datasets", + "merged_datasets/whole_design.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_eatlas_all_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_eatlas_selected_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_expression_distributions_most_stable_genes.txt", + "multiqc/multiqc_data/multiqc_gene_statistics.txt", + "multiqc/multiqc_data/multiqc_geo_all_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_geo_rejected_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_geo_selected_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_geo_warning_reasons.txt", + "multiqc/multiqc_data/multiqc_id_mapping_stats.txt", + "multiqc/multiqc_data/multiqc_ranked_most_stable_genes_summary.txt", + "multiqc/multiqc_data/multiqc_ratio_zeros.txt", + "multiqc/multiqc_data/multiqc_skewness.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_data/multiqc_total_gene_id_occurrence_quantiles.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/eatlas_all_experiments_metadata.pdf", + "multiqc/multiqc_plots/pdf/eatlas_selected_experiments_metadata.pdf", + "multiqc/multiqc_plots/pdf/expression_distributions_most_stable_genes.pdf", + "multiqc/multiqc_plots/pdf/gene_statistics.pdf", + "multiqc/multiqc_plots/pdf/geo_all_experiments_metadata.pdf", + "multiqc/multiqc_plots/pdf/geo_rejected_experiments_metadata.pdf", + "multiqc/multiqc_plots/pdf/geo_selected_experiments_metadata.pdf", + "multiqc/multiqc_plots/pdf/geo_warning_reasons.pdf", + "multiqc/multiqc_plots/pdf/id_mapping_stats-cnt.pdf", + "multiqc/multiqc_plots/pdf/id_mapping_stats-pct.pdf", + "multiqc/multiqc_plots/pdf/ranked_most_stable_genes_summary.pdf", + "multiqc/multiqc_plots/pdf/ratio_zeros.pdf", + "multiqc/multiqc_plots/pdf/skewness.pdf", + "multiqc/multiqc_plots/pdf/total_gene_id_occurrence_quantiles.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/eatlas_all_experiments_metadata.png", + "multiqc/multiqc_plots/png/eatlas_selected_experiments_metadata.png", + "multiqc/multiqc_plots/png/expression_distributions_most_stable_genes.png", + "multiqc/multiqc_plots/png/gene_statistics.png", + "multiqc/multiqc_plots/png/geo_all_experiments_metadata.png", + "multiqc/multiqc_plots/png/geo_rejected_experiments_metadata.png", + "multiqc/multiqc_plots/png/geo_selected_experiments_metadata.png", + "multiqc/multiqc_plots/png/geo_warning_reasons.png", + "multiqc/multiqc_plots/png/id_mapping_stats-cnt.png", + "multiqc/multiqc_plots/png/id_mapping_stats-pct.png", + "multiqc/multiqc_plots/png/ranked_most_stable_genes_summary.png", + "multiqc/multiqc_plots/png/ratio_zeros.png", + "multiqc/multiqc_plots/png/skewness.png", + "multiqc/multiqc_plots/png/total_gene_id_occurrence_quantiles.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/eatlas_all_experiments_metadata.svg", + "multiqc/multiqc_plots/svg/eatlas_selected_experiments_metadata.svg", + "multiqc/multiqc_plots/svg/expression_distributions_most_stable_genes.svg", + "multiqc/multiqc_plots/svg/gene_statistics.svg", + "multiqc/multiqc_plots/svg/geo_all_experiments_metadata.svg", + "multiqc/multiqc_plots/svg/geo_rejected_experiments_metadata.svg", + "multiqc/multiqc_plots/svg/geo_selected_experiments_metadata.svg", + "multiqc/multiqc_plots/svg/geo_warning_reasons.svg", + "multiqc/multiqc_plots/svg/id_mapping_stats-cnt.svg", + "multiqc/multiqc_plots/svg/id_mapping_stats-pct.svg", + "multiqc/multiqc_plots/svg/ranked_most_stable_genes_summary.svg", + "multiqc/multiqc_plots/svg/ratio_zeros.svg", + "multiqc/multiqc_plots/svg/skewness.svg", + "multiqc/multiqc_plots/svg/total_gene_id_occurrence_quantiles.svg", + "multiqc/multiqc_report.html", + "multiqc/versions.yml", + "normalised", + "normalised/E_MTAB_8187_rnaseq", + "normalised/E_MTAB_8187_rnaseq/quantile_normalised", + "normalised/E_MTAB_8187_rnaseq/quantile_normalised/E_MTAB_8187_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.quant_norm.parquet", + "normalised/E_MTAB_8187_rnaseq/tpm", + "normalised/E_MTAB_8187_rnaseq/tpm/E_MTAB_8187_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.parquet", + "normalised/beta_vulgaris.rnaseq.raw.counts", + "normalised/beta_vulgaris.rnaseq.raw.counts/quantile_normalised", + "normalised/beta_vulgaris.rnaseq.raw.counts/quantile_normalised/beta_vulgaris.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.quant_norm.parquet", + "normalised/beta_vulgaris.rnaseq.raw.counts/tpm", + "normalised/beta_vulgaris.rnaseq.raw.counts/tpm/beta_vulgaris.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.parquet", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "public_data", + "public_data/expression_atlas", + "public_data/expression_atlas/accessions", + "public_data/expression_atlas/accessions/accessions.txt", + "public_data/expression_atlas/accessions/selected_experiments.metadata.tsv", + "public_data/expression_atlas/accessions/species_experiments.metadata.tsv", + "public_data/expression_atlas/datasets", + "public_data/expression_atlas/datasets/E_MTAB_8187_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv", + "public_data/geo", + "public_data/geo/accessions", + "public_data/geo/accessions/accessions.txt", + "public_data/geo/accessions/geo_all_datasets.metadata.tsv", + "public_data/geo/accessions/geo_rejected_datasets.metadata.tsv", + "public_data/geo/accessions/geo_selected_datasets.metadata.tsv", + "public_data/geo/datasets", + "public_data/geo/datasets/warning_reason.txt", + "statistics", + "statistics/id_mapping_stats.csv", + "statistics/ratio_zeros.csv", + "statistics/skewness.csv", + "warnings", + "warnings/geo_warning_reasons.csv" + ], + [ + "all_genes_summary.csv:md5,bea9d7cee88136a4a64812558d3b3119", + "most_stable_genes_summary.csv:md5,9d5b15d996ab2d9064174bfdfeb8d256", + "most_stable_genes_transposed_counts_filtered.csv:md5,70412588c290cac3cd85b189cdee2db6", + "style.css:md5,e6ba182eaf06980dbda49920efbf6e64", + "all_genes_summary.csv:md5,bea9d7cee88136a4a64812558d3b3119", + "whole_design.csv:md5,3c1e14c9bd7ad250326b070a0dd4d81f", + "environment.yml:md5,f9b192ef98a67f2084ad2fed6da01bc1", + "gene_id_occurrences.csv:md5,fab6c66d7793c245f67b0cd6d5053cdd", + "unique_gene_ids.txt:md5,ba79f5609df755c0f75de41357319c84", + "global_gene_id_mapping.csv:md5,7eecbd2d88adaf5f213f238a72d28b99", + "global_gene_metadata.csv:md5,cc8d4afdbaf03cd39a4e10f2a9040b7e", + "gene_metadata.csv:md5,cc8d4afdbaf03cd39a4e10f2a9040b7e", + "mapped_gene_ids.csv:md5,7eecbd2d88adaf5f213f238a72d28b99", + "whole_gene_id_mapping.csv:md5,7eecbd2d88adaf5f213f238a72d28b99", + "whole_gene_metadata.csv:md5,cc8d4afdbaf03cd39a4e10f2a9040b7e", + "whole_design.csv:md5,3c1e14c9bd7ad250326b070a0dd4d81f", + "accessions.txt:md5,76e5e3af7c72eac7a1993a2bd75b4d1a", + "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1", + "accessions.txt:md5,63a651d9df354aef24400cebe56dd5ec", + "warning_reason.txt:md5,603e30b732b7a6b501b59adf9d0e8837", + "id_mapping_stats.csv:md5,dc2d9d7f34e570411c8cf5885b447719", + "ratio_zeros.csv:md5,a6ca9ce8ab585102df150ae182af68b6", + "skewness.csv:md5,3fecd4b1fa61e11361d6d871763c8e48", + "geo_warning_reasons.csv:md5,0a77f9268abb1084fde8cb4c5cc96eca" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-13T13:09:19.670861344" + }, + "-profile test_dataset_custom_mapping_and_gene_length": { + "content": [ + { + "Workflow": { + "nf-core/stableexpression": "v1.0dev" + } + }, + [ + "errors", + "idmapping", + "idmapping/global_gene_id_mapping.csv", + "idmapping/global_gene_metadata.csv", + "idmapping/whole_gene_id_mapping.csv", + "idmapping/whole_gene_metadata.csv", + "merged_datasets", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "multiqc/versions.yml", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "statistics", + "warnings" + ], + [ + "global_gene_id_mapping.csv:md5,187a86074197044846bb8565e122eb8e", + "global_gene_metadata.csv:md5,5ae2d701ca0cb6384d9e1e08a345e452", + "whole_gene_id_mapping.csv:md5,187a86074197044846bb8565e122eb8e", + "whole_gene_metadata.csv:md5,5ae2d701ca0cb6384d9e1e08a345e452" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-06T17:24:45.471369388" + }, + "-profile test_accessions_only": { + "content": [ + { + "EXPRESSION_ATLAS": { + "nltk": "3.9.1", + "pandas": "2.3.0", + "python": "3.13.5", + "pyyaml": "6.0.2", + "requests": "2.32.4" + }, + "Workflow": { + "nf-core/stableexpression": "v1.0dev" + } + }, + [ + "errors", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_eatlas_all_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_eatlas_selected_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/eatlas_all_experiments_metadata.pdf", + "multiqc/multiqc_plots/pdf/eatlas_selected_experiments_metadata.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/eatlas_all_experiments_metadata.png", + "multiqc/multiqc_plots/png/eatlas_selected_experiments_metadata.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/eatlas_all_experiments_metadata.svg", + "multiqc/multiqc_plots/svg/eatlas_selected_experiments_metadata.svg", + "multiqc/multiqc_report.html", + "multiqc/versions.yml", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "public_data", + "public_data/expression_atlas", + "public_data/expression_atlas/accessions", + "public_data/expression_atlas/accessions/accessions.txt", + "public_data/expression_atlas/accessions/selected_experiments.metadata.tsv", + "public_data/expression_atlas/accessions/species_experiments.metadata.tsv", + "statistics", + "warnings" + ], + [ + "accessions.txt:md5,76e5e3af7c72eac7a1993a2bd75b4d1a" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-13T13:11:02.671079693" + }, + "-profile test_one_accession_low_gene_count": { + "content": [ + { + "CLEAN_GENE_IDS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COLLECT_GENE_IDS": { + "python": "3.14.2", + "tqdm": "4.67.1" + }, + "COLLECT_STATISTICS": { + "pandas": "2.3.3", + "python": "3.13.7" + }, + "COMPUTE_DATASET_STATISTICS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_GENE_TRANSCRIPT_LENGTHS": { + "pandas": "2.3.3", + "python": "3.13.7" + }, + "COMPUTE_GLOBAL_STATISTICS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_PLATFORM_STATISTICS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_TPM": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "DETECT_RARE_GENES": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "DOWNLOAD_ENSEMBL_ANNOTATION": { + "bs4": "4.14.2", + "pandas": "2.3.3", + "python": "3.14.0", + "requests": "2.32.5", + "tqdm": "4.67.1" + }, + "EXPRESSION_ATLAS": { + "ExpressionAtlas": "1.30.0", + "R": "4.3.3 (2024-02-29)" + }, + "FILTER_AND_RENAME_GENES": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "GET_CANDIDATE_GENES": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "GPROFILER_IDMAPPING": { + "pandas": "2.3.1", + "python": "3.13.5", + "requests": "2.32.4" + }, + "MERGE_ALL_COUNTS": { + "polars": "1.34.0", + "python": "3.14.0", + "tqdm": "4.67.1" + }, + "MERGE_PLATFORM_COUNTS": { + "polars": "1.34.0", + "python": "3.14.0", + "tqdm": "4.67.1" + }, + "NORMFINDER": { + "polars": "1.33.1", + "python": "3.13.7" + }, + "QUANTILE_NORMALISATION": { + "polars": "1.36.1", + "python": "3.14.2", + "scikit-learn": "1.8.0" + }, + "REMOVE_SAMPLES_NOT_VALID": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "Workflow": { + "nf-core/stableexpression": "v1.0dev" + } + }, + [ + "errors", + "idmapping", + "idmapping/collected_gene_ids", + "idmapping/collected_gene_ids/gene_id_occurrences.csv", + "idmapping/collected_gene_ids/unique_gene_ids.txt", + "idmapping/global_gene_id_mapping.csv", + "idmapping/global_gene_metadata.csv", + "idmapping/gprofiler", + "idmapping/gprofiler/gene_metadata.csv", + "idmapping/gprofiler/mapped_gene_ids.csv", + "idmapping/whole_gene_id_mapping.csv", + "idmapping/whole_gene_metadata.csv", + "merged_datasets", + "merged_datasets/whole_design.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_id_mapping_stats.txt", + "multiqc/multiqc_data/multiqc_ratio_zeros.txt", + "multiqc/multiqc_data/multiqc_skewness.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_data/multiqc_total_gene_id_occurrence_quantiles.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/id_mapping_stats-cnt.pdf", + "multiqc/multiqc_plots/pdf/id_mapping_stats-pct.pdf", + "multiqc/multiqc_plots/pdf/ratio_zeros.pdf", + "multiqc/multiqc_plots/pdf/skewness.pdf", + "multiqc/multiqc_plots/pdf/total_gene_id_occurrence_quantiles.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/id_mapping_stats-cnt.png", + "multiqc/multiqc_plots/png/id_mapping_stats-pct.png", + "multiqc/multiqc_plots/png/ratio_zeros.png", + "multiqc/multiqc_plots/png/skewness.png", + "multiqc/multiqc_plots/png/total_gene_id_occurrence_quantiles.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/id_mapping_stats-cnt.svg", + "multiqc/multiqc_plots/svg/id_mapping_stats-pct.svg", + "multiqc/multiqc_plots/svg/ratio_zeros.svg", + "multiqc/multiqc_plots/svg/skewness.svg", + "multiqc/multiqc_plots/svg/total_gene_id_occurrence_quantiles.svg", + "multiqc/multiqc_report.html", + "multiqc/versions.yml", + "normalised", + "normalised/E_GEOD_51720_rnaseq", + "normalised/E_GEOD_51720_rnaseq/quantile_normalised", + "normalised/E_GEOD_51720_rnaseq/quantile_normalised/E_GEOD_51720_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.quant_norm.parquet", + "normalised/E_GEOD_51720_rnaseq/tpm", + "normalised/E_GEOD_51720_rnaseq/tpm/E_GEOD_51720_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.parquet", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "public_data", + "public_data/expression_atlas", + "public_data/expression_atlas/datasets", + "public_data/expression_atlas/datasets/E_GEOD_51720_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_GEOD_51720_rnaseq.rnaseq.raw.counts.csv", + "statistics", + "statistics/id_mapping_stats.csv", + "statistics/ratio_zeros.csv", + "statistics/skewness.csv", + "warnings" + ], + [ + "gene_id_occurrences.csv:md5,279e38e052661017d28451c348f39f21", + "unique_gene_ids.txt:md5,9c463e6c1754d4f4c7ea684578aa6849", + "global_gene_id_mapping.csv:md5,42491ef436cce231258c0358e1af5745", + "global_gene_metadata.csv:md5,b35e20500269d4e6787ef1a3468f16bc", + "gene_metadata.csv:md5,b35e20500269d4e6787ef1a3468f16bc", + "mapped_gene_ids.csv:md5,42491ef436cce231258c0358e1af5745", + "whole_gene_id_mapping.csv:md5,42491ef436cce231258c0358e1af5745", + "whole_gene_metadata.csv:md5,b35e20500269d4e6787ef1a3468f16bc", + "whole_design.csv:md5,d3aa542c4ad07d0051a84482fe6cd81c", + "E_GEOD_51720_rnaseq.design.csv:md5,80805afb29837b6fbb73a6aa6f3a461b", + "E_GEOD_51720_rnaseq.rnaseq.raw.counts.csv:md5,07cd448196fc2fea4663bd9705da2b98", + "id_mapping_stats.csv:md5,cd17a5d4afa6b86a48adb03868d3073f", + "ratio_zeros.csv:md5,57f747774c59abc441a353544b7c11be", + "skewness.csv:md5,14ee7163a228d70c26097fa0f9a793ec" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-13T13:12:38.322522517" + }, + "-profile test_bigger_with_genorm": { + "content": [ + { + "AGGREGATE_RESULTS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "CLEAN_GENE_IDS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COLLECT_GENE_IDS": { + "python": "3.14.2", + "tqdm": "4.67.1" + }, + "COLLECT_STATISTICS": { + "pandas": "2.3.3", + "python": "3.13.7" + }, + "COMPUTE_DATASET_STATISTICS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_GENE_TRANSCRIPT_LENGTHS": { + "pandas": "2.3.3", + "python": "3.13.7" + }, + "COMPUTE_GLOBAL_STATISTICS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_M_MEASURE": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_PLATFORM_STATISTICS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_STABILITY_SCORES": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "COMPUTE_TPM": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "CROSS_JOIN": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "DASH_APP": { + "python": "3.13.8", + "dash": "3.2.0", + "dash-extensions": "2.0.4", + "dash-mantine-components": "2.3.0", + "dash-ag-grid": "32.3.2", + "polars": "1.35.0", + "pandas": "2.3.3", + "pyarrow": "22.0.0", + "scipy": "1.16.3" + }, + "DETECT_RARE_GENES": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "DOWNLOAD_ENSEMBL_ANNOTATION": { + "bs4": "4.14.2", + "pandas": "2.3.3", + "python": "3.14.0", + "requests": "2.32.5", + "tqdm": "4.67.1" + }, + "EXPRESSION_ATLAS": { + "ExpressionAtlas": "1.30.0", + "R": "4.3.3 (2024-02-29)", + "nltk": "3.9.1", + "pandas": "2.3.0", + "python": "3.13.5", + "pyyaml": "6.0.2", + "requests": "2.32.4" + }, + "EXPRESSION_RATIO": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "FILTER_AND_RENAME_GENES": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "GET_CANDIDATE_GENES": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "GPROFILER_IDMAPPING": { + "pandas": "2.3.1", + "python": "3.13.5", + "requests": "2.32.4" + }, + "MAKE_CHUNKS": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "MERGE_ALL_COUNTS": { + "polars": "1.34.0", + "python": "3.14.0", + "tqdm": "4.67.1" + }, + "MERGE_PLATFORM_COUNTS": { + "polars": "1.34.0", + "python": "3.14.0", + "tqdm": "4.67.1" + }, + "NORMFINDER": { + "polars": "1.33.1", + "python": "3.13.7" + }, + "QUANTILE_NORMALISATION": { + "polars": "1.36.1", + "python": "3.14.2", + "scikit-learn": "1.8.0" + }, + "RATIO_STANDARD_VARIATION": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "REMOVE_SAMPLES_NOT_VALID": { + "polars": "1.17.1", + "python": "3.12.8" + }, + "Workflow": { + "nf-core/stableexpression": "v1.0dev" + } + }, + [ + "aggregated", + "aggregated/all_counts_filtered.parquet", + "aggregated/all_genes_summary.csv", + "aggregated/most_stable_genes_summary.csv", + "aggregated/most_stable_genes_transposed_counts_filtered.csv", + "dash_app", + "dash_app/app.py", + "dash_app/assets", + "dash_app/assets/style.css", + "dash_app/data", + "dash_app/data/all_counts.parquet", + "dash_app/data/all_genes_summary.csv", + "dash_app/data/whole_design.csv", + "dash_app/environment.yml", + "dash_app/src", + "dash_app/src/callbacks", + "dash_app/src/callbacks/common.py", + "dash_app/src/callbacks/genes.py", + "dash_app/src/callbacks/samples.py", + "dash_app/src/components", + "dash_app/src/components/graphs.py", + "dash_app/src/components/icons.py", + "dash_app/src/components/right_sidebar.py", + "dash_app/src/components/settings", + "dash_app/src/components/settings/genes.py", + "dash_app/src/components/settings/samples.py", + "dash_app/src/components/stores.py", + "dash_app/src/components/tables.py", + "dash_app/src/components/tooltips.py", + "dash_app/src/components/top.py", + "dash_app/src/utils", + "dash_app/src/utils/config.py", + "dash_app/src/utils/data_management.py", + "dash_app/src/utils/style.py", + "errors", + "idmapping", + "idmapping/collected_gene_ids", + "idmapping/collected_gene_ids/gene_id_occurrences.csv", + "idmapping/collected_gene_ids/unique_gene_ids.txt", + "idmapping/global_gene_id_mapping.csv", + "idmapping/global_gene_metadata.csv", + "idmapping/gprofiler", + "idmapping/gprofiler/gene_metadata.csv", + "idmapping/gprofiler/mapped_gene_ids.csv", + "idmapping/whole_gene_id_mapping.csv", + "idmapping/whole_gene_metadata.csv", + "merged_datasets", + "merged_datasets/whole_design.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_eatlas_all_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_eatlas_selected_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_expression_distributions_most_stable_genes.txt", + "multiqc/multiqc_data/multiqc_gene_statistics.txt", + "multiqc/multiqc_data/multiqc_id_mapping_stats.txt", + "multiqc/multiqc_data/multiqc_ranked_most_stable_genes_summary.txt", + "multiqc/multiqc_data/multiqc_ratio_zeros.txt", + "multiqc/multiqc_data/multiqc_skewness.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_data/multiqc_total_gene_id_occurrence_quantiles.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/eatlas_all_experiments_metadata.pdf", + "multiqc/multiqc_plots/pdf/eatlas_selected_experiments_metadata.pdf", + "multiqc/multiqc_plots/pdf/expression_distributions_most_stable_genes.pdf", + "multiqc/multiqc_plots/pdf/gene_statistics.pdf", + "multiqc/multiqc_plots/pdf/id_mapping_stats-cnt.pdf", + "multiqc/multiqc_plots/pdf/id_mapping_stats-pct.pdf", + "multiqc/multiqc_plots/pdf/ranked_most_stable_genes_summary.pdf", + "multiqc/multiqc_plots/pdf/ratio_zeros.pdf", + "multiqc/multiqc_plots/pdf/skewness.pdf", + "multiqc/multiqc_plots/pdf/total_gene_id_occurrence_quantiles.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/eatlas_all_experiments_metadata.png", + "multiqc/multiqc_plots/png/eatlas_selected_experiments_metadata.png", + "multiqc/multiqc_plots/png/expression_distributions_most_stable_genes.png", + "multiqc/multiqc_plots/png/gene_statistics.png", + "multiqc/multiqc_plots/png/id_mapping_stats-cnt.png", + "multiqc/multiqc_plots/png/id_mapping_stats-pct.png", + "multiqc/multiqc_plots/png/ranked_most_stable_genes_summary.png", + "multiqc/multiqc_plots/png/ratio_zeros.png", + "multiqc/multiqc_plots/png/skewness.png", + "multiqc/multiqc_plots/png/total_gene_id_occurrence_quantiles.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/eatlas_all_experiments_metadata.svg", + "multiqc/multiqc_plots/svg/eatlas_selected_experiments_metadata.svg", + "multiqc/multiqc_plots/svg/expression_distributions_most_stable_genes.svg", + "multiqc/multiqc_plots/svg/gene_statistics.svg", + "multiqc/multiqc_plots/svg/id_mapping_stats-cnt.svg", + "multiqc/multiqc_plots/svg/id_mapping_stats-pct.svg", + "multiqc/multiqc_plots/svg/ranked_most_stable_genes_summary.svg", + "multiqc/multiqc_plots/svg/ratio_zeros.svg", + "multiqc/multiqc_plots/svg/skewness.svg", + "multiqc/multiqc_plots/svg/total_gene_id_occurrence_quantiles.svg", + "multiqc/multiqc_report.html", + "multiqc/versions.yml", + "normalised", + "normalised/E_MTAB_5072_rnaseq", + "normalised/E_MTAB_5072_rnaseq/quantile_normalised", + "normalised/E_MTAB_5072_rnaseq/quantile_normalised/E_MTAB_5072_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.quant_norm.parquet", + "normalised/E_MTAB_5072_rnaseq/tpm", + "normalised/E_MTAB_5072_rnaseq/tpm/E_MTAB_5072_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.parquet", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "public_data", + "public_data/expression_atlas", + "public_data/expression_atlas/accessions", + "public_data/expression_atlas/accessions/accessions.txt", + "public_data/expression_atlas/accessions/selected_experiments.metadata.tsv", + "public_data/expression_atlas/accessions/species_experiments.metadata.tsv", + "public_data/expression_atlas/datasets", + "public_data/expression_atlas/datasets/E_MTAB_5072_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_MTAB_5072_rnaseq.rnaseq.raw.counts.csv", + "statistics", + "statistics/id_mapping_stats.csv", + "statistics/ratio_zeros.csv", + "statistics/skewness.csv", + "warnings" + ], + [ + "all_genes_summary.csv:md5,a0d04bcad1ead65f551d466b23a5a6fb", + "most_stable_genes_summary.csv:md5,de1cfd50d23435117f6643c86f922c47", + "most_stable_genes_transposed_counts_filtered.csv:md5,5ebf832e94abb4e1bac0b9fad41c6dbb", + "style.css:md5,e6ba182eaf06980dbda49920efbf6e64", + "all_genes_summary.csv:md5,a0d04bcad1ead65f551d466b23a5a6fb", + "whole_design.csv:md5,c9bfd7bc8ca365222e03e67eb24b9a76", + "environment.yml:md5,f9b192ef98a67f2084ad2fed6da01bc1", + "gene_id_occurrences.csv:md5,631317db987951a12f15e1c3e76068cd", + "unique_gene_ids.txt:md5,13ae1b52833134f8ed6d982c00487927", + "global_gene_id_mapping.csv:md5,efc95a8e276be1eb0af9639f72e48145", + "global_gene_metadata.csv:md5,1d342577587bc48c1eff077a594929fa", + "gene_metadata.csv:md5,1d342577587bc48c1eff077a594929fa", + "mapped_gene_ids.csv:md5,efc95a8e276be1eb0af9639f72e48145", + "whole_gene_id_mapping.csv:md5,efc95a8e276be1eb0af9639f72e48145", + "whole_gene_metadata.csv:md5,1d342577587bc48c1eff077a594929fa", + "whole_design.csv:md5,c9bfd7bc8ca365222e03e67eb24b9a76", + "accessions.txt:md5,561a967c16b2ef6c29fb643cd4002947", + "E_MTAB_5072_rnaseq.design.csv:md5,a1f33d126dde387a2d542381c44bc1f3", + "E_MTAB_5072_rnaseq.rnaseq.raw.counts.csv:md5,c41c84899a380515d759b99eeccfe43e", + "id_mapping_stats.csv:md5,70d0c1eacf06cd1312caaefb2f614811", + "ratio_zeros.csv:md5,b541c01e58e21cc28707534147aff80d", + "skewness.csv:md5,74dd30b70c807032d6bacdb669908dc1" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-13T13:25:48.350670705" + }, + "-profile test_download_only": { + "content": [ + { + "EXPRESSION_ATLAS": { + "ExpressionAtlas": "1.30.0", + "R": "4.3.3 (2024-02-29)", + "nltk": "3.9.1", + "pandas": "2.3.0", + "python": "3.13.5", + "pyyaml": "6.0.2", + "requests": "2.32.4" + }, + "Workflow": { + "nf-core/stableexpression": "v1.0dev" + } + }, + [ + "errors", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_eatlas_all_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_eatlas_selected_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/eatlas_all_experiments_metadata.pdf", + "multiqc/multiqc_plots/pdf/eatlas_selected_experiments_metadata.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/eatlas_all_experiments_metadata.png", + "multiqc/multiqc_plots/png/eatlas_selected_experiments_metadata.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/eatlas_all_experiments_metadata.svg", + "multiqc/multiqc_plots/svg/eatlas_selected_experiments_metadata.svg", + "multiqc/multiqc_report.html", + "multiqc/versions.yml", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "public_data", + "public_data/expression_atlas", + "public_data/expression_atlas/accessions", + "public_data/expression_atlas/accessions/accessions.txt", + "public_data/expression_atlas/accessions/selected_experiments.metadata.tsv", + "public_data/expression_atlas/accessions/species_experiments.metadata.tsv", + "public_data/expression_atlas/datasets", + "public_data/expression_atlas/datasets/E_MTAB_8187_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv", + "statistics", + "warnings" + ], + [ + "accessions.txt:md5,76e5e3af7c72eac7a1993a2bd75b4d1a", + "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-13T13:11:30.403331158" + }, + "-profile test_gprofiler_target_database_entrez": { + "content": [ + [ + "aggregated", + "aggregated/all_counts_filtered.parquet", + "aggregated/all_genes_summary.csv", + "aggregated/most_stable_genes_summary.csv", + "aggregated/most_stable_genes_transposed_counts_filtered.csv", + "dash_app", + "dash_app/app.py", + "dash_app/assets", + "dash_app/assets/style.css", + "dash_app/data", + "dash_app/data/all_counts.parquet", + "dash_app/data/all_genes_summary.csv", + "dash_app/data/whole_design.csv", + "dash_app/environment.yml", + "dash_app/src", + "dash_app/src/callbacks", + "dash_app/src/callbacks/common.py", + "dash_app/src/callbacks/genes.py", + "dash_app/src/callbacks/samples.py", + "dash_app/src/components", + "dash_app/src/components/graphs.py", + "dash_app/src/components/icons.py", + "dash_app/src/components/right_sidebar.py", + "dash_app/src/components/settings", + "dash_app/src/components/settings/genes.py", + "dash_app/src/components/settings/samples.py", + "dash_app/src/components/stores.py", + "dash_app/src/components/tables.py", + "dash_app/src/components/tooltips.py", + "dash_app/src/components/top.py", + "dash_app/src/utils", + "dash_app/src/utils/config.py", + "dash_app/src/utils/data_management.py", + "dash_app/src/utils/style.py", + "errors", + "idmapping", + "idmapping/collected_gene_ids", + "idmapping/collected_gene_ids/gene_id_occurrences.csv", + "idmapping/collected_gene_ids/unique_gene_ids.txt", + "idmapping/global_gene_id_mapping.csv", + "idmapping/global_gene_metadata.csv", + "idmapping/gprofiler", + "idmapping/gprofiler/gene_metadata.csv", + "idmapping/gprofiler/mapped_gene_ids.csv", + "idmapping/whole_gene_id_mapping.csv", + "idmapping/whole_gene_metadata.csv", + "merged_datasets", + "merged_datasets/whole_design.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_eatlas_all_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_eatlas_selected_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_expression_distributions_most_stable_genes.txt", + "multiqc/multiqc_data/multiqc_gene_statistics.txt", + "multiqc/multiqc_data/multiqc_id_mapping_stats.txt", + "multiqc/multiqc_data/multiqc_ranked_most_stable_genes_summary.txt", + "multiqc/multiqc_data/multiqc_ratio_zeros.txt", + "multiqc/multiqc_data/multiqc_skewness.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_data/multiqc_total_gene_id_occurrence_quantiles.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/eatlas_all_experiments_metadata.pdf", + "multiqc/multiqc_plots/pdf/eatlas_selected_experiments_metadata.pdf", + "multiqc/multiqc_plots/pdf/expression_distributions_most_stable_genes.pdf", + "multiqc/multiqc_plots/pdf/gene_statistics.pdf", + "multiqc/multiqc_plots/pdf/id_mapping_stats-cnt.pdf", + "multiqc/multiqc_plots/pdf/id_mapping_stats-pct.pdf", + "multiqc/multiqc_plots/pdf/ranked_most_stable_genes_summary.pdf", + "multiqc/multiqc_plots/pdf/ratio_zeros.pdf", + "multiqc/multiqc_plots/pdf/skewness.pdf", + "multiqc/multiqc_plots/pdf/total_gene_id_occurrence_quantiles.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/eatlas_all_experiments_metadata.png", + "multiqc/multiqc_plots/png/eatlas_selected_experiments_metadata.png", + "multiqc/multiqc_plots/png/expression_distributions_most_stable_genes.png", + "multiqc/multiqc_plots/png/gene_statistics.png", + "multiqc/multiqc_plots/png/id_mapping_stats-cnt.png", + "multiqc/multiqc_plots/png/id_mapping_stats-pct.png", + "multiqc/multiqc_plots/png/ranked_most_stable_genes_summary.png", + "multiqc/multiqc_plots/png/ratio_zeros.png", + "multiqc/multiqc_plots/png/skewness.png", + "multiqc/multiqc_plots/png/total_gene_id_occurrence_quantiles.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/eatlas_all_experiments_metadata.svg", + "multiqc/multiqc_plots/svg/eatlas_selected_experiments_metadata.svg", + "multiqc/multiqc_plots/svg/expression_distributions_most_stable_genes.svg", + "multiqc/multiqc_plots/svg/gene_statistics.svg", + "multiqc/multiqc_plots/svg/id_mapping_stats-cnt.svg", + "multiqc/multiqc_plots/svg/id_mapping_stats-pct.svg", + "multiqc/multiqc_plots/svg/ranked_most_stable_genes_summary.svg", + "multiqc/multiqc_plots/svg/ratio_zeros.svg", + "multiqc/multiqc_plots/svg/skewness.svg", + "multiqc/multiqc_plots/svg/total_gene_id_occurrence_quantiles.svg", + "multiqc/multiqc_report.html", + "multiqc/versions.yml", + "normalised", + "normalised/E_MTAB_8187_rnaseq", + "normalised/E_MTAB_8187_rnaseq/quantile_normalised", + "normalised/E_MTAB_8187_rnaseq/quantile_normalised/E_MTAB_8187_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.quant_norm.parquet", + "normalised/E_MTAB_8187_rnaseq/tpm", + "normalised/E_MTAB_8187_rnaseq/tpm/E_MTAB_8187_rnaseq.rnaseq.raw.counts.cleaned.renamed.filtered.tpm.parquet", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "public_data", + "public_data/expression_atlas", + "public_data/expression_atlas/accessions", + "public_data/expression_atlas/accessions/accessions.txt", + "public_data/expression_atlas/accessions/selected_experiments.metadata.tsv", + "public_data/expression_atlas/accessions/species_experiments.metadata.tsv", + "public_data/expression_atlas/datasets", + "public_data/expression_atlas/datasets/E_MTAB_8187_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv", + "statistics", + "statistics/id_mapping_stats.csv", + "statistics/ratio_zeros.csv", + "statistics/skewness.csv", + "warnings" + ], + [ + "all_genes_summary.csv:md5,829fd6221705fc1588a11bfdb1a37210", + "most_stable_genes_summary.csv:md5,73cbb4d6462e01ed7778f076726613fd", + "most_stable_genes_transposed_counts_filtered.csv:md5,4f50fafaa96ffd7a7b0d98d9c8d6beb4", + "style.css:md5,e6ba182eaf06980dbda49920efbf6e64", + "all_genes_summary.csv:md5,829fd6221705fc1588a11bfdb1a37210", + "whole_design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "environment.yml:md5,f9b192ef98a67f2084ad2fed6da01bc1", + "gene_id_occurrences.csv:md5,b3ee7b1c575f83d247c5bce88382fb2b", + "unique_gene_ids.txt:md5,ba79f5609df755c0f75de41357319c84", + "global_gene_id_mapping.csv:md5,7eecbd2d88adaf5f213f238a72d28b99", + "global_gene_metadata.csv:md5,cc8d4afdbaf03cd39a4e10f2a9040b7e", + "gene_metadata.csv:md5,cc8d4afdbaf03cd39a4e10f2a9040b7e", + "mapped_gene_ids.csv:md5,7eecbd2d88adaf5f213f238a72d28b99", + "whole_gene_id_mapping.csv:md5,7eecbd2d88adaf5f213f238a72d28b99", + "whole_gene_metadata.csv:md5,cc8d4afdbaf03cd39a4e10f2a9040b7e", + "whole_design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "accessions.txt:md5,76e5e3af7c72eac7a1993a2bd75b4d1a", + "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1", + "id_mapping_stats.csv:md5,17ccaa8e70c67c7d0de4ec3c630c2e5b", + "ratio_zeros.csv:md5,32889cf6de2af6413c42b8810a99a2df", + "skewness.csv:md5,178449bbd2361aa1e804e3f18e092ef1" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-13T13:20:59.702133972" + } +} \ No newline at end of file diff --git a/tests/modules/local/aggregate_results/main.nf.test b/tests/modules/local/aggregate_results/main.nf.test new file mode 100644 index 00000000..20016d9b --- /dev/null +++ b/tests/modules/local/aggregate_results/main.nf.test @@ -0,0 +1,57 @@ +nextflow_process { + + name "Test Process AGGREGATE_RESULTS" + script "modules/local/aggregate_results/main.nf" + process "AGGREGATE_RESULTS" + tag "aggregate_results" + + test("Without microarray") { + + when { + process { + """ + input[0] = file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + input[1] = file( '$projectDir/tests/test_data/base_statistics/output/stats_all_genes.csv', checkIfExists: true) + input[2] = [ file( '$projectDir/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv', checkIfExists: true) ] + input[3] = file( '$projectDir/tests/test_data/aggregate_results/metadata.csv', checkIfExists: true) + input[4] = file( '$projectDir/tests/test_data/aggregate_results/mapping.csv', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("With microarray") { + + when { + process { + """ + input[0] = file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + input[1] = file( '$projectDir/tests/test_data/base_statistics/output/stats_all_genes.csv', checkIfExists: true) + input[2] = [ + file( '$projectDir/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv', checkIfExists: true), + file( '$projectDir/tests/test_data/aggregate_results/microarray_stats_all_genes.csv', checkIfExists: true) + ] + input[3] = file( '$projectDir/tests/test_data/aggregate_results/metadata.csv', checkIfExists: true) + input[4] = file( '$projectDir/tests/test_data/aggregate_results/mapping.csv', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/aggregate_results/main.nf.test.snap b/tests/modules/local/aggregate_results/main.nf.test.snap new file mode 100644 index 00000000..554a48e6 --- /dev/null +++ b/tests/modules/local/aggregate_results/main.nf.test.snap @@ -0,0 +1,84 @@ +{ + "Without microarray": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + + ], + "all_counts_filtered": [ + + ], + "all_genes_summary": [ + + ], + "most_stable_genes_summary": [ + + ], + "most_stable_genes_transposed_counts_filtered": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.8" + }, + "timestamp": "2025-12-03T18:51:31.019751859" + }, + "With microarray": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + + ], + "all_counts_filtered": [ + + ], + "all_genes_summary": [ + + ], + "most_stable_genes_summary": [ + + ], + "most_stable_genes_transposed_counts_filtered": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.8" + }, + "timestamp": "2025-12-03T18:51:42.230203404" + } +} \ No newline at end of file diff --git a/tests/modules/local/compute_base_statistics/main.nf.test b/tests/modules/local/compute_base_statistics/main.nf.test new file mode 100644 index 00000000..5a2ed2c3 --- /dev/null +++ b/tests/modules/local/compute_base_statistics/main.nf.test @@ -0,0 +1,52 @@ +nextflow_process { + + name "Test Process COMPUTE_BASE_STATISTICS" + script "modules/local/compute_base_statistics/main.nf" + process "COMPUTE_BASE_STATISTICS" + tag "base_stats" + + test("No platform") { + + when { + process { + """ + input[0] = [ + [ platform: 'all' ], + file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("RNAseq platform") { + + when { + process { + """ + input[0] = [ + [ platform: 'rnaseq' ], + file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/compute_base_statistics/main.nf.test.snap b/tests/modules/local/compute_base_statistics/main.nf.test.snap new file mode 100644 index 00000000..d57408ca --- /dev/null +++ b/tests/modules/local/compute_base_statistics/main.nf.test.snap @@ -0,0 +1,48 @@ +{ + "No platform": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "stats": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.8" + }, + "timestamp": "2025-12-03T18:51:55.581816062" + }, + "RNAseq platform": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "stats": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.8" + }, + "timestamp": "2025-12-03T18:52:06.262096663" + } +} \ No newline at end of file diff --git a/tests/modules/local/compute_dataset_statistics/main.nf.test b/tests/modules/local/compute_dataset_statistics/main.nf.test new file mode 100644 index 00000000..2504fba8 --- /dev/null +++ b/tests/modules/local/compute_dataset_statistics/main.nf.test @@ -0,0 +1,30 @@ +nextflow_process { + + name "Test Process COMPUTE_DATASET_STATISTICS" + script "modules/local/compute_dataset_statistics/main.nf" + process "COMPUTE_DATASET_STATISTICS" + tag "COMPUTE_DATASET_STATISTICS" + + test("Should not fail") { + + when { + process { + """ + input[0] = [ + [dataset: 'test'], + file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/compute_dataset_statistics/main.nf.test.snap b/tests/modules/local/compute_dataset_statistics/main.nf.test.snap new file mode 100644 index 00000000..20ceb9b7 --- /dev/null +++ b/tests/modules/local/compute_dataset_statistics/main.nf.test.snap @@ -0,0 +1,43 @@ +{ + "Should not fail": { + "content": [ + { + "0": [ + [ + [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ], + "skewness.txt:md5,3f2c6b786ec7344d8d21444cfd3714c5" + ] + ], + "1": [ + [ + [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ], + "ratio_zeros.txt:md5,2fc93fe393cf18725bbc6d575e3d2d89" + ] + ], + "2": [ + [ + "COMPUTE_DATASET_STATISTICS", + "python", + "3.12.8" + ] + ], + "3": [ + [ + "COMPUTE_DATASET_STATISTICS", + "polars", + "1.17.1" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-06T17:37:06.331244835" + } +} \ No newline at end of file diff --git a/tests/modules/local/compute_stability_scores/main.nf.test b/tests/modules/local/compute_stability_scores/main.nf.test new file mode 100644 index 00000000..c0f5b44c --- /dev/null +++ b/tests/modules/local/compute_stability_scores/main.nf.test @@ -0,0 +1,52 @@ +nextflow_process { + + name "Test Process COMPUTE_STABILITY_SCORES" + script "modules/local/compute_stability_scores/main.nf" + process "COMPUTE_STABILITY_SCORES" + tag "stability_scores" + + test("Without Genorm") { + + when { + process { + """ + input[0] = file( '$projectDir/tests/test_data/compute_stability_scores/input/stats_all_genes.csv', checkIfExists: true) + input[1] = "0.8,0.1,0.1,0" + input[2] = file( '$projectDir/tests/test_data/compute_stability_scores/input/stability_values.normfinder.csv', checkIfExists: true) + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("With Genorm") { + + when { + process { + """ + input[0] = file( '$projectDir/tests/test_data/compute_stability_scores/input/stats_all_genes.csv', checkIfExists: true) + input[1] = "0.8,0.1,0.1,0.1" + input[2] = file( '$projectDir/tests/test_data/compute_stability_scores/input/stability_values.normfinder.csv', checkIfExists: true) + input[3] = file( '$projectDir/tests/test_data/compute_stability_scores/input/genorm.m_measures.csv', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/compute_stability_scores/main.nf.test.snap b/tests/modules/local/compute_stability_scores/main.nf.test.snap new file mode 100644 index 00000000..53455505 --- /dev/null +++ b/tests/modules/local/compute_stability_scores/main.nf.test.snap @@ -0,0 +1,64 @@ +{ + "With Genorm": { + "content": [ + { + "0": [ + "stats_with_scores.csv:md5,d1e74b628b6dd02635e07dec414fded0" + ], + "1": [ + [ + "COMPUTE_STABILITY_SCORES", + "python", + "3.12.8" + ] + ], + "2": [ + [ + "COMPUTE_STABILITY_SCORES", + "polars", + "1.17.1" + ] + ], + "stats_with_stability_scores": [ + "stats_with_scores.csv:md5,d1e74b628b6dd02635e07dec414fded0" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.8" + }, + "timestamp": "2025-12-03T18:52:21.178734391" + }, + "Without Genorm": { + "content": [ + { + "0": [ + "stats_with_scores.csv:md5,bf305bce944291ce723ef41b68bac0fc" + ], + "1": [ + [ + "COMPUTE_STABILITY_SCORES", + "python", + "3.12.8" + ] + ], + "2": [ + [ + "COMPUTE_STABILITY_SCORES", + "polars", + "1.17.1" + ] + ], + "stats_with_stability_scores": [ + "stats_with_scores.csv:md5,bf305bce944291ce723ef41b68bac0fc" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.8" + }, + "timestamp": "2025-12-03T18:52:14.104078273" + } +} \ No newline at end of file diff --git a/tests/modules/local/dataset_statistics/main.nf.test b/tests/modules/local/dataset_statistics/main.nf.test deleted file mode 100644 index 21b9adb6..00000000 --- a/tests/modules/local/dataset_statistics/main.nf.test +++ /dev/null @@ -1,30 +0,0 @@ -nextflow_process { - - name "Test Process DATASET_STATISTICS" - script "modules/local/dataset_statistics/main.nf" - process "DATASET_STATISTICS" - tag "dataset_statistics" - tag "module" - - test("Should run without failures") { - - when { - process { - """ - meta = [dataset: 'test'] - count_file = file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) - input[0] = [meta, count_file] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - -} diff --git a/tests/modules/local/dataset_statistics/main.nf.test.snap b/tests/modules/local/dataset_statistics/main.nf.test.snap deleted file mode 100644 index 1bbb014b..00000000 --- a/tests/modules/local/dataset_statistics/main.nf.test.snap +++ /dev/null @@ -1,57 +0,0 @@ -{ - "Should run without failures": { - "content": [ - { - "0": [ - [ - { - "dataset": "test" - }, - "count.raw.cpm.dataset_stats.csv:md5,93290e92d7ac9c0abdd2d9e76a7aff6b" - ] - ], - "1": [ - [ - "DATASET_STATISTICS", - "python", - "3.12.8" - ] - ], - "2": [ - [ - "DATASET_STATISTICS", - "pandas", - "2.2.3" - ] - ], - "3": [ - [ - "DATASET_STATISTICS", - "scipy", - "1.15.0" - ] - ], - "4": [ - [ - "DATASET_STATISTICS", - "pyarrow", - "19.0.0" - ] - ], - "stats": [ - [ - { - "dataset": "test" - }, - "count.raw.cpm.dataset_stats.csv:md5,93290e92d7ac9c0abdd2d9e76a7aff6b" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.4" - }, - "timestamp": "2025-02-08T19:39:30.385303512" - } -} \ No newline at end of file diff --git a/tests/modules/local/deseq2/normalise/main.nf.test b/tests/modules/local/deseq2/normalise/main.nf.test deleted file mode 100644 index 76418c22..00000000 --- a/tests/modules/local/deseq2/normalise/main.nf.test +++ /dev/null @@ -1,75 +0,0 @@ -nextflow_process { - - name "Test Process DESEQ2_NORMALISE" - script "modules/local/deseq2/normalise/main.nf" - process "DESEQ2_NORMALISE" - tag "deseq2" - tag "normalise" - tag "module" - - test("Very small dataset") { - - when { - - process { - """ - meta = [accession: "accession", design: file('$projectDir/tests/test_data/normalise/base/design.csv')] - input[0] = [meta, file('$projectDir/tests/test_data/normalise/base/counts.csv')] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out.cpm).match() } - ) - } - - } - - test("Rows with many zeros") { - - tag "deseq2_many_zeros" - - when { - - process { - """ - meta = [accession: "accession", design: file('$projectDir/tests/test_data/normalise/many_zeros/design.csv')] - input[0] = [meta, file('$projectDir/tests/test_data/normalise/many_zeros/counts.csv')] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out.cpm).match() } - ) - } - - } - - test("One group") { - - when { - - process { - """ - meta = [accession: "accession", design: file('$projectDir/tests/test_data/normalise/one_group/design.csv')] - input[0] = [meta, file('$projectDir/tests/test_data/normalise/one_group/counts.csv')] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out.cpm).match() } - ) - } - - } - -} diff --git a/tests/modules/local/deseq2/normalise/main.nf.test.snap b/tests/modules/local/deseq2/normalise/main.nf.test.snap deleted file mode 100644 index 8f6af170..00000000 --- a/tests/modules/local/deseq2/normalise/main.nf.test.snap +++ /dev/null @@ -1,56 +0,0 @@ -{ - "Very small dataset": { - "content": [ - [ - [ - { - "accession": "accession", - "design": "design.csv:md5,a83dd6a15463b51d94f0a42c196d7933" - }, - "counts.cpm.csv:md5,df001c189c61c11dfab04d1bb47f7511" - ] - ] - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2025-01-29T14:11:50.643097087" - }, - "One group": { - "content": [ - [ - [ - { - "accession": "accession", - "design": "design.csv:md5,28cf54802df5df4a9dc406003623c6a7" - }, - "counts.cpm.csv:md5,568be72a338ad95037007bea5d552f86" - ] - ] - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2025-01-29T14:20:12.392982447" - }, - "Rows with many zeros": { - "content": [ - [ - [ - { - "accession": "accession", - "design": "design.csv:md5,3cdf98e2e202b4af2687eaefd9bdd8e9" - }, - "counts.cpm.csv:md5,f4741deb2b94a7898fe4e5bcf2135a63" - ] - ] - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2025-01-29T14:20:00.462942337" - } -} \ No newline at end of file diff --git a/tests/modules/local/edger/normalise/main.nf.test b/tests/modules/local/edger/normalise/main.nf.test deleted file mode 100644 index 31dc23c7..00000000 --- a/tests/modules/local/edger/normalise/main.nf.test +++ /dev/null @@ -1,75 +0,0 @@ -nextflow_process { - - name "Test Process EDGER_NORMALISE" - script "modules/local/edger/normalise/main.nf" - process "EDGER_NORMALISE" - tag "edger" - tag "normalisation" - tag "module" - - test("Very small dataset") { - - when { - - process { - """ - meta = [accession: "accession", design: file('$projectDir/tests/test_data/normalise/base/design.csv')] - input[0] = [meta, file('$projectDir/tests/test_data/normalise/base/counts.csv')] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out.cpm).match() } - ) - } - - } - - test("Rows with many zeros") { - - tag "rows_many_zeros" - - when { - - process { - """ - meta = [accession: "accession", design: file('$projectDir/tests/test_data/normalise/many_zeros/design.csv')] - input[0] = [meta, file('$projectDir/tests/test_data/normalise/many_zeros/counts.csv')] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out.cpm).match() } - ) - } - - } - - test("One group") { - - when { - - process { - """ - meta = [accession: "accession", design: file('$projectDir/tests/test_data/normalise/one_group/design.csv')] - input[0] = [meta, file('$projectDir/tests/test_data/normalise/one_group/counts.csv')] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out.cpm).match() } - ) - } - - } - -} diff --git a/tests/modules/local/edger/normalise/main.nf.test.snap b/tests/modules/local/edger/normalise/main.nf.test.snap deleted file mode 100644 index e4bc5c2e..00000000 --- a/tests/modules/local/edger/normalise/main.nf.test.snap +++ /dev/null @@ -1,56 +0,0 @@ -{ - "Very small dataset": { - "content": [ - [ - [ - { - "accession": "accession", - "design": "design.csv:md5,a83dd6a15463b51d94f0a42c196d7933" - }, - "counts.cpm.csv:md5,537bffb095e79d9667c955accd81e3a2" - ] - ] - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2025-01-29T14:20:44.011465207" - }, - "One group": { - "content": [ - [ - [ - { - "accession": "accession", - "design": "design.csv:md5,28cf54802df5df4a9dc406003623c6a7" - }, - "counts.cpm.csv:md5,680e59fd500ddd83ead508a63f3f9b48" - ] - ] - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2025-01-29T14:20:58.681438521" - }, - "Rows with many zeros": { - "content": [ - [ - [ - { - "accession": "accession", - "design": "design.csv:md5,3cdf98e2e202b4af2687eaefd9bdd8e9" - }, - "counts.cpm.csv:md5,f4624bdcb195b95f6e8bd9ee08470d3d" - ] - ] - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2025-01-29T14:20:51.214114937" - } -} \ No newline at end of file diff --git a/tests/modules/local/expressionatlas/getaccessions/main.nf.test b/tests/modules/local/expressionatlas/getaccessions/main.nf.test index be6c941d..a5e4f860 100644 --- a/tests/modules/local/expressionatlas/getaccessions/main.nf.test +++ b/tests/modules/local/expressionatlas/getaccessions/main.nf.test @@ -3,35 +3,72 @@ nextflow_process { name "Test Process EXPRESSIONATLAS_GETACCESSIONS" script "modules/local/expressionatlas/getaccessions/main.nf" process "EXPRESSIONATLAS_GETACCESSIONS" - tag "getaccessions" - tag "module" + tag "eatlas_getaccessions" - test("Solanum tuberosum one keyword") { + test("Beta vulgaris one keyword - no platform") { - tag "potato_two_kw" + when { + + process { + """ + input[0] = "beta_vulgaris" + input[1] = "leaf" + input[2] = [] + input[3] = 100 + input[4] = 42 + """ + } + } + + then { + assert process.success + } + + } + + test("Beta vulgaris no keyword - rnaseq platform") { when { process { """ - input[0] = "solanum_tuberosum" - input[1] = "potato" + input[0] = "beta_vulgaris" + input[1] = "" + input[2] = "rnaseq" + input[3] = 100 + input[4] = 42 """ } } then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) + assert process.success } } - test('Solanum tuberosum two keywords') { + test("Beta vulgaris - no experiments left after random sampling") { - tag "potato_two_kw" + when { + + process { + """ + input[0] = "beta_vulgaris" + input[1] = "" + input[2] = [] + input[3] = 1 + input[4] = 42 + """ + } + } + + then { + assert process.success + } + + } + + test('Solanum tuberosum two keywords - microarray') { when { @@ -39,6 +76,9 @@ nextflow_process { """ input[0] = "solanum_tuberosum" input[1] = "potato,phloem" + input[2] = "microarray" + input[3] = 10000 + input[4] = 42 """ } } @@ -54,14 +94,15 @@ nextflow_process { test('Solanum tuberosum no keyword') { - tag "potato_no_kw" - when { process { """ input[0] = "solanum_tuberosum" input[1] = "" + input[2] = "microarray" + input[3] = 100 + input[4] = 42 """ } } diff --git a/tests/modules/local/expressionatlas/getaccessions/main.nf.test.snap b/tests/modules/local/expressionatlas/getaccessions/main.nf.test.snap index 1bcaf1d0..54c55c29 100644 --- a/tests/modules/local/expressionatlas/getaccessions/main.nf.test.snap +++ b/tests/modules/local/expressionatlas/getaccessions/main.nf.test.snap @@ -1,116 +1,130 @@ { - "Solanum tuberosum no keyword": { + "Solanum tuberosum two keywords - microarray": { "content": [ { "0": [ - "accessions.txt:md5,dad63ac7a1715277fa44567bc40b5872" + "accessions.txt:md5,d41d8cd98f00b204e9800998ecf8427e" ], "1": [ + "ok" + ], + "2": [ + + ], + "3": [ + "species_experiments.metadata.tsv:md5,68b329da9893e34099c7d8ad5cb9c940" + ], + "4": [ [ "EXPRESSIONATLAS_GETACCESSIONS", "python", - "3.12.8" + "3.13.5" ] ], - "2": [ + "5": [ [ "EXPRESSIONATLAS_GETACCESSIONS", "requests", - "2.32.3" + "2.32.4" ] ], - "3": [ + "6": [ [ "EXPRESSIONATLAS_GETACCESSIONS", "nltk", "3.9.1" ] ], - "txt": [ - "accessions.txt:md5,dad63ac7a1715277fa44567bc40b5872" - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2025-01-01T14:25:08.425684541" - }, - "Solanum tuberosum one keyword": { - "content": [ - { - "0": [ - "accessions.txt:md5,dad63ac7a1715277fa44567bc40b5872" - ], - "1": [ + "7": [ [ "EXPRESSIONATLAS_GETACCESSIONS", - "python", - "3.12.8" + "pyyaml", + "6.0.2" ] ], - "2": [ + "8": [ [ "EXPRESSIONATLAS_GETACCESSIONS", - "requests", - "2.32.3" + "pandas", + "2.3.0" ] ], - "3": [ - [ - "EXPRESSIONATLAS_GETACCESSIONS", - "nltk", - "3.9.1" - ] + "accessions": [ + "accessions.txt:md5,d41d8cd98f00b204e9800998ecf8427e" ], - "txt": [ - "accessions.txt:md5,dad63ac7a1715277fa44567bc40b5872" + "sampling_quota": [ + "ok" ] } ], "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" + "nf-test": "0.9.3", + "nextflow": "25.10.2" }, - "timestamp": "2025-01-01T14:23:43.573176854" + "timestamp": "2025-12-11T10:59:26.647904855" }, - "Solanum tuberosum two keywords": { + "Solanum tuberosum no keyword": { "content": [ { "0": [ - "accessions.txt:md5,dad63ac7a1715277fa44567bc40b5872" + "accessions.txt:md5,d41d8cd98f00b204e9800998ecf8427e" ], "1": [ + "ok" + ], + "2": [ + + ], + "3": [ + "species_experiments.metadata.tsv:md5,68b329da9893e34099c7d8ad5cb9c940" + ], + "4": [ [ "EXPRESSIONATLAS_GETACCESSIONS", "python", - "3.12.8" + "3.13.5" ] ], - "2": [ + "5": [ [ "EXPRESSIONATLAS_GETACCESSIONS", "requests", - "2.32.3" + "2.32.4" ] ], - "3": [ + "6": [ [ "EXPRESSIONATLAS_GETACCESSIONS", "nltk", "3.9.1" ] ], - "txt": [ - "accessions.txt:md5,dad63ac7a1715277fa44567bc40b5872" + "7": [ + [ + "EXPRESSIONATLAS_GETACCESSIONS", + "pyyaml", + "6.0.2" + ] + ], + "8": [ + [ + "EXPRESSIONATLAS_GETACCESSIONS", + "pandas", + "2.3.0" + ] + ], + "accessions": [ + "accessions.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "sampling_quota": [ + "ok" ] } ], "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" + "nf-test": "0.9.3", + "nextflow": "25.10.2" }, - "timestamp": "2025-01-01T14:24:29.42124266" + "timestamp": "2025-12-11T10:59:38.871741855" } } \ No newline at end of file diff --git a/tests/modules/local/expressionatlas/getdata/main.nf.test b/tests/modules/local/expressionatlas/getdata/main.nf.test index c55adf41..4558f426 100644 --- a/tests/modules/local/expressionatlas/getdata/main.nf.test +++ b/tests/modules/local/expressionatlas/getdata/main.nf.test @@ -3,8 +3,7 @@ nextflow_process { name "Test Process EXPRESSIONATLAS_GETDATA" script "modules/local/expressionatlas/getdata/main.nf" process "EXPRESSIONATLAS_GETDATA" - tag "getdata" - tag "module" + tag "eatlas_getdata" test("Transcriptome Analysis of the potato (rnaseq)") { @@ -20,8 +19,10 @@ nextflow_process { } then { - assert process.success - assert snapshot(process.out).match() + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) } } @@ -40,8 +41,10 @@ nextflow_process { } then { - assert process.success - assert snapshot(process.out).match() + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) } } @@ -60,8 +63,10 @@ nextflow_process { } then { - assert process.success - assert snapshot(process.out).match() + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) } } @@ -79,12 +84,13 @@ nextflow_process { } } - // check for the absence of expected output (the error is ignored but no output is produced) + // must be successful without any output then { - assert process.success - assert process.trace.succeeded().size() == 0 - assert process.trace.failed().size() == 1 - assert process.out.design.size() == 0 + assertAll( + { assert process.success }, + { assert process.out.counts.size() == 0 }, + { assert process.out.design.size() == 0 } + ) } } @@ -104,10 +110,11 @@ nextflow_process { // check for the absence of expected output (the error is ignored but no output is produced) then { - assert process.success - assert process.trace.succeeded().size() == 0 - assert process.trace.failed().size() == 1 - assert process.out.design.size() == 0 + assertAll( + { assert process.success }, + { assert process.out.counts.size() == 0 }, + { assert process.out.design.size() == 0 } + ) } } @@ -127,10 +134,11 @@ nextflow_process { // check for the absence of expected output (the error is ignored but no output is produced) then { - assert process.success - assert process.trace.succeeded().size() == 0 - assert process.trace.failed().size() == 1 - assert process.out.design.size() == 0 + assertAll( + { assert process.success }, + { assert process.out.counts.size() == 0 }, + { assert process.out.design.size() == 0 } + ) } } @@ -150,10 +158,35 @@ nextflow_process { // check for the absence of expected output (the error is ignored but no output is produced) then { - assert process.success - assert process.trace.succeeded().size() == 0 - assert process.trace.failed().size() == 1 - assert process.out.design.size() == 0 + assertAll( + { assert process.success }, + { assert process.out.counts.size() == 0 }, + { assert process.out.design.size() == 0 } + ) + } + + } + + test("E-MTAB-3578 :: serverside error 550") { + + tag "getdata_error_550" + + when { + + process { + """ + input[0] = "E-MTAB-3578" + """ + } + } + + // check for the absence of expected output (the error is ignored but no output is produced) + then { + assertAll( + { assert process.success }, + { assert process.out.counts.size() == 0 }, + { assert process.out.design.size() == 0 } + ) } } diff --git a/tests/modules/local/expressionatlas/getdata/main.nf.test.snap b/tests/modules/local/expressionatlas/getdata/main.nf.test.snap index b3a5f521..f624143d 100644 --- a/tests/modules/local/expressionatlas/getdata/main.nf.test.snap +++ b/tests/modules/local/expressionatlas/getdata/main.nf.test.snap @@ -3,19 +3,25 @@ "content": [ { "0": [ - "E_MTAB_552_rnaseq.design.csv:md5,b81490696f638e90c1cf14236bb0c08c" + "E_MTAB_552_rnaseq.rnaseq.raw.counts.csv:md5,830f50b60b17b62f9ca2f6a163a2879f" ], "1": [ - "E_MTAB_552_rnaseq.rnaseq.raw.counts.csv:md5,830f50b60b17b62f9ca2f6a163a2879f" + "E_MTAB_552_rnaseq.design.csv:md5,b81490696f638e90c1cf14236bb0c08c" ], "2": [ + + ], + "3": [ + + ], + "4": [ [ "EXPRESSIONATLAS_GETDATA", "R", "4.3.3 (2024-02-29)" ] ], - "3": [ + "5": [ [ "EXPRESSIONATLAS_GETDATA", "ExpressionAtlas", @@ -32,27 +38,33 @@ ], "meta": { "nf-test": "0.9.2", - "nextflow": "25.04.0" + "nextflow": "25.04.8" }, - "timestamp": "2025-05-12T12:17:16.305872196" + "timestamp": "2025-11-17T11:22:31.479739243" }, "Arabidopsis Geo dataset": { "content": [ { "0": [ - "E_GEOD_62537_A_AFFY_2.design.csv:md5,7d7dd72be4f5b326dd25a36db01ebf88" + "E_GEOD_62537_A_AFFY_2.microarray.normalised.counts.csv:md5,673c55171d0ccfc1d036bf43c49ae320" ], "1": [ - "E_GEOD_62537_A_AFFY_2.microarray.normalised.counts.csv:md5,673c55171d0ccfc1d036bf43c49ae320" + "E_GEOD_62537_A_AFFY_2.design.csv:md5,7d7dd72be4f5b326dd25a36db01ebf88" ], "2": [ + + ], + "3": [ + + ], + "4": [ [ "EXPRESSIONATLAS_GETDATA", "R", "4.3.3 (2024-02-29)" ] ], - "3": [ + "5": [ [ "EXPRESSIONATLAS_GETDATA", "ExpressionAtlas", @@ -69,27 +81,33 @@ ], "meta": { "nf-test": "0.9.2", - "nextflow": "25.04.0" + "nextflow": "25.04.8" }, - "timestamp": "2025-05-12T12:17:49.460792842" + "timestamp": "2025-11-17T11:22:51.038290522" }, "Transcription profiling by array of Arabidopsis mutant for fis2 (microarray)": { "content": [ { "0": [ - "E_TABM_1007_A_AFFY_2.design.csv:md5,120f63cae2193b97d7451483bdbbaab1" + "E_TABM_1007_A_AFFY_2.microarray.normalised.counts.csv:md5,a3afe33d7eaed3339da9109bf25bb3ed" ], "1": [ - "E_TABM_1007_A_AFFY_2.microarray.normalised.counts.csv:md5,a3afe33d7eaed3339da9109bf25bb3ed" + "E_TABM_1007_A_AFFY_2.design.csv:md5,120f63cae2193b97d7451483bdbbaab1" ], "2": [ + + ], + "3": [ + + ], + "4": [ [ "EXPRESSIONATLAS_GETDATA", "R", "4.3.3 (2024-02-29)" ] ], - "3": [ + "5": [ [ "EXPRESSIONATLAS_GETDATA", "ExpressionAtlas", @@ -106,8 +124,8 @@ ], "meta": { "nf-test": "0.9.2", - "nextflow": "25.04.0" + "nextflow": "25.04.8" }, - "timestamp": "2025-05-12T12:17:33.926612754" + "timestamp": "2025-11-17T11:22:41.604691764" } } \ No newline at end of file diff --git a/tests/modules/local/filter_and_rename_genes/main.nf.test b/tests/modules/local/filter_and_rename_genes/main.nf.test new file mode 100644 index 00000000..a48b2fba --- /dev/null +++ b/tests/modules/local/filter_and_rename_genes/main.nf.test @@ -0,0 +1,88 @@ +nextflow_process { + + name "Test Process FILTER_AND_RENAME_GENES" + script "modules/local/filter_and_rename_genes/main.nf" + process "FILTER_AND_RENAME_GENES" + tag "filter_and_rename_genes" + + test("Map Ensembl IDs") { + + when { + process { + """ + input[0] = channel.of( + [ + [ dataset: "test" ], + file("$projectDir/tests/test_data/idmapping/base/counts.ensembl_ids.csv", checkIfExists: true) + ] + ) + input[1] = file("$projectDir/tests/test_data/idmapping/mapped/mapped_gene_ids.csv", checkIfExists: true) + input[2] = file("$projectDir/tests/test_data/idmapping/mapped/valid_gene_ids.txt", checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("No valid gene") { + + when { + process { + """ + input[0] = channel.of( + [ + [ dataset: "test" ], + file("$projectDir/tests/test_data/idmapping/base/counts.ensembl_ids.csv", checkIfExists: true) + ] + ) + input[1] = file("$projectDir/tests/test_data/idmapping/mapped/mapped_gene_ids.csv", checkIfExists: true) + input[2] = file("$projectDir/tests/test_data/idmapping/mapped/no_valid_gene_id.txt", checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Custom mapping - TSV") { + + tag "custom_mapping_tsv" + + when { + process { + """ + input[0] = channel.of( + [ + [ dataset: "test" ], + file("$projectDir/tests/test_data/idmapping/tsv/counts.ensembl_ids.tsv", checkIfExists: true) + ] + ) + input[1] = file( "$projectDir/tests/test_data/idmapping/tsv/mapping.tsv", checkIfExists: true) + input[2] = file("$projectDir/tests/test_data/idmapping/tsv/valid_gene_ids.txt", checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/filter_and_rename_genes/main.nf.test.snap b/tests/modules/local/filter_and_rename_genes/main.nf.test.snap new file mode 100644 index 00000000..0f7c284b --- /dev/null +++ b/tests/modules/local/filter_and_rename_genes/main.nf.test.snap @@ -0,0 +1,170 @@ +{ + "Custom mapping - TSV": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ], + "failure_reason.txt:md5,0eea8256c81d0362f3f10979ab2de23e" + ] + ], + "2": [ + + ], + "3": [ + [ + [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ], + "0", + "0", + "3", + "0" + ] + ], + "4": [ + [ + "FILTER_AND_RENAME_GENES", + "python", + "3.12.8" + ] + ], + "5": [ + [ + "FILTER_AND_RENAME_GENES", + "polars", + "1.17.1" + ] + ], + "counts": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-06T19:01:46.153372095" + }, + "Map Ensembl IDs": { + "content": [ + { + "0": [ + [ + { + "dataset": [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ] + }, + "counts.ensembl_ids.renamed.parquet:md5,dcdec4c4a0bdcc5802a6d4c3c24d0af6" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ], + "2", + "1", + "1", + "0" + ] + ], + "4": [ + [ + "FILTER_AND_RENAME_GENES", + "python", + "3.12.8" + ] + ], + "5": [ + [ + "FILTER_AND_RENAME_GENES", + "polars", + "1.17.1" + ] + ], + "counts": [ + [ + { + "dataset": [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ] + }, + "counts.ensembl_ids.renamed.parquet:md5,dcdec4c4a0bdcc5802a6d4c3c24d0af6" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-06T19:00:32.790897635" + }, + "No valid gene": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ], + "failure_reason.txt:md5,0eea8256c81d0362f3f10979ab2de23e" + ] + ], + "2": [ + + ], + "3": [ + [ + [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ], + "0", + "0", + "3", + "0" + ] + ], + "4": [ + [ + "FILTER_AND_RENAME_GENES", + "python", + "3.12.8" + ] + ], + "5": [ + [ + "FILTER_AND_RENAME_GENES", + "polars", + "1.17.1" + ] + ], + "counts": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-06T19:00:41.34477175" + } +} \ No newline at end of file diff --git a/tests/modules/local/gene_statistics/main.nf.test b/tests/modules/local/gene_statistics/main.nf.test deleted file mode 100644 index e3fd5226..00000000 --- a/tests/modules/local/gene_statistics/main.nf.test +++ /dev/null @@ -1,38 +0,0 @@ -nextflow_process { - - name "Test Process GENE_STATISTICS" - script "modules/local/gene_statistics/main.nf" - process "GENE_STATISTICS" - tag "genestats" - tag "module" - - test("Should run without failures") { - - when { - - process { - """ - ch_counts = Channel.fromPath( '$projectDir/tests/test_data/merge_data/output/all_counts.parquet', checkIfExists: true) - ch_metadata = Channel.fromPath( '$projectDir/tests/test_data/gene_statistics/input/metadata*.csv', checkIfExists: true ).collect() - ch_mapping = Channel.fromPath( '$projectDir/tests/test_data/gene_statistics/input/mapping*.csv', checkIfExists: true ).collect() - ch_ks_stat_file = Channel.fromPath( '$projectDir/tests/test_data/gene_statistics/input/ks_stats.csv', checkIfExists: true ) - input[0] = ch_counts - input[1] = ch_metadata - input[2] = ch_mapping - input[3] = 3 - input[4] = ch_ks_stat_file - input[5] = 0 - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - -} diff --git a/tests/modules/local/gene_statistics/main.nf.test.snap b/tests/modules/local/gene_statistics/main.nf.test.snap deleted file mode 100644 index ce43fa5d..00000000 --- a/tests/modules/local/gene_statistics/main.nf.test.snap +++ /dev/null @@ -1,51 +0,0 @@ -{ - "Should run without failures": { - "content": [ - { - "0": [ - "top_stable_genes_summary.csv:md5,ad6f15d03a4465551e689a7ef79a9490" - ], - "1": [ - "stats_all_genes.csv:md5,7e62d9c35f65c30059dd415c67bf258c" - ], - "2": [ - "all_counts_filtered.parquet:md5,c1d7f14d04ab280b9ff875599127091d" - ], - "3": [ - "top_stable_genes_transposed_counts_filtered.csv:md5,6662ad1c9bbccdce113d7e574bf7e45c" - ], - "4": [ - [ - "GENE_STATISTICS", - "python", - "3.12.8" - ] - ], - "5": [ - [ - "GENE_STATISTICS", - "polars", - "1.17.1" - ] - ], - "all_counts": [ - "all_counts_filtered.parquet:md5,c1d7f14d04ab280b9ff875599127091d" - ], - "all_statistics": [ - "stats_all_genes.csv:md5,7e62d9c35f65c30059dd415c67bf258c" - ], - "top_stable_genes_summary": [ - "top_stable_genes_summary.csv:md5,ad6f15d03a4465551e689a7ef79a9490" - ], - "top_stable_genes_transposed_counts": [ - "top_stable_genes_transposed_counts_filtered.csv:md5,6662ad1c9bbccdce113d7e574bf7e45c" - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "25.04.0" - }, - "timestamp": "2025-05-11T19:30:08.395767893" - } -} \ No newline at end of file diff --git a/tests/modules/local/genorm/compute_m_measure/main.nf.test b/tests/modules/local/genorm/compute_m_measure/main.nf.test new file mode 100644 index 00000000..bbff428e --- /dev/null +++ b/tests/modules/local/genorm/compute_m_measure/main.nf.test @@ -0,0 +1,28 @@ +nextflow_process { + + name "Test Process COMPUTE_M_MEASURE" + script "modules/local/genorm/compute_m_measure/main.nf" + process "COMPUTE_M_MEASURE" + tag "m_measure" + + test("Four initial chunk files") { + + when { + process { + """ + ch_count_file = channel.fromPath( '$projectDir/tests/test_data/genorm/make_chunks/input/counts.parquet', checkIfExists: true) + ch_ratio_std_files = channel.fromPath( '$projectDir/tests/test_data/genorm/compute_m_measure/input/std.*.parquet', checkIfExists: true).collect() + input[0] = ch_count_file + input[1] = ch_ratio_std_files + """ + } + } + + then { + assert process.success + // assert snapshot(process.out).match() + } + + } + +} diff --git a/tests/modules/local/genorm/compute_m_measure/main.nf.test.snap b/tests/modules/local/genorm/compute_m_measure/main.nf.test.snap new file mode 100644 index 00000000..64909557 --- /dev/null +++ b/tests/modules/local/genorm/compute_m_measure/main.nf.test.snap @@ -0,0 +1,64 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + "m_measures.csv:md5,996ba9ca255d5d4032e30e04bc062079" + ], + "1": [ + [ + "COMPUTE_M_MEASURE", + "python", + "3.12.8" + ] + ], + "2": [ + [ + "COMPUTE_M_MEASURE", + "polars", + "1.17.1" + ] + ], + "m_measures": [ + "m_measures.csv:md5,996ba9ca255d5d4032e30e04bc062079" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-16T11:48:03.528113087" + }, + "Four initial chunk files": { + "content": [ + { + "0": [ + "m_measures.csv:md5,8d6b01ed9f1dd7767f9ac370cec263ad" + ], + "1": [ + [ + "COMPUTE_M_MEASURE", + "python", + "3.12.8" + ] + ], + "2": [ + [ + "COMPUTE_M_MEASURE", + "polars", + "1.17.1" + ] + ], + "m_measures": [ + "m_measures.csv:md5,8d6b01ed9f1dd7767f9ac370cec263ad" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-30T16:16:59.347457078" + } +} \ No newline at end of file diff --git a/tests/modules/local/genorm/cross_join/main.nf.test b/tests/modules/local/genorm/cross_join/main.nf.test new file mode 100644 index 00000000..fd154604 --- /dev/null +++ b/tests/modules/local/genorm/cross_join/main.nf.test @@ -0,0 +1,28 @@ +nextflow_process { + + name "Test Process CROSS_JOIN" + script "modules/local/genorm/cross_join/main.nf" + process "CROSS_JOIN" + tag "cross_join" + + test("Should run without failures") { + + when { + process { + """ + meta = [index_1: 0, index_2: 1] + file_1 = file( '$projectDir/tests/test_data/genorm/make_chunks/output/count_chunk.0.parquet', checkIfExists: true) + file_2 = file( '$projectDir/tests/test_data/genorm/make_chunks/output/count_chunk.1.parquet', checkIfExists: true) + input[0] = [meta, file_1, file_2] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/tests/modules/local/genorm/cross_join/main.nf.test.snap b/tests/modules/local/genorm/cross_join/main.nf.test.snap new file mode 100644 index 00000000..4186e075 --- /dev/null +++ b/tests/modules/local/genorm/cross_join/main.nf.test.snap @@ -0,0 +1,33 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + "cross_join.0.1.parquet:md5,ef1d4777f59bf8c52f05fa37d638989f" + ], + "1": [ + [ + "CROSS_JOIN", + "python", + "3.12.8" + ] + ], + "2": [ + [ + "CROSS_JOIN", + "polars", + "1.17.1" + ] + ], + "data": [ + "cross_join.0.1.parquet:md5,ef1d4777f59bf8c52f05fa37d638989f" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-29T15:29:44.996421136" + } +} \ No newline at end of file diff --git a/tests/modules/local/genorm/expression_ratio/main.nf.test b/tests/modules/local/genorm/expression_ratio/main.nf.test new file mode 100644 index 00000000..5630b93a --- /dev/null +++ b/tests/modules/local/genorm/expression_ratio/main.nf.test @@ -0,0 +1,26 @@ +nextflow_process { + + name "Test Process EXPRESSION_RATIO" + script "modules/local/genorm/expression_ratio/main.nf" + process "EXPRESSION_RATIO" + tag "expression_ratio" + + test("Should run without failures") { + + when { + process { + """ + file = file( '$projectDir/tests/test_data/genorm/cross_join/output/cross_join.0.1.parquet', checkIfExists: true) + input[0] = file + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/tests/modules/local/genorm/expression_ratio/main.nf.test.snap b/tests/modules/local/genorm/expression_ratio/main.nf.test.snap new file mode 100644 index 00000000..1cf86e59 --- /dev/null +++ b/tests/modules/local/genorm/expression_ratio/main.nf.test.snap @@ -0,0 +1,25 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "data": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.8" + }, + "timestamp": "2025-12-03T18:55:17.991614786" + } +} \ No newline at end of file diff --git a/tests/modules/local/genorm/make_chunks/main.nf.test b/tests/modules/local/genorm/make_chunks/main.nf.test new file mode 100644 index 00000000..bb6c00ba --- /dev/null +++ b/tests/modules/local/genorm/make_chunks/main.nf.test @@ -0,0 +1,26 @@ +nextflow_process { + + name "Test Process MAKE_CHUNKS" + script "modules/local/genorm/make_chunks/main.nf" + process "MAKE_CHUNKS" + tag "make_chunks" + + test("Should run without failures") { + + when { + process { + """ + ch_counts = channel.fromPath( '$projectDir/tests/test_data/genorm/make_chunks/input/counts.parquet', checkIfExists: true) + input[0] = ch_counts + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/tests/modules/local/genorm/make_chunks/main.nf.test.snap b/tests/modules/local/genorm/make_chunks/main.nf.test.snap new file mode 100644 index 00000000..4b06bf77 --- /dev/null +++ b/tests/modules/local/genorm/make_chunks/main.nf.test.snap @@ -0,0 +1,25 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "chunks": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.8" + }, + "timestamp": "2025-12-03T18:55:28.842433752" + } +} \ No newline at end of file diff --git a/tests/modules/local/genorm/ratio_standard_variation/main.nf.test b/tests/modules/local/genorm/ratio_standard_variation/main.nf.test new file mode 100644 index 00000000..39b52373 --- /dev/null +++ b/tests/modules/local/genorm/ratio_standard_variation/main.nf.test @@ -0,0 +1,26 @@ +nextflow_process { + + name "Test Process RATIO_STANDARD_VARIATION" + script "modules/local/genorm/ratio_standard_variation/main.nf" + process "RATIO_STANDARD_VARIATION" + tag "ratio_std" + + test("Should run without failures") { + + when { + process { + """ + file = file( '$projectDir/tests/test_data/genorm/expression_ratio/output/ratios.0.1.parquet', checkIfExists: true) + input[0] = file + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/tests/modules/local/genorm/ratio_standard_variation/main.nf.test.snap b/tests/modules/local/genorm/ratio_standard_variation/main.nf.test.snap new file mode 100644 index 00000000..087dae8c --- /dev/null +++ b/tests/modules/local/genorm/ratio_standard_variation/main.nf.test.snap @@ -0,0 +1,25 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "data": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.8" + }, + "timestamp": "2025-12-03T18:55:39.031631857" + } +} \ No newline at end of file diff --git a/tests/modules/local/geo/getaccessions/main.nf.test b/tests/modules/local/geo/getaccessions/main.nf.test new file mode 100644 index 00000000..deddb432 --- /dev/null +++ b/tests/modules/local/geo/getaccessions/main.nf.test @@ -0,0 +1,71 @@ +nextflow_process { + + name "Test Process GEO_GETACCESSIONS" + script "modules/local/geo/getaccessions/main.nf" + process "GEO_GETACCESSIONS" + tag "geo_getaccession" + + test("Beta vulgaris - exclude two accessions") { + + when { + process { + """ + input[0] = "beta_vulgaris" + input[1] = "" + input[2] = [] + input[3] = file( '$projectDir/tests/test_data/public_accessions/exclude_one_geo_accession.txt', checkIfExists: true ) + input[4] = 100 + input[5] = 42 + """ + } + } + + then { + assert process.success + } + + } + + test("Beta vulgaris - leaf / microarray") { + + when { + process { + """ + input[0] = "beta_vulgaris" + input[1] = "leaf" + input[2] = "microarray" + input[3] = [] + input[4] = 100 + input[5] = 42 + """ + } + } + + then { + assert process.success + } + + } + + test("Beta vulgaris - leaf / microarray") { + + when { + process { + """ + input[0] = "beta_vulgaris" + input[1] = "leaf" + input[2] = "microarray" + input[3] = [] + input[4] = 100 + input[5] = 42 + """ + } + } + + then { + assert process.success + } + + } + +} diff --git a/tests/modules/local/geo/getaccessions/main.nf.test.snap b/tests/modules/local/geo/getaccessions/main.nf.test.snap new file mode 100644 index 00000000..17bfedde --- /dev/null +++ b/tests/modules/local/geo/getaccessions/main.nf.test.snap @@ -0,0 +1,87 @@ +{ + "Beta vulgaris": { + "content": [ + { + "0": [ + "accessions.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "1": [ + [ + "filtered_datasets.metadata.tsv:md5,4d458c56f3fc71e92feda6d4031ff17e", + "rejected_datasets.metadata.tsv:md5,b7382bbefa84d5bb60089b057e75c09b", + "selected_datasets.metadata.tsv:md5,4d458c56f3fc71e92feda6d4031ff17e", + "species_datasets.metadata.tsv:md5,c36fd625541112de75d4d4ab38ec68e5" + ] + ], + "2": [ + "selected_datasets.keywords.yaml:md5,f7726c8e3b07ed20e5572d79fb7f575e" + ], + "3": [ + [ + "GEO_GETACCESSIONS", + "python", + "3.13.7" + ] + ], + "4": [ + [ + "GEO_GETACCESSIONS", + "requests", + "2.32.5" + ] + ], + "5": [ + [ + "GEO_GETACCESSIONS", + "nltk", + "3.9.1" + ] + ], + "6": [ + [ + "GEO_GETACCESSIONS", + "pyyaml", + "6.0.2" + ] + ], + "7": [ + [ + "GEO_GETACCESSIONS", + "pandas", + "2.3.2" + ] + ], + "8": [ + [ + "GEO_GETACCESSIONS", + "xmltodict", + "0.14.2" + ] + ], + "9": [ + [ + "GEO_GETACCESSIONS", + "biopython", + "1.85" + ] + ], + "accessions": [ + "accessions.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "metadata": [ + [ + "filtered_datasets.metadata.tsv:md5,4d458c56f3fc71e92feda6d4031ff17e", + "rejected_datasets.metadata.tsv:md5,b7382bbefa84d5bb60089b057e75c09b", + "selected_datasets.metadata.tsv:md5,4d458c56f3fc71e92feda6d4031ff17e", + "species_datasets.metadata.tsv:md5,c36fd625541112de75d4d4ab38ec68e5" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.8" + }, + "timestamp": "2025-10-18T11:17:44.966423003" + } +} \ No newline at end of file diff --git a/tests/modules/local/geo/getdata/main.nf.test b/tests/modules/local/geo/getdata/main.nf.test new file mode 100644 index 00000000..c7cd11f9 --- /dev/null +++ b/tests/modules/local/geo/getdata/main.nf.test @@ -0,0 +1,226 @@ +nextflow_process { + + name "Test Process GEO_GETDATA" + script "modules/local/geo/getdata/main.nf" + process "GEO_GETDATA" + tag "geo_getdata" + + test("Beta vulgaris - Small RNA of sugar beet in response to drought stress") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE205328" + ] + input[1] = "beta vulgaris" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Accession does not exist") { + + when { + + process { + """ + input[0] = [ + [ ], + "GSE568945478" + ] + input[1] = "blabla" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } +/* + test("Drosophila simulans - Only one sample among several") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE59707" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } +*/ + test("Drosophila simulans - No data found") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE124142" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Drosophila simulans - Expression profiling by array") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE43665" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Drosophila simulans - Expression profiling by high throughput sequencing / Some raw counts found") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE59707" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Drosophila simulans - Expression profiling by high throughput sequencing / One raw count found") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE100837" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Drosophila simulans - Only series suppl data but multiple species") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE274048" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.counts.size() == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Drosophila simulans - Mismatch in suppl data colnames / design") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE49127" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.counts.size() == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/geo/getdata/main.nf.test.snap b/tests/modules/local/geo/getdata/main.nf.test.snap new file mode 100644 index 00000000..79e5e54a --- /dev/null +++ b/tests/modules/local/geo/getdata/main.nf.test.snap @@ -0,0 +1,450 @@ +{ + "Drosophila simulans - Expression profiling by high throughput sequencing / One raw count found": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.4.3 (2025-02-28)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.74.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.1.4" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-11T15:57:14.702700023" + }, + "Drosophila simulans - No data found": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.4.3 (2025-02-28)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.74.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.1.4" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-11T15:56:37.133179219" + }, + "Drosophila simulans - Mismatch in suppl data colnames / design": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.4.3 (2025-02-28)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.74.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.1.4" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-11T15:57:39.828336686" + }, + "Accession does not exist": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.4.3 (2025-02-28)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.74.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.1.4" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-11T15:56:24.827278212" + }, + "Drosophila simulans - Expression profiling by array": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.4.3 (2025-02-28)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.74.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.1.4" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-11T15:56:49.691858912" + }, + "Drosophila simulans - Expression profiling by high throughput sequencing / Some raw counts found": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.4.3 (2025-02-28)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.74.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.1.4" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-11T15:57:02.033741199" + }, + "Drosophila simulans - Only series suppl data but multiple species": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.4.3 (2025-02-28)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.74.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.1.4" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-11T15:57:27.287712988" + }, + "Beta vulgaris - Small RNA of sugar beet in response to drought stress": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.4.3 (2025-02-28)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.74.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.1.4" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-11T15:56:12.348217731" + } +} \ No newline at end of file diff --git a/tests/modules/local/get_candidate_genes/main.nf.test b/tests/modules/local/get_candidate_genes/main.nf.test new file mode 100644 index 00000000..54c1c1b0 --- /dev/null +++ b/tests/modules/local/get_candidate_genes/main.nf.test @@ -0,0 +1,56 @@ +nextflow_process { + + name "Test Process GET_CANDIDATE_GENES" + script "modules/local/get_candidate_genes/main.nf" + process "GET_CANDIDATE_GENES" + tag "get_candidate_genes" + + test("With coefficient of variation") { + + when { + + process { + """ + input[0] = file('$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + input[1] = file('$projectDir/tests/test_data/base_statistics/output/stats_all_genes.csv', checkIfExists: true) + input[2] = "cv" + input[3] = 10 + input[4] = 0.2 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("With RCVm and no filter on expression level") { + + when { + + process { + """ + input[0] = file('$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + input[1] = file('$projectDir/tests/test_data/base_statistics/output/stats_all_genes.csv', checkIfExists: true) + input[2] = "rcvm" + input[3] = 10 + input[4] = 0 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/get_candidate_genes/main.nf.test.snap b/tests/modules/local/get_candidate_genes/main.nf.test.snap new file mode 100644 index 00000000..91c17521 --- /dev/null +++ b/tests/modules/local/get_candidate_genes/main.nf.test.snap @@ -0,0 +1,48 @@ +{ + "With coefficient of variation": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "counts": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.8" + }, + "timestamp": "2025-12-03T18:57:33.822736894" + }, + "With RCVm and no filter on expression level": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "counts": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.8" + }, + "timestamp": "2025-12-03T18:57:44.20986199" + } +} \ No newline at end of file diff --git a/tests/modules/local/gprofiler/idmapping/main.nf.test b/tests/modules/local/gprofiler/idmapping/main.nf.test index a19f98a6..e754ba0f 100644 --- a/tests/modules/local/gprofiler/idmapping/main.nf.test +++ b/tests/modules/local/gprofiler/idmapping/main.nf.test @@ -3,40 +3,17 @@ nextflow_process { name "Test Process GPROFILER_IDMAPPING" script "modules/local/gprofiler/idmapping/main.nf" process "GPROFILER_IDMAPPING" - tag "idmapping" - tag "module" + tag "gprofiler_idmapping" - test("Map Ensembl IDs to themselves") { + test("ENSG - Mapping found") { when { - process { - """ - meta = [] - count_file = file("$projectDir/tests/test_data/idmapping/base/counts.ensembl_ids.csv", checkIfExists: true) - input[0] = [meta, count_file, "Solanum tuberosum"] - input[1] = '' - """ - } - } - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("Map NCBI IDs") { - - when { process { """ - meta = [] - count_file = file("$projectDir/tests/test_data/idmapping/base/counts.ncbi_ids.csv", checkIfExists: true) - input[0] = [meta, count_file, "Arabidopsis thaliana"] - input[1] = '' + input[0] = file("$projectDir/tests/test_data/idmapping/gene_ids/gene_ids.txt", checkIfExists: true) + input[1] = "Solanum tuberosum" + input[2] = "ENSG" """ } } @@ -44,116 +21,31 @@ nextflow_process { then { assertAll( { assert process.success }, + { assert process.out.mapping.size() == 1 }, { assert snapshot(process.out).match() } ) } - } - - - test("Map Uniprot IDs") { + /* + test("Entrez - No mapping found") { when { - process { - """ - meta = [] - count_file = file("$projectDir/tests/test_data/idmapping/base/counts.uniprot_ids.csv", checkIfExists: true) - input[0] = [meta, count_file, "Arabidopsis thaliana"] - input[1] = '' - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - test("Empty count file - ignore error") { - - tag "idmapping_empty" - - when { process { """ - meta = [] - count_file = file("$projectDir/tests/test_data/idmapping/empty/counts.csv", checkIfExists: true) - input[0] = [meta, count_file, "Arabidopsis thaliana"] - input[1] = '' + input[0] = file("$projectDir/tests/test_data/idmapping/gene_ids/gene_ids.txt", checkIfExists: true) + input[1] = "Solanum tuberosum" + input[2] = "ENTREZGENE" """ } } - // check for the absence of expected output (the error is ignored but no output is produced) then { assertAll( - { assert process.success }, - { assert process.trace.succeeded().size() == 0 }, - { assert process.trace.failed().size() == 1 }, - { assert process.out.renamed.size() == 0 }, - { assert process.out.metadata.size() == 0 }, - { assert process.out.mapping.size() == 0 } + { assert !process.success } ) } - - } - - test("Mapping not found - ignore error") { - - tag "idmapping_not_found" - - when { - process { - """ - meta = [] - count_file = file("$projectDir/tests/test_data/idmapping/not_found/counts.csv", checkIfExists: true) - input[0] = [meta, count_file, "Homo sapiens"] - input[1] = '' - """ - } - } - - // check for the absence of expected output (the error is ignored but no output is produced) - then { - assertAll( - { assert process.success }, - { assert process.trace.succeeded().size() == 0 }, - { assert process.trace.failed().size() == 1 }, - { assert process.out.renamed.size() == 0 }, - { assert process.out.metadata.size() == 0 }, - { assert process.out.mapping.size() == 0 } - ) - } - - } - - test("Custom mapping") { - - tag "custom_mapping" - - when { - process { - """ - meta = [] - count_file = file("$projectDir/tests/test_data/idmapping/base/counts.ensembl_ids.csv", checkIfExists: true) - custom_mapping_file = file("$projectDir/tests/test_data/idmapping/custom/mapping.csv", checkIfExists: true) - input[0] = [meta, count_file, "Solanum tuberosum"] - input[1] = custom_mapping_file - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - } + */ } diff --git a/tests/modules/local/gprofiler/idmapping/main.nf.test.snap b/tests/modules/local/gprofiler/idmapping/main.nf.test.snap index 649c7294..5c458b7a 100644 --- a/tests/modules/local/gprofiler/idmapping/main.nf.test.snap +++ b/tests/modules/local/gprofiler/idmapping/main.nf.test.snap @@ -1,230 +1,43 @@ { - "Custom mapping": { + "ENSG - Mapping found": { "content": [ { "0": [ - [ - [ - - ], - "counts.ensembl_ids.renamed.csv:md5,b2f45fd17fcb2688ea08b94527f70c1f" - ] + "mapped_gene_ids.csv:md5,c4ef4df6530509b486662a107ba8de44" ], "1": [ - + "gene_metadata.csv:md5,f4dad0185e6f2d780f561d3efc301562" ], "2": [ - "counts.ensembl_ids.mapping.csv:md5,6ff8d8f71b9df7a1b08ff0bfda8da755" - ], - "3": [ [ "GPROFILER_IDMAPPING", "python", - "3.12.8" - ] - ], - "4": [ - [ - "GPROFILER_IDMAPPING", - "pandas", - "2.2.3" - ] - ], - "5": [ - [ - "GPROFILER_IDMAPPING", - "requests", - "2.32.3" - ] - ], - "metadata": [ - - ], - "renamed": [ - [ - [ - - ], - "counts.ensembl_ids.renamed.csv:md5,b2f45fd17fcb2688ea08b94527f70c1f" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2025-02-05T12:00:14.408028923" - }, - "Map Ensembl IDs to themselves": { - "content": [ - { - "0": [ - [ - [ - - ], - "counts.ensembl_ids.renamed.csv:md5,ef96059e3283d4305b2c004d649ae648" - ] - ], - "1": [ - "counts.ensembl_ids.metadata.csv:md5,608af2e5f8df9d8d2bb5829c04d6f9eb" - ], - "2": [ - "counts.ensembl_ids.mapping.csv:md5,a2d01dbbe84a30640825befdda3e1963" - ], - "3": [ - [ - "GPROFILER_IDMAPPING", - "python", - "3.12.8" - ] - ], - "4": [ - [ - "GPROFILER_IDMAPPING", - "pandas", - "2.2.3" - ] - ], - "5": [ - [ - "GPROFILER_IDMAPPING", - "requests", - "2.32.3" - ] - ], - "metadata": [ - "counts.ensembl_ids.metadata.csv:md5,608af2e5f8df9d8d2bb5829c04d6f9eb" - ], - "renamed": [ - [ - [ - - ], - "counts.ensembl_ids.renamed.csv:md5,ef96059e3283d4305b2c004d649ae648" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2025-02-05T11:59:32.564006233" - }, - "Map Uniprot IDs": { - "content": [ - { - "0": [ - [ - [ - - ], - "counts.uniprot_ids.renamed.csv:md5,a93d7145f8b35f6074cdf21c7c97de7c" + "3.13.5" ] ], - "1": [ - "counts.uniprot_ids.metadata.csv:md5,b87d6533848a3ae07b289ec6b0c4a1ff" - ], - "2": [ - "counts.uniprot_ids.mapping.csv:md5,fe88c79c45d45825d28f325f7a2f383e" - ], "3": [ - [ - "GPROFILER_IDMAPPING", - "python", - "3.12.8" - ] - ], - "4": [ [ "GPROFILER_IDMAPPING", "pandas", - "2.2.3" - ] - ], - "5": [ - [ - "GPROFILER_IDMAPPING", - "requests", - "2.32.3" - ] - ], - "metadata": [ - "counts.uniprot_ids.metadata.csv:md5,b87d6533848a3ae07b289ec6b0c4a1ff" - ], - "renamed": [ - [ - [ - - ], - "counts.uniprot_ids.renamed.csv:md5,a93d7145f8b35f6074cdf21c7c97de7c" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2025-02-05T11:59:50.805226174" - }, - "Map NCBI IDs": { - "content": [ - { - "0": [ - [ - [ - - ], - "counts.ncbi_ids.renamed.csv:md5,a93d7145f8b35f6074cdf21c7c97de7c" - ] - ], - "1": [ - "counts.ncbi_ids.metadata.csv:md5,b87d6533848a3ae07b289ec6b0c4a1ff" - ], - "2": [ - "counts.ncbi_ids.mapping.csv:md5,fe4fd9005dce99b7722b84134f51badd" - ], - "3": [ - [ - "GPROFILER_IDMAPPING", - "python", - "3.12.8" + "2.3.1" ] ], "4": [ - [ - "GPROFILER_IDMAPPING", - "pandas", - "2.2.3" - ] - ], - "5": [ [ "GPROFILER_IDMAPPING", "requests", - "2.32.3" + "2.32.4" ] ], "metadata": [ - "counts.ncbi_ids.metadata.csv:md5,b87d6533848a3ae07b289ec6b0c4a1ff" - ], - "renamed": [ - [ - [ - - ], - "counts.ncbi_ids.renamed.csv:md5,a93d7145f8b35f6074cdf21c7c97de7c" - ] + "gene_metadata.csv:md5,f4dad0185e6f2d780f561d3efc301562" ] } ], "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" + "nf-test": "0.9.3", + "nextflow": "25.10.2" }, - "timestamp": "2025-02-05T11:59:41.467446589" + "timestamp": "2026-01-07T10:30:46.747985228" } } \ No newline at end of file diff --git a/tests/modules/local/merge_counts/main.nf.test b/tests/modules/local/merge_counts/main.nf.test new file mode 100644 index 00000000..a8f55ea1 --- /dev/null +++ b/tests/modules/local/merge_counts/main.nf.test @@ -0,0 +1,84 @@ +nextflow_process { + + name "Test Process MERGE_COUNTS" + script "modules/local/merge_counts/main.nf" + process "MERGE_COUNTS" + tag "merge_counts" + + test("2 files") { + + when { + + process { + """ + input[0] = [ + [ platform: 'rnaseq' ], + [ + file("$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet", checkIfExists: true), + file( "$projectDir/tests/test_data/dataset_statistics/input/count2.raw.cpm.quant_norm.parquet", checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("2 identical files") { + + when { + + process { + """ + input[0] = [ + [ platform: 'rnaseq' ], + [ + file("$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet", checkIfExists: true), + file( "$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet", checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("1 file") { + + when { + + process { + """ + input[0] = [ + [ platform: 'microarray' ], + [ + file("$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet", checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } +} diff --git a/tests/modules/local/merge_counts/main.nf.test.snap b/tests/modules/local/merge_counts/main.nf.test.snap new file mode 100644 index 00000000..bdd077da --- /dev/null +++ b/tests/modules/local/merge_counts/main.nf.test.snap @@ -0,0 +1,146 @@ +{ + "2 files": { + "content": [ + { + "0": [ + [ + { + "platform": "rnaseq" + }, + "all_counts.parquet:md5,ea386983967ba07c233245b530c3edd0" + ] + ], + "1": [ + [ + "MERGE_COUNTS", + "python", + "3.14.0" + ] + ], + "2": [ + [ + "MERGE_COUNTS", + "polars", + "1.34.0" + ] + ], + "3": [ + [ + "MERGE_COUNTS", + "tqdm", + "4.67.1" + ] + ], + "counts": [ + [ + { + "platform": "rnaseq" + }, + "all_counts.parquet:md5,ea386983967ba07c233245b530c3edd0" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-06T18:55:36.577938151" + }, + "2 identical files": { + "content": [ + { + "0": [ + [ + { + "platform": "rnaseq" + }, + "all_counts.parquet:md5,a83c94a90be32af4fc3bcc4909f6f1d2" + ] + ], + "1": [ + [ + "MERGE_COUNTS", + "python", + "3.14.0" + ] + ], + "2": [ + [ + "MERGE_COUNTS", + "polars", + "1.34.0" + ] + ], + "3": [ + [ + "MERGE_COUNTS", + "tqdm", + "4.67.1" + ] + ], + "counts": [ + [ + { + "platform": "rnaseq" + }, + "all_counts.parquet:md5,a83c94a90be32af4fc3bcc4909f6f1d2" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-06T18:55:45.369797346" + }, + "1 file": { + "content": [ + { + "0": [ + [ + { + "platform": "microarray" + }, + "all_counts.parquet:md5,57629ccf12df0e16a39281dfe02df4bc" + ] + ], + "1": [ + [ + "MERGE_COUNTS", + "python", + "3.14.0" + ] + ], + "2": [ + [ + "MERGE_COUNTS", + "polars", + "1.34.0" + ] + ], + "3": [ + [ + "MERGE_COUNTS", + "tqdm", + "4.67.1" + ] + ], + "counts": [ + [ + { + "platform": "microarray" + }, + "all_counts.parquet:md5,57629ccf12df0e16a39281dfe02df4bc" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-06T18:55:54.187807965" + } +} \ No newline at end of file diff --git a/tests/modules/local/merge_data/main.nf.test b/tests/modules/local/merge_data/main.nf.test deleted file mode 100644 index 540e0086..00000000 --- a/tests/modules/local/merge_data/main.nf.test +++ /dev/null @@ -1,59 +0,0 @@ -nextflow_process { - - name "Test Process MERGE_DATA" - script "modules/local/merge_data/main.nf" - process "MERGE_DATA" - tag "merge" - tag "module" - - test("Merge counts - filter out genes with zero counts") { - - when { - process { - """ - ch_counts = Channel.fromPath( '$projectDir/tests/test_data/merge_data/input/counts*.parquet', checkIfExists: true).collect() - ch_designs = Channel.fromPath( '$projectDir/tests/test_data/merge_data/input/design*.csv', checkIfExists: true).collect() - ch_dataset_stat = Channel.fromPath( '$projectDir/tests/test_data/merge_data/input/dataset_stat*.csv', checkIfExists: true).collect() - nb_candidate_genes = 3 - input[0] = ch_counts - input[1] = ch_designs - input[2] = ch_dataset_stat - input[3] = nb_candidate_genes - """ - } - - } - - then { - assertAll( - { assert process.success }, - { assert path(process.out.all_counts[0]).exists() }, - { with( path(process.out.all_designs[0]).csv( header: true ) ) { - assert columnNames == ["batch", "condition", "sample"] - assert rowCount == 27 - }}, - { assert path(process.out.candidate_gene_counts[0]).exists() }, - { with( path(process.out.distribution_correlations[0]).csv( header: false ) ) { - assert rowCount == 27 - assert columnCount == 2 - }}, - { with( path(process.out.gene_count_statistics[0]).csv( header: false ) ) { - assert rowCount == 27 - assert columnCount == 2 - }}, - { with( path(process.out.skewness_statistics[0]).csv( header: false ) ) { - assert rowCount == 27 - assert columnCount == 2 - }}, - { with( path(process.out.ks_test_statistics[0]).csv( header: false ) ) { - assert rowCount == 27 - assert columnCount == 2 - }} - ) - - - } - - } - -} diff --git a/tests/modules/local/normalisation/compute_cpm/main.nf.test b/tests/modules/local/normalisation/compute_cpm/main.nf.test new file mode 100644 index 00000000..c32312b8 --- /dev/null +++ b/tests/modules/local/normalisation/compute_cpm/main.nf.test @@ -0,0 +1,101 @@ +nextflow_process { + + name "Test Process NORMALISATION_COMPUTE_CPM" + script "modules/local/normalisation/compute_cpm/main.nf" + process "NORMALISATION_COMPUTE_CPM" + tag "cpm_norm" + + + test("Very small dataset") { + + when { + + process { + """ + input[0] = [ + [ dataset: "test" ], + file('$projectDir/tests/test_data/normalisation/base/counts.csv') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Rows with many zeros") { + + when { + + process { + """ + input[0] = [ + [ dataset: "test"], + file('$projectDir/tests/test_data/normalisation/many_zeros/counts.csv') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("One group") { + + when { + + process { + """ + input[0] = [ + [ dataset: "accession" ], + file('$projectDir/tests/test_data/normalisation/one_group/counts.csv') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("TSV files") { + + when { + + process { + """ + input[0] = [ + [ dataset: "accession" ], + file('$projectDir/tests/test_data/normalisation/base/counts.tsv') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/normalisation/compute_cpm/main.nf.test.snap b/tests/modules/local/normalisation/compute_cpm/main.nf.test.snap new file mode 100644 index 00000000..6948468f --- /dev/null +++ b/tests/modules/local/normalisation/compute_cpm/main.nf.test.snap @@ -0,0 +1,198 @@ +{ + "Very small dataset": { + "content": [ + { + "0": [ + [ + { + "dataset": [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ] + }, + "counts.cpm.parquet:md5,5f9f89a0711ea45a216dcd29805d806a" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_CPM", + "python", + "3.12.8" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_CPM", + "polars", + "1.17.1" + ] + ], + "counts": [ + [ + { + "dataset": [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ] + }, + "counts.cpm.parquet:md5,5f9f89a0711ea45a216dcd29805d806a" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-06T17:46:20.91627774" + }, + "One group": { + "content": [ + { + "0": [ + [ + { + "dataset": "accession" + }, + "counts.cpm.parquet:md5,0af81a4e4e335bb2be6c4fa2e375696c" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_CPM", + "python", + "3.12.8" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_CPM", + "polars", + "1.17.1" + ] + ], + "counts": [ + [ + { + "dataset": "accession" + }, + "counts.cpm.parquet:md5,0af81a4e4e335bb2be6c4fa2e375696c" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-06T17:46:38.175795079" + }, + "TSV files": { + "content": [ + { + "0": [ + [ + { + "dataset": "accession" + }, + "counts.cpm.parquet:md5,5f9f89a0711ea45a216dcd29805d806a" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_CPM", + "python", + "3.12.8" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_CPM", + "polars", + "1.17.1" + ] + ], + "counts": [ + [ + { + "dataset": "accession" + }, + "counts.cpm.parquet:md5,5f9f89a0711ea45a216dcd29805d806a" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-06T17:46:46.581965221" + }, + "Rows with many zeros": { + "content": [ + { + "0": [ + [ + { + "dataset": [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ] + }, + "counts.cpm.parquet:md5,a342cf59dee7ab9eadbe8df3420e3477" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_CPM", + "python", + "3.12.8" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_CPM", + "polars", + "1.17.1" + ] + ], + "counts": [ + [ + { + "dataset": [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ] + }, + "counts.cpm.parquet:md5,a342cf59dee7ab9eadbe8df3420e3477" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-06T17:46:29.367031711" + } +} \ No newline at end of file diff --git a/tests/modules/local/normalisation/compute_tpm/main.nf.test b/tests/modules/local/normalisation/compute_tpm/main.nf.test new file mode 100644 index 00000000..23463223 --- /dev/null +++ b/tests/modules/local/normalisation/compute_tpm/main.nf.test @@ -0,0 +1,106 @@ +nextflow_process { + + name "Test Process NORMALISATION_COMPUTE_TPM" + script "modules/local/normalisation/compute_tpm/main.nf" + process "NORMALISATION_COMPUTE_TPM" + tag "tpm_norm" + + + test("Very small dataset") { + + when { + + process { + """ + input[0] = [ + [ dataset: "test" ], + file('$projectDir/tests/test_data/normalisation/base/counts.csv') + ] + input[1] = file('$projectDir/tests/test_data/normalisation/base/gene_lengths.csv') + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Rows with many zeros") { + + when { + + process { + """ + input[0] = [ + [ dataset: "test"], + file('$projectDir/tests/test_data/normalisation/many_zeros/counts.csv') + ] + input[1] = file('$projectDir/tests/test_data/normalisation/many_zeros/gene_lengths.csv') + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("One group") { + + when { + + process { + """ + input[0] = [ + [ dataset: "test" ], + file('$projectDir/tests/test_data/normalisation/one_group/counts.csv') + ] + input[1] = file('$projectDir/tests/test_data/normalisation/one_group/gene_lengths.csv') + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("TSV files") { + + when { + + process { + """ + input[0] = [ + [ dataset: "test" ], + file('$projectDir/tests/test_data/normalisation/base/counts.tsv') + ] + input[1] = file('$projectDir/tests/test_data/normalisation/base/gene_lengths.csv') + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + +} diff --git a/tests/modules/local/normalisation/compute_tpm/main.nf.test.snap b/tests/modules/local/normalisation/compute_tpm/main.nf.test.snap new file mode 100644 index 00000000..5d11c3cb --- /dev/null +++ b/tests/modules/local/normalisation/compute_tpm/main.nf.test.snap @@ -0,0 +1,206 @@ +{ + "Very small dataset": { + "content": [ + { + "0": [ + [ + { + "dataset": [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ] + }, + "counts.tpm.parquet:md5,2fe2ac9557f7d3955d5104563185cb31" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_TPM", + "python", + "3.12.8" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_TPM", + "polars", + "1.17.1" + ] + ], + "counts": [ + [ + { + "dataset": [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ] + }, + "counts.tpm.parquet:md5,2fe2ac9557f7d3955d5104563185cb31" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-06T17:46:55.157210607" + }, + "One group": { + "content": [ + { + "0": [ + [ + { + "dataset": [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ] + }, + "counts.tpm.parquet:md5,7cc642e81f82432bf38690f105e5d2de" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_TPM", + "python", + "3.12.8" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_TPM", + "polars", + "1.17.1" + ] + ], + "counts": [ + [ + { + "dataset": [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ] + }, + "counts.tpm.parquet:md5,7cc642e81f82432bf38690f105e5d2de" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-06T17:47:12.492274549" + }, + "TSV files": { + "content": [ + { + "0": [ + [ + { + "dataset": [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ] + }, + "counts.tpm.parquet:md5,2fe2ac9557f7d3955d5104563185cb31" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_TPM", + "python", + "3.12.8" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_TPM", + "polars", + "1.17.1" + ] + ], + "counts": [ + [ + { + "dataset": [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ] + }, + "counts.tpm.parquet:md5,2fe2ac9557f7d3955d5104563185cb31" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-06T17:47:20.896198862" + }, + "Rows with many zeros": { + "content": [ + { + "0": [ + [ + { + "dataset": [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ] + }, + "counts.tpm.parquet:md5,72d0424d7465443a882963b3a77a2162" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_TPM", + "python", + "3.12.8" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_TPM", + "polars", + "1.17.1" + ] + ], + "counts": [ + [ + { + "dataset": [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ] + }, + "counts.tpm.parquet:md5,72d0424d7465443a882963b3a77a2162" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-06T17:47:03.903930502" + } +} \ No newline at end of file diff --git a/tests/modules/local/normfinder/main.nf.test b/tests/modules/local/normfinder/main.nf.test new file mode 100644 index 00000000..b3ee40cb --- /dev/null +++ b/tests/modules/local/normfinder/main.nf.test @@ -0,0 +1,44 @@ +nextflow_process { + + name "Test Process NORMFINDER" + script "modules/local/normfinder/main.nf" + process "NORMFINDER" + tag "normfinder" + + test("Very small dataset - Cq values") { + + when { + process { + """ + input[0] = file( '$projectDir/tests/test_data/normfinder/very_small_cq/all_counts.normalised.parquet', checkIfExists: true) + input[1] = file( '$projectDir/tests/test_data/normfinder/very_small_cq/design.csv', checkIfExists: true) + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + + test("Small dataset - Real expression values") { + + when { + process { + """ + input[0] = file( '$projectDir/tests/test_data/normfinder/small_normalised/all_counts.normalised.parquet', checkIfExists: true) + input[1] = file( '$projectDir/tests/test_data/normfinder/small_normalised/design.csv', checkIfExists: true) + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/tests/modules/local/normfinder/main.nf.test.snap b/tests/modules/local/normfinder/main.nf.test.snap new file mode 100644 index 00000000..e54a6eb4 --- /dev/null +++ b/tests/modules/local/normfinder/main.nf.test.snap @@ -0,0 +1,48 @@ +{ + "Small dataset - Real expression values": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "stability_values": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.8" + }, + "timestamp": "2025-12-03T18:59:42.999801136" + }, + "Very small dataset - Cq values": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "stability_values": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.8" + }, + "timestamp": "2025-12-03T18:59:29.818592345" + } +} \ No newline at end of file diff --git a/tests/modules/local/quantile_normalisation/main.nf.test b/tests/modules/local/quantile_normalisation/main.nf.test index bba28046..469c52bb 100644 --- a/tests/modules/local/quantile_normalisation/main.nf.test +++ b/tests/modules/local/quantile_normalisation/main.nf.test @@ -1,19 +1,43 @@ nextflow_process { - name "Test Process QUANTILE_NORMALISE" + name "Test Process QUANTILE_NORMALISATION" script "modules/local/quantile_normalisation/main.nf" - process "QUANTILE_NORMALISE" + process "QUANTILE_NORMALISATION" tag "quant_norm" - tag "module" - test("Should run without failures") { + test("Uniform target distribution") { when { process { """ - meta = [dataset: 'test'] - count_file = file( '$projectDir/tests/test_data/quantile_normalise/count.raw.cpm.csv', checkIfExists: true) - input[0] = [meta, count_file] + input[0] = [ + [dataset: 'test'], + file( '$projectDir/tests/test_data/quantile_normalisation/count.raw.cpm.csv', checkIfExists: true) + ] + input[1] = "uniform" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Normal target distribution") { + + when { + process { + """ + input[0] = [ + [dataset: 'test'], + file( '$projectDir/tests/test_data/quantile_normalisation/count.raw.cpm.csv', checkIfExists: true) + ] + input[1] = "normal" """ } } diff --git a/tests/modules/local/quantile_normalisation/main.nf.test.snap b/tests/modules/local/quantile_normalisation/main.nf.test.snap index 5fff9838..69d066a3 100644 --- a/tests/modules/local/quantile_normalisation/main.nf.test.snap +++ b/tests/modules/local/quantile_normalisation/main.nf.test.snap @@ -1,57 +1,106 @@ { - "Should run without failures": { + "Uniform target distribution": { "content": [ { "0": [ [ { - "dataset": "test" + "dataset": [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ] }, - "count.raw.cpm.quant_norm.parquet:md5,6c49b5fe4e23e64dbc6cad6355432b49" + "count.raw.cpm.quant_norm.parquet:md5,57629ccf12df0e16a39281dfe02df4bc" ] ], "1": [ [ - "QUANTILE_NORMALISE", + "QUANTILE_NORMALISATION", "python", - "3.12.8" + "3.14.2" ] ], "2": [ [ - "QUANTILE_NORMALISE", - "pandas", - "2.2.3" + "QUANTILE_NORMALISATION", + "polars", + "1.36.1" ] ], "3": [ [ - "QUANTILE_NORMALISE", + "QUANTILE_NORMALISATION", "scikit-learn", - "1.6.1" + "1.8.0" ] ], - "4": [ + "counts": [ [ - "QUANTILE_NORMALISE", - "pyarrow", - "19.0.0" + { + "dataset": [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ] + }, + "count.raw.cpm.quant_norm.parquet:md5,57629ccf12df0e16a39281dfe02df4bc" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-06T17:48:09.829197251" + }, + "Normal target distribution": { + "content": [ + { + "0": [ + [ + { + "dataset": [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ] + }, + "count.raw.cpm.quant_norm.parquet:md5,93484b73e81c9e3a6138aaddc1f79c41" + ] + ], + "1": [ + [ + "QUANTILE_NORMALISATION", + "python", + "3.14.2" + ] + ], + "2": [ + [ + "QUANTILE_NORMALISATION", + "polars", + "1.36.1" + ] + ], + "3": [ + [ + "QUANTILE_NORMALISATION", + "scikit-learn", + "1.8.0" ] ], "counts": [ [ { - "dataset": "test" + "dataset": [ + "nb_samples.ipynb:md5,eaea35c0b57650ecf2f88322e6060926" + ] }, - "count.raw.cpm.quant_norm.parquet:md5,6c49b5fe4e23e64dbc6cad6355432b49" + "count.raw.cpm.quant_norm.parquet:md5,93484b73e81c9e3a6138aaddc1f79c41" ] ] } ], "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.6" + "nf-test": "0.9.3", + "nextflow": "25.10.2" }, - "timestamp": "2025-05-08T14:38:27.184977502" + "timestamp": "2026-01-06T17:48:21.228107331" } } \ No newline at end of file diff --git a/tests/nextflow.config b/tests/nextflow.config index b828e7a6..70fa952f 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -1,7 +1,15 @@ /* ======================================================================================== - Nextflow config file for running tests + Nextflow config file for running nf-test tests ======================================================================================== */ +// TODO nf-core: Specify any additional parameters here +// Or any resources requirements +params { + modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/stableexpression' +} + +aws.client.anonymous = true // fixes S3 access issues on self-hosted runners enable.moduleBinaries = true diff --git a/tests/subworkflows/local/download_public_datasets/main.nf.test b/tests/subworkflows/local/download_public_datasets/main.nf.test new file mode 100644 index 00000000..54391b25 --- /dev/null +++ b/tests/subworkflows/local/download_public_datasets/main.nf.test @@ -0,0 +1,52 @@ +nextflow_workflow { + + name "Test Workflow DOWNLOAD_PUBLIC_DATASETS" + script "subworkflows/local/download_public_datasets/main.nf" + workflow "DOWNLOAD_PUBLIC_DATASETS" + tag "download_public_datasets" + + test("Beta vulgaris - Eatlas + GEO - all accessions") { + + when { + params { + species = 'beta vulgaris' + } + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = channel.fromList(['E-MTAB-8187', 'GSE107627', 'GSE114968', 'GSE135555', 'GSE205413', 'GSE269454', 'GSE281272', 'GSE55951', 'GSE79526', 'GSE92859']) + """ + } + } + + then { + assert workflow.success + assert snapshot(workflow.out).match() + } + + } + + test("Beta vulgaris - Eatlas only") { + + when { + params { + species = 'beta vulgaris' + } + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = channel.fromList(['E-MTAB-8187']) + """ + } + } + + then { + assert workflow.success + assert snapshot(workflow.out).match() + } + + } + + + +} diff --git a/tests/subworkflows/local/download_public_datasets/main.nf.test.snap b/tests/subworkflows/local/download_public_datasets/main.nf.test.snap new file mode 100644 index 00000000..3e299483 --- /dev/null +++ b/tests/subworkflows/local/download_public_datasets/main.nf.test.snap @@ -0,0 +1,86 @@ +{ + "Beta vulgaris - Eatlas only": { + "content": [ + { + "0": [ + [ + { + "dataset": "E_MTAB_8187_rnaseq", + "design": "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "normalised": false, + "platform": "rnaseq" + }, + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1" + ] + ], + "datasets": [ + [ + { + "dataset": "E_MTAB_8187_rnaseq", + "design": "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "normalised": false, + "platform": "rnaseq" + }, + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-16T15:18:21.726044151" + }, + "Beta vulgaris - Eatlas + GEO - all accessions": { + "content": [ + { + "0": [ + [ + { + "dataset": "E_MTAB_8187_rnaseq", + "design": "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "normalised": false, + "platform": "rnaseq" + }, + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1" + ], + [ + { + "dataset": "GSE55951_GPL18429", + "design": "GSE55951_GPL18429.microarray.normalised.design.csv:md5,f4872dff0edbe441d1600ffe2b67a25d", + "normalised": true, + "platform": "microarray" + }, + "GSE55951_GPL18429.microarray.normalised.counts.csv:md5,18fd2d728ad2ec5cb78f994f73375144" + ] + ], + "datasets": [ + [ + { + "dataset": "E_MTAB_8187_rnaseq", + "design": "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "normalised": false, + "platform": "rnaseq" + }, + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1" + ], + [ + { + "dataset": "GSE55951_GPL18429", + "design": "GSE55951_GPL18429.microarray.normalised.design.csv:md5,f4872dff0edbe441d1600ffe2b67a25d", + "normalised": true, + "platform": "microarray" + }, + "GSE55951_GPL18429.microarray.normalised.counts.csv:md5,18fd2d728ad2ec5cb78f994f73375144" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-16T15:18:08.622422246" + } +} \ No newline at end of file diff --git a/tests/subworkflows/local/expression_normalisation/main.nf.test b/tests/subworkflows/local/expression_normalisation/main.nf.test index d5407534..b5f5b689 100644 --- a/tests/subworkflows/local/expression_normalisation/main.nf.test +++ b/tests/subworkflows/local/expression_normalisation/main.nf.test @@ -6,22 +6,30 @@ nextflow_workflow { tag "subworkflow_expression_normalisation" tag "subworkflow" - test("DESeq2 Normalisation") { + test("TPM Normalisation") { when { workflow { """ - rnaseq_raw_file = file( '$projectDir/tests/test_data/custom_datasets/rnaseq.raw.csv', checkIfExists: true ) - rnaseq_raw_design_file = file( '$projectDir/tests/test_data/custom_datasets/rnaseq.raw.design.csv', checkIfExists: true ) - microarray_normalised_file = file( '$projectDir/tests/test_data/custom_datasets/microarray.normalised.csv', checkIfExists: true ) - microarray_normalised_design_file = file( '$projectDir/tests/test_data/custom_datasets/microarray.normalised.design.csv', checkIfExists: true ) - ch_datasets = Channel.of( - [ [normalised: false, design: rnaseq_raw_design_file, dataset: "rnaseq_raw", platform: "rnaseq"], rnaseq_raw_file], - [ [normalised: true, design: microarray_normalised_design_file, dataset: "microarray_normalised", platform: "microarray"], microarray_normalised_file ] + rnaseq_raw_file = file( '$projectDir/tests/test_data/input_datasets/rnaseq.raw.csv', checkIfExists: true ) + rnaseq_raw_design_file = file( '$projectDir/tests/test_data/input_datasets/rnaseq.raw.design.csv', checkIfExists: true ) + microarray_normalised_file = file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.csv', checkIfExists: true ) + microarray_normalised_design_file = file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.design.csv', checkIfExists: true ) + ch_datasets = channel.of( + [ + [normalised: false, design: rnaseq_raw_design_file, dataset: "rnaseq_raw", platform: "rnaseq"], + rnaseq_raw_file + ], + [ + [normalised: true, design: microarray_normalised_design_file, dataset: "microarray_normalised", platform: "microarray"], + microarray_normalised_file + ] ) - normalisation_method = "deseq2" - input[0] = ch_datasets - input[1] = normalisation_method + input[0] = "solanum_tuberosum" + input[1] = ch_datasets + input[2] = "tpm" + input[3] = "uniform" + input[4] = null """ } } @@ -35,22 +43,68 @@ nextflow_workflow { } - test("EdgeR Normalisation") { + test("TPM Normalisation with gene length") { when { workflow { """ - rnaseq_raw_file = file( '$projectDir/tests/test_data/custom_datasets/rnaseq.raw.csv', checkIfExists: true ) - rnaseq_raw_design_file = file( '$projectDir/tests/test_data/custom_datasets/rnaseq.raw.design.csv', checkIfExists: true ) - microarray_normalised_file = file( '$projectDir/tests/test_data/custom_datasets/microarray.normalised.csv', checkIfExists: true ) - microarray_normalised_design_file = file( '$projectDir/tests/test_data/custom_datasets/microarray.normalised.design.csv', checkIfExists: true ) - ch_datasets = Channel.of( - [ [normalised: false, design: rnaseq_raw_design_file, dataset: "rnaseq_raw", platform: "rnaseq"], rnaseq_raw_file], - [ [normalised: true, design: microarray_normalised_design_file, dataset: "microarray_normalised", platform: "microarray"], microarray_normalised_file ] + rnaseq_raw_file = file( '$projectDir/tests/test_data/input_datasets/rnaseq.raw.csv', checkIfExists: true ) + rnaseq_raw_design_file = file( '$projectDir/tests/test_data/input_datasets/rnaseq.raw.design.csv', checkIfExists: true ) + microarray_normalised_file = file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.csv', checkIfExists: true ) + microarray_normalised_design_file = file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.design.csv', checkIfExists: true ) + gene_length_file = file( '$projectDir/tests/test_data/input_datasets/gene_lengths.csv', checkIfExists: true ) + ch_datasets = channel.of( + [ + [normalised: false, design: rnaseq_raw_design_file, dataset: "rnaseq_raw", platform: "rnaseq"], + rnaseq_raw_file + ], + [ + [normalised: true, design: microarray_normalised_design_file, dataset: "microarray_normalised", platform: "microarray"], + microarray_normalised_file + ] ) - normalisation_method = "edger" - input[0] = ch_datasets - input[1] = normalisation_method + input[0] = "solanum_tuberosum" + input[1] = ch_datasets + input[2] = "tpm" + input[3] = "uniform" + input[4] = gene_length_file + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + + test("CPM Normalisation") { + + when { + workflow { + """ + rnaseq_raw_file = file( '$projectDir/tests/test_data/input_datasets/rnaseq.raw.csv', checkIfExists: true ) + rnaseq_raw_design_file = file( '$projectDir/tests/test_data/input_datasets/rnaseq.raw.design.csv', checkIfExists: true ) + microarray_normalised_file = file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.csv', checkIfExists: true ) + microarray_normalised_design_file = file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.design.csv', checkIfExists: true ) + ch_datasets = channel.of( + [ + [normalised: false, design: rnaseq_raw_design_file, dataset: "rnaseq_raw", platform: "rnaseq"], + rnaseq_raw_file + ], + [ + [normalised: true, design: microarray_normalised_design_file, dataset: "microarray_normalised", platform: "microarray"], + microarray_normalised_file + ] + ) + input[0] = "solanum_tuberosum" + input[1] = ch_datasets + input[2] = "cpm" + input[3] = "uniform" + input[4] = null """ } } @@ -69,14 +123,19 @@ nextflow_workflow { when { workflow { """ - microarray_normalised_file = file( '$projectDir/tests/test_data/custom_datasets/microarray.normalised.csv', checkIfExists: true ) - microarray_normalised_design_file = file( '$projectDir/tests/test_data/custom_datasets/microarray.normalised.design.csv', checkIfExists: true ) - ch_datasets = Channel.of( - [ [normalised: true, design: microarray_normalised_design_file, dataset: "microarray_normalised", platform: "microarray"], microarray_normalised_file ] + microarray_normalised_file = file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.csv', checkIfExists: true ) + microarray_normalised_design_file = file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.design.csv', checkIfExists: true ) + ch_datasets = channel.of( + [ + [normalised: true, design: microarray_normalised_design_file, dataset: "microarray_normalised", platform: "microarray"], + microarray_normalised_file + ] ) - normalisation_method = "deseq2" - input[0] = ch_datasets - input[1] = normalisation_method + input[0] = "solanum_tuberosum" + input[1] = ch_datasets + input[2] = "tpm " + input[3] = "uniform" + input[4] = null """ } } diff --git a/tests/subworkflows/local/expression_normalisation/main.nf.test.snap b/tests/subworkflows/local/expression_normalisation/main.nf.test.snap index 03373f2a..ef5a5f99 100644 --- a/tests/subworkflows/local/expression_normalisation/main.nf.test.snap +++ b/tests/subworkflows/local/expression_normalisation/main.nf.test.snap @@ -1,60 +1,5 @@ { - "No rnaseq normalisation": { - "content": [ - { - "0": [ - [ - { - "normalised": true, - "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", - "dataset": "microarray_normalised", - "platform": "microarray" - }, - "microarray.normalised.quant_norm.parquet:md5,eabdc05374b0c21ebcfd83b01efcedd4" - ] - ], - "1": [ - [ - { - "normalised": true, - "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", - "dataset": "microarray_normalised", - "platform": "microarray" - }, - "microarray.normalised.dataset_stats.csv:md5,5b23ee1b631b5068d6845ae7a85382f5" - ] - ], - "dataset_statistics": [ - [ - { - "normalised": true, - "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", - "dataset": "microarray_normalised", - "platform": "microarray" - }, - "microarray.normalised.dataset_stats.csv:md5,5b23ee1b631b5068d6845ae7a85382f5" - ] - ], - "normalised_counts": [ - [ - { - "normalised": true, - "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", - "dataset": "microarray_normalised", - "platform": "microarray" - }, - "microarray.normalised.quant_norm.parquet:md5,eabdc05374b0c21ebcfd83b01efcedd4" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.6" - }, - "timestamp": "2025-05-08T14:41:25.339980078" - }, - "EdgeR Normalisation": { + "CPM Normalisation": { "content": [ { "0": [ @@ -65,7 +10,7 @@ "dataset": "rnaseq_raw", "platform": "rnaseq" }, - "rnaseq.raw.cpm.quant_norm.parquet:md5,5c9765f2ffbc78fb90b5591a28e68fe4" + "rnaseq.raw.cpm.quant_norm.parquet:md5,447d804d600b61f0bc86326d3e0972cc" ], [ { @@ -74,10 +19,10 @@ "dataset": "microarray_normalised", "platform": "microarray" }, - "microarray.normalised.quant_norm.parquet:md5,eabdc05374b0c21ebcfd83b01efcedd4" + "microarray.normalised.quant_norm.parquet:md5,9c3aec01cdb7ac94b0c28acd711a12a0" ] ], - "1": [ + "counts": [ [ { "normalised": false, @@ -85,7 +30,7 @@ "dataset": "rnaseq_raw", "platform": "rnaseq" }, - "rnaseq.raw.cpm.dataset_stats.csv:md5,d5ab6a5663b62b9869cfddf31bfdac74" + "rnaseq.raw.cpm.quant_norm.parquet:md5,447d804d600b61f0bc86326d3e0972cc" ], [ { @@ -94,19 +39,21 @@ "dataset": "microarray_normalised", "platform": "microarray" }, - "microarray.normalised.dataset_stats.csv:md5,5b23ee1b631b5068d6845ae7a85382f5" + "microarray.normalised.quant_norm.parquet:md5,9c3aec01cdb7ac94b0c28acd711a12a0" ] - ], - "dataset_statistics": [ - [ - { - "normalised": false, - "design": "rnaseq.raw.design.csv:md5,39470b02a211aff791f9e4851b017488", - "dataset": "rnaseq_raw", - "platform": "rnaseq" - }, - "rnaseq.raw.cpm.dataset_stats.csv:md5,d5ab6a5663b62b9869cfddf31bfdac74" - ], + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-06T17:50:09.532650251" + }, + "No rnaseq normalisation": { + "content": [ + { + "0": [ [ { "normalised": true, @@ -114,19 +61,10 @@ "dataset": "microarray_normalised", "platform": "microarray" }, - "microarray.normalised.dataset_stats.csv:md5,5b23ee1b631b5068d6845ae7a85382f5" + "microarray.normalised.quant_norm.parquet:md5,9c3aec01cdb7ac94b0c28acd711a12a0" ] ], - "normalised_counts": [ - [ - { - "normalised": false, - "design": "rnaseq.raw.design.csv:md5,39470b02a211aff791f9e4851b017488", - "dataset": "rnaseq_raw", - "platform": "rnaseq" - }, - "rnaseq.raw.cpm.quant_norm.parquet:md5,5c9765f2ffbc78fb90b5591a28e68fe4" - ], + "counts": [ [ { "normalised": true, @@ -134,18 +72,18 @@ "dataset": "microarray_normalised", "platform": "microarray" }, - "microarray.normalised.quant_norm.parquet:md5,eabdc05374b0c21ebcfd83b01efcedd4" + "microarray.normalised.quant_norm.parquet:md5,9c3aec01cdb7ac94b0c28acd711a12a0" ] ] } ], "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.6" + "nf-test": "0.9.3", + "nextflow": "25.10.2" }, - "timestamp": "2025-05-08T14:40:40.452345993" + "timestamp": "2026-01-06T17:50:21.467295296" }, - "DESeq2 Normalisation": { + "TPM Normalisation with gene length": { "content": [ { "0": [ @@ -156,7 +94,7 @@ "dataset": "rnaseq_raw", "platform": "rnaseq" }, - "rnaseq.raw.cpm.quant_norm.parquet:md5,5c9765f2ffbc78fb90b5591a28e68fe4" + "rnaseq.raw.tpm.quant_norm.parquet:md5,f7e75c14dde78849897a89f6e2d6ef65" ], [ { @@ -165,10 +103,10 @@ "dataset": "microarray_normalised", "platform": "microarray" }, - "microarray.normalised.quant_norm.parquet:md5,eabdc05374b0c21ebcfd83b01efcedd4" + "microarray.normalised.quant_norm.parquet:md5,9c3aec01cdb7ac94b0c28acd711a12a0" ] ], - "1": [ + "counts": [ [ { "normalised": false, @@ -176,7 +114,7 @@ "dataset": "rnaseq_raw", "platform": "rnaseq" }, - "rnaseq.raw.cpm.dataset_stats.csv:md5,d5ab6a5663b62b9869cfddf31bfdac74" + "rnaseq.raw.tpm.quant_norm.parquet:md5,f7e75c14dde78849897a89f6e2d6ef65" ], [ { @@ -185,10 +123,21 @@ "dataset": "microarray_normalised", "platform": "microarray" }, - "microarray.normalised.dataset_stats.csv:md5,5b23ee1b631b5068d6845ae7a85382f5" + "microarray.normalised.quant_norm.parquet:md5,9c3aec01cdb7ac94b0c28acd711a12a0" ] - ], - "dataset_statistics": [ + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-06T17:49:56.051333397" + }, + "TPM Normalisation": { + "content": [ + { + "0": [ [ { "normalised": false, @@ -196,7 +145,7 @@ "dataset": "rnaseq_raw", "platform": "rnaseq" }, - "rnaseq.raw.cpm.dataset_stats.csv:md5,d5ab6a5663b62b9869cfddf31bfdac74" + "rnaseq.raw.tpm.quant_norm.parquet:md5,f7e75c14dde78849897a89f6e2d6ef65" ], [ { @@ -205,10 +154,10 @@ "dataset": "microarray_normalised", "platform": "microarray" }, - "microarray.normalised.dataset_stats.csv:md5,5b23ee1b631b5068d6845ae7a85382f5" + "microarray.normalised.quant_norm.parquet:md5,9c3aec01cdb7ac94b0c28acd711a12a0" ] ], - "normalised_counts": [ + "counts": [ [ { "normalised": false, @@ -216,7 +165,7 @@ "dataset": "rnaseq_raw", "platform": "rnaseq" }, - "rnaseq.raw.cpm.quant_norm.parquet:md5,5c9765f2ffbc78fb90b5591a28e68fe4" + "rnaseq.raw.tpm.quant_norm.parquet:md5,f7e75c14dde78849897a89f6e2d6ef65" ], [ { @@ -225,15 +174,15 @@ "dataset": "microarray_normalised", "platform": "microarray" }, - "microarray.normalised.quant_norm.parquet:md5,eabdc05374b0c21ebcfd83b01efcedd4" + "microarray.normalised.quant_norm.parquet:md5,9c3aec01cdb7ac94b0c28acd711a12a0" ] ] } ], "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.6" + "nf-test": "0.9.3", + "nextflow": "25.10.2" }, - "timestamp": "2025-05-08T14:39:51.31785655" + "timestamp": "2026-01-06T17:49:33.232295092" } } \ No newline at end of file diff --git a/tests/subworkflows/local/expressionatlas_fetchdata/main.nf.test b/tests/subworkflows/local/expressionatlas_fetchdata/main.nf.test deleted file mode 100644 index 421785b0..00000000 --- a/tests/subworkflows/local/expressionatlas_fetchdata/main.nf.test +++ /dev/null @@ -1,36 +0,0 @@ -nextflow_workflow { - - name "Test Workflow EXPRESSIONATLAS_FETCHDATA" - script "subworkflows/local/expressionatlas_fetchdata/main.nf" - workflow "EXPRESSIONATLAS_FETCHDATA" - tag "expressionatlas_fetchdata" - tag "subworkflow" - - test("Should run without failures") { - - when { - workflow { - """ - species = 'solanum tuberosum' - eatlas_accessions = "E-MTAB-552,E-GEOD-61690 ,E-PROT-138" - eatlas_keywords = "potato,stress" - fetch_eatlas_accessions = false // no impact since we define keywords - - input[0] = Channel.value( species.split(' ').join('_') ) - input[1] = eatlas_accessions - input[2] = eatlas_keywords - input[3] = fetch_eatlas_accessions - """ - } - } - - then { - assertAll( - { assert workflow.success }, - { assert snapshot(workflow.out).match() } - ) - } - - } - -} diff --git a/tests/subworkflows/local/expressionatlas_fetchdata/main.nf.test.snap b/tests/subworkflows/local/expressionatlas_fetchdata/main.nf.test.snap deleted file mode 100644 index e43779d1..00000000 --- a/tests/subworkflows/local/expressionatlas_fetchdata/main.nf.test.snap +++ /dev/null @@ -1,101 +0,0 @@ -{ - "Should run without failures": { - "content": [ - { - "0": [ - [ - { - "dataset": "E_GEOD_77826_rnaseq", - "design": "E_GEOD_77826_rnaseq.design.csv:md5,5aa61df754aa9c6c107b247c642d2e53" - }, - "E_GEOD_77826_rnaseq.rnaseq.raw.counts.csv:md5,85cea79c602a9924d5a4d6b597ef5530" - ], - [ - { - "dataset": "E_MTAB_4251_rnaseq", - "design": "E_MTAB_4251_rnaseq.design.csv:md5,4f3ef7b76ca6ed1ec3157295bc4a8d84" - }, - "E_MTAB_4251_rnaseq.rnaseq.raw.counts.csv:md5,5cf27be0e00b93d5d431754ba8058687" - ], - [ - { - "dataset": "E_MTAB_4301_rnaseq", - "design": "E_MTAB_4301_rnaseq.design.csv:md5,165eeef7d612c01fd62baae1a3f296ae" - }, - "E_MTAB_4301_rnaseq.rnaseq.raw.counts.csv:md5,1ab49feea238e7b1419937b5037952b5" - ], - [ - { - "dataset": "E_MTAB_5038_rnaseq", - "design": "E_MTAB_5038_rnaseq.design.csv:md5,352ed3163d7deef2be35d899418d5ad4" - }, - "E_MTAB_5038_rnaseq.rnaseq.raw.counts.csv:md5,b4acb3d7c39cdb2bd6cef6c9314c5b2a" - ], - [ - { - "dataset": "E_MTAB_5215_rnaseq", - "design": "E_MTAB_5215_rnaseq.design.csv:md5,2741dcd5b45bacce865db632f626a273" - }, - "E_MTAB_5215_rnaseq.rnaseq.raw.counts.csv:md5,273704bdf762c342271b33958a84d1e7" - ], - [ - { - "dataset": "E_MTAB_7711_rnaseq", - "design": "E_MTAB_7711_rnaseq.design.csv:md5,3e7748b54a0c25c008d9bd2ddbf1bf00" - }, - "E_MTAB_7711_rnaseq.rnaseq.raw.counts.csv:md5,3c02cf432c29d3751c978439539df388" - ] - ], - "downloaded_datasets": [ - [ - { - "dataset": "E_GEOD_77826_rnaseq", - "design": "E_GEOD_77826_rnaseq.design.csv:md5,5aa61df754aa9c6c107b247c642d2e53" - }, - "E_GEOD_77826_rnaseq.rnaseq.raw.counts.csv:md5,85cea79c602a9924d5a4d6b597ef5530" - ], - [ - { - "dataset": "E_MTAB_4251_rnaseq", - "design": "E_MTAB_4251_rnaseq.design.csv:md5,4f3ef7b76ca6ed1ec3157295bc4a8d84" - }, - "E_MTAB_4251_rnaseq.rnaseq.raw.counts.csv:md5,5cf27be0e00b93d5d431754ba8058687" - ], - [ - { - "dataset": "E_MTAB_4301_rnaseq", - "design": "E_MTAB_4301_rnaseq.design.csv:md5,165eeef7d612c01fd62baae1a3f296ae" - }, - "E_MTAB_4301_rnaseq.rnaseq.raw.counts.csv:md5,1ab49feea238e7b1419937b5037952b5" - ], - [ - { - "dataset": "E_MTAB_5038_rnaseq", - "design": "E_MTAB_5038_rnaseq.design.csv:md5,352ed3163d7deef2be35d899418d5ad4" - }, - "E_MTAB_5038_rnaseq.rnaseq.raw.counts.csv:md5,b4acb3d7c39cdb2bd6cef6c9314c5b2a" - ], - [ - { - "dataset": "E_MTAB_5215_rnaseq", - "design": "E_MTAB_5215_rnaseq.design.csv:md5,2741dcd5b45bacce865db632f626a273" - }, - "E_MTAB_5215_rnaseq.rnaseq.raw.counts.csv:md5,273704bdf762c342271b33958a84d1e7" - ], - [ - { - "dataset": "E_MTAB_7711_rnaseq", - "design": "E_MTAB_7711_rnaseq.design.csv:md5,3e7748b54a0c25c008d9bd2ddbf1bf00" - }, - "E_MTAB_7711_rnaseq.rnaseq.raw.counts.csv:md5,3c02cf432c29d3751c978439539df388" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "25.04.0" - }, - "timestamp": "2025-05-12T16:09:37.515591183" - } -} \ No newline at end of file diff --git a/tests/subworkflows/local/genorm/main.nf.test b/tests/subworkflows/local/genorm/main.nf.test new file mode 100644 index 00000000..1b44b4a7 --- /dev/null +++ b/tests/subworkflows/local/genorm/main.nf.test @@ -0,0 +1,49 @@ +nextflow_workflow { + + name "Test Workflow genorm" + script "subworkflows/local/genorm/main.nf" + workflow "GENORM" + tag "subworkflow_genorm" + tag "subworkflow" + + test("10 genes") { + + tag "subworkflow_genorm_10_genes" + + when { + workflow { + """ + ch_counts = channel.fromPath( '$projectDir/tests/test_data/genorm/make_chunks/input/counts.head.parquet', checkIfExists: true) + input[0] = ch_counts + """ + } + } + + then { + assert workflow.success + assert snapshot(workflow.out).match() + } + + } + + test("1000 genes") { + + tag "subworkflow_genorm_1000_genes" + + when { + workflow { + """ + ch_counts = channel.fromPath( '$projectDir/tests/test_data/genorm/make_chunks/input/counts.parquet', checkIfExists: true) + input[0] = ch_counts + """ + } + } + + then { + assert workflow.success + // assert snapshot(workflow.out).match() + } + + } + +} diff --git a/tests/subworkflows/local/genorm/main.nf.test.snap b/tests/subworkflows/local/genorm/main.nf.test.snap new file mode 100644 index 00000000..0f1f8714 --- /dev/null +++ b/tests/subworkflows/local/genorm/main.nf.test.snap @@ -0,0 +1,19 @@ +{ + "10 genes": { + "content": [ + { + "0": [ + + ], + "m_measures": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.8" + }, + "timestamp": "2025-12-03T19:01:05.843822263" + } +} \ No newline at end of file diff --git a/tests/subworkflows/local/genorm/run_genorm.py b/tests/subworkflows/local/genorm/run_genorm.py new file mode 100644 index 00000000..9704d7dc --- /dev/null +++ b/tests/subworkflows/local/genorm/run_genorm.py @@ -0,0 +1,44 @@ +import sys + +import numpy as np +import pandas as pd + +file = sys.argv[1] +# Expression data for three control genes. +counts = pd.read_parquet(file) +counts.set_index("gene_id", inplace=True) +counts = counts.T.replace(0, 1e-8) + + +def _m_numpy(gene_expression: np.ndarray) -> np.ndarray: + """Internal control gene-stability measure `M`. + + Computes Eq. (4) in Ref. [1]. + + [1]: Vandesompele, Jo, et al. "Accurate normalization of real-time quantitative + RT-PCR data by geometric averaging of multiple internal control genes." Genome + biology 3.7 (2002): 1-12. + """ + + if not (gene_expression > 0).all(): + raise ValueError( + "Expression domain error: not all expression data are strictly positive!" + ) + + a = gene_expression + # Eq. (2): A_{jk}^{(i)} = log_2 (a_{ij} / a_{ik}) + A = np.log2(np.einsum("ij,ik->ijk", a, 1 / a)) + # Eq. (3) + V = np.std(A, axis=0) + # Eq. (4) N.B., Since V_{j=k} is zero, we can simply ignore it since it does not + # contribute to calculation. + n = V.shape[1] + return np.sum(V, axis=1) / (n - 1) + + +def m_measure(gene_expression): + m_values = _m_numpy(gene_expression.to_numpy()) + return pd.Series(m_values, index=gene_expression.columns) + + +print(m_measure(counts).sort_values()) diff --git a/tests/subworkflows/local/get_public_accessions/main.nf.test b/tests/subworkflows/local/get_public_accessions/main.nf.test new file mode 100644 index 00000000..d1d0e64b --- /dev/null +++ b/tests/subworkflows/local/get_public_accessions/main.nf.test @@ -0,0 +1,192 @@ +nextflow_workflow { + + name "Test Workflow GET_PUBLIC_ACCESSIONS" + script "subworkflows/local/get_public_accessions/main.nf" + workflow "GET_PUBLIC_ACCESSIONS" + tag "get_public_accessions" + + test("Fetch eatlas accessions without keywords") { + + when { + + params { + species = 'beta vulgaris' + skip_fetch_eatlas_accessions = false + fetch_geo_accessions = false + platform = null + keywords = "" + accessions = "" + accessions_file = null + excluded_accessions = "" + excluded_accessions_file = null + random_sampling_size = null + random_sampling_seed = 42 + outdir = "$outputDir" + } + + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = params.skip_fetch_eatlas_accessions + input[2] = params.fetch_geo_accessions + input[3] = params.platform + input[4] = params.keywords + input[5] = channel.fromList( params.accessions.tokenize(',') ) + input[6] = params.accessions_file ? channel.fromPath(params.accessions_file, checkIfExists: true) : channel.empty() + input[7] = channel.fromList( params.excluded_accessions.tokenize(',') ) + input[8] = params.excluded_accessions_file ? channel.fromPath(params.excluded_accessions_file, checkIfExists: true) : channel.empty() + input[9] = params.random_sampling_size + input[10] = params.random_sampling_seed + input[11] = params.outdir + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + + test("Fetch public accessions with keywords") { + + when { + + params { + species = 'beta vulgaris' + skip_fetch_eatlas_accessions = false + fetch_geo_accessions = true + platform = null + keywords = "leaf" + accessions = "" + accessions_file = null + excluded_accessions = "" + excluded_accessions_file = null + random_sampling_size = null + random_sampling_seed = 42 + outdir = "$outputDir" + } + + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = params.skip_fetch_eatlas_accessions + input[2] = params.fetch_geo_accessions + input[3] = params.platform + input[4] = params.keywords + input[5] = channel.fromList( params.accessions.tokenize(',') ) + input[6] = params.accessions_file ? channel.fromPath(params.accessions_file, checkIfExists: true) : channel.empty() + input[7] = channel.fromList( params.excluded_accessions.tokenize(',') ) + input[8] = params.excluded_accessions_file ? channel.fromPath(params.excluded_accessions_file, checkIfExists: true) : channel.empty() + input[9] = params.random_sampling_size + input[10] = params.random_sampling_seed + input[11] = params.outdir + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + + test("No GEO + accessions provided") { + + when { + + params { + species = 'beta vulgaris' + skip_fetch_eatlas_accessions = false + fetch_geo_accessions = false + platform = null + keywords = "" + accessions = "E-MTAB-552,E-GEOD-61690 ,E-PROT-138" + accessions_file = null + excluded_accessions = "" + excluded_accessions_file = null + random_sampling_size = null + random_sampling_seed = 42 + outdir = "$outputDir" + } + + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = params.skip_fetch_eatlas_accessions + input[2] = params.fetch_geo_accessions + input[3] = params.platform + input[4] = params.keywords + input[5] = channel.fromList( params.accessions.tokenize(',') ) + input[6] = params.accessions_file ? channel.fromPath(params.accessions_file, checkIfExists: true) : channel.empty() + input[7] = channel.fromList( params.excluded_accessions.tokenize(',') ) + input[8] = params.excluded_accessions_file ? channel.fromPath(params.excluded_accessions_file, checkIfExists: true) : channel.empty() + input[9] = params.random_sampling_size + input[10] = params.random_sampling_seed + input[11] = params.outdir + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + + test("Accessions file + Excluded accessions file") { + + when { + params { + species = 'beta vulgaris' + skip_fetch_eatlas_accessions = false + fetch_geo_accessions = false + platform = null + keywords = "" + accessions = "E-MTAB-552,E-GEOD-61690 ,E-PROT-138" + accessions_file = null + excluded_accessions = "" + excluded_accessions_file = file( '$projectDir/tests/test_data/public_accessions/exclude_one_two_accessions.txt', checkIfExists: true ) + random_sampling_size = null + random_sampling_seed = 42 + outdir = "$outputDir" + } + + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = params.skip_fetch_eatlas_accessions + input[2] = params.fetch_geo_accessions + input[3] = params.platform + input[4] = params.keywords + input[5] = channel.fromList( params.accessions.tokenize(',') ) + input[6] = params.accessions_file ? channel.fromPath(params.accessions_file, checkIfExists: true) : channel.empty() + input[7] = channel.fromList( params.excluded_accessions.tokenize(',') ) + input[8] = params.excluded_accessions_file ? channel.fromPath(params.excluded_accessions_file, checkIfExists: true) : channel.empty() + input[9] = params.random_sampling_size + input[10] = params.random_sampling_seed + input[11] = params.outdir + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + + +} diff --git a/tests/subworkflows/local/get_public_accessions/main.nf.test.snap b/tests/subworkflows/local/get_public_accessions/main.nf.test.snap new file mode 100644 index 00000000..85090531 --- /dev/null +++ b/tests/subworkflows/local/get_public_accessions/main.nf.test.snap @@ -0,0 +1,78 @@ +{ + "Accessions file + Excluded accessions file": { + "content": [ + { + "0": [ + "E-GEOD-61690", + "E-MTAB-552", + "E-PROT-138" + ], + "accessions": [ + "E-GEOD-61690", + "E-MTAB-552", + "E-PROT-138" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-11T11:13:39.15754592" + }, + "No GEO + accessions provided": { + "content": [ + { + "0": [ + "E-GEOD-61690", + "E-MTAB-552", + "E-PROT-138" + ], + "accessions": [ + "E-GEOD-61690", + "E-MTAB-552", + "E-PROT-138" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-11T11:13:13.734553188" + }, + "Fetch eatlas accessions without keywords": { + "content": [ + { + "0": [ + + ], + "accessions": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-11T11:12:37.434155203" + }, + "Fetch public accessions with keywords": { + "content": [ + { + "0": [ + + ], + "accessions": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-11T14:12:14.94130631" + } +} \ No newline at end of file diff --git a/tests/test_data/aggregate_results/mapping.csv b/tests/test_data/aggregate_results/mapping.csv new file mode 100644 index 00000000..b3c00132 --- /dev/null +++ b/tests/test_data/aggregate_results/mapping.csv @@ -0,0 +1,4 @@ +original_gene_id,gene_id +ENSRNA049434199,ENSRNA049454747 +ENSRNA049434246,ENSRNA049454887 +ENSRNA049434252,SNSRNA049434252 diff --git a/tests/test_data/aggregate_results/metadata.csv b/tests/test_data/aggregate_results/metadata.csv new file mode 100644 index 00000000..5e8f3142 --- /dev/null +++ b/tests/test_data/aggregate_results/metadata.csv @@ -0,0 +1,4 @@ +gene_id,name,description +ENSRNA049454747,geneA,descriptionA +ENSRNA049454887,geneB,descriptionB +ENSRNA049454747,geneC,descriptionC diff --git a/tests/test_data/aggregate_results/microarray_stats_all_genes.csv b/tests/test_data/aggregate_results/microarray_stats_all_genes.csv new file mode 100644 index 00000000..0fe7d08f --- /dev/null +++ b/tests/test_data/aggregate_results/microarray_stats_all_genes.csv @@ -0,0 +1,10 @@ +gene_id,microarray_mean,microarray_standard_deviation,microarray_median,microarray_median_absolute_deviation,microarray_coefficient_of_variation,microarray_robust_coefficient_of_variation_median,microarray_ratio_nulls_in_all_samples,microarray_ratio_nulls_in_valid_samples,microarray_ratio_zeros,microarray_expression_level_quantile_interval +ENSRNA049454747,0.9375,0.11572751247156893,1.0,0.0,0.12344267996967352,0.0,0.0,0.0,0.0,99 +ENSRNA049454887,0.140625,0.15580293184477811,0.125,0.125,1.1079319597850887,1.4826,0.0,0.0,0.5,11 +ENSRNA049454931,0.4453125,0.12246309575308217,0.4375,0.0625,0.2750048466034126,0.2118,0.0,0.0,0.0,66 +ENSRNA049454947,0.3984375,0.1887975933374152,0.375,0.125,0.4738449401409636,0.4942,0.0,0.0,0.0,44 +ENSRNA049454955,0.421875,0.18525441001112883,0.4375,0.15625,0.4391215644708239,0.5295,0.0,0.0,0.0,55 +ENSRNA049454963,0.78125,0.08838834764831845,0.75,0.0625,0.1131370849898476,0.12355,0.0,0.0,0.0,77 +ENSRNA049454974,0.859375,0.12387890112063936,0.875,0.0625,0.14414999403128945,0.1059,0.0,0.0,0.0,88 +ENSRNA049455639,0.15625,0.20863074009907004,0.125,0.125,1.3352367366340483,1.4826,0.0,0.0,0.375,22 +ENSRNA049455690,0.328125,0.34028283928856257,0.3125,0.3125,1.0370524625937145,1.4826,0.0,0.0,0.375,33 diff --git a/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv b/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv new file mode 100644 index 00000000..de9af372 --- /dev/null +++ b/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv @@ -0,0 +1,10 @@ +gene_id,rnaseq_mean,rnaseq_standard_deviation,rnaseq_median,rnaseq_median_absolute_deviation,rnaseq_coefficient_of_variation,rnaseq_robust_coefficient_of_variation_median,rnaseq_ratio_nulls_in_all_samples,rnaseq_ratio_nulls_in_valid_samples,rnaseq_ratio_zeros,rnaseq_expression_level_quantile_interval +ENSRNA049454747,0.9375,0.11572751247156893,1.0,0.0,0.12344267996967352,0.0,0.0,0.0,0.0,99 +ENSRNA049454887,0.140625,0.15580293184477811,0.125,0.125,1.1079319597850887,1.4826,0.0,0.0,0.5,11 +ENSRNA049454931,0.4453125,0.12246309575308217,0.4375,0.0625,0.2750048466034126,0.2118,0.0,0.0,0.0,66 +ENSRNA049454947,0.3984375,0.1887975933374152,0.375,0.125,0.4738449401409636,0.4942,0.0,0.0,0.0,44 +ENSRNA049454955,0.421875,0.18525441001112883,0.4375,0.15625,0.4391215644708239,0.5295,0.0,0.0,0.0,55 +ENSRNA049454963,0.78125,0.08838834764831845,0.75,0.0625,0.1131370849898476,0.12355,0.0,0.0,0.0,77 +ENSRNA049454974,0.859375,0.12387890112063936,0.875,0.0625,0.14414999403128945,0.1059,0.0,0.0,0.0,88 +ENSRNA049455639,0.15625,0.20863074009907004,0.125,0.125,1.3352367366340483,1.4826,0.0,0.0,0.375,22 +ENSRNA049455690,0.328125,0.34028283928856257,0.3125,0.3125,1.0370524625937145,1.4826,0.0,0.0,0.375,33 diff --git a/tests/test_data/base_statistics/output/stats_all_genes.csv b/tests/test_data/base_statistics/output/stats_all_genes.csv new file mode 100644 index 00000000..85153b9e --- /dev/null +++ b/tests/test_data/base_statistics/output/stats_all_genes.csv @@ -0,0 +1,10 @@ +gene_id,mean,standard_deviation,median,median_absolute_deviation,coefficient_of_variation,robust_coefficient_of_variation_median,ratio_nulls_in_all_samples,ratio_nulls_in_valid_samples,ratio_zeros,expression_level_quantile_interval +ENSRNA049454747,0.9375,0.11572751247156893,1.0,0.0,0.12344267996967352,0.0,0.0,0.0,0.0,99 +ENSRNA049454887,0.140625,0.15580293184477811,0.125,0.125,1.1079319597850887,1.4826,0.0,0.0,0.5,11 +ENSRNA049454931,0.4453125,0.12246309575308217,0.4375,0.0625,0.2750048466034126,0.2118,0.0,0.0,0.0,66 +ENSRNA049454947,0.3984375,0.1887975933374152,0.375,0.125,0.4738449401409636,0.4942,0.0,0.0,0.0,44 +ENSRNA049454955,0.421875,0.18525441001112883,0.4375,0.15625,0.4391215644708239,0.5295,0.0,0.0,0.0,55 +ENSRNA049454963,0.78125,0.08838834764831845,0.75,0.0625,0.1131370849898476,0.12355,0.0,0.0,0.0,77 +ENSRNA049454974,0.859375,0.12387890112063936,0.875,0.0625,0.14414999403128945,0.1059,0.0,0.0,0.0,88 +ENSRNA049455639,0.15625,0.20863074009907004,0.125,0.125,1.3352367366340483,1.4826,0.0,0.0,0.375,22 +ENSRNA049455690,0.328125,0.34028283928856257,0.3125,0.3125,1.0370524625937145,1.4826,0.0,0.0,0.375,33 diff --git a/tests/test_data/compute_gene_statistics/input/design.csv b/tests/test_data/compute_gene_statistics/input/design.csv new file mode 100644 index 00000000..d3e8694c --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/design.csv @@ -0,0 +1,28 @@ +sample,condition,batch +ARR029909,g1,A +ARR029910,g1,A +ARR029911,g1,A +ARR029912,g2,A +ARR029913,g2,A +ARR029914,g2,A +ARR029915,g3,A +ARR029916,g3,A +ARR029917,g3,A +URR029909,g1,B +URR029910,g1,B +URR029911,g1,B +URR029912,g2,B +URR029913,g2,B +URR029914,g2,B +URR029915,g3,B +URR029916,g3,B +URR029917,g3,B +ERR029909,g1,C +ERR029910,g1,C +ERR029911,g1,C +ERR029912,g2,C +ERR029913,g2,C +ERR029914,g2,C +ERR029915,g3,C +ERR029916,g3,C +ERR029917,g3,C diff --git a/tests/test_data/gene_statistics/input/gene_counts.csv b/tests/test_data/compute_gene_statistics/input/gene_counts.csv similarity index 100% rename from tests/test_data/gene_statistics/input/gene_counts.csv rename to tests/test_data/compute_gene_statistics/input/gene_counts.csv diff --git a/tests/test_data/gene_statistics/input/ks_stats.csv b/tests/test_data/compute_gene_statistics/input/ks_stats.csv similarity index 100% rename from tests/test_data/gene_statistics/input/ks_stats.csv rename to tests/test_data/compute_gene_statistics/input/ks_stats.csv diff --git a/tests/test_data/gene_statistics/input/mapping1.csv b/tests/test_data/compute_gene_statistics/input/mapping1.csv similarity index 80% rename from tests/test_data/gene_statistics/input/mapping1.csv rename to tests/test_data/compute_gene_statistics/input/mapping1.csv index d8abe730..8c5865b4 100644 --- a/tests/test_data/gene_statistics/input/mapping1.csv +++ b/tests/test_data/compute_gene_statistics/input/mapping1.csv @@ -1,4 +1,4 @@ -original_gene_id,ensembl_gene_id +original_gene_id,gene_id Q8VWG3,AT1G34790 Q9FJA2,AT5G35550 Q8RYD9,AT5G23260 diff --git a/tests/test_data/gene_statistics/input/mapping2.csv b/tests/test_data/compute_gene_statistics/input/mapping2.csv similarity index 80% rename from tests/test_data/gene_statistics/input/mapping2.csv rename to tests/test_data/compute_gene_statistics/input/mapping2.csv index 305ccbea..080dbefd 100644 --- a/tests/test_data/gene_statistics/input/mapping2.csv +++ b/tests/test_data/compute_gene_statistics/input/mapping2.csv @@ -1,4 +1,4 @@ -original_gene_id,ensembl_gene_id +original_gene_id,gene_id Q8VWG3,AT1G34790 Q9FJA2,AT5G35550 Q8RYD9,AT5G23260 diff --git a/tests/test_data/gene_statistics/input/mapping3.csv b/tests/test_data/compute_gene_statistics/input/mapping3.csv similarity index 67% rename from tests/test_data/gene_statistics/input/mapping3.csv rename to tests/test_data/compute_gene_statistics/input/mapping3.csv index e20257b0..c8fbe3f9 100644 --- a/tests/test_data/gene_statistics/input/mapping3.csv +++ b/tests/test_data/compute_gene_statistics/input/mapping3.csv @@ -1,4 +1,4 @@ -original_gene_id,ensembl_gene_id +original_gene_id,gene_id Q8VWG3,AT1G34790 Q9FJA2,AT5G35550 Q8RYD9,AT5G23260 diff --git a/tests/test_data/gene_statistics/input/metadata1.csv b/tests/test_data/compute_gene_statistics/input/metadata1.csv similarity index 87% rename from tests/test_data/gene_statistics/input/metadata1.csv rename to tests/test_data/compute_gene_statistics/input/metadata1.csv index ea4db477..399628bf 100644 --- a/tests/test_data/gene_statistics/input/metadata1.csv +++ b/tests/test_data/compute_gene_statistics/input/metadata1.csv @@ -1,4 +1,4 @@ -ensembl_gene_id,name,description +gene_id,name,description AT1G34790,TT1,C2H2 and C2HC zinc fingers superfamily protein AT5G35550,TT2,Duplicated homeodomain-like superfamily protein AT5G23260,TT16,K-box region and MADS-box transcription factor family protein diff --git a/tests/test_data/gene_statistics/input/metadata2.csv b/tests/test_data/compute_gene_statistics/input/metadata2.csv similarity index 85% rename from tests/test_data/gene_statistics/input/metadata2.csv rename to tests/test_data/compute_gene_statistics/input/metadata2.csv index b5890d89..69fadca4 100644 --- a/tests/test_data/gene_statistics/input/metadata2.csv +++ b/tests/test_data/compute_gene_statistics/input/metadata2.csv @@ -1,4 +1,4 @@ -ensembl_gene_id,name,description +gene_id,name,description AT1G34790,TT1,C2H2 and C2HC zinc fingers superfamily protein AT5G35550,TT2,Duplicated homeodomain-like superfamily protein AT5G23260,TT16,K-box region and MADS-box transcription factor family protein diff --git a/tests/test_data/compute_gene_statistics/input/microarray_stats_all_genes.csv b/tests/test_data/compute_gene_statistics/input/microarray_stats_all_genes.csv new file mode 100644 index 00000000..e40090d6 --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/microarray_stats_all_genes.csv @@ -0,0 +1,8 @@ +gene_id,microarray_mean,microarray_standard_deviation,microarray_median,microarray_median_absolute_deviation,microarray_variation_coefficient,microarray_total_nb_nulls,microarray_nb_nulls_valid_samples,microarray_stability_score,microarray_expression_level_quantile_interval +AT1G34790,0.6041722385984585,0.2965945020346847,0.8210634736950527,0.07852066041592076,0.49091051042450434,678,643,1.4392880915454482,71 +AT5G35550,0.04211885958141837,0.017403154131542625,0.04081717758449555,0.00889668425147598,0.41319148487155133,678,643,1.3615690659924953,0 +AT5G23260,0.3265572056851324,0.12636844695328353,0.2977133397782717,0.09861099799987358,0.3869718528738528,678,643,1.3353494339947967,35 +AT5G23261,0.05948100952172446,0.0268768665570047,0.049569984365840696,0.021228253513649518,0.4518562608993441,678,643,1.400233842020288,1 +AT1G34790,0.5791984846868644,0.16532007773816776,0.5865184277282238,0.13319224137108376,0.28542905775650596,70,35,0.337051476635562,68 +AT5G35550,0.4069181057633956,0.2662419700433056,0.26506770843115524,0.13156965253473574,0.6542888268484007,678,643,1.6026664079693447,46 +AT5G23260,0.12079194562039748,0.060559689529495545,0.10818687095210754,0.0368400391021249,0.5013553612242599,678,643,1.449732942345204,7 diff --git a/tests/test_data/compute_gene_statistics/input/rnaseq_stats_all_genes.csv b/tests/test_data/compute_gene_statistics/input/rnaseq_stats_all_genes.csv new file mode 100644 index 00000000..e4c7327d --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/rnaseq_stats_all_genes.csv @@ -0,0 +1,8 @@ +gene_id,rnaseq_mean,rnaseq_standard_deviation,rnaseq_median,rnaseq_median_absolute_deviation,rnaseq_variation_coefficient,rnaseq_total_nb_nulls,rnaseq_nb_nulls_valid_samples,rnaseq_stability_score,rnaseq_expression_level_quantile_interval +AT1G34790,0.029004004004004002,0.061217504567865136,0.0,0.0,2.110657016852365,345,336,3.0544772415714663,0 +AT5G35550,0.2921254587921254,0.028005675342417956,0.28128128128128127,0.025025025025025016,0.09586865676896245,356,347,1.070587757892558,41 +AT5G23260,0.051621388830691145,0.04715133948046024,0.04154154154154154,0.027027027027027035,0.9134070304677029,322,313,1.7926205136137703,3 +AT5G23261,0.06000444889333778,0.0796183056079376,0.030030030030030026,0.030030030030030026,1.3268733748303374,356,347,2.301592475953933,5 +AT1G34790,0.027638749860972082,0.019581626793675158,0.025525525525525526,0.014014014014014014,0.7084845332069752,356,347,1.6832036343305707,0 +AT5G35550,0.07687920478618152,0.05023977809403856,0.06906906906906907,0.03603603603603604,0.6534898251583997,322,313,1.532703308304467,8 +AT5G23260,0.05421550582840906,0.0785887308235655,0.0,0.0,1.4495618849761762,303,294,2.2754045816053896,4 diff --git a/tests/test_data/compute_stability_scores/input/genorm.m_measures.csv b/tests/test_data/compute_stability_scores/input/genorm.m_measures.csv new file mode 100644 index 00000000..b4b6ae10 --- /dev/null +++ b/tests/test_data/compute_stability_scores/input/genorm.m_measures.csv @@ -0,0 +1,10 @@ +gene_id,genorm_m_measure +ENSRNA049454747,0.16034699963469335 +ENSRNA049454887,0.525024672172669794 +ENSRNA049454931,0.264017707597323344 +ENSRNA049454947,0.037074358179388235 +ENSRNA049454955,0.65294154739420848 +ENSRNA049454963,0.213698246698642331 +ENSRNA049454974,0.16807095772646336 +ENSRNA049455639,0.02698654413301954 +ENSRNA049455690,0.57785261216485885 diff --git a/tests/test_data/compute_stability_scores/input/stability_values.normfinder.csv b/tests/test_data/compute_stability_scores/input/stability_values.normfinder.csv new file mode 100644 index 00000000..238572e2 --- /dev/null +++ b/tests/test_data/compute_stability_scores/input/stability_values.normfinder.csv @@ -0,0 +1,10 @@ +gene_id,normfinder_stability_value +ENSRNA049454747,0.036034699963469335 +ENSRNA049454887,0.05024672172669794 +ENSRNA049454931,0.014017707597323344 +ENSRNA049454947,0.037074358179388235 +ENSRNA049454955,0.03294154739420848 +ENSRNA049454963,0.03698246698642331 +ENSRNA049454974,0.06807095772646336 +ENSRNA049455639,0.02698654413301954 +ENSRNA049455690,0.07785261216485885 diff --git a/tests/test_data/compute_stability_scores/input/stats_all_genes.csv b/tests/test_data/compute_stability_scores/input/stats_all_genes.csv new file mode 100644 index 00000000..85153b9e --- /dev/null +++ b/tests/test_data/compute_stability_scores/input/stats_all_genes.csv @@ -0,0 +1,10 @@ +gene_id,mean,standard_deviation,median,median_absolute_deviation,coefficient_of_variation,robust_coefficient_of_variation_median,ratio_nulls_in_all_samples,ratio_nulls_in_valid_samples,ratio_zeros,expression_level_quantile_interval +ENSRNA049454747,0.9375,0.11572751247156893,1.0,0.0,0.12344267996967352,0.0,0.0,0.0,0.0,99 +ENSRNA049454887,0.140625,0.15580293184477811,0.125,0.125,1.1079319597850887,1.4826,0.0,0.0,0.5,11 +ENSRNA049454931,0.4453125,0.12246309575308217,0.4375,0.0625,0.2750048466034126,0.2118,0.0,0.0,0.0,66 +ENSRNA049454947,0.3984375,0.1887975933374152,0.375,0.125,0.4738449401409636,0.4942,0.0,0.0,0.0,44 +ENSRNA049454955,0.421875,0.18525441001112883,0.4375,0.15625,0.4391215644708239,0.5295,0.0,0.0,0.0,55 +ENSRNA049454963,0.78125,0.08838834764831845,0.75,0.0625,0.1131370849898476,0.12355,0.0,0.0,0.0,77 +ENSRNA049454974,0.859375,0.12387890112063936,0.875,0.0625,0.14414999403128945,0.1059,0.0,0.0,0.0,88 +ENSRNA049455639,0.15625,0.20863074009907004,0.125,0.125,1.3352367366340483,1.4826,0.0,0.0,0.375,22 +ENSRNA049455690,0.328125,0.34028283928856257,0.3125,0.3125,1.0370524625937145,1.4826,0.0,0.0,0.375,33 diff --git a/tests/test_data/custom_datasets/input.csv b/tests/test_data/custom_datasets/input.csv deleted file mode 100644 index 697e034b..00000000 --- a/tests/test_data/custom_datasets/input.csv +++ /dev/null @@ -1,3 +0,0 @@ -counts,design,platform,normalised -tests/test_data/custom_datasets/microarray.normalised.csv,tests/test_data/custom_datasets/microarray.normalised.design.csv,microarray,true -tests/test_data/custom_datasets/rnaseq.raw.csv,tests/test_data/custom_datasets/rnaseq.raw.design.csv,rnaseq,false diff --git a/tests/test_data/dataset_statistics/input/count2.raw.cpm.quant_norm.parquet b/tests/test_data/dataset_statistics/input/count2.raw.cpm.quant_norm.parquet new file mode 100644 index 00000000..5e09a87a Binary files /dev/null and b/tests/test_data/dataset_statistics/input/count2.raw.cpm.quant_norm.parquet differ diff --git a/tests/test_data/dataset_statistics/output/test.dataset_stats.csv b/tests/test_data/dataset_statistics/output/test.dataset_stats.csv new file mode 100644 index 00000000..ead888ce --- /dev/null +++ b/tests/test_data/dataset_statistics/output/test.dataset_stats.csv @@ -0,0 +1,9 @@ +sample,count,mean,std,min,25%,50%,75%,max,skewness,kolmogorov_smirnov_pvalue +sample_63,9.0,0.5,0.3423265984407288,0.0,0.25,0.5,0.75,1.0,0.0,0.013238665147108418 +sample_64,9.0,0.5,0.34089725358236606,0.0,0.25,0.5625,0.75,1.0,-0.0059425832940604335,0.013238665147108418 +sample_65,9.0,0.5,0.3423265984407288,0.0,0.25,0.5,0.75,1.0,0.0,0.013238665147108418 +sample_66,9.0,0.5,0.3423265984407288,0.0,0.25,0.5,0.75,1.0,0.0,0.013238665147108418 +sample_67,9.0,0.4861111111111111,0.361444824435364,0.0,0.25,0.5,0.75,1.0,-0.09766083489340871,0.013238665147108418 +sample_68,9.0,0.4861111111111111,0.361444824435364,0.0,0.25,0.5,0.75,1.0,-0.09766083489340871,0.013238665147108418 +sample_69,9.0,0.5,0.3423265984407288,0.0,0.25,0.5,0.75,1.0,0.0,0.013238665147108418 +sample_70,9.0,0.5,0.34089725358236606,0.0,0.3125,0.5,0.75,1.0,0.0178277498821813,0.013238665147108418 diff --git a/tests/test_data/genorm/compute_m_measure/input/std.0.0.parquet b/tests/test_data/genorm/compute_m_measure/input/std.0.0.parquet new file mode 100644 index 00000000..3421c129 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.0.0.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.0.1.parquet b/tests/test_data/genorm/compute_m_measure/input/std.0.1.parquet new file mode 100644 index 00000000..ca1bcfb2 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.0.1.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.0.2.parquet b/tests/test_data/genorm/compute_m_measure/input/std.0.2.parquet new file mode 100644 index 00000000..44d8b7fb Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.0.2.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.0.3.parquet b/tests/test_data/genorm/compute_m_measure/input/std.0.3.parquet new file mode 100644 index 00000000..bda5baec Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.0.3.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.1.1.parquet b/tests/test_data/genorm/compute_m_measure/input/std.1.1.parquet new file mode 100644 index 00000000..b7c2f8e3 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.1.1.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.1.2.parquet b/tests/test_data/genorm/compute_m_measure/input/std.1.2.parquet new file mode 100644 index 00000000..7a388549 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.1.2.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.1.3.parquet b/tests/test_data/genorm/compute_m_measure/input/std.1.3.parquet new file mode 100644 index 00000000..83e7970d Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.1.3.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.2.2.parquet b/tests/test_data/genorm/compute_m_measure/input/std.2.2.parquet new file mode 100644 index 00000000..3e96e691 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.2.2.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.2.3.parquet b/tests/test_data/genorm/compute_m_measure/input/std.2.3.parquet new file mode 100644 index 00000000..36ae146c Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.2.3.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.3.3.parquet b/tests/test_data/genorm/compute_m_measure/input/std.3.3.parquet new file mode 100644 index 00000000..a38417a7 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.3.3.parquet differ diff --git a/tests/test_data/genorm/cross_join/output/cross_join.0.1.parquet b/tests/test_data/genorm/cross_join/output/cross_join.0.1.parquet new file mode 100644 index 00000000..bc08ec7b Binary files /dev/null and b/tests/test_data/genorm/cross_join/output/cross_join.0.1.parquet differ diff --git a/tests/test_data/genorm/expression_ratio/output/ratios.0.1.parquet b/tests/test_data/genorm/expression_ratio/output/ratios.0.1.parquet new file mode 100644 index 00000000..2ecf4c28 Binary files /dev/null and b/tests/test_data/genorm/expression_ratio/output/ratios.0.1.parquet differ diff --git a/tests/test_data/genorm/make_chunks/input/counts.head.parquet b/tests/test_data/genorm/make_chunks/input/counts.head.parquet new file mode 100644 index 00000000..d01f1c63 Binary files /dev/null and b/tests/test_data/genorm/make_chunks/input/counts.head.parquet differ diff --git a/tests/test_data/genorm/make_chunks/input/counts.parquet b/tests/test_data/genorm/make_chunks/input/counts.parquet new file mode 100644 index 00000000..8eb2cc91 Binary files /dev/null and b/tests/test_data/genorm/make_chunks/input/counts.parquet differ diff --git a/tests/test_data/genorm/make_chunks/output/count_chunk.0.parquet b/tests/test_data/genorm/make_chunks/output/count_chunk.0.parquet new file mode 100644 index 00000000..90515d2e Binary files /dev/null and b/tests/test_data/genorm/make_chunks/output/count_chunk.0.parquet differ diff --git a/tests/test_data/genorm/make_chunks/output/count_chunk.1.parquet b/tests/test_data/genorm/make_chunks/output/count_chunk.1.parquet new file mode 100644 index 00000000..4661c425 Binary files /dev/null and b/tests/test_data/genorm/make_chunks/output/count_chunk.1.parquet differ diff --git a/tests/test_data/genorm/make_chunks/output/count_chunk.2.parquet b/tests/test_data/genorm/make_chunks/output/count_chunk.2.parquet new file mode 100644 index 00000000..f2b055f3 Binary files /dev/null and b/tests/test_data/genorm/make_chunks/output/count_chunk.2.parquet differ diff --git a/tests/test_data/genorm/ratio_standard_variation/output/std.0.1.parquet b/tests/test_data/genorm/ratio_standard_variation/output/std.0.1.parquet new file mode 100644 index 00000000..b3dbda97 Binary files /dev/null and b/tests/test_data/genorm/ratio_standard_variation/output/std.0.1.parquet differ diff --git a/tests/test_data/genorm/ratio_standard_variation/output/std.0.2.parquet b/tests/test_data/genorm/ratio_standard_variation/output/std.0.2.parquet new file mode 100644 index 00000000..082ad2d6 Binary files /dev/null and b/tests/test_data/genorm/ratio_standard_variation/output/std.0.2.parquet differ diff --git a/tests/test_data/idmapping/base/counts.ensembl_ids.csv b/tests/test_data/idmapping/base/counts.ensembl_ids.csv index 0a9dbca4..a093ec4b 100644 --- a/tests/test_data/idmapping/base/counts.ensembl_ids.csv +++ b/tests/test_data/idmapping/base/counts.ensembl_ids.csv @@ -1,4 +1,4 @@ -ERR029909,ERR029910,ERR029911,ERR029912,ERR029913,ERR029914,ERR029915,ERR029916,ERR029917,ERR029918,ERR029920,ERR029921,ERR029922,ERR029923,ERR029924 +gend_id,ERR029909,ERR029910,ERR029911,ERR029912,ERR029913,ERR029914,ERR029915,ERR029916,ERR029917,ERR029918,ERR029920,ERR029921,ERR029922,ERR029923,ERR029924 ENSRNA049434199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 ENSRNA049434246,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 ENSRNA049434252,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/tests/test_data/idmapping/custom/mapping.csv b/tests/test_data/idmapping/custom/mapping.csv index 9cb9aee4..cd43e30f 100644 --- a/tests/test_data/idmapping/custom/mapping.csv +++ b/tests/test_data/idmapping/custom/mapping.csv @@ -1,4 +1,4 @@ -original_gene_id,ensembl_gene_id +original_gene_id,gene_id ENSRNA049434199,SNSRNA049434199 ENSRNA049434246,SNSRNA049434246 ENSRNA049434252,SNSRNA049434252 diff --git a/tests/test_data/idmapping/custom/metadata.csv b/tests/test_data/idmapping/custom/metadata.csv new file mode 100644 index 00000000..0c4095a9 --- /dev/null +++ b/tests/test_data/idmapping/custom/metadata.csv @@ -0,0 +1,4 @@ +gene_id,name,description +SNSRNA049434199,geneA,descriptionA +SNSRNA049434246,geneB,descriptionB +SNSRNA049434252,geneC,descriptionC diff --git a/tests/test_data/idmapping/gene_ids/gene_ids.txt b/tests/test_data/idmapping/gene_ids/gene_ids.txt new file mode 100644 index 00000000..94233419 --- /dev/null +++ b/tests/test_data/idmapping/gene_ids/gene_ids.txt @@ -0,0 +1,9 @@ +ENSRNA049434199 +ENSRNA049434246 +ENSRNA049434252 +840386 +833520 +832390 +Q8VWG3 +Q9FJA2 +Q8RYD9 diff --git a/tests/test_data/idmapping/mapped/mapped_gene_ids.csv b/tests/test_data/idmapping/mapped/mapped_gene_ids.csv new file mode 100644 index 00000000..84561688 --- /dev/null +++ b/tests/test_data/idmapping/mapped/mapped_gene_ids.csv @@ -0,0 +1,4 @@ +original_gene_id,gene_id +ENSRNA049434199,ENSRNA049434199 +ENSRNA049434246,ENSRNA049434246 +ENSRNA049434252,ENSRNA049434252 diff --git a/tests/test_data/idmapping/mapped/no_valid_gene_id.txt b/tests/test_data/idmapping/mapped/no_valid_gene_id.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_data/idmapping/mapped/valid_gene_ids.txt b/tests/test_data/idmapping/mapped/valid_gene_ids.txt new file mode 100644 index 00000000..4fc4b319 --- /dev/null +++ b/tests/test_data/idmapping/mapped/valid_gene_ids.txt @@ -0,0 +1,2 @@ +ENSRNA049434199 +ENSRNA049434246 diff --git a/tests/test_data/idmapping/tsv/counts.ensembl_ids.tsv b/tests/test_data/idmapping/tsv/counts.ensembl_ids.tsv new file mode 100644 index 00000000..b1e1511d --- /dev/null +++ b/tests/test_data/idmapping/tsv/counts.ensembl_ids.tsv @@ -0,0 +1,4 @@ +gene_id ERR029909 ERR029910 ERR029911 ERR029912 ERR029913 ERR029914 ERR029915 ERR029916 ERR029917 ERR029918 ERR029920 ERR029921 ERR029922 ERR029923 ERR029924 +ENSRNA049434199 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +ENSRNA049434246 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +ENSRNA049434252 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 diff --git a/tests/test_data/idmapping/tsv/mapping.tsv b/tests/test_data/idmapping/tsv/mapping.tsv new file mode 100644 index 00000000..c425f89f --- /dev/null +++ b/tests/test_data/idmapping/tsv/mapping.tsv @@ -0,0 +1,4 @@ +original_gene_id gene_id +ENSRNA049434199 SNSRNA049434199 +ENSRNA049434246 SNSRNA049434246 +ENSRNA049434252 SNSRNA049434252 diff --git a/tests/test_data/idmapping/tsv/metadata.tsv b/tests/test_data/idmapping/tsv/metadata.tsv new file mode 100644 index 00000000..11eae353 --- /dev/null +++ b/tests/test_data/idmapping/tsv/metadata.tsv @@ -0,0 +1,4 @@ +gene_id name description +SNSRNA049434199 geneA descriptionA +SNSRNA049434246 geneB descriptionB +SNSRNA049434252 geneC descriptionC diff --git a/tests/test_data/idmapping/tsv/valid_gene_ids.txt b/tests/test_data/idmapping/tsv/valid_gene_ids.txt new file mode 100644 index 00000000..4fc4b319 --- /dev/null +++ b/tests/test_data/idmapping/tsv/valid_gene_ids.txt @@ -0,0 +1,2 @@ +ENSRNA049434199 +ENSRNA049434246 diff --git a/tests/test_data/input_datasets/gene_lengths.csv b/tests/test_data/input_datasets/gene_lengths.csv new file mode 100644 index 00000000..03ffba68 --- /dev/null +++ b/tests/test_data/input_datasets/gene_lengths.csv @@ -0,0 +1,10 @@ +gene_id,length +ENSRNA049453121,100 +ENSRNA049453138,200 +ENSRNA049454388,300 +ENSRNA049454416,400 +ENSRNA049454647,500 +ENSRNA049454661,600 +ENSRNA049454747,700 +ENSRNA049454887,800 +ENSRNA049454931,900 diff --git a/tests/test_data/input_datasets/input.csv b/tests/test_data/input_datasets/input.csv new file mode 100644 index 00000000..73278d53 --- /dev/null +++ b/tests/test_data/input_datasets/input.csv @@ -0,0 +1,3 @@ +counts,design,platform,normalised +https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/microarray.normalised.csv,https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/microarray.normalised.design.csv,microarray,true +https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/rnaseq.raw.csv,https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/rnaseq.raw.design.csv,rnaseq,false diff --git a/tests/test_data/input_datasets/input_big.yaml b/tests/test_data/input_datasets/input_big.yaml new file mode 100644 index 00000000..f54577bb --- /dev/null +++ b/tests/test_data/input_datasets/input_big.yaml @@ -0,0 +1,4 @@ +- counts: https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/modules_testdata/SRP254919.salmon.merged.gene_counts.top1000cov.assay.tsv + design: https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/rnaseq_big.design.csv + platform: rnaseq + normalised: false diff --git a/tests/test_data/input_datasets/mapping.csv b/tests/test_data/input_datasets/mapping.csv new file mode 100644 index 00000000..04489426 --- /dev/null +++ b/tests/test_data/input_datasets/mapping.csv @@ -0,0 +1,10 @@ +original_gene_id,gene_id +ENSRNA049453121,SNSRNA049434199 +ENSRNA049453138,SNSRNA049434246 +ENSRNA049454388,SNSRNA049434252 +ENSRNA049454416,SNSRNA049434253 +ENSRNA049454647,SNSRNA049434254 +ENSRNA049454661,SNSRNA049434255 +ENSRNA049454747,SNSRNA049434256 +ENSRNA049454887,SNSRNA049434257 +ENSRNA049454931,SNSRNA049434258 diff --git a/tests/test_data/input_datasets/metadata.csv b/tests/test_data/input_datasets/metadata.csv new file mode 100644 index 00000000..fcccf222 --- /dev/null +++ b/tests/test_data/input_datasets/metadata.csv @@ -0,0 +1,10 @@ +gene_id,name,description +ENSRNA049453121,geneA,descriptionA +ENSRNA049453138,geneB,descriptionB +ENSRNA049454388,geneC,descriptionC +ENSRNA049454416,geneD,descriptionD +ENSRNA049454647,geneE,descriptionE +ENSRNA049454661,geneF,descriptionF +ENSRNA049454747,geneG,descriptionG +ENSRNA049454887,geneH,descriptionH +ENSRNA049454931,geneI,descriptionI diff --git a/tests/test_data/input_datasets/microarray.normalised.csv b/tests/test_data/input_datasets/microarray.normalised.csv new file mode 100644 index 00000000..81f3f904 --- /dev/null +++ b/tests/test_data/input_datasets/microarray.normalised.csv @@ -0,0 +1,10 @@ +gene_id,GSM1528575,GSM1528576,GSM1528579,GSM1528583,GSM1528584,GSM1528585,GSM1528580,GSM1528586,GSM1528582,GSM1528578,GSM1528581,GSM1528577 +ENSRNA049453121,20925.1255070264,136184.261516502,144325.370645564,89427.0987612997,164143.182734208,34178.6378088171,28842.7323281157,76973.395782103,41906.9367255656,44756.5602263121,252562.049703724,6953.65643340122 +ENSRNA049453138,196173.051628372,16607.8367703051,344972.83715281,22602.4535330758,13678.598561184,104546.421532852,15451.4637472048,71664.8857281649,160643.257448002,91459.0578537683,88396.7173963033,281623.08555275 +ENSRNA049454388,91547.4240932405,11625.4857392136,84483.143792525,80582.6604222701,218857.576978944,58304.7350856292,42234.0009090266,88475.1675656357,87306.1181782617,17513.436610296,90922.3378933406,76490.2207674135 +ENSRNA049454416,20925.1255070264,106290.155329953,193607.204524536,47170.3378081581,392119.825420608,190998.270108096,90648.5873169351,81397.1541603848,83813.8734511313,165404.67909724,111127.301869638,194702.380135234 +ENSRNA049454647,99394.3461583754,91343.1022366783,3520.13099135521,71738.2220832404,118547.854196928,20105.0810640101,81377.7090686122,15040.7784861581,66352.6498154789,110918.431865208,55563.6509348192,111258.50293442 +ENSRNA049454661,175247.926121346,66431.3470812206,24640.9169394865,52083.9146631746,360203.095444512,36189.1459152181,70046.6356539953,85820.9125386666,13968.9789085219,50594.3724297441,25256.2049703724,52152.4232505092 +ENSRNA049454747,117703.830977024,154452.881963838,281610.479308417,29481.4611300988,191500.379856576,152798.616086476,53565.0743236435,14156.0268105017,293348.557078959,155674.99209152,63140.5124259309,243377.975169043 +ENSRNA049454887,2615.6406883783,164417.584026021,28161.0479308417,82548.0911642767,50154.861391008,136714.551235268,97859.270398964,64586.872322914,328271.004350264,159566.866893808,151537.229822234,86920.7054175153 +ENSRNA049454931,177863.566809724,81378.4001744952,235848.776420799,88444.3833902964,18238.131414912,120630.48638406,82407.8066517592,50430.8455124123,118736.320722436,68107.8090400402,232357.085727426,163410.926184929 diff --git a/tests/test_data/input_datasets/microarray.normalised.design.csv b/tests/test_data/input_datasets/microarray.normalised.design.csv new file mode 100644 index 00000000..d31e5cef --- /dev/null +++ b/tests/test_data/input_datasets/microarray.normalised.design.csv @@ -0,0 +1,13 @@ +sample,condition +GSM1528575,g1 +GSM1528576,g1 +GSM1528579,g1 +GSM1528583,g2 +GSM1528584,g2 +GSM1528585,g2 +GSM1528580,g3 +GSM1528586,g3 +GSM1528582,g3 +GSM1528578,g4 +GSM1528581,g4 +GSM1528577,g4 diff --git a/tests/test_data/input_datasets/rnaseq.raw.csv b/tests/test_data/input_datasets/rnaseq.raw.csv new file mode 100644 index 00000000..5688c066 --- /dev/null +++ b/tests/test_data/input_datasets/rnaseq.raw.csv @@ -0,0 +1,10 @@ +gene_id,ESM1528575,ESM1528576,ESM1528579,ESM1528583,ESM1528584,ESM1528585,ESM1528580,ESM1528586,ESM1528582,ESM1528578,ESM1528581,ESM1528577 +ENSRNA049453121,1,82,8,82,4,68,88,73,46,57,25,22 +ENSRNA049453138,68,93,41,84,36,18,28,92,84,85,92,32 +ENSRNA049454388,38,10,0,23,11,17,95,57,25,82,10,70 +ENSRNA049454416,75,55,7,30,79,60,15,97,12,35,60,56 +ENSRNA049454647,35,64,55,91,48,95,68,100,24,26,100,47 +ENSRNA049454661,8,99,80,48,86,29,80,17,19,9,44,2 +ENSRNA049454747,67,7,98,53,3,10,52,87,4,80,22,15 +ENSRNA049454887,8,40,24,90,42,52,79,81,94,23,35,81 +ENSRNA049454931,45,49,67,73,26,76,41,16,34,47,36,25 diff --git a/tests/test_data/input_datasets/rnaseq.raw.design.csv b/tests/test_data/input_datasets/rnaseq.raw.design.csv new file mode 100644 index 00000000..469751d2 --- /dev/null +++ b/tests/test_data/input_datasets/rnaseq.raw.design.csv @@ -0,0 +1,13 @@ +sample,condition +ESM1528575,g1 +ESM1528576,g1 +ESM1528579,g1 +ESM1528583,g2 +ESM1528584,g2 +ESM1528585,g2 +ESM1528580,g3 +ESM1528586,g3 +ESM1528582,g3 +ESM1528578,g4 +ESM1528581,g4 +ESM1528577,g4 diff --git a/tests/test_data/input_datasets/rnaseq_big.design.csv b/tests/test_data/input_datasets/rnaseq_big.design.csv new file mode 100644 index 00000000..e8de12df --- /dev/null +++ b/tests/test_data/input_datasets/rnaseq_big.design.csv @@ -0,0 +1,7 @@ +sample,condition +SRX8042381,control +SRX8042382,control +SRX8042383,control +SRX8042384,treatment +SRX8042385,treatment +SRX8042386,treatment diff --git a/tests/test_data/merge_data/output/all_counts.csv b/tests/test_data/merge_data/output/all_counts.csv index 0ba6456a..527a2205 100644 --- a/tests/test_data/merge_data/output/all_counts.csv +++ b/tests/test_data/merge_data/output/all_counts.csv @@ -1,4 +1,4 @@ -ensembl_gene_id,URR029909,URR029910,URR029911,URR029912,URR029913,URR029914,URR029915,URR029916,URR029917,ERR029909,ERR029910,ERR029911,ERR029912,ERR029913,ERR029914,ERR029915,ERR029916,ERR029917,ARR029909,ARR029910,ARR029911,ARR029912,ARR029913,ARR029914,ARR029915,ARR029916,ARR029917 +gene_id,URR029909,URR029910,URR029911,URR029912,URR029913,URR029914,URR029915,URR029916,URR029917,ERR029909,ERR029910,ERR029911,ERR029912,ERR029913,ERR029914,ERR029915,ERR029916,ERR029917,ARR029909,ARR029910,ARR029911,ARR029912,ARR029913,ARR029914,ARR029915,ARR029916,ARR029917 AT1G34790,0.60113057,0.64080682,0.6,0.6197164000000003,0.60115891,0.63052843,0.61002869,0.65849011,0.66239896,0.60113057,0.64080682,0.6348181099999999,0.6519716400000001,0.60115891,0.63052843,0.61002869,0.65849011,0.66239896,0.60113057,0.64080682,0.6348181099999999,0.6519716400000001,0.20115891000000002,0.93052843,0.71002869,0.65849011,0.16239896 AT5G35550,0.7148504699999999,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.0,0.8336608,0.00340416,0.23179154000000002,0.0,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.7555268599999999,0.8336608,0.00340416,0.23179154000000002,0.7148504699999999,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.7555268599999999,0.8336608,0.00340416,0.23179154000000002 AT5G23260,0.71122807,0.47981484,0.85599454,0.69023553,0.40420572,0.30220852000000004,0.73996866,0.08559519,0.80013134,0.71122807,0.47981484,0.85599454,0.69023553,0.40420572,0.30220852000000004,0.73996866,0.08559519,0.80013134,0.71122807,0.47981484,0.85599454,0.69023553,0.40420572,0.30220852000000004,0.73996866,0.08559519,0.80013134 diff --git a/tests/test_data/misc/accessions_to_include.txt b/tests/test_data/misc/accessions_to_include.txt new file mode 100644 index 00000000..7020d409 --- /dev/null +++ b/tests/test_data/misc/accessions_to_include.txt @@ -0,0 +1,2 @@ +E-MTAB-4252 +E-MTAB-4253 diff --git a/tests/test_data/misc/excluded_accessions.txt b/tests/test_data/misc/excluded_accessions.txt new file mode 100644 index 00000000..6c403a93 --- /dev/null +++ b/tests/test_data/misc/excluded_accessions.txt @@ -0,0 +1,2 @@ +E-MTAB-4251 +E-MTAB-4301 diff --git a/tests/test_data/normalise/base/counts.csv b/tests/test_data/normalisation/base/counts.csv similarity index 100% rename from tests/test_data/normalise/base/counts.csv rename to tests/test_data/normalisation/base/counts.csv diff --git a/tests/test_data/normalisation/base/counts.tsv b/tests/test_data/normalisation/base/counts.tsv new file mode 100644 index 00000000..17db2d66 --- /dev/null +++ b/tests/test_data/normalisation/base/counts.tsv @@ -0,0 +1,13 @@ + E_MTAB_5038_rnaseq_SRR1586392 E_MTAB_5038_rnaseq_SRR1586393 E_MTAB_5038_rnaseq_SRR1586394 E_MTAB_5038_rnaseq_SRR1586395 E_MTAB_5038_rnaseq_SRR1586396 E_MTAB_5038_rnaseq_SRR1586397 E_MTAB_5038_rnaseq_SRR1586400 E_MTAB_5038_rnaseq_SRR1586401 E_MTAB_5038_rnaseq_SRR1586402 +ENSRNA549434199 14 25 27 47 39 34 38 19 64 +ENSRNA549434200 91 37 78 84 6 51 18 2 57 +ENSRNA549434201 98 48 69 7 73 48 57 92 36 +ENSRNA549434202 52 15 41 19 8 100 85 83 97 +ENSRNA549434203 86 71 53 16 66 23 12 42 33 +ENSRNA549434204 62 2 25 89 74 32 45 56 26 +ENSRNA549434205 98 42 79 76 74 85 3 91 56 +ENSRNA549434206 42 49 4 88 82 34 27 83 98 +ENSRNA549434207 82 93 85 14 38 8 98 97 30 +ENSRNA549434208 72 36 4 60 25 7 14 76 47 +ENSRNA549434209 65 12 99 82 72 52 24 79 31 +ENSRNA549434210 0 0 0 0 0 0 0 0 0 diff --git a/tests/test_data/normalise/base/design.csv b/tests/test_data/normalisation/base/design.csv similarity index 100% rename from tests/test_data/normalise/base/design.csv rename to tests/test_data/normalisation/base/design.csv diff --git a/tests/test_data/normalisation/base/design.tsv b/tests/test_data/normalisation/base/design.tsv new file mode 100644 index 00000000..fca7e731 --- /dev/null +++ b/tests/test_data/normalisation/base/design.tsv @@ -0,0 +1,10 @@ +batch condition sample +E_MTAB_5038_rnaseq g1 E_MTAB_5038_rnaseq_SRR1586392 +E_MTAB_5038_rnaseq g1 E_MTAB_5038_rnaseq_SRR1586393 +E_MTAB_5038_rnaseq g1 E_MTAB_5038_rnaseq_SRR1586394 +E_MTAB_5038_rnaseq g2 E_MTAB_5038_rnaseq_SRR1586395 +E_MTAB_5038_rnaseq g2 E_MTAB_5038_rnaseq_SRR1586396 +E_MTAB_5038_rnaseq g2 E_MTAB_5038_rnaseq_SRR1586397 +E_MTAB_5038_rnaseq g3 E_MTAB_5038_rnaseq_SRR1586400 +E_MTAB_5038_rnaseq g3 E_MTAB_5038_rnaseq_SRR1586401 +E_MTAB_5038_rnaseq g3 E_MTAB_5038_rnaseq_SRR1586402 diff --git a/tests/test_data/normalisation/base/gene_lengths.csv b/tests/test_data/normalisation/base/gene_lengths.csv new file mode 100644 index 00000000..67b05cee --- /dev/null +++ b/tests/test_data/normalisation/base/gene_lengths.csv @@ -0,0 +1,13 @@ +gene_id,length +ENSRNA549434199,100 +ENSRNA549434200,200 +ENSRNA549434201,300 +ENSRNA549434202,400 +ENSRNA549434203,500 +ENSRNA549434204,600 +ENSRNA549434205,700 +ENSRNA549434206,800 +ENSRNA549434207,900 +ENSRNA549434208,1000 +ENSRNA549434209,1100 +ENSRNA549434210,1200 diff --git a/tests/test_data/normalise/many_zeros/counts.csv b/tests/test_data/normalisation/many_zeros/counts.csv similarity index 100% rename from tests/test_data/normalise/many_zeros/counts.csv rename to tests/test_data/normalisation/many_zeros/counts.csv diff --git a/tests/test_data/normalise/many_zeros/design.csv b/tests/test_data/normalisation/many_zeros/design.csv similarity index 100% rename from tests/test_data/normalise/many_zeros/design.csv rename to tests/test_data/normalisation/many_zeros/design.csv diff --git a/tests/test_data/normalisation/many_zeros/gene_lengths.csv b/tests/test_data/normalisation/many_zeros/gene_lengths.csv new file mode 100644 index 00000000..923e2d65 --- /dev/null +++ b/tests/test_data/normalisation/many_zeros/gene_lengths.csv @@ -0,0 +1,6 @@ +gene_id,length +AT1G80990,100 +AT2G01008,200 +AT2G01010,300 +AT2G01020,400 +AT2G01021,500 diff --git a/tests/test_data/normalise/one_group/counts.csv b/tests/test_data/normalisation/one_group/counts.csv similarity index 100% rename from tests/test_data/normalise/one_group/counts.csv rename to tests/test_data/normalisation/one_group/counts.csv diff --git a/tests/test_data/normalise/one_group/design.csv b/tests/test_data/normalisation/one_group/design.csv similarity index 100% rename from tests/test_data/normalise/one_group/design.csv rename to tests/test_data/normalisation/one_group/design.csv diff --git a/tests/test_data/normalisation/one_group/gene_lengths.csv b/tests/test_data/normalisation/one_group/gene_lengths.csv new file mode 100644 index 00000000..73eb9655 --- /dev/null +++ b/tests/test_data/normalisation/one_group/gene_lengths.csv @@ -0,0 +1,6 @@ +gene_id,length +ENSG00000000003,100 +ENSG00000000005,200 +ENSG00000000419,300 +ENSG00000000457,400 +ENSG00000000460,500 diff --git a/tests/test_data/normfinder/small_normalised/all_counts.normalised.parquet b/tests/test_data/normfinder/small_normalised/all_counts.normalised.parquet new file mode 100644 index 00000000..23f39583 Binary files /dev/null and b/tests/test_data/normfinder/small_normalised/all_counts.normalised.parquet differ diff --git a/tests/test_data/normfinder/small_normalised/design.csv b/tests/test_data/normfinder/small_normalised/design.csv new file mode 100644 index 00000000..6a212658 --- /dev/null +++ b/tests/test_data/normfinder/small_normalised/design.csv @@ -0,0 +1,12 @@ +batch,condition,sample +E_MTAB_11876_rnaseq,g1,E_MTAB_11876_rnaseq_ERR9883576 +E_MTAB_11876_rnaseq,g1,E_MTAB_11876_rnaseq_ERR9883577 +E_MTAB_11876_rnaseq,g1,E_MTAB_11876_rnaseq_ERR9883578 +E_MTAB_11876_rnaseq,g2,E_MTAB_11876_rnaseq_ERR9883579 +E_MTAB_11876_rnaseq,g2,E_MTAB_11876_rnaseq_ERR9883580 +E_MTAB_11876_rnaseq,g2,E_MTAB_11876_rnaseq_ERR9883581 +E_MTAB_4789_rnaseq,g8,E_MTAB_4789_rnaseq_SRR948460 +E_MTAB_4789_rnaseq,g8,E_MTAB_4789_rnaseq_SRR948461 +E_MTAB_4789_rnaseq,g8,E_MTAB_4789_rnaseq_SRR948462 +E_MTAB_4789_rnaseq,g8,E_MTAB_4789_rnaseq_SRR948463 +E_MTAB_4789_rnaseq,g9,E_MTAB_4789_rnaseq_SRR948464 diff --git a/tests/test_data/normfinder/very_small_cq/all_counts.normalised.parquet b/tests/test_data/normfinder/very_small_cq/all_counts.normalised.parquet new file mode 100644 index 00000000..ba20a3e5 Binary files /dev/null and b/tests/test_data/normfinder/very_small_cq/all_counts.normalised.parquet differ diff --git a/tests/test_data/normfinder/very_small_cq/design.csv b/tests/test_data/normfinder/very_small_cq/design.csv new file mode 100644 index 00000000..221601a5 --- /dev/null +++ b/tests/test_data/normfinder/very_small_cq/design.csv @@ -0,0 +1,7 @@ +sample,condition,batch +S1,control,A +S2,treated,A +S3,control,A +S4,treated,A +S5,control,A +S6,treated,A diff --git a/tests/test_data/normfinder/very_small_cq/normfinder.R b/tests/test_data/normfinder/very_small_cq/normfinder.R new file mode 100644 index 00000000..f415f95f --- /dev/null +++ b/tests/test_data/normfinder/very_small_cq/normfinder.R @@ -0,0 +1,298 @@ +library(optparse) +library(dplyr) +library(tidyr) + + +get_args <- function() { + option_list <- list( + make_option("--data", type = "character") + ) + + args <- parse_args(OptionParser( + option_list = option_list, + description = "Normfinder" + )) + return(args) +} + + +normfinder<-function(data, group = TRUE, ctVal=FALSE, pStabLim=0.3, sample = "sample", gene = "gene", groups = "group", cq = "cq"){ + + # Group & sample ID + sample_group <- unique(data[,c(sample, groups)]) + + tmp <- data.frame(sample = as.character(data[, sample]), + gene = as.character(data[, gene]), + cq = as.numeric(data[, cq])) + tmp <- tmp %>% + dplyr::group_by(sample, gene) %>% + dplyr::summarise(cq=mean(cq, na.rm=T)) %>% + tidyr::spread(sample, cq) + + ntotal<-length(sample_group[,1]) + + if (group == TRUE){ + ngenes <- length(tmp$gene) # number of genes + genenames <- as.character(tmp$gene) + grId <- factor(sample_group[,2]) + } else { + ngenes <- length(tmp$gene) # number of genes + genenames <- as.character(tmp$gene) + grId <- rep(1,ntotal) + } + + tmp <- data.matrix(tmp[,sample_group[,1]]) + + if (!ctVal){tmp<-log2(tmp)} + + + groupnames <- levels(grId) + ngr <- length(levels(grId)) + + # Number of samples in each group: + nsamples <- rep(0,ngr) + for (group in 1:ngr){nsamples[group] <- sum(grId==groupnames[group])} + + + + MakeStab <- function(da){ + + ngenes <- dim(da)[1] + # Sample averages + sampleavg <- apply(da,2,mean) + # Gene averages within group + genegroupavg <- matrix(0,ngenes,ngr) + + for (group in 1:ngr){ + genegroupavg[,group] <- apply(da[,grId==groupnames[group]],1,mean)} + + # Group averages + groupavg=rep(0,ngr) + for (group in 1:ngr){groupavg[group] <- mean(da[,grId==groupnames[group]])} + + # Variances + GGvar=matrix(0,ngenes,ngr) + for (group in 1:ngr){ + grset <- (grId==groupnames[group]) + a=rep(0,ngenes) + for (gene in 1:ngenes){ + a[gene] <- sum((da[gene,grset]-genegroupavg[gene,group]- + sampleavg[grset]+groupavg[group])^2)/(nsamples[group]-1) + } + GGvar[,group] <- (a-sum(a)/(ngenes*ngenes-ngenes))/(1-2/ngenes) + } + + print("GGvar") + print(GGvar) + + # + # Change possible negative values + genegroupMinvar <- matrix(0, ngenes, ngr) + for (group in 1:ngr){ + grset <- (grId == groupnames[group]) + z <- da[,grset] + for (gene in 1:ngenes){ + varpair <- rep(0,ngenes) + for (gene1 in 1:ngenes){varpair[gene1] <- var(z[gene,] - z[gene1,])} + genegroupMinvar[gene,group] <- min(varpair[-gene])/4 + } + } + # + # Final variances + GGvar <- ifelse(GGvar < 0, genegroupMinvar, GGvar) + print("GGvar") + print(GGvar) + # + # Old stability measure for each gene is calculated: + # + dif <- genegroupavg + difgeneavg <- apply(dif, 1, mean) + difgroupavg <- apply(dif, 2, mean) + difavg <- mean(dif) + for (gene in 1:ngenes){ + for (group in 1:ngr){ + dif[gene,group] <- dif[gene, group] - difgeneavg[gene] - difgroupavg[group] + difavg + } + } + # + nsampMatrix <- matrix(rep(nsamples,ngenes),ngenes,ngr,byrow=T) + vardif <- GGvar/nsampMatrix + gamma <- sum(dif * dif) / ((ngr-1) * (ngenes-1)) -sum (vardif) / (ngenes*ngr) + gamma <- ifelse(gamma<0,0,gamma) + # + difnew <- dif * gamma / (gamma+vardif) + varnew <- vardif + gamma * vardif / (gamma+vardif) + Ostab0 <- abs(difnew) + sqrt(varnew) + Ostab <- apply(Ostab0, 1, mean) + + # + # Measure of group differences: + mud <- rep(0,ngenes) + for (gene in 1:ngenes){ + mud[gene] <- 2*max(abs(dif[gene,])) + } + # Common variance: + genevar <- rep(0,ngenes) + for (gene in 1:ngenes){ + genevar[gene] <- sum((nsamples-1) * GGvar[gene,]) / (sum(nsamples)-ngr) + } + Gsd <- sqrt(genevar) + # + # Return results: + # + return(cbind(mud, Gsd, Ostab, rep(gamma,ngenes), GGvar,dif)) + } # End of function MakeStab + # + # + MakeComb2 <- function(g1, g2, res){ + gam <- res[1,4] + d1 <- res[g1,(4 + ngr + 1):(4 + ngr + ngr)]; d2 <- res[g2, (4 + ngr + 1):(4+ngr+ngr)] + s1 <- res[g1, (4+1):(4+ngr)]; s2 <- res[g2, (4+1):(4+ngr)] + rho <- abs(gam * d1 / (gam + s1 / nsamples) + gam * d2 / (gam + s2 / nsamples)) * sqrt(ngenes / (ngenes-2)) / 2 + rho <- rho + sqrt(s1 / nsamples + gam * s1 / (nsamples*gam+s1) + s2 / nsamples + gam * s2 / (nsamples*gam+s2))/2 + return(mean(rho)) + } + # + # + MakeStabOne <- function(da){ + ngenes <- dim(da)[1] + # Sample averages + sampleavg <- apply(da, 2, mean) + # Gene averages + geneavg <- apply(da, 1, mean) + totalavg <- mean(da) + # + # Variances + genevar0 <- rep(0, ngenes) + for (gene in 1:ngenes){ + genevar0[gene] <- sum((tmp[gene,] - geneavg[gene] - sampleavg + totalavg)^2) / ((ntotal-1) * (1-2/ngenes)) + } + genevar <- genevar0 - sum(genevar0) / (ngenes*ngenes-ngenes) + # + # Change possible negative values + geneMinvar <- rep(0,ngenes) + z <- da + for (gene in 1:ngenes){ + varpair <- rep(0, ngenes) + for (gene1 in 1:ngenes){varpair[gene1] <- var(z[gene,] - z[gene1,])} + geneMinvar[gene] <- min(varpair[-gene]) / 4 + } + # Final variances + genevar = ifelse(genevar<0, geneMinvar, genevar) + # + return(genevar) + } + # End of function MakeStabOne + + #### Main function #### + if (ngr>1){ # More than one group. + # + res <- MakeStab(tmp) + # + gcand <- c(1:ngenes)[res[,3] < pStabLim] + ncand <- length(gcand) + if (ncand<4){ + if (ngenes>3){ + li <- sort(res[,3])[4] + gcand <- c(1:ngenes)[res[,3]<=li] + ncand <- length(gcand) + } else { + gcand <- c(1:ngenes) + ncand <- length(gcand) + } + } + # + vv2 <- c() + # + for (g1 in 1:(ncand-1)){ + for (g2 in (g1+1):ncand){ + qmeas <- MakeComb2(gcand[g1], gcand[g2], res) + vv2 <- rbind(vv2, c(gcand[g1], gcand[g2], qmeas)) + }} + # + ord <- order(res[,3]) + FinalRes <- list(Ordered <- data.frame("GroupDif" = round(res[ord,1],3), + "GroupSD" = round(res[ord,2],3), + "Stability" = round(res[ord,3],3), + row.names = genenames[ord]), + UnOrdered <- data.frame("GroupDif" = round(res[,1],3), + "GroupSD" = round(res[,2],3), + "Stability" = round(res[,3],3), + "IGroupSD" = round(sqrt(res[,(4+1):(4+ngr)]),3), + "IGroupDif" = round(res[,(4+ngr+1):(4+ngr+ngr)],3), + row.names = genenames), + PairOfGenes <- data.frame("Gene1" = genenames[vv2[,1]], + "Gene2" = genenames[vv2[,2]], + "Stability" = round(vv2[,3],3))) + # + return(FinalRes) + # + } else { # End of more than one group: next is for one group only. + # + # + sigma <- sqrt(MakeStabOne(tmp)) + # + siglim <- (min(sigma)+0.1) + gcand <- c(1:ngenes)[sigma=2) & (ngenes>3)){ + # + vv2=c() + # + for (g1 in 1:(ncand-1)){ + for (g2 in (g1+1):ncand){ + dat1 <- rbind(tmp[-c(gcand[g1], gcand[g2]),], + apply(tmp[c(gcand[g1], gcand[g2]),], 2, mean)) + qmeas <- sqrt(MakeStabOne(dat1)) + vv2 <- rbind(vv2, c(gcand[g1], gcand[g2], qmeas[ngenes-1])) + }} + ord <- order(sigma) + FinalRes <- list(Ordered <- data.frame("GroupSD" = round(sigma[ord],3), + row.names = genenames[ord]), + PairOfGenes <- data.frame("Gene1" = genenames[vv2[,1]], + "Gene2" = genenames[vv2[,2]], + "GroupSD" = round(vv2[,3],3))) + } else { # No combined genes to consider + ord <- order(sigma) + FinalRes <- list(Ordered <- data.frame("GroupSD" = round(sigma[ord],3), + row.names = genenames[ord])) + } # End ncand<2 or ngenes<=3 + # + return(FinalRes) + # + } # End one group only + +} ##### + +# Read the counts file +counts <- read.csv("all_counts.normfinder.csv") + +# Build design (conditions per sample) +design <- data.frame( + sample = c("S1","S2","S3","S4","S5","S6"), + group = c("control","treated","control","treated","control","treated") +) + +# Convert counts wide → long +library(tidyr) +library(dplyr) + +data <- counts %>% + tidyr::pivot_longer( + cols = -gene_id, + names_to = "sample", + values_to = "cq" + ) %>% + dplyr::rename(gene = gene_id) %>% + dplyr::left_join(design, by = "sample") + +# Inspect +#print(data) + +data <- as.data.frame(data) + + +res = normfinder(data, ctVal=TRUE) +print("res") +print(res) diff --git a/tests/test_data/public_accessions/exclude_one_geo_accession.txt b/tests/test_data/public_accessions/exclude_one_geo_accession.txt new file mode 100644 index 00000000..c6978b9b --- /dev/null +++ b/tests/test_data/public_accessions/exclude_one_geo_accession.txt @@ -0,0 +1 @@ +GSE55951 diff --git a/tests/test_data/public_accessions/exclude_two_geo_accessions.txt b/tests/test_data/public_accessions/exclude_two_geo_accessions.txt new file mode 100644 index 00000000..0ef19a43 --- /dev/null +++ b/tests/test_data/public_accessions/exclude_two_geo_accessions.txt @@ -0,0 +1,2 @@ +GSE79526 +GSE55951 diff --git a/tests/test_data/quantile_normalise/count.raw.cpm.csv b/tests/test_data/quantile_normalisation/count.raw.cpm.csv similarity index 100% rename from tests/test_data/quantile_normalise/count.raw.cpm.csv rename to tests/test_data/quantile_normalisation/count.raw.cpm.csv diff --git a/tests/workflows/nextflow.config b/tests/workflows/nextflow.config deleted file mode 100644 index 20ab42b2..00000000 --- a/tests/workflows/nextflow.config +++ /dev/null @@ -1,3 +0,0 @@ -params { - outdir = "results" -} diff --git a/tests/workflows/stableexpression.nf.test b/tests/workflows/stableexpression.nf.test deleted file mode 100644 index 63bf3b26..00000000 --- a/tests/workflows/stableexpression.nf.test +++ /dev/null @@ -1,174 +0,0 @@ -nextflow_workflow { - - name "Test Workflow STABLEEXPRESSION" - script "workflows/stableexpression.nf" - workflow "STABLEEXPRESSION" - config "./nextflow.config" - tag "workflow" - - test("Two Expression Atlas accessions provided") { - - tag "workflow_eatlas_accessions" - - when { - params { - species = "solanum tuberosum" - eatlas_accessions = "E-MTAB-552,E-GEOD-61690" - } - workflow { - """ - input[0] = Channel.empty() - """ - } - } - - then { - assert workflow.success - with(workflow.out.multiqc_report[0]) { - assertAll( - { assert path(get(0)).readLines().any { it.contains('MultiQC: A modular tool') } }, - { assert path(get(0)).readLines().any { it.contains('Data was processed using nf-core/stableexpression') } } - ) - } - } - } - - test("Expression Atlas accession - two output datasets") { - - tag "workflow_eatlas_accession_two_datasets" - - when { - params { - species = "homo sapiens" - eatlas_accessions = "E-GEOD-1615" - } - workflow { - """ - input[0] = Channel.empty() - """ - } - } - - then { - assert workflow.success - with(workflow.out.multiqc_report[0]) { - assertAll( - { assert path(get(0)).readLines().any { it.contains('MultiQC: A modular tool') } }, - { assert path(get(0)).readLines().any { it.contains('Data was processed using nf-core/stableexpression') } } - ) - } - } - } - - test("Two Expression Atlas no keyword (whole species)") { - - tag "workflow_eatlas_no_kw" - - when { - params { - species = "solanum tuberosum" - fetch_eatlas_accessions = true - } - workflow { - """ - input[0] = Channel.empty() - """ - } - } - - then { - assert workflow.success - with(workflow.out.multiqc_report[0]) { - assertAll( - { assert path(get(0)).readLines().any { it.contains('MultiQC: A modular tool') } }, - { assert path(get(0)).readLines().any { it.contains('Data was processed using nf-core/stableexpression') } } - ) - } - } - } - - test("Two Expression Atlas keywords provided") { - - tag "workflow_eatlas_kw" - - when { - params { - species = "solanum tuberosum" - eatlas_keywords = "potato,stress" - } - workflow { - """ - input[0] = Channel.empty() - """ - } - } - - then { - assert workflow.success - with(workflow.out.multiqc_report[0]) { - assertAll( - { assert path(get(0)).readLines().any { it.contains('MultiQC: A modular tool') } }, - { assert path(get(0)).readLines().any { it.contains('Data was processed using nf-core/stableexpression') } } - ) - } - } - } - - test("Two Expression Atlas accessions provided - normalisation with EdgeR") { - - tag "workflow_eatlas_accessions_edger" - - when { - params { - species = "solanum tuberosum" - eatlas_accessions = "E-MTAB-552,E-GEOD-61690" - normalisation_method = "edger" - } - workflow { - """ - input[0] = Channel.empty() - """ - } - } - - then { - assert workflow.success - with(workflow.out.multiqc_report[0]) { - assertAll( - { assert path(get(0)).readLines().any { it.contains('MultiQC: A modular tool') } }, - { assert path(get(0)).readLines().any { it.contains('Data was processed using nf-core/stableexpression') } } - ) - } - } - } - - test("Test workflow - common accession (E-MTAB-552) between manual and auto") { - - tag "workflow_accession_E-MTAB-552" - - when { - params { - species = "solanum tuberosum" - eatlas_accessions = "E-MTAB-552,E-GEOD-61690" - eatlas_keywords = "phloem" - } - workflow { - """ - input[0] = Channel.empty() - """ - } - } - - then { - assert workflow.success - with(workflow.out.multiqc_report[0]) { - assertAll( - { assert path(get(0)).readLines().any { it.contains('MultiQC: A modular tool') } }, - { assert path(get(0)).readLines().any { it.contains('Data was processed using nf-core/stableexpression') } } - ) - } - } - - } - -} diff --git a/tests/workflows/stableexpression.nf.test.snap b/tests/workflows/stableexpression.nf.test.snap deleted file mode 100644 index 20a4fb3d..00000000 --- a/tests/workflows/stableexpression.nf.test.snap +++ /dev/null @@ -1,113 +0,0 @@ -{ - "Expression Atlas accession - two output datasets": { - "content": [ - [ - "stats_most_stable_genes.csv:md5,24892b65e3569872371f79c3afcb863d" - ], - [ - "stats_all_genes.csv:md5,7867fdf5168199d20dc16c47006ef8ce" - ], - [ - "count_summary.csv:md5,3ce9f092b0863842fc08db3dd0bc947e" - ] - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2025-01-09T10:46:57.423487611" - }, - "Two Expression Atlas keywords provided": { - "content": [ - [ - "stats_most_stable_genes.csv:md5,a06dceee03c9d413d6c8ec22329f5262" - ], - [ - "stats_all_genes.csv:md5,ef718e06c30989d1e8341cbd2f5b8b44" - ], - [ - "count_summary.csv:md5,42fd89bef2f7a1e6f43ae91bc443e584" - ] - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2025-01-09T10:55:01.778957099" - }, - "Two Expression Atlas no keyword (whole species)": { - "content": [ - [ - "stats_most_stable_genes.csv:md5,efb81b25c691f8758da1c1f44f315016" - ], - [ - "stats_all_genes.csv:md5,0ec1c7005ad2c0ef43d4b43787293cfc" - ], - [ - "count_summary.csv:md5,7bffef35d4d295abd25391ff42def670" - ] - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2025-01-09T10:50:56.83921757" - }, - "Test workflow - common accession (E-MTAB-552) between manual and auto": { - "content": [ - [ - "stats_most_stable_genes.csv:md5,b311ed6e0abab127f180ece1737c4835" - ], - [ - "stats_all_genes.csv:md5,a1ee2aa140ec3f2968b662e46e65a7ae" - ], - [ - "count_summary.csv:md5,ff830820097999847752e1e553628c18" - ] - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2025-01-09T15:21:40.286621572" - }, - "Two Expression Atlas accessions provided - normalisation with EdgeR": { - "content": [ - [ - "stats_most_stable_genes.csv:md5,9af4528cb8ea2695e0aff66c1ea464a7" - ], - [ - "stats_all_genes.csv:md5,7339175b11aeaeac1a9552a294f9f7b3" - ], - [ - "count_summary.csv:md5,06ba6597605876cc7152c67db0545075" - ] - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2025-01-09T10:56:26.563087788" - }, - "Two Expression Atlas accessions provided": { - "content": [ - { - "0": [ - [ - "multiqc_report.html:md5,eb870e24fa1d767bdddc89be9add54b6" - ] - ], - "multiqc_report": [ - [ - "multiqc_report.html:md5,eb870e24fa1d767bdddc89be9add54b6" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "25.04.0" - }, - "timestamp": "2025-05-11T22:54:00.699548935" - } -} \ No newline at end of file diff --git a/workflows/stableexpression.nf b/workflows/stableexpression.nf index 9e2162c7..8e7985cc 100644 --- a/workflows/stableexpression.nf +++ b/workflows/stableexpression.nf @@ -4,20 +4,21 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { EXPRESSIONATLAS_FETCHDATA } from '../subworkflows/local/expressionatlas_fetchdata/main' -include { EXPRESSION_NORMALISATION } from '../subworkflows/local/expression_normalisation/main.nf' - -include { GPROFILER_IDMAPPING } from '../modules/local/gprofiler/idmapping/main' -include { MERGE_DATA } from '../modules/local/merge_data/main' -include { GENE_STATISTICS } from '../modules/local/gene_statistics/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' - -include { parseInputDatasets } from '../subworkflows/local/utils_nfcore_stableexpression_pipeline' -include { customSoftwareVersionsToYAML } from '../subworkflows/local/utils_nfcore_stableexpression_pipeline' -include { validateInputParameters } from '../subworkflows/local/utils_nfcore_stableexpression_pipeline' -include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_stableexpression_pipeline' -include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { paramsSummaryMap } from 'plugin/nf-schema' +include { GET_PUBLIC_ACCESSIONS } from '../subworkflows/local/get_public_accessions' +include { DOWNLOAD_PUBLIC_DATASETS } from '../subworkflows/local/download_public_datasets' +include { ID_MAPPING } from '../subworkflows/local/idmapping' +include { FILTER_DATASETS } from '../subworkflows/local/filter_datasets' +include { EXPRESSION_NORMALISATION } from '../subworkflows/local/expression_normalisation' +include { MERGE_DATA } from '../subworkflows/local/merge_data' +include { BASE_STATISTICS } from '../subworkflows/local/base_statistics' +include { STABILITY_SCORING } from '../subworkflows/local/stability_scoring' +include { MULTIQC_WORKFLOW } from '../subworkflows/local/multiqc' + +include { COMPUTE_DATASET_STATISTICS } from '../modules/local/compute_dataset_statistics' +include { AGGREGATE_RESULTS } from '../modules/local/aggregate_results' +include { DASH_APP } from '../modules/local/dash_app' + +include { checkCounts } from '../subworkflows/local/utils_nfcore_stableexpression_pipeline' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -32,170 +33,196 @@ workflow STABLEEXPRESSION { main: - ch_multiqc_files = Channel.empty() - ch_species = Channel.value( params.species.split(' ').join('_') ) - - // - // SUBWORKFLOW: fetching Expression Atlas datasets if needed - // - - EXPRESSIONATLAS_FETCHDATA( - ch_species, - params.eatlas_accessions, - params.eatlas_keywords, - params.fetch_eatlas_accessions - ) - // putting all datasets together (local datasets + Expression Atlas datasets) - ch_datasets = ch_input_datasets.concat( EXPRESSIONATLAS_FETCHDATA.out.downloaded_datasets ) + ch_accessions = channel.empty() + ch_downloaded_datasets = channel.empty() + + ch_versions = channel.empty() + ch_multiqc_files = channel.empty() + + ch_most_stable_genes_summary = channel.empty() + ch_all_genes_statistics = channel.empty() + ch_most_stable_genes_transposed_counts = channel.empty() + + def species = params.species.split(' ').join('_').toLowerCase() + + // ----------------------------------------------------------------- + // FETCH PUBLIC ACCESSIONS + // ----------------------------------------------------------------- + + GET_PUBLIC_ACCESSIONS( + species, + params.skip_fetch_eatlas_accessions, + params.fetch_geo_accessions, + params.platform, + params.keywords, + channel.fromList( params.accessions.tokenize(',') ), + params.accessions_file ? channel.fromPath(params.accessions_file, checkIfExists: true) : channel.empty(), + channel.fromList( params.excluded_accessions.tokenize(',') ), + params.excluded_accessions_file ? channel.fromPath(params.excluded_accessions_file, checkIfExists: true) : channel.empty(), + params.random_sampling_size, + params.random_sampling_seed, + params.outdir + ) + ch_accessions = GET_PUBLIC_ACCESSIONS.out.accessions + + // ----------------------------------------------------------------- + // DOWNLOAD GEO DATASETS IF NEEDED + // ----------------------------------------------------------------- + + if ( !params.accessions_only) { - // - // MODULE: ID Mapping - // + DOWNLOAD_PUBLIC_DATASETS ( + species, + ch_accessions + ) + ch_downloaded_datasets = DOWNLOAD_PUBLIC_DATASETS.out.datasets - ch_gene_metadata = Channel.empty() - if ( params.gene_metadata ) { - ch_gene_metadata = Channel.fromPath( params.gene_metadata, checkIfExists: true ) } - if ( params.skip_gprofiler ) { + if ( !params.accessions_only && !params.download_only ) { + + ch_counts = ch_input_datasets.mix( ch_downloaded_datasets ) + // returns an error with a message if no dataset was found + checkCounts( ch_counts ) - ch_gene_id_mapping = Channel.empty() - if ( params.gene_id_mapping ) { - // the gene id mappings will only be those provided by the user - ch_gene_id_mapping = Channel.fromPath( params.gene_id_mapping, checkIfExists: true ) - } + // ----------------------------------------------------------------- + // IDMAPPING + // ----------------------------------------------------------------- - } else { // tries to map gene IDs to Ensembl IDs whenever possible - GPROFILER_IDMAPPING( - ch_datasets.combine( ch_species ), - params.gene_id_mapping ? Channel.fromPath( params.gene_id_mapping, checkIfExists: true ) : 'none' + ID_MAPPING( + ch_counts, + species, + params.skip_id_mapping, + params.gprofiler_target_db, + params.gene_id_mapping, + params.gene_metadata, + params.min_occurrence_freq, + params.min_occurrence_quantile, + params.outdir + ) + ch_counts = ID_MAPPING.out.counts + ch_gene_id_mapping = ID_MAPPING.out.mapping + ch_gene_metadata = ID_MAPPING.out.metadata + + // ----------------------------------------------------------------- + // FILTER OUT SAMPLES NOT VALID + // ----------------------------------------------------------------- + + FILTER_DATASETS ( ch_counts ) + + // ----------------------------------------------------------------- + // NORMALISATION OF RAW COUNT DATASETS (INCLUDING RNA-SEQ DATASETS) + // ----------------------------------------------------------------- + + EXPRESSION_NORMALISATION( + species, + FILTER_DATASETS.out.counts, + params.normalisation_method, + params.quantile_norm_target_distrib, + params.gene_length ) - ch_datasets = GPROFILER_IDMAPPING.out.renamed - ch_gene_metadata = ch_gene_metadata.mix( GPROFILER_IDMAPPING.out.metadata ) - // the gene id mappings are the sum - // of those provided by the user and those fetched from g:Profiler - ch_gene_id_mapping = GPROFILER_IDMAPPING.out.mapping - } - // - // SURBWORKFLOW: normalisation of raw count datasets (including RNA-seq datasets) - // + // ----------------------------------------------------------------- + // COMPUTE VARIOUS STATISTICS AT THE SAMPLE LEVEL + // ----------------------------------------------------------------- - EXPRESSION_NORMALISATION( - ch_datasets, - params.normalisation_method - ) - ch_normalised_counts = EXPRESSION_NORMALISATION.out.normalised_counts - ch_dataset_statistics = EXPRESSION_NORMALISATION.out.dataset_statistics - - // - // MODULE: Merge count files and design files and filter out zero counts - // - - ch_normalised_counts - .map { meta, file -> [file] } - .collect() - .set { ch_count_files } - - ch_normalised_counts - .map { meta, file -> [meta.design] } - .collect() - .set { ch_design_files } - - ch_dataset_statistics - .map { meta, file -> [file] } - .collect() - .set { ch_dataset_stat_files } - - MERGE_DATA( - ch_count_files, - ch_design_files, - ch_dataset_stat_files, - params.nb_top_gene_candidates - ) + COMPUTE_DATASET_STATISTICS ( ch_counts ) - ch_candidate_gene_counts = MERGE_DATA.out.candidate_gene_counts - ch_ks_stats = MERGE_DATA.out.ks_test_statistics - - // - // MODULE: Gene statistics - // - GENE_STATISTICS( - MERGE_DATA.out.all_counts, - ch_gene_metadata.collect(), - ch_gene_id_mapping.collect(), - params.nb_top_gene_candidates, - ch_ks_stats, - params.ks_pvalue_threshold - ) + // ----------------------------------------------------------------- + // MERGE DATA + // ----------------------------------------------------------------- + + MERGE_DATA ( + EXPRESSION_NORMALISATION.out.counts, + ch_gene_id_mapping, + ch_gene_metadata, + params.outdir + ) - ch_multiqc_files = ch_multiqc_files - .mix( GENE_STATISTICS.out.top_stable_genes_summary.collect() ) - .mix( GENE_STATISTICS.out.all_statistics.collect() ) - .mix( GENE_STATISTICS.out.top_stable_genes_transposed_counts.collect() ) - .mix( MERGE_DATA.out.gene_count_statistics.collect() ) - .mix( MERGE_DATA.out.skewness_statistics.collect() ) - .mix( ch_ks_stats.collect() ) - .mix ( MERGE_DATA.out.distribution_correlations.collect() ) - - - // - // Collate and save software versions - // TODO: use the nf-core functions when they are adapted to channel topics - // - - ch_collated_versions = customSoftwareVersionsToYAML( Channel.topic('versions') ) - .collectFile( - storeDir: "${params.outdir}/pipeline_info", - name: 'nf_core_' + 'stableexpression_software_' + 'mqc_' + 'versions.yml', - sort: true, - newLine: true + ch_all_counts = MERGE_DATA.out.all_counts + ch_whole_design = MERGE_DATA.out.whole_design + + // ----------------------------------------------------------------- + // COMPUTE BASE STATISTICS FOR ALL GENES + // ----------------------------------------------------------------- + + BASE_STATISTICS ( + ch_all_counts, + MERGE_DATA.out.platform_counts ) - // - // MODULE: MultiQC - // - ch_multiqc_config = Channel.fromPath( - "$projectDir/assets/multiqc_config.yml", checkIfExists: true) - ch_multiqc_custom_config = params.multiqc_config ? - Channel.fromPath(params.multiqc_config, checkIfExists: true) : - Channel.empty() - ch_multiqc_logo = params.multiqc_logo ? - Channel.fromPath(params.multiqc_logo, checkIfExists: true) : - Channel.empty() - - summary_params = paramsSummaryMap( - workflow, parameters_schema: "nextflow_schema.json") - ch_workflow_summary = Channel.value(paramsSummaryMultiqc(summary_params)) - ch_multiqc_files = ch_multiqc_files.mix( - ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_custom_methods_description = params.multiqc_methods_description ? - file(params.multiqc_methods_description, checkIfExists: true) : - file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - ch_methods_description = Channel.value( - methodsDescriptionText(ch_multiqc_custom_methods_description)) - - ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) - ch_multiqc_files = ch_multiqc_files.mix( - ch_methods_description.collectFile( - name: 'methods_description_mqc.yaml', - sort: true + ch_all_datasets_stats = BASE_STATISTICS.out.stats + + // ----------------------------------------------------------------- + // GET CANDIDATES AS REFERENCE GENE AND COMPUTES VARIOUS STABILITY VALUES + // ----------------------------------------------------------------- + + STABILITY_SCORING ( + ch_all_counts.map{ meta, file -> file }, + ch_whole_design, + ch_all_datasets_stats, + params.candidate_selection_descriptor, + params.nb_top_gene_candidates, + params.min_expr_threshold, + params.run_genorm, + params.stability_score_weights ) - ) - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList(), - [], - [] + ch_stats_all_genes_with_scores = STABILITY_SCORING.out.summary_statistics + + // ----------------------------------------------------------------- + // AGGREGATE ALL RESULTS FOR MULTIQC + // ----------------------------------------------------------------- + + AGGREGATE_RESULTS ( + ch_all_counts.map{ meta, file -> file }.collect(), + ch_stats_all_genes_with_scores.collect(), + BASE_STATISTICS.out.platform_stats.collect(), + MERGE_DATA.out.whole_gene_metadata.collect(), + MERGE_DATA.out.whole_gene_id_mapping.collect() + ) + + ch_all_genes_summary = AGGREGATE_RESULTS.out.all_genes_summary + ch_most_stable_genes_summary = AGGREGATE_RESULTS.out.most_stable_genes_summary + ch_most_stable_genes_transposed_counts = AGGREGATE_RESULTS.out.most_stable_genes_transposed_counts_filtered + + // ----------------------------------------------------------------- + // DASH APPLICATION + // ----------------------------------------------------------------- + + DASH_APP( + ch_all_counts.map{ meta, file -> file }.collect(), + ch_whole_design.collect(), + ch_all_genes_summary.collect() + ) + ch_versions = ch_versions.mix ( DASH_APP.out.versions ) + + ch_multiqc_files = ch_multiqc_files + .mix( ch_most_stable_genes_summary.collect() ) + .mix( ch_all_genes_summary.collect() ) + .mix( ch_most_stable_genes_transposed_counts.collect() ) + + } + + // ----------------------------------------------------------------- + // MULTIQC + // ----------------------------------------------------------------- + + MULTIQC_WORKFLOW( + ch_multiqc_files, + ch_versions, + params.multiqc_config, + params.multiqc_logo, + params.multiqc_methods_description, + params.outdir ) + emit: - multiqc_report = MULTIQC.out.report.toList() + multiqc_report = MULTIQC_WORKFLOW.out.report.toList() + most_stable_genes_summary = ch_most_stable_genes_summary }