diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a30dc58..acf05fe 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -2,7 +2,7 @@ name: Build and Test on: push: - branches: [main, master, develop] + branches: [main, master] pull_request: branches: [main, master] @@ -14,7 +14,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, macos-latest] - python-version: ["3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] steps: - uses: actions/checkout@v4 @@ -50,13 +50,13 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, macos-13, macos-14] # macos-13=x86, macos-14=arm64 + os: [ubuntu-latest, macos-15-intel, macos-14] # macos-15-intel=x86, macos-14=arm64 steps: - uses: actions/checkout@v4 - name: Build wheels - uses: pypa/cibuildwheel@v2.21.3 + uses: pypa/cibuildwheel@v3.3.1 # Config is read from pyproject.toml [tool.cibuildwheel] - uses: actions/upload-artifact@v4 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 8ad5979..1170dae 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -10,7 +10,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, macos-13, macos-14] + os: [ubuntu-latest, macos-15-intel, macos-14] steps: - uses: actions/checkout@v4 @@ -48,7 +48,9 @@ jobs: name: Publish to PyPI needs: [build-wheels, build-sdist] runs-on: ubuntu-latest - environment: pypi + environment: + name: pypi + url: https://pypi.org/project/smudgeplot permissions: id-token: write # Required for trusted publishing diff --git a/Makefile b/Makefile index bb4da2b..4834060 100644 --- a/Makefile +++ b/Makefile @@ -8,11 +8,14 @@ endif HET_KMERS_INST = $(INSTALL_PREFIX)/bin/hetmers $(INSTALL_PREFIX)/bin/extract_kmer_pairs .PHONY : default -default: exec/hetmers exec/extract_kmer_pairs +default: exec/ exec/hetmers exec/extract_kmer_pairs .PHONY : install install : $(HET_KMERS_INST) +exec/: + mkdir -p exec + $(INSTALL_PREFIX)/bin/% : exec/% install -C $< $(INSTALL_PREFIX)/bin diff --git a/README.md b/README.md index 10bd1d0..c82d04a 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,12 @@ # Smudgeplot -**_Version: 0.5.1 Skylight_** +**_Version: 0.5.3 Skylight_** **_Authors: Sam Ebdon, [Gene W Myers](https://github.com/thegenemyers) and [Kamil S. Jaron](https://github.com/KamilSJaron), Tianyi Ma._** -We keep the same pythonic interface; the interface of this and previous versions are very similar and largely compatible. - -Current state: RUNNING; beta-testing; - ## Installation -This version of smudgeplot operates on FastK k-mer databases. The smudgeplot installation consists of a python package and C-backend to search for all the k-mer pairs (hetmers) and extract sequences of k-mer pairs (extract_kmer_pairs). +This version of smudgeplot operates on FastK k-mer databases. The smudgeplot installation consists of a python package and C-backend to search for all the k-mer pairs (smudgeplot hetmers) and extract sequences of k-mer pairs (smudgeplot extract). We recommend installing smudgeplot within a [conda](https://conda-forge.org/download/) environment. @@ -19,7 +15,10 @@ We recommend installing smudgeplot within a [conda](https://conda-forge.org/down conda create -n smudgeplot && conda activate smudgeplot conda install pip -#download and install +# install via pypi +pip install smudgeplot + +# or download and install directly. See below if you need to compile the C dependencies. git clone https://github.com/KamilSJaron/smudgeplot.git cd smudgeplot && pip install . smudgeplot -h # check installation succeeded @@ -31,7 +30,7 @@ Note the smudgeplot version downloadable from conda itself is not currently up t ### Compiling the C code -The process above install everything including compilation of the C backend. If you would like to know how to compile the code yourself you can simply run +The process above should install everything including compilation of the C backend. If you need or would like to know how to compile the code yourself you can simply run ``` make @@ -39,6 +38,14 @@ make This will not, however, install the smudgeplot python package. +### Pypi installation [EXPERIMENTAL] + +We are working on packaging smudgeplot for pypi. You are welcome to try installing from pypi if you are interested and please open an issue if you have problems. If it fails please follow the main instructions above to install for now. + +``` +pip install smudgeplot +``` + ## Example run on Saccharomyces data Requires ~2.1GB of space and `FastK` and `smudgeplot` installed. diff --git a/playground/alternative_fitting/pair_clustering.py b/playground/alternative_fitting/pair_clustering.py index 1cd3eb3..059b40c 100644 --- a/playground/alternative_fitting/pair_clustering.py +++ b/playground/alternative_fitting/pair_clustering.py @@ -13,7 +13,7 @@ #### parser = argparse.ArgumentParser() -parser.add_argument('infile', nargs='?', help='name of the input tsv file with covarages and frequencies.') +parser.add_argument('infile', nargs='?', help='name of the input tsv file with coverages and frequencies.') parser.add_argument('-nf', '-noise_filter', help='Do not agregate into smudge k-mer pairs with frequency lower than this parameter', type=int, default=50) parser.add_argument('-d', '-distance', help='Manthattan distance of k-mer pairs that are considered neioboring for the local agregation purposes.', type=int, default=5) parser.add_argument('--mask_errors', help='instead of reporting assignments to individual smudges, just remove all monotonically decreasing points from the error line', action="store_true", default = False) diff --git a/pyproject.toml b/pyproject.toml index 7220e30..766db9a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,13 @@ [build-system] -requires = ["setuptools>=61.0", "wheel"] +requires = [ + "setuptools>=61.0", + "wheel", +] build-backend = "setuptools.build_meta" [project] name = "smudgeplot" -version = "0.5.1" +version = "0.5.3" description = "Inference of ploidy and heterozygosity structure using whole genome sequencing data" readme = "README.md" license = "Apache-2.0" @@ -38,6 +41,8 @@ classifiers = [ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Operating System :: POSIX :: Linux", "Operating System :: MacOS", "Intended Audience :: Science/Research", @@ -66,7 +71,8 @@ smudgeplot = ["bin/*"] [tool.cibuildwheel] # Skip 32-bit, PyPy, musllinux, and Windows (no Windows support for now) skip = ["*-win32", "*-win_amd64", "*-manylinux_i686", "pp*", "*-musllinux*"] -build = ["cp39-*", "cp310-*", "cp311-*", "cp312-*"] +build = ["cp39-*", "cp310-*", "cp311-*", "cp312-*", "cp313-*", "cp314-*"] +environment = { MACOSX_DEPLOYMENT_TARGET = "11.0" } # Test that the CLI works after building test-command = "smudgeplot --version" diff --git a/setup.py b/setup.py index add6518..de1df16 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ from setuptools import setup from setuptools.command.build_py import build_py from setuptools.command.develop import develop - +from setuptools.dist import Distribution class CompilationError(Exception): """Raised when C binary compilation fails.""" @@ -132,10 +132,14 @@ def run(self): print("Continuing with editable install, but binaries will not work.", file=sys.stderr) super().run() +class BinaryDistribution(Distribution): + def has_ext_modules(self): + return True setup( cmdclass={ "build_py": BuildPyWithBinaries, "develop": DevelopWithBinaries, - } + }, + distclass=BinaryDistribution ) diff --git a/src/lib/PloidyList.c b/src/lib/PloidyList.c index 6f1d9e7..55410fa 100644 --- a/src/lib/PloidyList.c +++ b/src/lib/PloidyList.c @@ -1222,7 +1222,7 @@ int main(int argc, char *argv[]) int flags[128]; char *eptr; - ARG_INIT("PloidyList"); + ARG_INIT("extract_kmer_pairs"); OUT = NULL; ETHRESH = 4; diff --git a/src/lib/PloidyPlot.c b/src/lib/PloidyPlot.c index 298ddd3..34953cd 100644 --- a/src/lib/PloidyPlot.c +++ b/src/lib/PloidyPlot.c @@ -1247,7 +1247,7 @@ int main(int argc, char *argv[]) int flags[128]; char *eptr; - ARG_INIT("PloidyPlot"); + ARG_INIT("hetmers"); OUT = NULL; ETHRESH = 4; diff --git a/src/smudgeplot/cli.py b/src/smudgeplot/cli.py index 086e851..30f0cec 100755 --- a/src/smudgeplot/cli.py +++ b/src/smudgeplot/cli.py @@ -2,10 +2,13 @@ import argparse import os +import shlex import shutil +import subprocess import sys from importlib.metadata import version from pathlib import Path +from typing import Any import numpy as np @@ -41,30 +44,32 @@ def get_binary_path(name: str) -> str: if system_binary: return system_binary - raise FileNotFoundError( - f"Binary '{name}' not found. Please ensure smudgeplot is properly installed. " + msg = ( + f"Binary '{name}' not found. Please ensure smudgeplot is properly installed.\n" f"Checked locations:\n" - f" - Package: {bundled_binary}\n" - f" - System PATH: (not found)\n" + f" - Package: {bundled_binary.parent}\n" + f" - System PATH: {os.get_exec_path()}\n" f"\nYou may need to reinstall smudgeplot or install the binaries manually." ) + raise FileNotFoundError(msg) -def run_binary(name: str, args: str) -> int: +def run_binary(name: str, args: list[Any]) -> None: """ Run a binary with the given arguments. Args: name: Name of the binary - args: Space-separated argument string + args: List of (stringify-able) arguments - Returns: - Return code from the binary + Throws: + subprocess.CalledProcessError on non-zero exit of the command """ - binary_path = get_binary_path(name) - cmd = f"{binary_path} {args}" - sys.stderr.write(f"Calling: {name} {args}\n") - return os.system(cmd) + cmd_line = [get_binary_path(name)] + for x in args: + cmd_line.append(str(x)) + sys.stderr.write(f"Calling: {shlex.join(cmd_line)}\n") + subprocess.run(cmd_line, check=True) class Parser: @@ -78,7 +83,7 @@ def __init__(self): tasks: cutoff Calculate meaningful values for lower kmer histogram cutoff. hetmers Calculate unique kmer pairs from a FastK k-mer database. peak_aggregation Agregates smudges using local aggregation algorithm; prints assignments to stdout. - plot Generate 2d histogram; infere ploidy and plot a smudgeplot. + plot Generate 2d histogram; infer ploidy and plot a smudgeplot. all Runs all the steps (with default options) extract Extract kmer pair sequences from a FastK k-mer database. """ @@ -94,6 +99,7 @@ def __init__(self): default=False, help="Print the version and exit.", ) + # print version is a special case if len(sys.argv) > 1: if sys.argv[1] in ["-v", "--version"]: @@ -144,6 +150,7 @@ def hetmers(self): "-L", help="Count threshold below which k-mers are considered erroneous.", type=int, + required=True, ) argparser.add_argument("-t", help="Number of threads (default 4).", type=int, default=4) argparser.add_argument( @@ -157,6 +164,13 @@ def hetmers(self): default=".", ) argparser.add_argument("--verbose", action="store_true", default=False, help="Verbose mode.") + argparser.add_argument( + "--json_report", + action="store_true", + default=False, + help="Write a JSON format report recording the selected parameters (default False)", + ) + self.arguments = argparser.parse_args(sys.argv[2:]) def peak_aggregation(self): @@ -168,7 +182,7 @@ def peak_aggregation(self): description="Aggregates smudges using local aggregation algorithm.") argparser.add_argument( "infile", - help="Name of the input smu file with covarages and frequencies.", + help="Name of the input smu file with coverages and frequencies.", ) argparser.add_argument( "-nf", @@ -247,7 +261,7 @@ def all(self): description="Runs all the steps (with default options).") argparser.add_argument( "infile", - help="Name of the input tsv file with covarages and frequencies.", + help="Name of the input tsv file with coverages and frequencies.", ) argparser.add_argument( "-o", @@ -301,7 +315,7 @@ def add_plotting_arguments(self, argparser): "--format", default="png", help="Output format for the plots (default png)", - choices=["pdf", "png"], + choices=["pdf", "png", "svg"], ) argparser.add_argument( "--json_report", @@ -332,47 +346,48 @@ def main(): fin() if _parser.task == "hetmers": - # PloidyPlot is expected to be installed in the system as well as the R library supporting it - plot_args = " -o" + str(args.o) - plot_args += " -e" + str(args.L) - plot_args += " -T" + str(args.t) + + hetmer_args = [ + f"-o{args.o}", + f"-e{args.L}", + f"-T{args.t}", + ] if args.verbose: - plot_args += " -v" + hetmer_args.append("-v") if args.tmp != ".": - plot_args += " -P" + args.tmp - plot_args += " " + args.infile + hetmer_args.append(f"-P{args.tmp}") + hetmer_args.append(args.infile) + + run_binary("hetmers", hetmer_args) - run_binary("hetmers", plot_args) + if args.json_report: + smg.save_hetmers_json_report(args.o, input_params=vars(args)) fin() if _parser.task == "extract": - plot_args = " -o" + str(args.o) - plot_args += " -T" + str(args.t) + extract_args = [ + f"-o{args.o}", + f"-T{args.t}", + ] if args.verbose: - plot_args += " -v" + extract_args.append("-v") if args.tmp != ".": - plot_args += " -P" + args.tmp - plot_args += " " + args.infile - if args.sma.endswith(".sma"): - plot_args += " " + args.sma.removesuffix(".sma") - else: - plot_args += " " + args.sma + extract_args.append(f"-P{args.tmp}") + extract_args.append(args.infile) + extract_args.append(args.sma.removesuffix(".sma")) - run_binary("extract_kmer_pairs", plot_args) + run_binary("extract_kmer_pairs", extract_args) fin() - if args.title: - title=args.title - else: - title = ".".join(args.infile.split("/")[-1].split(".")[0:2]) + title = args.title or Path(args.infile).stem if _parser.task == "plot": smudge_tab = smg.read_csv(args.smudgefile, sep="\t", names=["structure", "size", "rel_size"]) cov_tab = smg.load_hetmers(args.infile) smudgeplot_data = smg.SmudgeplotData(cov_tab, smudge_tab, args.n) - smg.prepare_smudgeplot_data_for_plotting(smudgeplot_data, args.o, title, upper_ylim=args.ylim) + smg.prepare_smudgeplot_data_for_plotting(smudgeplot_data, args.o, title, upper_ylim=args.ylim, fmt=args.format) smg.smudgeplot(smudgeplot_data, log=False, palette=args.col_ramp, invert_cols=args.invert_cols) smg.smudgeplot(smudgeplot_data, log=True, palette=args.col_ramp, invert_cols=args.invert_cols) @@ -396,12 +411,11 @@ def main(): coverages.local_aggregation(distance=args.d, noise_filter=1000, mask_errors=True) coverages.count_kmers() sys.stderr.write( - f"\t\ - Total kmers: {coverages.total_kmers}\n\t \ - Genomic kmers: {coverages.total_genomic_kmers}\n\t \ - Genomic kmers in smudges: {coverages.total_genomic_kmers_in_smudges}\n\t \ - Sequencing errors: {coverages.total_error_kmers}\n\t \ - Fraction of errors: {round(coverages.total_error_kmers/coverages.total_kmers, 3)}" + f"\nTotal kmers: {coverages.total_kmers}\n" + f"Genomic kmers: {coverages.total_genomic_kmers}\n" + f"Genomic kmers in smudges: {coverages.total_genomic_kmers_in_smudges}\n" + f"Sequencing errors: {coverages.total_error_kmers}\n" + f"Fraction of errors: {coverages.error_fraction:.3f}\n" ) smudge_size_cutoff = ( @@ -420,11 +434,7 @@ def main(): delimiter="\t", ) - limit = 0.7 - if coverages.error_fraction < limit: - cov = smudges.cov - else: - cov = 0 + cov = smudges.cov if coverages.error_fraction < 0.7 else 0 sys.stderr.write("\nCreating centrality plot\n") smudges.centrality_plot(args.o, args.format) @@ -459,10 +469,11 @@ def main(): json_report=args.json_report, input_params=vars(args), palette=args.col_ramp, - invert_cols=args.invert_cols + invert_cols=args.invert_cols, ) fin() + if __name__ == "__main__": main() diff --git a/src/smudgeplot/smudgeplot.py b/src/smudgeplot/smudgeplot.py index 59a123d..f7aa7b5 100644 --- a/src/smudgeplot/smudgeplot.py +++ b/src/smudgeplot/smudgeplot.py @@ -1,10 +1,12 @@ #!/usr/bin/env python3 import json +import shlex import sys from collections import defaultdict from importlib.metadata import version from math import ceil, log +from pathlib import Path from statistics import fmean import matplotlib as mpl @@ -12,7 +14,7 @@ import numpy as np from matplotlib.collections import PatchCollection from numpy import arange, argmin, concatenate -from pandas import DataFrame, Series, concat, read_csv # type: ignore +from pandas import DataFrame, Series, concat, read_csv class Coverages: @@ -376,11 +378,14 @@ def generate_plots( if json_report: write_json_report(smudgeplot_data, input_params) + def write_json_report(smg_data, input_params=None, min_size=0.03): + hetmers_report = read_hetmers_report_json(input_params["infile"]) report = { "version": version("smudgeplot"), - "commandline_arguments": sys.argv[1:], + "commandline_arguments": shlex.join(sys.argv[1:]), "input_parameters": input_params, + "hetmers_input": hetmers_report, "haploid_coverage": float(f"{smg_data.cov:.3f}"), "error_fraction": smg_data.error_fraction, "top_smudges": [ @@ -400,8 +405,37 @@ def write_json_report(smg_data, input_params=None, min_size=0.03): for row in smg_data.smudge_tab.itertuples(index=False) ], } - with open(smg_data.json_report_file, "w") as fh: - fh.write(json.dumps(report, indent=2) + "\n") + write_json_file(smg_data.json_report_file, report) + + +def save_hetmers_json_report(outfile, input_params=None): + report = { + "version": version("smudgeplot"), + "commandline_arguments": shlex.join(sys.argv[1:]), + "input_parameters": input_params, + } + write_json_file(f"{outfile}_report.json", report) + + +def write_json_file(filename: str, data): + Path(filename).write_text(json.dumps(data, indent=2) + "\n") + + +def read_hetmers_report_json(hetmers: str): + """ + Returns the parsed contents of the hetmers report JSON file if it exists + and its modification time is the same as or more recent than the hetmers + file itself. + """ + hetmers_file = Path(hetmers) + report_file = Path(f"{hetmers_file.stem}_report.json") + + if ( + report_file.exists() + and report_file.stat().st_mtime >= hetmers_file.stat().st_mtime + ): + return json.loads(report_file.read_text()) + return None def prepare_smudgeplot_data_for_plotting(smudgeplot_data, output, title, fmt=None, upper_ylim=None):