diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index a30dc58..acf05fe 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -2,7 +2,7 @@ name: Build and Test
on:
push:
- branches: [main, master, develop]
+ branches: [main, master]
pull_request:
branches: [main, master]
@@ -14,7 +14,7 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest]
- python-version: ["3.9", "3.10", "3.11", "3.12"]
+ python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"]
steps:
- uses: actions/checkout@v4
@@ -50,13 +50,13 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
- os: [ubuntu-latest, macos-13, macos-14] # macos-13=x86, macos-14=arm64
+ os: [ubuntu-latest, macos-15-intel, macos-14] # macos-15-intel=x86, macos-14=arm64
steps:
- uses: actions/checkout@v4
- name: Build wheels
- uses: pypa/cibuildwheel@v2.21.3
+ uses: pypa/cibuildwheel@v3.3.1
# Config is read from pyproject.toml [tool.cibuildwheel]
- uses: actions/upload-artifact@v4
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 8ad5979..1170dae 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -10,7 +10,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
- os: [ubuntu-latest, macos-13, macos-14]
+ os: [ubuntu-latest, macos-15-intel, macos-14]
steps:
- uses: actions/checkout@v4
@@ -48,7 +48,9 @@ jobs:
name: Publish to PyPI
needs: [build-wheels, build-sdist]
runs-on: ubuntu-latest
- environment: pypi
+ environment:
+ name: pypi
+ url: https://pypi.org/project/smudgeplot
permissions:
id-token: write # Required for trusted publishing
diff --git a/Makefile b/Makefile
index bb4da2b..4834060 100644
--- a/Makefile
+++ b/Makefile
@@ -8,11 +8,14 @@ endif
HET_KMERS_INST = $(INSTALL_PREFIX)/bin/hetmers $(INSTALL_PREFIX)/bin/extract_kmer_pairs
.PHONY : default
-default: exec/hetmers exec/extract_kmer_pairs
+default: exec/ exec/hetmers exec/extract_kmer_pairs
.PHONY : install
install : $(HET_KMERS_INST)
+exec/:
+ mkdir -p exec
+
$(INSTALL_PREFIX)/bin/% : exec/%
install -C $< $(INSTALL_PREFIX)/bin
diff --git a/README.md b/README.md
index 10bd1d0..c82d04a 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,12 @@
# Smudgeplot
-**_Version: 0.5.1 Skylight_**
+**_Version: 0.5.3 Skylight_**
**_Authors: Sam Ebdon, [Gene W Myers](https://github.com/thegenemyers) and [Kamil S. Jaron](https://github.com/KamilSJaron), Tianyi Ma._**
-We keep the same pythonic interface; the interface of this and previous versions are very similar and largely compatible.
-
-Current state: RUNNING; beta-testing;
-
## Installation
-This version of smudgeplot operates on FastK k-mer databases. The smudgeplot installation consists of a python package and C-backend to search for all the k-mer pairs (hetmers) and extract sequences of k-mer pairs (extract_kmer_pairs).
+This version of smudgeplot operates on FastK k-mer databases. The smudgeplot installation consists of a python package and C-backend to search for all the k-mer pairs (smudgeplot hetmers) and extract sequences of k-mer pairs (smudgeplot extract).
We recommend installing smudgeplot within a [conda](https://conda-forge.org/download/) environment.
@@ -19,7 +15,10 @@ We recommend installing smudgeplot within a [conda](https://conda-forge.org/down
conda create -n smudgeplot && conda activate smudgeplot
conda install pip
-#download and install
+# install via pypi
+pip install smudgeplot
+
+# or download and install directly. See below if you need to compile the C dependencies.
git clone https://github.com/KamilSJaron/smudgeplot.git
cd smudgeplot && pip install .
smudgeplot -h # check installation succeeded
@@ -31,7 +30,7 @@ Note the smudgeplot version downloadable from conda itself is not currently up t
### Compiling the C code
-The process above install everything including compilation of the C backend. If you would like to know how to compile the code yourself you can simply run
+The process above should install everything including compilation of the C backend. If you need or would like to know how to compile the code yourself you can simply run
```
make
@@ -39,6 +38,14 @@ make
This will not, however, install the smudgeplot python package.
+### Pypi installation [EXPERIMENTAL]
+
+We are working on packaging smudgeplot for pypi. You are welcome to try installing from pypi if you are interested and please open an issue if you have problems. If it fails please follow the main instructions above to install for now.
+
+```
+pip install smudgeplot
+```
+
## Example run on Saccharomyces data
Requires ~2.1GB of space and `FastK` and `smudgeplot` installed.
diff --git a/playground/alternative_fitting/pair_clustering.py b/playground/alternative_fitting/pair_clustering.py
index 1cd3eb3..059b40c 100644
--- a/playground/alternative_fitting/pair_clustering.py
+++ b/playground/alternative_fitting/pair_clustering.py
@@ -13,7 +13,7 @@
####
parser = argparse.ArgumentParser()
-parser.add_argument('infile', nargs='?', help='name of the input tsv file with covarages and frequencies.')
+parser.add_argument('infile', nargs='?', help='name of the input tsv file with coverages and frequencies.')
parser.add_argument('-nf', '-noise_filter', help='Do not agregate into smudge k-mer pairs with frequency lower than this parameter', type=int, default=50)
parser.add_argument('-d', '-distance', help='Manthattan distance of k-mer pairs that are considered neioboring for the local agregation purposes.', type=int, default=5)
parser.add_argument('--mask_errors', help='instead of reporting assignments to individual smudges, just remove all monotonically decreasing points from the error line', action="store_true", default = False)
diff --git a/pyproject.toml b/pyproject.toml
index 7220e30..766db9a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,10 +1,13 @@
[build-system]
-requires = ["setuptools>=61.0", "wheel"]
+requires = [
+ "setuptools>=61.0",
+ "wheel",
+]
build-backend = "setuptools.build_meta"
[project]
name = "smudgeplot"
-version = "0.5.1"
+version = "0.5.3"
description = "Inference of ploidy and heterozygosity structure using whole genome sequencing data"
readme = "README.md"
license = "Apache-2.0"
@@ -38,6 +41,8 @@ classifiers = [
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
+ "Programming Language :: Python :: 3.13",
+ "Programming Language :: Python :: 3.14",
"Operating System :: POSIX :: Linux",
"Operating System :: MacOS",
"Intended Audience :: Science/Research",
@@ -66,7 +71,8 @@ smudgeplot = ["bin/*"]
[tool.cibuildwheel]
# Skip 32-bit, PyPy, musllinux, and Windows (no Windows support for now)
skip = ["*-win32", "*-win_amd64", "*-manylinux_i686", "pp*", "*-musllinux*"]
-build = ["cp39-*", "cp310-*", "cp311-*", "cp312-*"]
+build = ["cp39-*", "cp310-*", "cp311-*", "cp312-*", "cp313-*", "cp314-*"]
+environment = { MACOSX_DEPLOYMENT_TARGET = "11.0" }
# Test that the CLI works after building
test-command = "smudgeplot --version"
diff --git a/setup.py b/setup.py
index add6518..de1df16 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
from setuptools import setup
from setuptools.command.build_py import build_py
from setuptools.command.develop import develop
-
+from setuptools.dist import Distribution
class CompilationError(Exception):
"""Raised when C binary compilation fails."""
@@ -132,10 +132,14 @@ def run(self):
print("Continuing with editable install, but binaries will not work.", file=sys.stderr)
super().run()
+class BinaryDistribution(Distribution):
+ def has_ext_modules(self):
+ return True
setup(
cmdclass={
"build_py": BuildPyWithBinaries,
"develop": DevelopWithBinaries,
- }
+ },
+ distclass=BinaryDistribution
)
diff --git a/src/lib/PloidyList.c b/src/lib/PloidyList.c
index 6f1d9e7..55410fa 100644
--- a/src/lib/PloidyList.c
+++ b/src/lib/PloidyList.c
@@ -1222,7 +1222,7 @@ int main(int argc, char *argv[])
int flags[128];
char *eptr;
- ARG_INIT("PloidyList");
+ ARG_INIT("extract_kmer_pairs");
OUT = NULL;
ETHRESH = 4;
diff --git a/src/lib/PloidyPlot.c b/src/lib/PloidyPlot.c
index 298ddd3..34953cd 100644
--- a/src/lib/PloidyPlot.c
+++ b/src/lib/PloidyPlot.c
@@ -1247,7 +1247,7 @@ int main(int argc, char *argv[])
int flags[128];
char *eptr;
- ARG_INIT("PloidyPlot");
+ ARG_INIT("hetmers");
OUT = NULL;
ETHRESH = 4;
diff --git a/src/smudgeplot/cli.py b/src/smudgeplot/cli.py
index 086e851..30f0cec 100755
--- a/src/smudgeplot/cli.py
+++ b/src/smudgeplot/cli.py
@@ -2,10 +2,13 @@
import argparse
import os
+import shlex
import shutil
+import subprocess
import sys
from importlib.metadata import version
from pathlib import Path
+from typing import Any
import numpy as np
@@ -41,30 +44,32 @@ def get_binary_path(name: str) -> str:
if system_binary:
return system_binary
- raise FileNotFoundError(
- f"Binary '{name}' not found. Please ensure smudgeplot is properly installed. "
+ msg = (
+ f"Binary '{name}' not found. Please ensure smudgeplot is properly installed.\n"
f"Checked locations:\n"
- f" - Package: {bundled_binary}\n"
- f" - System PATH: (not found)\n"
+ f" - Package: {bundled_binary.parent}\n"
+ f" - System PATH: {os.get_exec_path()}\n"
f"\nYou may need to reinstall smudgeplot or install the binaries manually."
)
+ raise FileNotFoundError(msg)
-def run_binary(name: str, args: str) -> int:
+def run_binary(name: str, args: list[Any]) -> None:
"""
Run a binary with the given arguments.
Args:
name: Name of the binary
- args: Space-separated argument string
+ args: List of (stringify-able) arguments
- Returns:
- Return code from the binary
+ Throws:
+ subprocess.CalledProcessError on non-zero exit of the command
"""
- binary_path = get_binary_path(name)
- cmd = f"{binary_path} {args}"
- sys.stderr.write(f"Calling: {name} {args}\n")
- return os.system(cmd)
+ cmd_line = [get_binary_path(name)]
+ for x in args:
+ cmd_line.append(str(x))
+ sys.stderr.write(f"Calling: {shlex.join(cmd_line)}\n")
+ subprocess.run(cmd_line, check=True)
class Parser:
@@ -78,7 +83,7 @@ def __init__(self):
tasks: cutoff Calculate meaningful values for lower kmer histogram cutoff.
hetmers Calculate unique kmer pairs from a FastK k-mer database.
peak_aggregation Agregates smudges using local aggregation algorithm; prints assignments to stdout.
- plot Generate 2d histogram; infere ploidy and plot a smudgeplot.
+ plot Generate 2d histogram; infer ploidy and plot a smudgeplot.
all Runs all the steps (with default options)
extract Extract kmer pair sequences from a FastK k-mer database.
"""
@@ -94,6 +99,7 @@ def __init__(self):
default=False,
help="Print the version and exit.",
)
+
# print version is a special case
if len(sys.argv) > 1:
if sys.argv[1] in ["-v", "--version"]:
@@ -144,6 +150,7 @@ def hetmers(self):
"-L",
help="Count threshold below which k-mers are considered erroneous.",
type=int,
+ required=True,
)
argparser.add_argument("-t", help="Number of threads (default 4).", type=int, default=4)
argparser.add_argument(
@@ -157,6 +164,13 @@ def hetmers(self):
default=".",
)
argparser.add_argument("--verbose", action="store_true", default=False, help="Verbose mode.")
+ argparser.add_argument(
+ "--json_report",
+ action="store_true",
+ default=False,
+ help="Write a JSON format report recording the selected parameters (default False)",
+ )
+
self.arguments = argparser.parse_args(sys.argv[2:])
def peak_aggregation(self):
@@ -168,7 +182,7 @@ def peak_aggregation(self):
description="Aggregates smudges using local aggregation algorithm.")
argparser.add_argument(
"infile",
- help="Name of the input smu file with covarages and frequencies.",
+ help="Name of the input smu file with coverages and frequencies.",
)
argparser.add_argument(
"-nf",
@@ -247,7 +261,7 @@ def all(self):
description="Runs all the steps (with default options).")
argparser.add_argument(
"infile",
- help="Name of the input tsv file with covarages and frequencies.",
+ help="Name of the input tsv file with coverages and frequencies.",
)
argparser.add_argument(
"-o",
@@ -301,7 +315,7 @@ def add_plotting_arguments(self, argparser):
"--format",
default="png",
help="Output format for the plots (default png)",
- choices=["pdf", "png"],
+ choices=["pdf", "png", "svg"],
)
argparser.add_argument(
"--json_report",
@@ -332,47 +346,48 @@ def main():
fin()
if _parser.task == "hetmers":
- # PloidyPlot is expected to be installed in the system as well as the R library supporting it
- plot_args = " -o" + str(args.o)
- plot_args += " -e" + str(args.L)
- plot_args += " -T" + str(args.t)
+
+ hetmer_args = [
+ f"-o{args.o}",
+ f"-e{args.L}",
+ f"-T{args.t}",
+ ]
if args.verbose:
- plot_args += " -v"
+ hetmer_args.append("-v")
if args.tmp != ".":
- plot_args += " -P" + args.tmp
- plot_args += " " + args.infile
+ hetmer_args.append(f"-P{args.tmp}")
+ hetmer_args.append(args.infile)
+
+ run_binary("hetmers", hetmer_args)
- run_binary("hetmers", plot_args)
+ if args.json_report:
+ smg.save_hetmers_json_report(args.o, input_params=vars(args))
fin()
if _parser.task == "extract":
- plot_args = " -o" + str(args.o)
- plot_args += " -T" + str(args.t)
+ extract_args = [
+ f"-o{args.o}",
+ f"-T{args.t}",
+ ]
if args.verbose:
- plot_args += " -v"
+ extract_args.append("-v")
if args.tmp != ".":
- plot_args += " -P" + args.tmp
- plot_args += " " + args.infile
- if args.sma.endswith(".sma"):
- plot_args += " " + args.sma.removesuffix(".sma")
- else:
- plot_args += " " + args.sma
+ extract_args.append(f"-P{args.tmp}")
+ extract_args.append(args.infile)
+ extract_args.append(args.sma.removesuffix(".sma"))
- run_binary("extract_kmer_pairs", plot_args)
+ run_binary("extract_kmer_pairs", extract_args)
fin()
- if args.title:
- title=args.title
- else:
- title = ".".join(args.infile.split("/")[-1].split(".")[0:2])
+ title = args.title or Path(args.infile).stem
if _parser.task == "plot":
smudge_tab = smg.read_csv(args.smudgefile, sep="\t", names=["structure", "size", "rel_size"])
cov_tab = smg.load_hetmers(args.infile)
smudgeplot_data = smg.SmudgeplotData(cov_tab, smudge_tab, args.n)
- smg.prepare_smudgeplot_data_for_plotting(smudgeplot_data, args.o, title, upper_ylim=args.ylim)
+ smg.prepare_smudgeplot_data_for_plotting(smudgeplot_data, args.o, title, upper_ylim=args.ylim, fmt=args.format)
smg.smudgeplot(smudgeplot_data, log=False, palette=args.col_ramp, invert_cols=args.invert_cols)
smg.smudgeplot(smudgeplot_data, log=True, palette=args.col_ramp, invert_cols=args.invert_cols)
@@ -396,12 +411,11 @@ def main():
coverages.local_aggregation(distance=args.d, noise_filter=1000, mask_errors=True)
coverages.count_kmers()
sys.stderr.write(
- f"\t\
- Total kmers: {coverages.total_kmers}\n\t \
- Genomic kmers: {coverages.total_genomic_kmers}\n\t \
- Genomic kmers in smudges: {coverages.total_genomic_kmers_in_smudges}\n\t \
- Sequencing errors: {coverages.total_error_kmers}\n\t \
- Fraction of errors: {round(coverages.total_error_kmers/coverages.total_kmers, 3)}"
+ f"\nTotal kmers: {coverages.total_kmers}\n"
+ f"Genomic kmers: {coverages.total_genomic_kmers}\n"
+ f"Genomic kmers in smudges: {coverages.total_genomic_kmers_in_smudges}\n"
+ f"Sequencing errors: {coverages.total_error_kmers}\n"
+ f"Fraction of errors: {coverages.error_fraction:.3f}\n"
)
smudge_size_cutoff = (
@@ -420,11 +434,7 @@ def main():
delimiter="\t",
)
- limit = 0.7
- if coverages.error_fraction < limit:
- cov = smudges.cov
- else:
- cov = 0
+ cov = smudges.cov if coverages.error_fraction < 0.7 else 0
sys.stderr.write("\nCreating centrality plot\n")
smudges.centrality_plot(args.o, args.format)
@@ -459,10 +469,11 @@ def main():
json_report=args.json_report,
input_params=vars(args),
palette=args.col_ramp,
- invert_cols=args.invert_cols
+ invert_cols=args.invert_cols,
)
fin()
+
if __name__ == "__main__":
main()
diff --git a/src/smudgeplot/smudgeplot.py b/src/smudgeplot/smudgeplot.py
index 59a123d..f7aa7b5 100644
--- a/src/smudgeplot/smudgeplot.py
+++ b/src/smudgeplot/smudgeplot.py
@@ -1,10 +1,12 @@
#!/usr/bin/env python3
import json
+import shlex
import sys
from collections import defaultdict
from importlib.metadata import version
from math import ceil, log
+from pathlib import Path
from statistics import fmean
import matplotlib as mpl
@@ -12,7 +14,7 @@
import numpy as np
from matplotlib.collections import PatchCollection
from numpy import arange, argmin, concatenate
-from pandas import DataFrame, Series, concat, read_csv # type: ignore
+from pandas import DataFrame, Series, concat, read_csv
class Coverages:
@@ -376,11 +378,14 @@ def generate_plots(
if json_report:
write_json_report(smudgeplot_data, input_params)
+
def write_json_report(smg_data, input_params=None, min_size=0.03):
+ hetmers_report = read_hetmers_report_json(input_params["infile"])
report = {
"version": version("smudgeplot"),
- "commandline_arguments": sys.argv[1:],
+ "commandline_arguments": shlex.join(sys.argv[1:]),
"input_parameters": input_params,
+ "hetmers_input": hetmers_report,
"haploid_coverage": float(f"{smg_data.cov:.3f}"),
"error_fraction": smg_data.error_fraction,
"top_smudges": [
@@ -400,8 +405,37 @@ def write_json_report(smg_data, input_params=None, min_size=0.03):
for row in smg_data.smudge_tab.itertuples(index=False)
],
}
- with open(smg_data.json_report_file, "w") as fh:
- fh.write(json.dumps(report, indent=2) + "\n")
+ write_json_file(smg_data.json_report_file, report)
+
+
+def save_hetmers_json_report(outfile, input_params=None):
+ report = {
+ "version": version("smudgeplot"),
+ "commandline_arguments": shlex.join(sys.argv[1:]),
+ "input_parameters": input_params,
+ }
+ write_json_file(f"{outfile}_report.json", report)
+
+
+def write_json_file(filename: str, data):
+ Path(filename).write_text(json.dumps(data, indent=2) + "\n")
+
+
+def read_hetmers_report_json(hetmers: str):
+ """
+ Returns the parsed contents of the hetmers report JSON file if it exists
+ and its modification time is the same as or more recent than the hetmers
+ file itself.
+ """
+ hetmers_file = Path(hetmers)
+ report_file = Path(f"{hetmers_file.stem}_report.json")
+
+ if (
+ report_file.exists()
+ and report_file.stat().st_mtime >= hetmers_file.stat().st_mtime
+ ):
+ return json.loads(report_file.read_text())
+ return None
def prepare_smudgeplot_data_for_plotting(smudgeplot_data, output, title, fmt=None, upper_ylim=None):