Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 35 additions & 29 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,44 +1,50 @@
FROM kbase/sdkpython:3.8.10
MAINTAINER KBase Developer
# -----------------------------------------
# System deps required to build QUAST bundled tools
RUN apt-get update --fix-missing && \
apt-get install -y --no-install-recommends \
build-essential perl wget ca-certificates zlib1g-dev && \
rm -rf /var/lib/apt/lists/*

# Insert apt-get instructions here to install
# any required dependencies for your module.

# libz is for Minimap2 in Quast
RUN apt update && apt install -y g++ nano tree wget libz-dev

RUN python -m pip install --upgrade pip \
&& pip install \
psutil==6.0.0 \
matplotlib==3.7.5 \
quast==5.2.0 \
biopython==1.83 \
&& rm -r /opt/conda3/lib/python3.8/site-packages/quast_libs/genemark \
&& rm -r /opt/conda3/lib/python3.8/site-packages/quast_libs/genemark-es

# Genemark is not open source for non-academic use, and so can't be used in KBase

# Precompile dependencies vs compiling at first use
# In particular, this allows the container to run @ NERSC
RUN mkdir /quasttemp \
&& cd /quasttemp \
&& wget quast.sf.net/test_data.tar.gz \
&& tar xzf test_data.tar.gz \
&& quast.py --test --debug \
&& cd - \
&& rm -r /quasttemp
# Optional: enable static/PDF plots
# RUN apt-get update && apt-get install -y --no-install-recommends \
# pkg-config libfreetype6-dev libpng-dev python3-matplotlib && \
# rm -rf /var/lib/apt/lists/*

# -----------------------------------------
# Python tooling (upgrade packaging to avoid canonicalize_version error)
RUN python3 -m pip install -U pip setuptools wheel packaging && python --version

# If you keep these utilities, install them here
RUN pip install coverage==5.5 in_place==1.0.1 pathos==0.3.4

# -----------------------------------------
# Install QUAST 5.3.0 from source (PyPI tops out at 5.2.0)
ENV QUAST_VER=5.3.0
WORKDIR /opt
RUN wget -qO quast-${QUAST_VER}.tar.gz \
"https://sourceforge.net/projects/quast/files/quast-${QUAST_VER}.tar.gz/download" \
&& tar -xzf quast-${QUAST_VER}.tar.gz \
&& cd quast-${QUAST_VER} \
&& python3 -m pip install . --no-cache-dir --root-user-action=ignore \
&& quast.py --version

RUN pip install biopython==1.81

# Python tooling (upgrade packaging, etc.)
RUN python3 -m pip install -U pip setuptools wheel packaging && python --version
# Add psutil (and keep your existing utilities)
RUN pip install psutil coverage==5.5 in_place==1.0.1 pathos==0.3.4

# -----------------------------------------
# Your module
COPY ./ /kb/module
RUN mkdir -p /kb/module/work
RUN chmod -R 777 /kb/module
RUN chmod -R a+rw /kb/module

WORKDIR /kb/module

RUN make all

ENTRYPOINT [ "./scripts/entrypoint.sh" ]

CMD [ ]
10 changes: 9 additions & 1 deletion RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
### 1.2.0

- Updated QUAST to 5.3.0
- Added support for large genomes
- Added ONP support
- added assemblysets input
- 1.1.0 update

### 1.1.0

- Updated base image to `kbase/sdkpython:3.8.10` to fix build
Expand All @@ -11,4 +19,4 @@

### 0.0.7
- Updated zip file size
- Added github action
- Added github action
2 changes: 1 addition & 1 deletion kbase.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ service-language:
python

module-version:
1.1.0
1.2.0

owners:
[gaprice]
178 changes: 156 additions & 22 deletions lib/kb_quast/kb_quastImpl.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import subprocess as _subprocess
import time as _time
import uuid as _uuid
import shlex # kept from original (not used directly but harmless)
from pathlib import Path # kept from original (not used directly but harmless)

import psutil
from Bio import SeqIO as _SeqIO
Expand All @@ -17,10 +19,10 @@
from installed_clients.baseclient import ServerError as _DFUError
from installed_clients.baseclient import ServerError as _RepError
from installed_clients.baseclient import ServerError as _WSError
#END_HEADER


class ObjInfo(object):

def __init__(self, obj_info):
self.id = obj_info[0]
self.name = obj_info[1]
Expand All @@ -34,7 +36,6 @@ def __init__(self, obj_info):
self.size = obj_info[9]
self.meta = obj_info[10]
self.ref = str(self.wsid) + '/' + str(self.id) + '/' + str(self.version)
#END_HEADER


class kb_quast:
Expand Down Expand Up @@ -95,7 +96,7 @@ def get_min_contig_length(self, params):
def get_assemblies(self, target_dir, object_infos):
filepaths = []
asscli = _AssClient(self.callback_url)
# would be nice the the assembly utils had bulk download...
# would be nice if the assembly utils had bulk download...
for i in object_infos:
fn = _os.path.join(target_dir, i.ref.replace('/', '_'))
filepaths.append(fn)
Expand All @@ -105,28 +106,77 @@ def get_assemblies(self, target_dir, object_infos):
except _AssError as asserr:
self.log('Logging assembly downloader exception')
self.log(str(asserr))
raise
raise asserr
return filepaths

def get_assembly_object_info(self, assemblies, token):
# Prefer the top-level ObjInfo; fall back to a local definition if missing
ObjInfoClass = globals().get('ObjInfo')
if ObjInfoClass is None:
class ObjInfoClass(object):
def __init__(self, obj_info):
self.id = obj_info[0]
self.name = obj_info[1]
t_full = obj_info[2]
self.type, self.type_ver = (t_full.split('-', 1) + [''])[:2] if '-' in t_full else (t_full, '')
self.time = obj_info[3]
self.version = obj_info[4]
self.saved_by = obj_info[5]
self.wsid = obj_info[6]
self.workspace = obj_info[7]
self.chsum = obj_info[8]
self.size = obj_info[9]
self.meta = obj_info[10]
self.ref = f"{self.wsid}/{self.id}/{self.version}"

refs = [{'ref': x} for x in assemblies]
ws = _WSClient(self.ws_url, token=token)
self.log('Getting object information from workspace')
# TODO use this often enough that should add to DFU but return dict vs list

try:
info = [ObjInfo(i) for i in ws.get_object_info3({'objects': refs})['infos']]
infos = ws.get_object_info3({'objects': refs})['infos']
except _WSError as wse:
self.log('Logging workspace exception')
self.log(str(wse))
raise
raise wse

info = [ObjInfoClass(i) for i in infos]

self.log('Object list:')
for i in info: # don't check type - assemblyutils should handle that
self.log('{}/{} {} {}'.format(i.workspace, i.name, i.ref, i.type))
absrefs = [i.ref for i in info]
for o in info:
self.log(f'{o.workspace}/{o.name} {o.ref} {o.type}')

absrefs = [o.ref for o in info]
if len(set(absrefs)) != len(absrefs):
raise ValueError('Duplicate objects detected in input') # could list objs later
raise ValueError('Duplicate objects detected in input')
return info

def expand_assembly_sets(self, set_refs, token):
"""
Given a list of KBaseSets.AssemblySet refs, return a flat list of assembly refs.
Supports common schemas: {'items': [{'ref': ...}, ...]} or {'elements': [{'ref': ...}, ...]}.
"""
if not set_refs:
return []
ws = _WSClient(self.ws_url, token=token)
objs = ws.get_objects2({'objects': [{'ref': r} for r in set_refs]})['data']
out = []
for od in objs:
data = od.get('data', {}) or {}
items = data.get('items')
if isinstance(items, list):
for it in items:
if isinstance(it, dict) and it.get('ref'):
out.append(it['ref'])
continue
elements = data.get('elements')
if isinstance(elements, list):
for el in elements:
if isinstance(el, dict) and el.get('ref'):
out.append(el['ref'])
continue
return out

def run_quast_exec(self, outdir, filepaths, labels, min_contig_length, skip_glimmer=False):
threads = psutil.cpu_count() * self.THREADS_PER_CORE
# DO NOT use genemark instead of glimmer, not open source
Expand Down Expand Up @@ -163,6 +213,68 @@ def run_quast_exec(self, outdir, filepaths, labels, min_contig_length, skip_glim
self.log(err)
raise ValueError(err)


# ---------- NEW HELPERS (instance methods) ----------
def _ws_batch_get_info(self, ws_client, refs):

"""
Batched get_object_info3 to avoid repeated roundtrips.
Returns list of dicts with keys: ref, type
"""
if not refs:
return []
info = ws_client.get_object_info3({"objects": [{"ref": r} for r in refs]})["infos"]
out = []
for r, i in zip(refs, info):
# i[2] is the full type string, e.g. "KBaseSets.AssemblySet-2.0"
out.append({"ref": r, "type": i[2]})
return out

def _classify_input_refs(self, ws_client, raw_refs):
"""
Split mixed refs into assemblies vs assembly_sets based on WS type.
Returns (assemblies, assembly_sets).
"""
assemblies, assembly_sets = [], []
for item in self._ws_batch_get_info(ws_client, raw_refs):
t = item["type"] or ""
t_base = t.split("-", 1)[0] # strip version suffix
if t_base in ("KBaseGenomes.Assembly", "KBaseGenomeAnnotations.Assembly"):
assemblies.append(item["ref"])
elif t_base == "KBaseSets.AssemblySet":
assembly_sets.append(item["ref"])
else:
raise ValueError(
f"Unsupported input ref type: {t} for {item['ref']}. "
"Expected KBaseGenomes.Assembly or KBaseSets.AssemblySet."
)
return assemblies, assembly_sets

def _gather_inputs(self, params, ws_client):
"""
Accepts any combo of:
- params['assemblies'] (list of refs)
- params['assembly_sets'] (list of refs)
- params['input_refs'] (list of refs, mixed)
- params['files'] (list of paths/handles)
Classifies refs by WS type and returns normalized dict.
"""
assemblies_param = params.get("assemblies") or []
assembly_sets_param = params.get("assembly_sets") or []
mixed_param = params.get("input_refs") or []
files = params.get("files") or []

cls_assemblies, cls_sets = self._classify_input_refs(ws_client, mixed_param)
exp_assemblies, exp_sets = self._classify_input_refs(
ws_client, assemblies_param + assembly_sets_param
)

assemblies = list(dict.fromkeys(cls_assemblies + exp_assemblies))
assembly_sets = list(dict.fromkeys(cls_sets + exp_sets))

return {"assemblies": assemblies, "assembly_sets": assembly_sets, "files": files}
# ---------- END HELPERS ----------

def check_large_input(self, filepaths):
skip_glimmer = False
basecount = 0
Expand All @@ -187,7 +299,6 @@ def __init__(self, config):
#END_CONSTRUCTOR
pass


def run_QUAST_app(self, ctx, params):
"""
Run QUAST and save a KBaseReport with the output.
Expand Down Expand Up @@ -237,7 +348,7 @@ def run_QUAST_app(self, ctx, params):
self.log('Logging exception from creating report object')
self.log(str(re))
# TODO delete shock node
raise
raise re
output = {'report_name': repout['name'],
'report_ref': repout['ref']
}
Expand All @@ -253,6 +364,11 @@ def run_QUAST_app(self, ctx, params):
def run_QUAST(self, ctx, params):
"""
Run QUAST and return a shock node containing the zipped QUAST output.
Supports:
- files: list of {'path', 'label'}
- assemblies: list<ref> to Assembly/ContigSet
- assembly_sets: list<ref> to KBaseSets.AssemblySet
Exactly one of (files) or (assemblies/assembly_sets) must be provided.
:param params: instance of type "QUASTParams" (Input for running
QUAST. assemblies - the list of assemblies upon which QUAST will
be run. -OR- files - the list of FASTA files upon which QUAST will
Expand Down Expand Up @@ -296,18 +412,36 @@ def run_QUAST(self, ctx, params):
#BEGIN run_QUAST
self.log('Starting QUAST run. Parameters:')
self.log(str(params))
assemblies = params.get('assemblies')
files = params.get('files')

# Normalize inputs (classify mixed refs into assemblies vs assembly_sets)
ws = _WSClient(self.ws_url, token=ctx['token'])
normalized = self._gather_inputs(params, ws)
assemblies = normalized["assemblies"]
assembly_sets = normalized["assembly_sets"]
files = normalized["files"]

min_contig_length = self.get_min_contig_length(params) # fail early if param is bad
if not self.xor(assemblies, files):
raise ValueError(
'One and only one of a list of assembly references or files is required')

has_obj_inputs = bool(assemblies or assembly_sets)
if bool(files) == has_obj_inputs:
raise ValueError('One and only one of a list of assembly references or files is required')

tdir = _os.path.join(self.scratch, str(_uuid.uuid4()))
self.mkdir_p(tdir)
if assemblies:

if has_obj_inputs:
if type(assemblies) != list:
raise ValueError('assemblies must be a list')
info = self.get_assembly_object_info(assemblies, ctx['token'])
if type(assembly_sets) != list:
raise ValueError('assembly_sets must be a list')

# Expand sets to assembly refs and combine
set_expanded = self.expand_assembly_sets(assembly_sets, ctx['token']) if assembly_sets else []
all_ass_refs = list(assemblies) + set_expanded
if not all_ass_refs:
raise ValueError('Provided assembly_sets expand to zero assemblies')

info = self.get_assembly_object_info(all_ass_refs, ctx['token'])
filepaths = self.get_assemblies(tdir, info)
labels = [i.name for i in info]
else:
Expand Down Expand Up @@ -339,10 +473,9 @@ def run_QUAST(self, ctx, params):
'make_handle': 1 if mh else 0,
'pack': 'zip'})
except _DFUError as dfue:
# not really any way to test this block
self.log('Logging exception loading results to shock')
self.log(str(dfue))
raise
raise dfue
output['quast_path'] = out
#END run_QUAST

Expand All @@ -352,6 +485,7 @@ def run_QUAST(self, ctx, params):
'output is not type dict as required.')
# return the results
return [output]

def status(self, ctx):
#BEGIN_STATUS
del ctx
Expand Down
Loading
Loading