kbaseapps · Cyrus-Shahnam · Aug 29, 2025 · Sep 2, 2025 · Sep 3, 2025 · Sep 3, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -1,44 +1,50 @@
 FROM kbase/sdkpython:3.8.10
 MAINTAINER KBase Developer
 # -----------------------------------------
+# System deps required to build QUAST bundled tools
+RUN apt-get update --fix-missing && \
+    apt-get install -y --no-install-recommends \
+        build-essential perl wget ca-certificates zlib1g-dev && \
+    rm -rf /var/lib/apt/lists/*
 
-# Insert apt-get instructions here to install
-# any required dependencies for your module.
-
-# libz is for Minimap2 in Quast
-RUN apt update && apt install -y  g++ nano tree wget libz-dev
-
-RUN python -m pip install --upgrade pip \
-    && pip install \
-            psutil==6.0.0 \
-            matplotlib==3.7.5 \
-            quast==5.2.0 \
-            biopython==1.83 \
-    && rm -r /opt/conda3/lib/python3.8/site-packages/quast_libs/genemark \
-    && rm -r /opt/conda3/lib/python3.8/site-packages/quast_libs/genemark-es
-
-# Genemark is not open source for non-academic use, and so can't be used in KBase
-
-# Precompile dependencies vs compiling at first use
-# In particular, this allows the container to run @ NERSC
-RUN mkdir /quasttemp \
-    && cd /quasttemp \
-    && wget quast.sf.net/test_data.tar.gz \
-    && tar xzf test_data.tar.gz \
-    && quast.py --test --debug \
-    && cd - \
-    && rm -r /quasttemp
+# Optional: enable static/PDF plots
+# RUN apt-get update && apt-get install -y --no-install-recommends \
+#     pkg-config libfreetype6-dev libpng-dev python3-matplotlib && \
+#     rm -rf /var/lib/apt/lists/*
 
 # -----------------------------------------
+# Python tooling (upgrade packaging to avoid canonicalize_version error)
+RUN python3 -m pip install -U pip setuptools wheel packaging && python --version
 
+# If you keep these utilities, install them here
+RUN pip install coverage==5.5 in_place==1.0.1 pathos==0.3.4
+
+# -----------------------------------------
+# Install QUAST 5.3.0 from source (PyPI tops out at 5.2.0)
+ENV QUAST_VER=5.3.0
+WORKDIR /opt
+RUN wget -qO quast-${QUAST_VER}.tar.gz \
+      "https://sourceforge.net/projects/quast/files/quast-${QUAST_VER}.tar.gz/download" \
+ && tar -xzf quast-${QUAST_VER}.tar.gz \
+ && cd quast-${QUAST_VER} \
+ && python3 -m pip install . --no-cache-dir --root-user-action=ignore \
+ && quast.py --version
+
+RUN pip install biopython==1.81
+
+# Python tooling (upgrade packaging, etc.)
+RUN python3 -m pip install -U pip setuptools wheel packaging && python --version
+# Add psutil (and keep your existing utilities)
+RUN pip install psutil coverage==5.5 in_place==1.0.1 pathos==0.3.4
+
+# -----------------------------------------
+# Your module
 COPY ./ /kb/module
 RUN mkdir -p /kb/module/work
-RUN chmod -R 777 /kb/module
+RUN chmod -R a+rw /kb/module
 
 WORKDIR /kb/module
-
 RUN make all
 
 ENTRYPOINT [ "./scripts/entrypoint.sh" ]
-
 CMD [ ]
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -1,3 +1,11 @@
+### 1.2.0
+
+- Updated QUAST to 5.3.0
+- Added support for large genomes
+- Added ONP support
+- added assemblysets input
+- 1.1.0 update
+
 ### 1.1.0
 
 - Updated base image to `kbase/sdkpython:3.8.10` to fix build
@@ -11,4 +19,4 @@
 
 ### 0.0.7
 - Updated zip file size
-- Added github action
+- Added github action
diff --git a/kbase.yml b/kbase.yml
@@ -11,7 +11,7 @@ service-language:
     python
 
 module-version:
-    1.1.0
+    1.2.0
 
 owners:
     [gaprice]
diff --git a/lib/kb_quast/kb_quastImpl.py b/lib/kb_quast/kb_quastImpl.py
@@ -5,6 +5,8 @@
 import subprocess as _subprocess
 import time as _time
 import uuid as _uuid
+import shlex  # kept from original (not used directly but harmless)
+from pathlib import Path  # kept from original (not used directly but harmless)
 
 import psutil
 from Bio import SeqIO as _SeqIO
@@ -17,10 +19,10 @@
 from installed_clients.baseclient import ServerError as _DFUError
 from installed_clients.baseclient import ServerError as _RepError
 from installed_clients.baseclient import ServerError as _WSError
+#END_HEADER
 
 
 class ObjInfo(object):
-
     def __init__(self, obj_info):
         self.id = obj_info[0]
         self.name = obj_info[1]
@@ -34,7 +36,6 @@ def __init__(self, obj_info):
         self.size = obj_info[9]
         self.meta = obj_info[10]
         self.ref = str(self.wsid) + '/' + str(self.id) + '/' + str(self.version)
-#END_HEADER
 
 
 class kb_quast:
@@ -95,7 +96,7 @@ def get_min_contig_length(self, params):
     def get_assemblies(self, target_dir, object_infos):
         filepaths = []
         asscli = _AssClient(self.callback_url)
-        # would be nice the the assembly utils had bulk download...
+        # would be nice if the assembly utils had bulk download...
         for i in object_infos:
             fn = _os.path.join(target_dir, i.ref.replace('/', '_'))
             filepaths.append(fn)
@@ -105,28 +106,77 @@ def get_assemblies(self, target_dir, object_infos):
             except _AssError as asserr:
                 self.log('Logging assembly downloader exception')
                 self.log(str(asserr))
-                raise
+                raise asserr
         return filepaths
 
     def get_assembly_object_info(self, assemblies, token):
+        # Prefer the top-level ObjInfo; fall back to a local definition if missing
+        ObjInfoClass = globals().get('ObjInfo')
+        if ObjInfoClass is None:
+            class ObjInfoClass(object):
+                def __init__(self, obj_info):
+                    self.id = obj_info[0]
+                    self.name = obj_info[1]
+                    t_full = obj_info[2]
+                    self.type, self.type_ver = (t_full.split('-', 1) + [''])[:2] if '-' in t_full else (t_full, '')
+                    self.time = obj_info[3]
+                    self.version = obj_info[4]
+                    self.saved_by = obj_info[5]
+                    self.wsid = obj_info[6]
+                    self.workspace = obj_info[7]
+                    self.chsum = obj_info[8]
+                    self.size = obj_info[9]
+                    self.meta = obj_info[10]
+                    self.ref = f"{self.wsid}/{self.id}/{self.version}"
+
         refs = [{'ref': x} for x in assemblies]
         ws = _WSClient(self.ws_url, token=token)
         self.log('Getting object information from workspace')
-        # TODO use this often enough that should add to DFU but return dict vs list
+
         try:
-            info = [ObjInfo(i) for i in ws.get_object_info3({'objects': refs})['infos']]
+            infos = ws.get_object_info3({'objects': refs})['infos']
         except _WSError as wse:
             self.log('Logging workspace exception')
             self.log(str(wse))
-            raise
+            raise wse
+
+        info = [ObjInfoClass(i) for i in infos]
+
         self.log('Object list:')
-        for i in info:  # don't check type - assemblyutils should handle that
-            self.log('{}/{} {} {}'.format(i.workspace, i.name, i.ref, i.type))
-        absrefs = [i.ref for i in info]
+        for o in info:
+            self.log(f'{o.workspace}/{o.name} {o.ref} {o.type}')
+
+        absrefs = [o.ref for o in info]
         if len(set(absrefs)) != len(absrefs):
-            raise ValueError('Duplicate objects detected in input')  # could list objs later
+            raise ValueError('Duplicate objects detected in input')
         return info
 
+    def expand_assembly_sets(self, set_refs, token):
+        """
+        Given a list of KBaseSets.AssemblySet refs, return a flat list of assembly refs.
+        Supports common schemas: {'items': [{'ref': ...}, ...]} or {'elements': [{'ref': ...}, ...]}.
+        """
+        if not set_refs:
+            return []
+        ws = _WSClient(self.ws_url, token=token)
+        objs = ws.get_objects2({'objects': [{'ref': r} for r in set_refs]})['data']
+        out = []
+        for od in objs:
+            data = od.get('data', {}) or {}
+            items = data.get('items')
+            if isinstance(items, list):
+                for it in items:
+                    if isinstance(it, dict) and it.get('ref'):
+                        out.append(it['ref'])
+                continue
+            elements = data.get('elements')
+            if isinstance(elements, list):
+                for el in elements:
+                    if isinstance(el, dict) and el.get('ref'):
+                        out.append(el['ref'])
+                continue
+        return out
+
     def run_quast_exec(self, outdir, filepaths, labels, min_contig_length, skip_glimmer=False):
         threads = psutil.cpu_count() * self.THREADS_PER_CORE
         # DO NOT use genemark instead of glimmer, not open source
@@ -163,6 +213,68 @@ def run_quast_exec(self, outdir, filepaths, labels, min_contig_length, skip_glim
             self.log(err)
             raise ValueError(err)
 
+
+    # ---------- NEW HELPERS (instance methods) ----------
+    def _ws_batch_get_info(self, ws_client, refs):
+
+        """
+        Batched get_object_info3 to avoid repeated roundtrips.
+        Returns list of dicts with keys: ref, type
+        """
+        if not refs:
+            return []
+        info = ws_client.get_object_info3({"objects": [{"ref": r} for r in refs]})["infos"]
+        out = []
+        for r, i in zip(refs, info):
+            # i[2] is the full type string, e.g. "KBaseSets.AssemblySet-2.0"
+            out.append({"ref": r, "type": i[2]})
+        return out
+
+    def _classify_input_refs(self, ws_client, raw_refs):
+        """
+        Split mixed refs into assemblies vs assembly_sets based on WS type.
+        Returns (assemblies, assembly_sets).
+        """
+        assemblies, assembly_sets = [], []
+        for item in self._ws_batch_get_info(ws_client, raw_refs):
+            t = item["type"] or ""
+            t_base = t.split("-", 1)[0]  # strip version suffix
+            if t_base in ("KBaseGenomes.Assembly", "KBaseGenomeAnnotations.Assembly"):
+                assemblies.append(item["ref"])
+            elif t_base == "KBaseSets.AssemblySet":
+                assembly_sets.append(item["ref"])
+            else:
+                raise ValueError(
+                    f"Unsupported input ref type: {t} for {item['ref']}. "
+                    "Expected KBaseGenomes.Assembly or KBaseSets.AssemblySet."
+                )
+        return assemblies, assembly_sets
+
+    def _gather_inputs(self, params, ws_client):
+        """
+        Accepts any combo of:
+          - params['assemblies'] (list of refs)
+          - params['assembly_sets'] (list of refs)
+          - params['input_refs'] (list of refs, mixed)
+          - params['files'] (list of paths/handles)
+        Classifies refs by WS type and returns normalized dict.
+        """
+        assemblies_param = params.get("assemblies") or []
+        assembly_sets_param = params.get("assembly_sets") or []
+        mixed_param = params.get("input_refs") or []
+        files = params.get("files") or []
+
+        cls_assemblies, cls_sets = self._classify_input_refs(ws_client, mixed_param)
+        exp_assemblies, exp_sets = self._classify_input_refs(
+            ws_client, assemblies_param + assembly_sets_param
+        )
+
+        assemblies = list(dict.fromkeys(cls_assemblies + exp_assemblies))
+        assembly_sets = list(dict.fromkeys(cls_sets + exp_sets))
+
+        return {"assemblies": assemblies, "assembly_sets": assembly_sets, "files": files}
+    # ---------- END HELPERS ----------
+
     def check_large_input(self, filepaths):
         skip_glimmer = False
         basecount = 0
@@ -187,7 +299,6 @@ def __init__(self, config):
         #END_CONSTRUCTOR
         pass
 
-
     def run_QUAST_app(self, ctx, params):
         """
         Run QUAST and save a KBaseReport with the output.
@@ -237,7 +348,7 @@ def run_QUAST_app(self, ctx, params):
             self.log('Logging exception from creating report object')
             self.log(str(re))
             # TODO delete shock node
-            raise
+            raise re
         output = {'report_name': repout['name'],
                   'report_ref': repout['ref']
                   }
@@ -253,6 +364,11 @@ def run_QUAST_app(self, ctx, params):
     def run_QUAST(self, ctx, params):
         """
         Run QUAST and return a shock node containing the zipped QUAST output.
+        Supports:
+          - files: list of {'path', 'label'}
+          - assemblies: list<ref> to Assembly/ContigSet
+          - assembly_sets: list<ref> to KBaseSets.AssemblySet
+        Exactly one of (files) or (assemblies/assembly_sets) must be provided.
         :param params: instance of type "QUASTParams" (Input for running
            QUAST. assemblies - the list of assemblies upon which QUAST will
            be run. -OR- files - the list of FASTA files upon which QUAST will
@@ -296,18 +412,36 @@ def run_QUAST(self, ctx, params):
         #BEGIN run_QUAST
         self.log('Starting QUAST run. Parameters:')
         self.log(str(params))
-        assemblies = params.get('assemblies')
-        files = params.get('files')
+
+        # Normalize inputs (classify mixed refs into assemblies vs assembly_sets)
+        ws = _WSClient(self.ws_url, token=ctx['token'])
+        normalized = self._gather_inputs(params, ws)
+        assemblies = normalized["assemblies"]
+        assembly_sets = normalized["assembly_sets"]
+        files = normalized["files"]
+
         min_contig_length = self.get_min_contig_length(params)  # fail early if param is bad
-        if not self.xor(assemblies, files):
-            raise ValueError(
-                'One and only one of a list of assembly references or files is required')
+
+        has_obj_inputs = bool(assemblies or assembly_sets)
+        if bool(files) == has_obj_inputs:
+            raise ValueError('One and only one of a list of assembly references or files is required')
+
         tdir = _os.path.join(self.scratch, str(_uuid.uuid4()))
         self.mkdir_p(tdir)
-        if assemblies:
+
+        if has_obj_inputs:
             if type(assemblies) != list:
                 raise ValueError('assemblies must be a list')
-            info = self.get_assembly_object_info(assemblies, ctx['token'])
+            if type(assembly_sets) != list:
+                raise ValueError('assembly_sets must be a list')
+
+            # Expand sets to assembly refs and combine
+            set_expanded = self.expand_assembly_sets(assembly_sets, ctx['token']) if assembly_sets else []
+            all_ass_refs = list(assemblies) + set_expanded
+            if not all_ass_refs:
+                raise ValueError('Provided assembly_sets expand to zero assemblies')
+
+            info = self.get_assembly_object_info(all_ass_refs, ctx['token'])
             filepaths = self.get_assemblies(tdir, info)
             labels = [i.name for i in info]
         else:
@@ -339,10 +473,9 @@ def run_QUAST(self, ctx, params):
                                         'make_handle': 1 if mh else 0,
                                         'pack': 'zip'})
         except _DFUError as dfue:
-            # not really any way to test this block
             self.log('Logging exception loading results to shock')
             self.log(str(dfue))
-            raise
+            raise dfue
         output['quast_path'] = out
         #END run_QUAST
 
@@ -352,6 +485,7 @@ def run_QUAST(self, ctx, params):
                              'output is not type dict as required.')
         # return the results
         return [output]
+
     def status(self, ctx):
         #BEGIN_STATUS
         del ctx
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,7 +11,7 @@ service-language: @@
         python
     module-version:
-.1.0
+.2.0
     owners:
         [gaprice]