From f30d17d7fe0e51f9e8e883d6470f1cf193a059f2 Mon Sep 17 00:00:00 2001 From: wolfey Date: Tue, 23 Jun 2020 16:47:20 -0400 Subject: [PATCH 01/10] linux-ready on my machine --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c0ef96a..30ca837 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,8 @@ module = Extension('clustalo', sources = ['clustalo.c'], - include_dirs=['/usr/include/clustalo', '/usr/local/include/clustalo'], + include_dirs=['/usr/local/sci/clustalo/current/include/clustalo'], + library_dirs=['/usr/local/sci/clustalo/current/lib'], libraries=libraries, extra_compile_args=extra_compile_args) From 002d6c81680ce06689ebfc753a02dda637d0cf32 Mon Sep 17 00:00:00 2001 From: wolfey Date: Tue, 23 Jun 2020 16:47:25 -0400 Subject: [PATCH 02/10] linux-ready on my machine --- venv/pyvenv.cfg | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 venv/pyvenv.cfg diff --git a/venv/pyvenv.cfg b/venv/pyvenv.cfg new file mode 100644 index 0000000..e7c42a0 --- /dev/null +++ b/venv/pyvenv.cfg @@ -0,0 +1,8 @@ +home = /usr +implementation = CPython +version_info = 3.8.2.final.0 +virtualenv = 20.0.20 +include-system-site-packages = false +base-prefix = /usr +base-exec-prefix = /usr +base-executable = /usr/bin/python3 From 29d346b635a9ac1a20bc86bd64093d090274d038 Mon Sep 17 00:00:00 2001 From: wolfey Date: Wed, 24 Jun 2020 12:16:00 -0400 Subject: [PATCH 03/10] Updated so that properly maintains the alignment order, but requires a modified version of clustal-omega (see my other repository) --- clustalo.c | 39 +++++++++++++++++++++++++++++++-------- setup.py | 2 +- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/clustalo.c b/clustalo.c index d24ad0a..570e081 100644 --- a/clustalo.c +++ b/clustalo.c @@ -27,6 +27,7 @@ clustalo_clustalo(PyObject *self, PyObject *args, PyObject *keywds) int maxGuidetreeIterations = rAlnOpts.iMaxGuidetreeIterations; int maxHMMIterations = rAlnOpts.iMaxHMMIterations; int numThreads = 1; + int outOrder = 1; static char *kwlist[] = { "seqs", "seqtype", @@ -36,9 +37,10 @@ clustalo_clustalo(PyObject *self, PyObject *args, PyObject *keywds) "max_guidetree_iterations", "max_hmm_iterations", "num_threads", + "output_order", NULL }; - if (!PyArg_ParseTupleAndKeywords(args, keywds, "O!|iOOiiii", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, keywds, "O!|iOOiiiii", kwlist, &PyDict_Type, &inputDict, &seqtype, &mbedGuideTree, @@ -46,7 +48,8 @@ clustalo_clustalo(PyObject *self, PyObject *args, PyObject *keywds) &numCombinedIterations, &maxGuidetreeIterations, &maxHMMIterations, - &numThreads)) + &numThreads, + &outOrder)) return NULL; if (PyObject_Not(inputDict)) @@ -71,6 +74,7 @@ clustalo_clustalo(PyObject *self, PyObject *args, PyObject *keywds) rAlnOpts.iNumIterations = numCombinedIterations; rAlnOpts.iMaxGuidetreeIterations = maxGuidetreeIterations; rAlnOpts.iMaxHMMIterations = maxHMMIterations; + rAlnOpts.iOutputOrder = outOrder; // Read in sequences from input. PyObject *key, *value; @@ -120,14 +124,33 @@ clustalo_clustalo(PyObject *self, PyObject *args, PyObject *keywds) // Return the aligned results in a dict. PyObject *returnDict = PyDict_New(); int idx; - for (idx = 0; idx < prMSeq->nseqs; idx++) { - const char *key = prMSeq->sqinfo[idx].name; - #if PY_MAJOR_VERSION >= 3 + if (outOrder == 1){ + for (idx = 0; idx < prMSeq->nseqs; idx++) { + //printf("NAME OF SEQUENCE: %s, %i \n", prMSeq->sqinfo[prMSeq->tree_order[idx]].name, prMSeq->tree_order[idx]); + const char *key = prMSeq->sqinfo[prMSeq->tree_order[idx]].name; + +#if PY_MAJOR_VERSION >= 3 + PyObject *value = PyUnicode_FromString(prMSeq->seq[prMSeq->tree_order[idx]]); +#else + PyObject *value = PyString_FromString(prMSeq->seq[prMSeq->tree_order[idx]]); + +#endif + PyDict_SetItemString(returnDict, key, value); + } + + + } + else { + for (idx = 0; idx < prMSeq->nseqs; idx++) { + const char *key = prMSeq->sqinfo[idx].name; + +#if PY_MAJOR_VERSION >= 3 PyObject *value = PyUnicode_FromString(prMSeq->seq[idx]); - #else +#else PyObject *value = PyString_FromString(prMSeq->seq[idx]); - #endif - PyDict_SetItemString(returnDict, key, value); +#endif + PyDict_SetItemString(returnDict, key, value); + } } return returnDict; } diff --git a/setup.py b/setup.py index 30ca837..17b2530 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ extra_compile_args=extra_compile_args) setup(name='clustalo', - version='0.1.2', + version='0.1.3', description='Python wrapper around libclustalo', author='Benchling Engineering', author_email='eng@benchling.com', From 006923709ed117ff06da0de8c08f95f6ba796f90 Mon Sep 17 00:00:00 2001 From: wolfey Date: Wed, 24 Jun 2020 12:16:38 -0400 Subject: [PATCH 04/10] updated gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index b09d791..f48057a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.idea *.py[cod] # C extensions From 9c3d5cf85bd2f842869bd8016945b2fe0e7e4f39 Mon Sep 17 00:00:00 2001 From: wolfey Date: Thu, 25 Jun 2020 08:48:17 -0400 Subject: [PATCH 05/10] This allows for accessing the tree_order without modifying the source of ClustalO --- clustalo.c | 42 +++++++++++++++++++++--------------------- setup.py | 4 ++-- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/clustalo.c b/clustalo.c index 570e081..584d68e 100644 --- a/clustalo.c +++ b/clustalo.c @@ -74,6 +74,11 @@ clustalo_clustalo(PyObject *self, PyObject *args, PyObject *keywds) rAlnOpts.iNumIterations = numCombinedIterations; rAlnOpts.iMaxGuidetreeIterations = maxGuidetreeIterations; rAlnOpts.iMaxHMMIterations = maxHMMIterations; + + // allocating the tree_order of prMSeq is enough to capture the tree_order information + if (outOrder == 1) { + prMSeq->tree_order = (int *)CKMALLOC(prMSeq->nseqs * sizeof(int)); + } rAlnOpts.iOutputOrder = outOrder; // Read in sequences from input. @@ -128,22 +133,17 @@ clustalo_clustalo(PyObject *self, PyObject *args, PyObject *keywds) for (idx = 0; idx < prMSeq->nseqs; idx++) { //printf("NAME OF SEQUENCE: %s, %i \n", prMSeq->sqinfo[prMSeq->tree_order[idx]].name, prMSeq->tree_order[idx]); const char *key = prMSeq->sqinfo[prMSeq->tree_order[idx]].name; - #if PY_MAJOR_VERSION >= 3 PyObject *value = PyUnicode_FromString(prMSeq->seq[prMSeq->tree_order[idx]]); #else PyObject *value = PyString_FromString(prMSeq->seq[prMSeq->tree_order[idx]]); - #endif PyDict_SetItemString(returnDict, key, value); } - - } else { for (idx = 0; idx < prMSeq->nseqs; idx++) { const char *key = prMSeq->sqinfo[idx].name; - #if PY_MAJOR_VERSION >= 3 PyObject *value = PyUnicode_FromString(prMSeq->seq[idx]); #else @@ -154,7 +154,6 @@ clustalo_clustalo(PyObject *self, PyObject *args, PyObject *keywds) } return returnDict; } - #if PY_MAJOR_VERSION >= 3 #define MOD_ERROR_VAL NULL #define MOD_SUCCESS_VAL(val) val @@ -174,21 +173,22 @@ clustalo_clustalo(PyObject *self, PyObject *args, PyObject *keywds) static PyMethodDef ClustaloMethods[] = { {"clustalo", (PyCFunction)clustalo_clustalo, METH_VARARGS | METH_KEYWORDS, - "Runs clustal omega." - "" - "Args:" - " data (dict): dictionary of sequence_name => bases" - "" - "Kwargs:" - " seqtype (int): should be one of clustalo.DNA, clustalo.RNA, or clustalo.PROTEIN" - " mbed_guide_tree (bool): whether mBed-like clustering guide tree should be used" - " mbed_iteration (bool): whether mBed-like clustering iteration should be used" - " num_combined_iterations (int): number of (combined guide-tree/HMM) iterations" - " max_guidetree_iterations (int): max guide tree iterations within combined iterations" - " max_hmm_iterations (int): max HMM iterations within combined iterations" - " num_threads (int): number of threads to use (requires libclustalo compiled with OpenMP)" - "" - "Returns dict of sequence_named => aligned_bases ('_' for gaps)"}, + "Runs clustal omega.\n" + "\n" + "Args:\n" + " data (dict): dictionary of sequence_name => bases\n" + "\n" + "Kwargs:\n" + " seqtype (int): should be one of clustalo.DNA, clustalo.RNA, or clustalo.PROTEIN\n" + " mbed_guide_tree (bool): whether mBed-like clustering guide tree should be used\n" + " mbed_iteration (bool): whether mBed-like clustering iteration should be used\n" + " num_combined_iterations (int): number of (combined guide-tree/HMM) iterations\n" + " max_guidetree_iterations (int): max guide tree iterations within combined iterations\n" + " max_hmm_iterations (int): max HMM iterations within combined iterations\n" + " num_threads (int): number of threads to use (requires libclustalo compiled with OpenMP)\n" + " output_order (int): return the alignment with either the input order (0) or alignment tree order (1)\n" + "\n" + "Returns dict of sequence_names:aligned_bases ('-' for gaps)\n"}, {NULL, NULL, 0, NULL} }; diff --git a/setup.py b/setup.py index 17b2530..a309ca4 100644 --- a/setup.py +++ b/setup.py @@ -12,8 +12,8 @@ module = Extension('clustalo', sources = ['clustalo.c'], - include_dirs=['/usr/local/sci/clustalo/current/include/clustalo'], - library_dirs=['/usr/local/sci/clustalo/current/lib'], + include_dirs=['/usr/local/sci/clustalo/v1.2.4/include/clustalo'], + library_dirs=['/usr/local/sci/clustalo/v1.2.4/lib'], libraries=libraries, extra_compile_args=extra_compile_args) From 78476d41792b101acbb63a1b2a9d283065c2fcae Mon Sep 17 00:00:00 2001 From: wolfey Date: Thu, 25 Jun 2020 09:12:02 -0400 Subject: [PATCH 06/10] added fix for if there are only 2 sequences --- clustalo.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/clustalo.c b/clustalo.c index 584d68e..81a12e8 100644 --- a/clustalo.c +++ b/clustalo.c @@ -75,12 +75,6 @@ clustalo_clustalo(PyObject *self, PyObject *args, PyObject *keywds) rAlnOpts.iMaxGuidetreeIterations = maxGuidetreeIterations; rAlnOpts.iMaxHMMIterations = maxHMMIterations; - // allocating the tree_order of prMSeq is enough to capture the tree_order information - if (outOrder == 1) { - prMSeq->tree_order = (int *)CKMALLOC(prMSeq->nseqs * sizeof(int)); - } - rAlnOpts.iOutputOrder = outOrder; - // Read in sequences from input. PyObject *key, *value; Py_ssize_t pos = 0; @@ -115,6 +109,14 @@ clustalo_clustalo(PyObject *self, PyObject *args, PyObject *keywds) return PyDict_Copy(inputDict); } + // allocating the tree_order of prMSeq is enough to capture the tree_order information + if (prMSeq->nseqs > 2 && outOrder == 1) { + prMSeq->tree_order = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int)); + } + else { + outOrder = 0; + } + // Perform the alignment. int rv; Py_BEGIN_ALLOW_THREADS @@ -131,7 +133,6 @@ clustalo_clustalo(PyObject *self, PyObject *args, PyObject *keywds) int idx; if (outOrder == 1){ for (idx = 0; idx < prMSeq->nseqs; idx++) { - //printf("NAME OF SEQUENCE: %s, %i \n", prMSeq->sqinfo[prMSeq->tree_order[idx]].name, prMSeq->tree_order[idx]); const char *key = prMSeq->sqinfo[prMSeq->tree_order[idx]].name; #if PY_MAJOR_VERSION >= 3 PyObject *value = PyUnicode_FromString(prMSeq->seq[prMSeq->tree_order[idx]]); From b355caeabcd9c88a35e9ba82d38bea977d92aa75 Mon Sep 17 00:00:00 2001 From: wolfey Date: Thu, 25 Jun 2020 09:14:38 -0400 Subject: [PATCH 07/10] more comments --- clustalo.c | 1 + 1 file changed, 1 insertion(+) diff --git a/clustalo.c b/clustalo.c index 81a12e8..2f15618 100644 --- a/clustalo.c +++ b/clustalo.c @@ -110,6 +110,7 @@ clustalo_clustalo(PyObject *self, PyObject *args, PyObject *keywds) } // allocating the tree_order of prMSeq is enough to capture the tree_order information + // Segfaults if only 2 sequences though (the program refuses to calculate a tree) if (prMSeq->nseqs > 2 && outOrder == 1) { prMSeq->tree_order = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int)); } From bb3229648bc61263a6debc63edd4365ae424c3a8 Mon Sep 17 00:00:00 2001 From: wolfey Date: Thu, 25 Jun 2020 09:18:51 -0400 Subject: [PATCH 08/10] docs updated --- clustalo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clustalo.c b/clustalo.c index 2f15618..8adcff0 100644 --- a/clustalo.c +++ b/clustalo.c @@ -188,7 +188,7 @@ static PyMethodDef ClustaloMethods[] = { " max_guidetree_iterations (int): max guide tree iterations within combined iterations\n" " max_hmm_iterations (int): max HMM iterations within combined iterations\n" " num_threads (int): number of threads to use (requires libclustalo compiled with OpenMP)\n" - " output_order (int): return the alignment with either the input order (0) or alignment tree order (1)\n" + " output_order (int): return the alignment with either the input order (0) or alignment tree order (1). Only works on >Python 3.6, where dictionaries are ordered.\n" "\n" "Returns dict of sequence_names:aligned_bases ('-' for gaps)\n"}, {NULL, NULL, 0, NULL} From d85d3050a459e01705ffb1d5f116fa6a39abba41 Mon Sep 17 00:00:00 2001 From: Aaron Wolfe Date: Thu, 25 Jun 2020 10:35:22 -0400 Subject: [PATCH 09/10] Removed venv folder --- .gitignore | 1 + venv/pyvenv.cfg | 8 -------- 2 files changed, 1 insertion(+), 8 deletions(-) delete mode 100644 venv/pyvenv.cfg diff --git a/.gitignore b/.gitignore index f48057a..ba731fd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +venv .idea *.py[cod] diff --git a/venv/pyvenv.cfg b/venv/pyvenv.cfg deleted file mode 100644 index e7c42a0..0000000 --- a/venv/pyvenv.cfg +++ /dev/null @@ -1,8 +0,0 @@ -home = /usr -implementation = CPython -version_info = 3.8.2.final.0 -virtualenv = 20.0.20 -include-system-site-packages = false -base-prefix = /usr -base-exec-prefix = /usr -base-executable = /usr/bin/python3 From 24cafa5da7842ef64208bc105d290f88d807b1ce Mon Sep 17 00:00:00 2001 From: beowulfey Date: Tue, 29 Dec 2020 07:31:32 -0500 Subject: [PATCH 10/10] Update README.rst --- README.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.rst b/README.rst index c83877b..a84498a 100644 --- a/README.rst +++ b/README.rst @@ -1,3 +1,7 @@ +### Beowulfey's notes: + +I've forked this repository in order to add the ability to return the sequences of the alignment in the aligned tree order. Although there isn't an option in the clustalo API I figured out how it knows whether to do so (it's dependendent on whether memory is allocated for the tree order, I guess). Thanks to ordered dictionaries in Python 3 this actually works, too! + clustalo-python ===============