From 34fa2b1009ea5d9fc01924810337a54116cf1459 Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Thu, 24 May 2018 10:40:55 +0200 Subject: [PATCH 01/31] first working module --- .gitignore | 1 + pystarcode/pystarcode.c | 75 +++++++++++++++++++++++++++++++++++++++++ pystarcode/setup.py | 7 ++++ 3 files changed, 83 insertions(+) create mode 100644 pystarcode/pystarcode.c create mode 100644 pystarcode/setup.py diff --git a/.gitignore b/.gitignore index ff62139..3e66ee5 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ test/.inspect.gdb test/runtests doc/*.log *\#* +*.txt diff --git a/pystarcode/pystarcode.c b/pystarcode/pystarcode.c new file mode 100644 index 0000000..9c44366 --- /dev/null +++ b/pystarcode/pystarcode.c @@ -0,0 +1,75 @@ +#include +#include +#include "../src/starcode.h" + +#define MAX_STR_LENGTH 2048 + +static char module_docstring[] = + "This module is a Python interface to Starcode"; +static char starcode_docstring[] = + "Starcode invocation"; + +static PyObject *pystarcode_starcode(PyObject *self, PyObject *args); + +static PyMethodDef module_methods[] = { + {"starcode", pystarcode_starcode, METH_VARARGS, starcode_docstring}, + {NULL, NULL, 0, NULL} +}; + +PyMODINIT_FUNC initpystarcode(void) +{ + PyObject *m = Py_InitModule3("pystarcode", module_methods, module_docstring); + if (m == NULL) + return; +} + +static PyObject *pystarcode_starcode(PyObject *self, PyObject *args) +{ + // Parse the input + char *in_filename, *out_filename; + int tau, cluster_ratio; + if (!PyArg_ParseTuple(args, "sshh", + &in_filename, + &out_filename, + &tau, + &cluster_ratio)) + return NULL; + + FILE *inputf1 = fopen(in_filename,"r"); + if (inputf1==NULL) + { + char error_message[MAX_STR_LENGTH]; + sprintf(error_message, "Cannot open %s for reading", in_filename); + PyErr_SetString(PyExc_IOError, error_message); + return NULL; + } + FILE *inputf2 = NULL; + FILE *outputf1 = fopen(out_filename, "w"); + FILE *outputf2 = NULL; + const int verbose = 1; + int thrmax = 4; + const int clusteralg = 0; + const int showclusters = 1; + const int showids = 0; + const int outputt = 0; + + int out = starcode( + inputf1, + inputf2, + outputf1, + outputf2, + tau, + verbose, + thrmax, + clusteralg, + cluster_ratio, + showclusters, + showids, + outputt + ); + + + PyObject *ret = Py_BuildValue("h", out); + + return ret; +} diff --git a/pystarcode/setup.py b/pystarcode/setup.py new file mode 100644 index 0000000..4ffc6c9 --- /dev/null +++ b/pystarcode/setup.py @@ -0,0 +1,7 @@ +from distutils.core import setup, Extension + +setup( + ext_modules = [Extension("pystarcode", ["pystarcode.c", + "../src/starcode.c", + "../src/trie.c"])], +) From 65568e3d905d7ce568ad6f30940b508ec49d5e30 Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Thu, 24 May 2018 11:10:09 +0200 Subject: [PATCH 02/31] throws exception for non existent files --- pystarcode/pystarcode.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/pystarcode/pystarcode.c b/pystarcode/pystarcode.c index 9c44366..817061c 100644 --- a/pystarcode/pystarcode.c +++ b/pystarcode/pystarcode.c @@ -23,6 +23,20 @@ PyMODINIT_FUNC initpystarcode(void) return; } +FILE *py_fopen(const char *fname, const char *mode) +{ + FILE *f = fopen(fname, mode); + if (f==NULL) + { + char error_message[MAX_STR_LENGTH]; + sprintf(error_message, "Cannot open file %s", fname); + PyErr_SetString(PyExc_IOError, error_message); + return NULL; + } + else + return f; +} + static PyObject *pystarcode_starcode(PyObject *self, PyObject *args) { // Parse the input @@ -35,17 +49,14 @@ static PyObject *pystarcode_starcode(PyObject *self, PyObject *args) &cluster_ratio)) return NULL; - FILE *inputf1 = fopen(in_filename,"r"); - if (inputf1==NULL) - { - char error_message[MAX_STR_LENGTH]; - sprintf(error_message, "Cannot open %s for reading", in_filename); - PyErr_SetString(PyExc_IOError, error_message); - return NULL; - } + // open input and output files + FILE *inputf1 = py_fopen(in_filename,"r"); + if (inputf1 == NULL) return NULL; FILE *inputf2 = NULL; - FILE *outputf1 = fopen(out_filename, "w"); + FILE *outputf1 = py_fopen(out_filename, "w"); + if (outputf1 == NULL) return NULL; FILE *outputf2 = NULL; + const int verbose = 1; int thrmax = 4; const int clusteralg = 0; From 997594747a6d6ace285bb778a1b155f6ca37c65f Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Thu, 24 May 2018 11:57:51 +0200 Subject: [PATCH 03/31] added keywords to the main "starcode" method --- pystarcode/pystarcode.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pystarcode/pystarcode.c b/pystarcode/pystarcode.c index 817061c..fe795a9 100644 --- a/pystarcode/pystarcode.c +++ b/pystarcode/pystarcode.c @@ -9,10 +9,10 @@ static char module_docstring[] = static char starcode_docstring[] = "Starcode invocation"; -static PyObject *pystarcode_starcode(PyObject *self, PyObject *args); +static PyObject *pystarcode_starcode(PyObject *self, PyObject *args, PyObject *kwargs); static PyMethodDef module_methods[] = { - {"starcode", pystarcode_starcode, METH_VARARGS, starcode_docstring}, + {"starcode", pystarcode_starcode, METH_VARARGS|METH_KEYWORDS, starcode_docstring}, {NULL, NULL, 0, NULL} }; @@ -37,16 +37,20 @@ FILE *py_fopen(const char *fname, const char *mode) return f; } -static PyObject *pystarcode_starcode(PyObject *self, PyObject *args) +static PyObject *pystarcode_starcode(PyObject *self, PyObject *args, PyObject *kwargs) { // Parse the input char *in_filename, *out_filename; int tau, cluster_ratio; - if (!PyArg_ParseTuple(args, "sshh", + int clusteralg = 0; + + static char *kwlist[] = {"input","output","dist","cluster_ratio","clusteralg",NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "sshh|h", kwlist, &in_filename, &out_filename, &tau, - &cluster_ratio)) + &cluster_ratio, + &clusteralg)) return NULL; // open input and output files @@ -59,7 +63,6 @@ static PyObject *pystarcode_starcode(PyObject *self, PyObject *args) const int verbose = 1; int thrmax = 4; - const int clusteralg = 0; const int showclusters = 1; const int showids = 0; const int outputt = 0; From b3fd55801cdd824c3188d865119f4241e9669b9c Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Thu, 24 May 2018 12:29:46 +0200 Subject: [PATCH 04/31] added "verbose" kwarg --- pystarcode/pystarcode.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pystarcode/pystarcode.c b/pystarcode/pystarcode.c index fe795a9..b90390e 100644 --- a/pystarcode/pystarcode.c +++ b/pystarcode/pystarcode.c @@ -43,14 +43,23 @@ static PyObject *pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k char *in_filename, *out_filename; int tau, cluster_ratio; int clusteralg = 0; + int verbose = 1; - static char *kwlist[] = {"input","output","dist","cluster_ratio","clusteralg",NULL}; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "sshh|h", kwlist, + static char *kwlist[] = { + "input", + "output", + "dist", + "cluster_ratio", + "clusteralg", + "verbose", + NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "ssii|ii", kwlist, &in_filename, &out_filename, &tau, &cluster_ratio, - &clusteralg)) + &clusteralg, + &verbose)) return NULL; // open input and output files @@ -61,12 +70,13 @@ static PyObject *pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k if (outputf1 == NULL) return NULL; FILE *outputf2 = NULL; - const int verbose = 1; int thrmax = 4; const int showclusters = 1; const int showids = 0; const int outputt = 0; + // printf("%s %s %d %d %d %d\n",in_filename,out_filename,tau,cluster_ratio,clusteralg,verbose); + int out = starcode( inputf1, inputf2, @@ -82,6 +92,7 @@ static PyObject *pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k outputt ); + // int out = 0; PyObject *ret = Py_BuildValue("h", out); From 14d1a18c45f49cf6a7f9089b49d05ae2bb5b7bc1 Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Thu, 24 May 2018 12:35:03 +0200 Subject: [PATCH 05/31] added "threads" kwarg --- pystarcode/pystarcode.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pystarcode/pystarcode.c b/pystarcode/pystarcode.c index b90390e..18de3ad 100644 --- a/pystarcode/pystarcode.c +++ b/pystarcode/pystarcode.c @@ -44,6 +44,7 @@ static PyObject *pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k int tau, cluster_ratio; int clusteralg = 0; int verbose = 1; + int thrmax = 4; static char *kwlist[] = { "input", @@ -52,14 +53,16 @@ static PyObject *pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k "cluster_ratio", "clusteralg", "verbose", + "threads", NULL}; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "ssii|ii", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "ssii|iii", kwlist, &in_filename, &out_filename, &tau, &cluster_ratio, &clusteralg, - &verbose)) + &verbose, + &thrmax)) return NULL; // open input and output files @@ -70,7 +73,6 @@ static PyObject *pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k if (outputf1 == NULL) return NULL; FILE *outputf2 = NULL; - int thrmax = 4; const int showclusters = 1; const int showids = 0; const int outputt = 0; From b033f9ae949828b4cab4d6bc05a80f33ea3b31da Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Thu, 24 May 2018 12:50:01 +0200 Subject: [PATCH 06/31] finalized kwargs --- pystarcode/pystarcode.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/pystarcode/pystarcode.c b/pystarcode/pystarcode.c index 18de3ad..8ac8adf 100644 --- a/pystarcode/pystarcode.c +++ b/pystarcode/pystarcode.c @@ -45,6 +45,9 @@ static PyObject *pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k int clusteralg = 0; int verbose = 1; int thrmax = 4; + int showclusters = 1; + int showids = 0; + int outputt = 0; static char *kwlist[] = { "input", @@ -54,15 +57,21 @@ static PyObject *pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k "clusteralg", "verbose", "threads", + "showclusters", + "showids", + "outputt", NULL}; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "ssii|iii", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "ssii|iiiiii", kwlist, &in_filename, &out_filename, &tau, &cluster_ratio, &clusteralg, &verbose, - &thrmax)) + &thrmax, + &showclusters, + &showids, + &outputt)) return NULL; // open input and output files @@ -73,10 +82,6 @@ static PyObject *pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k if (outputf1 == NULL) return NULL; FILE *outputf2 = NULL; - const int showclusters = 1; - const int showids = 0; - const int outputt = 0; - // printf("%s %s %d %d %d %d\n",in_filename,out_filename,tau,cluster_ratio,clusteralg,verbose); int out = starcode( From 67e0b08f8333d030e367ea77214d697a52b06e7f Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Thu, 24 May 2018 16:30:06 +0200 Subject: [PATCH 07/31] transferred output to separate function in starcode --- Makefile | 4 +- src/starcode.c | 248 +++++++++++++++++++++++++++++-------------------- 2 files changed, 149 insertions(+), 103 deletions(-) diff --git a/Makefile b/Makefile index 4d1f69d..b949e9b 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ SOURCES= $(addprefix $(SRC_DIR)/,$(SOURCE_FILES)) INCLUDES= $(addprefix -I, $(INC_DIR)) # Development and debug flags. -#CFLAGS= -std=c99 -g -O0 -Wunused-parameter -Wredundant-decls \ +CFLAGS= -std=c99 -g -O0 -Wunused-parameter -Wredundant-decls \ -Wreturn-type -Wswitch-default -Wunused-value -Wimplicit \ -Wimplicit-function-declaration -Wimplicit-int -Wimport \ -Wunused -Wunused-function -Wunused-label -Wbad-function-cast \ @@ -19,7 +19,7 @@ INCLUDES= $(addprefix -I, $(INC_DIR)) -Wunused-variable -Wformat-nonliteral -Wparentheses -Wundef \ -Wsequence-point -Wuninitialized -Wbad-function-cast # Release flags. -CFLAGS= -std=c99 -O3 -Wall +#CFLAGS= -std=c99 -O3 -Wall LDLIBS= -lpthread -lm CC= gcc diff --git a/src/starcode.c b/src/starcode.c index f8371fe..2368be3 100644 --- a/src/starcode.c +++ b/src/starcode.c @@ -398,22 +398,17 @@ warn_about_missing_sequences return; } - -int -starcode +void +print_starcode_output ( - FILE *inputf1, - FILE *inputf2, FILE *outputf1, FILE *outputf2, - int tau, - const int verbose, - int thrmax, + gstack_t *clusters, const int clusteralg, - int parent_to_child, const int showclusters, const int showids, - const int outputt + const int outputt, + const int verbose ) { @@ -421,54 +416,6 @@ starcode OUTPUTF2 = outputf2; OUTPUTT = outputt; CLUSTERALG = clusteralg; - CLUSTER_RATIO = parent_to_child; - - if (verbose) { - fprintf(stderr, "running starcode with %d thread%s\n", - thrmax, thrmax > 1 ? "s" : ""); - fprintf(stderr, "reading input files\n"); - } - gstack_t *uSQ = read_file(inputf1, inputf2, verbose); - if (uSQ == NULL || uSQ->nitems < 1) { - fprintf(stderr, "input file empty\n"); - return 1; - } - - // Sort/reduce. - if (verbose) fprintf(stderr, "sorting\n"); - uSQ->nitems = seqsort((useq_t **) uSQ->items, uSQ->nitems, thrmax); - - // Get number of tries. - int ntries = 3 * thrmax + (thrmax % 2 == 0); - if (uSQ->nitems < ntries) { - ntries = 1; - thrmax = 1; - } - - // Pad sequences (and return the median size). - // Compute 'tau' from it in "auto" mode. - int med = -1; - int height = pad_useq(uSQ, &med); - if (tau < 0) { - tau = med > 160 ? 8 : 2 + med/30; - if (verbose) { - fprintf(stderr, "setting dist to %d\n", tau); - } - } - - // Make multithreading plan. - mtplan_t *mtplan = plan_mt(tau, height, med, ntries, uSQ); - - // Run the query. - run_plan(mtplan, verbose, thrmax); - if (verbose) fprintf(stderr, "progress: 100.00%%\n"); - - // Remove padding characters. - unpad_useq(uSQ); - - // - // MESSAGE PASSING ALGORITHM - // propt_t propt = { .first = {0}, @@ -479,24 +426,16 @@ starcode if (CLUSTERALG == MP_CLUSTER) { - if (verbose) fprintf(stderr, "message passing clustering\n"); - // User must be warned when sequences without // canonical are removed from the output. int user_warned_about_missing_sequences = 0; - // Cluster the pairs. - message_passing_clustering(uSQ, showids); - // Sort in canonical order. - qsort(uSQ->items, uSQ->nitems, sizeof(useq_t *), canonical_order); - if (OUTPUTT == DEFAULT_OUTPUT) { - - useq_t *first = (useq_t *) uSQ->items[0]; + useq_t *first = (useq_t *) clusters->items[0]; useq_t *canonical = first->canonical; // If the first canonical is NULL, then they all are. - if (first->canonical == NULL) return 0; + if (first->canonical == NULL) return; head_default(first, propt); @@ -504,8 +443,8 @@ starcode memcpy(propt.first, "\n", 3); // Run through the clustered items. - for (int i = 1 ; i < uSQ->nitems ; i++) { - useq_t *u = (useq_t *) uSQ->items[i]; + for (int i = 1 ; i < clusters->nitems ; i++) { + useq_t *u = (useq_t *) clusters->items[i]; if (u->canonical == NULL) { // Sequences without canonical are not printed // at all (but their counts are transferred). @@ -538,18 +477,11 @@ starcode // SPHERES ALGORITHM // - } else if (CLUSTERALG == SPHERES_CLUSTER) { - - if (verbose) fprintf(stderr, "spheres clustering\n"); - // Cluster the pairs. - sphere_clustering(uSQ, showids); - // Sort in count order. - qsort(uSQ->items, uSQ->nitems, sizeof(useq_t *), count_order); - + } else if (clusteralg == SPHERES_CLUSTER) { // Default output. if (OUTPUTT == DEFAULT_OUTPUT) { - for (int i = 0 ; i < uSQ->nitems ; i++) { - useq_t *u = (useq_t *) uSQ->items[i]; + for (int i = 0 ; i < clusters->nitems ; i++) { + useq_t *u = (useq_t *) clusters->items[i]; if (u->canonical != u) break; fprintf(OUTPUTF1, "%s\t", u->seq); @@ -565,7 +497,7 @@ starcode for (int k = 0 ; k < hits->nitems ; k++) { useq_t *match = (useq_t *) hits->items[k]; if (match->canonical != u) continue; - fprintf(OUTPUTF1, ",%s", match->seq); + fprintf(outputf1, ",%s", match->seq); } } } @@ -589,14 +521,7 @@ starcode * CONNECTED COMPONENTS ALGORITHM */ - } else if (CLUSTERALG == COMPONENTS_CLUSTER) { - if (verbose) fprintf(stderr, "connected components clustering\n"); - // Cluster connected components. - // Returns a stack containing stacks of clusters, where clusters->item[i]->item[0] is - // the centroid of the i-th cluster. The output is sorted by cluster count, which is - // stored in centroid->count. - gstack_t * clusters = compute_clusters(uSQ); - + } else if (clusteralg == COMPONENTS_CLUSTER) { // Default output. if (OUTPUTT == DEFAULT_OUTPUT) { for (int i = 0; i < clusters->nitems; i++) { @@ -604,20 +529,15 @@ starcode // Get canonical. useq_t * canonical = (useq_t *) cluster->items[0]; // Print canonical and cluster count. - fprintf(OUTPUTF1, "%s\t%d", canonical->seq, canonical->count); + fprintf(outputf1, "%s\t%d", canonical->seq, canonical->count); if (showclusters) { - fprintf (OUTPUTF1, "\t%s", canonical->seq); + fprintf (outputf1, "\t%s", canonical->seq); for (int k = 1; k < cluster->nitems; k++) { - fprintf (OUTPUTF1, ",%s", ((useq_t *)cluster->items[k])->seq); + fprintf (outputf1, ",%s", ((useq_t *)cluster->items[k])->seq); } } - fprintf(OUTPUTF1, "\n"); + fprintf(outputf1, "\n"); } - } else if (OUTPUTT == NRED_OUTPUT) { - uSQ->nitems = 0; - // Fill uSQ with cluster centroids. - for (int i = 0 ; i < clusters->nitems ; i++) - push(((gstack_t *)clusters->items[i])->items[0], &uSQ); } } @@ -625,7 +545,7 @@ starcode * ALTERNATIVE OUTPUT FORMAT: NON-REDUNDANT */ - if (OUTPUTT == NRED_OUTPUT) { + if (outputt == NRED_OUTPUT) { if (verbose) fprintf(stderr, "non-redundant output\n"); // If print non redundant sequences, just print the @@ -637,8 +557,8 @@ starcode else if (FORMAT == FASTQ) print_nr = print_nr_fastq; else if (FORMAT == PE_FASTQ) print_nr = print_nr_pe_fastq; - for (int i = 0 ; i < uSQ->nitems ; i++) { - useq_t *u = (useq_t *) uSQ->items[i]; + for (int i = 0 ; i < clusters->nitems ; i++) { + useq_t *u = (useq_t *) clusters->items[i]; if (u->canonical == NULL) break; if (u->canonical != u) continue; print_nr(u, propt); @@ -649,6 +569,132 @@ starcode // Do not free anything. OUTPUTF1 = NULL; OUTPUTF2 = NULL; +} + +int +starcode +( + FILE *inputf1, + FILE *inputf2, + FILE *outputf1, + FILE *outputf2, + int tau, + const int verbose, + int thrmax, + const int clusteralg, + int parent_to_child, + const int showclusters, + const int showids, + const int outputt +) +{ + + OUTPUTF1 = outputf1; + OUTPUTF2 = outputf2; + OUTPUTT = outputt; + CLUSTERALG = clusteralg; + CLUSTER_RATIO = parent_to_child; + + if (verbose) { + fprintf(stderr, "running starcode with %d thread%s\n", + thrmax, thrmax > 1 ? "s" : ""); + fprintf(stderr, "reading input files\n"); + } + gstack_t *uSQ = read_file(inputf1, inputf2, verbose); + if (uSQ == NULL || uSQ->nitems < 1) { + fprintf(stderr, "input file empty\n"); + return 1; + } + + // Sort/reduce. + if (verbose) fprintf(stderr, "sorting\n"); + uSQ->nitems = seqsort((useq_t **) uSQ->items, uSQ->nitems, thrmax); + + // Get number of tries. + int ntries = 3 * thrmax + (thrmax % 2 == 0); + if (uSQ->nitems < ntries) { + ntries = 1; + thrmax = 1; + } + + // Pad sequences (and return the median size). + // Compute 'tau' from it in "auto" mode. + int med = -1; + int height = pad_useq(uSQ, &med); + if (tau < 0) { + tau = med > 160 ? 8 : 2 + med/30; + if (verbose) { + fprintf(stderr, "setting dist to %d\n", tau); + } + } + + // Make multithreading plan. + mtplan_t *mtplan = plan_mt(tau, height, med, ntries, uSQ); + + // Run the query. + run_plan(mtplan, verbose, thrmax); + if (verbose) fprintf(stderr, "progress: 100.00%%\n"); + + // Remove padding characters. + unpad_useq(uSQ); + + // + // MESSAGE PASSING ALGORITHM + // + + propt_t propt = { + .first = {0}, + .showclusters = showclusters, + .showids = showids, + .pe_fastq = PE_FASTQ == FORMAT, + }; + + if (CLUSTERALG == MP_CLUSTER) { + + if (verbose) fprintf(stderr, "message passing clustering\n"); + + // Cluster the pairs. + message_passing_clustering(uSQ, showids); + // Sort in canonical order. + qsort(uSQ->items, uSQ->nitems, sizeof(useq_t *), canonical_order); + + // print output + print_starcode_output(outputf1, outputf2, + uSQ, CLUSTERALG, showclusters, showids, OUTPUTT, verbose); + + // + // SPHERES ALGORITHM + // + + } else if (CLUSTERALG == SPHERES_CLUSTER) { + + if (verbose) fprintf(stderr, "spheres clustering\n"); + // Cluster the pairs. + sphere_clustering(uSQ, showids); + // Sort in count order. + qsort(uSQ->items, uSQ->nitems, sizeof(useq_t *), count_order); + + // print output + print_starcode_output(outputf1, outputf2, + uSQ, CLUSTERALG, showclusters, showids, OUTPUTT, verbose); + + /* + * CONNECTED COMPONENTS ALGORITHM + */ + + } else if (CLUSTERALG == COMPONENTS_CLUSTER) { + if (verbose) fprintf(stderr, "connected components clustering\n"); + // Cluster connected components. + // Returns a stack containing stacks of clusters, where clusters->item[i]->item[0] is + // the centroid of the i-th cluster. The output is sorted by cluster count, which is + // stored in centroid->count. + gstack_t * clusters = compute_clusters(uSQ); + + // print output + print_starcode_output(outputf1, outputf2, + clusters, CLUSTERALG, showclusters, showids, OUTPUTT, verbose); + } + return 0; } From 8ff847b59507e2928b897f91082b388ca923914b Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Thu, 24 May 2018 16:47:22 +0200 Subject: [PATCH 08/31] read files now in main --- src/main-starcode.c | 15 +++++++++++++-- src/starcode.c | 14 +------------- src/starcode.h | 12 ++++++++++-- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/src/main-starcode.c b/src/main-starcode.c index d53ad20..6444eb0 100644 --- a/src/main-starcode.c +++ b/src/main-starcode.c @@ -28,6 +28,7 @@ #include #include #include "starcode.h" +#include "trie.h" #define ERRM "starcode error:" @@ -486,10 +487,20 @@ main( if (threads < 0) threads = 1; if (cluster_ratio < 0) cluster_ratio = 5; + if (vb_flag) { + fprintf(stderr, "running starcode with %d thread%s\n", + threads, threads > 1 ? "s" : ""); + fprintf(stderr, "reading input files\n"); + } + gstack_t *uSQ = read_file(inputf1, inputf2, vb_flag); + if (uSQ == NULL || uSQ->nitems < 1) { + fprintf(stderr, "input file empty\n"); + return 1; + } + int exitcode = starcode( - inputf1, - inputf2, + uSQ, outputf1, outputf2, dist, diff --git a/src/starcode.c b/src/starcode.c index 2368be3..a14523b 100644 --- a/src/starcode.c +++ b/src/starcode.c @@ -574,8 +574,7 @@ print_starcode_output int starcode ( - FILE *inputf1, - FILE *inputf2, + gstack_t *uSQ, FILE *outputf1, FILE *outputf2, int tau, @@ -595,17 +594,6 @@ starcode CLUSTERALG = clusteralg; CLUSTER_RATIO = parent_to_child; - if (verbose) { - fprintf(stderr, "running starcode with %d thread%s\n", - thrmax, thrmax > 1 ? "s" : ""); - fprintf(stderr, "reading input files\n"); - } - gstack_t *uSQ = read_file(inputf1, inputf2, verbose); - if (uSQ == NULL || uSQ->nitems < 1) { - fprintf(stderr, "input file empty\n"); - return 1; - } - // Sort/reduce. if (verbose) fprintf(stderr, "sorting\n"); uSQ->nitems = seqsort((useq_t **) uSQ->items, uSQ->nitems, thrmax); diff --git a/src/starcode.h b/src/starcode.h index 8705faf..0ca08ff 100644 --- a/src/starcode.h +++ b/src/starcode.h @@ -26,6 +26,7 @@ #define _GNU_SOURCE #include +#include "trie.h" #define VERSION "starcode-v1.2 20-04-2018" #define STARCODE_MAX_TAU 8 @@ -42,9 +43,16 @@ typedef enum { COMPONENTS_CLUSTER } cluster_t; +gstack_t * +read_file +( + FILE * inputf1, + FILE * inputf2, + const int verbose +); + int starcode( - FILE *inputf1, - FILE *inputf2, + gstack_t *uSQ, FILE *outputf1, FILE *outputf2, int tau, From a1405424ccc020b0cf17f3ab275ca07325a9ea03 Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Fri, 1 Jun 2018 09:30:00 +0200 Subject: [PATCH 09/31] "starcode" function takes gstack_t as argument --- pystarcode/pystarcode.c | 16 ++++++++++------ src/starcode.c | 8 -------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/pystarcode/pystarcode.c b/pystarcode/pystarcode.c index 8ac8adf..a0eeac6 100644 --- a/pystarcode/pystarcode.c +++ b/pystarcode/pystarcode.c @@ -1,6 +1,7 @@ #include #include #include "../src/starcode.h" +#include "../src/trie.h" #define MAX_STR_LENGTH 2048 @@ -9,10 +10,10 @@ static char module_docstring[] = static char starcode_docstring[] = "Starcode invocation"; -static PyObject *pystarcode_starcode(PyObject *self, PyObject *args, PyObject *kwargs); +static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *kwargs); static PyMethodDef module_methods[] = { - {"starcode", pystarcode_starcode, METH_VARARGS|METH_KEYWORDS, starcode_docstring}, + {"starcode", (PyCFunction) pystarcode_starcode, METH_VARARGS|METH_KEYWORDS, starcode_docstring}, {NULL, NULL, 0, NULL} }; @@ -37,7 +38,7 @@ FILE *py_fopen(const char *fname, const char *mode) return f; } -static PyObject *pystarcode_starcode(PyObject *self, PyObject *args, PyObject *kwargs) +static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *kwargs) { // Parse the input char *in_filename, *out_filename; @@ -82,11 +83,14 @@ static PyObject *pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k if (outputf1 == NULL) return NULL; FILE *outputf2 = NULL; - // printf("%s %s %d %d %d %d\n",in_filename,out_filename,tau,cluster_ratio,clusteralg,verbose); + printf("%s %s %d %d %d %d\n",in_filename,out_filename,tau,cluster_ratio,clusteralg,verbose); + + // init the "gstack_t" data structure, which will contain the information on + // the sequences to analyze + gstack_t *uSQ = read_file(inputf1, inputf2, verbose); int out = starcode( - inputf1, - inputf2, + uSQ, outputf1, outputf2, tau, diff --git a/src/starcode.c b/src/starcode.c index a14523b..c079b20 100644 --- a/src/starcode.c +++ b/src/starcode.c @@ -204,7 +204,6 @@ void run_plan (mtplan_t *, int, int); gstack_t * read_rawseq (FILE *, gstack_t *); gstack_t * read_fasta (FILE *, gstack_t *); gstack_t * read_fastq (FILE *, gstack_t *); -gstack_t * read_file (FILE *, FILE *, int); gstack_t * read_PE_fastq (FILE *, FILE *, gstack_t *); int seq2id (char *, int); gstack_t * seq2useq (gstack_t*, int); @@ -630,13 +629,6 @@ starcode // MESSAGE PASSING ALGORITHM // - propt_t propt = { - .first = {0}, - .showclusters = showclusters, - .showids = showids, - .pe_fastq = PE_FASTQ == FORMAT, - }; - if (CLUSTERALG == MP_CLUSTER) { if (verbose) fprintf(stderr, "message passing clustering\n"); From 33cb2a12d073021398f4b0502701d6ecb5b9908c Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Tue, 12 Jun 2018 14:26:23 +0200 Subject: [PATCH 10/31] put flag check in separate function --- src/main-starcode.c | 48 ++--------- src/starcode.c | 188 ++++++++++++++++++++++++++++++++++++++++++++ src/starcode.h | 43 +++++++++- 3 files changed, 235 insertions(+), 44 deletions(-) diff --git a/src/main-starcode.c b/src/main-starcode.c index 6444eb0..87b4db9 100644 --- a/src/main-starcode.c +++ b/src/main-starcode.c @@ -30,8 +30,6 @@ #include "starcode.h" #include "trie.h" -#define ERRM "starcode error:" - // Prototypes for utilities of the main. char * outname (char *); void say_usage (void); @@ -355,43 +353,11 @@ main( } } - // Check options compatibility. // - if (nr_flag && (cl_flag || id_flag)) { - fprintf(stderr, - "%s --non-redundant flag is incompatible with " - "--print-clusters and --seq-id\n", ERRM); - say_usage(); - return EXIT_FAILURE; - } - if (input != UNSET && (input1 != UNSET || input2 != UNSET)) { - fprintf(stderr, - "%s --input and --input1/2 are incompatible\n", ERRM); - say_usage(); - return EXIT_FAILURE; - } - if (input1 == UNSET && input2 != UNSET) { - fprintf(stderr, "%s --input2 set without --input1\n", ERRM); - say_usage(); - return EXIT_FAILURE; - } - if (input2 == UNSET && input1 != UNSET) { - fprintf(stderr, "%s --input1 set without --input2\n", ERRM); - say_usage(); - return EXIT_FAILURE; - } - if (nr_flag && output != UNSET && - (input1 != UNSET || input2 != UNSET)) { - fprintf(stderr, "%s cannot specify --output for paired-end " - "fastq file with --non-redundant\n", ERRM); - say_usage(); - return EXIT_FAILURE; - } - if (sp_flag && cp_flag) { - fprintf(stderr, "%s --sphere and --connected-comp are " - "incompatible\n", ERRM); - say_usage(); - return EXIT_FAILURE; - } + // set default input and check flag compatibility + input_compatibility_t ic = check_input (nr_flag,cl_flag,id_flag,sp_flag,cp_flag,vb_flag, + &threads,&cluster_ratio,input1,input2,input,output); + if (ic != INPUT_OK) return EXIT_FAILURE; + // Set output type. // int output_type; @@ -483,10 +449,6 @@ main( outputf1 = stdout; } - // Set remaining default options. - if (threads < 0) threads = 1; - if (cluster_ratio < 0) cluster_ratio = 5; - if (vb_flag) { fprintf(stderr, "running starcode with %d thread%s\n", threads, threads > 1 ? "s" : ""); diff --git a/src/starcode.c b/src/starcode.c index c079b20..71afc92 100644 --- a/src/starcode.c +++ b/src/starcode.c @@ -570,6 +570,194 @@ print_starcode_output OUTPUTF2 = NULL; } +input_compatibility_t +check_input +( + int nr_flag, + int cl_flag, + int id_flag, + int sp_flag, + int cp_flag, + int vb_flag, + int * threads, + int * cluster_ratio, + char *input1, + char *input2, + char *input, + char *output +) +{ + char * const UNSET = "unset"; + + // Check options compatibility. // + if (nr_flag && (cl_flag || id_flag)) { + fprintf(stderr, + "%s --non-redundant flag is incompatible with " + "--print-clusters and --seq-id\n", ERRM); + say_usage(); + return NR_CL_ID_INCOMPATIBILITY; + } + if (input != UNSET && (input1 != UNSET || input2 != UNSET)) { + fprintf(stderr, + "%s --input and --input1/2 are incompatible\n", ERRM); + say_usage(); + return INPUT_INPUT12_INCOMPATIBILITY; + } + if (input1 == UNSET && input2 != UNSET) { + fprintf(stderr, "%s --input2 set without --input1\n", ERRM); + say_usage(); + return ONLY_INPUT2_INCOMPATIBILITY; + } + if (input2 == UNSET && input1 != UNSET) { + fprintf(stderr, "%s --input1 set without --input2\n", ERRM); + say_usage(); + return ONLY_INPUT1_INCOMPATIBILITY; + } + if (nr_flag && output != UNSET && + (input1 != UNSET || input2 != UNSET)) { + fprintf(stderr, "%s cannot specify --output for paired-end " + "fastq file with --non-redundant\n", ERRM); + say_usage(); + return NR_OUTPUT_INCOMPATIBILITY; + } + if (sp_flag && cp_flag) { + fprintf(stderr, "%s --sphere and --connected-comp are " + "incompatible\n", ERRM); + say_usage(); + return SP_CP_INCOMPATIBILITY; + } + + // Set remaining default options. + if (*threads < 0) *threads = 1; + if (*cluster_ratio < 0) *cluster_ratio = 5; + + // all went well: return + return INPUT_OK; +} + + +output_t +set_output_type +( + int nr_flag +) +{ + // Set output type. // + int output_type; + if (nr_flag) output_type = NRED_OUTPUT; + else output_type = DEFAULT_OUTPUT; + return output_type; +} + +cluster_t set_cluster_alg +( + int cp_flag, + int sp_flag +) +{ + int cluster_alg; + if (cp_flag) cluster_alg = COMPONENTS_CLUSTER; + else if (sp_flag) cluster_alg = SPHERES_CLUSTER; + else cluster_alg = MP_CLUSTER; + return cluster_alg; +} + + +starcode_io_check +set_input_and_output +( + starcode_io_t *io, + char * input1, + char * input2, + char * input, + char * output1, + char * output2, + char * output, + int nr_flag +) +{ + char * const UNSET = "unset"; + + // Set input file(s). // + io->inputf1 = NULL; + io->inputf2 = NULL; + + // Set output file(s). // + io->outputf1 = NULL; + io->outputf2 = NULL; + + if (input != UNSET) { + io->inputf1 = fopen(input, "r"); + if (io->inputf1 == NULL) { + fprintf(stderr, "%s cannot open file %s\n", ERRM, input); + say_usage(); + return IO_FILERR; + } + } + else if (input1 != UNSET) { + io->inputf1 = fopen(input1, "r"); + if (io->inputf1 == NULL) { + fprintf(stderr, "%s cannot open file %s\n", ERRM, input1); + say_usage(); + return IO_FILERR; + } + io->inputf2 = fopen(input2, "r"); + if (io->inputf2 == NULL) { + fprintf(stderr, "%s cannot open file %s\n", ERRM, input2); + say_usage(); + return IO_FILERR; + } + } + else { + io->inputf1 = stdin; + } + + if (output != UNSET) { + io->outputf1 = fopen(output, "w"); + if (io->outputf1 == NULL) { + fprintf(stderr, "%s cannot write to file %s\n", ERRM, output); + say_usage(); + return IO_FILERR; + } + } + else if (nr_flag && input1 != UNSET && input2 != UNSET) { + // Set default names as inputX-starcode.fastq + if (output1 == UNSET) { + output1 = outname(input1); + io->outputf1 = fopen(output1, "w"); + free(output1); + } else { + io->outputf1 = fopen(output1, "w"); + } + + if (io->outputf1 == NULL) { + fprintf(stderr, + "%s cannot write to file %s\n", ERRM, outname(input1)); + say_usage(); + return IO_FILERR; + } + + if (output2 == UNSET) { + output2 = outname(input2); + io->outputf2 = fopen(output2, "w"); + free(output2); + } else { + io->outputf2 = fopen(output2, "w"); + } + + if (io->outputf2 == NULL) { + fprintf(stderr, + "%s cannot write to file %s\n", ERRM, outname(input2)); + say_usage(); + return IO_FILERR; + } + } + else { + io->outputf1 = stdout; + } + return IO_OK; +} + int starcode ( diff --git a/src/starcode.h b/src/starcode.h index 0ca08ff..d81ab1d 100644 --- a/src/starcode.h +++ b/src/starcode.h @@ -31,6 +31,8 @@ #define VERSION "starcode-v1.2 20-04-2018" #define STARCODE_MAX_TAU 8 +#define ERRM "starcode error:" + typedef enum { DEFAULT_OUTPUT, CLUSTER_OUTPUT, @@ -50,7 +52,46 @@ read_file FILE * inputf2, const int verbose ); - + +typedef enum { + INPUT_OK, + NR_CL_ID_INCOMPATIBILITY, + INPUT_INPUT12_INCOMPATIBILITY, + ONLY_INPUT2_INCOMPATIBILITY, + ONLY_INPUT1_INCOMPATIBILITY, + NR_OUTPUT_INCOMPATIBILITY, + SP_CP_INCOMPATIBILITY, +} input_compatibility_t; + +input_compatibility_t +check_input +( + int nr_flag, + int cl_flag, + int id_flag, + int sp_flag, + int cp_flag, + int vb_flag, + int * threads, + int * cluster_ratio, + char *input1, + char *input2, + char *input, + char *output +); + +typedef struct { + FILE *inputf1; + FILE *inputf2; + FILE *outputf1; + FILE *outputf2; +} starcode_io_t; + +typedef enum { + IO_OK, + IO_FILERR +} starcode_io_check; + int starcode( gstack_t *uSQ, FILE *outputf1, From af373049177fe6a3066b8a4e87efde7e2bd1c768 Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Tue, 12 Jun 2018 14:50:35 +0200 Subject: [PATCH 11/31] fixed all compiler warnings --- src/main-starcode.c | 69 +------------------------------- src/starcode.c | 96 +++++++++++++++++++++++++++++++++------------ src/starcode.h | 88 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 158 insertions(+), 95 deletions(-) diff --git a/src/main-starcode.c b/src/main-starcode.c index 87b4db9..22ff920 100644 --- a/src/main-starcode.c +++ b/src/main-starcode.c @@ -31,47 +31,9 @@ #include "trie.h" // Prototypes for utilities of the main. -char * outname (char *); -void say_usage (void); void say_version (void); void SIGSEGV_handler (int); -char *USAGE = -"\n" -"Usage:" -" starcode [options]\n" -"\n" -" general options:\n" -" -d --dist: maximum Levenshtein distance (default auto)\n" -" -t --threads: number of concurrent threads (default 1)\n" -" -q --quiet: quiet output (default verbose)\n" -" -v --version: display version and exit\n" -"\n" -" cluster options: (default algorithm: message passing)\n" -" -r --cluster-ratio: min size ratio for merging clusters in\n" -" message passing (default 5)\n" -" -s --sphere: use sphere clustering algorithm\n" -" -c --connected-comp: cluster connected components\n" -"\n" -" input/output options (single file, default)\n" -" -i --input: input file (default stdin)\n" -" -o --output: output file (default stdout)\n" -"\n" -" input options (paired-end fastq files)\n" -" -1 --input1: input file 1\n" -" -2 --input2: input file 2\n" -"\n" -" output options (paired-end fastq files, --non-redundant only)\n" -" --output1: output file1 (default input1-starcode.fastq)\n" -" --output2: output file2 (default input2-starcode.fastq)\n" -"\n" -" output format options\n" -" --non-redundant: remove redundant sequences from input file(s)\n" -" --print-clusters: outputs cluster compositions\n" -" --seq-id: print sequence id numbers (1-based)\n"; - - -void say_usage(void) { fprintf(stderr, "%s\n", USAGE); } void say_version(void) { fprintf(stderr, VERSION "\n"); } void SIGSEGV_handler(int sig) { @@ -88,35 +50,6 @@ void SIGSEGV_handler(int sig) { } -char * -outname -( - char *path -) -{ - - char * name = calloc(320,1); - if (strlen(path) > 310) { - fprintf(stderr, "input file name too long (%s)\n", path); - abort(); - } - - // Find final dot, append "-starcode" just before. - // If no final dot, just append starcode as suffix. - char *c = strrchr(path, '.'); - if (c == NULL) { - sprintf(name, "%s-starcode", path); - } - else { - *c = '\0'; - sprintf(name, "%s-starcode.%s", path, c+1); - *c = '.'; - } - - return (char *) name; - -} - int main( @@ -354,7 +287,7 @@ main( } // set default input and check flag compatibility - input_compatibility_t ic = check_input (nr_flag,cl_flag,id_flag,sp_flag,cp_flag,vb_flag, + input_compatibility_t ic = check_input (nr_flag,cl_flag,id_flag,sp_flag,cp_flag, &threads,&cluster_ratio,input1,input2,input,output); if (ic != INPUT_OK) return EXIT_FAILURE; diff --git a/src/starcode.c b/src/starcode.c index 71afc92..9ea654a 100644 --- a/src/starcode.c +++ b/src/starcode.c @@ -80,7 +80,6 @@ static const char capitalize[128] = { 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,123,124,125,126,127 }; -struct useq_t; struct c_t; struct match_t; @@ -92,33 +91,16 @@ typedef enum { UNSET, } format_t; -typedef struct useq_t useq_t; typedef struct c_t ustack_t; typedef struct match_t match_t; typedef struct mtplan_t mtplan_t; typedef struct mttrie_t mttrie_t; typedef struct mtjob_t mtjob_t; typedef struct lookup_t lookup_t; -typedef struct propt_t propt_t; typedef struct sortargs_t sortargs_t; -// The field 'seqid' is either an id number for -// the unique sequence or a pointer to a struct -// containing information about the matches. This -// creates some confusion in the code at times. -// See function 'transfer_useq_ids()'. -struct useq_t { - int count; // Number of sequences - unsigned int nids; // Number of associated IDs - char * seq; // Sequence - char * info; // Multi-function text field - gstack_t ** matches; // Matches stratified by distance - struct useq_t * canonical; // Pointer to canonical sequence - int * seqid; // Unique ID / pointer (see above). -}; - struct lookup_t { int slen; int kmers; @@ -170,13 +152,6 @@ struct mtjob_t { char * active; }; -struct propt_t { - char first[5]; - int pe_fastq; - int showclusters; - int showids; -}; - int size_order (const void *a, const void *b); int addmatch (useq_t*, useq_t*, int, int); @@ -224,6 +199,71 @@ static output_t OUTPUTT = DEFAULT_OUTPUT; // output type static cluster_t CLUSTERALG = MP_CLUSTER; // cluster algorithm static int CLUSTER_RATIO = 5; // min parent/child ratio // to link clusters +char * +outname +( + char *path +) +{ + + char * name = calloc(320,1); + if (strlen(path) > 310) { + fprintf(stderr, "input file name too long (%s)\n", path); + abort(); + } + + // Find final dot, append "-starcode" just before. + // If no final dot, just append starcode as suffix. + char *c = strrchr(path, '.'); + if (c == NULL) { + sprintf(name, "%s-starcode", path); + } + else { + *c = '\0'; + sprintf(name, "%s-starcode.%s", path, c+1); + *c = '.'; + } + + return (char *) name; + +} + +void say_usage(void) { + char *USAGE = + "\n" + "Usage:" + " starcode [options]\n" + "\n" + " general options:\n" + " -d --dist: maximum Levenshtein distance (default auto)\n" + " -t --threads: number of concurrent threads (default 1)\n" + " -q --quiet: quiet output (default verbose)\n" + " -v --version: display version and exit\n" + "\n" + " cluster options: (default algorithm: message passing)\n" + " -r --cluster-ratio: min size ratio for merging clusters in\n" + " message passing (default 5)\n" + " -s --sphere: use sphere clustering algorithm\n" + " -c --connected-comp: cluster connected components\n" + "\n" + " input/output options (single file, default)\n" + " -i --input: input file (default stdin)\n" + " -o --output: output file (default stdout)\n" + "\n" + " input options (paired-end fastq files)\n" + " -1 --input1: input file 1\n" + " -2 --input2: input file 2\n" + "\n" + " output options (paired-end fastq files, --non-redundant only)\n" + " --output1: output file1 (default input1-starcode.fastq)\n" + " --output2: output file2 (default input2-starcode.fastq)\n" + "\n" + " output format options\n" + " --non-redundant: remove redundant sequences from input file(s)\n" + " --print-clusters: outputs cluster compositions\n" + " --seq-id: print sequence id numbers (1-based)\n"; + fprintf(stderr, "%s\n", USAGE); +} void @@ -293,6 +333,7 @@ print_ids propt_t propt ) { + (void) propt; // If there are more than one ID then 'u->seqid' is // a pointer to the IDs. @@ -320,6 +361,7 @@ print_nr_raw propt_t propt ) { + (void) propt; fprintf(OUTPUTF1, "%s\n", u->seq); } @@ -331,6 +373,7 @@ print_nr_fasta propt_t propt ) { + (void) propt; fprintf(OUTPUTF1, "%s\n%s\n", u->info, u->seq); } @@ -342,6 +385,7 @@ print_nr_fastq propt_t propt ) { + (void) propt; char header[M] = {0}; char quality[M] = {0}; sscanf(u->info, "%s\n%s", header, quality); @@ -357,6 +401,7 @@ print_nr_pe_fastq propt_t propt ) { + (void) propt; char head1[M] = {0}; char head2[M] = {0}; char qual1[M] = {0}; @@ -578,7 +623,6 @@ check_input int id_flag, int sp_flag, int cp_flag, - int vb_flag, int * threads, int * cluster_ratio, char *input1, diff --git a/src/starcode.h b/src/starcode.h index d81ab1d..c60ed84 100644 --- a/src/starcode.h +++ b/src/starcode.h @@ -33,6 +33,33 @@ #define ERRM "starcode error:" +char * outname (char *); + +struct useq_t; +typedef struct useq_t useq_t; +// The field 'seqid' is either an id number for +// the unique sequence or a pointer to a struct +// containing information about the matches. This +// creates some confusion in the code at times. +// See function 'transfer_useq_ids()'. +struct useq_t { + int count; // Number of sequences + unsigned int nids; // Number of associated IDs + char * seq; // Sequence + char * info; // Multi-function text field + gstack_t ** matches; // Matches stratified by distance + struct useq_t * canonical; // Pointer to canonical sequence + int * seqid; // Unique ID / pointer (see above). +}; + +typedef struct propt_t propt_t; +struct propt_t { + char first[5]; + int pe_fastq; + int showclusters; + int showids; +}; + typedef enum { DEFAULT_OUTPUT, CLUSTER_OUTPUT, @@ -71,7 +98,6 @@ check_input int id_flag, int sp_flag, int cp_flag, - int vb_flag, int * threads, int * cluster_ratio, char *input1, @@ -87,11 +113,71 @@ typedef struct { FILE *outputf2; } starcode_io_t; +output_t set_output_type (int); + typedef enum { IO_OK, IO_FILERR } starcode_io_check; +starcode_io_check +set_input_and_output +( + starcode_io_t *io, + char * input1, + char * input2, + char * input, + char * output1, + char * output2, + char * output, + int nr_flag +); + +cluster_t set_cluster_alg (int, int); + +void say_usage (void); + +void +head_default +( + useq_t * u, + propt_t propt +); + +void +members_mp_default +( + useq_t * u, + propt_t propt +); + +void +members_sc_default +( + useq_t * u, + propt_t propt +); + +// print functions +void print_ids (useq_t *, propt_t); +void print_nr_raw (useq_t *, propt_t); +void print_nr_fasta (useq_t *, propt_t); +void print_nr_fastq (useq_t *, propt_t); +void print_nr_pe_fastq (useq_t *, propt_t); + +void +print_starcode_output +( + FILE *outputf1, + FILE *outputf2, + gstack_t *clusters, + const int clusteralg, + const int showclusters, + const int showids, + const int outputt, + const int verbose +); + int starcode( gstack_t *uSQ, FILE *outputf1, From 35f709679e11b9eb640c92de25b82c4f86834106 Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Tue, 12 Jun 2018 16:11:27 +0200 Subject: [PATCH 12/31] new function "starcode" takes list as input --- pystarcode/pystarcode.c | 107 +++++++++++++++++++++++++++++++++++++++- src/starcode.c | 20 -------- src/starcode.h | 21 ++++++++ 3 files changed, 126 insertions(+), 22 deletions(-) diff --git a/pystarcode/pystarcode.c b/pystarcode/pystarcode.c index a0eeac6..5e444cf 100644 --- a/pystarcode/pystarcode.c +++ b/pystarcode/pystarcode.c @@ -5,18 +5,28 @@ #define MAX_STR_LENGTH 2048 +// module docstrings static char module_docstring[] = "This module is a Python interface to Starcode"; +static char starcode_c_docstring[] = + "Starcode invocation"; static char starcode_docstring[] = "Starcode invocation"; +// the module methods declaration +static PyObject* pystarcode_starcode_c(PyObject *self, PyObject *args, PyObject *kwargs); + static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *kwargs); static PyMethodDef module_methods[] = { - {"starcode", (PyCFunction) pystarcode_starcode, METH_VARARGS|METH_KEYWORDS, starcode_docstring}, + {"starcode_c", (PyCFunction) pystarcode_starcode_c, + METH_VARARGS|METH_KEYWORDS, starcode_c_docstring}, + {"starcode", (PyCFunction) pystarcode_starcode, + METH_VARARGS|METH_KEYWORDS, starcode_docstring}, {NULL, NULL, 0, NULL} }; +// init module method PyMODINIT_FUNC initpystarcode(void) { PyObject *m = Py_InitModule3("pystarcode", module_methods, module_docstring); @@ -24,6 +34,7 @@ PyMODINIT_FUNC initpystarcode(void) return; } +// a python-friendly file opener FILE *py_fopen(const char *fname, const char *mode) { FILE *f = fopen(fname, mode); @@ -38,7 +49,9 @@ FILE *py_fopen(const char *fname, const char *mode) return f; } -static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *kwargs) +// this method is used to invoke exactly starcode from within python, without +// any system call +static PyObject* pystarcode_starcode_c(PyObject *self, PyObject *args, PyObject *kwargs) { // Parse the input char *in_filename, *out_filename; @@ -109,3 +122,93 @@ static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k return ret; } + +static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *kwargs) +{ + // Parse the input + int tau, cluster_ratio; + int clusteralg = 0; + int verbose = 1; + int thrmax = 4; + int showclusters = 1; + int showids = 0; + int outputt = 0; + PyObject * in_list; + + static char *kwlist[] = { + "input_list", + "dist", + "cluster_ratio", + "clusteralg", + "verbose", + "threads", + "showclusters", + "showids", + "outputt", + NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!ii|iiiiii", kwlist, + &PyList_Type, + &in_list, + &tau, + &cluster_ratio, + &clusteralg, + &verbose, + &thrmax, + &showclusters, + &showids, + &outputt)) + return NULL; + + // get the number of sequence elements in the input list + int numLines = PyList_Size(in_list); + + // init the return object + PyObject * retList = PyList_New(numLines); /* the list to return */ + + // should raise an error here + if (numLines < 0) { + PyErr_SetString(PyExc_ValueError, "Input element is not list."); + return NULL; + } + + // init the structure that we will pass to the starcode core function + gstack_t *uSQ = new_gstack(); + + // build the list of sequences to pass to the starcode core function + for (size_t i=0; inids = 1; + new->seqid = (void *)(unsigned long)uSQ->nitems+1; + push(new, &uSQ); + + if (PyList_SetItem(retList, i, strObj) == -1) + Py_RETURN_FALSE; + } + + return retList; +} diff --git a/src/starcode.c b/src/starcode.c index 9ea654a..f86464a 100644 --- a/src/starcode.c +++ b/src/starcode.c @@ -50,25 +50,6 @@ #define min(a,b) (((a) < (b)) ? (a) : (b)) #define max(a,b) (((a) > (b)) ? (a) : (b)) -static const int valid_DNA_char[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0, - 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, - 0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0, - 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -}; - static const char capitalize[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, @@ -172,7 +153,6 @@ int lut_insert (lookup_t *, useq_t *); int lut_search (lookup_t *, useq_t *); void message_passing_clustering (gstack_t*, int); lookup_t * new_lookup (int, int, int); -useq_t * new_useq (int, char *, char *); int pad_useq (gstack_t*, int*); mtplan_t * plan_mt (int, int, int, int, gstack_t *); void run_plan (mtplan_t *, int, int); diff --git a/src/starcode.h b/src/starcode.h index c60ed84..57efc4d 100644 --- a/src/starcode.h +++ b/src/starcode.h @@ -33,6 +33,25 @@ #define ERRM "starcode error:" +static const int valid_DNA_char[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0, + 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, + 0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0, + 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + char * outname (char *); struct useq_t; @@ -52,6 +71,8 @@ struct useq_t { int * seqid; // Unique ID / pointer (see above). }; +useq_t * new_useq (int, char *, char *); + typedef struct propt_t propt_t; struct propt_t { char first[5]; From bbca4abb27e1df54b58ac042505141a2a51315b9 Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Tue, 12 Jun 2018 17:40:19 +0200 Subject: [PATCH 13/31] new layout of output, doesn't work yet --- pystarcode/pystarcode.c | 57 +++++++++++++++++++++++++++++++---------- src/main-starcode.c | 15 ++++++----- src/starcode.c | 28 +++++--------------- src/starcode.h | 8 ++---- 4 files changed, 59 insertions(+), 49 deletions(-) diff --git a/pystarcode/pystarcode.c b/pystarcode/pystarcode.c index 5e444cf..d439eec 100644 --- a/pystarcode/pystarcode.c +++ b/pystarcode/pystarcode.c @@ -102,23 +102,21 @@ static PyObject* pystarcode_starcode_c(PyObject *self, PyObject *args, PyObject // the sequences to analyze gstack_t *uSQ = read_file(inputf1, inputf2, verbose); - int out = starcode( + gstack_t *result = starcode( uSQ, - outputf1, - outputf2, tau, verbose, thrmax, clusteralg, cluster_ratio, - showclusters, - showids, - outputt + showids ); - // int out = 0; + // print output + print_starcode_output(outputf1, outputf2, + result, clusteralg, showclusters, showids, outputt, verbose); - PyObject *ret = Py_BuildValue("h", out); + PyObject *ret = Py_BuildValue("h", 0); return ret; } @@ -162,9 +160,6 @@ static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k // get the number of sequence elements in the input list int numLines = PyList_Size(in_list); - // init the return object - PyObject * retList = PyList_New(numLines); /* the list to return */ - // should raise an error here if (numLines < 0) { PyErr_SetString(PyExc_ValueError, "Input element is not list."); @@ -205,10 +200,44 @@ static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k new->nids = 1; new->seqid = (void *)(unsigned long)uSQ->nitems+1; push(new, &uSQ); + } + + // we're now ready to invoke the main starcode core function + gstack_t *clusters = starcode( + uSQ, + tau, + verbose, + thrmax, + clusteralg, + cluster_ratio, + showids + ); - if (PyList_SetItem(retList, i, strObj) == -1) - Py_RETURN_FALSE; + // init the return object + PyObject * d = PyDict_New(); + + // fill in the dictionary + PyObject *seq_list = PyList_New(0); + useq_t *first = (useq_t *) clusters->items[0]; + useq_t *canonical = first->canonical; + for (size_t i = 0 ; i < clusters->nitems ; i++) { + useq_t *u = (useq_t *) clusters->items[i]; + printf("%s\n", u->seq); + if (u->canonical != canonical) { + // Update canonical and set key of dictionary, reset list + // printf("Sequence %lu: Updating canonical: old = %s", i, canonical->seq); + canonical = u->canonical; + // printf(" new = %s\n", canonical->seq); + PyDict_SetItemString(d, canonical->seq, seq_list); + seq_list = PyList_New(0); + } + else { + // printf ("Sequence %lu: canonical = %s, current = %s\n", i, canonical->seq, u->seq); + PyObject *val = PyString_FromString(u->seq); + PyList_Append(seq_list, val); + } } - return retList; + // return the created dictionary + return d; } diff --git a/src/main-starcode.c b/src/main-starcode.c index 22ff920..a381e7a 100644 --- a/src/main-starcode.c +++ b/src/main-starcode.c @@ -393,26 +393,27 @@ main( return 1; } - int exitcode = + // invoke the main starcode routine + gstack_t *result = starcode( uSQ, - outputf1, - outputf2, dist, vb_flag, threads, cluster_alg, cluster_ratio, - cl_flag, - id_flag, - output_type + id_flag ); + // print output + print_starcode_output(outputf1, outputf2, + result, cluster_alg, cl_flag, id_flag, output_type, vb_flag); + if (inputf1 != stdin) fclose(inputf1); if (inputf2 != NULL) fclose(inputf2); if (outputf1 != stdout) fclose(outputf1); if (outputf2 != NULL) fclose(outputf2); - return exitcode; + return 0; } diff --git a/src/starcode.c b/src/starcode.c index f86464a..de73823 100644 --- a/src/starcode.c +++ b/src/starcode.c @@ -782,26 +782,19 @@ set_input_and_output return IO_OK; } -int +gstack_t * starcode ( gstack_t *uSQ, - FILE *outputf1, - FILE *outputf2, int tau, const int verbose, int thrmax, const int clusteralg, int parent_to_child, - const int showclusters, - const int showids, - const int outputt + const int showids ) { - OUTPUTF1 = outputf1; - OUTPUTF2 = outputf2; - OUTPUTT = outputt; CLUSTERALG = clusteralg; CLUSTER_RATIO = parent_to_child; @@ -850,9 +843,7 @@ starcode // Sort in canonical order. qsort(uSQ->items, uSQ->nitems, sizeof(useq_t *), canonical_order); - // print output - print_starcode_output(outputf1, outputf2, - uSQ, CLUSTERALG, showclusters, showids, OUTPUTT, verbose); + return uSQ; // // SPHERES ALGORITHM @@ -866,9 +857,7 @@ starcode // Sort in count order. qsort(uSQ->items, uSQ->nitems, sizeof(useq_t *), count_order); - // print output - print_starcode_output(outputf1, outputf2, - uSQ, CLUSTERALG, showclusters, showids, OUTPUTT, verbose); + return uSQ; /* * CONNECTED COMPONENTS ALGORITHM @@ -880,15 +869,10 @@ starcode // Returns a stack containing stacks of clusters, where clusters->item[i]->item[0] is // the centroid of the i-th cluster. The output is sorted by cluster count, which is // stored in centroid->count. - gstack_t * clusters = compute_clusters(uSQ); - - // print output - print_starcode_output(outputf1, outputf2, - clusters, CLUSTERALG, showclusters, showids, OUTPUTT, verbose); + return compute_clusters(uSQ); } - return 0; - + return NULL; } void diff --git a/src/starcode.h b/src/starcode.h index 57efc4d..f97cdb8 100644 --- a/src/starcode.h +++ b/src/starcode.h @@ -199,18 +199,14 @@ print_starcode_output const int verbose ); -int starcode( +gstack_t * starcode( gstack_t *uSQ, - FILE *outputf1, - FILE *outputf2, int tau, const int verbose, int thrmax, const int clusteralg, int parent_to_child, - const int showclusters, - const int showids, - const int outputt + const int showids ); #endif From aef676497c45e44536c225974fd751c2fdcc21a9 Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Wed, 13 Jun 2018 17:04:48 +0200 Subject: [PATCH 14/31] added test.py to test --- pystarcode/test.py | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 pystarcode/test.py diff --git a/pystarcode/test.py b/pystarcode/test.py new file mode 100644 index 0000000..d97ce38 --- /dev/null +++ b/pystarcode/test.py @@ -0,0 +1,8 @@ +import pystarcode + +seq_list = [] +with open('iPCR_rep1_filtered.txt','r') as f : + for line in f : + seq_list.append(line.strip('\n')) +d = pystarcode.starcode(seq_list,2,5) +# print d From a7778101b30a84f3ed072d6c332bdafce7d64a3f Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Wed, 13 Jun 2018 17:53:40 +0200 Subject: [PATCH 15/31] fixed check_input function, takes flags not chars --- src/main-starcode.c | 57 +++++++++++++++++++++++++++------------------ src/starcode.c | 53 +++++++++++++++++++++-------------------- src/starcode.h | 26 +++++++++++++-------- 3 files changed, 78 insertions(+), 58 deletions(-) diff --git a/src/main-starcode.c b/src/main-starcode.c index a381e7a..99f3f4d 100644 --- a/src/main-starcode.c +++ b/src/main-starcode.c @@ -72,15 +72,20 @@ main( int dist = -1; int threads = -1; int cluster_ratio = -1; - - // Unset options (value 'UNSET'). - char * const UNSET = "unset"; - char * input = UNSET; - char * input1 = UNSET; - char * input2 = UNSET; - char * output = UNSET; - char * output1 = UNSET; - char * output2 = UNSET; + int input_set = 0; + int input1_set = 0; + int input2_set = 0; + int output_set = 0; + int output1_set = 0; + int output2_set = 0; + + // file names + char * input; + char * input1; + char * input2; + char * output; + char * output1; + char * output2; if (argc == 1 && isatty(0)) { @@ -125,8 +130,9 @@ main( break; case '1': - if (input1 == UNSET) { + if (!input1_set) { input1 = optarg; + input1_set = 1; } else { fprintf(stderr, "%s --input1 set more than once\n", ERRM); @@ -136,8 +142,9 @@ main( break; case '2': - if (input2 == UNSET) { + if (!input2_set) { input2 = optarg; + input2_set = 1; } else { fprintf(stderr, "%s --input2 set more than once\n", ERRM); @@ -147,8 +154,9 @@ main( break; case '3': - if (output1 == UNSET) { + if (!output1_set) { output1 = optarg; + output1_set = 1; } else { fprintf(stderr, "%s --output1 set more than once\n", ERRM); @@ -158,8 +166,9 @@ main( break; case '4': - if (output2 == UNSET) { + if (!output2_set) { output2 = optarg; + output2_set = 1; } else { fprintf(stderr, "%s --output2 set more than once\n", ERRM); @@ -192,8 +201,9 @@ main( return 0; case 'i': - if (input == UNSET) { + if (!input_set) { input = optarg; + input_set = 1; } else { fprintf(stderr, "%s --input set more than once\n", ERRM); @@ -203,8 +213,9 @@ main( break; case 'o': - if (output == UNSET) { + if (!output_set) { output = optarg; + output_set = 1; } else { fprintf(stderr, "%s --output set more than once\n", ERRM); @@ -276,7 +287,7 @@ main( if (optind < argc) { // If no input is specified, assume first positional argument // is the name of the input file. - if ((optind == argc-1) && (input == UNSET && input1 == UNSET)) { + if ((optind == argc-1) && (!input_set && !input1_set)) { input = argv[optind]; } else { @@ -288,7 +299,7 @@ main( // set default input and check flag compatibility input_compatibility_t ic = check_input (nr_flag,cl_flag,id_flag,sp_flag,cp_flag, - &threads,&cluster_ratio,input1,input2,input,output); + &threads,&cluster_ratio,input_set,input1_set,input2_set,output_set); if (ic != INPUT_OK) return EXIT_FAILURE; @@ -312,7 +323,7 @@ main( FILE *outputf1 = NULL; FILE *outputf2 = NULL; - if (input != UNSET) { + if (input_set) { inputf1 = fopen(input, "r"); if (inputf1 == NULL) { fprintf(stderr, "%s cannot open file %s\n", ERRM, input); @@ -320,7 +331,7 @@ main( return EXIT_FAILURE; } } - else if (input1 != UNSET) { + else if (input1_set) { inputf1 = fopen(input1, "r"); if (inputf1 == NULL) { fprintf(stderr, "%s cannot open file %s\n", ERRM, input1); @@ -338,7 +349,7 @@ main( inputf1 = stdin; } - if (output != UNSET) { + if (output_set) { outputf1 = fopen(output, "w"); if (outputf1 == NULL) { fprintf(stderr, "%s cannot write to file %s\n", ERRM, output); @@ -346,9 +357,9 @@ main( return EXIT_FAILURE; } } - else if (nr_flag && input1 != UNSET && input2 != UNSET) { + else if (nr_flag && input1_set && input2_set) { // Set default names as inputX-starcode.fastq - if (output1 == UNSET) { + if (!output1_set) { output1 = outname(input1); outputf1 = fopen(output1, "w"); free(output1); @@ -363,7 +374,7 @@ main( return EXIT_FAILURE; } - if (output2 == UNSET) { + if (!output2_set) { output2 = outname(input2); outputf2 = fopen(output2, "w"); free(output2); diff --git a/src/starcode.c b/src/starcode.c index de73823..b8c9bd2 100644 --- a/src/starcode.c +++ b/src/starcode.c @@ -605,14 +605,12 @@ check_input int cp_flag, int * threads, int * cluster_ratio, - char *input1, - char *input2, - char *input, - char *output + int input_set, + int input1_set, + int input2_set, + int output_set ) { - char * const UNSET = "unset"; - // Check options compatibility. // if (nr_flag && (cl_flag || id_flag)) { fprintf(stderr, @@ -621,24 +619,25 @@ check_input say_usage(); return NR_CL_ID_INCOMPATIBILITY; } - if (input != UNSET && (input1 != UNSET || input2 != UNSET)) { + if (input_set && (input1_set || input2_set)) { fprintf(stderr, "%s --input and --input1/2 are incompatible\n", ERRM); say_usage(); return INPUT_INPUT12_INCOMPATIBILITY; } - if (input1 == UNSET && input2 != UNSET) { + if (!input1_set && input2_set) { fprintf(stderr, "%s --input2 set without --input1\n", ERRM); + printf ("input1_set = %d, input2_set = %d\n", input1_set, input2_set); say_usage(); return ONLY_INPUT2_INCOMPATIBILITY; } - if (input2 == UNSET && input1 != UNSET) { + if (!input2_set && input1_set) { fprintf(stderr, "%s --input1 set without --input2\n", ERRM); say_usage(); return ONLY_INPUT1_INCOMPATIBILITY; } - if (nr_flag && output != UNSET && - (input1 != UNSET || input2 != UNSET)) { + if (nr_flag && output_set && + (input1_set || input2_set)) { fprintf(stderr, "%s cannot specify --output for paired-end " "fastq file with --non-redundant\n", ERRM); say_usage(); @@ -691,17 +690,21 @@ starcode_io_check set_input_and_output ( starcode_io_t *io, - char * input1, - char * input2, - char * input, - char * output1, - char * output2, - char * output, + char *input, + char *input1, + char *input2, + char *output, + char *output1, + char *output2, + int input1_set, + int input2_set, + int input_set, + int output1_set, + int output2_set, + int output_set, int nr_flag ) { - char * const UNSET = "unset"; - // Set input file(s). // io->inputf1 = NULL; io->inputf2 = NULL; @@ -710,7 +713,7 @@ set_input_and_output io->outputf1 = NULL; io->outputf2 = NULL; - if (input != UNSET) { + if (input_set) { io->inputf1 = fopen(input, "r"); if (io->inputf1 == NULL) { fprintf(stderr, "%s cannot open file %s\n", ERRM, input); @@ -718,7 +721,7 @@ set_input_and_output return IO_FILERR; } } - else if (input1 != UNSET) { + else if (input1_set) { io->inputf1 = fopen(input1, "r"); if (io->inputf1 == NULL) { fprintf(stderr, "%s cannot open file %s\n", ERRM, input1); @@ -736,7 +739,7 @@ set_input_and_output io->inputf1 = stdin; } - if (output != UNSET) { + if (output_set) { io->outputf1 = fopen(output, "w"); if (io->outputf1 == NULL) { fprintf(stderr, "%s cannot write to file %s\n", ERRM, output); @@ -744,9 +747,9 @@ set_input_and_output return IO_FILERR; } } - else if (nr_flag && input1 != UNSET && input2 != UNSET) { + else if (nr_flag && input1_set && input2_set) { // Set default names as inputX-starcode.fastq - if (output1 == UNSET) { + if (!output1_set) { output1 = outname(input1); io->outputf1 = fopen(output1, "w"); free(output1); @@ -761,7 +764,7 @@ set_input_and_output return IO_FILERR; } - if (output2 == UNSET) { + if (!output2_set) { output2 = outname(input2); io->outputf2 = fopen(output2, "w"); free(output2); diff --git a/src/starcode.h b/src/starcode.h index f97cdb8..48e90bd 100644 --- a/src/starcode.h +++ b/src/starcode.h @@ -121,10 +121,10 @@ check_input int cp_flag, int * threads, int * cluster_ratio, - char *input1, - char *input2, - char *input, - char *output + int input_set, + int input1_set, + int input2_set, + int output_set ); typedef struct { @@ -145,12 +145,18 @@ starcode_io_check set_input_and_output ( starcode_io_t *io, - char * input1, - char * input2, - char * input, - char * output1, - char * output2, - char * output, + char *input, + char *input1, + char *input2, + char *output, + char *output1, + char *output2, + int input1_set, + int input2_set, + int input_set, + int output1_set, + int output2_set, + int output_set, int nr_flag ); From cbdf794b5a6ff25ddefb1b5d119ce83f05e0b6e8 Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Wed, 13 Jun 2018 18:28:15 +0200 Subject: [PATCH 16/31] pystarcode.starcode returns a dictionary with starcoded barcodes --- pystarcode/pystarcode.c | 20 +++----------------- pystarcode/test.py | 5 +++-- 2 files changed, 6 insertions(+), 19 deletions(-) diff --git a/pystarcode/pystarcode.c b/pystarcode/pystarcode.c index d439eec..9b751d0 100644 --- a/pystarcode/pystarcode.c +++ b/pystarcode/pystarcode.c @@ -217,27 +217,13 @@ static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k PyObject * d = PyDict_New(); // fill in the dictionary - PyObject *seq_list = PyList_New(0); useq_t *first = (useq_t *) clusters->items[0]; useq_t *canonical = first->canonical; - for (size_t i = 0 ; i < clusters->nitems ; i++) { + PyDict_SetItemString(d, first->seq, PyString_FromString(canonical->seq)); + for (size_t i = 1 ; i < clusters->nitems ; i++) { useq_t *u = (useq_t *) clusters->items[i]; - printf("%s\n", u->seq); - if (u->canonical != canonical) { - // Update canonical and set key of dictionary, reset list - // printf("Sequence %lu: Updating canonical: old = %s", i, canonical->seq); - canonical = u->canonical; - // printf(" new = %s\n", canonical->seq); - PyDict_SetItemString(d, canonical->seq, seq_list); - seq_list = PyList_New(0); - } - else { - // printf ("Sequence %lu: canonical = %s, current = %s\n", i, canonical->seq, u->seq); - PyObject *val = PyString_FromString(u->seq); - PyList_Append(seq_list, val); - } + PyDict_SetItemString(d, u->seq, PyString_FromString(u->canonical->seq)); } - // return the created dictionary return d; } diff --git a/pystarcode/test.py b/pystarcode/test.py index d97ce38..b792dc8 100644 --- a/pystarcode/test.py +++ b/pystarcode/test.py @@ -1,8 +1,9 @@ import pystarcode seq_list = [] -with open('iPCR_rep1_filtered.txt','r') as f : +with open('small.txt','r') as f : for line in f : seq_list.append(line.strip('\n')) d = pystarcode.starcode(seq_list,2,5) -# print d +for key, l in d.iteritems() : + print key,l From c68dabd08095cdd7d1b0d47b8f831414b27708a0 Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Thu, 14 Jun 2018 10:52:38 +0200 Subject: [PATCH 17/31] pystarcode.starcode outputs all the information that starcode does --- pystarcode/pystarcode.c | 36 ++++++++++++++++++++++++++++-------- pystarcode/test.py | 13 +++++++++++-- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/pystarcode/pystarcode.c b/pystarcode/pystarcode.c index 9b751d0..373c2da 100644 --- a/pystarcode/pystarcode.c +++ b/pystarcode/pystarcode.c @@ -213,17 +213,37 @@ static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k showids ); - // init the return object + // init the return object: a python dictionary for the sequence -> canonical + // association, and another dictionary for the canonical -> counts association PyObject * d = PyDict_New(); + PyObject * counts = PyDict_New(); - // fill in the dictionary - useq_t *first = (useq_t *) clusters->items[0]; - useq_t *canonical = first->canonical; - PyDict_SetItemString(d, first->seq, PyString_FromString(canonical->seq)); - for (size_t i = 1 ; i < clusters->nitems ; i++) { + // init the canonical sequence + useq_t *canonical = ((useq_t *)clusters->items[0])->canonical; + PyDict_SetItemString(counts, canonical->seq, PyInt_FromSize_t(canonical->count)); + + // fill in the dictionaries + for (size_t i = 0 ; i < clusters->nitems ; i++) { useq_t *u = (useq_t *) clusters->items[i]; PyDict_SetItemString(d, u->seq, PyString_FromString(u->canonical->seq)); + + // let's see if the current canonical is the same as the last one + if (u->canonical != canonical) { + + // in this case we update the canonical + canonical = u->canonical; + + // and we write the new element in the dictionary of counts + PyDict_SetItemString(counts, canonical->seq, PyInt_FromSize_t(canonical->count)); + } } - // return the created dictionary - return d; + + // let's now build the return object, which will be a tuple of the two + // dictionaries + PyObject *ret = PyTuple_New(2); + PyTuple_SetItem(ret, 0, counts); + PyTuple_SetItem(ret, 1, d); + + // return the created tuple + return ret; } diff --git a/pystarcode/test.py b/pystarcode/test.py index b792dc8..41924ea 100644 --- a/pystarcode/test.py +++ b/pystarcode/test.py @@ -1,9 +1,18 @@ import pystarcode +# parse file seq_list = [] with open('small.txt','r') as f : for line in f : seq_list.append(line.strip('\n')) -d = pystarcode.starcode(seq_list,2,5) + +# invoke starcode +counts, d = pystarcode.starcode(seq_list,2,5) + +# print counts output +for key, l in counts.iteritems() : + print '%s -> %d'%(key,l) + +# print output for key, l in d.iteritems() : - print key,l + print '%s -> %s'%(key,l) From 8efbd0c8c2da0746e9cc7739c1b1035cff30cfc9 Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Thu, 14 Jun 2018 11:12:44 +0200 Subject: [PATCH 18/31] main starcode is now mostly a sequence of encapsulated functions --- src/main-starcode.c | 118 +++++++++----------------------------------- 1 file changed, 23 insertions(+), 95 deletions(-) diff --git a/src/main-starcode.c b/src/main-starcode.c index 99f3f4d..644fe3c 100644 --- a/src/main-starcode.c +++ b/src/main-starcode.c @@ -298,107 +298,35 @@ main( } // set default input and check flag compatibility - input_compatibility_t ic = check_input (nr_flag,cl_flag,id_flag,sp_flag,cp_flag, + int ic = check_input (nr_flag,cl_flag,id_flag,sp_flag,cp_flag, &threads,&cluster_ratio,input_set,input1_set,input2_set,output_set); if (ic != INPUT_OK) return EXIT_FAILURE; + // Set output type + int output_type = set_output_type(nr_flag); - // Set output type. // - int output_type; - if (nr_flag) output_type = NRED_OUTPUT; - else output_type = DEFAULT_OUTPUT; + // Set clustering algorithm + int cluster_alg = set_cluster_alg(cp_flag, sp_flag); - int cluster_alg; - if (cp_flag) cluster_alg = COMPONENTS_CLUSTER; - else if (sp_flag) cluster_alg = SPHERES_CLUSTER; - else cluster_alg = MP_CLUSTER; - - - - // Set input file(s). // - FILE *inputf1 = NULL; - FILE *inputf2 = NULL; - - // Set output file(s). // - FILE *outputf1 = NULL; - FILE *outputf2 = NULL; - - if (input_set) { - inputf1 = fopen(input, "r"); - if (inputf1 == NULL) { - fprintf(stderr, "%s cannot open file %s\n", ERRM, input); - say_usage(); - return EXIT_FAILURE; - } - } - else if (input1_set) { - inputf1 = fopen(input1, "r"); - if (inputf1 == NULL) { - fprintf(stderr, "%s cannot open file %s\n", ERRM, input1); - say_usage(); - return EXIT_FAILURE; - } - inputf2 = fopen(input2, "r"); - if (inputf2 == NULL) { - fprintf(stderr, "%s cannot open file %s\n", ERRM, input2); - say_usage(); - return EXIT_FAILURE; - } - } - else { - inputf1 = stdin; - } - - if (output_set) { - outputf1 = fopen(output, "w"); - if (outputf1 == NULL) { - fprintf(stderr, "%s cannot write to file %s\n", ERRM, output); - say_usage(); - return EXIT_FAILURE; - } - } - else if (nr_flag && input1_set && input2_set) { - // Set default names as inputX-starcode.fastq - if (!output1_set) { - output1 = outname(input1); - outputf1 = fopen(output1, "w"); - free(output1); - } else { - outputf1 = fopen(output1, "w"); - } - - if (outputf1 == NULL) { - fprintf(stderr, - "%s cannot write to file %s\n", ERRM, outname(input1)); - say_usage(); - return EXIT_FAILURE; - } - - if (!output2_set) { - output2 = outname(input2); - outputf2 = fopen(output2, "w"); - free(output2); - } else { - outputf2 = fopen(output2, "w"); - } - - if (outputf2 == NULL) { - fprintf(stderr, - "%s cannot write to file %s\n", ERRM, outname(input2)); - say_usage(); - return EXIT_FAILURE; - } - } - else { - outputf1 = stdout; - } + // Set starcode input and output files + starcode_io_t io; + int io_ok = set_input_and_output(&io, + input, input1, input2, + output, output1, output2, + input1_set, input2_set, input_set, + output1_set, output2_set, output_set, + nr_flag); + if (io_ok != IO_OK) return EXIT_FAILURE; + // if verbose flag is set, print some information for the user if (vb_flag) { fprintf(stderr, "running starcode with %d thread%s\n", threads, threads > 1 ? "s" : ""); fprintf(stderr, "reading input files\n"); } - gstack_t *uSQ = read_file(inputf1, inputf2, vb_flag); + + // initialize the "uSQ" stack with the input sequences + gstack_t *uSQ = read_file(io.inputf1, io.inputf2, vb_flag); if (uSQ == NULL || uSQ->nitems < 1) { fprintf(stderr, "input file empty\n"); return 1; @@ -417,13 +345,13 @@ main( ); // print output - print_starcode_output(outputf1, outputf2, + print_starcode_output(io.outputf1, io.outputf2, result, cluster_alg, cl_flag, id_flag, output_type, vb_flag); - if (inputf1 != stdin) fclose(inputf1); - if (inputf2 != NULL) fclose(inputf2); - if (outputf1 != stdout) fclose(outputf1); - if (outputf2 != NULL) fclose(outputf2); + if (io.inputf1 != stdin) fclose(io.inputf1); + if (io.inputf2 != NULL) fclose(io.inputf2); + if (io.outputf1 != stdout) fclose(io.outputf1); + if (io.outputf2 != NULL) fclose(io.outputf2); return 0; From 8010479d079c643ce865c690d6664caa6a78a5eb Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Thu, 14 Jun 2018 11:18:26 +0200 Subject: [PATCH 19/31] removed starcode_c function from python module --- pystarcode/pystarcode.c | 80 +---------------------------------------- 1 file changed, 1 insertion(+), 79 deletions(-) diff --git a/pystarcode/pystarcode.c b/pystarcode/pystarcode.c index 373c2da..e8fbfa5 100644 --- a/pystarcode/pystarcode.c +++ b/pystarcode/pystarcode.c @@ -8,19 +8,13 @@ // module docstrings static char module_docstring[] = "This module is a Python interface to Starcode"; -static char starcode_c_docstring[] = - "Starcode invocation"; static char starcode_docstring[] = - "Starcode invocation"; + "Starcode list of input sequences"; // the module methods declaration -static PyObject* pystarcode_starcode_c(PyObject *self, PyObject *args, PyObject *kwargs); - static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *kwargs); static PyMethodDef module_methods[] = { - {"starcode_c", (PyCFunction) pystarcode_starcode_c, - METH_VARARGS|METH_KEYWORDS, starcode_c_docstring}, {"starcode", (PyCFunction) pystarcode_starcode, METH_VARARGS|METH_KEYWORDS, starcode_docstring}, {NULL, NULL, 0, NULL} @@ -49,78 +43,6 @@ FILE *py_fopen(const char *fname, const char *mode) return f; } -// this method is used to invoke exactly starcode from within python, without -// any system call -static PyObject* pystarcode_starcode_c(PyObject *self, PyObject *args, PyObject *kwargs) -{ - // Parse the input - char *in_filename, *out_filename; - int tau, cluster_ratio; - int clusteralg = 0; - int verbose = 1; - int thrmax = 4; - int showclusters = 1; - int showids = 0; - int outputt = 0; - - static char *kwlist[] = { - "input", - "output", - "dist", - "cluster_ratio", - "clusteralg", - "verbose", - "threads", - "showclusters", - "showids", - "outputt", - NULL}; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "ssii|iiiiii", kwlist, - &in_filename, - &out_filename, - &tau, - &cluster_ratio, - &clusteralg, - &verbose, - &thrmax, - &showclusters, - &showids, - &outputt)) - return NULL; - - // open input and output files - FILE *inputf1 = py_fopen(in_filename,"r"); - if (inputf1 == NULL) return NULL; - FILE *inputf2 = NULL; - FILE *outputf1 = py_fopen(out_filename, "w"); - if (outputf1 == NULL) return NULL; - FILE *outputf2 = NULL; - - printf("%s %s %d %d %d %d\n",in_filename,out_filename,tau,cluster_ratio,clusteralg,verbose); - - // init the "gstack_t" data structure, which will contain the information on - // the sequences to analyze - gstack_t *uSQ = read_file(inputf1, inputf2, verbose); - - gstack_t *result = starcode( - uSQ, - tau, - verbose, - thrmax, - clusteralg, - cluster_ratio, - showids - ); - - // print output - print_starcode_output(outputf1, outputf2, - result, clusteralg, showclusters, showids, outputt, verbose); - - PyObject *ret = Py_BuildValue("h", 0); - - return ret; -} - static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *kwargs) { // Parse the input From c41f3515fcd9756cb5869de5023e0898791cc6b9 Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Thu, 14 Jun 2018 11:26:10 +0200 Subject: [PATCH 20/31] cleaned input keywords passed to pystarcode.starcode --- pystarcode/pystarcode.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/pystarcode/pystarcode.c b/pystarcode/pystarcode.c index e8fbfa5..788e264 100644 --- a/pystarcode/pystarcode.c +++ b/pystarcode/pystarcode.c @@ -45,15 +45,16 @@ FILE *py_fopen(const char *fname, const char *mode) static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *kwargs) { - // Parse the input - int tau, cluster_ratio; - int clusteralg = 0; + // Input variables: required arguments + int tau; + PyObject * in_list; + + // Input variables: keyword arguments with default values + int cluster_ratio = 5; + int clusteralg = MP_CLUSTER; int verbose = 1; int thrmax = 4; - int showclusters = 1; int showids = 0; - int outputt = 0; - PyObject * in_list; static char *kwlist[] = { "input_list", @@ -62,11 +63,9 @@ static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k "clusteralg", "verbose", "threads", - "showclusters", "showids", - "outputt", NULL}; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!ii|iiiiii", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!i|iiiii", kwlist, &PyList_Type, &in_list, &tau, @@ -74,9 +73,7 @@ static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k &clusteralg, &verbose, &thrmax, - &showclusters, - &showids, - &outputt)) + &showids)) return NULL; // get the number of sequence elements in the input list From 3083df8788eaff5cf31de7b8ffebb6c054c6e86f Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Thu, 14 Jun 2018 11:35:05 +0200 Subject: [PATCH 21/31] updated test.py --- pystarcode/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pystarcode/test.py b/pystarcode/test.py index 41924ea..ba566ad 100644 --- a/pystarcode/test.py +++ b/pystarcode/test.py @@ -2,12 +2,12 @@ # parse file seq_list = [] -with open('small.txt','r') as f : +with open('iPCR_rep1_filtered.txt','r') as f : for line in f : seq_list.append(line.strip('\n')) # invoke starcode -counts, d = pystarcode.starcode(seq_list,2,5) +counts, d = pystarcode.starcode(seq_list,2) # print counts output for key, l in counts.iteritems() : From 2859e4c73fdd9e9fb9b340fa3645f5d1973cf171 Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Fri, 15 Jun 2018 12:18:26 +0200 Subject: [PATCH 22/31] draft output for different algorithms --- pystarcode/pystarcode.c | 77 ++++++++++++++++++++++++++++++++--------- pystarcode/test.py | 19 +++++++--- 2 files changed, 75 insertions(+), 21 deletions(-) diff --git a/pystarcode/pystarcode.c b/pystarcode/pystarcode.c index 788e264..6df4885 100644 --- a/pystarcode/pystarcode.c +++ b/pystarcode/pystarcode.c @@ -1,5 +1,6 @@ #include #include +#include #include "../src/starcode.h" #include "../src/trie.h" @@ -51,7 +52,8 @@ static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k // Input variables: keyword arguments with default values int cluster_ratio = 5; - int clusteralg = MP_CLUSTER; + int clusteralg_code; + char *clusteralg = NULL; int verbose = 1; int thrmax = 4; int showids = 0; @@ -65,7 +67,7 @@ static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k "threads", "showids", NULL}; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!i|iiiii", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!i|isiii", kwlist, &PyList_Type, &in_list, &tau, @@ -85,6 +87,27 @@ static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k return NULL; } + // evaluate the cluster algorithm to use + if (clusteralg == NULL) { + // if clusteralg is still NULL, then use the default MP clustering + clusteralg_code = MP_CLUSTER; + } + else { + if (strcmp(clusteralg, "mp")==0) { + clusteralg_code = MP_CLUSTER; + } + else if (strcmp(clusteralg, "spheres")==0) { + clusteralg_code = SPHERES_CLUSTER; + } + else if (strcmp(clusteralg, "components")==0) { + clusteralg_code = COMPONENTS_CLUSTER; + } + else { + PyErr_SetString(PyExc_ValueError, "Unrecognized clustering method"); + return NULL; + } + } + // init the structure that we will pass to the starcode core function gstack_t *uSQ = new_gstack(); @@ -127,7 +150,7 @@ static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k tau, verbose, thrmax, - clusteralg, + clusteralg_code, cluster_ratio, showids ); @@ -137,25 +160,47 @@ static PyObject* pystarcode_starcode(PyObject *self, PyObject *args, PyObject *k PyObject * d = PyDict_New(); PyObject * counts = PyDict_New(); - // init the canonical sequence - useq_t *canonical = ((useq_t *)clusters->items[0])->canonical; - PyDict_SetItemString(counts, canonical->seq, PyInt_FromSize_t(canonical->count)); + //////////////////////////////// + // WRITE OUTPUT + //////////////////////////////// - // fill in the dictionaries - for (size_t i = 0 ; i < clusters->nitems ; i++) { - useq_t *u = (useq_t *) clusters->items[i]; - PyDict_SetItemString(d, u->seq, PyString_FromString(u->canonical->seq)); + // case of MP clustering + if (clusteralg == MP_CLUSTER) { - // let's see if the current canonical is the same as the last one - if (u->canonical != canonical) { + // init the canonical sequence + useq_t *canonical = ((useq_t *)clusters->items[0])->canonical; + PyDict_SetItemString(counts, + canonical->seq, + PyInt_FromSize_t(canonical->count)); - // in this case we update the canonical - canonical = u->canonical; + // fill in the dictionaries + for (size_t i = 0 ; i < clusters->nitems ; i++) { + useq_t *u = (useq_t *) clusters->items[i]; + PyDict_SetItemString(d, + u->seq, + PyString_FromString(u->canonical->seq)); - // and we write the new element in the dictionary of counts - PyDict_SetItemString(counts, canonical->seq, PyInt_FromSize_t(canonical->count)); + // let's see if the current canonical is the same as the last one + if (u->canonical != canonical) { + + // in this case we update the canonical + canonical = u->canonical; + + // and we write the new element in the dictionary of counts + PyDict_SetItemString(counts, + canonical->seq, + PyInt_FromSize_t(canonical->count)); + } } } + else if (clusteralg_code == SPHERES_CLUSTER) { + PyErr_SetString(PyExc_ValueError, "Output for spheres not implemented\n"); + return NULL; + } + else if (clusteralg_code == COMPONENTS_CLUSTER) { + PyErr_SetString(PyExc_ValueError, "Output for components not implemented\n"); + return NULL; + } // let's now build the return object, which will be a tuple of the two // dictionaries diff --git a/pystarcode/test.py b/pystarcode/test.py index ba566ad..a5b93ea 100644 --- a/pystarcode/test.py +++ b/pystarcode/test.py @@ -7,12 +7,21 @@ seq_list.append(line.strip('\n')) # invoke starcode -counts, d = pystarcode.starcode(seq_list,2) +clusteralg = "components" +canonical_counts, d = pystarcode.starcode(seq_list,2,clusteralg=clusteralg) # print counts output -for key, l in counts.iteritems() : - print '%s -> %d'%(key,l) +# for key, l in canonical_counts.iteritems() : + # print '%s -> %d'%(key,l) # print output -for key, l in d.iteritems() : - print '%s -> %s'%(key,l) +# for key, l in d.iteritems() : + # print '%s -> %s'%(key,l) + +with open('test-%s.txt.true'%(clusteralg), 'r') as f : + for line in f : + canonical, counts = line.split('\t') + counts = int(counts) + if canonical_counts[canonical] != counts : + print "Discrepancy between counts of %s"%(canonical) + break From 8047b132adedd68561dd70a789c0066f0ff697a3 Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Mon, 18 Jun 2018 16:41:13 +0200 Subject: [PATCH 23/31] moved source files of pystarcode --- pystarcode/pystarcode.c => pystarcode.c | 0 pystarcode/setup.py => setup.py | 0 pystarcode/test.py => test.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename pystarcode/pystarcode.c => pystarcode.c (100%) rename pystarcode/setup.py => setup.py (100%) rename pystarcode/test.py => test.py (100%) diff --git a/pystarcode/pystarcode.c b/pystarcode.c similarity index 100% rename from pystarcode/pystarcode.c rename to pystarcode.c diff --git a/pystarcode/setup.py b/setup.py similarity index 100% rename from pystarcode/setup.py rename to setup.py diff --git a/pystarcode/test.py b/test.py similarity index 100% rename from pystarcode/test.py rename to test.py From e6542adda25c0cfcd6522f0c0c54fc302cf60ad9 Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Mon, 18 Jun 2018 16:42:36 +0200 Subject: [PATCH 24/31] updated test.py with example sequence file --- example_seqs.txt | 2500 ++++++++++++++++++++++++++++++++++++++++++++++ test.py | 20 +- 2 files changed, 2506 insertions(+), 14 deletions(-) create mode 100644 example_seqs.txt diff --git a/example_seqs.txt b/example_seqs.txt new file mode 100644 index 0000000..e6f4cc4 --- /dev/null +++ b/example_seqs.txt @@ -0,0 +1,2500 @@ +TAGAANGAACGGTAGTCGGA +GCAATNTTCTAAGTCCTAAG +GACTCNTATCTAAACCAGTC +CAGCTNGCGGCAAAACTGCG +GTTGCNCTGGCGATTTTGTA +GGGGGNGGGGGGGGGGGGGG +AAGGTNCTGAATCTCTTTAG +GTAAGNTTGGGTAATTAAAT +ACAGTNTTGTTATCGGTAGC +AATCTNCTCAAATTTATGCG +CGTGTNATTATTGTGGAAGC +ATCGANCATCACTCGAGGTA +TAGCGNGTATAAGGGAGGCG +GTATCNGGTCACAAATAATC +CGTATNCCGTCTTCTGCTTG +CTCAGNTAGTAATCCACGCT +AACGTNCGTCAAAAATTACG +CGACCNAAATTAGGGTCAAC +TTACCNTTATGCTATCTTTC +GTTAANTACTTGGATCAAGG +ATTGCNGGAGGCCTCCACTA +GTAGCNGGGATGGCTTTTAT +AATTANATCGAAGTGGACTG +GCGTCNTGGGGTATGAACCG +CGTATNCCGTCTTCTGCTTG +AATGGNTCGACAGATGTAAT +TCTGTNAGTAGTGACGGCAT +GATTANGTCGTGCGCGGACT +ATTTTNTGAAACGATATAGG +TTAAANTCCTAAGCAGAAAA +AAGGGNTATGGACTGTCATG +CTTCTNCATAGGCACAGATA +GGACGNTTCGTTGTAATTGG +GGTGCNATATTGTCGGACGG +TTAGTNAGGCACATCTCGGT +CGTATNCCGTCTTCTGCTTG +TCCTCNCTAAGTATTCTTCA +ACTGGNTTAATAGAAGTGTG +AACCTNCAGAGTTTTATCGC +GCCTTNAGTACCTCGCAACG +ATCAANAGCAATATCAGCAC +GACGTNCCTGGTATGTTAGC +CACGGNGGGTGTTAAAACGC +AAATCNGCTCAAATTTATGC +CATGANCAGATCGACTCGGC +TGTCANCTGTGTATCTAGGA +TAGAGNAGATGGTGTCAGGG +GGAACNAAAAGTCCGGTATT +CTCGANCAAACCCCTGAATG +TCACCNTTAGCATCAACAGG +TTGTTNAGGGTCGGACACCT +TTCCTNCTTTATCAAGATAA +GGGACNTAAAAAGTAAAAAT +TAGCGNTAAGGTACTGAATC +AGTCTNGAGTTAGGGAACTA +CGTATNCCGTCTTCTGCTTG +AAACGATAAACCAACCATCA +AACTTAATTGGTCAGATAAA +TCTGACGTCAGGTATAGTAG +AAAAATTAAAATTTTTACCG +AGGTCTAAAGCTATGGTACT +TCTCTTTAATAACCTGATTC +GACGTTCCTGGTATGTTAGC +CGTATGCCGTCTTCTGCTTG +CATGATTACACCAAGCGCGA +AATGGCTCGACAGATGTAAT +ATTAAGACAACGCGGGATAA +GTTGCCTTCGAATGAGTTGG +CGTATGCCGTCTTCTGCTTG +AGTAGTATGGAGGTATGACG +GAACTGGTAGAAACAGAGGG +ACAAGTTGAGTTAAAGATGT +GGTGCGCAAAAACTTTATTA +ATGGTATTGGCTCTAATTTG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +CATGGTCTGGGGAGTTCAAA +ATCGTGAAGAGGTTTTAATA +GAGTGCGTCCCTTTAAACAC +AAATGAGATGCTTGCTTATC +TGTCCTCCCGCCTAGGTAGA +GGTACACAATCAGCACGTTG +TAGTCTACGATCTTAATGTG +ATGTGTCTTCGCGACGAGTG +CAGTAGAATCCTCTCGAGCG +CGTATGCCGTCTTCTGCTTG +TAACCAGTAGTGTTAACAGT +TTTGTTTATTAGTCCTGGGT +GCTTAACTCATACCTAAGCT +GCAAGCCTCAACGCAGCGAC +TATTGAAGCAGAACGCAAAA +TGAATGAGCTTAATAGAGGC +CGTATGCCGTCTTCTGCTTG +AGTAAACCACGTAATAATAT +AGGTCTAAAGCTATGGTACT +TTTTTAATTTTTGCCGCTGA +GCGGACTAACGCCCAGATTA +CGGCTGACTTATGTTATCCA +CAGCTGAAACTGTGTTTAAG +GGTTTATCGTTTTTGACACT +TTAAGGATGAGTGTTCAAGA +AGATTCTGTCTTTTCGTATG +GACTTCTCAGATATGGGACA +ACTGGATTAATAGAAGTGTG +GTTATGGTTTCCGTTGCTGC +AAAACAAAACGCCCTCAGGA +AAAGTTTTGGTTACGCAGAG +GGCAGATGAGAAATGTATAC +TTCAGGGTTATTTGAATATC +AGTGGTAGAGGTCTCTTAAC +TGTATTTACCATAATTCGGT +TGACGGTTAATGCTGGTAAT +CCTATCAGGGGCCCAATTAT +CACCAGCAAGAGCAGAAGCA +CAAAGTCAAAATAATCAGCG +GAGGATACGTGGTACTCATG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +GTTATTTCCTAGACAAATTA +TGGTATGGTTGACGCCGGAT +CGTATGCCGTCTTCTGCTTG +TTGGTTAGGAAGTTATAAAA +TGATTGTCCAGTTGCATTTT +ATTTGAAGTAGGTTAGATAG +TAGTTGTTGCATACGTAATA +TCTTAGGCACGATGTATATC +GGTTCAAAGGAAAATCCGAT +ATACACGGTTAATAGGTTAG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +GCTTACTAAAATGCAACTGG +CGTATGCCGTCTTCTGCTTG +TAGTTGTTGCATACGTAATA +GATTTTGTGTGAAGAACAGG +AGTCCCGACCGGCCAGTGGG +AGACTGTTTGAAGTCGGAAG +CGTATGCCGTCTTCTGCTTG +TACCCGACGACCAAAATTAG +GTTTTCTGTCGTAATTATAA +CATGGTAGAGATTCTCTTGT +TTTCTGACGAGTAACAAAGT +TTACCTTTATGCTATCTTTC +GTTCGTGATGAGTTTGTATC +ATTTCTGGAAAGACGGTAAA +TCAGAAAGAGATTGCCGAGA +TTTGAGATGCTGATCGGTCT +GATCTCCTTAGTGCATGATC +GGATTCCCATGCTGGCAAAA +AGTGACTTAAAATCTCGCTG +CGTATGCCGTCTTCTGCTTG +CTAAAAACCAAGCTGTCGCT +CGTATGCCGTCTTCTGCTTG +CGTTCTAGCTAAGGAATAGG +CAGTCTCAGGAGGAAGCGGA +GTACAGGAATGAATCCTGAG +GGGCGGACGTAGGTCTATGG +TAAACATAGGATATGAAGAA +TATGCTAGAATGGACATGTG +CGTATGCCGTCTTCTGCTTG +TATCTCGTAATGATAGGTAC +GAGGATACGTGGTACTCATG +CCAGGTGAGTGACAACAAAT +TGTATTTACCATAATTCGGT +TGAGCTGGCGAAAGATAGAA +ATTACACGAGGGATACTAAG +CGTATGCCGTCTTCTGCTTG +GCGAGTTTTATAACTAAGGG +CTTATTCGTTGGTTCAACAG +GGGATGAAAATGCTCACAAT +CGTGGCAAATGAACCATAAG +CTTAGTTGCATGAGCATACA +GAGAAAGGTTTTTCAATTCC +CAGCTTCTGCGCGGGGCGAG +GGCTTGTAACCGTGTTAGGA +ATAAACACCAGTATAGATAA +GTTCCATCAACATCATAGCC +CCACAAGCCTCAATAGCAGG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +GCTCATTCAGGCTTCTGCCG +GACGTTCCTGGTATGTTAGC +GCTAGGCAATCTCTTCTAGG +CGTATGCCGTCTTCTGCTTG +GTTGCGGCCCGCGATAAGAT +CCAGAAGCAGCATCAGTGAC +CGATTTACCTAATAGCAGTT +AATACCTACTACTAGTAGGC +TTAATTATAAGCGGGGTTAG +GGTGCGCCGCGAGAATACCC +TTTCCGGAGTAATATCATGA +GAGTTGAGCTAACTAGAGTC +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +ATAGTGCAGTAAGCTAGAGG +CGTATGCCGTCTTCTGCTTG +ATGGTCGGAAAATGTAAGGG +CGTATGCCGTCTTCTGCTTG +TACCGGAGATAGGAGGATCA +CACGTAATTTTTGACGCACG +GAGGTTTTACCTCCAAATGA +TAGTTTAGACTCAAATCCGA +CGTATGCCGTCTTCTGCTTG +CAACCAATCATCAATTTATG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +CCCCTTCGGGGCGGTGGTCT +CGTATGCCGTCTTCTGCTTG +ATTCTCGCGCGTAGTCCACC +TATTGTGTGGAAACTTCGAG +GTACTCCATGCTCTATATAC +ACGAGTATCCTTTCCTTTAT +TAAACATAGGATATGAAGAA +TCTAACGTCGATTTCACTTG +TATTATGAGTCGAGTAAGGC +GTTATATTTTGATAGTTTGA +TTCCAATTAACCCGAACCTA +CAGTCGGCCTCAGTCTGAAC +ACCTGCCTTGTAAGTAAACG +CGTATTATGTTATAGGATTG +GATGTAGCTGGAAACGTTGA +TTGTTCAGGCTGTGAAAGCA +CCATCTCAAAAACATTTGGA +GATGATTCTCGAATTGTGAG +AAATAATCTCTTTAATAACC +TTAAACAGGCCCTGCTATTA +GAGCGGTCAGTAGCAATCCA +CTCCCTTTGGTAAAACGAAG +GTCAGTTCCATCAACATCAT +AGTCCCGTGATGCGTTGCTG +GGTAGCGATTTTTGAATTGC +GTGGCATTCTAGCAGTCAAC +TATTAGGGAAGCCAAAGACT +CAACACTGACATCATCTAGA +CTGGACGTGTGAGTTGTAAC +TGGCATAGGGGACGTTGGGG +ATATCGATAGGTGACTTAGG +AGGAATACAAAAAGAAAAGT +GAGGGGGGAATAACGATTCT +GATGGCGTTCAATAGGAGGG +GGTGAAATCAAGAGATAGAG +CGTATGCCGTCTTCTGCTTG +TATTTAAGAACGAAATAATG +GCATTTAGTAGCGGTAAAGT +CAGCTTCTGCGCGGGGCGAG +GGATCTAGTAAATGGAAACA +AACATACATATCACCATTAT +GGAGGTAAAACCTCTTATGA +TACTACGAATGACATCCCGG +TGCAACTGGACAATCAGAAA +CTTTGGTGATCGGTGCGATG +AAAATACGTCTTACGTCAGT +GACTCATCAGAAATATCCGA +AAAGCGCCGTGGATGCCTGA +CGTATGCCGTCTTCTGCTTG +TAGGAGTTATGATGGGGCAA +TAGACCAGGATTATGGGTTA +ATACGTGCACACAATTTTAG +GCAGTCAAAATAGAGTTTAC +TTTGAAGCAGGGGAATAAAA +GCGGGAACTCTTGTTTCTTG +GGTTCCCGACTCTCACATTT +GCTGTAGACATGACTACACC +ACAAAGCTCCTGCTGCGAAT +TAGATATGAGTCACATTTTG +CGTATGCCGTCTTCTGCTTG +TGGTACGTCCCGGTAAGTTC +GTAAAAATTTTAATTTTTGC +ATCATACCTCAATAGAACAC +TGGACTCAATAAGCTAACCT +GTTAAACGGGGTTGTCTGGC +AGAAGTGAGAACCAGCTTAT +GAGTACCCAGTTCAATATAG +GAATTAAATCGAAGTGGACT +GGCTTGTAACCGTGTTAGGA +TACCGGACCACTGAAAATCA +TACTACGAATGACATCCCGG +CCTCCAAGATTTGGAGGCAT +CATACTAGGCTGTTGTATCT +GGACTAGTCAAGCATGAACT +AAGATTGCTGGAGGCCTCCA +GCCTGGTAGTGGATTTAAGC +AAAATGAGAAAATTCGACCT +CAAATTCAACTTCTCTGGGG +GACGTTCCTGGTATGTTAGC +GGTGCTATATTGTCGGACGG +ATTTGAAGTAGGTTAGATAG +TTAATGGTTACAATCAAAAG +CCTATTTTAAGATTGTAAAG +ACTGCGTAACCGTCTTCTCG +TGTTGAAAACATGTTTACAC +CGTGCCCTGTACCATTATAT +GGCCCGGGGCCCCCGGCCGG +TGACCCAGATAGACTGCCAT +GGCTTGTAACCGTGTTAGGA +CGTATGCCGTCTTCTGCTTG +TGTGTATTTGACAGCTCGTG +AAAGGAAAGGATACTCGTGA +ATGGATGAATTGGCACAATG +AATGGCTCGACAGATGTAAT +TGATAAAAGATTGAGTGTGA +CAAAATTGTTAGAGAATGAA +CGTATGCCGTCTTCTGCTTG +TAGAGAAGATGGTGTCAGGG +AGTGTGACGTGTTTTGTGGG +GGAGATGGTGAAATGCCTAA +GATGCCGACCCTAAATTTTT +GCGGTTATGGATGAGTCGCC +GGGTAGAAATGTGCGGAGGG +TGTTCTTCGCATGGTATACA +CGTATGCCGTCTTCTGCTTG +CACGTTTATGGTGAACAGTG +CGTATGCCGTCTTCTGCTTG +CGTGCATAACGACGATAGGT +AGATGGCACAGTATTCTACG +GGTTTAGATTTTGTTGAGTC +AAGTATCTCATTAGATCTTT +TGATGCTCTTTTTGATTACA +CGTATGCCGTCTTCTGCTTG +CTTGGCACGTTCGTCAAGGA +CAGGAAACACTGACGTTCTT +CGTATGCCGTCTTCTGCTTG +GACTTGACTTAGAATGTACT +GACATGTACAGTTGAAGTCG +CGCGATGAACTGTGGCAAAT +AATTTTGCGAAACTGAAAAA +GTCGTCATACCTATAGAGGC +TCTTGCTGGTGGCGCCATGT +GATCTTGAGGACTGGTCCAT +GCGATATACCCAAATGTGGA +CAGATATGGACCTTGCTGCT +ACAGTTGAAGTCGGAAGTTT +TACAAAACAGGGTCGCCAGC +CGCAGCTCGAGAAGCTCTTA +TGTTTAGGCGATCTACATCG +TTAAAGGGTAAACCTTCATG +CTTCCTCTTTTCATAAATAA +AAATTCAGGAGATTTAAGGA +AACGTGCAGAAGGAGAGAAG +GCATCTTGAAACTGTATTTC +ATTATGGCGAGAAATAAAAG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +GAATGGCAGATTTAATACCA +GTAATTAATCTGTCACGGAG +TGGAGACTCCATCTTTAGGG +ACATCACCTTGAATGCCACC +ATCGACCATCACTCGAGGTA +CCAAATCTTGGAGGCTTTTT +AGGCTTTGCTATTCAGCGTT +GTAGCTGCATGACCGGATGC +CAGGGGCCACAGTGGAGTAG +CGCAACTTCGGGATGAAAAT +TATGCCAAGAAGCAGCGTTT +GTATGATGGGCCAATTCGGG +CGTATGCCGTCTTCTGCTTG +TAAGGGGTCAGGGAATAAAT +TGTATTTACCATAATTCGGT +CGTATGCCGTCTTCTGCTTG +TGTATTTACCATAATTCGGT +CAGAGTGGAAGCTTAATACG +ATTGATGATTCGACATTTCT +GATTTTGTGTGAAGAACAGG +CGTATGCCGTCTTCTGCTTG +AAAAATTAAAATTTTTACCG +ACTGGATTAATAGAAGTGTG +GAGGATGGACTCGCGGTTAA +AAAGGATAAACATCATAGGC +CTAAGATGAGGCAAAGTTGG +GCCCTGGTCGTCCGCAGCCG +GCTTCTTGAAACTGTATTTC +TCTAATAGTTATTGCCATTG +GCGTCTGGATGTGCTCGGAG +GACGTTCCTGGTATGTTAGC +CTCGACGGCCTTTCGGCTTG +ATCTTGAACACTCATCCTTA +TGATGCTCTTTTTGATTACA +ACCGCTCTCGTGCTCGTCGC +GAACCCTGGGGGGCCCGACC +TATATTACGTGAAGATTCAG +CGTATGCCGTCTTCTGCTTG +GTTATGCGGGGGCGCAAGCA +GGCTTGTAACCGTGTTAGGA +CGTATGCCGTCTTCTGCTTG +CGCCAAAACGTCGGCTACAG +CTGTATCCTCGAGTAACCTC +CAGTAGGCGGAAAACGAACA +GATCTTCCGAGCTGACGTGA +TCTCCAGCCACTTAAGTGAG +AAAATACGTCTTACGTCAGT +TATTAGGGGATCAGGGGAGG +GAGTTGAGCTAACTAGAGTC +CGTATGCCGTCTTCTGCTTG +AGTAAAAATGTCTACAGTAG +GGTCGCCTCCCCCTCTAAAC +AAAGATTGAGTGTGAGGTTA +AATTTTTTGCCTGTTTGGTT +CACATCGCGTCGGCGCCGAT +CTGCTAAAGGTCTAGGAGCT +CGTATGCCGTCTTCTGCTTG +AAATGAGATGCTTGCTTATC +TTTCCGGAGTAATATCATGA +CGTATGCCGTCTTCTGCTTG +AGTTGTTCCATTCTTTAGCT +CATACATATCACCATTATCG +AGTTGTAAAAACGTGAAAGT +AACCATAAAAAAGCCTCCAA +CGATCTATGCTGTAGTTCAT +CGTATGCCGTCTTCTGCTTG +ACGCTTAATCATCCAATAAG +GGGCTCGTCGGTCTCGCTAC +TCACTCGAGGGAAAGTGCGG +AATCTAAAATAAGAAGGATC +GACGTTCCTGGTATGTTAGC +GACGTTCCTGGTATGTTAGC +GAAGCCGATCAATGTAAGAA +CGTATGCCGTCTTCTGCTTG +CGGTGGATAGGACTCCCTCT +CGTATGCCGTCTTCTGCTTG +CTCAAAGCGAACCAAACAGG +GAGGGTCCGGGGTGAAAGGA +ACTGTTGTCGAGGTATATCG +GTTAGTCAGACAATGTAAAG +GGGAAGCCTTCAAGAAGGTG +GTTTGTCGTTAAAGTGTTTG +TGAATGTAGGAACCTTTTTA +AAGTTCCGGATCTCTCCGTA +TATCGTTATGCGCCTTCGTA +GGTCGTGTTAGTAATGAGAT +CGTATGCCGTCTTCTGCTTG +AAAACAGGGTCGCCAGCAAT +AAAGTCATGTGTGTGTTTAA +AAATGCCGCGGATTGGTTTC +ATTTGCTAAGGGATTGCTAC +CCATTGTGGGATTGAATTAG +CGTATGCCGTCTTCTGCTTG +AATGAAATCTGGAGTTTCCA +CGTATGCCGTCTTCTGCTTG +TGTCCCGGTATTAGCGTGGA +CGTATGCCGTCTTCTGCTTG +AGAGATTAGAGCGCATGACA +AATTCCACTCATAGGGTCCG +GGATCTAAAAACATATACTG +AGAGTGTCAAAAACGATAAA +CGATGCTCTCGATAATAATG +GAGTGAAGTCAGTTGTTCGG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +TGGGCAGCAGTGATGATAGT +CGTGATTATCTTGCTGCTGC +AGCAGAAAACCTACCGCGCT +TGGATGAGGAGAAGTGGCTT +ATAGCAAGGCCACGACGCAA +TAGACGGTGAACATAATTTT +CGTATGCCGTCTTCTGCTTG +AGCGCAGATTAGTAGAATAT +AGTCAGTTTTAATTATGAAG +TTAGCTGTACCATACTCAGG +CGTATGCCGTCTTCTGCTTG +CATGTTGCAGTTGTTAAGAT +CGGCACCTGTTTTACAGACA +CAACCAGAACGTGAAAAAGC +GGAAAGGATACTCGTGATTA +CGAATGGTGCACATACGAGG +TCAACAGGGCAGAATCCGTG +AATCTAAAATAAGAAGGATC +GCATCTTGAAACTGTATTTC +TAGGTCAACATGATTACACC +CGTATGCCGTCTTCTGCTTG +TGATGCTCTTTTTGATTACA +TCGTGTGGCGCAACTGATAG +CAGCGACGAGCACGAGAGCG +GCGCCAGTTTGAATATTAGA +CGTATGCCGTCTTCTGCTTG +TCATCCCGTCAACATTCAAA +AATCTCATCTCTCTTTTTGC +CAGTTTTTGACAGAATCGTT +TTGTTTTGCGCCGCGTGGAG +GAGGTATTTAGGGCTGATAT +GGCAAATCGTTGCAGTTAGC +GCATCTTTGATACGGGTTTG +GTTTTTGACAGAATCGTTAG +GAGAGAGGTTCGTCGGCCTT +TACTAACGCACAAGATTACG +TAACCATAAGGCCACGTATT +CGTATGCCGTCTTCTGCTTG +TTTTATGGTTGCGTGGATCC +ATACACGGTTAATAGGTTAG +CTTAGTTGCATGAGCATACA +GGTACTCGCACCTCCCAGAT +GAGTGAAGTCAGTTGTTCGG +AGGTCTAAAGCTATGGTACT +CGTGGCAAATGAACCATAAG +GTTATAACCTCACACTCAAT +GACGTTCCTGGTATGTTAGC +AGGGCTTTCTAATAAGGGAA +GTAATAAGAACGAACCATAA +CGTATGCCGTCTTCTGCTTG +CCCCTCAAGTAGTTTAGGCA +TTTGGGGCTATAAAGGATGG +CGTATGCCGTCTTCTGCTTG +ATCGTGAAGAGGTTTTAATA +GGCTTGTAACCGTGTTAGGA +GATTGAATGGTGGGACAAAT +TATGCCAAGAAGCAGCGTTT +GACGTTCCTGGTATGTTAGC +GTGTATCCCTCTTGGCTGTC +TATAGGCTTCAGCGATTACA +CGTATGCCGTCTTCTGCTTG +TTACGCAGTTTTGCCGCAAG +TGCATACTGGAGCAGAGAGA +AAAATACGTCTTACGTCAGT +GTCTCTATTAAGTTCCGAGG +TCACCATAAACGTGACGATG +TGCATACTGGAGCAGAGAGA +GTCAGCAGGCGAAGTTTAAG +CGTATGCCGTCTTCTGCTTG +GCAACCTGTGACGACAAATC +TTGTACCCATTTCAATAACA +GGCGGCGGTTCAGGTCGAAC +AAACTGCCTTCGCGTGCGAA +TTATGCGACAGCGTTAGACT +TCTGATAAGTTGCTTGATTT +CATGACCAGATCGACTCGGC +TTATGACGCTGACAACCGTC +CACTCAGGCCGAAATTTACC +GGCATACGCTCGGCGCCAGT +AGAAAACGGCTGGTCCACTG +CTTGCGTGTCAGACTATGGG +CTGTCTGAGGTGGCTTGGGG +AAGATCATTGGGTGTTGGAA +GCGATGAGAGTATAAGGGAT +ATGCTCAGGAACAAAGAAAC +CGTATGCCGTCTTCTGCTTG +AATACCAGGGACTTGAGGAT +CGTATGCCGTCTTCTGCTTG +TAGACGGCGGAGACTTTTCG +TAAATGAACAGATAAGATAG +GTGCTATGGCTAAAGCTGGT +GTTAACTTCGGAGTCTAGGC +CCATTGTGGGATTGAATTAG +ATTGTTAAGCGGTAGTTGTG +AAAGTTAGGTTCGGTGTCTA +GAGGGGGGAATAACGATTCT +GACGTTCCTGGTATGTTAGC +TAAATGAACAGATAAGATAG +TCATGGATGTAAATCAAGTT +AACACTTTCGGATATTTCTG +ATTCTAAATTCATACAGCAA +CGGCCATTAGCTGTACCATA +TCCAGCTGCAAACTGGTTAT +TGGCTACAGCACCTGGGACC +GATGGCGTTCAATAGGAGGG +ACTGGATTAATAGAAGTGTG +CGTATGCCGTCTTCTGCTTG +ACCCCGGTTTAGGCATGATT +ACCGGAGGCGGCTTTTTGAC +CATAAATTTGAGCAGATTTG +AGTCGTACATAGGTTCTGTA +GATAGAGGAGACGCGGGGGG +CTGCCCCTTGTAGGTTTCAG +TCATTAGCCTTGCGACCCTC +TATATGGCTGTTGGTTTCTA +TTTCCCAAGCACGGAACCAT +ACAGAATGTTTATAGGTCTG +GCGGTTATGGATGAGTCGCC +CGTATGCCGTCTTCTGCTTG +CACGCTCTTTTAAAATGTCA +GCATCTTTGATACGGGTTTG +GTAAACGTATATAGCAGTCA +TAGGTTTCGTAGGTAGTAAT +TGGAAGACTTGATAAATTGA +CCATTGTGGGATTGAATTAG +GCAGGTTTAAGAGCCTCGAT +TGATAAAATAAAGTAGGAGC +ATTAACCTGTACTACCTTTA +CCATGATGAGACAGGCCGTT +AGATGCTTGCTTATCAACAG +GGTTCTGTAGATGTTATCTC +CGGGCAATAATGTTTATGTT +CAATAAGAGTACTTTTCGTG +CGTATGCCGTCTTCTGCTTG +GATAATAAGATACTGTGAGG +TTGATAAGCAAGCATCTCAT +TGTATTTACCATAATTCGGT +AACTATTACAGAACTTCGGA +CTTAGTTGCATGAGCATACA +GACGTTCCTGGTATGTTAGC +AATGGCTCGACAGATGTAAT +ATGCCGTGTGTTAGAGGATA +GGCTTGTAACCGTGTTAGGA +TAATCCTCGTACATTCTACC +ATGAGTGTTCAAGATTGCTG +AACATAAACATTATTGCCCG +CGTATGCCGTCTTCTGCTTG +ACAGAAGGAGTCTACTGCTC +ACGCTGATTATTTTGACTTT +AGATGGCACAGTATTCTACG +ACATTCAAACGGCCTGTCTC +TTGAGAGGACAGTTGATGGA +TTTCCGGAGTAATATCATGA +CGTATGCCGTCTTCTGCTTG +CGTTTCTCAGTAGTCAGTTG +AATGGAGAAAGACGGAGAGC +AATTTACATCAAGTTGGGGC +GAATTAAGAACATGTGCAGC +TTAACCCTGTGTAATCTGAG +ATCATACCTCAATAGAACAC +TAGACAATAAGTTGAGGATC +CGTATGCCGTCTTCTGCTTG +CGTTTATGACAAGGATAAAG +AGTCGCGATTTGTATTGGTT +CCCTCGGCAGCAAGAACCAT +CGTATGCCGTCTTCTGCTTG +TTCATATGTATACTGTGGTC +GATCACCTCAGGTGGCGGGA +CGTATGCCGTCTTCTGCTTG +ACGCGGGAGGGAGATGGCGG +CGGGGCTGAGTCGGCTAGAC +GATTTTGTGTGAAGAACAGG +AAGATAATTTTTCGACTCAT +TGGGCCGTAACTTGTACGTA +AGCGTTACCATGATGTTATT +CTCCAAATCTTGGAGGCTTT +CGTATGCCGTCTTCTGCTTG +CGATAAACCAACCATCAGCA +TACGAAAAGACAGAATCTCT +GGGGTGTACCAGTGATTAGC +TAAAGCCGCTGAATTGTTCG +AGGTACGTTGTTCCCACTGA +ACAGTTGAAGTCGGAAGTTT +ACGAATTAAGAGCAAGCTAT +CGTATGCCGTCTTCTGCTTG +TAGTCTACGATCTTAATGTG +CTGTATGACTAGTGTTATCA +TGTCATGCGCTCTAATCTCT +TGATGCTCTTTTTGATTACA +TAGAGAACCGACGAGCTGTG +GAGTCTTGGCACGGTGGGAG +CGTATGCCGTCTTCTGCTTG +ACTCTGCGTCATAAGCCGAG +CAACTCACTAAAAACCAAGC +TCGACTCATCAGAAATATCC +CGCCAAATGCTTACTCAAGC +ATGTATCCATCTGAATGCAA +CGTACGGATTGAAATCTGAA +GTGTGACGATAAAAGAATCA +TGAATTCGGGACCAACAGGC +CGCTCAAAGTCAAAATAATC +TATTAGGGGATCAGGGGAGG +TAGTGACGTTAGAACGATGC +CCATTGTGGGATTGAATTAG +CCCTGTGTAAACGGGTGAGA +ACTGGATTAATAGAAGTGTG +GCGGAATTAGTTCAATATCA +TGTCACGCTGATTATTTTGA +ATGGTCAGAAAATGTAAGGG +TGAGCTGGCGAAAGATAGAA +TATTCGCCACCATGATTATG +CGTATGCCGTCTTCTGCTTG +ACCCCGGTTTAGGCATGATT +AATTTGGCCCTAAGTCCTGG +TGACAATCGGGTTTCGACAG +GCTGCATGAAGTAATCACGT +TCTGTTAGTAGTGACGGCAT +AGTCGCGATTTGTATTGGTT +GATCCCGGACTTTATTGCGG +AAGTGGACAGCGGGTTTATG +GCAGCCACGAGTTTTAATTC +ACAAGTTGAGTTAAAGATGT +TGATGCTCTTTTTGATTACA +GAGTAGAATAACTTCGTCGA +ACTATTTTAAAGCGCCGTGG +CTCTGAGGGAAGGGTGGTGC +GCTAATTTGCATACTGACCA +GTTGGATGAGGAGAAGTGGC +TTAACCCTGTGTAATCTGAG +TGGCTACAGCACCTGGGACC +TGGAAATTAGATTGTTGAGC +TGGGCTGCCTTCGTGAATAC +GACGACACTTCCGCTTAGTC +GAGTTGAGCTAACTAGAGTC +GAACTATAAGAAATACTTCA +CGTATGCCGTCTTCTGCTTG +TGTTCCTATCCGCAGTTTGT +TAACCTGTAGACAGCATGAT +TAGTCTACGATCTTAATGTG +GGAGGTAAAACCTCTTATGA +GTGGTGTAGTGTTTTAAAGG +ATTTGAAGTAGGTTAGATAG +ACAAGTTGAGTTAAAGATGT +TATGGCTAAAGCTGGTAAAG +GGGTACTTACGGCGATTTTA +TGCGAGAACTGGGCACTTTG +GACGACACTTCCGCTTAGTC +GCATCTTTGATACGGGTTTG +CAGTGAAGTCAGTTGTTCGG +CTTTATCAGCGGCAGACTTG +TTCAATTTCGTCAAGGGAGG +TAACCTGATTCAGCGAAACC +GGAACTAAAAGTCCGGTATT +GGTTCTAACCGCAGAGGTCA +GGCTTACGTCACGCTTCGTT +GCCCCTATGTGATGGGGGAT +GTCTGCTCGTCGAAATCAAG +AAGTGACTACAGAAAGCATC +AAAATAGTTGTTATAGATAT +CAGCCCGACCAGATACGGTA +TCGTAGTTAGGTCTTAGCGT +ATGCCTACAGTATTGTTATC +GGTTTGCGACTCTATTTGTA +ATGGCCGTCAACATACATAT +GTTTCAAAGGTCATGACGTC +TAACCGTCAAACTATCAAAA +ATTGTACCGTTCGATTGACA +TCTGACGTCAGGTATAGTAG +CTGCTTGTTCGAAGCTAAAA +CGTATGCCGTCTTCTGCTTG +TGATGATGAAGGAACTATAC +CAAAGCTGGATATCTTTATT +AAAAAAGTTAACACGGTGGC +CGTATGCCGTCTTCTGCTTG +CGCATGGAAATGAAGACGGC +CAGCTTCTGCGCGGGGCGAG +GCAGCAATAAACTCAACAGG +TTTTGATAGTTTGACGGTTA +CGTATGCCGTCTTCTGCTTG +CAGCTTCTGCGCGGGGCGAG +GGGGCTGTGGCTGCGTCAGT +TGCTGGCATTCAGTCGGCGA +TTCTCAGTAACAGATACAAA +GTGATTCGTCTAAGAAGTTT +ACTAATTCAAACTAGGGGGG +TTAGCCTTGCGCCCCTCGGC +AGAGGCGTTTTATGATAATC +GTACTCCATGCTCTATATAC +AGCCTTACGATTTCATGTGT +TTATGGTACGCTGGACTTTG +CCTTCCGAACATAAGAATCT +TGGGCCGCAACTACGTATAA +TCTGTTAGTAGTGACGGCAT +ATTGTTAAGCGGTAGTTGTG +TCATGATTGAATCGCGAGTG +GTGCTTTATGGTTGTGGGTA +CCCCACTCTTACAGACGTGC +TACCTTTAGACATTACATCA +TGTATTTACCATAATTCGGT +ACACTCCGGGTGCTCGGAAT +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +AATGTTCTAGGAGTAAAGGG +CGTATGCCGTCTTCTGCTTG +CCTGAAACAAATGCTTAGGG +CGAGTATCCTTTCCTTTATC +CTTAGTTGCATGAGCATACA +GTGGCGCCATGTCTAAATTG +CTGTCAGGAGAATTACCGGG +AAAGAGGACCCATTTGGTTT +CGAAAATGACGACATACAAC +GTCGCTCCATTAGGTAGGCT +CCGACTGCTATCAGTATTTT +TGGCATAGGGGACGTTGGGG +GTTCTGCTTCAATATCTGGT +GTGCTGCACGCAGCCTCTTG +ATGTTCATCCCGTCAACATT +TGGGTACCTTAGTATTGTTG +AGAAATATCCGAAAGTGCTA +GTTTTCCTTTCTCCGTGAGC +ATGTACACGTTGAGAATGTC +GACGTTCCTGGTATGTTAGC +TGCGTTCTGCTTCAATATCT +GGATCTAGTAAATGGAAACA +CCTGAAACAAATGCTTAGGG +GGATTCCCATGTACAGTTGA +GGCTGACGGTTACATTGTGG +TCGACCTATCCTTGCGCAGC +AAGCATTAAGCTCAGGAAAT +AGTCGCGATTTGTATTGGTT +TGGGCCGCAACTACGTATAA +ACGTAAAAAAATATAGTCCT +GCTTAACTCATACCTAAGCT +CTGGGACTGGGTATAATGTG +ATTATGGGACGCGGCGCGAG +GAAACCGAGATGTCACACAG +TGTCCTGAATTCTATTTTTC +CCAGCCACTTAAGTGAGGTG +AGGGACTGAGGGGCGCCTTG +ATGACGGCAGCAATAAACTC +AGTCGCGATTTGTATTGGTT +GACGTTCCTGGTATGTTAGC +AGCGGTCTGGAAACGTACGG +TAATCACGTTCTTGGTCAGT +CGTATGCCGTCTTCTGCTTG +AATGCCACCGGAGGCGGCTT +CGTATGCCGTCTTCTGCTTG +TATTAGGGGATCAGGGGAGG +GTAATTAATCTGTCACGGAG +ATCATCTTGATTAAGCTCAT +TCATGATTGAATCGCGAGTG +TTGATAGGCTACGTCGTCCG +TTTTTAGTGAGTTGTTCCAT +CGTATGCCGTCTTCTGCTTG +TTACCTTTATGCTATCTTTC +AATGCAAGGGACACTTAGTG +CATTGGGGATTGAGAAAGAG +TAAACATTGGGCAATTCTCG +CGTATGCCGTCTTCTGCTTG +TTGCATACTGACCAAGAACG +AGCCACTTCTCCTCATCCAA +AGACTGTTTGAAGTCGGAAG +CGTTTGAGCTTGAGTAAGCA +GTAGGCCGTCATAGGGATCG +CGTATGCCGTCTTCTGCTTG +TCATTCGGCTGTGTAAAGAA +TGCGTAACCGTCTTCTCGTT +AGTATGCAAATTAGCATAAG +TTCTTGATCAAAACCTTAGC +CGTATGCCGTCTTCTGCTTG +GAGGGGGGAATAACGATTCT +CCTAAGCCCTCTTCACCAAG +CAAACTCAGGCACACAAAAA +CCTCAGCAATCTTAAACTTC +TATGGTTCGTTCTTATTACC +GATTTTACTGAGCACTGACG +GCAACAACTGAACGGACTGG +ACAACAGAAAGTAGCAAAGG +CGTATGCCGTCTTCTGCTTG +GATTTTATTGGTATCAGGGT +AATGGCTCGACAGATGTAAT +CGTATGCCGTCTTCTGCTTG +ATTCAATTTACGGGGGGGAG +TAGTCCATTCGTCCTGGACA +CCGTCAAACTATCAAAATAT +TGTGTGCCTGAGTATGGTAC +TGAGAAGGCGTGAAGAAGGT +TAGACTCTAGAGGGAAAACG +CGTATGCCGTCTTCTGCTTG +TCAGGTTAAATTTAATGTGA +GTCCAAGAAAATCCACGTGT +TAGTGAGATAAGTCATTAAT +GAAACGCGGCACAGAATGTT +GAACGTGAAAAAGCGTCCTG +GACGTTCCTGGTATGTTAGC +CATTTGGCGAGAAAGCTCAG +CCGATAACAATACTGTAGGC +GGCTTGTAACCGTGTTAGGA +TTGCCTTTATGCTATCTTTC +CAGAATTGAGTTATAGGTCA +CGAGGACACAAGTTTCAAGG +TGATTTATTGGTGCAGTTGT +TGTCAAAAACGATAAACCAA +CGAGCGCAGGATCCGATATA +TAATTGCAGGGGCTTCGGCC +TAGGTAAGATTGGCAGGCGG +GTGCGGGAGGAACAGACAAG +TGTAGCTTTAGGTGTCTGTA +AACCCTGAAACAAATGCTTA +GGCTTGTAACCGTGTTAGGA +AATTGATGCAGTAGCAGTAT +TTTATTGCTGCCGTCATTGC +GCTTCTGCTCTTGCTGGTGG +GGACGTCAATAGTCACACAG +CGTATGCCGTCTTCTGCTTG +TGCTCTTGCTGGTGGCGCCA +ATTGTTAAGCGGTAGTTGTG +CTGTCTCATAAGCTAATTGG +GTTATGCGGGGGCGCAAGCA +CGTATGCCGTCTTCTGCTTG +AATGTGTATGAAGTGCAATG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +GAATAATGAGATTAGTAAAC +TAGTTGTTGCATACGTAATA +CGTATGCCGTCTTCTGCTTG +CGATTTACCTAATAGCAGTT +GGGGGGGGGGGGGGGGGGGG +GGTTGGAGCCCATGTTAAAA +TGTGAAGACGCCAGTCGTTC +GTGATTTATGTTTGGTGCTA +CGTATGCCGTCTTCTGCTTG +AGGAAGTTCTAATATCCCTG +AGACCCATAATGTCAATAGA +CGTATGCCGTCTTCTGCTTG +TCCATGATTACACCAAGCGC +TGGTTTTTAGTGAGTTGTTC +CTTCTCACACTAACAAAGAA +GTTTAAGATTGCTGAGGGTC +GTGTCCCCCGTTTTGCAACG +CACTTTCGGATATTTCTGAT +CGTATGCCGTCTTCTGCTTG +GCGGAGCAGTCCAAATGTTT +GAGCCGCAACTTCGGGATGA +GGGTTTACTACGAATTATAT +CCCCTCAAGTAGTTTAGGCA +TTATCCATCTGCTTATGGAA +GAGGCGTATCGAAAGCGTGC +GACGTTCCTGGTATGTTAGC +CCCCTCAAGTAGTTTAGGCA +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +GTCGGGAGGGTAGTCGGAAC +GTGCTTGTACTTATGAATGA +ATGGTCGGAAAATGTAAGGG +CGTATGCCGTCTTCTGCTTG +TCATAGCGTATAACTTAACA +TAGAGAAGATGGTGTCAGGG +GTTCCAGCAGATAATAATAA +TAGAGAAGATGGTGTCAGGG +CGTATGCCGTCTTCTGCTTG +AATCTTGGAGGCTTTTTTAT +TTTTATGATTTATGAGAAAC +TTGCCTGGATAGGTGTGTCG +GGCTTCGTAGAATCAAAGCG +TTGGATACGCCAATCATTTT +CGTCAGTAAGAACGTCAGTG +TGGGGTACGGCGAGATATCT +CGTATGCCGTCTTCTGCTTG +CGCTCTAATCTCTGGGCATC +CAGTGAGCCGATTTACAAGT +TAGTTGTTGCATACGTAATA +TAGGGTAGGTTCGGAGCATG +TTACCTTTATGCTATCTTTC +CGTATGCCGTCTTCTGCTTG +CCATTGTGGGATTGAATTAG +AAAGGAACTAATAATAAGTA +GATTCCTGTCCCTATGTAAA +GGTCGGCAGATTGCGATAAA +CGTATGCCGTCTTCTGCTTG +TGGCATAGGGGACGTTGGGG +CCATTGTGGGATTGAATTAG +AATCAGCTGATGCGCGTGAG +GATTACAATTATTTGGAGTA +GAAGTTATGATGTATCAAGT +CGTATGCCGTCTTCTGCTTG +CATAGAAACCAACAGCCATA +GTAATTAATCTGTCACGGAG +GCCGTTGCGAGGTACTAAAG +ACAAGTTGAGTTAAAGATGT +CGTATGCCGTCTTCTGCTTG +CCAGCAATCTCTTTTTGAGT +AAACCAAGCTGTCGCTACTT +GGATCTAGTAAATGGAAACA +GGCTTGTAACCGTGTTAGGA +CGTATGCCGTCTTCTGCTTG +TAGACGTCAGGAGCCCGCCG +AATGGCTCGACAGATGTAAT +CCCATGCCTACAGTATTGTT +CGTATGCCGTCTTCTGCTTG +CAGGCCAGGTCGTGATTAGG +CGTATGCCGTCTTCTGCTTG +CCAGCAGAGGAAGCATCAGC +ACCATAAGGCCACGTATTTT +AAGGTCTAGGAGCTAAAGAA +GACGTTCCTGGTATGTTAGC +CAGACGCCAGCTCCAGTTGT +GACGTTCCTGGTATGTTAGC +GTGTTGCATTTTGGGGCGGG +AATATCACGAAAATAGTCAC +GGCTTGTAACCGTGTTAGGA +CGTATGCCGTCTTCTGCTTG +AGTGGGTGTGATTAATGAAC +CATGACCAGATCGACTCGGC +TCGTCTGCGCCCGAGTGGAG +TGTTCTTCGCATGGTATACA +GTTCTCCGCCTGTACCCGCT +GATCAGATGAAAGGAGCATC +TTAAAGGGTAAACCTTCATG +GTCCGGTTAAAGCCGCTGAA +AGCTGCTTATGCTAATTTGC +GACGAAAAATGGTTTTTAGA +TAGTCCATTCGTCCTGGACA +TGATTGTCCAGTTGCATTTT +TAGACGTCAGGAGCCCGCCG +CGTATGCCGTCTTCTGCTTG +AGTTCTCGTAAACATAAGTC +ACGTCCGCTTACTGGTCTTA +TGAATGTAGGAACCTTTTTA +CGTATGCCGTCTTCTGCTTG +TCACACAGCGAATCTCTTCT +TAGAGAAGATGGTGTCAGGG +GCTATTAATCTCTTAATTAA +AATAACCTGATTCAGCGAAA +CGTATGCCGTCTTCTGCTTG +TGGAGGCTTTTTTATGGTTC +TGGAACTCGCAAGTGTTATT +TTGACTTATACCGATATTGC +CGTATGCCGTCTTCTGCTTG +CAGCGTTACCATGATGTTAT +TTAGTTGAAGTCGGAAGTTT +ATGGCAGCAACGGAAACCAT +ATAAAGTCATGAGCTGCAAG +AATGCGACAGGCTCATGCTG +CGTATGCCGTCTTCTGCTTG +CTTTAGGGTCTAGATTGAAC +CATGCACTCTCTAGCTAAAT +AAACGATAAACCAACCATCA +TACCAAGCTGGGTTACGACG +TGCGAGAACTGGGCACTTTG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +GGCTTGTAACCGTGTTAGGA +GGGATCACACATGTTGCAGA +TGGGCAGCAGTGATGATAGT +TTTACCTAAGGAAACGAGAG +CGTATGCCGTCTTCTGCTTG +CACGCTGATTATTTTGACTT +CGCACAAGTTGCACTGTGGG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +ATTCCTTAAGAGGCCAACAC +AAAATGAGATGCTTGCTTAT +AGGTCATGCGGCATACGCTC +CGTATGCCGTCTTCTGCTTG +GAACGGACTGGAAACACTGG +TTGAACGACTGGAGACTTAG +CGTATGCCGTCTTCTGCTTG +AATTCTAATCAATAATCGAA +GATTACAATTATTTGGAGTA +GCAGGGCGTTGAGTTCGATA +GCTCAGGAACAAAGAAACGC +ACTACTGCTTGTTTACGAAT +CCAACATCACAGTTAGGGTG +GACTGGAAACACTGGTCATA +CGTATGCCGTCTTCTGCTTG +AAGGGGCCGAAGCCCCTGCA +CGTATGCCGTCTTCTGCTTG +GCGGTTATGGATGAGTCGCC +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +TGATGCTCTTTTTGATTACA +CGTATGCCGTCTTCTGCTTG +GATACCAGAGATCTTTATCA +GCCATGCTCAGGAACAAAGA +CGTATGCCGTCTTCTGCTTG +CGCCTCCAAACAATTTAGAC +TGTCAACTGTGTATCTAGGA +CGTATGCCGTCTTCTGCTTG +AATCCCCAATGCTTGGCTTC +TGAGTTTATCTTCAGACTTG +ATCACTCCTTCTGCACGTAA +TGATGCTCTTTTTGATTACA +CCAGCCGCTTGTCTGGGGTA +GAGGATACGTGGTACTCATG +CGGTGCGATGGGGGTCTTGA +CCATTATTACCCACCGTTCA +AACGTCGCGTTGTAAAGTCG +CCATTGTGGGATTGAATTAG +GACGTTTGGTCAGTTCCATC +GGACGTCAATAGTCACACAG +TTTGTGTAATCTCACGTTGA +TCATAGCCAGATGCCCAGAG +ACAAATCTGCTCAAATTTAT +AATTTTGCGAAACTGAAAAA +GACGACCAATCTGACCAGCA +AAATTGGGCTTCTCAGGCGA +ATCTTGATTAAGCTCATTAG +CGTGCCAAGAAAAGCGGCAT +TAGTTCTTTTCAAGATGTGT +CCATTGTGGGATTGAATTAG +CGTATGCCGTCTTCTGCTTG +TGAATTCGGGACCAACAGGC +CGGTGGATAGGACTCCCTCT +GTTCATCCCGTCAACATTCA +TGGGATTATCATAAAACGCC +GTTTTGGTCTTTAATGATAC +GAATTAAGAACATGTGCAGC +AGGTCCATATCTGACTTTTT +CGCAAGCATTCAAAAGGTGG +TCTACGTTGAGAGTTATGCA +TATAGGTCTGTTGAACACGA +ATTAAATCGAAGTGGACTGC +CGAGGTACTAAAGGCAAGCG +TACTCAGGCACACAAAAATA +GCTGTTGCTTGGAAAGATTG +CAGAATGAGCCGCAACTTCG +TGGCTGCTGAACGCCCTCTT +TTCTGGTTTCTTACAAGTCA +CGTATGCCGTCTTCTGCTTG +GACGTTCCTGGTATGTTAGC +AAACTGGGTCTGTAGAGCAT +CGTATGCCGTCTTCTGCTTG +TAACCTGTAGACAGCATGAG +CTCTACTCAAGAACTCCAGG +AATATGGTAAACGTCTTAAT +CCTATGGGATCATGTCAGCG +CGTATGCCGTCTTCTGCTTG +AAACGCTGAATAGCAAAGCC +AATGGGAGGAAACTTAAATC +CGTATGCCGTCTTCTGCTTG +CATAAATCACCTCACTTAAG +AGGGACGGCCTCATCAGGGT +TAGGAGTTATGATGGGGCAA +TAGTAAGTCCGACCCGCTCA +ACAATAAGGTTAACGAGGAC +CGCACAAGTTGCACTGTGGG +TGGATAGTTTTAGGGGGAGT +GGTCCAATGCAGGTATAACA +TAGGAGTTATGATGGGGCAA +GTCCCTCATCGTCACGTTTA +GTGCCTCTGAGTCGCTAAGC +GTAATTAATCTGTCACGGAG +GAGCAGATTTGTCGTCACAG +AGAGGCGTTTTATGATAATC +GAGTAGGGCGGTGAGTAGGA +GACTTATACCGATATTGCTG +TTATACTCATAGATTACGTG +GCTCAGGAACAAAGAAACGC +GATTTTGTGTGAAGAACAGG +CGTATGCCGTCTTCTGCTTG +ACGCTATTAATTTGCTGCCA +GATACAAAGAGGCCTATCGT +GGCTTGTAACCGTGTTAGGA +TTATAACTTCATAGGTGCGA +GGAATAGTCAGGTTAAATTT +TTAAAGGGTAAACCTTCATG +CGTATGCCGTCTTCTGCTTG +GGACTGCTGGCGGAAAATGA +ATCATAGCCAGATGCCCAGA +CTACCAGTTATATGGCTGTT +AATAGTCACACAGTCCTTGA +AACAATTTAGACATGGCGCC +CAATTAAAAGTATTCAGGGG +CGTCGTTAGGGTATATATAC +GACTGTAACGATACGAGCGG +GTAAACGCGAACAATTCAGC +AGGTTGGTAGATTCGGGTTG +TCTGTTGAACACGACCAGAA +ATACGTGGCCTTATGGTTAC +TTAACCGACTACACTTCGTT +CGTATGCCGTCTTCTGCTTG +TTGTTATAGATATTCAAATA +GTCGAGTTCATTCCGTGATG +GCGGCGGCAAGTTGCCATAC +CGTATGCCGTCTTCTGCTTG +TTTGGGGCTATAAAGGATGG +GTTTTATGATAATCCCAATG +CGTATGCCGTCTTCTGCTTG +TGCTGCTTCTGACGTTCGTG +GTCATGACCCCCCACGTAGG +CGTAGTCTATAGCGTCTGAA +TTCAGTATCGAGGTGCCGTG +TAATCCGGGTTAAGCTGGCT +CGTATGCCGTCTTCTGCTTG +TGTATTTACCATAATTCGGT +TAGTTGTTGCATACGTAATA +CGTATGCCGTCTTCTGCTTG +GGATTCCCATGTACAGTTGA +TGATGCTCTTTTTGATTACA +AGTTGTATGTGCAAGTAGAA +CGTGTCATTATTGTGGAAGC +CTGGTTGGTTGTGGCCTGTT +AAATCTCACGAAAGCAAATC +CGTATGCCGTCTTCTGCTTG +ACGTTCCGTCTGAAACATCA +TGGTTTGGTCTAACTTTACC +AGCGATAGTCGCAGTATTAA +CATCATGGAAGGCGCTGAAT +TATTTTCGTGATATTGGTCG +CGATAGTCATGTACAGTTGA +CCTTTCGCATCGGAGATGTG +GAAGTAAAAGACATCCGTTA +GCCTAATCGCGTCAATTTGG +TATAATCGGAATCGGAATAA +CGTATGCCGTCTTCTGCTTG +GGAATCTATCAGGTAAGCCG +CGTATGCCGTCTTCTGCTTG +GTGTTCAAGATTGCTGGAGG +GATTCAATCATGACTTCGTG +CTGCTAAAGGACTTCTTGAA +TCACCAGAACGGAAAACATC +AAAGTCATGTGTGTGTTTAA +CTGAATGCAATGAAGAAAAC +CGTCAACATTCAAACGGCCT +GCATGACAAGTAAAGGACGG +AACTGCGTAACCGTCTTCTC +CGTATGCCGTCTTCTGCTTG +GGTAAAAATTTTAATTTTTG +TGTATCGTAAAACTCGTTAC +CGTGACATTCAGAAGGGTAA +GTTTACAAAAATTGACATGT +CCCAAAACATCGCCCAAATA +TGCGGATGGACAGATTCTCG +TTTGGATTGCTACTGACCGC +GCCGAGTCAGTGGTGTAAAT +ATCGTGAAGAGGTTTTAATA +GCGCTGCCTACAAAGGAGGC +CTTGAATGCCACCGGAGGCG +CGGCATCAAAAGCAATATCA +CATTTGCGATTGATGCTCTA +GTTCTTGTCCATTTGCAATT +AATGTTTTTGAGATGGCAGC +ACGTGCGTCAAAAATTACGT +TGTATTTACCATAATTCGGT +CGCACAAGTTGCACTGTGGG +CGTATGCCGTCTTCTGCTTG +TAGTAAGTCCGACCCGCTCA +TGTCCCGGTATTAGCGTGGA +GGAGAAAGCGTAAAGATATC +GGAGTAGTGTCGCTAACGGG +CCAGCTTATCAGAAAAAAAG +ACAGTATGCGGCCGGCTGGG +AAAGTTTTGGTTACGCAGAG +CGTATGCCGTCTTCTGCTTG +CAGAAGAAAACGTGCGTCAA +AAGGATGGTGTTAATGCCAC +CAAGCTTTGTCTTCCCGGGC +CGTATGCCGTCTTCTGCTTG +TGCGAGAACTGGGCACTTTG +TAACTTCTCAGTAACAGATA +GGGTTTACTACGAATTATAT +CCAACGCCAGGCAGAAAGTG +AAGTGGACAGCGGGTTTATG +ACTCCTAAGCAGAAAACCTA +ACTATAGACCACCGCCCCGA +ACAAAGACTTAGGGATTGTG +TAGCGTGTGAATAAGTGAGA +GCACTCCGTGGACAGATTTG +TACGAGTTCTAGACCTCAAA +CGTATGCCGTCTTCTGCTTG +GATTATGCGCCAAATGCTTA +TGGCATAGGGGACGTTGGGG +CAGTTAAATAGCTTGCAAAA +AAGGTCTATTTAAATAAAGG +CGGAAAACGAACAAGCGCAA +TAGACTGTGATCTTGAGGTG +CGTATGCCGTCTTCTGCTTG +GGAACAAAGAAACGCGGCAC +CAGTTGAAGTCGGAAGTTTA +TTAAAGGGTAAACCTTCATG +CTAATAGAAAAAAGCTTGGC +TCGCTTTCCTGCTCCTGTTG +AGTGGAACAGTTTTATGAGA +CGTATGCCGTCTTCTGCTTG +GGAACATTAGAGCCTTGAAT +CTAGGAGTTTATGTGACCTG +CGTATGCCGTCTTCTGCTTG +ACAAGTTGAGTTAAAGATGT +CCGTACCCATGAAGGATATC +CGTATGCCGTCTTCTGCTTG +CACGAAGTCATGATTGAATC +GCACAGAAAATCGTTGTAGA +GGACTTCCAAGTAGTGAGGA +GCGAGTCATTCCCACCGTGA +GGTACACAATCAGCACGTTG +TCATAGAGATATGTAAAGGA +TGTTCTTCGCATGGTATACA +CGTATGCCGTCTTCTGCTTG +CTGCCAATGCCCGACAACAA +AAAATACGTCTTACGTCAGT +CAGGCCGCAGGAAAGTCACG +CGTATGCCGTCTTCTGCTTG +GTGAAATGACTTCTTGCAAA +TGAATGTCGAGAATACTGTA +CTACCTGTAGGAAGTGTCCG +CGTATGCCGTCTTCTGCTTG +TAAACATAGGATATGAAGAA +GTAAGCCTAGACCCACAGGT +CGTATGCCGTCTTCTGCTTG +CCTTATGGTTACAGTATGCC +GGAGATTGATGAGACCATAG +CATTTCCGGGACAAGGATAA +GGTACGATGTCAGCGAAGTA +ATCACTCGGGGGTTTCGACG +GCCAAAACGTCGGCTACAGT +TTTGGTTCTAATATATATAG +CAGGCTTCTGCCGTTTTGGA +CGTATGCCGTCTTCTGCTTG +AACCTACCGCGCTTCGCTTG +TGATGCTCTTTTTGATTACA +CGCGTCATTATTGTGGAAGC +GCTGAATCAGGTTATTAAAG +ATTGCAAAATGTTATTAGCG +GTTTCAAAGGTCATGACGTC +TAATTATACAGTAATAATAA +GGGGGGGGGGGGGGGGGGGG +ATTAACTTCTCAGTAACAGA +TAGTCCATTCGTCCTGGACA +ATTTCTGATGAGTCGAAAAA +CAGCACCAACAGAAACAACC +AGCTGGGTTACGACGCGACG +TTAACCCTGTGTAATCTGAG +AAGCGGTTCTCCACGTGCAA +AGCTAATGGCCGTCTTCATT +AAGGATGGGGGGACAAACAT +TGATGCTCTTTTTGATTACA +TGAAGCGGATAGATGGTTGG +ATTATGACCAGTGTTTCCAG +ATCGTGAAGAGGTTTTAATA +CGTATGCCGTCTTCTGCTTG +AGCTTTATCAATACCATGAA +CACGTATTTTGCAAGCTATT +TCCGCATAAAGTGCACCGCA +CGTATGCCGTCTTCTGCTTG +ATATGAATTTGTTGACTGGG +TAAATGAACAGATAAGATAG +AAGGATGAATAAAAAGTCAG +TAAACGCGAACAATTCAGCG +GAAGGTGCGAGTCGGCATGC +GAGTAAGCGTCAACTGATGG +GACGTTCCTGGTATGTTAGC +AGAGTCCCAAAGTTTCACAG +GGAGGGTGTCAATCCTGACG +GATTGAATGGTGGGACAAAT +TGCTTACATAAATTAGCGAC +CAAGCTGCTTACGAGTAAAT +GGCTTGTAACCGTGTTAGGA +ATATCGGTATCAGGACATTG +AAGAAACGCGGCACAGAATG +GATGAATAAAGTGATGGAGA +TGTGTGTTGCCCAGAAGGAG +TCATGCCCTTTTTTTAGGTA +CGTATGCCGTCTTCTGCTTG +AAAAGTTCATCTAAGAGTCC +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +GCGATTATTAGGAGATGTTA +TGCTTGGCTTCCATAAGCAG +CGTATGCCGTCTTCTGCTTG +ACTGAACAATCCGTACGTTT +TTCTGAACAGCTTCTTGGGA +CCGAGACCACTAGCGTACAT +TGTTTATAGGTCTGTTGAAC +TTTGCGAAAAAGAGGAGACA +AACCATCAGCATGAGCCTGT +AATTAGGTGGCGAGAGAGTT +GACGTTCCTGGTATGTTAGC +TGATGCTCTTTTTGATTACA +ACCGTTTATCGCAATCTGCC +CGTACGGGGAAGGACGTCAA +GACGTTCCTGGTATGTTAGC +CGTATGCCGTCTTCTGCTTG +ACCAGCACGCTCCCAAGCAT +ATGAATGCAATGCGACAGGC +TTTATCGAAGCGCGCATAAA +TGGGCTGCCTTCGCGAATAC +CTGGCCTGTAGCGGGGTTGA +GACGTTCCTGGTATGTTAGC +AAGAAGGTGATAAGCAGGAG +GGATCTAGTAAATGGAAACA +TATGTTAGAGTCTTTAGGAT +GTCGAAGCTTGCCACAGTTG +CGTATGCCGTCTTCTGCTTG +GTTTTGGGATGGTCTGGGCT +TTACCTTTATGCTATCTTTC +TATGATAAAAGAGAGATAGG +GATGGCGAAAGGTCGCAAAG +GCAAAAGTTGGTGCACTCTA +CGTATGCCGTCTTCTGCTTG +GTCGAAGCTTGCCACAGTTG +AGCTTGCAGACCCATAATGT +GGCTTGTAACCGTGTTAGGA +GACGTTCCTGGTATGTTAGC +GATGGCGTTCAATAGGAGGG +GCTGACCAGCATTAAAATAG +CGTATGCCGTCTTCTGCTTG +GAAGGAAGCACCGAAGTTCG +TTTCTCACGGACAGGGTTAG +GACGTTCCTGGTATGTTAGC +GCTGCCTGAGCAAAAATAGA +CGTATGCCGTCTTCTGCTTG +TAGTGATCAGAGGTGACAAT +TAACCCTAATGAGCTTAATC +CCGGCCATACAAGGGAAGTG +CGTATGCCGTCTTCTGCTTG +AGGCACGAAGGATTATTTCG +GGCTTCGGGTAGTTTGGGAA +TTCATGAAGGATGGTGTTAA +CTTGACGAACGTGCCAAGCA +CGTATGCCGTCTTCTGCTTG +CAGTTATATGGCTGTTGGTT +ACGCTCTCTGAGAAACATAA +ACTCAAGCTCAAACGGCTGG +TCCAGACCGCTTTGGCCTCT +ATCGTGAAGAGGTTTTAATA +AACGTGATTACTTCATGCAG +CCAATAAAATCCCTAAGCAT +CTATGGATTTAAACATCCTC +GTAAGGGCCCCCTGAGTTAT +GACGTTCCTGGTATGTTAGC +ACCCCGGTTTAGGCATGATT +AAACGTCGGCTACAGTAACT +GACAAATAATCTCTTTAATA +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +ACAAAGTTCGCTTCGGTGGC +CTGACGTTCTTACTGACGCA +TTGCCTGGATAGGTGTGTCA +TTCCTGCTCCTGTTGAGTTT +ACATCCTTCATAGAAATTTC +TAGTTGTTGCATACGTAATA +CTCTAATTTGTCTAGGAAAT +AGGTTTGTTCTCGGGAGGAC +ATTGTCGAATGCCCAGAGTC +GACGTTCCTGGTATGTTAGC +GATGCGGTTATCCATCTGCT +GAAGCCGATCAATGTAAGAA +CTTAGAGTCTAGAGTCTCCG +ATGGTCGGAAAATGTAAGGG +CGTATGCCGTCTTCTGCTTG +AAAAAAGTTAACACGGTGGC +TTTGCGGGTATTCCTCACGG +TATTAGGGGATCAGGGGAGG +ACTGGATTAATAGAAGTGTG +TGCGACCCTCGGCAGCAAGA +AGTTGTTATAGATATTCAAA +GCATGGTACAGTTGAAGTCG +GCAATAAGTCGTAACCATGT +GAATCAGGTTATTAAAGAGA +CTAACGGTATTATTTAGGGG +GTTTATCGAAGTAGTTGGGT +TCTAATAGTTATTGCCATTG +GGATCTAGTAAATGGAAACA +GGCTTGTAACCGTGTTAGGA +GGTACGATGTCAGCGAAGTA +ACGATTTCTGAGATCGGATC +CGTATGCCGTCTTCTGCTTG +ATAACAGGTTTTGCGATATA +TGAAGAAATAACATCATGGT +GTGCGTCAAAAATTACGTGC +TTGTTCCATTCTTTAGCTCC +GGCTTACAATCTAGGTAGTA +GTTATACCGGTGCATCATAC +ATCCATCTGCTTATGGAAGC +AAAATGATCACAATGACAAA +TCTTTCCAGAAATTGTTCCA +AGACTGTTTGAAGTCGGAAG +TGCGTCGTAATGTAAAGAGA +CGTATGCCGTCTTCTGCTTG +TAGTCCATTCGTCCTGGACA +GTTCTCCGCCTGTACCCGCT +CGTATGCCGTCTTCTGCTTG +TGACCGTACCGAGGCTAACC +ACGAATTAAGAGCAAGCTAT +TTTACGAATTAAATCGAAGT +GTAAGAAATCATGAGTCAAG +CGTATGCCGTCTTCTGCTTG +AAGTTGAGCATGCTAGTTGT +TAGTTGTTGCATACGTAATA +GTGGAGCACATATATATGAA +TACAAACTCATCACGAACGT +TGCGAGAACTGGGCACTTTG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +AGGGGAGAAGTCTTAGTACG +TTCCCCGTACGCCGGGCAAT +TTAGTCAGGCACATCTCGGT +GAGTTCGAAACTCTGGAGTG +AATCCCATGTCGAATTGACT +TTAAAGGGTAAACCTTCATG +TTATCCGCAAGGTTACTGAT +GCATCTTTGATACGGGTTTG +CGACTTCTACCACATCTATT +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +ACGTACAAATGATGGGCGAG +TATCAGGGTTAATCGTGCCA +TATGGCCGTCAACATACATA +TTATGTAGGGGTCAACAGGG +CTGTGATGACACAGTCTTAG +CTATCTGGTTAATGAGTTAG +TTTCATTGATACAGTAAGCA +TGGACAATCAGGTTGCTTAG +AAGGCTGAGAATACGGACGG +AATTGGGTGAAAATTGAGAG +AACCCTGAAACAAATGCTTA +ATCCCTCTGTATTTCGAGAG +CGTATGCCGTCTTCTGCTTG +GTAATTAATCTGTCACGGAG +CGTGCGTCAAAAATTACGTG +TGACGCAGGCCTGAGTATAG +ATAAGAGGTTTTACCTCCAA +ATGATCAGTCAGCATGCGAA +TTCTGTTTAGCCAATTTGTG +TTACAGAGCGGGTCGAGGTG +CTGCGACGCGCGTGTCTTAA +GTCTCTATCATGGGATGAGT +GTACTCCATGCTCTATATAC +TCTTCTGGACGACGAAACAA +GACGTTCCTGGTATGTTAGC +GACGTTCCTGGTATGTTAGC +CAATTTTGAGATTCGTAGGG +CACAAATTAGATATCAAAAT +GACTATTTTCGTGATATTGG +ACATACAATTGGGAGGGTGT +ATAATCTCGGAAACCTGCTG +TGACGCGTTGGATGAGGAGA +GAAACAAAGCAACGCGGGGT +ATTCAAACTTTTTTTCTGAT +CGTATGCCGTCTTCTGCTTG +AATTACGAGCGGATACATAG +CGTATGCCGTCTTCTGCTTG +CTCTTTTAAAATGTCAACAA +GGCTTGTAACCGTGTTAGGA +CCACTGGTCGGTGATAATAA +AGTGCACCTAAAAAATGGAC +TGCTGCATAAAAAGGGGCTA +GGAGGTAGATGGCACATAAG +TAGCGGGTATAAGGGAGGCG +CCATTGTGGGATTGAATTAG +CGTATGCCGTCTTCTGCTTG +AGCGCAGATTAGTAGAATAT +AAAATACGTCTTACGTCAGT +ATTACATCACTCCTTCTGCA +AGCAGGAGAAACATACGAAG +GCATCTTGAAACTGTATTTC +ACTGCCAGTAAGTATGGGTC +CTATGTCCTCTGAGGAGCCT +TCCCAGACCTTTCCCGGTCC +GCATCTTGAAACTGTATTTC +GCTTGAGTAAGCATTTGGCG +AGTACGTAATATCTTCACAC +CCCAGTGGGGACAGTGTACG +GATCAGAGGTGATGGAATGG +AACATTATTGCCCGGCGTAC +TGATGCTCTTTTTGATTACA +CGTATGCCGTCTTCTGCTTG +TTTCTCAATCCCCAATGCTT +CGTATGCCGTCTTCTGCTTG +TACGAAAGACCAGGTATATG +GAGTTGTTCCATTCTTTAGC +GGTGAAAGGTGATGCGCTCC +CGTCTACCGGGTACTTGGAG +TTTGAAAAATACTACCCATC +GCTGGTAAAGGACTTCTTGA +AAGGAGTTTAAGTTATATGT +CGTATGCCGTCTTCTGCTTG +GGAACTAAAAGTCCGGTATT +TTATCCGAAAGTGCTGCTGG +GTTTGTTGCCTACAGCCTGA +TGCTTGGGAGCGTGCTGGTG +TATATGGAGGCACTGTGTTG +GGTAACAGATTACCCGCTCG +AAAAAAAATATTAGTGAAGG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +ATTTTAGTAGTGGTAGGGGT +GGTCGTATGGTTCTTGCTGC +AATACCATTTCAACTACTCC +TGAGTAAGACGGATCCACTG +GAATTTACGGAAAACATTAT +AATGAGGTATAAAAGTCGCA +CTGATGAGGCCGTCCCTAGT +GTACAGGGATGAATCCTGAG +TGCCGAGATGCAAAATGAGA +TTGTAAATGTATATGTAATG +AAGTGAGGTGATTTATGTTT +CTTGCCGGGTACGTACAGGC +TTTAGCGGGCAATGGTGTGG +CGTATGCCGTCTTCTGCTTG +ATAATTAGGGCGAGCTATAA +CGTATGCCGTCTTCTGCTTG +GAACTGTGAAACATCTAAAA +CGTCTACCGGGTACTTGGAG +CCGTCAAGATATTTATTGTG +CGGAGTAGTTGAAATGGTAA +TAATCATGTTTCAGACTTTT +GAACGGAAAACATCCTTCAT +AAGCTCCATACTGTCGTTCC +CAACCAATCATCAATTTATG +GGCTTGTAACCGTGTTAGGA +AAAGGATATTTCTAATGTCG +GGTTTTTAGAGAACGAGAAG +GCACATAACCAGATAGATGG +CGTATGCCGTCTTCTGCTTG +AAACGGGTTGAGGCCTCAAG +GGTGGGGTTGGTGCGTCGTC +CGTATGCCGTCTTCTGCTTG +TATTAGGGGATCAGGGGAGG +CGTATGCCGTCTTCTGCTTG +CGATTTACCTAATAGCAGTT +GTGGTTTTTAAGGATCTAGT +CGCTGACAACCGTCCTTTAC +AATGAAGAAATAACATCATG +GGTCGTATGGTTCTTGCTGC +TGCGTGTAGCGAACTGCGAT +CGTATGCCGTCTTCTGCTTG +GAACGTCAGAAGCAGCCTTA +CGTATGCCGTCTTCTGCTTG +GTCCAAGAAAATCCACGTGT +GCAGCTCGAGAAGCTCTTAC +TCCTAAGCAGAAAACCTACC +ACGCCGTTGGCGCTCTCCGT +CACGAAGTCATGATTGAATC +ACTGTTAAGGTAACGAGGAC +TAAATTTCATGCCAGCGAGG +ATGCTCCGCTGGGGGTTCTT +ACTGCCCAAAACTTAGCCCC +GCATCTTTGATACGGGTTTG +CCCCCATATTAGAGAGACTT +CGCTACCTGTAGGAAGTGTC +TTACCTTTATGCTATCTTTC +TATGATAATCCCAATGCTTT +GCATTTATACAGGGATAGGG +ATTTTTTATGAAGACAAGAT +CGTATGCCGTCTTCTGCTTG +TAGTCCATTCGTCCTGGACA +GAGGAGAAGAATGTACATGC +TTACCGCTTCGGCGTTATAA +AAGAACGTGATTACTTCATG +ATAGGTGACAGACCTTCTCA +TTTCTCAATCCCCAATGCTT +ACTCCTAAGCAGAAAACCTA +CGTAAATTATAATTCATCAT +AAGTATGTCATTCACAATTA +GCGTGATATTGCTATCTTAG +CGCTCAAAGTCAAAATAATC +CAAATGTTTTTGAGATGGCA +CGATAAACGGTCACATTAAA +TGTTATAGATATTCAAATAA +GACGTTCCTGGTATGTTAGC +TGAGTGTTCAAGATTGCTGG +AGTTTGACCATCAGATCGTT +AAAATACGTCTTACGTCAGT +CAAACATATAAGAAAGTCTA +CCGACTGAATGCCAGCAATC +GAGGATACGTGGTACTCATG +TGTATTTACCATAATTCGGT +AATGTTAAATCAGAAGCACA +AGGCTGATAGAATTAGGGCT +TTCAGGCTTCTGCCGTTTTG +TCATTCGGCTGTGTAAAGAA +CGATTTACCTAATAGCAGTT +GAAGCTTACAGCTCAAACTG +GTAAACGGAAAAGTCCTTTC +TTAAACAGGCCCTGCTATTA +AATCCCATGTCGAATTGACT +ACAGGCAAAAAATTTAGGGT +GACGTTCCTGGTATGTTAGC +CGTATGCCGTCTTCTGCTTG +TATTTAAGAACGAAATAATG +ATTCTCCAGGTGCCCAGAAA +CGTATGCCGTCTTCTGCTTG +AGTAACAGAAGTGAGAACCA +AGGTTTACTAGCGGAAGGGG +AATACCATCAGCTTTACCGT +CGTATGCCGTCTTCTGCTTG +TGAATGCCAGCAATCTCTTT +GAGCTTGATGCGGTTATCCA +AATCAGGTTATTAAAGAGAT +GATACCAGAGATCTTTATCA +CGTATGCCGTCTTCTGCTTG +GGGGGGGGGGGGGGGGGGGG +CGTATGCCGTCTTCTGCTTG +CATAGTGCCATGCTCAGGAA +ATTTATGTATTCGAACTTCA +GATTCCCTACGGTAATAGTA +CGTATGCCGTCTTCTGCTTG +GACGTTCCTGGTATGTTAGC +CTTCCTATCGATGATATACG +CGTATGCCGTCTTCTGCTTG +CAACTGCTCTGCCCCTGGAA +GGTCCCGTTAACTGCTATAG +CGTATGCCGTCTTCTGCTTG +TTTCCATGCGGTGCACTTTA +GAACACATTAGAGGTGCATG +GACCCGTAATAGTGGTATGG +CGTATGCCGTCTTCTGCTTG +CCAGCCGCTTGTCTGGGGTA +GGTCTGTGATATGTTTAGAA +AATATCAACCACACCAGAAG +TAAACATAGGATATGAAGAA +GCAGGAGAAACATACGAAGG +AATGGCTCGACAGATGTAAT +GTTATAACCTCACACTCAAT +GACGACACTTCCGCTTAGTC +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +CATCAGTGACGACATTAGAA +TTAGCCACTAAGATACTTCC +ATCAACAGAACTGAAAAAAT +CCATTGTGGGATTGAATTAG +CCTGATACCAATAAAATCCC +GTCGTATGGTTCTTGCTGCC +GGAGTGTCTACGCTGTTTTC +ATTGCATTCAGATGGATACA +GGAGCACATTGTAGCATTGT +GGATCTAGTAAATGGAAACA +GTAGGCCGTCATAGGGATAG +GACGGTTATTTCCTAGACAA +CGTATGCCGTCTTCTGCTTG +CCTTGCGCAGCTCGAGAAGC +CATTAACCGTCAAACTATCA +CGTATGCCGTCTTCTGCTTG +TGTTGAAAACATGTTTACAC +CTTACATTTGAGCTGAAGGT +AATGAGCTTAATAGAGGCCA +ACCGTCAGGATTGACACCCT +GGTTAAATCCAAAACGGCAG +GGTTTCCACGTTTAACCAGA +TAATCGAAAAGAGTACCCAG +TAGTCCATTCGTCCTGGACA +ATGGTCGGAAAATGTAAGGG +CAGAATCGAACTAAGTAACA +CTAAGTCACATAACTTTGGT +CGTATGCCGTCTTCTGCTTG +GGTCCAGCCTCATGATAAAG +CGCCATTAATAATGTTTTCC +AACCATAACGAGCATCATCT +ACATCCTCTGGGTTCGCCAG +TACACGCAGGACGCTTTTTC +CTTCGAGATATTGGTCACGC +GACGTTCCTGGTATGTTAGC +GAGGTATTTAGGGCTGATAT +GTACTCAATATAACCCTTTC +CCTGAGTATGGTACAGCTAA +CCGTGTTCCTAAGAGGCCGG +GGTTTCAAAGAGAGTGGAAA +CTACCGGAAATTTTTATTGG +GGTAAAACACGGGGGCCGTG +CCAGTTGCATTTTAGTAAGC +AGTGAGTTGTTCCATTCTTT +AGTGGGTGTGATTAATGAAC +TTCCAGACCGCTTTGGCCTC +CGTATGCCGTCTTCTGCTTG +CGTCCTGCGTGTAGCGAACT +TGTTCCTATCCGCAGTTTGT +GTTGGTGCACTGCTTGTGAC +AGGCGGTCAAAAAGCCGCCT +GGTCGCCTCCCCCTCTAAAC +AGTTGCTAGCCCGGTAACCT +ATGTTTAACTTCTATAATAC +TCATAGAGATATGTAAAGGA +AGTGGCCAGGATATTCAAGT +CTTCGCGTAGTCTGCTGGGG +GGATCCACACGGAGCCCCCT +CGTATGCCGTCTTCTGCTTG +CATTTCTACTCTTTCTCAAT +CGTATGCCGTCTTCTGCTTG +AGTGTTTCCAGTCCGTTCAG +CGTATGCCGTCTTCTGCTTG +TGAGCTTGAGTAAGCATTTG +CGTATTATGTTATAGGATTG +CATGGTGGCGAATAAGTACG +CGTATGCCGTCTTCTGCTTG +GCCAGTTGAAGTCGGAAGTT +GAATGGGTTAACATAATGGT +GGATCTAGTAAATGGAAACA +TAGATACCCCGTAAGCCGAA +GATCCCGGACTTTATTGCGG +TGTTCTTCGCATGGTATACA +TAGCTCTCATAGAGATGGGA +CTAATAGGTAAGAAATCATG +GCGTCGTAATGTCAAAGGTC +CGTATGCCGTCTTCTGCTTG +CTTTACCGTCTTTCCAGAAA +TCGGCCAGATTAAAGGATAC +CGTATGCCGTCTTCTGCTTG +ATCATACCTCAATAGAACAC +TAGCGGGTATAAGGGAGGCG +CCAATCGAGGCGTACCAGGG +TGACGCCGGATTTGAGAATC +TGTTTGGTGCTATTGCTGGC +AGAGTTTTGAAAAGCGATAA +CTAAGATGAGGCAAAGTTGG +AACGTTAGGATTTTAGCGCA +ATATTCTCTCGGACTTAGGA +ATGGTGCACGACGTCTGTGC +CTTCTGAATGTCACGCTGAT +AGCATCAGTGACGACATTAG +TCACGTTCTGGTTGGTTGTG +TAAATGAACAGATAAGATAG +GCTTAGGGATTTTATTGGTA +CTTCGCGTAGTCTGCTGGGG +TTGTTCATGGTAGAGATTCT +TTGGATTAAGCACTCCGTGG +CGGTTAAAGCCGCTGAATTG +CTTCCGAGTGAGCTTACGAT +CCATGTCTAAATTGTTTGGA +ACAACCAACCAGAACGTGAA +CAGCTTCTGCGCGGGGCGAG +TATAGACCACCGCCCCGAAG +CAGCAGTCCACTTCGATTTA +GAGGTATTTAGGGCTGATAT +TGAAGGCCGGAATTATAGTT +CGTATGCCGTCTTCTGCTTG +GTACTCCATGCTCTATATAC +CGTATGCCGTCTTCTGCTTG +GGATTCCCATGTACAGTTGA +TCGGTACAGTATCAGAGGTA +TGGATCAAGGGCTAAGGGAT +GCTACACGCAGGACGCTTTT +TTTCTCCCGGAAAAAACGGC +ACTGGATTAATAGAAGTGTG +TTAGTTGAAGTCGGAAGTTT +GTGCTTAGAGTGGGGGAACA +GTTGTGCAAATAAAAGATAG +TGTCTTCTAGGTGATGAAGA +GCCCTGCATACGAAAAGACA +TATTTGACATTTGGTGAAGA +CCTATCCCCTCTACAGCATG +ATACACGGTTAATAGGTTAG +CTCTTGGGGAAGTAATAAAC +ATACTGGCAGCAGCCAGGCC +GATCAGATGAAAGGAGCATC +CTTAAAGCTACCAGTTATAT +CCATTGTGGGATTGAATTAG +AACGTCGTTAGGCCAGTTTT +CGTATGCCGTCTTCTGCTTG +ACAACAGAAAGTAGCAAAGG +TCGGTAATTGCATCAGTCAA +CCTAACTTCTCCTCGCGCCA +TACTGTAGCCGACGTTTTGG +GGACTAGTCAAGCATGAACT +ACTAGTTTAACCCCCGAGGG +AAAGTCATGCCGCACTGCCG +GTTATGGTTTCCGTTGCTGC +AGCTTGCGGCAAAACTGCGT +TAGACAAATTAGAGCCAATA +GTGGTCCGTTAGCAATATAG +ATAGTCCGCTTTGCCTAGCC +ACTGCTCGCGTTGCGTCTAT +GTTTATCGAAGTAGTTGGGT +GAACTTAATCCACTGTTCAC +TGGTATTGGCTCTAATTTGT +AGGTTCAGTGTACCAGGGAG +CGTGATTACTTCATGCAGCG +CGTATGCCGTCTTCTGCTTG +AAGGATGAGTGTTCAAGATT +AGTCGCGATTTGTATTGGTT +GAGCTGCTAAATCAGAGTAC +AATGTTTATAGGTCTGTTGA +CGTATGCCGTCTTCTGCTTG +AGCTCAGTCTCAGGAGGAAG +GAATCTCTTCCAAGAGCTTG +TCTCTTAATTATTGTCATGT +AAATAAGCGAAACGAATAAT +AGGTTTGTTCTCGGGAGGAC +TTTCCGGAGTAATATCATGA +GACCCTCGGCAGCAAGAACC +TGTTGCGGGTGTGTGGGGGA +TACTACGAATGACATCCCGG +GATTTTGTGTGAAGAACAGG +CGTATGCCGTCTTCTGCTTG +AGGGTTGGGGAGCCATAAAG +GGGTCACTCATTGGGGTTTA +CGTTAATGCTGAAGGCTGAC +TACGAATTGGGTGTAGCTTC +TGAGAAGGCGTGAAGAAGGT +CGTATGCCGTCTTCTGCTTG +GGCTTGTAACCGTGTTAGGA +TGTTCTTCGCATGGTATACA +AAGGCGCTGAATTTACGGAA +GGTAGCGATTTTTGAATTGC +TCTGCAAAACGCGCTATTTA +GACCGTAAATAGGAGAAATT +GAGGGGGGAATAACGATTCT +TTACCTTTATGCTATCTTTC +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +TGGAGTAATAGCAAAGAATA +GTAAAATACTGACCAGCCGT +TAGACATTACATCACTCCTT +ACCTGCCTTGTAAGTAAACG +TCCCAAGAAGCTGTTCAGAA +CGTATGCCGTCTTCTGCTTG +ATTGTTAAGCGGTAGTTGTG +GTAGGTTAGTAGCGTATGCA +TTTGAGTATATTGCTTCAGA +ACGCTTGCGGCCGAGTCGAC +TACACCGGTCCAGGTACTTG +GTTCAGCAGCCAGCTTGCGG +TCGGGGCGGTGGTCTATAGT +CGAGACGCTGACTGGTAATC +ATCCTTAAGAGGGCGTTCAG +TTAATTATAAGCGGGGTTAG +ATTTAAAATTCGTACCGGAC +CGTATGCCGTCTTCTGCTTG +CCCGAAGTTGCGGCTCATTC +ACTTCGAAGGTATGACAGTA +GTGTTCACAGAGTCAACATA +ATTGCTGGCATTCAGTCGGC +TGAGGCTTGCGTTTATGGTA +ACAAGTTGAGTTAAAGATGT +CGTATGCCGTCTTCTGCTTG +CGGAACCGAAGAAGACTCAA +CGATAAGGGGGACAAAGGGC +TCTTAGTGATCTAGATTTCT +CGTATGCCGTCTTCTGCTTG +AAGGCTCGACGATAATTTGG +AGGACTGGACGGATGGAAAC +AATGATACATCATATTAGGG +AGACTGTTTGAAGTCGGAAG +CGTATGCCGTCTTCTGCTTG +TGTTACCCTTAGGAGTAGTG +CTGCTCGTATGGTCTCGGAA +TTTTTCACGTTCTGGTTGGT +GTGTATGAGAATCCATACAA +AAAGTGTAAATGAGAGACGA +TGTTTGAATCATAAGCTTGG +AAAAAGCTCAATTGTACATG +AACAGGGTCGCCAGCAATAT +ATGTAATGTCTAAAGGTAAA +CTAACGATTCTGTCAAAAAC +CCATTGTGGGATTGAATTAG +GCGGAATTAGTTCAATATCA +TGGAGCTATAGTGTCGTAGA +TGATGCTCTTTTTGATTACA +CCATTGTGGGATTGAATTAG +TCAATAGTCACACAGTCCTA +AGGTATGGCCGGGATGTCGA +GATGGGAAAGGTCATGCGGC +CCGTCGCGTTCCAAATCCGT +CTTATCCATCGGTTGAGGGT +ATACGCTCGGCGCCAGTTTG +TAGTCTACGATCTTAATGTG +CGTATGCCGTCTTCTGCTTG +GATAATAAGATACTGTGAGG +ATCGGCAACAGCTTTATCAA +CTTCCGGTAGCTTATTCTGG +GATTTTCGCGTGTTGTGGTA +CGTATGCCGTCTTCTGCTTG +CACCATTACCAGCATTAACC +ATTAATCTAGCAACAAAATA +CGTATGCCGTCTTCTGCTTG +ACGATACCACTGACCCTCAG +TTTCCGCCAGCAGTCCACTT +CAACCAATCATCAATTTATG +GCACCTTCAAATAGGTAGAG +ACGGGATTAGACATGATTAC +TGCCATACAAAACAGGGTCG +CCACTGCAACAACTGAACGG +TATTTATCTCGATCGAGTTC +TGAAGAAAACAGGTGTCAAA +CAATATCACGAAAATAGTCA +ACATCACCTTGAATGCCACC +ACTGGATTAATAGAAGTGTG +GCGGGACGGCTGGCTAGCGA +CAAAAATTAATGCTAGTGGA +TGTTTTATACATGTATACTG +TGATTTATTGGTGCAGTTGT +AAAATACGTCTTACGTCAGT +GCGCAGCCACAGGCGAGACT +CTGGCGCTCGCCCTGGTCGT +TTACTATTCTAGACAAGACG +CGTATGCCGTCTTCTGCTTG +AATCAGAAAGAGATTGCCGA +TTTGGAGGTAAAACCTCTTA +GCGTCTGGATGTGCTCGGAG +ACCACTTGAAATATTCCATA +TTACCTTTATGCTATCTTTC +TATCTTGATAAAGCAGGAAT +TAAAAATGGTGATAGCAGAC +CGCCTTCATCCGAGAATCTA +ACGTCTAGTGGGGTTGAGGT +AAAGGACGGTTGTCAGCGTC +TGATGCTCTTTTTGATTACA +TGAGAAGGCGTGAAGAAGGT +CATCTTGGCTTCCTTGCTGG +CGTATGCCGTCTTCTGCTTG +GCTAAAGCTGGTAAAGGACT +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +TTGGTTTTTAGTGAGTTGTT +GATACGACGTGCTCAGTGTG +TGATGCTCTTTTTGATTACA +CTGGATGACCGACGGGATCA +GACGATGAGGGACATAAAAA +TCTTGCTGTCGGGCCAGTTG +TTTGCTCTTCTTGGACACGG +GCAAATTTATAGAAATACCG +AAGTAGGATGCGGTCTAACA +CACAAATTAGATATCAAAAT +GCATACGAAAAGACAGAATC +ACTGAACAATCCGTACGTTT +CGTATGCCGTCTTCTGCTTG +GCGATGGGCATACTGTAACC +CAATGCCACAATGTAAGGAA +ATTATCTTGATAAAGCAGGA +AAACACTGGTCATAATCATG +CGTATGCCGTCTTCTGCTTG +ACTGGATTAATAGAAGTGTG +ATATCCGAAAGTGTTAACTT +CGTATGCCGTCTTCTGCTTG +AGACCCATAATGTCAATAGA +CGTATGCCGTCTTCTGCTTG +CGCCATGATGGTGGTTATTA +GCTTTCTCGCCAAATGACGA +TGATGCTCTTTTTGATTACA +CCCATGCCTACAGTATTGTT +GTTTGAATATTAGACATAAT +CGTATGCCGTCTTCTGCTTG +TAGTCCATTCGTCCTGGACA +CGTATGCCGTCTTCTGCTTG +ACTGTGATGTCGCATGAACT +AAATGCATAGCATTAAAAAA +GCTATCAGTATTTTTGTGTG +CGTATGCCGTCTTCTGCTTG +CGAAAAACAGCGAAGTCGGT +CGTATGCCGTCTTCTGCTTG +AAGTAAAGGACGGTTGTCAG +AGCCTTGCGACCCTCGGCAG +TCCATGCCGGAATCTACATA +CACCTTGTGAACACACACCA +GTTATACCGGTGCATCATAC +CAGGAGAGGCGACTTCTAGA +CGTATGCCGTCTTCTGCTTG +GCACACCCATCGTCGCATTC +CGTATGCCGTCTTCTGCTTG +GGTAGACAATAGCAAGGTAT +GGTGAAAGGTGATGCGCTCC +TCTTTAGAATTACGTATCAG +ACTCTAGGCTTTACGTAGAG +GGCTTGTAACCGTGTTAGGA +CGTATGCCGTCTTCTGCTTG +AATCTAAAATAAGAAGGATC +TAAGTGAATTGTATATAGAG +GTATGTTTATAGCGAGGGAT +TCTGTTAGTAGTGACGGCAT +ATATGTATGTTGACGGCCAT +TGGCGGCGCAACCTGTGACG +TGTATTTACCATAATTCGGT +AAGTTTCAAGAAAGAAAGAA +GGTTAAAGAGAGTCATAAAC +CTTGCTGCCGAGGGTCGCAA +CGTATGCCGTCTTCTGCTTG +GACGTTCCTGGTATGTTAGC +GTGGACTATCGCTATTGAAG +CGTATGCCGTCTTCTGCTTG +ATTACCAAGTTTGTTGGAGG +TTTTGTATCCTGCTGTTGCT +TAGACCTTTAGCAGCAAGGT +AGAACCATACGACCAATATC +TTGGTCATTGAGTTGAAAGG +TGTATTTACCATAATTCGGT +AACCCTACCGCCGAAGGTTT +TGCGAGAACTGGGCACTTTG +GCGCGGGAAATCTTAAGTTG +TACTTTAAGTGGCTGAGACA +TCACTCGAGGGAAAGTGCGG +AATGGCTCGACAGATGTAAT +CGTATGCCGTCTTCTGCTTG +ACAATTAACACAATCGCCAT +AACGCAGCGACGAGCACGAG +AATTCCACTCATAGGGTCCG +GACGTTCCTGGTATGTTAGC +TCCTTTTTCTGTCTTGTGCG +CGTATGCCGTCTTCTGCTTG +TGTCCCGGTATTAGCGTGGA +AAACTCCTAAGCAGAAAACC +GCATCTTGAAACTGTATTTC +GAGCCACATCAGAATGATTA +TTACAGAGCGGGTCGAGGTG +AGAAGCAGCATCAGTGACGA +CGTATGCCGTCTTCTGCTTG +AGTGGCTGGAGACAAATAAT +TATGGATAGACTGGTTTCGT +GTACAGAACTAAGCGGCGTA +CTAACGACGTTTGGTCAGTT +CTTTTTCTAATTAACATGAT +TAAGAGGGAAAGCTTTTAGT +GAGTTGAGCTAACTAGAGTC +TAAATGAACAGATAAGATAG +TGATGCTCTTTTTGATTACA +CGTATGCCGTCTTCTGCTTG +GGTACTCGCACCTCCCAGAT +TTATATAGTTTGCGAGATCA +CCCAAAACATCGCCCAAATA +ATAAATCACCTCACTTAAGT +GGAACTAAAAGTCCGGTATT +TTATAGTGTTGACAGCATGT +TTTGGAGGCATGAAAACATA +CGTATGCCGTCTTCTGCTTG +AGAAGCAGCATCAGTGACGA +TTTCCAGACCGCTTTGGCCT +AAGCTCATTAGGGTTAGCCT +TGGCACTTCTGCCGTTTCTG +CGTCATTTCAACAGTATAGG +CTAGATCGAGCGGCTGGAAA +TATTAGGGGATCAGGGGAGG +AACCGTCAGTCGGTTCATCG +TCGTCGAGGAATAATCCAGG +TCTGACGTCAGGTATAGTAG +ACTGGATTAATAGAAGTGTG +ACCCCGGTTTAGGCATGATT +GACCAGCAAGGAAGCCAAGA +GGTCCATATCTGACTTTTTG +CTGACCGCTCTCGTGCTCGT +ACCGCATGGAAATGAAGACG +TGCGAGAACTGGGCACTTTG +CGTATGCCGTCTTCTGCTTG +TTTCCGGAGTAATATCATGA +TAAACATCATTCGTCACTCT +GACGTTCCTGGTATGTTAGC +GTAATTAATCTGTCACGGAG +TCAGGCTTCTGCCGTTTTGG +CGTATGCCGTCTTCTGCTTG +CAGCAATCCGGCCGGAGGCA +TACTACGAATGACATCCCGG +GTACGCCGGGCAATAATGTT +ACAGGAACTGGATTCCAAGG +CGTATGCCGTCTTCTGCTTG +AGCGTGCTGGTGCTGATGCT +CGTATGCCGTCTTCTGCTTG +GTTACAAGCTCATAAGGGTG +CGTATGCCGTCTTCTGCTTG +TAGTCTACGATCTTAATGTG +TGATTGTACTGATGGAGGTA +CGTATGCCCTCTTCAGCTTG +AAACATCAGCAGGTAAATGG +GTTAATGGATGAATTGGCAC +GACGTTCCTGGTATGTTAGC +CGTATGCCGTCTTCTGCTTG +GACGACCGATTAGAGGCGTT +GAAGCCGATCAATGTAAGAA +TTTTGTATCCTGCTGTTGCT +AGGGGAAGTAGCATTTGCGG +GGGAGTACAGCCTCTCGCAC +GTTCGAGCAGGTTCGCGCCA +TTATCAGAGACAAGGCTAGG +AGTCGGGAGAGGAGTGGCAT +CGTATGCCGTCTTCTGCTTG +ATGAGGGACATAAAAAGTAA +TTTGGCGACGTCGACGAGTA +CCATTGTGGGATTGAATTAG +TAGTTGTTGCATACGTAATA +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +GCGAGCGCCAGAACGTTTTT +AGTAGCGACAGCTTGGTTTT +GGCTTGTAACCGTGTTAGGA +TGTCCCGGTATTAGCGTGGA +TGCTGATGCTTCCTCTGCTG +CCACCGCCCCGAAGGGGACG +TGATTGAATCGCGAGTGGTC +GTGCCAAGAAAAGCGGCATG +TGTTGCAGTGGAATAGTCAG +AACATATAGGATAAAAATTG +TTTACCCTAGCTTTTACACG +GGATCCACACGGAGCCCCCT +CTACAGTAGAGTCAATAGCA +AATATTAGACATAATTTATC +GAGCCAATACCATCAGCTTT +AATCCCATGTCGAATTGACT +GCACATCACCTTGAATGCCA +NAGTTATGTGTTATTTCTAT +CCGGCGTACGGGGAAGGACG +AGGATCACTGGGTCATTGGA +CTTAGTTGCATGAGCATACA +AAGTATAAGGGTGAGTATGG +TCTAAAGTTGGGCGTTAGGA +GAGCTTTCTCGCCAAATGAC +GTCGTAACCCAGCTTGGTAA +CGTATGCCGTCTTCTGCTTG +TCGACAAATTCGATGTTAGA +CCATTGATAAAATCGGAGAA +TGGCCTATTTGATCGGAGGC +AGGTCTAAAGCTATGGTACT +GTAGCCCATAATCAAAGTAG +TGCCCATTGAGCGTGGATGA +GGAGTATTGTGTCGTCCAGG +AGATGGGGGCAGTATATGAG +GGAAACACTGGTCATAATCA +GCTGTAGTAGCTCGTATTAA +CCAGCCGCTTGTCTGGGGTA +TGATGCTCTTTTTGATTACA +TGGTATTCCCATCAAGGCAA +TAAATCACCTCACTTAAGTG +AGATTGCTGGCATTCAGTCG +TCTCTTTCTGATTGTCCAGT +TTTATGGTACGCTGGACTTT +TGGCATAGGGGACGTTGGGG +TGGATCAAGGGCTAAGGGAT +GCGACTGTACAATAATGTAT +GTTATGCGCCTTCGTATGTT +GCAAACATCCGTGGAGCCGA +TAATAGGGTTAAACGAGGGC +AACGATCCGGCTAGCCTGGA +AAATTTCACGCGGCGGCAAG +TGCGCTTTCGAGGGTCGTTG +TAACACGATACGGGGCCTGC +AGAAAACCTACCGCGCTTCG +CGTATGCCGTCTTCTGCTTG +GAGGGCAGAAATAGTAACCG +CCTGTTTGGTTCGCTTTGAG +CGTATGCCGTCTTCTGCTTG +GATACTCGTGATTATCTTGC +TAGTTGTTATAGATATTCAA +CTTCGCGTAGTCTGCTGGGG +AGTCGCGATTTGTATTGGTT +GACGTTCCTGGTATGTTAGC +CAAGCTATTTAACTGGCGGC +CGTATGCCGTCTTCTGCTTG +AGGGACTGAGGGGCGCCTTG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +TTGGAGCTCGACATCGTTTG +AATTTAGGGTCGGCATCAAA +CGCCTTTACGCTTGCCTTTA +CGTATGCCGTCTTCTGCTTG +CTGTCCCACAAGCTGATAGG +TTCTGCGCCCCATTGGCATA +CTTATTACCATTTCAACTAC +CGTATGCCGTCTTCTGCTTG +TGATGCTCTTTTTGATTACA +CGTATGCCGTCTTCTGCTTG +TTTAGCGGGCAATGGTGTGG +TGTGGTTTTCGGAGTAAAAG +TGGAATCAGGGCCGGCGAAG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +AGATTAGAATGGGTTAGGAT +AGGTCTGCCGAGCGGTACTG +CGTATGCCGTCTTCTGCTTG +GATTTTGTGTGAAGAACAGG +CATGACCAGATCGACTCGGC +CGAGTGGGGTGTAGCGAGAG +CCATGTCTAAATTGTTTGGA +GTTCTATTGTTCTATCTGAG +GAGCAGAAGCAATACCGCCA +CAGATTTAATACCAGCATCA +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +AATGGCTCGACAGATGTAAT +TTTAGTCGCAGTAGGCGGAA +CGTATGCCGTCTTCTGCTTG +GTGTACATAGATTATTGGGC +CATCGCAGTTCGCTACACGC +TGCCCATTGAGCGTGGATGA +TGAATATCTATAACAACTAT +TGGCTTATCATTAACTGAGA +AGTAAGAATGCAGCCACATT +CGTATGCCGTCTTCTGCTTG +CTGAGGTTGACTTAGTTCAT +CGTTCGTCAAAACCGGTGCA +TAAGATGGGTAGGTTGATTA +CGCTGCGTTGAGGCTTGCGT +ACGAATTAAGAGCAAGCTAT +CTGCAATTAAAATTGTTGAC +CGTATGCCGTCTTCTGCTTG +GATCTTGAGGACTGGTCCAT +TTAGTGTATAATTGAGCAGG +GATTATTTAAAATACATTGT +ATGTATAGTACATGATTACA +ATAATCTCGAAATGTTAGTT +CGACTGCCTATGATGTTTAT +GCGCGTTCTTGCAAATCACC +CTCAGGAGGAAGCGGAGCAG +AAACGCGAACAATTCAGCGG +TCATGTTACTAGACAGGCCC +GGATCTAAAAACATATACTG +GTAGTGCGGATTGATCTGCA +CCAGAAATTGTTCCAAGTAT +GAGGTATTTAGGGCTGATAT +TATGGTGAACAGTGGATTAA +GAGGTATGGATGTACTAAGC +CGTATGCCGTCTTCTGCTTG +GATAGGATACATTGGCTGTA +CGTTTTACCCCCGGCGGGTA +CTGGTGTGGATGACACAAGG +TGCTAAGACAACGGTATACA +TTGAGAGGACAGTTGATGGA +GATTGGCGTATCCAACCTGC +TGCGAGAACTGGGCACTTTG +GCTTTTTGGTACGCAATGGT +CAGTAGTGTTAACAGTCGGG +AGACTGTTTGAAGTCGGAAG +CGTATGCCGTCTTCTGCTTG +GCAATAAACTCAACAGGAGC +CCTGCGGCGACGGTGATTAG +GTGGGATGTGACTTTATCTA +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +TCGCCCTGGTCGTCCGCAGC +TTGTCTTCAGAAGCATATAG +TGCATACTGACCAAGAACGT +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +ATTGTTAAGCGGTAGTTGTG +AGGCACTGGGTGAGGTGTAG +AATTGATCAGGTTAGTTCGC +GAGAGGAGTGGCATTAACAC +AGTGTGAGGTTATAACGCCG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +TAAATGAACAGATAAGATAG +TGGTCAACAATTTTAATTGC +AGCACCAGAAACAAAACTAG +TTGGTTTCGCTGAATCAGGT +ATAGAAAACTTACCGGCCGT +AGCGCAGATTAGTAGAATAT +TGAGTAAGACGGATCCACTG +TAGTCCATTCGTCCTGGACA +GGTTTGTGTGTGAGTTTGAT +TATAGATTAGGGTATGTTGG +GTGCTGGTGCTGATGCTTCC +CGTATGCCGTCTTCTGCTTG +TCATAGAGATATGTAAAGGA +TAGGAAGTGTCCGCATAAAG +AACTCGGAGTATTAGATTGG +TCATGAACTTAATCCACTGT +TGATGAGGCCGTCCCTAGTT +TAGCGAACTGCGATGGGCAT +CATCTGGATAAATCTTTTGT +TGCTCACAATGACAAATCTG +TCTTCGGGCATGGTCGGCAC +CGTATGCCGTCTTCTGCTTG +AAAGCAATATCAGCACCAAC +CGTATGCCGTCTTCTGCTTG +TGTCCCGGTATTAGCGTGGA +CGCGGCGGCAAGTTGCCATA +GGTGTACGGGATATCGCGAG +CGTATGCCGTCTTCTGCTTG +CGTTTATGACAAGGATAAAG +CGTATGCCGTCTTCTGCTTG +AGTGGCTCCTTGGACAAGTC +AGACGTGCTCAGATGAGTGA +TCTTATTGCGGAAATTGGGA +TGTTACCCTTAGGAGTAGTG +TCTAAACCAGTCCTTGACGA +AATTACATCATGGTATGCAG +GTGAAATGACTTCTTGCAAA +CGAACTCAACGCCCTGCATA +GAGTTGAGCTAACTAGAGTC +CGTATGCCGTCTTCTGCTTG +CCTAAGTCTAAGTAGGGGAA +CGGTGGATAGGACTCCCTCT +TTTAAGGGTAACCTTGGTGT +ATAGATTTAATTCGGGGGCC +CTTAGTTGCATGAGCATACA +TAGCATCAACAGGCCACAAC +ATCAGCACCAACAGAAACAA +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +CAAATGCTCTATCTAAAATA +TGACGATGAGGGACATAAAA +TCCAAACTTTGTTACTCGTC +CGTATGCCGTCTTCTGCTTG +CAGCTTGCAGACCCATAATG +TGGCTGCTGAACGCCCTCTT +GACTCGACGGAGTGGCGGGG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +GACGTTCCTGGTATGTTAGC +CGTATGCCGTCTTCTGCTTG +CTGTTCTTAGCGGGAAACAG +TTCATATGTATACTGTGGTC +TTGTATGAAAAAAACAAGTT +AAAATGTCAACAAGAGAATC +CGTATGCCGTCTTCTGCTTG +CGGGTGTAAATTGCGGTGGT +GCCCCTATGTGATGGGGGAT +ATCATTTTACTAGACATGTT +GGTTGGTTGTGGCCTGTTGA +CGTGTGAATCATTAGCCTTG +ACTGTTACCTCTGAAATTTG +CATTGCATTCAGATGGATAC +GCCGTCAACATACATATCAC +GCTAGTTTTACAGACGTTAA +GAGGCGTATCGAAAGCGTGC +AATAACCCTGAAACAAATGC +CTCGAGCAAACCCCTGAATG +CAAATTCTTAGGGATTTTAT +TATAAACATTCTGTGCCGCG +GAAATGCCACAAGCCTCAAT +AAAGAGATTATTTGTCTCCA +CGTATGCCGTCTTCTGCTTG +ATTAAGGATGAGTGTTCAAG +GAAAACGAACAAGCGCAAGA +CTCGAGCAAACCCCTGAATG +CTGCGCTTAGATCCGTAAGA +GGTATCGTTATGCGCCTTCG +CCATTGTGGGATTGAATTAG +CGTATGCCGTCTTCTGCTTG +TTCGTATGCAGGGCGTTGAG +CGTATGCCGTCTTCTGCTTG +CTAATTTTGGTCGTCGGGTA +GACGTTCCTGGTATGTTAGC +CCAGCCGCTTGTCTGGGGTA +GAGATGGCAGCAACGGAAAC +CCATTGTGGGATTGAATTAG +CGTATGCCGTCTTCTGCTTG +TGTCTTAATGTAGAGTTAAG +CAGGAGAGATGGACGGCATA +CGTATGCCGTCTTCTGCTTG +AAGCTGGTTCTCACTTCTGT +GCTTAACTCATACCTAAGCT +AAACTGACGCGTTGGATGAG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +GGTTTGTATTTGATCCATAT +AAGGCCACGACGCAATGGAG +TTCATAGTGGAGGCCTCCAG +TTAGTCATGCGGGAGGAGCA +CGTATGCCGTCTTCTGCTTG +ATCCTTCATAGAAATTTCAC +TGATTTGCAGAATTTATTGA +CGCCCCGAAGGGGACGAAAA +GCTTTCCTGCTCCTGTTGAG +ACTGAGATTTAATGGTCTTT +GAAAACCACCATTACCAGCA +CAGTATGCAAATTAGCATAA +GGATAGGTTATAAGTATAGG +GGGTGTGCGTGCTCTGAGGC +AGGCTGAAGAATTCTGACAC +CGTATGCCGTCTTCTGCTTG +GATGTATCCATCTGAATGCA +TGATTTATTGGTGCAGTTGT +AGGAATGATTGAAGAGGGCG +TCACTACGAGTTGATTGCCC +GTGTGACGATAAAAGAATCA +AGCGACGAGCACGAGAGCGG +GGACTAGTCAAGCATGAACT +TATAGCAGGAACGATAATTG +ACCCCGGTTTAGGCATGATT +CGTATGCCGTCTTCTGCTTG +CCTGTAAGTAACGCTATAAA +CGTATGCCGTCTTCTGCTTG +TGAGACTACTTTCGTGAAGT +GAGGTATTTAGGGCTGATAT +TTGCGTGTACGCGCAGGAAA +CCCCTGGCAAGAATAGTCGA +CGTATGCCGTCTTCTGCTTG +GGCTCCTATTCTCAGGTTAG +AGAAGACCAAGGAAGGAGAG +CGTATGCCGTCTTCTGCTTG +GTTAGTCAGACAATGTAAAG +TGTATCGTAAAACTCGTTAC +GATCTCGTTTAGCTTGGTCA +GCTTAACTAAGCCAGAATGA +GGTAAGAAATCATGAGTCAA +GGCTTGTAACCGTGTTAGGA +GCCTTCGTATGTTTCTCCTG +CGCGTTGGTTATGGTGCACT +CCCACCAAATGTCGTTAGAG +CGTATGCCGTCTTCTGCTTG +CGTATGCCGTCTTCTGCTTG +GCAGTAAAGGTTGACGAGCC +GACTGCTGGCGGAAAATGAG +CGTATGCCGTCTTCTGCTTG +TGTGACATTTGACAAGTATG +GGAGGTCAATGAGATAGTAA +GTGTGACTATTGACGTCCTT +CTCGAGCAAACCCCTGAATG +ATACACGGTTAATAGGTTAG +CAGCAATCTTAAACTTCTTA +CGTATGCCGTCTTCTGCTTG +TCTGACGTCAGGTATAGTAG +CGTATGCCGTCTTCTGCTTG +CCAAGATTTGGAGGCATGAA +TAGTTGTTGCATACGTAATA +CTTCGCGTAGTCTGCTGGGG +TGGTTTTCTTCATTGCATTC +CGTATGCCGTCTTCTGCTTG +TGATGTTTTAGAGAGCGAGG +TAGCGGGTATAAGGGAGGCG +ATTCCCAGGAGACCGCGTTG +GTGAGGTGATTTATGTTTGG +AATTCAACGTGTGCTGACGA +CGGGGCTGAGTCGGCTAGAC +TCAGCAAGGTGCGCCGAAGA +CTTGGCTTCCATAAGCAGAT +CGTATGCCGTCTTCTGCTTG +AGTCAAAGCACCTTTAGCGT +TGCTAAGTTCAGGAGAAAAT +TAGCAAAGCCTCTACGCGAT +ATTTTATTGGTATCAGGGTT +GATCTTGTTGCGTCCGGTCA +CGTATGCCGTCTTCTGCTTG +ACTCTCGGATATGACAGGTC +AATTTACGGAAAACATTATT +TCGCAAGGCTAATGATTCAC +TAAATAGCTTAGTTATAATG +CTCGCAACGGCTGCGGACGA +GGTTATAGCGGCATTAAAAA +ATTGACTCTACTGTAGACAT +ATGACAAATCTGTCCACGGA +ATATCACGAAAATAGTCACG +CGTATGCCGTCTTCTGCTTG +TAGTGATCAGAGGTGACAAT +CTTTTACTATTGCATAATGT +TGGCGAGAAATAAAAGTCTG +CAATAAGAGTACTTTTCGTG +TTGAATAACGAGCGAAACTA +AGCAACGGAAACCATAACGA +TCGAATCCAAACAAGAAGGC +GAGGGTCCGGGGTGAAAGGA +TAAATGAACAGATAAGATAG +TGCTTTGTACGACACTATAC +TACATTAGTGAATTCAGGCT +GGTCAGTAGCAATCCAAACT +GACGTTCCTGGTATGTTAGC +TACGGTCAGGCATCCACGGC +GCCGATACTATCTGTATGTA +ATTAACCGTCAAACTATCAA +TAGGTATGTCAGGCACAAAA +CTTGGTTAGAAGTAGTTAGG +CACGTTTTCTTCTGCGTCAG +ATGATACAGACAGATTTTGA +TGCGCTTGTTCGTTTTCCGC +CGTATGCCGTCTTCTGCTTG +TTCGTCCCCTTCGGGGCGGT +TAGTCTACGATCTTAATGTG +GTACAAGCCGAGTCAAATGA +CAAGCTGGGTTACGACGCGA +ATTGCCAGTTGCCGGGGAAG diff --git a/test.py b/test.py index a5b93ea..4e61404 100644 --- a/test.py +++ b/test.py @@ -2,26 +2,18 @@ # parse file seq_list = [] -with open('iPCR_rep1_filtered.txt','r') as f : +with open('example_seqs.txt','r') as f : for line in f : seq_list.append(line.strip('\n')) # invoke starcode -clusteralg = "components" +clusteralg = "mp" canonical_counts, d = pystarcode.starcode(seq_list,2,clusteralg=clusteralg) # print counts output -# for key, l in canonical_counts.iteritems() : - # print '%s -> %d'%(key,l) +for key, l in canonical_counts.iteritems() : + print '%s -> %d'%(key,l) # print output -# for key, l in d.iteritems() : - # print '%s -> %s'%(key,l) - -with open('test-%s.txt.true'%(clusteralg), 'r') as f : - for line in f : - canonical, counts = line.split('\t') - counts = int(counts) - if canonical_counts[canonical] != counts : - print "Discrepancy between counts of %s"%(canonical) - break +for key, l in d.iteritems() : + print '%s -> %s'%(key,l) From 52a9fa936c67562e963d4a2754a2e6cf817ac8a0 Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Mon, 18 Jun 2018 16:52:53 +0200 Subject: [PATCH 25/31] updated sources and setup.py --- pystarcode.c | 4 ++-- setup.py | 15 ++++++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/pystarcode.c b/pystarcode.c index 6df4885..8bc0dde 100644 --- a/pystarcode.c +++ b/pystarcode.c @@ -1,8 +1,8 @@ #include #include #include -#include "../src/starcode.h" -#include "../src/trie.h" +#include "src/starcode.h" +#include "src/trie.h" #define MAX_STR_LENGTH 2048 diff --git a/setup.py b/setup.py index 4ffc6c9..ffb187c 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,12 @@ from distutils.core import setup, Extension -setup( - ext_modules = [Extension("pystarcode", ["pystarcode.c", - "../src/starcode.c", - "../src/trie.c"])], -) +module1 = Extension('pystarcode', + define_macros = [('MAJOR_VERSION', '1'), ('MINOR_VERSION', '0')], + include_dirs = ['src/'], + sources = ['pystarcode.c', 'src/starcode.c', 'src/trie.c'], + extra_compile_args = ['-std=c99']) + +setup(name = 'pystarcode', + version = '1.0', + description = 'Starcode library for Python', + ext_modules = [module1]) From d8083121b266decae3dadf7277a2fc137fb1e2e8 Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Mon, 25 Jun 2018 09:52:24 +0200 Subject: [PATCH 26/31] set input_flag = 1 in the case where input file is not specified with -i --- src/main-starcode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main-starcode.c b/src/main-starcode.c index 644fe3c..6ba5e7e 100644 --- a/src/main-starcode.c +++ b/src/main-starcode.c @@ -289,6 +289,7 @@ main( // is the name of the input file. if ((optind == argc-1) && (!input_set && !input1_set)) { input = argv[optind]; + input_set = 1; } else { fprintf(stderr, "%s too many options\n", ERRM); From 1872a3e43bdb718a1e0f2333de24a9ced6f9c340 Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Thu, 28 Jun 2018 15:32:30 +0200 Subject: [PATCH 27/31] fixed output type setting --- src/main-starcode.c | 2 +- src/starcode.c | 38 ++++++++++++++++++++------------------ src/starcode.h | 3 ++- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/src/main-starcode.c b/src/main-starcode.c index 6ba5e7e..929f75e 100644 --- a/src/main-starcode.c +++ b/src/main-starcode.c @@ -327,7 +327,7 @@ main( } // initialize the "uSQ" stack with the input sequences - gstack_t *uSQ = read_file(io.inputf1, io.inputf2, vb_flag); + gstack_t *uSQ = read_file(io.inputf1, io.inputf2, vb_flag, output_type); if (uSQ == NULL || uSQ->nitems < 1) { fprintf(stderr, "input file empty\n"); return 1; diff --git a/src/starcode.c b/src/starcode.c index b8c9bd2..0ddadc4 100644 --- a/src/starcode.c +++ b/src/starcode.c @@ -157,9 +157,9 @@ int pad_useq (gstack_t*, int*); mtplan_t * plan_mt (int, int, int, int, gstack_t *); void run_plan (mtplan_t *, int, int); gstack_t * read_rawseq (FILE *, gstack_t *); -gstack_t * read_fasta (FILE *, gstack_t *); -gstack_t * read_fastq (FILE *, gstack_t *); -gstack_t * read_PE_fastq (FILE *, FILE *, gstack_t *); +gstack_t * read_fasta (FILE *, gstack_t *, output_t); +gstack_t * read_fastq (FILE *, gstack_t *, output_t); +gstack_t * read_PE_fastq (FILE *, FILE *, gstack_t *, output_t); int seq2id (char *, int); gstack_t * seq2useq (gstack_t*, int); int seqsort (useq_t **, int, int); @@ -175,7 +175,6 @@ void warn_about_missing_sequences (void); static FILE * OUTPUTF1 = NULL; // output file 1 static FILE * OUTPUTF2 = NULL; // output file 2 static format_t FORMAT = UNSET; // input format -static output_t OUTPUTT = DEFAULT_OUTPUT; // output type static cluster_t CLUSTERALG = MP_CLUSTER; // cluster algorithm static int CLUSTER_RATIO = 5; // min parent/child ratio // to link clusters @@ -438,7 +437,6 @@ print_starcode_output OUTPUTF1 = outputf1; OUTPUTF2 = outputf2; - OUTPUTT = outputt; CLUSTERALG = clusteralg; propt_t propt = { @@ -454,7 +452,7 @@ print_starcode_output // canonical are removed from the output. int user_warned_about_missing_sequences = 0; - if (OUTPUTT == DEFAULT_OUTPUT) { + if (outputt == DEFAULT_OUTPUT) { useq_t *first = (useq_t *) clusters->items[0]; useq_t *canonical = first->canonical; @@ -503,7 +501,7 @@ print_starcode_output } else if (clusteralg == SPHERES_CLUSTER) { // Default output. - if (OUTPUTT == DEFAULT_OUTPUT) { + if (outputt == DEFAULT_OUTPUT) { for (int i = 0 ; i < clusters->nitems ; i++) { useq_t *u = (useq_t *) clusters->items[i]; if (u->canonical != u) break; @@ -547,7 +545,7 @@ print_starcode_output } else if (clusteralg == COMPONENTS_CLUSTER) { // Default output. - if (OUTPUTT == DEFAULT_OUTPUT) { + if (outputt == DEFAULT_OUTPUT) { for (int i = 0; i < clusters->nitems; i++) { gstack_t * cluster = (gstack_t *) clusters->items[i]; // Get canonical. @@ -1647,7 +1645,8 @@ gstack_t * read_fasta ( FILE * inputf, - gstack_t * uSQ + gstack_t * uSQ, + output_t outputt ) { @@ -1662,7 +1661,7 @@ read_fasta char *header = NULL; int lineno = 0; - int const readh = OUTPUTT == NRED_OUTPUT; + int const readh = outputt == NRED_OUTPUT; while ((nread = getline(&line, &nchar, inputf)) != -1) { lineno++; // Strip newline character. @@ -1711,7 +1710,8 @@ gstack_t * read_fastq ( FILE * inputf, - gstack_t * uSQ + gstack_t * uSQ, + output_t outputt ) { @@ -1728,7 +1728,7 @@ read_fastq char info[2*M+2] = {0}; int lineno = 0; - int const readh = OUTPUTT == NRED_OUTPUT; + int const readh = outputt == NRED_OUTPUT; while ((nread = getline(&line, &nchar, inputf)) != -1) { lineno++; // Strip newline character. @@ -1784,7 +1784,8 @@ read_PE_fastq ( FILE * inputf1, FILE * inputf2, - gstack_t * uSQ + gstack_t * uSQ, + output_t outputt ) { @@ -1816,7 +1817,7 @@ read_PE_fastq char info[4*M] = {0}; int lineno = 0; - int const readh = OUTPUTT == NRED_OUTPUT; + int const readh = outputt == NRED_OUTPUT; char sep[STARCODE_MAX_TAU+2] = {0}; memset(sep, '-', STARCODE_MAX_TAU+1); @@ -1913,7 +1914,8 @@ read_file ( FILE * inputf1, FILE * inputf2, - const int verbose + const int verbose, + output_t outputt ) { @@ -1952,9 +1954,9 @@ read_file } if (FORMAT == RAW) return read_rawseq(inputf1, uSQ); - if (FORMAT == FASTA) return read_fasta(inputf1, uSQ); - if (FORMAT == FASTQ) return read_fastq(inputf1, uSQ); - if (FORMAT == PE_FASTQ) return read_PE_fastq(inputf1, inputf2, uSQ); + if (FORMAT == FASTA) return read_fasta(inputf1, uSQ, outputt); + if (FORMAT == FASTQ) return read_fastq(inputf1, uSQ, outputt); + if (FORMAT == PE_FASTQ) return read_PE_fastq(inputf1, inputf2, uSQ, outputt); return NULL; diff --git a/src/starcode.h b/src/starcode.h index 48e90bd..214a7dd 100644 --- a/src/starcode.h +++ b/src/starcode.h @@ -98,7 +98,8 @@ read_file ( FILE * inputf1, FILE * inputf2, - const int verbose + const int verbose, + output_t outputt ); typedef enum { From 609c3b1fd49dd678cf9d79bfcc60c222f1c88281 Mon Sep 17 00:00:00 2001 From: Ruggero Cortini Date: Thu, 28 Jun 2018 15:49:52 +0200 Subject: [PATCH 28/31] fixed 'read_file' calls in test_starcode --- test/tests_starcode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/tests_starcode.c b/test/tests_starcode.c index 8679523..7fd4543 100644 --- a/test/tests_starcode.c +++ b/test/tests_starcode.c @@ -702,7 +702,7 @@ test_starcode_10 // Read raw file. FILE *f = fopen("test_file.txt", "r"); - gstack_t *useqS = read_file(f, NULL, 0); + gstack_t *useqS = read_file(f, NULL, 0, DEFAULT_OUTPUT); test_assert(useqS->nitems == 35); for (int i = 0 ; i < useqS->nitems ; i++) { useq_t * u = (useq_t *) useqS->items[i]; @@ -719,7 +719,7 @@ test_starcode_10 // Read fasta file. f = fopen("test_file.fasta", "r"); - useqS = read_file(f, NULL, 0); + useqS = read_file(f, NULL, 0, DEFAULT_OUTPUT); test_assert(useqS->nitems == 5); for (int i = 0 ; i < useqS->nitems ; i++) { useq_t * u = (useq_t *) useqS->items[i]; @@ -736,7 +736,7 @@ test_starcode_10 // Read fastq file. f = fopen("test_file1.fastq", "r"); - useqS = read_file(f, NULL, 0); + useqS = read_file(f, NULL, 0, DEFAULT_OUTPUT); test_assert(useqS->nitems == 5); for (int i = 0 ; i < useqS->nitems ; i++) { useq_t * u = (useq_t *) useqS->items[i]; @@ -770,7 +770,7 @@ test_starcode_10 // Read paired-end fastq file. FILE *f1 = fopen("test_file1.fastq", "r"); FILE *f2 = fopen("test_file2.fastq", "r"); - useqS = read_file(f1, f2, 0); + useqS = read_file(f1, f2, 0, DEFAULT_OUTPUT); test_assert(useqS->nitems == 5); for (int i = 0 ; i < useqS->nitems ; i++) { useq_t * u = (useq_t *) useqS->items[i]; From 31eb8a30f282b77512a57f1f7c450af667b0e43b Mon Sep 17 00:00:00 2001 From: Eduard Valera i Zorita Date: Fri, 29 Jun 2018 12:03:07 +0200 Subject: [PATCH 29/31] Moves new files to their folders. Adds pystarcode tests to main test routine. --- setup.py | 2 +- pystarcode.c => src/pystarcode.c | 4 ++-- test/Makefile | 1 + example_seqs.txt => test/example_seqs.txt | 0 test.py => test/test.py | 0 5 files changed, 4 insertions(+), 3 deletions(-) rename pystarcode.c => src/pystarcode.c (99%) rename example_seqs.txt => test/example_seqs.txt (100%) rename test.py => test/test.py (100%) diff --git a/setup.py b/setup.py index ffb187c..4f8c1d3 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ module1 = Extension('pystarcode', define_macros = [('MAJOR_VERSION', '1'), ('MINOR_VERSION', '0')], include_dirs = ['src/'], - sources = ['pystarcode.c', 'src/starcode.c', 'src/trie.c'], + sources = ['src/pystarcode.c', 'src/starcode.c', 'src/trie.c'], extra_compile_args = ['-std=c99']) setup(name = 'pystarcode', diff --git a/pystarcode.c b/src/pystarcode.c similarity index 99% rename from pystarcode.c rename to src/pystarcode.c index 8bc0dde..f1af35c 100644 --- a/pystarcode.c +++ b/src/pystarcode.c @@ -1,8 +1,8 @@ #include #include #include -#include "src/starcode.h" -#include "src/trie.h" +#include "starcode.h" +#include "trie.h" #define MAX_STR_LENGTH 2048 diff --git a/test/Makefile b/test/Makefile index 108d1bc..73c8981 100644 --- a/test/Makefile +++ b/test/Makefile @@ -24,6 +24,7 @@ libunittest.so: unittest.c test: $(P) ./$(P) sh extratests.sh + python test.py inspect: $(P) gdb --command=.inspect.gdb --args $(P) diff --git a/example_seqs.txt b/test/example_seqs.txt similarity index 100% rename from example_seqs.txt rename to test/example_seqs.txt diff --git a/test.py b/test/test.py similarity index 100% rename from test.py rename to test/test.py From 47dbe16e7b8a07e5bae3aaf3523c8babbcbd46ee Mon Sep 17 00:00:00 2001 From: Eduard Valera i Zorita Date: Fri, 29 Jun 2018 12:13:07 +0200 Subject: [PATCH 30/31] test: installs pystarcode before running travis tests. --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 00ab16c..6077b3d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,6 +8,7 @@ branches: before_install: - sudo pip install cpp-coveralls + - sudo python setup.py install script: make && cd test && make test From 9b20038cbf7b81fc463c59873b21630867623efa Mon Sep 17 00:00:00 2001 From: Eduard Valera i Zorita Date: Fri, 29 Jun 2018 12:18:03 +0200 Subject: [PATCH 31/31] travis.yml: installs pystarcode in main routine to avoid clang failure. --- .travis.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6077b3d..5535509 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,9 +8,8 @@ branches: before_install: - sudo pip install cpp-coveralls - - sudo python setup.py install -script: make && cd test && make test +script: sudo python setup.py install && make && cd test && make test after_success: - coveralls --verbose --gcov-options '\-lp' -r .. -E '.*test*'