From 801835cce171252bb5a6eb24a3f25afa3727bdec Mon Sep 17 00:00:00 2001 From: Python3pkg Date: Wed, 17 May 2017 22:59:55 -0700 Subject: [PATCH 1/3] Convert to python3 --- backspinpy/Cef_tools.py | 24 +- backspinpy/Cef_tools.py.bak | 161 +++++++ backspinpy/__init__.py | 2 +- backspinpy/__init__.py.bak | 3 + backspinpy/backSPIN.py | 2 +- backspinpy/backSPIN.py.bak | 932 ++++++++++++++++++++++++++++++++++++ 6 files changed, 1110 insertions(+), 14 deletions(-) create mode 100644 backspinpy/Cef_tools.py.bak create mode 100644 backspinpy/__init__.py.bak create mode 100755 backspinpy/backSPIN.py.bak diff --git a/backspinpy/Cef_tools.py b/backspinpy/Cef_tools.py index 9d56f8a..c9fbfe4 100644 --- a/backspinpy/Cef_tools.py +++ b/backspinpy/Cef_tools.py @@ -40,7 +40,7 @@ # attribute_values_list2 = cef.col_attr_values[2] # ... -from builtins import str as unicode +from builtins import str as str from io import open from builtins import range @@ -73,7 +73,7 @@ def update(self): self.rows = len(self.matrix) self.cols = len(self.matrix[0]) self.tab_fields = max(7, self.cols + self.row_attr + 1) - self.linestr = u'%s' + u'\t%s' * (self.tab_fields-1) + u'\n' + self.linestr = '%s' + '\t%s' * (self.tab_fields-1) + '\n' def add_header(self, name, value): self.header_names.append(name) @@ -130,7 +130,7 @@ def readCEF(self, filepath, matrix_dtype = 'auto'): try: self.matrix.append( [matrix_dtype(el) for el in linelist[self.row_attr+1:] ]) except ValueError: - print(repr(el), ' is invalid') + print((repr(el), ' is invalid')) @@ -138,24 +138,24 @@ def writeCEF(self, filepath, matrix_str_fmt = '%i'): self.update() with open(filepath, 'w') as fout: #Write cef file first line - fout.write( self.linestr % ( ('CEF', unicode(self.headers), unicode(self.row_attr),\ - unicode(self.col_attr) , unicode(self.rows), unicode(self.cols), unicode(self.flags) ) +\ + fout.write( self.linestr % ( ('CEF', str(self.headers), str(self.row_attr),\ + str(self.col_attr) , str(self.rows), str(self.cols), str(self.flags) ) +\ ('',) * (self.tab_fields - 7) ) ) #Write header for i in range(self.headers): - fout.write(self.linestr % ( (unicode( self.header_names[i]), unicode( self.header_values[i]) ) + ('',) * (self.tab_fields - 2) )) + fout.write(self.linestr % ( (str( self.header_names[i]), str( self.header_values[i]) ) + ('',) * (self.tab_fields - 2) )) #Write col attributes for i in range( self.col_attr ): - fout.write( self.linestr % ( ('',) * (self.row_attr) + (unicode( self.col_attr_names[i] ),) + tuple( unicode(el) for el in self.col_attr_values[i] ) )) + fout.write( self.linestr % ( ('',) * (self.row_attr) + (str( self.col_attr_names[i] ),) + tuple( str(el) for el in self.col_attr_values[i] ) )) #Write headers of row attributes - fout.write( self.linestr % ( tuple(unicode(el) for el in self.row_attr_names) + ('',)*(self.tab_fields-self.row_attr) ) ) + fout.write( self.linestr % ( tuple(str(el) for el in self.row_attr_names) + ('',)*(self.tab_fields-self.row_attr) ) ) #Write rows for i in range(self.rows): for j in range(self.row_attr): - fout.write(unicode(self.row_attr_values[j][i]) + u'\t') - fout.write(u'\t') - fout.write(u'\t'.join([unicode(matrix_str_fmt % el) for el in self.matrix[i]] ) ) - fout.write(u'\n') + fout.write(str(self.row_attr_values[j][i]) + '\t') + fout.write('\t') + fout.write('\t'.join([str(matrix_str_fmt % el) for el in self.matrix[i]] ) ) + fout.write('\n') diff --git a/backspinpy/Cef_tools.py.bak b/backspinpy/Cef_tools.py.bak new file mode 100644 index 0000000..9d56f8a --- /dev/null +++ b/backspinpy/Cef_tools.py.bak @@ -0,0 +1,161 @@ +# Copyright (c) 2015 Gioele La Manno and Sten Linnarsson +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +# This code contains a simple parser reader for Cef files +# typical usage to write a cef_file: +# +# cef = CEF_obj() +# cef.add_header('Mynote', 'This is my message') +# cef.set_matrix(your_array) +# cef.add_col_attr(attribute_name, attribute_values_list) +# cef.add_row_attr(attribute_name, attribute_values_list) +# cef.writeCEF('path/to/your/file.cef') +# +# to read: +# cef = CEF_obj() +# cef.readCEF('path/to/your/file.cef') +# your_array = cef.matrix +# attribute_values_list2 = cef.col_attr_values[2] +# ... + +from builtins import str as unicode +from io import open +from builtins import range + +class CEF_obj(object): + def __init__(self): + self.headers = 0 + self.row_attr = 0 + self.col_attr = 0 + self.rows = 0 + self.cols = 0 + self.flags = 0 + self.tab_fields = 7 + + self.header_names = [] + self.header_values = [] + + self.row_attr_names = [] + self.row_attr_values = [] + + self.col_attr_names = [] + self.col_attr_values = [] + + self.matrix = [] + + + def update(self): + self.headers = len( self.header_names) + self.row_attr = len(self.row_attr_names) + self.col_attr = len(self.col_attr_names) + self.rows = len(self.matrix) + self.cols = len(self.matrix[0]) + self.tab_fields = max(7, self.cols + self.row_attr + 1) + self.linestr = u'%s' + u'\t%s' * (self.tab_fields-1) + u'\n' + + def add_header(self, name, value): + self.header_names.append(name) + self.header_values.append(value) + + def add_row_attr(self, name, value): + self.row_attr_names.append(name) + self.row_attr_values.append(list(value)) + + def add_col_attr(self, name, value): + self.col_attr_names.append(name) + self.col_attr_values.append(list(value)) + + def set_matrix(self, matrix): + for row in matrix: + self.matrix.append(list(row)) + + def readCEF(self, filepath, matrix_dtype = 'auto'): + #Delete all the stored information + self.__init__() + #Start parsing + with open(filepath, 'rU') as fin: + # Read cef file first line + self.header, self.row_attr, self.col_attr, self.rows,\ + self.cols, self.flags = fin.readline().rstrip('\n').split('\t')[1:7] + self.header = int(self.header) + self.row_attr = int( self.row_attr ) + self.col_attr = int(self.col_attr) + self.rows = int(self.rows) + self.cols = int(self.cols) + self.flags = int(self.flags) + self.row_attr_values = [[] for _ in range(self.row_attr)] + # Read header + for i in range(self.header): + name, value = fin.readline().rstrip('\n').split('\t')[:2] + self.header_names.append(name) + self.header_values.append(value) + # Read col attr + for i in range(self.col_attr): + line_col_attr = fin.readline().rstrip('\n').split('\t')[self.row_attr:] + self.col_attr_names.append( line_col_attr[0] ) + self.col_attr_values.append( line_col_attr[1:] ) + #Read row attr and matrix + self.row_attr_names += fin.readline().rstrip('\n').split('\t')[:self.row_attr] + for _ in range(self.rows): + linelist = fin.readline().rstrip('\n').split('\t') + for n, entry in enumerate( linelist[:self.row_attr] ): + self.row_attr_values[n].append( entry ) + if matrix_dtype == 'auto': + if sum(('.' in i) or ('e' in i) for i in linelist[self.row_attr+1:]) != 0: + matrix_dtype = float + else: + matrix_dtype = int + try: + self.matrix.append( [matrix_dtype(el) for el in linelist[self.row_attr+1:] ]) + except ValueError: + print(repr(el), ' is invalid') + + + + def writeCEF(self, filepath, matrix_str_fmt = '%i'): + self.update() + with open(filepath, 'w') as fout: + #Write cef file first line + fout.write( self.linestr % ( ('CEF', unicode(self.headers), unicode(self.row_attr),\ + unicode(self.col_attr) , unicode(self.rows), unicode(self.cols), unicode(self.flags) ) +\ + ('',) * (self.tab_fields - 7) ) ) + #Write header + for i in range(self.headers): + fout.write(self.linestr % ( (unicode( self.header_names[i]), unicode( self.header_values[i]) ) + ('',) * (self.tab_fields - 2) )) + #Write col attributes + for i in range( self.col_attr ): + fout.write( self.linestr % ( ('',) * (self.row_attr) + (unicode( self.col_attr_names[i] ),) + tuple( unicode(el) for el in self.col_attr_values[i] ) )) + #Write headers of row attributes + fout.write( self.linestr % ( tuple(unicode(el) for el in self.row_attr_names) + ('',)*(self.tab_fields-self.row_attr) ) ) + #Write rows + for i in range(self.rows): + for j in range(self.row_attr): + fout.write(unicode(self.row_attr_values[j][i]) + u'\t') + fout.write(u'\t') + fout.write(u'\t'.join([unicode(matrix_str_fmt % el) for el in self.matrix[i]] ) ) + fout.write(u'\n') + + + diff --git a/backspinpy/__init__.py b/backspinpy/__init__.py index 2b92ec8..5b7711f 100644 --- a/backspinpy/__init__.py +++ b/backspinpy/__init__.py @@ -1,3 +1,3 @@ -from __future__ import absolute_import + from .backSPIN import SPIN, backSPIN, fit_CV, feature_selection from .Cef_tools import CEF_obj diff --git a/backspinpy/__init__.py.bak b/backspinpy/__init__.py.bak new file mode 100644 index 0000000..2b92ec8 --- /dev/null +++ b/backspinpy/__init__.py.bak @@ -0,0 +1,3 @@ +from __future__ import absolute_import +from .backSPIN import SPIN, backSPIN, fit_CV, feature_selection +from .Cef_tools import CEF_obj diff --git a/backspinpy/backSPIN.py b/backspinpy/backSPIN.py index 02bc3bd..baf9208 100755 --- a/backspinpy/backSPIN.py +++ b/backspinpy/backSPIN.py @@ -34,7 +34,7 @@ # pyinstaller -F backSPIN.py -n backspin-mac-64-bit # -from __future__ import division, print_function, absolute_import + from numpy import * import getopt import sys diff --git a/backspinpy/backSPIN.py.bak b/backspinpy/backSPIN.py.bak new file mode 100755 index 0000000..02bc3bd --- /dev/null +++ b/backspinpy/backSPIN.py.bak @@ -0,0 +1,932 @@ +#!/usr/bin/env python + +# Copyright (c) 2015, Amit Zeisel, Gioele La Manno and Sten Linnarsson +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# This .py file can be used as a library or a command-line version of BackSPIN, +# This version of BackSPIN was implemented by Gioele La Manno. +# The BackSPIN biclustering algorithm was developed by Amit Zeisel and is described +# in Zeisel et al. Cell types in the mouse cortex and hippocampus revealed by +# single-cell RNA-seq Science 2015 (PMID: 25700174, doi: 10.1126/science.aaa1934). +# +# Building using pyinstaller: +# pyinstaller -F backSPIN.py -n backspin-mac-64-bit +# + +from __future__ import division, print_function, absolute_import +from numpy import * +import getopt +import sys +import os +from .Cef_tools import CEF_obj + +class Results: + pass + +def calc_loccenter(x, lin_log_flag): + M,N = x.shape + if N==1 and M>1: + x = x.T + M,N = x.shape + loc_center = zeros(M) + min_x = x.min(1) + x = x - min_x[:,newaxis] + for i in range(M): + ind = where(x[i,:]>0)[0] + if len(ind) != 0: + if lin_log_flag == 1: + w = x[i,ind]/sum(x[i,ind], 0) + else: + w = (2**x[i,ind])/sum(2**x[i,ind], 0) + loc_center[i] = sum(w*ind, 0) + else: + loc_center[i] = 0 + + return loc_center + +def _calc_weights_matrix(mat_size, wid): + '''Calculate Weight Matrix + Parameters + ---------- + mat_size: int + dimension of the distance matrix + wid: int + parameter that controls the width of the neighbourood + Returns + ------- + weights_mat: 2-D array + the weights matrix to multiply with the distance matrix + + ''' + #calculate square distance from the diagonal + sqd = (arange(1,mat_size+1)[newaxis,:] - arange(1,mat_size+1)[:,newaxis])**2 + #make the distance relative to the mat_size + norm_sqd = sqd/wid + #evaluate a normal pdf + weights_mat = exp(-norm_sqd/mat_size) + #avoid useless precision that would slow down the matrix multiplication + weights_mat -= 1e-6 + weights_mat[weights_mat<0] = 0 + #normalize row and column sum + weights_mat /= sum(weights_mat,0)[newaxis,:] + weights_mat /= sum(weights_mat,1)[:, newaxis] + #fix asimmetries + weights_mat = (weights_mat + weights_mat.T) / 2. + return weights_mat + + +def _sort_neighbourhood( dist_matrix, wid ): + '''Perform a single iteration of SPIN + Parameters + ---------- + dist_matrix: 2-D array + distance matrix + wid: int + parameter that controls the width of the neighbourood + Returns + ------- + sorted_ind: 1-D array + indexes that order the matrix + + ''' + assert wid > 0, 'Parameter wid < 0 is not allowed' + mat_size = dist_matrix.shape[0] + #assert mat_size>2, 'Matrix is too small to be sorted' + weights_mat = _calc_weights_matrix(mat_size, wid) + #Calculate the dot product (can be very slow for big mat_size) + mismatch_score = dot(dist_matrix, weights_mat) + energy, target_permutation = mismatch_score.min(1), mismatch_score.argmin(1) + max_energy = max(energy) + #Avoid points that have the same target_permutation value + sort_score = target_permutation - 0.1 * sign( (mat_size/2 - target_permutation) ) * energy/max_energy + #sort_score = target_permutation - 0.1 * sign( 1-2*(int(1000*energy/max_energy) % 2) ) * energy/max_energy # Alternative + # Sorting the matrix + sorted_ind = sort_score.argsort(0)[::-1] + return sorted_ind + + +def sort_mat_by_neighborhood(dist_matrix, wid, times): + '''Perform several iterations of SPIN using a fixed wid parameter + Parameters + ---------- + dist_matrix: 2-D array + distance matrix + wid: int + parameter that controls the width of the neighbourood + times: int + number of repetitions + verbose: bool + print the progress + Returns + ------- + indexes: 1-D array + indexes that order the matrix + + ''' + # original indexes + indexes = arange(dist_matrix.shape[0]) + for i in range(times): + #sort the sitance matrix according the previous iteration + tmpmat = dist_matrix[indexes,:] + tmpmat = tmpmat[:,indexes] + sorted_ind = _sort_neighbourhood(tmpmat, wid); + #resort the original indexes + indexes = indexes[sorted_ind] + return indexes + + +def _generate_widlist(data, axis=1, step=0.6): + '''Generate a list of wid parameters to execute sort_mat_by_neighborhood + Parameters + ---------- + data: 2-D array + the data matrix + axis: int + the axis to take in consideration + step: float + the increment between two successive wid parameters + Returns + ------- + wid_list: list of int + list of wid parameters to run SPIN + + ''' + max_wid = data.shape[axis]*0.6 + new_wid = 1 + wid_list = [] + while new_wid < (1+step)*max_wid: + wid_list.append( new_wid ) + new_wid = int(ceil( new_wid + new_wid*(step) +1)) + return wid_list[::-1] + + + +def SPIN(dt, widlist=[10,1], iters=30, axis='both', verbose=False): + """Run the original SPIN algorithm + Parameters + ---------- + dt: 2-D array + the data matrix + widlist: float or list of int + If float is passed, it is used as step parameted of _generate_widlist, + and widlist is generated to run SPIN. + If list is passed it is used directly to run SPIN. + iters: int + number of repetitions for every wid in widlist + axis: int + the axis to take in consideration (must be 0, 1 or 'both') + step: float + the increment between two successive wid parameters + Returns + ------- + indexes: 1-D array (if axis in [0,1]) or tuple of 1-D array (if axis = 'both') + indexes that sort the data matrix + Notes + ----- + Typical usage + sorted_dt0 = SPIN(dt, iters=30, axis=0) + sorted_dt1 = SPIN(dt, iters=30, axis=1) + dt = dt[sorted_dt0,:] + dt = dt[:,sorted_dt1] + """ + IXc = arange(dt.shape[1]) + IXr = arange(dt.shape[0]) + assert axis in ['both', 0,1], 'axis must be 0, 1 or \'both\' ' + #Sort both axis + if axis == 'both': + CCc = 1 - corrcoef(dt.T) + CCr = 1 - corrcoef(dt) + if type(widlist) != list: + widlist_r = _generate_widlist(dt, axis=0, step=widlist) + widlist_c = _generate_widlist(dt, axis=1, step=widlist) + if verbose: + print('\nSorting genes.') + print('Neighbourood=', end=""), + for wid in widlist_r: + if verbose: + print ('%i, ' % wid), + sys.stdout.flush() + INDr = sort_mat_by_neighborhood(CCr, wid, iters) + CCr = CCr[INDr,:][:,INDr] + IXr = IXr[INDr] + if verbose: + print ('\nSorting cells.') + print ('Neighbourood=',end="") + for wid in widlist_c: + if verbose: + print ('%i, ' % wid), + sys.stdout.flush() + INDc = sort_mat_by_neighborhood(CCc, wid, iters) + CCc = CCc[:,INDc][INDc,:] + IXc= IXc[INDc] + return IXr, IXc + #Sort rows + elif axis == 0: + CCr = 1 - corrcoef(dt) + if type(widlist) != list: + widlist = _generate_widlist(dt, axis=0, step=widlist) + if verbose: + print ('\nSorting genes.\nNeighbourood=',end="") + for wid in widlist: + if verbose: + print ('%i, ' % wid,end="") + sys.stdout.flush() + INDr = sort_mat_by_neighborhood(CCr, wid, iters) + CCr = CCr[INDr,:][:,INDr] + IXr = IXr[INDr] + return IXr + #Sort columns + elif axis == 1: + CCc = 1 - corrcoef(dt.T) + if type(widlist) != list: + widlist = _generate_widlist(dt, axis=1, step=widlist) + if verbose: + print ('\nSorting cells.\nNeighbourood=',end="") + for wid in widlist: + if verbose: + print ('%i, ' % wid,end="") + sys.stdout.flush() + INDc = sort_mat_by_neighborhood(CCc, wid, iters) + CCc = CCc[:,INDc][INDc,:] + IXc = IXc[INDc] + return IXc + + +def backSPIN(data, numLevels=2, first_run_iters=10, first_run_step=0.05, runs_iters=8 ,runs_step=0.25,\ + split_limit_g=2, split_limit_c=2, stop_const = 1.15, low_thrs=0.2, verbose=False): + '''Run the backSPIN algorithm + Parameters + ---------- + data: 2-D array + the data matrix, rows should be genes and columns single cells/samples + numLevels: int + the number of splits that will be tried + first_run_iters: float + the iterations of the preparatory SPIN + first_run_step: float + the step parameter passed to _generate_widlist for the preparatory SPIN + runs_iters: int + the iterations parameter passed to the _divide_to_2and_resort. + influences all the SPIN iterations except the first + runs_step: float + the step parameter passed to the _divide_to_2and_resort. + influences all the SPIN iterations except the first + wid: float + the wid of every iteration of the splitting and resorting + split_limit_g: int + If the number of specific genes in a subgroup is smaller than this number + splitting of that subgrup is not allowed + split_limit_c: int + If the number cells in a subgroup is smaller than this number splitting of + that subgrup is not allowed + stop_const: float + minimum score that a breaking point has to reach to be suitable for splitting + low_thrs: float + genes with average lower than this threshold are assigned to either of the + splitting group reling on genes that are higly correlated with them + + Returns + ------- + results: Result object + The results object contain the following attributes + genes_order: 1-D array + indexes (a permutation) sorting the genes + cells_order: 1-D array + indexes (a permutation) sorting the cells + genes_gr_level: 2-D array + for each depth level contains the cluster indexes for each gene + cells_gr_level: + for each depth level contains the cluster indexes for each cell + cells_gr_level_sc: + score of the splitting + genes_bor_level: + the border index between gene clusters + cells_bor_level: + the border index between cell clusters + + Notes + ----- + Typical usage + + ''' + assert numLevels>0, '0 is not an available depth for backSPIN, use SPIN instead' + #initialize some varaibles + genes_bor_level = [[] for i in range(numLevels)] + cells_bor_level = [[] for i in range(numLevels)] + N,M = data.shape + genes_order = arange(N) + cells_order = arange(M) + genes_gr_level = zeros((N,numLevels+1)) + cells_gr_level = zeros((M,numLevels+1)) + cells_gr_level_sc = zeros((M,numLevels+1)) + + # Do a Preparatory SPIN on cells + if verbose: + print ('\nPreparatory SPIN') + ix1 = SPIN(data, widlist=_generate_widlist(data, axis=1, step=first_run_step), iters=first_run_iters, axis=1, verbose=verbose) + cells_order = cells_order[ix1] + + #For every level of depth DO: + for i in range(numLevels): + k=0 # initialize group id counter + # For every group generated at the parent level DO: + for j in range( len( set(cells_gr_level[:,i]) ) ): + # Extract the a data matrix of the genes at that level + g_settmp = nonzero(genes_gr_level[:,i]==j)[0] #indexes of genes in the level j + c_settmp = nonzero(cells_gr_level[:,i]==j)[0] #indexes of cells in the level j + datatmp = data[ ix_(genes_order[g_settmp], cells_order[c_settmp]) ] + # If we are not below the splitting limit for both genes and cells DO: + if (len(g_settmp)>split_limit_g) & (len(c_settmp)>split_limit_c): + # Split and SPINsort the two halves + if i == numLevels-1: + divided = _divide_to_2and_resort(datatmp, wid=runs_step, iters_spin=runs_iters,\ + stop_const=stop_const, low_thrs=low_thrs, sort_genes=True, verbose=verbose) + else: + divided = _divide_to_2and_resort(datatmp, wid=runs_step, iters_spin=runs_iters,\ + stop_const=stop_const, low_thrs=low_thrs, sort_genes=False,verbose=verbose) + # _divide_to_2and_resort retruns an empty array in gr2 if the splitting condition was not satisfied + if divided: + sorted_data_resort1, genes_resort1, cells_resort1,\ + gr1, gr2, genesgr1, genesgr2, score1, score2 = divided + # Resort from the previous level + genes_order[g_settmp] = genes_order[g_settmp[genes_resort1]] + cells_order[c_settmp] = cells_order[c_settmp[cells_resort1]] + # Assign a numerical identifier to the groups + genes_gr_level[g_settmp[genesgr1],i+1] = k + genes_gr_level[g_settmp[genesgr2],i+1] = k+1 + cells_gr_level[c_settmp[gr1],i+1] = k + cells_gr_level[c_settmp[gr2],i+1] = k+1 + # Not really clear what sc is + cells_gr_level_sc[c_settmp[gr1],i+1] = score1 + cells_gr_level_sc[c_settmp[gr2],i+1] = score2 + # Augment the counter of 2 becouse two groups were generated from one + k = k+2 + else: + # The split is not convenient, keep everithing the same + genes_gr_level[g_settmp,i+1] = k + # if it is the deepest level: perform gene sorting + if i == numLevels-1: + if (datatmp.shape[0] > 2 )and (datatmp.shape[1] > 2): + genes_resort1 = SPIN(datatmp, widlist=runs_step, iters=runs_iters, axis=0, verbose=verbose) + genes_order[g_settmp] = genes_order[g_settmp[genes_resort1]] + cells_gr_level[c_settmp,i+1] = k + cells_gr_level_sc[c_settmp,i+1] = cells_gr_level_sc[c_settmp,i] + # Augment of 1 becouse no new group was generated + k = k+1 + else: + # Below the splitting limit: the split is not convenient, keep everithing the same + genes_gr_level[g_settmp,i+1] = k + cells_gr_level[c_settmp,i+1] = k + cells_gr_level_sc[c_settmp,i+1] = cells_gr_level_sc[c_settmp,i] + # Augment of 1 becouse no new group was generated + k = k+1 + + # Find boundaries + genes_bor_level[i] = r_[0, nonzero(diff(genes_gr_level[:,i+1])>0)[0]+1, data.shape[0] ] + cells_bor_level[i] = r_[0, nonzero(diff(cells_gr_level[:,i+1])>0)[0]+1, data.shape[1] ] + + #dataout_sorted = data[ ix_(genes_order,cells_order) ] + + results = Results() + results.genes_order = genes_order + results.cells_order = cells_order + results.genes_gr_level = genes_gr_level + results.cells_gr_level = cells_gr_level + results.cells_gr_level_sc = cells_gr_level_sc + results.genes_bor_level = genes_bor_level + results.cells_bor_level = cells_bor_level + + return results + + + +def _divide_to_2and_resort(sorted_data, wid, iters_spin=8, stop_const = 1.15, low_thrs=0.2 , sort_genes=True, verbose=False): + '''Core function of backSPIN: split the datamatrix in two and resort the two halves + + Parameters + ---------- + sorted_data: 2-D array + the data matrix, rows should be genes and columns single cells/samples + wid: float + wid parameter to give to widlist parameter of th SPIN fucntion + stop_const: float + minimum score that a breaking point has to reach to be suitable for splitting + low_thrs: float + if the difference between the average expression of two groups is lower than threshold the algorythm + uses higly correlated gens to assign the gene to one of the two groups + verbose: bool + information about the split is printed + + Returns + ------- + ''' + + # Calculate correlation matrix for cells and genes + Rcells = corrcoef(sorted_data.T) + Rgenes = corrcoef(sorted_data) + # Look for the optimal breaking point + N = Rcells.shape[0] + score = zeros(N) + for i in range(2,N-2): + if i == 2: + tmp1 = sum( Rcells[:i,:i] ) + tmp2 = sum( Rcells[i:,i:] ) + score[i] = (tmp1+tmp2) / float(i**2 + (N-i)**2) + else: + tmp1 += sum(Rcells[i-1,:i]) + sum(Rcells[:i-1,i-1]); + tmp2 -= sum(Rcells[i-1:,i-1]) + sum(Rcells[i-1,i:]); + score[i] = (tmp1+tmp2) / float(i**2 + (N-i)**2) + + breakp1 = argmax(score) + score1 = Rcells[:breakp1,:breakp1] + score1 = triu(score1) + score1 = mean( score1[score1 != 0] ) + score2 = Rcells[breakp1:, breakp1:] + score2 = triu(score2) + score2 = mean( score2[score2 != 0] ) + avg_tot = triu(Rcells) + avg_tot = mean( avg_tot[avg_tot != 0] ) + + # If it is convenient to break + if (max([score1,score2])/avg_tot) > stop_const: + # Divide in two groups + gr1 = arange(N)[:breakp1] + gr2 = arange(N)[breakp1:] + # and assign the genes into the two groups + mean_gr1 = sorted_data[:, gr1].mean(1) + mean_gr2 = sorted_data[:, gr2].mean(1) + concat_loccenter_gr1 = c_[ calc_loccenter(sorted_data[:,gr1], 2), calc_loccenter(sorted_data[:,gr1][...,::-1], 2) ] + concat_loccenter_gr2 = c_[ calc_loccenter(sorted_data[:,gr2], 2), calc_loccenter(sorted_data[:,gr2][...,::-1], 2) ] + center_gr1, flip_flag1 = concat_loccenter_gr1.min(1), concat_loccenter_gr1.argmin(1) + center_gr2, flip_flag2 = concat_loccenter_gr2.max(1), concat_loccenter_gr2.argmax(1) + sorted_data_tmp = array( sorted_data ) + sorted_data_tmp[ix_(flip_flag1==1,gr1)] = sorted_data[ix_(flip_flag1==1,gr1)][...,::-1] + sorted_data_tmp[ix_(flip_flag2==1,gr2)] = sorted_data[ix_(flip_flag2==1,gr2)][...,::-1] + loc_center = calc_loccenter(sorted_data_tmp, 2) + + imax = zeros(loc_center.shape) + imax[loc_center<=breakp1] = 1 + imax[loc_center>breakp1] = 2 + + genesgr1 = where(imax==1)[0] + genesgr2 = where(imax==2)[0] + if size(genesgr1) == 0: + IN = argmax(mean_gr1) + genesgr1 = array([IN]) + genesgr2 = setdiff1d(genesgr2, IN) + elif size(genesgr2) == 0: + IN = argmax(mean_gr2) + genesgr2 = array([IN]) + genesgr1 = setdiff1d(genesgr1, IN) + + if verbose: + print ('\nSplitting (%i, %i) ' % sorted_data.shape) + print ('in (%i,%i) ' % (genesgr1.shape[0],gr1.shape[0])) + print ('and (%i,%i)' % (genesgr2.shape[0],gr2.shape[0]),end="") + sys.stdout.flush() + + # Data of group1 + datagr1 = sorted_data[ix_(genesgr1,gr1)] + # zero center + datagr1 = datagr1 - datagr1.mean(1)[:,newaxis] + # Resort group1 + if min( datagr1.shape ) > 1: + if sort_genes: + genesorder1,cellorder1 = SPIN(datagr1, widlist=wid, iters=iters_spin, axis='both', verbose=verbose) + else: + cellorder1 = SPIN(datagr1, widlist=wid, iters=iters_spin, axis=1, verbose=verbose) + genesorder1 = arange(datagr1.shape[0]) + elif len(genesgr1) == 1: + genesorder1 = 0 + cellorder1 = argsort( datagr1[0,:] ) + elif len(gr1) == 1: + cellorder1 = 0 + genesorder1 = argsort( datagr1[:,0] ) + + # Data of group2 + datagr2 = sorted_data[ix_(genesgr2,gr2)] + # zero center + datagr2 = datagr2 - datagr2.mean(1)[:,newaxis] + # Resort group2 + if min( datagr2.shape )>1: + if sort_genes: + genesorder2, cellorder2 = SPIN(datagr2, widlist=wid, iters=iters_spin, axis='both',verbose=verbose) + else: + cellorder2 = SPIN(datagr2, widlist=wid, iters=iters_spin, axis=1,verbose=verbose) + genesorder2 = arange(datagr2.shape[0]) + elif len(genesgr2) == 1: + genesorder2 = 0 + cellorder2 = argsort(datagr2[0,:]) + elif len(gr2) == 1: + cellorder2 = 0 + genesorder2 = argsort(datagr2[:,0]) + + # contcatenate cells and genes indexes + genes_resort1 = r_[genesgr1[genesorder1], genesgr2[genesorder2] ] + cells_resort1 = r_[gr1[cellorder1], gr2[cellorder2] ] + genesgr1 = arange(len(genesgr1)) + genesgr2 = arange(len(genesgr1), len(sorted_data[:,0])) + # resort + sorted_data_resort1 = sorted_data[ix_(genes_resort1,cells_resort1)] + + return sorted_data_resort1, genes_resort1, cells_resort1, gr1, gr2, genesgr1, genesgr2, score1, score2 + + else: + if verbose: + print('Low splitting score was : %.4f' % (max([score1,score2])/avg_tot)) + return False + + +def fit_CV(mu, cv, fit_method='Exp', svr_gamma=0.06, x0=[0.5,0.5], verbose=False): + '''Fits a noise model (CV vs mean) + Parameters + ---------- + mu: 1-D array + mean of the genes (raw counts) + cv: 1-D array + coefficient of variation for each gene + fit_method: string + allowed: 'SVR', 'Exp', 'binSVR', 'binExp' + default: 'SVR'(requires scikit learn) + SVR: uses Support vector regression to fit the noise model + Exp: Parametric fit to cv = mu^(-a) + b + bin: before fitting the distribution of mean is normalized to be + uniform by downsampling and resampling. + Returns + ------- + score: 1-D array + Score is the relative position with respect of the fitted curve + mu_linspace: 1-D array + x coordiantes to plot (min(log2(mu)) -> max(log2(mu))) + cv_fit: 1-D array + y=f(x) coordinates to plot + pars: tuple or None + + ''' + log2_m = log2(mu) + log2_cv = log2(cv) + + if len(mu)>1000 and 'bin' in fit_method: + #histogram with 30 bins + n,xi = histogram(log2_m,30) + med_n = percentile(n,50) + for i in range(0,len(n)): + # index of genes within the ith bin + ind = where( (log2_m >= xi[i]) & (log2_m < xi[i+1]) )[0] + if len(ind)>med_n: + #Downsample if count is more than median + ind = ind[random.permutation(len(ind))] + ind = ind[:len(ind)-med_n] + mask = ones(len(log2_m), dtype=bool) + mask[ind] = False + log2_m = log2_m[mask] + log2_cv = log2_cv[mask] + elif (around(med_n/len(ind))>1) and (len(ind)>5): + #Duplicate if count is less than median + log2_m = r_[ log2_m, tile(log2_m[ind], around(med_n/len(ind))-1) ] + log2_cv = r_[ log2_cv, tile(log2_cv[ind], around(med_n/len(ind))-1) ] + else: + if 'bin' in fit_method: + print('More than 1000 input feature needed for bin correction.') + pass + + if 'SVR' in fit_method: + try: + from sklearn.svm import SVR + if svr_gamma == 'auto': + svr_gamma = 1000./len(mu) + #Fit the Support Vector Regression + clf = SVR(gamma=svr_gamma) + clf.fit(log2_m[:,newaxis], log2_cv) + fitted_fun = clf.predict + score = log2(cv) - fitted_fun(log2(mu)[:,newaxis]) + params = None + #The coordinates of the fitted curve + mu_linspace = linspace(min(log2_m),max(log2_m)) + cv_fit = fitted_fun(mu_linspace[:,newaxis]) + return score, mu_linspace, cv_fit , params + + except ImportError: + if verbose: + print('SVR fit requires scikit-learn python library. Using exponential instead.') + if 'bin' in fit_method: + return fit_CV(mu, cv, fit_method='binExp', x0=x0) + else: + return fit_CV(mu, cv, fit_method='Exp', x0=x0) + elif 'Exp' in fit_method: + from scipy.optimize import minimize + #Define the objective function to fit (least squares) + fun = lambda x, log2_m, log2_cv: sum(abs( log2( (2.**log2_m)**(-x[0])+x[1]) - log2_cv )) + #Fit using Nelder-Mead algorythm + optimization = minimize(fun, x0, args=(log2_m,log2_cv), method='Nelder-Mead') + params = optimization.x + #The fitted function + fitted_fun = lambda log_mu: log2( (2.**log_mu)**(-params[0]) + params[1]) + # Score is the relative position with respect of the fitted curve + score = log2(cv) - fitted_fun(log2(mu)) + #The coordinates of the fitted curve + mu_linspace = linspace(min(log2_m),max(log2_m)) + cv_fit = fitted_fun(mu_linspace) + return score, mu_linspace, cv_fit , params + + + +def feature_selection(data,thrs, verbose=False): + if thrs>= data.shape[0]: + if verbose: + print ("Trying to select %i features but only %i genes available." %( thrs, data.shape[0])) + print ("Skipping feature selection") + return arange(data.shape[0]) + ix_genes = arange(data.shape[0]) + threeperK = int(ceil(3*data.shape[1]/1000.)) + zerotwoperK = int(floor(0.3*data.shape[1]/1000.)) + # is at least 1 molecule in 0.3% of thecells, is at least 2 molecules in 0.03% of the cells + condition = (sum(data>=1, 1)>= threeperK) & (sum(data>=2, 1)>=zerotwoperK) + ix_genes = ix_genes[condition] + + mu = data[ix_genes,:].mean(1) + sigma = data[ix_genes,:].std(1, ddof=1) + cv = sigma/mu + + try: + score, mu_linspace, cv_fit , params = fit_CV(mu,cv,fit_method='SVR', verbose=verbose) + except ImportError: + print ("WARNING: Feature selection was skipped becouse scipy is required. Install scipy to run feature selection.") + return arange(data.shape[0]) + + return ix_genes[argsort(score)[::-1]][:thrs] + +def usage_quick(): + + message ='''usage: backSPIN [-hbv] [-i inputfile] [-o outputfolder] [-d int] [-f int] [-t int] [-s float] [-T int] [-S float] [-g int] [-c int] [-k float] [-r float] + manual: backSPIN -h + ''' + print (message) + +def usage(): + + message=''' + backSPIN commandline tool + ------------------------- + + The options are as follows: + + -i [inputfile] + --input=[inputfile] + Path of the cef formatted tab delimited file. + Rows should be genes and columns single cells/samples. + For further information on the cef format visit: + https://github.com/linnarsson-lab/ceftools + + -o [outputfile] + --output=[outputfile] + The name of the file to which the output will be written + + -d [int] + Depth/Number of levels: The number of nested splits that will be tried by the algorithm + -t [int] + Number of the iterations used in the preparatory SPIN. + Defaults to 10 + -f [int] + Feature selection is performed before BackSPIN. Argument controls how many genes are seleceted. + Selection is based on expected noise (a curve fit to the CV-vs-mean plot). + -s [float] + Controls the decrease rate of the width parameter used in the preparatory SPIN. + Smaller values will increase the number of SPIN iterations and result in higher + precision in the first step but longer execution time. + Defaults to 0.1 + -T [int] + Number of the iterations used for every width parameter. + Does not apply on the first run (use -t instead) + Defaults to 8 + -S [float] + Controls the decrease rate of the width parameter. + Smaller values will increase the number of SPIN iterations and result in higher + precision but longer execution time. + Does not apply on the first run (use -s instead) + Defaults to 0.3 + -g [int] + Minimal number of genes that a group must contain for splitting to be allowed. + Defaults to 2 + -c [int] + Minimal number of cells that a group must contain for splitting to be allowed. + Defaults to 2 + -k [float] + Minimum score that a breaking point has to reach to be suitable for splitting. + Defaults to 1.15 + -r [float] + If the difference between the average expression of two groups is lower than threshold the algorythm + uses higly correlated genes to assign the gene to one of the two groups + Defaults to 0.2 + -b [axisvalue] + Run normal SPIN instead of backSPIN. + Normal spin accepts the parameters -T -S + An axis value 0 to only sort genes (rows), 1 to only sort cells (columns) or 'both' for both + must be passed + -v + Verbose. Print to the stdoutput extra details of what is happening + + ''' + + print(message) + + + +if __name__ == '__main__': + print("") + #defaults arguments + input_path = None + outfiles_path = None + numLevels=2 # -d + feature_fit = False # -f + feature_genes = 2000 + first_run_iters=10 # -t + first_run_step=0.1 # -s + runs_iters=8 # -T + runs_step=0.3 # -S + split_limit_g=2 # -g + split_limit_c=2 # -c + stop_const = 1.15 # -k + low_thrs=0.2 # -r + normal_spin = False #-b + normal_spin_axis = 'both' + verbose=False # -v + + optlist, args = getopt.gnu_getopt(sys.argv[1:], "hvi:o:f:d:t:s:T:S:g:c:k:r:b:", ["help", "input=","output="]) + + if optlist== [] and args == []: + usage_quick() + sys.exit() + for opt, a in optlist: + if opt in ("-h", "--help"): + usage() + sys.exit() + elif opt in ('-i', '--input'): + input_path = a + elif opt in ("-o", "--output"): + outfiles_path = a + elif opt == '-d': + numLevels = int(a) + elif opt == '-f': + feature_fit = True + if a != '': + feature_genes = int(a) + elif opt == '-t': + first_run_iters = int(a) + elif opt == '-s': + first_run_step = float(a) + elif opt == '-T': + runs_iters = int(a) + elif opt == '-S': + runs_step = float(a) + elif opt == '-g': + split_limit_g = int(a) + elif opt == '-c': + split_limit_c = int(a) + elif opt == '-k': + stop_const = float(a) + elif opt == '-r': + low_thrs = float(a) + elif opt == '-v': + verbose = True + elif opt == '-b': + normal_spin = True + if a != '': + if a == 'both': + normal_spin_axis = a + else: + normal_spin_axis = int(a) + else: + assert False, "%s option is not supported" % opt + + if input_path == None: + print ('No input file was provided.\nYou need to specify an input file\n(e.g. backSPIN -i path/to/your/file/foo.cef)\n') + sys.exit() + if outfiles_path == None: + print ('No output file was provided.\nYou need to specify an output file\n(e.g. backSPIN -o path/to/your/file/bar.cef)\n') + sys.exit() + + try: + if verbose: + print ('Loading file.') + input_cef = CEF_obj() + input_cef.readCEF(input_path) + + data = array(input_cef.matrix) + + if feature_fit: + if verbose: + print ("Performing feature selection") + ix_features = feature_selection(data, feature_genes, verbose=verbose) + if verbose: + print ("Selected %i genes" % len(ix_features)) + data = data[ix_features, :] + input_cef.matrix = data.tolist() + input_cef.row_attr_values = atleast_2d( array( input_cef.row_attr_values ))[:,ix_features].tolist() + input_cef.update() + + data = log2(data+1) + data = data - data.mean(1)[:,newaxis] + if data.shape[0] <= 3 and data.shape[1] <= 3: + print ('Input file is not correctly formatted.\n') + sys.exit() + except Exception as err: + import traceback + print ('There was an error') + print (traceback.format_exc()) + print ('Error occurred in parsing the input file.') + print ('Please check that your input file is a correctly formatted cef file.\n') + sys.exit() + + if normal_spin == False: + + print ('backSPIN started\n----------------\n') + print ('Input file:\n%s\n' % input_path) + print ('Output file:\n%s\n' % outfiles_path) + print ('numLevels: %i\nfirst_run_iters: %i\nfirst_run_step: %.3f\nruns_iters: %i\nruns_step: %.3f\nsplit_limit_g: %i\nsplit_limit_c: %i\nstop_const: %.3f\nlow_thrs: %.3f\n' % (numLevels, first_run_iters, first_run_step, runs_iters,\ + runs_step, split_limit_g, split_limit_c, stop_const, low_thrs)) + + + results = backSPIN(data, numLevels, first_run_iters, first_run_step, runs_iters, runs_step,\ + split_limit_g, split_limit_c, stop_const, low_thrs, verbose) + + sys.stdout.flush() + print ('\nWriting output.\n') + + output_cef = CEF_obj() + + for h_name, h_val in zip( input_cef.header_names, input_cef.header_values): + output_cef.add_header(h_name, h_val ) + for c_name, c_val in zip( input_cef.col_attr_names, input_cef.col_attr_values): + output_cef.add_col_attr(c_name, array(c_val)[results.cells_order]) + for r_name, r_val in zip( input_cef.row_attr_names, input_cef.row_attr_values): + output_cef.add_row_attr(r_name, array(r_val)[results.genes_order]) + + for level, groups in enumerate( results.genes_gr_level.T ): + output_cef.add_row_attr('Level_%i_group' % level, [int(el) for el in groups]) + for level, groups in enumerate( results.cells_gr_level.T ): + output_cef.add_col_attr('Level_%i_group' % level, [int(el) for el in groups]) + + output_cef.set_matrix(array(input_cef.matrix)[results.genes_order,:][:,results.cells_order]) + if sum(type(i)==float for i in input_cef.matrix[0]) + sum(type(i)==float for i in input_cef.matrix[-1]) == 0: + fmt = '%i' + else: + fmt ='%.6g' + output_cef.writeCEF( outfiles_path, matrix_str_fmt=fmt ) + else: + + print ('normal SPIN started\n----------------\n') + print ('Input file:\n%s\n' % input_path) + print ('Output file:\n%s\n' % outfiles_path) + + results = SPIN(data, widlist=runs_step, iters=runs_iters, axis=normal_spin_axis, verbose=verbose) + + print ('\nWriting output.\n') + + output_cef = CEF_obj() + + for h_name, h_val in zip( input_cef.header_names, input_cef.header_values): + output_cef.add_header(h_name, h_val ) + + if normal_spin_axis == 'both': + for c_name, c_val in zip( input_cef.col_attr_names, input_cef.col_attr_values): + output_cef.add_col_attr(c_name, array(c_val)[results[1]]) + for r_name, r_val in zip( input_cef.row_attr_names, input_cef.row_attr_values): + output_cef.add_row_attr(r_name, array(r_val)[results[0]]) + output_cef.set_matrix(array(input_cef.matrix)[results[0],:][:,results[1]]) + + if normal_spin_axis == 0: + for r_name, r_val in zip( input_cef.row_attr_names, input_cef.row_attr_values): + output_cef.add_row_attr(r_name, array(r_val)[results]) + output_cef.set_matrix(array(input_cef.matrix)[results,:]) + + if normal_spin_axis == 1: + for c_name, c_val in zip( input_cef.col_attr_names, input_cef.col_attr_values): + output_cef.add_col_attr(c_name, array(c_val)[results]) + output_cef.set_matrix(array(input_cef.matrix)[:,results]) + + + + output_cef.writeCEF( outfiles_path ) + From 43b112667710ec340ddc62eafb8e8856c8d22601 Mon Sep 17 00:00:00 2001 From: Python3pkg Date: Thu, 18 May 2017 05:14:21 -0700 Subject: [PATCH 2/3] Remove .bak --- backspinpy/Cef_tools.py.bak | 161 ------- backspinpy/__init__.py.bak | 3 - backspinpy/backSPIN.py.bak | 932 ------------------------------------ 3 files changed, 1096 deletions(-) delete mode 100644 backspinpy/Cef_tools.py.bak delete mode 100644 backspinpy/__init__.py.bak delete mode 100755 backspinpy/backSPIN.py.bak diff --git a/backspinpy/Cef_tools.py.bak b/backspinpy/Cef_tools.py.bak deleted file mode 100644 index 9d56f8a..0000000 --- a/backspinpy/Cef_tools.py.bak +++ /dev/null @@ -1,161 +0,0 @@ -# Copyright (c) 2015 Gioele La Manno and Sten Linnarsson -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -# This code contains a simple parser reader for Cef files -# typical usage to write a cef_file: -# -# cef = CEF_obj() -# cef.add_header('Mynote', 'This is my message') -# cef.set_matrix(your_array) -# cef.add_col_attr(attribute_name, attribute_values_list) -# cef.add_row_attr(attribute_name, attribute_values_list) -# cef.writeCEF('path/to/your/file.cef') -# -# to read: -# cef = CEF_obj() -# cef.readCEF('path/to/your/file.cef') -# your_array = cef.matrix -# attribute_values_list2 = cef.col_attr_values[2] -# ... - -from builtins import str as unicode -from io import open -from builtins import range - -class CEF_obj(object): - def __init__(self): - self.headers = 0 - self.row_attr = 0 - self.col_attr = 0 - self.rows = 0 - self.cols = 0 - self.flags = 0 - self.tab_fields = 7 - - self.header_names = [] - self.header_values = [] - - self.row_attr_names = [] - self.row_attr_values = [] - - self.col_attr_names = [] - self.col_attr_values = [] - - self.matrix = [] - - - def update(self): - self.headers = len( self.header_names) - self.row_attr = len(self.row_attr_names) - self.col_attr = len(self.col_attr_names) - self.rows = len(self.matrix) - self.cols = len(self.matrix[0]) - self.tab_fields = max(7, self.cols + self.row_attr + 1) - self.linestr = u'%s' + u'\t%s' * (self.tab_fields-1) + u'\n' - - def add_header(self, name, value): - self.header_names.append(name) - self.header_values.append(value) - - def add_row_attr(self, name, value): - self.row_attr_names.append(name) - self.row_attr_values.append(list(value)) - - def add_col_attr(self, name, value): - self.col_attr_names.append(name) - self.col_attr_values.append(list(value)) - - def set_matrix(self, matrix): - for row in matrix: - self.matrix.append(list(row)) - - def readCEF(self, filepath, matrix_dtype = 'auto'): - #Delete all the stored information - self.__init__() - #Start parsing - with open(filepath, 'rU') as fin: - # Read cef file first line - self.header, self.row_attr, self.col_attr, self.rows,\ - self.cols, self.flags = fin.readline().rstrip('\n').split('\t')[1:7] - self.header = int(self.header) - self.row_attr = int( self.row_attr ) - self.col_attr = int(self.col_attr) - self.rows = int(self.rows) - self.cols = int(self.cols) - self.flags = int(self.flags) - self.row_attr_values = [[] for _ in range(self.row_attr)] - # Read header - for i in range(self.header): - name, value = fin.readline().rstrip('\n').split('\t')[:2] - self.header_names.append(name) - self.header_values.append(value) - # Read col attr - for i in range(self.col_attr): - line_col_attr = fin.readline().rstrip('\n').split('\t')[self.row_attr:] - self.col_attr_names.append( line_col_attr[0] ) - self.col_attr_values.append( line_col_attr[1:] ) - #Read row attr and matrix - self.row_attr_names += fin.readline().rstrip('\n').split('\t')[:self.row_attr] - for _ in range(self.rows): - linelist = fin.readline().rstrip('\n').split('\t') - for n, entry in enumerate( linelist[:self.row_attr] ): - self.row_attr_values[n].append( entry ) - if matrix_dtype == 'auto': - if sum(('.' in i) or ('e' in i) for i in linelist[self.row_attr+1:]) != 0: - matrix_dtype = float - else: - matrix_dtype = int - try: - self.matrix.append( [matrix_dtype(el) for el in linelist[self.row_attr+1:] ]) - except ValueError: - print(repr(el), ' is invalid') - - - - def writeCEF(self, filepath, matrix_str_fmt = '%i'): - self.update() - with open(filepath, 'w') as fout: - #Write cef file first line - fout.write( self.linestr % ( ('CEF', unicode(self.headers), unicode(self.row_attr),\ - unicode(self.col_attr) , unicode(self.rows), unicode(self.cols), unicode(self.flags) ) +\ - ('',) * (self.tab_fields - 7) ) ) - #Write header - for i in range(self.headers): - fout.write(self.linestr % ( (unicode( self.header_names[i]), unicode( self.header_values[i]) ) + ('',) * (self.tab_fields - 2) )) - #Write col attributes - for i in range( self.col_attr ): - fout.write( self.linestr % ( ('',) * (self.row_attr) + (unicode( self.col_attr_names[i] ),) + tuple( unicode(el) for el in self.col_attr_values[i] ) )) - #Write headers of row attributes - fout.write( self.linestr % ( tuple(unicode(el) for el in self.row_attr_names) + ('',)*(self.tab_fields-self.row_attr) ) ) - #Write rows - for i in range(self.rows): - for j in range(self.row_attr): - fout.write(unicode(self.row_attr_values[j][i]) + u'\t') - fout.write(u'\t') - fout.write(u'\t'.join([unicode(matrix_str_fmt % el) for el in self.matrix[i]] ) ) - fout.write(u'\n') - - - diff --git a/backspinpy/__init__.py.bak b/backspinpy/__init__.py.bak deleted file mode 100644 index 2b92ec8..0000000 --- a/backspinpy/__init__.py.bak +++ /dev/null @@ -1,3 +0,0 @@ -from __future__ import absolute_import -from .backSPIN import SPIN, backSPIN, fit_CV, feature_selection -from .Cef_tools import CEF_obj diff --git a/backspinpy/backSPIN.py.bak b/backspinpy/backSPIN.py.bak deleted file mode 100755 index 02bc3bd..0000000 --- a/backspinpy/backSPIN.py.bak +++ /dev/null @@ -1,932 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2015, Amit Zeisel, Gioele La Manno and Sten Linnarsson -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -# This .py file can be used as a library or a command-line version of BackSPIN, -# This version of BackSPIN was implemented by Gioele La Manno. -# The BackSPIN biclustering algorithm was developed by Amit Zeisel and is described -# in Zeisel et al. Cell types in the mouse cortex and hippocampus revealed by -# single-cell RNA-seq Science 2015 (PMID: 25700174, doi: 10.1126/science.aaa1934). -# -# Building using pyinstaller: -# pyinstaller -F backSPIN.py -n backspin-mac-64-bit -# - -from __future__ import division, print_function, absolute_import -from numpy import * -import getopt -import sys -import os -from .Cef_tools import CEF_obj - -class Results: - pass - -def calc_loccenter(x, lin_log_flag): - M,N = x.shape - if N==1 and M>1: - x = x.T - M,N = x.shape - loc_center = zeros(M) - min_x = x.min(1) - x = x - min_x[:,newaxis] - for i in range(M): - ind = where(x[i,:]>0)[0] - if len(ind) != 0: - if lin_log_flag == 1: - w = x[i,ind]/sum(x[i,ind], 0) - else: - w = (2**x[i,ind])/sum(2**x[i,ind], 0) - loc_center[i] = sum(w*ind, 0) - else: - loc_center[i] = 0 - - return loc_center - -def _calc_weights_matrix(mat_size, wid): - '''Calculate Weight Matrix - Parameters - ---------- - mat_size: int - dimension of the distance matrix - wid: int - parameter that controls the width of the neighbourood - Returns - ------- - weights_mat: 2-D array - the weights matrix to multiply with the distance matrix - - ''' - #calculate square distance from the diagonal - sqd = (arange(1,mat_size+1)[newaxis,:] - arange(1,mat_size+1)[:,newaxis])**2 - #make the distance relative to the mat_size - norm_sqd = sqd/wid - #evaluate a normal pdf - weights_mat = exp(-norm_sqd/mat_size) - #avoid useless precision that would slow down the matrix multiplication - weights_mat -= 1e-6 - weights_mat[weights_mat<0] = 0 - #normalize row and column sum - weights_mat /= sum(weights_mat,0)[newaxis,:] - weights_mat /= sum(weights_mat,1)[:, newaxis] - #fix asimmetries - weights_mat = (weights_mat + weights_mat.T) / 2. - return weights_mat - - -def _sort_neighbourhood( dist_matrix, wid ): - '''Perform a single iteration of SPIN - Parameters - ---------- - dist_matrix: 2-D array - distance matrix - wid: int - parameter that controls the width of the neighbourood - Returns - ------- - sorted_ind: 1-D array - indexes that order the matrix - - ''' - assert wid > 0, 'Parameter wid < 0 is not allowed' - mat_size = dist_matrix.shape[0] - #assert mat_size>2, 'Matrix is too small to be sorted' - weights_mat = _calc_weights_matrix(mat_size, wid) - #Calculate the dot product (can be very slow for big mat_size) - mismatch_score = dot(dist_matrix, weights_mat) - energy, target_permutation = mismatch_score.min(1), mismatch_score.argmin(1) - max_energy = max(energy) - #Avoid points that have the same target_permutation value - sort_score = target_permutation - 0.1 * sign( (mat_size/2 - target_permutation) ) * energy/max_energy - #sort_score = target_permutation - 0.1 * sign( 1-2*(int(1000*energy/max_energy) % 2) ) * energy/max_energy # Alternative - # Sorting the matrix - sorted_ind = sort_score.argsort(0)[::-1] - return sorted_ind - - -def sort_mat_by_neighborhood(dist_matrix, wid, times): - '''Perform several iterations of SPIN using a fixed wid parameter - Parameters - ---------- - dist_matrix: 2-D array - distance matrix - wid: int - parameter that controls the width of the neighbourood - times: int - number of repetitions - verbose: bool - print the progress - Returns - ------- - indexes: 1-D array - indexes that order the matrix - - ''' - # original indexes - indexes = arange(dist_matrix.shape[0]) - for i in range(times): - #sort the sitance matrix according the previous iteration - tmpmat = dist_matrix[indexes,:] - tmpmat = tmpmat[:,indexes] - sorted_ind = _sort_neighbourhood(tmpmat, wid); - #resort the original indexes - indexes = indexes[sorted_ind] - return indexes - - -def _generate_widlist(data, axis=1, step=0.6): - '''Generate a list of wid parameters to execute sort_mat_by_neighborhood - Parameters - ---------- - data: 2-D array - the data matrix - axis: int - the axis to take in consideration - step: float - the increment between two successive wid parameters - Returns - ------- - wid_list: list of int - list of wid parameters to run SPIN - - ''' - max_wid = data.shape[axis]*0.6 - new_wid = 1 - wid_list = [] - while new_wid < (1+step)*max_wid: - wid_list.append( new_wid ) - new_wid = int(ceil( new_wid + new_wid*(step) +1)) - return wid_list[::-1] - - - -def SPIN(dt, widlist=[10,1], iters=30, axis='both', verbose=False): - """Run the original SPIN algorithm - Parameters - ---------- - dt: 2-D array - the data matrix - widlist: float or list of int - If float is passed, it is used as step parameted of _generate_widlist, - and widlist is generated to run SPIN. - If list is passed it is used directly to run SPIN. - iters: int - number of repetitions for every wid in widlist - axis: int - the axis to take in consideration (must be 0, 1 or 'both') - step: float - the increment between two successive wid parameters - Returns - ------- - indexes: 1-D array (if axis in [0,1]) or tuple of 1-D array (if axis = 'both') - indexes that sort the data matrix - Notes - ----- - Typical usage - sorted_dt0 = SPIN(dt, iters=30, axis=0) - sorted_dt1 = SPIN(dt, iters=30, axis=1) - dt = dt[sorted_dt0,:] - dt = dt[:,sorted_dt1] - """ - IXc = arange(dt.shape[1]) - IXr = arange(dt.shape[0]) - assert axis in ['both', 0,1], 'axis must be 0, 1 or \'both\' ' - #Sort both axis - if axis == 'both': - CCc = 1 - corrcoef(dt.T) - CCr = 1 - corrcoef(dt) - if type(widlist) != list: - widlist_r = _generate_widlist(dt, axis=0, step=widlist) - widlist_c = _generate_widlist(dt, axis=1, step=widlist) - if verbose: - print('\nSorting genes.') - print('Neighbourood=', end=""), - for wid in widlist_r: - if verbose: - print ('%i, ' % wid), - sys.stdout.flush() - INDr = sort_mat_by_neighborhood(CCr, wid, iters) - CCr = CCr[INDr,:][:,INDr] - IXr = IXr[INDr] - if verbose: - print ('\nSorting cells.') - print ('Neighbourood=',end="") - for wid in widlist_c: - if verbose: - print ('%i, ' % wid), - sys.stdout.flush() - INDc = sort_mat_by_neighborhood(CCc, wid, iters) - CCc = CCc[:,INDc][INDc,:] - IXc= IXc[INDc] - return IXr, IXc - #Sort rows - elif axis == 0: - CCr = 1 - corrcoef(dt) - if type(widlist) != list: - widlist = _generate_widlist(dt, axis=0, step=widlist) - if verbose: - print ('\nSorting genes.\nNeighbourood=',end="") - for wid in widlist: - if verbose: - print ('%i, ' % wid,end="") - sys.stdout.flush() - INDr = sort_mat_by_neighborhood(CCr, wid, iters) - CCr = CCr[INDr,:][:,INDr] - IXr = IXr[INDr] - return IXr - #Sort columns - elif axis == 1: - CCc = 1 - corrcoef(dt.T) - if type(widlist) != list: - widlist = _generate_widlist(dt, axis=1, step=widlist) - if verbose: - print ('\nSorting cells.\nNeighbourood=',end="") - for wid in widlist: - if verbose: - print ('%i, ' % wid,end="") - sys.stdout.flush() - INDc = sort_mat_by_neighborhood(CCc, wid, iters) - CCc = CCc[:,INDc][INDc,:] - IXc = IXc[INDc] - return IXc - - -def backSPIN(data, numLevels=2, first_run_iters=10, first_run_step=0.05, runs_iters=8 ,runs_step=0.25,\ - split_limit_g=2, split_limit_c=2, stop_const = 1.15, low_thrs=0.2, verbose=False): - '''Run the backSPIN algorithm - Parameters - ---------- - data: 2-D array - the data matrix, rows should be genes and columns single cells/samples - numLevels: int - the number of splits that will be tried - first_run_iters: float - the iterations of the preparatory SPIN - first_run_step: float - the step parameter passed to _generate_widlist for the preparatory SPIN - runs_iters: int - the iterations parameter passed to the _divide_to_2and_resort. - influences all the SPIN iterations except the first - runs_step: float - the step parameter passed to the _divide_to_2and_resort. - influences all the SPIN iterations except the first - wid: float - the wid of every iteration of the splitting and resorting - split_limit_g: int - If the number of specific genes in a subgroup is smaller than this number - splitting of that subgrup is not allowed - split_limit_c: int - If the number cells in a subgroup is smaller than this number splitting of - that subgrup is not allowed - stop_const: float - minimum score that a breaking point has to reach to be suitable for splitting - low_thrs: float - genes with average lower than this threshold are assigned to either of the - splitting group reling on genes that are higly correlated with them - - Returns - ------- - results: Result object - The results object contain the following attributes - genes_order: 1-D array - indexes (a permutation) sorting the genes - cells_order: 1-D array - indexes (a permutation) sorting the cells - genes_gr_level: 2-D array - for each depth level contains the cluster indexes for each gene - cells_gr_level: - for each depth level contains the cluster indexes for each cell - cells_gr_level_sc: - score of the splitting - genes_bor_level: - the border index between gene clusters - cells_bor_level: - the border index between cell clusters - - Notes - ----- - Typical usage - - ''' - assert numLevels>0, '0 is not an available depth for backSPIN, use SPIN instead' - #initialize some varaibles - genes_bor_level = [[] for i in range(numLevels)] - cells_bor_level = [[] for i in range(numLevels)] - N,M = data.shape - genes_order = arange(N) - cells_order = arange(M) - genes_gr_level = zeros((N,numLevels+1)) - cells_gr_level = zeros((M,numLevels+1)) - cells_gr_level_sc = zeros((M,numLevels+1)) - - # Do a Preparatory SPIN on cells - if verbose: - print ('\nPreparatory SPIN') - ix1 = SPIN(data, widlist=_generate_widlist(data, axis=1, step=first_run_step), iters=first_run_iters, axis=1, verbose=verbose) - cells_order = cells_order[ix1] - - #For every level of depth DO: - for i in range(numLevels): - k=0 # initialize group id counter - # For every group generated at the parent level DO: - for j in range( len( set(cells_gr_level[:,i]) ) ): - # Extract the a data matrix of the genes at that level - g_settmp = nonzero(genes_gr_level[:,i]==j)[0] #indexes of genes in the level j - c_settmp = nonzero(cells_gr_level[:,i]==j)[0] #indexes of cells in the level j - datatmp = data[ ix_(genes_order[g_settmp], cells_order[c_settmp]) ] - # If we are not below the splitting limit for both genes and cells DO: - if (len(g_settmp)>split_limit_g) & (len(c_settmp)>split_limit_c): - # Split and SPINsort the two halves - if i == numLevels-1: - divided = _divide_to_2and_resort(datatmp, wid=runs_step, iters_spin=runs_iters,\ - stop_const=stop_const, low_thrs=low_thrs, sort_genes=True, verbose=verbose) - else: - divided = _divide_to_2and_resort(datatmp, wid=runs_step, iters_spin=runs_iters,\ - stop_const=stop_const, low_thrs=low_thrs, sort_genes=False,verbose=verbose) - # _divide_to_2and_resort retruns an empty array in gr2 if the splitting condition was not satisfied - if divided: - sorted_data_resort1, genes_resort1, cells_resort1,\ - gr1, gr2, genesgr1, genesgr2, score1, score2 = divided - # Resort from the previous level - genes_order[g_settmp] = genes_order[g_settmp[genes_resort1]] - cells_order[c_settmp] = cells_order[c_settmp[cells_resort1]] - # Assign a numerical identifier to the groups - genes_gr_level[g_settmp[genesgr1],i+1] = k - genes_gr_level[g_settmp[genesgr2],i+1] = k+1 - cells_gr_level[c_settmp[gr1],i+1] = k - cells_gr_level[c_settmp[gr2],i+1] = k+1 - # Not really clear what sc is - cells_gr_level_sc[c_settmp[gr1],i+1] = score1 - cells_gr_level_sc[c_settmp[gr2],i+1] = score2 - # Augment the counter of 2 becouse two groups were generated from one - k = k+2 - else: - # The split is not convenient, keep everithing the same - genes_gr_level[g_settmp,i+1] = k - # if it is the deepest level: perform gene sorting - if i == numLevels-1: - if (datatmp.shape[0] > 2 )and (datatmp.shape[1] > 2): - genes_resort1 = SPIN(datatmp, widlist=runs_step, iters=runs_iters, axis=0, verbose=verbose) - genes_order[g_settmp] = genes_order[g_settmp[genes_resort1]] - cells_gr_level[c_settmp,i+1] = k - cells_gr_level_sc[c_settmp,i+1] = cells_gr_level_sc[c_settmp,i] - # Augment of 1 becouse no new group was generated - k = k+1 - else: - # Below the splitting limit: the split is not convenient, keep everithing the same - genes_gr_level[g_settmp,i+1] = k - cells_gr_level[c_settmp,i+1] = k - cells_gr_level_sc[c_settmp,i+1] = cells_gr_level_sc[c_settmp,i] - # Augment of 1 becouse no new group was generated - k = k+1 - - # Find boundaries - genes_bor_level[i] = r_[0, nonzero(diff(genes_gr_level[:,i+1])>0)[0]+1, data.shape[0] ] - cells_bor_level[i] = r_[0, nonzero(diff(cells_gr_level[:,i+1])>0)[0]+1, data.shape[1] ] - - #dataout_sorted = data[ ix_(genes_order,cells_order) ] - - results = Results() - results.genes_order = genes_order - results.cells_order = cells_order - results.genes_gr_level = genes_gr_level - results.cells_gr_level = cells_gr_level - results.cells_gr_level_sc = cells_gr_level_sc - results.genes_bor_level = genes_bor_level - results.cells_bor_level = cells_bor_level - - return results - - - -def _divide_to_2and_resort(sorted_data, wid, iters_spin=8, stop_const = 1.15, low_thrs=0.2 , sort_genes=True, verbose=False): - '''Core function of backSPIN: split the datamatrix in two and resort the two halves - - Parameters - ---------- - sorted_data: 2-D array - the data matrix, rows should be genes and columns single cells/samples - wid: float - wid parameter to give to widlist parameter of th SPIN fucntion - stop_const: float - minimum score that a breaking point has to reach to be suitable for splitting - low_thrs: float - if the difference between the average expression of two groups is lower than threshold the algorythm - uses higly correlated gens to assign the gene to one of the two groups - verbose: bool - information about the split is printed - - Returns - ------- - ''' - - # Calculate correlation matrix for cells and genes - Rcells = corrcoef(sorted_data.T) - Rgenes = corrcoef(sorted_data) - # Look for the optimal breaking point - N = Rcells.shape[0] - score = zeros(N) - for i in range(2,N-2): - if i == 2: - tmp1 = sum( Rcells[:i,:i] ) - tmp2 = sum( Rcells[i:,i:] ) - score[i] = (tmp1+tmp2) / float(i**2 + (N-i)**2) - else: - tmp1 += sum(Rcells[i-1,:i]) + sum(Rcells[:i-1,i-1]); - tmp2 -= sum(Rcells[i-1:,i-1]) + sum(Rcells[i-1,i:]); - score[i] = (tmp1+tmp2) / float(i**2 + (N-i)**2) - - breakp1 = argmax(score) - score1 = Rcells[:breakp1,:breakp1] - score1 = triu(score1) - score1 = mean( score1[score1 != 0] ) - score2 = Rcells[breakp1:, breakp1:] - score2 = triu(score2) - score2 = mean( score2[score2 != 0] ) - avg_tot = triu(Rcells) - avg_tot = mean( avg_tot[avg_tot != 0] ) - - # If it is convenient to break - if (max([score1,score2])/avg_tot) > stop_const: - # Divide in two groups - gr1 = arange(N)[:breakp1] - gr2 = arange(N)[breakp1:] - # and assign the genes into the two groups - mean_gr1 = sorted_data[:, gr1].mean(1) - mean_gr2 = sorted_data[:, gr2].mean(1) - concat_loccenter_gr1 = c_[ calc_loccenter(sorted_data[:,gr1], 2), calc_loccenter(sorted_data[:,gr1][...,::-1], 2) ] - concat_loccenter_gr2 = c_[ calc_loccenter(sorted_data[:,gr2], 2), calc_loccenter(sorted_data[:,gr2][...,::-1], 2) ] - center_gr1, flip_flag1 = concat_loccenter_gr1.min(1), concat_loccenter_gr1.argmin(1) - center_gr2, flip_flag2 = concat_loccenter_gr2.max(1), concat_loccenter_gr2.argmax(1) - sorted_data_tmp = array( sorted_data ) - sorted_data_tmp[ix_(flip_flag1==1,gr1)] = sorted_data[ix_(flip_flag1==1,gr1)][...,::-1] - sorted_data_tmp[ix_(flip_flag2==1,gr2)] = sorted_data[ix_(flip_flag2==1,gr2)][...,::-1] - loc_center = calc_loccenter(sorted_data_tmp, 2) - - imax = zeros(loc_center.shape) - imax[loc_center<=breakp1] = 1 - imax[loc_center>breakp1] = 2 - - genesgr1 = where(imax==1)[0] - genesgr2 = where(imax==2)[0] - if size(genesgr1) == 0: - IN = argmax(mean_gr1) - genesgr1 = array([IN]) - genesgr2 = setdiff1d(genesgr2, IN) - elif size(genesgr2) == 0: - IN = argmax(mean_gr2) - genesgr2 = array([IN]) - genesgr1 = setdiff1d(genesgr1, IN) - - if verbose: - print ('\nSplitting (%i, %i) ' % sorted_data.shape) - print ('in (%i,%i) ' % (genesgr1.shape[0],gr1.shape[0])) - print ('and (%i,%i)' % (genesgr2.shape[0],gr2.shape[0]),end="") - sys.stdout.flush() - - # Data of group1 - datagr1 = sorted_data[ix_(genesgr1,gr1)] - # zero center - datagr1 = datagr1 - datagr1.mean(1)[:,newaxis] - # Resort group1 - if min( datagr1.shape ) > 1: - if sort_genes: - genesorder1,cellorder1 = SPIN(datagr1, widlist=wid, iters=iters_spin, axis='both', verbose=verbose) - else: - cellorder1 = SPIN(datagr1, widlist=wid, iters=iters_spin, axis=1, verbose=verbose) - genesorder1 = arange(datagr1.shape[0]) - elif len(genesgr1) == 1: - genesorder1 = 0 - cellorder1 = argsort( datagr1[0,:] ) - elif len(gr1) == 1: - cellorder1 = 0 - genesorder1 = argsort( datagr1[:,0] ) - - # Data of group2 - datagr2 = sorted_data[ix_(genesgr2,gr2)] - # zero center - datagr2 = datagr2 - datagr2.mean(1)[:,newaxis] - # Resort group2 - if min( datagr2.shape )>1: - if sort_genes: - genesorder2, cellorder2 = SPIN(datagr2, widlist=wid, iters=iters_spin, axis='both',verbose=verbose) - else: - cellorder2 = SPIN(datagr2, widlist=wid, iters=iters_spin, axis=1,verbose=verbose) - genesorder2 = arange(datagr2.shape[0]) - elif len(genesgr2) == 1: - genesorder2 = 0 - cellorder2 = argsort(datagr2[0,:]) - elif len(gr2) == 1: - cellorder2 = 0 - genesorder2 = argsort(datagr2[:,0]) - - # contcatenate cells and genes indexes - genes_resort1 = r_[genesgr1[genesorder1], genesgr2[genesorder2] ] - cells_resort1 = r_[gr1[cellorder1], gr2[cellorder2] ] - genesgr1 = arange(len(genesgr1)) - genesgr2 = arange(len(genesgr1), len(sorted_data[:,0])) - # resort - sorted_data_resort1 = sorted_data[ix_(genes_resort1,cells_resort1)] - - return sorted_data_resort1, genes_resort1, cells_resort1, gr1, gr2, genesgr1, genesgr2, score1, score2 - - else: - if verbose: - print('Low splitting score was : %.4f' % (max([score1,score2])/avg_tot)) - return False - - -def fit_CV(mu, cv, fit_method='Exp', svr_gamma=0.06, x0=[0.5,0.5], verbose=False): - '''Fits a noise model (CV vs mean) - Parameters - ---------- - mu: 1-D array - mean of the genes (raw counts) - cv: 1-D array - coefficient of variation for each gene - fit_method: string - allowed: 'SVR', 'Exp', 'binSVR', 'binExp' - default: 'SVR'(requires scikit learn) - SVR: uses Support vector regression to fit the noise model - Exp: Parametric fit to cv = mu^(-a) + b - bin: before fitting the distribution of mean is normalized to be - uniform by downsampling and resampling. - Returns - ------- - score: 1-D array - Score is the relative position with respect of the fitted curve - mu_linspace: 1-D array - x coordiantes to plot (min(log2(mu)) -> max(log2(mu))) - cv_fit: 1-D array - y=f(x) coordinates to plot - pars: tuple or None - - ''' - log2_m = log2(mu) - log2_cv = log2(cv) - - if len(mu)>1000 and 'bin' in fit_method: - #histogram with 30 bins - n,xi = histogram(log2_m,30) - med_n = percentile(n,50) - for i in range(0,len(n)): - # index of genes within the ith bin - ind = where( (log2_m >= xi[i]) & (log2_m < xi[i+1]) )[0] - if len(ind)>med_n: - #Downsample if count is more than median - ind = ind[random.permutation(len(ind))] - ind = ind[:len(ind)-med_n] - mask = ones(len(log2_m), dtype=bool) - mask[ind] = False - log2_m = log2_m[mask] - log2_cv = log2_cv[mask] - elif (around(med_n/len(ind))>1) and (len(ind)>5): - #Duplicate if count is less than median - log2_m = r_[ log2_m, tile(log2_m[ind], around(med_n/len(ind))-1) ] - log2_cv = r_[ log2_cv, tile(log2_cv[ind], around(med_n/len(ind))-1) ] - else: - if 'bin' in fit_method: - print('More than 1000 input feature needed for bin correction.') - pass - - if 'SVR' in fit_method: - try: - from sklearn.svm import SVR - if svr_gamma == 'auto': - svr_gamma = 1000./len(mu) - #Fit the Support Vector Regression - clf = SVR(gamma=svr_gamma) - clf.fit(log2_m[:,newaxis], log2_cv) - fitted_fun = clf.predict - score = log2(cv) - fitted_fun(log2(mu)[:,newaxis]) - params = None - #The coordinates of the fitted curve - mu_linspace = linspace(min(log2_m),max(log2_m)) - cv_fit = fitted_fun(mu_linspace[:,newaxis]) - return score, mu_linspace, cv_fit , params - - except ImportError: - if verbose: - print('SVR fit requires scikit-learn python library. Using exponential instead.') - if 'bin' in fit_method: - return fit_CV(mu, cv, fit_method='binExp', x0=x0) - else: - return fit_CV(mu, cv, fit_method='Exp', x0=x0) - elif 'Exp' in fit_method: - from scipy.optimize import minimize - #Define the objective function to fit (least squares) - fun = lambda x, log2_m, log2_cv: sum(abs( log2( (2.**log2_m)**(-x[0])+x[1]) - log2_cv )) - #Fit using Nelder-Mead algorythm - optimization = minimize(fun, x0, args=(log2_m,log2_cv), method='Nelder-Mead') - params = optimization.x - #The fitted function - fitted_fun = lambda log_mu: log2( (2.**log_mu)**(-params[0]) + params[1]) - # Score is the relative position with respect of the fitted curve - score = log2(cv) - fitted_fun(log2(mu)) - #The coordinates of the fitted curve - mu_linspace = linspace(min(log2_m),max(log2_m)) - cv_fit = fitted_fun(mu_linspace) - return score, mu_linspace, cv_fit , params - - - -def feature_selection(data,thrs, verbose=False): - if thrs>= data.shape[0]: - if verbose: - print ("Trying to select %i features but only %i genes available." %( thrs, data.shape[0])) - print ("Skipping feature selection") - return arange(data.shape[0]) - ix_genes = arange(data.shape[0]) - threeperK = int(ceil(3*data.shape[1]/1000.)) - zerotwoperK = int(floor(0.3*data.shape[1]/1000.)) - # is at least 1 molecule in 0.3% of thecells, is at least 2 molecules in 0.03% of the cells - condition = (sum(data>=1, 1)>= threeperK) & (sum(data>=2, 1)>=zerotwoperK) - ix_genes = ix_genes[condition] - - mu = data[ix_genes,:].mean(1) - sigma = data[ix_genes,:].std(1, ddof=1) - cv = sigma/mu - - try: - score, mu_linspace, cv_fit , params = fit_CV(mu,cv,fit_method='SVR', verbose=verbose) - except ImportError: - print ("WARNING: Feature selection was skipped becouse scipy is required. Install scipy to run feature selection.") - return arange(data.shape[0]) - - return ix_genes[argsort(score)[::-1]][:thrs] - -def usage_quick(): - - message ='''usage: backSPIN [-hbv] [-i inputfile] [-o outputfolder] [-d int] [-f int] [-t int] [-s float] [-T int] [-S float] [-g int] [-c int] [-k float] [-r float] - manual: backSPIN -h - ''' - print (message) - -def usage(): - - message=''' - backSPIN commandline tool - ------------------------- - - The options are as follows: - - -i [inputfile] - --input=[inputfile] - Path of the cef formatted tab delimited file. - Rows should be genes and columns single cells/samples. - For further information on the cef format visit: - https://github.com/linnarsson-lab/ceftools - - -o [outputfile] - --output=[outputfile] - The name of the file to which the output will be written - - -d [int] - Depth/Number of levels: The number of nested splits that will be tried by the algorithm - -t [int] - Number of the iterations used in the preparatory SPIN. - Defaults to 10 - -f [int] - Feature selection is performed before BackSPIN. Argument controls how many genes are seleceted. - Selection is based on expected noise (a curve fit to the CV-vs-mean plot). - -s [float] - Controls the decrease rate of the width parameter used in the preparatory SPIN. - Smaller values will increase the number of SPIN iterations and result in higher - precision in the first step but longer execution time. - Defaults to 0.1 - -T [int] - Number of the iterations used for every width parameter. - Does not apply on the first run (use -t instead) - Defaults to 8 - -S [float] - Controls the decrease rate of the width parameter. - Smaller values will increase the number of SPIN iterations and result in higher - precision but longer execution time. - Does not apply on the first run (use -s instead) - Defaults to 0.3 - -g [int] - Minimal number of genes that a group must contain for splitting to be allowed. - Defaults to 2 - -c [int] - Minimal number of cells that a group must contain for splitting to be allowed. - Defaults to 2 - -k [float] - Minimum score that a breaking point has to reach to be suitable for splitting. - Defaults to 1.15 - -r [float] - If the difference between the average expression of two groups is lower than threshold the algorythm - uses higly correlated genes to assign the gene to one of the two groups - Defaults to 0.2 - -b [axisvalue] - Run normal SPIN instead of backSPIN. - Normal spin accepts the parameters -T -S - An axis value 0 to only sort genes (rows), 1 to only sort cells (columns) or 'both' for both - must be passed - -v - Verbose. Print to the stdoutput extra details of what is happening - - ''' - - print(message) - - - -if __name__ == '__main__': - print("") - #defaults arguments - input_path = None - outfiles_path = None - numLevels=2 # -d - feature_fit = False # -f - feature_genes = 2000 - first_run_iters=10 # -t - first_run_step=0.1 # -s - runs_iters=8 # -T - runs_step=0.3 # -S - split_limit_g=2 # -g - split_limit_c=2 # -c - stop_const = 1.15 # -k - low_thrs=0.2 # -r - normal_spin = False #-b - normal_spin_axis = 'both' - verbose=False # -v - - optlist, args = getopt.gnu_getopt(sys.argv[1:], "hvi:o:f:d:t:s:T:S:g:c:k:r:b:", ["help", "input=","output="]) - - if optlist== [] and args == []: - usage_quick() - sys.exit() - for opt, a in optlist: - if opt in ("-h", "--help"): - usage() - sys.exit() - elif opt in ('-i', '--input'): - input_path = a - elif opt in ("-o", "--output"): - outfiles_path = a - elif opt == '-d': - numLevels = int(a) - elif opt == '-f': - feature_fit = True - if a != '': - feature_genes = int(a) - elif opt == '-t': - first_run_iters = int(a) - elif opt == '-s': - first_run_step = float(a) - elif opt == '-T': - runs_iters = int(a) - elif opt == '-S': - runs_step = float(a) - elif opt == '-g': - split_limit_g = int(a) - elif opt == '-c': - split_limit_c = int(a) - elif opt == '-k': - stop_const = float(a) - elif opt == '-r': - low_thrs = float(a) - elif opt == '-v': - verbose = True - elif opt == '-b': - normal_spin = True - if a != '': - if a == 'both': - normal_spin_axis = a - else: - normal_spin_axis = int(a) - else: - assert False, "%s option is not supported" % opt - - if input_path == None: - print ('No input file was provided.\nYou need to specify an input file\n(e.g. backSPIN -i path/to/your/file/foo.cef)\n') - sys.exit() - if outfiles_path == None: - print ('No output file was provided.\nYou need to specify an output file\n(e.g. backSPIN -o path/to/your/file/bar.cef)\n') - sys.exit() - - try: - if verbose: - print ('Loading file.') - input_cef = CEF_obj() - input_cef.readCEF(input_path) - - data = array(input_cef.matrix) - - if feature_fit: - if verbose: - print ("Performing feature selection") - ix_features = feature_selection(data, feature_genes, verbose=verbose) - if verbose: - print ("Selected %i genes" % len(ix_features)) - data = data[ix_features, :] - input_cef.matrix = data.tolist() - input_cef.row_attr_values = atleast_2d( array( input_cef.row_attr_values ))[:,ix_features].tolist() - input_cef.update() - - data = log2(data+1) - data = data - data.mean(1)[:,newaxis] - if data.shape[0] <= 3 and data.shape[1] <= 3: - print ('Input file is not correctly formatted.\n') - sys.exit() - except Exception as err: - import traceback - print ('There was an error') - print (traceback.format_exc()) - print ('Error occurred in parsing the input file.') - print ('Please check that your input file is a correctly formatted cef file.\n') - sys.exit() - - if normal_spin == False: - - print ('backSPIN started\n----------------\n') - print ('Input file:\n%s\n' % input_path) - print ('Output file:\n%s\n' % outfiles_path) - print ('numLevels: %i\nfirst_run_iters: %i\nfirst_run_step: %.3f\nruns_iters: %i\nruns_step: %.3f\nsplit_limit_g: %i\nsplit_limit_c: %i\nstop_const: %.3f\nlow_thrs: %.3f\n' % (numLevels, first_run_iters, first_run_step, runs_iters,\ - runs_step, split_limit_g, split_limit_c, stop_const, low_thrs)) - - - results = backSPIN(data, numLevels, first_run_iters, first_run_step, runs_iters, runs_step,\ - split_limit_g, split_limit_c, stop_const, low_thrs, verbose) - - sys.stdout.flush() - print ('\nWriting output.\n') - - output_cef = CEF_obj() - - for h_name, h_val in zip( input_cef.header_names, input_cef.header_values): - output_cef.add_header(h_name, h_val ) - for c_name, c_val in zip( input_cef.col_attr_names, input_cef.col_attr_values): - output_cef.add_col_attr(c_name, array(c_val)[results.cells_order]) - for r_name, r_val in zip( input_cef.row_attr_names, input_cef.row_attr_values): - output_cef.add_row_attr(r_name, array(r_val)[results.genes_order]) - - for level, groups in enumerate( results.genes_gr_level.T ): - output_cef.add_row_attr('Level_%i_group' % level, [int(el) for el in groups]) - for level, groups in enumerate( results.cells_gr_level.T ): - output_cef.add_col_attr('Level_%i_group' % level, [int(el) for el in groups]) - - output_cef.set_matrix(array(input_cef.matrix)[results.genes_order,:][:,results.cells_order]) - if sum(type(i)==float for i in input_cef.matrix[0]) + sum(type(i)==float for i in input_cef.matrix[-1]) == 0: - fmt = '%i' - else: - fmt ='%.6g' - output_cef.writeCEF( outfiles_path, matrix_str_fmt=fmt ) - else: - - print ('normal SPIN started\n----------------\n') - print ('Input file:\n%s\n' % input_path) - print ('Output file:\n%s\n' % outfiles_path) - - results = SPIN(data, widlist=runs_step, iters=runs_iters, axis=normal_spin_axis, verbose=verbose) - - print ('\nWriting output.\n') - - output_cef = CEF_obj() - - for h_name, h_val in zip( input_cef.header_names, input_cef.header_values): - output_cef.add_header(h_name, h_val ) - - if normal_spin_axis == 'both': - for c_name, c_val in zip( input_cef.col_attr_names, input_cef.col_attr_values): - output_cef.add_col_attr(c_name, array(c_val)[results[1]]) - for r_name, r_val in zip( input_cef.row_attr_names, input_cef.row_attr_values): - output_cef.add_row_attr(r_name, array(r_val)[results[0]]) - output_cef.set_matrix(array(input_cef.matrix)[results[0],:][:,results[1]]) - - if normal_spin_axis == 0: - for r_name, r_val in zip( input_cef.row_attr_names, input_cef.row_attr_values): - output_cef.add_row_attr(r_name, array(r_val)[results]) - output_cef.set_matrix(array(input_cef.matrix)[results,:]) - - if normal_spin_axis == 1: - for c_name, c_val in zip( input_cef.col_attr_names, input_cef.col_attr_values): - output_cef.add_col_attr(c_name, array(c_val)[results]) - output_cef.set_matrix(array(input_cef.matrix)[:,results]) - - - - output_cef.writeCEF( outfiles_path ) - From efd33daca72982cb3b3723557a95698a9f717dac Mon Sep 17 00:00:00 2001 From: Python3pkg Date: Thu, 18 May 2017 06:47:35 -0700 Subject: [PATCH 3/3] Add Travis-CI --- .travis.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..7d03ea8 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,20 @@ +sudo: false +language: python +python: + - "2.7" + - "3.2" + - "3.3" + - "3.4" + - "3.5" + - "3.6" + - "3.7-dev" # 3.7 development branch + +install: "pip install -r requirements.txt; pip install tox-travis" +env: + - TOXENV=pep8 + - TOXENV=coverage +script: tox +before_install: + - pip install codecov +after_success: + - codecov \ No newline at end of file