From d75d37fd406065f26b9b2fc5e20c1b7396cbaf7a Mon Sep 17 00:00:00 2001
From: YY-SONG0718 <794815059@qq.com>
Date: Fri, 11 Feb 2022 13:30:04 +0000
Subject: [PATCH 1/8] update __init__.py

---
 SCCAF/__init__.py | 336 ++++++++++++++++++++++++++--------------------
 1 file changed, 193 insertions(+), 143 deletions(-)

diff --git a/SCCAF/__init__.py b/SCCAF/__init__.py
index f58c79c..1df6d35 100644
--- a/SCCAF/__init__.py
+++ b/SCCAF/__init__.py
@@ -19,6 +19,8 @@
 #
 #
 
+import h5py
+from scipy.sparse import csr_matrix
 import pandas as pd
 import numpy as np
 from scipy.sparse import issparse
@@ -51,22 +53,22 @@
 from sklearn.ensemble import RandomForestClassifier, VotingClassifier
 from sklearn.svm import SVC
 
-color_long = ['#e6194b','#3cb44b','#ffe119','#0082c8','#f58231','#911eb4',\
-              '#46f0f0','#f032e6','#d2f53c','#fabebe','#008080','#e6beff',\
-              '#aa6e28','#800000','#aaffc3','#808000','#ffd8b1','#000080',\
-              '#808080','#000000', "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46", \
-              "#008941", "#006FA6", "#A30059","#FFDBE5", "#7A4900", "#0000A6", \
-              "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87","#5A0007", \
-              "#809693", "#6A3A4C", "#1B4400", "#4FC601", "#3B5DFF","#4A3B53", \
-              "#FF2F80","#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100",\
-              "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F",\
-              "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09",\
-              "#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66",\
-              "#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C",\
-              "#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81",\
-              "#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00",\
-              "#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700",\
-              "#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329",\
+color_long = ['#e6194b', '#3cb44b', '#ffe119', '#0082c8', '#f58231', '#911eb4',
+              '#46f0f0', '#f032e6', '#d2f53c', '#fabebe', '#008080', '#e6beff',
+              '#aa6e28', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000080',
+              '#808080', '#000000', "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46",
+              "#008941", "#006FA6", "#A30059", "#FFDBE5", "#7A4900", "#0000A6",
+              "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87", "#5A0007",
+              "#809693", "#6A3A4C", "#1B4400", "#4FC601", "#3B5DFF", "#4A3B53",
+              "#FF2F80", "#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100",
+              "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F",
+              "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09",
+              "#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66",
+              "#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C",
+              "#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81",
+              "#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00",
+              "#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700",
+              "#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329",
               "#5B4534", "#FDE8DC", "#404E55", "#0089A3", "#CB7E98", "#A4E804", "#324E72"]
 
 
@@ -110,7 +112,8 @@ def bhattacharyya_matrix(prob, flags=None):
     else:
         for i in np.arange(ndim):
             for j in np.arange(i + 1, ndim):
-                df = pd.DataFrame({'i': prob[:, i], 'j': prob[:, j], 'f': flags})
+                df = pd.DataFrame(
+                    {'i': prob[:, i], 'j': prob[:, j], 'f': flags})
                 df = df[df['f']]
                 val = -bhattacharyya_distance(df['i'], df['j'])
                 m[i, j] = val
@@ -143,9 +146,11 @@ def normalize_confmat1(cmat, mod='1'):
     for i in range(dim):
         for j in range(i + 1, dim):
             if mod is '1':
-                xmat[i, j] = xmat[j, i] = max(dmat[i, j] / smat[j], dmat[j, i] / smat[i])
+                xmat[i, j] = xmat[j, i] = max(
+                    dmat[i, j] / smat[j], dmat[j, i] / smat[i])
             else:
-                xmat[i, j] = xmat[j, i] = max(dmat[i, j] / smat[i], dmat[j, i] / smat[j])
+                xmat[i, j] = xmat[j, i] = max(
+                    dmat[i, j] / smat[i], dmat[j, i] / smat[j])
     return xmat
 
 
@@ -193,7 +198,8 @@ def cluster_adjmat(xmat,
     -----
     new group names.
     """
-    g = sc._utils.get_igraph_from_adjacency((xmat > cutoff).astype(int), directed=False)
+    g = sc._utils.get_igraph_from_adjacency(
+        (xmat > cutoff).astype(int), directed=False)
     print(g)
     part = louvain.find_partition(g, louvain.RBConfigurationVertexPartition,
                                   resolution_parameter=resolution)
@@ -217,7 +223,8 @@ def msample(x, n, frac):
     sampled selection.
     """
     if len(x) <= np.floor(n / frac):
-        if len(x) < 10: frac = 0.9
+        if len(x) < 10:
+            frac = 0.9
         return x.sample(frac=frac)
     else:
         return x.sample(n=n)
@@ -246,7 +253,8 @@ def train_test_split_per_type(X, y, n=100, frac=0.8):
     df = pd.DataFrame(y)
     df.index = np.arange(len(y))
     df.columns = ['class']
-    c_idx = df.groupby('class').apply(lambda x: msample(x, n=n, frac=frac)).index.get_level_values(None)
+    c_idx = df.groupby('class').apply(lambda x: msample(
+        x, n=n, frac=frac)).index.get_level_values(None)
     d_idx = ~np.isin(np.arange(len(y)), c_idx)
     return X[c_idx, :], X[d_idx, :], y[c_idx], y[d_idx]
 
@@ -261,17 +269,17 @@ def SCCAF_assessment(*args, **kwargs):
 
 
 # need to check number of cells in each cluster of the training set.
-def self_projection(X, 
+def self_projection(X,
                     cell_types,
                     classifier="LR",
                     penalty='l1',
                     sparsity=0.5,
                     fraction=0.5,
                     random_state=1,
-		    solver='liblinear',
+                    solver='liblinear',
                     n=0,
                     cv=5,
-                    whole=False, 
+                    whole=False,
                     n_jobs=None):
     # n = 100 should be good.
     """
@@ -326,7 +334,8 @@ def self_projection(X,
                              stratify=cell_types, test_size=fraction)  # fraction means test size
     # set the classifier
     if classifier == 'LR':
-        clf = LogisticRegression(random_state=1, penalty=penalty, C=sparsity, multi_class="ovr", solver=solver)
+        clf = LogisticRegression(
+            random_state=1, penalty=penalty, C=sparsity, multi_class="ovr", solver=solver)
     elif classifier == 'RF':
         clf = RandomForestClassifier(random_state=1, n_jobs=n_jobs)
     elif classifier == 'GNB':
@@ -341,11 +350,12 @@ def self_projection(X,
         clf = SGDClassifier(loss='perceptron', n_jobs=n_jobs)
     elif classifier == 'DT':
         clf = DecisionTreeClassifier()
-    
+
     # mean cross validation score
     cvsm = 0
     if cv > 0:
-        cvs = cross_val_score(clf, X_train, np.array(y_train), cv=cv, scoring='accuracy', n_jobs=n_jobs)
+        cvs = cross_val_score(clf, X_train, np.array(
+            y_train), cv=cv, scoring='accuracy', n_jobs=n_jobs)
         cvsm = cvs.mean()
         print("Mean CV accuracy: %.4f" % cvsm)
     # accuracy on cross validation and on test set
@@ -354,18 +364,18 @@ def self_projection(X,
     print("Accuracy on the training set: %.4f" % accuracy)
     accuracy_test = clf.score(X_test, y_test)
     print("Accuracy on the hold-out set: %.4f" % accuracy_test)
-    
+
     # accuracy of the whole dataset
     if whole:
         accuracy = clf.score(X, cell_types)
         print("Accuracy on the whole set: %.4f" % accuracy)
-    
+
     # get predicted probability on the test set
     y_prob = None
     if not classifier in ['SH', 'PCP']:
         y_prob = clf.predict_proba(X_test)
     y_pred = clf.predict(X_test)
-    
+
     return y_prob, y_pred, y_test, clf, cvsm, accuracy_test
 
 
@@ -404,8 +414,10 @@ def confusion_matrix(y_test, y_pred, clf, labels=None):
     -----
     the confusion matrix
     """
-    if labels is None: labels = clf.classes_
-    df = pd.DataFrame.from_records(metrics.confusion_matrix(y_test, y_pred, labels=labels))
+    if labels is None:
+        labels = clf.classes_
+    df = pd.DataFrame.from_records(
+        metrics.confusion_matrix(y_test, y_pred, labels=labels))
     df.index = labels
     df.columns = labels
     df.index.name = 'Real'
@@ -443,7 +455,8 @@ def per_cluster_accuracy(mtx, ad=None, clstr_name='louvain'):
 
 def per_cell_accuracy(X, cell_types, clf):
     y_prob = clf.predict_proba(X)
-    df = pd.DataFrame(y_prob, index=cell_types, columns=clf.classes_).sort_index().T
+    df = pd.DataFrame(y_prob, index=cell_types,
+                      columns=clf.classes_).sort_index().T
     df.index.name = 'Predicted'
     dy = np.empty([0])
     for cell in df.index:
@@ -476,13 +489,14 @@ def get_topmarkers(clf, names, topn=10):
 
     top_markers = \
         marker_genes \
-            .query('weight > 0.') \
-            .sort_values('weight', ascending=False) \
-            .groupby('cell_type') \
-            .head(topn) \
-            .sort_values(['cell_type', 'weight'], ascending=[True, False])
+        .query('weight > 0.') \
+        .sort_values('weight', ascending=False) \
+        .groupby('cell_type') \
+        .head(topn) \
+        .sort_values(['cell_type', 'weight'], ascending=[True, False])
     return top_markers
 
+
 def eu_distance(X, gp1, gp2, cell):
     """
     Measure the euclidean distance between two groups of cells and the third group.
@@ -541,20 +555,22 @@ def get_distance_matrix(X, clusters, labels=None, metric='euclidean'):
     centers = []
     if scipy.sparse.issparse(X):
         for cl in labels:
-            centers.append(np.array(X[np.where(clusters == cl)[0], :].mean(0))[0, :])
+            centers.append(
+                np.array(X[np.where(clusters == cl)[0], :].mean(0))[0, :])
     else:
         for cl in labels:
             centers.append(np.array(X[np.where(clusters == cl)[0], :].mean(0)))
     return pairwise_distances(np.array(centers), metric=metric)
 
+
 def merge_cluster(ad, old_id, new_id, groups):
     ad.obs[new_id] = ad.obs[old_id]
     ad.obs[new_id] = ad.obs[new_id].astype('category')
     ad.obs[new_id].cat.categories = make_unique(groups.astype(str))
     ad.obs[new_id] = ad.obs[new_id].str.split('_').str[0]
     return ad
-    
-    
+
+
 def find_high_resolution(ad, resolution=4, n=100):
     cut = resolution
     while cut > 0.5:
@@ -567,18 +583,18 @@ def find_high_resolution(ad, resolution=4, n=100):
 
 
 def get_connection_matrix(ad_obs, key1, key2):
-    df = ad_obs.groupby([key1,key2]).size().to_frame().reset_index()
-    df.columns = [key1,key2,'counts']
+    df = ad_obs.groupby([key1, key2]).size().to_frame().reset_index()
+    df.columns = [key1, key2, 'counts']
     df2 = ad_obs[key2].value_counts()
     df['size'] = df2[df[key2].tolist()].tolist()
     df['percent'] = df['counts']/df['size']
-    df = df[df['percent']>0.1]
+    df = df[df['percent'] > 0.1]
     df2 = df.groupby(key1).size()
-    df2 = df2[df2>1]
+    df2 = df2[df2 > 1]
     df = df[df[key1].isin(df2.index)]
 
     dim = len(ad_obs[key2].unique())
-    mat = pd.DataFrame(np.zeros([dim,dim])).astype(int)
+    mat = pd.DataFrame(np.zeros([dim, dim])).astype(int)
     mat.columns = mat.columns.astype(str)
     mat.index = mat.index.astype(str)
 
@@ -586,10 +602,10 @@ def get_connection_matrix(ad_obs, key1, key2):
     grouped = df.groupby(key1)
     for name, group in grouped:
         x = group[key2]
-        if len(x)>0:
-            for i,j in list(itertools.combinations(x.tolist(), 2)):
-                mat.loc[i,j] = mat.loc[j,i] = 1
-    return mat    
+        if len(x) > 0:
+            for i, j in list(itertools.combinations(x.tolist(), 2)):
+                mat.loc[i, j] = mat.loc[j, i] = 1
+    return mat
 
 
 def SCCAF_optimize_all(ad,
@@ -597,11 +613,11 @@ def SCCAF_optimize_all(ad,
                        R1norm_cutoff=0.5,
                        R2norm_cutoff=0.05,
                        R1norm_step=0.01,
-                       R2norm_step=0.001, 
+                       R2norm_step=0.001,
                        prefix='L1',
-                       min_i = 3,
-		       start = None,
-                       start_iter = 0,
+                       min_i=3,
+                       start=None,
+                       start_iter=0,
                        *args, **kwargs):
     """
     ad: `AnnData`
@@ -626,20 +642,22 @@ def SCCAF_optimize_all(ad,
         The reduce step for minimum R2norm value.
     """
     acc = 0
-    #'start_iter = -1
+    # 'start_iter = -1
     if start is None:
-        start = '%s_Round%d'%(prefix, start_iter)
+        start = '%s_Round%d' % (prefix, start_iter)
         if not start in ad.obs.keys():
-            raise ValueError("`adata.obs['%s']` doesn't exist. Please assign the initial clustering first."%(start))
+            raise ValueError(
+                "`adata.obs['%s']` doesn't exist. Please assign the initial clustering first." % (start))
     else:
         if not start in ad.obs.keys():
-            raise ValueError("`adata.obs['%s']` doesn't exist. Please assign the initial clustering first."%(start))
-        ad.obs['%s_Round%d'%(prefix, start_iter)] = ad.obs[start]
-    
-    clstr_old = len(ad.obs['%s_Round%d'%(prefix, start_iter)].unique())
-    #'while acc < min_acc:
+            raise ValueError(
+                "`adata.obs['%s']` doesn't exist. Please assign the initial clustering first." % (start))
+        ad.obs['%s_Round%d' % (prefix, start_iter)] = ad.obs[start]
+
+    clstr_old = len(ad.obs['%s_Round%d' % (prefix, start_iter)].unique())
+    # 'while acc < min_acc:
     for i in range(10):
-        if start_iter >0:
+        if start_iter > 0:
             print("start_iter: %d" % start_iter)
         print("R1norm_cutoff: %f" % R1norm_cutoff)
         print("R2norm_cutoff: %f" % R2norm_cutoff)
@@ -649,7 +667,7 @@ def SCCAF_optimize_all(ad,
                                                      R1norm_cutoff=R1norm_cutoff,
                                                      R2norm_cutoff=R2norm_cutoff,
                                                      start_iter=start_iter,
-                                                     min_acc=min_acc, 
+                                                     min_acc=min_acc,
                                                      prefix=prefix,
                                                      *args, **kwargs)
         print("m1: %f" % m1)
@@ -657,17 +675,17 @@ def SCCAF_optimize_all(ad,
         print("Accuracy: %f" % acc)
         R1norm_cutoff = m1 - R1norm_step
         R2norm_cutoff = m2 - R2norm_step
-        
-        clstr_new = len(ad.obs['%s_result'%prefix].unique())
-        
+
+        clstr_new = len(ad.obs['%s_result' % prefix].unique())
+
         if clstr_new >= clstr_old and i >= min_i:
             print("converged SCCAF_all!")
             break
-        
-        if acc >=min_acc:
+
+        if acc >= min_acc:
             break
-            
-            
+
+
 def SCCAF_optimize(ad,
                    prefix='L1',
                    use='raw',
@@ -681,7 +699,7 @@ def SCCAF_optimize(ad,
                    plot_dist=False,
                    plot_cmat=False,
                    mod='1',
-                   low_res = None,
+                   low_res=None,
                    c_iter=3,
                    n_iter=10,
                    n_jobs=None,
@@ -757,8 +775,8 @@ def SCCAF_optimize(ad,
         The cutoff for the euclidean distance between two clusters of cells.
         8.0 means the euclidean distance between two cell types should be greater than 8.0.
     low_res: `str` optional
-		the clustering boundary for under-clustering. Set a low resolution in louvain/leiden clustering and give
-		the key as the underclustering boundary.
+                the clustering boundary for under-clustering. Set a low resolution in louvain/leiden clustering and give
+                the key as the underclustering boundary.
     classifier: `String` optional (default: 'LR')
         a machine learning model in "LR" (logistic regression), \
         "RF" (Random Forest), "GNB"(Gaussion Naive Bayes), "SVM" (Support Vector Machine) and "DT"(Decision Tree).
@@ -766,7 +784,7 @@ def SCCAF_optimize(ad,
         MatPlotLib multi-page backend object instance, previously initialised (currently the only type supported is
         PdfPages).
     min_acc: `float`
-		the minimum total accuracy to be achieved. Above this threshold, the optimization will stop.
+                the minimum total accuracy to be achieved. Above this threshold, the optimization will stop.
 
     return
     -----
@@ -779,12 +797,13 @@ def SCCAF_optimize(ad,
         X = ad.raw.X
     elif use == 'pca':
         if 'X_pca' not in ad.obsm.keys():
-            raise ValueError("`adata.obsm['X_pca']` doesn't exist. Run `sc.pp.pca` first.")
+            raise ValueError(
+                "`adata.obsm['X_pca']` doesn't exist. Run `sc.pp.pca` first.")
         X = ad.obsm['X_pca']
-    elif 'X_%s'%use in ad.obsm.dtype.fields:
-	X = ad.obsm['X_%s'%use]
+    elif 'X_%s' % use in ad.obsm.dtype.fields:
+        X = ad.obsm['X_%s' % use]
     else:
-        X = ad[:,ad.var['highly_variable']].X
+        X = ad[:, ad.var['highly_variable']].X
 
     for i in range(start_iter, start_iter + n_iter):
         print("Round%d ..." % (i + 1))
@@ -799,9 +818,10 @@ def SCCAF_optimize(ad,
                             fraction=fraction, classifier=classifier, n_jobs=n_jobs)
         accs = [acc]
         ad.obs['%s_self-projection' % old_id] = clf.predict(X)
-        
+
         if plot:
-            aucs = plot_roc(y_prob, y_test, clf, cvsm=cvsm, acc=acc, title="Self-project ROC {}".format(old_id))
+            aucs = plot_roc(y_prob, y_test, clf, cvsm=cvsm, acc=acc,
+                            title="Self-project ROC {}".format(old_id))
             if mplotlib_backend:
                 mplotlib_backend.savefig()
                 plt.clf()
@@ -849,9 +869,12 @@ def SCCAF_optimize(ad,
 
         if plot:
             if plot_cmat:
-                plot_heatmap_gray(cmat, 'Confusion Matrix', mplotlib_backend=mplotlib_backend)
-            plot_heatmap_gray(R1mat, 'Normalized Confusion Matrix (R1norm) - %s' % old_id, mplotlib_backend=mplotlib_backend)
-            plot_heatmap_gray(R2mat, 'Normalized Confusion Matrix (R2norm) - %s' % old_id, mplotlib_backend=mplotlib_backend)
+                plot_heatmap_gray(cmat, 'Confusion Matrix',
+                                  mplotlib_backend=mplotlib_backend)
+            plot_heatmap_gray(R1mat, 'Normalized Confusion Matrix (R1norm) - %s' %
+                              old_id, mplotlib_backend=mplotlib_backend)
+            plot_heatmap_gray(R2mat, 'Normalized Confusion Matrix (R2norm) - %s' %
+                              old_id, mplotlib_backend=mplotlib_backend)
 
         if R1norm_only:
             groups = cluster_adjmat(R1mat, cutoff=R1norm_cutoff)
@@ -859,32 +882,35 @@ def SCCAF_optimize(ad,
             groups = cluster_adjmat(R2mat, cutoff=R2norm_cutoff)
         else:
             if not low_res is None:
-                conn_mat = get_connection_matrix(ad_obs = ad.obs, key1 = low_res, key2 = old_id)
-                zmat = np.minimum.reduce([(R1mat > R1norm_cutoff), conn_mat.values])
+                conn_mat = get_connection_matrix(
+                    ad_obs=ad.obs, key1=low_res, key2=old_id)
+                zmat = np.minimum.reduce(
+                    [(R1mat > R1norm_cutoff), conn_mat.values])
                 groups = cluster_adjmat(zmat, cutoff=0)
             else:
-                zmat = np.maximum.reduce([(R1mat > R1norm_cutoff), (R2mat > R2norm_cutoff)])
+                zmat = np.maximum.reduce(
+                    [(R1mat > R1norm_cutoff), (R2mat > R2norm_cutoff)])
                 groups = cluster_adjmat(zmat, cutoff=0)
 
         if len(np.unique(groups)) == len(ad.obs[old_id].unique()):
             ad.obs['%s_result' % prefix] = ad.obs[old_id]
             print("Converge SCCAF_optimize no. cluster!")
             break
-        
+
         merge_cluster(ad, old_id1, new_id, groups)
-        
+
         if plot:
             sc.pl.scatter(ad, basis=basis, color=[new_id], color_map="RdYlBu_r", legend_loc='on data',
                           show=(mplotlib_backend is None))
             if mplotlib_backend:
                 mplotlib_backend.savefig()
                 plt.clf()
-        
+
         if len(np.unique(groups)) <= 1:
             ad.obs['%s_result' % prefix] = ad.obs[new_id]
             print("no clustering!")
             break
-    
+
     return ad, m1, m2, np.min(accs), i
 
 
@@ -927,7 +953,8 @@ def plot_center(ad, groupby, ax, basis='tsne', size=20):
         Y_mask = Y[ad.obs[groupby] == c, :]
         centroids[c] = np.median(Y_mask, axis=0)
     for c in centroids.keys():
-        ax.plot(centroids[c][0], centroids[c][1], 'wo', markersize=size, alpha=0.5)
+        ax.plot(centroids[c][0], centroids[c][1],
+                'wo', markersize=size, alpha=0.5)
     return ax
 
 
@@ -1021,33 +1048,34 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None,
     """
     y_prob, y_test, clf, plot=True, save=False, title ='', colors=None, cvsm=None, acc=None, fontsize=16):
     """
-    rc_aucs = [] #AUC
-    rp_aucs = [] # AUC from recall precision
-    fprs = [] #FPR
-    tprs = [] #TPR
-    prss = [] #Precision
-    recs = [] #Recall
+    rc_aucs = []  # AUC
+    rp_aucs = []  # AUC from recall precision
+    fprs = []  # FPR
+    tprs = []  # TPR
+    prss = []  # Precision
+    recs = []  # Recall
     for i, cell_type in enumerate(clf.classes_):
         fpr, tpr, _ = metrics.roc_curve(y_test == cell_type, y_prob[:, i])
-        prs, rec, _ = metrics.precision_recall_curve(y_test == cell_type, y_prob[:, i])
+        prs, rec, _ = metrics.precision_recall_curve(
+            y_test == cell_type, y_prob[:, i])
         fprs.append(fpr)
         tprs.append(tpr)
         prss.append(prs)
         recs.append(rec)
         rc_aucs.append(metrics.auc(fpr, tpr))
         rp_aucs.append(metrics.auc(rec, prs))
-    
+
     good_aucs = np.asarray(rc_aucs)
     good_aucs = good_aucs[~np.isnan(good_aucs)]
     min_auc_rc = np.min(good_aucs)
     max_auc_rc = np.max(good_aucs)
-    
+
     good_aucs = np.asarray(rp_aucs)
     good_aucs = good_aucs[~np.isnan(good_aucs)]
     min_auc_rp = np.min(good_aucs)
     max_auc_rp = np.max(good_aucs)
-    
-    if plot in ['both','roc','prc']:
+
+    if plot in ['both', 'roc', 'prc']:
         if colors is None:
             if len(clf.classes_) < 21:
                 colors = default_20
@@ -1056,7 +1084,7 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None,
             else:
                 colors = default_102
         if plot == 'both':
-            
+
             fig, ax = plt.subplots(1, 2, sharey=True)
             ax[0].plot([0, 1], [0, 1], color='k', ls=':')
             ax[0].set_xticks([0, 1])
@@ -1076,18 +1104,26 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None,
                 ax[1].plot(Xs[i], Ys[i], c=colors[i], lw=2, label=cell_type)
             ax[1].set_xlabel('Recall')
             ax[1].set_ylabel('Precision')
-            
-            ax[0].annotate(r'$AUC_{min}: %.3f$' % min_auc_rc, (0.5, 0.4), fontsize=fontsize)
-            ax[0].annotate(r'$AUC_{max}: %.3f$' % max_auc_rc, (0.5, 0.3), fontsize=fontsize)
-            ax[1].annotate(r'$AUC_{min}: %.3f$' % min_auc_rp, (0.5, 0.4), fontsize=fontsize)
-            ax[1].annotate(r'$AUC_{max}: %.3f$' % max_auc_rp, (0.5, 0.3), fontsize=fontsize)
+
+            ax[0].annotate(r'$AUC_{min}: %.3f$' %
+                           min_auc_rc, (0.5, 0.4), fontsize=fontsize)
+            ax[0].annotate(r'$AUC_{max}: %.3f$' %
+                           max_auc_rc, (0.5, 0.3), fontsize=fontsize)
+            ax[1].annotate(r'$AUC_{min}: %.3f$' %
+                           min_auc_rp, (0.5, 0.4), fontsize=fontsize)
+            ax[1].annotate(r'$AUC_{max}: %.3f$' %
+                           max_auc_rp, (0.5, 0.3), fontsize=fontsize)
             if cvsm:
-                ax[0].annotate("CV: %.3f" % cvsm, (0.5, 0.2), fontsize=fontsize)
-                ax[1].annotate("CV: %.3f" % cvsm, (0.5, 0.2), fontsize=fontsize)
+                ax[0].annotate("CV: %.3f" %
+                               cvsm, (0.5, 0.2), fontsize=fontsize)
+                ax[1].annotate("CV: %.3f" %
+                               cvsm, (0.5, 0.2), fontsize=fontsize)
             if acc:
-                ax[0].annotate("Test: %.3f" % acc, (0.5, 0.1), fontsize=fontsize)
-                ax[1].annotate("Test: %.3f" % acc, (0.5, 0.1), fontsize=fontsize)
-            
+                ax[0].annotate("Test: %.3f" %
+                               acc, (0.5, 0.1), fontsize=fontsize)
+                ax[1].annotate("Test: %.3f" %
+                               acc, (0.5, 0.1), fontsize=fontsize)
+
         else:
             fig, ax = plt.subplots()
             ax.set_xticks([0, 1])
@@ -1099,8 +1135,10 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None,
                 ax.set_ylabel('TPR')
                 ax.plot([0, 1], [0, 1], color='k', ls=':')
                 pos = 0.5
-                ax.annotate(r'$AUC_{min}: %.3f$' % min_auc_rc, (pos, 0.4), fontsize=fontsize)
-                ax.annotate(r'$AUC_{max}: %.3f$' % max_auc_rc, (pos, 0.3), fontsize=fontsize)
+                ax.annotate(r'$AUC_{min}: %.3f$' %
+                            min_auc_rc, (pos, 0.4), fontsize=fontsize)
+                ax.annotate(r'$AUC_{max}: %.3f$' %
+                            max_auc_rc, (pos, 0.3), fontsize=fontsize)
             else:
                 Xs = recs
                 Ys = prss
@@ -1108,8 +1146,10 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None,
                 ax.set_ylabel('Precision')
                 ax.plot([0, 1], [1, 0], color='k', ls=':')
                 pos = 0
-                ax.annotate(r'$AUC_{min}: %.3f$' % min_auc_rp, (pos, 0.4), fontsize=fontsize)
-                ax.annotate(r'$AUC_{max}: %.3f$' % max_auc_rp, (pos, 0.3), fontsize=fontsize)
+                ax.annotate(r'$AUC_{min}: %.3f$' %
+                            min_auc_rp, (pos, 0.4), fontsize=fontsize)
+                ax.annotate(r'$AUC_{max}: %.3f$' %
+                            max_auc_rp, (pos, 0.3), fontsize=fontsize)
             for i, cell_type in enumerate(clf.classes_):
                 ax.plot(Xs[i], Ys[i], c=colors[i], lw=2, label=cell_type)
 
@@ -1120,11 +1160,11 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None,
 
         if save:
             plt.savefig(save)
-        
+
     return rc_aucs
 
 
-## pySankey
+# pySankey
 
 class pySankeyException(Exception):
     pass
@@ -1152,7 +1192,8 @@ def check_data_matches_labels(labels, data, side):
                 msg = "Labels: " + ",".join(labels) + "\n"
             if len(data) < 20:
                 msg += "Data: " + ",".join(data)
-            raise LabelMismatch('{0} labels and data do not match.{1}'.format(side, msg))
+            raise LabelMismatch(
+                '{0} labels and data do not match.{1}'.format(side, msg))
 
 
 def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None,
@@ -1258,7 +1299,8 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None,
             myD['bottom'] = 0
             myD['top'] = myD['left']
         else:
-            myD['bottom'] = widths_left[leftLabels[i - 1]]['top'] + 0.02 * df.leftWeight.sum()
+            myD['bottom'] = widths_left[leftLabels[i - 1]
+                                        ]['top'] + 0.02 * df.leftWeight.sum()
             myD['top'] = myD['bottom'] + myD['left']
             topEdge = myD['top']
         widths_left[l] = myD
@@ -1272,7 +1314,8 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None,
             myD['bottom'] = 0
             myD['top'] = myD['right']
         else:
-            myD['bottom'] = widths_right[rightLabels[i - 1]]['top'] + 0.02 * df.rightWeight.sum()
+            myD['bottom'] = widths_right[rightLabels[i - 1]
+                                         ]['top'] + 0.02 * df.rightWeight.sum()
             myD['top'] = myD['bottom'] + myD['right']
             topEdge = myD['top']
         widths_right[l] = myD
@@ -1299,12 +1342,13 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None,
     for l in rightLabels:
         plt.fill_between(
             [xMax, 1.02 * xMax], 2 * [widths_right[l]['bottom']],
-                                 2 * [widths_right[l]['bottom'] + widths_right[l]['right']],
+            2 * [widths_right[l]['bottom'] + widths_right[l]['right']],
             color=colorDict[l],
             alpha=0.99
         )
         plt.text(
-            1.05 * xMax, widths_right[l]['bottom'] + 0.5 * widths_right[l]['right'],
+            1.05 * xMax, widths_right[l]['bottom'] +
+            0.5 * widths_right[l]['right'],
             l,
             {'ha': 'left', 'va': 'center'},
             fontsize=fontsize
@@ -1318,7 +1362,8 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None,
                 lc = l2
             if len(df[(df.left == l) & (df.right == l2)]) > 0:
                 # Create array of y values for each strip, half at left value, half at right, convolve
-                ys_d = np.array(50 * [widths_left[l]['bottom']] + 50 * [widths_right[l2]['bottom']])
+                ys_d = np.array(
+                    50 * [widths_left[l]['bottom']] + 50 * [widths_right[l2]['bottom']])
                 ys_d = np.convolve(ys_d, 0.05 * np.ones(20), mode='valid')
                 ys_d = np.convolve(ys_d, 0.05 * np.ones(20), mode='valid')
                 ys_u = np.array(
@@ -1342,23 +1387,22 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None,
 
 
 ############################ UTILS #####################################
-from scipy.sparse import csr_matrix
-import h5py
+
 
 def readHCA(fin):
     f = h5py.File(fin, 'r')
     data = f[list(f.keys())[0]]
-    X = csr_matrix((np.array(data.get("data")), 
-                np.array(data.get("indices")), 
-                np.array(data.get("indptr"))), 
-                shape=np.array(data.get("shape"))[::-1])
+    X = csr_matrix((np.array(data.get("data")),
+                    np.array(data.get("indices")),
+                    np.array(data.get("indptr"))),
+                   shape=np.array(data.get("shape"))[::-1])
     ad = sc.AnnData(X)
     ad.obs_names = np.array(data.get("barcodes")).astype(str)
     ad.var_names = np.array(data.get("gene_names")).astype(str)
     ad.var['ensembl_gene'] = np.array(data.get("genes")).astype(str)
     return(ad)
-    
-    
+
+
 def sc_workflow(ad, prefix='L1', resolution=1.5, n_pcs=15, do_tsne=True):
     sc.pp.normalize_per_cell(ad, counts_per_cell_after=1e4)
     filter_result = sc.pp.filter_genes_dispersion(
@@ -1396,16 +1440,19 @@ def sc_pp_regress_out(adata, keys, n_jobs=None, copy=False):
     -------
     Depening on `copy` returns or updates `adata` with the corrected data matrix.
     """
-    logg.info('regressing out'.format(tuple(keys) if type(keys) is list else keys))
+    logg.info('regressing out'.format(
+        tuple(keys) if type(keys) is list else keys))
     if issparse(adata.X):
         logg.info('... sparse input is densified and may '
                   'lead to huge memory consumption')
     adata = adata.copy() if copy else adata
-    if isinstance(keys, str): keys = [keys]
+    if isinstance(keys, str):
+        keys = [keys]
     if issparse(adata.X):
         adata.X = adata.X.toarray()
     if n_jobs is not None:
-        logg.warning('Parallelization is currently broke, will be restored soon. Running on 1 core.')
+        logg.warning(
+            'Parallelization is currently broke, will be restored soon. Running on 1 core.')
     n_jobs = sett.n_jobs if n_jobs is None else n_jobs
 
     cat_keys = []
@@ -1447,14 +1494,16 @@ def sc_pp_regress_out(adata, keys, n_jobs=None, copy=False):
     def _regress_out(col_index, responses, regressors):
         try:
             if regressors.shape[1] - 1 == responses.shape[1]:
-                regressors_view = np.c_[regressors[:, 0], regressors[:, col_index + 1]]
+                regressors_view = np.c_[
+                    regressors[:, 0], regressors[:, col_index + 1]]
             else:
                 regressors_view = regressors
             result = sm.GLM(responses[:, col_index],
                             regressors_view, family=sm.families.Gaussian()).fit()
             new_column = result.resid_response
         except PerfectSeparationError:  # this emulates R's behavior
-            logg.warning('Encountered PerfectSeparationError, setting to 0 as in R.')
+            logg.warning(
+                'Encountered PerfectSeparationError, setting to 0 as in R.')
             new_column = np.zeros(responses.shape[0])
         return new_column
 
@@ -1475,7 +1524,8 @@ def _regress_out_chunk(chunk, responses, regressors):
         for i_column, column in enumerate(chunk):
             adata.X[:, column] = result_lst[i_column]
     logg.info('finished')
-    logg.hint('after `sc.pp.regress_out`, consider rescaling the adata using `sc.pp.scale`')
+    logg.hint(
+        'after `sc.pp.regress_out`, consider rescaling the adata using `sc.pp.scale`')
     return adata if copy else None
 
 
@@ -1492,8 +1542,8 @@ def regress_out(metadata, exprs, covariate_formula, design_formula='1', rcond=-1
     covariate_matrix = patsy.dmatrix(covariate_formula, metadata)
 
     design_batch = np.hstack((covariate_matrix, design_matrix))
-    coefficients, res, rank, s = np.linalg.lstsq(design_batch, exprs.T, rcond=rcond)
+    coefficients, res, rank, s = np.linalg.lstsq(
+        design_batch, exprs.T, rcond=rcond)
 
     beta = coefficients[covariate_matrix.shape[1]:]
     return exprs - design_matrix.dot(beta).T
-

From 5ef100921cba11c955220efce8063ca4fd34bd76 Mon Sep 17 00:00:00 2001
From: YY-SONG0718 <794815059@qq.com>
Date: Sun, 20 Mar 2022 17:49:31 +0000
Subject: [PATCH 2/8] add legend to plot_roc in __init__.py

---
 SCCAF/__init__.py | 336 ++++++++++++++++++++--------------------------
 1 file changed, 142 insertions(+), 194 deletions(-)

diff --git a/SCCAF/__init__.py b/SCCAF/__init__.py
index 1df6d35..71f3a6b 100644
--- a/SCCAF/__init__.py
+++ b/SCCAF/__init__.py
@@ -19,8 +19,6 @@
 #
 #
 
-import h5py
-from scipy.sparse import csr_matrix
 import pandas as pd
 import numpy as np
 from scipy.sparse import issparse
@@ -53,22 +51,22 @@
 from sklearn.ensemble import RandomForestClassifier, VotingClassifier
 from sklearn.svm import SVC
 
-color_long = ['#e6194b', '#3cb44b', '#ffe119', '#0082c8', '#f58231', '#911eb4',
-              '#46f0f0', '#f032e6', '#d2f53c', '#fabebe', '#008080', '#e6beff',
-              '#aa6e28', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000080',
-              '#808080', '#000000', "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46",
-              "#008941", "#006FA6", "#A30059", "#FFDBE5", "#7A4900", "#0000A6",
-              "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87", "#5A0007",
-              "#809693", "#6A3A4C", "#1B4400", "#4FC601", "#3B5DFF", "#4A3B53",
-              "#FF2F80", "#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100",
-              "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F",
-              "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09",
-              "#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66",
-              "#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C",
-              "#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81",
-              "#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00",
-              "#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700",
-              "#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329",
+color_long = ['#e6194b','#3cb44b','#ffe119','#0082c8','#f58231','#911eb4',\
+              '#46f0f0','#f032e6','#d2f53c','#fabebe','#008080','#e6beff',\
+              '#aa6e28','#800000','#aaffc3','#808000','#ffd8b1','#000080',\
+              '#808080','#000000', "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46", \
+              "#008941", "#006FA6", "#A30059","#FFDBE5", "#7A4900", "#0000A6", \
+              "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87","#5A0007", \
+              "#809693", "#6A3A4C", "#1B4400", "#4FC601", "#3B5DFF","#4A3B53", \
+              "#FF2F80","#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100",\
+              "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F",\
+              "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09",\
+              "#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66",\
+              "#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C",\
+              "#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81",\
+              "#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00",\
+              "#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700",\
+              "#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329",\
               "#5B4534", "#FDE8DC", "#404E55", "#0089A3", "#CB7E98", "#A4E804", "#324E72"]
 
 
@@ -112,8 +110,7 @@ def bhattacharyya_matrix(prob, flags=None):
     else:
         for i in np.arange(ndim):
             for j in np.arange(i + 1, ndim):
-                df = pd.DataFrame(
-                    {'i': prob[:, i], 'j': prob[:, j], 'f': flags})
+                df = pd.DataFrame({'i': prob[:, i], 'j': prob[:, j], 'f': flags})
                 df = df[df['f']]
                 val = -bhattacharyya_distance(df['i'], df['j'])
                 m[i, j] = val
@@ -146,11 +143,9 @@ def normalize_confmat1(cmat, mod='1'):
     for i in range(dim):
         for j in range(i + 1, dim):
             if mod is '1':
-                xmat[i, j] = xmat[j, i] = max(
-                    dmat[i, j] / smat[j], dmat[j, i] / smat[i])
+                xmat[i, j] = xmat[j, i] = max(dmat[i, j] / smat[j], dmat[j, i] / smat[i])
             else:
-                xmat[i, j] = xmat[j, i] = max(
-                    dmat[i, j] / smat[i], dmat[j, i] / smat[j])
+                xmat[i, j] = xmat[j, i] = max(dmat[i, j] / smat[i], dmat[j, i] / smat[j])
     return xmat
 
 
@@ -198,8 +193,7 @@ def cluster_adjmat(xmat,
     -----
     new group names.
     """
-    g = sc._utils.get_igraph_from_adjacency(
-        (xmat > cutoff).astype(int), directed=False)
+    g = sc._utils.get_igraph_from_adjacency((xmat > cutoff).astype(int), directed=False)
     print(g)
     part = louvain.find_partition(g, louvain.RBConfigurationVertexPartition,
                                   resolution_parameter=resolution)
@@ -223,8 +217,7 @@ def msample(x, n, frac):
     sampled selection.
     """
     if len(x) <= np.floor(n / frac):
-        if len(x) < 10:
-            frac = 0.9
+        if len(x) < 10: frac = 0.9
         return x.sample(frac=frac)
     else:
         return x.sample(n=n)
@@ -253,8 +246,7 @@ def train_test_split_per_type(X, y, n=100, frac=0.8):
     df = pd.DataFrame(y)
     df.index = np.arange(len(y))
     df.columns = ['class']
-    c_idx = df.groupby('class').apply(lambda x: msample(
-        x, n=n, frac=frac)).index.get_level_values(None)
+    c_idx = df.groupby('class').apply(lambda x: msample(x, n=n, frac=frac)).index.get_level_values(None)
     d_idx = ~np.isin(np.arange(len(y)), c_idx)
     return X[c_idx, :], X[d_idx, :], y[c_idx], y[d_idx]
 
@@ -269,17 +261,17 @@ def SCCAF_assessment(*args, **kwargs):
 
 
 # need to check number of cells in each cluster of the training set.
-def self_projection(X,
+def self_projection(X, 
                     cell_types,
                     classifier="LR",
                     penalty='l1',
                     sparsity=0.5,
                     fraction=0.5,
                     random_state=1,
-                    solver='liblinear',
+		    solver='liblinear',
                     n=0,
                     cv=5,
-                    whole=False,
+                    whole=False, 
                     n_jobs=None):
     # n = 100 should be good.
     """
@@ -334,8 +326,7 @@ def self_projection(X,
                              stratify=cell_types, test_size=fraction)  # fraction means test size
     # set the classifier
     if classifier == 'LR':
-        clf = LogisticRegression(
-            random_state=1, penalty=penalty, C=sparsity, multi_class="ovr", solver=solver)
+        clf = LogisticRegression(random_state=1, penalty=penalty, C=sparsity, multi_class="ovr", solver=solver)
     elif classifier == 'RF':
         clf = RandomForestClassifier(random_state=1, n_jobs=n_jobs)
     elif classifier == 'GNB':
@@ -350,12 +341,11 @@ def self_projection(X,
         clf = SGDClassifier(loss='perceptron', n_jobs=n_jobs)
     elif classifier == 'DT':
         clf = DecisionTreeClassifier()
-
+    
     # mean cross validation score
     cvsm = 0
     if cv > 0:
-        cvs = cross_val_score(clf, X_train, np.array(
-            y_train), cv=cv, scoring='accuracy', n_jobs=n_jobs)
+        cvs = cross_val_score(clf, X_train, np.array(y_train), cv=cv, scoring='accuracy', n_jobs=n_jobs)
         cvsm = cvs.mean()
         print("Mean CV accuracy: %.4f" % cvsm)
     # accuracy on cross validation and on test set
@@ -364,18 +354,18 @@ def self_projection(X,
     print("Accuracy on the training set: %.4f" % accuracy)
     accuracy_test = clf.score(X_test, y_test)
     print("Accuracy on the hold-out set: %.4f" % accuracy_test)
-
+    
     # accuracy of the whole dataset
     if whole:
         accuracy = clf.score(X, cell_types)
         print("Accuracy on the whole set: %.4f" % accuracy)
-
+    
     # get predicted probability on the test set
     y_prob = None
     if not classifier in ['SH', 'PCP']:
         y_prob = clf.predict_proba(X_test)
     y_pred = clf.predict(X_test)
-
+    
     return y_prob, y_pred, y_test, clf, cvsm, accuracy_test
 
 
@@ -414,10 +404,8 @@ def confusion_matrix(y_test, y_pred, clf, labels=None):
     -----
     the confusion matrix
     """
-    if labels is None:
-        labels = clf.classes_
-    df = pd.DataFrame.from_records(
-        metrics.confusion_matrix(y_test, y_pred, labels=labels))
+    if labels is None: labels = clf.classes_
+    df = pd.DataFrame.from_records(metrics.confusion_matrix(y_test, y_pred, labels=labels))
     df.index = labels
     df.columns = labels
     df.index.name = 'Real'
@@ -455,8 +443,7 @@ def per_cluster_accuracy(mtx, ad=None, clstr_name='louvain'):
 
 def per_cell_accuracy(X, cell_types, clf):
     y_prob = clf.predict_proba(X)
-    df = pd.DataFrame(y_prob, index=cell_types,
-                      columns=clf.classes_).sort_index().T
+    df = pd.DataFrame(y_prob, index=cell_types, columns=clf.classes_).sort_index().T
     df.index.name = 'Predicted'
     dy = np.empty([0])
     for cell in df.index:
@@ -489,14 +476,13 @@ def get_topmarkers(clf, names, topn=10):
 
     top_markers = \
         marker_genes \
-        .query('weight > 0.') \
-        .sort_values('weight', ascending=False) \
-        .groupby('cell_type') \
-        .head(topn) \
-        .sort_values(['cell_type', 'weight'], ascending=[True, False])
+            .query('weight > 0.') \
+            .sort_values('weight', ascending=False) \
+            .groupby('cell_type') \
+            .head(topn) \
+            .sort_values(['cell_type', 'weight'], ascending=[True, False])
     return top_markers
 
-
 def eu_distance(X, gp1, gp2, cell):
     """
     Measure the euclidean distance between two groups of cells and the third group.
@@ -555,22 +541,20 @@ def get_distance_matrix(X, clusters, labels=None, metric='euclidean'):
     centers = []
     if scipy.sparse.issparse(X):
         for cl in labels:
-            centers.append(
-                np.array(X[np.where(clusters == cl)[0], :].mean(0))[0, :])
+            centers.append(np.array(X[np.where(clusters == cl)[0], :].mean(0))[0, :])
     else:
         for cl in labels:
             centers.append(np.array(X[np.where(clusters == cl)[0], :].mean(0)))
     return pairwise_distances(np.array(centers), metric=metric)
 
-
 def merge_cluster(ad, old_id, new_id, groups):
     ad.obs[new_id] = ad.obs[old_id]
     ad.obs[new_id] = ad.obs[new_id].astype('category')
     ad.obs[new_id].cat.categories = make_unique(groups.astype(str))
     ad.obs[new_id] = ad.obs[new_id].str.split('_').str[0]
     return ad
-
-
+    
+    
 def find_high_resolution(ad, resolution=4, n=100):
     cut = resolution
     while cut > 0.5:
@@ -583,18 +567,18 @@ def find_high_resolution(ad, resolution=4, n=100):
 
 
 def get_connection_matrix(ad_obs, key1, key2):
-    df = ad_obs.groupby([key1, key2]).size().to_frame().reset_index()
-    df.columns = [key1, key2, 'counts']
+    df = ad_obs.groupby([key1,key2]).size().to_frame().reset_index()
+    df.columns = [key1,key2,'counts']
     df2 = ad_obs[key2].value_counts()
     df['size'] = df2[df[key2].tolist()].tolist()
     df['percent'] = df['counts']/df['size']
-    df = df[df['percent'] > 0.1]
+    df = df[df['percent']>0.1]
     df2 = df.groupby(key1).size()
-    df2 = df2[df2 > 1]
+    df2 = df2[df2>1]
     df = df[df[key1].isin(df2.index)]
 
     dim = len(ad_obs[key2].unique())
-    mat = pd.DataFrame(np.zeros([dim, dim])).astype(int)
+    mat = pd.DataFrame(np.zeros([dim,dim])).astype(int)
     mat.columns = mat.columns.astype(str)
     mat.index = mat.index.astype(str)
 
@@ -602,10 +586,10 @@ def get_connection_matrix(ad_obs, key1, key2):
     grouped = df.groupby(key1)
     for name, group in grouped:
         x = group[key2]
-        if len(x) > 0:
-            for i, j in list(itertools.combinations(x.tolist(), 2)):
-                mat.loc[i, j] = mat.loc[j, i] = 1
-    return mat
+        if len(x)>0:
+            for i,j in list(itertools.combinations(x.tolist(), 2)):
+                mat.loc[i,j] = mat.loc[j,i] = 1
+    return mat    
 
 
 def SCCAF_optimize_all(ad,
@@ -613,11 +597,11 @@ def SCCAF_optimize_all(ad,
                        R1norm_cutoff=0.5,
                        R2norm_cutoff=0.05,
                        R1norm_step=0.01,
-                       R2norm_step=0.001,
+                       R2norm_step=0.001, 
                        prefix='L1',
-                       min_i=3,
-                       start=None,
-                       start_iter=0,
+                       min_i = 3,
+		       start = None,
+                       start_iter = 0,
                        *args, **kwargs):
     """
     ad: `AnnData`
@@ -642,22 +626,20 @@ def SCCAF_optimize_all(ad,
         The reduce step for minimum R2norm value.
     """
     acc = 0
-    # 'start_iter = -1
+    #'start_iter = -1
     if start is None:
-        start = '%s_Round%d' % (prefix, start_iter)
+        start = '%s_Round%d'%(prefix, start_iter)
         if not start in ad.obs.keys():
-            raise ValueError(
-                "`adata.obs['%s']` doesn't exist. Please assign the initial clustering first." % (start))
+            raise ValueError("`adata.obs['%s']` doesn't exist. Please assign the initial clustering first."%(start))
     else:
         if not start in ad.obs.keys():
-            raise ValueError(
-                "`adata.obs['%s']` doesn't exist. Please assign the initial clustering first." % (start))
-        ad.obs['%s_Round%d' % (prefix, start_iter)] = ad.obs[start]
-
-    clstr_old = len(ad.obs['%s_Round%d' % (prefix, start_iter)].unique())
-    # 'while acc < min_acc:
+            raise ValueError("`adata.obs['%s']` doesn't exist. Please assign the initial clustering first."%(start))
+        ad.obs['%s_Round%d'%(prefix, start_iter)] = ad.obs[start]
+    
+    clstr_old = len(ad.obs['%s_Round%d'%(prefix, start_iter)].unique())
+    #'while acc < min_acc:
     for i in range(10):
-        if start_iter > 0:
+        if start_iter >0:
             print("start_iter: %d" % start_iter)
         print("R1norm_cutoff: %f" % R1norm_cutoff)
         print("R2norm_cutoff: %f" % R2norm_cutoff)
@@ -667,7 +649,7 @@ def SCCAF_optimize_all(ad,
                                                      R1norm_cutoff=R1norm_cutoff,
                                                      R2norm_cutoff=R2norm_cutoff,
                                                      start_iter=start_iter,
-                                                     min_acc=min_acc,
+                                                     min_acc=min_acc, 
                                                      prefix=prefix,
                                                      *args, **kwargs)
         print("m1: %f" % m1)
@@ -675,17 +657,17 @@ def SCCAF_optimize_all(ad,
         print("Accuracy: %f" % acc)
         R1norm_cutoff = m1 - R1norm_step
         R2norm_cutoff = m2 - R2norm_step
-
-        clstr_new = len(ad.obs['%s_result' % prefix].unique())
-
+        
+        clstr_new = len(ad.obs['%s_result'%prefix].unique())
+        
         if clstr_new >= clstr_old and i >= min_i:
             print("converged SCCAF_all!")
             break
-
-        if acc >= min_acc:
+        
+        if acc >=min_acc:
             break
-
-
+            
+            
 def SCCAF_optimize(ad,
                    prefix='L1',
                    use='raw',
@@ -699,7 +681,7 @@ def SCCAF_optimize(ad,
                    plot_dist=False,
                    plot_cmat=False,
                    mod='1',
-                   low_res=None,
+                   low_res = None,
                    c_iter=3,
                    n_iter=10,
                    n_jobs=None,
@@ -775,8 +757,8 @@ def SCCAF_optimize(ad,
         The cutoff for the euclidean distance between two clusters of cells.
         8.0 means the euclidean distance between two cell types should be greater than 8.0.
     low_res: `str` optional
-                the clustering boundary for under-clustering. Set a low resolution in louvain/leiden clustering and give
-                the key as the underclustering boundary.
+		the clustering boundary for under-clustering. Set a low resolution in louvain/leiden clustering and give
+		the key as the underclustering boundary.
     classifier: `String` optional (default: 'LR')
         a machine learning model in "LR" (logistic regression), \
         "RF" (Random Forest), "GNB"(Gaussion Naive Bayes), "SVM" (Support Vector Machine) and "DT"(Decision Tree).
@@ -784,7 +766,7 @@ def SCCAF_optimize(ad,
         MatPlotLib multi-page backend object instance, previously initialised (currently the only type supported is
         PdfPages).
     min_acc: `float`
-                the minimum total accuracy to be achieved. Above this threshold, the optimization will stop.
+		the minimum total accuracy to be achieved. Above this threshold, the optimization will stop.
 
     return
     -----
@@ -797,13 +779,10 @@ def SCCAF_optimize(ad,
         X = ad.raw.X
     elif use == 'pca':
         if 'X_pca' not in ad.obsm.keys():
-            raise ValueError(
-                "`adata.obsm['X_pca']` doesn't exist. Run `sc.pp.pca` first.")
+            raise ValueError("`adata.obsm['X_pca']` doesn't exist. Run `sc.pp.pca` first.")
         X = ad.obsm['X_pca']
-    elif 'X_%s' % use in ad.obsm.dtype.fields:
-        X = ad.obsm['X_%s' % use]
     else:
-        X = ad[:, ad.var['highly_variable']].X
+        X = ad[:,ad.var['highly_variable']].X
 
     for i in range(start_iter, start_iter + n_iter):
         print("Round%d ..." % (i + 1))
@@ -818,10 +797,9 @@ def SCCAF_optimize(ad,
                             fraction=fraction, classifier=classifier, n_jobs=n_jobs)
         accs = [acc]
         ad.obs['%s_self-projection' % old_id] = clf.predict(X)
-
+        
         if plot:
-            aucs = plot_roc(y_prob, y_test, clf, cvsm=cvsm, acc=acc,
-                            title="Self-project ROC {}".format(old_id))
+            aucs = plot_roc(y_prob, y_test, clf, cvsm=cvsm, acc=acc, title="Self-project ROC {}".format(old_id))
             if mplotlib_backend:
                 mplotlib_backend.savefig()
                 plt.clf()
@@ -869,12 +847,9 @@ def SCCAF_optimize(ad,
 
         if plot:
             if plot_cmat:
-                plot_heatmap_gray(cmat, 'Confusion Matrix',
-                                  mplotlib_backend=mplotlib_backend)
-            plot_heatmap_gray(R1mat, 'Normalized Confusion Matrix (R1norm) - %s' %
-                              old_id, mplotlib_backend=mplotlib_backend)
-            plot_heatmap_gray(R2mat, 'Normalized Confusion Matrix (R2norm) - %s' %
-                              old_id, mplotlib_backend=mplotlib_backend)
+                plot_heatmap_gray(cmat, 'Confusion Matrix', mplotlib_backend=mplotlib_backend)
+            plot_heatmap_gray(R1mat, 'Normalized Confusion Matrix (R1norm) - %s' % old_id, mplotlib_backend=mplotlib_backend)
+            plot_heatmap_gray(R2mat, 'Normalized Confusion Matrix (R2norm) - %s' % old_id, mplotlib_backend=mplotlib_backend)
 
         if R1norm_only:
             groups = cluster_adjmat(R1mat, cutoff=R1norm_cutoff)
@@ -882,35 +857,32 @@ def SCCAF_optimize(ad,
             groups = cluster_adjmat(R2mat, cutoff=R2norm_cutoff)
         else:
             if not low_res is None:
-                conn_mat = get_connection_matrix(
-                    ad_obs=ad.obs, key1=low_res, key2=old_id)
-                zmat = np.minimum.reduce(
-                    [(R1mat > R1norm_cutoff), conn_mat.values])
+                conn_mat = get_connection_matrix(ad_obs = ad.obs, key1 = low_res, key2 = old_id)
+                zmat = np.minimum.reduce([(R1mat > R1norm_cutoff), conn_mat.values])
                 groups = cluster_adjmat(zmat, cutoff=0)
             else:
-                zmat = np.maximum.reduce(
-                    [(R1mat > R1norm_cutoff), (R2mat > R2norm_cutoff)])
+                zmat = np.maximum.reduce([(R1mat > R1norm_cutoff), (R2mat > R2norm_cutoff)])
                 groups = cluster_adjmat(zmat, cutoff=0)
 
         if len(np.unique(groups)) == len(ad.obs[old_id].unique()):
             ad.obs['%s_result' % prefix] = ad.obs[old_id]
             print("Converge SCCAF_optimize no. cluster!")
             break
-
+        
         merge_cluster(ad, old_id1, new_id, groups)
-
+        
         if plot:
             sc.pl.scatter(ad, basis=basis, color=[new_id], color_map="RdYlBu_r", legend_loc='on data',
                           show=(mplotlib_backend is None))
             if mplotlib_backend:
                 mplotlib_backend.savefig()
                 plt.clf()
-
+        
         if len(np.unique(groups)) <= 1:
             ad.obs['%s_result' % prefix] = ad.obs[new_id]
             print("no clustering!")
             break
-
+    
     return ad, m1, m2, np.min(accs), i
 
 
@@ -953,8 +925,7 @@ def plot_center(ad, groupby, ax, basis='tsne', size=20):
         Y_mask = Y[ad.obs[groupby] == c, :]
         centroids[c] = np.median(Y_mask, axis=0)
     for c in centroids.keys():
-        ax.plot(centroids[c][0], centroids[c][1],
-                'wo', markersize=size, alpha=0.5)
+        ax.plot(centroids[c][0], centroids[c][1], 'wo', markersize=size, alpha=0.5)
     return ax
 
 
@@ -1048,34 +1019,33 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None,
     """
     y_prob, y_test, clf, plot=True, save=False, title ='', colors=None, cvsm=None, acc=None, fontsize=16):
     """
-    rc_aucs = []  # AUC
-    rp_aucs = []  # AUC from recall precision
-    fprs = []  # FPR
-    tprs = []  # TPR
-    prss = []  # Precision
-    recs = []  # Recall
+    rc_aucs = [] #AUC
+    rp_aucs = [] # AUC from recall precision
+    fprs = [] #FPR
+    tprs = [] #TPR
+    prss = [] #Precision
+    recs = [] #Recall
     for i, cell_type in enumerate(clf.classes_):
         fpr, tpr, _ = metrics.roc_curve(y_test == cell_type, y_prob[:, i])
-        prs, rec, _ = metrics.precision_recall_curve(
-            y_test == cell_type, y_prob[:, i])
+        prs, rec, _ = metrics.precision_recall_curve(y_test == cell_type, y_prob[:, i])
         fprs.append(fpr)
         tprs.append(tpr)
         prss.append(prs)
         recs.append(rec)
         rc_aucs.append(metrics.auc(fpr, tpr))
         rp_aucs.append(metrics.auc(rec, prs))
-
+    
     good_aucs = np.asarray(rc_aucs)
     good_aucs = good_aucs[~np.isnan(good_aucs)]
     min_auc_rc = np.min(good_aucs)
     max_auc_rc = np.max(good_aucs)
-
+    
     good_aucs = np.asarray(rp_aucs)
     good_aucs = good_aucs[~np.isnan(good_aucs)]
     min_auc_rp = np.min(good_aucs)
     max_auc_rp = np.max(good_aucs)
-
-    if plot in ['both', 'roc', 'prc']:
+    
+    if plot in ['both','roc','prc']:
         if colors is None:
             if len(clf.classes_) < 21:
                 colors = default_20
@@ -1084,7 +1054,7 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None,
             else:
                 colors = default_102
         if plot == 'both':
-
+            
             fig, ax = plt.subplots(1, 2, sharey=True)
             ax[0].plot([0, 1], [0, 1], color='k', ls=':')
             ax[0].set_xticks([0, 1])
@@ -1098,32 +1068,24 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None,
                 ax[0].plot(Xs[i], Ys[i], c=colors[i], lw=2, label=cell_type)
             ax[0].set_xlabel('FPR')
             ax[0].set_ylabel('TPR')
+            # ysong modify add legend 10 Mar 2022
             Xs = recs
             Ys = prss
             for i, cell_type in enumerate(clf.classes_):
                 ax[1].plot(Xs[i], Ys[i], c=colors[i], lw=2, label=cell_type)
             ax[1].set_xlabel('Recall')
-            ax[1].set_ylabel('Precision')
-
-            ax[0].annotate(r'$AUC_{min}: %.3f$' %
-                           min_auc_rc, (0.5, 0.4), fontsize=fontsize)
-            ax[0].annotate(r'$AUC_{max}: %.3f$' %
-                           max_auc_rc, (0.5, 0.3), fontsize=fontsize)
-            ax[1].annotate(r'$AUC_{min}: %.3f$' %
-                           min_auc_rp, (0.5, 0.4), fontsize=fontsize)
-            ax[1].annotate(r'$AUC_{max}: %.3f$' %
-                           max_auc_rp, (0.5, 0.3), fontsize=fontsize)
+            ax[1].legend(bbox_to_anchor=(1.04,0.5), loc="center left")
+            ax[0].annotate(r'$AUC_{min}: %.3f$' % min_auc_rc, (0.5, 0.4), fontsize=fontsize)
+            ax[0].annotate(r'$AUC_{max}: %.3f$' % max_auc_rc, (0.5, 0.3), fontsize=fontsize)
+            ax[1].annotate(r'$AUC_{min}: %.3f$' % min_auc_rp, (0.5, 0.4), fontsize=fontsize)
+            ax[1].annotate(r'$AUC_{max}: %.3f$' % max_auc_rp, (0.5, 0.3), fontsize=fontsize)
             if cvsm:
-                ax[0].annotate("CV: %.3f" %
-                               cvsm, (0.5, 0.2), fontsize=fontsize)
-                ax[1].annotate("CV: %.3f" %
-                               cvsm, (0.5, 0.2), fontsize=fontsize)
+                ax[0].annotate("CV: %.3f" % cvsm, (0.5, 0.2), fontsize=fontsize)
+                ax[1].annotate("CV: %.3f" % cvsm, (0.5, 0.2), fontsize=fontsize)
             if acc:
-                ax[0].annotate("Test: %.3f" %
-                               acc, (0.5, 0.1), fontsize=fontsize)
-                ax[1].annotate("Test: %.3f" %
-                               acc, (0.5, 0.1), fontsize=fontsize)
-
+                ax[0].annotate("Test: %.3f" % acc, (0.5, 0.1), fontsize=fontsize)
+                ax[1].annotate("Test: %.3f" % acc, (0.5, 0.1), fontsize=fontsize)
+            
         else:
             fig, ax = plt.subplots()
             ax.set_xticks([0, 1])
@@ -1135,10 +1097,8 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None,
                 ax.set_ylabel('TPR')
                 ax.plot([0, 1], [0, 1], color='k', ls=':')
                 pos = 0.5
-                ax.annotate(r'$AUC_{min}: %.3f$' %
-                            min_auc_rc, (pos, 0.4), fontsize=fontsize)
-                ax.annotate(r'$AUC_{max}: %.3f$' %
-                            max_auc_rc, (pos, 0.3), fontsize=fontsize)
+                ax.annotate(r'$AUC_{min}: %.3f$' % min_auc_rc, (pos, 0.4), fontsize=fontsize)
+                ax.annotate(r'$AUC_{max}: %.3f$' % max_auc_rc, (pos, 0.3), fontsize=fontsize)
             else:
                 Xs = recs
                 Ys = prss
@@ -1146,10 +1106,8 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None,
                 ax.set_ylabel('Precision')
                 ax.plot([0, 1], [1, 0], color='k', ls=':')
                 pos = 0
-                ax.annotate(r'$AUC_{min}: %.3f$' %
-                            min_auc_rp, (pos, 0.4), fontsize=fontsize)
-                ax.annotate(r'$AUC_{max}: %.3f$' %
-                            max_auc_rp, (pos, 0.3), fontsize=fontsize)
+                ax.annotate(r'$AUC_{min}: %.3f$' % min_auc_rp, (pos, 0.4), fontsize=fontsize)
+                ax.annotate(r'$AUC_{max}: %.3f$' % max_auc_rp, (pos, 0.3), fontsize=fontsize)
             for i, cell_type in enumerate(clf.classes_):
                 ax.plot(Xs[i], Ys[i], c=colors[i], lw=2, label=cell_type)
 
@@ -1160,11 +1118,11 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None,
 
         if save:
             plt.savefig(save)
-
+        
     return rc_aucs
 
 
-# pySankey
+## pySankey
 
 class pySankeyException(Exception):
     pass
@@ -1192,8 +1150,7 @@ def check_data_matches_labels(labels, data, side):
                 msg = "Labels: " + ",".join(labels) + "\n"
             if len(data) < 20:
                 msg += "Data: " + ",".join(data)
-            raise LabelMismatch(
-                '{0} labels and data do not match.{1}'.format(side, msg))
+            raise LabelMismatch('{0} labels and data do not match.{1}'.format(side, msg))
 
 
 def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None,
@@ -1299,8 +1256,7 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None,
             myD['bottom'] = 0
             myD['top'] = myD['left']
         else:
-            myD['bottom'] = widths_left[leftLabels[i - 1]
-                                        ]['top'] + 0.02 * df.leftWeight.sum()
+            myD['bottom'] = widths_left[leftLabels[i - 1]]['top'] + 0.02 * df.leftWeight.sum()
             myD['top'] = myD['bottom'] + myD['left']
             topEdge = myD['top']
         widths_left[l] = myD
@@ -1314,8 +1270,7 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None,
             myD['bottom'] = 0
             myD['top'] = myD['right']
         else:
-            myD['bottom'] = widths_right[rightLabels[i - 1]
-                                         ]['top'] + 0.02 * df.rightWeight.sum()
+            myD['bottom'] = widths_right[rightLabels[i - 1]]['top'] + 0.02 * df.rightWeight.sum()
             myD['top'] = myD['bottom'] + myD['right']
             topEdge = myD['top']
         widths_right[l] = myD
@@ -1342,13 +1297,12 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None,
     for l in rightLabels:
         plt.fill_between(
             [xMax, 1.02 * xMax], 2 * [widths_right[l]['bottom']],
-            2 * [widths_right[l]['bottom'] + widths_right[l]['right']],
+                                 2 * [widths_right[l]['bottom'] + widths_right[l]['right']],
             color=colorDict[l],
             alpha=0.99
         )
         plt.text(
-            1.05 * xMax, widths_right[l]['bottom'] +
-            0.5 * widths_right[l]['right'],
+            1.05 * xMax, widths_right[l]['bottom'] + 0.5 * widths_right[l]['right'],
             l,
             {'ha': 'left', 'va': 'center'},
             fontsize=fontsize
@@ -1362,8 +1316,7 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None,
                 lc = l2
             if len(df[(df.left == l) & (df.right == l2)]) > 0:
                 # Create array of y values for each strip, half at left value, half at right, convolve
-                ys_d = np.array(
-                    50 * [widths_left[l]['bottom']] + 50 * [widths_right[l2]['bottom']])
+                ys_d = np.array(50 * [widths_left[l]['bottom']] + 50 * [widths_right[l2]['bottom']])
                 ys_d = np.convolve(ys_d, 0.05 * np.ones(20), mode='valid')
                 ys_d = np.convolve(ys_d, 0.05 * np.ones(20), mode='valid')
                 ys_u = np.array(
@@ -1387,22 +1340,23 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None,
 
 
 ############################ UTILS #####################################
-
+from scipy.sparse import csr_matrix
+import h5py
 
 def readHCA(fin):
     f = h5py.File(fin, 'r')
     data = f[list(f.keys())[0]]
-    X = csr_matrix((np.array(data.get("data")),
-                    np.array(data.get("indices")),
-                    np.array(data.get("indptr"))),
-                   shape=np.array(data.get("shape"))[::-1])
+    X = csr_matrix((np.array(data.get("data")), 
+                np.array(data.get("indices")), 
+                np.array(data.get("indptr"))), 
+                shape=np.array(data.get("shape"))[::-1])
     ad = sc.AnnData(X)
     ad.obs_names = np.array(data.get("barcodes")).astype(str)
     ad.var_names = np.array(data.get("gene_names")).astype(str)
     ad.var['ensembl_gene'] = np.array(data.get("genes")).astype(str)
     return(ad)
-
-
+    
+    
 def sc_workflow(ad, prefix='L1', resolution=1.5, n_pcs=15, do_tsne=True):
     sc.pp.normalize_per_cell(ad, counts_per_cell_after=1e4)
     filter_result = sc.pp.filter_genes_dispersion(
@@ -1440,19 +1394,16 @@ def sc_pp_regress_out(adata, keys, n_jobs=None, copy=False):
     -------
     Depening on `copy` returns or updates `adata` with the corrected data matrix.
     """
-    logg.info('regressing out'.format(
-        tuple(keys) if type(keys) is list else keys))
+    logg.info('regressing out'.format(tuple(keys) if type(keys) is list else keys))
     if issparse(adata.X):
         logg.info('... sparse input is densified and may '
                   'lead to huge memory consumption')
     adata = adata.copy() if copy else adata
-    if isinstance(keys, str):
-        keys = [keys]
+    if isinstance(keys, str): keys = [keys]
     if issparse(adata.X):
         adata.X = adata.X.toarray()
     if n_jobs is not None:
-        logg.warning(
-            'Parallelization is currently broke, will be restored soon. Running on 1 core.')
+        logg.warning('Parallelization is currently broke, will be restored soon. Running on 1 core.')
     n_jobs = sett.n_jobs if n_jobs is None else n_jobs
 
     cat_keys = []
@@ -1494,16 +1445,14 @@ def sc_pp_regress_out(adata, keys, n_jobs=None, copy=False):
     def _regress_out(col_index, responses, regressors):
         try:
             if regressors.shape[1] - 1 == responses.shape[1]:
-                regressors_view = np.c_[
-                    regressors[:, 0], regressors[:, col_index + 1]]
+                regressors_view = np.c_[regressors[:, 0], regressors[:, col_index + 1]]
             else:
                 regressors_view = regressors
             result = sm.GLM(responses[:, col_index],
                             regressors_view, family=sm.families.Gaussian()).fit()
             new_column = result.resid_response
         except PerfectSeparationError:  # this emulates R's behavior
-            logg.warning(
-                'Encountered PerfectSeparationError, setting to 0 as in R.')
+            logg.warning('Encountered PerfectSeparationError, setting to 0 as in R.')
             new_column = np.zeros(responses.shape[0])
         return new_column
 
@@ -1524,8 +1473,7 @@ def _regress_out_chunk(chunk, responses, regressors):
         for i_column, column in enumerate(chunk):
             adata.X[:, column] = result_lst[i_column]
     logg.info('finished')
-    logg.hint(
-        'after `sc.pp.regress_out`, consider rescaling the adata using `sc.pp.scale`')
+    logg.hint('after `sc.pp.regress_out`, consider rescaling the adata using `sc.pp.scale`')
     return adata if copy else None
 
 
@@ -1542,8 +1490,8 @@ def regress_out(metadata, exprs, covariate_formula, design_formula='1', rcond=-1
     covariate_matrix = patsy.dmatrix(covariate_formula, metadata)
 
     design_batch = np.hstack((covariate_matrix, design_matrix))
-    coefficients, res, rank, s = np.linalg.lstsq(
-        design_batch, exprs.T, rcond=rcond)
+    coefficients, res, rank, s = np.linalg.lstsq(design_batch, exprs.T, rcond=rcond)
 
     beta = coefficients[covariate_matrix.shape[1]:]
     return exprs - design_matrix.dot(beta).T
+

From 7024b41fe3476790327a9887c86dc1c81de4efd7 Mon Sep 17 00:00:00 2001
From: YY-SONG0718 <794815059@qq.com>
Date: Wed, 21 Sep 2022 21:04:49 +0100
Subject: [PATCH 3/8] add legend

---
 SCCAF/__init__.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/SCCAF/__init__.py b/SCCAF/__init__.py
index 71f3a6b..8f8d37c 100644
--- a/SCCAF/__init__.py
+++ b/SCCAF/__init__.py
@@ -781,6 +781,8 @@ def SCCAF_optimize(ad,
         if 'X_pca' not in ad.obsm.keys():
             raise ValueError("`adata.obsm['X_pca']` doesn't exist. Run `sc.pp.pca` first.")
         X = ad.obsm['X_pca']
+    elif 'X_%s'%use in ad.obsm.dtype.fields:
+	X = ad.obsm['X_%s'%use]
     else:
         X = ad[:,ad.var['highly_variable']].X
 
@@ -1068,12 +1070,14 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None,
                 ax[0].plot(Xs[i], Ys[i], c=colors[i], lw=2, label=cell_type)
             ax[0].set_xlabel('FPR')
             ax[0].set_ylabel('TPR')
-            # ysong modify add legend 10 Mar 2022
+            # add legend
             Xs = recs
             Ys = prss
             for i, cell_type in enumerate(clf.classes_):
                 ax[1].plot(Xs[i], Ys[i], c=colors[i], lw=2, label=cell_type)
             ax[1].set_xlabel('Recall')
+            ax[1].set_ylabel('Precision')
+
             ax[1].legend(bbox_to_anchor=(1.04,0.5), loc="center left")
             ax[0].annotate(r'$AUC_{min}: %.3f$' % min_auc_rc, (0.5, 0.4), fontsize=fontsize)
             ax[0].annotate(r'$AUC_{max}: %.3f$' % max_auc_rc, (0.5, 0.3), fontsize=fontsize)

From f841eb3a820dc1405c1204f7cafb0ef7e344b957 Mon Sep 17 00:00:00 2001
From: YY-SONG0718 <794815059@qq.com>
Date: Wed, 21 Sep 2022 21:16:42 +0100
Subject: [PATCH 4/8] add option for plotting legend in plot_roc

---
 SCCAF/__init__.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/SCCAF/__init__.py b/SCCAF/__init__.py
index 8f8d37c..f8fc7ab 100644
--- a/SCCAF/__init__.py
+++ b/SCCAF/__init__.py
@@ -694,7 +694,8 @@ def SCCAF_optimize(ad,
                    dist_cutoff=8,
                    classifier="LR",
                    mplotlib_backend=None,
-                   min_acc=1):
+                   min_acc=1,
+                   legend=False):
     """
     This is a self-projection confusion matrix directed cluster optimization function.
 
@@ -767,6 +768,8 @@ def SCCAF_optimize(ad,
         PdfPages).
     min_acc: `float`
 		the minimum total accuracy to be achieved. Above this threshold, the optimization will stop.
+    legend: `bool` optional (default: False)
+        If also plot legend for the AUC curve.
 
     return
     -----
@@ -801,7 +804,7 @@ def SCCAF_optimize(ad,
         ad.obs['%s_self-projection' % old_id] = clf.predict(X)
         
         if plot:
-            aucs = plot_roc(y_prob, y_test, clf, cvsm=cvsm, acc=acc, title="Self-project ROC {}".format(old_id))
+            aucs = plot_roc(y_prob, y_test, clf, cvsm=cvsm, acc=acc, title="Self-project ROC {}".format(old_id), legend=legend)
             if mplotlib_backend:
                 mplotlib_backend.savefig()
                 plt.clf()
@@ -1017,7 +1020,7 @@ def plot_heatmap_gray(X, title='', save=None, mplotlib_backend=None):
         plt.show()
 
 
-def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, cvsm=None, acc=None, fontsize=16):
+def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, cvsm=None, acc=None, fontsize=16, legend=None):
     """
     y_prob, y_test, clf, plot=True, save=False, title ='', colors=None, cvsm=None, acc=None, fontsize=16):
     """
@@ -1070,15 +1073,15 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None,
                 ax[0].plot(Xs[i], Ys[i], c=colors[i], lw=2, label=cell_type)
             ax[0].set_xlabel('FPR')
             ax[0].set_ylabel('TPR')
-            # add legend
             Xs = recs
             Ys = prss
             for i, cell_type in enumerate(clf.classes_):
                 ax[1].plot(Xs[i], Ys[i], c=colors[i], lw=2, label=cell_type)
             ax[1].set_xlabel('Recall')
             ax[1].set_ylabel('Precision')
-
-            ax[1].legend(bbox_to_anchor=(1.04,0.5), loc="center left")
+            ## add legend if required
+            if legend == True:
+                ax[1].legend(bbox_to_anchor=(1.04,0.5), loc="center left")
             ax[0].annotate(r'$AUC_{min}: %.3f$' % min_auc_rc, (0.5, 0.4), fontsize=fontsize)
             ax[0].annotate(r'$AUC_{max}: %.3f$' % max_auc_rc, (0.5, 0.3), fontsize=fontsize)
             ax[1].annotate(r'$AUC_{min}: %.3f$' % min_auc_rp, (0.5, 0.4), fontsize=fontsize)

From c2eea917d49cf0e616def44ac2202a692dfa3b80 Mon Sep 17 00:00:00 2001
From: YY-SONG0718 <794815059@qq.com>
Date: Wed, 21 Sep 2022 21:17:42 +0100
Subject: [PATCH 5/8] add option for plotting legend in plot_roc

---
 SCCAF/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SCCAF/__init__.py b/SCCAF/__init__.py
index f8fc7ab..5b85442 100644
--- a/SCCAF/__init__.py
+++ b/SCCAF/__init__.py
@@ -769,7 +769,7 @@ def SCCAF_optimize(ad,
     min_acc: `float`
 		the minimum total accuracy to be achieved. Above this threshold, the optimization will stop.
     legend: `bool` optional (default: False)
-        If also plot legend for the AUC curve.
+        If also plot legend for the ROC curve.
 
     return
     -----

From 9ea9755a432e14d331bf43454fddb3bf6cb87ff0 Mon Sep 17 00:00:00 2001
From: YY-SONG0718 <794815059@qq.com>
Date: Wed, 21 Sep 2022 21:19:51 +0100
Subject: [PATCH 6/8] add option for plotting legend in plot_roc

---
 SCCAF/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SCCAF/__init__.py b/SCCAF/__init__.py
index 5b85442..975da14 100644
--- a/SCCAF/__init__.py
+++ b/SCCAF/__init__.py
@@ -1020,7 +1020,7 @@ def plot_heatmap_gray(X, title='', save=None, mplotlib_backend=None):
         plt.show()
 
 
-def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, cvsm=None, acc=None, fontsize=16, legend=None):
+def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, cvsm=None, acc=None, fontsize=16, legend=False):
     """
     y_prob, y_test, clf, plot=True, save=False, title ='', colors=None, cvsm=None, acc=None, fontsize=16):
     """

From 50efca66fa2cfc1a6c45ad4fa88f5c6f239586c3 Mon Sep 17 00:00:00 2001
From: YY-SONG0718 <794815059@qq.com>
Date: Thu, 26 Oct 2023 18:11:17 +0100
Subject: [PATCH 7/8] update scanpy version

---
 setup.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 7de0c9f..fa5a715 100644
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@ def readme():
 
 setup(
         name='SCCAF',
-        version='0.0.10',
+        version='0.0.11',
         description='Single-Cell Clustering Assessment Framework',
         long_description=readme(),
         long_description_content_type='text/markdown',
@@ -20,9 +20,10 @@ def readme():
             'louvain',
             'scikit-learn',
             'psutil',
-            'scanpy==1.4.6'],
+            'scanpy==1.9.2'],
         scripts=['cli/sccaf', 'cli/sccaf-assess', 'cli/sccaf-assess-merger', 'cli/sccaf-regress-out'],
         author='Chichau Miau',
         author_email='zmiao@ebi.ac.uk',
         license='MIT'
     )
+# modified by Yuyao Song <ysong@ebi.ac.uk> on Oct 2023 to bump scanpy version

From 05fcfcadfdbef38531a548291f2f867b197db4c4 Mon Sep 17 00:00:00 2001
From: YY-SONG0718 <794815059@qq.com>
Date: Fri, 27 Oct 2023 15:58:44 +0100
Subject: [PATCH 8/8] space for indent

---
 SCCAF/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SCCAF/__init__.py b/SCCAF/__init__.py
index 975da14..a54580e 100644
--- a/SCCAF/__init__.py
+++ b/SCCAF/__init__.py
@@ -785,7 +785,7 @@ def SCCAF_optimize(ad,
             raise ValueError("`adata.obsm['X_pca']` doesn't exist. Run `sc.pp.pca` first.")
         X = ad.obsm['X_pca']
     elif 'X_%s'%use in ad.obsm.dtype.fields:
-	X = ad.obsm['X_%s'%use]
+        X = ad.obsm['X_%s'%use]
     else:
         X = ad[:,ad.var['highly_variable']].X