From d75d37fd406065f26b9b2fc5e20c1b7396cbaf7a Mon Sep 17 00:00:00 2001 From: YY-SONG0718 <794815059@qq.com> Date: Fri, 11 Feb 2022 13:30:04 +0000 Subject: [PATCH 1/8] update __init__.py --- SCCAF/__init__.py | 336 ++++++++++++++++++++++++++-------------------- 1 file changed, 193 insertions(+), 143 deletions(-) diff --git a/SCCAF/__init__.py b/SCCAF/__init__.py index f58c79c..1df6d35 100644 --- a/SCCAF/__init__.py +++ b/SCCAF/__init__.py @@ -19,6 +19,8 @@ # # +import h5py +from scipy.sparse import csr_matrix import pandas as pd import numpy as np from scipy.sparse import issparse @@ -51,22 +53,22 @@ from sklearn.ensemble import RandomForestClassifier, VotingClassifier from sklearn.svm import SVC -color_long = ['#e6194b','#3cb44b','#ffe119','#0082c8','#f58231','#911eb4',\ - '#46f0f0','#f032e6','#d2f53c','#fabebe','#008080','#e6beff',\ - '#aa6e28','#800000','#aaffc3','#808000','#ffd8b1','#000080',\ - '#808080','#000000', "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46", \ - "#008941", "#006FA6", "#A30059","#FFDBE5", "#7A4900", "#0000A6", \ - "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87","#5A0007", \ - "#809693", "#6A3A4C", "#1B4400", "#4FC601", "#3B5DFF","#4A3B53", \ - "#FF2F80","#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100",\ - "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F",\ - "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09",\ - "#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66",\ - "#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C",\ - "#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81",\ - "#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00",\ - "#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700",\ - "#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329",\ +color_long = ['#e6194b', '#3cb44b', '#ffe119', '#0082c8', '#f58231', '#911eb4', + '#46f0f0', '#f032e6', '#d2f53c', '#fabebe', '#008080', '#e6beff', + '#aa6e28', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000080', + '#808080', '#000000', "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46", + "#008941", "#006FA6", "#A30059", "#FFDBE5", "#7A4900", "#0000A6", + "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87", "#5A0007", + "#809693", "#6A3A4C", "#1B4400", "#4FC601", "#3B5DFF", "#4A3B53", + "#FF2F80", "#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100", + "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F", + "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09", + "#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66", + "#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C", + "#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81", + "#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00", + "#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700", + "#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329", "#5B4534", "#FDE8DC", "#404E55", "#0089A3", "#CB7E98", "#A4E804", "#324E72"] @@ -110,7 +112,8 @@ def bhattacharyya_matrix(prob, flags=None): else: for i in np.arange(ndim): for j in np.arange(i + 1, ndim): - df = pd.DataFrame({'i': prob[:, i], 'j': prob[:, j], 'f': flags}) + df = pd.DataFrame( + {'i': prob[:, i], 'j': prob[:, j], 'f': flags}) df = df[df['f']] val = -bhattacharyya_distance(df['i'], df['j']) m[i, j] = val @@ -143,9 +146,11 @@ def normalize_confmat1(cmat, mod='1'): for i in range(dim): for j in range(i + 1, dim): if mod is '1': - xmat[i, j] = xmat[j, i] = max(dmat[i, j] / smat[j], dmat[j, i] / smat[i]) + xmat[i, j] = xmat[j, i] = max( + dmat[i, j] / smat[j], dmat[j, i] / smat[i]) else: - xmat[i, j] = xmat[j, i] = max(dmat[i, j] / smat[i], dmat[j, i] / smat[j]) + xmat[i, j] = xmat[j, i] = max( + dmat[i, j] / smat[i], dmat[j, i] / smat[j]) return xmat @@ -193,7 +198,8 @@ def cluster_adjmat(xmat, ----- new group names. """ - g = sc._utils.get_igraph_from_adjacency((xmat > cutoff).astype(int), directed=False) + g = sc._utils.get_igraph_from_adjacency( + (xmat > cutoff).astype(int), directed=False) print(g) part = louvain.find_partition(g, louvain.RBConfigurationVertexPartition, resolution_parameter=resolution) @@ -217,7 +223,8 @@ def msample(x, n, frac): sampled selection. """ if len(x) <= np.floor(n / frac): - if len(x) < 10: frac = 0.9 + if len(x) < 10: + frac = 0.9 return x.sample(frac=frac) else: return x.sample(n=n) @@ -246,7 +253,8 @@ def train_test_split_per_type(X, y, n=100, frac=0.8): df = pd.DataFrame(y) df.index = np.arange(len(y)) df.columns = ['class'] - c_idx = df.groupby('class').apply(lambda x: msample(x, n=n, frac=frac)).index.get_level_values(None) + c_idx = df.groupby('class').apply(lambda x: msample( + x, n=n, frac=frac)).index.get_level_values(None) d_idx = ~np.isin(np.arange(len(y)), c_idx) return X[c_idx, :], X[d_idx, :], y[c_idx], y[d_idx] @@ -261,17 +269,17 @@ def SCCAF_assessment(*args, **kwargs): # need to check number of cells in each cluster of the training set. -def self_projection(X, +def self_projection(X, cell_types, classifier="LR", penalty='l1', sparsity=0.5, fraction=0.5, random_state=1, - solver='liblinear', + solver='liblinear', n=0, cv=5, - whole=False, + whole=False, n_jobs=None): # n = 100 should be good. """ @@ -326,7 +334,8 @@ def self_projection(X, stratify=cell_types, test_size=fraction) # fraction means test size # set the classifier if classifier == 'LR': - clf = LogisticRegression(random_state=1, penalty=penalty, C=sparsity, multi_class="ovr", solver=solver) + clf = LogisticRegression( + random_state=1, penalty=penalty, C=sparsity, multi_class="ovr", solver=solver) elif classifier == 'RF': clf = RandomForestClassifier(random_state=1, n_jobs=n_jobs) elif classifier == 'GNB': @@ -341,11 +350,12 @@ def self_projection(X, clf = SGDClassifier(loss='perceptron', n_jobs=n_jobs) elif classifier == 'DT': clf = DecisionTreeClassifier() - + # mean cross validation score cvsm = 0 if cv > 0: - cvs = cross_val_score(clf, X_train, np.array(y_train), cv=cv, scoring='accuracy', n_jobs=n_jobs) + cvs = cross_val_score(clf, X_train, np.array( + y_train), cv=cv, scoring='accuracy', n_jobs=n_jobs) cvsm = cvs.mean() print("Mean CV accuracy: %.4f" % cvsm) # accuracy on cross validation and on test set @@ -354,18 +364,18 @@ def self_projection(X, print("Accuracy on the training set: %.4f" % accuracy) accuracy_test = clf.score(X_test, y_test) print("Accuracy on the hold-out set: %.4f" % accuracy_test) - + # accuracy of the whole dataset if whole: accuracy = clf.score(X, cell_types) print("Accuracy on the whole set: %.4f" % accuracy) - + # get predicted probability on the test set y_prob = None if not classifier in ['SH', 'PCP']: y_prob = clf.predict_proba(X_test) y_pred = clf.predict(X_test) - + return y_prob, y_pred, y_test, clf, cvsm, accuracy_test @@ -404,8 +414,10 @@ def confusion_matrix(y_test, y_pred, clf, labels=None): ----- the confusion matrix """ - if labels is None: labels = clf.classes_ - df = pd.DataFrame.from_records(metrics.confusion_matrix(y_test, y_pred, labels=labels)) + if labels is None: + labels = clf.classes_ + df = pd.DataFrame.from_records( + metrics.confusion_matrix(y_test, y_pred, labels=labels)) df.index = labels df.columns = labels df.index.name = 'Real' @@ -443,7 +455,8 @@ def per_cluster_accuracy(mtx, ad=None, clstr_name='louvain'): def per_cell_accuracy(X, cell_types, clf): y_prob = clf.predict_proba(X) - df = pd.DataFrame(y_prob, index=cell_types, columns=clf.classes_).sort_index().T + df = pd.DataFrame(y_prob, index=cell_types, + columns=clf.classes_).sort_index().T df.index.name = 'Predicted' dy = np.empty([0]) for cell in df.index: @@ -476,13 +489,14 @@ def get_topmarkers(clf, names, topn=10): top_markers = \ marker_genes \ - .query('weight > 0.') \ - .sort_values('weight', ascending=False) \ - .groupby('cell_type') \ - .head(topn) \ - .sort_values(['cell_type', 'weight'], ascending=[True, False]) + .query('weight > 0.') \ + .sort_values('weight', ascending=False) \ + .groupby('cell_type') \ + .head(topn) \ + .sort_values(['cell_type', 'weight'], ascending=[True, False]) return top_markers + def eu_distance(X, gp1, gp2, cell): """ Measure the euclidean distance between two groups of cells and the third group. @@ -541,20 +555,22 @@ def get_distance_matrix(X, clusters, labels=None, metric='euclidean'): centers = [] if scipy.sparse.issparse(X): for cl in labels: - centers.append(np.array(X[np.where(clusters == cl)[0], :].mean(0))[0, :]) + centers.append( + np.array(X[np.where(clusters == cl)[0], :].mean(0))[0, :]) else: for cl in labels: centers.append(np.array(X[np.where(clusters == cl)[0], :].mean(0))) return pairwise_distances(np.array(centers), metric=metric) + def merge_cluster(ad, old_id, new_id, groups): ad.obs[new_id] = ad.obs[old_id] ad.obs[new_id] = ad.obs[new_id].astype('category') ad.obs[new_id].cat.categories = make_unique(groups.astype(str)) ad.obs[new_id] = ad.obs[new_id].str.split('_').str[0] return ad - - + + def find_high_resolution(ad, resolution=4, n=100): cut = resolution while cut > 0.5: @@ -567,18 +583,18 @@ def find_high_resolution(ad, resolution=4, n=100): def get_connection_matrix(ad_obs, key1, key2): - df = ad_obs.groupby([key1,key2]).size().to_frame().reset_index() - df.columns = [key1,key2,'counts'] + df = ad_obs.groupby([key1, key2]).size().to_frame().reset_index() + df.columns = [key1, key2, 'counts'] df2 = ad_obs[key2].value_counts() df['size'] = df2[df[key2].tolist()].tolist() df['percent'] = df['counts']/df['size'] - df = df[df['percent']>0.1] + df = df[df['percent'] > 0.1] df2 = df.groupby(key1).size() - df2 = df2[df2>1] + df2 = df2[df2 > 1] df = df[df[key1].isin(df2.index)] dim = len(ad_obs[key2].unique()) - mat = pd.DataFrame(np.zeros([dim,dim])).astype(int) + mat = pd.DataFrame(np.zeros([dim, dim])).astype(int) mat.columns = mat.columns.astype(str) mat.index = mat.index.astype(str) @@ -586,10 +602,10 @@ def get_connection_matrix(ad_obs, key1, key2): grouped = df.groupby(key1) for name, group in grouped: x = group[key2] - if len(x)>0: - for i,j in list(itertools.combinations(x.tolist(), 2)): - mat.loc[i,j] = mat.loc[j,i] = 1 - return mat + if len(x) > 0: + for i, j in list(itertools.combinations(x.tolist(), 2)): + mat.loc[i, j] = mat.loc[j, i] = 1 + return mat def SCCAF_optimize_all(ad, @@ -597,11 +613,11 @@ def SCCAF_optimize_all(ad, R1norm_cutoff=0.5, R2norm_cutoff=0.05, R1norm_step=0.01, - R2norm_step=0.001, + R2norm_step=0.001, prefix='L1', - min_i = 3, - start = None, - start_iter = 0, + min_i=3, + start=None, + start_iter=0, *args, **kwargs): """ ad: `AnnData` @@ -626,20 +642,22 @@ def SCCAF_optimize_all(ad, The reduce step for minimum R2norm value. """ acc = 0 - #'start_iter = -1 + # 'start_iter = -1 if start is None: - start = '%s_Round%d'%(prefix, start_iter) + start = '%s_Round%d' % (prefix, start_iter) if not start in ad.obs.keys(): - raise ValueError("`adata.obs['%s']` doesn't exist. Please assign the initial clustering first."%(start)) + raise ValueError( + "`adata.obs['%s']` doesn't exist. Please assign the initial clustering first." % (start)) else: if not start in ad.obs.keys(): - raise ValueError("`adata.obs['%s']` doesn't exist. Please assign the initial clustering first."%(start)) - ad.obs['%s_Round%d'%(prefix, start_iter)] = ad.obs[start] - - clstr_old = len(ad.obs['%s_Round%d'%(prefix, start_iter)].unique()) - #'while acc < min_acc: + raise ValueError( + "`adata.obs['%s']` doesn't exist. Please assign the initial clustering first." % (start)) + ad.obs['%s_Round%d' % (prefix, start_iter)] = ad.obs[start] + + clstr_old = len(ad.obs['%s_Round%d' % (prefix, start_iter)].unique()) + # 'while acc < min_acc: for i in range(10): - if start_iter >0: + if start_iter > 0: print("start_iter: %d" % start_iter) print("R1norm_cutoff: %f" % R1norm_cutoff) print("R2norm_cutoff: %f" % R2norm_cutoff) @@ -649,7 +667,7 @@ def SCCAF_optimize_all(ad, R1norm_cutoff=R1norm_cutoff, R2norm_cutoff=R2norm_cutoff, start_iter=start_iter, - min_acc=min_acc, + min_acc=min_acc, prefix=prefix, *args, **kwargs) print("m1: %f" % m1) @@ -657,17 +675,17 @@ def SCCAF_optimize_all(ad, print("Accuracy: %f" % acc) R1norm_cutoff = m1 - R1norm_step R2norm_cutoff = m2 - R2norm_step - - clstr_new = len(ad.obs['%s_result'%prefix].unique()) - + + clstr_new = len(ad.obs['%s_result' % prefix].unique()) + if clstr_new >= clstr_old and i >= min_i: print("converged SCCAF_all!") break - - if acc >=min_acc: + + if acc >= min_acc: break - - + + def SCCAF_optimize(ad, prefix='L1', use='raw', @@ -681,7 +699,7 @@ def SCCAF_optimize(ad, plot_dist=False, plot_cmat=False, mod='1', - low_res = None, + low_res=None, c_iter=3, n_iter=10, n_jobs=None, @@ -757,8 +775,8 @@ def SCCAF_optimize(ad, The cutoff for the euclidean distance between two clusters of cells. 8.0 means the euclidean distance between two cell types should be greater than 8.0. low_res: `str` optional - the clustering boundary for under-clustering. Set a low resolution in louvain/leiden clustering and give - the key as the underclustering boundary. + the clustering boundary for under-clustering. Set a low resolution in louvain/leiden clustering and give + the key as the underclustering boundary. classifier: `String` optional (default: 'LR') a machine learning model in "LR" (logistic regression), \ "RF" (Random Forest), "GNB"(Gaussion Naive Bayes), "SVM" (Support Vector Machine) and "DT"(Decision Tree). @@ -766,7 +784,7 @@ def SCCAF_optimize(ad, MatPlotLib multi-page backend object instance, previously initialised (currently the only type supported is PdfPages). min_acc: `float` - the minimum total accuracy to be achieved. Above this threshold, the optimization will stop. + the minimum total accuracy to be achieved. Above this threshold, the optimization will stop. return ----- @@ -779,12 +797,13 @@ def SCCAF_optimize(ad, X = ad.raw.X elif use == 'pca': if 'X_pca' not in ad.obsm.keys(): - raise ValueError("`adata.obsm['X_pca']` doesn't exist. Run `sc.pp.pca` first.") + raise ValueError( + "`adata.obsm['X_pca']` doesn't exist. Run `sc.pp.pca` first.") X = ad.obsm['X_pca'] - elif 'X_%s'%use in ad.obsm.dtype.fields: - X = ad.obsm['X_%s'%use] + elif 'X_%s' % use in ad.obsm.dtype.fields: + X = ad.obsm['X_%s' % use] else: - X = ad[:,ad.var['highly_variable']].X + X = ad[:, ad.var['highly_variable']].X for i in range(start_iter, start_iter + n_iter): print("Round%d ..." % (i + 1)) @@ -799,9 +818,10 @@ def SCCAF_optimize(ad, fraction=fraction, classifier=classifier, n_jobs=n_jobs) accs = [acc] ad.obs['%s_self-projection' % old_id] = clf.predict(X) - + if plot: - aucs = plot_roc(y_prob, y_test, clf, cvsm=cvsm, acc=acc, title="Self-project ROC {}".format(old_id)) + aucs = plot_roc(y_prob, y_test, clf, cvsm=cvsm, acc=acc, + title="Self-project ROC {}".format(old_id)) if mplotlib_backend: mplotlib_backend.savefig() plt.clf() @@ -849,9 +869,12 @@ def SCCAF_optimize(ad, if plot: if plot_cmat: - plot_heatmap_gray(cmat, 'Confusion Matrix', mplotlib_backend=mplotlib_backend) - plot_heatmap_gray(R1mat, 'Normalized Confusion Matrix (R1norm) - %s' % old_id, mplotlib_backend=mplotlib_backend) - plot_heatmap_gray(R2mat, 'Normalized Confusion Matrix (R2norm) - %s' % old_id, mplotlib_backend=mplotlib_backend) + plot_heatmap_gray(cmat, 'Confusion Matrix', + mplotlib_backend=mplotlib_backend) + plot_heatmap_gray(R1mat, 'Normalized Confusion Matrix (R1norm) - %s' % + old_id, mplotlib_backend=mplotlib_backend) + plot_heatmap_gray(R2mat, 'Normalized Confusion Matrix (R2norm) - %s' % + old_id, mplotlib_backend=mplotlib_backend) if R1norm_only: groups = cluster_adjmat(R1mat, cutoff=R1norm_cutoff) @@ -859,32 +882,35 @@ def SCCAF_optimize(ad, groups = cluster_adjmat(R2mat, cutoff=R2norm_cutoff) else: if not low_res is None: - conn_mat = get_connection_matrix(ad_obs = ad.obs, key1 = low_res, key2 = old_id) - zmat = np.minimum.reduce([(R1mat > R1norm_cutoff), conn_mat.values]) + conn_mat = get_connection_matrix( + ad_obs=ad.obs, key1=low_res, key2=old_id) + zmat = np.minimum.reduce( + [(R1mat > R1norm_cutoff), conn_mat.values]) groups = cluster_adjmat(zmat, cutoff=0) else: - zmat = np.maximum.reduce([(R1mat > R1norm_cutoff), (R2mat > R2norm_cutoff)]) + zmat = np.maximum.reduce( + [(R1mat > R1norm_cutoff), (R2mat > R2norm_cutoff)]) groups = cluster_adjmat(zmat, cutoff=0) if len(np.unique(groups)) == len(ad.obs[old_id].unique()): ad.obs['%s_result' % prefix] = ad.obs[old_id] print("Converge SCCAF_optimize no. cluster!") break - + merge_cluster(ad, old_id1, new_id, groups) - + if plot: sc.pl.scatter(ad, basis=basis, color=[new_id], color_map="RdYlBu_r", legend_loc='on data', show=(mplotlib_backend is None)) if mplotlib_backend: mplotlib_backend.savefig() plt.clf() - + if len(np.unique(groups)) <= 1: ad.obs['%s_result' % prefix] = ad.obs[new_id] print("no clustering!") break - + return ad, m1, m2, np.min(accs), i @@ -927,7 +953,8 @@ def plot_center(ad, groupby, ax, basis='tsne', size=20): Y_mask = Y[ad.obs[groupby] == c, :] centroids[c] = np.median(Y_mask, axis=0) for c in centroids.keys(): - ax.plot(centroids[c][0], centroids[c][1], 'wo', markersize=size, alpha=0.5) + ax.plot(centroids[c][0], centroids[c][1], + 'wo', markersize=size, alpha=0.5) return ax @@ -1021,33 +1048,34 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, """ y_prob, y_test, clf, plot=True, save=False, title ='', colors=None, cvsm=None, acc=None, fontsize=16): """ - rc_aucs = [] #AUC - rp_aucs = [] # AUC from recall precision - fprs = [] #FPR - tprs = [] #TPR - prss = [] #Precision - recs = [] #Recall + rc_aucs = [] # AUC + rp_aucs = [] # AUC from recall precision + fprs = [] # FPR + tprs = [] # TPR + prss = [] # Precision + recs = [] # Recall for i, cell_type in enumerate(clf.classes_): fpr, tpr, _ = metrics.roc_curve(y_test == cell_type, y_prob[:, i]) - prs, rec, _ = metrics.precision_recall_curve(y_test == cell_type, y_prob[:, i]) + prs, rec, _ = metrics.precision_recall_curve( + y_test == cell_type, y_prob[:, i]) fprs.append(fpr) tprs.append(tpr) prss.append(prs) recs.append(rec) rc_aucs.append(metrics.auc(fpr, tpr)) rp_aucs.append(metrics.auc(rec, prs)) - + good_aucs = np.asarray(rc_aucs) good_aucs = good_aucs[~np.isnan(good_aucs)] min_auc_rc = np.min(good_aucs) max_auc_rc = np.max(good_aucs) - + good_aucs = np.asarray(rp_aucs) good_aucs = good_aucs[~np.isnan(good_aucs)] min_auc_rp = np.min(good_aucs) max_auc_rp = np.max(good_aucs) - - if plot in ['both','roc','prc']: + + if plot in ['both', 'roc', 'prc']: if colors is None: if len(clf.classes_) < 21: colors = default_20 @@ -1056,7 +1084,7 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, else: colors = default_102 if plot == 'both': - + fig, ax = plt.subplots(1, 2, sharey=True) ax[0].plot([0, 1], [0, 1], color='k', ls=':') ax[0].set_xticks([0, 1]) @@ -1076,18 +1104,26 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, ax[1].plot(Xs[i], Ys[i], c=colors[i], lw=2, label=cell_type) ax[1].set_xlabel('Recall') ax[1].set_ylabel('Precision') - - ax[0].annotate(r'$AUC_{min}: %.3f$' % min_auc_rc, (0.5, 0.4), fontsize=fontsize) - ax[0].annotate(r'$AUC_{max}: %.3f$' % max_auc_rc, (0.5, 0.3), fontsize=fontsize) - ax[1].annotate(r'$AUC_{min}: %.3f$' % min_auc_rp, (0.5, 0.4), fontsize=fontsize) - ax[1].annotate(r'$AUC_{max}: %.3f$' % max_auc_rp, (0.5, 0.3), fontsize=fontsize) + + ax[0].annotate(r'$AUC_{min}: %.3f$' % + min_auc_rc, (0.5, 0.4), fontsize=fontsize) + ax[0].annotate(r'$AUC_{max}: %.3f$' % + max_auc_rc, (0.5, 0.3), fontsize=fontsize) + ax[1].annotate(r'$AUC_{min}: %.3f$' % + min_auc_rp, (0.5, 0.4), fontsize=fontsize) + ax[1].annotate(r'$AUC_{max}: %.3f$' % + max_auc_rp, (0.5, 0.3), fontsize=fontsize) if cvsm: - ax[0].annotate("CV: %.3f" % cvsm, (0.5, 0.2), fontsize=fontsize) - ax[1].annotate("CV: %.3f" % cvsm, (0.5, 0.2), fontsize=fontsize) + ax[0].annotate("CV: %.3f" % + cvsm, (0.5, 0.2), fontsize=fontsize) + ax[1].annotate("CV: %.3f" % + cvsm, (0.5, 0.2), fontsize=fontsize) if acc: - ax[0].annotate("Test: %.3f" % acc, (0.5, 0.1), fontsize=fontsize) - ax[1].annotate("Test: %.3f" % acc, (0.5, 0.1), fontsize=fontsize) - + ax[0].annotate("Test: %.3f" % + acc, (0.5, 0.1), fontsize=fontsize) + ax[1].annotate("Test: %.3f" % + acc, (0.5, 0.1), fontsize=fontsize) + else: fig, ax = plt.subplots() ax.set_xticks([0, 1]) @@ -1099,8 +1135,10 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, ax.set_ylabel('TPR') ax.plot([0, 1], [0, 1], color='k', ls=':') pos = 0.5 - ax.annotate(r'$AUC_{min}: %.3f$' % min_auc_rc, (pos, 0.4), fontsize=fontsize) - ax.annotate(r'$AUC_{max}: %.3f$' % max_auc_rc, (pos, 0.3), fontsize=fontsize) + ax.annotate(r'$AUC_{min}: %.3f$' % + min_auc_rc, (pos, 0.4), fontsize=fontsize) + ax.annotate(r'$AUC_{max}: %.3f$' % + max_auc_rc, (pos, 0.3), fontsize=fontsize) else: Xs = recs Ys = prss @@ -1108,8 +1146,10 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, ax.set_ylabel('Precision') ax.plot([0, 1], [1, 0], color='k', ls=':') pos = 0 - ax.annotate(r'$AUC_{min}: %.3f$' % min_auc_rp, (pos, 0.4), fontsize=fontsize) - ax.annotate(r'$AUC_{max}: %.3f$' % max_auc_rp, (pos, 0.3), fontsize=fontsize) + ax.annotate(r'$AUC_{min}: %.3f$' % + min_auc_rp, (pos, 0.4), fontsize=fontsize) + ax.annotate(r'$AUC_{max}: %.3f$' % + max_auc_rp, (pos, 0.3), fontsize=fontsize) for i, cell_type in enumerate(clf.classes_): ax.plot(Xs[i], Ys[i], c=colors[i], lw=2, label=cell_type) @@ -1120,11 +1160,11 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, if save: plt.savefig(save) - + return rc_aucs -## pySankey +# pySankey class pySankeyException(Exception): pass @@ -1152,7 +1192,8 @@ def check_data_matches_labels(labels, data, side): msg = "Labels: " + ",".join(labels) + "\n" if len(data) < 20: msg += "Data: " + ",".join(data) - raise LabelMismatch('{0} labels and data do not match.{1}'.format(side, msg)) + raise LabelMismatch( + '{0} labels and data do not match.{1}'.format(side, msg)) def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None, @@ -1258,7 +1299,8 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None, myD['bottom'] = 0 myD['top'] = myD['left'] else: - myD['bottom'] = widths_left[leftLabels[i - 1]]['top'] + 0.02 * df.leftWeight.sum() + myD['bottom'] = widths_left[leftLabels[i - 1] + ]['top'] + 0.02 * df.leftWeight.sum() myD['top'] = myD['bottom'] + myD['left'] topEdge = myD['top'] widths_left[l] = myD @@ -1272,7 +1314,8 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None, myD['bottom'] = 0 myD['top'] = myD['right'] else: - myD['bottom'] = widths_right[rightLabels[i - 1]]['top'] + 0.02 * df.rightWeight.sum() + myD['bottom'] = widths_right[rightLabels[i - 1] + ]['top'] + 0.02 * df.rightWeight.sum() myD['top'] = myD['bottom'] + myD['right'] topEdge = myD['top'] widths_right[l] = myD @@ -1299,12 +1342,13 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None, for l in rightLabels: plt.fill_between( [xMax, 1.02 * xMax], 2 * [widths_right[l]['bottom']], - 2 * [widths_right[l]['bottom'] + widths_right[l]['right']], + 2 * [widths_right[l]['bottom'] + widths_right[l]['right']], color=colorDict[l], alpha=0.99 ) plt.text( - 1.05 * xMax, widths_right[l]['bottom'] + 0.5 * widths_right[l]['right'], + 1.05 * xMax, widths_right[l]['bottom'] + + 0.5 * widths_right[l]['right'], l, {'ha': 'left', 'va': 'center'}, fontsize=fontsize @@ -1318,7 +1362,8 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None, lc = l2 if len(df[(df.left == l) & (df.right == l2)]) > 0: # Create array of y values for each strip, half at left value, half at right, convolve - ys_d = np.array(50 * [widths_left[l]['bottom']] + 50 * [widths_right[l2]['bottom']]) + ys_d = np.array( + 50 * [widths_left[l]['bottom']] + 50 * [widths_right[l2]['bottom']]) ys_d = np.convolve(ys_d, 0.05 * np.ones(20), mode='valid') ys_d = np.convolve(ys_d, 0.05 * np.ones(20), mode='valid') ys_u = np.array( @@ -1342,23 +1387,22 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None, ############################ UTILS ##################################### -from scipy.sparse import csr_matrix -import h5py + def readHCA(fin): f = h5py.File(fin, 'r') data = f[list(f.keys())[0]] - X = csr_matrix((np.array(data.get("data")), - np.array(data.get("indices")), - np.array(data.get("indptr"))), - shape=np.array(data.get("shape"))[::-1]) + X = csr_matrix((np.array(data.get("data")), + np.array(data.get("indices")), + np.array(data.get("indptr"))), + shape=np.array(data.get("shape"))[::-1]) ad = sc.AnnData(X) ad.obs_names = np.array(data.get("barcodes")).astype(str) ad.var_names = np.array(data.get("gene_names")).astype(str) ad.var['ensembl_gene'] = np.array(data.get("genes")).astype(str) return(ad) - - + + def sc_workflow(ad, prefix='L1', resolution=1.5, n_pcs=15, do_tsne=True): sc.pp.normalize_per_cell(ad, counts_per_cell_after=1e4) filter_result = sc.pp.filter_genes_dispersion( @@ -1396,16 +1440,19 @@ def sc_pp_regress_out(adata, keys, n_jobs=None, copy=False): ------- Depening on `copy` returns or updates `adata` with the corrected data matrix. """ - logg.info('regressing out'.format(tuple(keys) if type(keys) is list else keys)) + logg.info('regressing out'.format( + tuple(keys) if type(keys) is list else keys)) if issparse(adata.X): logg.info('... sparse input is densified and may ' 'lead to huge memory consumption') adata = adata.copy() if copy else adata - if isinstance(keys, str): keys = [keys] + if isinstance(keys, str): + keys = [keys] if issparse(adata.X): adata.X = adata.X.toarray() if n_jobs is not None: - logg.warning('Parallelization is currently broke, will be restored soon. Running on 1 core.') + logg.warning( + 'Parallelization is currently broke, will be restored soon. Running on 1 core.') n_jobs = sett.n_jobs if n_jobs is None else n_jobs cat_keys = [] @@ -1447,14 +1494,16 @@ def sc_pp_regress_out(adata, keys, n_jobs=None, copy=False): def _regress_out(col_index, responses, regressors): try: if regressors.shape[1] - 1 == responses.shape[1]: - regressors_view = np.c_[regressors[:, 0], regressors[:, col_index + 1]] + regressors_view = np.c_[ + regressors[:, 0], regressors[:, col_index + 1]] else: regressors_view = regressors result = sm.GLM(responses[:, col_index], regressors_view, family=sm.families.Gaussian()).fit() new_column = result.resid_response except PerfectSeparationError: # this emulates R's behavior - logg.warning('Encountered PerfectSeparationError, setting to 0 as in R.') + logg.warning( + 'Encountered PerfectSeparationError, setting to 0 as in R.') new_column = np.zeros(responses.shape[0]) return new_column @@ -1475,7 +1524,8 @@ def _regress_out_chunk(chunk, responses, regressors): for i_column, column in enumerate(chunk): adata.X[:, column] = result_lst[i_column] logg.info('finished') - logg.hint('after `sc.pp.regress_out`, consider rescaling the adata using `sc.pp.scale`') + logg.hint( + 'after `sc.pp.regress_out`, consider rescaling the adata using `sc.pp.scale`') return adata if copy else None @@ -1492,8 +1542,8 @@ def regress_out(metadata, exprs, covariate_formula, design_formula='1', rcond=-1 covariate_matrix = patsy.dmatrix(covariate_formula, metadata) design_batch = np.hstack((covariate_matrix, design_matrix)) - coefficients, res, rank, s = np.linalg.lstsq(design_batch, exprs.T, rcond=rcond) + coefficients, res, rank, s = np.linalg.lstsq( + design_batch, exprs.T, rcond=rcond) beta = coefficients[covariate_matrix.shape[1]:] return exprs - design_matrix.dot(beta).T - From 5ef100921cba11c955220efce8063ca4fd34bd76 Mon Sep 17 00:00:00 2001 From: YY-SONG0718 <794815059@qq.com> Date: Sun, 20 Mar 2022 17:49:31 +0000 Subject: [PATCH 2/8] add legend to plot_roc in __init__.py --- SCCAF/__init__.py | 336 ++++++++++++++++++++-------------------------- 1 file changed, 142 insertions(+), 194 deletions(-) diff --git a/SCCAF/__init__.py b/SCCAF/__init__.py index 1df6d35..71f3a6b 100644 --- a/SCCAF/__init__.py +++ b/SCCAF/__init__.py @@ -19,8 +19,6 @@ # # -import h5py -from scipy.sparse import csr_matrix import pandas as pd import numpy as np from scipy.sparse import issparse @@ -53,22 +51,22 @@ from sklearn.ensemble import RandomForestClassifier, VotingClassifier from sklearn.svm import SVC -color_long = ['#e6194b', '#3cb44b', '#ffe119', '#0082c8', '#f58231', '#911eb4', - '#46f0f0', '#f032e6', '#d2f53c', '#fabebe', '#008080', '#e6beff', - '#aa6e28', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000080', - '#808080', '#000000', "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46", - "#008941", "#006FA6", "#A30059", "#FFDBE5", "#7A4900", "#0000A6", - "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87", "#5A0007", - "#809693", "#6A3A4C", "#1B4400", "#4FC601", "#3B5DFF", "#4A3B53", - "#FF2F80", "#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100", - "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F", - "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09", - "#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66", - "#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C", - "#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81", - "#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00", - "#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700", - "#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329", +color_long = ['#e6194b','#3cb44b','#ffe119','#0082c8','#f58231','#911eb4',\ + '#46f0f0','#f032e6','#d2f53c','#fabebe','#008080','#e6beff',\ + '#aa6e28','#800000','#aaffc3','#808000','#ffd8b1','#000080',\ + '#808080','#000000', "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46", \ + "#008941", "#006FA6", "#A30059","#FFDBE5", "#7A4900", "#0000A6", \ + "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87","#5A0007", \ + "#809693", "#6A3A4C", "#1B4400", "#4FC601", "#3B5DFF","#4A3B53", \ + "#FF2F80","#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100",\ + "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F",\ + "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09",\ + "#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66",\ + "#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C",\ + "#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81",\ + "#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00",\ + "#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700",\ + "#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329",\ "#5B4534", "#FDE8DC", "#404E55", "#0089A3", "#CB7E98", "#A4E804", "#324E72"] @@ -112,8 +110,7 @@ def bhattacharyya_matrix(prob, flags=None): else: for i in np.arange(ndim): for j in np.arange(i + 1, ndim): - df = pd.DataFrame( - {'i': prob[:, i], 'j': prob[:, j], 'f': flags}) + df = pd.DataFrame({'i': prob[:, i], 'j': prob[:, j], 'f': flags}) df = df[df['f']] val = -bhattacharyya_distance(df['i'], df['j']) m[i, j] = val @@ -146,11 +143,9 @@ def normalize_confmat1(cmat, mod='1'): for i in range(dim): for j in range(i + 1, dim): if mod is '1': - xmat[i, j] = xmat[j, i] = max( - dmat[i, j] / smat[j], dmat[j, i] / smat[i]) + xmat[i, j] = xmat[j, i] = max(dmat[i, j] / smat[j], dmat[j, i] / smat[i]) else: - xmat[i, j] = xmat[j, i] = max( - dmat[i, j] / smat[i], dmat[j, i] / smat[j]) + xmat[i, j] = xmat[j, i] = max(dmat[i, j] / smat[i], dmat[j, i] / smat[j]) return xmat @@ -198,8 +193,7 @@ def cluster_adjmat(xmat, ----- new group names. """ - g = sc._utils.get_igraph_from_adjacency( - (xmat > cutoff).astype(int), directed=False) + g = sc._utils.get_igraph_from_adjacency((xmat > cutoff).astype(int), directed=False) print(g) part = louvain.find_partition(g, louvain.RBConfigurationVertexPartition, resolution_parameter=resolution) @@ -223,8 +217,7 @@ def msample(x, n, frac): sampled selection. """ if len(x) <= np.floor(n / frac): - if len(x) < 10: - frac = 0.9 + if len(x) < 10: frac = 0.9 return x.sample(frac=frac) else: return x.sample(n=n) @@ -253,8 +246,7 @@ def train_test_split_per_type(X, y, n=100, frac=0.8): df = pd.DataFrame(y) df.index = np.arange(len(y)) df.columns = ['class'] - c_idx = df.groupby('class').apply(lambda x: msample( - x, n=n, frac=frac)).index.get_level_values(None) + c_idx = df.groupby('class').apply(lambda x: msample(x, n=n, frac=frac)).index.get_level_values(None) d_idx = ~np.isin(np.arange(len(y)), c_idx) return X[c_idx, :], X[d_idx, :], y[c_idx], y[d_idx] @@ -269,17 +261,17 @@ def SCCAF_assessment(*args, **kwargs): # need to check number of cells in each cluster of the training set. -def self_projection(X, +def self_projection(X, cell_types, classifier="LR", penalty='l1', sparsity=0.5, fraction=0.5, random_state=1, - solver='liblinear', + solver='liblinear', n=0, cv=5, - whole=False, + whole=False, n_jobs=None): # n = 100 should be good. """ @@ -334,8 +326,7 @@ def self_projection(X, stratify=cell_types, test_size=fraction) # fraction means test size # set the classifier if classifier == 'LR': - clf = LogisticRegression( - random_state=1, penalty=penalty, C=sparsity, multi_class="ovr", solver=solver) + clf = LogisticRegression(random_state=1, penalty=penalty, C=sparsity, multi_class="ovr", solver=solver) elif classifier == 'RF': clf = RandomForestClassifier(random_state=1, n_jobs=n_jobs) elif classifier == 'GNB': @@ -350,12 +341,11 @@ def self_projection(X, clf = SGDClassifier(loss='perceptron', n_jobs=n_jobs) elif classifier == 'DT': clf = DecisionTreeClassifier() - + # mean cross validation score cvsm = 0 if cv > 0: - cvs = cross_val_score(clf, X_train, np.array( - y_train), cv=cv, scoring='accuracy', n_jobs=n_jobs) + cvs = cross_val_score(clf, X_train, np.array(y_train), cv=cv, scoring='accuracy', n_jobs=n_jobs) cvsm = cvs.mean() print("Mean CV accuracy: %.4f" % cvsm) # accuracy on cross validation and on test set @@ -364,18 +354,18 @@ def self_projection(X, print("Accuracy on the training set: %.4f" % accuracy) accuracy_test = clf.score(X_test, y_test) print("Accuracy on the hold-out set: %.4f" % accuracy_test) - + # accuracy of the whole dataset if whole: accuracy = clf.score(X, cell_types) print("Accuracy on the whole set: %.4f" % accuracy) - + # get predicted probability on the test set y_prob = None if not classifier in ['SH', 'PCP']: y_prob = clf.predict_proba(X_test) y_pred = clf.predict(X_test) - + return y_prob, y_pred, y_test, clf, cvsm, accuracy_test @@ -414,10 +404,8 @@ def confusion_matrix(y_test, y_pred, clf, labels=None): ----- the confusion matrix """ - if labels is None: - labels = clf.classes_ - df = pd.DataFrame.from_records( - metrics.confusion_matrix(y_test, y_pred, labels=labels)) + if labels is None: labels = clf.classes_ + df = pd.DataFrame.from_records(metrics.confusion_matrix(y_test, y_pred, labels=labels)) df.index = labels df.columns = labels df.index.name = 'Real' @@ -455,8 +443,7 @@ def per_cluster_accuracy(mtx, ad=None, clstr_name='louvain'): def per_cell_accuracy(X, cell_types, clf): y_prob = clf.predict_proba(X) - df = pd.DataFrame(y_prob, index=cell_types, - columns=clf.classes_).sort_index().T + df = pd.DataFrame(y_prob, index=cell_types, columns=clf.classes_).sort_index().T df.index.name = 'Predicted' dy = np.empty([0]) for cell in df.index: @@ -489,14 +476,13 @@ def get_topmarkers(clf, names, topn=10): top_markers = \ marker_genes \ - .query('weight > 0.') \ - .sort_values('weight', ascending=False) \ - .groupby('cell_type') \ - .head(topn) \ - .sort_values(['cell_type', 'weight'], ascending=[True, False]) + .query('weight > 0.') \ + .sort_values('weight', ascending=False) \ + .groupby('cell_type') \ + .head(topn) \ + .sort_values(['cell_type', 'weight'], ascending=[True, False]) return top_markers - def eu_distance(X, gp1, gp2, cell): """ Measure the euclidean distance between two groups of cells and the third group. @@ -555,22 +541,20 @@ def get_distance_matrix(X, clusters, labels=None, metric='euclidean'): centers = [] if scipy.sparse.issparse(X): for cl in labels: - centers.append( - np.array(X[np.where(clusters == cl)[0], :].mean(0))[0, :]) + centers.append(np.array(X[np.where(clusters == cl)[0], :].mean(0))[0, :]) else: for cl in labels: centers.append(np.array(X[np.where(clusters == cl)[0], :].mean(0))) return pairwise_distances(np.array(centers), metric=metric) - def merge_cluster(ad, old_id, new_id, groups): ad.obs[new_id] = ad.obs[old_id] ad.obs[new_id] = ad.obs[new_id].astype('category') ad.obs[new_id].cat.categories = make_unique(groups.astype(str)) ad.obs[new_id] = ad.obs[new_id].str.split('_').str[0] return ad - - + + def find_high_resolution(ad, resolution=4, n=100): cut = resolution while cut > 0.5: @@ -583,18 +567,18 @@ def find_high_resolution(ad, resolution=4, n=100): def get_connection_matrix(ad_obs, key1, key2): - df = ad_obs.groupby([key1, key2]).size().to_frame().reset_index() - df.columns = [key1, key2, 'counts'] + df = ad_obs.groupby([key1,key2]).size().to_frame().reset_index() + df.columns = [key1,key2,'counts'] df2 = ad_obs[key2].value_counts() df['size'] = df2[df[key2].tolist()].tolist() df['percent'] = df['counts']/df['size'] - df = df[df['percent'] > 0.1] + df = df[df['percent']>0.1] df2 = df.groupby(key1).size() - df2 = df2[df2 > 1] + df2 = df2[df2>1] df = df[df[key1].isin(df2.index)] dim = len(ad_obs[key2].unique()) - mat = pd.DataFrame(np.zeros([dim, dim])).astype(int) + mat = pd.DataFrame(np.zeros([dim,dim])).astype(int) mat.columns = mat.columns.astype(str) mat.index = mat.index.astype(str) @@ -602,10 +586,10 @@ def get_connection_matrix(ad_obs, key1, key2): grouped = df.groupby(key1) for name, group in grouped: x = group[key2] - if len(x) > 0: - for i, j in list(itertools.combinations(x.tolist(), 2)): - mat.loc[i, j] = mat.loc[j, i] = 1 - return mat + if len(x)>0: + for i,j in list(itertools.combinations(x.tolist(), 2)): + mat.loc[i,j] = mat.loc[j,i] = 1 + return mat def SCCAF_optimize_all(ad, @@ -613,11 +597,11 @@ def SCCAF_optimize_all(ad, R1norm_cutoff=0.5, R2norm_cutoff=0.05, R1norm_step=0.01, - R2norm_step=0.001, + R2norm_step=0.001, prefix='L1', - min_i=3, - start=None, - start_iter=0, + min_i = 3, + start = None, + start_iter = 0, *args, **kwargs): """ ad: `AnnData` @@ -642,22 +626,20 @@ def SCCAF_optimize_all(ad, The reduce step for minimum R2norm value. """ acc = 0 - # 'start_iter = -1 + #'start_iter = -1 if start is None: - start = '%s_Round%d' % (prefix, start_iter) + start = '%s_Round%d'%(prefix, start_iter) if not start in ad.obs.keys(): - raise ValueError( - "`adata.obs['%s']` doesn't exist. Please assign the initial clustering first." % (start)) + raise ValueError("`adata.obs['%s']` doesn't exist. Please assign the initial clustering first."%(start)) else: if not start in ad.obs.keys(): - raise ValueError( - "`adata.obs['%s']` doesn't exist. Please assign the initial clustering first." % (start)) - ad.obs['%s_Round%d' % (prefix, start_iter)] = ad.obs[start] - - clstr_old = len(ad.obs['%s_Round%d' % (prefix, start_iter)].unique()) - # 'while acc < min_acc: + raise ValueError("`adata.obs['%s']` doesn't exist. Please assign the initial clustering first."%(start)) + ad.obs['%s_Round%d'%(prefix, start_iter)] = ad.obs[start] + + clstr_old = len(ad.obs['%s_Round%d'%(prefix, start_iter)].unique()) + #'while acc < min_acc: for i in range(10): - if start_iter > 0: + if start_iter >0: print("start_iter: %d" % start_iter) print("R1norm_cutoff: %f" % R1norm_cutoff) print("R2norm_cutoff: %f" % R2norm_cutoff) @@ -667,7 +649,7 @@ def SCCAF_optimize_all(ad, R1norm_cutoff=R1norm_cutoff, R2norm_cutoff=R2norm_cutoff, start_iter=start_iter, - min_acc=min_acc, + min_acc=min_acc, prefix=prefix, *args, **kwargs) print("m1: %f" % m1) @@ -675,17 +657,17 @@ def SCCAF_optimize_all(ad, print("Accuracy: %f" % acc) R1norm_cutoff = m1 - R1norm_step R2norm_cutoff = m2 - R2norm_step - - clstr_new = len(ad.obs['%s_result' % prefix].unique()) - + + clstr_new = len(ad.obs['%s_result'%prefix].unique()) + if clstr_new >= clstr_old and i >= min_i: print("converged SCCAF_all!") break - - if acc >= min_acc: + + if acc >=min_acc: break - - + + def SCCAF_optimize(ad, prefix='L1', use='raw', @@ -699,7 +681,7 @@ def SCCAF_optimize(ad, plot_dist=False, plot_cmat=False, mod='1', - low_res=None, + low_res = None, c_iter=3, n_iter=10, n_jobs=None, @@ -775,8 +757,8 @@ def SCCAF_optimize(ad, The cutoff for the euclidean distance between two clusters of cells. 8.0 means the euclidean distance between two cell types should be greater than 8.0. low_res: `str` optional - the clustering boundary for under-clustering. Set a low resolution in louvain/leiden clustering and give - the key as the underclustering boundary. + the clustering boundary for under-clustering. Set a low resolution in louvain/leiden clustering and give + the key as the underclustering boundary. classifier: `String` optional (default: 'LR') a machine learning model in "LR" (logistic regression), \ "RF" (Random Forest), "GNB"(Gaussion Naive Bayes), "SVM" (Support Vector Machine) and "DT"(Decision Tree). @@ -784,7 +766,7 @@ def SCCAF_optimize(ad, MatPlotLib multi-page backend object instance, previously initialised (currently the only type supported is PdfPages). min_acc: `float` - the minimum total accuracy to be achieved. Above this threshold, the optimization will stop. + the minimum total accuracy to be achieved. Above this threshold, the optimization will stop. return ----- @@ -797,13 +779,10 @@ def SCCAF_optimize(ad, X = ad.raw.X elif use == 'pca': if 'X_pca' not in ad.obsm.keys(): - raise ValueError( - "`adata.obsm['X_pca']` doesn't exist. Run `sc.pp.pca` first.") + raise ValueError("`adata.obsm['X_pca']` doesn't exist. Run `sc.pp.pca` first.") X = ad.obsm['X_pca'] - elif 'X_%s' % use in ad.obsm.dtype.fields: - X = ad.obsm['X_%s' % use] else: - X = ad[:, ad.var['highly_variable']].X + X = ad[:,ad.var['highly_variable']].X for i in range(start_iter, start_iter + n_iter): print("Round%d ..." % (i + 1)) @@ -818,10 +797,9 @@ def SCCAF_optimize(ad, fraction=fraction, classifier=classifier, n_jobs=n_jobs) accs = [acc] ad.obs['%s_self-projection' % old_id] = clf.predict(X) - + if plot: - aucs = plot_roc(y_prob, y_test, clf, cvsm=cvsm, acc=acc, - title="Self-project ROC {}".format(old_id)) + aucs = plot_roc(y_prob, y_test, clf, cvsm=cvsm, acc=acc, title="Self-project ROC {}".format(old_id)) if mplotlib_backend: mplotlib_backend.savefig() plt.clf() @@ -869,12 +847,9 @@ def SCCAF_optimize(ad, if plot: if plot_cmat: - plot_heatmap_gray(cmat, 'Confusion Matrix', - mplotlib_backend=mplotlib_backend) - plot_heatmap_gray(R1mat, 'Normalized Confusion Matrix (R1norm) - %s' % - old_id, mplotlib_backend=mplotlib_backend) - plot_heatmap_gray(R2mat, 'Normalized Confusion Matrix (R2norm) - %s' % - old_id, mplotlib_backend=mplotlib_backend) + plot_heatmap_gray(cmat, 'Confusion Matrix', mplotlib_backend=mplotlib_backend) + plot_heatmap_gray(R1mat, 'Normalized Confusion Matrix (R1norm) - %s' % old_id, mplotlib_backend=mplotlib_backend) + plot_heatmap_gray(R2mat, 'Normalized Confusion Matrix (R2norm) - %s' % old_id, mplotlib_backend=mplotlib_backend) if R1norm_only: groups = cluster_adjmat(R1mat, cutoff=R1norm_cutoff) @@ -882,35 +857,32 @@ def SCCAF_optimize(ad, groups = cluster_adjmat(R2mat, cutoff=R2norm_cutoff) else: if not low_res is None: - conn_mat = get_connection_matrix( - ad_obs=ad.obs, key1=low_res, key2=old_id) - zmat = np.minimum.reduce( - [(R1mat > R1norm_cutoff), conn_mat.values]) + conn_mat = get_connection_matrix(ad_obs = ad.obs, key1 = low_res, key2 = old_id) + zmat = np.minimum.reduce([(R1mat > R1norm_cutoff), conn_mat.values]) groups = cluster_adjmat(zmat, cutoff=0) else: - zmat = np.maximum.reduce( - [(R1mat > R1norm_cutoff), (R2mat > R2norm_cutoff)]) + zmat = np.maximum.reduce([(R1mat > R1norm_cutoff), (R2mat > R2norm_cutoff)]) groups = cluster_adjmat(zmat, cutoff=0) if len(np.unique(groups)) == len(ad.obs[old_id].unique()): ad.obs['%s_result' % prefix] = ad.obs[old_id] print("Converge SCCAF_optimize no. cluster!") break - + merge_cluster(ad, old_id1, new_id, groups) - + if plot: sc.pl.scatter(ad, basis=basis, color=[new_id], color_map="RdYlBu_r", legend_loc='on data', show=(mplotlib_backend is None)) if mplotlib_backend: mplotlib_backend.savefig() plt.clf() - + if len(np.unique(groups)) <= 1: ad.obs['%s_result' % prefix] = ad.obs[new_id] print("no clustering!") break - + return ad, m1, m2, np.min(accs), i @@ -953,8 +925,7 @@ def plot_center(ad, groupby, ax, basis='tsne', size=20): Y_mask = Y[ad.obs[groupby] == c, :] centroids[c] = np.median(Y_mask, axis=0) for c in centroids.keys(): - ax.plot(centroids[c][0], centroids[c][1], - 'wo', markersize=size, alpha=0.5) + ax.plot(centroids[c][0], centroids[c][1], 'wo', markersize=size, alpha=0.5) return ax @@ -1048,34 +1019,33 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, """ y_prob, y_test, clf, plot=True, save=False, title ='', colors=None, cvsm=None, acc=None, fontsize=16): """ - rc_aucs = [] # AUC - rp_aucs = [] # AUC from recall precision - fprs = [] # FPR - tprs = [] # TPR - prss = [] # Precision - recs = [] # Recall + rc_aucs = [] #AUC + rp_aucs = [] # AUC from recall precision + fprs = [] #FPR + tprs = [] #TPR + prss = [] #Precision + recs = [] #Recall for i, cell_type in enumerate(clf.classes_): fpr, tpr, _ = metrics.roc_curve(y_test == cell_type, y_prob[:, i]) - prs, rec, _ = metrics.precision_recall_curve( - y_test == cell_type, y_prob[:, i]) + prs, rec, _ = metrics.precision_recall_curve(y_test == cell_type, y_prob[:, i]) fprs.append(fpr) tprs.append(tpr) prss.append(prs) recs.append(rec) rc_aucs.append(metrics.auc(fpr, tpr)) rp_aucs.append(metrics.auc(rec, prs)) - + good_aucs = np.asarray(rc_aucs) good_aucs = good_aucs[~np.isnan(good_aucs)] min_auc_rc = np.min(good_aucs) max_auc_rc = np.max(good_aucs) - + good_aucs = np.asarray(rp_aucs) good_aucs = good_aucs[~np.isnan(good_aucs)] min_auc_rp = np.min(good_aucs) max_auc_rp = np.max(good_aucs) - - if plot in ['both', 'roc', 'prc']: + + if plot in ['both','roc','prc']: if colors is None: if len(clf.classes_) < 21: colors = default_20 @@ -1084,7 +1054,7 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, else: colors = default_102 if plot == 'both': - + fig, ax = plt.subplots(1, 2, sharey=True) ax[0].plot([0, 1], [0, 1], color='k', ls=':') ax[0].set_xticks([0, 1]) @@ -1098,32 +1068,24 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, ax[0].plot(Xs[i], Ys[i], c=colors[i], lw=2, label=cell_type) ax[0].set_xlabel('FPR') ax[0].set_ylabel('TPR') + # ysong modify add legend 10 Mar 2022 Xs = recs Ys = prss for i, cell_type in enumerate(clf.classes_): ax[1].plot(Xs[i], Ys[i], c=colors[i], lw=2, label=cell_type) ax[1].set_xlabel('Recall') - ax[1].set_ylabel('Precision') - - ax[0].annotate(r'$AUC_{min}: %.3f$' % - min_auc_rc, (0.5, 0.4), fontsize=fontsize) - ax[0].annotate(r'$AUC_{max}: %.3f$' % - max_auc_rc, (0.5, 0.3), fontsize=fontsize) - ax[1].annotate(r'$AUC_{min}: %.3f$' % - min_auc_rp, (0.5, 0.4), fontsize=fontsize) - ax[1].annotate(r'$AUC_{max}: %.3f$' % - max_auc_rp, (0.5, 0.3), fontsize=fontsize) + ax[1].legend(bbox_to_anchor=(1.04,0.5), loc="center left") + ax[0].annotate(r'$AUC_{min}: %.3f$' % min_auc_rc, (0.5, 0.4), fontsize=fontsize) + ax[0].annotate(r'$AUC_{max}: %.3f$' % max_auc_rc, (0.5, 0.3), fontsize=fontsize) + ax[1].annotate(r'$AUC_{min}: %.3f$' % min_auc_rp, (0.5, 0.4), fontsize=fontsize) + ax[1].annotate(r'$AUC_{max}: %.3f$' % max_auc_rp, (0.5, 0.3), fontsize=fontsize) if cvsm: - ax[0].annotate("CV: %.3f" % - cvsm, (0.5, 0.2), fontsize=fontsize) - ax[1].annotate("CV: %.3f" % - cvsm, (0.5, 0.2), fontsize=fontsize) + ax[0].annotate("CV: %.3f" % cvsm, (0.5, 0.2), fontsize=fontsize) + ax[1].annotate("CV: %.3f" % cvsm, (0.5, 0.2), fontsize=fontsize) if acc: - ax[0].annotate("Test: %.3f" % - acc, (0.5, 0.1), fontsize=fontsize) - ax[1].annotate("Test: %.3f" % - acc, (0.5, 0.1), fontsize=fontsize) - + ax[0].annotate("Test: %.3f" % acc, (0.5, 0.1), fontsize=fontsize) + ax[1].annotate("Test: %.3f" % acc, (0.5, 0.1), fontsize=fontsize) + else: fig, ax = plt.subplots() ax.set_xticks([0, 1]) @@ -1135,10 +1097,8 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, ax.set_ylabel('TPR') ax.plot([0, 1], [0, 1], color='k', ls=':') pos = 0.5 - ax.annotate(r'$AUC_{min}: %.3f$' % - min_auc_rc, (pos, 0.4), fontsize=fontsize) - ax.annotate(r'$AUC_{max}: %.3f$' % - max_auc_rc, (pos, 0.3), fontsize=fontsize) + ax.annotate(r'$AUC_{min}: %.3f$' % min_auc_rc, (pos, 0.4), fontsize=fontsize) + ax.annotate(r'$AUC_{max}: %.3f$' % max_auc_rc, (pos, 0.3), fontsize=fontsize) else: Xs = recs Ys = prss @@ -1146,10 +1106,8 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, ax.set_ylabel('Precision') ax.plot([0, 1], [1, 0], color='k', ls=':') pos = 0 - ax.annotate(r'$AUC_{min}: %.3f$' % - min_auc_rp, (pos, 0.4), fontsize=fontsize) - ax.annotate(r'$AUC_{max}: %.3f$' % - max_auc_rp, (pos, 0.3), fontsize=fontsize) + ax.annotate(r'$AUC_{min}: %.3f$' % min_auc_rp, (pos, 0.4), fontsize=fontsize) + ax.annotate(r'$AUC_{max}: %.3f$' % max_auc_rp, (pos, 0.3), fontsize=fontsize) for i, cell_type in enumerate(clf.classes_): ax.plot(Xs[i], Ys[i], c=colors[i], lw=2, label=cell_type) @@ -1160,11 +1118,11 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, if save: plt.savefig(save) - + return rc_aucs -# pySankey +## pySankey class pySankeyException(Exception): pass @@ -1192,8 +1150,7 @@ def check_data_matches_labels(labels, data, side): msg = "Labels: " + ",".join(labels) + "\n" if len(data) < 20: msg += "Data: " + ",".join(data) - raise LabelMismatch( - '{0} labels and data do not match.{1}'.format(side, msg)) + raise LabelMismatch('{0} labels and data do not match.{1}'.format(side, msg)) def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None, @@ -1299,8 +1256,7 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None, myD['bottom'] = 0 myD['top'] = myD['left'] else: - myD['bottom'] = widths_left[leftLabels[i - 1] - ]['top'] + 0.02 * df.leftWeight.sum() + myD['bottom'] = widths_left[leftLabels[i - 1]]['top'] + 0.02 * df.leftWeight.sum() myD['top'] = myD['bottom'] + myD['left'] topEdge = myD['top'] widths_left[l] = myD @@ -1314,8 +1270,7 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None, myD['bottom'] = 0 myD['top'] = myD['right'] else: - myD['bottom'] = widths_right[rightLabels[i - 1] - ]['top'] + 0.02 * df.rightWeight.sum() + myD['bottom'] = widths_right[rightLabels[i - 1]]['top'] + 0.02 * df.rightWeight.sum() myD['top'] = myD['bottom'] + myD['right'] topEdge = myD['top'] widths_right[l] = myD @@ -1342,13 +1297,12 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None, for l in rightLabels: plt.fill_between( [xMax, 1.02 * xMax], 2 * [widths_right[l]['bottom']], - 2 * [widths_right[l]['bottom'] + widths_right[l]['right']], + 2 * [widths_right[l]['bottom'] + widths_right[l]['right']], color=colorDict[l], alpha=0.99 ) plt.text( - 1.05 * xMax, widths_right[l]['bottom'] + - 0.5 * widths_right[l]['right'], + 1.05 * xMax, widths_right[l]['bottom'] + 0.5 * widths_right[l]['right'], l, {'ha': 'left', 'va': 'center'}, fontsize=fontsize @@ -1362,8 +1316,7 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None, lc = l2 if len(df[(df.left == l) & (df.right == l2)]) > 0: # Create array of y values for each strip, half at left value, half at right, convolve - ys_d = np.array( - 50 * [widths_left[l]['bottom']] + 50 * [widths_right[l2]['bottom']]) + ys_d = np.array(50 * [widths_left[l]['bottom']] + 50 * [widths_right[l2]['bottom']]) ys_d = np.convolve(ys_d, 0.05 * np.ones(20), mode='valid') ys_d = np.convolve(ys_d, 0.05 * np.ones(20), mode='valid') ys_u = np.array( @@ -1387,22 +1340,23 @@ def sankey(left, right, leftWeight=None, rightWeight=None, colorDict=None, ############################ UTILS ##################################### - +from scipy.sparse import csr_matrix +import h5py def readHCA(fin): f = h5py.File(fin, 'r') data = f[list(f.keys())[0]] - X = csr_matrix((np.array(data.get("data")), - np.array(data.get("indices")), - np.array(data.get("indptr"))), - shape=np.array(data.get("shape"))[::-1]) + X = csr_matrix((np.array(data.get("data")), + np.array(data.get("indices")), + np.array(data.get("indptr"))), + shape=np.array(data.get("shape"))[::-1]) ad = sc.AnnData(X) ad.obs_names = np.array(data.get("barcodes")).astype(str) ad.var_names = np.array(data.get("gene_names")).astype(str) ad.var['ensembl_gene'] = np.array(data.get("genes")).astype(str) return(ad) - - + + def sc_workflow(ad, prefix='L1', resolution=1.5, n_pcs=15, do_tsne=True): sc.pp.normalize_per_cell(ad, counts_per_cell_after=1e4) filter_result = sc.pp.filter_genes_dispersion( @@ -1440,19 +1394,16 @@ def sc_pp_regress_out(adata, keys, n_jobs=None, copy=False): ------- Depening on `copy` returns or updates `adata` with the corrected data matrix. """ - logg.info('regressing out'.format( - tuple(keys) if type(keys) is list else keys)) + logg.info('regressing out'.format(tuple(keys) if type(keys) is list else keys)) if issparse(adata.X): logg.info('... sparse input is densified and may ' 'lead to huge memory consumption') adata = adata.copy() if copy else adata - if isinstance(keys, str): - keys = [keys] + if isinstance(keys, str): keys = [keys] if issparse(adata.X): adata.X = adata.X.toarray() if n_jobs is not None: - logg.warning( - 'Parallelization is currently broke, will be restored soon. Running on 1 core.') + logg.warning('Parallelization is currently broke, will be restored soon. Running on 1 core.') n_jobs = sett.n_jobs if n_jobs is None else n_jobs cat_keys = [] @@ -1494,16 +1445,14 @@ def sc_pp_regress_out(adata, keys, n_jobs=None, copy=False): def _regress_out(col_index, responses, regressors): try: if regressors.shape[1] - 1 == responses.shape[1]: - regressors_view = np.c_[ - regressors[:, 0], regressors[:, col_index + 1]] + regressors_view = np.c_[regressors[:, 0], regressors[:, col_index + 1]] else: regressors_view = regressors result = sm.GLM(responses[:, col_index], regressors_view, family=sm.families.Gaussian()).fit() new_column = result.resid_response except PerfectSeparationError: # this emulates R's behavior - logg.warning( - 'Encountered PerfectSeparationError, setting to 0 as in R.') + logg.warning('Encountered PerfectSeparationError, setting to 0 as in R.') new_column = np.zeros(responses.shape[0]) return new_column @@ -1524,8 +1473,7 @@ def _regress_out_chunk(chunk, responses, regressors): for i_column, column in enumerate(chunk): adata.X[:, column] = result_lst[i_column] logg.info('finished') - logg.hint( - 'after `sc.pp.regress_out`, consider rescaling the adata using `sc.pp.scale`') + logg.hint('after `sc.pp.regress_out`, consider rescaling the adata using `sc.pp.scale`') return adata if copy else None @@ -1542,8 +1490,8 @@ def regress_out(metadata, exprs, covariate_formula, design_formula='1', rcond=-1 covariate_matrix = patsy.dmatrix(covariate_formula, metadata) design_batch = np.hstack((covariate_matrix, design_matrix)) - coefficients, res, rank, s = np.linalg.lstsq( - design_batch, exprs.T, rcond=rcond) + coefficients, res, rank, s = np.linalg.lstsq(design_batch, exprs.T, rcond=rcond) beta = coefficients[covariate_matrix.shape[1]:] return exprs - design_matrix.dot(beta).T + From 7024b41fe3476790327a9887c86dc1c81de4efd7 Mon Sep 17 00:00:00 2001 From: YY-SONG0718 <794815059@qq.com> Date: Wed, 21 Sep 2022 21:04:49 +0100 Subject: [PATCH 3/8] add legend --- SCCAF/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/SCCAF/__init__.py b/SCCAF/__init__.py index 71f3a6b..8f8d37c 100644 --- a/SCCAF/__init__.py +++ b/SCCAF/__init__.py @@ -781,6 +781,8 @@ def SCCAF_optimize(ad, if 'X_pca' not in ad.obsm.keys(): raise ValueError("`adata.obsm['X_pca']` doesn't exist. Run `sc.pp.pca` first.") X = ad.obsm['X_pca'] + elif 'X_%s'%use in ad.obsm.dtype.fields: + X = ad.obsm['X_%s'%use] else: X = ad[:,ad.var['highly_variable']].X @@ -1068,12 +1070,14 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, ax[0].plot(Xs[i], Ys[i], c=colors[i], lw=2, label=cell_type) ax[0].set_xlabel('FPR') ax[0].set_ylabel('TPR') - # ysong modify add legend 10 Mar 2022 + # add legend Xs = recs Ys = prss for i, cell_type in enumerate(clf.classes_): ax[1].plot(Xs[i], Ys[i], c=colors[i], lw=2, label=cell_type) ax[1].set_xlabel('Recall') + ax[1].set_ylabel('Precision') + ax[1].legend(bbox_to_anchor=(1.04,0.5), loc="center left") ax[0].annotate(r'$AUC_{min}: %.3f$' % min_auc_rc, (0.5, 0.4), fontsize=fontsize) ax[0].annotate(r'$AUC_{max}: %.3f$' % max_auc_rc, (0.5, 0.3), fontsize=fontsize) From f841eb3a820dc1405c1204f7cafb0ef7e344b957 Mon Sep 17 00:00:00 2001 From: YY-SONG0718 <794815059@qq.com> Date: Wed, 21 Sep 2022 21:16:42 +0100 Subject: [PATCH 4/8] add option for plotting legend in plot_roc --- SCCAF/__init__.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/SCCAF/__init__.py b/SCCAF/__init__.py index 8f8d37c..f8fc7ab 100644 --- a/SCCAF/__init__.py +++ b/SCCAF/__init__.py @@ -694,7 +694,8 @@ def SCCAF_optimize(ad, dist_cutoff=8, classifier="LR", mplotlib_backend=None, - min_acc=1): + min_acc=1, + legend=False): """ This is a self-projection confusion matrix directed cluster optimization function. @@ -767,6 +768,8 @@ def SCCAF_optimize(ad, PdfPages). min_acc: `float` the minimum total accuracy to be achieved. Above this threshold, the optimization will stop. + legend: `bool` optional (default: False) + If also plot legend for the AUC curve. return ----- @@ -801,7 +804,7 @@ def SCCAF_optimize(ad, ad.obs['%s_self-projection' % old_id] = clf.predict(X) if plot: - aucs = plot_roc(y_prob, y_test, clf, cvsm=cvsm, acc=acc, title="Self-project ROC {}".format(old_id)) + aucs = plot_roc(y_prob, y_test, clf, cvsm=cvsm, acc=acc, title="Self-project ROC {}".format(old_id), legend=legend) if mplotlib_backend: mplotlib_backend.savefig() plt.clf() @@ -1017,7 +1020,7 @@ def plot_heatmap_gray(X, title='', save=None, mplotlib_backend=None): plt.show() -def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, cvsm=None, acc=None, fontsize=16): +def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, cvsm=None, acc=None, fontsize=16, legend=None): """ y_prob, y_test, clf, plot=True, save=False, title ='', colors=None, cvsm=None, acc=None, fontsize=16): """ @@ -1070,15 +1073,15 @@ def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, ax[0].plot(Xs[i], Ys[i], c=colors[i], lw=2, label=cell_type) ax[0].set_xlabel('FPR') ax[0].set_ylabel('TPR') - # add legend Xs = recs Ys = prss for i, cell_type in enumerate(clf.classes_): ax[1].plot(Xs[i], Ys[i], c=colors[i], lw=2, label=cell_type) ax[1].set_xlabel('Recall') ax[1].set_ylabel('Precision') - - ax[1].legend(bbox_to_anchor=(1.04,0.5), loc="center left") + ## add legend if required + if legend == True: + ax[1].legend(bbox_to_anchor=(1.04,0.5), loc="center left") ax[0].annotate(r'$AUC_{min}: %.3f$' % min_auc_rc, (0.5, 0.4), fontsize=fontsize) ax[0].annotate(r'$AUC_{max}: %.3f$' % max_auc_rc, (0.5, 0.3), fontsize=fontsize) ax[1].annotate(r'$AUC_{min}: %.3f$' % min_auc_rp, (0.5, 0.4), fontsize=fontsize) From c2eea917d49cf0e616def44ac2202a692dfa3b80 Mon Sep 17 00:00:00 2001 From: YY-SONG0718 <794815059@qq.com> Date: Wed, 21 Sep 2022 21:17:42 +0100 Subject: [PATCH 5/8] add option for plotting legend in plot_roc --- SCCAF/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SCCAF/__init__.py b/SCCAF/__init__.py index f8fc7ab..5b85442 100644 --- a/SCCAF/__init__.py +++ b/SCCAF/__init__.py @@ -769,7 +769,7 @@ def SCCAF_optimize(ad, min_acc: `float` the minimum total accuracy to be achieved. Above this threshold, the optimization will stop. legend: `bool` optional (default: False) - If also plot legend for the AUC curve. + If also plot legend for the ROC curve. return ----- From 9ea9755a432e14d331bf43454fddb3bf6cb87ff0 Mon Sep 17 00:00:00 2001 From: YY-SONG0718 <794815059@qq.com> Date: Wed, 21 Sep 2022 21:19:51 +0100 Subject: [PATCH 6/8] add option for plotting legend in plot_roc --- SCCAF/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SCCAF/__init__.py b/SCCAF/__init__.py index 5b85442..975da14 100644 --- a/SCCAF/__init__.py +++ b/SCCAF/__init__.py @@ -1020,7 +1020,7 @@ def plot_heatmap_gray(X, title='', save=None, mplotlib_backend=None): plt.show() -def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, cvsm=None, acc=None, fontsize=16, legend=None): +def plot_roc(y_prob, y_test, clf, plot='both', save=None, title='', colors=None, cvsm=None, acc=None, fontsize=16, legend=False): """ y_prob, y_test, clf, plot=True, save=False, title ='', colors=None, cvsm=None, acc=None, fontsize=16): """ From 50efca66fa2cfc1a6c45ad4fa88f5c6f239586c3 Mon Sep 17 00:00:00 2001 From: YY-SONG0718 <794815059@qq.com> Date: Thu, 26 Oct 2023 18:11:17 +0100 Subject: [PATCH 7/8] update scanpy version --- setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 7de0c9f..fa5a715 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ def readme(): setup( name='SCCAF', - version='0.0.10', + version='0.0.11', description='Single-Cell Clustering Assessment Framework', long_description=readme(), long_description_content_type='text/markdown', @@ -20,9 +20,10 @@ def readme(): 'louvain', 'scikit-learn', 'psutil', - 'scanpy==1.4.6'], + 'scanpy==1.9.2'], scripts=['cli/sccaf', 'cli/sccaf-assess', 'cli/sccaf-assess-merger', 'cli/sccaf-regress-out'], author='Chichau Miau', author_email='zmiao@ebi.ac.uk', license='MIT' ) +# modified by Yuyao Song on Oct 2023 to bump scanpy version From 05fcfcadfdbef38531a548291f2f867b197db4c4 Mon Sep 17 00:00:00 2001 From: YY-SONG0718 <794815059@qq.com> Date: Fri, 27 Oct 2023 15:58:44 +0100 Subject: [PATCH 8/8] space for indent --- SCCAF/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SCCAF/__init__.py b/SCCAF/__init__.py index 975da14..a54580e 100644 --- a/SCCAF/__init__.py +++ b/SCCAF/__init__.py @@ -785,7 +785,7 @@ def SCCAF_optimize(ad, raise ValueError("`adata.obsm['X_pca']` doesn't exist. Run `sc.pp.pca` first.") X = ad.obsm['X_pca'] elif 'X_%s'%use in ad.obsm.dtype.fields: - X = ad.obsm['X_%s'%use] + X = ad.obsm['X_%s'%use] else: X = ad[:,ad.var['highly_variable']].X