From 1b58111c71c04bdc1aa67d116a8170b4f76cc7be Mon Sep 17 00:00:00 2001 From: Christian Heckendorf Date: Tue, 1 Mar 2022 12:28:53 -0500 Subject: [PATCH] standardize whitespace and use () for print --- src/CalculateCoElutionScores.py | 18 +++++----- src/GoldStandard.py | 62 ++++++++++++++++----------------- src/main.py | 46 ++++++++++++------------ src/utils.py | 58 +++++++++++++++--------------- 4 files changed, 92 insertions(+), 92 deletions(-) diff --git a/src/CalculateCoElutionScores.py b/src/CalculateCoElutionScores.py index e9af9f7..6f07850 100644 --- a/src/CalculateCoElutionScores.py +++ b/src/CalculateCoElutionScores.py @@ -141,7 +141,7 @@ def loadElutionData(self, elutionProfileF, frac_count = 2, max_count_cutoff=1): pro_list.append(protID) else: removed += 1 - print "finished processing %s\n removed %i (%.2f, total: %i, after filtering: %i) proteins found in less than %i fraction" % (elutionProfileF, removed, removed/(removed + len(prot2Index)), removed + len(prot2Index), len(prot2Index), frac_count) + print("finished processing %s\n removed %i (%.2f, total: %i, after filtering: %i) proteins found in less than %i fraction" % (elutionProfileF, removed, removed/(removed + len(prot2Index)), removed + len(prot2Index), len(prot2Index), frac_count)) elutionProfileFH.close() elutionMat = np.nan_to_num(np.matrix(elutionMat)) return elutionMat, prot2Index @@ -609,7 +609,7 @@ def load_genemania(self): for fp in self.files: #for de-bugging, I only used the first three files filename = str(fp.split('/')[-1]) if filename.startswith(f_evidence): - print "Processing: %s" % (filename) + print("Processing: %s" % (filename)) fh = urllib2.urlopen(fp) fh.readline() for line in fh: @@ -1082,8 +1082,8 @@ def cv_eval(self, data, targets, folds= 5): this_targets = [] i = 1 for train, test in skf.split(data, targets): - #print "Processing fold %i" % i - print "Processing data..." + #print("Processing fold %i" % i) + print("Processing data...") i += 1 self.fit(data[train], targets[train]) probs.extend(self.predict_proba(data[test])) @@ -1108,7 +1108,7 @@ class MLP_wrapper(object): def __init__(self): - print "Using MLP with Keras/tensorflow" + print("Using MLP with Keras/tensorflow") self.model = Sequential() def fit(self, data, labels): @@ -1133,12 +1133,12 @@ def predict(self, toPred): class SAE_wrapper(MLP_wrapper): def __init__(self): - print "Using stacked autoencoder" + print("Using stacked autoencoder") def fit(self, data, labels): - print data.shape - print len(labels) + print(data.shape) + print(len(labels)) num_features = data.shape[1] input = Input(shape=(num_features,)) @@ -1250,7 +1250,7 @@ def get_current_string_ver(self): return response.getroot()[0][0].text except urllib2.HTTPError as err: error_message = err.read() - print error_message + print(error_message) sys.exit() diff --git a/src/GoldStandard.py b/src/GoldStandard.py index b40e47a..3bbbb79 100644 --- a/src/GoldStandard.py +++ b/src/GoldStandard.py @@ -50,19 +50,19 @@ def make_reference_data(self, db_clusters, orthmap="", found_prots=""): self.complexes.addComplex("%i;%s;%s" % (i, db_clust.name, compl), tmp_clust.complexes[compl]) i += 1 - print "Total number of complexes %i in %s" % (total_complexes, self.name) - print "Number of complexes after ortholog mapping %i complexes in %s" % (len(self.complexes.complexes), self.name) + print("Total number of complexes %i in %s" % (total_complexes, self.name)) + print("Number of complexes after ortholog mapping %i complexes in %s" % (len(self.complexes.complexes), self.name)) if found_prots != "": self.complexes.remove_proteins(found_prots) - print "After removing not indetified proteins %i number of complexes in % s" % (len(self.complexes.complexes), self.name) + print("After removing not indetified proteins %i number of complexes in % s" % (len(self.complexes.complexes), self.name)) self.complexes.filter_complexes() - print "After size filtering %i number of complexes in % s" % (len(self.complexes.complexes), self.name) + print("After size filtering %i number of complexes in % s" % (len(self.complexes.complexes), self.name)) self.complexes.merge_complexes() self.complexes.filter_complexes() - print "After mergning %i number of complexes in % s" % (len(self.complexes.complexes), self.name) + print("After mergning %i number of complexes in % s" % (len(self.complexes.complexes), self.name)) self.make_pos_neg_ppis() @@ -227,8 +227,8 @@ def n_fols_split(self, num_folds, number_pruning, overlap="False"): round += 1 - print "length of complex set one: " + str(len(itemindex_one)) - print "length of complex set two: " + str(len(itemindex_zero)) + print("length of complex set one: " + str(len(itemindex_one))) + print("length of complex set two: " + str(len(itemindex_zero))) # randomize clusters #rnd.shuffle(ref_cluster_ids) @@ -272,16 +272,16 @@ def n_fols_split(self, num_folds, number_pruning, overlap="False"): len_over_positive = len(train[0] & evaluate[0]) len_over_negative = len(train[1] & evaluate[1]) - print len_train_positive - print len_eva_positive - print len_train_negative - print len_eva_negative + print(len_train_positive) + print(len_eva_positive) + print(len_train_negative) + print(len_eva_negative) - print "number of train and evaluation PPIs:" - print len_train_positive + len_train_negative - print "number of overlapped PPIs:" - print len_over_positive + len_over_negative + print("number of train and evaluation PPIs:") + print(len_train_positive + len_train_negative) + print("number of overlapped PPIs:") + print(len_over_positive + len_over_negative) out_folds.append((training, evaluation)) @@ -346,12 +346,12 @@ def n_fols_split(self, num_folds, number_pruning, overlap="False"): # len_over_negative = len(train[1] & evaluate[1]) # # - # print len_train_positive - # print len_eva_positive - # print len_over_positive - # print len_train_negative - # print len_eva_negative - # print len_over_negative + # print(len_train_positive) + # print(len_eva_positive) + # print(len_over_positive) + # print(len_train_negative) + # print(len_eva_negative) + # print(len_over_negative) # sys.exit() # # out_folds.append((training, evaluation)) @@ -476,9 +476,9 @@ def split_into_n_fold2(self, n_fold, val_ppis, no_overlapp=False): # what is va training_evaluation_dictionary["turpleKey"].append((training, evaluation)) - print "the number of training negatives and positives for corss validation " - print len(training.get_negative()) - print len(training.get_positive()) + print("the number of training negatives and positives for corss validation ") + print(len(training.get_negative())) + print(len(training.get_positive())) return training_evaluation_dictionary @@ -747,7 +747,7 @@ def read_file(self, clusterF): all_proteins_count += len(prots) clusterFH.close() - print "Average size of predicted complexes is: " + str((all_proteins_count)/i) + print("Average size of predicted complexes is: " + str((all_proteins_count)/i)) def write_cuslter_file(self, outF): outFH = open(outF, "w") @@ -1044,9 +1044,9 @@ def get_complexes(self): # go_to_prot_map = {} # prot_to_go_map = {} # quickgoURL = "http://www.ebi.ac.uk/QuickGO-Old/GAnnotation?goid=GO:0043234&tax=%s&format=tsv&limit=1000000000&evidence=IDA,IPI,EXP," % (self.taxid) - # print quickgoURL - # print self.taxid - # print "the url is: ..." + # print(quickgoURL) + # print(self.taxid) + # print("the url is: ...") # quickgoURL_FH = urllib2.urlopen(quickgoURL) # quickgoURL_FH.readline() # for line in quickgoURL_FH: @@ -1094,7 +1094,7 @@ def __init__(self, taxid, inparanoid_cutoff=1, foundProts = set([])): xmldoc = self.getXML() self.orthmap, self.orthgroups = self.parseXML(xmldoc) else: - print "Taxid:%s not supported" % taxid + print("Taxid:%s not supported" % taxid) def mapProtein(self, prot): if prot not in self.orthmap: return None @@ -1127,7 +1127,7 @@ def mapComplexes(self, clusters): if prot in self.orthmap: mapped_members.add(self.orthmap[prot]) # else: -# print "No map for %s" % prot +# print("No map for %s" % prot) if len(mapped_members)==0: todel.add(clust) @@ -1246,4 +1246,4 @@ def getids(ids_raw): else: outmap[protA] = protB outgroups.append(orthgroup) - self.orthmap, self.orthgroups = outmap, outgroups \ No newline at end of file + self.orthmap, self.orthgroups = outmap, outgroups diff --git a/src/main.py b/src/main.py index 77c0813..79b83ec 100644 --- a/src/main.py +++ b/src/main.py @@ -82,50 +82,50 @@ def main(): args.fun_anno_source = args.fun_anno_source.upper() #Create feature combination - if args.feature_selection == "00000000": - print "Select at least one feature" + if args.feature_selection == "00000000": + print("Select at least one feature") sys.exit() this_scores = utils.get_fs_comb(args.feature_selection) - print "\t".join([fs.name for fs in this_scores]) + print("\t".join([fs.name for fs in this_scores])) # Initialize CLF - use_rf = args.classifier == "RF" + use_rf = args.classifier == "RF" clf = CS.CLF_Wrapper(args.num_cores, use_rf) # Load elution data - foundprots, elution_datas = utils.load_data(args.input_dir, this_scores, fc=args.frac_count, mfc=args.elution_max_count) + foundprots, elution_datas = utils.load_data(args.input_dir, this_scores, fc=args.frac_count, mfc=args.elution_max_count) # Generate reference data set gs = "" if ((args.taxid != "" and args.ppi != "") or (args.cluster != "" and args.ppi != "" )): - print "Refernce from cluster and PPI are nor compatiple. Please supply ppi or complex reference, not both!" + print("Refernce from cluster and PPI are nor compatiple. Please supply ppi or complex reference, not both!") sys.exit() if args.taxid == "" and args.ppi == "" and args.cluster == "": - print "Please supply a reference by setting taxid, cluster, or ppi tag" + print("Please supply a reference by setting taxid, cluster, or ppi tag") sys.exit() gs_clusters = [] if (args.taxid != "" and args.cluster == "" and args.ppi == ""): - print "Loading clusters from GO, CORUM, and Intact" + print("Loading clusters from GO, CORUM, and Intact") gs_clusters.extend(utils.get_reference_from_net(args.taxid)) if args.cluster != "": - print "Loading complexes from file" + print("Loading complexes from file") if args.mode == "FA": gs_clusters.append(GS.FileClusters(args.cluster, "all")) else: gs_clusters.append(GS.FileClusters(args.cluster, foundprots)) if args.ppi != "": - print "Reading PPI file from %s" % args.reference + print("Reading PPI file from %s" % args.reference) gs = Goldstandard_from_PPI_File(args.ppi, foundprots) - print gs_clusters - if len(gs_clusters)>0: + print(gs_clusters) + if len(gs_clusters)>0: gs = utils.create_goldstandard(gs_clusters, args.taxid, foundprots) output_dir = args.output_dir + os.sep + args.output_prefix @@ -139,26 +139,26 @@ def main(): if args.precalcualted_score_file == "NONE": scoreCalc.calculate_coelutionDatas(gs) else: - scoreCalc.readTable(args.precalcualted_score_file, gs) + scoreCalc.readTable(args.precalcualted_score_file, gs) - print scoreCalc.scores.shape + print(scoreCalc.scores.shape) functionalData = "" gs.positive = set(gs.positive & set(scoreCalc.ppiToIndex.keys())) gs.negative = set(gs.negative & set(scoreCalc.ppiToIndex.keys())) gs.rebalance() - print len(gs.positive) - print len(gs.negative) + print(len(gs.positive)) + print(len(gs.negative)) if args.mode != "EXP": - print "Loading functional data" + print("Loading functional data") functionalData = utils.get_FA_data(args.fun_anno_source, args.taxid, args.fun_anno_file) - print "Dimension of fun anno " + str(functionalData.scores.shape) + print("Dimension of fun anno " + str(functionalData.scores.shape)) - print "Start benchmarking" + print("Start benchmarking") if args.mode == "EXP": utils.cv_bench_clf(scoreCalc, clf, gs, output_dir, format="pdf", verbose=True, folds = 5) @@ -172,7 +172,7 @@ def main(): utils.cv_bench_clf(functionalData, clf, gs, output_dir, format="pdf", verbose=True, folds= 5) # PPI evaluation - print utils.cv_bench_clf(scoreCalc, clf, gs, args.output_dir, verbose=False, format="pdf", folds=5) + print(utils.cv_bench_clf(scoreCalc, clf, gs, args.output_dir, verbose=False, format="pdf", folds=5)) #print "I am here" network = utils.make_predictions(scoreCalc, args.mode, clf, gs, fun_anno=functionalData) @@ -197,14 +197,14 @@ def main(): pred_clusters = GS.Clusters(False) pred_clusters.read_file("%s.clust.txt" % (output_dir)) overlapped_complexes_with_reference = gs.get_complexes().get_overlapped_complexes_set(pred_clusters) - print "# of complexes in reference dataset: " + str(len(overlapped_complexes_with_reference)) + print("# of complexes in reference dataset: " + str(len(overlapped_complexes_with_reference))) #clust_scores, header = utils.clustering_evaluation(gs.complexes, pred_clusters, "", False) clust_scores, header, composite_score = utils.clustering_evaluation(gs.complexes, pred_clusters, "", False) outFH = open("%s.eval.txt" % (output_dir), "w") header = header.split("\t") clust_scores = clust_scores.split("\t") for i, head in enumerate(header): - print "%s\t%s" % (head, clust_scores[i]) + print("%s\t%s" % (head, clust_scores[i])) print >> outFH, "%s\t%s" % (head, clust_scores[i]) outFH.close() @@ -214,4 +214,4 @@ def main(): except KeyboardInterrupt: pass -#11000100 (MI, Bayes, PCC+N) \ No newline at end of file +#11000100 (MI, Bayes, PCC+N) diff --git a/src/utils.py b/src/utils.py index 9851e50..8b27f0f 100644 --- a/src/utils.py +++ b/src/utils.py @@ -114,7 +114,7 @@ def cv_bench_clf(scoreCalc, clf, gs, outDir, verbose=False, format="pdf", folds #if verbose: #val_scores = [precision, recall, fmeasure, auc_pr, auc_roc] #for i in range(len(rownames)): - #print str(rownames[i]) + "\t" + str(val_scores[i]) + #print(str(rownames[i]) + "\t" + str(val_scores[i])) return rownames, [precision, recall, fmeasure, auc_pr, auc_roc] def bench_clf(scoreCalc, train, eval, clf, outDir, verbose=False, format = "pdf"): @@ -133,8 +133,8 @@ def bench_clf(scoreCalc, train, eval, clf, outDir, verbose=False, format = "pdf" rownames = ["Precision", "Recall", "F-Measure", "AUC PR", "AUC ROC"] val_scores = [precision, recall, fmeasure, auc_pr, auc_roc] for i in range(len(rownames)): - print rownames[i] - print val_scores[i] + print(rownames[i]) + print(val_scores[i]) @@ -147,8 +147,8 @@ def make_predictions_cross_validation(scoreCalc, train, eval, clf): eval_names, data_eval, targets_eval = scoreCalc.toSklearnData(eval) if len(eval_names) == 0: return networkDic - print "To pred" - print data_eval.shape + print("To pred") + print(data_eval.shape) tmp_clf = copy.deepcopy(clf) tmp_clf.fit(data_train, targets_train) @@ -200,7 +200,7 @@ def getPredictions(scores, edges, clf): pred_class = clf.predict(scores) for i, prediction in enumerate(pred_class): if prediction == 1: - out.append("%s\t%f" % (edges[i], pred_prob[i])) #Alternative code that also print label:out.append("%s\t%f\t%i" % (edges[i], pred_prob[i], prediction)) + out.append("%s\t%f" % (edges[i], pred_prob[i])) #Alternative code that also print(label:out.append("%s\t%f\t%i" % (edges[i], pred_prob[i], prediction))) return out out = [] @@ -209,14 +209,14 @@ def getPredictions(scores, edges, clf): k = 0 chunk_num=1 scoreCalc.open() - print "to predict: %i" % scoreCalc.to_predict + print("to predict: %i" % scoreCalc.to_predict) for line in range(scoreCalc.to_predict): if k % 100000==0 and k != 0: out.extend(getPredictions(tmpscores[0:k, :], edges[0:k], clf)) tmpscores = np.zeros((100000, num_features)) edges = [""] * 100000 if verbose: - print "Completed chunk %i" % chunk_num + print("Completed chunk %i" % chunk_num) chunk_num += 1 k = 0 edge, edge_scores = scoreCalc.get_next() @@ -244,7 +244,7 @@ def get_FA_data(anno_source, taxid, file="", datadir = ""): elif anno_source == "FILE": if file == "": - print "When using FILE tag please suppy path to file containing functional annotation using -F file+path" + print("When using FILE tag please suppy path to file containing functional annotation using -F file+path") sys.exit() # the supplied functional evidence data needs to have the correct header row... externaldata = CS.ExternalEvidence(file) @@ -252,7 +252,7 @@ def get_FA_data(anno_source, taxid, file="", datadir = ""): functionalData = externaldata.getScoreCalc() else: - print "EPIC only support GeneMane, STRING, and flat file input please use the followign tags for anno_source GM, STRING, FILE. Returning empty string object." + print("EPIC only support GeneMane, STRING, and flat file input please use the followign tags for anno_source GM, STRING, FILE. Returning empty string object.") return functionalData def make_predictions(score_calc, mode, clf, gs, fun_anno="", verbose = False): @@ -273,21 +273,21 @@ def get_edges_from_network(network): if mode == "FA"or mode == "BR": if fun_anno=="": # TODO make illigal argument error - print "if using only functional annotation for prediction functional annotation (fun_anno param != "") must not be empty" + print("if using only functional annotation for prediction functional annotation (fun_anno param != "") must not be empty") sys.exit() networks.append(predictInteractions(fun_anno, clf, gs, True, verbose)) #predict using both functional annotation and exp if mode == "COMB" or mode == "BR": tmp_score_calc = copy.deepcopy(score_calc) - print tmp_score_calc.scores.shape + print(tmp_score_calc.scores.shape) tmp_score_calc.add_fun_anno(fun_anno) - print tmp_score_calc.scores.shape + print(tmp_score_calc.scores.shape) networks.append(predictInteractions(tmp_score_calc, clf, gs, True, verbose)) # return error when no networks is predicted if len(networks) == 0: - print "Error no networks predicted" + print("Error no networks predicted") sys.exit() # return finised network when only one network is predicted, which happens in any mode expect final elif len(networks) ==1: @@ -382,14 +382,14 @@ def clustering_evaluation(eval_comp, pred_comp, prefix, verbose= True): tmp_scores = cluster_scores.split("\t") #composite_score = 0 for i in range(len(tmp_head)): - print "%s\t%s" % (tmp_head[i], tmp_scores[i]) + print("%s\t%s" % (tmp_head[i], tmp_scores[i])) # add composite score output. # added by Lucas HU, a trial function. if tmp_head[i] == "mmr" or tmp_head[i] == "overlapp" or tmp_head[i] == "accuracy": composite_score = composite_score + float(tmp_scores[i]) - print "composite score is: " + str(composite_score) + print("composite score is: " + str(composite_score)) return cluster_scores, head, composite_score @@ -556,14 +556,14 @@ def stability_evaluation(n_fold, all_gs, scoreCalc, clf, output_dir, mode, anno_ train, eval = tmp_train_eval_container[index] - print "All comp:%i" % len(all_gs.complexes.complexes) - print "Train comp:%i" % len(train.complexes.complexes) - print "Eval comp:%i" % len(eval.complexes.complexes) + print("All comp:%i" % len(all_gs.complexes.complexes)) + print("Train comp:%i" % len(train.complexes.complexes)) + print("Eval comp:%i" % len(eval.complexes.complexes)) - print "Num valid ppis in training pos: %i" % len(train.positive) - print "Num valid ppis in training neg: %i" % len(train.negative) - print "Num valid ppis in eval pos: %i" % len(eval.positive) - print "Num valid ppis in eval neg: %i" % len(eval.negative) + print("Num valid ppis in training pos: %i" % len(train.positive)) + print("Num valid ppis in training neg: %i" % len(train.negative)) + print("Num valid ppis in eval pos: %i" % len(eval.positive)) + print("Num valid ppis in eval neg: %i" % len(eval.negative)) # Evaluate classifier bench_clf(scoreCalc, train, eval, clf, output_dir, verbose=True) @@ -571,9 +571,9 @@ def stability_evaluation(n_fold, all_gs, scoreCalc, clf, output_dir, mode, anno_ functionalData = "" if mode != "exp": functionalData = get_FA_data(anno_source, taxid, anno_F) - print functionalData.scores.shape + print(functionalData.scores.shape) - print "the functional evidence data shape is: " + print("the functional evidence data shape is: ") # Predict protein interaction based on n_fold cross validation @@ -595,7 +595,7 @@ def stability_evaluation(n_fold, all_gs, scoreCalc, clf, output_dir, mode, anno_ complexes_dict_for_each_fold[index] = pred_clusters - print "fold " + str(index+1) + "is done" + print("fold " + str(index+1) + "is done") #create a matrix for storing overlapped matrix, each element in the matrix is a zero. overlapped_ratio_matrix_PPIs = np.zeros((n_fold,n_fold)) @@ -614,8 +614,8 @@ def stability_evaluation(n_fold, all_gs, scoreCalc, clf, output_dir, mode, anno_ overlapped_ratio_matrix_complexes[i,j] = averaged_overlapped_complexes_no / ((len(complexes_dict_for_each_fold[i].get_complexes()) + len(complexes_dict_for_each_fold[j].get_complexes())) / 2) - print overlapped_ratio_matrix_PPIs - print overlapped_ratio_matrix_complexes + print(overlapped_ratio_matrix_PPIs) + print(overlapped_ratio_matrix_complexes) # create the txt file to save the overlap matrix for stabilit testing. filename1 = output_dir + " n_fold_corss_validation_PPIs overlap matrix.txt" @@ -656,4 +656,4 @@ def Goldstandard_from_cluster_File(gsF, foundprots=""): gs = GS.Goldstandard_from_Complexes("All") gs.complexes = clusters gs.make_pos_neg_ppis() - return gs \ No newline at end of file + return gs