From d0e449d122b18e64a5c14ec3b7f93cc6c6984f8e Mon Sep 17 00:00:00 2001 From: Paul Emsley Date: Tue, 15 Nov 2022 03:38:16 +0000 Subject: [PATCH 1/2] Make a copy and rework compare_models so that it works with the ligands-of-the-week infrastructure --- .../compare_models_for_ligands_of_the_week.py | 205 ++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 compare-models/compare_models_for_ligands_of_the_week.py diff --git a/compare-models/compare_models_for_ligands_of_the_week.py b/compare-models/compare_models_for_ligands_of_the_week.py new file mode 100644 index 0000000..58628ce --- /dev/null +++ b/compare-models/compare_models_for_ligands_of_the_week.py @@ -0,0 +1,205 @@ + +import argparse +import datetime +import gemmi +import os +import sys +import math +import numpy +import json +from scipy.stats import pearsonr + +def print_to_file(text,fout,mode='a'): + stdout = sys.stdout + with open(fout,mode) as sys.stdout: + print(text) + sys.stdout.flush() + sys.stdout = stdout + return + +def get_rscc(map_o,map_c,pos,mask_radius): + print("---- get_rsccs -start ---") + mask = gemmi.FloatGrid(numpy.array(map_o, copy=False),map_o.unit_cell,map_o.spacegroup) + mask.fill(0) + mask.set_points_around(pos, radius=mask_radius, value=1) + #mask.symmetrize_max() + maskarray = numpy.array(mask, copy=False) + points = numpy.argwhere(maskarray == 1) + print("points.size", points.size) + if points.size < 4: + return None + list_obs = [] + list_calc = [] + list_diff = [] + for point in points: + obs = map_o.get_point(point[0],point[1],point[2]).value + calc = map_c.get_point(point[0],point[1],point[2]).value + list_obs.append(obs) + list_calc.append(calc) + list_diff.append(obs-calc) + n_points = len(list_obs) + sum_obs = round(sum(list_obs), 2) + sum_calc = round(sum(list_calc),2) + sum_diff = round(sum(list_diff),2) + sum_absdiff = round(sum(abs(el) for el in list_diff),2) + rmsd_obs_calc = round(math.sqrt(sum(el**2 for el in list_diff)/n_points),2) + cor_obs_calc,cor_obs_calc_p = pearsonr(list_obs,list_calc) + return cor_obs_calc + +def modelComparison(model1, model2, mtzin, ligand_comp_id, opt): + + if model1 == None: + print("modelComparison: model1 is None") + return + + if model2 == None: + print("modelComparison: model2 is None") + return + + print("debug:: in modelComparison() model1", model1, "model2", model2) + + try: + if not 'sample_rate' in opt: + opt['sample_rate'] = 1.0 + if not 'mask_radius' in opt: + opt['mask_radius'] = 1.5 + + print("Reading model file:",model1) + st1 = gemmi.read_structure(model1) + print("Reading model file:",model2) + st2 = gemmi.read_structure(model2) + print("Reading MTZ file:",mtzin) + mtz = gemmi.read_mtz_file(mtzin) + + print("Converting FWT/PHWT to map") + map_obs = mtz.transform_f_phi_to_map('FWT','PHWT', sample_rate=opt['sample_rate']) + print("Converting FC_ALL/PHIC_ALL to map") + map_calc = mtz.transform_f_phi_to_map('FC_ALL','PHIC_ALL', sample_rate=opt['sample_rate']) + + print("Comparing equivalent atoms in all polymeric entities") + result = [] + for model in st1: + for chain in model: + for residue in chain: + if residue.name != ligand_comp_id: + if ligand_comp_id: + continue + print(" Found", residue, residue.entity_type) + if residue.entity_type == gemmi.EntityType.NonPolymer: + for atom in residue: + if atom.element.name == "H": + continue + addr = gemmi.make_address(chain, residue, atom) + cra1 = model.find_cra(addr) + cra2 = st2[0].find_cra(addr) # This always uses the first model in st2 - probably worth checking that both st1 and st2 only have one model, otherwise fail. + print("---\ncra1: ", cra1, "cra2: ", cra2) + + if cra1 and cra2: + print("cra1 pos:", cra1.atom.pos) + print("cra2 pos:", cra2.atom.pos) + peak1 = map_obs.interpolate_value(cra1.atom.pos) + peak2 = map_obs.interpolate_value(cra2.atom.pos) + rscc1 = get_rscc(map_obs,map_calc,cra1.atom.pos,opt['mask_radius']) + rscc2 = get_rscc(map_obs,map_calc,cra2.atom.pos,opt['mask_radius']) + + if rscc1 == None: continue + if rscc2 == None: continue + + result.append({'chain':chain.name, + 'residue':str(residue.seqid.num) + (residue.seqid.icode if residue.seqid.icode != ' ' else ''), + 'tlc':residue.name, + 'atom':atom.name, + 'dist':cra1.atom.pos.dist(cra2.atom.pos), + 'b1':cra1.atom.b_iso, + 'b2':cra2.atom.b_iso, + 'peak1':peak1, + 'peak2':peak2, + 'rscc1':rscc1, + 'rscc2':rscc2}) + + print("Processed",len(result),"atoms") + return result + + # except Exception as e: + except FloatingPointError as e: # I want to see the backtrace + print("keyError: %s" % e) + print("Cannot continue - program terminated.") + sys.exit(1) + +def printModelComparison(analysis,output_file): + try: + output = '\t'.join(["chain","residue","TLC","atom","dist","b1","b2","peak1","peak2","rscc1","rscc2"]) + for atom in analysis: + output = output + '\n' + '\t'.join([atom['chain'], + atom['residue'], + atom['tlc'], + atom['atom'], + str(round(atom['dist'],2)), + str(round(atom['b1'],2)), + str(round(atom['b2'],2)), + str(round(atom['peak1'],2)), + str(round(atom['peak2'],2)), + str(round(atom['rscc1'],2)), + str(round(atom['rscc2'],2))]) + + print("Writing output to file:",output_file) + print_to_file(output,output_file,'w') + + except Exception as e: + print("Error: %s" % e) + print("Cannot continue - program terminated.") + sys.exit(1) + +def proc_using_code(accession_code, ligand_analysis_base): + + two_code = accession_code[1:3] + upcase_code = accession_code.upper() + refmac_dir = os.path.join(ligand_analysis_base, 'refmac') + sub_dir = os.path.join(refmac_dir, two_code) + info_file = os.path.join(sub_dir, upcase_code + ".info.json") + print("debug:: in proc_using_code() accession_code", accession_code, "info_file", info_file) + if os.path.exists(info_file): + print("INFO file", info_file) + with open(info_file, "r") as info_file_data: + info = json.load(info_file_data) + refmac_out_fn = info['refmac_pdb_file_name'] + ligand_comp_id = info['ligand-code'] + if os.path.exists(refmac_out_fn): + pdb_file_name = info['pdb_file_name'] + mtz = info['refmac_mtz_file_name'] + dict = info['acedrg_cif_file_name'] + opt = {} + result = modelComparison(pdb_file_name, refmac_out_fn, mtz, ligand_comp_id, opt) + results_output_file_name = "results.out" + printModelComparison(result, results_output_file_name) + else: + print('Missing refmac pdb output file "{}"'.format(refmac_out_fn)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--f1', dest='model1', help="First model (PDB or mmCIF format)") + parser.add_argument('--f2', dest='model2', help="Second model (PDB or mmCIF format)") + parser.add_argument('--mtz', dest='mtzin', help="MTZ file containing phases (columns required: FWT, PHWT, FC_ALL, PHIC_ALL). At the minute just one MTZ is used for model-map analysis... in future this should be extended to a second.") + parser.add_argument('-o','--output', dest='output', required=False, help="Output file containing results of comparative analysis", default='analysis.txt') + parser.add_argument('-s','--samplerate', dest='samplerate', required=False, help="Map sampling rate", default=1.0, type=float) + parser.add_argument('-r','--maskradius', dest='maskradius', required=False, help="Mask radius around atomic position", default=1.5, type=float) + parser.add_argument("--code", dest='accession_code') + parser.add_argument('--ligand-analysis-base', dest='ligand_analysis_base') + args = parser.parse_args() + + opt = {} + if args.samplerate: + opt['sample_rate'] = args.samplerate + if args.maskradius: + opt['mask_radius'] = args.maskradius + if args.accession_code: + print("INFO:: Compare models using this accession code", args.accession_code, "with base", args.ligand_analysis_base) + proc_using_code(args.accession_code, args.ligand_analysis_base) + else: + time_start = datetime.datetime.now() + ligand_code = "" # special value, means "all residue types" + result = modelComparison(args.model1,args.model2,args.mtzin,ligand_code,opt) + printModelComparison(result,args.output) + time_end= datetime.datetime.now() + print("Job completed ("+str(round((time_end-time_start).total_seconds(),3))+" seconds)") From 037cc122a13a76ae80c361ed4d6b67525f9be822 Mon Sep 17 00:00:00 2001 From: Paul Emsley Date: Wed, 16 Nov 2022 03:04:06 +0000 Subject: [PATCH 2/2] Add scripts for ligands-of-the-week analysis --- compare-models/compare-ligands.py | 100 +++++++ compare-models/ligand-hist.r | 37 +++ compare-models/multi-robs-analysis.py | 39 +++ compare-models/notes | 40 +++ compare-models/pool-sub-1.py | 70 +++++ compare-models/proc-lig.py | 363 ++++++++++++++++++++++++++ compare-models/r-correl.r | 15 ++ compare-models/refmac-log-analysis.py | 27 ++ 8 files changed, 691 insertions(+) create mode 100644 compare-models/compare-ligands.py create mode 100644 compare-models/ligand-hist.r create mode 100644 compare-models/multi-robs-analysis.py create mode 100644 compare-models/notes create mode 100644 compare-models/pool-sub-1.py create mode 100644 compare-models/proc-lig.py create mode 100644 compare-models/r-correl.r create mode 100644 compare-models/refmac-log-analysis.py diff --git a/compare-models/compare-ligands.py b/compare-models/compare-ligands.py new file mode 100644 index 0000000..2177249 --- /dev/null +++ b/compare-models/compare-ligands.py @@ -0,0 +1,100 @@ +import sys +import os +import json +import gemmi +import glob + +def get_ligand(ligand_tlc: str, pdb_file_name: str): + st = gemmi.read_pdb(pdb_file_name) + for model in st: + for chain in model: + for residue in chain: + if residue.name == ligand_tlc: + return residue + return False + +def get_reso(pdb_file_name): + st = gemmi.read_pdb(pdb_file_name) + return st.resolution + +def compare_ligands(pdb_file_name: str, refmac_pdb_file_name: str, accession_code:str, ligand_tlc: str, acedrg_cif_file_name: str): + + if not os.path.exists(pdb_file_name): + print("Bad PDB file name", pdb_file_name) + if not os.path.exists(refmac_pdb_file_name): + print("Bad refmac file name", refmac_pdb_file_name) + if not os.path.exists(acedrg_cif_file_name): + print("bad acedrg cif file name", acedrg_cif_file_name) + if len(ligand_tlc) > 3: + print("bad ligand tlc", ligand_tlc) + if len(ligand_tlc) == 0: + print("bad tlc") + + lig_1 = get_ligand(ligand_tlc, pdb_file_name) + lig_2 = get_ligand(ligand_tlc, refmac_pdb_file_name) + reso = get_reso(pdb_file_name) + + for atom_1, atom_2 in zip(lig_1, lig_2): + pos_1 = atom_1.pos + pos_2 = atom_2.pos + d = pos_1.dist(pos_2) + print("delta:", ligand_tlc, accession_code, str(reso), atom_1.name, d) + if d > 1.0: + print(refmac_pdb_file_name) + + return False + +def multi_compare_ligands(refmac_dir): + for d in glob.glob(os.path.join(refmac_dir, "*")): + for info_file in glob.glob(os.path.join(d, "*.info.json")): + print(info_file) + with open(info_file, "r") as info_file_data: + info = json.load(info_file_data) + print(info) + refmac_out_fn = info['refmac_pdb_file_name'] + # refmac_out_fn may not be there because refmac didn't properly complete + if os.path.exists(refmac_out_fn): + compare_ligands(info['pdb_file_name'], info['refmac_pdb_file_name'], info['accession_code'], info['ligand-code'], info['acedrg_cif_file_name']) + else: + print("missing refmac-out", refmac_out_fn, "for ligand-code", info['ligand-code']) + +def compare_ligands_in_table_file(this_weeks_ligands_table_file_name): + with open(this_weeks_ligands_table_file_name) as f: + lines = f.readlines() + for line in lines: + parts = line.split() + if parts: + shell_script_file_name = parts[0] + refmac_log_file_name = parts[1] + fn = os.path.basename(refmac_log_file_name) + dir_name = os.path.dirname(refmac_log_file_name) + parts = fn.split("-") + p = parts[1] + parts_2 = p.split(".") + code = parts_2[0] + json_file_name = code.upper() + ".info.json" + info_file = os.path.join(dir_name, json_file_name) + print(info_file) + with open(info_file, "r") as info_file_data: + info = json.load(info_file_data) + print(info) + refmac_out_fn = info['refmac_pdb_file_name'] + # refmac_out_fn may not be there because refmac didn't properly complete + if os.path.exists(refmac_out_fn): + compare_ligands(info['pdb_file_name'], info['refmac_pdb_file_name'], info['accession_code'], info['ligand-code'], info['acedrg_cif_file_name']) + else: + print("missing refmac-out", refmac_out_fn, "for ligand-code", info['ligand-code']) + + # compare_ligands(info['pdb_file_name'], info['refmac_pdb_file_name'], info['accession_code'], info['ligand-code'], info['acedrg_cif_file_name']) + + + +if __name__ == "__main__": + refmac_dir = "refmac" + dated_refmac_logs_table_list = [] + for arg in sys.argv[1:]: + print(arg) + table_file = arg + compare_ligands_in_table_file(table_file) + + # multi_compare_ligands(refmac_dir) diff --git a/compare-models/ligand-hist.r b/compare-models/ligand-hist.r new file mode 100644 index 0000000..071ab66 --- /dev/null +++ b/compare-models/ligand-hist.r @@ -0,0 +1,37 @@ + +st=format(Sys.time(), '%d-%m-%Y') +pdf(paste("acedrg-ligands_",st, ".pdf", sep = "")) +x=rnorm(100,40,3) +y=rnorm(100,100,5) + +par(mfrow=c(3,3), cex=0.5) + +a = read.table("15-11-refmacs-deltas.table") + +# hist(a$V6, breaks=30, col="lightblue", main="Overall Displacement Histogram", +# xlab="Displacement (A)") + +factors = factor(a$V3) +levels = levels(factors) +print(levels) + +for (l in levels) { + ligand_code = a$V2[a$V3==l][1] + reso = a$V4[a$V3==l][2] + mt = l + mt = paste(mt, ligand_code) + mt = paste(mt, reso) + mt = paste(mt,'\uc5') + + hist(a$V6[a$V3==l], col="lightblue", main=mt, + xlim=c(0, 1.2), xlab="Displacement (A)") + correl_table_fn <- paste("correlation-tables/results-", tolower(l), sep="") + correl_table_fn <- paste(correl_table_fn, ".table", sep="") + print(correl_table_fn) + b <- read.table(correl_table_fn, header=TRUE) + plot(b$rscc1, b$rscc2, xlim=c(0.3, 1.0), ylim=c(0.3, 1.0), xlab="correl", ylab="correl", main=mt) + abline(a=0, b=1) + plot(b$dist, b$b2, xlab="dist", ylab="b2", main=mt) +} + +dev.off() diff --git a/compare-models/multi-robs-analysis.py b/compare-models/multi-robs-analysis.py new file mode 100644 index 0000000..7bdd0aa --- /dev/null +++ b/compare-models/multi-robs-analysis.py @@ -0,0 +1,39 @@ +import os +import subprocess + +todays_refmacs_table = "15-11-refmacs.table" + +def log_name_to_code(log): + parts_1 = log.split(".") + pre = parts_1[0] + parts_2 = pre.split("-") + code = parts_2[1] + return code + +if os.path.exists("rob-proc"): + pass +else: + os.mkdir("rob-proc") + +if os.path.exists("correlation-tables"): + pass +else: + os.mkdir("correlation-tables") + +with open(todays_refmacs_table) as f: + lines = f.readlines() + for line in lines: + parts = line.split() + log = parts[1] + accession_code = log_name_to_code(log) + print(log, accession_code) + + rob_proc_log_file_name = os.path.join("rob-proc", accession_code + ".log") + args = ["python3", "robsrepo/Ligand-Tools/compare-models/compare_models_for_ligands_of_the_week.py", + '--code', accession_code, '--ligand-analysis-base', "."] + comp_proc = subprocess.run(args, capture_output=True) + with open(rob_proc_log_file_name, "bw") as f: + f.write(comp_proc.stdout) + if os.path.exists("results.out"): + new_results_out_name = os.path.join("correlation-tables", "results-" + accession_code + ".table") + os.rename("results.out", new_results_out_name) diff --git a/compare-models/notes b/compare-models/notes new file mode 100644 index 0000000..f28aec0 --- /dev/null +++ b/compare-models/notes @@ -0,0 +1,40 @@ + +Notes: + +- python3 proc-lig.py # download new ligands, runs acedrg downloads pdbs and makes mtzs and refmac scripts +##- bash multisub-refmac.sh # run refmac for every script that doesn't have a log, creates "this_weeks_refmacs.table" +##- [wait] +- python3 pool-sub-1.py +the above doesn't make the refmacs table, so +- edit recover-refmac-table.sh and put in todays date +- run recover-refmac-table.sh to make the refmacs table for this week. +- edit refmac-log-analysis.py to set the refmac table for the analysis +- python3 refmac-log-analysis.py # find the logs files that had an error in the above generated refmacs table +- python3 compare-ligands.py 22_09_28_refmacs.table > 22_09_28_refmacs.log +- grep ^delta: 22_09_28_refmacs.log > deltas.table # finds how much the ligands have moved. Makes deltas.table + # note in the above, 22_09_28_refmacs.table is just an example, you can use any number of table files. + +- Rscript ligand-hist.r # make plots using deltas.table + + +Further extentions with Coot or gemmi + +Coot: + - learn coot API + - depends on Coot installation + - many functions available - most of the work is done + +Gemmi: + - learn GEMMI API + - gemmi is easily installed usig pip3 + - more portable + - fewer functions - most of the functions will need to be written from scatch + + +---- +Mon 14 Nov 18:47:43 GMT 2022 + + Using Rob's script: + +python3 robsrepo/Ligand-Tools/compare-models/compare_models_for_ligands_of_the_week.py --code 7s3x --ligand-analysis-base . + diff --git a/compare-models/pool-sub-1.py b/compare-models/pool-sub-1.py new file mode 100644 index 0000000..a0b3458 --- /dev/null +++ b/compare-models/pool-sub-1.py @@ -0,0 +1,70 @@ + +from multiprocessing import Pool +import subprocess +import os +import glob + +pool_size = 6 + +def bash_the_script(script_file_name: str): + log_file_name = script_name_to_log_name(script_file_name) + print("### bash", script_file_name, log_file_name) + process = subprocess.run(["bash", script_file_name], check=True, stdout=subprocess.PIPE, text=True) + output = process.stdout + with open(log_file_name, "w") as f_out: + f_out.write(output) + +def script_name_to_log_name(script_file_name): + s = script_file_name[:-3] + ".log" + return s + +if __name__ == '__main__': + + today = "22_10_24" # generate this + # per line we have the script file name and the log file name + this_weeks_refmacs_file_name = today + "_refmacs.table" + + ls = [] + # run refmac if there was not a log file for that script + refmac_subdirs = glob.glob("refmac/*") + for subdir in refmac_subdirs: + subdir_scripts = glob.glob(subdir + "/refmac-*.sh") + for script_file_name in subdir_scripts: + log_file_name = script_name_to_log_name(script_file_name) + if os.path.exists(log_file_name): + pass + else: + # print("run that bad boy", script_file_name) + ls.append(script_file_name) + + + with Pool(pool_size) as pool: + m = pool.map(bash_the_script, ls) + + with open(this_weeks_refmacs_file_name) as f_refmacs_table: + for scr in ls: + f_refmacs_table.write(scr) + f_refmacs_table.write(" ") + f_refmacs_table.writee(script_name_to_log_name(scr)) + f_refmacs_table.write("\n") + +# this is the shell script: +# +# today=$(date +%y_%m_%d) +# this_weeks_refmacs=$today"_refmacs.table" +# echo "" > $this_weeks_refmacs +# for refmac_script in refmac/*/refmac*.sh ; +# do +# stub="${refmac_script%.*}" +# log=$stub.log +# # force a rerun +# log=$stub.logx +# # echo $log +# if [ -e $log ] ; then +# : +# else +# echo run refmac script $refmac_script $log +# # bash $refmac_script > $log 2>&1 & +# # echo $refmac_script $log >> $this_weeks_refmacs +# fi +# done diff --git a/compare-models/proc-lig.py b/compare-models/proc-lig.py new file mode 100644 index 0000000..49aa9ea --- /dev/null +++ b/compare-models/proc-lig.py @@ -0,0 +1,363 @@ + +import os +import json +import urllib.request +import time +import subprocess + +def get_latest_ligands_json(json_file_name: str) -> bool: + # Thank you Sameer + s = "select?q=(q_document_type%3Alatest_chemistry)&fl=new_revised_ligand&rows=10000&omitHeader=true&wt=json" + s = "pdb/select?group=true&group.field=pdb_id&group.ngroups=true&facet.field=new_revised_ligand&facet=true&rows=0&facet.limit=100000&facet.mincount=1&json.nl=map&group.facet=true&q=(q_document_type%3Alatest_chemistry)&omitHeader=true&wt=json" + url = os.path.join("https://www.ebi.ac.uk/pdbe/search/", s) + urllib.request.urlretrieve(url, json_file_name) + # add a test that that overwrote the old one (if there was an old one) + success = True + return success + +def mkdir_maybe(dir: str, sub_dir: str) -> None: + dir_name = os.path.join(dir, sub_dir) + if os.path.exists(dir_name): + pass + else: + if os.path.exists(dir): + pass + else: + os.mkdir(dir) + os.mkdir(dir_name) + +def tlc_to_ccd_cif_file_name(cif_dir: str, tlc: str) -> str: + letter = tlc[0].upper() + tlc_local_file_name = tlc + ".cif" + fn = os.path.join(cif_dir, letter, tlc_local_file_name) + return fn + +def accession_code_to_pdb_file_name(code: str, pdb_dir: str) -> str: + sub_dir_local = code[1:3].lower() + pdb_fn_local = code.lower() + ".pdb" + sub_dir = os.path.join(pdb_dir, sub_dir_local) + fn = os.path.join(pdb_dir, sub_dir_local, pdb_fn_local) + if os.path.exists(pdb_dir): + pass + else: + os.mkdir(pdb_dir) + if os.path.exists(sub_dir): + pass + else: + os.mkdir(sub_dir) + return fn + +def fetch_ccd_cif_for(tlc: str, cif_dir: str) -> str: + fn = tlc_to_ccd_cif_file_name(cif_dir, tlc) + if os.path.exists(fn): + return fn + else: + url = os.path.join("https://www.ebi.ac.uk/pdbe/static/files/pdbechem_v2/", tlc + ".cif") + letter = tlc[0].upper() + mkdir_maybe(cif_dir, letter) + urllib.request.urlretrieve(url, fn) + +def get_model_db_code(tlc_cif: str) -> str: + if os.path.exists(tlc_cif): + with open(tlc_cif, 'r') as f: + lines = f.readlines() + for line in lines: + if "_chem_comp.pdbx_model_coordinates_db_code" in line: + parts = line.split() + if len(parts) == 2: + model_db_code = parts[1] + if model_db_code == "?": + return False + return model_db_code + return False + else: + print("No file", tlc_cif) + return False + +def fetch_pdb_for_model_db_code(accession_code: str, pdb_dir: str) -> str: + # print("fetch_pdb_for_model_db_code", accession_code) + fn = accession_code_to_pdb_file_name(accession_code, pdb_dir) + if os.path.exists(fn): + return fn + else: + s = "https://www.ebi.ac.uk/pdbe/entry-files/download" + fn_server = "pdb" + accession_code.lower() + ".ent" + url = os.path.join(s, fn_server) + print("Fetching pdb", url, fn) + try: + # the PDB file might not (yet) exist on the server + fn_tmp = fn + ".partial_download" + urllib.request.urlretrieve(url, fn_tmp) + os.rename(fn_tmp, fn) + return fn + except urllib.error.HTTPError as e: + print(e, url) + with open("missing-models.table", "w") as f: + f.write(url, fn) + f.write("\n") + return False + return False + +def ligand_has_only_organic_set_atoms(ccd_cif_file_name): + + # the Right Way to do this is with GEMMI, but this simple + # hack will do now - so that I don't have to mess about + # with python and installing the gemmi module + + def is_acceptable_element(ele): + goodies = ["H", "C", "N", "I", "CL", "BR", "F", "O", "P", "S", "B"] + return ele in goodies + + with open(ccd_cif_file_name, 'r') as f: + lines = f.readlines() + atoms_block = False + for line in lines: + if "_chem_comp_atom.pdbx_ordinal" in line: + atoms_block = True + if "loop_" in line: + atoms_block = False + if atoms_block: + parts = line.split() + if len(parts) > 10: + element = parts[3] + if not is_acceptable_element(element): + print("non-organic-set element: %s in %s" % (element, ccd_cif_file_name)) + return False + return True + +def run_acedrg(ccd_cif_file_name: str, ligand_tlc: str, acedrg_cif_root: str) -> bool: + args = ["acedrg", "-c", ccd_cif_file_name, "--coords", "-z", "-o", acedrg_cif_root] + print(" ", args) + comp_proc = subprocess.run(args, capture_output=True) + log_dir = "acedrg-logs" + if not os.path.exists(log_dir): + os.mkdir(log_dir) + log_file_name = os.path.join(log_dir, "acedrg-" + ligand_tlc + ".log") + with open(log_file_name, "bw") as f: + f.write(comp_proc.stdout) + return False + +def convert_to_mtz(accession_code: str, sub_dir: str, data_cif_file_name: str, + data_mtz_file_name: str) -> bool: + args = ["cif2mtz", "HKLIN", data_cif_file_name, "HKLOUT", data_mtz_file_name] + s=b"END\n" + comp_proc = subprocess.run(args, capture_output=True, input=s) + log_file_name = os.path.join(sub_dir, "cif2mtz-" + accession_code + ".log") + with open(log_file_name, "bw") as f: + # print("writing cif2mtz log file: ", log_file_name) + f.write(comp_proc.stdout) + return comp_proc.returncode + +def fetch_and_make_mtz_data(accession_code: str, pdb_data_dir: str) -> str: + sub_dir_local = accession_code[1:3].lower() + data_cif_fn_local = accession_code.lower() + ".cif" + data_mtz_fn_local = accession_code.lower() + ".mtz" + data_cif_file_name = os.path.join(pdb_data_dir, sub_dir_local, data_cif_fn_local) + data_mtz_file_name = os.path.join(pdb_data_dir, sub_dir_local, data_mtz_fn_local) + data_cif_file_name_tmp = data_cif_file_name + ".partial_download" + + # note to self" check here for EXPDATA in the PDB file here. Don't try to download sfs + # for "ELECTRON MICROSCOPY" structures + + if not os.path.exists(pdb_data_dir): + os.mkdir(pdb_data_dir) + if not os.path.exists(os.path.join(pdb_data_dir, sub_dir_local)): + os.mkdir(os.path.join(pdb_data_dir, sub_dir_local)) + if os.path.exists(data_mtz_file_name): + return data_mtz_file_name + else: + sub_dir = os.path.join(pdb_data_dir, sub_dir_local) + server_dir = "https://www.ebi.ac.uk/pdbe/entry-files/download/" + sf_file_name_local = "r" + accession_code.lower() + "sf.ent" + url = os.path.join(server_dir, sf_file_name_local) + try: + result,headers = urllib.request.urlretrieve(url, data_cif_file_name_tmp) + if os.path.exists(result): + os.rename(data_cif_file_name_tmp, data_cif_file_name) + print("Download", data_cif_file_name, "created") + r = convert_to_mtz(accession_code, sub_dir, data_cif_file_name, data_mtz_file_name) + if r != 0: + print("debug:: in fetch_and_make_mtz_data() with r:", r) + if r == 0: # shell program completed successfully + return data_mtz_file_name + except urllib.error.HTTPError as e: + print("Error: in fetch_and_make_mtz_data() ", e, "for", url, data_mtz_file_name) + return False + +def write_refmac_shell_script(pdb_file_name: str, data_mtz_file_name: str, acedrg_cif_file_name: str, + refmac_sub_dir: str, accession_code: str, ligand_tlc: str) -> bool: + + print("making refmac script for:", pdb_file_name, data_mtz_file_name, acedrg_cif_file_name) + if data_mtz_file_name == False: + print("No data_mtz_file - nothing doing") + return + if len(pdb_file_name) == 0: + return False + if len(data_mtz_file_name) == 0: + return False + if len(acedrg_cif_file_name) == 0: + return False + pdb_out = os.path.join(refmac_sub_dir, accession_code + "-refmac.pdb") + hkl_out = os.path.join(refmac_sub_dir, accession_code + "-refmac.mtz") + args = ["refmac5", "XYZIN", pdb_file_name, "HKLIN", data_mtz_file_name, + "XYZOUT", pdb_out, "HKLOUT", hkl_out, "LIBIN", acedrg_cif_file_name] + script_file_name_local = "refmac-" + accession_code.lower() + ".sh" + shell_script_file_name = os.path.join(refmac_sub_dir, script_file_name_local) + with open(shell_script_file_name, 'w') as f: + f.write("\n") + for arg in args: + f.write(arg) + f.write(" ") + f.write("<< !\n") + f.write("NCYCLES 5\n") + f.write("END\n") + f.write("!\n") + + # now the info file: # it's used for validation of the ligand + info_file = os.path.join(refmac_sub_dir, accession_code + ".infox") + with open(info_file, 'w') as f: + f.write("accession_code ") + f.write(accession_code) + f.write("\n") + f.write("ligand-code ") + f.write(ligand_tlc) + f.write("\n") + f.write("pdb_file_name ") + f.write(pdb_file_name) + f.write("\n") + f.write("data_mtz_file_name ") + f.write(data_mtz_file_name) + f.write("\n") + f.write("ligand-cif ") + f.write(acedrg_cif_file_name) + f.write("\n") + + info_file = os.path.join(refmac_sub_dir, accession_code + ".info.json") + dict = {} + dict['accession_code'] = accession_code + dict['ligand-code'] = ligand_tlc + dict['pdb_file_name'] = pdb_file_name + dict['refmac_pdb_file_name'] = pdb_out + dict['data_mtz_file_name'] = data_mtz_file_name + dict['refmac_mtz_file_name'] = hkl_out + dict['acedrg_cif_file_name'] = acedrg_cif_file_name + j = json.dumps(dict, indent = 4) + with open(info_file, 'w') as f: + f.write(j) + f.write("\n") + +def is_em(pdb_file_name: str) -> bool: + if os.path.exists(pdb_file_name): + f = open(pdb_file_name) + lines = f.readlines() + for line in lines: + if "EXPDTA" in line: + return "ELECTRON MICROSCOPY" in line + return False + +def proc_model_and_ligand(accession_code: str, pdb_file_name: str, ligand_tlc: str, + cif_dir: str, acedrg_dir: str, refmac_dir: str) -> bool: + + # Don't continue if the pdb_file_name is False or empty + try: + if len(pdb_file_name) == 0: + return False + except Exception as e: + print("in proc_model_and_ligand()", e) + return False + + if os.path.exists(pdb_file_name): + letter = ligand_tlc[0].lower() + acedrg_cif_local = ligand_tlc + "-acedrg.cif" + acedrg_root_local = ligand_tlc + "-acedrg" + acedrg_cif_root = os.path.join(acedrg_dir, letter, acedrg_root_local) + acedrg_cif_file_name = os.path.join(acedrg_dir, letter, acedrg_cif_local) + if os.path.exists(acedrg_cif_file_name): + pass + else: + ccd_cif_file_name = tlc_to_ccd_cif_file_name(cif_dir, ligand_tlc) + if ligand_has_only_organic_set_atoms(ccd_cif_file_name): + run_acedrg(ccd_cif_file_name, ligand_tlc, acedrg_cif_root) + is_em_flag = is_em(pdb_file_name) + if is_em_flag: + print("EM data for", accession_code) + else: + data_mtz_file_name = fetch_and_make_mtz_data(accession_code, pdb_data_dir) + # print("data_mtz_file_name", data_mtz_file_name) + sub_dir = os.path.join(refmac_dir, accession_code[1:3].lower()) + if not os.path.exists(refmac_dir): + os.mkdir(refmac_dir) + if not os.path.exists(sub_dir): + os.mkdir(sub_dir) + + try: + # print("data_mtz_file_name: ", data_mtz_file_name, len(data_mtz_file_name)) + if os.path.exists(acedrg_cif_file_name): + if os.path.exists(pdb_file_name): + if os.path.exists(data_mtz_file_name): + write_refmac_shell_script(pdb_file_name, data_mtz_file_name, + acedrg_cif_file_name, + sub_dir, accession_code, ligand_tlc) + else: + print("missing mtz", data_mtz_file_name) + except TypeError as e: + print(e, "for in write_refmac_shell_script") + return False + else: + print("pdb file does not exist: " + pdb_file_name) + return False + +def parse_latest_ligands_json(j: json, cif_dir: str, pdb_dir: str, acedrg_dir: str, refmac_dir: str): + + grouped = j["grouped"] + fc = j['facet_counts'] + ff = fc["facet_fields"] + nrl = ff["new_revised_ligand"] + for lig_descr in nrl: + s = lig_descr + tlc = s[:3] + print("Processing: -----------", tlc, "--------------") + tlc_cif = tlc_to_ccd_cif_file_name(cif_dir, tlc) + fetch_ccd_cif_for(tlc, cif_dir) + try: + model_db_code = get_model_db_code(tlc_cif) + pdb_fn = fetch_pdb_for_model_db_code(model_db_code, pdb_dir) + if len(pdb_fn) > 0: + if os.path.exists(pdb_fn): + proc_model_and_ligand(model_db_code, pdb_fn, tlc, cif_dir, acedrg_dir, refmac_dir) + except TypeError as e: + print(e, "for", tlc) + +def new_ligands_from_pdbe_json(json_file_name, cif_dir, pdb_dir: str, acedrg_dir: str, refmac_dir: str): + try: + with open(json_file_name, 'r') as f: + j = json.load(f) + parse_latest_ligands_json(j, cif_dir, pdb_dir, acedrg_dir, refmac_dir) + except TypeError as e: + print(e) + +def need_new_json_for_ligands(json_file_name: str, cif_dir: str) -> bool: + if os.path.exists(json_file_name): + # if it's more than a day old, get a new one + st=os.stat(json_file_name) + mtime=st.st_mtime + t = time.time() + delta_time = t - mtime + return delta_time > 24 * 60 * 60 + else: + status = get_latest_ligands_json(json_file_name) + return False + +if __name__ == '__main__': + + cif_dir = "CCD-cifs" + pdb_dir = "pdb" + pdb_data_dir = "pdb-data" + acedrg_dir = "acedrg" + refmac_dir = "refmac" + json_file_name = "new_revised_ligands.json" + + if need_new_json_for_ligands(json_file_name, cif_dir): + get_latest_ligands_json(json_file_name) # download the json file + + new_ligands_from_pdbe_json(json_file_name, cif_dir, pdb_dir, acedrg_dir, refmac_dir) diff --git a/compare-models/r-correl.r b/compare-models/r-correl.r new file mode 100644 index 0000000..93aa5b2 --- /dev/null +++ b/compare-models/r-correl.r @@ -0,0 +1,15 @@ + +table_fn = "correlation-tables/results-8ers.table" +table_fn = "correlation-tables/results-8ewa.table" +table_fn = "correlation-tables/results-8dus.table" +table_fn = "correlation-tables/results-8du9.table" +table_fn = "correlation-tables/results-8bb5.table" +table_fn = "correlation-tables/results-8aqn.table" +table_fn = "correlation-tables/results-8duc.table" +table_fn = "correlation-tables/results-8a92.table" +table_fn = "correlation-tables/results-7fr6.table" + +a = read.table(table_fn, header=TRUE) + +plot(a$rscc1, a$rscc2, xlim=c(0.3, 1.0), ylim=c(0.3, 1.0), xlab="correl", ylab="correl") +abline(a=0, b=1) diff --git a/compare-models/refmac-log-analysis.py b/compare-models/refmac-log-analysis.py new file mode 100644 index 0000000..91a9e31 --- /dev/null +++ b/compare-models/refmac-log-analysis.py @@ -0,0 +1,27 @@ +import glob +import os + +this_weeks_refmacs_table = "this_weeks_refmacs.table" +this_weeks_refmacs_table = "15-11-refmacs.table" + + +def check_for_error(refmac_log_file_name_in): + refmac_log_file_name = str(refmac_log_file_name_in) + print('checking "{}"'.format(refmac_log_file_name)) # may not be needed + with open(refmac_log_file_name, "r") as refmac_log_file: + # print("analysing log", refmac_log_file_name) + for line in refmac_log_file.readlines(): + if "Error termination." in line: + print("This log had a error termination:", refmac_log_file_name) + if "Error: Problem with ligand name" in line: + print("This log had a error due to ligand name:", refmac_log_file_name) + + +with open(this_weeks_refmacs_table) as f: + lines = f.readlines() + for line in lines: + parts = line.split() + if parts: + shel_script_file_name = parts[0] + refmac_log_file_name = parts[1] + check_for_error(refmac_log_file_name)