MonomerLibrary · pemsley · Nov 15, 2022 · Nov 16, 2022
diff --git a/compare-models/compare-ligands.py b/compare-models/compare-ligands.py
@@ -0,0 +1,100 @@
+import sys
+import os
+import json
+import gemmi
+import glob
+
+def get_ligand(ligand_tlc: str, pdb_file_name: str):
+   st = gemmi.read_pdb(pdb_file_name)
+   for model in st:
+      for chain in model:
+         for residue in chain:
+            if residue.name == ligand_tlc:
+               return residue
+   return False
+
+def get_reso(pdb_file_name):
+    st = gemmi.read_pdb(pdb_file_name)
+    return st.resolution
+
+def compare_ligands(pdb_file_name: str, refmac_pdb_file_name: str, accession_code:str, ligand_tlc: str, acedrg_cif_file_name: str):
+
+   if not os.path.exists(pdb_file_name):
+      print("Bad PDB file name", pdb_file_name)
+   if not os.path.exists(refmac_pdb_file_name):
+      print("Bad refmac file name", refmac_pdb_file_name)
+   if not os.path.exists(acedrg_cif_file_name):
+      print("bad acedrg cif file name", acedrg_cif_file_name)
+   if len(ligand_tlc) > 3:
+      print("bad ligand tlc", ligand_tlc)
+   if len(ligand_tlc) == 0:
+      print("bad tlc")
+
+   lig_1 = get_ligand(ligand_tlc,        pdb_file_name)
+   lig_2 = get_ligand(ligand_tlc, refmac_pdb_file_name)
+   reso = get_reso(pdb_file_name)
+
+   for atom_1, atom_2 in zip(lig_1, lig_2):
+      pos_1 = atom_1.pos
+      pos_2 = atom_2.pos
+      d = pos_1.dist(pos_2)
+      print("delta:", ligand_tlc, accession_code, str(reso), atom_1.name, d)
+      if d > 1.0:
+         print(refmac_pdb_file_name)
+
+   return False
+
+def multi_compare_ligands(refmac_dir):
+   for d in glob.glob(os.path.join(refmac_dir, "*")):
+      for info_file in glob.glob(os.path.join(d, "*.info.json")):
+         print(info_file)
+         with open(info_file, "r") as info_file_data:
+            info = json.load(info_file_data)
+            print(info)
+            refmac_out_fn = info['refmac_pdb_file_name']
+            # refmac_out_fn may not be there because refmac didn't properly complete
+            if os.path.exists(refmac_out_fn):
+               compare_ligands(info['pdb_file_name'], info['refmac_pdb_file_name'], info['accession_code'], info['ligand-code'], info['acedrg_cif_file_name'])
+            else:
+               print("missing refmac-out", refmac_out_fn, "for ligand-code", info['ligand-code'])
+
+def compare_ligands_in_table_file(this_weeks_ligands_table_file_name):
+   with open(this_weeks_ligands_table_file_name) as f:
+      lines = f.readlines()
+      for line in lines:
+         parts = line.split()
+         if parts:
+               shell_script_file_name = parts[0]
+               refmac_log_file_name   = parts[1]
+               fn       = os.path.basename(refmac_log_file_name)
+               dir_name = os.path.dirname(refmac_log_file_name)
+               parts = fn.split("-")
+               p = parts[1]
+               parts_2 = p.split(".")
+               code = parts_2[0]
+               json_file_name = code.upper() + ".info.json"
+               info_file = os.path.join(dir_name, json_file_name)
+               print(info_file)
+               with open(info_file, "r") as info_file_data:
+                  info = json.load(info_file_data)
+                  print(info)
+                  refmac_out_fn = info['refmac_pdb_file_name']
+                  # refmac_out_fn may not be there because refmac didn't properly complete
+                  if os.path.exists(refmac_out_fn):
+                     compare_ligands(info['pdb_file_name'], info['refmac_pdb_file_name'], info['accession_code'], info['ligand-code'], info['acedrg_cif_file_name'])
+                  else:
+                     print("missing refmac-out", refmac_out_fn, "for ligand-code", info['ligand-code'])
+
+               # compare_ligands(info['pdb_file_name'], info['refmac_pdb_file_name'], info['accession_code'], info['ligand-code'], info['acedrg_cif_file_name'])
+
+
+
+if __name__ == "__main__":
+   refmac_dir = "refmac"
+   dated_refmac_logs_table_list = []
+   for arg in sys.argv[1:]:
+      print(arg)
+      table_file = arg
+      compare_ligands_in_table_file(table_file)
+
+   # multi_compare_ligands(refmac_dir)
diff --git a/compare-models/compare_models_for_ligands_of_the_week.py b/compare-models/compare_models_for_ligands_of_the_week.py
@@ -0,0 +1,205 @@
+
+import argparse
+import datetime
+import gemmi
+import os
+import sys
+import math
+import numpy
+import json
+from scipy.stats import pearsonr
+
+def print_to_file(text,fout,mode='a'):
+  stdout = sys.stdout
+  with open(fout,mode) as sys.stdout:
+    print(text)
+    sys.stdout.flush()
+  sys.stdout = stdout
+  return
+
+def get_rscc(map_o,map_c,pos,mask_radius):
+  print("---- get_rsccs -start ---")
+  mask = gemmi.FloatGrid(numpy.array(map_o, copy=False),map_o.unit_cell,map_o.spacegroup)
+  mask.fill(0)
+  mask.set_points_around(pos, radius=mask_radius, value=1)
+  #mask.symmetrize_max()
+  maskarray = numpy.array(mask, copy=False)
+  points = numpy.argwhere(maskarray == 1)
+  print("points.size", points.size)
+  if points.size < 4:
+    return None
+  list_obs = []
+  list_calc = []
+  list_diff = []
+  for point in points:
+    obs =  map_o.get_point(point[0],point[1],point[2]).value
+    calc = map_c.get_point(point[0],point[1],point[2]).value
+    list_obs.append(obs)
+    list_calc.append(calc)
+    list_diff.append(obs-calc)
+  n_points = len(list_obs)
+  sum_obs =  round(sum(list_obs), 2)
+  sum_calc = round(sum(list_calc),2)
+  sum_diff = round(sum(list_diff),2)
+  sum_absdiff = round(sum(abs(el) for el in list_diff),2)
+  rmsd_obs_calc = round(math.sqrt(sum(el**2 for el in list_diff)/n_points),2)
+  cor_obs_calc,cor_obs_calc_p = pearsonr(list_obs,list_calc)
+  return cor_obs_calc
+
+def modelComparison(model1, model2, mtzin, ligand_comp_id, opt):
+
+  if model1 == None:
+    print("modelComparison: model1 is None")
+    return
+
+  if model2 == None:
+    print("modelComparison: model2 is None")
+    return
+
+  print("debug:: in modelComparison() model1", model1, "model2", model2)
+
+  try:
+    if not 'sample_rate' in opt:
+      opt['sample_rate'] = 1.0
+    if not 'mask_radius' in opt:
+      opt['mask_radius'] = 1.5
+
+    print("Reading model file:",model1)
+    st1 = gemmi.read_structure(model1)
+    print("Reading model file:",model2)
+    st2 = gemmi.read_structure(model2)
+    print("Reading MTZ file:",mtzin)
+    mtz = gemmi.read_mtz_file(mtzin)
+
+    print("Converting FWT/PHWT to map")
+    map_obs = mtz.transform_f_phi_to_map('FWT','PHWT', sample_rate=opt['sample_rate'])
+    print("Converting FC_ALL/PHIC_ALL to map")
+    map_calc = mtz.transform_f_phi_to_map('FC_ALL','PHIC_ALL', sample_rate=opt['sample_rate'])
+
+    print("Comparing equivalent atoms in all polymeric entities")
+    result = []
+    for model in st1:
+      for chain in model:
+        for residue in chain:
+          if residue.name != ligand_comp_id:
+            if ligand_comp_id:
+              continue
+          print(" Found", residue, residue.entity_type)
+          if residue.entity_type == gemmi.EntityType.NonPolymer:
+            for atom in residue:
+              if atom.element.name == "H":
+                  continue
+              addr = gemmi.make_address(chain, residue, atom)
+              cra1 = model.find_cra(addr)
+              cra2 = st2[0].find_cra(addr)  # This always uses the first model in st2 - probably worth checking that both st1 and st2 only have one model, otherwise fail.
+              print("---\ncra1: ", cra1, "cra2: ", cra2)
+
+              if cra1 and cra2:
+                print("cra1 pos:", cra1.atom.pos)
+                print("cra2 pos:", cra2.atom.pos)
+                peak1 = map_obs.interpolate_value(cra1.atom.pos)
+                peak2 = map_obs.interpolate_value(cra2.atom.pos)
+                rscc1 = get_rscc(map_obs,map_calc,cra1.atom.pos,opt['mask_radius'])
+                rscc2 = get_rscc(map_obs,map_calc,cra2.atom.pos,opt['mask_radius'])
+
+                if rscc1 == None: continue
+                if rscc2 == None: continue
+
+                result.append({'chain':chain.name,
+                               'residue':str(residue.seqid.num) + (residue.seqid.icode if residue.seqid.icode != ' ' else ''),
+                               'tlc':residue.name,
+                               'atom':atom.name,
+                               'dist':cra1.atom.pos.dist(cra2.atom.pos),
+                               'b1':cra1.atom.b_iso,
+                               'b2':cra2.atom.b_iso,
+                               'peak1':peak1,
+                               'peak2':peak2,
+                               'rscc1':rscc1,
+                               'rscc2':rscc2})
+
+    print("Processed",len(result),"atoms")
+    return result
+
+  # except Exception as e:
+  except FloatingPointError as e: #  I want to see the backtrace
+    print("keyError: %s" % e)
+    print("Cannot continue - program terminated.")
+    sys.exit(1)
+
+def printModelComparison(analysis,output_file):
+  try:
+    output = '\t'.join(["chain","residue","TLC","atom","dist","b1","b2","peak1","peak2","rscc1","rscc2"])
+    for atom in analysis:
+      output = output + '\n' + '\t'.join([atom['chain'],
+                          atom['residue'],
+                          atom['tlc'],
+                          atom['atom'],
+                          str(round(atom['dist'],2)),
+                          str(round(atom['b1'],2)),
+                          str(round(atom['b2'],2)),
+                          str(round(atom['peak1'],2)),
+                          str(round(atom['peak2'],2)),
+                          str(round(atom['rscc1'],2)),
+                          str(round(atom['rscc2'],2))])
+
+    print("Writing output to file:",output_file)
+    print_to_file(output,output_file,'w')
+
+  except Exception as e:
+    print("Error: %s" % e)
+    print("Cannot continue - program terminated.")
+    sys.exit(1)
+
+def proc_using_code(accession_code, ligand_analysis_base):
+
+    two_code    = accession_code[1:3]
+    upcase_code = accession_code.upper()
+    refmac_dir  = os.path.join(ligand_analysis_base, 'refmac')
+    sub_dir     = os.path.join(refmac_dir, two_code)
+    info_file   = os.path.join(sub_dir, upcase_code + ".info.json")
+    print("debug:: in proc_using_code() accession_code", accession_code, "info_file", info_file)
+    if os.path.exists(info_file):
+        print("INFO file", info_file)
+        with open(info_file, "r") as info_file_data:
+            info = json.load(info_file_data)
+            refmac_out_fn = info['refmac_pdb_file_name']
+            ligand_comp_id = info['ligand-code']
+            if os.path.exists(refmac_out_fn):
+                pdb_file_name = info['pdb_file_name']
+                mtz  = info['refmac_mtz_file_name']
+                dict = info['acedrg_cif_file_name']
+                opt = {}
+                result = modelComparison(pdb_file_name, refmac_out_fn, mtz, ligand_comp_id, opt)
+                results_output_file_name = "results.out"
+                printModelComparison(result, results_output_file_name)
+            else:
+                print('Missing refmac pdb output file "{}"'.format(refmac_out_fn))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--f1', dest='model1', help="First model (PDB or mmCIF format)")
+    parser.add_argument('--f2', dest='model2', help="Second model (PDB or mmCIF format)")
+    parser.add_argument('--mtz', dest='mtzin', help="MTZ file containing phases (columns required: FWT, PHWT,  FC_ALL, PHIC_ALL). At the minute just one MTZ is used for model-map analysis... in future this should be extended to a second.")
+    parser.add_argument('-o','--output', dest='output', required=False, help="Output file containing results of comparative analysis", default='analysis.txt')
+    parser.add_argument('-s','--samplerate', dest='samplerate', required=False, help="Map sampling rate", default=1.0, type=float)
+    parser.add_argument('-r','--maskradius', dest='maskradius', required=False, help="Mask radius around atomic position", default=1.5, type=float)
+    parser.add_argument("--code", dest='accession_code')
+    parser.add_argument('--ligand-analysis-base', dest='ligand_analysis_base')
+    args = parser.parse_args()
+
+    opt = {}
+    if args.samplerate:
+        opt['sample_rate'] = args.samplerate
+    if args.maskradius:
+        opt['mask_radius'] = args.maskradius
+    if args.accession_code:
+        print("INFO:: Compare models using this accession code", args.accession_code, "with base", args.ligand_analysis_base)
+        proc_using_code(args.accession_code, args.ligand_analysis_base)
+    else:
+        time_start = datetime.datetime.now()
+        ligand_code = "" # special value, means "all residue types"
+        result = modelComparison(args.model1,args.model2,args.mtzin,ligand_code,opt)
+        printModelComparison(result,args.output)
+        time_end= datetime.datetime.now()
+        print("Job completed ("+str(round((time_end-time_start).total_seconds(),3))+" seconds)")
diff --git a/compare-models/ligand-hist.r b/compare-models/ligand-hist.r
@@ -0,0 +1,37 @@
+
+st=format(Sys.time(), '%d-%m-%Y')
+pdf(paste("acedrg-ligands_",st, ".pdf", sep = ""))
+x=rnorm(100,40,3)
+y=rnorm(100,100,5)
+
+par(mfrow=c(3,3), cex=0.5)
+
+a = read.table("15-11-refmacs-deltas.table")
+
+# hist(a$V6, breaks=30, col="lightblue", main="Overall Displacement Histogram",
+#     xlab="Displacement (A)")
+
+factors = factor(a$V3)
+levels = levels(factors)
+print(levels)
+
+for (l in levels) {
+    ligand_code = a$V2[a$V3==l][1]
+    reso        = a$V4[a$V3==l][2]
+    mt = l
+    mt = paste(mt, ligand_code)
+    mt = paste(mt, reso)
+    mt = paste(mt,'\uc5')
+
+    hist(a$V6[a$V3==l], col="lightblue", main=mt,
+         xlim=c(0, 1.2), xlab="Displacement (A)")
+    correl_table_fn <- paste("correlation-tables/results-", tolower(l), sep="")
+    correl_table_fn <- paste(correl_table_fn, ".table", sep="")
+    print(correl_table_fn)
+    b <- read.table(correl_table_fn, header=TRUE)
+    plot(b$rscc1, b$rscc2, xlim=c(0.3, 1.0), ylim=c(0.3, 1.0), xlab="correl", ylab="correl", main=mt)
+    abline(a=0, b=1)
+    plot(b$dist, b$b2, xlab="dist", ylab="b2", main=mt)
+}
+
+dev.off()
diff --git a/compare-models/multi-robs-analysis.py b/compare-models/multi-robs-analysis.py
@@ -0,0 +1,39 @@
+import os
+import subprocess
+
+todays_refmacs_table = "15-11-refmacs.table"
+
+def log_name_to_code(log):
+    parts_1 = log.split(".")
+    pre = parts_1[0]
+    parts_2 = pre.split("-")
+    code = parts_2[1]
+    return code
+
+if os.path.exists("rob-proc"):
+    pass
+else:
+    os.mkdir("rob-proc")
+
+if os.path.exists("correlation-tables"):
+    pass
+else:
+    os.mkdir("correlation-tables")
+
+with open(todays_refmacs_table) as f:
+    lines = f.readlines()
+    for line in lines:
+        parts = line.split()
+        log = parts[1]
+        accession_code = log_name_to_code(log)
+        print(log, accession_code)
+
+        rob_proc_log_file_name = os.path.join("rob-proc", accession_code + ".log")
+        args = ["python3", "robsrepo/Ligand-Tools/compare-models/compare_models_for_ligands_of_the_week.py",
+                '--code', accession_code, '--ligand-analysis-base', "."]
+        comp_proc = subprocess.run(args, capture_output=True)
+        with open(rob_proc_log_file_name, "bw") as f:
+            f.write(comp_proc.stdout)
+        if os.path.exists("results.out"):
+            new_results_out_name = os.path.join("correlation-tables", "results-" + accession_code + ".table")
+            os.rename("results.out", new_results_out_name)