Skip to content

Commit b3c7fe2

Browse files
Add structure JSON parsing from NERDSS
Sam just implemented a JSON formatted structure output file in NERDSS. This update involves pipeliens that imports that file and extract the structural information.
1 parent c4eebdd commit b3c7fe2

File tree

4 files changed

+339
-43
lines changed

4 files changed

+339
-43
lines changed

ionerdss/model/pdb/structure_validation.py

Lines changed: 140 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -832,6 +832,27 @@ def _find_matching_components(
832832
return matching_components
833833

834834

835+
def _build_observed_coordinate_map(
836+
mol_names: Sequence[str],
837+
coords: Sequence[Sequence[float]],
838+
target_counts: Mapping[str, int],
839+
) -> Dict[str, Tuple[float, float, float]]:
840+
"""Build the observed coordinate map with stable per-copy labels for repeated molecule types."""
841+
observed: Dict[str, Tuple[float, float, float]] = {}
842+
type_counts: Dict[str, int] = {}
843+
for mol_name, coord in zip(mol_names, coords):
844+
copy_idx = type_counts.get(mol_name, 0)
845+
type_counts[mol_name] = copy_idx + 1
846+
key = f"{mol_name}_{copy_idx}" if target_counts.get(mol_name, 1) > 1 else mol_name
847+
observed[key] = tuple(float(value) for value in coord)
848+
849+
missing = sorted(set(target_counts) - set(type_counts))
850+
if missing:
851+
raise ValueError(f"Missing COM coordinates for molecule types: {missing}")
852+
853+
return observed
854+
855+
835856
def _restart_snapshot_sort_key(restart_path: Path) -> Tuple[Tuple[int, ...], str]:
836857
"""Sort restart snapshot files from earliest to latest by embedded numeric suffixes."""
837858
numeric_parts = tuple(int(part) for part in re.findall(r"\d+", restart_path.stem))
@@ -859,6 +880,77 @@ def _iter_restart_snapshot_candidates(primary_restart_file: Union[str, Path]) ->
859880
return candidates
860881

861882

883+
def _complex_snapshot_sort_key(snapshot_path: Path) -> Tuple[Tuple[int, ...], str]:
884+
"""Sort COMPLEXES JSON snapshots by embedded numeric suffixes."""
885+
numeric_parts = tuple(int(part) for part in re.findall(r"\d+", snapshot_path.stem))
886+
return numeric_parts, snapshot_path.name
887+
888+
889+
def _iter_complex_json_candidates(complexes_dir: Union[str, Path]) -> list[Path]:
890+
"""Return COMPLEXES JSON snapshots newest-to-oldest."""
891+
directory = Path(complexes_dir)
892+
if not directory.is_dir():
893+
return []
894+
895+
json_files = [path for path in directory.iterdir() if path.is_file() and path.suffix == ".json"]
896+
return sorted(json_files, key=_complex_snapshot_sort_key, reverse=True)
897+
898+
899+
def _extract_observed_com_coordinates_from_complex_json(
900+
complex_json_file: Union[str, Path],
901+
target_counts: Mapping[str, int],
902+
) -> Dict[str, Tuple[float, float, float]]:
903+
"""Extract one matching complex directly from a COMPLEXES JSON snapshot."""
904+
payload = json.loads(Path(complex_json_file).read_text(encoding="utf-8"))
905+
if not isinstance(payload, list):
906+
raise ValueError(f"Malformed COMPLEXES JSON snapshot: {complex_json_file}")
907+
908+
matching_complexes: list[tuple[list[str], list[Sequence[float]]]] = []
909+
for complex_record in payload:
910+
if not isinstance(complex_record, dict):
911+
continue
912+
names = complex_record.get("names")
913+
coords = complex_record.get("coords")
914+
if not isinstance(names, list) or not isinstance(coords, list) or len(names) != len(coords):
915+
continue
916+
if dict(Counter(str(name) for name in names)) == dict(target_counts):
917+
matching_complexes.append(([str(name) for name in names], coords))
918+
919+
if not matching_complexes:
920+
raise ValueError(
921+
"No complex in the COMPLEXES JSON snapshot matches the designed "
922+
f"target composition {dict(target_counts)}."
923+
)
924+
925+
selected_names, selected_coords = matching_complexes[0]
926+
if len(matching_complexes) > 1:
927+
warnings.warn(
928+
"Multiple full assemblies matching the validation target were present in the COMPLEXES "
929+
f"snapshot {Path(complex_json_file).name}; selecting the first matching complex.",
930+
RuntimeWarning,
931+
)
932+
933+
return _build_observed_coordinate_map(selected_names, selected_coords, target_counts)
934+
935+
936+
def _find_observed_com_coordinates_in_complex_json_snapshots(
937+
complexes_dir: Union[str, Path],
938+
target_counts: Mapping[str, int],
939+
) -> Tuple[Dict[str, Tuple[float, float, float]], Path]:
940+
"""Search COMPLEXES JSON snapshots newest-to-oldest for a matching full assembly."""
941+
errors: list[str] = []
942+
for candidate_json in _iter_complex_json_candidates(complexes_dir):
943+
try:
944+
observed = _extract_observed_com_coordinates_from_complex_json(candidate_json, target_counts)
945+
return observed, candidate_json
946+
except Exception as exc:
947+
errors.append(f"{candidate_json.name}: {exc}")
948+
949+
raise ValueError(
950+
"No matching assembly was found in DATA/COMPLEXES JSON snapshots. " + " | ".join(errors)
951+
)
952+
953+
862954
def _extract_observed_com_coordinates(
863955
system_psf_file: Union[str, Path],
864956
final_coords_file: Optional[Union[str, Path]],
@@ -889,23 +981,9 @@ def _extract_observed_com_coordinates(
889981
)
890982

891983
observed = {}
892-
type_counts: Dict[str, int] = {}
893-
for mol_id in selected_component:
894-
mol_name = restart_mol_names[mol_id]
895-
copy_idx = type_counts.get(mol_name, 0)
896-
type_counts[mol_name] = copy_idx + 1
897-
898-
if target_counts.get(mol_name, 1) > 1:
899-
key = f"{mol_name}_{copy_idx}"
900-
else:
901-
key = mol_name
902-
903-
observed[key] = restart_coords[mol_id]
904-
905-
missing = sorted(set(target_counts) - set(type_counts))
906-
if missing:
907-
raise ValueError(f"Missing COM coordinates for molecule types: {missing}")
908-
984+
selected_names = [restart_mol_names[mol_id] for mol_id in selected_component]
985+
selected_coords = [restart_coords[mol_id] for mol_id in selected_component]
986+
observed = _build_observed_coordinate_map(selected_names, selected_coords, target_counts)
909987
return observed
910988

911989

@@ -960,16 +1038,21 @@ def run_structure_validation_simulation(
9601038

9611039
simulation_dir = work_dir / sim_dir_name / str(sim_index)
9621040
histogram_file = simulation_dir / "DATA" / "histogram_complexes_time.dat"
1041+
complexes_dir = simulation_dir / "DATA" / "COMPLEXES"
9631042
final_coords_file = simulation_dir / "DATA" / "final_coords.xyz"
9641043
system_psf_file = simulation_dir / "DATA" / "system.psf"
9651044
restart_file = simulation_dir / "DATA" / "restart.dat"
9661045

967-
hist_times, hist_comps, hist_matrix = parse_complex_histogram(histogram_file)
1046+
hist_times = np.asarray([])
1047+
hist_comps = []
1048+
hist_matrix = None
1049+
if histogram_file.exists():
1050+
hist_times, hist_comps, hist_matrix = parse_complex_histogram(histogram_file)
9681051
target_comp = dict(artifacts.target_counts)
9691052

9701053
first_full_assembly_time = None
9711054
full_assembly_found = False
972-
if len(hist_times) > 0:
1055+
if len(hist_times) > 0 and hist_matrix is not None:
9731056
for col_idx, comp in enumerate(hist_comps):
9741057
if comp == target_comp:
9751058
counts = np.asarray(hist_matrix[:, [col_idx]].todense()).ravel()
@@ -982,26 +1065,53 @@ def run_structure_validation_simulation(
9821065
warning_message = None
9831066
observed_coordinates = None
9841067
selected_restart_file = restart_file
985-
if full_assembly_found:
1068+
complex_json_candidates = _iter_complex_json_candidates(complexes_dir)
1069+
if complex_json_candidates:
1070+
try:
1071+
observed_coordinates, selected_restart_file = _find_observed_com_coordinates_in_complex_json_snapshots(
1072+
complexes_dir=complexes_dir,
1073+
target_counts=artifacts.target_counts,
1074+
)
1075+
full_assembly_found = True
1076+
except Exception as exc:
1077+
warning_message = (
1078+
"Validation warning: no full assembly matching the designed one-copy target "
1079+
f"{target_comp} was found in {complexes_dir}."
1080+
)
1081+
warnings.warn(warning_message, RuntimeWarning)
1082+
else:
1083+
fallback_warning = (
1084+
"Validation warning: no COMPLEXES JSON snapshots were found in "
1085+
f"{complexes_dir}; falling back to restart snapshots."
1086+
)
1087+
warnings.warn(fallback_warning, RuntimeWarning)
1088+
warning_message = fallback_warning
9861089
try:
9871090
observed_coordinates, selected_restart_file = _find_observed_com_coordinates_in_restart_snapshots(
9881091
system_psf_file=system_psf_file,
9891092
final_coords_file=final_coords_file,
9901093
restart_file=restart_file,
9911094
target_counts=artifacts.target_counts,
9921095
)
993-
except Exception as exc:
994-
raise ValueError(
995-
"The target composition appeared in the histogram, but no matching connected "
996-
"assembly was found in DATA/restart.dat or any RESTART snapshot. "
997-
f"{exc}"
998-
) from exc
999-
else:
1000-
warning_message = (
1096+
full_assembly_found = True
1097+
except Exception:
1098+
full_assembly_found = False
1099+
1100+
if not full_assembly_found:
1101+
no_assembly_warning = (
10011102
"Validation warning: no full assembly matching the designed one-copy target "
1002-
f"{target_comp} was found in {histogram_file}."
1103+
f"{target_comp} was found in {complexes_dir if complex_json_candidates else restart_file}."
1104+
)
1105+
warning_message = (
1106+
no_assembly_warning
1107+
if warning_message is None
1108+
else f"{warning_message} {no_assembly_warning}"
10031109
)
1004-
warnings.warn(warning_message, RuntimeWarning)
1110+
if complex_json_candidates:
1111+
warnings.warn(no_assembly_warning, RuntimeWarning)
1112+
else:
1113+
if warning_message is not None and not warning_message.endswith("snapshots."):
1114+
warnings.warn(warning_message, RuntimeWarning)
10051115

10061116
return StructureValidationSimulationResult(
10071117
simulation_dir=simulation_dir,

ionerdss/model/pdb/template_builder.py

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1020,6 +1020,23 @@ def _record_chain_assignments(
10201020
self.chain_assigned_types[interface.chain_i].add(assigned_i)
10211021
self.chain_assigned_types[interface.chain_j].add(assigned_j)
10221022

1023+
def _homodimer_distance_threshold_angstrom(self) -> float:
1024+
"""Return the homodimer distance threshold in Angstroms.
1025+
1026+
Geometric signatures are computed from parser coordinates in Angstroms,
1027+
while the hyperparameter is configured in nanometers.
1028+
"""
1029+
return self.hyperparams.homodimer_distance_threshold * 10.0
1030+
1031+
def _has_sparse_interface_support(self, interface: InterfaceString) -> bool:
1032+
"""Return True when an interface is supported by only a minimal contact set.
1033+
1034+
Sparse same-template contacts are noisy enough that a single observation is
1035+
not strong evidence for a complementary `f`/`b` HHT family.
1036+
"""
1037+
min_contacts = min(len(interface.residue_details_i), len(interface.residue_details_j))
1038+
return min_contacts <= (self.hyperparams.interface_detect_n_residue_cutoff + 1)
1039+
10231040
def _homotypic_orientation_candidates(
10241041
self,
10251042
interface1: "InterfaceString",
@@ -1030,7 +1047,7 @@ def _homotypic_orientation_candidates(
10301047
"""Return valid HHT orientations for `interface2` against `interface1`, in preference order."""
10311048
direct_geom_ok = signature2.is_similar_to(
10321049
signature1,
1033-
distance_threshold=self.hyperparams.homodimer_distance_threshold * 10,
1050+
distance_threshold=self._homodimer_distance_threshold_angstrom(),
10341051
angle_threshold=self.hyperparams.homodimer_angle_threshold,
10351052
)
10361053

@@ -1040,7 +1057,7 @@ def _homotypic_orientation_candidates(
10401057
)
10411058
flipped_geom_ok = signature2.is_similar_to(
10421059
rev_sig1,
1043-
distance_threshold=self.hyperparams.homodimer_distance_threshold * 10,
1060+
distance_threshold=self._homodimer_distance_threshold_angstrom(),
10441061
angle_threshold=self.hyperparams.homodimer_angle_threshold,
10451062
)
10461063

@@ -1049,7 +1066,7 @@ def _homotypic_orientation_candidates(
10491066
"[HOMO-VALID] geometric check: direct=%s, flipped=%s (dist_th=%.3f Å, ang_th=%.3f rad)",
10501067
direct_geom_ok,
10511068
flipped_geom_ok,
1052-
self.hyperparams.homodimer_distance_threshold * 10,
1069+
self._homodimer_distance_threshold_angstrom(),
10531070
self.hyperparams.homodimer_angle_threshold,
10541071
)
10551072

@@ -1868,11 +1885,21 @@ def _is_interface_homotypic(self, interface: InterfaceString,
18681885

18691886
# Check geometric signature symmetry
18701887
is_geometrically_symmetric = signature.is_homotypic(
1871-
self.hyperparams.homodimer_distance_threshold,
1888+
self._homodimer_distance_threshold_angstrom(),
18721889
self.hyperparams.homodimer_angle_threshold
18731890
)
18741891

18751892
if not is_geometrically_symmetric:
1893+
if template_i == template_j and self._has_sparse_interface_support(interface):
1894+
if self.workspace_manager:
1895+
self.workspace_manager.logger.info(
1896+
"Interface %s <-> %s is geometrically asymmetric but only has sparse support (%d/%d contacting residues); treating as homotypic.",
1897+
interface.chain_i,
1898+
interface.chain_j,
1899+
len(interface.residue_details_i),
1900+
len(interface.residue_details_j),
1901+
)
1902+
return True
18761903
if self.workspace_manager:
18771904
self.workspace_manager.logger.info(
18781905
"Interface %s <-> %s failed geometric symmetry test: d_diff=%.3f, theta_diff=%.3f",
@@ -3375,7 +3402,7 @@ def _get_heterotypic_reason(self, template_i: str, template_j: str,
33753402
return "different_molecule_types"
33763403

33773404
if not signature.is_homotypic(
3378-
self.hyperparams.homodimer_distance_threshold,
3405+
self._homodimer_distance_threshold_angstrom(),
33793406
self.hyperparams.homodimer_angle_threshold
33803407
):
33813408
return "asymmetric_geometry"

ionerdss/model/pdb/validation.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,10 @@
1010
In structure_validation.py, get_structure_validation_counts() builds the expected full assembly as one representative copy of each molecule type, for example {"A": 1, "H": 1, "L": 1}.
1111
1212
2. Run the actual NERDSS validation simulation with that target in mind.
13-
run_structure_validation_simulation(...) uses parms_titrate.inp, runs NERDSS, and reads [histogram_complexes_time.dat] via the histogram parser to look for a complex whose composition exactly matches that target composition. This is the “is one full assembly present?” check. If no such complex ever appears, it returns a warning instead of pretending there is an observed structure to compare.
14-
15-
3. If a full assembly exists, extract one observed assembly from the final snapshot.
16-
The code then reads:
17-
```
18-
final_coords.xyz
19-
system.psf
20-
```
13+
run_structure_validation_simulation(...) uses parms_titrate.inp, runs NERDSS, then looks for a matching full assembly in `DATA/COMPLEXES/*.json`. These JSON snapshots are the primary source for both existence checks and observed COM extraction.
14+
15+
3. If no COMPLEXES JSON snapshots exist, fall back to restart snapshots.
16+
The code emits a warning and then scans `DATA/restart.dat` and any `RESTART/*.dat` snapshots for a connected component whose composition matches the target.
2117
2218
"""
2319

0 commit comments

Comments
 (0)