diff --git a/.coveragerc b/.coveragerc index 83705f2..da0552f 100644 --- a/.coveragerc +++ b/.coveragerc @@ -2,7 +2,8 @@ source = steamroll omit = steamroll/__main__.py - steamroll/xyz2mol.py + steamroll/xyz2mol/* + steamroll/xyz2mol_tmc/* [report] exclude_lines = diff --git a/pyproject.toml b/pyproject.toml index 623b3de..f552700 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "steamroll" description = "Package to convert 3D molecules to RDKit" license = {file = "LICENSE"} -version = "0.0.2" +version = "0.0.3" readme = "README.md" keywords = [] authors = [ @@ -28,7 +28,7 @@ dev = [ [tool.ruff] line-length = 100 -exclude = ["steamroll/xyz2mol.py"] +exclude = ["steamroll/xyz2mol", "steamroll/xyz2mol_tmc"] [tool.ruff.lint] select = [ @@ -70,11 +70,11 @@ convention = "google" [tool.pytest.ini_options] testpaths = ["tests", "steamroll"] -addopts = "--doctest-modules --ignore=steamroll/xyz2mol.py" +addopts = "--doctest-modules --ignore=steamroll/xyz2mol --ignore=steamroll/xyz2mol_tmc" doctest_optionflags = "NORMALIZE_WHITESPACE" [tool.ty.src] -exclude = ["steamroll/xyz2mol.py"] +exclude = ["steamroll/xyz2mol", "steamroll/xyz2mol_tmc"] [build-system] build-backend = "hatchling.build" diff --git a/steamroll/steamroll.py b/steamroll/steamroll.py index b5214b4..e3bfc14 100644 --- a/steamroll/steamroll.py +++ b/steamroll/steamroll.py @@ -1,16 +1,25 @@ """steamroll package.""" import logging +import os +import tempfile from typing import Iterable import numpy as np from numpy.typing import ArrayLike from rdkit import Chem -from .xyz2mol import xyz2mol +from .xyz2mol.xyz2mol import xyz2mol +from .xyz2mol_tmc.xyz2mol_local import xyz2AC_obabel as xyz2ac_obabel +from .xyz2mol_tmc.xyz2mol_tmc import TRANSITION_METALS_NUM, get_tmc_mol logger = logging.getLogger(__name__) +# Lanthanides Ce-Yb (58-70) and actinides Ac-Lr (89-103) that xyz2mol cannot +# handle. Molecules containing these elements bypass xyz2mol and go directly to +# the geometry-only xyz2ac_obabel fallback. +_SKIP_XYZ2MOL: frozenset[int] = frozenset(range(58, 71)) | frozenset(range(89, 104)) + class SteamrollConversionError(Exception): """Raised when a conversion error occurs.""" @@ -55,11 +64,33 @@ def fragment(molecule: Chem.rdchem.Mol) -> list[Chem.rdchem.Mol]: return Chem.GetMolFrags(molecule, asMols=True, sanitizeFrags=True) # type: ignore [return-value] +def _write_temp_xyz(atomic_numbers: list[int], coordinates: list[list[float]]) -> str: + """Write atomic numbers and coordinates to a temporary xyz file. + + Args: + atomic_numbers: atomic numbers for each atom + coordinates: Cartesian coordinates for each atom, in Å + + Returns: + path to the temporary file (caller is responsible for deletion) + """ + pt = Chem.GetPeriodicTable() + lines = [str(len(atomic_numbers)), ""] + for num, (x, y, z) in zip(atomic_numbers, coordinates, strict=True): + symbol = pt.GetElementSymbol(num) + lines.append(f"{symbol} {x} {y} {z}") + f = tempfile.NamedTemporaryFile(mode="w", suffix=".xyz", delete=False) + f.write("\n".join(lines) + "\n") + f.close() + return f.name + + def to_rdkit( atomic_numbers: Iterable[int], coordinates: ArrayLike, charge: int = 0, remove_Hs: bool = True, + fail_without_bond_order: bool = False, ) -> Chem.rdchem.Mol: """Convert a given molecular geometry to an RDKit molecule. @@ -68,6 +99,7 @@ def to_rdkit( coordinates: coordinates, in Å charge: charge remove_Hs: whether or not to strip hydrogens from the output molecule + fail_without_bond_order: if bond order cannot be detected, raise SteamrollConversionError Raises: ValueError: if input dimensions aren't correct @@ -89,13 +121,164 @@ def to_rdkit( ) coords = coordinates.tolist() - rdkm: Chem.rdchem.Mol - try: + has_tm = any(n in TRANSITION_METALS_NUM for n in atomic_numbers) + has_exotic = any(n in _SKIP_XYZ2MOL for n in atomic_numbers) + + rdkm: Chem.rdchem.Mol | None = None + + if not has_tm and not has_exotic: + try: + try: + rdkm = xyz2mol(atomic_numbers, coords, charge=charge)[0] + except (Exception, ValueError, IndexError): + rdkm = xyz2mol(atomic_numbers, coords, charge=charge, use_huckel=True)[0] + except Exception as e: + if fail_without_bond_order: + raise SteamrollConversionError from e + + if has_tm: + # Use the specialized TMC converter; Hs come back implicit → make explicit. + xyz_file = _write_temp_xyz(atomic_numbers, coords) try: - rdkm = xyz2mol(atomic_numbers, coords, charge=charge)[0] - except (Exception, ValueError, IndexError): - rdkm = xyz2mol(atomic_numbers, coords, charge=charge, use_huckel=True)[0] - except Exception as e: - raise SteamrollConversionError("xyz2mol conversion failed!") from e + rdkm = get_tmc_mol(xyz_file, charge) + except Exception as e: + raise SteamrollConversionError("xyz2mol_tm conversion failed") from e + finally: + os.unlink(xyz_file) + if rdkm is None: + raise SteamrollConversionError("xyz2mol_tm returned no molecule") + return Chem.AddHs(rdkm) # type: ignore [return-value] + + if rdkm is None: + # xyz2mol failed for a non-TM molecule (e.g. wrong charge, unsupported + # element). Fall back to a geometry-only mol via obabel connectivity. + try: + _, rdkm = xyz2ac_obabel(atomic_numbers, coords) + except Exception as e: + raise SteamrollConversionError("xyz2mol conversion failed") from e + return rdkm # type: ignore [return-value] return remove_hydrogens(rdkm) if remove_Hs else rdkm + + +ATOMIC_NUMBERS = { + "X": 0, + "H": 1, + "He": 2, + "Li": 3, + "Be": 4, + "B": 5, + "C": 6, + "N": 7, + "O": 8, + "F": 9, + "Ne": 10, + "Na": 11, + "Mg": 12, + "Al": 13, + "Si": 14, + "P": 15, + "S": 16, + "Cl": 17, + "Ar": 18, + "K": 19, + "Ca": 20, + "Sc": 21, + "Ti": 22, + "V": 23, + "Cr": 24, + "Mn": 25, + "Fe": 26, + "Co": 27, + "Ni": 28, + "Cu": 29, + "Zn": 30, + "Ga": 31, + "Ge": 32, + "As": 33, + "Se": 34, + "Br": 35, + "Kr": 36, + "Rb": 37, + "Sr": 38, + "Y": 39, + "Zr": 40, + "Nb": 41, + "Mo": 42, + "Tc": 43, + "Ru": 44, + "Rh": 45, + "Pd": 46, + "Ag": 47, + "Cd": 48, + "In": 49, + "Sn": 50, + "Sb": 51, + "Te": 52, + "I": 53, + "Xe": 54, + "Cs": 55, + "Ba": 56, + "La": 57, + "Ce": 58, + "Pr": 59, + "Nd": 60, + "Pm": 61, + "Sm": 62, + "Eu": 63, + "Gd": 64, + "Tb": 65, + "Dy": 66, + "Ho": 67, + "Er": 68, + "Tm": 69, + "Yb": 70, + "Lu": 71, + "Hf": 72, + "Ta": 73, + "W": 74, + "Re": 75, + "Os": 76, + "Ir": 77, + "Pt": 78, + "Au": 79, + "Hg": 80, + "Tl": 81, + "Pb": 82, + "Bi": 83, + "Po": 84, + "At": 85, + "Rn": 86, + "Fr": 87, + "Ra": 88, + "Ac": 89, + "Th": 90, + "Pa": 91, + "U": 92, + "Np": 93, + "Pu": 94, + "Am": 95, + "Cm": 96, + "Bk": 97, + "Cf": 98, + "Es": 99, + "Fm": 100, + "Md": 101, + "No": 102, + "Lr": 103, + "Rf": 104, + "Db": 105, + "Sg": 106, + "Bh": 107, + "Hs": 108, + "Mt": 109, + "Ds": 110, + "Rg": 111, + "Cp": 112, + "Uut": 113, + "Uuq": 114, + "Uup": 115, + "Uuh": 116, + "Uus": 117, + "Uuo": 118, +} diff --git a/steamroll/xyz2mol/__init__.py b/steamroll/xyz2mol/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/steamroll/xyz2mol.py b/steamroll/xyz2mol/xyz2mol.py similarity index 99% rename from steamroll/xyz2mol.py rename to steamroll/xyz2mol/xyz2mol.py index 9eb021d..6c08b08 100644 --- a/steamroll/xyz2mol.py +++ b/steamroll/xyz2mol/xyz2mol.py @@ -84,7 +84,7 @@ atomic_valence[17] = [1] atomic_valence[32] = [4] atomic_valence[35] = [1] -atomic_valence[53] = [1] +atomic_valence[53] = [1,3,5] atomic_valence_electrons = {} atomic_valence_electrons[1] = 1 diff --git a/steamroll/xyz2mol_tmc/LICENSE.txt b/steamroll/xyz2mol_tmc/LICENSE.txt new file mode 100644 index 0000000..723af02 --- /dev/null +++ b/steamroll/xyz2mol_tmc/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Jensen Group + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/steamroll/xyz2mol_tmc/__init__.py b/steamroll/xyz2mol_tmc/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/steamroll/xyz2mol_tmc/xyz2mol_local.py b/steamroll/xyz2mol_tmc/xyz2mol_local.py new file mode 100644 index 0000000..f950dc1 --- /dev/null +++ b/steamroll/xyz2mol_tmc/xyz2mol_local.py @@ -0,0 +1,1270 @@ +"""Module for generating rdkit molobj/smiles/molecular graph from free atoms. + +Main implementation by Jan H. Jensen, based on the paper + + Yeonjoon Kim and Woo Youn Kim + "Universal Structure Conversion Method for Organic Molecules: From Atomic Connectivity + to Three-Dimensional Geometry" + Bull. Korean Chem. Soc. 2015, Vol. 36, 1769-1777 + DOI: 10.1002/bkcs.10334 + +Modified by Maria Harris Rasmussen 2024 +""" + +import copy +import itertools + +try: + from rdkit.Chem import rdEHTTools # requires RDKit 2019.9.1 or later +except ImportError: + rdEHTTools = None + +import sys +from collections import defaultdict + +import networkx as nx +import numpy as np +from rdkit import Chem + +global __ATOM_LIST__ +__ATOM_LIST__ = [ + "h", + "he", + "li", + "be", + "b", + "c", + "n", + "o", + "f", + "ne", + "na", + "mg", + "al", + "si", + "p", + "s", + "cl", + "ar", + "k", + "ca", + "sc", + "ti", + "v", + "cr", + "mn", + "fe", + "co", + "ni", + "cu", + "zn", + "ga", + "ge", + "as", + "se", + "br", + "kr", + "rb", + "sr", + "y", + "zr", + "nb", + "mo", + "tc", + "ru", + "rh", + "pd", + "ag", + "cd", + "in", + "sn", + "sb", + "te", + "i", + "xe", + "cs", + "ba", + "la", + "ce", + "pr", + "nd", + "pm", + "sm", + "eu", + "gd", + "tb", + "dy", + "ho", + "er", + "tm", + "yb", + "lu", + "hf", + "ta", + "w", + "re", + "os", + "ir", + "pt", + "au", + "hg", + "tl", + "pb", + "bi", + "po", + "at", + "rn", + "fr", + "ra", + "ac", + "th", + "pa", + "u", + "np", + "pu", +] + + +global atomic_valence +global atomic_valence_electrons + +atomic_valence = defaultdict(list) +atomic_valence[1] = [1] +atomic_valence[5] = [3, 4] +atomic_valence[6] = [4, 2] +atomic_valence[7] = [3, 4] +atomic_valence[8] = [2, 1, 3] # [2,1,3] +atomic_valence[9] = [1] +atomic_valence[13] = [3, 4] +atomic_valence[14] = [4] +atomic_valence[15] = [3, 5] # [5,4,3] +atomic_valence[16] = [2, 4, 6] # [6,3,2] +atomic_valence[17] = [1] +atomic_valence[18] = [0] +atomic_valence[32] = [4] +atomic_valence[33] = [5, 3] +atomic_valence[35] = [1] +atomic_valence[34] = [2] +atomic_valence[52] = [2] +atomic_valence[53] = [1] + +atomic_valence[21] = [20] +atomic_valence[22] = [20] +atomic_valence[23] = [20] +atomic_valence[24] = [20] +atomic_valence[25] = [20] +atomic_valence[26] = [20] +atomic_valence[27] = [20] +atomic_valence[28] = [20] +atomic_valence[29] = [20] +atomic_valence[30] = [20] + +atomic_valence[39] = [20] +atomic_valence[40] = [20] +atomic_valence[41] = [20] +atomic_valence[42] = [20] +atomic_valence[43] = [20] +atomic_valence[44] = [20] +atomic_valence[45] = [20] +atomic_valence[46] = [20] +atomic_valence[47] = [20] +atomic_valence[48] = [20] + + +atomic_valence[57] = [20] +atomic_valence[58] = [20] +atomic_valence[59] = [20] +atomic_valence[60] = [20] +atomic_valence[61] = [20] +atomic_valence[62] = [20] +atomic_valence[63] = [20] +atomic_valence[64] = [20] +atomic_valence[65] = [20] +atomic_valence[66] = [20] +atomic_valence[67] = [20] +atomic_valence[68] = [20] +atomic_valence[69] = [20] +atomic_valence[70] = [20] +atomic_valence[71] = [20] +atomic_valence[72] = [20] +atomic_valence[73] = [20] +atomic_valence[74] = [20] +atomic_valence[75] = [20] +atomic_valence[76] = [20] +atomic_valence[77] = [20] +atomic_valence[78] = [20] +atomic_valence[79] = [20] +atomic_valence[80] = [20] + +atomic_valence[89] = [20] +atomic_valence[90] = [20] +atomic_valence[91] = [20] +atomic_valence[92] = [20] +atomic_valence[93] = [20] +atomic_valence[94] = [20] +atomic_valence[95] = [20] +atomic_valence[96] = [20] +atomic_valence[97] = [20] +atomic_valence[98] = [20] +atomic_valence[99] = [20] +atomic_valence[100] = [20] +atomic_valence[101] = [20] +atomic_valence[102] = [20] +atomic_valence[103] = [20] + + +atomic_valence_electrons = {} +atomic_valence_electrons[1] = 1 +atomic_valence_electrons[5] = 3 +atomic_valence_electrons[6] = 4 +atomic_valence_electrons[7] = 5 +atomic_valence_electrons[8] = 6 +atomic_valence_electrons[9] = 7 +atomic_valence_electrons[13] = 3 +atomic_valence_electrons[14] = 4 +atomic_valence_electrons[15] = 5 +atomic_valence_electrons[16] = 6 +atomic_valence_electrons[17] = 7 +atomic_valence_electrons[18] = 8 +atomic_valence_electrons[32] = 4 +atomic_valence_electrons[33] = 5 +atomic_valence_electrons[35] = 7 +atomic_valence_electrons[34] = 6 +atomic_valence_electrons[52] = 6 +atomic_valence_electrons[53] = 7 + + +def str_atom(atom): + """Convert integer atom to string atom.""" + global __ATOM_LIST__ + atom = __ATOM_LIST__[atom - 1] + return atom + + +def int_atom(atom): + """Convert str atom to integer atom.""" + global __ATOM_LIST__ + # print(atom) + atom = atom.lower() + return __ATOM_LIST__.index(atom) + 1 + + +def get_UA(maxValence_list, valence_list): + """""" + UA = [] + DU = [] + for i, (maxValence, valence) in enumerate(zip(maxValence_list, valence_list)): + if not maxValence - valence > 0: + continue + UA.append(i) + DU.append(maxValence - valence) + return UA, DU + + +def get_BO(AC, UA, DU, valences, UA_pairs, use_graph=True): + """""" + BO = AC.copy() + DU_save = [] + + while DU_save != DU: + for i, j in UA_pairs: + BO[i, j] += 1 + BO[j, i] += 1 + + BO_valence = list(BO.sum(axis=1)) + DU_save = copy.copy(DU) + UA, DU = get_UA(valences, BO_valence) + UA_pairs = get_UA_pairs(UA, AC, DU, use_graph=use_graph)[0] + return BO + + +def valences_not_too_large(BO, valences): + """""" + number_of_bonds_list = BO.sum(axis=1) + for valence, number_of_bonds in zip(valences, number_of_bonds_list): + if number_of_bonds > valence: + return False + + return True + + +def charge_is_OK( + BO, + AC, + charge, + DU, + atomic_valence_electrons, + atoms, + valences, + allow_charged_fragments=True, + allow_carbenes=True, +): + # total charge + Q = 0 + # charge fragment list + q_list = [] + + if allow_charged_fragments: + BO_valences = list(BO.sum(axis=1)) + for i, atom in enumerate(atoms): + q = get_atomic_charge(atom, atomic_valence_electrons[atom], BO_valences[i]) + Q += q + if atom == 6: + number_of_single_bonds_to_C = list(BO[i, :]).count(1) + if not allow_carbenes and number_of_single_bonds_to_C == 2 and BO_valences[i] == 2: + print("found illegal carbene") + Q += 1 + q = 2 + if number_of_single_bonds_to_C == 3 and Q + 1 < charge: + Q += 2 + q = 1 + if q != 0: + q_list.append(q) + return charge == Q + + +def BO_is_OK( + BO, + AC, + charge, + DU, + atomic_valence_electrons, + atoms, + valences, + allow_charged_fragments=True, + allow_carbenes=True, +): + """Sanity of bond-orders. + + Args: + BO - + AC - + charge - + DU - + + + optional + allow_charges_fragments - + + + Returns: + boolean - true of molecule is OK, false if not + """ + if not valences_not_too_large(BO, valences): + return False + + check_sum = (BO - AC).sum() == sum(DU) + check_charge = charge_is_OK( + BO, + AC, + charge, + DU, + atomic_valence_electrons, + atoms, + valences, + allow_charged_fragments, + allow_carbenes=True, + ) + + if check_charge and check_sum: + return True + + return False + + +def get_atomic_charge(atom, atomic_valence_electrons, BO_valence): + """""" + if atom == 1: + charge = 1 - BO_valence + elif atom == 5: + charge = 3 - BO_valence + elif atom == 6 and BO_valence == 2: + charge = 0 + elif atom == 13: + charge = 3 - BO_valence + elif atom == 15 and BO_valence == 5: + charge = 0 + elif atom == 16 and BO_valence == 6: + charge = 0 + elif atom == 16 and BO_valence == 4: # testing for sulphur + charge = 0 + elif atom == 16 and BO_valence == 5: + charge = 1 + + else: + charge = atomic_valence_electrons - 8 + BO_valence + + return charge + + +def BO2mol( + mol, + BO_matrix, + atoms, + atomic_valence_electrons, + mol_charge, + allow_charged_fragments=True, + use_atom_maps=True, +): + """Based on code written by Paolo Toscani. + + From bond order, atoms, valence structure and total charge, generate an + rdkit molecule. + + Args: + mol - rdkit molecule + BO_matrix - bond order matrix of molecule + atoms - list of integer atomic symbols + atomic_valence_electrons - + mol_charge - total charge of molecule + + optional: + allow_charged_fragments - bool - allow charged fragments + + Returns: + mol - updated rdkit molecule with bond connectivity + """ + length_bo = len(BO_matrix) + length_atoms = len(atoms) + BO_valences = list(BO_matrix.sum(axis=1)) + + if length_bo != length_atoms: + raise RuntimeError( + "sizes of adjMat ({0:d}) and Atoms {1:d} differ".format(length_bo, length_atoms) + ) + + rwMol = Chem.RWMol(mol) + + bondTypeDict = { + 1: Chem.BondType.SINGLE, + 2: Chem.BondType.DOUBLE, + 3: Chem.BondType.TRIPLE, + } + + for i in range(length_bo): + for j in range(i + 1, length_bo): + bo = int(round(BO_matrix[i, j])) + if bo == 0: + continue + bt = bondTypeDict.get(bo, Chem.BondType.SINGLE) + rwMol.RemoveBond(i, j) # added this for TMC procedure + rwMol.AddBond(i, j, bt) + + mol = rwMol.GetMol() + + if allow_charged_fragments: + mol = set_atomic_charges( + mol, + atoms, + atomic_valence_electrons, + BO_valences, + BO_matrix, + mol_charge, + use_atom_maps=use_atom_maps, + ) + else: + mol = set_atomic_radicals( + mol, + atoms, + atomic_valence_electrons, + BO_valences, + use_atom_maps=use_atom_maps, + ) + + Chem.SanitizeMol(mol) + + return mol + + +def set_atomic_charges( + mol, + atoms, + atomic_valence_electrons, + BO_valences, + BO_matrix, + mol_charge, + use_atom_maps=True, +): + """""" + q = 0 + for i, atom in enumerate(atoms): + a = mol.GetAtomWithIdx(i) + if use_atom_maps: + a.SetAtomMapNum(i + 1) + charge = get_atomic_charge(atom, atomic_valence_electrons[atom], BO_valences[i]) + q += charge + if atom == 6: + number_of_single_bonds_to_C = list(BO_matrix[i, :]).count(1) + if BO_valences[i] == 2: + # q += 1 + a.SetNumRadicalElectrons(2) + charge = 0 + if number_of_single_bonds_to_C == 3 and q + 1 < mol_charge: + q += 2 + charge = 1 + + if abs(charge) > 0: + a.SetFormalCharge(int(charge)) + + # mol = clean_charges(mol) + + return mol + + +def set_atomic_radicals(mol, atoms, atomic_valence_electrons, BO_valences, use_atom_maps=True): + """The number of radical electrons = absolute atomic charge.""" + atomic_valence[8] = [2, 1] + atomic_valence[7] = [3, 2] + atomic_valence[6] = [4, 2] + + for i, atom in enumerate(atoms): + a = mol.GetAtomWithIdx(i) + if use_atom_maps: + a.SetAtomMapNum(i + 1) + charge = get_atomic_charge(atom, atomic_valence_electrons[atom], BO_valences[i]) + + if abs(charge) > 0: + a.SetNumRadicalElectrons(abs(int(charge))) + + return mol + + +def get_bonds(UA, AC): + """""" + bonds = [] + + for k, i in enumerate(UA): + for j in UA[k + 1 :]: + if AC[i, j] == 1: + bonds.append(tuple(sorted([i, j]))) + + return bonds + + +def get_UA_pairs(UA, AC, DU, use_graph=True): + """""" + N_UA = 10000 + matching_ids = dict() + matching_ids2 = dict() + for i, du in zip(UA, DU): + if du > 1: + matching_ids[i] = N_UA + matching_ids2[N_UA] = i + N_UA += 1 + + bonds = get_bonds(UA, AC) + for i, j in bonds: + if i in matching_ids: + bonds.append(tuple(sorted([matching_ids[i], j]))) + + elif j in matching_ids: + bonds.append(tuple(sorted([i, matching_ids[j]]))) + + if len(bonds) == 0: + return [()] + + if use_graph: + G = nx.Graph() + G.add_edges_from(bonds) + UA_pairs = [list(nx.max_weight_matching(G))] + UA_pair = UA_pairs[0] + + remove_pairs = [] + add_pairs = [] + for i, j in UA_pair: + if i in matching_ids2 and j in matching_ids2: + remove_pairs.append(tuple([i, j])) + add_pairs.append(tuple([matching_ids2[i], matching_ids2[j]])) + # UA_pair.remove(tuple([i,j])) + # UA_pair.append(tuple([matching_ids2[i], matching_ids2[j]])) + elif i in matching_ids2: + # UA_pair.remove(tuple([i,j])) + remove_pairs.append(tuple([i, j])) + add_pairs.append(tuple([matching_ids2[i], j])) + # UA_pair.append(tuple([matching_ids2[i],j])) + elif j in matching_ids2: + remove_pairs.append(tuple([i, j])) + add_pairs.append(tuple([i, matching_ids2[j]])) + + # UA_pair.remove(tuple([i,j])) + # UA_pair.append(tuple([i,matching_ids2[j]])) + for p1, p2 in zip(remove_pairs, add_pairs): + UA_pair.remove(p1) + UA_pair.append(p2) + return [UA_pair] + + max_atoms_in_combo = 0 + UA_pairs = [()] + for combo in list(itertools.combinations(bonds, int(len(UA) / 2))): + flat_list = [item for sublist in combo for item in sublist] + atoms_in_combo = len(set(flat_list)) + if atoms_in_combo > max_atoms_in_combo: + max_atoms_in_combo = atoms_in_combo + UA_pairs = [combo] + + elif atoms_in_combo == max_atoms_in_combo: + UA_pairs.append(combo) + + return UA_pairs + + +def AC2BO(AC, atoms, charge, allow_charged_fragments=True, use_graph=True, allow_carbenes=True): + """Implemenation of algorithm shown in Figure 2. + + UA: unsaturated atoms + + DU: degree of unsaturation (u matrix in Figure) + + best_BO: Bcurr in Figure + """ + global atomic_valence + global atomic_valence_electrons + + # make a list of valences, e.g. for CO: [[4],[2,1]] + valences_list_of_lists = [] + AC_valence = list(AC.sum(axis=1)) + + for i, (atomicNum, valence) in enumerate(zip(atoms, AC_valence)): + # valence can't be smaller than number of neighbourgs + possible_valence = [x for x in atomic_valence[atomicNum] if x >= valence] + if atomicNum == 6 and valence == 1: + possible_valence.remove(2) + if atomicNum == 6 and not allow_carbenes and valence == 2: + possible_valence.remove(2) + if atomicNum == 6 and valence == 2: + possible_valence.append(3) + if atomicNum == 16 and valence == 1: + possible_valence = [1, 2] + + if not possible_valence: + print( + "Valence of atom", + i, + "is", + valence, + "which bigger than allowed max", + max(atomic_valence[atomicNum]), + ". Stopping", + ) + sys.exit() + valences_list_of_lists.append(possible_valence) + + # convert [[4],[2,1]] to [[4,2],[4,1]] + valences_list = itertools.product(*valences_list_of_lists) + + best_BO = AC.copy() + + O_valences = [ + v_list for v_list, atomicNum in zip(valences_list_of_lists, atoms) if atomicNum == 8 + ] + N_valences = [ + v_list for v_list, atomicNum in zip(valences_list_of_lists, atoms) if atomicNum == 7 + ] + C_valences = [ + v_list for v_list, atomicNum in zip(valences_list_of_lists, atoms) if atomicNum == 6 + ] + P_valences = [ + v_list for v_list, atomicNum in zip(valences_list_of_lists, atoms) if atomicNum == 15 + ] + S_valences = [ + v_list for v_list, atomicNum in zip(valences_list_of_lists, atoms) if atomicNum == 16 + ] + + O_sums = [] + for v_list in itertools.product(*O_valences): + O_sums.append(v_list) + # if sum(v_list) not in O_sums: + # O_sums.append(v_list)) + + N_sums = [] + for v_list in itertools.product(*N_valences): + N_sums.append(v_list) + # if sum(v_list) not in N_sums: + # N_sums.append(sum(v_list)) + + C_sums = [] + for v_list in itertools.product(*C_valences): + C_sums.append(v_list) + # if sum(v_list) not in C_sums: + # C_sums.append(sum(v_list)) + + P_sums = [] + for v_list in itertools.product(*P_valences): + P_sums.append(v_list) + + S_sums = [] + for v_list in itertools.product(*S_valences): + S_sums.append(v_list) + + order_dict = dict() + for i, v_list in enumerate(itertools.product(*[O_sums, N_sums, C_sums, P_sums, S_sums])): + order_dict[v_list] = i + + valence_order_list = [] + for valence_list in valences_list: + C_sum = [] + N_sum = [] + O_sum = [] + P_sum = [] + S_sum = [] + for v, atomicNum in zip(valence_list, atoms): + if atomicNum == 6: + C_sum.append(v) + if atomicNum == 7: + N_sum.append(v) + if atomicNum == 8: + O_sum.append(v) + if atomicNum == 15: + P_sum.append(v) + if atomicNum == 16: + S_sum.append(v) + + order_idx = order_dict[ + (tuple(O_sum), tuple(N_sum), tuple(C_sum), tuple(P_sum), tuple(S_sum)) + ] + valence_order_list.append(order_idx) + + sorted_valences_list = [ + y + for x, y in sorted( + zip(valence_order_list, list(itertools.product(*valences_list_of_lists))) + ) + ] + + for valences in sorted_valences_list: # valences_list: + UA, DU_from_AC = get_UA(valences, AC_valence) + check_len = len(UA) == 0 + if check_len: + check_bo = BO_is_OK( + AC, + AC, + charge, + DU_from_AC, + atomic_valence_electrons, + atoms, + valences, + allow_charged_fragments=allow_charged_fragments, + allow_carbenes=allow_carbenes, + ) + else: + check_bo = None + + if check_len and check_bo: + return AC, atomic_valence_electrons + + UA_pairs_list = get_UA_pairs(UA, AC, DU_from_AC, use_graph=use_graph) + for UA_pairs in UA_pairs_list: + BO = get_BO(AC, UA, DU_from_AC, valences, UA_pairs, use_graph=use_graph) + status = BO_is_OK( + BO, + AC, + charge, + DU_from_AC, + atomic_valence_electrons, + atoms, + valences, + allow_charged_fragments=allow_charged_fragments, + allow_carbenes=allow_carbenes, + ) + charge_OK = charge_is_OK( + BO, + AC, + charge, + DU_from_AC, + atomic_valence_electrons, + atoms, + valences, + allow_charged_fragments=allow_charged_fragments, + allow_carbenes=allow_carbenes, + ) + + if status: + return BO, atomic_valence_electrons + elif BO.sum() >= best_BO.sum() and valences_not_too_large(BO, valences) and charge_OK: + best_BO = BO.copy() + + return best_BO, atomic_valence_electrons + + +def AC2mol( + mol, + AC, + atoms, + charge, + allow_charged_fragments=True, + use_graph=True, + use_atom_maps=True, + allow_carbenes=True, +): + """""" + + # convert AC matrix to bond order (BO) matrix + BO, atomic_valence_electrons = AC2BO( + AC, + atoms, + charge, + allow_charged_fragments=allow_charged_fragments, + use_graph=use_graph, + allow_carbenes=allow_carbenes, + ) + # add BO connectivity and charge info to mol object + mol = BO2mol( + mol, + BO, + atoms, + atomic_valence_electrons, + charge, + allow_charged_fragments=allow_charged_fragments, + use_atom_maps=use_atom_maps, + ) + + # print(Chem.GetFormalCharge(mol), charge) + # If charge is not correct don't return mol + if Chem.GetFormalCharge(mol) != charge: + return None + + # BO2mol returns an arbitrary resonance form. Let's make the rest + + # mols = rdchem.ResonanceMolSupplier(mol) + # mols = [mol for mol in mols] + # print(mols) + + return mol + + +def get_proto_mol(atoms): + """""" + mol = Chem.MolFromSmarts("[#" + str(atoms[0]) + "]") + rwMol = Chem.RWMol(mol) + for i in range(1, len(atoms)): + a = Chem.Atom(atoms[i]) + rwMol.AddAtom(a) + + mol = rwMol.GetMol() + + return mol + + +def read_xyz_file(filename, look_for_charge=True): + """""" + atomic_symbols = [] + xyz_coordinates = [] + charge = 0 + + with open(filename, "r") as file: + for line_number, line in enumerate(file): + if line_number == 0: + int(line) + elif line_number == 1: + if "charge=" in line: + charge = int(line.split("=")[1]) + else: + atomic_symbol, x, y, z = line.split() + atomic_symbols.append(atomic_symbol) + xyz_coordinates.append([float(x), float(y), float(z)]) + + atoms = [int_atom(atom) for atom in atomic_symbols] + + return atoms, charge, xyz_coordinates + + +def xyz2AC(atoms, xyz, charge, use_huckel=False, use_obabel=False): + """Atoms and coordinates to atom connectivity (AC) + + Args: + atoms - int atom types + xyz - coordinates + charge - molecule charge + + optional: + use_huckel - Use Huckel method for atom connecitivty + use_obabel - Use Opne Babel method for atom connectivity + + Returns: + ac - atom connectivity matrix + mol - rdkit molecule + """ + if use_huckel: + return xyz2AC_huckel(atoms, xyz, charge) + elif use_obabel: + return xyz2AC_obabel(atoms, xyz) + else: + return xyz2AC_vdW(atoms, xyz) + + +def xyz2AC_vdW(atoms, xyz): + # Get mol template + mol = get_proto_mol(atoms) + + # Set coordinates + conf = Chem.Conformer(mol.GetNumAtoms()) + for i in range(mol.GetNumAtoms()): + conf.SetAtomPosition(i, (xyz[i][0], xyz[i][1], xyz[i][2])) + mol.AddConformer(conf) + + AC = get_AC(mol) + + return AC, mol + + +def get_AC(mol, covalent_factor=1.3): + """Generate adjacent matrix from atoms and coordinates. + + AC is a (num_atoms, num_atoms) matrix with 1 being covalent bond and 0 is not + + + covalent_factor - 1.3 is an arbitrary factor + + Args: + mol - rdkit molobj with 3D conformer + + optional + covalent_factor - increase covalent bond length threshold with facto + + Returns: + AC - adjacent matrix + """ + # Calculate distance matrix + dMat = Chem.Get3DDistanceMatrix(mol) + + pt = Chem.GetPeriodicTable() + num_atoms = mol.GetNumAtoms() + AC = np.zeros((num_atoms, num_atoms), dtype=int) + + for i in range(num_atoms): + a_i = mol.GetAtomWithIdx(i) + Rcov_i = pt.GetRcovalent(a_i.GetAtomicNum()) * covalent_factor + for j in range(i + 1, num_atoms): + a_j = mol.GetAtomWithIdx(j) + Rcov_j = pt.GetRcovalent(a_j.GetAtomicNum()) * covalent_factor + if dMat[i, j] <= Rcov_i + Rcov_j: + AC[i, j] = 1 + AC[j, i] = 1 + + return AC + + +def xyz2AC_huckel(atomicNumList, xyz, charge): + """Args. + + atomicNumList - atom type list + xyz - coordinates + charge - molecule charge + + Returns: + ac - atom connectivity + mol - rdkit molecule + """ + # print(charge) + mol = get_proto_mol(atomicNumList) + + conf = Chem.Conformer(mol.GetNumAtoms()) + for i in range(mol.GetNumAtoms()): + conf.SetAtomPosition(i, (xyz[i][0], xyz[i][1], xyz[i][2])) + mol.AddConformer(conf) + + num_atoms = len(atomicNumList) + AC = np.zeros((num_atoms, num_atoms)).astype(int) + + mol_huckel = Chem.Mol(mol) + mol_huckel.GetAtomWithIdx(0).SetFormalCharge(charge) # mol charge arbitrarily added to 1st atom + + passed, result = rdEHTTools.RunMol(mol_huckel) + opop = result.GetReducedOverlapPopulationMatrix() + tri = np.zeros((num_atoms, num_atoms)) + tri[np.tril(np.ones((num_atoms, num_atoms), dtype=bool))] = ( + opop # lower triangular to square matrix + ) + for i in range(num_atoms): + for j in range(i + 1, num_atoms): + pair_pop = abs(tri[j, i]) + if pair_pop >= 0.2: # arbitry cutoff for bond. May need adjustment + AC[i, j] = 1 + AC[j, i] = 1 + + dMat = Chem.Get3DDistanceMatrix(mol) + pt = Chem.GetPeriodicTable() + + # filter adjacency matrix if max valence is exceeded + for i in range(num_atoms): + a_i = mol.GetAtomWithIdx(i) + N_con = np.sum(AC[i, :]) + # print(a_i.GetAtomicNum(), N_con) + while N_con > max(atomic_valence[a_i.GetAtomicNum()]): + # print("removing longest bond") + AC = remove_weakest_bond(mol, i, AC, dMat, pt) + N_con = np.sum(AC[i, :]) + + return AC, mol + + +def remove_weakest_bond(mol, atom_idx, AC, dMat, pt): + extra_bond_lengths = [] + bond_atoms = np.nonzero(AC[atom_idx, :])[0] + # print(bond_atoms) + a_i = mol.GetAtomWithIdx(atom_idx) + # print(a_i.GetAtomicNum()) + rcovi = pt.GetRcovalent(a_i.GetAtomicNum()) + for j in bond_atoms: + # print(j) + a_j = mol.GetAtomWithIdx(int(j)) + # print(a_j.GetAtomicNum()) + rcovj = pt.GetRcovalent(a_j.GetAtomicNum()) + extra_bond_length = dMat[atom_idx, j] - rcovj - rcovi + extra_bond_lengths.append(extra_bond_length) + + longest_bond_index = bond_atoms[np.argmax(extra_bond_lengths)] + AC[atom_idx, longest_bond_index] = 0 + AC[longest_bond_index, atom_idx] = 0 + + return AC + + +def xyz2AC_obabel(atoms, xyz, tolerance=0.45): + """Generate adjacent matrix from atoms and coordinates in a way similar to + open babels. + + AC is a (num_atoms, num_atoms) matrix with 1 being covalent bond and 0 is not + + + tolerance - 0.45Å is from the open babel paper + + Args: + mol - rdkit molobj with 3D conformer + + optional + tolerance - atoms connected if distance is shorter than sum of atomic + radii + tolerance. If too many bonds to an atom; break longest bond + + Returns: + AC - adjacency matrix + """ + global atomic_valence + # atomic_valence[8] = [2,1] + # atomic_valence[7] = [3,2] + atomic_valence[6] = [4, 2] + + # Get mol template + mol = get_proto_mol(atoms) + + # Set coordinates + conf = Chem.Conformer(mol.GetNumAtoms()) + for i in range(mol.GetNumAtoms()): + conf.SetAtomPosition(i, (xyz[i][0], xyz[i][1], xyz[i][2])) + mol.AddConformer(conf) + # Calculate distance matrix + dMat = Chem.Get3DDistanceMatrix(mol) + + pt = Chem.GetPeriodicTable() + num_atoms = mol.GetNumAtoms() + AC = np.zeros((num_atoms, num_atoms), dtype=int) + + for i in range(num_atoms): + a_i = mol.GetAtomWithIdx(i) + Rcov_i = pt.GetRcovalent(a_i.GetAtomicNum()) + for j in range(i + 1, num_atoms): + a_j = mol.GetAtomWithIdx(j) + Rcov_j = pt.GetRcovalent(a_j.GetAtomicNum()) + if dMat[i, j] <= Rcov_i + Rcov_j + tolerance: + AC[i, j] = 1 + AC[j, i] = 1 + + # filter adjacency matrix if max valence is exceeded + for i in range(num_atoms): + a_i = mol.GetAtomWithIdx(i) + N_con = np.sum(AC[i, :]) + while N_con > max(atomic_valence[a_i.GetAtomicNum()]): + # print("removing longest bond") + AC = remove_weakest_bond(mol, i, AC, dMat, pt) + N_con = np.sum(AC[i, :]) + + # print(Chem.MolToSmiles(mol)) + + return AC, mol + + +def chiral_stereo_check(mol): + """Find and embed chiral information into the model based on the + coordinates. + + Args: + mol - rdkit molecule, with embeded conformer + """ + Chem.SanitizeMol(mol) + Chem.DetectBondStereochemistry(mol, -1) + Chem.AssignStereochemistry(mol, flagPossibleStereoCenters=True, force=True) + Chem.AssignAtomChiralTagsFromStructure(mol, -1) + + +def xyz2mol( + atoms, + coordinates, + charge=0, + allow_charged_fragments=True, + use_graph=True, + use_huckel=False, + use_obabel=False, + embed_chiral=True, + use_atom_maps=True, +): + """Generate a rdkit molobj from atoms, coordinates and a total_charge. + + Args: + atoms - list of atom types (int) + coordinates - 3xN Cartesian coordinates + charge - total charge of the system (default: 0) + + optional: + allow_charged_fragments - alternatively radicals are made + use_graph - use graph (networkx) + use_huckel - Use Huckel method for atom connectivity prediction + embed_chiral - embed chiral information to the molecule + + Returns: + mols - list of rdkit molobjects + """ + # Get atom connectivity (AC) matrix, list of atomic numbers, molecular charge, + # and mol object with no connectivity information + AC, mol = xyz2AC(atoms, coordinates, charge, use_huckel=use_huckel, use_obabel=use_obabel) + # Convert AC to bond order matrix and add connectivity and charge info to + # mol object + new_mol = AC2mol( + mol, + AC, + atoms, + charge, + allow_charged_fragments=allow_charged_fragments, + use_graph=use_graph, + use_atom_maps=use_atom_maps, + ) + + # Check for stereocenters and chiral centers + if embed_chiral: + chiral_stereo_check(new_mol) + + return new_mol + + +def canonicalize_smiles(structure_smiles): + """Remove all structural info an atom mapping information.""" + mol = Chem.MolFromSmiles(structure_smiles, sanitize=False) + for atom in mol.GetAtoms(): + atom.SetAtomMapNum(0) + Chem.SanitizeMol(mol) + mol = Chem.RemoveHs(mol) + canonical_smiles = Chem.MolToSmiles(mol) + + return canonical_smiles + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(usage="%(prog)s [options] molecule.xyz") + parser.add_argument("structure", metavar="structure", type=str) + parser.add_argument("-s", "--sdf", action="store_true", help="Dump sdf file") + parser.add_argument("--ignore-chiral", action="store_true", help="Ignore chiral centers") + parser.add_argument( + "--no-charged-fragments", action="store_true", help="Allow radicals to be made" + ) + parser.add_argument( + "--no-graph", + action="store_true", + help="Run xyz2mol without networkx dependencies", + ) + + # huckel uses extended Huckel bond orders to locate bonds (requires RDKit 2019.9.1 or later) + # otherwise van der Waals radii are used + parser.add_argument( + "--use-huckel", + action="store_true", + help="Use Huckel method for atom connectivity", + ) + parser.add_argument( + "--use-obabel", + action="store_true", + help="Use Open Babel way of obtaining atom connectivity; recommended for radicals", + ) + parser.add_argument( + "-o", + "--output-format", + action="store", + type=str, + help="Output format [smiles,sdf] (default=sdf)", + ) + parser.add_argument( + "-c", + "--charge", + action="store", + metavar="int", + type=int, + help="Total charge of the system", + ) + parser.add_argument( + "--use-atom-maps", + action="store_true", + help="Label atoms with map numbers according to their order in the .xyz file", + ) + + args = parser.parse_args() + + # read xyz file + filename = args.structure + + # allow for charged fragments, alternatively radicals are made + charged_fragments = not args.no_charged_fragments + + # quick is faster for large systems but requires networkx + # if you don't want to install networkx set quick=False and + # uncomment 'import networkx as nx' at the top of the file + quick = not args.no_graph + + # chiral comment + embed_chiral = not args.ignore_chiral + + # read atoms and coordinates. Try to find the charge + atoms, charge, xyz_coordinates = read_xyz_file(filename) + + # huckel uses extended Huckel bond orders to locate bonds (requires RDKit 2019.9.1 or later) + # otherwise van der Waals radii are used + use_huckel = args.use_huckel + + use_obabel = args.use_obabel + + # if explicit charge from args, set it + if args.charge is not None: + charge = int(args.charge) + + use_atom_maps = args.use_atom_maps + if not charged_fragments: + atomic_valence[8] = [2, 1] + atomic_valence[7] = [3, 2] + atomic_valence[6] = [4, 2] + + # Get the molobjs + mols = xyz2mol( + atoms, + xyz_coordinates, + charge=charge, + use_graph=quick, + allow_charged_fragments=charged_fragments, + embed_chiral=embed_chiral, + use_huckel=use_huckel, + use_obabel=use_obabel, + use_atom_maps=use_atom_maps, + ) + + # Print output + for mol in [mols]: + if args.output_format == "sdf": + txt = Chem.MolToMolBlock(mol) + print(txt) + + else: + # Canonical hack + isomeric_smiles = not args.ignore_chiral + smiles = Chem.MolToSmiles(mol, isomericSmiles=isomeric_smiles) + # m = Chem.MolFromSmiles(smiles, sanitize=False) + # smiles = Chem.MolToSmiles(m, isomericSmiles=isomeric_smiles) + + smiles = canonicalize_smiles(smiles) diff --git a/steamroll/xyz2mol_tmc/xyz2mol_tmc.py b/steamroll/xyz2mol_tmc/xyz2mol_tmc.py new file mode 100644 index 0000000..c0b5d43 --- /dev/null +++ b/steamroll/xyz2mol_tmc/xyz2mol_tmc.py @@ -0,0 +1,622 @@ +"""Module for the xyz2mol functionality for TMCs""" + +import argparse +import logging +import signal +import subprocess +from itertools import combinations +from pathlib import Path + +import numpy as np +from rdkit import Chem +from rdkit.Chem import GetPeriodicTable, rdchem, rdEHTTools, rdmolops +from rdkit.Chem.MolStandardize import rdMolStandardize + +from .xyz2mol_local import ( + AC2mol, + chiral_stereo_check, + read_xyz_file, + xyz2AC_obabel, +) + +# fmt: off +TRANSITION_METALS = ["Sc","Ti","V","Cr","Mn","Fe","Co","La","Ni","Cu","Zn", + "Y","Zr","Nb","Mo","Tc","Ru","Rh","Pd","Ag","Cd","Lu", + "Hf","Ta","W","Re","Os","Ir","Pt","Au","Hg", +] + +TRANSITION_METALS_NUM = [21,22,23,24,25,26,27,57,28,29,30,39,40,41, + 42,43,44,45,46,47,48,71,72,73,74,75,76,77,78,79,80, +] + + +ALLOWED_OXIDATION_STATES = { + "Sc": [3], + "Ti": [3, 4], + "V": [2, 3, 4, 5], + "Cr": [2, 3, 4, 6], + "Mn": [2, 3, 4, 6, 7], + "Fe": [2, 3], + "Co": [2, 3], + "Ni": [2], + "Cu": [1, 2], + "Zn": [2], + "Y": [3], + "Zr": [4], + "Nb": [3, 4, 5], + "Mo": [2, 3, 4, 5, 6], + "Tc": [2, 3, 4, 5, 6, 7], + "Ru": [2, 3, 4, 5, 6, 7, 8], + "Rh": [1, 3], + "Pd": [2, 4], + "Ag": [1], + "Cd": [2], + "La": [3], + "Hf": [4], + "Ta": [3, 4, 5], + "W": [2, 3, 4, 5, 6], + "Re": [2, 3, 4, 5, 6, 7], + "Os": [3, 4, 5, 6, 7, 8], + "Ir": [1, 3], + "Pt": [2, 4], + "Au": [1, 3], + "Hg": [1, 2], +} +# fmt: on + +logger = logging.getLogger(__name__) + +params = Chem.MolStandardize.rdMolStandardize.MetalDisconnectorOptions() +params.splitAromaticC = True +params.splitGrignards = True +params.adjustCharges = False + +MetalNon_Hg = "[#3,#11,#12,#19,#13,#21,#22,#23,#24,#25,#26,#27,#28,#29,#30,#39,#40,#41,#42,#43,#44,#45,#46,#47,#48,#57,#72,#73,#74,#75,#76,#77,#78,#79,#80]~[B,#6,#14,#15,#33,#51,#16,#34,#52,Cl,Br,I,#85]" + +pt = GetPeriodicTable + +global atomic_valence_electrons + +atomic_valence_electrons = {} +atomic_valence_electrons[1] = 1 +atomic_valence_electrons[5] = 3 +atomic_valence_electrons[6] = 4 +atomic_valence_electrons[7] = 5 +atomic_valence_electrons[8] = 6 +atomic_valence_electrons[9] = 7 +atomic_valence_electrons[13] = 3 +atomic_valence_electrons[14] = 4 +atomic_valence_electrons[15] = 5 +atomic_valence_electrons[16] = 6 +atomic_valence_electrons[17] = 7 +atomic_valence_electrons[18] = 8 +atomic_valence_electrons[32] = 4 +atomic_valence_electrons[33] = 5 # As +atomic_valence_electrons[35] = 7 +atomic_valence_electrons[34] = 6 +atomic_valence_electrons[53] = 7 + +# TMs +atomic_valence_electrons[21] = 3 # Sc +atomic_valence_electrons[22] = 4 # Ti +atomic_valence_electrons[23] = 5 # V +atomic_valence_electrons[24] = 6 # Cr +atomic_valence_electrons[25] = 7 # Mn +atomic_valence_electrons[26] = 8 # Fe +atomic_valence_electrons[27] = 9 # Co +atomic_valence_electrons[28] = 10 # Ni +atomic_valence_electrons[29] = 11 # Cu +atomic_valence_electrons[30] = 12 # Zn + +atomic_valence_electrons[39] = 3 # Y +atomic_valence_electrons[40] = 4 # Zr +atomic_valence_electrons[41] = 5 # Nb +atomic_valence_electrons[42] = 6 # Mo +atomic_valence_electrons[43] = 7 # Tc +atomic_valence_electrons[44] = 8 # Ru +atomic_valence_electrons[45] = 9 # Rh +atomic_valence_electrons[46] = 10 # Pd +atomic_valence_electrons[47] = 11 # Ag +atomic_valence_electrons[48] = 12 # Cd + +atomic_valence_electrons[57] = 3 # La +atomic_valence_electrons[72] = 4 # Hf +atomic_valence_electrons[73] = 5 # Ta +atomic_valence_electrons[74] = 6 # W +atomic_valence_electrons[75] = 7 # Re +atomic_valence_electrons[76] = 8 # Os +atomic_valence_electrons[77] = 9 # Ir +atomic_valence_electrons[78] = 10 # Pt +atomic_valence_electrons[79] = 11 # Au +atomic_valence_electrons[80] = 12 # Hg + + +def shell(cmd, shell=False): + if shell: + p = subprocess.Popen( + cmd, + shell=True, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + else: + cmd = cmd.split() + p = subprocess.Popen( + cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + + output, err = p.communicate() + return output + + +def fix_NO2(smiles): + """Localizes nitro groups that have been assigned a charge of -2 (neutral + Nitrogen bound to two negatively charged Oxygen atoms). + + These groups are changed to reflect the correct neutral + configuration of a nitro group. The oxidation state on the + transition metal is changed accordingly. + """ + m = Chem.MolFromSmiles(smiles) + emol = Chem.RWMol(m) + patt = Chem.MolFromSmarts( + "[#8-]-[#7+0]-[#8-].[#21,#22,#23,#24,#25,#26,#27,#28,#29,#30,#39,#40,#41,#42,#43,#44,#45,#46,#47,#48,#57,#72,#73,#74,#75,#76,#77,#78,#79,#80]" + ) + matches = emol.GetSubstructMatches(patt) + for a1, a2, a3, a4 in matches: + if not emol.GetBondBetweenAtoms(a1, a4) and not emol.GetBondBetweenAtoms(a3, a4): + tm = emol.GetAtomWithIdx(a4) + o1 = emol.GetAtomWithIdx(a1) + n = emol.GetAtomWithIdx(a2) + tm_charge = tm.GetFormalCharge() + new_charge = tm_charge - 2 + tm.SetFormalCharge(new_charge) + n.SetFormalCharge(+1) + o1.SetFormalCharge(0) + emol.RemoveBond(a1, a2) + emol.AddBond(a1, a2, rdchem.BondType.DOUBLE) + + mol = emol.GetMol() + Chem.SanitizeMol(mol) + return Chem.MolToSmiles(Chem.MolFromSmiles(Chem.MolToSmiles(mol))) + + +def fix_equivalent_Os(smiles): + """Localizes and fixes where a neutral atom is coordinating to the metal + but connected ro a negatively charged atom through resonane. + + The charge is moved to the coordinating atom and charges fixed + accordingly. + """ + m = Chem.MolFromSmiles(smiles) + emol = Chem.RWMol(m) + + patt = Chem.MolFromSmarts("[#6-,#7-,#8-,#15-,#16-]-[*]=[#6,#7,#8,#15,#16]") + + matches = emol.GetSubstructMatches(patt) + used_atom_ids_1 = [] + used_atom_ids_3 = [] + for atom in emol.GetAtoms(): + if atom.GetAtomicNum() in TRANSITION_METALS_NUM: + neighbor_idxs = [a.GetIdx() for a in atom.GetNeighbors()] + for a1, a2, a3 in matches: + if ( + a3 in neighbor_idxs + and a1 not in neighbor_idxs + and a1 not in used_atom_ids_1 + and a3 not in used_atom_ids_3 + ): + used_atom_ids_1.append(a1) + used_atom_ids_3.append(a3) + + emol.RemoveBond(a1, a2) + emol.AddBond(a1, a2, Chem.rdchem.BondType.DOUBLE) + emol.RemoveBond(a2, a3) + emol.AddBond(a2, a3, Chem.rdchem.BondType.SINGLE) + emol.GetAtomWithIdx(a1).SetFormalCharge(0) + emol.GetAtomWithIdx(a3).SetFormalCharge(-1) + + mol = emol.GetMol() + Chem.SanitizeMol(mol) + return Chem.MolToSmiles(Chem.MolFromSmiles(Chem.MolToSmiles(mol))) + + +def get_proposed_ligand_charge(ligand_mol, cutoff=-10): + """Runs an extended Hückel calculation for the ligand defined in + ligand_mol. + + A suggested charge is found by filling electrons in orbitals <-10eV + and omparing with total number of valence electrons. If charge is >= + 1 (<-1) and the LUMO (HOMO) is low (high) in energy, two additional + electrons are added (removed). The suggested charge is returned. + """ + valence_electrons = 0 + passed, result = rdEHTTools.RunMol(ligand_mol) + for a in ligand_mol.GetAtoms(): + valence_electrons += atomic_valence_electrons[a.GetAtomicNum()] + + passed, result = rdEHTTools.RunMol(ligand_mol) + N_occ_orbs = sum(1 for i in result.GetOrbitalEnergies() if i < cutoff) + charge = valence_electrons - 2 * N_occ_orbs + percieved_homo = result.GetOrbitalEnergies()[N_occ_orbs - 1] + if N_occ_orbs == len(result.GetOrbitalEnergies()): + percieved_lumo = np.nan + else: + percieved_lumo = result.GetOrbitalEnergies()[N_occ_orbs] + while charge >= 1 and percieved_lumo < -9: + N_occ_orbs += 1 + charge += -2 + logger.debug("added two more electrons:", charge, percieved_lumo) + percieved_lumo = result.GetOrbitalEnergies()[N_occ_orbs] + while charge < -1 and percieved_homo > -10.2: + N_occ_orbs -= 1 + charge += 2 + logger.debug("removed two electrons:", charge, percieved_homo) + percieved_homo = result.GetOrbitalEnergies()[N_occ_orbs - 1] + + return charge + + +def get_basic_mol(xyz_file, overall_charge): + """A basic mol-object (that can be usedto do an extended Hückel calculation + is constructed based on the adjacency matrix evaluated from the xyz- + coordinates. + + All bonds are single bonds, and charges are only asigned if + necessary to work with it, i.e. a Nitrogen with four neihbors gets a + +1 charge, Boron with 4 neighbors gets a -1 charge and oxygen with + three neighbors gets a +1 charge. + """ + atoms, _, xyz_coords = read_xyz_file(xyz_file) + + # AC, mol = xyz2AC_huckel(atoms, xyz_coords, overall_charge) + AC, mol = xyz2AC_obabel(atoms, xyz_coords) + tm_indxs = [atoms.index(tm) for tm in TRANSITION_METALS_NUM if tm in atoms] + + rwMol = Chem.RWMol(mol) + length_ac = len(AC) + + bondTypeDict = { + 1: Chem.BondType.SINGLE, + 2: Chem.BondType.DOUBLE, + 3: Chem.BondType.TRIPLE, + } + for i in range(length_ac): + for j in range(i + 1, length_ac): + bo = int(round(AC[i, j])) + if bo == 0: + continue + bt = bondTypeDict.get(bo, Chem.BondType.SINGLE) + rwMol.AddBond(i, j, bt) + + mol = rwMol.GetMol() + + for i, a in enumerate(mol.GetAtoms()): + if a.GetAtomicNum() == 7: + # explicit_valence = np.sum(AC[i]) + explicit_valence = sum([ele for idx, ele in enumerate(AC[i]) if idx not in tm_indxs]) + if explicit_valence == 4: + a.SetFormalCharge(1) + if a.GetAtomicNum() == 5: + # Boron with 4 explicit bonds should be negative + explicit_valence = sum([ele for idx, ele in enumerate(AC[i]) if idx not in tm_indxs]) + if explicit_valence == 4: + a.SetFormalCharge(-1) + if a.GetAtomicNum() == 8: + explicit_valence = sum([ele for idx, ele in enumerate(AC[i]) if idx not in tm_indxs]) + if explicit_valence == 3: + a.SetFormalCharge(1) + + return mol + + +def lig_checks(lig_mol, coordinating_atoms): + """Sending proposed ligand mol object through series of checks. + + - neighbouring coordinating atoms must be connected by pi-bond, aromatic bond (haptic), conjugated system + - If I have two neighbouring identical charges -> fail, I would rather change the charge and make a bond + -> suggest new charge adding/subtracting electrons based on these neighbouring charges + - count partial charges: partial charges that are not negative on ligand coordinating atoms count against this ligand + -> loop through resonance forms to see if any live up to this, then choose that one. + -> partial positive charge on coordinating atom is big red flag + -> If "bad" partial charges still exists suggest a new charge: add/subtract electrons based on the values of the partial charges + """ + res_mols = rdchem.ResonanceMolSupplier(lig_mol) + if len(res_mols) == 0: + res_mols = rdchem.ResonanceMolSupplier(lig_mol, flags=Chem.ALLOW_INCOMPLETE_OCTETS) + # Check for neighbouring coordinating atoms: + possible_lig_mols = [] + + for res_mol in res_mols: + positive_atoms = [] + negative_atoms = [] + N_aromatic = 0 + for a in res_mol.GetAtoms(): + if a.GetIsAromatic(): + N_aromatic += 1 + if a.GetFormalCharge() > 0: + positive_atoms.append(a.GetIdx()) + if a.GetFormalCharge() < 0 and a.GetIdx() not in coordinating_atoms: + negative_atoms.append(a.GetIdx()) + + possible_lig_mols.append((res_mol, len(positive_atoms), len(negative_atoms), N_aromatic)) + return possible_lig_mols + + +def get_lig_mol(mol, charge, coordinating_atoms): + """A sanitizable mol object is created for the ligand, taking into account + the checks defined in lig_checks. + + We try different charge settings and settings where carbenes are + allowed/not allowed in case no perfect solution (no partial charges + on other than the coordinating atoms) can be found. Finally best + found solution based on criteria in lig_checks is returned. + """ + atoms = [a.GetAtomicNum() for a in mol.GetAtoms()] + AC = Chem.rdmolops.GetAdjacencyMatrix(mol) + lig_mol = AC2mol(mol, AC, atoms, charge, allow_charged_fragments=True, use_atom_maps=False) + if not lig_mol and charge >= 0: + charge += -2 + lig_mol = AC2mol(mol, AC, atoms, charge, allow_charged_fragments=True, use_atom_maps=False) + if not lig_mol: + return None, charge + if not lig_mol and charge < 0: + charge += 2 + lig_mol = AC2mol(mol, AC, atoms, charge, allow_charged_fragments=True, use_atom_maps=False) + if not lig_mol: + charge += -4 + lig_mol = AC2mol( + mol, + AC, + atoms, + charge, + allow_charged_fragments=True, + use_atom_maps=False, + ) + if not lig_mol: + return None, charge + + possible_res_mols = lig_checks(lig_mol, coordinating_atoms) + best_res_mol, lowest_pos, lowest_neg, highest_aromatic = possible_res_mols[0] + for res_mol, N_pos_atoms, N_neg_atoms, N_aromatic in possible_res_mols: + if N_aromatic > highest_aromatic: + best_res_mol, lowest_pos, lowest_neg, highest_aromatic = ( + res_mol, + N_pos_atoms, + N_neg_atoms, + N_aromatic, + ) + if N_aromatic == highest_aromatic and N_pos_atoms + N_neg_atoms < lowest_pos + lowest_neg: + best_res_mol, lowest_pos, lowest_neg = res_mol, N_pos_atoms, N_neg_atoms + if lowest_pos + lowest_neg == 0: + return best_res_mol, charge + + lig_mol_no_carbene = AC2mol( + mol, + AC, + atoms, + charge, + allow_charged_fragments=True, + use_atom_maps=False, + allow_carbenes=False, + ) + allow_carbenes = True + + if lig_mol_no_carbene: + res_mols_no_carbenes = lig_checks(lig_mol_no_carbene, coordinating_atoms) + for res_mol, N_pos_atoms, N_neg_atoms, N_aromatic in res_mols_no_carbenes: + if ( + N_aromatic > highest_aromatic + and N_pos_atoms + N_neg_atoms <= lowest_pos + lowest_neg + ): + best_res_mol, lowest_pos, lowest_neg, highest_aromatic = ( + res_mol, + N_pos_atoms, + N_neg_atoms, + N_aromatic, + ) + if ( + N_aromatic == highest_aromatic + and N_pos_atoms + N_neg_atoms < lowest_pos + lowest_neg + ): + best_res_mol, lowest_pos, lowest_neg = res_mol, N_pos_atoms, N_neg_atoms + allow_carbenes = False + + if lowest_pos + lowest_neg == 0: + logger.debug("found opt solution without carbenes") + return best_res_mol, charge + + if lowest_pos - lowest_neg + charge < 0: + new_charge = charge + 2 + else: + new_charge = charge - 2 # if 0 maybe I should try both + + new_lig_mol = AC2mol( + mol, + AC, + atoms, + new_charge, + allow_charged_fragments=True, + use_atom_maps=False, + allow_carbenes=allow_carbenes, + ) + if not new_lig_mol: + return best_res_mol, charge + new_possible_res_mols = lig_checks(new_lig_mol, coordinating_atoms) + for res_mol, N_pos_atoms, N_neg_atoms, N_aromatic in new_possible_res_mols: + if N_aromatic > highest_aromatic: + best_res_mol, lowest_pos, lowest_neg, highest_aromatic = ( + res_mol, + N_pos_atoms, + N_neg_atoms, + N_aromatic, + ) + charge = new_charge + if N_aromatic == highest_aromatic and N_pos_atoms + N_neg_atoms < lowest_pos + lowest_neg: + best_res_mol, lowest_pos, lowest_neg = res_mol, N_pos_atoms, N_neg_atoms + charge = new_charge + + return best_res_mol, charge + + +def get_tmc_mol(xyz_file, overall_charge, with_stereo=False): + """Get TMC mol object from given xyz file. + + Args: + xyz_file (str) : Path to TMC xyz file + overall_charge (int): Overall charge of TMC + with_stereo (bool): Whether to percieve stereochemistry from the 3D data + + Returns: + tmc_mol (rdkit.Chem.rdchem.Mol): TMC mol object + """ + mol = get_basic_mol(xyz_file, overall_charge) + + tmc_idx = None + for a in mol.GetAtoms(): + a.SetIntProp("__origIdx", a.GetIdx()) + if a.GetAtomicNum() in TRANSITION_METALS_NUM: + # tm_atom = a.GetSymbol() + tmc_idx = a.GetIdx() + + if tmc_idx is None: + raise Exception("Found no TM in the input file. Please supply an xyz file with a TM") + + coordinating_atoms = np.nonzero(Chem.rdmolops.GetAdjacencyMatrix(mol)[tmc_idx, :])[0] + + # frags = rdMolStandardize.DisconnectOrganometallics(mol, params) + mdis = rdMolStandardize.MetalDisconnector(params) + mdis.SetMetalNon(Chem.MolFromSmarts(MetalNon_Hg)) + frags = mdis.Disconnect(mol) + frag_mols = rdmolops.GetMolFrags(frags, asMols=True) + + total_lig_charge = 0 + tm_idx = None + lig_list = [] + for i, f in enumerate(frag_mols): + m = Chem.Mol(f) + atoms = m.GetAtoms() + for atom in atoms: + if atom.GetAtomicNum() in TRANSITION_METALS_NUM: + tm_idx = i + break + else: + lig_charge = get_proposed_ligand_charge(f) + + lig_coordinating_atoms = [ + a.GetIdx() for a in m.GetAtoms() if a.GetIntProp("__origIdx") in coordinating_atoms + ] + lig_mol, lig_charge = get_lig_mol(m, lig_charge, lig_coordinating_atoms) + if not lig_mol: + return None + + total_lig_charge += lig_charge + lig_list.append(lig_mol) + + if tm_idx is None: + raise Exception("Found no TM in the input file. Please supply an xyz file with a TM") + + tm = Chem.RWMol(frag_mols[tm_idx]) + tm_ox = overall_charge - total_lig_charge + + len(tm.GetAtoms()) + + for a in tm.GetAtoms(): + if a.GetAtomicNum() in TRANSITION_METALS_NUM: + a.SetFormalCharge(tm_ox) + + for lmol in lig_list: + tm = Chem.CombineMols(tm, lmol) + + emol = Chem.RWMol(tm) + coordinating_atoms_idx = [ + a.GetIdx() for a in emol.GetAtoms() if a.GetIntProp("__origIdx") in coordinating_atoms + ] + tm_idx = [a.GetIdx() for a in emol.GetAtoms() if a.GetIntProp("__origIdx") == tmc_idx][0] + dMat = Chem.Get3DDistanceMatrix(emol) + cut_atoms = [] + for i, j in combinations(coordinating_atoms_idx, 2): + bond = emol.GetBondBetweenAtoms(int(i), int(j)) + if bond and abs(dMat[i, tm_idx] - dMat[j, tm_idx]) >= 0.4: + logger.debug( + "Haptic bond pattern with too great distance:", + dMat[i, tm_idx], + dMat[j, tm_idx], + ) + if dMat[i, tm_idx] > dMat[j, tm_idx] and i in coordinating_atoms_idx: + coordinating_atoms_idx.remove(i) + cut_atoms.append(i) + if dMat[j, tm_idx] > dMat[i, tm_idx] and j in coordinating_atoms_idx: + coordinating_atoms_idx.remove(j) + cut_atoms.append(j) + for j in cut_atoms: + for i in coordinating_atoms_idx: + bond = emol.GetBondBetweenAtoms(int(i), int(j)) + if bond and dMat[i, tm_idx] - dMat[j, tm_idx] >= -0.1 and i in coordinating_atoms_idx: + coordinating_atoms_idx.remove(i) + + for i in coordinating_atoms_idx: + if emol.GetBondBetweenAtoms(i, tm_idx): + continue + emol.AddBond(i, tm_idx, Chem.BondType.DATIVE) + + smiles = Chem.MolToSmiles(emol.GetMol()) + + # Fix specific cases + smiles = fix_equivalent_Os(smiles) + smiles = fix_NO2(smiles) + + tmc_mol = Chem.MolFromSmiles(smiles) + Chem.SanitizeMol(tmc_mol) + if with_stereo: + chiral_stereo_check(tmc_mol) + return tmc_mol + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="This script takes a TMC xyzfile as input and returns a TMC SMILES" + ) + parser.add_argument("--xyz_file", type=Path, help="The path to a TMC xyz file", required=True) + parser.add_argument( + "--charge", + type=int, + help="The overall charge of the TMC", + required=True, + ) + parser.add_argument( + "--log_level", + type=str, + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL", "DISABLE"], + default="INFO", + help="Set the logging level", + ) + + # Parse arguments + args = parser.parse_args() + + if args.log_level == "DISABLE": + logging.disable(logging.CRITICAL) + else: + # logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") + logging.basicConfig(format="") + logger.setLevel(getattr(logging, args.log_level)) + + # Stop the function if it runs too long. + def timeout_handler(num, stack): + print("Received SIGALRM, terminating") + raise Exception("Timeout") + + signal.signal(signal.SIGALRM, timeout_handler) + + # Set timeout length + signal.alarm(300) + + tmc_mol = get_tmc_mol(args.xyz_file, args.charge, with_stereo=False) + + smiles = Chem.MolToSmiles(Chem.MolFromSmiles(Chem.MolToSmiles(tmc_mol))) + with open(args.xyz_file.stem + ".txt", "w") as _f: + _f.write(smiles) + + logger.info(f"Output SMILES: {Chem.MolToSmiles(Chem.MolFromSmiles(Chem.MolToSmiles(tmc_mol)))}") diff --git a/tests/data/Ce_18-crown-6.xyz b/tests/data/Ce_18-crown-6.xyz new file mode 100644 index 0000000..1c54fe2 --- /dev/null +++ b/tests/data/Ce_18-crown-6.xyz @@ -0,0 +1,45 @@ +43 +name: Cerium 18-crown-6; charge: 1; multiplicity: 2; SMILES: [Ce+].O1CCOCCOCCOCCOCCOCC1; +Ce -0.0001437629 0.0000290711 0.0000314625 +C 1.4048527159 3.0831062543 0.0653267372 +C 2.6113949649 2.1594927572 -0.0628305583 +O 2.3364127712 0.9762045664 0.6796211324 +C 3.3790299350 0.0145917128 0.6249728354 +C 3.2625978458 -0.8828393385 -0.6227434392 +O 2.0076635562 -1.5436768644 -0.6792740943 +C 1.9681497053 -2.7587517478 0.0616929905 +C 0.5643899530 -3.3400190761 -0.0677399790 +O -0.3215291148 -2.5117724159 0.6780336161 +C -1.6756881842 -2.9338484301 0.6253399417 +C -2.3970077854 -2.3838388290 -0.6208087612 +O -2.3418075195 -0.9666005156 -0.6769634836 +C -3.3731892235 -0.3249356490 0.0656406480 +C -3.1761313683 1.1814423159 -0.0653740392 +O -2.0144629626 1.5360306187 0.6774313376 +C -1.7030340553 2.9197181486 0.6212067738 +C -0.8651454423 3.2658163879 -0.6253532148 +O 0.3348960861 2.5095955647 -0.6784715153 +H 1.6491656373 4.0734787978 -0.3444656203 +H 1.1434843064 3.1779011677 1.1298840469 +H 3.5034900310 2.6532982443 0.3480426379 +H 2.7718002915 1.9328905179 -1.1274838434 +H 4.3592969145 0.5071368800 0.6396448791 +H 3.2688144356 -0.5894180426 1.5328846941 +H 4.0830713955 -1.6110827570 -0.6358339519 +H 3.3131552181 -0.2710708397 -1.5307155666 +H 2.1811391534 -2.5824596549 1.1266423186 +H 2.7029716434 -3.4653050821 -0.3497729030 +H 0.2874989849 -3.3616521521 -1.1323910389 +H 0.5451459869 -4.3608790628 0.3395993030 +H -1.7392152624 -4.0290779383 0.6395520384 +H -2.1420891436 -2.5368647876 1.5342543479 +H -3.4379855058 -2.7301794677 -0.6320427240 +H -1.8942587619 -2.7332851743 -1.5298434253 +H -4.3531507066 -0.6089866202 -0.3435128848 +H -3.3246239002 -0.5965978032 1.1307214780 +H -3.0590751031 1.4313369996 -1.1304691014 +H -4.0500838974 1.7081588386 0.3435929443 +H -2.6198108249 3.5223254795 0.6330113863 +H -1.1267938141 3.1278303619 1.5299879097 +H -0.6448077892 4.3405147059 -0.6396316232 +H -1.4183809242 3.0022034937 -1.5340285941 diff --git a/tests/data/PdCl2MeCN2.xyz b/tests/data/PdCl2MeCN2.xyz new file mode 100644 index 0000000..939d2a6 --- /dev/null +++ b/tests/data/PdCl2MeCN2.xyz @@ -0,0 +1,17 @@ +15 +name: Pd(Cl)₂(MeCN)₂; charge: 0; multiplicity: 1; method: GFN2-xTB; energy: -31.16702100; SMILES: [Pd](Cl)(Cl)(N#CC)N#CC +Cl 3.11660431 7.35093655 1.82544062 +Cl 1.88081396 4.43967614 5.15810466 +N 1.24858144 5.12399843 2.35681373 +N 3.74980170 6.66256866 4.62830826 +C 0.46769563 4.64327825 1.66291711 +C 4.51853094 7.13448775 5.34155127 +Pd 2.50026978 5.89422667 3.49077834 +C 5.47225138 7.72143078 6.22834309 +C -0.50317304 4.04691383 0.80135531 +H -0.95489706 4.81490740 0.17351047 +H -1.27581887 3.56538215 1.40073375 +H -0.02055659 3.30220086 0.16851441 +H 6.47274541 7.37530267 5.96910563 +H 5.24348464 7.43123132 7.25379836 +H 5.43286341 8.80708472 6.13974326 diff --git a/tests/data/TeH2-NH3.xyz b/tests/data/TeH2-NH3.xyz new file mode 100644 index 0000000..6b1ca61 --- /dev/null +++ b/tests/data/TeH2-NH3.xyz @@ -0,0 +1,9 @@ +7 +name: TeH₂---NH₃; charge: 0; multiplicity: 1; SMILES: [TeH2].N; +Te 1.33365960928807 0.58921109724901 -0.02554811274403 +H 2.06466330424255 -0.88081247072650 0.06360896688761 +H 2.86318170400621 1.25493395382317 -0.02806899278085 +N -1.00573771403775 -1.09148973201686 0.00808526491897 +H -1.82209695233015 -0.68237412269865 -0.42953927921440 +H -1.25836041219935 -1.35112182466288 0.95348441988334 +H -0.77960406278569 -1.94043786659092 -0.49485228946985 diff --git a/tests/data/ethyl_methyl_cobaltacene.xyz b/tests/data/ethyl_methyl_cobaltacene.xyz new file mode 100644 index 0000000..080ce9e --- /dev/null +++ b/tests/data/ethyl_methyl_cobaltacene.xyz @@ -0,0 +1,32 @@ +30 +name: (EtCp)Co(MeCp); charge: 0; multiplicity: 2; method: OMol25's eSEN Conserving Small; energy: -1887.7005270 +Co -0.04056011 -0.00235882 -0.01003153 +C 0.77326728 0.98248343 1.66158418 +C 1.87634504 1.99620819 1.62871680 +C -0.61925749 1.24964068 1.57629346 +H -1.06197515 2.22375974 1.42940856 +C -1.31924796 0.01714168 1.65184760 +H -2.38827128 -0.11136968 1.58313241 +C -0.36112936 -1.01910672 1.80251792 +H -0.57354953 -2.07484457 1.86470149 +C 0.92531170 -0.42314278 1.80939684 +H 1.86697467 -0.94694315 1.87725830 +C 1.24303242 -0.17721320 -1.65864343 +H 2.31789848 -0.21957091 -1.57866079 +C 0.46062216 0.99895788 -1.79086254 +H 0.83489835 2.01170987 -1.81591122 +C -0.90926586 0.62802644 -1.82128386 +C -2.07786352 1.55599503 -1.93490554 +C -0.96498665 -0.78909299 -1.72520222 +H -1.86826877 -1.37986627 -1.70013839 +C 0.35848012 -1.28668821 -1.62446506 +H 0.64220963 -2.32097459 -1.51100528 +C 2.24692443 2.49471436 3.02582394 +H 1.56831434 2.83746244 1.00312732 +H 2.75205823 1.55587186 1.14546298 +H 1.38624353 2.95795070 3.50992624 +H 2.57324336 1.66574707 3.65538207 +H 3.05193902 3.23003796 2.98661933 +H -2.93524095 1.17429508 -1.37948826 +H -2.38233371 1.68408379 -2.97585245 +H -1.83113387 2.54037689 -1.53638782 diff --git a/tests/data/ferrocene.xyz b/tests/data/ferrocene.xyz new file mode 100644 index 0000000..de1994c --- /dev/null +++ b/tests/data/ferrocene.xyz @@ -0,0 +1,23 @@ +21 +name: Ferrocene; SMILES: [CH-]1C=CC=C1.[CH-]1C=CC=C1.[Fe+2]; charge: 0; multiplicity: 1; +Fe -0.00000000 0.00000000 -0.00000000 +C 0.97272879 0.70673138 1.79559290 +H 1.84244370 1.33861351 1.85569942 +C -0.37155151 1.14350891 1.79559302 +H -0.70375276 2.16592526 1.85569859 +C -1.20235395 -0.00000000 1.79559171 +H -2.27738905 -0.00000000 1.85569894 +C -0.37155151 -1.14350891 1.79559302 +H -0.70375276 -2.16592526 1.85569859 +C 0.97272879 -0.70673138 1.79559290 +H 1.84244370 -1.33861351 1.85569942 +C 1.20235395 -0.00000000 -1.79559171 +H 2.27738905 -0.00000000 -1.85569894 +C 0.37155151 1.14350891 -1.79559302 +H 0.70375276 2.16592526 -1.85569859 +C -0.97272879 0.70673138 -1.79559290 +H -1.84244370 1.33861351 -1.85569942 +C -0.97272879 -0.70673138 -1.79559290 +H -1.84244370 -1.33861351 -1.85569942 +C 0.37155151 -1.14350891 -1.79559302 +H 0.70375276 -2.16592526 -1.85569859 diff --git a/tests/data/hypervalent_iodine.xyz b/tests/data/hypervalent_iodine.xyz new file mode 100644 index 0000000..b769712 --- /dev/null +++ b/tests/data/hypervalent_iodine.xyz @@ -0,0 +1,36 @@ +34 +charge: 1; multiplicity: 1; generated_by: Rowan; timestamp: 2026-02-26 15:00:34; +O -1.35636091 -0.35639405 -1.86558604 +S -1.65149856 -0.42764711 -0.38256726 +C -2.65256071 0.94797736 0.08368185 +C -1.88602734 2.22268701 -0.27736726 +C -2.91582155 0.86633754 1.55720210 +C -3.91963410 0.98899871 -0.72478646 +N -0.15129961 -0.39896592 0.44786224 +I 1.34163487 -1.37906301 -0.60433257 +C 3.18814349 -0.63914496 0.07975909 +C 3.62315655 0.62707341 -0.27750763 +F 2.84581995 1.38239574 -1.07204187 +C 4.83640909 1.03134191 0.21087115 +F 5.29876232 2.27073550 -0.11829823 +C 5.59624004 0.19557604 1.03375852 +F 6.79171038 0.62400419 1.50069022 +C 5.16348171 -1.05070758 1.38282657 +F 5.87818527 -1.88485849 2.17965269 +C 3.93591022 -1.47624075 0.89651906 +F 3.48104858 -2.71909642 1.23155212 +N -2.42136049 -1.76472783 -0.12453038 +C -2.16012239 -2.92552328 -0.96372181 +H -1.61765242 2.11283708 -1.36061037 +H -0.98146045 2.27089286 0.35251367 +H -2.53241825 3.10315299 -0.19401129 +H -1.95600963 0.56137645 2.04766202 +H -3.60293126 0.01157207 1.70989263 +H -3.25388861 1.79888093 2.01220441 +H -3.78749108 0.48952639 -1.71150279 +H -4.79597092 0.57300603 -0.16439404 +H -4.22571325 2.04582620 -0.97580129 +H 0.12452430 0.55576283 0.76691240 +H -3.00450420 -3.63505316 -0.80087477 +H -2.04037428 -2.64095521 -2.01031208 +H -1.19192600 -3.38158321 -0.62945646 diff --git a/tests/data/methyl_radical.xyz b/tests/data/methyl_radical.xyz new file mode 100644 index 0000000..8c44c74 --- /dev/null +++ b/tests/data/methyl_radical.xyz @@ -0,0 +1,6 @@ +4 +name: methyl radical; charge: 1; multiplicity: 1; SMILES: [CH3+]; method: OMol25's eSEN Conserving Small; energy: -39.77199700; +C 0.00000000 -0.00000000 0.00000000 +H 1.07946500 0.00000000 0.00000000 +H -0.53973500 0.93483780 -0.00012656 +H -0.53973500 -0.93483780 0.00012655 diff --git a/tests/data/methylammonium.xyz b/tests/data/methylammonium.xyz new file mode 100644 index 0000000..5127225 --- /dev/null +++ b/tests/data/methylammonium.xyz @@ -0,0 +1,10 @@ +8 +name: methylammonium; charge: 1; multiplicity: 1; SMILES: [NH3+]C; method: AIMNet2 (ωB97M-D3); energy: -96.2744510; +N -0.74192777 -0.00205673 0.02532436 +C 0.76463782 0.00212151 -0.02610027 +H -1.09227109 0.64072447 0.73580258 +H -1.10462023 -0.92958493 0.24633594 +H -1.14601379 0.27957814 -0.86803048 +H 1.08290972 -0.69311227 -0.79228458 +H 1.14102353 -0.30257898 0.94197955 +H 1.09626190 1.00490884 -0.26302707 diff --git a/tests/test_steamroll.py b/tests/test_steamroll.py index c2795a7..8a51292 100644 --- a/tests/test_steamroll.py +++ b/tests/test_steamroll.py @@ -1,8 +1,48 @@ """Tests for the steamroll package.""" +from pathlib import Path +from typing import Any + +import pytest from rdkit import Chem -from steamroll.steamroll import fragment, to_rdkit +from steamroll.steamroll import ATOMIC_NUMBERS, fragment, to_rdkit + +HERE = Path(__file__).parent +DATA_DIR = HERE / "data" + + +def parse_comment_line(comment: str) -> dict[str, Any]: + """Parse the comment line of an XYZ file.""" + data: dict[str, Any] = {} + for kv in comment.strip(";").split(";"): + try: + key, value = kv.split(":", 1) + data[key.strip()] = value.strip() + except ValueError: + continue + + return data + + +def read_xyz(file: Path | str) -> tuple[list[int], list[list[float]], int]: + """Read an XYZ file.""" + atomic_numbers = [] + coordinates = [] + with Path(file).open() as f: + next(f) + data = parse_comment_line(next(f)) + charge = int(data.get("charge", 0)) + + for line in f: + atom, x, y, z = line.split() + if atom.isdigit(): + atomic_numbers.append(int(atom)) + else: + atomic_numbers.append(ATOMIC_NUMBERS[atom]) + coordinates.append([float(x), float(y), float(z)]) + + return atomic_numbers, coordinates, charge def test_steamroll() -> None: @@ -26,7 +66,7 @@ def test_no_remove_hydrogens() -> None: assert rdkm.GetNumAtoms() == 3 -def test_fragement() -> None: +def test_fragment() -> None: """Test to make sure multiple molecules are produced.""" atomic_numbers = [1, 8, 1, 1, 8, 1] coordinates = [[0, 0, 0], [0, 0, 1], [0, 1, 1], [50, 0, 0], [50, 0, 1], [50, 1, 1]] @@ -36,3 +76,12 @@ def test_fragement() -> None: rdkm1, rdkm2 = fragment(rdkm) assert rdkm1.GetNumAtoms() == 1 assert rdkm2.GetNumAtoms() == 1 + + +@pytest.mark.parametrize("file", DATA_DIR.glob("*.xyz")) +def test_all_data(file: str) -> None: + """Test to make sure all data files can be processed.""" + atomic_numbers, coordinates, charge = read_xyz(file) + rdkm = to_rdkit(atomic_numbers, coordinates, charge=charge, remove_Hs=False) + + assert rdkm.GetNumAtoms() == len(atomic_numbers)