From 3f627a968d7f5229617f8e08eebb5ec6103d059b Mon Sep 17 00:00:00 2001 From: Fabian Zills Date: Mon, 23 Jun 2025 21:27:09 +0200 Subject: [PATCH 1/9] JOSS manuscript draft --- .gitignore | 1 + paper/.gitignore | 6 ++ paper/bibliography.bib | 212 +++++++++++++++++++++++++++++++++++++++++ paper/build.sh | 1 + paper/paper.md | 113 ++++++++++++++++++++++ paper/zndraw_rdkit.svg | 62 ++++++++++++ 6 files changed, 395 insertions(+) create mode 100644 paper/.gitignore create mode 100644 paper/bibliography.bib create mode 100755 paper/build.sh create mode 100644 paper/paper.md create mode 100644 paper/zndraw_rdkit.svg diff --git a/.gitignore b/.gitignore index aa7554b..716587a 100644 --- a/.gitignore +++ b/.gitignore @@ -159,3 +159,4 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ tmp/ +.DS_Store \ No newline at end of file diff --git a/paper/.gitignore b/paper/.gitignore new file mode 100644 index 0000000..e895f2f --- /dev/null +++ b/paper/.gitignore @@ -0,0 +1,6 @@ +jats/ +*.xyz +*.pdf +*.ipynb +*.png + diff --git a/paper/bibliography.bib b/paper/bibliography.bib new file mode 100644 index 0000000..bb82b66 --- /dev/null +++ b/paper/bibliography.bib @@ -0,0 +1,212 @@ +@misc{landrumRdkitRdkit2023_03_22023, + title = {Rdkit/Rdkit: 2023\_03\_2 ({{Q1}} 2023) {{Release}}}, + shorttitle = {Rdkit/Rdkit}, + author = {Landrum, Greg and Tosco, Paolo and Kelley, Brian and Ric and Cosgrove, David and {sriniker} and {gedeck} and Vianello, Riccardo and NadineSchneider and Kawashima, Eisuke and N, Dan and Jones, Gareth and Dalke, Andrew and Cole, Brian and Swain, Matt and Turk, Samo and AlexanderSavelyev and Vaucher, Alain and W{\'o}jcikowski, Maciej and Take, Ichiru and Probst, Daniel and Ujihara, Kazuya and Scalfani, Vincent F. and {godin}, guillaume and Lehtivarjo, Juuso and Pahl, Axel and Walker, Rachel and Berenger, Francois and {jasondbiggs} and {strets123}}, + year = {2023}, + month = jun, + doi = {10.5281/zenodo.8053810}, + urldate = {2023-08-13}, + howpublished = {Zenodo} +} +@article{abrahamGROMACSHighPerformance2015, + title = {{{GROMACS}}: {{High}} Performance Molecular Simulations through Multi-Level Parallelism from Laptops to Supercomputers}, + shorttitle = {{{GROMACS}}}, + author = {Abraham, Mark James and Murtola, Teemu and Schulz, Roland and P{\'a}ll, Szil{\'a}rd and Smith, Jeremy C. and Hess, Berk and Lindahl, Erik}, + year = {2015}, + month = sep, + journal = {SoftwareX}, + volume = {1--2}, + pages = {19--25}, + issn = {2352-7110}, + doi = {10.1016/j.softx.2015.06.001}, + urldate = {2025-02-11}, + keywords = {Free energy,GPU,Molecular dynamics,SIMD} +} + +@article{larsenAtomicSimulationEnvironment2017, + title = {The Atomic Simulation Environment---a {{Python}} Library for Working with Atoms}, + author = {Larsen, Ask Hjorth and Mortensen, Jens J{\o}rgen and Blomqvist, Jakob and Castelli, Ivano E. and Christensen, Rune and Du{\l}ak, Marcin and Friis, Jesper and Groves, Michael N. and Hammer, Bj{\o}rk and Hargus, Cory and Hermes, Eric D. and Jennings, Paul C. and Jensen, Peter Bjerre and Kermode, James and Kitchin, John R. and Kolsbjerg, Esben Leonhard and Kubal, Joseph and Kaasbjerg, Kristen and Lysgaard, Steen and Maronsson, J{\'o}n Bergmann and Maxson, Tristan and Olsen, Thomas and Pastewka, Lars and Peterson, Andrew and Rostgaard, Carsten and Schi{\o}tz, Jakob and Sch{\"u}tt, Ole and Strange, Mikkel and Thygesen, Kristian S. and Vegge, Tejs and Vilhelmsen, Lasse and Walter, Michael and Zeng, Zhenhua and Jacobsen, Karsten W.}, + year = {2017}, + month = jun, + journal = {Journal of Physics: Condensed Matter}, + volume = {29}, + number = {27}, + pages = {273002}, + publisher = {IOP Publishing}, + issn = {0953-8984}, + doi = {10.1088/1361-648X/aa680e}, + urldate = {2023-07-10}, + langid = {english} +} +@article{martinezPACKMOLPackageBuilding2009, + title = {{{PACKMOL}}: A Package for Building Initial Configurations for Molecular Dynamics Simulations}, + shorttitle = {{{PACKMOL}}}, + author = {Mart{\'i}nez, L. and Andrade, R. and Birgin, E. G. and Mart{\'i}nez, J. M.}, + year = {2009}, + month = oct, + journal = {Journal of Computational Chemistry}, + volume = {30}, + number = {13}, + pages = {2157--2164}, + issn = {1096-987X}, + doi = {10.1002/jcc.21224}, + langid = {english}, + pmid = {19229944} +} +@article{zillsCollaborationMachineLearnedPotentials2024, + title = {Collaboration on {{Machine-Learned Potentials}} with {{IPSuite}}: {{A Modular Framework}} for {{Learning-on-the-Fly}}}, + shorttitle = {Collaboration on {{Machine-Learned Potentials}} with {{IPSuite}}}, + author = {Zills, Fabian and Sch{\"a}fer, Moritz Ren{\'e} and Segreto, Nico and K{\"a}stner, Johannes and Holm, Christian and Tovey, Samuel}, + year = {2024}, + month = apr, + journal = {The Journal of Physical Chemistry B}, + volume = {128}, + number = {15}, + pages = {3662--3676}, + publisher = {American Chemical Society}, + issn = {1520-6106}, + doi = {10.1021/acs.jpcb.3c07187}, + urldate = {2024-07-17}, + copyright = {All rights reserved} +} +@misc{elijosiusZeroShotMolecular2024, + title = {Zero {{Shot Molecular Generation}} via {{Similarity Kernels}}}, + author = {Elijo{\v s}ius, Rokas and Zills, Fabian and Batatia, Ilyes and Norwood, Sam Walton and Kov{\'a}cs, D{\'a}vid P{\'e}ter and Holm, Christian and Cs{\'a}nyi, G{\'a}bor}, + year = {2024}, + month = feb, + number = {arXiv:2402.08708}, + eprint = {2402.08708}, + primaryclass = {physics}, + publisher = {arXiv}, + doi = {10.48550/arXiv.2402.08708}, + urldate = {2024-02-15}, + archiveprefix = {arXiv}, + copyright = {All rights reserved}, + keywords = {Computer Science - Machine Learning,Physics - Chemical Physics} +} +@article{LAMMPS, + title = {{{LAMMPS}} - a Flexible Simulation Tool for Particle-Based Materials Modeling at the Atomic, Meso, and Continuum Scales}, + author = {Thompson, A. P. and Aktulga, H. M. and Berger, R. and Bolintineanu, D. S. and Brown, W. M. and Crozier, P. S. and in 't Veld, P. J. and Kohlmeyer, A. and Moore, S. G. and Nguyen, T. D. and Shan, R. and Stevens, M. J. and Tranchida, J. and Trott, C. and Plimpton, S. J.}, + year = {2022}, + volume = {271}, + pages = {108171}, + doi = {10.1016/j.cpc.2021.108171} +} +@article{weikESPResSo40Extensible2019, + title = {{{ESPResSo}} 4.0 -- an Extensible Software Package for Simulating Soft Matter Systems}, + author = {Weik, Florian and Weeber, Rudolf and Szuttor, Kai and Breitsprecher, Konrad and {de Graaf}, Joost and Kuron, Michael and Landsgesell, Jonas and Menke, Henri and Sean, David and Holm, Christian}, + year = {2019}, + month = mar, + journal = {The European Physical Journal Special Topics}, + volume = {227}, + number = {14}, + pages = {1789--1816}, + issn = {1951-6401}, + doi = {10.1140/epjst/e2019-800186-9}, + urldate = {2025-05-23}, + langid = {english} +} +@article{eastmanOpenMM8Molecular2024, + title = {{{OpenMM}} 8: {{Molecular Dynamics Simulation}} with {{Machine Learning Potentials}}}, + shorttitle = {{{OpenMM}} 8}, + author = {Eastman, Peter and Galvelis, Raimondas and Pel{\'a}ez, Ra{\'u}l P. and Abreu, Charlles R. A. and Farr, Stephen E. and Gallicchio, Emilio and Gorenko, Anton and Henry, Michael M. and Hu, Frank and Huang, Jing and Kr{\"a}mer, Andreas and Michel, Julien and Mitchell, Joshua A. and Pande, Vijay S. and Rodrigues, Jo{\~a}o PGLM and {Rodriguez-Guerra}, Jaime and Simmonett, Andrew C. and Singh, Sukrit and Swails, Jason and Turner, Philip and Wang, Yuanqing and Zhang, Ivy and Chodera, John D. and De Fabritiis, Gianni and Markland, Thomas E.}, + year = {2024}, + month = jan, + journal = {The Journal of Physical Chemistry B}, + volume = {128}, + number = {1}, + pages = {109--116}, + publisher = {American Chemical Society}, + issn = {1520-6106}, + doi = {10.1021/acs.jpcb.3c06662}, + urldate = {2025-05-23} +} +@article{phillipsScalableMolecularDynamics2020, + title = {Scalable Molecular Dynamics on {{CPU}} and {{GPU}} Architectures with {{NAMD}}}, + author = {Phillips, James C. and Hardy, David J. and Maia, Julio D. C. and Stone, John E. and Ribeiro, Jo{\~a}o V. and Bernardi, Rafael C. and Buch, Ronak and Fiorin, Giacomo and H{\'e}nin, J{\'e}r{\^o}me and Jiang, Wei and McGreevy, Ryan and Melo, Marcelo C. R. and Radak, Brian K. and Skeel, Robert D. and Singharoy, Abhishek and Wang, Yi and Roux, Beno{\^i}t and Aksimentiev, Aleksei and {Luthey-Schulten}, Zaida and Kal{\'e}, Laxmikant V. and Schulten, Klaus and Chipot, Christophe and Tajkhorshid, Emad}, + year = {2020}, + month = jul, + journal = {The Journal of Chemical Physics}, + volume = {153}, + number = {4}, + pages = {044130}, + issn = {0021-9606}, + doi = {10.1063/5.0014475}, + urldate = {2025-05-23} +} +@article{jainCommentaryMaterialsProject2013, + title = {Commentary: {{The Materials Project}}: {{A}} Materials Genome Approach to Accelerating Materials Innovation}, + shorttitle = {Commentary}, + author = {Jain, Anubhav and Ong, Shyue Ping and Hautier, Geoffroy and Chen, Wei and Richards, William Davidson and Dacek, Stephen and Cholia, Shreyas and Gunter, Dan and Skinner, David and Ceder, Gerbrand and Persson, Kristin A.}, + year = {2013}, + month = jul, + journal = {APL Materials}, + volume = {1}, + number = {1}, + pages = {011002}, + issn = {2166-532X}, + doi = {10.1063/1.4812323}, + urldate = {2024-08-27} +} +@article{weiningerSMILESChemicalLanguage1988, + title = {{{SMILES}}, a Chemical Language and Information System. 1. {{Introduction}} to Methodology and Encoding Rules}, + author = {Weininger, David}, + year = {1988}, + month = feb, + journal = {Journal of Chemical Information and Computer Sciences}, + volume = {28}, + number = {1}, + pages = {31--36}, + publisher = {American Chemical Society}, + issn = {0095-2338}, + doi = {10.1021/ci00057a005}, + urldate = {2023-07-10} +} +@article{kimUniversalStructureConversion2015, + title = {Universal {{Structure Conversion Method}} for {{Organic Molecules}}: {{From Atomic Connectivity}} to {{Three-Dimensional Geometry}}}, + shorttitle = {Universal {{Structure Conversion Method}} for {{Organic Molecules}}}, + author = {Kim, Yeonjoon and Kim, Woo Youn}, + year = {2015}, + journal = {Bulletin of the Korean Chemical Society}, + volume = {36}, + number = {7}, + pages = {1769--1777}, + issn = {1229-5949}, + doi = {10.1002/bkcs.10334}, + urldate = {2025-05-23}, + copyright = {{\copyright} 2015 Korean Chemical Society, Seoul \& Wiley-VCH Verlag GmbH \& Co. KGaA, Weinheim}, + langid = {english} +} +@article{gowersMDAnalysisPythonPackage2016, + title = {{{MDAnalysis}}: {{A Python Package}} for the {{Rapid Analysis}} of {{Molecular Dynamics Simulations}}}, + shorttitle = {{{MDAnalysis}}}, + author = {Gowers, Richard J. and Linke, Max and Barnoud, Jonathan and Reddy, Tyler J. E. and Melo, Manuel N. and Seyler, Sean L. and Doma{\'n}ski, Jan and Dotson, David L. and Buchoux, S{\'e}bastien and Kenney, Ian M. and Beckstein, Oliver}, + year = {2016}, + journal = {Proceedings of the 15th Python in Science Conference}, + pages = {98--105}, + doi = {10.25080/Majora-629e541a-00e}, + urldate = {2024-02-10} +} +@article{tingleZINC22AFreeMultiBillionScale2023, + title = {{{ZINC-22}}-{{A Free Multi-Billion-Scale Database}} of {{Tangible Compounds}} for {{Ligand Discovery}}}, + author = {Tingle, Benjamin I. and Tang, Khanh G. and Castanon, Mar and Gutierrez, John J. and Khurelbaatar, Munkhzul and Dandarchuluun, Chinzorig and Moroz, Yurii S. and Irwin, John J.}, + year = {2023}, + month = feb, + journal = {Journal of Chemical Information and Modeling}, + volume = {63}, + number = {4}, + pages = {1166--1176}, + publisher = {American Chemical Society}, + issn = {1549-9596}, + doi = {10.1021/acs.jcim.2c01253}, + urldate = {2025-05-24} +} +@inproceedings{hagbergExploringNetworkStructure2008, + author = {Aric A. Hagberg and Daniel A. Schult and Pieter J. Swart}, + title = {Exploring Network Structure, Dynamics, and Function using NetworkX}, + booktitle = {Proceedings of the 7th Python in Science Conference}, + pages = {11 - 15}, + address = {Pasadena, CA USA}, + year = {2008}, + editor = {Ga\"el Varoquaux and Travis Vaught and Jarrod Millman} +} diff --git a/paper/build.sh b/paper/build.sh new file mode 100755 index 0000000..0407838 --- /dev/null +++ b/paper/build.sh @@ -0,0 +1 @@ +docker run --rm --volume $PWD:/data --user $(id -u):$(id -g) --platform=linux/amd64 --env JOURNAL=joss openjournals/inara \ No newline at end of file diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 0000000..82c9143 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,113 @@ +--- +title: 'rdkit2ase: Molecular Structure Generation and Manipulation for Machine-Learned Interatomic Potentials' +tags: + - Python + - cheminformatics + - MLIPs + - ASE + - RDKit + - PACKMOL + - NetworkX +authors: + - name: Fabian Zills + orcid: 0000-0002-6936-4692 + affiliation: "1" +affiliations: + - name: Institute for Computational Physics, University of Stuttgart, 70569 Stuttgart, Germany + index: 1 +date: 2025-06-23 +bibliography: bibliography.bib +--- +# Summary + +The increasing prevalence of Machine-Learned Interatomic Potentials (MLIPs) has shifted requirements for setting up atomistic simulations. +Unlike classical force fields, MLIPs primarily require atomic positions and species, thereby removing the need for predefined topology files used for classical force fields in molecular dynamics software like GROMACS[@abrahamGROMACSHighPerformance2015], LAMMPS[@LAMMPS], ESPResSo[@weikESPResSo40Extensible2019] or OpenMM [@eastmanOpenMM8Molecular2024]. +Consequently, the Atomic Simulation Environment (ASE) [@larsenAtomicSimulationEnvironment2017] has become a popular Python toolkit for handling atomic structures and interfacing with MLIPs, particularly within the materials science and soft matter communities, because it originates from _ab initio_ simulations, which share the same setup as MLIP-driven studies. + +Concurrently, RDKit [@landrumRdkitRdkit2023_03_22023] offers extensive functionality for cheminformatics and manipulating chemical structures. +However, standard RDKit workflows are not designed for MLIP-driven simulation, while typical ASE-MLIP workflows may lack rich, explicit chemical information such as bond orders or molecular identities, as well as generating different conformations or searching substructures. + +The rdkit2ase package bridges this gap, providing an interface between RDKit's chemical structure generation and cheminformatics capabilities and ASE's handling of 3D atomic structures. +Furthermore, rdkit2ase integrates with PACKMOL [@martinezPACKMOLPackageBuilding2009] to facilitate the creation of complex, periodic simulation cells with diverse chemical compositions, all while preserving crucial chemical connectivity information. +Lastly, the combination of these packages enables selection and manipulation of atomistic structures based on chemical knowledge rather than manual index handling. + + +# Statement of need +rdkit2ase serves as a vital link between RDKit, ASE, and PACKMOL. +While its core function is to interface these tools, it thereby unlocks new capabilities and significantly reduces the manual coding and data wrangling typically required for preparing and analyzing molecular simulations. + +The package simplifies workflows that previously involved laborious tasks such as sourcing individual structure files from various databases (e.g., the Materials Project [@jainCommentaryMaterialsProject2013] or the ZINC database [@tingleZINC22AFreeMultiBillionScale2023]) and custom setups of simulation cells. +This simplification not only accelerates research but also supports the setup of more complex and chemically diverse simulation scenarios. + +One challenge in MLIP-driven simulations is the post-simulation identification and analysis of molecular fragments or chemical changes, as explicit topological information is often absent. +rdkit2ase addresses this by enabling the use of RDKit's powerful SMILES[@weiningerSMILESChemicalLanguage1988]/SMARTS-based substructure searching on ASE structures. +In addition, the resulting molecular graph can be exported to a NetworkX[@hagbergExploringNetworkStructure2008] object for further analysis. +This selection and handling allows for similar functionality as is provided by the MDAnalysis[@gowersMDAnalysisPythonPackage2016] atom selection language, targeted towards simulations with a fixed topology. + +# Features and Implementation + +![Visualization of a 3D structure from ASE, visualized with ZnDraw [@elijosiusZeroShotMolecular2024] (left) and its corresponding RDKit 2D chemical structure representation (right).\label{fig:zndraw-rdkit}](zndraw_rdkit.svg) + +The generation of atomic configurations in rdkit2ase is centered around SMILES for defining molecular species. +A typical workflow involves: + +1. Generating 3D conformers for individual molecular species from their SMILES using RDKit. +2. Packing these conformers into a simulation box to achieve a target density using PACKMOL and obtaining an ASE Atoms object representing the simulation cell, ready for use with MLIPs. +3. Post-processing the simulation data by identifying and selecting structures based on SMARTS. + +```python +from rdkit2ase import pack, smiles2conformers + +water = smiles2conformers("O", numConfs=2) +print(water[0].info['connectivity']) +>>> [(0, 1, 1.0), (0, 2, 1.0)] # (atom_idx1, atom_idx2, bond_order) +ethanol = smiles2conformers("CCO", numConfs=5) +density = 1000 # kg/m^3 +box = pack([water, ethanol], [7, 5], density, packmol="packmol.jl") +print(box) +>>> Atoms(symbols='C10H44O12', pbc=True, cell=[8.4, 8.4, 8.4]) +``` +All ASE Atoms objects generated or processed by rdkit2ase will store `connectivity` information (bonds and their orders) within the `ase.Atoms.info` dictionary. +If available, rdkit2ase uses this bond information for accurate interconversion. +If an ASE structure is converted to an RDKit molecule without pre-existing connectivity, rdkit2ase leverages RDKit's robust bond perception algorithms [@kimUniversalStructureConversion2015] to estimate this information. +A representation from both packages is shown in \autoref{fig:zndraw-rdkit}. + +```python +from rdkit2ase import ase2rdkit +from rdkit.Chem import Draw + +mol = ase2rdkit(box) +img = Draw.MolToImage(mol) +``` + +This bidirectional conversion capability allows the use of RDKit's chemical analysis tools together with ASE for MLIP-based simulations.. + +For instance, if during a simulation, atomic positions in an ASE Atoms object are updated, rdkit2ase can convert this structure back to an RDKit molecule to analyze chemical changes or identify specific substructures. +One common example is the extraction of substructures based on SMILES or SMARTS to track their structure and dynamics within a simulation. +For example, rdkit2ase streamlines the extraction of the CH$_3$ alkyl group from the ethanol molecules inside the simulation cell, without manual index lookup. + +```py +from rdkit2ase import get_substructures + +frames: list[ase.Atoms] = get_substructures( + atoms=box, + smiles="[C]([H])([H])[H]" +) +``` + +# Acknowledgements +F. Z. acknowledges support by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) in the framework of the priority program SPP 2363, “Utilization and Development of Machine Learning for Molecular Applications – Molecular Machine Learning” Project No. 497249646. Further funding through the DFG under Germany's Excellence Strategy – EXC 2075 – 390740016 and the Stuttgart Center for Simulation Science (SimTech) was provided. + +# Related software +The functionality of rdkit2ase relies critically on the following packages: + +- [RDKit](https://www.rdkit.org/docs/index.html): For cheminformatics tasks, SMILES parsing, conformer generation, and substructure searching. +- [ASE](https://wiki.fysik.dtu.dk/ase/): For representing and manipulating atomic structures, and interfacing with simulation engines. +- [PACKMOL](https://m3g.github.io/packmol/): For packing molecules into simulation boxes. rdkit2ase can interface with either a PACKMOL executable or the packmol.jl package. +- [NetworkX](https://networkx.org/): For the handling and analysis of molecular graphs. + +The rdkit2ase package is currently a crucial part of the following software packages: + +- [IPSuite](https://github.com/zincware/ipsuite): For generating structures for training MLIPs. +- [ZnDraw](https://github.com/zincware/zndraw): Interactive generation of simulation boxes and selection of substructures through a graphical user interface inside a web-based visualization package. +- [mlipx](https://github.com/basf/mlipx): Creating initial structures for benchmarking different MLIPs on real-world test scenarios. diff --git a/paper/zndraw_rdkit.svg b/paper/zndraw_rdkit.svg new file mode 100644 index 0000000..6882831 --- /dev/null +++ b/paper/zndraw_rdkit.svg @@ -0,0 +1,62 @@ + + + + From 0d7c97bdb087987cf2bd1eecc8aab85baed717c6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 23 Jun 2025 19:27:26 +0000 Subject: [PATCH 2/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .gitignore | 2 +- paper/.gitignore | 1 - paper/build.sh | 2 +- paper/paper.md | 183 +++++++++++++++++++++++++++++++---------------- 4 files changed, 122 insertions(+), 66 deletions(-) diff --git a/.gitignore b/.gitignore index 716587a..fba862b 100644 --- a/.gitignore +++ b/.gitignore @@ -159,4 +159,4 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ tmp/ -.DS_Store \ No newline at end of file +.DS_Store diff --git a/paper/.gitignore b/paper/.gitignore index e895f2f..fc07d23 100644 --- a/paper/.gitignore +++ b/paper/.gitignore @@ -3,4 +3,3 @@ jats/ *.pdf *.ipynb *.png - diff --git a/paper/build.sh b/paper/build.sh index 0407838..b9868b9 100755 --- a/paper/build.sh +++ b/paper/build.sh @@ -1 +1 @@ -docker run --rm --volume $PWD:/data --user $(id -u):$(id -g) --platform=linux/amd64 --env JOURNAL=joss openjournals/inara \ No newline at end of file +docker run --rm --volume $PWD:/data --user $(id -u):$(id -g) --platform=linux/amd64 --env JOURNAL=joss openjournals/inara diff --git a/paper/paper.md b/paper/paper.md index 82c9143..b175f50 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -1,59 +1,90 @@ ---- -title: 'rdkit2ase: Molecular Structure Generation and Manipulation for Machine-Learned Interatomic Potentials' -tags: - - Python - - cheminformatics - - MLIPs - - ASE - - RDKit - - PACKMOL - - NetworkX -authors: - - name: Fabian Zills - orcid: 0000-0002-6936-4692 - affiliation: "1" -affiliations: - - name: Institute for Computational Physics, University of Stuttgart, 70569 Stuttgart, Germany - index: 1 -date: 2025-06-23 -bibliography: bibliography.bib ---- -# Summary +______________________________________________________________________ -The increasing prevalence of Machine-Learned Interatomic Potentials (MLIPs) has shifted requirements for setting up atomistic simulations. -Unlike classical force fields, MLIPs primarily require atomic positions and species, thereby removing the need for predefined topology files used for classical force fields in molecular dynamics software like GROMACS[@abrahamGROMACSHighPerformance2015], LAMMPS[@LAMMPS], ESPResSo[@weikESPResSo40Extensible2019] or OpenMM [@eastmanOpenMM8Molecular2024]. -Consequently, the Atomic Simulation Environment (ASE) [@larsenAtomicSimulationEnvironment2017] has become a popular Python toolkit for handling atomic structures and interfacing with MLIPs, particularly within the materials science and soft matter communities, because it originates from _ab initio_ simulations, which share the same setup as MLIP-driven studies. +title: 'rdkit2ase: Molecular Structure Generation and Manipulation for +Machine-Learned Interatomic Potentials' tags: -Concurrently, RDKit [@landrumRdkitRdkit2023_03_22023] offers extensive functionality for cheminformatics and manipulating chemical structures. -However, standard RDKit workflows are not designed for MLIP-driven simulation, while typical ASE-MLIP workflows may lack rich, explicit chemical information such as bond orders or molecular identities, as well as generating different conformations or searching substructures. +- Python +- cheminformatics +- MLIPs +- ASE +- RDKit +- PACKMOL +- NetworkX authors: +- name: Fabian Zills orcid: 0000-0002-6936-4692 affiliation: "1" affiliations: +- name: Institute for Computational Physics, University of Stuttgart, 70569 + Stuttgart, Germany index: 1 date: 2025-06-23 bibliography: bibliography.bib -The rdkit2ase package bridges this gap, providing an interface between RDKit's chemical structure generation and cheminformatics capabilities and ASE's handling of 3D atomic structures. -Furthermore, rdkit2ase integrates with PACKMOL [@martinezPACKMOLPackageBuilding2009] to facilitate the creation of complex, periodic simulation cells with diverse chemical compositions, all while preserving crucial chemical connectivity information. -Lastly, the combination of these packages enables selection and manipulation of atomistic structures based on chemical knowledge rather than manual index handling. +______________________________________________________________________ +# Summary -# Statement of need -rdkit2ase serves as a vital link between RDKit, ASE, and PACKMOL. -While its core function is to interface these tools, it thereby unlocks new capabilities and significantly reduces the manual coding and data wrangling typically required for preparing and analyzing molecular simulations. +The increasing prevalence of Machine-Learned Interatomic Potentials (MLIPs) has +shifted requirements for setting up atomistic simulations. Unlike classical +force fields, MLIPs primarily require atomic positions and species, thereby +removing the need for predefined topology files used for classical force fields +in molecular dynamics software like GROMACS[@abrahamGROMACSHighPerformance2015], +LAMMPS[@LAMMPS], ESPResSo[@weikESPResSo40Extensible2019] or OpenMM +[@eastmanOpenMM8Molecular2024]. Consequently, the Atomic Simulation Environment +(ASE) [@larsenAtomicSimulationEnvironment2017] has become a popular Python +toolkit for handling atomic structures and interfacing with MLIPs, particularly +within the materials science and soft matter communities, because it originates +from _ab initio_ simulations, which share the same setup as MLIP-driven studies. + +Concurrently, RDKit [@landrumRdkitRdkit2023_03_22023] offers extensive +functionality for cheminformatics and manipulating chemical structures. However, +standard RDKit workflows are not designed for MLIP-driven simulation, while +typical ASE-MLIP workflows may lack rich, explicit chemical information such as +bond orders or molecular identities, as well as generating different +conformations or searching substructures. + +The rdkit2ase package bridges this gap, providing an interface between RDKit's +chemical structure generation and cheminformatics capabilities and ASE's +handling of 3D atomic structures. Furthermore, rdkit2ase integrates with PACKMOL +[@martinezPACKMOLPackageBuilding2009] to facilitate the creation of complex, +periodic simulation cells with diverse chemical compositions, all while +preserving crucial chemical connectivity information. Lastly, the combination of +these packages enables selection and manipulation of atomistic structures based +on chemical knowledge rather than manual index handling. -The package simplifies workflows that previously involved laborious tasks such as sourcing individual structure files from various databases (e.g., the Materials Project [@jainCommentaryMaterialsProject2013] or the ZINC database [@tingleZINC22AFreeMultiBillionScale2023]) and custom setups of simulation cells. -This simplification not only accelerates research but also supports the setup of more complex and chemically diverse simulation scenarios. +# Statement of need -One challenge in MLIP-driven simulations is the post-simulation identification and analysis of molecular fragments or chemical changes, as explicit topological information is often absent. -rdkit2ase addresses this by enabling the use of RDKit's powerful SMILES[@weiningerSMILESChemicalLanguage1988]/SMARTS-based substructure searching on ASE structures. -In addition, the resulting molecular graph can be exported to a NetworkX[@hagbergExploringNetworkStructure2008] object for further analysis. -This selection and handling allows for similar functionality as is provided by the MDAnalysis[@gowersMDAnalysisPythonPackage2016] atom selection language, targeted towards simulations with a fixed topology. +rdkit2ase serves as a vital link between RDKit, ASE, and PACKMOL. While its core +function is to interface these tools, it thereby unlocks new capabilities and +significantly reduces the manual coding and data wrangling typically required +for preparing and analyzing molecular simulations. + +The package simplifies workflows that previously involved laborious tasks such +as sourcing individual structure files from various databases (e.g., the +Materials Project [@jainCommentaryMaterialsProject2013] or the ZINC database +[@tingleZINC22AFreeMultiBillionScale2023]) and custom setups of simulation +cells. This simplification not only accelerates research but also supports the +setup of more complex and chemically diverse simulation scenarios. + +One challenge in MLIP-driven simulations is the post-simulation identification +and analysis of molecular fragments or chemical changes, as explicit topological +information is often absent. rdkit2ase addresses this by enabling the use of +RDKit's powerful SMILES[@weiningerSMILESChemicalLanguage1988]/SMARTS-based +substructure searching on ASE structures. In addition, the resulting molecular +graph can be exported to a NetworkX[@hagbergExploringNetworkStructure2008] +object for further analysis. This selection and handling allows for similar +functionality as is provided by the +MDAnalysis[@gowersMDAnalysisPythonPackage2016] atom selection language, targeted +towards simulations with a fixed topology. # Features and Implementation -![Visualization of a 3D structure from ASE, visualized with ZnDraw [@elijosiusZeroShotMolecular2024] (left) and its corresponding RDKit 2D chemical structure representation (right).\label{fig:zndraw-rdkit}](zndraw_rdkit.svg) +![Visualization of a 3D structure from ASE, visualized with ZnDraw [@elijosiusZeroShotMolecular2024] (left) and its corresponding RDKit 2D chemical structure representation (right).abel{fig:zndraw-rdkit}](zndraw_rdkit.svg) -The generation of atomic configurations in rdkit2ase is centered around SMILES for defining molecular species. -A typical workflow involves: +The generation of atomic configurations in rdkit2ase is centered around SMILES +for defining molecular species. A typical workflow involves: -1. Generating 3D conformers for individual molecular species from their SMILES using RDKit. -2. Packing these conformers into a simulation box to achieve a target density using PACKMOL and obtaining an ASE Atoms object representing the simulation cell, ready for use with MLIPs. -3. Post-processing the simulation data by identifying and selecting structures based on SMARTS. +1. Generating 3D conformers for individual molecular species from their SMILES + using RDKit. +1. Packing these conformers into a simulation box to achieve a target density + using PACKMOL and obtaining an ASE Atoms object representing the simulation + cell, ready for use with MLIPs. +1. Post-processing the simulation data by identifying and selecting structures + based on SMARTS. ```python from rdkit2ase import pack, smiles2conformers @@ -67,10 +98,14 @@ box = pack([water, ethanol], [7, 5], density, packmol="packmol.jl") print(box) >>> Atoms(symbols='C10H44O12', pbc=True, cell=[8.4, 8.4, 8.4]) ``` -All ASE Atoms objects generated or processed by rdkit2ase will store `connectivity` information (bonds and their orders) within the `ase.Atoms.info` dictionary. -If available, rdkit2ase uses this bond information for accurate interconversion. -If an ASE structure is converted to an RDKit molecule without pre-existing connectivity, rdkit2ase leverages RDKit's robust bond perception algorithms [@kimUniversalStructureConversion2015] to estimate this information. -A representation from both packages is shown in \autoref{fig:zndraw-rdkit}. + +All ASE Atoms objects generated or processed by rdkit2ase will store +`connectivity` information (bonds and their orders) within the `ase.Atoms.info` +dictionary. If available, rdkit2ase uses this bond information for accurate +interconversion. If an ASE structure is converted to an RDKit molecule without +pre-existing connectivity, rdkit2ase leverages RDKit's robust bond perception +algorithms [@kimUniversalStructureConversion2015] to estimate this information. +A representation from both packages is shown in \\autoref{fig:zndraw-rdkit}. ```python from rdkit2ase import ase2rdkit @@ -80,11 +115,16 @@ mol = ase2rdkit(box) img = Draw.MolToImage(mol) ``` -This bidirectional conversion capability allows the use of RDKit's chemical analysis tools together with ASE for MLIP-based simulations.. +This bidirectional conversion capability allows the use of RDKit's chemical +analysis tools together with ASE for MLIP-based simulations.. -For instance, if during a simulation, atomic positions in an ASE Atoms object are updated, rdkit2ase can convert this structure back to an RDKit molecule to analyze chemical changes or identify specific substructures. -One common example is the extraction of substructures based on SMILES or SMARTS to track their structure and dynamics within a simulation. -For example, rdkit2ase streamlines the extraction of the CH$_3$ alkyl group from the ethanol molecules inside the simulation cell, without manual index lookup. +For instance, if during a simulation, atomic positions in an ASE Atoms object +are updated, rdkit2ase can convert this structure back to an RDKit molecule to +analyze chemical changes or identify specific substructures. One common example +is the extraction of substructures based on SMILES or SMARTS to track their +structure and dynamics within a simulation. For example, rdkit2ase streamlines +the extraction of the CH$\_3$ alkyl group from the ethanol molecules inside the +simulation cell, without manual index lookup. ```py from rdkit2ase import get_substructures @@ -96,18 +136,35 @@ frames: list[ase.Atoms] = get_substructures( ``` # Acknowledgements -F. Z. acknowledges support by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) in the framework of the priority program SPP 2363, “Utilization and Development of Machine Learning for Molecular Applications – Molecular Machine Learning” Project No. 497249646. Further funding through the DFG under Germany's Excellence Strategy – EXC 2075 – 390740016 and the Stuttgart Center for Simulation Science (SimTech) was provided. -# Related software -The functionality of rdkit2ase relies critically on the following packages: +F. Z. acknowledges support by the Deutsche Forschungsgemeinschaft (DFG, German +Research Foundation) in the framework of the priority program SPP 2363, +“Utilization and Development of Machine Learning for Molecular Applications – +Molecular Machine Learning” Project No. 497249646. Further funding through the +DFG under Germany's Excellence Strategy – EXC 2075 – 390740016 and the Stuttgart +Center for Simulation Science (SimTech) was provided. -- [RDKit](https://www.rdkit.org/docs/index.html): For cheminformatics tasks, SMILES parsing, conformer generation, and substructure searching. -- [ASE](https://wiki.fysik.dtu.dk/ase/): For representing and manipulating atomic structures, and interfacing with simulation engines. -- [PACKMOL](https://m3g.github.io/packmol/): For packing molecules into simulation boxes. rdkit2ase can interface with either a PACKMOL executable or the packmol.jl package. -- [NetworkX](https://networkx.org/): For the handling and analysis of molecular graphs. +# Related software -The rdkit2ase package is currently a crucial part of the following software packages: +The functionality of rdkit2ase relies critically on the following packages: -- [IPSuite](https://github.com/zincware/ipsuite): For generating structures for training MLIPs. -- [ZnDraw](https://github.com/zincware/zndraw): Interactive generation of simulation boxes and selection of substructures through a graphical user interface inside a web-based visualization package. -- [mlipx](https://github.com/basf/mlipx): Creating initial structures for benchmarking different MLIPs on real-world test scenarios. +- [RDKit](https://www.rdkit.org/docs/index.html): For cheminformatics tasks, + SMILES parsing, conformer generation, and substructure searching. +- [ASE](https://wiki.fysik.dtu.dk/ase/): For representing and manipulating + atomic structures, and interfacing with simulation engines. +- [PACKMOL](https://m3g.github.io/packmol/): For packing molecules into + simulation boxes. rdkit2ase can interface with either a PACKMOL executable or + the packmol.jl package. +- [NetworkX](https://networkx.org/): For the handling and analysis of molecular + graphs. + +The rdkit2ase package is currently a crucial part of the following software +packages: + +- [IPSuite](https://github.com/zincware/ipsuite): For generating structures for + training MLIPs. +- [ZnDraw](https://github.com/zincware/zndraw): Interactive generation of + simulation boxes and selection of substructures through a graphical user + interface inside a web-based visualization package. +- [mlipx](https://github.com/basf/mlipx): Creating initial structures for + benchmarking different MLIPs on real-world test scenarios. From 60d57fef1d0815ed69255e19872d686365b456f0 Mon Sep 17 00:00:00 2001 From: Fabian Zills Date: Mon, 23 Jun 2025 21:29:50 +0200 Subject: [PATCH 3/9] lint --- .gitignore | 2 +- paper/.gitignore | 1 - paper/build.sh | 4 +- paper/paper.md | 183 +++++++++++++++++++++++++++++++---------------- pyproject.toml | 4 ++ 5 files changed, 128 insertions(+), 66 deletions(-) diff --git a/.gitignore b/.gitignore index 716587a..fba862b 100644 --- a/.gitignore +++ b/.gitignore @@ -159,4 +159,4 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ tmp/ -.DS_Store \ No newline at end of file +.DS_Store diff --git a/paper/.gitignore b/paper/.gitignore index e895f2f..fc07d23 100644 --- a/paper/.gitignore +++ b/paper/.gitignore @@ -3,4 +3,3 @@ jats/ *.pdf *.ipynb *.png - diff --git a/paper/build.sh b/paper/build.sh index 0407838..f2c5c8c 100755 --- a/paper/build.sh +++ b/paper/build.sh @@ -1 +1,3 @@ -docker run --rm --volume $PWD:/data --user $(id -u):$(id -g) --platform=linux/amd64 --env JOURNAL=joss openjournals/inara \ No newline at end of file +#!/bin/bash + +docker run --rm --volume $PWD:/data --user $(id -u):$(id -g) --platform=linux/amd64 --env JOURNAL=joss openjournals/inara diff --git a/paper/paper.md b/paper/paper.md index 82c9143..b175f50 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -1,59 +1,90 @@ ---- -title: 'rdkit2ase: Molecular Structure Generation and Manipulation for Machine-Learned Interatomic Potentials' -tags: - - Python - - cheminformatics - - MLIPs - - ASE - - RDKit - - PACKMOL - - NetworkX -authors: - - name: Fabian Zills - orcid: 0000-0002-6936-4692 - affiliation: "1" -affiliations: - - name: Institute for Computational Physics, University of Stuttgart, 70569 Stuttgart, Germany - index: 1 -date: 2025-06-23 -bibliography: bibliography.bib ---- -# Summary +______________________________________________________________________ -The increasing prevalence of Machine-Learned Interatomic Potentials (MLIPs) has shifted requirements for setting up atomistic simulations. -Unlike classical force fields, MLIPs primarily require atomic positions and species, thereby removing the need for predefined topology files used for classical force fields in molecular dynamics software like GROMACS[@abrahamGROMACSHighPerformance2015], LAMMPS[@LAMMPS], ESPResSo[@weikESPResSo40Extensible2019] or OpenMM [@eastmanOpenMM8Molecular2024]. -Consequently, the Atomic Simulation Environment (ASE) [@larsenAtomicSimulationEnvironment2017] has become a popular Python toolkit for handling atomic structures and interfacing with MLIPs, particularly within the materials science and soft matter communities, because it originates from _ab initio_ simulations, which share the same setup as MLIP-driven studies. +title: 'rdkit2ase: Molecular Structure Generation and Manipulation for +Machine-Learned Interatomic Potentials' tags: -Concurrently, RDKit [@landrumRdkitRdkit2023_03_22023] offers extensive functionality for cheminformatics and manipulating chemical structures. -However, standard RDKit workflows are not designed for MLIP-driven simulation, while typical ASE-MLIP workflows may lack rich, explicit chemical information such as bond orders or molecular identities, as well as generating different conformations or searching substructures. +- Python +- cheminformatics +- MLIPs +- ASE +- RDKit +- PACKMOL +- NetworkX authors: +- name: Fabian Zills orcid: 0000-0002-6936-4692 affiliation: "1" affiliations: +- name: Institute for Computational Physics, University of Stuttgart, 70569 + Stuttgart, Germany index: 1 date: 2025-06-23 bibliography: bibliography.bib -The rdkit2ase package bridges this gap, providing an interface between RDKit's chemical structure generation and cheminformatics capabilities and ASE's handling of 3D atomic structures. -Furthermore, rdkit2ase integrates with PACKMOL [@martinezPACKMOLPackageBuilding2009] to facilitate the creation of complex, periodic simulation cells with diverse chemical compositions, all while preserving crucial chemical connectivity information. -Lastly, the combination of these packages enables selection and manipulation of atomistic structures based on chemical knowledge rather than manual index handling. +______________________________________________________________________ +# Summary -# Statement of need -rdkit2ase serves as a vital link between RDKit, ASE, and PACKMOL. -While its core function is to interface these tools, it thereby unlocks new capabilities and significantly reduces the manual coding and data wrangling typically required for preparing and analyzing molecular simulations. +The increasing prevalence of Machine-Learned Interatomic Potentials (MLIPs) has +shifted requirements for setting up atomistic simulations. Unlike classical +force fields, MLIPs primarily require atomic positions and species, thereby +removing the need for predefined topology files used for classical force fields +in molecular dynamics software like GROMACS[@abrahamGROMACSHighPerformance2015], +LAMMPS[@LAMMPS], ESPResSo[@weikESPResSo40Extensible2019] or OpenMM +[@eastmanOpenMM8Molecular2024]. Consequently, the Atomic Simulation Environment +(ASE) [@larsenAtomicSimulationEnvironment2017] has become a popular Python +toolkit for handling atomic structures and interfacing with MLIPs, particularly +within the materials science and soft matter communities, because it originates +from _ab initio_ simulations, which share the same setup as MLIP-driven studies. + +Concurrently, RDKit [@landrumRdkitRdkit2023_03_22023] offers extensive +functionality for cheminformatics and manipulating chemical structures. However, +standard RDKit workflows are not designed for MLIP-driven simulation, while +typical ASE-MLIP workflows may lack rich, explicit chemical information such as +bond orders or molecular identities, as well as generating different +conformations or searching substructures. + +The rdkit2ase package bridges this gap, providing an interface between RDKit's +chemical structure generation and cheminformatics capabilities and ASE's +handling of 3D atomic structures. Furthermore, rdkit2ase integrates with PACKMOL +[@martinezPACKMOLPackageBuilding2009] to facilitate the creation of complex, +periodic simulation cells with diverse chemical compositions, all while +preserving crucial chemical connectivity information. Lastly, the combination of +these packages enables selection and manipulation of atomistic structures based +on chemical knowledge rather than manual index handling. -The package simplifies workflows that previously involved laborious tasks such as sourcing individual structure files from various databases (e.g., the Materials Project [@jainCommentaryMaterialsProject2013] or the ZINC database [@tingleZINC22AFreeMultiBillionScale2023]) and custom setups of simulation cells. -This simplification not only accelerates research but also supports the setup of more complex and chemically diverse simulation scenarios. +# Statement of need -One challenge in MLIP-driven simulations is the post-simulation identification and analysis of molecular fragments or chemical changes, as explicit topological information is often absent. -rdkit2ase addresses this by enabling the use of RDKit's powerful SMILES[@weiningerSMILESChemicalLanguage1988]/SMARTS-based substructure searching on ASE structures. -In addition, the resulting molecular graph can be exported to a NetworkX[@hagbergExploringNetworkStructure2008] object for further analysis. -This selection and handling allows for similar functionality as is provided by the MDAnalysis[@gowersMDAnalysisPythonPackage2016] atom selection language, targeted towards simulations with a fixed topology. +rdkit2ase serves as a vital link between RDKit, ASE, and PACKMOL. While its core +function is to interface these tools, it thereby unlocks new capabilities and +significantly reduces the manual coding and data wrangling typically required +for preparing and analyzing molecular simulations. + +The package simplifies workflows that previously involved laborious tasks such +as sourcing individual structure files from various databases (e.g., the +Materials Project [@jainCommentaryMaterialsProject2013] or the ZINC database +[@tingleZINC22AFreeMultiBillionScale2023]) and custom setups of simulation +cells. This simplification not only accelerates research but also supports the +setup of more complex and chemically diverse simulation scenarios. + +One challenge in MLIP-driven simulations is the post-simulation identification +and analysis of molecular fragments or chemical changes, as explicit topological +information is often absent. rdkit2ase addresses this by enabling the use of +RDKit's powerful SMILES[@weiningerSMILESChemicalLanguage1988]/SMARTS-based +substructure searching on ASE structures. In addition, the resulting molecular +graph can be exported to a NetworkX[@hagbergExploringNetworkStructure2008] +object for further analysis. This selection and handling allows for similar +functionality as is provided by the +MDAnalysis[@gowersMDAnalysisPythonPackage2016] atom selection language, targeted +towards simulations with a fixed topology. # Features and Implementation -![Visualization of a 3D structure from ASE, visualized with ZnDraw [@elijosiusZeroShotMolecular2024] (left) and its corresponding RDKit 2D chemical structure representation (right).\label{fig:zndraw-rdkit}](zndraw_rdkit.svg) +![Visualization of a 3D structure from ASE, visualized with ZnDraw [@elijosiusZeroShotMolecular2024] (left) and its corresponding RDKit 2D chemical structure representation (right).abel{fig:zndraw-rdkit}](zndraw_rdkit.svg) -The generation of atomic configurations in rdkit2ase is centered around SMILES for defining molecular species. -A typical workflow involves: +The generation of atomic configurations in rdkit2ase is centered around SMILES +for defining molecular species. A typical workflow involves: -1. Generating 3D conformers for individual molecular species from their SMILES using RDKit. -2. Packing these conformers into a simulation box to achieve a target density using PACKMOL and obtaining an ASE Atoms object representing the simulation cell, ready for use with MLIPs. -3. Post-processing the simulation data by identifying and selecting structures based on SMARTS. +1. Generating 3D conformers for individual molecular species from their SMILES + using RDKit. +1. Packing these conformers into a simulation box to achieve a target density + using PACKMOL and obtaining an ASE Atoms object representing the simulation + cell, ready for use with MLIPs. +1. Post-processing the simulation data by identifying and selecting structures + based on SMARTS. ```python from rdkit2ase import pack, smiles2conformers @@ -67,10 +98,14 @@ box = pack([water, ethanol], [7, 5], density, packmol="packmol.jl") print(box) >>> Atoms(symbols='C10H44O12', pbc=True, cell=[8.4, 8.4, 8.4]) ``` -All ASE Atoms objects generated or processed by rdkit2ase will store `connectivity` information (bonds and their orders) within the `ase.Atoms.info` dictionary. -If available, rdkit2ase uses this bond information for accurate interconversion. -If an ASE structure is converted to an RDKit molecule without pre-existing connectivity, rdkit2ase leverages RDKit's robust bond perception algorithms [@kimUniversalStructureConversion2015] to estimate this information. -A representation from both packages is shown in \autoref{fig:zndraw-rdkit}. + +All ASE Atoms objects generated or processed by rdkit2ase will store +`connectivity` information (bonds and their orders) within the `ase.Atoms.info` +dictionary. If available, rdkit2ase uses this bond information for accurate +interconversion. If an ASE structure is converted to an RDKit molecule without +pre-existing connectivity, rdkit2ase leverages RDKit's robust bond perception +algorithms [@kimUniversalStructureConversion2015] to estimate this information. +A representation from both packages is shown in \\autoref{fig:zndraw-rdkit}. ```python from rdkit2ase import ase2rdkit @@ -80,11 +115,16 @@ mol = ase2rdkit(box) img = Draw.MolToImage(mol) ``` -This bidirectional conversion capability allows the use of RDKit's chemical analysis tools together with ASE for MLIP-based simulations.. +This bidirectional conversion capability allows the use of RDKit's chemical +analysis tools together with ASE for MLIP-based simulations.. -For instance, if during a simulation, atomic positions in an ASE Atoms object are updated, rdkit2ase can convert this structure back to an RDKit molecule to analyze chemical changes or identify specific substructures. -One common example is the extraction of substructures based on SMILES or SMARTS to track their structure and dynamics within a simulation. -For example, rdkit2ase streamlines the extraction of the CH$_3$ alkyl group from the ethanol molecules inside the simulation cell, without manual index lookup. +For instance, if during a simulation, atomic positions in an ASE Atoms object +are updated, rdkit2ase can convert this structure back to an RDKit molecule to +analyze chemical changes or identify specific substructures. One common example +is the extraction of substructures based on SMILES or SMARTS to track their +structure and dynamics within a simulation. For example, rdkit2ase streamlines +the extraction of the CH$\_3$ alkyl group from the ethanol molecules inside the +simulation cell, without manual index lookup. ```py from rdkit2ase import get_substructures @@ -96,18 +136,35 @@ frames: list[ase.Atoms] = get_substructures( ``` # Acknowledgements -F. Z. acknowledges support by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) in the framework of the priority program SPP 2363, “Utilization and Development of Machine Learning for Molecular Applications – Molecular Machine Learning” Project No. 497249646. Further funding through the DFG under Germany's Excellence Strategy – EXC 2075 – 390740016 and the Stuttgart Center for Simulation Science (SimTech) was provided. -# Related software -The functionality of rdkit2ase relies critically on the following packages: +F. Z. acknowledges support by the Deutsche Forschungsgemeinschaft (DFG, German +Research Foundation) in the framework of the priority program SPP 2363, +“Utilization and Development of Machine Learning for Molecular Applications – +Molecular Machine Learning” Project No. 497249646. Further funding through the +DFG under Germany's Excellence Strategy – EXC 2075 – 390740016 and the Stuttgart +Center for Simulation Science (SimTech) was provided. -- [RDKit](https://www.rdkit.org/docs/index.html): For cheminformatics tasks, SMILES parsing, conformer generation, and substructure searching. -- [ASE](https://wiki.fysik.dtu.dk/ase/): For representing and manipulating atomic structures, and interfacing with simulation engines. -- [PACKMOL](https://m3g.github.io/packmol/): For packing molecules into simulation boxes. rdkit2ase can interface with either a PACKMOL executable or the packmol.jl package. -- [NetworkX](https://networkx.org/): For the handling and analysis of molecular graphs. +# Related software -The rdkit2ase package is currently a crucial part of the following software packages: +The functionality of rdkit2ase relies critically on the following packages: -- [IPSuite](https://github.com/zincware/ipsuite): For generating structures for training MLIPs. -- [ZnDraw](https://github.com/zincware/zndraw): Interactive generation of simulation boxes and selection of substructures through a graphical user interface inside a web-based visualization package. -- [mlipx](https://github.com/basf/mlipx): Creating initial structures for benchmarking different MLIPs on real-world test scenarios. +- [RDKit](https://www.rdkit.org/docs/index.html): For cheminformatics tasks, + SMILES parsing, conformer generation, and substructure searching. +- [ASE](https://wiki.fysik.dtu.dk/ase/): For representing and manipulating + atomic structures, and interfacing with simulation engines. +- [PACKMOL](https://m3g.github.io/packmol/): For packing molecules into + simulation boxes. rdkit2ase can interface with either a PACKMOL executable or + the packmol.jl package. +- [NetworkX](https://networkx.org/): For the handling and analysis of molecular + graphs. + +The rdkit2ase package is currently a crucial part of the following software +packages: + +- [IPSuite](https://github.com/zincware/ipsuite): For generating structures for + training MLIPs. +- [ZnDraw](https://github.com/zincware/zndraw): Interactive generation of + simulation boxes and selection of substructures through a graphical user + interface inside a web-based visualization package. +- [mlipx](https://github.com/basf/mlipx): Creating initial structures for + benchmarking different MLIPs on real-world test scenarios. diff --git a/pyproject.toml b/pyproject.toml index 88d366b..d51aff1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,3 +38,7 @@ extend-ignore = [ [build-system] requires = ["hatchling"] build-backend = "hatchling.build" + +[tool.codespell] +ignore-words-list = "basf" +skip = "*.svg,paper/bibliography.bib" From e9f4e5a16f84cbea4351d042a32b5fdbd0b48b94 Mon Sep 17 00:00:00 2001 From: Fabian Zills Date: Mon, 23 Jun 2025 21:32:35 +0200 Subject: [PATCH 4/9] fix `mdformat` removing important parts of the file --- .pre-commit-config.yaml | 10 +-- paper/paper.md | 183 ++++++++++++++-------------------------- 2 files changed, 68 insertions(+), 125 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6bbe24d..bca9e6f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,8 +33,8 @@ repos: args: [ --fix ] # Run the formatter. - id: ruff-format - - repo: https://github.com/executablebooks/mdformat - rev: 0.7.22 - hooks: - - id: mdformat - args: ["--wrap=80"] + # - repo: https://github.com/executablebooks/mdformat + # rev: 0.7.22 + # hooks: + # - id: mdformat + # args: ["--wrap=80"] diff --git a/paper/paper.md b/paper/paper.md index b175f50..f991b22 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -1,90 +1,59 @@ -______________________________________________________________________ - -title: 'rdkit2ase: Molecular Structure Generation and Manipulation for -Machine-Learned Interatomic Potentials' tags: +--- +title: 'rdkit2ase: Molecular Structure Generation and Manipulation for Machine-Learned Interatomic Potentials' +tags: + - Python + - cheminformatics + - MLIPs + - ASE + - RDKit + - PACKMOL + - NetworkX +authors: + - name: Fabian Zills + orcid: 0000-0002-6936-4692 + affiliation: "1" +affiliations: + - name: Institute for Computational Physics, University of Stuttgart, 70569 Stuttgart, Germany + index: 1 +date: 2025-06-23 +bibliography: bibliography.bib +--- +# Summary -- Python -- cheminformatics -- MLIPs -- ASE -- RDKit -- PACKMOL -- NetworkX authors: -- name: Fabian Zills orcid: 0000-0002-6936-4692 affiliation: "1" affiliations: -- name: Institute for Computational Physics, University of Stuttgart, 70569 - Stuttgart, Germany index: 1 date: 2025-06-23 bibliography: bibliography.bib +The increasing prevalence of Machine-Learned Interatomic Potentials (MLIPs) has shifted requirements for setting up atomistic simulations. +Unlike classical force fields, MLIPs primarily require atomic positions and species, thereby removing the need for predefined topology files used for classical force fields in molecular dynamics software like GROMACS[@abrahamGROMACSHighPerformance2015], LAMMPS[@LAMMPS], ESPResSo[@weikESPResSo40Extensible2019] or OpenMM [@eastmanOpenMM8Molecular2024]. +Consequently, the Atomic Simulation Environment (ASE) [@larsenAtomicSimulationEnvironment2017] has become a popular Python toolkit for handling atomic structures and interfacing with MLIPs, particularly within the materials science and soft matter communities, because it originates from _ab initio_ simulations, which share the same setup as MLIP-driven studies. -______________________________________________________________________ +Concurrently, RDKit [@landrumRdkitRdkit2023_03_22023] offers extensive functionality for cheminformatics and manipulating chemical structures. +However, standard RDKit workflows are not designed for MLIP-driven simulation, while typical ASE-MLIP workflows may lack rich, explicit chemical information such as bond orders or molecular identities, as well as generating different conformations or searching substructures. -# Summary +The rdkit2ase package bridges this gap, providing an interface between RDKit's chemical structure generation and cheminformatics capabilities and ASE's handling of 3D atomic structures. +Furthermore, rdkit2ase integrates with PACKMOL [@martinezPACKMOLPackageBuilding2009] to facilitate the creation of complex, periodic simulation cells with diverse chemical compositions, all while preserving crucial chemical connectivity information. +Lastly, the combination of these packages enables selection and manipulation of atomistic structures based on chemical knowledge rather than manual index handling. -The increasing prevalence of Machine-Learned Interatomic Potentials (MLIPs) has -shifted requirements for setting up atomistic simulations. Unlike classical -force fields, MLIPs primarily require atomic positions and species, thereby -removing the need for predefined topology files used for classical force fields -in molecular dynamics software like GROMACS[@abrahamGROMACSHighPerformance2015], -LAMMPS[@LAMMPS], ESPResSo[@weikESPResSo40Extensible2019] or OpenMM -[@eastmanOpenMM8Molecular2024]. Consequently, the Atomic Simulation Environment -(ASE) [@larsenAtomicSimulationEnvironment2017] has become a popular Python -toolkit for handling atomic structures and interfacing with MLIPs, particularly -within the materials science and soft matter communities, because it originates -from _ab initio_ simulations, which share the same setup as MLIP-driven studies. - -Concurrently, RDKit [@landrumRdkitRdkit2023_03_22023] offers extensive -functionality for cheminformatics and manipulating chemical structures. However, -standard RDKit workflows are not designed for MLIP-driven simulation, while -typical ASE-MLIP workflows may lack rich, explicit chemical information such as -bond orders or molecular identities, as well as generating different -conformations or searching substructures. - -The rdkit2ase package bridges this gap, providing an interface between RDKit's -chemical structure generation and cheminformatics capabilities and ASE's -handling of 3D atomic structures. Furthermore, rdkit2ase integrates with PACKMOL -[@martinezPACKMOLPackageBuilding2009] to facilitate the creation of complex, -periodic simulation cells with diverse chemical compositions, all while -preserving crucial chemical connectivity information. Lastly, the combination of -these packages enables selection and manipulation of atomistic structures based -on chemical knowledge rather than manual index handling. # Statement of need +rdkit2ase serves as a vital link between RDKit, ASE, and PACKMOL. +While its core function is to interface these tools, it thereby unlocks new capabilities and significantly reduces the manual coding and data wrangling typically required for preparing and analyzing molecular simulations. -rdkit2ase serves as a vital link between RDKit, ASE, and PACKMOL. While its core -function is to interface these tools, it thereby unlocks new capabilities and -significantly reduces the manual coding and data wrangling typically required -for preparing and analyzing molecular simulations. - -The package simplifies workflows that previously involved laborious tasks such -as sourcing individual structure files from various databases (e.g., the -Materials Project [@jainCommentaryMaterialsProject2013] or the ZINC database -[@tingleZINC22AFreeMultiBillionScale2023]) and custom setups of simulation -cells. This simplification not only accelerates research but also supports the -setup of more complex and chemically diverse simulation scenarios. - -One challenge in MLIP-driven simulations is the post-simulation identification -and analysis of molecular fragments or chemical changes, as explicit topological -information is often absent. rdkit2ase addresses this by enabling the use of -RDKit's powerful SMILES[@weiningerSMILESChemicalLanguage1988]/SMARTS-based -substructure searching on ASE structures. In addition, the resulting molecular -graph can be exported to a NetworkX[@hagbergExploringNetworkStructure2008] -object for further analysis. This selection and handling allows for similar -functionality as is provided by the -MDAnalysis[@gowersMDAnalysisPythonPackage2016] atom selection language, targeted -towards simulations with a fixed topology. +The package simplifies workflows that previously involved laborious tasks such as sourcing individual structure files from various databases (e.g., the Materials Project [@jainCommentaryMaterialsProject2013] or the ZINC database [@tingleZINC22AFreeMultiBillionScale2023]) and custom setups of simulation cells. +This simplification not only accelerates research but also supports the setup of more complex and chemically diverse simulation scenarios. + +One challenge in MLIP-driven simulations is the post-simulation identification and analysis of molecular fragments or chemical changes, as explicit topological information is often absent. +rdkit2ase addresses this by enabling the use of RDKit's powerful SMILES[@weiningerSMILESChemicalLanguage1988]/SMARTS-based substructure searching on ASE structures. +In addition, the resulting molecular graph can be exported to a NetworkX[@hagbergExploringNetworkStructure2008] object for further analysis. +This selection and handling allows for similar functionality as is provided by the MDAnalysis[@gowersMDAnalysisPythonPackage2016] atom selection language, targeted towards simulations with a fixed topology. # Features and Implementation -![Visualization of a 3D structure from ASE, visualized with ZnDraw [@elijosiusZeroShotMolecular2024] (left) and its corresponding RDKit 2D chemical structure representation (right).abel{fig:zndraw-rdkit}](zndraw_rdkit.svg) +![Visualization of a 3D structure from ASE, visualized with ZnDraw [@elijosiusZeroShotMolecular2024] (left) and its corresponding RDKit 2D chemical structure representation (right).\label{fig:zndraw-rdkit}](zndraw_rdkit.svg) -The generation of atomic configurations in rdkit2ase is centered around SMILES -for defining molecular species. A typical workflow involves: +The generation of atomic configurations in rdkit2ase is centered around SMILES for defining molecular species. +A typical workflow involves: -1. Generating 3D conformers for individual molecular species from their SMILES - using RDKit. -1. Packing these conformers into a simulation box to achieve a target density - using PACKMOL and obtaining an ASE Atoms object representing the simulation - cell, ready for use with MLIPs. -1. Post-processing the simulation data by identifying and selecting structures - based on SMARTS. +1. Generating 3D conformers for individual molecular species from their SMILES using RDKit. +2. Packing these conformers into a simulation box to achieve a target density using PACKMOL and obtaining an ASE Atoms object representing the simulation cell, ready for use with MLIPs. +3. Post-processing the simulation data by identifying and selecting structures based on SMARTS. ```python from rdkit2ase import pack, smiles2conformers @@ -98,14 +67,10 @@ box = pack([water, ethanol], [7, 5], density, packmol="packmol.jl") print(box) >>> Atoms(symbols='C10H44O12', pbc=True, cell=[8.4, 8.4, 8.4]) ``` - -All ASE Atoms objects generated or processed by rdkit2ase will store -`connectivity` information (bonds and their orders) within the `ase.Atoms.info` -dictionary. If available, rdkit2ase uses this bond information for accurate -interconversion. If an ASE structure is converted to an RDKit molecule without -pre-existing connectivity, rdkit2ase leverages RDKit's robust bond perception -algorithms [@kimUniversalStructureConversion2015] to estimate this information. -A representation from both packages is shown in \\autoref{fig:zndraw-rdkit}. +All ASE Atoms objects generated or processed by rdkit2ase will store `connectivity` information (bonds and their orders) within the `ase.Atoms.info` dictionary. +If available, rdkit2ase uses this bond information for accurate interconversion. +If an ASE structure is converted to an RDKit molecule without pre-existing connectivity, rdkit2ase leverages RDKit's robust bond perception algorithms [@kimUniversalStructureConversion2015] to estimate this information. +A representation from both packages is shown in \autoref{fig:zndraw-rdkit}. ```python from rdkit2ase import ase2rdkit @@ -115,16 +80,11 @@ mol = ase2rdkit(box) img = Draw.MolToImage(mol) ``` -This bidirectional conversion capability allows the use of RDKit's chemical -analysis tools together with ASE for MLIP-based simulations.. +This bidirectional conversion capability allows the use of RDKit's chemical analysis tools together with ASE for MLIP-based simulations.. -For instance, if during a simulation, atomic positions in an ASE Atoms object -are updated, rdkit2ase can convert this structure back to an RDKit molecule to -analyze chemical changes or identify specific substructures. One common example -is the extraction of substructures based on SMILES or SMARTS to track their -structure and dynamics within a simulation. For example, rdkit2ase streamlines -the extraction of the CH$\_3$ alkyl group from the ethanol molecules inside the -simulation cell, without manual index lookup. +For instance, if during a simulation, atomic positions in an ASE Atoms object are updated, rdkit2ase can convert this structure back to an RDKit molecule to analyze chemical changes or identify specific substructures. +One common example is the extraction of substructures based on SMILES or SMARTS to track their structure and dynamics within a simulation. +For example, rdkit2ase streamlines the extraction of the CH$_3$ alkyl group from the ethanol molecules inside the simulation cell, without manual index lookup. ```py from rdkit2ase import get_substructures @@ -136,35 +96,18 @@ frames: list[ase.Atoms] = get_substructures( ``` # Acknowledgements - -F. Z. acknowledges support by the Deutsche Forschungsgemeinschaft (DFG, German -Research Foundation) in the framework of the priority program SPP 2363, -“Utilization and Development of Machine Learning for Molecular Applications – -Molecular Machine Learning” Project No. 497249646. Further funding through the -DFG under Germany's Excellence Strategy – EXC 2075 – 390740016 and the Stuttgart -Center for Simulation Science (SimTech) was provided. +F. Z. acknowledges support by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) in the framework of the priority program SPP 2363, “Utilization and Development of Machine Learning for Molecular Applications – Molecular Machine Learning” Project No. 497249646. Further funding through the DFG under Germany's Excellence Strategy – EXC 2075 – 390740016 and the Stuttgart Center for Simulation Science (SimTech) was provided. # Related software - The functionality of rdkit2ase relies critically on the following packages: -- [RDKit](https://www.rdkit.org/docs/index.html): For cheminformatics tasks, - SMILES parsing, conformer generation, and substructure searching. -- [ASE](https://wiki.fysik.dtu.dk/ase/): For representing and manipulating - atomic structures, and interfacing with simulation engines. -- [PACKMOL](https://m3g.github.io/packmol/): For packing molecules into - simulation boxes. rdkit2ase can interface with either a PACKMOL executable or - the packmol.jl package. -- [NetworkX](https://networkx.org/): For the handling and analysis of molecular - graphs. - -The rdkit2ase package is currently a crucial part of the following software -packages: - -- [IPSuite](https://github.com/zincware/ipsuite): For generating structures for - training MLIPs. -- [ZnDraw](https://github.com/zincware/zndraw): Interactive generation of - simulation boxes and selection of substructures through a graphical user - interface inside a web-based visualization package. -- [mlipx](https://github.com/basf/mlipx): Creating initial structures for - benchmarking different MLIPs on real-world test scenarios. +- [RDKit](https://www.rdkit.org/docs/index.html): For cheminformatics tasks, SMILES parsing, conformer generation, and substructure searching. +- [ASE](https://wiki.fysik.dtu.dk/ase/): For representing and manipulating atomic structures, and interfacing with simulation engines. +- [PACKMOL](https://m3g.github.io/packmol/): For packing molecules into simulation boxes. rdkit2ase can interface with either a PACKMOL executable or the packmol.jl package. +- [NetworkX](https://networkx.org/): For the handling and analysis of molecular graphs. + +The rdkit2ase package is currently a crucial part of the following software packages: + +- [IPSuite](https://github.com/zincware/ipsuite): For generating structures for training MLIPs. +- [ZnDraw](https://github.com/zincware/zndraw): Interactive generation of simulation boxes and selection of substructures through a graphical user interface inside a web-based visualization package. +- [mlipx](https://github.com/basf/mlipx): Creating initial structures for benchmarking different MLIPs on real-world test scenarios. From 170c81c2b56d760743cc02037ac7a64f49889e13 Mon Sep 17 00:00:00 2001 From: Fabian Zills Date: Sat, 15 Nov 2025 11:48:02 +0100 Subject: [PATCH 5/9] draft --- paper/bibliography.bib | 14 ++++++++++++ paper/paper.md | 51 ++++++++++++++++++++++++++---------------- 2 files changed, 46 insertions(+), 19 deletions(-) diff --git a/paper/bibliography.bib b/paper/bibliography.bib index bb82b66..af53362 100644 --- a/paper/bibliography.bib +++ b/paper/bibliography.bib @@ -210,3 +210,17 @@ @inproceedings{hagbergExploringNetworkStructure2008 year = {2008}, editor = {Ga\"el Varoquaux and Travis Vaught and Jarrod Millman} } +@article{oboyleOpenBabelOpen2011, + title = {Open {{Babel}}: {{An}} Open Chemical Toolbox}, + shorttitle = {Open {{Babel}}}, + author = {O'Boyle, Noel M. and Banck, Michael and James, Craig A. and Morley, Chris and Vandermeersch, Tim and Hutchison, Geoffrey R.}, + year = 2011, + month = oct, + journal = {Journal of Cheminformatics}, + volume = {3}, + number = {1}, + pages = {33}, + issn = {1758-2946}, + doi = {10.1186/1758-2946-3-33}, + urldate = {2025-11-15}, +} \ No newline at end of file diff --git a/paper/paper.md b/paper/paper.md index f991b22..8e98651 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -1,5 +1,5 @@ --- -title: 'rdkit2ase: Molecular Structure Generation and Manipulation for Machine-Learned Interatomic Potentials' +title: 'molify: Molecular Structure Interface' tags: - Python - cheminformatics @@ -23,24 +23,27 @@ bibliography: bibliography.bib The increasing prevalence of Machine-Learned Interatomic Potentials (MLIPs) has shifted requirements for setting up atomistic simulations. Unlike classical force fields, MLIPs primarily require atomic positions and species, thereby removing the need for predefined topology files used for classical force fields in molecular dynamics software like GROMACS[@abrahamGROMACSHighPerformance2015], LAMMPS[@LAMMPS], ESPResSo[@weikESPResSo40Extensible2019] or OpenMM [@eastmanOpenMM8Molecular2024]. Consequently, the Atomic Simulation Environment (ASE) [@larsenAtomicSimulationEnvironment2017] has become a popular Python toolkit for handling atomic structures and interfacing with MLIPs, particularly within the materials science and soft matter communities, because it originates from _ab initio_ simulations, which share the same setup as MLIP-driven studies. +In contrast to _ab initio_, MLIPs are much faster and can run on much larger systems, making high throughput simulations of more complex systems feasible and increasing the need for efficient initial structure generation. Concurrently, RDKit [@landrumRdkitRdkit2023_03_22023] offers extensive functionality for cheminformatics and manipulating chemical structures. However, standard RDKit workflows are not designed for MLIP-driven simulation, while typical ASE-MLIP workflows may lack rich, explicit chemical information such as bond orders or molecular identities, as well as generating different conformations or searching substructures. -The rdkit2ase package bridges this gap, providing an interface between RDKit's chemical structure generation and cheminformatics capabilities and ASE's handling of 3D atomic structures. -Furthermore, rdkit2ase integrates with PACKMOL [@martinezPACKMOLPackageBuilding2009] to facilitate the creation of complex, periodic simulation cells with diverse chemical compositions, all while preserving crucial chemical connectivity information. +The `molify` package bridges this gap, providing an interface between RDKit's chemical structure generation and cheminformatics capabilities and ASE's handling of 3D atomic structures. +Furthermore, `molify` integrates with PACKMOL [@martinezPACKMOLPackageBuilding2009] to facilitate the creation of complex, periodic simulation cells with diverse chemical compositions, all while preserving crucial chemical connectivity information. +In addition, `molify` simplifies the representation of molecular structures as graphs using NetworkX [@hagbergExploringNetworkStructure2008], e.g. enabling traversing or comparing molecular graphs. Lastly, the combination of these packages enables selection and manipulation of atomistic structures based on chemical knowledge rather than manual index handling. # Statement of need -rdkit2ase serves as a vital link between RDKit, ASE, and PACKMOL. +`molify` serves as a vital link between RDKit, ASE, NetworkX, and PACKMOL. While its core function is to interface these tools, it thereby unlocks new capabilities and significantly reduces the manual coding and data wrangling typically required for preparing and analyzing molecular simulations. +For example, ASE has no tools for handling topological information such as bonds or molecular identities, whilst rdkit does not natively interface with MLIPs. The package simplifies workflows that previously involved laborious tasks such as sourcing individual structure files from various databases (e.g., the Materials Project [@jainCommentaryMaterialsProject2013] or the ZINC database [@tingleZINC22AFreeMultiBillionScale2023]) and custom setups of simulation cells. This simplification not only accelerates research but also supports the setup of more complex and chemically diverse simulation scenarios. One challenge in MLIP-driven simulations is the post-simulation identification and analysis of molecular fragments or chemical changes, as explicit topological information is often absent. -rdkit2ase addresses this by enabling the use of RDKit's powerful SMILES[@weiningerSMILESChemicalLanguage1988]/SMARTS-based substructure searching on ASE structures. +`molify` addresses this by enabling the use of RDKit's powerful SMILES[@weiningerSMILESChemicalLanguage1988]/SMARTS-based substructure searching on ASE structures. In addition, the resulting molecular graph can be exported to a NetworkX[@hagbergExploringNetworkStructure2008] object for further analysis. This selection and handling allows for similar functionality as is provided by the MDAnalysis[@gowersMDAnalysisPythonPackage2016] atom selection language, targeted towards simulations with a fixed topology. @@ -48,32 +51,39 @@ This selection and handling allows for similar functionality as is provided by t ![Visualization of a 3D structure from ASE, visualized with ZnDraw [@elijosiusZeroShotMolecular2024] (left) and its corresponding RDKit 2D chemical structure representation (right).\label{fig:zndraw-rdkit}](zndraw_rdkit.svg) -The generation of atomic configurations in rdkit2ase is centered around SMILES for defining molecular species. +The generation of atomic configurations in `molify` is centered around SMILES for defining molecular species. A typical workflow involves: 1. Generating 3D conformers for individual molecular species from their SMILES using RDKit. 2. Packing these conformers into a simulation box to achieve a target density using PACKMOL and obtaining an ASE Atoms object representing the simulation cell, ready for use with MLIPs. -3. Post-processing the simulation data by identifying and selecting structures based on SMARTS. +4. Running MLIP-based simulations using ASE calculators. +5. Post-processing the simulation data by identifying and selecting structures based on SMARTS. ```python -from rdkit2ase import pack, smiles2conformers +from molify import pack, smiles2conformers +from ase.optimize import LBFGS +from mace.calculators import mace_mp + water = smiles2conformers("O", numConfs=2) print(water[0].info['connectivity']) >>> [(0, 1, 1.0), (0, 2, 1.0)] # (atom_idx1, atom_idx2, bond_order) ethanol = smiles2conformers("CCO", numConfs=5) density = 1000 # kg/m^3 -box = pack([water, ethanol], [7, 5], density, packmol="packmol.jl") +box = pack([water, ethanol], [7, 5], density) print(box) >>> Atoms(symbols='C10H44O12', pbc=True, cell=[8.4, 8.4, 8.4]) +atoms.calc = mace_mp() # MLIP calculator +opt = LBFGS(atoms) +opt.run(fmax=0.01) ``` -All ASE Atoms objects generated or processed by rdkit2ase will store `connectivity` information (bonds and their orders) within the `ase.Atoms.info` dictionary. -If available, rdkit2ase uses this bond information for accurate interconversion. -If an ASE structure is converted to an RDKit molecule without pre-existing connectivity, rdkit2ase leverages RDKit's robust bond perception algorithms [@kimUniversalStructureConversion2015] to estimate this information. +All ASE Atoms objects generated or processed by `molify` will store `connectivity` information (bonds and their orders) within the `ase.Atoms.info` dictionary. +If available, `molify` uses this bond information for accurate interconversion. +If an ASE structure is converted to an RDKit molecule without pre-existing connectivity, `molify` leverages RDKit's bond perception algorithms [@kimUniversalStructureConversion2015] to estimate this information. A representation from both packages is shown in \autoref{fig:zndraw-rdkit}. ```python -from rdkit2ase import ase2rdkit +from molify import ase2rdkit from rdkit.Chem import Draw mol = ase2rdkit(box) @@ -82,12 +92,12 @@ img = Draw.MolToImage(mol) This bidirectional conversion capability allows the use of RDKit's chemical analysis tools together with ASE for MLIP-based simulations.. -For instance, if during a simulation, atomic positions in an ASE Atoms object are updated, rdkit2ase can convert this structure back to an RDKit molecule to analyze chemical changes or identify specific substructures. +For instance, if during a simulation, atomic positions in an ASE Atoms object are updated, `molify` can convert this structure back to an RDKit molecule to analyze chemical changes or identify specific substructures. One common example is the extraction of substructures based on SMILES or SMARTS to track their structure and dynamics within a simulation. -For example, rdkit2ase streamlines the extraction of the CH$_3$ alkyl group from the ethanol molecules inside the simulation cell, without manual index lookup. +For example, `molify` streamlines the extraction of the CH$_3$ alkyl group from the ethanol molecules inside the simulation cell, without manual index lookup. ```py -from rdkit2ase import get_substructures +from molify import get_substructures frames: list[ase.Atoms] = get_substructures( atoms=box, @@ -99,15 +109,18 @@ frames: list[ase.Atoms] = get_substructures( F. Z. acknowledges support by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) in the framework of the priority program SPP 2363, “Utilization and Development of Machine Learning for Molecular Applications – Molecular Machine Learning” Project No. 497249646. Further funding through the DFG under Germany's Excellence Strategy – EXC 2075 – 390740016 and the Stuttgart Center for Simulation Science (SimTech) was provided. # Related software -The functionality of rdkit2ase relies critically on the following packages: +The functionality of `molify` relies critically on the following packages: - [RDKit](https://www.rdkit.org/docs/index.html): For cheminformatics tasks, SMILES parsing, conformer generation, and substructure searching. - [ASE](https://wiki.fysik.dtu.dk/ase/): For representing and manipulating atomic structures, and interfacing with simulation engines. -- [PACKMOL](https://m3g.github.io/packmol/): For packing molecules into simulation boxes. rdkit2ase can interface with either a PACKMOL executable or the packmol.jl package. +- [PACKMOL](https://m3g.github.io/packmol/): For packing molecules into simulation boxes. `molify` includes a `packmol` binary when installed from PyPI. - [NetworkX](https://networkx.org/): For the handling and analysis of molecular graphs. -The rdkit2ase package is currently a crucial part of the following software packages: +The `molify` package is currently a crucial part of the following software packages: - [IPSuite](https://github.com/zincware/ipsuite): For generating structures for training MLIPs. - [ZnDraw](https://github.com/zincware/zndraw): Interactive generation of simulation boxes and selection of substructures through a graphical user interface inside a web-based visualization package. - [mlipx](https://github.com/basf/mlipx): Creating initial structures for benchmarking different MLIPs on real-world test scenarios. + +The openBabel[@oboyleOpenBabelOpen2011] package provides similar functionality to RDKit as well as vast file format support. +Currently, there is no support for ase calculators or direct RDKit-ASE interconversion in openBabel. From f09107d0933b5fdde90f107a03eb170a686ff585 Mon Sep 17 00:00:00 2001 From: Fabian Zills Date: Mon, 17 Nov 2025 08:19:35 +0100 Subject: [PATCH 6/9] update manuscript --- paper/paper.md | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 8e98651..f5fa947 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -21,25 +21,26 @@ bibliography: bibliography.bib # Summary The increasing prevalence of Machine-Learned Interatomic Potentials (MLIPs) has shifted requirements for setting up atomistic simulations. -Unlike classical force fields, MLIPs primarily require atomic positions and species, thereby removing the need for predefined topology files used for classical force fields in molecular dynamics software like GROMACS[@abrahamGROMACSHighPerformance2015], LAMMPS[@LAMMPS], ESPResSo[@weikESPResSo40Extensible2019] or OpenMM [@eastmanOpenMM8Molecular2024]. +Unlike classical force fields, MLIPs primarily require atomic positions and species, thereby removing the need for predefined topology files used for classical force fields in molecular dynamics software like GROMACS[@abrahamGROMACSHighPerformance2015], LAMMPS[@LAMMPS], ESPResSo[@weikESPResSo40Extensible2019], or OpenMM[@eastmanOpenMM8Molecular2024]. Consequently, the Atomic Simulation Environment (ASE) [@larsenAtomicSimulationEnvironment2017] has become a popular Python toolkit for handling atomic structures and interfacing with MLIPs, particularly within the materials science and soft matter communities, because it originates from _ab initio_ simulations, which share the same setup as MLIP-driven studies. In contrast to _ab initio_, MLIPs are much faster and can run on much larger systems, making high throughput simulations of more complex systems feasible and increasing the need for efficient initial structure generation. -Concurrently, RDKit [@landrumRdkitRdkit2023_03_22023] offers extensive functionality for cheminformatics and manipulating chemical structures. -However, standard RDKit workflows are not designed for MLIP-driven simulation, while typical ASE-MLIP workflows may lack rich, explicit chemical information such as bond orders or molecular identities, as well as generating different conformations or searching substructures. +Concurrently, RDKit[@landrumRdkitRdkit2023_03_22023] offers extensive functionality for cheminformatics and manipulating chemical structures. +However, standard RDKit workflows are not designed for MLIP-driven simulation, while typical ASE-MLIP workflows may lack rich, explicit chemical information such as bond orders or molecular identities, as well as capabilities for generating different conformations or searching substructures. The `molify` package bridges this gap, providing an interface between RDKit's chemical structure generation and cheminformatics capabilities and ASE's handling of 3D atomic structures. -Furthermore, `molify` integrates with PACKMOL [@martinezPACKMOLPackageBuilding2009] to facilitate the creation of complex, periodic simulation cells with diverse chemical compositions, all while preserving crucial chemical connectivity information. -In addition, `molify` simplifies the representation of molecular structures as graphs using NetworkX [@hagbergExploringNetworkStructure2008], e.g. enabling traversing or comparing molecular graphs. +Furthermore, `molify` integrates with PACKMOL[@martinezPACKMOLPackageBuilding2009] to facilitate the creation of complex, periodic simulation cells with diverse chemical compositions, all while preserving crucial chemical connectivity information. +In addition, `molify` simplifies the representation of molecular structures as graphs using NetworkX[@hagbergExploringNetworkStructure2008], e.g., enabling traversing or comparing molecular graphs. Lastly, the combination of these packages enables selection and manipulation of atomistic structures based on chemical knowledge rather than manual index handling. +While designed for MLIP data, the usage of `molify` is not limited and can be expanded, e.g., by utilizing the bond order information in other ASE-based workflows for classical MD simulations or integrating with machine-learning driven bond order predictions. # Statement of need -`molify` serves as a vital link between RDKit, ASE, NetworkX, and PACKMOL. +`molify` serves as a vital link between RDKit, ASE, NetworkX, and PACKMOL, designed to aid working with molecular structures in systems without dedicated topology files and formats. While its core function is to interface these tools, it thereby unlocks new capabilities and significantly reduces the manual coding and data wrangling typically required for preparing and analyzing molecular simulations. -For example, ASE has no tools for handling topological information such as bonds or molecular identities, whilst rdkit does not natively interface with MLIPs. +For example, ASE has no tools for handling topological information such as bonds or molecular identities, while RDKit does not natively interface with MLIPs. -The package simplifies workflows that previously involved laborious tasks such as sourcing individual structure files from various databases (e.g., the Materials Project [@jainCommentaryMaterialsProject2013] or the ZINC database [@tingleZINC22AFreeMultiBillionScale2023]) and custom setups of simulation cells. +`molify` simplifies workflows that previously involved laborious tasks such as sourcing individual structure files from various databases (e.g., the Materials Project[@jainCommentaryMaterialsProject2013] or the ZINC database[@tingleZINC22AFreeMultiBillionScale2023]) and custom setups of simulation cells. This simplification not only accelerates research but also supports the setup of more complex and chemically diverse simulation scenarios. One challenge in MLIP-driven simulations is the post-simulation identification and analysis of molecular fragments or chemical changes, as explicit topological information is often absent. @@ -49,15 +50,15 @@ This selection and handling allows for similar functionality as is provided by t # Features and Implementation -![Visualization of a 3D structure from ASE, visualized with ZnDraw [@elijosiusZeroShotMolecular2024] (left) and its corresponding RDKit 2D chemical structure representation (right).\label{fig:zndraw-rdkit}](zndraw_rdkit.svg) +![Visualization of a 3D structure from ASE, visualized with ZnDraw[@elijosiusZeroShotMolecular2024] (left) and its corresponding RDKit 2D chemical structure representation (right).\label{fig:zndraw-rdkit}](zndraw_rdkit.svg) The generation of atomic configurations in `molify` is centered around SMILES for defining molecular species. A typical workflow involves: 1. Generating 3D conformers for individual molecular species from their SMILES using RDKit. 2. Packing these conformers into a simulation box to achieve a target density using PACKMOL and obtaining an ASE Atoms object representing the simulation cell, ready for use with MLIPs. -4. Running MLIP-based simulations using ASE calculators. -5. Post-processing the simulation data by identifying and selecting structures based on SMARTS. +3. Running MLIP-based simulations using ASE calculators. +4. Post-processing the simulation data by identifying and selecting structures based on SMARTS. ```python from molify import pack, smiles2conformers @@ -73,13 +74,13 @@ density = 1000 # kg/m^3 box = pack([water, ethanol], [7, 5], density) print(box) >>> Atoms(symbols='C10H44O12', pbc=True, cell=[8.4, 8.4, 8.4]) -atoms.calc = mace_mp() # MLIP calculator -opt = LBFGS(atoms) +box.calc = mace_mp() # MLIP calculator +opt = LBFGS(box) opt.run(fmax=0.01) ``` -All ASE Atoms objects generated or processed by `molify` will store `connectivity` information (bonds and their orders) within the `ase.Atoms.info` dictionary. +All ASE Atoms objects generated or processed by `molify` store `connectivity` information (bonds and their orders) within the `ase.Atoms.info` dictionary. If available, `molify` uses this bond information for accurate interconversion. -If an ASE structure is converted to an RDKit molecule without pre-existing connectivity, `molify` leverages RDKit's bond perception algorithms [@kimUniversalStructureConversion2015] to estimate this information. +If an ASE structure is converted to an RDKit molecule without pre-existing connectivity, `molify` leverages RDKit's bond perception algorithms[@kimUniversalStructureConversion2015] to estimate this information. A representation from both packages is shown in \autoref{fig:zndraw-rdkit}. ```python @@ -90,13 +91,13 @@ mol = ase2rdkit(box) img = Draw.MolToImage(mol) ``` -This bidirectional conversion capability allows the use of RDKit's chemical analysis tools together with ASE for MLIP-based simulations.. +This bidirectional conversion capability allows the use of RDKit's chemical analysis tools together with ASE for MLIP-based simulations. For instance, if during a simulation, atomic positions in an ASE Atoms object are updated, `molify` can convert this structure back to an RDKit molecule to analyze chemical changes or identify specific substructures. One common example is the extraction of substructures based on SMILES or SMARTS to track their structure and dynamics within a simulation. For example, `molify` streamlines the extraction of the CH$_3$ alkyl group from the ethanol molecules inside the simulation cell, without manual index lookup. -```py +```python from molify import get_substructures frames: list[ase.Atoms] = get_substructures( @@ -106,7 +107,7 @@ frames: list[ase.Atoms] = get_substructures( ``` # Acknowledgements -F. Z. acknowledges support by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) in the framework of the priority program SPP 2363, “Utilization and Development of Machine Learning for Molecular Applications – Molecular Machine Learning” Project No. 497249646. Further funding through the DFG under Germany's Excellence Strategy – EXC 2075 – 390740016 and the Stuttgart Center for Simulation Science (SimTech) was provided. +F.Z. acknowledges support by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) in the framework of the priority program SPP 2363, "Utilization and Development of Machine Learning for Molecular Applications – Molecular Machine Learning" Project No. 497249646. Further funding through the DFG under Germany's Excellence Strategy – EXC 2075 – 390740016 and the Stuttgart Center for Simulation Science (SimTech) was provided. # Related software The functionality of `molify` relies critically on the following packages: @@ -122,5 +123,7 @@ The `molify` package is currently a crucial part of the following software packa - [ZnDraw](https://github.com/zincware/zndraw): Interactive generation of simulation boxes and selection of substructures through a graphical user interface inside a web-based visualization package. - [mlipx](https://github.com/basf/mlipx): Creating initial structures for benchmarking different MLIPs on real-world test scenarios. -The openBabel[@oboyleOpenBabelOpen2011] package provides similar functionality to RDKit as well as vast file format support. -Currently, there is no support for ase calculators or direct RDKit-ASE interconversion in openBabel. +The OpenBabel[@oboyleOpenBabelOpen2011] package provides similar cheminformatics functionality to RDKit along with extensive file format support. +However, OpenBabel is primarily designed as a format conversion tool with a focus on command-line usage and file I/O, while `molify` is designed for Python-native workflows with in-memory object conversions. +Currently, OpenBabel does not provide direct support for ASE Atoms objects, ASE calculators (including MLIPs), or seamless RDKit-ASE interconversion within Python. +Furthermore, `molify`'s integration with PACKMOL for system building and NetworkX for graph-based molecular analysis provides capabilities beyond OpenBabel's core focus on format conversion. From 53ec2680a5773ddb86616e95593418512a413c45 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 17 Nov 2025 07:20:30 +0000 Subject: [PATCH 7/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- paper/bibliography.bib | 2 +- paper/paper.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paper/bibliography.bib b/paper/bibliography.bib index af53362..1db153d 100644 --- a/paper/bibliography.bib +++ b/paper/bibliography.bib @@ -223,4 +223,4 @@ @article{oboyleOpenBabelOpen2011 issn = {1758-2946}, doi = {10.1186/1758-2946-3-33}, urldate = {2025-11-15}, -} \ No newline at end of file +} diff --git a/paper/paper.md b/paper/paper.md index f5fa947..f1bf0b6 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -30,7 +30,7 @@ However, standard RDKit workflows are not designed for MLIP-driven simulation, w The `molify` package bridges this gap, providing an interface between RDKit's chemical structure generation and cheminformatics capabilities and ASE's handling of 3D atomic structures. Furthermore, `molify` integrates with PACKMOL[@martinezPACKMOLPackageBuilding2009] to facilitate the creation of complex, periodic simulation cells with diverse chemical compositions, all while preserving crucial chemical connectivity information. -In addition, `molify` simplifies the representation of molecular structures as graphs using NetworkX[@hagbergExploringNetworkStructure2008], e.g., enabling traversing or comparing molecular graphs. +In addition, `molify` simplifies the representation of molecular structures as graphs using NetworkX[@hagbergExploringNetworkStructure2008], e.g., enabling traversing or comparing molecular graphs. Lastly, the combination of these packages enables selection and manipulation of atomistic structures based on chemical knowledge rather than manual index handling. While designed for MLIP data, the usage of `molify` is not limited and can be expanded, e.g., by utilizing the bond order information in other ASE-based workflows for classical MD simulations or integrating with machine-learning driven bond order predictions. From 76c4b20cc95fac11be3be704e41fcfdbba834a01 Mon Sep 17 00:00:00 2001 From: Fabian Zills Date: Tue, 25 Nov 2025 21:49:58 +0100 Subject: [PATCH 8/9] update manuscript --- paper/paper.md | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index f1bf0b6..54c03c2 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -15,22 +15,23 @@ authors: affiliations: - name: Institute for Computational Physics, University of Stuttgart, 70569 Stuttgart, Germany index: 1 -date: 2025-06-23 +date: 2025-11-25 bibliography: bibliography.bib --- # Summary The increasing prevalence of Machine-Learned Interatomic Potentials (MLIPs) has shifted requirements for setting up atomistic simulations. Unlike classical force fields, MLIPs primarily require atomic positions and species, thereby removing the need for predefined topology files used for classical force fields in molecular dynamics software like GROMACS[@abrahamGROMACSHighPerformance2015], LAMMPS[@LAMMPS], ESPResSo[@weikESPResSo40Extensible2019], or OpenMM[@eastmanOpenMM8Molecular2024]. -Consequently, the Atomic Simulation Environment (ASE) [@larsenAtomicSimulationEnvironment2017] has become a popular Python toolkit for handling atomic structures and interfacing with MLIPs, particularly within the materials science and soft matter communities, because it originates from _ab initio_ simulations, which share the same setup as MLIP-driven studies. -In contrast to _ab initio_, MLIPs are much faster and can run on much larger systems, making high throughput simulations of more complex systems feasible and increasing the need for efficient initial structure generation. +Consequently, the Atomic Simulation Environment (ASE) [@larsenAtomicSimulationEnvironment2017] has become a popular Python toolkit for handling atomic structures and interfacing with MLIPs, particularly within the material science and soft matter communities. +ASE originates from the electronic structure community, which shares the same setup as MLIP-driven studies. +In contrast to _ab initio_, MLIPs are much faster and can run on much larger systems, making high-throughput simulations of more complex systems feasible and increasing the need for efficient initial structure generation. Concurrently, RDKit[@landrumRdkitRdkit2023_03_22023] offers extensive functionality for cheminformatics and manipulating chemical structures. -However, standard RDKit workflows are not designed for MLIP-driven simulation, while typical ASE-MLIP workflows may lack rich, explicit chemical information such as bond orders or molecular identities, as well as capabilities for generating different conformations or searching substructures. +However, standard RDKit workflows are not designed for MLIP-driven simulation, while typical ASE-MLIP workflows may lack rich explicit chemical information such as bond orders or molecular identities, as well as capabilities for generating different conformations or searching substructures. The `molify` package bridges this gap, providing an interface between RDKit's chemical structure generation and cheminformatics capabilities and ASE's handling of 3D atomic structures. Furthermore, `molify` integrates with PACKMOL[@martinezPACKMOLPackageBuilding2009] to facilitate the creation of complex, periodic simulation cells with diverse chemical compositions, all while preserving crucial chemical connectivity information. -In addition, `molify` simplifies the representation of molecular structures as graphs using NetworkX[@hagbergExploringNetworkStructure2008], e.g., enabling traversing or comparing molecular graphs. +In addition, `molify` simplifies the representation of molecular structures as graphs using NetworkX[@hagbergExploringNetworkStructure2008], e.g., enabling traversing or comparing them. Lastly, the combination of these packages enables selection and manipulation of atomistic structures based on chemical knowledge rather than manual index handling. While designed for MLIP data, the usage of `molify` is not limited and can be expanded, e.g., by utilizing the bond order information in other ASE-based workflows for classical MD simulations or integrating with machine-learning driven bond order predictions. @@ -38,26 +39,26 @@ While designed for MLIP data, the usage of `molify` is not limited and can be ex # Statement of need `molify` serves as a vital link between RDKit, ASE, NetworkX, and PACKMOL, designed to aid working with molecular structures in systems without dedicated topology files and formats. While its core function is to interface these tools, it thereby unlocks new capabilities and significantly reduces the manual coding and data wrangling typically required for preparing and analyzing molecular simulations. -For example, ASE has no tools for handling topological information such as bonds or molecular identities, while RDKit does not natively interface with MLIPs. +For example, ASE has no tools for handling topological information such as bonds or molecular identities, while RDKit cannot natively interface with MLIPs. `molify` simplifies workflows that previously involved laborious tasks such as sourcing individual structure files from various databases (e.g., the Materials Project[@jainCommentaryMaterialsProject2013] or the ZINC database[@tingleZINC22AFreeMultiBillionScale2023]) and custom setups of simulation cells. -This simplification not only accelerates research but also supports the setup of more complex and chemically diverse simulation scenarios. +With `molify`, more complex and chemically diverse simulation cells are easier to set up and process. -One challenge in MLIP-driven simulations is the post-simulation identification and analysis of molecular fragments or chemical changes, as explicit topological information is often absent. +One challenge in MLIP-driven simulations is the post-simulation identification and analysis of molecular fragments or chemical changes, as explicit topological information is not available and changes in connectivity can occur. `molify` addresses this by enabling the use of RDKit's powerful SMILES[@weiningerSMILESChemicalLanguage1988]/SMARTS-based substructure searching on ASE structures. In addition, the resulting molecular graph can be exported to a NetworkX[@hagbergExploringNetworkStructure2008] object for further analysis. -This selection and handling allows for similar functionality as is provided by the MDAnalysis[@gowersMDAnalysisPythonPackage2016] atom selection language, targeted towards simulations with a fixed topology. +This selection and handling allows for similar functionality as is provided by the MDAnalysis[@gowersMDAnalysisPythonPackage2016] atom selection language, designed for simulations with a fixed topology. # Features and Implementation ![Visualization of a 3D structure from ASE, visualized with ZnDraw[@elijosiusZeroShotMolecular2024] (left) and its corresponding RDKit 2D chemical structure representation (right).\label{fig:zndraw-rdkit}](zndraw_rdkit.svg) The generation of atomic configurations in `molify` is centered around SMILES for defining molecular species. -A typical workflow involves: +A typical workflow often follows these steps: 1. Generating 3D conformers for individual molecular species from their SMILES using RDKit. -2. Packing these conformers into a simulation box to achieve a target density using PACKMOL and obtaining an ASE Atoms object representing the simulation cell, ready for use with MLIPs. -3. Running MLIP-based simulations using ASE calculators. +2. Packing these conformers into a simulation box with a target density using PACKMOL and returning the simulation box as an ASE Atoms object. +3. Running MLIP-based simulations using ASE. 4. Post-processing the simulation data by identifying and selecting structures based on SMARTS. ```python @@ -79,9 +80,10 @@ opt = LBFGS(box) opt.run(fmax=0.01) ``` All ASE Atoms objects generated or processed by `molify` store `connectivity` information (bonds and their orders) within the `ase.Atoms.info` dictionary. -If available, `molify` uses this bond information for accurate interconversion. +If this information is available, `molify` uses it to convert between ASE, NetworkX and RDKit. If an ASE structure is converted to an RDKit molecule without pre-existing connectivity, `molify` leverages RDKit's bond perception algorithms[@kimUniversalStructureConversion2015] to estimate this information. -A representation from both packages is shown in \autoref{fig:zndraw-rdkit}. + +A visualisation of the 2D and 3D structure from the simulation is shown in \autoref{fig:zndraw-rdkit}. ```python from molify import ase2rdkit @@ -91,9 +93,17 @@ mol = ase2rdkit(box) img = Draw.MolToImage(mol) ``` +Additionally, `molify` can convert structures to and from NetworkX graphs, enabling graph-based algorithms such as traversal, isomorphism checks, or component analysis: + +```python +from molify import ase2networkx + +graph = ase2networkx(box) # NetworkX graph with atomic numbers and bond orders +``` + This bidirectional conversion capability allows the use of RDKit's chemical analysis tools together with ASE for MLIP-based simulations. -For instance, if during a simulation, atomic positions in an ASE Atoms object are updated, `molify` can convert this structure back to an RDKit molecule to analyze chemical changes or identify specific substructures. +For example, after atomic positions in an ASE Atoms object are updated, `molify` can convert this structure back to an RDKit molecule allowing for further analysis or visualization. One common example is the extraction of substructures based on SMILES or SMARTS to track their structure and dynamics within a simulation. For example, `molify` streamlines the extraction of the CH$_3$ alkyl group from the ethanol molecules inside the simulation cell, without manual index lookup. @@ -114,7 +124,7 @@ The functionality of `molify` relies critically on the following packages: - [RDKit](https://www.rdkit.org/docs/index.html): For cheminformatics tasks, SMILES parsing, conformer generation, and substructure searching. - [ASE](https://wiki.fysik.dtu.dk/ase/): For representing and manipulating atomic structures, and interfacing with simulation engines. -- [PACKMOL](https://m3g.github.io/packmol/): For packing molecules into simulation boxes. `molify` includes a `packmol` binary when installed from PyPI. +- [PACKMOL](https://m3g.github.io/packmol/): For optimizing the placement of molecules within a specified simulation box size. - [NetworkX](https://networkx.org/): For the handling and analysis of molecular graphs. The `molify` package is currently a crucial part of the following software packages: @@ -126,4 +136,4 @@ The `molify` package is currently a crucial part of the following software packa The OpenBabel[@oboyleOpenBabelOpen2011] package provides similar cheminformatics functionality to RDKit along with extensive file format support. However, OpenBabel is primarily designed as a format conversion tool with a focus on command-line usage and file I/O, while `molify` is designed for Python-native workflows with in-memory object conversions. Currently, OpenBabel does not provide direct support for ASE Atoms objects, ASE calculators (including MLIPs), or seamless RDKit-ASE interconversion within Python. -Furthermore, `molify`'s integration with PACKMOL for system building and NetworkX for graph-based molecular analysis provides capabilities beyond OpenBabel's core focus on format conversion. +Furthermore, `molify`'s integration with PACKMOL and NetworkX provides capabilities beyond OpenBabel's core focus on format conversion. From f7503166b8c753f4b8d4baaf042532fd34ef4dc6 Mon Sep 17 00:00:00 2001 From: Fabian Zills Date: Tue, 9 Dec 2025 11:20:55 +0100 Subject: [PATCH 9/9] editorial comments --- paper/bibliography.bib | 35 +++++++++++++++++------------------ paper/paper.md | 28 ++++++++++++++-------------- 2 files changed, 31 insertions(+), 32 deletions(-) diff --git a/paper/bibliography.bib b/paper/bibliography.bib index 1db153d..b886db2 100644 --- a/paper/bibliography.bib +++ b/paper/bibliography.bib @@ -202,25 +202,24 @@ @article{tingleZINC22AFreeMultiBillionScale2023 urldate = {2025-05-24} } @inproceedings{hagbergExploringNetworkStructure2008, - author = {Aric A. Hagberg and Daniel A. Schult and Pieter J. Swart}, - title = {Exploring Network Structure, Dynamics, and Function using NetworkX}, - booktitle = {Proceedings of the 7th Python in Science Conference}, - pages = {11 - 15}, - address = {Pasadena, CA USA}, - year = {2008}, - editor = {Ga\"el Varoquaux and Travis Vaught and Jarrod Millman} + author = {Aric A. Hagberg and Daniel A. Schult and Pieter J. Swart}, + year = {2008}, + month = {06}, + title = {Exploring Network Structure, Dynamics, and Function Using NetworkX}, + journal = {Proceedings of the 7th Python in Science Conference}, + doi = {10.25080/TCWV9851} } @article{oboyleOpenBabelOpen2011, - title = {Open {{Babel}}: {{An}} Open Chemical Toolbox}, + title = {Open {{Babel}}: {{An}} Open Chemical Toolbox}, shorttitle = {Open {{Babel}}}, - author = {O'Boyle, Noel M. and Banck, Michael and James, Craig A. and Morley, Chris and Vandermeersch, Tim and Hutchison, Geoffrey R.}, - year = 2011, - month = oct, - journal = {Journal of Cheminformatics}, - volume = {3}, - number = {1}, - pages = {33}, - issn = {1758-2946}, - doi = {10.1186/1758-2946-3-33}, - urldate = {2025-11-15}, + author = {O'Boyle, Noel M. and Banck, Michael and James, Craig A. and Morley, Chris and Vandermeersch, Tim and Hutchison, Geoffrey R.}, + year = 2011, + month = oct, + journal = {Journal of Cheminformatics}, + volume = {3}, + number = {1}, + pages = {33}, + issn = {1758-2946}, + doi = {10.1186/1758-2946-3-33}, + urldate = {2025-11-15} } diff --git a/paper/paper.md b/paper/paper.md index 54c03c2..2b365d8 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -21,17 +21,17 @@ bibliography: bibliography.bib # Summary The increasing prevalence of Machine-Learned Interatomic Potentials (MLIPs) has shifted requirements for setting up atomistic simulations. -Unlike classical force fields, MLIPs primarily require atomic positions and species, thereby removing the need for predefined topology files used for classical force fields in molecular dynamics software like GROMACS[@abrahamGROMACSHighPerformance2015], LAMMPS[@LAMMPS], ESPResSo[@weikESPResSo40Extensible2019], or OpenMM[@eastmanOpenMM8Molecular2024]. +Unlike classical force fields, MLIPs primarily require atomic positions and species, thereby removing the need for predefined topology files used for classical force fields in molecular dynamics software like GROMACS [@abrahamGROMACSHighPerformance2015], LAMMPS [@LAMMPS], ESPResSo [@weikESPResSo40Extensible2019], or OpenMM [@eastmanOpenMM8Molecular2024]. Consequently, the Atomic Simulation Environment (ASE) [@larsenAtomicSimulationEnvironment2017] has become a popular Python toolkit for handling atomic structures and interfacing with MLIPs, particularly within the material science and soft matter communities. ASE originates from the electronic structure community, which shares the same setup as MLIP-driven studies. In contrast to _ab initio_, MLIPs are much faster and can run on much larger systems, making high-throughput simulations of more complex systems feasible and increasing the need for efficient initial structure generation. -Concurrently, RDKit[@landrumRdkitRdkit2023_03_22023] offers extensive functionality for cheminformatics and manipulating chemical structures. +Concurrently, RDKit [@landrumRdkitRdkit2023_03_22023] offers extensive functionality for cheminformatics and manipulating chemical structures. However, standard RDKit workflows are not designed for MLIP-driven simulation, while typical ASE-MLIP workflows may lack rich explicit chemical information such as bond orders or molecular identities, as well as capabilities for generating different conformations or searching substructures. The `molify` package bridges this gap, providing an interface between RDKit's chemical structure generation and cheminformatics capabilities and ASE's handling of 3D atomic structures. -Furthermore, `molify` integrates with PACKMOL[@martinezPACKMOLPackageBuilding2009] to facilitate the creation of complex, periodic simulation cells with diverse chemical compositions, all while preserving crucial chemical connectivity information. -In addition, `molify` simplifies the representation of molecular structures as graphs using NetworkX[@hagbergExploringNetworkStructure2008], e.g., enabling traversing or comparing them. +Furthermore, `molify` integrates with PACKMOL [@martinezPACKMOLPackageBuilding2009] to facilitate the creation of complex, periodic simulation cells with diverse chemical compositions, all while preserving crucial chemical connectivity information. +In addition, `molify` simplifies the representation of molecular structures as graphs using NetworkX [@hagbergExploringNetworkStructure2008], e.g., enabling traversing or comparing them. Lastly, the combination of these packages enables selection and manipulation of atomistic structures based on chemical knowledge rather than manual index handling. While designed for MLIP data, the usage of `molify` is not limited and can be expanded, e.g., by utilizing the bond order information in other ASE-based workflows for classical MD simulations or integrating with machine-learning driven bond order predictions. @@ -41,17 +41,17 @@ While designed for MLIP data, the usage of `molify` is not limited and can be ex While its core function is to interface these tools, it thereby unlocks new capabilities and significantly reduces the manual coding and data wrangling typically required for preparing and analyzing molecular simulations. For example, ASE has no tools for handling topological information such as bonds or molecular identities, while RDKit cannot natively interface with MLIPs. -`molify` simplifies workflows that previously involved laborious tasks such as sourcing individual structure files from various databases (e.g., the Materials Project[@jainCommentaryMaterialsProject2013] or the ZINC database[@tingleZINC22AFreeMultiBillionScale2023]) and custom setups of simulation cells. +`molify` simplifies workflows that previously involved laborious tasks such as sourcing individual structure files from various databases (e.g., the Materials Project [@jainCommentaryMaterialsProject2013] or the ZINC database [@tingleZINC22AFreeMultiBillionScale2023]) and custom setups of simulation cells. With `molify`, more complex and chemically diverse simulation cells are easier to set up and process. One challenge in MLIP-driven simulations is the post-simulation identification and analysis of molecular fragments or chemical changes, as explicit topological information is not available and changes in connectivity can occur. -`molify` addresses this by enabling the use of RDKit's powerful SMILES[@weiningerSMILESChemicalLanguage1988]/SMARTS-based substructure searching on ASE structures. -In addition, the resulting molecular graph can be exported to a NetworkX[@hagbergExploringNetworkStructure2008] object for further analysis. -This selection and handling allows for similar functionality as is provided by the MDAnalysis[@gowersMDAnalysisPythonPackage2016] atom selection language, designed for simulations with a fixed topology. +`molify` addresses this by enabling the use of RDKit's powerful SMILES [@weiningerSMILESChemicalLanguage1988]/SMARTS-based substructure searching on ASE structures. +In addition, the resulting molecular graph can be exported to a NetworkX [@hagbergExploringNetworkStructure2008] object for further analysis. +This selection and handling allows for similar functionality as is provided by the MDAnalysis [@gowersMDAnalysisPythonPackage2016] atom selection language, designed for simulations with a fixed topology. # Features and Implementation -![Visualization of a 3D structure from ASE, visualized with ZnDraw[@elijosiusZeroShotMolecular2024] (left) and its corresponding RDKit 2D chemical structure representation (right).\label{fig:zndraw-rdkit}](zndraw_rdkit.svg) +![Visualization of a 3D structure from ASE, visualized with ZnDraw [@elijosiusZeroShotMolecular2024] (left) and its corresponding RDKit 2D chemical structure representation (right).\label{fig:zndraw-rdkit}](zndraw_rdkit.svg) The generation of atomic configurations in `molify` is centered around SMILES for defining molecular species. A typical workflow often follows these steps: @@ -81,7 +81,7 @@ opt.run(fmax=0.01) ``` All ASE Atoms objects generated or processed by `molify` store `connectivity` information (bonds and their orders) within the `ase.Atoms.info` dictionary. If this information is available, `molify` uses it to convert between ASE, NetworkX and RDKit. -If an ASE structure is converted to an RDKit molecule without pre-existing connectivity, `molify` leverages RDKit's bond perception algorithms[@kimUniversalStructureConversion2015] to estimate this information. +If an ASE structure is converted to an RDKit molecule without pre-existing connectivity, `molify` leverages RDKit's bond perception algorithms [@kimUniversalStructureConversion2015] to estimate this information. A visualisation of the 2D and 3D structure from the simulation is shown in \autoref{fig:zndraw-rdkit}. @@ -116,9 +116,6 @@ frames: list[ase.Atoms] = get_substructures( ) ``` -# Acknowledgements -F.Z. acknowledges support by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) in the framework of the priority program SPP 2363, "Utilization and Development of Machine Learning for Molecular Applications – Molecular Machine Learning" Project No. 497249646. Further funding through the DFG under Germany's Excellence Strategy – EXC 2075 – 390740016 and the Stuttgart Center for Simulation Science (SimTech) was provided. - # Related software The functionality of `molify` relies critically on the following packages: @@ -133,7 +130,10 @@ The `molify` package is currently a crucial part of the following software packa - [ZnDraw](https://github.com/zincware/zndraw): Interactive generation of simulation boxes and selection of substructures through a graphical user interface inside a web-based visualization package. - [mlipx](https://github.com/basf/mlipx): Creating initial structures for benchmarking different MLIPs on real-world test scenarios. -The OpenBabel[@oboyleOpenBabelOpen2011] package provides similar cheminformatics functionality to RDKit along with extensive file format support. +The OpenBabel [@oboyleOpenBabelOpen2011] package provides similar cheminformatics functionality to RDKit along with extensive file format support. However, OpenBabel is primarily designed as a format conversion tool with a focus on command-line usage and file I/O, while `molify` is designed for Python-native workflows with in-memory object conversions. Currently, OpenBabel does not provide direct support for ASE Atoms objects, ASE calculators (including MLIPs), or seamless RDKit-ASE interconversion within Python. Furthermore, `molify`'s integration with PACKMOL and NetworkX provides capabilities beyond OpenBabel's core focus on format conversion. + +# Acknowledgements +F.Z. acknowledges support by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) in the framework of the priority program SPP 2363, "Utilization and Development of Machine Learning for Molecular Applications – Molecular Machine Learning" Project No. 497249646. Further funding through the DFG under Germany's Excellence Strategy – EXC 2075 – 390740016 and the Stuttgart Center for Simulation Science (SimTech) was provided.