From 66bdb9e51bb9822d7571fc9f8e0d300bf79a4f8f Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Mon, 28 Oct 2024 04:27:58 +0100 Subject: [PATCH 01/42] work in progress: implemented the option for the fragmenter to return saturated or unsaturated fragments, added a test and renamed some tests to prepare for the tests for unsaturated fragments --- .../cdk/fragment/ExhaustiveFragmenter.java | 143 +++++++++++++----- .../fragment/ExhaustiveFragmenterTest.java | 80 ++++++---- 2 files changed, 154 insertions(+), 69 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index 1111ff0921c..7e5711ee94a 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -29,6 +29,7 @@ import org.openscience.cdk.interfaces.IAtomContainer; import org.openscience.cdk.interfaces.IBond; import org.openscience.cdk.interfaces.IRingSet; +import org.openscience.cdk.smiles.SmiFlavor; import org.openscience.cdk.smiles.SmilesGenerator; import org.openscience.cdk.tools.CDKHydrogenAdder; import org.openscience.cdk.tools.ILoggingTool; @@ -42,10 +43,24 @@ /** * Generate fragments exhaustively. - * + *

* This fragmentation scheme simply breaks single non-ring bonds. By default - * fragments smaller than 6 atoms in size are not considered, but this can be - * changed by the user. Side chains are retained. + * fragments smaller than 6 atoms in size are not considered and the returned + * fragments are not saturated, but this can be changed by the user. + * Side chains are retained. + * + *

Example Usage

+ * + *
{@code
+ * ExhaustiveFragmenter fragmenter = new ExhaustiveFragmenter(); // per default this returns unsaturated fragments with a minimum size of 6
+ * SmilesParser smiParser = new SmilesParser(SilentChemObjectBuilder.getInstance());
+ * IAtomContainer mol = smiParser.parseSmiles(c1ccccc1CC(N)C(=O)O);
+ * fragmenter.generateFragments(mol);
+ * // if you want the SMILES representation of the fragments
+ * String[] smilesFragments = fragmenter.getFragments();
+ * // if you want the Atom containers
+ * IAtomContainer[] atomContainerFragments = fragmenter.getFragmentsAsContainers();
+ * }
* * @author Rajarshi Guha * @cdk.module fragment @@ -54,12 +69,28 @@ */ public class ExhaustiveFragmenter implements IFragmenter { + /** + * Defines the saturation of the returned fragments. + */ + public enum Saturation { + /** + * Fragments will get returned saturated. + */ + SATURATED_FRAGMENTS, + + /** + * Fragments will get returned unsaturated. + */ + UNSATURATED_FRAGMENTS + } + private static final int DEFAULT_MIN_FRAG_SIZE = 6; + private static final Saturation DEFAULT_SATURATION = Saturation.UNSATURATED_FRAGMENTS; final Map fragMap; final SmilesGenerator smilesGenerator; - String[] fragments = null; int minFragSize; + Saturation saturationSetting; private static final ILoggingTool logger = LoggingToolFactory .createLoggingTool(ExhaustiveFragmenter.class); @@ -67,18 +98,44 @@ public class ExhaustiveFragmenter implements IFragmenter { * Instantiate fragmenter with default minimum fragment size. */ public ExhaustiveFragmenter() { - this(DEFAULT_MIN_FRAG_SIZE); + this(DEFAULT_MIN_FRAG_SIZE, DEFAULT_SATURATION); } /** - * Instantiate fragmenter with user specified minimum fragment size. + * Instantiate fragmenter with user specified minimum fragment size and default saturation (saturated fragments). * - * @param minFragSize the minimum fragment size desired + * @param minFragSize the minimum fragment size desired. */ public ExhaustiveFragmenter(int minFragSize) { this.minFragSize = minFragSize; + this.saturationSetting = DEFAULT_SATURATION; + fragMap = new HashMap<>(); + smilesGenerator = new SmilesGenerator(SmiFlavor.UseAromaticSymbols | SmiFlavor.Unique); + } + + /** + * Instantiate fragmenter with default minimum fragment size and user specified saturation setting. + * + * @param saturationSetting setting to specify if the returned fragments should be saturated or not. + */ + public ExhaustiveFragmenter(Saturation saturationSetting) { + this.minFragSize = DEFAULT_MIN_FRAG_SIZE; + this.saturationSetting = saturationSetting; + fragMap = new HashMap<>(); + smilesGenerator = new SmilesGenerator(SmiFlavor.UseAromaticSymbols | SmiFlavor.Unique); + } + + /** + * Instantiate fragmenter with user specified minimum fragment size. + * + * @param minFragSize the minimum fragment size desired. + * @param saturationSetting setting to specify if the returned fragments should be saturated or not. + */ + public ExhaustiveFragmenter(int minFragSize, Saturation saturationSetting) { + this.minFragSize = minFragSize; + this.saturationSetting = saturationSetting; fragMap = new HashMap<>(); - smilesGenerator = SmilesGenerator.unique().aromatic(); + smilesGenerator = new SmilesGenerator(SmiFlavor.UseAromaticSymbols | SmiFlavor.Unique); } /** @@ -90,6 +147,15 @@ public void setMinimumFragmentSize(int minFragSize) { this.minFragSize = minFragSize; } + /** + * Set the saturation setting of the returned fragments. + * + * @param saturationSetting setting to specify if the returned fragments should be saturated or not. + */ + public void setSaturationSetting(Saturation saturationSetting) { + this.saturationSetting = saturationSetting; + } + /** * Generate fragments for the input molecule. * @@ -98,21 +164,25 @@ public void setMinimumFragmentSize(int minFragSize) { @Override public void generateFragments(IAtomContainer atomContainer) throws CDKException { fragMap.clear(); - run(atomContainer); + if (this.saturationSetting == Saturation.UNSATURATED_FRAGMENTS) { + runUnsaturated(atomContainer); + } else { + runSaturated(atomContainer); + } } - private List run(IAtomContainer atomContainer) throws CDKException { + private void runSaturated(IAtomContainer atomContainer) throws CDKException { - ArrayList fragments = new ArrayList<>(); - - if (atomContainer.getBondCount() < 3) return fragments; + if (atomContainer.getBondCount() < 3) return; List splitableBonds = getSplitableBonds(atomContainer); - if (splitableBonds.size() == 0) return fragments; + if (splitableBonds.size() == 0) return; logger.debug("Got " + splitableBonds.size() + " splittable bonds"); String tmpSmiles; +// int[] saturatedAtomIDs = new int[splitableBonds.size() * 2]; for (IBond bond : splitableBonds) { List parts = FragmentUtils.splitMolecule(atomContainer, bond); + // make sure we don't add the same fragment twice for (IAtomContainer partContainer : parts) { AtomContainerManipulator.clearAtomConfigurations(partContainer); @@ -123,41 +193,40 @@ private List run(IAtomContainer atomContainer) throws CDKExcepti Aromaticity.cdkLegacy().apply(partContainer); tmpSmiles = smilesGenerator.create(partContainer); if (partContainer.getAtomCount() >= minFragSize && !fragMap.containsKey(tmpSmiles)) { - fragments.add(partContainer); fragMap.put(tmpSmiles, partContainer); + if (partContainer.getAtomCount() > minFragSize) { + runSaturated(partContainer); + } } } } + } - // try and partition the fragments - List tmp = new ArrayList<>(fragments); - for (IAtomContainer fragment : fragments) { - if (fragment.getBondCount() < 3 || fragment.getAtomCount() < minFragSize) continue; - if (getSplitableBonds(fragment).size() == 0) continue; + private void runUnsaturated(IAtomContainer atomContainer) throws CDKException { - List frags = run(fragment); - if (frags.size() == 0) continue; + if (atomContainer.getBondCount() < 3) return; + List splitableBonds = getSplitableBonds(atomContainer); + if (splitableBonds.size() == 0) return; + logger.debug("Got " + splitableBonds.size() + " splittable bonds"); - for (IAtomContainer frag : frags) { - if (frag.getBondCount() < 3) continue; - AtomContainerManipulator.clearAtomConfigurations(frag); - for (IAtom atom : frag.atoms()) - atom.setImplicitHydrogenCount(null); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(frag); - CDKHydrogenAdder.getInstance(frag.getBuilder()).addImplicitHydrogens(frag); - Aromaticity.cdkLegacy().apply(frag); - tmpSmiles = smilesGenerator.create(frag); - if (frag.getAtomCount() >= minFragSize && !fragMap.containsKey(tmpSmiles)) { - tmp.add(frag); - fragMap.put(tmpSmiles, frag); + String tmpSmiles; + for (IBond bond : splitableBonds) { + List parts = FragmentUtils.splitMolecule(atomContainer, bond); + + // make sure we don't add the same fragment twice + for (IAtomContainer partContainer : parts) { + tmpSmiles = smilesGenerator.create(partContainer); + if (partContainer.getAtomCount() >= minFragSize && !fragMap.containsKey(tmpSmiles)) { + fragMap.put(tmpSmiles, partContainer); + if (partContainer.getAtomCount() > minFragSize) { + runSaturated(partContainer); + } } } } - fragments = new ArrayList<>(tmp); - return fragments; } - private List getSplitableBonds(IAtomContainer atomContainer) throws CDKException { + private List getSplitableBonds(IAtomContainer atomContainer) { // do ring detection SpanningTree spanningTree = new SpanningTree(atomContainer); IRingSet allRings = spanningTree.getAllRings(); diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java index b47e9b42adb..46a62aa6144 100644 --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java @@ -22,6 +22,8 @@ import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import org.openscience.cdk.smiles.SmiFlavor; +import org.openscience.cdk.smiles.SmilesGenerator; import org.openscience.cdk.test.CDKTestCase; import org.openscience.cdk.DefaultChemObjectBuilder; import org.openscience.cdk.interfaces.IAtomContainer; @@ -40,83 +42,85 @@ */ class ExhaustiveFragmenterTest extends CDKTestCase { - private static ExhaustiveFragmenter fragmenter; + private static ExhaustiveFragmenter fragmenterSaturated; + private static ExhaustiveFragmenter fragmenterUnsaturated; private static SmilesParser smilesParser; @BeforeAll static void setup() { - fragmenter = new ExhaustiveFragmenter(); + fragmenterSaturated = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.SATURATED_FRAGMENTS); + fragmenterUnsaturated = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); smilesParser = new SmilesParser(DefaultChemObjectBuilder.getInstance()); } @Test - void testEF1() throws Exception { + void testEF1WithSaturation() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("CCC"); - fragmenter.generateFragments(mol); - String[] frags = fragmenter.getFragments(); + fragmenterSaturated.generateFragments(mol); + String[] frags = fragmenterSaturated.getFragments(); Assertions.assertEquals(0, frags.length); } @Test - void testEF2() throws Exception { + void testEF2WithSaturation() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1"); - fragmenter.generateFragments(mol); - String[] frags = fragmenter.getFragments(); + fragmenterSaturated.generateFragments(mol); + String[] frags = fragmenterSaturated.getFragments(); Assertions.assertEquals(0, frags.length); } @Test - void testEF3() throws Exception { + void testEF3WithSaturation() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); - fragmenter.generateFragments(mol); - String[] frags = fragmenter.getFragments(); + fragmenterSaturated.generateFragments(mol); + String[] frags = fragmenterSaturated.getFragments(); org.hamcrest.MatcherAssert.assertThat(frags, is(new String[]{"C1CCCCC1"})); } @Test - void testEF4() throws Exception { + void testEF4WithSaturation() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC"); - fragmenter.generateFragments(mol); - String[] frags = fragmenter.getFragments(); + fragmenterSaturated.generateFragments(mol); + String[] frags = fragmenterSaturated.getFragments(); Assertions.assertNotNull(frags); org.hamcrest.MatcherAssert.assertThat(frags, is(new String[]{"c1ccccc1"})); } @Test - void testEF5() throws Exception { + void testEF5WithSaturation() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); - fragmenter.generateFragments(mol); - String[] frags = fragmenter.getFragments(); + fragmenterSaturated.generateFragments(mol); + String[] frags = fragmenterSaturated.getFragments(); Assertions.assertNotNull(frags); org.hamcrest.MatcherAssert.assertThat(Arrays.asList(frags), hasItems("c1ccc(cc1)C", "c1ccccc1")); - Assertions.assertNotNull(fragmenter.getFragmentsAsContainers()); - Assertions.assertEquals(2, fragmenter.getFragmentsAsContainers().length); + Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); + Assertions.assertEquals(2, fragmenterSaturated.getFragmentsAsContainers().length); } @Test - void testEF6() throws Exception { + void testEF6WithSaturation() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); - fragmenter.generateFragments(mol); - String[] frags = fragmenter.getFragments(); + fragmenterSaturated.generateFragments(mol); + String[] frags = fragmenterSaturated.getFragments(); Assertions.assertNotNull(frags); org.hamcrest.MatcherAssert.assertThat(frags, is(new String[]{"c1ccccc1"})); - Assertions.assertNotNull(fragmenter.getFragmentsAsContainers()); - Assertions.assertEquals(1, fragmenter.getFragmentsAsContainers().length); + Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); + Assertions.assertEquals(1, fragmenterSaturated.getFragmentsAsContainers().length); } @Test - void testEF7() throws Exception { + void testEF7WithSaturation() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); - fragmenter.generateFragments(mol); - List frags = Arrays.asList(fragmenter.getFragments()); + fragmenterSaturated.generateFragments(mol); + List frags = Arrays.asList(fragmenterSaturated.getFragments()); Assertions.assertNotNull(frags); Assertions.assertEquals(25, frags.size()); - Assertions.assertNotNull(fragmenter.getFragmentsAsContainers()); - Assertions.assertEquals(25, fragmenter.getFragmentsAsContainers().length); + Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); + Assertions.assertEquals(25, fragmenterSaturated.getFragmentsAsContainers().length); org.hamcrest.MatcherAssert.assertThat(frags, hasItems("c1ccccc1", "c1ccc(cc1)C2(CCC(CC)C2)CC3C=CC=C3", "c1ccc(cc1)C2(C)CCC(C)C2")); } @@ -124,11 +128,23 @@ void testEF7() throws Exception { @Test void testMinSize() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1C2CCCCC2"); - fragmenter.setMinimumFragmentSize(6); - fragmenter.generateFragments(mol); - String[] frags = fragmenter.getFragments(); + fragmenterSaturated.setMinimumFragmentSize(6); + fragmenterSaturated.generateFragments(mol); + String[] frags = fragmenterSaturated.getFragments(); Assertions.assertNotNull(frags); Assertions.assertEquals(1, frags.length); Assertions.assertTrue(frags[0].equals("C1CCCCC1")); } + + @Test + void testEqualityOfSmilesAndContainers() throws Exception { + SmilesGenerator smilesGenerator = new SmilesGenerator(SmiFlavor.UseAromaticSymbols | SmiFlavor.Unique); + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC(N)C(=O)O"); + fragmenterSaturated.generateFragments(mol); + List smilesFrags = Arrays.asList(fragmenterSaturated.getFragments()); + IAtomContainer[] containerFrags = fragmenterSaturated.getFragmentsAsContainers(); + for (IAtomContainer frag : containerFrags) { + org.hamcrest.MatcherAssert.assertThat(smilesFrags, hasItems(smilesGenerator.create(frag))); + } + } } From 9ad00c978bd22a99cdf5bd7b07597d08c03e6d37 Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Mon, 28 Oct 2024 17:03:39 +0100 Subject: [PATCH 02/42] added tests for unsaturated setting and fixed unsaturated run --- .../cdk/fragment/ExhaustiveFragmenter.java | 8 +- .../fragment/ExhaustiveFragmenterTest.java | 95 +++++++++++++++++-- 2 files changed, 92 insertions(+), 11 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index 7e5711ee94a..44e887c0ae0 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -212,14 +212,14 @@ private void runUnsaturated(IAtomContainer atomContainer) throws CDKException { String tmpSmiles; for (IBond bond : splitableBonds) { List parts = FragmentUtils.splitMolecule(atomContainer, bond); - // make sure we don't add the same fragment twice for (IAtomContainer partContainer : parts) { tmpSmiles = smilesGenerator.create(partContainer); - if (partContainer.getAtomCount() >= minFragSize && !fragMap.containsKey(tmpSmiles)) { + int fragmentSize = partContainer.getAtomCount(); + if (fragmentSize >= minFragSize && !fragMap.containsKey(tmpSmiles)) { fragMap.put(tmpSmiles, partContainer); - if (partContainer.getAtomCount() > minFragSize) { - runSaturated(partContainer); + if (fragmentSize > minFragSize) { + runUnsaturated(partContainer); } } } diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java index 46a62aa6144..23c84501d7e 100644 --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java @@ -19,6 +19,7 @@ */ package org.openscience.cdk.fragment; +import org.hamcrest.MatcherAssert; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -54,7 +55,15 @@ static void setup() { } @Test - void testEF1WithSaturation() throws Exception { + void testEF1Unsaturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("CCC"); + fragmenterUnsaturated.generateFragments(mol); + String[] frags = fragmenterUnsaturated.getFragments(); + Assertions.assertEquals(0, frags.length); + } + + @Test + void testEF1Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("CCC"); fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); @@ -62,7 +71,15 @@ void testEF1WithSaturation() throws Exception { } @Test - void testEF2WithSaturation() throws Exception { + void testEF2Unsaturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1"); + fragmenterUnsaturated.generateFragments(mol); + String[] frags = fragmenterUnsaturated.getFragments(); + Assertions.assertEquals(0, frags.length); + } + + @Test + void testEF2Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1"); fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); @@ -70,7 +87,15 @@ void testEF2WithSaturation() throws Exception { } @Test - void testEF3WithSaturation() throws Exception { + void testEF3Unsaturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); + fragmenterUnsaturated.generateFragments(mol); + String[] frags = fragmenterUnsaturated.getFragments(); + MatcherAssert.assertThat(frags, is(new String[]{"[CH]1CCCCC1"})); + } + + @Test + void testEF3Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); @@ -78,7 +103,16 @@ void testEF3WithSaturation() throws Exception { } @Test - void testEF4WithSaturation() throws Exception { + void testEF4Unsaturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC"); + fragmenterUnsaturated.generateFragments(mol); + String[] frags = fragmenterUnsaturated.getFragments(); + Assertions.assertNotNull(frags); + MatcherAssert.assertThat(frags, is(new String[]{"[c]1ccccc1"})); + } + + @Test + void testEF4Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC"); fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); @@ -86,8 +120,20 @@ void testEF4WithSaturation() throws Exception { org.hamcrest.MatcherAssert.assertThat(frags, is(new String[]{"c1ccccc1"})); } + + @Test + void testEF5Unsaturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); + fragmenterUnsaturated.generateFragments(mol); + String[] frags = fragmenterUnsaturated.getFragments(); + Assertions.assertNotNull(frags); + MatcherAssert.assertThat(Arrays.asList(frags), hasItems("[CH2]c1ccccc1", "[c]1ccccc1")); + Assertions.assertNotNull(fragmenterUnsaturated.getFragmentsAsContainers()); + Assertions.assertEquals(2, fragmenterUnsaturated.getFragmentsAsContainers().length); + } + @Test - void testEF5WithSaturation() throws Exception { + void testEF5Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); @@ -95,11 +141,24 @@ void testEF5WithSaturation() throws Exception { org.hamcrest.MatcherAssert.assertThat(Arrays.asList(frags), hasItems("c1ccc(cc1)C", "c1ccccc1")); Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); Assertions.assertEquals(2, fragmenterSaturated.getFragmentsAsContainers().length); + } + + + @Test + void testEF6Unsaturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); + fragmenterUnsaturated.generateFragments(mol); + String[] frags = fragmenterUnsaturated.getFragments(); + Assertions.assertNotNull(frags); + MatcherAssert.assertThat(frags, is(new String[]{"[c]1ccccc1"})); + + Assertions.assertNotNull(fragmenterUnsaturated.getFragmentsAsContainers()); + Assertions.assertEquals(1, fragmenterUnsaturated.getFragmentsAsContainers().length); } @Test - void testEF6WithSaturation() throws Exception { + void testEF6Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); @@ -108,11 +167,33 @@ void testEF6WithSaturation() throws Exception { Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); Assertions.assertEquals(1, fragmenterSaturated.getFragmentsAsContainers().length); + } + + @Test + void testEF7Unsaturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); + fragmenterUnsaturated.generateFragments(mol); + List frags = Arrays.asList(fragmenterUnsaturated.getFragments()); + Assertions.assertNotNull(frags); + // There is one additional fragment in comparison to the saturated version because there are following fragments: + // [C]1CCC([CH2])C1 + // [CH2][C]1C[CH]CC1 + // these fragments only differ in the number of hydrogens bonded to their respective carbon atoms. So these + // fragments would show up as one if saturated. + Assertions.assertEquals(26, frags.size()); + + Assertions.assertNotNull(fragmenterUnsaturated.getFragmentsAsContainers()); + Assertions.assertEquals(26, fragmenterUnsaturated.getFragmentsAsContainers().length); + + for (String smile : frags) { + System.out.println(smile); + } + MatcherAssert.assertThat(frags, hasItems("[c]1ccccc1", "[CH2]CC1CCC(c2ccccc2)(CC3C=CC=C3)C1", "[CH2]C1CCC([CH2])(c2ccccc2)C1")); } @Test - void testEF7WithSaturation() throws Exception { + void testEF7Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); fragmenterSaturated.generateFragments(mol); List frags = Arrays.asList(fragmenterSaturated.getFragments()); From db70ad85acfd266824c9989590dd7010a6e97b0c Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Thu, 28 Nov 2024 18:12:38 +0100 Subject: [PATCH 03/42] work in progress: first copy approaches to fix saturation problem and algorithm changes to use a power set to reduce computations --- .../cdk/fragment/ExhaustiveFragmenter.java | 178 +++++++++++++++--- .../fragment/ExhaustiveFragmenterTest.java | 7 +- 2 files changed, 150 insertions(+), 35 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index 44e887c0ae0..3abf8d3e69b 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -25,10 +25,7 @@ import org.openscience.cdk.aromaticity.Aromaticity; import org.openscience.cdk.exception.CDKException; import org.openscience.cdk.graph.SpanningTree; -import org.openscience.cdk.interfaces.IAtom; -import org.openscience.cdk.interfaces.IAtomContainer; -import org.openscience.cdk.interfaces.IBond; -import org.openscience.cdk.interfaces.IRingSet; +import org.openscience.cdk.interfaces.*; import org.openscience.cdk.smiles.SmiFlavor; import org.openscience.cdk.smiles.SmilesGenerator; import org.openscience.cdk.tools.CDKHydrogenAdder; @@ -36,17 +33,16 @@ import org.openscience.cdk.tools.LoggingToolFactory; import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.math.BigInteger; +import java.util.*; +import java.util.stream.IntStream; /** * Generate fragments exhaustively. *

- * This fragmentation scheme simply breaks single non-ring bonds. By default - * fragments smaller than 6 atoms in size are not considered and the returned - * fragments are not saturated, but this can be changed by the user. + * This fragmentation scheme simply breaks single non-ring bonds. By default, + * fragments smaller than 6 atoms (without implicit hydrogen) in size are not + * considered and the returned fragments are not saturated, but this can be changed by the user. * Side chains are retained. * *

Example Usage

@@ -174,40 +170,66 @@ public void generateFragments(IAtomContainer atomContainer) throws CDKException private void runSaturated(IAtomContainer atomContainer) throws CDKException { if (atomContainer.getBondCount() < 3) return; - List splitableBonds = getSplitableBonds(atomContainer); - if (splitableBonds.size() == 0) return; - logger.debug("Got " + splitableBonds.size() + " splittable bonds"); + IBond[] splittableBonds = getSplitableBonds(atomContainer); + int splittableBondsLength = splittableBonds.length; + if (splittableBondsLength == 0) return; + logger.debug("Got " + splittableBondsLength + " splittable bonds"); - String tmpSmiles; -// int[] saturatedAtomIDs = new int[splitableBonds.size() * 2]; - for (IBond bond : splitableBonds) { - List parts = FragmentUtils.splitMolecule(atomContainer, bond); + // If we want to check all unique combinations of splittings we calculate the power set of the splittable bonds. + // which is 2^n and without considering the empty set we can say it is 2^n - 1. + // example: + // if we have a set of splittable bonds here represented as numbers {1, 2, 3}, we can describe all unique + // subsets as follows: + // {1} + // {2} + // {3} + // {1,2} + // {1,3} + // {2,3} + // {1,2,3} + BigInteger numberOfIterations = BigInteger.ONE.shiftLeft(splittableBondsLength).subtract(BigInteger.ONE); - // make sure we don't add the same fragment twice + List> allSubsets = generateSubsets(IntStream.rangeClosed(0, splittableBondsLength).toArray()); + int[] splittableBondIndices = new int[splittableBondsLength]; + for (int i = 0; i < splittableBondsLength; i++) { + splittableBondIndices[i] = splittableBonds[i].getIndex(); + } + + for (BigInteger i = BigInteger.ZERO; i.compareTo(numberOfIterations) < 0; i = i.add(BigInteger.ONE)){ + int subsetSize = allSubsets.get(i.intValue()).size(); + IBond[] bondsToRemove = new IBond[subsetSize]; + for (int j = 0; j < subsetSize; j++) { + bondsToRemove[j] = atomContainer.getBond(splittableBondIndices[j]); + } +// List parts = FragmentUtils.splitMolecule(molToSplit, bondToSplit); + IAtomContainer[] parts = splitMoleculeWithCopy(atomContainer, bondsToRemove); for (IAtomContainer partContainer : parts) { AtomContainerManipulator.clearAtomConfigurations(partContainer); - for (IAtom atom : partContainer.atoms()) - atom.setImplicitHydrogenCount(null); + for (IAtom atom : partContainer.atoms()) { + atom.setImplicitHydrogenCount(0); + } AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(partContainer); CDKHydrogenAdder.getInstance(partContainer.getBuilder()).addImplicitHydrogens(partContainer); Aromaticity.cdkLegacy().apply(partContainer); - tmpSmiles = smilesGenerator.create(partContainer); - if (partContainer.getAtomCount() >= minFragSize && !fragMap.containsKey(tmpSmiles)) { + String tmpSmiles = smilesGenerator.create(partContainer); + int numberOfAtoms = partContainer.getAtomCount(); + if (numberOfAtoms >= minFragSize && !fragMap.containsKey(tmpSmiles)) { fragMap.put(tmpSmiles, partContainer); - if (partContainer.getAtomCount() > minFragSize) { - runSaturated(partContainer); - } + } + if (numberOfAtoms < minFragSize) { + break; } } + } } private void runUnsaturated(IAtomContainer atomContainer) throws CDKException { if (atomContainer.getBondCount() < 3) return; - List splitableBonds = getSplitableBonds(atomContainer); - if (splitableBonds.size() == 0) return; - logger.debug("Got " + splitableBonds.size() + " splittable bonds"); + IBond[] splitableBonds = getSplitableBonds(atomContainer); + if (splitableBonds.length == 0) return; + logger.debug("Got " + splitableBonds.length + " splittable bonds"); String tmpSmiles; for (IBond bond : splitableBonds) { @@ -226,7 +248,7 @@ private void runUnsaturated(IAtomContainer atomContainer) throws CDKException { } } - private List getSplitableBonds(IAtomContainer atomContainer) { + private IBond[] getSplitableBonds(IAtomContainer atomContainer) { // do ring detection SpanningTree spanningTree = new SpanningTree(atomContainer); IRingSet allRings = spanningTree.getAllRings(); @@ -252,7 +274,101 @@ private List getSplitableBonds(IAtomContainer atomContainer) { if (!(isInRing || isTerminal)) splitableBonds.add(bond); } - return splitableBonds; + return splitableBonds.toArray(new IBond[0]); + } + + private static List> generateSubsets(int[] nums) { + int n = nums.length; + // Just in case n > 32, we make an integer of arbitrary size. + BigInteger numOfSubsets = BigInteger.ONE.shiftLeft(n); + List> result = new ArrayList<>(numOfSubsets.intValue()); + + // we can collect all subsets if we iterate from one (to disregard the empty set) to the number + // of possible subsets and check for each number which bits are set to one and replace this + // index by the respective number at the same index from the given nums list. + // Example: + // nums = [1, 2, 3] + // first iteration: + // 0b001 (1) + // -> [1] + // second iteration: + // 0b010 (2) + // -> [2] + // third iteration: + // 0b011 (3) + // -> [1, 2] + // ... + for (BigInteger i = BigInteger.ONE; i.compareTo(numOfSubsets) < 0; i = i.add(BigInteger.ONE)) { + List subset = new ArrayList<>(); + for (int j = 0; j < n; j++) { + if (i.testBit(j)) { + subset.add(nums[j]); + } + } + result.add(subset); + } + return result; + } + + private static IAtom copyAtom(IAtom originalAtom, IAtomContainer atomContainer) { + IAtom cpyAtom = atomContainer.newAtom(originalAtom.getAtomicNumber(), + originalAtom.getImplicitHydrogenCount()); + cpyAtom.setIsAromatic(originalAtom.isAromatic()); + cpyAtom.setValency(originalAtom.getValency()); + cpyAtom.setAtomTypeName(originalAtom.getAtomTypeName()); + return cpyAtom; + } + + private static IAtomContainer[] splitMoleculeWithCopy(IAtomContainer mol, IBond[] bondsToSplit) { + boolean[] alreadyVisited = new boolean[mol.getAtomCount()]; + // set all values of already visited to false + Arrays.fill(alreadyVisited, false); + int numberOfFragments = bondsToSplit.length + 1; + IAtomContainer[] fragments = new IAtomContainer[numberOfFragments]; + for (IBond bond : bondsToSplit) { + mol.removeBond(bond); + } + for (int i = 0; i < numberOfFragments; i++) { + // new container to hold a fragment + IAtomContainer fragmentContainer = mol.getBuilder().newInstance(IAtomContainer.class); + + // a stack to make a DFS through the subgraph + IAtom firstAtom; + Stack atomStack = new Stack<>(); + if (i == 0) { + atomStack.add(bondsToSplit[0].getBegin()); + firstAtom = copyAtom(atomStack.peek(), fragmentContainer); + for (IAtom nbor : firstAtom.neighbors()) { + IAtom cpyNbor = copyAtom(nbor, fragmentContainer); + fragmentContainer.newBond(firstAtom, cpyNbor, mol.getBond(atomStack.peek(), nbor).getOrder()); + atomStack.add(nbor); + } + } else { + atomStack.add(bondsToSplit[i - 1].getEnd()); + firstAtom = copyAtom(atomStack.peek(), fragmentContainer); + for (IAtom nbor : firstAtom.neighbors()) { + IAtom cpyNbor = copyAtom(nbor, fragmentContainer); + fragmentContainer.newBond(firstAtom, cpyNbor, mol.getBond(atomStack.peek(), nbor).getOrder()); + atomStack.add(nbor); + } + } + while (!atomStack.isEmpty()) { + IAtom lastAtom = atomStack.pop(); + IAtom cpyAtom = copyAtom(lastAtom, fragmentContainer); + alreadyVisited[lastAtom.getIndex()] = true; + //FIXME: Add cycle connections together !!!! + for (IAtom neighbor: lastAtom.neighbors()) { + if (alreadyVisited[neighbor.getIndex()] == false) { + alreadyVisited[neighbor.getIndex()] = true; + IAtom cpyNeighbor = copyAtom(neighbor, fragmentContainer); + fragmentContainer.newBond(cpyAtom, cpyNeighbor, mol.getBond(lastAtom, neighbor).getOrder()); + atomStack.add(neighbor); + } + } + } + fragments[i] = fragmentContainer; + } + return fragments; } /** diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java index 23c84501d7e..a784288daf8 100644 --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java @@ -185,10 +185,6 @@ void testEF7Unsaturated() throws Exception { Assertions.assertNotNull(fragmenterUnsaturated.getFragmentsAsContainers()); Assertions.assertEquals(26, fragmenterUnsaturated.getFragmentsAsContainers().length); - for (String smile : frags) { - System.out.println(smile); - } - MatcherAssert.assertThat(frags, hasItems("[c]1ccccc1", "[CH2]CC1CCC(c2ccccc2)(CC3C=CC=C3)C1", "[CH2]C1CCC([CH2])(c2ccccc2)C1")); } @@ -198,6 +194,9 @@ void testEF7Saturated() throws Exception { fragmenterSaturated.generateFragments(mol); List frags = Arrays.asList(fragmenterSaturated.getFragments()); Assertions.assertNotNull(frags); + for (String f : frags) { + System.out.println(f); + } Assertions.assertEquals(25, frags.size()); Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); From 1584523e35f27e5799026c4e0b8e364ee4f2cc0d Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Thu, 28 Nov 2024 18:23:16 +0100 Subject: [PATCH 04/42] fix subset generation for case where initial capacity might be negative or too small because of truncation of BigInt to int --- .../openscience/cdk/fragment/ExhaustiveFragmenter.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index 3abf8d3e69b..826d71333e1 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -280,8 +280,14 @@ private IBond[] getSplitableBonds(IAtomContainer atomContainer) { private static List> generateSubsets(int[] nums) { int n = nums.length; // Just in case n > 32, we make an integer of arbitrary size. + List> result; BigInteger numOfSubsets = BigInteger.ONE.shiftLeft(n); - List> result = new ArrayList<>(numOfSubsets.intValue()); + if (n > 32) { + result = new ArrayList<>(Integer.MAX_VALUE); + } + else { + result = new ArrayList<>(numOfSubsets.intValue()); + } // we can collect all subsets if we iterate from one (to disregard the empty set) to the number // of possible subsets and check for each number which bits are set to one and replace this From 1831ce5234552211cfe087aa3d092956dfc5a06c Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Thu, 28 Nov 2024 18:52:19 +0100 Subject: [PATCH 05/42] switching to primitive integers because java Lists are indexed by these and therefore cant have more elements than 2^31 - 1 --- .../cdk/fragment/ExhaustiveFragmenter.java | 27 +++++++++---------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index 826d71333e1..bf32f2cb384 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -187,7 +187,8 @@ private void runSaturated(IAtomContainer atomContainer) throws CDKException { // {1,3} // {2,3} // {1,2,3} - BigInteger numberOfIterations = BigInteger.ONE.shiftLeft(splittableBondsLength).subtract(BigInteger.ONE); + int numberOfIterations = (1 << splittableBondsLength) - 1; + List> allSubsets = generateSubsets(IntStream.rangeClosed(0, splittableBondsLength).toArray()); int[] splittableBondIndices = new int[splittableBondsLength]; @@ -195,8 +196,8 @@ private void runSaturated(IAtomContainer atomContainer) throws CDKException { splittableBondIndices[i] = splittableBonds[i].getIndex(); } - for (BigInteger i = BigInteger.ZERO; i.compareTo(numberOfIterations) < 0; i = i.add(BigInteger.ONE)){ - int subsetSize = allSubsets.get(i.intValue()).size(); + for (int i = 0; i < numberOfIterations; i ++){ + int subsetSize = allSubsets.get(i).size(); IBond[] bondsToRemove = new IBond[subsetSize]; for (int j = 0; j < subsetSize; j++) { bondsToRemove[j] = atomContainer.getBond(splittableBondIndices[j]); @@ -277,17 +278,13 @@ private IBond[] getSplitableBonds(IAtomContainer atomContainer) { return splitableBonds.toArray(new IBond[0]); } - private static List> generateSubsets(int[] nums) { + private static List> generateSubsets(int[] nums) throws ArithmeticException { int n = nums.length; - // Just in case n > 32, we make an integer of arbitrary size. - List> result; - BigInteger numOfSubsets = BigInteger.ONE.shiftLeft(n); - if (n > 32) { - result = new ArrayList<>(Integer.MAX_VALUE); - } - else { - result = new ArrayList<>(numOfSubsets.intValue()); + if (n > 31) { + throw new ArithmeticException("You attempted to make more subsets than an primitive integer can handle"); } + int numOfSubsets = 1 << n; + List> result = new ArrayList<>(numOfSubsets); // we can collect all subsets if we iterate from one (to disregard the empty set) to the number // of possible subsets and check for each number which bits are set to one and replace this @@ -304,10 +301,10 @@ private static List> generateSubsets(int[] nums) { // 0b011 (3) // -> [1, 2] // ... - for (BigInteger i = BigInteger.ONE; i.compareTo(numOfSubsets) < 0; i = i.add(BigInteger.ONE)) { + for (int i = 1; i < numOfSubsets; i++) { List subset = new ArrayList<>(); - for (int j = 0; j < n; j++) { - if (i.testBit(j)) { + for (int j = 0; j < i; j++) { + if (((i >> j) & 1) == 1) { subset.add(nums[j]); } } From a3f1d35162aee126b875b7c77fd3d775fbf49758 Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Thu, 28 Nov 2024 19:13:03 +0100 Subject: [PATCH 06/42] work in progress: fixed indexing and fix of check for size constraint --- .../org/openscience/cdk/fragment/ExhaustiveFragmenter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index bf32f2cb384..ac2fe1fb208 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -280,7 +280,7 @@ private IBond[] getSplitableBonds(IAtomContainer atomContainer) { private static List> generateSubsets(int[] nums) throws ArithmeticException { int n = nums.length; - if (n > 31) { + if (n > 30) { throw new ArithmeticException("You attempted to make more subsets than an primitive integer can handle"); } int numOfSubsets = 1 << n; @@ -303,7 +303,7 @@ private static List> generateSubsets(int[] nums) throws Arithmetic // ... for (int i = 1; i < numOfSubsets; i++) { List subset = new ArrayList<>(); - for (int j = 0; j < i; j++) { + for (int j = 0; j < n; j++) { if (((i >> j) & 1) == 1) { subset.add(nums[j]); } From 49cfd6a650f7d141d8e0bf3218e679f0bba71d41 Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Thu, 28 Nov 2024 23:16:16 +0100 Subject: [PATCH 07/42] work in progress: switch to primitive int matrices for subset generation and added more comprehensive comments for subset generation --- .../cdk/fragment/ExhaustiveFragmenter.java | 68 +++++++++++++------ 1 file changed, 47 insertions(+), 21 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index ac2fe1fb208..90afcec113f 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -33,7 +33,6 @@ import org.openscience.cdk.tools.LoggingToolFactory; import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; -import java.math.BigInteger; import java.util.*; import java.util.stream.IntStream; @@ -190,14 +189,14 @@ private void runSaturated(IAtomContainer atomContainer) throws CDKException { int numberOfIterations = (1 << splittableBondsLength) - 1; - List> allSubsets = generateSubsets(IntStream.rangeClosed(0, splittableBondsLength).toArray()); + int[][] allSubsets = generateSubsets(IntStream.rangeClosed(0, splittableBondsLength).toArray()); int[] splittableBondIndices = new int[splittableBondsLength]; for (int i = 0; i < splittableBondsLength; i++) { splittableBondIndices[i] = splittableBonds[i].getIndex(); } for (int i = 0; i < numberOfIterations; i ++){ - int subsetSize = allSubsets.get(i).size(); + int subsetSize = allSubsets[i].length; IBond[] bondsToRemove = new IBond[subsetSize]; for (int j = 0; j < subsetSize; j++) { bondsToRemove[j] = atomContainer.getBond(splittableBondIndices[j]); @@ -278,37 +277,64 @@ private IBond[] getSplitableBonds(IAtomContainer atomContainer) { return splitableBonds.toArray(new IBond[0]); } - private static List> generateSubsets(int[] nums) throws ArithmeticException { + /** + * Generates all possible subsets (of all possible sample sizes, ranging from 1 to the length of nums) + * of the numbers given in nums, ignoring the order, so [1,2] and [2,1] are regarded as equal and only + * one of them is returned. + * The number of possible subsets is (2^n) - 1 with n = length of nums. + * Example output for nums = [1,2,3] (2^3 - 1 = 7): + * [1] + * [2] + * [3] + * [1,2] + * [1,3] + * [2,3] + * [1,2,3] + * The empty set [] is not part of the output. + * The returned subsets will be ordered differently because they are generated based on bit shifts internally. + * + * @param nums set of integers from which to generate all possible subsets, sets + * containing the same number multiple times do not lead to an exception but maybe do not make much sense. + * @return all possible subsets. + * @throws ArithmeticException if the number of elements in the nums array is greater than 30. Because it is not + * possible to create indexed data structures with more than 2^31 - 1 values. + * @author Tom Weiß + */ + private static int[][] generateSubsets(int[] nums) throws ArithmeticException { + // calculate nr of different subsets (2^n including the empty set) by shifting the 0th bit of an + // integer with value 1 n positions to the left + // for cases where n > 32 an exception is thrown int n = nums.length; if (n > 30) { throw new ArithmeticException("You attempted to make more subsets than an primitive integer can handle"); } int numOfSubsets = 1 << n; - List> result = new ArrayList<>(numOfSubsets); - // we can collect all subsets if we iterate from one (to disregard the empty set) to the number - // of possible subsets and check for each number which bits are set to one and replace this - // index by the respective number at the same index from the given nums list. + // collect all subsets by iterating from one (to disregard the empty set) to the number + // of possible subsets and check for each number which bits are on and replace this + // index by the respective number at the same index from the given nums int array // Example: // nums = [1, 2, 3] - // first iteration: - // 0b001 (1) - // -> [1] - // second iteration: - // 0b010 (2) - // -> [2] - // third iteration: - // 0b011 (3) - // -> [1, 2] - // ... + // i bit value subset + // 1 0b001 [1] + // 2 0b010 [2] + // 3 0b011 [1,2] + // 4 0b100 [3] + // 5 0b101 [1,3] + // 6 0b110 [2,3] + // 7 0b111 [1,2,3] + int[][] result = new int[numOfSubsets - 1][]; for (int i = 1; i < numOfSubsets; i++) { - List subset = new ArrayList<>(); + int[] subset = new int[Integer.bitCount(i)]; + // keep track of the next index to add a number + int resultIndex = 0; for (int j = 0; j < n; j++) { if (((i >> j) & 1) == 1) { - subset.add(nums[j]); + subset[resultIndex] = nums[j]; + resultIndex++; } } - result.add(subset); + result[i - 1] = subset; } return result; } From dc328c8dbd21fceaddbea353fcd0a67620adbace Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Thu, 28 Nov 2024 23:40:54 +0100 Subject: [PATCH 08/42] work in progress: fix size limitation of the subset generation --- .../java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index 90afcec113f..b45e33f8320 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -305,7 +305,7 @@ private static int[][] generateSubsets(int[] nums) throws ArithmeticException { // integer with value 1 n positions to the left // for cases where n > 32 an exception is thrown int n = nums.length; - if (n > 30) { + if (n > 31) { throw new ArithmeticException("You attempted to make more subsets than an primitive integer can handle"); } int numOfSubsets = 1 << n; From 1677622a3e6e6d6ab6e3a801863eaac456e06adc Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Tue, 10 Dec 2024 23:22:49 +0100 Subject: [PATCH 09/42] work in progress: figuring out why the end atoms are in a different container --- .../cdk/fragment/ExhaustiveFragmenter.java | 70 +++++++++++++------ 1 file changed, 48 insertions(+), 22 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index b45e33f8320..fd8dccff1cb 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -349,49 +349,75 @@ private static IAtom copyAtom(IAtom originalAtom, IAtomContainer atomContainer) } private static IAtomContainer[] splitMoleculeWithCopy(IAtomContainer mol, IBond[] bondsToSplit) { - boolean[] alreadyVisited = new boolean[mol.getAtomCount()]; + boolean[] alreadyVisitedtoms = new boolean[mol.getAtomCount()]; // set all values of already visited to false - Arrays.fill(alreadyVisited, false); - int numberOfFragments = bondsToSplit.length + 1; - IAtomContainer[] fragments = new IAtomContainer[numberOfFragments]; + Arrays.fill(alreadyVisitedtoms, false); + // map to keep track of the original atoms and the copies thereof + Map origToCpyMap = new HashMap<>(mol.getAtomCount()); + Map atomsToSeperate = new HashMap<>(bondsToSplit.length); for (IBond bond : bondsToSplit) { - mol.removeBond(bond); + atomsToSeperate.put(bond.getBegin(), bond.getEnd()); } + int numberOfFragments = bondsToSplit.length + 1; + IAtomContainer[] fragments = new IAtomContainer[numberOfFragments]; for (int i = 0; i < numberOfFragments; i++) { // new container to hold a fragment IAtomContainer fragmentContainer = mol.getBuilder().newInstance(IAtomContainer.class); // a stack to make a DFS through the subgraph - IAtom firstAtom; Stack atomStack = new Stack<>(); if (i == 0) { atomStack.add(bondsToSplit[0].getBegin()); - firstAtom = copyAtom(atomStack.peek(), fragmentContainer); + IAtom firstAtom = atomStack.peek(); + IAtom firstAtomCpy = copyAtom(atomStack.peek(), fragmentContainer); + origToCpyMap.put(firstAtom, firstAtomCpy); for (IAtom nbor : firstAtom.neighbors()) { - IAtom cpyNbor = copyAtom(nbor, fragmentContainer); - fragmentContainer.newBond(firstAtom, cpyNbor, mol.getBond(atomStack.peek(), nbor).getOrder()); - atomStack.add(nbor); + if (nbor != atomsToSeperate.get(firstAtom)) { + IAtom cpyNbor = copyAtom(nbor, fragmentContainer); + fragmentContainer.newBond(firstAtomCpy, cpyNbor, mol.getBond(firstAtom, nbor).getOrder()); + atomStack.add(nbor); + origToCpyMap.put(nbor, cpyNbor); + } } } else { atomStack.add(bondsToSplit[i - 1].getEnd()); - firstAtom = copyAtom(atomStack.peek(), fragmentContainer); + IAtom firstAtom = atomStack.peek(); + IAtom firstAtomCpy = copyAtom(firstAtom, fragmentContainer); + origToCpyMap.put(firstAtom, firstAtomCpy); for (IAtom nbor : firstAtom.neighbors()) { - IAtom cpyNbor = copyAtom(nbor, fragmentContainer); - fragmentContainer.newBond(firstAtom, cpyNbor, mol.getBond(atomStack.peek(), nbor).getOrder()); - atomStack.add(nbor); + if (nbor != atomsToSeperate.get(firstAtom)) { + IAtom cpyNbor = copyAtom(nbor, fragmentContainer); + fragmentContainer.newBond(firstAtomCpy, cpyNbor, mol.getBond(firstAtom, nbor).getOrder()); + atomStack.add(nbor); + origToCpyMap.put(nbor, cpyNbor); + } } } while (!atomStack.isEmpty()) { IAtom lastAtom = atomStack.pop(); IAtom cpyAtom = copyAtom(lastAtom, fragmentContainer); - alreadyVisited[lastAtom.getIndex()] = true; - //FIXME: Add cycle connections together !!!! - for (IAtom neighbor: lastAtom.neighbors()) { - if (alreadyVisited[neighbor.getIndex()] == false) { - alreadyVisited[neighbor.getIndex()] = true; - IAtom cpyNeighbor = copyAtom(neighbor, fragmentContainer); - fragmentContainer.newBond(cpyAtom, cpyNeighbor, mol.getBond(lastAtom, neighbor).getOrder()); - atomStack.add(neighbor); + + alreadyVisitedtoms[lastAtom.getIndex()] = true; + for (IAtom nbor: lastAtom.neighbors()) { + if (!alreadyVisitedtoms[nbor.getIndex()]) { + if (nbor == atomsToSeperate.get(lastAtom)) { + continue; + } + alreadyVisitedtoms[nbor.getIndex()] = true; + IAtom cpyNbor = copyAtom(nbor, fragmentContainer); + fragmentContainer.newBond(cpyAtom, cpyNbor, mol.getBond(lastAtom, nbor).getOrder()); + atomStack.add(nbor); + origToCpyMap.put(nbor, cpyNbor); + } else { + IBond cycleConeectionBond = mol.getBond(lastAtom, nbor); +// System.out.println(cpyAtom.getContainer().toString()); +// System.out.println(cycleConeectionBond.getContainer().toString()); +// System.out.println(cpyAtom.getContainer().equals(origToCpyMap.get(nbor).getContainer())); + fragmentContainer.newBond( + cpyAtom, + origToCpyMap.get(nbor), + cycleConeectionBond.getOrder() + ); } } } From 40dfd896a4e1224841debaf7c6f1af3e7d44658d Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Fri, 13 Dec 2024 09:45:04 +0100 Subject: [PATCH 10/42] work in progress: fixing for end atoms not in container --- .../cdk/fragment/ExhaustiveFragmenter.java | 87 ++++++++----------- 1 file changed, 38 insertions(+), 49 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index fd8dccff1cb..665534a2793 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -202,7 +202,7 @@ private void runSaturated(IAtomContainer atomContainer) throws CDKException { bondsToRemove[j] = atomContainer.getBond(splittableBondIndices[j]); } // List parts = FragmentUtils.splitMolecule(molToSplit, bondToSplit); - IAtomContainer[] parts = splitMoleculeWithCopy(atomContainer, bondsToRemove); + IAtomContainer[] parts = splitMoleculeBondsWithCopy(atomContainer, bondsToRemove); for (IAtomContainer partContainer : parts) { AtomContainerManipulator.clearAtomConfigurations(partContainer); for (IAtom atom : partContainer.atoms()) { @@ -348,75 +348,64 @@ private static IAtom copyAtom(IAtom originalAtom, IAtomContainer atomContainer) return cpyAtom; } - private static IAtomContainer[] splitMoleculeWithCopy(IAtomContainer mol, IBond[] bondsToSplit) { - boolean[] alreadyVisitedtoms = new boolean[mol.getAtomCount()]; + private static IAtomContainer[] splitMoleculeBondsWithCopy(IAtomContainer mol, IBond[] bondsToSplit) { + boolean[] alreadyVisitedAtoms = new boolean[mol.getAtomCount()]; // set all values of already visited to false - Arrays.fill(alreadyVisitedtoms, false); + Arrays.fill(alreadyVisitedAtoms, false); // map to keep track of the original atoms and the copies thereof Map origToCpyMap = new HashMap<>(mol.getAtomCount()); - Map atomsToSeperate = new HashMap<>(bondsToSplit.length); - for (IBond bond : bondsToSplit) { - atomsToSeperate.put(bond.getBegin(), bond.getEnd()); - } int numberOfFragments = bondsToSplit.length + 1; IAtomContainer[] fragments = new IAtomContainer[numberOfFragments]; + Set> pairIdxToSplit = new HashSet<>(bondsToSplit.length); + for (IBond bond : bondsToSplit) { + List pair = new ArrayList<>(2); + pair.add(bond.getBegin().getIndex()); + pair.add(bond.getBegin().getIndex()); + pairIdxToSplit.add(pair); + } for (int i = 0; i < numberOfFragments; i++) { // new container to hold a fragment IAtomContainer fragmentContainer = mol.getBuilder().newInstance(IAtomContainer.class); - // a stack to make a DFS through the subgraph Stack atomStack = new Stack<>(); if (i == 0) { atomStack.add(bondsToSplit[0].getBegin()); - IAtom firstAtom = atomStack.peek(); - IAtom firstAtomCpy = copyAtom(atomStack.peek(), fragmentContainer); - origToCpyMap.put(firstAtom, firstAtomCpy); - for (IAtom nbor : firstAtom.neighbors()) { - if (nbor != atomsToSeperate.get(firstAtom)) { - IAtom cpyNbor = copyAtom(nbor, fragmentContainer); - fragmentContainer.newBond(firstAtomCpy, cpyNbor, mol.getBond(firstAtom, nbor).getOrder()); - atomStack.add(nbor); - origToCpyMap.put(nbor, cpyNbor); - } - } + IAtom atom = atomStack.peek(); + IAtom atomCpy = copyAtom(atomStack.peek(), fragmentContainer); + alreadyVisitedAtoms[atom.getIndex()] = true; + origToCpyMap.put(atom, atomCpy); } else { atomStack.add(bondsToSplit[i - 1].getEnd()); - IAtom firstAtom = atomStack.peek(); - IAtom firstAtomCpy = copyAtom(firstAtom, fragmentContainer); - origToCpyMap.put(firstAtom, firstAtomCpy); - for (IAtom nbor : firstAtom.neighbors()) { - if (nbor != atomsToSeperate.get(firstAtom)) { - IAtom cpyNbor = copyAtom(nbor, fragmentContainer); - fragmentContainer.newBond(firstAtomCpy, cpyNbor, mol.getBond(firstAtom, nbor).getOrder()); - atomStack.add(nbor); - origToCpyMap.put(nbor, cpyNbor); - } - } + IAtom atom = atomStack.peek(); + IAtom atomCpy = copyAtom(atom, fragmentContainer); + alreadyVisitedAtoms[atom.getIndex()] = true; + origToCpyMap.put(atom, atomCpy); } while (!atomStack.isEmpty()) { - IAtom lastAtom = atomStack.pop(); - IAtom cpyAtom = copyAtom(lastAtom, fragmentContainer); - - alreadyVisitedtoms[lastAtom.getIndex()] = true; - for (IAtom nbor: lastAtom.neighbors()) { - if (!alreadyVisitedtoms[nbor.getIndex()]) { - if (nbor == atomsToSeperate.get(lastAtom)) { - continue; + IAtom atom = atomStack.pop(); + IAtom cpyAtom = copyAtom(atom, fragmentContainer); + alreadyVisitedAtoms[atom.getIndex()] = true; + for (IAtom nbor: atom.neighbors()) { + if (!alreadyVisitedAtoms[nbor.getIndex()]) { + List pair = new ArrayList<>(2); + pair.add(atom.getIndex()); + pair.add(nbor.getIndex()); + if (!pairIdxToSplit.contains(pair)) { + IAtom cpyNbor = copyAtom(nbor, fragmentContainer); + fragmentContainer.newBond(cpyAtom, cpyNbor, mol.getBond(atom, nbor).getOrder()); + atomStack.add(nbor); + origToCpyMap.put(nbor, cpyNbor); + alreadyVisitedAtoms[nbor.getIndex()] = true; } - alreadyVisitedtoms[nbor.getIndex()] = true; - IAtom cpyNbor = copyAtom(nbor, fragmentContainer); - fragmentContainer.newBond(cpyAtom, cpyNbor, mol.getBond(lastAtom, nbor).getOrder()); - atomStack.add(nbor); - origToCpyMap.put(nbor, cpyNbor); } else { - IBond cycleConeectionBond = mol.getBond(lastAtom, nbor); -// System.out.println(cpyAtom.getContainer().toString()); -// System.out.println(cycleConeectionBond.getContainer().toString()); -// System.out.println(cpyAtom.getContainer().equals(origToCpyMap.get(nbor).getContainer())); + IAtom nborCpy = origToCpyMap.get(nbor); + if (nborCpy == null || cpyAtom.getContainer() != nborCpy.getContainer()) { + continue; + } fragmentContainer.newBond( cpyAtom, origToCpyMap.get(nbor), - cycleConeectionBond.getOrder() + mol.getBond(atom, nbor).getOrder() ); } } From c01722e589cd9e33b1e40f99bc69b00137b4fbc3 Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Mon, 3 Feb 2025 05:07:01 +0100 Subject: [PATCH 11/42] work in progress: working reimplementation of the exhaustive fragmenter --- .../cdk/fragment/ExhaustiveFragmenter.java | 293 ++++++++++-------- 1 file changed, 159 insertions(+), 134 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index 665534a2793..0f36e0d2b8f 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -34,7 +34,6 @@ import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; import java.util.*; -import java.util.stream.IntStream; /** * Generate fragments exhaustively. @@ -90,47 +89,41 @@ public enum Saturation { .createLoggingTool(ExhaustiveFragmenter.class); /** - * Instantiate fragmenter with default minimum fragment size. + * Instantiate fragmenter with default minimum fragment size and unsaturated fragments. */ public ExhaustiveFragmenter() { this(DEFAULT_MIN_FRAG_SIZE, DEFAULT_SATURATION); } /** - * Instantiate fragmenter with user specified minimum fragment size and default saturation (saturated fragments). + * Instantiate fragmenter with user specified minimum fragment size. * * @param minFragSize the minimum fragment size desired. + * @param saturationSetting setting to specify if the returned fragments should be saturated or not. */ - public ExhaustiveFragmenter(int minFragSize) { + public ExhaustiveFragmenter(int minFragSize, Saturation saturationSetting) { this.minFragSize = minFragSize; - this.saturationSetting = DEFAULT_SATURATION; + this.saturationSetting = saturationSetting; fragMap = new HashMap<>(); - smilesGenerator = new SmilesGenerator(SmiFlavor.UseAromaticSymbols | SmiFlavor.Unique); + smilesGenerator = new SmilesGenerator(SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols); } /** - * Instantiate fragmenter with default minimum fragment size and user specified saturation setting. + * Instantiate fragmenter with user specified minimum fragment size and default saturation (saturated fragments). * - * @param saturationSetting setting to specify if the returned fragments should be saturated or not. + * @param minFragSize the minimum fragment size desired. */ - public ExhaustiveFragmenter(Saturation saturationSetting) { - this.minFragSize = DEFAULT_MIN_FRAG_SIZE; - this.saturationSetting = saturationSetting; - fragMap = new HashMap<>(); - smilesGenerator = new SmilesGenerator(SmiFlavor.UseAromaticSymbols | SmiFlavor.Unique); + public ExhaustiveFragmenter(int minFragSize) { + this(minFragSize, DEFAULT_SATURATION); } /** - * Instantiate fragmenter with user specified minimum fragment size. + * Instantiate fragmenter with default minimum fragment size and user specified saturation setting. * - * @param minFragSize the minimum fragment size desired. * @param saturationSetting setting to specify if the returned fragments should be saturated or not. */ - public ExhaustiveFragmenter(int minFragSize, Saturation saturationSetting) { - this.minFragSize = minFragSize; - this.saturationSetting = saturationSetting; - fragMap = new HashMap<>(); - smilesGenerator = new SmilesGenerator(SmiFlavor.UseAromaticSymbols | SmiFlavor.Unique); + public ExhaustiveFragmenter(Saturation saturationSetting) { + this(DEFAULT_MIN_FRAG_SIZE, saturationSetting); } /** @@ -189,19 +182,17 @@ private void runSaturated(IAtomContainer atomContainer) throws CDKException { int numberOfIterations = (1 << splittableBondsLength) - 1; - int[][] allSubsets = generateSubsets(IntStream.rangeClosed(0, splittableBondsLength).toArray()); int[] splittableBondIndices = new int[splittableBondsLength]; for (int i = 0; i < splittableBondsLength; i++) { splittableBondIndices[i] = splittableBonds[i].getIndex(); } - - for (int i = 0; i < numberOfIterations; i ++){ - int subsetSize = allSubsets[i].length; + for (int i = 1; i <= numberOfIterations; i ++){ + int[] subset = generateSubset(i, splittableBondIndices); + int subsetSize = subset.length; IBond[] bondsToRemove = new IBond[subsetSize]; for (int j = 0; j < subsetSize; j++) { - bondsToRemove[j] = atomContainer.getBond(splittableBondIndices[j]); + bondsToRemove[j] = atomContainer.getBond(subset[j]); } -// List parts = FragmentUtils.splitMolecule(molToSplit, bondToSplit); IAtomContainer[] parts = splitMoleculeBondsWithCopy(atomContainer, bondsToRemove); for (IAtomContainer partContainer : parts) { AtomContainerManipulator.clearAtomConfigurations(partContainer); @@ -216,9 +207,6 @@ private void runSaturated(IAtomContainer atomContainer) throws CDKException { if (numberOfAtoms >= minFragSize && !fragMap.containsKey(tmpSmiles)) { fragMap.put(tmpSmiles, partContainer); } - if (numberOfAtoms < minFragSize) { - break; - } } } @@ -227,25 +215,71 @@ private void runSaturated(IAtomContainer atomContainer) throws CDKException { private void runUnsaturated(IAtomContainer atomContainer) throws CDKException { if (atomContainer.getBondCount() < 3) return; - IBond[] splitableBonds = getSplitableBonds(atomContainer); - if (splitableBonds.length == 0) return; - logger.debug("Got " + splitableBonds.length + " splittable bonds"); - - String tmpSmiles; - for (IBond bond : splitableBonds) { - List parts = FragmentUtils.splitMolecule(atomContainer, bond); - // make sure we don't add the same fragment twice + IBond[] splittableBonds = getSplitableBonds(atomContainer); + int splittableBondsLength = splittableBonds.length; + if (splittableBondsLength == 0) return; + logger.debug("Got " + splittableBondsLength + " splittable bonds"); + + // If we want to check all unique combinations of splittings we calculate the power set of the splittable bonds. + // which is 2^n and without considering the empty set we can say it is 2^n - 1. + // example: + // if we have a set of splittable bonds here represented as numbers {1, 2, 3}, we can describe all unique + // subsets as follows: + // {1} + // {2} + // {3} + // {1,2} + // {1,3} + // {2,3} + // {1,2,3} + int numberOfIterations = (1 << splittableBondsLength) - 1; + + + int[] splittableBondIndices = new int[splittableBondsLength]; + for (int i = 0; i < splittableBondsLength; i++) { + splittableBondIndices[i] = splittableBonds[i].getIndex(); + } + for (int i = 1; i <= numberOfIterations; i ++){ + int[] subset = generateSubset(i, splittableBondIndices); + int subsetSize = subset.length; + IBond[] bondsToRemove = new IBond[subsetSize]; + for (int j = 0; j < subsetSize; j++) { + bondsToRemove[j] = atomContainer.getBond(subset[j]); + } + IAtomContainer[] parts = splitMoleculeBondsWithCopy(atomContainer, bondsToRemove); for (IAtomContainer partContainer : parts) { - tmpSmiles = smilesGenerator.create(partContainer); - int fragmentSize = partContainer.getAtomCount(); - if (fragmentSize >= minFragSize && !fragMap.containsKey(tmpSmiles)) { + AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(partContainer); + Aromaticity.cdkLegacy().apply(partContainer); + String tmpSmiles = smilesGenerator.create(partContainer); + int numberOfAtoms = partContainer.getAtomCount(); + if (numberOfAtoms >= minFragSize && !fragMap.containsKey(tmpSmiles)) { + System.out.println(tmpSmiles); fragMap.put(tmpSmiles, partContainer); - if (fragmentSize > minFragSize) { - runUnsaturated(partContainer); - } } } + } +// if (atomContainer.getBondCount() < 3) return; +// IBond[] splitableBonds = getSplitableBonds(atomContainer); +// if (splitableBonds.length == 0) return; +// logger.debug("Got " + splitableBonds.length + " splittable bonds"); +// +// String tmpSmiles; +// for (IBond bond : splitableBonds) { +// List parts = FragmentUtils.splitMolecule(atomContainer, bond); +// // make sure we don't add the same fragment twice +// for (IAtomContainer partContainer : parts) { +// tmpSmiles = smilesGenerator.create(partContainer); +// int fragmentSize = partContainer.getAtomCount(); +// if (fragmentSize >= minFragSize && !fragMap.containsKey(tmpSmiles)) { +// System.out.println(tmpSmiles); +// fragMap.put(tmpSmiles, partContainer); +// if (fragmentSize > minFragSize) { +// runUnsaturated(partContainer); +// } +// } +// } +// } } private IBond[] getSplitableBonds(IAtomContainer atomContainer) { @@ -278,65 +312,50 @@ private IBond[] getSplitableBonds(IAtomContainer atomContainer) { } /** - * Generates all possible subsets (of all possible sample sizes, ranging from 1 to the length of nums) - * of the numbers given in nums, ignoring the order, so [1,2] and [2,1] are regarded as equal and only - * one of them is returned. - * The number of possible subsets is (2^n) - 1 with n = length of nums. - * Example output for nums = [1,2,3] (2^3 - 1 = 7): - * [1] - * [2] - * [3] - * [1,2] - * [1,3] - * [2,3] - * [1,2,3] + * Generates a subset of the numbers given in `nums`, determined by the binary representation of the provided `index`. + * Each bit in the binary value of the `index` represents whether the corresponding element in the `nums` array is included + * in the subset. The order of the elements does not matter (i.e., [1, 2] and [2, 1] are considered identical), + * and the empty set is excluded. + *

+ * The total number of possible subsets is (2^n) - 1, where n is the length of the `nums` array. This excludes the empty set. + * The subsets are generated based on bit manipulation, and the order of subsets may vary depending on the internal bit shifts. + *

+ * Example output for nums = [1, 2, 3] (2^3 - 1 = 7): + * [1] + * [2] + * [3] + * [1, 2] + * [1, 3] + * [2, 3] + * [1, 2, 3] + *

* The empty set [] is not part of the output. - * The returned subsets will be ordered differently because they are generated based on bit shifts internally. * - * @param nums set of integers from which to generate all possible subsets, sets - * containing the same number multiple times do not lead to an exception but maybe do not make much sense. - * @return all possible subsets. - * @throws ArithmeticException if the number of elements in the nums array is greater than 30. Because it is not - * possible to create indexed data structures with more than 2^31 - 1 values. + * @param index The index, represented as an integer, where each bit corresponds to whether an element in `nums` should + * be included in the subset. A bit value of `1` means the corresponding element is included, and `0` means it is not. + * @param nums An array of integers from which to generate the subset. The presence of duplicate values in `nums` will not + * result in an exception, but may lead to repeated values in the generated subsets. + * @return An array containing the subset corresponding to the binary representation of the provided `index`. * @author Tom Weiß */ - private static int[][] generateSubsets(int[] nums) throws ArithmeticException { - // calculate nr of different subsets (2^n including the empty set) by shifting the 0th bit of an - // integer with value 1 n positions to the left - // for cases where n > 32 an exception is thrown - int n = nums.length; - if (n > 31) { - throw new ArithmeticException("You attempted to make more subsets than an primitive integer can handle"); - } - int numOfSubsets = 1 << n; - - // collect all subsets by iterating from one (to disregard the empty set) to the number - // of possible subsets and check for each number which bits are on and replace this - // index by the respective number at the same index from the given nums int array - // Example: - // nums = [1, 2, 3] - // i bit value subset - // 1 0b001 [1] - // 2 0b010 [2] - // 3 0b011 [1,2] - // 4 0b100 [3] - // 5 0b101 [1,3] - // 6 0b110 [2,3] - // 7 0b111 [1,2,3] - int[][] result = new int[numOfSubsets - 1][]; - for (int i = 1; i < numOfSubsets; i++) { - int[] subset = new int[Integer.bitCount(i)]; - // keep track of the next index to add a number - int resultIndex = 0; - for (int j = 0; j < n; j++) { - if (((i >> j) & 1) == 1) { - subset[resultIndex] = nums[j]; - resultIndex++; - } + private static int[] generateSubset(int index, int[] nums) { + + // Create a new array to hold the subset, size based on the number of 1-bits in the index. + int[] subset = new int[Integer.bitCount(index)]; + int subsetIndex = 0; + + // Iterate through each bit in the binary representation of the index. + for (int j = 0; j < 32; j++) { + // Check if the current bit (at position 'j') is set to 1. + if (((index >> j) & 1) == 1) { + // If the bit is set, add the corresponding number from nums to the subset. + subset[subsetIndex] = nums[j]; + subsetIndex++; } - result[i - 1] = subset; } - return result; + + // Return the generated subset. + return subset; } private static IAtom copyAtom(IAtom originalAtom, IAtomContainer atomContainer) { @@ -348,65 +367,70 @@ private static IAtom copyAtom(IAtom originalAtom, IAtomContainer atomContainer) return cpyAtom; } - private static IAtomContainer[] splitMoleculeBondsWithCopy(IAtomContainer mol, IBond[] bondsToSplit) { + private IAtomContainer[] splitMoleculeBondsWithCopy(IAtomContainer mol, IBond[] bondsToSplit) throws CDKException { boolean[] alreadyVisitedAtoms = new boolean[mol.getAtomCount()]; + boolean[] visitedBonds = new boolean[mol.getBondCount()]; // set all values of already visited to false Arrays.fill(alreadyVisitedAtoms, false); + Arrays.fill(visitedBonds, false); // map to keep track of the original atoms and the copies thereof - Map origToCpyMap = new HashMap<>(mol.getAtomCount()); int numberOfFragments = bondsToSplit.length + 1; IAtomContainer[] fragments = new IAtomContainer[numberOfFragments]; - Set> pairIdxToSplit = new HashSet<>(bondsToSplit.length); + Map> atomsToSplit = new HashMap<>(bondsToSplit.length * 2); for (IBond bond : bondsToSplit) { - List pair = new ArrayList<>(2); - pair.add(bond.getBegin().getIndex()); - pair.add(bond.getBegin().getIndex()); - pairIdxToSplit.add(pair); + IAtom beg = bond.getBegin(); + IAtom end = bond.getEnd(); + if (atomsToSplit.containsKey(beg)) { + atomsToSplit.get(beg).add(end); + } else { + List endList = new ArrayList<>(); + endList.add(end); + atomsToSplit.put(beg, endList); + } } + Stack startingAtoms = new Stack<>(); + startingAtoms.add(bondsToSplit[0].getBegin()); for (int i = 0; i < numberOfFragments; i++) { + Map origToCpyMap = new HashMap<>(mol.getAtomCount()); // new container to hold a fragment IAtomContainer fragmentContainer = mol.getBuilder().newInstance(IAtomContainer.class); // a stack to make a DFS through the subgraph Stack atomStack = new Stack<>(); - if (i == 0) { - atomStack.add(bondsToSplit[0].getBegin()); - IAtom atom = atomStack.peek(); - IAtom atomCpy = copyAtom(atomStack.peek(), fragmentContainer); - alreadyVisitedAtoms[atom.getIndex()] = true; - origToCpyMap.put(atom, atomCpy); - } else { - atomStack.add(bondsToSplit[i - 1].getEnd()); - IAtom atom = atomStack.peek(); - IAtom atomCpy = copyAtom(atom, fragmentContainer); - alreadyVisitedAtoms[atom.getIndex()] = true; - origToCpyMap.put(atom, atomCpy); - } + atomStack.add(startingAtoms.pop()); + IAtom firstAtom = atomStack.peek(); + IAtom atomCpy = copyAtom(firstAtom, fragmentContainer); + origToCpyMap.put(firstAtom, atomCpy); while (!atomStack.isEmpty()) { IAtom atom = atomStack.pop(); - IAtom cpyAtom = copyAtom(atom, fragmentContainer); + atomCpy = origToCpyMap.get(atom); alreadyVisitedAtoms[atom.getIndex()] = true; for (IAtom nbor: atom.neighbors()) { - if (!alreadyVisitedAtoms[nbor.getIndex()]) { - List pair = new ArrayList<>(2); - pair.add(atom.getIndex()); - pair.add(nbor.getIndex()); - if (!pairIdxToSplit.contains(pair)) { - IAtom cpyNbor = copyAtom(nbor, fragmentContainer); - fragmentContainer.newBond(cpyAtom, cpyNbor, mol.getBond(atom, nbor).getOrder()); + if (visitedBonds[mol.getBond(atom, nbor).getIndex()]) { + continue; + } + if (!atomsToSplit.containsKey(atom) || !atomsToSplit.get(atom).contains(nbor)) { + if (!alreadyVisitedAtoms[nbor.getIndex()]) { + IAtom nborCpy = copyAtom(nbor, fragmentContainer); + fragmentContainer.newBond(atomCpy, nborCpy, mol.getBond(atom, nbor).getOrder()); + visitedBonds[mol.getBond(atom, nbor).getIndex()] = true; atomStack.add(nbor); - origToCpyMap.put(nbor, cpyNbor); + origToCpyMap.put(nbor, nborCpy); alreadyVisitedAtoms[nbor.getIndex()] = true; + } else { + IAtom nborCpy = origToCpyMap.get(nbor); + if (nborCpy == null) { + continue; + } + fragmentContainer.newBond( + atomCpy, + nborCpy, + mol.getBond(atom, nbor).getOrder() + ); + visitedBonds[mol.getBond(atom, nbor).getIndex()] = true; + atomStack.remove(nbor); } } else { - IAtom nborCpy = origToCpyMap.get(nbor); - if (nborCpy == null || cpyAtom.getContainer() != nborCpy.getContainer()) { - continue; - } - fragmentContainer.newBond( - cpyAtom, - origToCpyMap.get(nbor), - mol.getBond(atom, nbor).getOrder() - ); + startingAtoms.add(nbor); } } } @@ -415,6 +439,7 @@ private static IAtomContainer[] splitMoleculeBondsWithCopy(IAtomContainer mol, I return fragments; } + /** * Get the fragments generated as SMILES strings. * From 225a564321886f792b4559dde2ffa1864ebc7fc7 Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Wed, 5 Feb 2025 17:37:29 +0100 Subject: [PATCH 12/42] work in progress: cleaning up and adding documentation --- .../cdk/fragment/ExhaustiveFragmenter.java | 187 +++++++++++------- .../fragment/ExhaustiveFragmenterTest.java | 3 - 2 files changed, 113 insertions(+), 77 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index 0f36e0d2b8f..65c8d96a81d 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -25,7 +25,10 @@ import org.openscience.cdk.aromaticity.Aromaticity; import org.openscience.cdk.exception.CDKException; import org.openscience.cdk.graph.SpanningTree; -import org.openscience.cdk.interfaces.*; +import org.openscience.cdk.interfaces.IAtom; +import org.openscience.cdk.interfaces.IAtomContainer; +import org.openscience.cdk.interfaces.IBond; +import org.openscience.cdk.interfaces.IRingSet; import org.openscience.cdk.smiles.SmiFlavor; import org.openscience.cdk.smiles.SmilesGenerator; import org.openscience.cdk.tools.CDKHydrogenAdder; @@ -33,7 +36,14 @@ import org.openscience.cdk.tools.LoggingToolFactory; import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Stack; + +import static org.openscience.cdk.fragment.FragmentUtils.splitMolecule; /** * Generate fragments exhaustively. @@ -78,15 +88,15 @@ public enum Saturation { UNSATURATED_FRAGMENTS } - private static final int DEFAULT_MIN_FRAG_SIZE = 6; + private static final int DEFAULT_MIN_FRAG_SIZE = 6; private static final Saturation DEFAULT_SATURATION = Saturation.UNSATURATED_FRAGMENTS; + private static final int maxTreeDepth = 31; final Map fragMap; - final SmilesGenerator smilesGenerator; - int minFragSize; + final SmilesGenerator smilesGenerator; + int minFragSize; Saturation saturationSetting; - private static final ILoggingTool logger = LoggingToolFactory - .createLoggingTool(ExhaustiveFragmenter.class); + private static final ILoggingTool logger = LoggingToolFactory.createLoggingTool(ExhaustiveFragmenter.class); /** * Instantiate fragmenter with default minimum fragment size and unsaturated fragments. @@ -153,13 +163,18 @@ public void setSaturationSetting(Saturation saturationSetting) { public void generateFragments(IAtomContainer atomContainer) throws CDKException { fragMap.clear(); if (this.saturationSetting == Saturation.UNSATURATED_FRAGMENTS) { - runUnsaturated(atomContainer); + runUnsaturated(atomContainer, maxTreeDepth); } else { - runSaturated(atomContainer); + runSaturated(atomContainer, maxTreeDepth); } } - private void runSaturated(IAtomContainer atomContainer) throws CDKException { + /** + * Splits the `atomContainer` and adds Hydrogen atoms to the atoms that get splitted. + * @param atomContainer molecule to split + * @throws CDKException + */ + private void runSaturated(IAtomContainer atomContainer, int maxTreeDepth) throws CDKException { if (atomContainer.getBondCount() < 3) return; IBond[] splittableBonds = getSplitableBonds(atomContainer); @@ -181,7 +196,6 @@ private void runSaturated(IAtomContainer atomContainer) throws CDKException { // {1,2,3} int numberOfIterations = (1 << splittableBondsLength) - 1; - int[] splittableBondIndices = new int[splittableBondsLength]; for (int i = 0; i < splittableBondsLength; i++) { splittableBondIndices[i] = splittableBonds[i].getIndex(); @@ -189,11 +203,14 @@ private void runSaturated(IAtomContainer atomContainer) throws CDKException { for (int i = 1; i <= numberOfIterations; i ++){ int[] subset = generateSubset(i, splittableBondIndices); int subsetSize = subset.length; - IBond[] bondsToRemove = new IBond[subsetSize]; + if (subsetSize > maxTreeDepth) { + continue; + } + IBond[] bondsToSplit = new IBond[subsetSize]; for (int j = 0; j < subsetSize; j++) { - bondsToRemove[j] = atomContainer.getBond(subset[j]); + bondsToSplit[j] = atomContainer.getBond(subset[j]); } - IAtomContainer[] parts = splitMoleculeBondsWithCopy(atomContainer, bondsToRemove); + IAtomContainer[] parts = splitBondsWithCopy(atomContainer, bondsToSplit); for (IAtomContainer partContainer : parts) { AtomContainerManipulator.clearAtomConfigurations(partContainer); for (IAtom atom : partContainer.atoms()) { @@ -201,6 +218,7 @@ private void runSaturated(IAtomContainer atomContainer) throws CDKException { } AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(partContainer); CDKHydrogenAdder.getInstance(partContainer.getBuilder()).addImplicitHydrogens(partContainer); + // todo: find the appropriate replacement for the legacy operation Aromaticity.cdkLegacy().apply(partContainer); String tmpSmiles = smilesGenerator.create(partContainer); int numberOfAtoms = partContainer.getAtomCount(); @@ -212,7 +230,7 @@ private void runSaturated(IAtomContainer atomContainer) throws CDKException { } } - private void runUnsaturated(IAtomContainer atomContainer) throws CDKException { + private void runUnsaturated(IAtomContainer atomContainer, int maxTreeDepth) throws CDKException { if (atomContainer.getBondCount() < 3) return; IBond[] splittableBonds = getSplitableBonds(atomContainer); @@ -234,52 +252,36 @@ private void runUnsaturated(IAtomContainer atomContainer) throws CDKException { // {1,2,3} int numberOfIterations = (1 << splittableBondsLength) - 1; - int[] splittableBondIndices = new int[splittableBondsLength]; for (int i = 0; i < splittableBondsLength; i++) { splittableBondIndices[i] = splittableBonds[i].getIndex(); } + // we start from one to disregard the empty set from generateSubset for (int i = 1; i <= numberOfIterations; i ++){ int[] subset = generateSubset(i, splittableBondIndices); int subsetSize = subset.length; - IBond[] bondsToRemove = new IBond[subsetSize]; + if (subsetSize > maxTreeDepth) { + continue; + } + IBond[] bondsToSplit = new IBond[subsetSize]; for (int j = 0; j < subsetSize; j++) { - bondsToRemove[j] = atomContainer.getBond(subset[j]); + bondsToSplit[j] = atomContainer.getBond(subset[j]); } - IAtomContainer[] parts = splitMoleculeBondsWithCopy(atomContainer, bondsToRemove); + // TODO: is a copy here really needed what do the other fragmenters use or return ? Otherwise either make a + // function that doesn't copy or find a way to use the existing splitMolecule from FragmentUtils. + IAtomContainer[] parts = splitBondsWithCopy(atomContainer, bondsToSplit); for (IAtomContainer partContainer : parts) { AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(partContainer); + // todo: find the appropriate replacement for the legacy operation Aromaticity.cdkLegacy().apply(partContainer); String tmpSmiles = smilesGenerator.create(partContainer); int numberOfAtoms = partContainer.getAtomCount(); if (numberOfAtoms >= minFragSize && !fragMap.containsKey(tmpSmiles)) { - System.out.println(tmpSmiles); fragMap.put(tmpSmiles, partContainer); } } } -// if (atomContainer.getBondCount() < 3) return; -// IBond[] splitableBonds = getSplitableBonds(atomContainer); -// if (splitableBonds.length == 0) return; -// logger.debug("Got " + splitableBonds.length + " splittable bonds"); -// -// String tmpSmiles; -// for (IBond bond : splitableBonds) { -// List parts = FragmentUtils.splitMolecule(atomContainer, bond); -// // make sure we don't add the same fragment twice -// for (IAtomContainer partContainer : parts) { -// tmpSmiles = smilesGenerator.create(partContainer); -// int fragmentSize = partContainer.getAtomCount(); -// if (fragmentSize >= minFragSize && !fragMap.containsKey(tmpSmiles)) { -// System.out.println(tmpSmiles); -// fragMap.put(tmpSmiles, partContainer); -// if (fragmentSize > minFragSize) { -// runUnsaturated(partContainer); -// } -// } -// } -// } } private IBond[] getSplitableBonds(IAtomContainer atomContainer) { @@ -312,29 +314,39 @@ private IBond[] getSplitableBonds(IAtomContainer atomContainer) { } /** - * Generates a subset of the numbers given in `nums`, determined by the binary representation of the provided `index`. - * Each bit in the binary value of the `index` represents whether the corresponding element in the `nums` array is included - * in the subset. The order of the elements does not matter (i.e., [1, 2] and [2, 1] are considered identical), - * and the empty set is excluded. + * Generates a subset of the numbers given in `nums`, determined by the binary representation of the + * provided `index`.Each bit in the binary value of the `index` represents whether the corresponding element + * in the `nums` array is included in the subset. The order of the elements does not matter + * (i.e., [1, 2] and [2, 1] are considered identical). *

- * The total number of possible subsets is (2^n) - 1, where n is the length of the `nums` array. This excludes the empty set. - * The subsets are generated based on bit manipulation, and the order of subsets may vary depending on the internal bit shifts. + * The total number of possible subsets is (2^n) - 1, where n is the length of the `nums` array. + * The subsets are generated based on bit manipulation, and the order of subsets may vary depending on + * the internal bit shifts. *

* Example output for nums = [1, 2, 3] (2^3 - 1 = 7): - * [1] - * [2] - * [3] - * [1, 2] - * [1, 3] - * [2, 3] - * [1, 2, 3] + * [1] for index = 1 + * [2] for index = 2 + * [1, 2] for index = 3 + * [3] for index = 4 + * [1, 3] for index = 5 + * [2, 3] for index = 6 + * [1, 2, 3] for index = 7 *

- * The empty set [] is not part of the output. + * It works like follows: + * index is here represented as binary and each place where the binary representation has a one results in the + * respective place of the `nums` array being returned. + * index (5): nums: result: + * 1 --> 1 1 + * 0 2 ---> [1, 3] + * 1 --> 3 3 + * * - * @param index The index, represented as an integer, where each bit corresponds to whether an element in `nums` should - * be included in the subset. A bit value of `1` means the corresponding element is included, and `0` means it is not. - * @param nums An array of integers from which to generate the subset. The presence of duplicate values in `nums` will not - * result in an exception, but may lead to repeated values in the generated subsets. + * + * @param index The index, represented as an integer, where each bit corresponds to whether an element in + * `nums` should be included in the subset. A bit value of `1` means the corresponding element + * is included, and `0` means it is not. + * @param nums An array of integers from which to generate the subset. The presence of duplicate values in + * `nums` will not result in an exception, but may lead to repeated values in the generated subsets. * @return An array containing the subset corresponding to the binary representation of the provided `index`. * @author Tom Weiß */ @@ -358,6 +370,12 @@ private static int[] generateSubset(int index, int[] nums) { return subset; } + /** + * Copies an atom into a new atom container. + * @param originalAtom the atom to be copied + * @param atomContainer the destination atom container + * @return the copy of the atom + */ private static IAtom copyAtom(IAtom originalAtom, IAtomContainer atomContainer) { IAtom cpyAtom = atomContainer.newAtom(originalAtom.getAtomicNumber(), originalAtom.getImplicitHydrogenCount()); @@ -367,16 +385,24 @@ private static IAtom copyAtom(IAtom originalAtom, IAtomContainer atomContainer) return cpyAtom; } - private IAtomContainer[] splitMoleculeBondsWithCopy(IAtomContainer mol, IBond[] bondsToSplit) throws CDKException { - boolean[] alreadyVisitedAtoms = new boolean[mol.getAtomCount()]; + /** + * Split a molecule on all specified bonds by making a copy of the fragments. + * @param mol the molecule to split. + * @param bondsToSplit the bonds that should be removed. + * @return the resulting copied fragments. + */ + private IAtomContainer[] splitBondsWithCopy(IAtomContainer mol, IBond[] bondsToSplit) { + // keep track of visited atoms and bonds + boolean[] visitedAtoms = new boolean[mol.getAtomCount()]; boolean[] visitedBonds = new boolean[mol.getBondCount()]; - // set all values of already visited to false - Arrays.fill(alreadyVisitedAtoms, false); + // set all values of already visited atoms and bonds to false + Arrays.fill(visitedAtoms, false); Arrays.fill(visitedBonds, false); - // map to keep track of the original atoms and the copies thereof + // the number of fragments is always the number of splits + 1 + // example: 1 split results in 2 fragments, 2 splits in 3 and so on int numberOfFragments = bondsToSplit.length + 1; IAtomContainer[] fragments = new IAtomContainer[numberOfFragments]; - Map> atomsToSplit = new HashMap<>(bondsToSplit.length * 2); + Map> atomsToSplit = new HashMap<>((int) Math.ceil((bondsToSplit.length * 2) / 0.75)); for (IBond bond : bondsToSplit) { IAtom beg = bond.getBegin(); IAtom end = bond.getEnd(); @@ -388,34 +414,48 @@ private IAtomContainer[] splitMoleculeBondsWithCopy(IAtomContainer mol, IBond[] atomsToSplit.put(beg, endList); } } + // stack to keep track of the next starting atom for the next fragment. If a bond to split is + // noticed the atom of the next fragment gets added here Stack startingAtoms = new Stack<>(); - startingAtoms.add(bondsToSplit[0].getBegin()); + // start at an arbitrary starting atom + startingAtoms.add(mol.getAtom(0)); + // for each fragment we iterate through the molecule, except for the bonds that we want to split for (int i = 0; i < numberOfFragments; i++) { - Map origToCpyMap = new HashMap<>(mol.getAtomCount()); - // new container to hold a fragment + // map to keep track of the original atoms and the copies thereof + Map origToCpyMap = new HashMap<>((int) Math.ceil(mol.getAtomCount() / 0.75)); IAtomContainer fragmentContainer = mol.getBuilder().newInstance(IAtomContainer.class); - // a stack to make a DFS through the subgraph + // a stack to make a DFS through the molecule Stack atomStack = new Stack<>(); + + // we start iterating at the most recent starting atom atomStack.add(startingAtoms.pop()); + + // we need to copy the first atom separately IAtom firstAtom = atomStack.peek(); IAtom atomCpy = copyAtom(firstAtom, fragmentContainer); origToCpyMap.put(firstAtom, atomCpy); + while (!atomStack.isEmpty()) { + // get the last atom and its copy IAtom atom = atomStack.pop(); atomCpy = origToCpyMap.get(atom); - alreadyVisitedAtoms[atom.getIndex()] = true; + visitedAtoms[atom.getIndex()] = true; for (IAtom nbor: atom.neighbors()) { + // if the bond to the current neighbour was already added we go to the next neighbour if (visitedBonds[mol.getBond(atom, nbor).getIndex()]) { continue; } + // if the neighbour is not to be split we either copy the neighbour and make a bond from the copy + // of the last atom to its copy of the neighbour or if we already visited the atom we know this is + // a cycle connection, and we make only a bond from the last atom to the already copied neighbour if (!atomsToSplit.containsKey(atom) || !atomsToSplit.get(atom).contains(nbor)) { - if (!alreadyVisitedAtoms[nbor.getIndex()]) { + if (!visitedAtoms[nbor.getIndex()]) { IAtom nborCpy = copyAtom(nbor, fragmentContainer); fragmentContainer.newBond(atomCpy, nborCpy, mol.getBond(atom, nbor).getOrder()); visitedBonds[mol.getBond(atom, nbor).getIndex()] = true; atomStack.add(nbor); origToCpyMap.put(nbor, nborCpy); - alreadyVisitedAtoms[nbor.getIndex()] = true; + visitedAtoms[nbor.getIndex()] = true; } else { IAtom nborCpy = origToCpyMap.get(nbor); if (nborCpy == null) { @@ -427,7 +467,6 @@ private IAtomContainer[] splitMoleculeBondsWithCopy(IAtomContainer mol, IBond[] mol.getBond(atom, nbor).getOrder() ); visitedBonds[mol.getBond(atom, nbor).getIndex()] = true; - atomStack.remove(nbor); } } else { startingAtoms.add(nbor); @@ -451,7 +490,7 @@ public String[] getFragments() { } /** - * Get the fragments generated as {@link IAtomContainer} objects.. + * Get the fragments generated as {@link IAtomContainer} objects. * * @return a IAtomContainer[] of the fragments. */ diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java index a784288daf8..5e47afa0a0e 100644 --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java @@ -194,9 +194,6 @@ void testEF7Saturated() throws Exception { fragmenterSaturated.generateFragments(mol); List frags = Arrays.asList(fragmenterSaturated.getFragments()); Assertions.assertNotNull(frags); - for (String f : frags) { - System.out.println(f); - } Assertions.assertEquals(25, frags.size()); Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); From 259eb803744fecf418d5f666e02a5b703aa34681 Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Wed, 5 Feb 2025 20:30:17 +0100 Subject: [PATCH 13/42] work in progress: cleaning up imports --- .../java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index 65c8d96a81d..8d965876356 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -43,8 +43,6 @@ import java.util.Map; import java.util.Stack; -import static org.openscience.cdk.fragment.FragmentUtils.splitMolecule; - /** * Generate fragments exhaustively. *

From 6ed08bdf0340f50ace7f44af8bb2d15c555c64d4 Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Wed, 5 Feb 2025 20:40:11 +0100 Subject: [PATCH 14/42] work in progress: adding a setter for the maximum tree depth --- .../cdk/fragment/ExhaustiveFragmenter.java | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index 8d965876356..eebe0e47ea2 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -88,8 +88,8 @@ public enum Saturation { private static final int DEFAULT_MIN_FRAG_SIZE = 6; private static final Saturation DEFAULT_SATURATION = Saturation.UNSATURATED_FRAGMENTS; - private static final int maxTreeDepth = 31; + int maxTreeDepth = 31; final Map fragMap; final SmilesGenerator smilesGenerator; int minFragSize; @@ -152,6 +152,15 @@ public void setSaturationSetting(Saturation saturationSetting) { this.saturationSetting = saturationSetting; } + /** + * Set the maximum depth of the fragmentation tree. It has to be in the range of 0 < maxTreeDepth < 32 + * + * @param maxTreeDepth + */ + public void setMaxTreeDepth(int maxTreeDepth) { + this.maxTreeDepth = maxTreeDepth; + } + /** * Generate fragments for the input molecule. * From 49536ef46b130f52316b2ffcc6886fb2f5ca759a Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Wed, 5 Feb 2025 21:38:40 +0100 Subject: [PATCH 15/42] work in progress: refining comments --- .../cdk/fragment/ExhaustiveFragmenter.java | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index eebe0e47ea2..3a092bda865 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -46,17 +46,19 @@ /** * Generate fragments exhaustively. *

- * This fragmentation scheme simply breaks single non-ring bonds. By default, - * fragments smaller than 6 atoms (without implicit hydrogen) in size are not + * This fragmentation scheme simply breaks single non-ring, non-terminal bonds, non-terminal meaning that no bond to + * a single heavy atom is split. + * By default, fragments smaller than 6 atoms (without implicit hydrogen) in size are not * considered and the returned fragments are not saturated, but this can be changed by the user. * Side chains are retained. * *

Example Usage

* *
{@code
- * ExhaustiveFragmenter fragmenter = new ExhaustiveFragmenter(); // per default this returns unsaturated fragments with a minimum size of 6
+ * // per default this fragmenter returns unsaturated fragments with a minimum size of 6
+ * ExhaustiveFragmenter fragmenter = new ExhaustiveFragmenter();
  * SmilesParser smiParser = new SmilesParser(SilentChemObjectBuilder.getInstance());
- * IAtomContainer mol = smiParser.parseSmiles(c1ccccc1CC(N)C(=O)O);
+ * IAtomContainer mol = smiParser.parseSmiles(c1ccccc1C);
  * fragmenter.generateFragments(mol);
  * // if you want the SMILES representation of the fragments
  * String[] smilesFragments = fragmenter.getFragments();
@@ -153,7 +155,7 @@ public void setSaturationSetting(Saturation saturationSetting) {
     }
 
     /**
-     * Set the maximum depth of the fragmentation tree. It has to be in the range of 0 < maxTreeDepth < 32
+     * Set the maximum number of simultaneously split bonds. It has to be in the range of 0 < maxTreeDepth < 32.
      *
      * @param maxTreeDepth
      */

From c2fd014421011b1c3ce95b7fd0dcbf6a0c988625 Mon Sep 17 00:00:00 2001
From: ToLeWeiss 
Date: Wed, 5 Feb 2025 22:27:09 +0100
Subject: [PATCH 16/42] work in progress: further improvements to comments and
 more private fields

---
 .../cdk/fragment/ExhaustiveFragmenter.java    | 371 ++++++++++--------
 1 file changed, 205 insertions(+), 166 deletions(-)

diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java
index 3a092bda865..e02d1f134ae 100644
--- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java
+++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java
@@ -44,72 +44,70 @@
 import java.util.Stack;
 
 /**
- * Generate fragments exhaustively.
+ * Performs exhaustive fragmentation of molecules by breaking single non-ring, non-terminal bonds.
  * 

- * This fragmentation scheme simply breaks single non-ring, non-terminal bonds, non-terminal meaning that no bond to - * a single heavy atom is split. - * By default, fragments smaller than 6 atoms (without implicit hydrogen) in size are not - * considered and the returned fragments are not saturated, but this can be changed by the user. - * Side chains are retained. - * - *

Example Usage

- * + * This fragmentation method avoids splitting bonds connected to single heavy atoms (non-terminal bonds). + * By default: + * - Fragments smaller than 6 atoms (excluding implicit hydrogen) are ignored. + * - Fragments are returned unsaturated. + * However, users can modify these settings. + *

+ * Example Usage: *

{@code
- * // per default this fragmenter returns unsaturated fragments with a minimum size of 6
+ * // By default, returns unsaturated fragments with a minimum size of 6 atoms
  * ExhaustiveFragmenter fragmenter = new ExhaustiveFragmenter();
  * SmilesParser smiParser = new SmilesParser(SilentChemObjectBuilder.getInstance());
- * IAtomContainer mol = smiParser.parseSmiles(c1ccccc1C);
+ * IAtomContainer mol = smiParser.parseSmiles("c1ccccc1C");  // Benzyl molecule
  * fragmenter.generateFragments(mol);
- * // if you want the SMILES representation of the fragments
+ *
+ * // Retrieve SMILES representations of fragments
  * String[] smilesFragments = fragmenter.getFragments();
- * // if you want the Atom containers
+ *
+ * // Retrieve AtomContainer representations of fragments
  * IAtomContainer[] atomContainerFragments = fragmenter.getFragmentsAsContainers();
  * }
* * @author Rajarshi Guha * @cdk.module fragment - * @cdk.githash * @cdk.keyword fragment */ public class ExhaustiveFragmenter implements IFragmenter { /** - * Defines the saturation of the returned fragments. + * Specifies whether generated fragments should be saturated (hydrogens added) or unsaturated. */ public enum Saturation { - /** - * Fragments will get returned saturated. - */ + // Fragments will be returned in their saturated form (implicit hydrogen atoms added). SATURATED_FRAGMENTS, - /** - * Fragments will get returned unsaturated. - */ + // Fragments will be returned in their unsaturated form (no additional hydrogen atoms). UNSATURATED_FRAGMENTS } private static final int DEFAULT_MIN_FRAG_SIZE = 6; private static final Saturation DEFAULT_SATURATION = Saturation.UNSATURATED_FRAGMENTS; - int maxTreeDepth = 31; - final Map fragMap; - final SmilesGenerator smilesGenerator; - int minFragSize; - Saturation saturationSetting; + private final Map fragMap; + private final SmilesGenerator smilesGenerator; + private int maxTreeDepth = 31; + private int minFragSize; + private Saturation saturationSetting; private static final ILoggingTool logger = LoggingToolFactory.createLoggingTool(ExhaustiveFragmenter.class); /** - * Instantiate fragmenter with default minimum fragment size and unsaturated fragments. + * Constructs an ExhaustiveFragmenter with the default settings: + * - Minimum fragment size: 6 atoms + * - Unsaturated fragments */ public ExhaustiveFragmenter() { this(DEFAULT_MIN_FRAG_SIZE, DEFAULT_SATURATION); } /** - * Instantiate fragmenter with user specified minimum fragment size. + * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment size and saturation setting. * - * @param minFragSize the minimum fragment size desired. - * @param saturationSetting setting to specify if the returned fragments should be saturated or not. + * @param minFragSize Minimum number of atoms in a valid fragment. + * @param saturationSetting Determines whether fragments should be saturated or unsaturated. */ public ExhaustiveFragmenter(int minFragSize, Saturation saturationSetting) { this.minFragSize = minFragSize; @@ -119,54 +117,66 @@ public ExhaustiveFragmenter(int minFragSize, Saturation saturationSetting) { } /** - * Instantiate fragmenter with user specified minimum fragment size and default saturation (saturated fragments). + * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment size. + * Saturation defaults to unsaturated fragments. * - * @param minFragSize the minimum fragment size desired. + * @param minFragSize Minimum number of atoms in a valid fragment. */ public ExhaustiveFragmenter(int minFragSize) { this(minFragSize, DEFAULT_SATURATION); } /** - * Instantiate fragmenter with default minimum fragment size and user specified saturation setting. + * Constructs an ExhaustiveFragmenter with a user-defined saturation setting. + * The minimum fragment size defaults to 6 atoms. * - * @param saturationSetting setting to specify if the returned fragments should be saturated or not. + * @param saturationSetting Determines whether fragments should be saturated or unsaturated. */ public ExhaustiveFragmenter(Saturation saturationSetting) { this(DEFAULT_MIN_FRAG_SIZE, saturationSetting); } /** - * Set the minimum fragment size. + * Sets the minimum allowed fragment size. * - * @param minFragSize the smallest size fragment that will be returned + * @param minFragSize Minimum number of atoms in a valid fragment. */ public void setMinimumFragmentSize(int minFragSize) { this.minFragSize = minFragSize; } /** - * Set the saturation setting of the returned fragments. + * Sets whether fragments should be saturated or unsaturated. * - * @param saturationSetting setting to specify if the returned fragments should be saturated or not. + * @param saturationSetting The saturation mode for generated fragments. */ public void setSaturationSetting(Saturation saturationSetting) { this.saturationSetting = saturationSetting; } /** - * Set the maximum number of simultaneously split bonds. It has to be in the range of 0 < maxTreeDepth < 32. + * Sets the maximum number of bonds that can be simultaneously split. + * Must be within the range `0 < maxTreeDepth < 32`. * - * @param maxTreeDepth + * @param maxTreeDepth Maximum number of bonds that can be split at once. */ public void setMaxTreeDepth(int maxTreeDepth) { this.maxTreeDepth = maxTreeDepth; } /** - * Generate fragments for the input molecule. + * Generates fragments for the given molecule. + *

+ * Based on the saturation setting: + * - **Unsaturated mode**: Fragments are returned without additional hydrogen atoms. + * - **Saturated mode**: Hydrogen atoms are explicitly added to atoms where bonds are broken. + *

+ * The generated fragments are stored internally and can be retrieved via: + * - {@link #getFragments()} (SMILES representation) + * - {@link #getFragmentsAsContainers()} (IAtomContainer representation) * * @param atomContainer The input molecule. + * @throws CDKException If fragmentation encounters an error. */ @Override public void generateFragments(IAtomContainer atomContainer) throws CDKException { @@ -179,117 +189,149 @@ public void generateFragments(IAtomContainer atomContainer) throws CDKException } /** - * Splits the `atomContainer` and adds Hydrogen atoms to the atoms that get splitted. - * @param atomContainer molecule to split - * @throws CDKException + * Splits the molecule at all possible combinations of splittable bonds and adds implicit hydrogen atoms + * to atoms that were originally involved in the split. + * + * @param atomContainer The molecule to be split. + * @param maxTreeDepth The maximum number of bond splits allowed per subset. + * @throws CDKException If an error occurs during hydrogen addition or atom type perception. */ private void runSaturated(IAtomContainer atomContainer, int maxTreeDepth) throws CDKException { + // Return early if the molecule has fewer than 3 bonds (no meaningful splits possible) if (atomContainer.getBondCount() < 3) return; + + // Retrieve bonds that are eligible for splitting IBond[] splittableBonds = getSplitableBonds(atomContainer); int splittableBondsLength = splittableBonds.length; + + // If no splittable bonds are found, return early if (splittableBondsLength == 0) return; logger.debug("Got " + splittableBondsLength + " splittable bonds"); - // If we want to check all unique combinations of splittings we calculate the power set of the splittable bonds. - // which is 2^n and without considering the empty set we can say it is 2^n - 1. - // example: - // if we have a set of splittable bonds here represented as numbers {1, 2, 3}, we can describe all unique - // subsets as follows: - // {1} - // {2} - // {3} - // {1,2} - // {1,3} - // {2,3} - // {1,2,3} + // Compute the number of possible bond subsets (excluding the empty set): 2^n - 1 int numberOfIterations = (1 << splittableBondsLength) - 1; + // Store indices of splittable bonds for subset generation int[] splittableBondIndices = new int[splittableBondsLength]; for (int i = 0; i < splittableBondsLength; i++) { splittableBondIndices[i] = splittableBonds[i].getIndex(); } - for (int i = 1; i <= numberOfIterations; i ++){ + + // Iterate over all non-empty subsets of splittable bonds + for (int i = 1; i <= numberOfIterations; i++) { int[] subset = generateSubset(i, splittableBondIndices); int subsetSize = subset.length; + + // Skip subsets exceeding the allowed depth if (subsetSize > maxTreeDepth) { continue; } + + // Convert subset indices back to bond objects IBond[] bondsToSplit = new IBond[subsetSize]; for (int j = 0; j < subsetSize; j++) { bondsToSplit[j] = atomContainer.getBond(subset[j]); } + + // Split the molecule and retrieve the resulting fragments IAtomContainer[] parts = splitBondsWithCopy(atomContainer, bondsToSplit); + + // Process each fragment for (IAtomContainer partContainer : parts) { AtomContainerManipulator.clearAtomConfigurations(partContainer); + + // Reset implicit hydrogen count before recalculating for (IAtom atom : partContainer.atoms()) { atom.setImplicitHydrogenCount(0); } + + // Configure atom types and add implicit hydrogens AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(partContainer); CDKHydrogenAdder.getInstance(partContainer.getBuilder()).addImplicitHydrogens(partContainer); - // todo: find the appropriate replacement for the legacy operation + + // Apply aromaticity perception (legacy operation) + // TODO: Investigate for the current method to do this. Aromaticity.cdkLegacy().apply(partContainer); + + // Generate a unique SMILES representation of the fragment String tmpSmiles = smilesGenerator.create(partContainer); int numberOfAtoms = partContainer.getAtomCount(); + + // Store the fragment if it meets the size requirement and is unique if (numberOfAtoms >= minFragSize && !fragMap.containsKey(tmpSmiles)) { fragMap.put(tmpSmiles, partContainer); } } - } } + /** + * Splits the molecule at all possible combinations of splittable bonds without adding implicit hydrogens. + * + * @param atomContainer The molecule to be split. + * @param maxTreeDepth The maximum number of bond splits allowed per subset. + * @throws CDKException If an error occurs during atom type perception. + */ private void runUnsaturated(IAtomContainer atomContainer, int maxTreeDepth) throws CDKException { + // Return early if the molecule has fewer than 3 bonds (no meaningful splits possible) if (atomContainer.getBondCount() < 3) return; + + // Retrieve bonds that are eligible for splitting IBond[] splittableBonds = getSplitableBonds(atomContainer); int splittableBondsLength = splittableBonds.length; + + // If no splittable bonds are found, return early if (splittableBondsLength == 0) return; logger.debug("Got " + splittableBondsLength + " splittable bonds"); - // If we want to check all unique combinations of splittings we calculate the power set of the splittable bonds. - // which is 2^n and without considering the empty set we can say it is 2^n - 1. - // example: - // if we have a set of splittable bonds here represented as numbers {1, 2, 3}, we can describe all unique - // subsets as follows: - // {1} - // {2} - // {3} - // {1,2} - // {1,3} - // {2,3} - // {1,2,3} + // Compute the number of possible bond subsets (excluding the empty set): 2^n - 1 int numberOfIterations = (1 << splittableBondsLength) - 1; + // Store indices of splittable bonds for subset generation int[] splittableBondIndices = new int[splittableBondsLength]; for (int i = 0; i < splittableBondsLength; i++) { splittableBondIndices[i] = splittableBonds[i].getIndex(); } - // we start from one to disregard the empty set from generateSubset - for (int i = 1; i <= numberOfIterations; i ++){ + + // Iterate over all non-empty subsets of splittable bonds + for (int i = 1; i <= numberOfIterations; i++) { int[] subset = generateSubset(i, splittableBondIndices); int subsetSize = subset.length; + + // Skip subsets exceeding the allowed depth if (subsetSize > maxTreeDepth) { continue; } + + // Convert subset indices back to bond objects IBond[] bondsToSplit = new IBond[subsetSize]; for (int j = 0; j < subsetSize; j++) { bondsToSplit[j] = atomContainer.getBond(subset[j]); } - // TODO: is a copy here really needed what do the other fragmenters use or return ? Otherwise either make a - // function that doesn't copy or find a way to use the existing splitMolecule from FragmentUtils. + + // TODO: Investigate whether copying is necessary. Consider using FragmentUtils.splitMolecule instead. IAtomContainer[] parts = splitBondsWithCopy(atomContainer, bondsToSplit); + + // Process each fragment for (IAtomContainer partContainer : parts) { + // Configure atom types (no hydrogen addition in unsaturated mode) AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(partContainer); - // todo: find the appropriate replacement for the legacy operation + + // Apply aromaticity perception (legacy operation) + // TODO: Investigate for the current method to do this. Aromaticity.cdkLegacy().apply(partContainer); + + // Generate a unique SMILES representation of the fragment String tmpSmiles = smilesGenerator.create(partContainer); int numberOfAtoms = partContainer.getAtomCount(); + + // Store the fragment if it meets the size requirement and is unique if (numberOfAtoms >= minFragSize && !fragMap.containsKey(tmpSmiles)) { fragMap.put(tmpSmiles, partContainer); } } - } } @@ -323,165 +365,162 @@ private IBond[] getSplitableBonds(IAtomContainer atomContainer) { } /** - * Generates a subset of the numbers given in `nums`, determined by the binary representation of the - * provided `index`.Each bit in the binary value of the `index` represents whether the corresponding element - * in the `nums` array is included in the subset. The order of the elements does not matter - * (i.e., [1, 2] and [2, 1] are considered identical). - *

- * The total number of possible subsets is (2^n) - 1, where n is the length of the `nums` array. - * The subsets are generated based on bit manipulation, and the order of subsets may vary depending on - * the internal bit shifts. - *

- * Example output for nums = [1, 2, 3] (2^3 - 1 = 7): - * [1] for index = 1 - * [2] for index = 2 - * [1, 2] for index = 3 - * [3] for index = 4 - * [1, 3] for index = 5 - * [2, 3] for index = 6 - * [1, 2, 3] for index = 7 - *

- * It works like follows: - * index is here represented as binary and each place where the binary representation has a one results in the - * respective place of the `nums` array being returned. - * index (5): nums: result: - * 1 --> 1 1 - * 0 2 ---> [1, 3] - * 1 --> 3 3 + * Generates a subset from the given array `nums`, determined by the binary representation of `index`. + * Each bit in `index` indicates whether the corresponding element in `nums` is included in the subset. + * The order of elements does not matter (i.e., `[1, 2]` and `[2, 1]` are equivalent). + * + *

The total number of possible subsets is (2^n) - 1, where `n` is the length of `nums`. + * Subsets are generated using bitwise operations, where each `1` bit in `index` selects + * the corresponding element from `nums`.

* + *

Example output for `nums = [1, 2, 3]`:

+ *
+     *   index = 1  → [1]
+     *   index = 2  → [2]
+     *   index = 3  → [1, 2]
+     *   index = 4  → [3]
+     *   index = 5  → [1, 3]
+     *   index = 6  → [2, 3]
+     *   index = 7  → [1, 2, 3]
+     * 
* + *

Example bitwise selection for `index = 5` (`101` in binary):

+ *
+     * index (binary)   nums    result
+     *      1        →   1   →  [1]
+     *      0        →   2
+     *      1        →   3   →  [1, 3]
+     * 
* - * @param index The index, represented as an integer, where each bit corresponds to whether an element in - * `nums` should be included in the subset. A bit value of `1` means the corresponding element - * is included, and `0` means it is not. - * @param nums An array of integers from which to generate the subset. The presence of duplicate values in - * `nums` will not result in an exception, but may lead to repeated values in the generated subsets. - * @return An array containing the subset corresponding to the binary representation of the provided `index`. - * @author Tom Weiß + * @param index An integer whose binary representation determines the subset elements. + * A `1` bit at position `j` means `nums[j]` is included. + * @param nums The array from which to generate subsets. + * Duplicate values in `nums` may result in duplicate subset entries. + * @return An array containing the subset corresponding to `index`. */ private static int[] generateSubset(int index, int[] nums) { - - // Create a new array to hold the subset, size based on the number of 1-bits in the index. + // Allocate subset array based on the number of 1-bits in index. int[] subset = new int[Integer.bitCount(index)]; int subsetIndex = 0; - // Iterate through each bit in the binary representation of the index. + // Iterate through each bit position (up to 32 bits). for (int j = 0; j < 32; j++) { - // Check if the current bit (at position 'j') is set to 1. + // If the j-th bit in index is set, include nums[j] in the subset. if (((index >> j) & 1) == 1) { - // If the bit is set, add the corresponding number from nums to the subset. - subset[subsetIndex] = nums[j]; - subsetIndex++; + subset[subsetIndex++] = nums[j]; } } - // Return the generated subset. return subset; } /** - * Copies an atom into a new atom container. - * @param originalAtom the atom to be copied - * @param atomContainer the destination atom container - * @return the copy of the atom + * Creates a copy of an atom and adds it to the specified atom container. + * + * @param originalAtom The atom to be copied. + * @param atomContainer The destination container where the copied atom will be added. + * @return A new atom with the same properties as `originalAtom`, added to `atomContainer`. */ private static IAtom copyAtom(IAtom originalAtom, IAtomContainer atomContainer) { - IAtom cpyAtom = atomContainer.newAtom(originalAtom.getAtomicNumber(), + IAtom copiedAtom = atomContainer.newAtom(originalAtom.getAtomicNumber(), originalAtom.getImplicitHydrogenCount()); - cpyAtom.setIsAromatic(originalAtom.isAromatic()); - cpyAtom.setValency(originalAtom.getValency()); - cpyAtom.setAtomTypeName(originalAtom.getAtomTypeName()); - return cpyAtom; + copiedAtom.setIsAromatic(originalAtom.isAromatic()); + copiedAtom.setValency(originalAtom.getValency()); + copiedAtom.setAtomTypeName(originalAtom.getAtomTypeName()); + return copiedAtom; } /** - * Split a molecule on all specified bonds by making a copy of the fragments. - * @param mol the molecule to split. - * @param bondsToSplit the bonds that should be removed. - * @return the resulting copied fragments. + * Splits a molecule into multiple fragments by removing the specified bonds and making copies of the resulting fragments. + * + * @param mol The molecule to be split. + * @param bondsToSplit The bonds that should be removed to create separate fragments. + * @return An array of copied molecular fragments resulting from the split. */ private IAtomContainer[] splitBondsWithCopy(IAtomContainer mol, IBond[] bondsToSplit) { - // keep track of visited atoms and bonds + // Track visited atoms and bonds during traversal boolean[] visitedAtoms = new boolean[mol.getAtomCount()]; boolean[] visitedBonds = new boolean[mol.getBondCount()]; - // set all values of already visited atoms and bonds to false + + // Initialize visited markers to false Arrays.fill(visitedAtoms, false); Arrays.fill(visitedBonds, false); - // the number of fragments is always the number of splits + 1 - // example: 1 split results in 2 fragments, 2 splits in 3 and so on + + // The number of fragments is always the number of bonds removed + 1 int numberOfFragments = bondsToSplit.length + 1; IAtomContainer[] fragments = new IAtomContainer[numberOfFragments]; + + // Map atoms to the atoms they should be disconnected from Map> atomsToSplit = new HashMap<>((int) Math.ceil((bondsToSplit.length * 2) / 0.75)); for (IBond bond : bondsToSplit) { IAtom beg = bond.getBegin(); IAtom end = bond.getEnd(); - if (atomsToSplit.containsKey(beg)) { - atomsToSplit.get(beg).add(end); - } else { - List endList = new ArrayList<>(); - endList.add(end); - atomsToSplit.put(beg, endList); - } + atomsToSplit.computeIfAbsent(beg, k -> new ArrayList<>()).add(end); } - // stack to keep track of the next starting atom for the next fragment. If a bond to split is - // noticed the atom of the next fragment gets added here + + // Stack to track starting atoms for fragment creation Stack startingAtoms = new Stack<>(); - // start at an arbitrary starting atom + + // Start traversal from the first atom of the molecule startingAtoms.add(mol.getAtom(0)); - // for each fragment we iterate through the molecule, except for the bonds that we want to split + + // Iterate to create each fragment while avoiding split bonds for (int i = 0; i < numberOfFragments; i++) { - // map to keep track of the original atoms and the copies thereof + // Map to associate original atoms with their copied versions Map origToCpyMap = new HashMap<>((int) Math.ceil(mol.getAtomCount() / 0.75)); IAtomContainer fragmentContainer = mol.getBuilder().newInstance(IAtomContainer.class); - // a stack to make a DFS through the molecule + + // Stack for depth-first search (DFS) traversal Stack atomStack = new Stack<>(); - // we start iterating at the most recent starting atom + // Start DFS from the next available atom atomStack.add(startingAtoms.pop()); - // we need to copy the first atom separately + // Copy the first atom and store the mapping IAtom firstAtom = atomStack.peek(); IAtom atomCpy = copyAtom(firstAtom, fragmentContainer); origToCpyMap.put(firstAtom, atomCpy); while (!atomStack.isEmpty()) { - // get the last atom and its copy + // Retrieve the current atom and its copy IAtom atom = atomStack.pop(); atomCpy = origToCpyMap.get(atom); visitedAtoms[atom.getIndex()] = true; - for (IAtom nbor: atom.neighbors()) { - // if the bond to the current neighbour was already added we go to the next neighbour - if (visitedBonds[mol.getBond(atom, nbor).getIndex()]) { + + // Iterate over neighboring atoms + for (IAtom nbor : atom.neighbors()) { + IBond bond = mol.getBond(atom, nbor); + int bondIndex = bond.getIndex(); + + // Skip if the bond was already processed + if (visitedBonds[bondIndex]) { continue; } - // if the neighbour is not to be split we either copy the neighbour and make a bond from the copy - // of the last atom to its copy of the neighbour or if we already visited the atom we know this is - // a cycle connection, and we make only a bond from the last atom to the already copied neighbour + + // If the neighbor is not part of a split bond, copy it and create a bond if (!atomsToSplit.containsKey(atom) || !atomsToSplit.get(atom).contains(nbor)) { if (!visitedAtoms[nbor.getIndex()]) { IAtom nborCpy = copyAtom(nbor, fragmentContainer); - fragmentContainer.newBond(atomCpy, nborCpy, mol.getBond(atom, nbor).getOrder()); - visitedBonds[mol.getBond(atom, nbor).getIndex()] = true; + fragmentContainer.newBond(atomCpy, nborCpy, bond.getOrder()); + visitedBonds[bondIndex] = true; atomStack.add(nbor); origToCpyMap.put(nbor, nborCpy); visitedAtoms[nbor.getIndex()] = true; } else { + // If the neighbor was already copied, establish a bond to maintain cyclic structures IAtom nborCpy = origToCpyMap.get(nbor); - if (nborCpy == null) { - continue; + if (nborCpy != null) { + fragmentContainer.newBond(atomCpy, nborCpy, bond.getOrder()); + visitedBonds[bondIndex] = true; } - fragmentContainer.newBond( - atomCpy, - nborCpy, - mol.getBond(atom, nbor).getOrder() - ); - visitedBonds[mol.getBond(atom, nbor).getIndex()] = true; } } else { + // If the neighbor is part of a split bond, mark it as a starting atom for a new fragment startingAtoms.add(nbor); } } } + // Store the created fragment fragments[i] = fragmentContainer; } return fragments; From c1596f0360f622ce817ab06dfdfb15745d0fb57d Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Sun, 6 Apr 2025 21:42:54 +0200 Subject: [PATCH 17/42] work in progress: first tries to add an saturation option with R-Atoms --- .../cdk/fragment/ExhaustiveFragmenter.java | 162 +++++++----------- .../fragment/ExhaustiveFragmenterTest.java | 6 +- 2 files changed, 62 insertions(+), 106 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index e02d1f134ae..a41360e1d01 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -28,6 +28,7 @@ import org.openscience.cdk.interfaces.IAtom; import org.openscience.cdk.interfaces.IAtomContainer; import org.openscience.cdk.interfaces.IBond; +import org.openscience.cdk.interfaces.IPseudoAtom; import org.openscience.cdk.interfaces.IRingSet; import org.openscience.cdk.smiles.SmiFlavor; import org.openscience.cdk.smiles.SmilesGenerator; @@ -44,12 +45,15 @@ import java.util.Stack; /** - * Performs exhaustive fragmentation of molecules by breaking single non-ring, non-terminal bonds. + * Performs exhaustive fragmentation of molecules by breaking single non-ring, non-terminal bonds in all + * combinations. *

- * This fragmentation method avoids splitting bonds connected to single heavy atoms (non-terminal bonds). + * Non-terminal meaning bonds connected to more than one single heavy atom (non-terminal bonds). * By default: - * - Fragments smaller than 6 atoms (excluding implicit hydrogen) are ignored. - * - Fragments are returned unsaturated. + * - Fragments smaller than 6 atoms (excluding implicit hydrogen) don't get returned. + *
+ * - Fragments are returned with open valences, where a bond has been split. + *
* However, users can modify these settings. *

* Example Usage: @@ -57,17 +61,19 @@ * // By default, returns unsaturated fragments with a minimum size of 6 atoms * ExhaustiveFragmenter fragmenter = new ExhaustiveFragmenter(); * SmilesParser smiParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); - * IAtomContainer mol = smiParser.parseSmiles("c1ccccc1C"); // Benzyl molecule + * IAtomContainer mol = smiParser.parseSmiles("C1CCC(C1)C2=CC=CC=C2"); // Cyclopentylbenzene molecule * fragmenter.generateFragments(mol); * * // Retrieve SMILES representations of fragments * String[] smilesFragments = fragmenter.getFragments(); + * // Results: + * // * * // Retrieve AtomContainer representations of fragments * IAtomContainer[] atomContainerFragments = fragmenter.getFragmentsAsContainers(); * }

* - * @author Rajarshi Guha + * @author Rajarshi Guha, Tom Weiß * @cdk.module fragment * @cdk.keyword fragment */ @@ -78,7 +84,10 @@ public class ExhaustiveFragmenter implements IFragmenter { */ public enum Saturation { // Fragments will be returned in their saturated form (implicit hydrogen atoms added). - SATURATED_FRAGMENTS, + HYDROGEN_SATURATED_FRAGMENTS, + + // Fragments will be saturated with R atoms. + REST_SATURATED_FRAGMENTS, // Fragments will be returned in their unsaturated form (no additional hydrogen atoms). UNSATURATED_FRAGMENTS @@ -126,16 +135,6 @@ public ExhaustiveFragmenter(int minFragSize) { this(minFragSize, DEFAULT_SATURATION); } - /** - * Constructs an ExhaustiveFragmenter with a user-defined saturation setting. - * The minimum fragment size defaults to 6 atoms. - * - * @param saturationSetting Determines whether fragments should be saturated or unsaturated. - */ - public ExhaustiveFragmenter(Saturation saturationSetting) { - this(DEFAULT_MIN_FRAG_SIZE, saturationSetting); - } - /** * Sets the minimum allowed fragment size. * @@ -166,11 +165,6 @@ public void setMaxTreeDepth(int maxTreeDepth) { /** * Generates fragments for the given molecule. - *

- * Based on the saturation setting: - * - **Unsaturated mode**: Fragments are returned without additional hydrogen atoms. - * - **Saturated mode**: Hydrogen atoms are explicitly added to atoms where bonds are broken. - *

* The generated fragments are stored internally and can be retrieved via: * - {@link #getFragments()} (SMILES representation) * - {@link #getFragmentsAsContainers()} (IAtomContainer representation) @@ -181,22 +175,18 @@ public void setMaxTreeDepth(int maxTreeDepth) { @Override public void generateFragments(IAtomContainer atomContainer) throws CDKException { fragMap.clear(); - if (this.saturationSetting == Saturation.UNSATURATED_FRAGMENTS) { - runUnsaturated(atomContainer, maxTreeDepth); - } else { - runSaturated(atomContainer, maxTreeDepth); - } + run(atomContainer, this.maxTreeDepth); } /** - * Splits the molecule at all possible combinations of splittable bonds and adds implicit hydrogen atoms - * to atoms that were originally involved in the split. + * Splits the molecule at all possible combinations of splittable bonds and saturates the open valences of the + * resulting fragments if the Saturation setting is turned on. * * @param atomContainer The molecule to be split. * @param maxTreeDepth The maximum number of bond splits allowed per subset. * @throws CDKException If an error occurs during hydrogen addition or atom type perception. */ - private void runSaturated(IAtomContainer atomContainer, int maxTreeDepth) throws CDKException { + private void run(IAtomContainer atomContainer, int maxTreeDepth) throws CDKException { // Return early if the molecule has fewer than 3 bonds (no meaningful splits possible) if (atomContainer.getBondCount() < 3) return; @@ -239,85 +229,19 @@ private void runSaturated(IAtomContainer atomContainer, int maxTreeDepth) throws // Process each fragment for (IAtomContainer partContainer : parts) { - AtomContainerManipulator.clearAtomConfigurations(partContainer); - - // Reset implicit hydrogen count before recalculating - for (IAtom atom : partContainer.atoms()) { - atom.setImplicitHydrogenCount(0); - } // Configure atom types and add implicit hydrogens AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(partContainer); - CDKHydrogenAdder.getInstance(partContainer.getBuilder()).addImplicitHydrogens(partContainer); - // Apply aromaticity perception (legacy operation) - // TODO: Investigate for the current method to do this. - Aromaticity.cdkLegacy().apply(partContainer); - - // Generate a unique SMILES representation of the fragment - String tmpSmiles = smilesGenerator.create(partContainer); - int numberOfAtoms = partContainer.getAtomCount(); + // Add Hydrogen to fragments if + if (saturationSetting == Saturation.HYDROGEN_SATURATED_FRAGMENTS) { - // Store the fragment if it meets the size requirement and is unique - if (numberOfAtoms >= minFragSize && !fragMap.containsKey(tmpSmiles)) { - fragMap.put(tmpSmiles, partContainer); + // Reset implicit hydrogen count before recalculating + for (IAtom atom : partContainer.atoms()) { + atom.setImplicitHydrogenCount(0); + } + CDKHydrogenAdder.getInstance(partContainer.getBuilder()).addImplicitHydrogens(partContainer); } - } - } - } - - /** - * Splits the molecule at all possible combinations of splittable bonds without adding implicit hydrogens. - * - * @param atomContainer The molecule to be split. - * @param maxTreeDepth The maximum number of bond splits allowed per subset. - * @throws CDKException If an error occurs during atom type perception. - */ - private void runUnsaturated(IAtomContainer atomContainer, int maxTreeDepth) throws CDKException { - - // Return early if the molecule has fewer than 3 bonds (no meaningful splits possible) - if (atomContainer.getBondCount() < 3) return; - - // Retrieve bonds that are eligible for splitting - IBond[] splittableBonds = getSplitableBonds(atomContainer); - int splittableBondsLength = splittableBonds.length; - - // If no splittable bonds are found, return early - if (splittableBondsLength == 0) return; - logger.debug("Got " + splittableBondsLength + " splittable bonds"); - - // Compute the number of possible bond subsets (excluding the empty set): 2^n - 1 - int numberOfIterations = (1 << splittableBondsLength) - 1; - - // Store indices of splittable bonds for subset generation - int[] splittableBondIndices = new int[splittableBondsLength]; - for (int i = 0; i < splittableBondsLength; i++) { - splittableBondIndices[i] = splittableBonds[i].getIndex(); - } - - // Iterate over all non-empty subsets of splittable bonds - for (int i = 1; i <= numberOfIterations; i++) { - int[] subset = generateSubset(i, splittableBondIndices); - int subsetSize = subset.length; - - // Skip subsets exceeding the allowed depth - if (subsetSize > maxTreeDepth) { - continue; - } - - // Convert subset indices back to bond objects - IBond[] bondsToSplit = new IBond[subsetSize]; - for (int j = 0; j < subsetSize; j++) { - bondsToSplit[j] = atomContainer.getBond(subset[j]); - } - - // TODO: Investigate whether copying is necessary. Consider using FragmentUtils.splitMolecule instead. - IAtomContainer[] parts = splitBondsWithCopy(atomContainer, bondsToSplit); - - // Process each fragment - for (IAtomContainer partContainer : parts) { - // Configure atom types (no hydrogen addition in unsaturated mode) - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(partContainer); // Apply aromaticity perception (legacy operation) // TODO: Investigate for the current method to do this. @@ -403,7 +327,7 @@ private static int[] generateSubset(int index, int[] nums) { int[] subset = new int[Integer.bitCount(index)]; int subsetIndex = 0; - // Iterate through each bit position (up to 32 bits). + // Iterate through each bit position (up to 31 bits). for (int j = 0; j < 32; j++) { // If the j-th bit in index is set, include nums[j] in the subset. if (((index >> j) & 1) == 1) { @@ -414,6 +338,23 @@ private static int[] generateSubset(int index, int[] nums) { return subset; } + /** + * Add pseudo ("R") atoms to an atom in a molecule. + * + * @param atom the atom to add the pseudo atoms to + * @param rcount the number of pseudo atoms to add + * @param mol the molecule the atom belongs to + */ + private void addRAtoms(IAtom atom, int rcount, IAtomContainer mol) { + for (int i = 0; i < rcount; i++) { + IPseudoAtom tmpRAtom = atom.getBuilder().newInstance(IPseudoAtom.class, "R"); + tmpRAtom.setAttachPointNum(1); + tmpRAtom.setImplicitHydrogenCount(0); + mol.addAtom(tmpRAtom); + mol.addBond(atom.getBuilder().newInstance(IBond.class, atom, tmpRAtom, IBond.Order.SINGLE)); + } + } + /** * Creates a copy of an atom and adds it to the specified atom container. * @@ -427,6 +368,7 @@ private static IAtom copyAtom(IAtom originalAtom, IAtomContainer atomContainer) copiedAtom.setIsAromatic(originalAtom.isAromatic()); copiedAtom.setValency(originalAtom.getValency()); copiedAtom.setAtomTypeName(originalAtom.getAtomTypeName()); + copiedAtom.setFormalCharge(originalAtom.getFormalCharge()); return copiedAtom; } @@ -455,6 +397,18 @@ private IAtomContainer[] splitBondsWithCopy(IAtomContainer mol, IBond[] bondsToS for (IBond bond : bondsToSplit) { IAtom beg = bond.getBegin(); IAtom end = bond.getEnd(); + switch (this.saturationSetting) { + case HYDROGEN_SATURATED_FRAGMENTS: + beg.setImplicitHydrogenCount(beg.getImplicitHydrogenCount() + 1); + end.setImplicitHydrogenCount(end.getImplicitHydrogenCount() + 1); + break; + case REST_SATURATED_FRAGMENTS: + addRAtoms(beg, 1, mol); + addRAtoms(end, 1, mol); + break; + case UNSATURATED_FRAGMENTS: + break; + } atomsToSplit.computeIfAbsent(beg, k -> new ArrayList<>()).add(end); } diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java index 5e47afa0a0e..4475f431bf3 100644 --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java @@ -49,8 +49,10 @@ class ExhaustiveFragmenterTest extends CDKTestCase { @BeforeAll static void setup() { - fragmenterSaturated = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.SATURATED_FRAGMENTS); - fragmenterUnsaturated = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); + fragmenterSaturated = new ExhaustiveFragmenter(); + fragmenterSaturated.setSaturationSetting(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + fragmenterUnsaturated = new ExhaustiveFragmenter(); + fragmenterUnsaturated.setSaturationSetting(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); smilesParser = new SmilesParser(DefaultChemObjectBuilder.getInstance()); } From f9faed12a37c0c5f81cfb47b3dbd0f31617faeef Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Fri, 25 Apr 2025 09:07:53 +0200 Subject: [PATCH 18/42] work in progress: added tests for pseudo-R-atom saturation and first tries to implement the pseudo-R-atom saturation --- .../cdk/fragment/ExhaustiveFragmenter.java | 42 ++++++++++--------- .../fragment/ExhaustiveFragmenterTest.java | 42 +++++++++++++++++++ 2 files changed, 65 insertions(+), 19 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index a41360e1d01..cae282bb44d 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -234,13 +234,19 @@ private void run(IAtomContainer atomContainer, int maxTreeDepth) throws CDKExcep AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(partContainer); // Add Hydrogen to fragments if - if (saturationSetting == Saturation.HYDROGEN_SATURATED_FRAGMENTS) { - - // Reset implicit hydrogen count before recalculating - for (IAtom atom : partContainer.atoms()) { - atom.setImplicitHydrogenCount(0); - } - CDKHydrogenAdder.getInstance(partContainer.getBuilder()).addImplicitHydrogens(partContainer); + switch (this.saturationSetting) { + case HYDROGEN_SATURATED_FRAGMENTS: + for (IAtom atom : partContainer.atoms()) { + atom.setImplicitHydrogenCount(0); + } + CDKHydrogenAdder.getInstance(partContainer.getBuilder()).addImplicitHydrogens(partContainer); + break; + case REST_SATURATED_FRAGMENTS: + // TODO: find a good working solution. + saturateWithRAtoms(partContainer); + break; + case UNSATURATED_FRAGMENTS: + break; } // Apply aromaticity perception (legacy operation) @@ -355,6 +361,16 @@ private void addRAtoms(IAtom atom, int rcount, IAtomContainer mol) { } } + private void saturateWithRAtoms(IAtomContainer mol) { + for (IAtom atom : mol.atoms()) { + int connected = mol.getConnectedBondsCount(atom); + Integer valency = atom.getValency(); + if (valency == null) continue; + int toAdd = valency - connected; + if (toAdd > 0) addRAtoms(atom, toAdd, mol); + } + } + /** * Creates a copy of an atom and adds it to the specified atom container. * @@ -397,18 +413,6 @@ private IAtomContainer[] splitBondsWithCopy(IAtomContainer mol, IBond[] bondsToS for (IBond bond : bondsToSplit) { IAtom beg = bond.getBegin(); IAtom end = bond.getEnd(); - switch (this.saturationSetting) { - case HYDROGEN_SATURATED_FRAGMENTS: - beg.setImplicitHydrogenCount(beg.getImplicitHydrogenCount() + 1); - end.setImplicitHydrogenCount(end.getImplicitHydrogenCount() + 1); - break; - case REST_SATURATED_FRAGMENTS: - addRAtoms(beg, 1, mol); - addRAtoms(end, 1, mol); - break; - case UNSATURATED_FRAGMENTS: - break; - } atomsToSplit.computeIfAbsent(beg, k -> new ArrayList<>()).add(end); } diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java index 4475f431bf3..fbb052d3169 100644 --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java @@ -45,6 +45,7 @@ class ExhaustiveFragmenterTest extends CDKTestCase { private static ExhaustiveFragmenter fragmenterSaturated; private static ExhaustiveFragmenter fragmenterUnsaturated; + private static ExhaustiveFragmenter fragmenterRestSaturated; private static SmilesParser smilesParser; @BeforeAll @@ -53,6 +54,8 @@ static void setup() { fragmenterSaturated.setSaturationSetting(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); fragmenterUnsaturated = new ExhaustiveFragmenter(); fragmenterUnsaturated.setSaturationSetting(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); + fragmenterRestSaturated = new ExhaustiveFragmenter(); + fragmenterRestSaturated.setSaturationSetting(ExhaustiveFragmenter.Saturation.REST_SATURATED_FRAGMENTS); smilesParser = new SmilesParser(DefaultChemObjectBuilder.getInstance()); } @@ -226,4 +229,43 @@ void testEqualityOfSmilesAndContainers() throws Exception { org.hamcrest.MatcherAssert.assertThat(smilesFrags, hasItems(smilesGenerator.create(frag))); } } + + @Test + void testEF3RestSaturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); + fragmenterRestSaturated.generateFragments(mol); + String[] frags = fragmenterRestSaturated.getFragments(); + Assertions.assertNotNull(frags); + MatcherAssert.assertThat(frags, is(new String[]{"[R]C1CCCCC1"})); + } + + @Test + void testEF5RestSaturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); + fragmenterRestSaturated.generateFragments(mol); + String[] frags = fragmenterRestSaturated.getFragments(); + Assertions.assertNotNull(frags); + MatcherAssert.assertThat(Arrays.asList(frags), hasItems("[R]c1ccccc1", "c1ccccc1")); + Assertions.assertEquals(2, fragmenterRestSaturated.getFragmentsAsContainers().length); + } + + @Test + void testEF6RestSaturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); + fragmenterRestSaturated.generateFragments(mol); + String[] frags = fragmenterRestSaturated.getFragments(); + Assertions.assertNotNull(frags); + MatcherAssert.assertThat(frags, is(new String[]{"[R]c1ccccc1"})); + Assertions.assertEquals(1, fragmenterRestSaturated.getFragmentsAsContainers().length); + } + + @Test + void testEF7RestSaturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); + fragmenterRestSaturated.generateFragments(mol); + List frags = Arrays.asList(fragmenterRestSaturated.getFragments()); + Assertions.assertNotNull(frags); + Assertions.assertEquals(25, fragmenterRestSaturated.getFragmentsAsContainers().length); + MatcherAssert.assertThat(frags, hasItems("[R]c1ccccc1", "[R]C1CCC(c2ccccc2)(CC3C=CC=C3)C1", "[R]C1CCC([R])(c2ccccc2)C1")); + } } From 59425dffc530ff0be4026190ff21d35b74642270 Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Fri, 23 May 2025 09:07:46 +0200 Subject: [PATCH 19/42] refactored splitting function to improve saturation and improved documentation --- .../cdk/fragment/ExhaustiveFragmenter.java | 291 +++++++++--------- .../fragment/ExhaustiveFragmenterTest.java | 11 +- 2 files changed, 151 insertions(+), 151 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index cae282bb44d..8c3f48ea241 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -32,16 +32,16 @@ import org.openscience.cdk.interfaces.IRingSet; import org.openscience.cdk.smiles.SmiFlavor; import org.openscience.cdk.smiles.SmilesGenerator; -import org.openscience.cdk.tools.CDKHydrogenAdder; import org.openscience.cdk.tools.ILoggingTool; import org.openscience.cdk.tools.LoggingToolFactory; import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; import java.util.ArrayList; -import java.util.Arrays; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.Stack; /** @@ -50,10 +50,10 @@ *

* Non-terminal meaning bonds connected to more than one single heavy atom (non-terminal bonds). * By default: - * - Fragments smaller than 6 atoms (excluding implicit hydrogen) don't get returned. - *
- * - Fragments are returned with open valences, where a bond has been split. - *
+ *

    + *
  • Fragments smaller than 6 atoms (excluding implicit hydrogen) don't get returned.
  • + *
  • Fragments are returned with open valences, where a bond has been split.
  • + *
* However, users can modify these settings. *

* Example Usage: @@ -61,19 +61,20 @@ * // By default, returns unsaturated fragments with a minimum size of 6 atoms * ExhaustiveFragmenter fragmenter = new ExhaustiveFragmenter(); * SmilesParser smiParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); - * IAtomContainer mol = smiParser.parseSmiles("C1CCC(C1)C2=CC=CC=C2"); // Cyclopentylbenzene molecule + * IAtomContainer mol = smiParser.parseSmiles("C1CCC(C1)C1=CC=CC=C1"); // Cyclopentylbenzene molecule * fragmenter.generateFragments(mol); * * // Retrieve SMILES representations of fragments * String[] smilesFragments = fragmenter.getFragments(); - * // Results: + * // Results: ["C1CCCCC1", "C1=CC=CC=C1"] * // * * // Retrieve AtomContainer representations of fragments * IAtomContainer[] atomContainerFragments = fragmenter.getFragmentsAsContainers(); * } * - * @author Rajarshi Guha, Tom Weiß + * @author Rajarshi Guha + * @author Tom Weiß * @cdk.module fragment * @cdk.keyword fragment */ @@ -83,13 +84,19 @@ public class ExhaustiveFragmenter implements IFragmenter { * Specifies whether generated fragments should be saturated (hydrogens added) or unsaturated. */ public enum Saturation { - // Fragments will be returned in their saturated form (implicit hydrogen atoms added). + /** + * Fragments will be returned in their saturated form (implicit hydrogen atoms added). + */ HYDROGEN_SATURATED_FRAGMENTS, - // Fragments will be saturated with R atoms. + /** + * Fragments will be saturated with R atoms. + */ REST_SATURATED_FRAGMENTS, - // Fragments will be returned in their unsaturated form (no additional hydrogen atoms). + /** + * Fragments will be returned in their unsaturated form (no additional hydrogen atoms). + */ UNSATURATED_FRAGMENTS } @@ -98,15 +105,17 @@ public enum Saturation { private final Map fragMap; private final SmilesGenerator smilesGenerator; - private int maxTreeDepth = 31; + private int exclusiveMaxTreeDepth = Integer.SIZE; private int minFragSize; private Saturation saturationSetting; private static final ILoggingTool logger = LoggingToolFactory.createLoggingTool(ExhaustiveFragmenter.class); /** * Constructs an ExhaustiveFragmenter with the default settings: - * - Minimum fragment size: 6 atoms - * - Unsaturated fragments + *

    + *
  • Minimum fragment size: 6 atoms
  • + *
  • Unsaturated fragments
  • + *
*/ public ExhaustiveFragmenter() { this(DEFAULT_MIN_FRAG_SIZE, DEFAULT_SATURATION); @@ -115,21 +124,21 @@ public ExhaustiveFragmenter() { /** * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment size and saturation setting. * - * @param minFragSize Minimum number of atoms in a valid fragment. - * @param saturationSetting Determines whether fragments should be saturated or unsaturated. + * @param minFragSize minimum number of atoms in a valid fragment. + * @param saturationSetting determines whether fragments should be saturated or unsaturated. */ public ExhaustiveFragmenter(int minFragSize, Saturation saturationSetting) { this.minFragSize = minFragSize; this.saturationSetting = saturationSetting; - fragMap = new HashMap<>(); - smilesGenerator = new SmilesGenerator(SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols); + this.fragMap = new HashMap<>(); + this.smilesGenerator = new SmilesGenerator(SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols); } /** * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment size. * Saturation defaults to unsaturated fragments. * - * @param minFragSize Minimum number of atoms in a valid fragment. + * @param minFragSize minimum number of atoms in a valid fragment. */ public ExhaustiveFragmenter(int minFragSize) { this(minFragSize, DEFAULT_SATURATION); @@ -147,20 +156,23 @@ public void setMinimumFragmentSize(int minFragSize) { /** * Sets whether fragments should be saturated or unsaturated. * - * @param saturationSetting The saturation mode for generated fragments. + * @param saturationSetting the saturation mode for generated fragments. */ public void setSaturationSetting(Saturation saturationSetting) { this.saturationSetting = saturationSetting; } /** - * Sets the maximum number of bonds that can be simultaneously split. - * Must be within the range `0 < maxTreeDepth < 32`. + * Sets the maximum number of bonds that can be simultaneously split. Must be within the range + * {@code 0 < exclusiveMaxTreeDepth < 32}. This is the limit of the maximal possible bonds to split, + * caused by the combinatorial explosion of fragments when dealing with larger molecules. Because Java + * indexes its common data structures with int32's and this algorithm scales with 2^n, this limit is + * strictly necessary. * - * @param maxTreeDepth Maximum number of bonds that can be split at once. + * @param exclusiveMaxTreeDepth maximum number of bonds that can be split in one atom container. */ - public void setMaxTreeDepth(int maxTreeDepth) { - this.maxTreeDepth = maxTreeDepth; + public void setExclusiveMaxTreeDepth(int exclusiveMaxTreeDepth) { + this.exclusiveMaxTreeDepth = exclusiveMaxTreeDepth; } /** @@ -169,22 +181,23 @@ public void setMaxTreeDepth(int maxTreeDepth) { * - {@link #getFragments()} (SMILES representation) * - {@link #getFragmentsAsContainers()} (IAtomContainer representation) * - * @param atomContainer The input molecule. - * @throws CDKException If fragmentation encounters an error. + * @param atomContainer the input molecule. + * @throws CDKException if fragmentation encounters an error. */ @Override public void generateFragments(IAtomContainer atomContainer) throws CDKException { - fragMap.clear(); - run(atomContainer, this.maxTreeDepth); + this.fragMap.clear(); + run(atomContainer, this.exclusiveMaxTreeDepth); } /** * Splits the molecule at all possible combinations of splittable bonds and saturates the open valences of the * resulting fragments if the Saturation setting is turned on. + * Only non-ring and non-terminal single bonds are considered for splitting. * - * @param atomContainer The molecule to be split. - * @param maxTreeDepth The maximum number of bond splits allowed per subset. - * @throws CDKException If an error occurs during hydrogen addition or atom type perception. + * @param atomContainer the molecule to be split. + * @param maxTreeDepth the maximum number of bond splits allowed per subset of bonds. + * @throws CDKException if an error occurs during hydrogen addition or atom type perception. */ private void run(IAtomContainer atomContainer, int maxTreeDepth) throws CDKException { @@ -193,18 +206,17 @@ private void run(IAtomContainer atomContainer, int maxTreeDepth) throws CDKExcep // Retrieve bonds that are eligible for splitting IBond[] splittableBonds = getSplitableBonds(atomContainer); - int splittableBondsLength = splittableBonds.length; // If no splittable bonds are found, return early - if (splittableBondsLength == 0) return; - logger.debug("Got " + splittableBondsLength + " splittable bonds"); + if (splittableBonds.length == 0) return; + logger.debug("Got " + splittableBonds.length + " splittable bonds"); // Compute the number of possible bond subsets (excluding the empty set): 2^n - 1 - int numberOfIterations = (1 << splittableBondsLength) - 1; + int numberOfIterations = (1 << splittableBonds.length) - 1; // Store indices of splittable bonds for subset generation - int[] splittableBondIndices = new int[splittableBondsLength]; - for (int i = 0; i < splittableBondsLength; i++) { + int[] splittableBondIndices = new int[splittableBonds.length]; + for (int i = 0; i < splittableBonds.length; i++) { splittableBondIndices[i] = splittableBonds[i].getIndex(); } @@ -214,7 +226,7 @@ private void run(IAtomContainer atomContainer, int maxTreeDepth) throws CDKExcep int subsetSize = subset.length; // Skip subsets exceeding the allowed depth - if (subsetSize > maxTreeDepth) { + if (subsetSize > this.exclusiveMaxTreeDepth) { continue; } @@ -233,22 +245,6 @@ private void run(IAtomContainer atomContainer, int maxTreeDepth) throws CDKExcep // Configure atom types and add implicit hydrogens AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(partContainer); - // Add Hydrogen to fragments if - switch (this.saturationSetting) { - case HYDROGEN_SATURATED_FRAGMENTS: - for (IAtom atom : partContainer.atoms()) { - atom.setImplicitHydrogenCount(0); - } - CDKHydrogenAdder.getInstance(partContainer.getBuilder()).addImplicitHydrogens(partContainer); - break; - case REST_SATURATED_FRAGMENTS: - // TODO: find a good working solution. - saturateWithRAtoms(partContainer); - break; - case UNSATURATED_FRAGMENTS: - break; - } - // Apply aromaticity perception (legacy operation) // TODO: Investigate for the current method to do this. Aromaticity.cdkLegacy().apply(partContainer); @@ -265,7 +261,23 @@ private void run(IAtomContainer atomContainer, int maxTreeDepth) throws CDKExcep } } - private IBond[] getSplitableBonds(IAtomContainer atomContainer) { + /** + * Detects and returns the bonds, which will be split by an exhaustive fragmentation. This method is especially useful + * to determine if it is even possible to split a specific molecule exhaustively. The number of fragments is 2^n - 1 with n + * being the number of splittable bonds. Therefore, it is impossible to entirely split a molecule with more than 31 splittable Bonds. + * To mitigate this one cna check this with this function, for example: + *
+     *     {@code
+     *     ExhaustiveFragmenter exhFragmenter = new Exhaustive Fragmenter;
+     *     if (exhFragmenter.getSplittableBonds(mol) > 31) {
+     *         // handle the case, where it is impossible to entirely split the molecule
+     *     }}
+     * 
+ * + * @param atomContainer the container which contains the molecule in question. + * @return the bonds which would be split by the exhaustive fragmentation. + */ + public IBond[] getSplitableBonds(IAtomContainer atomContainer) { // do ring detection SpanningTree spanningTree = new SpanningTree(atomContainer); IRingSet allRings = spanningTree.getAllRings(); @@ -334,7 +346,7 @@ private static int[] generateSubset(int index, int[] nums) { int subsetIndex = 0; // Iterate through each bit position (up to 31 bits). - for (int j = 0; j < 32; j++) { + for (int j = 0; j < Integer.SIZE; j++) { // If the j-th bit in index is set, include nums[j] in the subset. if (((index >> j) & 1) == 1) { subset[subsetIndex++] = nums[j]; @@ -361,16 +373,6 @@ private void addRAtoms(IAtom atom, int rcount, IAtomContainer mol) { } } - private void saturateWithRAtoms(IAtomContainer mol) { - for (IAtom atom : mol.atoms()) { - int connected = mol.getConnectedBondsCount(atom); - Integer valency = atom.getValency(); - if (valency == null) continue; - int toAdd = valency - connected; - if (toAdd > 0) addRAtoms(atom, toAdd, mol); - } - } - /** * Creates a copy of an atom and adds it to the specified atom container. * @@ -391,97 +393,94 @@ private static IAtom copyAtom(IAtom originalAtom, IAtomContainer atomContainer) /** * Splits a molecule into multiple fragments by removing the specified bonds and making copies of the resulting fragments. * - * @param mol The molecule to be split. + * @param origMol The molecule to be split. * @param bondsToSplit The bonds that should be removed to create separate fragments. * @return An array of copied molecular fragments resulting from the split. */ - private IAtomContainer[] splitBondsWithCopy(IAtomContainer mol, IBond[] bondsToSplit) { - // Track visited atoms and bonds during traversal - boolean[] visitedAtoms = new boolean[mol.getAtomCount()]; - boolean[] visitedBonds = new boolean[mol.getBondCount()]; - - // Initialize visited markers to false - Arrays.fill(visitedAtoms, false); - Arrays.fill(visitedBonds, false); - - // The number of fragments is always the number of bonds removed + 1 - int numberOfFragments = bondsToSplit.length + 1; - IAtomContainer[] fragments = new IAtomContainer[numberOfFragments]; - - // Map atoms to the atoms they should be disconnected from - Map> atomsToSplit = new HashMap<>((int) Math.ceil((bondsToSplit.length * 2) / 0.75)); + private IAtomContainer[] splitBondsWithCopy(IAtomContainer origMol, IBond[] bondsToSplit) { + Set> splitBondAtomPairs = new HashSet<>(); for (IBond bond : bondsToSplit) { - IAtom beg = bond.getBegin(); - IAtom end = bond.getEnd(); - atomsToSplit.computeIfAbsent(beg, k -> new ArrayList<>()).add(end); + Set pair = new HashSet<>(2); + pair.add(bond.getAtom(0)); + pair.add(bond.getAtom(1)); + splitBondAtomPairs.add(pair); } - // Stack to track starting atoms for fragment creation - Stack startingAtoms = new Stack<>(); - - // Start traversal from the first atom of the molecule - startingAtoms.add(mol.getAtom(0)); - - // Iterate to create each fragment while avoiding split bonds - for (int i = 0; i < numberOfFragments; i++) { - // Map to associate original atoms with their copied versions - Map origToCpyMap = new HashMap<>((int) Math.ceil(mol.getAtomCount() / 0.75)); - IAtomContainer fragmentContainer = mol.getBuilder().newInstance(IAtomContainer.class); - - // Stack for depth-first search (DFS) traversal - Stack atomStack = new Stack<>(); - - // Start DFS from the next available atom - atomStack.add(startingAtoms.pop()); - - // Copy the first atom and store the mapping - IAtom firstAtom = atomStack.peek(); - IAtom atomCpy = copyAtom(firstAtom, fragmentContainer); - origToCpyMap.put(firstAtom, atomCpy); - - while (!atomStack.isEmpty()) { - // Retrieve the current atom and its copy - IAtom atom = atomStack.pop(); - atomCpy = origToCpyMap.get(atom); - visitedAtoms[atom.getIndex()] = true; - - // Iterate over neighboring atoms - for (IAtom nbor : atom.neighbors()) { - IBond bond = mol.getBond(atom, nbor); - int bondIndex = bond.getIndex(); - - // Skip if the bond was already processed - if (visitedBonds[bondIndex]) { - continue; + boolean[] visitedOriginalAtoms = new boolean[origMol.getAtomCount()]; + List fragmentList = new ArrayList<>(bondsToSplit.length + 1); + + for (int i = 0; i < origMol.getAtomCount(); i++) { + IAtom currPotentialStartAtom = origMol.getAtom(i); + if (!visitedOriginalAtoms[origMol.indexOf(currPotentialStartAtom)]) { + IAtomContainer fragmentContainer = origMol.getBuilder().newInstance(IAtomContainer.class); + Map origToCpyMap = new HashMap<>(); + Stack dfsStack = new Stack<>(); + // Store split counts specific to the atoms in the fragment being built + Map splitCountsCpyAtoms = new HashMap<>(); + + dfsStack.push(currPotentialStartAtom); + visitedOriginalAtoms[origMol.indexOf(currPotentialStartAtom)] = true; + IAtom cpyStartAtom = copyAtom(currPotentialStartAtom, fragmentContainer); + origToCpyMap.put(currPotentialStartAtom, cpyStartAtom); + + while (!dfsStack.isEmpty()) { + IAtom origCurrAtom = dfsStack.pop(); + IAtom copiedCurrentAtom = origToCpyMap.get(origCurrAtom); + + for (IBond origBond : origMol.getConnectedBondsList(origCurrAtom)) { + IAtom origNbor = origBond.getOther(origCurrAtom); + Set currBondPair = new HashSet<>(2); + currBondPair.add(origCurrAtom); + currBondPair.add(origNbor); + boolean isThisABondToSplit = splitBondAtomPairs.contains(currBondPair); + + if (!isThisABondToSplit) { + if (!origToCpyMap.containsKey(origNbor)) { + visitedOriginalAtoms[origMol.indexOf(origNbor)] = true; + IAtom cpyNbor = copyAtom(origNbor, fragmentContainer); + origToCpyMap.put(origNbor, cpyNbor); + fragmentContainer.addBond(copiedCurrentAtom.getIndex(), cpyNbor.getIndex(), + origBond.getOrder(), origBond.getStereo()); + dfsStack.push(origNbor); + } else { + IAtom cpyNbor = origToCpyMap.get(origNbor); + if (fragmentContainer.getBond(copiedCurrentAtom, cpyNbor) == null) { + fragmentContainer.addBond(copiedCurrentAtom.getIndex(), cpyNbor.getIndex(), + origBond.getOrder(), origBond.getStereo()); + // Add bond only if not already present + } + } + } else { + // This bond is being cut. The origCurrAtom is part of the fragment being built. + // Increment the cleavage count for its corresponding copied atom. + splitCountsCpyAtoms.put(copiedCurrentAtom, + splitCountsCpyAtoms.getOrDefault(copiedCurrentAtom, 0) + 1); + } } + } - // If the neighbor is not part of a split bond, copy it and create a bond - if (!atomsToSplit.containsKey(atom) || !atomsToSplit.get(atom).contains(nbor)) { - if (!visitedAtoms[nbor.getIndex()]) { - IAtom nborCpy = copyAtom(nbor, fragmentContainer); - fragmentContainer.newBond(atomCpy, nborCpy, bond.getOrder()); - visitedBonds[bondIndex] = true; - atomStack.add(nbor); - origToCpyMap.put(nbor, nborCpy); - visitedAtoms[nbor.getIndex()] = true; - } else { - // If the neighbor was already copied, establish a bond to maintain cyclic structures - IAtom nborCpy = origToCpyMap.get(nbor); - if (nborCpy != null) { - fragmentContainer.newBond(atomCpy, nborCpy, bond.getOrder()); - visitedBonds[bondIndex] = true; - } + // Apply saturation logic based on the number of splitting counts for this fragment + if (this.saturationSetting != Saturation.UNSATURATED_FRAGMENTS) { + for (Map.Entry entry : splitCountsCpyAtoms.entrySet()) { + IAtom atom = entry.getKey(); + int bondsCutCount = entry.getValue(); + + switch (this.saturationSetting) { + case HYDROGEN_SATURATED_FRAGMENTS: + Integer currImplHCount = atom.getImplicitHydrogenCount(); + int newImplHCount = (currImplHCount == null ? 0 : currImplHCount) + bondsCutCount; + atom.setImplicitHydrogenCount(newImplHCount); + break; + case REST_SATURATED_FRAGMENTS: + addRAtoms(atom, bondsCutCount, fragmentContainer); + break; } - } else { - // If the neighbor is part of a split bond, mark it as a starting atom for a new fragment - startingAtoms.add(nbor); } } + fragmentList.add(fragmentContainer); } - // Store the created fragment - fragments[i] = fragmentContainer; } - return fragments; + return fragmentList.toArray(new IAtomContainer[0]); } diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java index 832256f00d1..4ee1409dc60 100644 --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java @@ -235,7 +235,7 @@ void testEF3RestSaturated() throws Exception { fragmenterRestSaturated.generateFragments(mol); String[] frags = fragmenterRestSaturated.getFragments(); Assertions.assertNotNull(frags); - MatcherAssert.assertThat(frags, is(new String[]{"[R]C1CCCCC1"})); + MatcherAssert.assertThat(frags, is(new String[]{"*C1CCCCC1"})); } @Test @@ -244,7 +244,7 @@ void testEF5RestSaturated() throws Exception { fragmenterRestSaturated.generateFragments(mol); String[] frags = fragmenterRestSaturated.getFragments(); Assertions.assertNotNull(frags); - MatcherAssert.assertThat(Arrays.asList(frags), hasItems("[R]c1ccccc1", "c1ccccc1")); + MatcherAssert.assertThat(Arrays.asList(frags), hasItems("*c1ccccc1", "*Cc1ccccc1")); Assertions.assertEquals(2, fragmenterRestSaturated.getFragmentsAsContainers().length); } @@ -254,7 +254,7 @@ void testEF6RestSaturated() throws Exception { fragmenterRestSaturated.generateFragments(mol); String[] frags = fragmenterRestSaturated.getFragments(); Assertions.assertNotNull(frags); - MatcherAssert.assertThat(frags, is(new String[]{"[R]c1ccccc1"})); + MatcherAssert.assertThat(frags, is(new String[]{"*c1ccccc1"})); Assertions.assertEquals(1, fragmenterRestSaturated.getFragmentsAsContainers().length); } @@ -264,7 +264,8 @@ void testEF7RestSaturated() throws Exception { fragmenterRestSaturated.generateFragments(mol); List frags = Arrays.asList(fragmenterRestSaturated.getFragments()); Assertions.assertNotNull(frags); - Assertions.assertEquals(25, fragmenterRestSaturated.getFragmentsAsContainers().length); - MatcherAssert.assertThat(frags, hasItems("[R]c1ccccc1", "[R]C1CCC(c2ccccc2)(CC3C=CC=C3)C1", "[R]C1CCC([R])(c2ccccc2)C1")); + // TODO: Check per hand if the fragmentation results in 28 unique fragments + Assertions.assertEquals(28, fragmenterRestSaturated.getFragmentsAsContainers().length); + MatcherAssert.assertThat(frags, hasItems("*c1ccccc1", "*C1CCC(c2ccccc2)(CC3C=CC=C3)C1", "*C1CCC(*)(c2ccccc2)C1")); } } From 2ba2fcf65dd677f5f58a096480b69158e881683f Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Wed, 4 Jun 2025 13:55:43 +0200 Subject: [PATCH 20/42] improved tests and documentation --- .../cdk/fragment/ExhaustiveFragmenter.java | 119 +++-- .../fragment/ExhaustiveFragmenterTest.java | 497 +++++++++++++++--- 2 files changed, 500 insertions(+), 116 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index 8c3f48ea241..474b1941070 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -51,26 +51,57 @@ * Non-terminal meaning bonds connected to more than one single heavy atom (non-terminal bonds). * By default: *
    - *
  • Fragments smaller than 6 atoms (excluding implicit hydrogen) don't get returned.
  • - *
  • Fragments are returned with open valences, where a bond has been split.
  • + *
  • Fragments smaller than 6 atoms (excluding implicit hydrogen) don't get returned.
  • + *
  • Fragments are returned with open valences, where a bond has been split.
  • *
* However, users can modify these settings. *

+ * Fragment Deduplication: + * The `ExhaustiveFragmenter` uses canonical SMILES strings for internal deduplication of generated fragments. + * This means that after a fragment is generated, its unique SMILES representation is computed + * (using {@link SmilesGenerator} with {@code SmiFlavor.Unique} and {@code SmiFlavor.UseAromaticSymbols}). + * If a fragment with the same canonical SMILES has already been generated and stored, the new fragment + * is considered a duplicate and is not added to the results. + *

+ * This deduplication strategy is particularly important when considering the {@link Saturation} setting: + *

    + *
  • If fragments are {@link Saturation#HYDROGEN_SATURATED_FRAGMENTS} or + * {@link Saturation#REST_SATURATED_FRAGMENTS}, the saturation process might lead to a canonical SMILES + * that is identical to a fragment obtained via a different bond cleavage, or a fragment that appears + * different due to explicit hydrogen representation but becomes identical when canonicalized.
  • + *
  • For example, an unsaturated fragment like `[CH]1CCCCC1` (cyclohexyl radical) might deduplicate + * with a saturated `C1CCCCC1` (cyclohexane) if `HYDROGEN_SATURATED_FRAGMENTS` is enabled and both forms + * canonicalize to the same SMILES depending on the exact SMILES generator and atom properties.
  • + *
  • It is crucial to understand that the uniqueness is based solely on the canonical SMILES string, + * not on the exact atom-by-atom identity or origin within the original molecule.
  • + *
+ * + *

* Example Usage: *

{@code
+ * import org.openscience.cdk.DefaultChemObjectBuilder;
+ * import org.openscience.cdk.interfaces.IAtomContainer;
+ * import org.openscience.cdk.smiles.SmilesParser;
+ *
  * // By default, returns unsaturated fragments with a minimum size of 6 atoms
  * ExhaustiveFragmenter fragmenter = new ExhaustiveFragmenter();
- * SmilesParser smiParser = new SmilesParser(SilentChemObjectBuilder.getInstance());
- * IAtomContainer mol = smiParser.parseSmiles("C1CCC(C1)C1=CC=CC=C1");  //  Cyclopentylbenzene molecule
+ * SmilesParser smiParser = new SmilesParser(DefaultChemObjectBuilder.getInstance());
+ * IAtomContainer mol = smiParser.parseSmiles("C1CCC(C1)C1=CC=CC=C1");  // Cyclopentylbenzene molecule
  * fragmenter.generateFragments(mol);
  *
  * // Retrieve SMILES representations of fragments
  * String[] smilesFragments = fragmenter.getFragments();
- * // Results: ["C1CCCCC1", "C1=CC=CC=C1"]
- * //
+ * // Example Result (depending on exact fragmentation points and min size):
+ * // ["C1CCCCC1", "c1ccccc1"]
+ * // Note: Actual fragments might vary based on chosen saturation setting and bond definitions.
  *
  * // Retrieve AtomContainer representations of fragments
  * IAtomContainer[] atomContainerFragments = fragmenter.getFragmentsAsContainers();
+ *
+ * // Example: Configuring for hydrogen-saturated fragments with a minimum size of 5
+ * ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(5, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS);
+ * saturatedFragmenter.generateFragments(mol);
+ * String[] saturatedSmilesFragments = saturatedFragmenter.getFragments();
  * }
* * @author Rajarshi Guha @@ -102,6 +133,7 @@ public enum Saturation { private static final int DEFAULT_MIN_FRAG_SIZE = 6; private static final Saturation DEFAULT_SATURATION = Saturation.UNSATURATED_FRAGMENTS; + private static final SmilesGenerator DEFAULT_SMILES_GENERATOR = new SmilesGenerator(SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols); private final Map fragMap; private final SmilesGenerator smilesGenerator; @@ -113,35 +145,51 @@ public enum Saturation { /** * Constructs an ExhaustiveFragmenter with the default settings: *
    - *
  • Minimum fragment size: 6 atoms
  • - *
  • Unsaturated fragments
  • + *
  • Minimum fragment size: 6 atoms (excluding implicit hydrogen)
  • + *
  • Unsaturated fragments
  • + *
  • Default {@link SmilesGenerator} ({@code SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols})
  • *
*/ public ExhaustiveFragmenter() { - this(DEFAULT_MIN_FRAG_SIZE, DEFAULT_SATURATION); + this(DEFAULT_SMILES_GENERATOR, DEFAULT_MIN_FRAG_SIZE, DEFAULT_SATURATION); } /** * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment size and saturation setting. + * Uses the default {@link SmilesGenerator}. * - * @param minFragSize minimum number of atoms in a valid fragment. - * @param saturationSetting determines whether fragments should be saturated or unsaturated. + * @param minFragSize Minimum number of atoms in a valid fragment (excluding implicit hydrogen). + * @param saturationSetting Determines whether fragments should be saturated (with hydrogens or R-atoms) or unsaturated. */ public ExhaustiveFragmenter(int minFragSize, Saturation saturationSetting) { - this.minFragSize = minFragSize; - this.saturationSetting = saturationSetting; - this.fragMap = new HashMap<>(); - this.smilesGenerator = new SmilesGenerator(SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols); + this(DEFAULT_SMILES_GENERATOR, minFragSize, saturationSetting); } /** * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment size. - * Saturation defaults to unsaturated fragments. + * Saturation defaults to {@link Saturation#UNSATURATED_FRAGMENTS}. + * Uses the default {@link SmilesGenerator}. * - * @param minFragSize minimum number of atoms in a valid fragment. + * @param minFragSize Minimum number of atoms in a valid fragment (excluding implicit hydrogen). */ public ExhaustiveFragmenter(int minFragSize) { - this(minFragSize, DEFAULT_SATURATION); + this(DEFAULT_SMILES_GENERATOR, minFragSize, DEFAULT_SATURATION); + } + + /** + * Constructs an ExhaustiveFragmenter with a user-provided {@link SmilesGenerator}, + * user-defined minimum fragment size, and saturation setting. + * + * @param smilesGenerator The {@link SmilesGenerator} instance to use for creating SMILES strings + * for fragment deduplication and retrieval. + * @param minFragSize Minimum number of atoms in a valid fragment (excluding implicit hydrogen). + * @param saturationSetting Determines whether fragments should be saturated (with hydrogens or R-atoms) or unsaturated. + */ + public ExhaustiveFragmenter(SmilesGenerator smilesGenerator, int minFragSize, Saturation saturationSetting) { + this.minFragSize = minFragSize; + this.saturationSetting = saturationSetting; + this.fragMap = new HashMap<>(); + this.smilesGenerator = smilesGenerator; } /** @@ -163,13 +211,17 @@ public void setSaturationSetting(Saturation saturationSetting) { } /** - * Sets the maximum number of bonds that can be simultaneously split. Must be within the range - * {@code 0 < exclusiveMaxTreeDepth < 32}. This is the limit of the maximal possible bonds to split, - * caused by the combinatorial explosion of fragments when dealing with larger molecules. Because Java - * indexes its common data structures with int32's and this algorithm scales with 2^n, this limit is - * strictly necessary. + * Sets the maximum number of bonds that can be simultaneously split in a single + * fragmentation event. This value is exclusive, meaning if set to `3`, a maximum of `2` bonds + * can be split simultaneously. + *

+ * Must be within the range {@code 0 < exclusiveMaxTreeDepth < 32}. This limit is important + * due to the combinatorial explosion of fragments (which scales with 2^n, where n is the + * number of splittable bonds) and Java's use of 32-bit integers for indexing. + * Setting a lower limit can help manage computational resources for larger molecules. + *

* - * @param exclusiveMaxTreeDepth maximum number of bonds that can be split in one atom container. + * @param exclusiveMaxTreeDepth The exclusive maximum number of bonds that can be split in one atom container. */ public void setExclusiveMaxTreeDepth(int exclusiveMaxTreeDepth) { this.exclusiveMaxTreeDepth = exclusiveMaxTreeDepth; @@ -187,7 +239,7 @@ public void setExclusiveMaxTreeDepth(int exclusiveMaxTreeDepth) { @Override public void generateFragments(IAtomContainer atomContainer) throws CDKException { this.fragMap.clear(); - run(atomContainer, this.exclusiveMaxTreeDepth); + run(atomContainer); } /** @@ -196,10 +248,9 @@ public void generateFragments(IAtomContainer atomContainer) throws CDKException * Only non-ring and non-terminal single bonds are considered for splitting. * * @param atomContainer the molecule to be split. - * @param maxTreeDepth the maximum number of bond splits allowed per subset of bonds. * @throws CDKException if an error occurs during hydrogen addition or atom type perception. */ - private void run(IAtomContainer atomContainer, int maxTreeDepth) throws CDKException { + private void run(IAtomContainer atomContainer) throws CDKException { // Return early if the molecule has fewer than 3 bonds (no meaningful splits possible) if (atomContainer.getBondCount() < 3) return; @@ -226,7 +277,7 @@ private void run(IAtomContainer atomContainer, int maxTreeDepth) throws CDKExcep int subsetSize = subset.length; // Skip subsets exceeding the allowed depth - if (subsetSize > this.exclusiveMaxTreeDepth) { + if (subsetSize >= this.exclusiveMaxTreeDepth) { continue; } @@ -246,11 +297,11 @@ private void run(IAtomContainer atomContainer, int maxTreeDepth) throws CDKExcep AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(partContainer); // Apply aromaticity perception (legacy operation) - // TODO: Investigate for the current method to do this. + // TODO: Not sure how to handle this Aromaticity.cdkLegacy().apply(partContainer); // Generate a unique SMILES representation of the fragment - String tmpSmiles = smilesGenerator.create(partContainer); + String tmpSmiles = this.smilesGenerator.create(partContainer); int numberOfAtoms = partContainer.getAtomCount(); // Store the fragment if it meets the size requirement and is unique @@ -269,7 +320,7 @@ private void run(IAtomContainer atomContainer, int maxTreeDepth) throws CDKExcep *
      *     {@code
      *     ExhaustiveFragmenter exhFragmenter = new Exhaustive Fragmenter;
-     *     if (exhFragmenter.getSplittableBonds(mol) > 31) {
+     *     if (exhFragmenter.getSplittableBonds(mol) > Integer.SIZE - 1) {
      *         // handle the case, where it is impossible to entirely split the molecule
      *     }}
      * 
@@ -279,6 +330,7 @@ private void run(IAtomContainer atomContainer, int maxTreeDepth) throws CDKExcep */ public IBond[] getSplitableBonds(IAtomContainer atomContainer) { // do ring detection + // TODO: Is this really the proper way to do ring detection here ? SpanningTree spanningTree = new SpanningTree(atomContainer); IRingSet allRings = spanningTree.getAllRings(); @@ -340,7 +392,7 @@ public IBond[] getSplitableBonds(IAtomContainer atomContainer) { * Duplicate values in `nums` may result in duplicate subset entries. * @return An array containing the subset corresponding to `index`. */ - private static int[] generateSubset(int index, int[] nums) { + static int[] generateSubset(int index, int[] nums) { // Allocate subset array based on the number of 1-bits in index. int[] subset = new int[Integer.bitCount(index)]; int subsetIndex = 0; @@ -391,7 +443,8 @@ private static IAtom copyAtom(IAtom originalAtom, IAtomContainer atomContainer) } /** - * Splits a molecule into multiple fragments by removing the specified bonds and making copies of the resulting fragments. + * Splits and saturates (if specified via {@link #saturationSetting}) a molecule into multiple fragments by removing the + * specified bonds and making copies of the resulting fragments. * * @param origMol The molecule to be split. * @param bondsToSplit The bonds that should be removed to create separate fragments. diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java index 4ee1409dc60..4f270ae2995 100644 --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java @@ -23,12 +23,13 @@ import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -import org.openscience.cdk.smiles.SmiFlavor; -import org.openscience.cdk.smiles.SmilesGenerator; -import org.openscience.cdk.test.CDKTestCase; import org.openscience.cdk.DefaultChemObjectBuilder; import org.openscience.cdk.interfaces.IAtomContainer; +import org.openscience.cdk.interfaces.IBond; +import org.openscience.cdk.smiles.SmiFlavor; +import org.openscience.cdk.smiles.SmilesGenerator; import org.openscience.cdk.smiles.SmilesParser; +import org.openscience.cdk.test.CDKTestCase; import java.util.Arrays; import java.util.List; @@ -38,7 +39,23 @@ /** * Test exhaustive fragmenter. + *

+ * This test class covers various scenarios for the {@link ExhaustiveFragmenter}, + * including different saturation settings (unsaturated, hydrogen-saturated, R-group saturated) + * and minimum fragment size. + *

+ * Note on deduplication: The {@link ExhaustiveFragmenter} uses SMILES strings for + * internal deduplication of generated fragments. This means that if two fragments, + * despite having different atom connectivity indices or implicit hydrogen counts, + * produce the same canonical SMILES string (as determined by {@link SmilesGenerator}), + * they will be considered the same fragment and only one will be stored. + * This is particularly relevant when comparing unsaturated vs. saturated fragments, + * as the saturation process might lead to a canonical SMILES that is identical + * to a fragment obtained via a different bond cleavage, or a fragment that appears + * different due to explicit hydrogen representation but becomes identical when + * canonicalized. * + * @see ExhaustiveFragmenter */ class ExhaustiveFragmenterTest extends CDKTestCase { @@ -58,6 +75,12 @@ static void setup() { smilesParser = new SmilesParser(DefaultChemObjectBuilder.getInstance()); } + // --- Unsaturated Fragments Tests --- + + /** + * Tests that a simple linear alkane (propane) with no splittable bonds + * yields no fragments when using the unsaturated setting. + */ @Test void testEF1Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("CCC"); @@ -66,14 +89,10 @@ void testEF1Unsaturated() throws Exception { Assertions.assertEquals(0, frags.length); } - @Test - void testEF1Saturated() throws Exception { - IAtomContainer mol = smilesParser.parseSmiles("CCC"); - fragmenterSaturated.generateFragments(mol); - String[] frags = fragmenterSaturated.getFragments(); - Assertions.assertEquals(0, frags.length); - } - + /** + * Tests that a simple cycloalkane (cyclopentane) with no non-ring, non-terminal bonds + * yields no fragments when using the unsaturated setting. + */ @Test void testEF2Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1"); @@ -82,14 +101,10 @@ void testEF2Unsaturated() throws Exception { Assertions.assertEquals(0, frags.length); } - @Test - void testEF2Saturated() throws Exception { - IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1"); - fragmenterSaturated.generateFragments(mol); - String[] frags = fragmenterSaturated.getFragments(); - Assertions.assertEquals(0, frags.length); - } - + /** + * Tests fragmentation of methylcyclohexane with unsaturated fragments. + * Expects "[CH]1CCCCC1" as a fragment, representing the cyclohexyl radical. + */ @Test void testEF3Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); @@ -98,14 +113,10 @@ void testEF3Unsaturated() throws Exception { MatcherAssert.assertThat(frags, is(new String[]{"[CH]1CCCCC1"})); } - @Test - void testEF3Saturated() throws Exception { - IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); - fragmenterSaturated.generateFragments(mol); - String[] frags = fragmenterSaturated.getFragments(); - org.hamcrest.MatcherAssert.assertThat(frags, is(new String[]{"C1CCCCC1"})); - } - + /** + * Tests fragmentation of ethylbenzene with unsaturated fragments. + * Expects "[c]1ccccc1" as a fragment, representing the phenyl radical. + */ @Test void testEF4Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC"); @@ -115,16 +126,10 @@ void testEF4Unsaturated() throws Exception { MatcherAssert.assertThat(frags, is(new String[]{"[c]1ccccc1"})); } - @Test - void testEF4Saturated() throws Exception { - IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC"); - fragmenterSaturated.generateFragments(mol); - String[] frags = fragmenterSaturated.getFragments(); - Assertions.assertNotNull(frags); - org.hamcrest.MatcherAssert.assertThat(frags, is(new String[]{"c1ccccc1"})); - } - - + /** + * Tests fragmentation of diphenylmethane with unsaturated fragments. + * Expects "[CH2]c1ccccc1" (benzyl radical) and "[c]1ccccc1" (phenyl radical). + */ @Test void testEF5Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); @@ -136,18 +141,10 @@ void testEF5Unsaturated() throws Exception { Assertions.assertEquals(2, fragmenterUnsaturated.getFragmentsAsContainers().length); } - @Test - void testEF5Saturated() throws Exception { - IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); - fragmenterSaturated.generateFragments(mol); - String[] frags = fragmenterSaturated.getFragments(); - Assertions.assertNotNull(frags); - org.hamcrest.MatcherAssert.assertThat(Arrays.asList(frags), hasItems("c1ccc(cc1)C", "c1ccccc1")); - Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); - Assertions.assertEquals(2, fragmenterSaturated.getFragmentsAsContainers().length); - } - - + /** + * Tests fragmentation of biphenyl with unsaturated fragments. + * Expects only "[c]1ccccc1" as the fragment. + */ @Test void testEF6Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); @@ -161,18 +158,14 @@ void testEF6Unsaturated() throws Exception { } - @Test - void testEF6Saturated() throws Exception { - IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); - fragmenterSaturated.generateFragments(mol); - String[] frags = fragmenterSaturated.getFragments(); - Assertions.assertNotNull(frags); - org.hamcrest.MatcherAssert.assertThat(frags, is(new String[]{"c1ccccc1"})); - - Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); - Assertions.assertEquals(1, fragmenterSaturated.getFragmentsAsContainers().length); - } - + /** + * Tests a complex molecule with unsaturated fragments. + * Expected fragments include phenyl and various complex radical fragments. + * Note: The number of fragments (26) is higher than the saturated version (25) + * because unsaturated fragments explicitly show radical centers, which can lead to + * unique SMILES for fragments that would be canonicalized identically when saturated + * due to differences in hydrogen counts or explicit radical representation. + */ @Test void testEF7Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); @@ -192,43 +185,115 @@ void testEF7Unsaturated() throws Exception { MatcherAssert.assertThat(frags, hasItems("[c]1ccccc1", "[CH2]CC1CCC(c2ccccc2)(CC3C=CC=C3)C1", "[CH2]C1CCC([CH2])(c2ccccc2)C1")); } + // --- Hydrogen-Saturated Fragments Tests --- + + /** + * Tests that a simple linear alkane (propane) with no splittable bonds + * yields no fragments when using the hydrogen-saturated setting. + */ @Test - void testEF7Saturated() throws Exception { - IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); + void testEF1Saturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("CCC"); fragmenterSaturated.generateFragments(mol); - List frags = Arrays.asList(fragmenterSaturated.getFragments()); + String[] frags = fragmenterSaturated.getFragments(); + Assertions.assertEquals(0, frags.length); + } + + /** + * Tests that a simple cycloalkane (cyclopentane) with no non-ring, non-terminal bonds + * yields no fragments when using the hydrogen-saturated setting. + */ + @Test + void testEF2Saturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1"); + fragmenterSaturated.generateFragments(mol); + String[] frags = fragmenterSaturated.getFragments(); + Assertions.assertEquals(0, frags.length); + } + + /** + * Tests fragmentation of methylcyclohexane with hydrogen-saturated fragments. + * Expects "C1CCCCC1" as a fragment, representing cyclohexane. + */ + @Test + void testEF3Saturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); + fragmenterSaturated.generateFragments(mol); + String[] frags = fragmenterSaturated.getFragments(); + org.hamcrest.MatcherAssert.assertThat(frags, is(new String[]{"C1CCCCC1"})); + } + + /** + * Tests fragmentation of ethylbenzene with hydrogen-saturated fragments. + * Expects "c1ccccc1" as a fragment, representing benzene. + */ + @Test + void testEF4Saturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC"); + fragmenterSaturated.generateFragments(mol); + String[] frags = fragmenterSaturated.getFragments(); Assertions.assertNotNull(frags); - Assertions.assertEquals(25, frags.size()); + org.hamcrest.MatcherAssert.assertThat(frags, is(new String[]{"c1ccccc1"})); + } + /** + * Tests fragmentation of diphenylmethane with hydrogen-saturated fragments. + * Expects "c1ccc(cc1)C" (toluene) and "c1ccccc1" (benzene). + * Note: "c1ccc(cc1)C" might also be canonicalized as "Cc1ccccc1". + */ + @Test + void testEF5Saturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); + fragmenterSaturated.generateFragments(mol); + String[] frags = fragmenterSaturated.getFragments(); + Assertions.assertNotNull(frags); + org.hamcrest.MatcherAssert.assertThat(Arrays.asList(frags), hasItems("c1ccc(cc1)C", "c1ccccc1")); Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); - Assertions.assertEquals(25, fragmenterSaturated.getFragmentsAsContainers().length); - - org.hamcrest.MatcherAssert.assertThat(frags, hasItems("c1ccccc1", "c1ccc(cc1)C2(CCC(CC)C2)CC3C=CC=C3", "c1ccc(cc1)C2(C)CCC(C)C2")); + Assertions.assertEquals(2, fragmenterSaturated.getFragmentsAsContainers().length); } + /** + * Tests fragmentation of biphenyl with hydrogen-saturated fragments. + * Expects only "c1ccccc1" (benzene) as the fragment. + */ @Test - void testMinSize() throws Exception { - IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1C2CCCCC2"); - fragmenterSaturated.setMinimumFragmentSize(6); + void testEF6Saturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); Assertions.assertNotNull(frags); - Assertions.assertEquals(1, frags.length); - Assertions.assertTrue(frags[0].equals("C1CCCCC1")); + org.hamcrest.MatcherAssert.assertThat(frags, is(new String[]{"c1ccccc1"})); + + Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); + Assertions.assertEquals(1, fragmenterSaturated.getFragmentsAsContainers().length); } + /** + * Tests a complex molecule with hydrogen-saturated fragments. + * Expected fragments include benzene and various complex saturated fragments. + * Compared to the unsaturated version, some fragments might canonicalize to the same SMILES + * after saturation, resulting in a slightly lower count (25 vs 26). + */ @Test - void testEqualityOfSmilesAndContainers() throws Exception { - SmilesGenerator smilesGenerator = new SmilesGenerator(SmiFlavor.UseAromaticSymbols | SmiFlavor.Unique); - IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC(N)C(=O)O"); + void testEF7Saturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); fragmenterSaturated.generateFragments(mol); - List smilesFrags = Arrays.asList(fragmenterSaturated.getFragments()); - IAtomContainer[] containerFrags = fragmenterSaturated.getFragmentsAsContainers(); - for (IAtomContainer frag : containerFrags) { - org.hamcrest.MatcherAssert.assertThat(smilesFrags, hasItems(smilesGenerator.create(frag))); - } + List frags = Arrays.asList(fragmenterSaturated.getFragments()); + Assertions.assertNotNull(frags); + Assertions.assertEquals(25, frags.size()); + + Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); + Assertions.assertEquals(25, fragmenterSaturated.getFragmentsAsContainers().length); + + MatcherAssert.assertThat(frags, hasItems("c1ccccc1", "c1ccc(cc1)C2(CCC(CC)C2)CC3C=CC=C3", "c1ccc(cc1)C2(C)CCC(C)C2")); } + // --- R-Group Saturated Fragments Tests --- + + /** + * Tests fragmentation of ethylcyclohexane with R-group saturated fragments. + * Expects "*C1CCCCC1" as a fragment, representing the cyclohexyl group with an R-atom. + */ @Test void testEF3RestSaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); @@ -238,6 +303,10 @@ void testEF3RestSaturated() throws Exception { MatcherAssert.assertThat(frags, is(new String[]{"*C1CCCCC1"})); } + /** + * Tests fragmentation of diphenylmethane with R-group saturated fragments. + * Expects "*c1ccccc1" (phenyl with R-atom) and "*Cc1ccccc1" (benzyl with R-atom). + */ @Test void testEF5RestSaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); @@ -248,6 +317,10 @@ void testEF5RestSaturated() throws Exception { Assertions.assertEquals(2, fragmenterRestSaturated.getFragmentsAsContainers().length); } + /** + * Tests fragmentation of biphenyl with R-group saturated fragments. + * Expects only "*c1ccccc1" as the fragment. + */ @Test void testEF6RestSaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); @@ -258,14 +331,272 @@ void testEF6RestSaturated() throws Exception { Assertions.assertEquals(1, fragmenterRestSaturated.getFragmentsAsContainers().length); } + /** + * Tests a complex molecule with R-group saturated fragments. + * The number of fragments can differ from hydrogen-saturated or unsaturated versions + * due to the explicit R-group notation affecting canonical SMILES. + */ @Test void testEF7RestSaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); fragmenterRestSaturated.generateFragments(mol); List frags = Arrays.asList(fragmenterRestSaturated.getFragments()); Assertions.assertNotNull(frags); - // TODO: Check per hand if the fragmentation results in 28 unique fragments + // There two additional fragments in comparison to the hydrogen saturated version because there are following fragments: + // *C1CCC(*)(*)C1 + // *C1C=CC=C1 + // these fragments only differ in size compared to their respective hydrogen saturated version beacuse the R-Group represented by '*' + // is also counted as a valid atom in comparison to implicit hydrogens. So these are valid fragments with size 6. Assertions.assertEquals(28, fragmenterRestSaturated.getFragmentsAsContainers().length); MatcherAssert.assertThat(frags, hasItems("*c1ccccc1", "*C1CCC(c2ccccc2)(CC3C=CC=C3)C1", "*C1CCC(*)(c2ccccc2)C1")); } -} + + // --- General Fragmenter Tests --- + + /** + * Tests the minimum fragment size setting. + * With a minimum size of 6, only the larger ring (cyclohexane) should be returned + * from a molecule composed of a cyclopentane and a cyclohexane connected by a single bond. + */ + @Test + void testMinSize() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1C2CCCCC2"); + fragmenterSaturated.setMinimumFragmentSize(6); + fragmenterSaturated.generateFragments(mol); + String[] frags = fragmenterSaturated.getFragments(); + Assertions.assertNotNull(frags); + Assertions.assertEquals(1, frags.length); + Assertions.assertEquals("C1CCCCC1", frags[0]); + } + + /** + * Tests that lowering the minimum fragment size allows smaller fragments to be returned. + * For "C1CCCC1C2CCCCC2", setting min size to 5 should yield both rings. + */ + @Test + void testMinSizeLowered() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1C2CCCCC2"); + ExhaustiveFragmenter localFragmenter = new ExhaustiveFragmenter(); + localFragmenter.setSaturationSetting(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + localFragmenter.setMinimumFragmentSize(5); + localFragmenter.generateFragments(mol); + String[] frags = localFragmenter.getFragments(); + Assertions.assertNotNull(frags); + Assertions.assertEquals(2, frags.length); + MatcherAssert.assertThat(Arrays.asList(frags), hasItems("C1CCCCC1", "C1CCCC1")); + } + + /** + * Verifies that the SMILES representations obtained from fragments match + * the SMILES generated directly from their corresponding {@link IAtomContainer} objects. + */ + @Test + void testEqualityOfSmilesAndContainers() throws Exception { + SmilesGenerator smilesGenerator = new SmilesGenerator(SmiFlavor.UseAromaticSymbols | SmiFlavor.Unique); + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC(N)C(=O)O"); // Phenylalanine + fragmenterSaturated.generateFragments(mol); + List smilesFrags = Arrays.asList(fragmenterSaturated.getFragments()); + IAtomContainer[] containerFrags = fragmenterSaturated.getFragmentsAsContainers(); + for (IAtomContainer frag : containerFrags) { + MatcherAssert.assertThat(smilesFrags, hasItems(smilesGenerator.create(frag))); + } + } + + /** + * Tests the {@link ExhaustiveFragmenter#getSplitableBonds(IAtomContainer)} method + * for a linear alkane (propane), which should have no splittable bonds. + */ + @Test + void testGetSplittableBondsLinearMolecule() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("CCC"); // Propane + IBond[] splittableBonds = fragmenterSaturated.getSplitableBonds(mol); + Assertions.assertEquals(0, splittableBonds.length); + } + + /** + * Tests the {@link ExhaustiveFragmenter#getSplitableBonds(IAtomContainer)} method + * for a cyclic alkane (cyclopentane), which should have no splittable bonds (all bonds are in a ring). + */ + @Test + void testGetSplittableBondsCyclicMolecule() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1"); // Cyclopentane + IBond[] splittableBonds = fragmenterSaturated.getSplitableBonds(mol); + Assertions.assertEquals(0, splittableBonds.length); + } + + /** + * Tests the {@link ExhaustiveFragmenter#getSplitableBonds(IAtomContainer)} method + * for ethylbenzene, which should have one splittable bond (the bond between the phenyl and ethyl groups). + */ + @Test + void testGetSplittableBondsBenzeneWithSideChain() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC"); // Ethylbenzene + IBond[] splittableBonds = fragmenterSaturated.getSplitableBonds(mol); + Assertions.assertEquals(1, splittableBonds.length); + } + + /** + * Tests the {@link ExhaustiveFragmenter#getSplitableBonds(IAtomContainer)} method + * for biphenyl, which should have one splittable bond (the bond connecting the two phenyl rings). + */ + @Test + void testGetSplittableBondsBiphenyl() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); // Biphenyl + IBond[] splittableBonds = fragmenterSaturated.getSplitableBonds(mol); + Assertions.assertEquals(1, splittableBonds.length); + } + + /** + * Tests the internal helper method `generateSubset` which creates subsets + * based on the bit representation of an index. + * This ensures the combinatorial generation of bond subsets works correctly. + */ + @Test + void testGenerateSubset() { + int[] nums = new int[]{10, 20, 30, 40}; + + // index = 1 (0001) -> {nums[0]} + Assertions.assertArrayEquals(new int[]{10}, ExhaustiveFragmenter.generateSubset(1, nums)); + + // index = 2 (0010) -> {nums[1]} + Assertions.assertArrayEquals(new int[]{20}, ExhaustiveFragmenter.generateSubset(2, nums)); + + // index = 3 (0011) -> {nums[0], nums[1]} + Assertions.assertArrayEquals(new int[]{10, 20}, ExhaustiveFragmenter.generateSubset(3, nums)); + + // index = 4 (0100) -> {nums[2]} + Assertions.assertArrayEquals(new int[]{30}, ExhaustiveFragmenter.generateSubset(4, nums)); + + // index = 5 (0101) -> {nums[0], nums[2]} + Assertions.assertArrayEquals(new int[]{10, 30}, ExhaustiveFragmenter.generateSubset(5, nums)); + + // index = 7 (0111) -> {nums[0], nums[1], nums[2]} + Assertions.assertArrayEquals(new int[]{10, 20, 30}, ExhaustiveFragmenter.generateSubset(7, nums)); + + // index = 15 (1111) -> {nums[0], nums[1], nums[2], nums[3]} + Assertions.assertArrayEquals(new int[]{10, 20, 30, 40}, ExhaustiveFragmenter.generateSubset(15, nums)); + } + + /** + * Tests the functionality of providing a custom SmilesGenerator to the ExhaustiveFragmenter. + * This test uses a SmilesGenerator that does NOT use aromatic symbols, expecting kekulized SMILES. + */ + @Test + void testCustomSmilesGenerator() throws Exception { + SmilesGenerator customSmilesGen = new SmilesGenerator(SmiFlavor.Unique); // No SmiFlavor.UseAromaticSymbols + ExhaustiveFragmenter customFragmenter = new ExhaustiveFragmenter( + customSmilesGen, 6, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); // Diphenylmethane + customFragmenter.generateFragments(mol); + String[] frags = customFragmenter.getFragments(); + + Assertions.assertNotNull(frags); + MatcherAssert.assertThat(Arrays.asList(frags), hasItems("C=1C=CC=CC1", "C=1C=CC(=CC1)C")); + Assertions.assertEquals(2, frags.length); + } + + /** + * Tests the setExclusiveMaxTreeDepth method using 1,4-dibutylbenzene. + * This molecule has two splittable bonds (the bonds connecting the butyl chains to the benzene ring). + * By varying `exclusiveMaxTreeDepth`, we can observe how the number of generated fragments changes. + * + *

+     * Molecule: 1,4-dibutylbenzene (CCCCc1ccc(CCCC)cc1)
+     * Splittable bonds: 3 (the two C-C bonds connecting the butyl chains to the ring).
+     * Fragmenter setup: minFragSize = 4 (to include butyl and benzene fragments), hydrogen-saturated fragments.
+     *
+     * Expected fragments for different exclusiveMaxTreeDepth settings:
+     *
+     * 1.  exclusiveMaxTreeDepth = 1 (allows 0 simultaneous cuts):
+     * - No fragments generated from bond cleavages, becuase there are no splits allowed as max tree depth is exclusive.
+     * - Expected fragments: 0
+     *
+     * 2.  exclusiveMaxTreeDepth = 2 (allows up to 1 simultaneous cut):
+     * - Considers all subsets of splittable bonds of size 1.
+     * - Cleaving one butyl-benzene bond yields:
+     * a) A butyl chain (canonical SMILES: "CCCC")
+     * b) A butylbenzene fragment (canonical SMILES: "CCCCc1ccccc1")
+     * - Expected unique fragments: 4 (
+     * c1ccc(cc1)CCCC
+     * c1cc(ccc1C)CCCC
+     * c1cc(ccc1CC)CCCC
+     * CCCC"
+     * )
+     *
+     * 3.  exclusiveMaxTreeDepth = 3 (allows up to 2 simultaneous cuts):
+     * - Considers all subsets of splittable bonds of size 1 and 2.
+     * - Includes fragments from 1-cut operations, plus fragments from 2-cut operations:
+     * - Expected unique fragments: 10 (
+     * c1ccc(cc1)C
+     * c1ccc(cc1)CC
+     * c1ccc(cc1)CCCC
+     * c1cc(ccc1C)C
+     * c1cc(ccc1C)CC
+     * c1cc(ccc1C)CCCC
+     * c1cc(ccc1CC)CC
+     * c1cc(ccc1CC)CCCC
+     * c1ccccc1
+     * CCCC
+     * )
+     *
+     * 4.  exclusiveMaxTreeDepth = 4 (allows up to 3 simultaneous cuts):
+     * - Since there are only 2 splittable bonds, allowing up to 3 cuts (or more) will yield
+     * the same set of fragments as allowing up to 2 cuts.
+     * - Expected unique fragments: 10 (
+     * c1ccc(cc1)C
+     * c1ccc(cc1)CC
+     * c1ccc(cc1)CCCC
+     * c1cc(ccc1C)C
+     * c1cc(ccc1C)CC
+     * c1cc(ccc1C)CCCC
+     * c1cc(ccc1CC)CC
+     * c1cc(ccc1CC)CCCC
+     * c1ccccc1
+     * CCCC
+     * )
+     * 
+ */ + @Test + void testSetExclusiveMaxTreeDepth() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("CCCCc1ccc(CCCC)cc1"); + + // Define a standard SmilesGenerator for fragmenter instantiation + SmilesGenerator standardSmilesGen = new SmilesGenerator(SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols); + + ExhaustiveFragmenter localFragmenter; + + localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + localFragmenter.setExclusiveMaxTreeDepth(1); + localFragmenter.generateFragments(mol); + String[] fragsDepth1 = localFragmenter.getFragments(); + Assertions.assertEquals(0, fragsDepth1.length, + "Expected 0 fragments when exclusiveMaxTreeDepth is 1 (allows 0 cuts) for 1,4-dibutylbenzene"); + + localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + localFragmenter.setExclusiveMaxTreeDepth(2); + localFragmenter.generateFragments(mol); + String[] fragsDepth2 = localFragmenter.getFragments(); + Assertions.assertEquals(4, fragsDepth2.length, + "Expected 4 fragments when exclusiveMaxTreeDepth is 2 (allows up to 1 cut)"); + MatcherAssert.assertThat(Arrays.asList(fragsDepth2), + hasItems("CCCC", "c1ccc(cc1)CCCC")); + + localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + localFragmenter.setExclusiveMaxTreeDepth(3); + localFragmenter.generateFragments(mol); + String[] fragsDepth3 = localFragmenter.getFragments(); + Assertions.assertEquals(10, fragsDepth3.length, + "Expected 10 fragments when exclusiveMaxTreeDepth is 3 (allows up to 2 cuts)"); + MatcherAssert.assertThat(Arrays.asList(fragsDepth3), + hasItems("CCCC", "c1ccc(cc1)CCCC", "c1ccccc1")); + + localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + localFragmenter.setExclusiveMaxTreeDepth(4); + localFragmenter.generateFragments(mol); + String[] fragsDepth4 = localFragmenter.getFragments(); + Assertions.assertEquals(10, fragsDepth4.length, + "Expected 10 fragments when exclusiveMaxTreeDepth is 4 (allows up to 3 cuts), same as max 2 cuts"); + MatcherAssert.assertThat(Arrays.asList(fragsDepth4), + hasItems("CCCC", "c1ccc(cc1)CCCC", "c1ccccc1")); + } +} \ No newline at end of file From 65fef6f02cf6b836bebe959c3e1f217082ef2e99 Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Wed, 4 Jun 2025 14:16:18 +0200 Subject: [PATCH 21/42] small test documentation fix --- .../openscience/cdk/fragment/ExhaustiveFragmenterTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java index 4f270ae2995..59a2462b921 100644 --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java @@ -540,8 +540,8 @@ void testCustomSmilesGenerator() throws Exception { * ) * * 4. exclusiveMaxTreeDepth = 4 (allows up to 3 simultaneous cuts): - * - Since there are only 2 splittable bonds, allowing up to 3 cuts (or more) will yield - * the same set of fragments as allowing up to 2 cuts. + * - Since there are only combinations of 2 splittable bonds that allow a fragment size bigger the 6, allowing up + * to 3 cuts (or more) will yield the same set of fragments as allowing up to 2 cuts. * - Expected unique fragments: 10 ( * c1ccc(cc1)C * c1ccc(cc1)CC From 53633e70161f50906112a3a37bcac68ab5d98178 Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Wed, 4 Jun 2025 14:41:24 +0200 Subject: [PATCH 22/42] fixed docs and corrected calculation for HashSet capacity --- .../openscience/cdk/fragment/ExhaustiveFragmenter.java | 2 +- .../cdk/fragment/ExhaustiveFragmenterTest.java | 10 +++------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index 474b1941070..8fb53200a40 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -453,7 +453,7 @@ private static IAtom copyAtom(IAtom originalAtom, IAtomContainer atomContainer) private IAtomContainer[] splitBondsWithCopy(IAtomContainer origMol, IBond[] bondsToSplit) { Set> splitBondAtomPairs = new HashSet<>(); for (IBond bond : bondsToSplit) { - Set pair = new HashSet<>(2); + Set pair = new HashSet<>((int) Math.ceil(2 / (double) 0.75f)); pair.add(bond.getAtom(0)); pair.add(bond.getAtom(1)); splitBondAtomPairs.add(pair); diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java index 59a2462b921..9c30c285ce3 100644 --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java @@ -175,7 +175,7 @@ void testEF7Unsaturated() throws Exception { // There is one additional fragment in comparison to the saturated version because there are following fragments: // [C]1CCC([CH2])C1 // [CH2][C]1C[CH]CC1 - // these fragments only differ in the number of hydrogens bonded to their respective carbon atoms. So these + // these fragments only differ in the number of hydrogen's bonded to their respective carbon atoms. So these // fragments would show up as one if saturated. Assertions.assertEquals(26, frags.size()); @@ -334,7 +334,7 @@ void testEF6RestSaturated() throws Exception { /** * Tests a complex molecule with R-group saturated fragments. * The number of fragments can differ from hydrogen-saturated or unsaturated versions - * due to the explicit R-group notation affecting canonical SMILES. + * due to the R-group affecting the size of the fragments. */ @Test void testEF7RestSaturated() throws Exception { @@ -497,12 +497,11 @@ void testCustomSmilesGenerator() throws Exception { /** * Tests the setExclusiveMaxTreeDepth method using 1,4-dibutylbenzene. - * This molecule has two splittable bonds (the bonds connecting the butyl chains to the benzene ring). * By varying `exclusiveMaxTreeDepth`, we can observe how the number of generated fragments changes. * *
      * Molecule: 1,4-dibutylbenzene (CCCCc1ccc(CCCC)cc1)
-     * Splittable bonds: 3 (the two C-C bonds connecting the butyl chains to the ring).
+     * Splittable bonds: 6 (the three C-C bonds for each butyl chain, from the ring until the the second last C-atom).
      * Fragmenter setup: minFragSize = 4 (to include butyl and benzene fragments), hydrogen-saturated fragments.
      *
      * Expected fragments for different exclusiveMaxTreeDepth settings:
@@ -513,9 +512,6 @@ void testCustomSmilesGenerator() throws Exception {
      *
      * 2.  exclusiveMaxTreeDepth = 2 (allows up to 1 simultaneous cut):
      * - Considers all subsets of splittable bonds of size 1.
-     * - Cleaving one butyl-benzene bond yields:
-     * a) A butyl chain (canonical SMILES: "CCCC")
-     * b) A butylbenzene fragment (canonical SMILES: "CCCCc1ccccc1")
      * - Expected unique fragments: 4 (
      * c1ccc(cc1)CCCC
      * c1cc(ccc1C)CCCC

From 0fce4aecffa8d82be47b723f1262ab7feb66dc0b Mon Sep 17 00:00:00 2001
From: ToLeWeiss 
Date: Wed, 9 Jul 2025 18:02:46 +0200
Subject: [PATCH 23/42] WIP fixing issues in documentation and conformities in
 the code

---
 .../cdk/fragment/ExhaustiveFragmenter.java    |  82 +++++---
 .../fragment/ExhaustiveFragmenterTest.java    | 196 +++++++++++++++---
 2 files changed, 215 insertions(+), 63 deletions(-)

diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java
index 8fb53200a40..c16a3970415 100644
--- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java
+++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java
@@ -1,4 +1,5 @@
-/* Copyright (C) 2010  Rajarshi Guha 
+/* Copyright (C) 2025  Rajarshi Guha 
+ *                     Tom Weiß 
  *
  * Contact: cdk-devel@lists.sourceforge.net
  *
@@ -51,22 +52,24 @@
  * Non-terminal meaning bonds connected to more than one single heavy atom (non-terminal bonds).
  * By default:
  * 
    - *
  • Fragments smaller than 6 atoms (excluding implicit hydrogen) don't get returned.
  • + *
  • Fragments smaller than 6 atoms (excluding implicit hydrogen) are not returned.
  • *
  • Fragments are returned with open valences, where a bond has been split.
  • + *
  • The Fragmentation splits at maximum 31 bonds in one run.
  • + *
  • The SMILES code of the fragments is generated with {@link SmiFlavor#Unique} and {@link SmiFlavor#UseAromaticSymbols}
  • *
- * However, users can modify these settings. + * However, users can modify these settings, with the exception, that the maximum tree depth can not be higher than 31 + * (Java's limitation caused by integer indexing). *

* Fragment Deduplication: * The `ExhaustiveFragmenter` uses canonical SMILES strings for internal deduplication of generated fragments. * This means that after a fragment is generated, its unique SMILES representation is computed - * (using {@link SmilesGenerator} with {@code SmiFlavor.Unique} and {@code SmiFlavor.UseAromaticSymbols}). + * (using the default or user specified {@link SmilesGenerator}). * If a fragment with the same canonical SMILES has already been generated and stored, the new fragment * is considered a duplicate and is not added to the results. *

* This deduplication strategy is particularly important when considering the {@link Saturation} setting: *

    - *
  • If fragments are {@link Saturation#HYDROGEN_SATURATED_FRAGMENTS} or - * {@link Saturation#REST_SATURATED_FRAGMENTS}, the saturation process might lead to a canonical SMILES + *
  • If fragments are {@link Saturation#HYDROGEN_SATURATED_FRAGMENTS}, the saturation process might lead to a canonical SMILES * that is identical to a fragment obtained via a different bond cleavage, or a fragment that appears * different due to explicit hydrogen representation but becomes identical when canonicalized.
  • *
  • For example, an unsaturated fragment like `[CH]1CCCCC1` (cyclohexyl radical) might deduplicate @@ -79,21 +82,16 @@ *

    * Example Usage: *

    {@code
    - * import org.openscience.cdk.DefaultChemObjectBuilder;
    - * import org.openscience.cdk.interfaces.IAtomContainer;
    - * import org.openscience.cdk.smiles.SmilesParser;
    - *
      * // By default, returns unsaturated fragments with a minimum size of 6 atoms
      * ExhaustiveFragmenter fragmenter = new ExhaustiveFragmenter();
    - * SmilesParser smiParser = new SmilesParser(DefaultChemObjectBuilder.getInstance());
    - * IAtomContainer mol = smiParser.parseSmiles("C1CCC(C1)C1=CC=CC=C1");  // Cyclopentylbenzene molecule
    + * SmilesParser smiParser = new SmilesParser(SilentChemObjectBuilder.getInstance());
    + * IAtomContainer mol = smiParser.parseSmiles("C1CCC(C1)C1=CC=CC=C1");  // Cyclopentylbenzene
      * fragmenter.generateFragments(mol);
      *
      * // Retrieve SMILES representations of fragments
      * String[] smilesFragments = fragmenter.getFragments();
      * // Example Result (depending on exact fragmentation points and min size):
      * // ["C1CCCCC1", "c1ccccc1"]
    - * // Note: Actual fragments might vary based on chosen saturation setting and bond definitions.
      *
      * // Retrieve AtomContainer representations of fragments
      * IAtomContainer[] atomContainerFragments = fragmenter.getFragmentsAsContainers();
    @@ -123,10 +121,11 @@ public enum Saturation {
             /**
              * Fragments will be saturated with R atoms.
              */
    -        REST_SATURATED_FRAGMENTS,
    +        R_SATURATED_FRAGMENTS,
     
             /**
    -         * Fragments will be returned in their unsaturated form (no additional hydrogen atoms).
    +         * Fragments will be returned in their unsaturated form (no additional hydrogen atoms). The unsaturated atoms
    +         * are the atoms of the splitted bonds.
              */
             UNSATURATED_FRAGMENTS
         }
    @@ -134,10 +133,11 @@ public enum Saturation {
         private static final int DEFAULT_MIN_FRAG_SIZE = 6;
         private static final Saturation DEFAULT_SATURATION = Saturation.UNSATURATED_FRAGMENTS;
         private static final SmilesGenerator DEFAULT_SMILES_GENERATOR = new SmilesGenerator(SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols);
    +    private static final int DEFAULT_INCLUSIVE_MAX_TREE_DEPTH = Integer.SIZE - 1;
     
         private final Map fragMap;
         private final SmilesGenerator smilesGenerator;
    -    private int exclusiveMaxTreeDepth = Integer.SIZE;
    +    private int inclusiveMaxTreeDepth;
         private int minFragSize;
         private Saturation saturationSetting;
         private static final ILoggingTool logger = LoggingToolFactory.createLoggingTool(ExhaustiveFragmenter.class);
    @@ -151,7 +151,7 @@ public enum Saturation {
          * 
*/ public ExhaustiveFragmenter() { - this(DEFAULT_SMILES_GENERATOR, DEFAULT_MIN_FRAG_SIZE, DEFAULT_SATURATION); + this(DEFAULT_SMILES_GENERATOR, DEFAULT_MIN_FRAG_SIZE, DEFAULT_SATURATION, DEFAULT_INCLUSIVE_MAX_TREE_DEPTH); } /** @@ -162,7 +162,7 @@ public ExhaustiveFragmenter() { * @param saturationSetting Determines whether fragments should be saturated (with hydrogens or R-atoms) or unsaturated. */ public ExhaustiveFragmenter(int minFragSize, Saturation saturationSetting) { - this(DEFAULT_SMILES_GENERATOR, minFragSize, saturationSetting); + this(DEFAULT_SMILES_GENERATOR, minFragSize, saturationSetting, DEFAULT_INCLUSIVE_MAX_TREE_DEPTH); } /** @@ -173,7 +173,7 @@ public ExhaustiveFragmenter(int minFragSize, Saturation saturationSetting) { * @param minFragSize Minimum number of atoms in a valid fragment (excluding implicit hydrogen). */ public ExhaustiveFragmenter(int minFragSize) { - this(DEFAULT_SMILES_GENERATOR, minFragSize, DEFAULT_SATURATION); + this(DEFAULT_SMILES_GENERATOR, minFragSize, DEFAULT_SATURATION, DEFAULT_INCLUSIVE_MAX_TREE_DEPTH); } /** @@ -185,19 +185,26 @@ public ExhaustiveFragmenter(int minFragSize) { * @param minFragSize Minimum number of atoms in a valid fragment (excluding implicit hydrogen). * @param saturationSetting Determines whether fragments should be saturated (with hydrogens or R-atoms) or unsaturated. */ - public ExhaustiveFragmenter(SmilesGenerator smilesGenerator, int minFragSize, Saturation saturationSetting) { + public ExhaustiveFragmenter(SmilesGenerator smilesGenerator, int minFragSize, Saturation saturationSetting, int inclusiveMaxTreeDepth) { this.minFragSize = minFragSize; this.saturationSetting = saturationSetting; this.fragMap = new HashMap<>(); this.smilesGenerator = smilesGenerator; + this.inclusiveMaxTreeDepth = inclusiveMaxTreeDepth; } /** - * Sets the minimum allowed fragment size. + * Sets the minimum allowed fragment size. This has to be greater than zero. * * @param minFragSize Minimum number of atoms in a valid fragment. + * @throws CDKException If the fragment size is less than or equal to zero. */ - public void setMinimumFragmentSize(int minFragSize) { + public void setMinimumFragmentSize(int minFragSize) throws CDKException { + if (minFragSize <= 0) { + throw new CDKException( + "Minimum fragment size must be a positive integer (>= 1). Provided: " + minFragSize + ); + } this.minFragSize = minFragSize; } @@ -215,16 +222,23 @@ public void setSaturationSetting(Saturation saturationSetting) { * fragmentation event. This value is exclusive, meaning if set to `3`, a maximum of `2` bonds * can be split simultaneously. *

- * Must be within the range {@code 0 < exclusiveMaxTreeDepth < 32}. This limit is important + * Must be within the range {@code 0 < inclusiveMaxTreeDepth < 32}. This limit is important * due to the combinatorial explosion of fragments (which scales with 2^n, where n is the * number of splittable bonds) and Java's use of 32-bit integers for indexing. * Setting a lower limit can help manage computational resources for larger molecules. *

* - * @param exclusiveMaxTreeDepth The exclusive maximum number of bonds that can be split in one atom container. + * @param inclusiveMaxTreeDepth The exclusive maximum number of bonds that can be split in one atom container. + * @throws CDKException If the given inclusive max tree depth is less or equal then zero or greater than 31 + * caused by Java's integer indexing limit */ - public void setExclusiveMaxTreeDepth(int exclusiveMaxTreeDepth) { - this.exclusiveMaxTreeDepth = exclusiveMaxTreeDepth; + public void setInclusiveMaxTreeDepth(int inclusiveMaxTreeDepth) throws CDKException { + if (inclusiveMaxTreeDepth <= 0 || inclusiveMaxTreeDepth >= 32) { + throw new CDKException( + "Inclusive max tree depth must be grater then zero and smaller then 32. Provided: " + inclusiveMaxTreeDepth + ); + } + this.inclusiveMaxTreeDepth = inclusiveMaxTreeDepth; } /** @@ -277,7 +291,7 @@ private void run(IAtomContainer atomContainer) throws CDKException { int subsetSize = subset.length; // Skip subsets exceeding the allowed depth - if (subsetSize >= this.exclusiveMaxTreeDepth) { + if (subsetSize >= this.inclusiveMaxTreeDepth) { continue; } @@ -478,7 +492,7 @@ private IAtomContainer[] splitBondsWithCopy(IAtomContainer origMol, IBond[] bond while (!dfsStack.isEmpty()) { IAtom origCurrAtom = dfsStack.pop(); - IAtom copiedCurrentAtom = origToCpyMap.get(origCurrAtom); + IAtom cpyCurrentAtom = origToCpyMap.get(origCurrAtom); for (IBond origBond : origMol.getConnectedBondsList(origCurrAtom)) { IAtom origNbor = origBond.getOther(origCurrAtom); @@ -492,13 +506,13 @@ private IAtomContainer[] splitBondsWithCopy(IAtomContainer origMol, IBond[] bond visitedOriginalAtoms[origMol.indexOf(origNbor)] = true; IAtom cpyNbor = copyAtom(origNbor, fragmentContainer); origToCpyMap.put(origNbor, cpyNbor); - fragmentContainer.addBond(copiedCurrentAtom.getIndex(), cpyNbor.getIndex(), + fragmentContainer.addBond(cpyCurrentAtom.getIndex(), cpyNbor.getIndex(), origBond.getOrder(), origBond.getStereo()); dfsStack.push(origNbor); } else { IAtom cpyNbor = origToCpyMap.get(origNbor); - if (fragmentContainer.getBond(copiedCurrentAtom, cpyNbor) == null) { - fragmentContainer.addBond(copiedCurrentAtom.getIndex(), cpyNbor.getIndex(), + if (fragmentContainer.getBond(cpyCurrentAtom, cpyNbor) == null) { + fragmentContainer.addBond(cpyCurrentAtom.getIndex(), cpyNbor.getIndex(), origBond.getOrder(), origBond.getStereo()); // Add bond only if not already present } @@ -506,8 +520,8 @@ private IAtomContainer[] splitBondsWithCopy(IAtomContainer origMol, IBond[] bond } else { // This bond is being cut. The origCurrAtom is part of the fragment being built. // Increment the cleavage count for its corresponding copied atom. - splitCountsCpyAtoms.put(copiedCurrentAtom, - splitCountsCpyAtoms.getOrDefault(copiedCurrentAtom, 0) + 1); + splitCountsCpyAtoms.put(cpyCurrentAtom, + splitCountsCpyAtoms.getOrDefault(cpyCurrentAtom, 0) + 1); } } } @@ -524,7 +538,7 @@ private IAtomContainer[] splitBondsWithCopy(IAtomContainer origMol, IBond[] bond int newImplHCount = (currImplHCount == null ? 0 : currImplHCount) + bondsCutCount; atom.setImplicitHydrogenCount(newImplHCount); break; - case REST_SATURATED_FRAGMENTS: + case R_SATURATED_FRAGMENTS: addRAtoms(atom, bondsCutCount, fragmentContainer); break; } diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java index 9c30c285ce3..265e53f9368 100644 --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java @@ -1,5 +1,5 @@ -/* - * Copyright (C) 2010 Rajarshi Guha +/* Copyright (C) 2025 Rajarshi Guha + * Tom Weiß * * Contact: cdk-devel@lists.sourceforge.net * @@ -26,6 +26,7 @@ import org.openscience.cdk.DefaultChemObjectBuilder; import org.openscience.cdk.interfaces.IAtomContainer; import org.openscience.cdk.interfaces.IBond; +import org.openscience.cdk.silent.SilentChemObjectBuilder; import org.openscience.cdk.smiles.SmiFlavor; import org.openscience.cdk.smiles.SmilesGenerator; import org.openscience.cdk.smiles.SmilesParser; @@ -71,7 +72,7 @@ static void setup() { fragmenterUnsaturated = new ExhaustiveFragmenter(); fragmenterUnsaturated.setSaturationSetting(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); fragmenterRestSaturated = new ExhaustiveFragmenter(); - fragmenterRestSaturated.setSaturationSetting(ExhaustiveFragmenter.Saturation.REST_SATURATED_FRAGMENTS); + fragmenterRestSaturated.setSaturationSetting(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); smilesParser = new SmilesParser(DefaultChemObjectBuilder.getInstance()); } @@ -136,7 +137,12 @@ void testEF5Unsaturated() throws Exception { fragmenterUnsaturated.generateFragments(mol); String[] frags = fragmenterUnsaturated.getFragments(); Assertions.assertNotNull(frags); - MatcherAssert.assertThat(Arrays.asList(frags), hasItems("[CH2]c1ccccc1", "[c]1ccccc1")); + MatcherAssert.assertThat(Arrays.asList(frags), + hasItems( + "[CH2]c1ccccc1", + "[c]1ccccc1" + ) + ); Assertions.assertNotNull(fragmenterUnsaturated.getFragmentsAsContainers()); Assertions.assertEquals(2, fragmenterUnsaturated.getFragmentsAsContainers().length); } @@ -182,7 +188,13 @@ void testEF7Unsaturated() throws Exception { Assertions.assertNotNull(fragmenterUnsaturated.getFragmentsAsContainers()); Assertions.assertEquals(26, fragmenterUnsaturated.getFragmentsAsContainers().length); - MatcherAssert.assertThat(frags, hasItems("[c]1ccccc1", "[CH2]CC1CCC(c2ccccc2)(CC3C=CC=C3)C1", "[CH2]C1CCC([CH2])(c2ccccc2)C1")); + MatcherAssert.assertThat(frags, + hasItems( + "[c]1ccccc1", + "[CH2]CC1CCC(c2ccccc2)(CC3C=CC=C3)C1", + "[CH2]C1CCC([CH2])(c2ccccc2)C1" + ) + ); } // --- Hydrogen-Saturated Fragments Tests --- @@ -247,7 +259,12 @@ void testEF5Saturated() throws Exception { fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); Assertions.assertNotNull(frags); - org.hamcrest.MatcherAssert.assertThat(Arrays.asList(frags), hasItems("c1ccc(cc1)C", "c1ccccc1")); + org.hamcrest.MatcherAssert.assertThat(Arrays.asList(frags), + hasItems( + "c1ccc(cc1)C", + "c1ccccc1" + ) + ); Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); Assertions.assertEquals(2, fragmenterSaturated.getFragmentsAsContainers().length); } @@ -285,7 +302,13 @@ void testEF7Saturated() throws Exception { Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); Assertions.assertEquals(25, fragmenterSaturated.getFragmentsAsContainers().length); - MatcherAssert.assertThat(frags, hasItems("c1ccccc1", "c1ccc(cc1)C2(CCC(CC)C2)CC3C=CC=C3", "c1ccc(cc1)C2(C)CCC(C)C2")); + MatcherAssert.assertThat(frags, + hasItems( + "c1ccccc1", + "c1ccc(cc1)C2(CCC(CC)C2)CC3C=CC=C3", + "c1ccc(cc1)C2(C)CCC(C)C2" + ) + ); } // --- R-Group Saturated Fragments Tests --- @@ -313,7 +336,12 @@ void testEF5RestSaturated() throws Exception { fragmenterRestSaturated.generateFragments(mol); String[] frags = fragmenterRestSaturated.getFragments(); Assertions.assertNotNull(frags); - MatcherAssert.assertThat(Arrays.asList(frags), hasItems("*c1ccccc1", "*Cc1ccccc1")); + MatcherAssert.assertThat(Arrays.asList(frags), + hasItems( + "*c1ccccc1", + "*Cc1ccccc1" + ) + ); Assertions.assertEquals(2, fragmenterRestSaturated.getFragmentsAsContainers().length); } @@ -348,7 +376,13 @@ void testEF7RestSaturated() throws Exception { // these fragments only differ in size compared to their respective hydrogen saturated version beacuse the R-Group represented by '*' // is also counted as a valid atom in comparison to implicit hydrogens. So these are valid fragments with size 6. Assertions.assertEquals(28, fragmenterRestSaturated.getFragmentsAsContainers().length); - MatcherAssert.assertThat(frags, hasItems("*c1ccccc1", "*C1CCC(c2ccccc2)(CC3C=CC=C3)C1", "*C1CCC(*)(c2ccccc2)C1")); + MatcherAssert.assertThat(frags, + hasItems( + "*c1ccccc1", + "*C1CCC(c2ccccc2)(CC3C=CC=C3)C1", + "*C1CCC(*)(c2ccccc2)C1" + ) + ); } // --- General Fragmenter Tests --- @@ -383,7 +417,12 @@ void testMinSizeLowered() throws Exception { String[] frags = localFragmenter.getFragments(); Assertions.assertNotNull(frags); Assertions.assertEquals(2, frags.length); - MatcherAssert.assertThat(Arrays.asList(frags), hasItems("C1CCCCC1", "C1CCCC1")); + MatcherAssert.assertThat(Arrays.asList(frags), + hasItems( + "C1CCCCC1", + "C1CCCC1" + ) + ); } /** @@ -485,13 +524,18 @@ void testGenerateSubset() { void testCustomSmilesGenerator() throws Exception { SmilesGenerator customSmilesGen = new SmilesGenerator(SmiFlavor.Unique); // No SmiFlavor.UseAromaticSymbols ExhaustiveFragmenter customFragmenter = new ExhaustiveFragmenter( - customSmilesGen, 6, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + customSmilesGen, 6, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, Integer.SIZE - 1); IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); // Diphenylmethane customFragmenter.generateFragments(mol); String[] frags = customFragmenter.getFragments(); Assertions.assertNotNull(frags); - MatcherAssert.assertThat(Arrays.asList(frags), hasItems("C=1C=CC=CC1", "C=1C=CC(=CC1)C")); + MatcherAssert.assertThat(Arrays.asList(frags), + hasItems( + "C=1C=CC=CC1", + "C=1C=CC(=CC1)C" + ) + ); Assertions.assertEquals(2, frags.length); } @@ -561,38 +605,132 @@ void testSetExclusiveMaxTreeDepth() throws Exception { ExhaustiveFragmenter localFragmenter; - localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); - localFragmenter.setExclusiveMaxTreeDepth(1); + localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, Integer.SIZE - 1); + localFragmenter.setInclusiveMaxTreeDepth(1); localFragmenter.generateFragments(mol); String[] fragsDepth1 = localFragmenter.getFragments(); Assertions.assertEquals(0, fragsDepth1.length, - "Expected 0 fragments when exclusiveMaxTreeDepth is 1 (allows 0 cuts) for 1,4-dibutylbenzene"); + "Expected 0 fragments when inclusiveMaxTreeDepth is 0 (allows 0 cuts) for 1,4-dibutylbenzene"); - localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); - localFragmenter.setExclusiveMaxTreeDepth(2); + localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, Integer.SIZE - 1); + localFragmenter.setInclusiveMaxTreeDepth(2); localFragmenter.generateFragments(mol); String[] fragsDepth2 = localFragmenter.getFragments(); Assertions.assertEquals(4, fragsDepth2.length, - "Expected 4 fragments when exclusiveMaxTreeDepth is 2 (allows up to 1 cut)"); + "Expected 4 fragments when inclusiveMaxTreeDepth is 1 (allows up to 1 cut)"); MatcherAssert.assertThat(Arrays.asList(fragsDepth2), - hasItems("CCCC", "c1ccc(cc1)CCCC")); - - localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); - localFragmenter.setExclusiveMaxTreeDepth(3); + hasItems( + "CCCC", + "c1ccc(cc1)CCCC" + ) + ); + + localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, Integer.SIZE - 1); + localFragmenter.setInclusiveMaxTreeDepth(3); localFragmenter.generateFragments(mol); String[] fragsDepth3 = localFragmenter.getFragments(); Assertions.assertEquals(10, fragsDepth3.length, - "Expected 10 fragments when exclusiveMaxTreeDepth is 3 (allows up to 2 cuts)"); + "Expected 10 fragments when inclusiveMaxTreeDepth is 2 (allows up to 2 cuts)"); MatcherAssert.assertThat(Arrays.asList(fragsDepth3), - hasItems("CCCC", "c1ccc(cc1)CCCC", "c1ccccc1")); - - localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); - localFragmenter.setExclusiveMaxTreeDepth(4); + hasItems( + "CCCC", + "c1ccc(cc1)CCCC", + "c1ccccc1" + ) + ); + + localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, Integer.SIZE - 1); + localFragmenter.setInclusiveMaxTreeDepth(4); localFragmenter.generateFragments(mol); String[] fragsDepth4 = localFragmenter.getFragments(); Assertions.assertEquals(10, fragsDepth4.length, - "Expected 10 fragments when exclusiveMaxTreeDepth is 4 (allows up to 3 cuts), same as max 2 cuts"); + "Expected 10 fragments when inclusiveMaxTreeDepth is 3 (allows up to 3 cuts), same as max 2 cuts"); MatcherAssert.assertThat(Arrays.asList(fragsDepth4), - hasItems("CCCC", "c1ccc(cc1)CCCC", "c1ccccc1")); + hasItems( + "CCCC", + "c1ccc(cc1)CCCC", + "c1ccccc1" + ) + ); + } + + // --- Complementary Molecule Tests --- + + /** + * Tests correct functional group identification on an example molecule with + * a disconnected structure. + * This was not allowed in a previous version. + */ + @Test + void testDisconnectedMolecules() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("C(CN(CC(=O)[O-])CC(=O)[O-])N(CC(=O)[O-])CC(=O)[O-].[Na+].[Na+].[Na+].[Na+]"); //Sodium edetate + fragmenterSaturated.generateFragments(mol); + String[] frags = fragmenterSaturated.getFragments(); + MatcherAssert.assertThat( + Arrays.asList(frags), + hasItems( + "O=C([O-])CNCCNCC(=O)[O-]", + "O=C([O-])CNCC(=O)[O-]", + "O=C([O-])CN(C)CCN(C)C", + "O=C([O-])CNCCNC", + "O=C([O-])CN(CC(=O)[O-])CC" + ) + ); + } + + /** + * Testing a bigger molecule + * + * @throws Exception if anything goes wrong + */ + @Test + void testIndexBigMolecule() throws Exception { + SmilesParser smiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); + IAtomContainer mol = smiPar.parseSmiles("CC1=C(C(=CC=C1)NC2=CC=CC=C2C" + + "(=O)NC(CCS(=O)C)C(=O)NC(C)C3=CC=C(C=C3)F)C"); //PubChem CID 118705975 + + fragmenterSaturated.generateFragments(mol); + String[] frags = fragmenterSaturated.getFragments(); + Assertions.assertEquals( + Arrays.asList(frags), + hasItems( + "O=CCNC(=O)c1ccccc1", + "O=C(N)CNC(=O)c1ccccc1N", + "O=C(NC)c1ccccc1N", + "O=C(NCCC)c1ccccc1N", + "O=CCNC(=O)c1ccccc1Nc2cccc(c2C)C", + "O=C(N)CCCS(=O)C", + "O=C(N)C(NC(=O)c1ccccc1)CCS(=O)C" + ) + ); + } + + + /** + * Testing a bigger molecule + * + * @throws Exception if anything goes wrong + */ + @Test + void testIndexBigMolecule2() throws Exception { + SmilesParser smiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); + IAtomContainer mol = smiPar.parseSmiles("C[C@]12CC[C@](CC1C3=CC(=O)C4[C@]5(CCC(C(C5CC[C@]4([C@@]3(CC2)C)C)(C)C)" + + "OC6C(C(C(C(O6)C(=O)N[C@H](CCC(=O)OC)C(=O)OC)O)O)OC7C(C(C(C(O7)C(=O)N[C@H](CCC(=O)OC)C(=O)OC)O)O)O)C)(C)C" + + "(=O)N[C@H](CCC(=O)OC)C(=O)OC"); // Pubchem CID 16396833 + System.out.println(fragmenterSaturated.getSplitableBonds(mol).length); + fragmenterSaturated.generateFragments(mol); + String[] frags = fragmenterSaturated.getFragments(); + Assertions.assertEquals( + Arrays.asList(frags), + hasItems( + "O=CCNC(=O)c1ccccc1", + "O=C(N)CNC(=O)c1ccccc1N", + "O=C(NC)c1ccccc1N", + "O=C(NCCC)c1ccccc1N", + "O=CCNC(=O)c1ccccc1Nc2cccc(c2C)C", + "O=C(N)CCCS(=O)C", + "O=C(N)C(NC(=O)c1ccccc1)CCS(=O)C" + ) + ); } } \ No newline at end of file From 2e50f4f750b8fb0230be1dc3509c52832058727e Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Wed, 23 Jul 2025 14:43:53 +0200 Subject: [PATCH 24/42] implemented comments and improved ring detection and aromaticity copying --- .../cdk/fragment/ExhaustiveFragmenter.java | 155 ++++++---- .../fragment/ExhaustiveFragmenterTest.java | 265 +++++++++--------- 2 files changed, 229 insertions(+), 191 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index c16a3970415..75df6151d01 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -24,20 +24,27 @@ package org.openscience.cdk.fragment; import org.openscience.cdk.aromaticity.Aromaticity; +import org.openscience.cdk.atomtype.CDKAtomTypeMatcher; +import org.openscience.cdk.config.AtomTypeFactory; +import org.openscience.cdk.config.atomtypes.AtomTypeReader; import org.openscience.cdk.exception.CDKException; -import org.openscience.cdk.graph.SpanningTree; import org.openscience.cdk.interfaces.IAtom; import org.openscience.cdk.interfaces.IAtomContainer; +import org.openscience.cdk.interfaces.IAtomType; import org.openscience.cdk.interfaces.IBond; import org.openscience.cdk.interfaces.IPseudoAtom; -import org.openscience.cdk.interfaces.IRingSet; +import org.openscience.cdk.ringsearch.RingSearch; import org.openscience.cdk.smiles.SmiFlavor; import org.openscience.cdk.smiles.SmilesGenerator; import org.openscience.cdk.tools.ILoggingTool; import org.openscience.cdk.tools.LoggingToolFactory; import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; +import org.openscience.cdk.tools.manipulator.AtomTypeManipulator; +import java.util.ArrayDeque; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Deque; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -63,7 +70,7 @@ * Fragment Deduplication: * The `ExhaustiveFragmenter` uses canonical SMILES strings for internal deduplication of generated fragments. * This means that after a fragment is generated, its unique SMILES representation is computed - * (using the default or user specified {@link SmilesGenerator}). + * (using the default or user specified {@link SmilesGenerator}). These SMILES do not encode stereochemistry. * If a fragment with the same canonical SMILES has already been generated and stored, the new fragment * is considered a duplicate and is not added to the results. *

@@ -85,13 +92,13 @@ * // By default, returns unsaturated fragments with a minimum size of 6 atoms * ExhaustiveFragmenter fragmenter = new ExhaustiveFragmenter(); * SmilesParser smiParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); - * IAtomContainer mol = smiParser.parseSmiles("C1CCC(C1)C1=CC=CC=C1"); // Cyclopentylbenzene + * IAtomContainer mol = smiParser.parseSmiles("C1CCCCC1C1=CC=CC=C1"); // Cyclopentylbenzene * fragmenter.generateFragments(mol); * * // Retrieve SMILES representations of fragments * String[] smilesFragments = fragmenter.getFragments(); * // Example Result (depending on exact fragmentation points and min size): - * // ["C1CCCCC1", "c1ccccc1"] + * // "[CH]1CCCCC1", "[c]1ccccc1" * * // Retrieve AtomContainer representations of fragments * IAtomContainer[] atomContainerFragments = fragmenter.getFragmentsAsContainers(); @@ -125,7 +132,7 @@ public enum Saturation { /** * Fragments will be returned in their unsaturated form (no additional hydrogen atoms). The unsaturated atoms - * are the atoms of the splitted bonds. + * are the atoms of the split bonds. */ UNSATURATED_FRAGMENTS } @@ -186,22 +193,27 @@ public ExhaustiveFragmenter(int minFragSize) { * @param saturationSetting Determines whether fragments should be saturated (with hydrogens or R-atoms) or unsaturated. */ public ExhaustiveFragmenter(SmilesGenerator smilesGenerator, int minFragSize, Saturation saturationSetting, int inclusiveMaxTreeDepth) { - this.minFragSize = minFragSize; + if (saturationSetting == null) { + throw new IllegalArgumentException("The given SaturationSetting can not be null"); + } this.saturationSetting = saturationSetting; - this.fragMap = new HashMap<>(); + if (smilesGenerator == null) { + throw new IllegalArgumentException("The given SmilesGenerator can not be null"); + } this.smilesGenerator = smilesGenerator; - this.inclusiveMaxTreeDepth = inclusiveMaxTreeDepth; + this.setInclusiveMaxTreeDepth(inclusiveMaxTreeDepth); + this.setMinimumFragmentSize(minFragSize); + this.fragMap = new HashMap<>(); } /** * Sets the minimum allowed fragment size. This has to be greater than zero. * * @param minFragSize Minimum number of atoms in a valid fragment. - * @throws CDKException If the fragment size is less than or equal to zero. */ - public void setMinimumFragmentSize(int minFragSize) throws CDKException { + public void setMinimumFragmentSize(int minFragSize) { if (minFragSize <= 0) { - throw new CDKException( + throw new IllegalArgumentException( "Minimum fragment size must be a positive integer (>= 1). Provided: " + minFragSize ); } @@ -214,6 +226,9 @@ public void setMinimumFragmentSize(int minFragSize) throws CDKException { * @param saturationSetting the saturation mode for generated fragments. */ public void setSaturationSetting(Saturation saturationSetting) { + if (saturationSetting == null) { + throw new IllegalArgumentException("The given SaturationSetting can not be null"); + } this.saturationSetting = saturationSetting; } @@ -229,12 +244,10 @@ public void setSaturationSetting(Saturation saturationSetting) { *

* * @param inclusiveMaxTreeDepth The exclusive maximum number of bonds that can be split in one atom container. - * @throws CDKException If the given inclusive max tree depth is less or equal then zero or greater than 31 - * caused by Java's integer indexing limit */ - public void setInclusiveMaxTreeDepth(int inclusiveMaxTreeDepth) throws CDKException { + public void setInclusiveMaxTreeDepth(int inclusiveMaxTreeDepth) { if (inclusiveMaxTreeDepth <= 0 || inclusiveMaxTreeDepth >= 32) { - throw new CDKException( + throw new IllegalArgumentException( "Inclusive max tree depth must be grater then zero and smaller then 32. Provided: " + inclusiveMaxTreeDepth ); } @@ -267,13 +280,22 @@ public void generateFragments(IAtomContainer atomContainer) throws CDKException private void run(IAtomContainer atomContainer) throws CDKException { // Return early if the molecule has fewer than 3 bonds (no meaningful splits possible) - if (atomContainer.getBondCount() < 3) return; + if (atomContainer.getBondCount() < 3 || atomContainer.getAtomCount() < this.minFragSize || atomContainer.isEmpty()) { + return; + } // Retrieve bonds that are eligible for splitting - IBond[] splittableBonds = getSplitableBonds(atomContainer); + IBond[] splittableBonds = getSplittableBonds(atomContainer); // If no splittable bonds are found, return early - if (splittableBonds.length == 0) return; + if (splittableBonds.length == 0) { + logger.info("no splittable bonds found"); + return; + } + if (splittableBonds.length > this.inclusiveMaxTreeDepth) { + logger.warn("Got " + splittableBonds.length + " splittable bonds but only " + this.inclusiveMaxTreeDepth + + " tree depth. This means only " + this.inclusiveMaxTreeDepth + " bonds can be split"); + } logger.debug("Got " + splittableBonds.length + " splittable bonds"); // Compute the number of possible bond subsets (excluding the empty set): 2^n - 1 @@ -307,20 +329,21 @@ private void run(IAtomContainer atomContainer) throws CDKException { // Process each fragment for (IAtomContainer partContainer : parts) { - // Configure atom types and add implicit hydrogens - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(partContainer); - - // Apply aromaticity perception (legacy operation) - // TODO: Not sure how to handle this - Aromaticity.cdkLegacy().apply(partContainer); - // Generate a unique SMILES representation of the fragment String tmpSmiles = this.smilesGenerator.create(partContainer); - int numberOfAtoms = partContainer.getAtomCount(); + + int numberOfAtoms = 0; + for (IAtom atom : partContainer.atoms()) { + + if (atom instanceof IPseudoAtom) { + continue; + } + numberOfAtoms++; + } // Store the fragment if it meets the size requirement and is unique - if (numberOfAtoms >= minFragSize && !fragMap.containsKey(tmpSmiles)) { - fragMap.put(tmpSmiles, partContainer); + if (numberOfAtoms >= minFragSize) { + fragMap.putIfAbsent(tmpSmiles, partContainer); } } } @@ -329,8 +352,8 @@ private void run(IAtomContainer atomContainer) throws CDKException { /** * Detects and returns the bonds, which will be split by an exhaustive fragmentation. This method is especially useful * to determine if it is even possible to split a specific molecule exhaustively. The number of fragments is 2^n - 1 with n - * being the number of splittable bonds. Therefore, it is impossible to entirely split a molecule with more than 31 splittable Bonds. - * To mitigate this one cna check this with this function, for example: + * being the number of splittable bonds. Therefore, it is impossible to entirely split a molecule with more than 31 splittable bonds. + * To mitigate this one can check this with this function, for example: *
      *     {@code
      *     ExhaustiveFragmenter exhFragmenter = new Exhaustive Fragmenter;
@@ -342,11 +365,10 @@ private void run(IAtomContainer atomContainer) throws CDKException {
      * @param atomContainer the container which contains the molecule in question.
      * @return the bonds which would be split by the exhaustive fragmentation.
      */
-    public IBond[] getSplitableBonds(IAtomContainer atomContainer) {
+    public static IBond[] getSplittableBonds(IAtomContainer atomContainer) {
         // do ring detection
-        // TODO: Is this really the proper way to do ring detection here ?
-        SpanningTree spanningTree = new SpanningTree(atomContainer);
-        IRingSet allRings = spanningTree.getAllRings();
+        RingSearch ringSearch= new RingSearch(atomContainer);
+        IAtomContainer allRingsContainer = ringSearch.ringFragments();
 
         // find the splitable bonds
         ArrayList splitableBonds = new ArrayList<>();
@@ -356,8 +378,7 @@ public IBond[] getSplitableBonds(IAtomContainer atomContainer) {
             boolean isTerminal = false;
 
             // lets see if it's in a ring
-            IRingSet rings = allRings.getRings(bond);
-            if (rings.getAtomContainerCount() != 0) isInRing = true;
+            if (allRingsContainer.contains(bond)) isInRing = true;
 
             // lets see if it is a terminal bond
             for (IAtom atom : bond.atoms()) {
@@ -406,17 +427,24 @@ public IBond[] getSplitableBonds(IAtomContainer atomContainer) {
      *              Duplicate values in `nums` may result in duplicate subset entries.
      * @return      An array containing the subset corresponding to `index`.
      */
-    static int[] generateSubset(int index, int[] nums) {
+    protected static int[] generateSubset(int index, int[] nums) {
         // Allocate subset array based on the number of 1-bits in index.
         int[] subset = new int[Integer.bitCount(index)];
         int subsetIndex = 0;
 
-        // Iterate through each bit position (up to 31 bits).
-        for (int j = 0; j < Integer.SIZE; j++) {
-            // If the j-th bit in index is set, include nums[j] in the subset.
-            if (((index >> j) & 1) == 1) {
-                subset[subsetIndex++] = nums[j];
+        // Process using bit manipulation - only iterate through set bits
+        while (index != 0) {
+            // Find position of lowest set bit
+            int lowestBitPos = Integer.numberOfTrailingZeros(index);
+
+            // Add the corresponding element from nums if within bounds
+            if (lowestBitPos < nums.length) {
+                subset[subsetIndex] = nums[lowestBitPos];
+                subsetIndex++;
             }
+
+            // Clear the lowest set bit and continue
+            index = index & (index - 1);
         }
 
         return subset;
@@ -465,23 +493,19 @@ private static IAtom copyAtom(IAtom originalAtom, IAtomContainer atomContainer)
      * @return An array of copied molecular fragments resulting from the split.
      */
     private IAtomContainer[] splitBondsWithCopy(IAtomContainer origMol, IBond[] bondsToSplit) {
-        Set> splitBondAtomPairs = new HashSet<>();
-        for (IBond bond : bondsToSplit) {
-            Set pair = new HashSet<>((int) Math.ceil(2 / (double) 0.75f));
-            pair.add(bond.getAtom(0));
-            pair.add(bond.getAtom(1));
-            splitBondAtomPairs.add(pair);
-        }
-
+        Set bondsToSplitSet = new HashSet<>(bondsToSplit.length);
+        // for a faster lookup the hashset is used here.
+        bondsToSplitSet.addAll(Arrays.asList(bondsToSplit));
         boolean[] visitedOriginalAtoms = new boolean[origMol.getAtomCount()];
         List fragmentList = new ArrayList<>(bondsToSplit.length + 1);
+        int copiedBonds = 0;
 
         for (int i = 0; i < origMol.getAtomCount(); i++) {
             IAtom currPotentialStartAtom = origMol.getAtom(i);
             if (!visitedOriginalAtoms[origMol.indexOf(currPotentialStartAtom)]) {
                 IAtomContainer fragmentContainer = origMol.getBuilder().newInstance(IAtomContainer.class);
                 Map origToCpyMap = new HashMap<>();
-                Stack dfsStack = new Stack<>();
+                Deque dfsStack = new ArrayDeque<>();
                 // Store split counts specific to the atoms in the fragment being built
                 Map splitCountsCpyAtoms = new HashMap<>();
 
@@ -496,25 +520,31 @@ private IAtomContainer[] splitBondsWithCopy(IAtomContainer origMol, IBond[] bond
 
                     for (IBond origBond : origMol.getConnectedBondsList(origCurrAtom)) {
                         IAtom origNbor = origBond.getOther(origCurrAtom);
-                        Set currBondPair = new HashSet<>(2);
-                        currBondPair.add(origCurrAtom);
-                        currBondPair.add(origNbor);
-                        boolean isThisABondToSplit = splitBondAtomPairs.contains(currBondPair);
+                        boolean isThisABondToSplit = bondsToSplitSet.contains(origBond);
 
                         if (!isThisABondToSplit) {
                             if (!origToCpyMap.containsKey(origNbor)) {
                                 visitedOriginalAtoms[origMol.indexOf(origNbor)] = true;
                                 IAtom cpyNbor = copyAtom(origNbor, fragmentContainer);
                                 origToCpyMap.put(origNbor, cpyNbor);
-                                fragmentContainer.addBond(cpyCurrentAtom.getIndex(), cpyNbor.getIndex(),
-                                        origBond.getOrder(), origBond.getStereo());
+                                IBond cpyBond = fragmentContainer.newBond(cpyCurrentAtom, cpyNbor,
+                                        origBond.getOrder());
+                                cpyBond.setStereo(origBond.getStereo());
+                                cpyBond.setIsAromatic(origBond.isAromatic());
+                                // Setting is in ring is possible here because we always detect rings
+                                // in the process of detecting the splittable bonds.
+                                cpyBond.setIsInRing(origBond.isInRing());
+                                // fragmentContainer.addBond(cpyBond);
                                 dfsStack.push(origNbor);
                             } else {
                                 IAtom cpyNbor = origToCpyMap.get(origNbor);
+                                // Add bond only if not already present
                                 if (fragmentContainer.getBond(cpyCurrentAtom, cpyNbor) == null) {
-                                    fragmentContainer.addBond(cpyCurrentAtom.getIndex(), cpyNbor.getIndex(),
-                                            origBond.getOrder(), origBond.getStereo());
-                                    // Add bond only if not already present
+                                    IBond cpyBond = fragmentContainer.newBond(cpyCurrentAtom, cpyNbor,
+                                            origBond.getOrder());
+                                    cpyBond.setStereo(origBond.getStereo());
+                                    cpyBond.setIsAromatic(origBond.isAromatic());
+                                    cpyBond.setIsInRing(origBond.isInRing());
                                 }
                             }
                         } else {
@@ -541,6 +571,8 @@ private IAtomContainer[] splitBondsWithCopy(IAtomContainer origMol, IBond[] bond
                             case R_SATURATED_FRAGMENTS:
                                 addRAtoms(atom, bondsCutCount, fragmentContainer);
                                 break;
+                            default:
+                                throw new UnsupportedOperationException("no treatment defined yet for this new enum constant");
                         }
                     }
                 }
@@ -550,7 +582,6 @@ private IAtomContainer[] splitBondsWithCopy(IAtomContainer origMol, IBond[] bond
         return fragmentList.toArray(new IAtomContainer[0]);
     }
 
-
     /**
      * Get the fragments generated as SMILES strings.
      *
diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java
index 265e53f9368..1ba04e3d36d 100644
--- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java
+++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java
@@ -23,7 +23,6 @@
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
-import org.openscience.cdk.DefaultChemObjectBuilder;
 import org.openscience.cdk.interfaces.IAtomContainer;
 import org.openscience.cdk.interfaces.IBond;
 import org.openscience.cdk.silent.SilentChemObjectBuilder;
@@ -33,28 +32,14 @@
 import org.openscience.cdk.test.CDKTestCase;
 
 import java.util.Arrays;
-import java.util.List;
-
-import static org.hamcrest.CoreMatchers.hasItems;
-import static org.hamcrest.CoreMatchers.is;
+import java.util.HashSet;
+import java.util.Set;
 
 /**
  * Test exhaustive fragmenter.
- * 

* This test class covers various scenarios for the {@link ExhaustiveFragmenter}, * including different saturation settings (unsaturated, hydrogen-saturated, R-group saturated) * and minimum fragment size. - *

- * Note on deduplication: The {@link ExhaustiveFragmenter} uses SMILES strings for - * internal deduplication of generated fragments. This means that if two fragments, - * despite having different atom connectivity indices or implicit hydrogen counts, - * produce the same canonical SMILES string (as determined by {@link SmilesGenerator}), - * they will be considered the same fragment and only one will be stored. - * This is particularly relevant when comparing unsaturated vs. saturated fragments, - * as the saturation process might lead to a canonical SMILES that is identical - * to a fragment obtained via a different bond cleavage, or a fragment that appears - * different due to explicit hydrogen representation but becomes identical when - * canonicalized. * * @see ExhaustiveFragmenter */ @@ -73,7 +58,7 @@ static void setup() { fragmenterUnsaturated.setSaturationSetting(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); fragmenterRestSaturated = new ExhaustiveFragmenter(); fragmenterRestSaturated.setSaturationSetting(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); - smilesParser = new SmilesParser(DefaultChemObjectBuilder.getInstance()); + smilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); } // --- Unsaturated Fragments Tests --- @@ -103,7 +88,7 @@ void testEF2Unsaturated() throws Exception { } /** - * Tests fragmentation of methylcyclohexane with unsaturated fragments. + * Tests fragmentation of ethylcyclohexane with unsaturated fragments. * Expects "[CH]1CCCCC1" as a fragment, representing the cyclohexyl radical. */ @Test @@ -111,7 +96,7 @@ void testEF3Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); fragmenterUnsaturated.generateFragments(mol); String[] frags = fragmenterUnsaturated.getFragments(); - MatcherAssert.assertThat(frags, is(new String[]{"[CH]1CCCCC1"})); + Assertions.assertArrayEquals(frags, new String[]{"[CH]1CCCCC1"}); } /** @@ -124,7 +109,7 @@ void testEF4Unsaturated() throws Exception { fragmenterUnsaturated.generateFragments(mol); String[] frags = fragmenterUnsaturated.getFragments(); Assertions.assertNotNull(frags); - MatcherAssert.assertThat(frags, is(new String[]{"[c]1ccccc1"})); + Assertions.assertArrayEquals(frags, new String[]{"[c]1ccccc1"}); } /** @@ -137,11 +122,11 @@ void testEF5Unsaturated() throws Exception { fragmenterUnsaturated.generateFragments(mol); String[] frags = fragmenterUnsaturated.getFragments(); Assertions.assertNotNull(frags); - MatcherAssert.assertThat(Arrays.asList(frags), - hasItems( + Assertions.assertTrue( + hasItems(frags, new String[] { "[CH2]c1ccccc1", "[c]1ccccc1" - ) + }) ); Assertions.assertNotNull(fragmenterUnsaturated.getFragmentsAsContainers()); Assertions.assertEquals(2, fragmenterUnsaturated.getFragmentsAsContainers().length); @@ -157,7 +142,7 @@ void testEF6Unsaturated() throws Exception { fragmenterUnsaturated.generateFragments(mol); String[] frags = fragmenterUnsaturated.getFragments(); Assertions.assertNotNull(frags); - MatcherAssert.assertThat(frags, is(new String[]{"[c]1ccccc1"})); + Assertions.assertArrayEquals(frags, new String[]{"[c]1ccccc1"}); Assertions.assertNotNull(fragmenterUnsaturated.getFragmentsAsContainers()); Assertions.assertEquals(1, fragmenterUnsaturated.getFragmentsAsContainers().length); @@ -176,24 +161,24 @@ void testEF6Unsaturated() throws Exception { void testEF7Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); fragmenterUnsaturated.generateFragments(mol); - List frags = Arrays.asList(fragmenterUnsaturated.getFragments()); + String[] frags = fragmenterUnsaturated.getFragments(); Assertions.assertNotNull(frags); // There is one additional fragment in comparison to the saturated version because there are following fragments: // [C]1CCC([CH2])C1 // [CH2][C]1C[CH]CC1 // these fragments only differ in the number of hydrogen's bonded to their respective carbon atoms. So these // fragments would show up as one if saturated. - Assertions.assertEquals(26, frags.size()); + Assertions.assertEquals(26, frags.length); Assertions.assertNotNull(fragmenterUnsaturated.getFragmentsAsContainers()); Assertions.assertEquals(26, fragmenterUnsaturated.getFragmentsAsContainers().length); - MatcherAssert.assertThat(frags, - hasItems( + Assertions.assertTrue( + hasItems(frags, new String[] { "[c]1ccccc1", "[CH2]CC1CCC(c2ccccc2)(CC3C=CC=C3)C1", "[CH2]C1CCC([CH2])(c2ccccc2)C1" - ) + }) ); } @@ -232,7 +217,7 @@ void testEF3Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); - org.hamcrest.MatcherAssert.assertThat(frags, is(new String[]{"C1CCCCC1"})); + Assertions.assertArrayEquals(frags, new String[]{"C1CCCCC1"}); } /** @@ -245,7 +230,7 @@ void testEF4Saturated() throws Exception { fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); Assertions.assertNotNull(frags); - org.hamcrest.MatcherAssert.assertThat(frags, is(new String[]{"c1ccccc1"})); + Assertions.assertArrayEquals(frags, new String[]{"c1ccccc1"}); } /** @@ -259,11 +244,11 @@ void testEF5Saturated() throws Exception { fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); Assertions.assertNotNull(frags); - org.hamcrest.MatcherAssert.assertThat(Arrays.asList(frags), - hasItems( + Assertions.assertTrue( + hasItems(frags, new String[] { "c1ccc(cc1)C", "c1ccccc1" - ) + }) ); Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); Assertions.assertEquals(2, fragmenterSaturated.getFragmentsAsContainers().length); @@ -279,7 +264,7 @@ void testEF6Saturated() throws Exception { fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); Assertions.assertNotNull(frags); - org.hamcrest.MatcherAssert.assertThat(frags, is(new String[]{"c1ccccc1"})); + Assertions.assertArrayEquals(frags, new String[]{"c1ccccc1"}); Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); Assertions.assertEquals(1, fragmenterSaturated.getFragmentsAsContainers().length); @@ -295,19 +280,19 @@ void testEF6Saturated() throws Exception { void testEF7Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); fragmenterSaturated.generateFragments(mol); - List frags = Arrays.asList(fragmenterSaturated.getFragments()); + String[] frags = fragmenterSaturated.getFragments(); Assertions.assertNotNull(frags); - Assertions.assertEquals(25, frags.size()); + Assertions.assertEquals(25, frags.length); Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); Assertions.assertEquals(25, fragmenterSaturated.getFragmentsAsContainers().length); - MatcherAssert.assertThat(frags, - hasItems( + Assertions.assertTrue( + hasItems(frags, new String[] { "c1ccccc1", "c1ccc(cc1)C2(CCC(CC)C2)CC3C=CC=C3", "c1ccc(cc1)C2(C)CCC(C)C2" - ) + }) ); } @@ -323,11 +308,11 @@ void testEF3RestSaturated() throws Exception { fragmenterRestSaturated.generateFragments(mol); String[] frags = fragmenterRestSaturated.getFragments(); Assertions.assertNotNull(frags); - MatcherAssert.assertThat(frags, is(new String[]{"*C1CCCCC1"})); + Assertions.assertArrayEquals(frags, new String[]{"*C1CCCCC1"}); } /** - * Tests fragmentation of diphenylmethane with R-group saturated fragments. + * Tests fragmentation of toluene with R-group saturated fragments. * Expects "*c1ccccc1" (phenyl with R-atom) and "*Cc1ccccc1" (benzyl with R-atom). */ @Test @@ -336,11 +321,11 @@ void testEF5RestSaturated() throws Exception { fragmenterRestSaturated.generateFragments(mol); String[] frags = fragmenterRestSaturated.getFragments(); Assertions.assertNotNull(frags); - MatcherAssert.assertThat(Arrays.asList(frags), - hasItems( + Assertions.assertTrue( + hasItems(frags, new String[] { "*c1ccccc1", "*Cc1ccccc1" - ) + }) ); Assertions.assertEquals(2, fragmenterRestSaturated.getFragmentsAsContainers().length); } @@ -355,7 +340,7 @@ void testEF6RestSaturated() throws Exception { fragmenterRestSaturated.generateFragments(mol); String[] frags = fragmenterRestSaturated.getFragments(); Assertions.assertNotNull(frags); - MatcherAssert.assertThat(frags, is(new String[]{"*c1ccccc1"})); + Assertions.assertArrayEquals(frags, new String[]{"*c1ccccc1"}); Assertions.assertEquals(1, fragmenterRestSaturated.getFragmentsAsContainers().length); } @@ -368,20 +353,16 @@ void testEF6RestSaturated() throws Exception { void testEF7RestSaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); fragmenterRestSaturated.generateFragments(mol); - List frags = Arrays.asList(fragmenterRestSaturated.getFragments()); + String[] frags = fragmenterRestSaturated.getFragments(); Assertions.assertNotNull(frags); - // There two additional fragments in comparison to the hydrogen saturated version because there are following fragments: - // *C1CCC(*)(*)C1 - // *C1C=CC=C1 - // these fragments only differ in size compared to their respective hydrogen saturated version beacuse the R-Group represented by '*' - // is also counted as a valid atom in comparison to implicit hydrogens. So these are valid fragments with size 6. - Assertions.assertEquals(28, fragmenterRestSaturated.getFragmentsAsContainers().length); - MatcherAssert.assertThat(frags, - hasItems( + // Needs to have the same number of fragments as the unsaturated version. + Assertions.assertEquals(26, fragmenterRestSaturated.getFragmentsAsContainers().length); + Assertions.assertTrue( + hasItems(frags, new String[] { "*c1ccccc1", "*C1CCC(c2ccccc2)(CC3C=CC=C3)C1", "*C1CCC(*)(c2ccccc2)C1" - ) + }) ); } @@ -417,11 +398,11 @@ void testMinSizeLowered() throws Exception { String[] frags = localFragmenter.getFragments(); Assertions.assertNotNull(frags); Assertions.assertEquals(2, frags.length); - MatcherAssert.assertThat(Arrays.asList(frags), - hasItems( + Assertions.assertTrue( + hasItems(frags, new String[] { "C1CCCCC1", "C1CCCC1" - ) + }) ); } @@ -434,54 +415,56 @@ void testEqualityOfSmilesAndContainers() throws Exception { SmilesGenerator smilesGenerator = new SmilesGenerator(SmiFlavor.UseAromaticSymbols | SmiFlavor.Unique); IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC(N)C(=O)O"); // Phenylalanine fragmenterSaturated.generateFragments(mol); - List smilesFrags = Arrays.asList(fragmenterSaturated.getFragments()); + String[] smilesFrags = fragmenterSaturated.getFragments(); IAtomContainer[] containerFrags = fragmenterSaturated.getFragmentsAsContainers(); for (IAtomContainer frag : containerFrags) { - MatcherAssert.assertThat(smilesFrags, hasItems(smilesGenerator.create(frag))); + Assertions.assertTrue(hasItems(smilesFrags, new String[] { + smilesGenerator.create(frag) + })); } } /** - * Tests the {@link ExhaustiveFragmenter#getSplitableBonds(IAtomContainer)} method + * Tests the {@link ExhaustiveFragmenter#getSplittableBonds(IAtomContainer)} method * for a linear alkane (propane), which should have no splittable bonds. */ @Test void testGetSplittableBondsLinearMolecule() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("CCC"); // Propane - IBond[] splittableBonds = fragmenterSaturated.getSplitableBonds(mol); + IBond[] splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); Assertions.assertEquals(0, splittableBonds.length); } /** - * Tests the {@link ExhaustiveFragmenter#getSplitableBonds(IAtomContainer)} method + * Tests the {@link ExhaustiveFragmenter#getSplittableBonds(IAtomContainer)} method * for a cyclic alkane (cyclopentane), which should have no splittable bonds (all bonds are in a ring). */ @Test void testGetSplittableBondsCyclicMolecule() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1"); // Cyclopentane - IBond[] splittableBonds = fragmenterSaturated.getSplitableBonds(mol); + IBond[] splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); Assertions.assertEquals(0, splittableBonds.length); } /** - * Tests the {@link ExhaustiveFragmenter#getSplitableBonds(IAtomContainer)} method + * Tests the {@link ExhaustiveFragmenter#getSplittableBonds(IAtomContainer)} method * for ethylbenzene, which should have one splittable bond (the bond between the phenyl and ethyl groups). */ @Test void testGetSplittableBondsBenzeneWithSideChain() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC"); // Ethylbenzene - IBond[] splittableBonds = fragmenterSaturated.getSplitableBonds(mol); + IBond[] splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); Assertions.assertEquals(1, splittableBonds.length); } /** - * Tests the {@link ExhaustiveFragmenter#getSplitableBonds(IAtomContainer)} method + * Tests the {@link ExhaustiveFragmenter#getSplittableBonds(IAtomContainer)} method * for biphenyl, which should have one splittable bond (the bond connecting the two phenyl rings). */ @Test void testGetSplittableBondsBiphenyl() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); // Biphenyl - IBond[] splittableBonds = fragmenterSaturated.getSplitableBonds(mol); + IBond[] splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); Assertions.assertEquals(1, splittableBonds.length); } @@ -530,31 +513,30 @@ void testCustomSmilesGenerator() throws Exception { String[] frags = customFragmenter.getFragments(); Assertions.assertNotNull(frags); - MatcherAssert.assertThat(Arrays.asList(frags), - hasItems( + Assertions.assertTrue( + hasItems(frags, new String[] { "C=1C=CC=CC1", "C=1C=CC(=CC1)C" - ) + }) ); Assertions.assertEquals(2, frags.length); } /** - * Tests the setExclusiveMaxTreeDepth method using 1,4-dibutylbenzene. - * By varying `exclusiveMaxTreeDepth`, we can observe how the number of generated fragments changes. + * Tests the setInclusiveMaxTreeDepth method using 1,4-dibutylbenzene. + * By varying `inclusiveMaxTreeDepth`, we can observe how the number of generated fragments changes. * *

      * Molecule: 1,4-dibutylbenzene (CCCCc1ccc(CCCC)cc1)
      * Splittable bonds: 6 (the three C-C bonds for each butyl chain, from the ring until the the second last C-atom).
      * Fragmenter setup: minFragSize = 4 (to include butyl and benzene fragments), hydrogen-saturated fragments.
      *
-     * Expected fragments for different exclusiveMaxTreeDepth settings:
+     * Expected fragments for different inclusiveMaxTreeDepth settings:
      *
-     * 1.  exclusiveMaxTreeDepth = 1 (allows 0 simultaneous cuts):
-     * - No fragments generated from bond cleavages, becuase there are no splits allowed as max tree depth is exclusive.
+     * 1.  inclusiveMaxTreeDepth = 0 (allows 0 simultaneous cuts):
      * - Expected fragments: 0
      *
-     * 2.  exclusiveMaxTreeDepth = 2 (allows up to 1 simultaneous cut):
+     * 2.  inclusiveMaxTreeDepth = 1 (allows up to 1 simultaneous cut):
      * - Considers all subsets of splittable bonds of size 1.
      * - Expected unique fragments: 4 (
      * c1ccc(cc1)CCCC
@@ -563,7 +545,7 @@ void testCustomSmilesGenerator() throws Exception {
      * CCCC"
      * )
      *
-     * 3.  exclusiveMaxTreeDepth = 3 (allows up to 2 simultaneous cuts):
+     * 3.  inclusiveMaxTreeDepth = 2 (allows up to 2 simultaneous cuts):
      * - Considers all subsets of splittable bonds of size 1 and 2.
      * - Includes fragments from 1-cut operations, plus fragments from 2-cut operations:
      * - Expected unique fragments: 10 (
@@ -579,7 +561,7 @@ void testCustomSmilesGenerator() throws Exception {
      * CCCC
      * )
      *
-     * 4.  exclusiveMaxTreeDepth = 4 (allows up to 3 simultaneous cuts):
+     * 4.  inclusiveMaxTreeDepth = 3 (allows up to 3 simultaneous cuts):
      * - Since there are only combinations of 2 splittable bonds that allow a fragment size bigger the 6, allowing up
      *  to 3 cuts (or more) will yield the same set of fragments as allowing up to 2 cuts.
      * - Expected unique fragments: 10 (
@@ -618,11 +600,11 @@ void testSetExclusiveMaxTreeDepth() throws Exception {
         String[] fragsDepth2 = localFragmenter.getFragments();
         Assertions.assertEquals(4, fragsDepth2.length,
                 "Expected 4 fragments when inclusiveMaxTreeDepth is 1 (allows up to 1 cut)");
-        MatcherAssert.assertThat(Arrays.asList(fragsDepth2),
-                hasItems(
+        Assertions.assertTrue(
+                hasItems(fragsDepth2, new String[] {
                         "CCCC",
                         "c1ccc(cc1)CCCC"
-                )
+                })
         );
 
         localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, Integer.SIZE - 1);
@@ -631,12 +613,12 @@ void testSetExclusiveMaxTreeDepth() throws Exception {
         String[] fragsDepth3 = localFragmenter.getFragments();
         Assertions.assertEquals(10, fragsDepth3.length,
                 "Expected 10 fragments when inclusiveMaxTreeDepth is 2 (allows up to 2 cuts)");
-        MatcherAssert.assertThat(Arrays.asList(fragsDepth3),
-                hasItems(
+        Assertions.assertTrue(
+                hasItems(fragsDepth3, new String[] {
                         "CCCC",
                         "c1ccc(cc1)CCCC",
                         "c1ccccc1"
-                )
+                })
         );
 
         localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, Integer.SIZE - 1);
@@ -645,12 +627,12 @@ void testSetExclusiveMaxTreeDepth() throws Exception {
         String[] fragsDepth4 = localFragmenter.getFragments();
         Assertions.assertEquals(10, fragsDepth4.length,
                 "Expected 10 fragments when inclusiveMaxTreeDepth is 3 (allows up to 3 cuts), same as max 2 cuts");
-        MatcherAssert.assertThat(Arrays.asList(fragsDepth4),
-                hasItems(
+        Assertions.assertTrue(
+                hasItems(fragsDepth4, new String[]{
                         "CCCC",
                         "c1ccc(cc1)CCCC",
                         "c1ccccc1"
-                )
+                })
         );
     }
 
@@ -666,14 +648,14 @@ void testDisconnectedMolecules() throws Exception {
         IAtomContainer mol = smilesParser.parseSmiles("C(CN(CC(=O)[O-])CC(=O)[O-])N(CC(=O)[O-])CC(=O)[O-].[Na+].[Na+].[Na+].[Na+]"); //Sodium edetate
         fragmenterSaturated.generateFragments(mol);
         String[] frags = fragmenterSaturated.getFragments();
-        MatcherAssert.assertThat(
-                Arrays.asList(frags),
-                hasItems(
-                        "O=C([O-])CNCCNCC(=O)[O-]",
-                        "O=C([O-])CNCC(=O)[O-]",
-                        "O=C([O-])CN(C)CCN(C)C",
-                        "O=C([O-])CNCCNC",
-                        "O=C([O-])CN(CC(=O)[O-])CC"
+        Assertions.assertTrue(
+                hasItems(frags, new String[]{
+                                "O=C([O-])CNCCNCC(=O)[O-]",
+                                "O=C([O-])CNCC(=O)[O-]",
+                                "O=C([O-])CN(C)CCN(C)C",
+                                "O=C([O-])CNCCNC",
+                                "O=C([O-])CN(CC(=O)[O-])CC"
+                        }
                 )
         );
     }
@@ -684,53 +666,78 @@ void testDisconnectedMolecules() throws Exception {
      * @throws Exception if anything goes wrong
      */
     @Test
-    void testIndexBigMolecule() throws Exception {
+    void testBigMolecule1() throws Exception {
         SmilesParser smiPar = new SmilesParser(SilentChemObjectBuilder.getInstance());
         IAtomContainer mol = smiPar.parseSmiles("CC1=C(C(=CC=C1)NC2=CC=CC=C2C" +
                 "(=O)NC(CCS(=O)C)C(=O)NC(C)C3=CC=C(C=C3)F)C"); //PubChem CID 118705975
 
         fragmenterSaturated.generateFragments(mol);
         String[] frags = fragmenterSaturated.getFragments();
-        Assertions.assertEquals(
-                Arrays.asList(frags),
+        Assertions.assertTrue(
                 hasItems(
-                        "O=CCNC(=O)c1ccccc1",
-                        "O=C(N)CNC(=O)c1ccccc1N",
-                        "O=C(NC)c1ccccc1N",
-                        "O=C(NCCC)c1ccccc1N",
-                        "O=CCNC(=O)c1ccccc1Nc2cccc(c2C)C",
-                        "O=C(N)CCCS(=O)C",
-                        "O=C(N)C(NC(=O)c1ccccc1)CCS(=O)C"
+                        frags, new String[]{
+                                "O=C(NCC)CCC",
+                                "NC=1C=CC=CC1",
+                                "O=C(N)CCCS(=O)C",
+                                "FC=1C=CC(=CC1)C(N)C"
+                        }
                 )
         );
     }
 
-
     /**
      * Testing a bigger molecule
      *
      * @throws Exception if anything goes wrong
      */
     @Test
-    void testIndexBigMolecule2() throws Exception {
+    void testTestMoleculeUnsaturated() throws Exception {
         SmilesParser smiPar = new SmilesParser(SilentChemObjectBuilder.getInstance());
-        IAtomContainer mol = smiPar.parseSmiles("C[C@]12CC[C@](CC1C3=CC(=O)C4[C@]5(CCC(C(C5CC[C@]4([C@@]3(CC2)C)C)(C)C)" +
-                "OC6C(C(C(C(O6)C(=O)N[C@H](CCC(=O)OC)C(=O)OC)O)O)OC7C(C(C(C(O7)C(=O)N[C@H](CCC(=O)OC)C(=O)OC)O)O)O)C)(C)C" +
-                "(=O)N[C@H](CCC(=O)OC)C(=O)OC"); // Pubchem CID 16396833
-        System.out.println(fragmenterSaturated.getSplitableBonds(mol).length);
-        fragmenterSaturated.generateFragments(mol);
-        String[] frags = fragmenterSaturated.getFragments();
-        Assertions.assertEquals(
-                Arrays.asList(frags),
-                hasItems(
-                        "O=CCNC(=O)c1ccccc1",
-                        "O=C(N)CNC(=O)c1ccccc1N",
-                        "O=C(NC)c1ccccc1N",
-                        "O=C(NCCC)c1ccccc1N",
-                        "O=CCNC(=O)c1ccccc1Nc2cccc(c2C)C",
-                        "O=C(N)CCCS(=O)C",
-                        "O=C(N)C(NC(=O)c1ccccc1)CCS(=O)C"
-                )
-        );
+        IAtomContainer mol = smiPar.parseSmiles("C1CCCCC1c1ccccc1"); //PubChem CID
+        fragmenterUnsaturated.generateFragments(mol);
+        String[] frags = fragmenterUnsaturated.getFragments();
+        Assertions.assertTrue(hasItems(frags, new String[] {
+                "[CH]1CCCCC1",
+                "[c]1ccccc1"
+        }));
+    }
+
+//    /**
+//     * Testing a molecule with 31 splittable bonds (takes extremely long, maybe days)
+//     *
+//     * @throws Exception if anything goes wrong
+//     */
+//    @Test
+//    void testIndexBigMolecule2() throws Exception {
+//        SmilesParser smiPar = new SmilesParser(SilentChemObjectBuilder.getInstance());
+//        IAtomContainer mol = smiPar.parseSmiles("C[C@]12CC[C@](CC1C3=CC(=O)C4[C@]5(CCC(C(C5CC[C@]4([C@@]3(CC2)C)C)(C)C)" +
+//                "OC6C(C(C(C(O6)C(=O)N[C@H](CCC(=O)OC)C(=O)OC)O)O)OC7C(C(C(C(O7)C(=O)N[C@H](CCC(=O)OC)C(=O)OC)O)O)O)C)(C)C" +
+//                "(=O)N[C@H](CCC(=O)OC)C(=O)OC"); // Pubchem CID 16396833
+//        System.out.println(fragmenterSaturated.getSplitableBonds(mol).length);
+//        fragmenterSaturated.generateFragments(mol);
+//        String[] frags = fragmenterSaturated.getFragments();
+//        Assertions.assertEquals(
+//                Arrays.asList(frags),
+//                hasItems(
+//                        "O=CCNC(=O)c1ccccc1",
+//                        "O=C(N)CNC(=O)c1ccccc1N",
+//                        "O=C(NC)c1ccccc1N",
+//                        "O=C(NCCC)c1ccccc1N",
+//                        "O=CCNC(=O)c1ccccc1Nc2cccc(c2C)C",
+//                        "O=C(N)CCCS(=O)C",
+//                        "O=C(N)C(NC(=O)c1ccccc1)CCS(=O)C"
+//                )
+//        );
+//    }
+
+    // --utility --
+    static boolean hasItems(String[] allFragments, String[] requiredFragments) {
+        Set allFragmentsSet = new HashSet<>(Arrays.asList(allFragments));
+        for (String frag : requiredFragments) {
+            if (!allFragmentsSet.contains(frag)) {
+                return false;
+            }
+        }
+        return true;
     }
-}
\ No newline at end of file
+}

From 33ce575cbe02c92dea9e886be4520124c3fd8005 Mon Sep 17 00:00:00 2001
From: ToLeWeiss 
Date: Wed, 23 Jul 2025 15:00:05 +0200
Subject: [PATCH 25/42] added null check for input molecule and elaborated on
 comments

---
 .../org/openscience/cdk/fragment/ExhaustiveFragmenter.java  | 3 +++
 .../openscience/cdk/fragment/ExhaustiveFragmenterTest.java  | 6 ++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java
index 75df6151d01..5f0e83d50fd 100644
--- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java
+++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java
@@ -278,6 +278,9 @@ public void generateFragments(IAtomContainer atomContainer) throws CDKException
      * @throws CDKException if an error occurs during hydrogen addition or atom type perception.
      */
     private void run(IAtomContainer atomContainer) throws CDKException {
+        if (atomContainer == null) {
+            throw new NullPointerException("No molecule provided");
+        }
 
         // Return early if the molecule has fewer than 3 bonds (no meaningful splits possible)
         if (atomContainer.getBondCount() < 3 || atomContainer.getAtomCount() < this.minFragSize || atomContainer.isEmpty()) {
diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java
index 1ba04e3d36d..415410ab473 100644
--- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java
+++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java
@@ -65,7 +65,8 @@ static void setup() {
 
     /**
      * Tests that a simple linear alkane (propane) with no splittable bonds
-     * yields no fragments when using the unsaturated setting.
+     * yields no fragments when using the unsaturated setting. This can not return any fragment even with
+     * a smaller minimal fragment size this doesn't result in any fragment as all bonds are terminal bonds.
      */
     @Test
     void testEF1Unsaturated() throws Exception {
@@ -77,7 +78,8 @@ void testEF1Unsaturated() throws Exception {
 
     /**
      * Tests that a simple cycloalkane (cyclopentane) with no non-ring, non-terminal bonds
-     * yields no fragments when using the unsaturated setting.
+     * yields no fragments when using the unsaturated setting. Because this is just one big
+     * ring there will be no resulting fragments whatsoever.
      */
     @Test
     void testEF2Unsaturated() throws Exception {

From 1acf2b1b0c0d04cc26f95aeb59e5d047ded2b3d4 Mon Sep 17 00:00:00 2001
From: ToLeWeiss 
Date: Mon, 4 Aug 2025 08:00:12 +0200
Subject: [PATCH 26/42] removed useless comments and imports and added
 documentation if it is not possible to generate any fragments

---
 .../openscience/cdk/fragment/ExhaustiveFragmenter.java | 10 +---------
 .../cdk/fragment/ExhaustiveFragmenterTest.java         |  7 ++-----
 2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java
index 5f0e83d50fd..58c2231482e 100644
--- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java
+++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java
@@ -23,14 +23,9 @@
  */
 package org.openscience.cdk.fragment;
 
-import org.openscience.cdk.aromaticity.Aromaticity;
-import org.openscience.cdk.atomtype.CDKAtomTypeMatcher;
-import org.openscience.cdk.config.AtomTypeFactory;
-import org.openscience.cdk.config.atomtypes.AtomTypeReader;
 import org.openscience.cdk.exception.CDKException;
 import org.openscience.cdk.interfaces.IAtom;
 import org.openscience.cdk.interfaces.IAtomContainer;
-import org.openscience.cdk.interfaces.IAtomType;
 import org.openscience.cdk.interfaces.IBond;
 import org.openscience.cdk.interfaces.IPseudoAtom;
 import org.openscience.cdk.ringsearch.RingSearch;
@@ -38,8 +33,6 @@
 import org.openscience.cdk.smiles.SmilesGenerator;
 import org.openscience.cdk.tools.ILoggingTool;
 import org.openscience.cdk.tools.LoggingToolFactory;
-import org.openscience.cdk.tools.manipulator.AtomContainerManipulator;
-import org.openscience.cdk.tools.manipulator.AtomTypeManipulator;
 
 import java.util.ArrayDeque;
 import java.util.ArrayList;
@@ -50,11 +43,10 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
-import java.util.Stack;
 
 /**
  * Performs exhaustive fragmentation of molecules by breaking single non-ring, non-terminal bonds in all
- * combinations.
+ * combinations. If it is not possible to generate fragments an empty list is returned.
  * 

* Non-terminal meaning bonds connected to more than one single heavy atom (non-terminal bonds). * By default: diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java index 415410ab473..ce4c3dcecf7 100644 --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java @@ -19,7 +19,6 @@ */ package org.openscience.cdk.fragment; -import org.hamcrest.MatcherAssert; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -65,8 +64,7 @@ static void setup() { /** * Tests that a simple linear alkane (propane) with no splittable bonds - * yields no fragments when using the unsaturated setting. This can not return any fragment even with - * a smaller minimal fragment size this doesn't result in any fragment as all bonds are terminal bonds. + * yields no fragments when using the unsaturated setting. */ @Test void testEF1Unsaturated() throws Exception { @@ -78,8 +76,7 @@ void testEF1Unsaturated() throws Exception { /** * Tests that a simple cycloalkane (cyclopentane) with no non-ring, non-terminal bonds - * yields no fragments when using the unsaturated setting. Because this is just one big - * ring there will be no resulting fragments whatsoever. + * yields no fragments when using the unsaturated setting. */ @Test void testEF2Unsaturated() throws Exception { From 829fb47f9bde678bceb935cc72358cde1e8ba401 Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Thu, 14 Aug 2025 07:47:50 +0200 Subject: [PATCH 27/42] improved comments and removed dependencies in the test class --- .../cdk/fragment/ExhaustiveFragmenter.java | 402 ++++++++++++------ .../fragment/ExhaustiveFragmenterTest.java | 278 ++++++------ 2 files changed, 427 insertions(+), 253 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index 58c2231482e..b4f4bb94a38 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -45,60 +45,77 @@ import java.util.Set; /** - * Performs exhaustive fragmentation of molecules by breaking single non-ring, non-terminal bonds in all - * combinations. If it is not possible to generate fragments an empty list is returned. + * Performs exhaustive fragmentation of molecules by breaking single non-ring, + * non-terminal bonds in all combinations. If it is not possible to generate + * fragments, an empty list is returned. Non-terminal bonds are those connected + * to heavy atoms that respectively have another bond to a heavy atom. *

- * Non-terminal meaning bonds connected to more than one single heavy atom (non-terminal bonds). * By default: *

    - *
  • Fragments smaller than 6 atoms (excluding implicit hydrogen) are not returned.
  • + *
  • Fragments smaller than 6 atoms (excluding implicit hydrogen) are not + * returned.
  • *
  • Fragments are returned with open valences, where a bond has been split.
  • - *
  • The Fragmentation splits at maximum 31 bonds in one run.
  • - *
  • The SMILES code of the fragments is generated with {@link SmiFlavor#Unique} and {@link SmiFlavor#UseAromaticSymbols}
  • + *
  • The fragmentation splits at a maximum tree depth of 31, meaning that + * maximum 31 bonds are split in one run.
  • + *
  • The SMILES code of the fragments is generated with {@link SmiFlavor#Unique} + * and {@link SmiFlavor#UseAromaticSymbols}. It does not contain information + * about the stereochemistry.
  • *
- * However, users can modify these settings, with the exception, that the maximum tree depth can not be higher than 31 - * (Java's limitation caused by integer indexing). + * However, users can modify these settings, with the exception, that the + * maximum tree depth can not be higher than 31 (Java's limitation caused by + * integer indexing). *

* Fragment Deduplication: - * The `ExhaustiveFragmenter` uses canonical SMILES strings for internal deduplication of generated fragments. - * This means that after a fragment is generated, its unique SMILES representation is computed - * (using the default or user specified {@link SmilesGenerator}). These SMILES do not encode stereochemistry. - * If a fragment with the same canonical SMILES has already been generated and stored, the new fragment - * is considered a duplicate and is not added to the results. + * The `ExhaustiveFragmenter` uses unique SMILES strings for internal + * deduplication of generated fragments. This means that after a fragment is + * generated, its unique SMILES representation is computed (using the default or + * user specified {@link SmilesGenerator}). These SMILES do not encode + * stereochemistry. If a fragment with the same canonical SMILES has already + * been generated and stored, the new fragment is considered a duplicate and is + * not added to the results. *

- * This deduplication strategy is particularly important when considering the {@link Saturation} setting: + * This deduplication strategy is particularly important when considering the + * {@link Saturation} setting: *

    - *
  • If fragments are {@link Saturation#HYDROGEN_SATURATED_FRAGMENTS}, the saturation process might lead to a canonical SMILES - * that is identical to a fragment obtained via a different bond cleavage, or a fragment that appears - * different due to explicit hydrogen representation but becomes identical when canonicalized.
  • - *
  • For example, an unsaturated fragment like `[CH]1CCCCC1` (cyclohexyl radical) might deduplicate - * with a saturated `C1CCCCC1` (cyclohexane) if `HYDROGEN_SATURATED_FRAGMENTS` is enabled and both forms - * canonicalize to the same SMILES depending on the exact SMILES generator and atom properties.
  • - *
  • It is crucial to understand that the uniqueness is based solely on the canonical SMILES string, - * not on the exact atom-by-atom identity or origin within the original molecule.
  • + *
  • If fragments are {@link Saturation#HYDROGEN_SATURATED_FRAGMENTS}, the + * saturation process might lead to a canonical SMILES that is identical to a + * fragment obtained via a different bond cleavage, or a fragment that appears + * different due to explicit hydrogen representation but becomes identical when + * canonicalized.
  • + *
  • For example, an unsaturated fragment like `[CH]1CCCCC1` (cyclohexyl + * radical) might deduplicate with a saturated `C1CCCCC1` (cyclohexane) if + * `HYDROGEN_SATURATED_FRAGMENTS` is enabled and both forms canonicalize to the + * same SMILES depending on the exact SMILES generator and atom properties.
  • + *
  • It is crucial to understand that the uniqueness is based solely on the + * canonical SMILES string, not on the exact atom-by-atom identity or origin + * within the original molecule.
  • *
- * *

* Example Usage: *

{@code
  * // By default, returns unsaturated fragments with a minimum size of 6 atoms
  * ExhaustiveFragmenter fragmenter = new ExhaustiveFragmenter();
  * SmilesParser smiParser = new SmilesParser(SilentChemObjectBuilder.getInstance());
- * IAtomContainer mol = smiParser.parseSmiles("C1CCCCC1C1=CC=CC=C1");  // Cyclopentylbenzene
+ * // Cyclopentylbenzene
+ * IAtomContainer mol = smiParser.parseSmiles("C1CCCC1C1=CC=CC=C1");
  * fragmenter.generateFragments(mol);
  *
  * // Retrieve SMILES representations of fragments
  * String[] smilesFragments = fragmenter.getFragments();
  * // Example Result (depending on exact fragmentation points and min size):
- * // "[CH]1CCCCC1", "[c]1ccccc1"
+ * // "[C]1=CC=CC=C1"
  *
  * // Retrieve AtomContainer representations of fragments
  * IAtomContainer[] atomContainerFragments = fragmenter.getFragmentsAsContainers();
  *
  * // Example: Configuring for hydrogen-saturated fragments with a minimum size of 5
- * ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(5, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS);
+ * ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(
+ *      5,
+ *      ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS
+ * );
  * saturatedFragmenter.generateFragments(mol);
  * String[] saturatedSmilesFragments = saturatedFragmenter.getFragments();
+ * // "C1CCCC1", "C1=CC=CC=C1"
  * }
* * @author Rajarshi Guha @@ -109,11 +126,13 @@ public class ExhaustiveFragmenter implements IFragmenter { /** - * Specifies whether generated fragments should be saturated (hydrogens added) or unsaturated. + * Specifies whether generated fragments should be saturated (hydrogens added) + * or unsaturated. */ public enum Saturation { /** - * Fragments will be returned in their saturated form (implicit hydrogen atoms added). + * Fragments will be returned in their saturated form + * (implicit hydrogen atoms added). */ HYDROGEN_SATURATED_FRAGMENTS, @@ -123,15 +142,20 @@ public enum Saturation { R_SATURATED_FRAGMENTS, /** - * Fragments will be returned in their unsaturated form (no additional hydrogen atoms). The unsaturated atoms - * are the atoms of the split bonds. + * Fragments will be returned in their unsaturated form + * (no additional hydrogen atoms). The unsaturated atoms are the atoms + * of the split bonds. */ UNSATURATED_FRAGMENTS } private static final int DEFAULT_MIN_FRAG_SIZE = 6; - private static final Saturation DEFAULT_SATURATION = Saturation.UNSATURATED_FRAGMENTS; - private static final SmilesGenerator DEFAULT_SMILES_GENERATOR = new SmilesGenerator(SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols); + private static final Saturation DEFAULT_SATURATION = + Saturation.UNSATURATED_FRAGMENTS; + private static final SmilesGenerator DEFAULT_SMILES_GENERATOR = + new SmilesGenerator( + SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols + ); private static final int DEFAULT_INCLUSIVE_MAX_TREE_DEPTH = Integer.SIZE - 1; private final Map fragMap; @@ -139,58 +163,92 @@ public enum Saturation { private int inclusiveMaxTreeDepth; private int minFragSize; private Saturation saturationSetting; - private static final ILoggingTool logger = LoggingToolFactory.createLoggingTool(ExhaustiveFragmenter.class); + private static final ILoggingTool logger = + LoggingToolFactory.createLoggingTool(ExhaustiveFragmenter.class); /** * Constructs an ExhaustiveFragmenter with the default settings: *
    *
  • Minimum fragment size: 6 atoms (excluding implicit hydrogen)
  • *
  • Unsaturated fragments
  • - *
  • Default {@link SmilesGenerator} ({@code SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols})
  • + *
  • Default {@link SmilesGenerator} + * ({@code SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols})
  • *
*/ public ExhaustiveFragmenter() { - this(DEFAULT_SMILES_GENERATOR, DEFAULT_MIN_FRAG_SIZE, DEFAULT_SATURATION, DEFAULT_INCLUSIVE_MAX_TREE_DEPTH); + this( + DEFAULT_SMILES_GENERATOR, + DEFAULT_MIN_FRAG_SIZE, + DEFAULT_SATURATION, + DEFAULT_INCLUSIVE_MAX_TREE_DEPTH + ); } /** - * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment size and saturation setting. - * Uses the default {@link SmilesGenerator}. + * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment + * size and saturation setting. Uses the default {@link SmilesGenerator}. * - * @param minFragSize Minimum number of atoms in a valid fragment (excluding implicit hydrogen). - * @param saturationSetting Determines whether fragments should be saturated (with hydrogens or R-atoms) or unsaturated. + * @param minFragSize Minimum number of atoms in a valid fragment + * (excluding implicit hydrogen). + * @param saturationSetting Determines whether fragments should be saturated + * (with hydrogens or R-atoms) or unsaturated. */ public ExhaustiveFragmenter(int minFragSize, Saturation saturationSetting) { - this(DEFAULT_SMILES_GENERATOR, minFragSize, saturationSetting, DEFAULT_INCLUSIVE_MAX_TREE_DEPTH); + this( + DEFAULT_SMILES_GENERATOR, + minFragSize, + saturationSetting, + DEFAULT_INCLUSIVE_MAX_TREE_DEPTH + ); } /** - * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment size. - * Saturation defaults to {@link Saturation#UNSATURATED_FRAGMENTS}. + * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment + * size. Saturation defaults to {@link Saturation#UNSATURATED_FRAGMENTS}. * Uses the default {@link SmilesGenerator}. * - * @param minFragSize Minimum number of atoms in a valid fragment (excluding implicit hydrogen). + * @param minFragSize Minimum number of atoms in a valid fragment + * (excluding implicit hydrogen). */ public ExhaustiveFragmenter(int minFragSize) { - this(DEFAULT_SMILES_GENERATOR, minFragSize, DEFAULT_SATURATION, DEFAULT_INCLUSIVE_MAX_TREE_DEPTH); + this( + DEFAULT_SMILES_GENERATOR, + minFragSize, + DEFAULT_SATURATION, + DEFAULT_INCLUSIVE_MAX_TREE_DEPTH + ); } /** * Constructs an ExhaustiveFragmenter with a user-provided {@link SmilesGenerator}, * user-defined minimum fragment size, and saturation setting. * - * @param smilesGenerator The {@link SmilesGenerator} instance to use for creating SMILES strings - * for fragment deduplication and retrieval. - * @param minFragSize Minimum number of atoms in a valid fragment (excluding implicit hydrogen). - * @param saturationSetting Determines whether fragments should be saturated (with hydrogens or R-atoms) or unsaturated. + * @param smilesGenerator The {@link SmilesGenerator} instance to use for + * creating SMILES strings + * for fragment deduplication and retrieval. + * @param minFragSize Minimum number of atoms in a valid fragment + * (excluding implicit hydrogen). + * @param saturationSetting Determines whether fragments should be saturated + * (with hydrogens or R-atoms) or unsaturated. + * @param inclusiveMaxTreeDepth Represents the number of Bonds that will be + * split for a fragmentation. */ - public ExhaustiveFragmenter(SmilesGenerator smilesGenerator, int minFragSize, Saturation saturationSetting, int inclusiveMaxTreeDepth) { + public ExhaustiveFragmenter( + SmilesGenerator smilesGenerator, + int minFragSize, + Saturation saturationSetting, + int inclusiveMaxTreeDepth + ) { if (saturationSetting == null) { - throw new IllegalArgumentException("The given SaturationSetting can not be null"); + throw new IllegalArgumentException( + "The given SaturationSetting can not be null" + ); } this.saturationSetting = saturationSetting; if (smilesGenerator == null) { - throw new IllegalArgumentException("The given SmilesGenerator can not be null"); + throw new IllegalArgumentException( + "The given SmilesGenerator can not be null" + ); } this.smilesGenerator = smilesGenerator; this.setInclusiveMaxTreeDepth(inclusiveMaxTreeDepth); @@ -206,7 +264,8 @@ public ExhaustiveFragmenter(SmilesGenerator smilesGenerator, int minFragSize, Sa public void setMinimumFragmentSize(int minFragSize) { if (minFragSize <= 0) { throw new IllegalArgumentException( - "Minimum fragment size must be a positive integer (>= 1). Provided: " + minFragSize + "Minimum fragment size must be a positive integer (>= 1)" + + " Provided: " + minFragSize ); } this.minFragSize = minFragSize; @@ -219,28 +278,32 @@ public void setMinimumFragmentSize(int minFragSize) { */ public void setSaturationSetting(Saturation saturationSetting) { if (saturationSetting == null) { - throw new IllegalArgumentException("The given SaturationSetting can not be null"); + throw new NullPointerException( + "The given SaturationSetting can not be null" + ); } this.saturationSetting = saturationSetting; } /** - * Sets the maximum number of bonds that can be simultaneously split in a single - * fragmentation event. This value is exclusive, meaning if set to `3`, a maximum of `2` bonds - * can be split simultaneously. + * Sets the maximum number of bonds that can be simultaneously split in a + * single fragmentation event. *

- * Must be within the range {@code 0 < inclusiveMaxTreeDepth < 32}. This limit is important - * due to the combinatorial explosion of fragments (which scales with 2^n, where n is the - * number of splittable bonds) and Java's use of 32-bit integers for indexing. - * Setting a lower limit can help manage computational resources for larger molecules. + * Must be within the range {@code 0 < inclusiveMaxTreeDepth < 32}. This + * limit is important due to the combinatorial explosion of fragments + * (which scales with 2^n, where n is the number of splittable bonds) and + * Java's use of 32-bit integers for indexing. Setting a lower limit can + * help manage computational resources for larger molecules. *

* - * @param inclusiveMaxTreeDepth The exclusive maximum number of bonds that can be split in one atom container. + * @param inclusiveMaxTreeDepth The exclusive maximum number of bonds that + * can be split in one atom container. */ public void setInclusiveMaxTreeDepth(int inclusiveMaxTreeDepth) { if (inclusiveMaxTreeDepth <= 0 || inclusiveMaxTreeDepth >= 32) { throw new IllegalArgumentException( - "Inclusive max tree depth must be grater then zero and smaller then 32. Provided: " + inclusiveMaxTreeDepth + "Inclusive max tree depth must be grater then zero and " + + "smaller then 32. Provided: " + inclusiveMaxTreeDepth ); } this.inclusiveMaxTreeDepth = inclusiveMaxTreeDepth; @@ -262,38 +325,49 @@ public void generateFragments(IAtomContainer atomContainer) throws CDKException } /** - * Splits the molecule at all possible combinations of splittable bonds and saturates the open valences of the - * resulting fragments if the Saturation setting is turned on. - * Only non-ring and non-terminal single bonds are considered for splitting. + * Splits the molecule at all possible combinations of splittable bonds and + * saturates the open valences of the resulting fragments according to the + * {@link ExhaustiveFragmenter#saturationSetting}. Only non-ring and + * non-terminal single bonds are considered for splitting. * * @param atomContainer the molecule to be split. - * @throws CDKException if an error occurs during hydrogen addition or atom type perception. + * @throws CDKException if an error occurs during hydrogen addition or atom + * type perception. */ private void run(IAtomContainer atomContainer) throws CDKException { if (atomContainer == null) { throw new NullPointerException("No molecule provided"); } - // Return early if the molecule has fewer than 3 bonds (no meaningful splits possible) - if (atomContainer.getBondCount() < 3 || atomContainer.getAtomCount() < this.minFragSize || atomContainer.isEmpty()) { + // Return early if the molecule has fewer than 3 bonds + // (no meaningful splits possible) + if (atomContainer.getBondCount() < 3 || + atomContainer.getAtomCount() < this.minFragSize || + atomContainer.isEmpty()) { return; } // Retrieve bonds that are eligible for splitting - IBond[] splittableBonds = getSplittableBonds(atomContainer); + IBond[] splittableBonds = + getSplittableBonds(atomContainer).toArray(new IBond[0]); // If no splittable bonds are found, return early if (splittableBonds.length == 0) { - logger.info("no splittable bonds found"); + logger.debug("no splittable bonds found"); return; } if (splittableBonds.length > this.inclusiveMaxTreeDepth) { - logger.warn("Got " + splittableBonds.length + " splittable bonds but only " + this.inclusiveMaxTreeDepth + - " tree depth. This means only " + this.inclusiveMaxTreeDepth + " bonds can be split"); + logger.debug( + "Got " + splittableBonds.length + " splittable bonds" + + " but only " + this.inclusiveMaxTreeDepth + " tree depth. " + + "This means only a maximum of " + this.inclusiveMaxTreeDepth + + " bonds can be split at once during a fragmentation step" + ); } logger.debug("Got " + splittableBonds.length + " splittable bonds"); - // Compute the number of possible bond subsets (excluding the empty set): 2^n - 1 + // Compute the number of possible bond subsets (excluding the empty set): + // 2^n - 1 int numberOfIterations = (1 << splittableBonds.length) - 1; // Store indices of splittable bonds for subset generation @@ -319,7 +393,9 @@ private void run(IAtomContainer atomContainer) throws CDKException { } // Split the molecule and retrieve the resulting fragments - IAtomContainer[] parts = splitBondsWithCopy(atomContainer, bondsToSplit); + IAtomContainer[] parts = splitBondsWithCopy( + atomContainer, bondsToSplit + ); // Process each fragment for (IAtomContainer partContainer : parts) { @@ -336,7 +412,8 @@ private void run(IAtomContainer atomContainer) throws CDKException { numberOfAtoms++; } - // Store the fragment if it meets the size requirement and is unique + // Store the fragment if it meets the size requirement and is + // unique if (numberOfAtoms >= minFragSize) { fragMap.putIfAbsent(tmpSmiles, partContainer); } @@ -345,57 +422,79 @@ private void run(IAtomContainer atomContainer) throws CDKException { } /** - * Detects and returns the bonds, which will be split by an exhaustive fragmentation. This method is especially useful - * to determine if it is even possible to split a specific molecule exhaustively. The number of fragments is 2^n - 1 with n - * being the number of splittable bonds. Therefore, it is impossible to entirely split a molecule with more than 31 splittable bonds. + * Detects and returns the bonds, which will be split by an exhaustive + * fragmentation. This method is especially useful to determine if it is + * even possible to split a specific molecule exhaustively. The number of + * fragments is 2^n - 1 with n being the number of splittable bonds. + * It is impossible to generate all possible fragment combinations for a molecule + * with more than 31 splittable bonds, as this would exceed the maximum tree depth + * of 31 due to the combinatorial explosion. For molecules with more than 31 + * splittable bonds, the fragmentation will still occur, but it will be limited + * to a maximum of {@code inclusiveMaxTreeDepth} bonds per fragmentation step. * To mitigate this one can check this with this function, for example: *
      *     {@code
      *     ExhaustiveFragmenter exhFragmenter = new Exhaustive Fragmenter;
      *     if (exhFragmenter.getSplittableBonds(mol) > Integer.SIZE - 1) {
-     *         // handle the case, where it is impossible to entirely split the molecule
+     *         // handle the case, where it is impossible to entirely split the
+     *         // molecule
      *     }}
      * 
* * @param atomContainer the container which contains the molecule in question. * @return the bonds which would be split by the exhaustive fragmentation. */ - public static IBond[] getSplittableBonds(IAtomContainer atomContainer) { + public static Set getSplittableBonds(IAtomContainer atomContainer) { + if (atomContainer == null) { + throw new NullPointerException("The atom container must not be null"); + } + if (atomContainer.isEmpty()) { + throw new IllegalArgumentException("The atom container must contain " + + "an actual molecule"); + } + // do ring detection - RingSearch ringSearch= new RingSearch(atomContainer); + RingSearch ringSearch = new RingSearch(atomContainer); IAtomContainer allRingsContainer = ringSearch.ringFragments(); - // find the splitable bonds - ArrayList splitableBonds = new ArrayList<>(); + // find the splittable bonds + Set splittableBondSet = new HashSet<>( + atomContainer.getBondCount() / 3 + ); for (IBond bond : atomContainer.bonds()) { - boolean isInRing = false; - boolean isTerminal = false; - // lets see if it's in a ring - if (allRingsContainer.contains(bond)) isInRing = true; + // only single bonds are candidates for splitting + if (bond.getOrder() == IBond.Order.SINGLE) { + boolean isInRing = false; + boolean isTerminal = false; - // lets see if it is a terminal bond - for (IAtom atom : bond.atoms()) { - if (atomContainer.getConnectedBondsCount(atom) == 1) { - isTerminal = true; - break; + // lets see if it's in a ring + if (allRingsContainer.contains(bond)) isInRing = true; + + // lets see if it is a terminal bond + for (IAtom atom : bond.atoms()) { + if (atomContainer.getConnectedBondsCount(atom) == 1) { + isTerminal = true; + break; + } } - } - if (!(isInRing || isTerminal)) splitableBonds.add(bond); + if (!(isInRing || isTerminal)) splittableBondSet.add(bond); + } } - return splitableBonds.toArray(new IBond[0]); + return splittableBondSet; } /** - * Generates a subset from the given array `nums`, determined by the binary representation of `index`. - * Each bit in `index` indicates whether the corresponding element in `nums` is included in the subset. - * The order of elements does not matter (i.e., `[1, 2]` and `[2, 1]` are equivalent). + * Generates a subset from the given array `nums`, determined by the binary + * representation of `index`. Each bit in `index` indicates whether the + * corresponding element in `nums` is included in the subset. The order of + * elements does not matter (i.e., `[1, 2]` and `[2, 1]` are equivalent). * - *

The total number of possible subsets is (2^n) - 1, where `n` is the length of `nums`. - * Subsets are generated using bitwise operations, where each `1` bit in `index` selects - * the corresponding element from `nums`.

+ *

The total number of possible subsets is (2^n) - 1, where `n` is the + * length of `nums`. Subsets are generated using bitwise operations, where + * each `1` bit in `index` selects the corresponding element from `nums`.

* *

Example output for `nums = [1, 2, 3]`:

*
@@ -416,11 +515,12 @@ public static IBond[] getSplittableBonds(IAtomContainer atomContainer) {
      *      1        →   3   →  [1, 3]
      * 
* - * @param index An integer whose binary representation determines the subset elements. - * A `1` bit at position `j` means `nums[j]` is included. - * @param nums The array from which to generate subsets. - * Duplicate values in `nums` may result in duplicate subset entries. - * @return An array containing the subset corresponding to `index`. + * @param index An integer whose binary representation determines the subset + * elements. A `1` bit at position `j` means `nums[j]` is + * included. + * @param nums The array from which to generate subsets. Duplicate values + * in `nums` may result in duplicate subset entries. + * @return An array containing the subset corresponding to `index`. */ protected static int[] generateSubset(int index, int[] nums) { // Allocate subset array based on the number of 1-bits in index. @@ -448,17 +548,23 @@ protected static int[] generateSubset(int index, int[] nums) { /** * Add pseudo ("R") atoms to an atom in a molecule. * - * @param atom the atom to add the pseudo atoms to + * @param atom the atom to add the pseudo atoms to * @param rcount the number of pseudo atoms to add - * @param mol the molecule the atom belongs to + * @param mol the molecule the atom belongs to */ private void addRAtoms(IAtom atom, int rcount, IAtomContainer mol) { for (int i = 0; i < rcount; i++) { - IPseudoAtom tmpRAtom = atom.getBuilder().newInstance(IPseudoAtom.class, "R"); + IPseudoAtom tmpRAtom = atom.getBuilder().newInstance( + IPseudoAtom.class, "R" + ); tmpRAtom.setAttachPointNum(1); tmpRAtom.setImplicitHydrogenCount(0); mol.addAtom(tmpRAtom); - mol.addBond(atom.getBuilder().newInstance(IBond.class, atom, tmpRAtom, IBond.Order.SINGLE)); + mol.addBond(atom.getBuilder().newInstance( + IBond.class, + atom, tmpRAtom, + IBond.Order.SINGLE + )); } } @@ -466,8 +572,10 @@ private void addRAtoms(IAtom atom, int rcount, IAtomContainer mol) { * Creates a copy of an atom and adds it to the specified atom container. * * @param originalAtom The atom to be copied. - * @param atomContainer The destination container where the copied atom will be added. - * @return A new atom with the same properties as `originalAtom`, added to `atomContainer`. + * @param atomContainer The destination container where the copied atom will + * be added. + * @return A new atom with the same properties as `originalAtom`, added to + * `atomContainer`. */ private static IAtom copyAtom(IAtom originalAtom, IAtomContainer atomContainer) { IAtom copiedAtom = atomContainer.newAtom(originalAtom.getAtomicNumber(), @@ -479,26 +587,50 @@ private static IAtom copyAtom(IAtom originalAtom, IAtomContainer atomContainer) return copiedAtom; } + private static void copyBond( + IAtom cpyCurrentAtom, + IAtom cpyNbor, + IBond origBond, + IAtomContainer atomContainer + ) { + IBond cpyBond = atomContainer.newBond( + cpyCurrentAtom, + cpyNbor, + origBond.getOrder()); + cpyBond.setStereo(origBond.getStereo()); + cpyBond.setIsAromatic(origBond.isAromatic()); + // Setting is in ring is possible here because we always detect rings + // in the process of detecting the splittable bonds. + cpyBond.setIsInRing(origBond.isInRing()); + } + /** - * Splits and saturates (if specified via {@link #saturationSetting}) a molecule into multiple fragments by removing the - * specified bonds and making copies of the resulting fragments. + * Splits and saturates (if specified via {@link #saturationSetting}) a + * molecule into multiple fragments by removing the specified bonds and + * making copies of the resulting fragments. * * @param origMol The molecule to be split. - * @param bondsToSplit The bonds that should be removed to create separate fragments. + * @param bondsToSplit The bonds that should be removed to create + * separate fragments. * @return An array of copied molecular fragments resulting from the split. */ - private IAtomContainer[] splitBondsWithCopy(IAtomContainer origMol, IBond[] bondsToSplit) { - Set bondsToSplitSet = new HashSet<>(bondsToSplit.length); + private IAtomContainer[] splitBondsWithCopy( + IAtomContainer origMol, + IBond[] bondsToSplit + ) { + Set bondsToSplitSet = new HashSet<>( + (int) Math.ceil(bondsToSplit.length / (double) 0.75f) + ); // for a faster lookup the hashset is used here. bondsToSplitSet.addAll(Arrays.asList(bondsToSplit)); boolean[] visitedOriginalAtoms = new boolean[origMol.getAtomCount()]; List fragmentList = new ArrayList<>(bondsToSplit.length + 1); - int copiedBonds = 0; for (int i = 0; i < origMol.getAtomCount(); i++) { IAtom currPotentialStartAtom = origMol.getAtom(i); if (!visitedOriginalAtoms[origMol.indexOf(currPotentialStartAtom)]) { - IAtomContainer fragmentContainer = origMol.getBuilder().newInstance(IAtomContainer.class); + IAtomContainer fragmentContainer = + origMol.getBuilder().newInstance(IAtomContainer.class); Map origToCpyMap = new HashMap<>(); Deque dfsStack = new ArrayDeque<>(); // Store split counts specific to the atoms in the fragment being built @@ -522,24 +654,23 @@ private IAtomContainer[] splitBondsWithCopy(IAtomContainer origMol, IBond[] bond visitedOriginalAtoms[origMol.indexOf(origNbor)] = true; IAtom cpyNbor = copyAtom(origNbor, fragmentContainer); origToCpyMap.put(origNbor, cpyNbor); - IBond cpyBond = fragmentContainer.newBond(cpyCurrentAtom, cpyNbor, - origBond.getOrder()); - cpyBond.setStereo(origBond.getStereo()); - cpyBond.setIsAromatic(origBond.isAromatic()); - // Setting is in ring is possible here because we always detect rings - // in the process of detecting the splittable bonds. - cpyBond.setIsInRing(origBond.isInRing()); - // fragmentContainer.addBond(cpyBond); + copyBond( + cpyCurrentAtom, + cpyNbor, + origBond, + fragmentContainer + ); dfsStack.push(origNbor); } else { IAtom cpyNbor = origToCpyMap.get(origNbor); // Add bond only if not already present if (fragmentContainer.getBond(cpyCurrentAtom, cpyNbor) == null) { - IBond cpyBond = fragmentContainer.newBond(cpyCurrentAtom, cpyNbor, - origBond.getOrder()); - cpyBond.setStereo(origBond.getStereo()); - cpyBond.setIsAromatic(origBond.isAromatic()); - cpyBond.setIsInRing(origBond.isInRing()); + copyBond( + cpyCurrentAtom, + cpyNbor, + origBond, + fragmentContainer + ); } } } else { @@ -560,14 +691,17 @@ private IAtomContainer[] splitBondsWithCopy(IAtomContainer origMol, IBond[] bond switch (this.saturationSetting) { case HYDROGEN_SATURATED_FRAGMENTS: Integer currImplHCount = atom.getImplicitHydrogenCount(); - int newImplHCount = (currImplHCount == null ? 0 : currImplHCount) + bondsCutCount; + int newImplHCount = + (currImplHCount == null ? 0 : currImplHCount) + bondsCutCount; atom.setImplicitHydrogenCount(newImplHCount); break; case R_SATURATED_FRAGMENTS: addRAtoms(atom, bondsCutCount, fragmentContainer); break; default: - throw new UnsupportedOperationException("no treatment defined yet for this new enum constant"); + throw new UnsupportedOperationException( + "no treatment defined yet for this new enum constant" + ); } } } diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java index ce4c3dcecf7..21be1051d37 100644 --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java @@ -33,6 +33,9 @@ import java.util.Arrays; import java.util.HashSet; import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.fail; /** * Test exhaustive fragmenter. @@ -47,7 +50,7 @@ class ExhaustiveFragmenterTest extends CDKTestCase { private static ExhaustiveFragmenter fragmenterSaturated; private static ExhaustiveFragmenter fragmenterUnsaturated; private static ExhaustiveFragmenter fragmenterRestSaturated; - private static SmilesParser smilesParser; + private static SmilesParser smilesParser; @BeforeAll static void setup() { @@ -95,7 +98,7 @@ void testEF3Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); fragmenterUnsaturated.generateFragments(mol); String[] frags = fragmenterUnsaturated.getFragments(); - Assertions.assertArrayEquals(frags, new String[]{"[CH]1CCCCC1"}); + Assertions.assertArrayEquals(new String[]{"[CH]1CCCCC1"}, frags); } /** @@ -108,7 +111,7 @@ void testEF4Unsaturated() throws Exception { fragmenterUnsaturated.generateFragments(mol); String[] frags = fragmenterUnsaturated.getFragments(); Assertions.assertNotNull(frags); - Assertions.assertArrayEquals(frags, new String[]{"[c]1ccccc1"}); + Assertions.assertArrayEquals(new String[]{"[c]1ccccc1"}, frags); } /** @@ -121,11 +124,11 @@ void testEF5Unsaturated() throws Exception { fragmenterUnsaturated.generateFragments(mol); String[] frags = fragmenterUnsaturated.getFragments(); Assertions.assertNotNull(frags); - Assertions.assertTrue( - hasItems(frags, new String[] { + assertFragsContain( + new String[]{ "[CH2]c1ccccc1", "[c]1ccccc1" - }) + }, frags ); Assertions.assertNotNull(fragmenterUnsaturated.getFragmentsAsContainers()); Assertions.assertEquals(2, fragmenterUnsaturated.getFragmentsAsContainers().length); @@ -172,12 +175,12 @@ void testEF7Unsaturated() throws Exception { Assertions.assertNotNull(fragmenterUnsaturated.getFragmentsAsContainers()); Assertions.assertEquals(26, fragmenterUnsaturated.getFragmentsAsContainers().length); - Assertions.assertTrue( - hasItems(frags, new String[] { + assertFragsContain( + new String[]{ "[c]1ccccc1", "[CH2]CC1CCC(c2ccccc2)(CC3C=CC=C3)C1", "[CH2]C1CCC([CH2])(c2ccccc2)C1" - }) + }, frags ); } @@ -216,7 +219,7 @@ void testEF3Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); - Assertions.assertArrayEquals(frags, new String[]{"C1CCCCC1"}); + Assertions.assertArrayEquals(new String[]{"C1CCCCC1"}, frags); } /** @@ -229,7 +232,7 @@ void testEF4Saturated() throws Exception { fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); Assertions.assertNotNull(frags); - Assertions.assertArrayEquals(frags, new String[]{"c1ccccc1"}); + Assertions.assertArrayEquals(new String[]{"c1ccccc1"}, frags); } /** @@ -243,11 +246,11 @@ void testEF5Saturated() throws Exception { fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); Assertions.assertNotNull(frags); - Assertions.assertTrue( - hasItems(frags, new String[] { + assertFragsContain( + new String[]{ "c1ccc(cc1)C", "c1ccccc1" - }) + }, frags ); Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); Assertions.assertEquals(2, fragmenterSaturated.getFragmentsAsContainers().length); @@ -263,7 +266,7 @@ void testEF6Saturated() throws Exception { fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); Assertions.assertNotNull(frags); - Assertions.assertArrayEquals(frags, new String[]{"c1ccccc1"}); + Assertions.assertArrayEquals(new String[]{"c1ccccc1"}, frags); Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); Assertions.assertEquals(1, fragmenterSaturated.getFragmentsAsContainers().length); @@ -286,12 +289,12 @@ void testEF7Saturated() throws Exception { Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); Assertions.assertEquals(25, fragmenterSaturated.getFragmentsAsContainers().length); - Assertions.assertTrue( - hasItems(frags, new String[] { + assertFragsContain( + new String[]{ "c1ccccc1", "c1ccc(cc1)C2(CCC(CC)C2)CC3C=CC=C3", "c1ccc(cc1)C2(C)CCC(C)C2" - }) + }, frags ); } @@ -307,7 +310,7 @@ void testEF3RestSaturated() throws Exception { fragmenterRestSaturated.generateFragments(mol); String[] frags = fragmenterRestSaturated.getFragments(); Assertions.assertNotNull(frags); - Assertions.assertArrayEquals(frags, new String[]{"*C1CCCCC1"}); + Assertions.assertArrayEquals(new String[]{"*C1CCCCC1"}, frags); } /** @@ -320,11 +323,11 @@ void testEF5RestSaturated() throws Exception { fragmenterRestSaturated.generateFragments(mol); String[] frags = fragmenterRestSaturated.getFragments(); Assertions.assertNotNull(frags); - Assertions.assertTrue( - hasItems(frags, new String[] { + assertFragsContain( + new String[]{ "*c1ccccc1", "*Cc1ccccc1" - }) + }, frags ); Assertions.assertEquals(2, fragmenterRestSaturated.getFragmentsAsContainers().length); } @@ -339,7 +342,7 @@ void testEF6RestSaturated() throws Exception { fragmenterRestSaturated.generateFragments(mol); String[] frags = fragmenterRestSaturated.getFragments(); Assertions.assertNotNull(frags); - Assertions.assertArrayEquals(frags, new String[]{"*c1ccccc1"}); + Assertions.assertArrayEquals(new String[]{"*c1ccccc1"}, frags); Assertions.assertEquals(1, fragmenterRestSaturated.getFragmentsAsContainers().length); } @@ -356,12 +359,12 @@ void testEF7RestSaturated() throws Exception { Assertions.assertNotNull(frags); // Needs to have the same number of fragments as the unsaturated version. Assertions.assertEquals(26, fragmenterRestSaturated.getFragmentsAsContainers().length); - Assertions.assertTrue( - hasItems(frags, new String[] { - "*c1ccccc1", - "*C1CCC(c2ccccc2)(CC3C=CC=C3)C1", - "*C1CCC(*)(c2ccccc2)C1" - }) + assertFragsContain( + new String[]{ + "*c1ccccc1", + "*C1CCC(c2ccccc2)(CC3C=CC=C3)C1", + "*C1CCC(*)(c2ccccc2)C1" + }, frags ); } @@ -397,11 +400,11 @@ void testMinSizeLowered() throws Exception { String[] frags = localFragmenter.getFragments(); Assertions.assertNotNull(frags); Assertions.assertEquals(2, frags.length); - Assertions.assertTrue( - hasItems(frags, new String[] { + assertFragsContain( + new String[]{ "C1CCCCC1", "C1CCCC1" - }) + }, frags ); } @@ -417,9 +420,11 @@ void testEqualityOfSmilesAndContainers() throws Exception { String[] smilesFrags = fragmenterSaturated.getFragments(); IAtomContainer[] containerFrags = fragmenterSaturated.getFragmentsAsContainers(); for (IAtomContainer frag : containerFrags) { - Assertions.assertTrue(hasItems(smilesFrags, new String[] { - smilesGenerator.create(frag) - })); + assertFragsContain( + new String[]{ + smilesGenerator.create(frag) + }, smilesFrags + ); } } @@ -430,8 +435,8 @@ void testEqualityOfSmilesAndContainers() throws Exception { @Test void testGetSplittableBondsLinearMolecule() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("CCC"); // Propane - IBond[] splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); - Assertions.assertEquals(0, splittableBonds.length); + Set splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); + Assertions.assertTrue(splittableBonds.isEmpty()); } /** @@ -441,8 +446,8 @@ void testGetSplittableBondsLinearMolecule() throws Exception { @Test void testGetSplittableBondsCyclicMolecule() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1"); // Cyclopentane - IBond[] splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); - Assertions.assertEquals(0, splittableBonds.length); + Set splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); + Assertions.assertTrue(splittableBonds.isEmpty()); } /** @@ -452,8 +457,8 @@ void testGetSplittableBondsCyclicMolecule() throws Exception { @Test void testGetSplittableBondsBenzeneWithSideChain() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC"); // Ethylbenzene - IBond[] splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); - Assertions.assertEquals(1, splittableBonds.length); + Set splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); + Assertions.assertEquals(1, splittableBonds.size()); } /** @@ -463,8 +468,8 @@ void testGetSplittableBondsBenzeneWithSideChain() throws Exception { @Test void testGetSplittableBondsBiphenyl() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); // Biphenyl - IBond[] splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); - Assertions.assertEquals(1, splittableBonds.length); + Set splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); + Assertions.assertEquals(1, splittableBonds.size()); } /** @@ -512,11 +517,11 @@ void testCustomSmilesGenerator() throws Exception { String[] frags = customFragmenter.getFragments(); Assertions.assertNotNull(frags); - Assertions.assertTrue( - hasItems(frags, new String[] { + assertFragsContain( + new String[]{ "C=1C=CC=CC1", "C=1C=CC(=CC1)C" - }) + }, frags ); Assertions.assertEquals(2, frags.length); } @@ -599,11 +604,11 @@ void testSetExclusiveMaxTreeDepth() throws Exception { String[] fragsDepth2 = localFragmenter.getFragments(); Assertions.assertEquals(4, fragsDepth2.length, "Expected 4 fragments when inclusiveMaxTreeDepth is 1 (allows up to 1 cut)"); - Assertions.assertTrue( - hasItems(fragsDepth2, new String[] { + assertFragsContain( + new String[]{ "CCCC", "c1ccc(cc1)CCCC" - }) + }, fragsDepth2 ); localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, Integer.SIZE - 1); @@ -612,12 +617,12 @@ void testSetExclusiveMaxTreeDepth() throws Exception { String[] fragsDepth3 = localFragmenter.getFragments(); Assertions.assertEquals(10, fragsDepth3.length, "Expected 10 fragments when inclusiveMaxTreeDepth is 2 (allows up to 2 cuts)"); - Assertions.assertTrue( - hasItems(fragsDepth3, new String[] { + assertFragsContain( + new String[]{ "CCCC", "c1ccc(cc1)CCCC", "c1ccccc1" - }) + }, fragsDepth3 ); localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, Integer.SIZE - 1); @@ -626,36 +631,34 @@ void testSetExclusiveMaxTreeDepth() throws Exception { String[] fragsDepth4 = localFragmenter.getFragments(); Assertions.assertEquals(10, fragsDepth4.length, "Expected 10 fragments when inclusiveMaxTreeDepth is 3 (allows up to 3 cuts), same as max 2 cuts"); - Assertions.assertTrue( - hasItems(fragsDepth4, new String[]{ + assertFragsContain( + new String[]{ "CCCC", "c1ccc(cc1)CCCC", "c1ccccc1" - }) + }, fragsDepth4 ); } // --- Complementary Molecule Tests --- /** - * Tests correct functional group identification on an example molecule with - * a disconnected structure. - * This was not allowed in a previous version. + * Tests exhaustive fragmentation on an example molecule with a disconnected + * structure. */ @Test void testDisconnectedMolecules() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C(CN(CC(=O)[O-])CC(=O)[O-])N(CC(=O)[O-])CC(=O)[O-].[Na+].[Na+].[Na+].[Na+]"); //Sodium edetate fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); - Assertions.assertTrue( - hasItems(frags, new String[]{ - "O=C([O-])CNCCNCC(=O)[O-]", - "O=C([O-])CNCC(=O)[O-]", - "O=C([O-])CN(C)CCN(C)C", - "O=C([O-])CNCCNC", - "O=C([O-])CN(CC(=O)[O-])CC" - } - ) + assertFragsContain( + new String[]{ + "O=C([O-])CNCCNCC(=O)[O-]", + "O=C([O-])CNCC(=O)[O-]", + "O=C([O-])CN(C)CCN(C)C", + "O=C([O-])CNCCNC", + "O=C([O-])CN(CC(=O)[O-])CC" + }, frags ); } @@ -672,71 +675,108 @@ void testBigMolecule1() throws Exception { fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); - Assertions.assertTrue( - hasItems( - frags, new String[]{ - "O=C(NCC)CCC", - "NC=1C=CC=CC1", - "O=C(N)CCCS(=O)C", - "FC=1C=CC(=CC1)C(N)C" - } - ) + assertFragsContain( + new String[]{ + "O=C(NCC)CCC", + "NC=1C=CC=CC1", + "O=C(N)CCCS(=O)C", + "FC=1C=CC(=CC1)C(N)C" + }, frags ); } /** - * Testing a bigger molecule + * Testing a molecule with 31 splittable bonds (takes extremely long, maybe days) + * + * @throws Exception if anything goes wrong + */ + // @Test + void testIndexBigMolecule2() throws Exception { + SmilesParser smiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); + IAtomContainer mol = smiPar.parseSmiles("C[C@]12CC[C@](CC1C3=CC(=O)C4[C@]5(CCC(C(C5CC[C@]4([C@@]3(CC2)C)C)(C)C)" + + "OC6C(C(C(C(O6)C(=O)N[C@H](CCC(=O)OC)C(=O)OC)O)O)OC7C(C(C(C(O7)C(=O)N[C@H](CCC(=O)OC)C(=O)OC)O)O)O)C)(C)C" + + "(=O)N[C@H](CCC(=O)OC)C(=O)OC"); // Pubchem CID 16396833 + fragmenterSaturated.generateFragments(mol); + String[] frags = fragmenterSaturated.getFragments(); + assertFragsContain( + new String[]{ + "O=CCNC(=O)c1ccccc1", + "O=C(N)CNC(=O)c1ccccc1N", + "O=C(NC)c1ccccc1N", + "O=C(NCCC)c1ccccc1N", + "O=CCNC(=O)c1ccccc1Nc2cccc(c2C)C", + "O=C(N)CCCS(=O)C", + "O=C(N)C(NC(=O)c1ccccc1)CCS(=O)C" + }, frags + ); + } + + /** + * Testing the test molecule of the Java doc comment for the + * {@link ExhaustiveFragmenter} * * @throws Exception if anything goes wrong */ @Test void testTestMoleculeUnsaturated() throws Exception { + + // test with default settings SmilesParser smiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); - IAtomContainer mol = smiPar.parseSmiles("C1CCCCC1c1ccccc1"); //PubChem CID - fragmenterUnsaturated.generateFragments(mol); - String[] frags = fragmenterUnsaturated.getFragments(); - Assertions.assertTrue(hasItems(frags, new String[] { - "[CH]1CCCCC1", - "[c]1ccccc1" - })); - } - -// /** -// * Testing a molecule with 31 splittable bonds (takes extremely long, maybe days) -// * -// * @throws Exception if anything goes wrong -// */ -// @Test -// void testIndexBigMolecule2() throws Exception { -// SmilesParser smiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); -// IAtomContainer mol = smiPar.parseSmiles("C[C@]12CC[C@](CC1C3=CC(=O)C4[C@]5(CCC(C(C5CC[C@]4([C@@]3(CC2)C)C)(C)C)" + -// "OC6C(C(C(C(O6)C(=O)N[C@H](CCC(=O)OC)C(=O)OC)O)O)OC7C(C(C(C(O7)C(=O)N[C@H](CCC(=O)OC)C(=O)OC)O)O)O)C)(C)C" + -// "(=O)N[C@H](CCC(=O)OC)C(=O)OC"); // Pubchem CID 16396833 -// System.out.println(fragmenterSaturated.getSplitableBonds(mol).length); -// fragmenterSaturated.generateFragments(mol); -// String[] frags = fragmenterSaturated.getFragments(); -// Assertions.assertEquals( -// Arrays.asList(frags), -// hasItems( -// "O=CCNC(=O)c1ccccc1", -// "O=C(N)CNC(=O)c1ccccc1N", -// "O=C(NC)c1ccccc1N", -// "O=C(NCCC)c1ccccc1N", -// "O=CCNC(=O)c1ccccc1Nc2cccc(c2C)C", -// "O=C(N)CCCS(=O)C", -// "O=C(N)C(NC(=O)c1ccccc1)CCS(=O)C" -// ) -// ); -// } + IAtomContainer mol = smiPar.parseSmiles("C1CCCC1C1=CC=CC=C1"); + ExhaustiveFragmenter localDefaultFragmenter = new ExhaustiveFragmenter(); + localDefaultFragmenter.generateFragments(mol.clone()); + String[] fragsDefault = localDefaultFragmenter.getFragments(); + assertFragsContain( + new String[]{ + "[C]1=CC=CC=C1" + }, fragsDefault + ); + + // test with minimal fragment size of 5 and hydrogen saturation + + ExhaustiveFragmenter localCustomFragmenter = new ExhaustiveFragmenter( + 5, + ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS + ); + localCustomFragmenter.generateFragments(mol.clone()); + String[] fragsCustom = localCustomFragmenter.getFragments(); + assertFragsContain( + new String[]{ + "C1CCCC1", + "C1=CC=CC=C1", + }, + fragsCustom + ); + } + // --utility -- - static boolean hasItems(String[] allFragments, String[] requiredFragments) { - Set allFragmentsSet = new HashSet<>(Arrays.asList(allFragments)); - for (String frag : requiredFragments) { - if (!allFragmentsSet.contains(frag)) { - return false; + + private static void assertFragsContain( + T[] expected, + T[] actual + ) { + + Set expectedSet = new HashSet<>(Arrays.asList(expected)); + Set actualSet = new HashSet<>(Arrays.asList(actual)); + + Set missing = expectedSet.stream() + .filter(item -> !actualSet.contains(item)) + .collect(Collectors.toSet()); + + Set extra = actualSet.stream() + .filter(item -> !expectedSet.contains(item)) + .collect(Collectors.toSet()); + + if (!missing.isEmpty()) { + StringBuilder failureMessage = new StringBuilder(); + + failureMessage.append("Expected but not found: ").append(missing).append("\n"); + if (!extra.isEmpty()) { + failureMessage.append("Found but not expected: ").append(extra).append("\n"); } + + fail(failureMessage.toString()); } - return true; } } From 0890b0b874fff1e3fd2dec44ce041784450abd5b Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Wed, 20 Aug 2025 17:40:41 +0200 Subject: [PATCH 28/42] implemented test for double bonds, cleaned up code and tests --- .../cdk/fragment/ExhaustiveFragmenter.java | 29 ++- .../fragment/ExhaustiveFragmenterTest.java | 230 +++++++++++------- 2 files changed, 160 insertions(+), 99 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index b4f4bb94a38..ff93e1d1f27 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -158,7 +158,7 @@ public enum Saturation { ); private static final int DEFAULT_INCLUSIVE_MAX_TREE_DEPTH = Integer.SIZE - 1; - private final Map fragMap; + private Map fragMap; private final SmilesGenerator smilesGenerator; private int inclusiveMaxTreeDepth; private int minFragSize; @@ -253,7 +253,7 @@ public ExhaustiveFragmenter( this.smilesGenerator = smilesGenerator; this.setInclusiveMaxTreeDepth(inclusiveMaxTreeDepth); this.setMinimumFragmentSize(minFragSize); - this.fragMap = new HashMap<>(); + this.fragMap = null; } /** @@ -320,7 +320,9 @@ public void setInclusiveMaxTreeDepth(int inclusiveMaxTreeDepth) { */ @Override public void generateFragments(IAtomContainer atomContainer) throws CDKException { - this.fragMap.clear(); + if (this.fragMap != null) { + this.fragMap.clear(); + } run(atomContainer); } @@ -344,16 +346,17 @@ private void run(IAtomContainer atomContainer) throws CDKException { if (atomContainer.getBondCount() < 3 || atomContainer.getAtomCount() < this.minFragSize || atomContainer.isEmpty()) { + this.fragMap = new HashMap<>(0); return; } // Retrieve bonds that are eligible for splitting - IBond[] splittableBonds = - getSplittableBonds(atomContainer).toArray(new IBond[0]); + IBond[] splittableBonds = getSplittableBonds(atomContainer); // If no splittable bonds are found, return early if (splittableBonds.length == 0) { logger.debug("no splittable bonds found"); + this.fragMap = new HashMap<>(0); return; } if (splittableBonds.length > this.inclusiveMaxTreeDepth) { @@ -376,6 +379,8 @@ private void run(IAtomContainer atomContainer) throws CDKException { splittableBondIndices[i] = splittableBonds[i].getIndex(); } + this.fragMap = new HashMap<>(numberOfIterations); + // Iterate over all non-empty subsets of splittable bonds for (int i = 1; i <= numberOfIterations; i++) { int[] subset = generateSubset(i, splittableBondIndices); @@ -444,7 +449,7 @@ private void run(IAtomContainer atomContainer) throws CDKException { * @param atomContainer the container which contains the molecule in question. * @return the bonds which would be split by the exhaustive fragmentation. */ - public static Set getSplittableBonds(IAtomContainer atomContainer) { + public static IBond[] getSplittableBonds(IAtomContainer atomContainer) { if (atomContainer == null) { throw new NullPointerException("The atom container must not be null"); } @@ -458,7 +463,7 @@ public static Set getSplittableBonds(IAtomContainer atomContainer) { IAtomContainer allRingsContainer = ringSearch.ringFragments(); // find the splittable bonds - Set splittableBondSet = new HashSet<>( + ArrayList splittableBondSet = new ArrayList<>( atomContainer.getBondCount() / 3 ); @@ -483,7 +488,7 @@ public static Set getSplittableBonds(IAtomContainer atomContainer) { if (!(isInRing || isTerminal)) splittableBondSet.add(bond); } } - return splittableBondSet; + return splittableBondSet.toArray(new IBond[0]); } /** @@ -718,6 +723,10 @@ private IAtomContainer[] splitBondsWithCopy( */ @Override public String[] getFragments() { + if (fragMap == null) { + throw new NullPointerException("It is mandatory to generate " + + "fragments before getting them"); + } return (new ArrayList<>(fragMap.keySet())).toArray(new String[0]); } @@ -728,6 +737,10 @@ public String[] getFragments() { */ @Override public IAtomContainer[] getFragmentsAsContainers() { + if (fragMap == null) { + throw new NullPointerException("It is mandatory to generate " + + "fragments before getting them"); + } return (new ArrayList<>(fragMap.values())).toArray(new IAtomContainer[0]); } diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java index 21be1051d37..cf5d4ab3b76 100644 --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java @@ -22,6 +22,7 @@ import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import org.openscience.cdk.exception.CDKException; import org.openscience.cdk.interfaces.IAtomContainer; import org.openscience.cdk.interfaces.IBond; import org.openscience.cdk.silent.SilentChemObjectBuilder; @@ -46,23 +47,30 @@ * @see ExhaustiveFragmenter */ class ExhaustiveFragmenterTest extends CDKTestCase { - - private static ExhaustiveFragmenter fragmenterSaturated; - private static ExhaustiveFragmenter fragmenterUnsaturated; - private static ExhaustiveFragmenter fragmenterRestSaturated; private static SmilesParser smilesParser; @BeforeAll static void setup() { - fragmenterSaturated = new ExhaustiveFragmenter(); - fragmenterSaturated.setSaturationSetting(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); - fragmenterUnsaturated = new ExhaustiveFragmenter(); - fragmenterUnsaturated.setSaturationSetting(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); - fragmenterRestSaturated = new ExhaustiveFragmenter(); - fragmenterRestSaturated.setSaturationSetting(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); smilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); } + private ExhaustiveFragmenter setupSaturatedFragmenter() { + ExhaustiveFragmenter exhaustiveFragmenterSaturated = new ExhaustiveFragmenter(); + exhaustiveFragmenterSaturated.setSaturationSetting(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + return exhaustiveFragmenterSaturated; + } + + private ExhaustiveFragmenter setupUnsaturatedFragmenter() { + ExhaustiveFragmenter exhaustiveFragmenterUnsaturated = new ExhaustiveFragmenter(); + exhaustiveFragmenterUnsaturated.setSaturationSetting(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); + return exhaustiveFragmenterUnsaturated; + } + + private ExhaustiveFragmenter setupRSaturatedFragmenter() { + ExhaustiveFragmenter exhaustiveFragmenterRSaturated = new ExhaustiveFragmenter(); + exhaustiveFragmenterRSaturated.setSaturationSetting(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); + return exhaustiveFragmenterRSaturated; + } // --- Unsaturated Fragments Tests --- /** @@ -72,8 +80,9 @@ static void setup() { @Test void testEF1Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("CCC"); - fragmenterUnsaturated.generateFragments(mol); - String[] frags = fragmenterUnsaturated.getFragments(); + ExhaustiveFragmenter unsaturatedFragmenter = setupUnsaturatedFragmenter(); + unsaturatedFragmenter.generateFragments(mol); + String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertEquals(0, frags.length); } @@ -84,8 +93,9 @@ void testEF1Unsaturated() throws Exception { @Test void testEF2Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1"); - fragmenterUnsaturated.generateFragments(mol); - String[] frags = fragmenterUnsaturated.getFragments(); + ExhaustiveFragmenter unsaturatedFragmenter = setupUnsaturatedFragmenter(); + unsaturatedFragmenter.generateFragments(mol); + String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertEquals(0, frags.length); } @@ -96,8 +106,9 @@ void testEF2Unsaturated() throws Exception { @Test void testEF3Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); - fragmenterUnsaturated.generateFragments(mol); - String[] frags = fragmenterUnsaturated.getFragments(); + ExhaustiveFragmenter unsaturatedFragmenter = setupUnsaturatedFragmenter(); + unsaturatedFragmenter.generateFragments(mol); + String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertArrayEquals(new String[]{"[CH]1CCCCC1"}, frags); } @@ -108,8 +119,9 @@ void testEF3Unsaturated() throws Exception { @Test void testEF4Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC"); - fragmenterUnsaturated.generateFragments(mol); - String[] frags = fragmenterUnsaturated.getFragments(); + ExhaustiveFragmenter unsaturatedFragmenter = setupUnsaturatedFragmenter(); + unsaturatedFragmenter.generateFragments(mol); + String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); Assertions.assertArrayEquals(new String[]{"[c]1ccccc1"}, frags); } @@ -121,8 +133,9 @@ void testEF4Unsaturated() throws Exception { @Test void testEF5Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); - fragmenterUnsaturated.generateFragments(mol); - String[] frags = fragmenterUnsaturated.getFragments(); + ExhaustiveFragmenter unsaturatedFragmenter = setupUnsaturatedFragmenter(); + unsaturatedFragmenter.generateFragments(mol); + String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); assertFragsContain( new String[]{ @@ -130,8 +143,8 @@ void testEF5Unsaturated() throws Exception { "[c]1ccccc1" }, frags ); - Assertions.assertNotNull(fragmenterUnsaturated.getFragmentsAsContainers()); - Assertions.assertEquals(2, fragmenterUnsaturated.getFragmentsAsContainers().length); + Assertions.assertNotNull(unsaturatedFragmenter.getFragmentsAsContainers()); + Assertions.assertEquals(2, unsaturatedFragmenter.getFragmentsAsContainers().length); } /** @@ -141,13 +154,14 @@ void testEF5Unsaturated() throws Exception { @Test void testEF6Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); - fragmenterUnsaturated.generateFragments(mol); - String[] frags = fragmenterUnsaturated.getFragments(); + ExhaustiveFragmenter unsaturatedFragmenter = setupUnsaturatedFragmenter(); + unsaturatedFragmenter.generateFragments(mol); + String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); - Assertions.assertArrayEquals(frags, new String[]{"[c]1ccccc1"}); + Assertions.assertArrayEquals(new String[]{"[c]1ccccc1"}, frags); - Assertions.assertNotNull(fragmenterUnsaturated.getFragmentsAsContainers()); - Assertions.assertEquals(1, fragmenterUnsaturated.getFragmentsAsContainers().length); + Assertions.assertNotNull(unsaturatedFragmenter.getFragmentsAsContainers()); + Assertions.assertEquals(1, unsaturatedFragmenter.getFragmentsAsContainers().length); } @@ -162,8 +176,9 @@ void testEF6Unsaturated() throws Exception { @Test void testEF7Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); - fragmenterUnsaturated.generateFragments(mol); - String[] frags = fragmenterUnsaturated.getFragments(); + ExhaustiveFragmenter unsaturatedFragmenter = setupUnsaturatedFragmenter(); + unsaturatedFragmenter.generateFragments(mol); + String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); // There is one additional fragment in comparison to the saturated version because there are following fragments: // [C]1CCC([CH2])C1 @@ -172,8 +187,8 @@ void testEF7Unsaturated() throws Exception { // fragments would show up as one if saturated. Assertions.assertEquals(26, frags.length); - Assertions.assertNotNull(fragmenterUnsaturated.getFragmentsAsContainers()); - Assertions.assertEquals(26, fragmenterUnsaturated.getFragmentsAsContainers().length); + Assertions.assertNotNull(unsaturatedFragmenter.getFragmentsAsContainers()); + Assertions.assertEquals(26, unsaturatedFragmenter.getFragmentsAsContainers().length); assertFragsContain( new String[]{ @@ -193,6 +208,7 @@ void testEF7Unsaturated() throws Exception { @Test void testEF1Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("CCC"); + ExhaustiveFragmenter fragmenterSaturated = setupSaturatedFragmenter(); fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); Assertions.assertEquals(0, frags.length); @@ -205,8 +221,9 @@ void testEF1Saturated() throws Exception { @Test void testEF2Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1"); - fragmenterSaturated.generateFragments(mol); - String[] frags = fragmenterSaturated.getFragments(); + ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); Assertions.assertEquals(0, frags.length); } @@ -217,8 +234,9 @@ void testEF2Saturated() throws Exception { @Test void testEF3Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); - fragmenterSaturated.generateFragments(mol); - String[] frags = fragmenterSaturated.getFragments(); + ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); Assertions.assertArrayEquals(new String[]{"C1CCCCC1"}, frags); } @@ -229,8 +247,9 @@ void testEF3Saturated() throws Exception { @Test void testEF4Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC"); - fragmenterSaturated.generateFragments(mol); - String[] frags = fragmenterSaturated.getFragments(); + ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); Assertions.assertArrayEquals(new String[]{"c1ccccc1"}, frags); } @@ -243,8 +262,9 @@ void testEF4Saturated() throws Exception { @Test void testEF5Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); - fragmenterSaturated.generateFragments(mol); - String[] frags = fragmenterSaturated.getFragments(); + ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); assertFragsContain( new String[]{ @@ -252,8 +272,8 @@ void testEF5Saturated() throws Exception { "c1ccccc1" }, frags ); - Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); - Assertions.assertEquals(2, fragmenterSaturated.getFragmentsAsContainers().length); + Assertions.assertNotNull(saturatedFragmenter.getFragmentsAsContainers()); + Assertions.assertEquals(2, saturatedFragmenter.getFragmentsAsContainers().length); } /** @@ -263,13 +283,14 @@ void testEF5Saturated() throws Exception { @Test void testEF6Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); - fragmenterSaturated.generateFragments(mol); - String[] frags = fragmenterSaturated.getFragments(); + ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); Assertions.assertArrayEquals(new String[]{"c1ccccc1"}, frags); - Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); - Assertions.assertEquals(1, fragmenterSaturated.getFragmentsAsContainers().length); + Assertions.assertNotNull(saturatedFragmenter.getFragmentsAsContainers()); + Assertions.assertEquals(1, saturatedFragmenter.getFragmentsAsContainers().length); } /** @@ -281,13 +302,14 @@ void testEF6Saturated() throws Exception { @Test void testEF7Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); - fragmenterSaturated.generateFragments(mol); - String[] frags = fragmenterSaturated.getFragments(); + ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); Assertions.assertEquals(25, frags.length); - Assertions.assertNotNull(fragmenterSaturated.getFragmentsAsContainers()); - Assertions.assertEquals(25, fragmenterSaturated.getFragmentsAsContainers().length); + Assertions.assertNotNull(saturatedFragmenter.getFragmentsAsContainers()); + Assertions.assertEquals(25, saturatedFragmenter.getFragmentsAsContainers().length); assertFragsContain( new String[]{ @@ -307,8 +329,9 @@ void testEF7Saturated() throws Exception { @Test void testEF3RestSaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); - fragmenterRestSaturated.generateFragments(mol); - String[] frags = fragmenterRestSaturated.getFragments(); + ExhaustiveFragmenter rSaturatedFragmenter = setupRSaturatedFragmenter(); + rSaturatedFragmenter.generateFragments(mol); + String[] frags = rSaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); Assertions.assertArrayEquals(new String[]{"*C1CCCCC1"}, frags); } @@ -320,8 +343,9 @@ void testEF3RestSaturated() throws Exception { @Test void testEF5RestSaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); - fragmenterRestSaturated.generateFragments(mol); - String[] frags = fragmenterRestSaturated.getFragments(); + ExhaustiveFragmenter rSaturatedFragmenter = setupRSaturatedFragmenter(); + rSaturatedFragmenter.generateFragments(mol); + String[] frags = rSaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); assertFragsContain( new String[]{ @@ -329,7 +353,7 @@ void testEF5RestSaturated() throws Exception { "*Cc1ccccc1" }, frags ); - Assertions.assertEquals(2, fragmenterRestSaturated.getFragmentsAsContainers().length); + Assertions.assertEquals(2, rSaturatedFragmenter.getFragmentsAsContainers().length); } /** @@ -339,11 +363,12 @@ void testEF5RestSaturated() throws Exception { @Test void testEF6RestSaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); - fragmenterRestSaturated.generateFragments(mol); - String[] frags = fragmenterRestSaturated.getFragments(); + ExhaustiveFragmenter rSaturatedFragmenter = setupRSaturatedFragmenter(); + rSaturatedFragmenter.generateFragments(mol); + String[] frags = rSaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); Assertions.assertArrayEquals(new String[]{"*c1ccccc1"}, frags); - Assertions.assertEquals(1, fragmenterRestSaturated.getFragmentsAsContainers().length); + Assertions.assertEquals(1, rSaturatedFragmenter.getFragmentsAsContainers().length); } /** @@ -354,11 +379,12 @@ void testEF6RestSaturated() throws Exception { @Test void testEF7RestSaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); - fragmenterRestSaturated.generateFragments(mol); - String[] frags = fragmenterRestSaturated.getFragments(); + ExhaustiveFragmenter fragmenterRSaturated = setupRSaturatedFragmenter(); + fragmenterRSaturated.generateFragments(mol); + String[] frags = fragmenterRSaturated.getFragments(); Assertions.assertNotNull(frags); // Needs to have the same number of fragments as the unsaturated version. - Assertions.assertEquals(26, fragmenterRestSaturated.getFragmentsAsContainers().length); + Assertions.assertEquals(26, fragmenterRSaturated.getFragmentsAsContainers().length); assertFragsContain( new String[]{ "*c1ccccc1", @@ -378,6 +404,7 @@ void testEF7RestSaturated() throws Exception { @Test void testMinSize() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1C2CCCCC2"); + ExhaustiveFragmenter fragmenterSaturated = setupSaturatedFragmenter(); fragmenterSaturated.setMinimumFragmentSize(6); fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); @@ -393,11 +420,10 @@ void testMinSize() throws Exception { @Test void testMinSizeLowered() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1C2CCCCC2"); - ExhaustiveFragmenter localFragmenter = new ExhaustiveFragmenter(); - localFragmenter.setSaturationSetting(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); - localFragmenter.setMinimumFragmentSize(5); - localFragmenter.generateFragments(mol); - String[] frags = localFragmenter.getFragments(); + ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + saturatedFragmenter.setMinimumFragmentSize(5); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); Assertions.assertEquals(2, frags.length); assertFragsContain( @@ -416,9 +442,10 @@ void testMinSizeLowered() throws Exception { void testEqualityOfSmilesAndContainers() throws Exception { SmilesGenerator smilesGenerator = new SmilesGenerator(SmiFlavor.UseAromaticSymbols | SmiFlavor.Unique); IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC(N)C(=O)O"); // Phenylalanine - fragmenterSaturated.generateFragments(mol); - String[] smilesFrags = fragmenterSaturated.getFragments(); - IAtomContainer[] containerFrags = fragmenterSaturated.getFragmentsAsContainers(); + ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + saturatedFragmenter.generateFragments(mol); + String[] smilesFrags = saturatedFragmenter.getFragments(); + IAtomContainer[] containerFrags = saturatedFragmenter.getFragmentsAsContainers(); for (IAtomContainer frag : containerFrags) { assertFragsContain( new String[]{ @@ -435,8 +462,8 @@ void testEqualityOfSmilesAndContainers() throws Exception { @Test void testGetSplittableBondsLinearMolecule() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("CCC"); // Propane - Set splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); - Assertions.assertTrue(splittableBonds.isEmpty()); + IBond[] splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); + Assertions.assertEquals(0, splittableBonds.length); } /** @@ -446,8 +473,8 @@ void testGetSplittableBondsLinearMolecule() throws Exception { @Test void testGetSplittableBondsCyclicMolecule() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1"); // Cyclopentane - Set splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); - Assertions.assertTrue(splittableBonds.isEmpty()); + IBond[] splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); + Assertions.assertEquals(0, splittableBonds.length); } /** @@ -457,8 +484,8 @@ void testGetSplittableBondsCyclicMolecule() throws Exception { @Test void testGetSplittableBondsBenzeneWithSideChain() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC"); // Ethylbenzene - Set splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); - Assertions.assertEquals(1, splittableBonds.size()); + IBond[] splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); + Assertions.assertEquals(1, splittableBonds.length); } /** @@ -468,8 +495,8 @@ void testGetSplittableBondsBenzeneWithSideChain() throws Exception { @Test void testGetSplittableBondsBiphenyl() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); // Biphenyl - Set splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); - Assertions.assertEquals(1, splittableBonds.size()); + IBond[] splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); + Assertions.assertEquals(1, splittableBonds.length); } /** @@ -640,6 +667,22 @@ void testSetExclusiveMaxTreeDepth() throws Exception { ); } + /** + * Tests that double bonds will not be split. + */ + @Test + void testDoubleBondIssue() throws CDKException { + IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1=CCC"); + ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); + assertFragsContain( + new String[]{ + "C=C1CCCCC1", + }, frags + ); + } + // --- Complementary Molecule Tests --- /** @@ -648,9 +691,12 @@ void testSetExclusiveMaxTreeDepth() throws Exception { */ @Test void testDisconnectedMolecules() throws Exception { - IAtomContainer mol = smilesParser.parseSmiles("C(CN(CC(=O)[O-])CC(=O)[O-])N(CC(=O)[O-])CC(=O)[O-].[Na+].[Na+].[Na+].[Na+]"); //Sodium edetate - fragmenterSaturated.generateFragments(mol); - String[] frags = fragmenterSaturated.getFragments(); + IAtomContainer mol = smilesParser.parseSmiles( + "C(CN(CC(=O)[O-])CC(=O)[O-])N(CC(=O)[O-])CC(=O)[O-].[Na+].[Na+].[Na+].[Na+]" + ); //Sodium edetate + ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); assertFragsContain( new String[]{ "O=C([O-])CNCCNCC(=O)[O-]", @@ -673,8 +719,9 @@ void testBigMolecule1() throws Exception { IAtomContainer mol = smiPar.parseSmiles("CC1=C(C(=CC=C1)NC2=CC=CC=C2C" + "(=O)NC(CCS(=O)C)C(=O)NC(C)C3=CC=C(C=C3)F)C"); //PubChem CID 118705975 - fragmenterSaturated.generateFragments(mol); - String[] frags = fragmenterSaturated.getFragments(); + ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); assertFragsContain( new String[]{ "O=C(NCC)CCC", @@ -691,13 +738,14 @@ void testBigMolecule1() throws Exception { * @throws Exception if anything goes wrong */ // @Test - void testIndexBigMolecule2() throws Exception { + void testMaxSplittableBonds() throws Exception { SmilesParser smiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); IAtomContainer mol = smiPar.parseSmiles("C[C@]12CC[C@](CC1C3=CC(=O)C4[C@]5(CCC(C(C5CC[C@]4([C@@]3(CC2)C)C)(C)C)" + "OC6C(C(C(C(O6)C(=O)N[C@H](CCC(=O)OC)C(=O)OC)O)O)OC7C(C(C(C(O7)C(=O)N[C@H](CCC(=O)OC)C(=O)OC)O)O)O)C)(C)C" + "(=O)N[C@H](CCC(=O)OC)C(=O)OC"); // Pubchem CID 16396833 - fragmenterSaturated.generateFragments(mol); - String[] frags = fragmenterSaturated.getFragments(); + ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); assertFragsContain( new String[]{ "O=CCNC(=O)c1ccccc1", @@ -718,7 +766,7 @@ void testIndexBigMolecule2() throws Exception { * @throws Exception if anything goes wrong */ @Test - void testTestMoleculeUnsaturated() throws Exception { + void testExampleUsage() throws Exception { // test with default settings SmilesParser smiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); @@ -753,18 +801,18 @@ void testTestMoleculeUnsaturated() throws Exception { // --utility -- private static void assertFragsContain( - T[] expected, - T[] actual + String[] expected, + String[] actual ) { - Set expectedSet = new HashSet<>(Arrays.asList(expected)); - Set actualSet = new HashSet<>(Arrays.asList(actual)); + Set expectedSet = new HashSet<>(Arrays.asList(expected)); + Set actualSet = new HashSet<>(Arrays.asList(actual)); - Set missing = expectedSet.stream() + Set missing = expectedSet.stream() .filter(item -> !actualSet.contains(item)) .collect(Collectors.toSet()); - Set extra = actualSet.stream() + Set extra = actualSet.stream() .filter(item -> !expectedSet.contains(item)) .collect(Collectors.toSet()); From c7084a5e8825dffbb55e35b9fd2b47fc4029ab67 Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Thu, 21 Aug 2025 18:06:30 +0200 Subject: [PATCH 29/42] implemented stereo chemistry copying, a respective test and added missing documentation --- .../cdk/fragment/ExhaustiveFragmenter.java | 91 ++++++++++++++++--- .../fragment/ExhaustiveFragmenterTest.java | 42 ++++++++- 2 files changed, 117 insertions(+), 16 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index ff93e1d1f27..5d4e4bd49b6 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -27,7 +27,9 @@ import org.openscience.cdk.interfaces.IAtom; import org.openscience.cdk.interfaces.IAtomContainer; import org.openscience.cdk.interfaces.IBond; +import org.openscience.cdk.interfaces.IChemObject; import org.openscience.cdk.interfaces.IPseudoAtom; +import org.openscience.cdk.interfaces.IStereoElement; import org.openscience.cdk.ringsearch.RingSearch; import org.openscience.cdk.smiles.SmiFlavor; import org.openscience.cdk.smiles.SmilesGenerator; @@ -173,6 +175,7 @@ public enum Saturation { *
  • Unsaturated fragments
  • *
  • Default {@link SmilesGenerator} * ({@code SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols})
  • + *
  • {@link ExhaustiveFragmenter#inclusiveMaxTreeDepth} of 31
  • * */ public ExhaustiveFragmenter() { @@ -186,7 +189,8 @@ public ExhaustiveFragmenter() { /** * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment - * size and saturation setting. Uses the default {@link SmilesGenerator}. + * size and saturation setting. Uses the default {@link SmilesGenerator} and + * default {@link ExhaustiveFragmenter#inclusiveMaxTreeDepth} of 31 * * @param minFragSize Minimum number of atoms in a valid fragment * (excluding implicit hydrogen). @@ -205,7 +209,8 @@ public ExhaustiveFragmenter(int minFragSize, Saturation saturationSetting) { /** * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment * size. Saturation defaults to {@link Saturation#UNSATURATED_FRAGMENTS}. - * Uses the default {@link SmilesGenerator}. + * Uses the default {@link SmilesGenerator} and the default + * {@link ExhaustiveFragmenter#inclusiveMaxTreeDepth} of 31 * * @param minFragSize Minimum number of atoms in a valid fragment * (excluding implicit hydrogen). @@ -221,7 +226,8 @@ public ExhaustiveFragmenter(int minFragSize) { /** * Constructs an ExhaustiveFragmenter with a user-provided {@link SmilesGenerator}, - * user-defined minimum fragment size, and saturation setting. + * user-defined minimum fragment size, inclusive max tree depth and + * saturation setting. * * @param smilesGenerator The {@link SmilesGenerator} instance to use for * creating SMILES strings @@ -240,13 +246,13 @@ public ExhaustiveFragmenter( int inclusiveMaxTreeDepth ) { if (saturationSetting == null) { - throw new IllegalArgumentException( + throw new NullPointerException( "The given SaturationSetting can not be null" ); } this.saturationSetting = saturationSetting; if (smilesGenerator == null) { - throw new IllegalArgumentException( + throw new NullPointerException( "The given SmilesGenerator can not be null" ); } @@ -592,7 +598,19 @@ private static IAtom copyAtom(IAtom originalAtom, IAtomContainer atomContainer) return copiedAtom; } - private static void copyBond( + /** + * Creates a copy of a bond and adds it to the specified atom container. + * + * @param cpyCurrentAtom Atom in the new atom container that is connected by + * the bond to be copied. + * @param cpyNbor The neighbour of `cpyCurrentAtom` that + * is connected by the bond one wants to copy. + * @param origBond The bond in the original molecule. + * @param atomContainer The new atom container to which the bond is to + * be copied. + * @return The bond in the new atom container. + */ + private static IBond copyBond( IAtom cpyCurrentAtom, IAtom cpyNbor, IBond origBond, @@ -607,6 +625,7 @@ private static void copyBond( // Setting is in ring is possible here because we always detect rings // in the process of detecting the splittable bonds. cpyBond.setIsInRing(origBond.isInRing()); + return cpyBond; } /** @@ -624,7 +643,7 @@ private IAtomContainer[] splitBondsWithCopy( IBond[] bondsToSplit ) { Set bondsToSplitSet = new HashSet<>( - (int) Math.ceil(bondsToSplit.length / (double) 0.75f) + (int) Math.ceil(bondsToSplit.length / 0.75) ); // for a faster lookup the hashset is used here. bondsToSplitSet.addAll(Arrays.asList(bondsToSplit)); @@ -636,7 +655,12 @@ private IAtomContainer[] splitBondsWithCopy( if (!visitedOriginalAtoms[origMol.indexOf(currPotentialStartAtom)]) { IAtomContainer fragmentContainer = origMol.getBuilder().newInstance(IAtomContainer.class); - Map origToCpyMap = new HashMap<>(); + Map origToCpyAtomMap = new HashMap<>( + (int) Math.ceil(origMol.getAtomCount() / 0.75) + ); + Map origToCpyBondMap = new HashMap<>( + (int) Math.ceil(origMol.getBondCount() / 0.75) + ); Deque dfsStack = new ArrayDeque<>(); // Store split counts specific to the atoms in the fragment being built Map splitCountsCpyAtoms = new HashMap<>(); @@ -644,38 +668,40 @@ private IAtomContainer[] splitBondsWithCopy( dfsStack.push(currPotentialStartAtom); visitedOriginalAtoms[origMol.indexOf(currPotentialStartAtom)] = true; IAtom cpyStartAtom = copyAtom(currPotentialStartAtom, fragmentContainer); - origToCpyMap.put(currPotentialStartAtom, cpyStartAtom); + origToCpyAtomMap.put(currPotentialStartAtom, cpyStartAtom); while (!dfsStack.isEmpty()) { IAtom origCurrAtom = dfsStack.pop(); - IAtom cpyCurrentAtom = origToCpyMap.get(origCurrAtom); + IAtom cpyCurrentAtom = origToCpyAtomMap.get(origCurrAtom); for (IBond origBond : origMol.getConnectedBondsList(origCurrAtom)) { IAtom origNbor = origBond.getOther(origCurrAtom); boolean isThisABondToSplit = bondsToSplitSet.contains(origBond); if (!isThisABondToSplit) { - if (!origToCpyMap.containsKey(origNbor)) { + if (!origToCpyAtomMap.containsKey(origNbor)) { visitedOriginalAtoms[origMol.indexOf(origNbor)] = true; IAtom cpyNbor = copyAtom(origNbor, fragmentContainer); - origToCpyMap.put(origNbor, cpyNbor); - copyBond( + origToCpyAtomMap.put(origNbor, cpyNbor); + IBond cpyBond = copyBond( cpyCurrentAtom, cpyNbor, origBond, fragmentContainer ); + origToCpyBondMap.put(origBond, cpyBond); dfsStack.push(origNbor); } else { - IAtom cpyNbor = origToCpyMap.get(origNbor); + IAtom cpyNbor = origToCpyAtomMap.get(origNbor); // Add bond only if not already present if (fragmentContainer.getBond(cpyCurrentAtom, cpyNbor) == null) { - copyBond( + IBond cpyBond = copyBond( cpyCurrentAtom, cpyNbor, origBond, fragmentContainer ); + origToCpyBondMap.put(origBond, cpyBond); } } } else { @@ -710,6 +736,41 @@ private IAtomContainer[] splitBondsWithCopy( } } } + // adding stereo information if all elements are present in the + // new fragment + for (IStereoElement elem : origMol.stereoElements()) { + boolean allAtomsPresent = true; + IChemObject focus = elem.getFocus(); + if (focus instanceof IAtom) { + if (!origToCpyAtomMap.containsKey(focus)) { + allAtomsPresent = false; + } + } else if (focus instanceof IBond) { + if (!origToCpyBondMap.containsKey(focus)) { + allAtomsPresent = false; + } + } + + if (allAtomsPresent) { + for (IChemObject iChemObject : elem.getCarriers()) { + if (iChemObject instanceof IAtom) { + if (!origToCpyAtomMap.containsKey(iChemObject)) { + allAtomsPresent = false; + break; + } + } else if (iChemObject instanceof IBond) { + if (!origToCpyBondMap.containsKey(iChemObject)) { + allAtomsPresent = false; + break; + } + } + } + } + + if (allAtomsPresent) { + fragmentContainer.addStereoElement(elem.map(origToCpyAtomMap, origToCpyBondMap)); + } + } fragmentList.add(fragmentContainer); } } diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java index cf5d4ab3b76..d5efabe7c5d 100644 --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java @@ -797,10 +797,50 @@ void testExampleUsage() throws Exception { ); } + /** + * Ensures that stereochemical information (chiral centers '@' and double-bond + * E/Z markers '/' or '\') is preserved when generating fragment containers. + */ + @Test + void testStereoChemistryCopied() throws Exception { + SmilesGenerator smilesGenerator = + new SmilesGenerator( + SmiFlavor.UseAromaticSymbols | SmiFlavor.Stereo + ); + IAtomContainer mol = smilesParser.parseSmiles("CC[C@H](F)C/C=C/C"); + ExhaustiveFragmenter fragmenter = new ExhaustiveFragmenter( + smilesGenerator, + 6, + ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, + 31 + ); + fragmenter.generateFragments(mol); + + String[] smilesFrags = fragmenter.getFragments(); + IAtomContainer[] containerFrags = fragmenter.getFragmentsAsContainers(); + + Assertions.assertNotNull(smilesFrags); + Assertions.assertNotNull(containerFrags); + Assertions.assertEquals(smilesFrags.length, containerFrags.length, + "Number of SMILES fragments and container fragments must match"); + + String[] containerSmiles = new String[smilesFrags.length]; + for (int i = 0; i < containerSmiles.length; i++) { + containerSmiles[i] = smilesGenerator.create(containerFrags[i]); + } + + assertFragsContain(new String[]{ + "C(F)C/C=C/C" + }, smilesFrags); + + assertFragsContain(new String[]{ + "C(F)C/C=C/C" + }, containerSmiles); + } // --utility -- - private static void assertFragsContain( + private static void assertFragsContain( String[] expected, String[] actual ) { From 3eeb88617423b5cbea3e7f7700ac2c4d77a9ff0b Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Thu, 21 Aug 2025 18:48:38 +0200 Subject: [PATCH 30/42] fixed regression in test and fixed wrong maximum tree depth --- .../cdk/fragment/ExhaustiveFragmenter.java | 2 +- .../fragment/ExhaustiveFragmenterTest.java | 220 +++++++++--------- 2 files changed, 105 insertions(+), 117 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index 5d4e4bd49b6..ae7080f324c 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -393,7 +393,7 @@ private void run(IAtomContainer atomContainer) throws CDKException { int subsetSize = subset.length; // Skip subsets exceeding the allowed depth - if (subsetSize >= this.inclusiveMaxTreeDepth) { + if (subsetSize > this.inclusiveMaxTreeDepth) { continue; } diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java index d5efabe7c5d..e6f83a2ef38 100644 --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java @@ -54,23 +54,12 @@ static void setup() { smilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); } - private ExhaustiveFragmenter setupSaturatedFragmenter() { - ExhaustiveFragmenter exhaustiveFragmenterSaturated = new ExhaustiveFragmenter(); - exhaustiveFragmenterSaturated.setSaturationSetting(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); - return exhaustiveFragmenterSaturated; + private ExhaustiveFragmenter setupFragmenter(ExhaustiveFragmenter.Saturation saturation) { + ExhaustiveFragmenter fragmenter = new ExhaustiveFragmenter(); + fragmenter.setSaturationSetting(saturation); + return fragmenter; } - private ExhaustiveFragmenter setupUnsaturatedFragmenter() { - ExhaustiveFragmenter exhaustiveFragmenterUnsaturated = new ExhaustiveFragmenter(); - exhaustiveFragmenterUnsaturated.setSaturationSetting(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); - return exhaustiveFragmenterUnsaturated; - } - - private ExhaustiveFragmenter setupRSaturatedFragmenter() { - ExhaustiveFragmenter exhaustiveFragmenterRSaturated = new ExhaustiveFragmenter(); - exhaustiveFragmenterRSaturated.setSaturationSetting(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); - return exhaustiveFragmenterRSaturated; - } // --- Unsaturated Fragments Tests --- /** @@ -80,7 +69,7 @@ private ExhaustiveFragmenter setupRSaturatedFragmenter() { @Test void testEF1Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("CCC"); - ExhaustiveFragmenter unsaturatedFragmenter = setupUnsaturatedFragmenter(); + ExhaustiveFragmenter unsaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); unsaturatedFragmenter.generateFragments(mol); String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertEquals(0, frags.length); @@ -93,7 +82,7 @@ void testEF1Unsaturated() throws Exception { @Test void testEF2Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1"); - ExhaustiveFragmenter unsaturatedFragmenter = setupUnsaturatedFragmenter(); + ExhaustiveFragmenter unsaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); unsaturatedFragmenter.generateFragments(mol); String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertEquals(0, frags.length); @@ -106,7 +95,7 @@ void testEF2Unsaturated() throws Exception { @Test void testEF3Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); - ExhaustiveFragmenter unsaturatedFragmenter = setupUnsaturatedFragmenter(); + ExhaustiveFragmenter unsaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); unsaturatedFragmenter.generateFragments(mol); String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertArrayEquals(new String[]{"[CH]1CCCCC1"}, frags); @@ -119,7 +108,7 @@ void testEF3Unsaturated() throws Exception { @Test void testEF4Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC"); - ExhaustiveFragmenter unsaturatedFragmenter = setupUnsaturatedFragmenter(); + ExhaustiveFragmenter unsaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); unsaturatedFragmenter.generateFragments(mol); String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -133,7 +122,7 @@ void testEF4Unsaturated() throws Exception { @Test void testEF5Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); - ExhaustiveFragmenter unsaturatedFragmenter = setupUnsaturatedFragmenter(); + ExhaustiveFragmenter unsaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); unsaturatedFragmenter.generateFragments(mol); String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -154,7 +143,7 @@ void testEF5Unsaturated() throws Exception { @Test void testEF6Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); - ExhaustiveFragmenter unsaturatedFragmenter = setupUnsaturatedFragmenter(); + ExhaustiveFragmenter unsaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); unsaturatedFragmenter.generateFragments(mol); String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -176,7 +165,7 @@ void testEF6Unsaturated() throws Exception { @Test void testEF7Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); - ExhaustiveFragmenter unsaturatedFragmenter = setupUnsaturatedFragmenter(); + ExhaustiveFragmenter unsaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); unsaturatedFragmenter.generateFragments(mol); String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -208,7 +197,7 @@ void testEF7Unsaturated() throws Exception { @Test void testEF1Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("CCC"); - ExhaustiveFragmenter fragmenterSaturated = setupSaturatedFragmenter(); + ExhaustiveFragmenter fragmenterSaturated = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); Assertions.assertEquals(0, frags.length); @@ -221,7 +210,7 @@ void testEF1Saturated() throws Exception { @Test void testEF2Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1"); - ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); Assertions.assertEquals(0, frags.length); @@ -234,7 +223,7 @@ void testEF2Saturated() throws Exception { @Test void testEF3Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); - ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); Assertions.assertArrayEquals(new String[]{"C1CCCCC1"}, frags); @@ -247,7 +236,7 @@ void testEF3Saturated() throws Exception { @Test void testEF4Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC"); - ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -262,7 +251,7 @@ void testEF4Saturated() throws Exception { @Test void testEF5Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); - ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -283,7 +272,7 @@ void testEF5Saturated() throws Exception { @Test void testEF6Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); - ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -302,7 +291,7 @@ void testEF6Saturated() throws Exception { @Test void testEF7Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); - ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -329,7 +318,7 @@ void testEF7Saturated() throws Exception { @Test void testEF3RestSaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); - ExhaustiveFragmenter rSaturatedFragmenter = setupRSaturatedFragmenter(); + ExhaustiveFragmenter rSaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); rSaturatedFragmenter.generateFragments(mol); String[] frags = rSaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -343,7 +332,7 @@ void testEF3RestSaturated() throws Exception { @Test void testEF5RestSaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); - ExhaustiveFragmenter rSaturatedFragmenter = setupRSaturatedFragmenter(); + ExhaustiveFragmenter rSaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); rSaturatedFragmenter.generateFragments(mol); String[] frags = rSaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -363,7 +352,7 @@ void testEF5RestSaturated() throws Exception { @Test void testEF6RestSaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); - ExhaustiveFragmenter rSaturatedFragmenter = setupRSaturatedFragmenter(); + ExhaustiveFragmenter rSaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); rSaturatedFragmenter.generateFragments(mol); String[] frags = rSaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -379,7 +368,7 @@ void testEF6RestSaturated() throws Exception { @Test void testEF7RestSaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); - ExhaustiveFragmenter fragmenterRSaturated = setupRSaturatedFragmenter(); + ExhaustiveFragmenter fragmenterRSaturated = setupFragmenter(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); fragmenterRSaturated.generateFragments(mol); String[] frags = fragmenterRSaturated.getFragments(); Assertions.assertNotNull(frags); @@ -404,7 +393,7 @@ void testEF7RestSaturated() throws Exception { @Test void testMinSize() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1C2CCCCC2"); - ExhaustiveFragmenter fragmenterSaturated = setupSaturatedFragmenter(); + ExhaustiveFragmenter fragmenterSaturated = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); fragmenterSaturated.setMinimumFragmentSize(6); fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); @@ -420,7 +409,7 @@ void testMinSize() throws Exception { @Test void testMinSizeLowered() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1C2CCCCC2"); - ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.setMinimumFragmentSize(5); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); @@ -442,7 +431,7 @@ void testMinSizeLowered() throws Exception { void testEqualityOfSmilesAndContainers() throws Exception { SmilesGenerator smilesGenerator = new SmilesGenerator(SmiFlavor.UseAromaticSymbols | SmiFlavor.Unique); IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC(N)C(=O)O"); // Phenylalanine - ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] smilesFrags = saturatedFragmenter.getFragments(); IAtomContainer[] containerFrags = saturatedFragmenter.getFragmentsAsContainers(); @@ -559,111 +548,106 @@ void testCustomSmilesGenerator() throws Exception { * *
          * Molecule: 1,4-dibutylbenzene (CCCCc1ccc(CCCC)cc1)
    -     * Splittable bonds: 6 (the three C-C bonds for each butyl chain, from the ring until the the second last C-atom).
    +     * Splittable bonds: 6 (the three C-C bonds for each butyl chain, from the ring until the second last C-atom).
          * Fragmenter setup: minFragSize = 4 (to include butyl and benzene fragments), hydrogen-saturated fragments.
          *
          * Expected fragments for different inclusiveMaxTreeDepth settings:
          *
    -     * 1.  inclusiveMaxTreeDepth = 0 (allows 0 simultaneous cuts):
    -     * - Expected fragments: 0
    +     * 1.  inclusiveMaxTreeDepth = 1 (allows 1 simultaneous cuts):
    +     * - Expected fragments: 4:
          *
    -     * 2.  inclusiveMaxTreeDepth = 1 (allows up to 1 simultaneous cut):
    -     * - Considers all subsets of splittable bonds of size 1.
    -     * - Expected unique fragments: 4 (
    -     * c1ccc(cc1)CCCC
    -     * c1cc(ccc1C)CCCC
    -     * c1cc(ccc1CC)CCCC
    -     * CCCC"
    -     * )
    +     * 2.  inclusiveMaxTreeDepth = 2 (allows up to 2 simultaneous cut):
    +     * - Considers all subsets of splittable bonds of size 1 and 2
    +     * - Expected unique fragments: 10
          *
    -     * 3.  inclusiveMaxTreeDepth = 2 (allows up to 2 simultaneous cuts):
    -     * - Considers all subsets of splittable bonds of size 1 and 2.
    -     * - Includes fragments from 1-cut operations, plus fragments from 2-cut operations:
    -     * - Expected unique fragments: 10 (
    -     * c1ccc(cc1)C
    -     * c1ccc(cc1)CC
    -     * c1ccc(cc1)CCCC
    -     * c1cc(ccc1C)C
    -     * c1cc(ccc1C)CC
    -     * c1cc(ccc1C)CCCC
    -     * c1cc(ccc1CC)CC
    -     * c1cc(ccc1CC)CCCC
    -     * c1ccccc1
    -     * CCCC
    -     * )
    -     *
    -     * 4.  inclusiveMaxTreeDepth = 3 (allows up to 3 simultaneous cuts):
    -     * - Since there are only combinations of 2 splittable bonds that allow a fragment size bigger the 6, allowing up
    -     *  to 3 cuts (or more) will yield the same set of fragments as allowing up to 2 cuts.
    -     * - Expected unique fragments: 10 (
    -     * c1ccc(cc1)C
    -     * c1ccc(cc1)CC
    -     * c1ccc(cc1)CCCC
    -     * c1cc(ccc1C)C
    -     * c1cc(ccc1C)CC
    -     * c1cc(ccc1C)CCCC
    -     * c1cc(ccc1CC)CC
    -     * c1cc(ccc1CC)CCCC
    -     * c1ccccc1
    -     * CCCC
    -     * )
    +     * 3.  inclusiveMaxTreeDepth = 3 (allows up to 3 simultaneous cuts):
    +     * - Considers all subsets of splittable bonds of size 1, 2 and 3
    +     * - Includes fragments from 1-cut operations, plus fragments from 2-cut
    +     *   and 3-cut operations:
    +     * - Expected unique fragments: 10
          * 
    */ @Test - void testSetExclusiveMaxTreeDepth() throws Exception { + void testSetInclusiveMaxTreeDepth() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("CCCCc1ccc(CCCC)cc1"); // Define a standard SmilesGenerator for fragmenter instantiation - SmilesGenerator standardSmilesGen = new SmilesGenerator(SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols); + SmilesGenerator standardSmilesGen = new SmilesGenerator( + SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols + ); ExhaustiveFragmenter localFragmenter; - localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, Integer.SIZE - 1); + localFragmenter = new ExhaustiveFragmenter( + standardSmilesGen, + 4, + ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, + Integer.SIZE - 1 + ); localFragmenter.setInclusiveMaxTreeDepth(1); localFragmenter.generateFragments(mol); String[] fragsDepth1 = localFragmenter.getFragments(); - Assertions.assertEquals(0, fragsDepth1.length, - "Expected 0 fragments when inclusiveMaxTreeDepth is 0 (allows 0 cuts) for 1,4-dibutylbenzene"); + Assertions.assertEquals(4, fragsDepth1.length, + "Expected 4 fragments when inclusiveMaxTreeDepth is 1 (allows 1 cuts) for 1,4-dibutylbenzene"); + assertFragsContain( + new String[]{ + "c1cc(ccc1C)CCCC", + "c1ccc(cc1)CCCC", + "c1cc(ccc1CC)CCCC", + "CCCC" + }, fragsDepth1 + ); - localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, Integer.SIZE - 1); + localFragmenter = new ExhaustiveFragmenter( + standardSmilesGen, + 4, + ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, + Integer.SIZE - 1 + ); localFragmenter.setInclusiveMaxTreeDepth(2); localFragmenter.generateFragments(mol); String[] fragsDepth2 = localFragmenter.getFragments(); - Assertions.assertEquals(4, fragsDepth2.length, - "Expected 4 fragments when inclusiveMaxTreeDepth is 1 (allows up to 1 cut)"); + Assertions.assertEquals(10, fragsDepth2.length, + "Expected 10 fragments when inclusiveMaxTreeDepth is 2 (allows up to 2 cut)"); assertFragsContain( new String[]{ - "CCCC", - "c1ccc(cc1)CCCC" + "c1ccc(cc1)C", + "c1ccc(cc1)CC", + "c1ccc(cc1)CCCC", + "c1cc(ccc1C)C", + "c1cc(ccc1C)CC", + "c1cc(ccc1C)CCCC", + "c1cc(ccc1CC)CC", + "c1cc(ccc1CC)CCCC", + "c1ccccc1", + "CCCC" }, fragsDepth2 ); - localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, Integer.SIZE - 1); + localFragmenter = new ExhaustiveFragmenter( + standardSmilesGen, + 4, + ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, + Integer.SIZE - 1 + ); localFragmenter.setInclusiveMaxTreeDepth(3); localFragmenter.generateFragments(mol); String[] fragsDepth3 = localFragmenter.getFragments(); Assertions.assertEquals(10, fragsDepth3.length, - "Expected 10 fragments when inclusiveMaxTreeDepth is 2 (allows up to 2 cuts)"); - assertFragsContain( - new String[]{ - "CCCC", - "c1ccc(cc1)CCCC", - "c1ccccc1" - }, fragsDepth3 - ); - - localFragmenter = new ExhaustiveFragmenter(standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, Integer.SIZE - 1); - localFragmenter.setInclusiveMaxTreeDepth(4); - localFragmenter.generateFragments(mol); - String[] fragsDepth4 = localFragmenter.getFragments(); - Assertions.assertEquals(10, fragsDepth4.length, "Expected 10 fragments when inclusiveMaxTreeDepth is 3 (allows up to 3 cuts), same as max 2 cuts"); assertFragsContain( new String[]{ - "CCCC", + "c1ccc(cc1)C", + "c1ccc(cc1)CC", "c1ccc(cc1)CCCC", - "c1ccccc1" - }, fragsDepth4 + "c1cc(ccc1C)C", + "c1cc(ccc1C)CC", + "c1cc(ccc1C)CCCC", + "c1cc(ccc1CC)CC", + "c1cc(ccc1CC)CCCC", + "c1ccccc1", + "CCCC" + }, fragsDepth3 ); } @@ -673,7 +657,7 @@ void testSetExclusiveMaxTreeDepth() throws Exception { @Test void testDoubleBondIssue() throws CDKException { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1=CCC"); - ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); assertFragsContain( @@ -694,7 +678,7 @@ void testDisconnectedMolecules() throws Exception { IAtomContainer mol = smilesParser.parseSmiles( "C(CN(CC(=O)[O-])CC(=O)[O-])N(CC(=O)[O-])CC(=O)[O-].[Na+].[Na+].[Na+].[Na+]" ); //Sodium edetate - ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); assertFragsContain( @@ -719,7 +703,7 @@ void testBigMolecule1() throws Exception { IAtomContainer mol = smiPar.parseSmiles("CC1=C(C(=CC=C1)NC2=CC=CC=C2C" + "(=O)NC(CCS(=O)C)C(=O)NC(C)C3=CC=C(C=C3)F)C"); //PubChem CID 118705975 - ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); assertFragsContain( @@ -743,7 +727,7 @@ void testMaxSplittableBonds() throws Exception { IAtomContainer mol = smiPar.parseSmiles("C[C@]12CC[C@](CC1C3=CC(=O)C4[C@]5(CCC(C(C5CC[C@]4([C@@]3(CC2)C)C)(C)C)" + "OC6C(C(C(C(O6)C(=O)N[C@H](CCC(=O)OC)C(=O)OC)O)O)OC7C(C(C(C(O7)C(=O)N[C@H](CCC(=O)OC)C(=O)OC)O)O)O)C)(C)C" + "(=O)N[C@H](CCC(=O)OC)C(=O)OC"); // Pubchem CID 16396833 - ExhaustiveFragmenter saturatedFragmenter = setupSaturatedFragmenter(); + ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); assertFragsContain( @@ -829,13 +813,17 @@ void testStereoChemistryCopied() throws Exception { containerSmiles[i] = smilesGenerator.create(containerFrags[i]); } - assertFragsContain(new String[]{ - "C(F)C/C=C/C" - }, smilesFrags); + assertFragsContain( + new String[]{ + "C(F)C/C=C/C" + }, smilesFrags + ); - assertFragsContain(new String[]{ - "C(F)C/C=C/C" - }, containerSmiles); + assertFragsContain( + new String[]{ + "C(F)C/C=C/C" + }, containerSmiles + ); } // --utility -- From 40a6559398fe6c15690f1c8c1ba4ece3731dfdc8 Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Wed, 3 Sep 2025 13:23:36 +0200 Subject: [PATCH 31/42] fixed documentation links, added constructor for just saturation and added a setting for copying setereo information + tests --- .../cdk/fragment/ExhaustiveFragmenter.java | 240 ++++++++++++++---- .../fragment/ExhaustiveFragmenterTest.java | 151 ++++++++--- 2 files changed, 298 insertions(+), 93 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index ae7080f324c..b1878b1ef34 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -60,8 +60,8 @@ *
  • The fragmentation splits at a maximum tree depth of 31, meaning that * maximum 31 bonds are split in one run.
  • *
  • The SMILES code of the fragments is generated with {@link SmiFlavor#Unique} - * and {@link SmiFlavor#UseAromaticSymbols}. It does not contain information - * about the stereochemistry.
  • + * and {@link SmiFlavor#UseAromaticSymbols}. + *
  • Stereo information is disregarded
  • * * However, users can modify these settings, with the exception, that the * maximum tree depth can not be higher than 31 (Java's limitation caused by @@ -71,10 +71,12 @@ * The `ExhaustiveFragmenter` uses unique SMILES strings for internal * deduplication of generated fragments. This means that after a fragment is * generated, its unique SMILES representation is computed (using the default or - * user specified {@link SmilesGenerator}). These SMILES do not encode - * stereochemistry. If a fragment with the same canonical SMILES has already - * been generated and stored, the new fragment is considered a duplicate and is - * not added to the results. + * user specified {@link SmilesGenerator}). Be aware that stereo information is + * only copied and checked for deduplication if + * {@link ExhaustiveFragmenter#setAttemptCopySteroInfo} is set to true and the + * specified {@link SmilesGenerator} has {@link SmiFlavor#Stereo}. If a fragment + * with the same canonical SMILES has already been generated and stored, the new + * fragment is considered a duplicate and is not added to the results. *

    * This deduplication strategy is particularly important when considering the * {@link Saturation} setting: @@ -159,12 +161,14 @@ public enum Saturation { SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols ); private static final int DEFAULT_INCLUSIVE_MAX_TREE_DEPTH = Integer.SIZE - 1; + private static final boolean DEFAULT_COPY_STEREO_INFO = false; private Map fragMap; private final SmilesGenerator smilesGenerator; private int inclusiveMaxTreeDepth; private int minFragSize; private Saturation saturationSetting; + private boolean attemptCopySteroInfo; private static final ILoggingTool logger = LoggingToolFactory.createLoggingTool(ExhaustiveFragmenter.class); @@ -172,26 +176,29 @@ public enum Saturation { * Constructs an ExhaustiveFragmenter with the default settings: *

      *
    • Minimum fragment size: 6 atoms (excluding implicit hydrogen)
    • - *
    • Unsaturated fragments
    • + *
    • {@link Saturation#UNSATURATED_FRAGMENTS}
    • *
    • Default {@link SmilesGenerator} - * ({@code SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols})
    • - *
    • {@link ExhaustiveFragmenter#inclusiveMaxTreeDepth} of 31
    • + * ({@link SmiFlavor#Unique} | {@link SmiFlavor#UseAromaticSymbols}) + *
    • inclusive maximum tree depth of 31
    • *
    + * @see ExhaustiveFragmenter#setInclusiveMaxTreeDepth */ public ExhaustiveFragmenter() { this( DEFAULT_SMILES_GENERATOR, DEFAULT_MIN_FRAG_SIZE, DEFAULT_SATURATION, - DEFAULT_INCLUSIVE_MAX_TREE_DEPTH + DEFAULT_INCLUSIVE_MAX_TREE_DEPTH, + DEFAULT_COPY_STEREO_INFO ); } /** * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment * size and saturation setting. Uses the default {@link SmilesGenerator} and - * default {@link ExhaustiveFragmenter#inclusiveMaxTreeDepth} of 31 + * default inclusive maximum tree depth of 31 * + * @see ExhaustiveFragmenter#setInclusiveMaxTreeDepth * @param minFragSize Minimum number of atoms in a valid fragment * (excluding implicit hydrogen). * @param saturationSetting Determines whether fragments should be saturated @@ -202,7 +209,8 @@ public ExhaustiveFragmenter(int minFragSize, Saturation saturationSetting) { DEFAULT_SMILES_GENERATOR, minFragSize, saturationSetting, - DEFAULT_INCLUSIVE_MAX_TREE_DEPTH + DEFAULT_INCLUSIVE_MAX_TREE_DEPTH, + DEFAULT_COPY_STEREO_INFO ); } @@ -210,8 +218,9 @@ public ExhaustiveFragmenter(int minFragSize, Saturation saturationSetting) { * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment * size. Saturation defaults to {@link Saturation#UNSATURATED_FRAGMENTS}. * Uses the default {@link SmilesGenerator} and the default - * {@link ExhaustiveFragmenter#inclusiveMaxTreeDepth} of 31 + * inclusive maximum tree depth of 31 * + * @see ExhaustiveFragmenter#setInclusiveMaxTreeDepth * @param minFragSize Minimum number of atoms in a valid fragment * (excluding implicit hydrogen). */ @@ -220,7 +229,27 @@ public ExhaustiveFragmenter(int minFragSize) { DEFAULT_SMILES_GENERATOR, minFragSize, DEFAULT_SATURATION, - DEFAULT_INCLUSIVE_MAX_TREE_DEPTH + DEFAULT_INCLUSIVE_MAX_TREE_DEPTH, + DEFAULT_COPY_STEREO_INFO + ); + } + + /** + * Constructs an ExhaustiveFragmenter with a user-defined saturation setting + * size. Fragment size defaults to 6. Uses the default {@link SmilesGenerator} + * and the default inclusive maximum tree depth of 31. + * + * @see ExhaustiveFragmenter#setInclusiveMaxTreeDepth + * @param saturation how open valences should be treated after the + * fragmentation. + */ + public ExhaustiveFragmenter(Saturation saturation) { + this( + DEFAULT_SMILES_GENERATOR, + DEFAULT_MIN_FRAG_SIZE, + saturation, + DEFAULT_INCLUSIVE_MAX_TREE_DEPTH, + DEFAULT_COPY_STEREO_INFO ); } @@ -229,21 +258,41 @@ public ExhaustiveFragmenter(int minFragSize) { * user-defined minimum fragment size, inclusive max tree depth and * saturation setting. * - * @param smilesGenerator The {@link SmilesGenerator} instance to use for - * creating SMILES strings - * for fragment deduplication and retrieval. - * @param minFragSize Minimum number of atoms in a valid fragment - * (excluding implicit hydrogen). + * @param smilesGenerator The {@link SmilesGenerator} instance to use for + * creating SMILES strings + * for fragment deduplication and retrieval. + * @param minFragSize Minimum number of atoms in a valid fragment + * (excluding implicit hydrogen). * @param saturationSetting Determines whether fragments should be saturated * (with hydrogens or R-atoms) or unsaturated. * @param inclusiveMaxTreeDepth Represents the number of Bonds that will be * split for a fragmentation. + * @param attemptCopySteroInfo Signals whether to attempt to copy stereochemical + * information from the original molecule to the generated fragments. + *

    + * Warning: This process is not reliable and can lead + * to incorrect stereochemistry in the resulting fragments. + * When a chiral center is broken during fragmentation, the new fragment may + * be incorrectly assigned as chiral even if it is not. + * This can occur because the algorithm may copy the original chirality + * information without having all the necessary atoms to correctly define + * the stereocenter in the new, smaller fragment. + *

    + * Note on Stereochemistry and SMILES: + * For stereochemical information to be included in the SMILES strings + * returned by {@link #getFragments()}, the `smilesGenerator` used by this + * fragmenter must be configured with the {@link SmiFlavor#Stereo} flag. + * If the flag is not set, the SMILES will not contain stereochemistry, + * even if this setting is enabled and the underlying `IAtomContainer` objects + * have stereo elements. + *

    */ public ExhaustiveFragmenter( SmilesGenerator smilesGenerator, int minFragSize, Saturation saturationSetting, - int inclusiveMaxTreeDepth + int inclusiveMaxTreeDepth, + boolean attemptCopySteroInfo ) { if (saturationSetting == null) { throw new NullPointerException( @@ -257,6 +306,7 @@ public ExhaustiveFragmenter( ); } this.smilesGenerator = smilesGenerator; + this.attemptCopySteroInfo = attemptCopySteroInfo; this.setInclusiveMaxTreeDepth(inclusiveMaxTreeDepth); this.setMinimumFragmentSize(minFragSize); this.fragMap = null; @@ -315,6 +365,36 @@ public void setInclusiveMaxTreeDepth(int inclusiveMaxTreeDepth) { this.inclusiveMaxTreeDepth = inclusiveMaxTreeDepth; } + /** + * Sets whether stereochemical information from the original molecule should + * be copied to the generated fragments. + * + *

    + * Warning: The copying process is not reliable and can + * result in fragments with incorrect stereochemistry. This method copies + * elements based on the presence of atoms and bonds, but it does not perform + * a chemical validation check on the resulting fragment. + * For example, a chiral center might be copied even if the new fragment does + * not contain the minimum four different substituents required for chirality. + * Use caution and consider a separate validation step. + *

    + *

    + * Note on Stereochemistry and SMILES: + * For stereochemical information to be included in the SMILES strings + * returned by {@link #getFragments()}, the `smilesGenerator` used by this + * fragmenter must be configured with the {@link SmiFlavor#Stereo} flag. + * If the flag is not set, the SMILES will not contain stereochemistry, + * even if this setting is enabled and the underlying `IAtomContainer` objects + * have stereo elements. + *

    + * + * @param attemptCopySteroInfo {@code true} to enable attempting to copy + * stereo information; {@code false} otherwise. + */ + public void setAttemptCopySteroInfo(boolean attemptCopySteroInfo) { + this.attemptCopySteroInfo = attemptCopySteroInfo; + } + /** * Generates fragments for the given molecule. * The generated fragments are stored internally and can be retrieved via: @@ -628,6 +708,79 @@ private static IBond copyBond( return cpyBond; } + /** + * Copies a subset of stereochemical information from a source molecule + * to a new fragment. + * + *

    + * This method iterates through the stereochemical elements of the original + * molecule (e.g., chiral centers, cis/trans bonds) and copies only those + * that are fully contained within the new fragment. A stereochemical element + * is considered fully contained if all of its defining atoms and bonds + * are present in the fragment, based on the provided atom and bond maps. + *

    + * + * @param origMol The original molecule containing the stereochemical + * information. + * @param fragmentContainer The new fragment where the stereochemical + * information will be added. + * @param origToCpyAtomMap A mapping of atoms from the original molecule to + * their corresponding atoms in the new fragment. + * @param origToCpyBondMap A mapping of bonds from the original molecule to + * their corresponding bonds in the new fragment. + * + * Warning: The copied stereochemical information may be + * chemically invalid. This method copies elements based on the presence of + * atoms and bonds, but it does not perform a chemical validation check on + * the resulting fragment. For example, a chiral center might be copied even + * if the new fragment does not contain the minimum four different + * substituents required for chirality. Use caution and consider a separate + * validation step. + */ + void attemptCopyStereoInformation( + IAtomContainer origMol, + IAtomContainer fragmentContainer, + Map origToCpyAtomMap, + Map origToCpyBondMap + ) { + // adding stereo information if all elements are present in the + // new fragment + for (IStereoElement elem : origMol.stereoElements()) { + boolean allAtomsPresent = true; + final IChemObject origFocus = elem.getFocus(); + if (origFocus instanceof IAtom) { + if (!origToCpyAtomMap.containsKey(origFocus)) { + allAtomsPresent = false; + } + } else if (origFocus instanceof IBond) { + if (!origToCpyBondMap.containsKey(origFocus)) { + allAtomsPresent = false; + } + } + + if (allAtomsPresent) { + + for (IChemObject iChemObject : elem.getCarriers()) { + if (iChemObject instanceof IAtom) { + if (!origToCpyAtomMap.containsKey(iChemObject)) { + allAtomsPresent = false; + break; + } + } else if (iChemObject instanceof IBond) { + if (!origToCpyBondMap.containsKey(iChemObject)) { + allAtomsPresent = false; + break; + } + } + } + } + + if (allAtomsPresent) { + fragmentContainer.addStereoElement(elem.map(origToCpyAtomMap, origToCpyBondMap)); + } + } + } + /** * Splits and saturates (if specified via {@link #saturationSetting}) a * molecule into multiple fragments by removing the specified bonds and @@ -736,40 +889,13 @@ private IAtomContainer[] splitBondsWithCopy( } } } - // adding stereo information if all elements are present in the - // new fragment - for (IStereoElement elem : origMol.stereoElements()) { - boolean allAtomsPresent = true; - IChemObject focus = elem.getFocus(); - if (focus instanceof IAtom) { - if (!origToCpyAtomMap.containsKey(focus)) { - allAtomsPresent = false; - } - } else if (focus instanceof IBond) { - if (!origToCpyBondMap.containsKey(focus)) { - allAtomsPresent = false; - } - } - - if (allAtomsPresent) { - for (IChemObject iChemObject : elem.getCarriers()) { - if (iChemObject instanceof IAtom) { - if (!origToCpyAtomMap.containsKey(iChemObject)) { - allAtomsPresent = false; - break; - } - } else if (iChemObject instanceof IBond) { - if (!origToCpyBondMap.containsKey(iChemObject)) { - allAtomsPresent = false; - break; - } - } - } - } - - if (allAtomsPresent) { - fragmentContainer.addStereoElement(elem.map(origToCpyAtomMap, origToCpyBondMap)); - } + if (this.attemptCopySteroInfo) { + attemptCopyStereoInformation( + origMol, + fragmentContainer, + origToCpyAtomMap, + origToCpyBondMap + ); } fragmentList.add(fragmentContainer); } @@ -779,6 +905,12 @@ private IAtomContainer[] splitBondsWithCopy( /** * Get the fragments generated as SMILES strings. + *

    + * Note on Stereochemistry: + * Stereochemistry information will only be included in the returned SMILES + * strings if the `SmilesGenerator` used by this fragmenter was configured + * with the {@link SmiFlavor#Stereo} flag. + *

    * * @return a String[] of the fragments. */ diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java index e6f83a2ef38..55397a563dd 100644 --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java @@ -41,7 +41,12 @@ /** * Test exhaustive fragmenter. * This test class covers various scenarios for the {@link ExhaustiveFragmenter}, - * including different saturation settings (unsaturated, hydrogen-saturated, R-group saturated) + * including different saturation settings: + *
      + *
    • {@link org.openscience.cdk.fragment.ExhaustiveFragmenter.Saturation#UNSATURATED_FRAGMENTS}
    • + *
    • {@link org.openscience.cdk.fragment.ExhaustiveFragmenter.Saturation#HYDROGEN_SATURATED_FRAGMENTS}
    • + *
    • {@link org.openscience.cdk.fragment.ExhaustiveFragmenter.Saturation#R_SATURATED_FRAGMENTS}
    • + *
    * and minimum fragment size. * * @see ExhaustiveFragmenter @@ -54,12 +59,6 @@ static void setup() { smilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); } - private ExhaustiveFragmenter setupFragmenter(ExhaustiveFragmenter.Saturation saturation) { - ExhaustiveFragmenter fragmenter = new ExhaustiveFragmenter(); - fragmenter.setSaturationSetting(saturation); - return fragmenter; - } - // --- Unsaturated Fragments Tests --- /** @@ -69,7 +68,7 @@ private ExhaustiveFragmenter setupFragmenter(ExhaustiveFragmenter.Saturation sat @Test void testEF1Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("CCC"); - ExhaustiveFragmenter unsaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); + ExhaustiveFragmenter unsaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); unsaturatedFragmenter.generateFragments(mol); String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertEquals(0, frags.length); @@ -82,7 +81,7 @@ void testEF1Unsaturated() throws Exception { @Test void testEF2Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1"); - ExhaustiveFragmenter unsaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); + ExhaustiveFragmenter unsaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); unsaturatedFragmenter.generateFragments(mol); String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertEquals(0, frags.length); @@ -95,7 +94,7 @@ void testEF2Unsaturated() throws Exception { @Test void testEF3Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); - ExhaustiveFragmenter unsaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); + ExhaustiveFragmenter unsaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); unsaturatedFragmenter.generateFragments(mol); String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertArrayEquals(new String[]{"[CH]1CCCCC1"}, frags); @@ -108,7 +107,7 @@ void testEF3Unsaturated() throws Exception { @Test void testEF4Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC"); - ExhaustiveFragmenter unsaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); + ExhaustiveFragmenter unsaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); unsaturatedFragmenter.generateFragments(mol); String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -122,7 +121,7 @@ void testEF4Unsaturated() throws Exception { @Test void testEF5Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); - ExhaustiveFragmenter unsaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); + ExhaustiveFragmenter unsaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); unsaturatedFragmenter.generateFragments(mol); String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -143,7 +142,7 @@ void testEF5Unsaturated() throws Exception { @Test void testEF6Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); - ExhaustiveFragmenter unsaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); + ExhaustiveFragmenter unsaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); unsaturatedFragmenter.generateFragments(mol); String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -165,7 +164,7 @@ void testEF6Unsaturated() throws Exception { @Test void testEF7Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); - ExhaustiveFragmenter unsaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); + ExhaustiveFragmenter unsaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); unsaturatedFragmenter.generateFragments(mol); String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -197,7 +196,7 @@ void testEF7Unsaturated() throws Exception { @Test void testEF1Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("CCC"); - ExhaustiveFragmenter fragmenterSaturated = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + ExhaustiveFragmenter fragmenterSaturated = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); Assertions.assertEquals(0, frags.length); @@ -210,7 +209,7 @@ void testEF1Saturated() throws Exception { @Test void testEF2Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1"); - ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); Assertions.assertEquals(0, frags.length); @@ -223,7 +222,7 @@ void testEF2Saturated() throws Exception { @Test void testEF3Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); - ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); Assertions.assertArrayEquals(new String[]{"C1CCCCC1"}, frags); @@ -236,7 +235,7 @@ void testEF3Saturated() throws Exception { @Test void testEF4Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC"); - ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -251,7 +250,7 @@ void testEF4Saturated() throws Exception { @Test void testEF5Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); - ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -272,7 +271,7 @@ void testEF5Saturated() throws Exception { @Test void testEF6Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); - ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -291,7 +290,7 @@ void testEF6Saturated() throws Exception { @Test void testEF7Saturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); - ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -318,7 +317,7 @@ void testEF7Saturated() throws Exception { @Test void testEF3RestSaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); - ExhaustiveFragmenter rSaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); + ExhaustiveFragmenter rSaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); rSaturatedFragmenter.generateFragments(mol); String[] frags = rSaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -332,7 +331,7 @@ void testEF3RestSaturated() throws Exception { @Test void testEF5RestSaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); - ExhaustiveFragmenter rSaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); + ExhaustiveFragmenter rSaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); rSaturatedFragmenter.generateFragments(mol); String[] frags = rSaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -352,7 +351,7 @@ void testEF5RestSaturated() throws Exception { @Test void testEF6RestSaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); - ExhaustiveFragmenter rSaturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); + ExhaustiveFragmenter rSaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); rSaturatedFragmenter.generateFragments(mol); String[] frags = rSaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); @@ -368,7 +367,7 @@ void testEF6RestSaturated() throws Exception { @Test void testEF7RestSaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); - ExhaustiveFragmenter fragmenterRSaturated = setupFragmenter(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); + ExhaustiveFragmenter fragmenterRSaturated = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); fragmenterRSaturated.generateFragments(mol); String[] frags = fragmenterRSaturated.getFragments(); Assertions.assertNotNull(frags); @@ -393,7 +392,7 @@ void testEF7RestSaturated() throws Exception { @Test void testMinSize() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1C2CCCCC2"); - ExhaustiveFragmenter fragmenterSaturated = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + ExhaustiveFragmenter fragmenterSaturated = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); fragmenterSaturated.setMinimumFragmentSize(6); fragmenterSaturated.generateFragments(mol); String[] frags = fragmenterSaturated.getFragments(); @@ -409,7 +408,7 @@ void testMinSize() throws Exception { @Test void testMinSizeLowered() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1C2CCCCC2"); - ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.setMinimumFragmentSize(5); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); @@ -431,7 +430,7 @@ void testMinSizeLowered() throws Exception { void testEqualityOfSmilesAndContainers() throws Exception { SmilesGenerator smilesGenerator = new SmilesGenerator(SmiFlavor.UseAromaticSymbols | SmiFlavor.Unique); IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC(N)C(=O)O"); // Phenylalanine - ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] smilesFrags = saturatedFragmenter.getFragments(); IAtomContainer[] containerFrags = saturatedFragmenter.getFragmentsAsContainers(); @@ -527,7 +526,12 @@ void testGenerateSubset() { void testCustomSmilesGenerator() throws Exception { SmilesGenerator customSmilesGen = new SmilesGenerator(SmiFlavor.Unique); // No SmiFlavor.UseAromaticSymbols ExhaustiveFragmenter customFragmenter = new ExhaustiveFragmenter( - customSmilesGen, 6, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, Integer.SIZE - 1); + customSmilesGen, + 6, + ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, + Integer.SIZE - 1, + false + ); IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); // Diphenylmethane customFragmenter.generateFragments(mol); String[] frags = customFragmenter.getFragments(); @@ -582,7 +586,8 @@ void testSetInclusiveMaxTreeDepth() throws Exception { standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, - Integer.SIZE - 1 + Integer.SIZE - 1, + false ); localFragmenter.setInclusiveMaxTreeDepth(1); localFragmenter.generateFragments(mol); @@ -602,7 +607,8 @@ void testSetInclusiveMaxTreeDepth() throws Exception { standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, - Integer.SIZE - 1 + Integer.SIZE - 1, + false ); localFragmenter.setInclusiveMaxTreeDepth(2); localFragmenter.generateFragments(mol); @@ -628,7 +634,8 @@ void testSetInclusiveMaxTreeDepth() throws Exception { standardSmilesGen, 4, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, - Integer.SIZE - 1 + Integer.SIZE - 1, + false ); localFragmenter.setInclusiveMaxTreeDepth(3); localFragmenter.generateFragments(mol); @@ -657,7 +664,7 @@ void testSetInclusiveMaxTreeDepth() throws Exception { @Test void testDoubleBondIssue() throws CDKException { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1=CCC"); - ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); assertFragsContain( @@ -678,7 +685,7 @@ void testDisconnectedMolecules() throws Exception { IAtomContainer mol = smilesParser.parseSmiles( "C(CN(CC(=O)[O-])CC(=O)[O-])N(CC(=O)[O-])CC(=O)[O-].[Na+].[Na+].[Na+].[Na+]" ); //Sodium edetate - ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); assertFragsContain( @@ -703,7 +710,9 @@ void testBigMolecule1() throws Exception { IAtomContainer mol = smiPar.parseSmiles("CC1=C(C(=CC=C1)NC2=CC=CC=C2C" + "(=O)NC(CCS(=O)C)C(=O)NC(C)C3=CC=C(C=C3)F)C"); //PubChem CID 118705975 - ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter( + ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS + ); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); assertFragsContain( @@ -727,7 +736,7 @@ void testMaxSplittableBonds() throws Exception { IAtomContainer mol = smiPar.parseSmiles("C[C@]12CC[C@](CC1C3=CC(=O)C4[C@]5(CCC(C(C5CC[C@]4([C@@]3(CC2)C)C)(C)C)" + "OC6C(C(C(C(O6)C(=O)N[C@H](CCC(=O)OC)C(=O)OC)O)O)OC7C(C(C(C(O7)C(=O)N[C@H](CCC(=O)OC)C(=O)OC)O)O)O)C)(C)C" + "(=O)N[C@H](CCC(=O)OC)C(=O)OC"); // Pubchem CID 16396833 - ExhaustiveFragmenter saturatedFragmenter = setupFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); saturatedFragmenter.generateFragments(mol); String[] frags = saturatedFragmenter.getFragments(); assertFragsContain( @@ -782,8 +791,8 @@ void testExampleUsage() throws Exception { } /** - * Ensures that stereochemical information (chiral centers '@' and double-bond - * E/Z markers '/' or '\') is preserved when generating fragment containers. + * Ensures that stereochemical information (double-bond E/Z markers '/' or + * '\') is preserved when generating fragments. */ @Test void testStereoChemistryCopied() throws Exception { @@ -796,7 +805,8 @@ void testStereoChemistryCopied() throws Exception { smilesGenerator, 6, ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, - 31 + 31, + true ); fragmenter.generateFragments(mol); @@ -826,6 +836,69 @@ void testStereoChemistryCopied() throws Exception { ); } + /** + * Ensures that stereochemical information (chiral centers '@') is + * copied if the fragmentation yields a fragment with the same chiral center. + */ + @Test + void testTetrahdralStereoChemistryCopied() throws Exception { + SmilesGenerator smilesGenerator = + new SmilesGenerator( + SmiFlavor.UseAromaticSymbols | SmiFlavor.Stereo + ); + IAtomContainer mol = smilesParser.parseSmiles("[C@@H](Cl)(O)CCCCCC"); + ExhaustiveFragmenter fragmenter = new ExhaustiveFragmenter( + smilesGenerator, + 6, + ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, + 31, + true + ); + fragmenter.generateFragments(mol); + + String[] smilesFrags = fragmenter.getFragments(); + assertFragsContain( + new String[]{ + "[C@@H](Cl)(O)CCC", + "CCCCCC", + "[C@@H](Cl)(O)CCCC" + }, smilesFrags + ); + } + + /** + * Tests a known bug where the stereo information for a chiral center is + * incorrectly copied to fragments where the center is no longer chiral. + * This occurs when the fragmentation results in two identical substituents, + * which should, by definition, remove the chirality from the center. + */ + @Test + void testTetrahedralStereoChemistryFalselyCopied() throws Exception { + SmilesGenerator smilesGenerator = + new SmilesGenerator( + SmiFlavor.UseAromaticSymbols | SmiFlavor.Stereo + ); + IAtomContainer mol = smilesParser.parseSmiles("CC[C@@H](Cl)CCCC"); + ExhaustiveFragmenter fragmenter = new ExhaustiveFragmenter( + smilesGenerator, + 6, + ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, + 31, + true + ); + fragmenter.generateFragments(mol); + + String[] smilesFrags = fragmenter.getFragments(); + assertFragsContain( + new String[]{ + "C(Cl)CCCC", + // The chemically correct representation would be CCC(Cl)CC + // instead of: + "CC[C@@H](Cl)CC" + }, smilesFrags + ); + } + // --utility -- private static void assertFragsContain( From d6699367e7633ee705ddd87ed150f0da61629352 Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Wed, 3 Sep 2025 22:11:42 +0200 Subject: [PATCH 32/42] improved comments and code aesthetic for stereo information copying --- .../cdk/fragment/ExhaustiveFragmenter.java | 149 ++++++++++-------- 1 file changed, 80 insertions(+), 69 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index b1878b1ef34..f42e4917d19 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -67,13 +67,33 @@ * maximum tree depth can not be higher than 31 (Java's limitation caused by * integer indexing). *

    + * Warning on preservation of stereo information: This process + * is not reliable and can lead to incorrect stereochemistry in the resulting + * fragments. When a chiral center is broken during fragmentation, the new + * fragment may be incorrectly assigned as chiral even if it is not + * anymore because some of its substituents are now equal. + *

    + * Example:the chiral molecule {@code CC[C@@H](Cl)CCCC} will i.a. + * produce the fragment {@code CC[C@@H](Cl)CC} where the stereo configuration is + * preserved but the structure is actually not chiral anymore. + *

    + *

    + * Note on Stereochemistry and SMILES: + * For stereochemical information to be included in the SMILES strings + * returned by {@link #getFragments()}, the `smilesGenerator` used by this + * fragmenter must be configured with the {@link SmiFlavor#Stereo} flag. + * If the flag is not set, the SMILES will not contain stereochemistry, + * even if this setting is enabled and the underlying {@code IAtomContainer} + * objects have stereo elements. + *

    + *

    * Fragment Deduplication: * The `ExhaustiveFragmenter` uses unique SMILES strings for internal * deduplication of generated fragments. This means that after a fragment is * generated, its unique SMILES representation is computed (using the default or * user specified {@link SmilesGenerator}). Be aware that stereo information is * only copied and checked for deduplication if - * {@link ExhaustiveFragmenter#setAttemptCopySteroInfo} is set to true and the + * {@link ExhaustiveFragmenter#setPreserveStereo} is set to true and the * specified {@link SmilesGenerator} has {@link SmiFlavor#Stereo}. If a fragment * with the same canonical SMILES has already been generated and stored, the new * fragment is considered a duplicate and is not added to the results. @@ -168,7 +188,7 @@ public enum Saturation { private int inclusiveMaxTreeDepth; private int minFragSize; private Saturation saturationSetting; - private boolean attemptCopySteroInfo; + private boolean preserveStereo; private static final ILoggingTool logger = LoggingToolFactory.createLoggingTool(ExhaustiveFragmenter.class); @@ -179,9 +199,9 @@ public enum Saturation { *

  • {@link Saturation#UNSATURATED_FRAGMENTS}
  • *
  • Default {@link SmilesGenerator} * ({@link SmiFlavor#Unique} | {@link SmiFlavor#UseAromaticSymbols})
  • - *
  • inclusive maximum tree depth of 31
  • + *
  • inclusive maximum tree depth of 31
  • + *
  • stereo information is not preserved
  • * - * @see ExhaustiveFragmenter#setInclusiveMaxTreeDepth */ public ExhaustiveFragmenter() { this( @@ -196,9 +216,9 @@ public ExhaustiveFragmenter() { /** * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment * size and saturation setting. Uses the default {@link SmilesGenerator} and - * default inclusive maximum tree depth of 31 + * default inclusive maximum tree depth of 31. Stereo information is not + * preserved. * - * @see ExhaustiveFragmenter#setInclusiveMaxTreeDepth * @param minFragSize Minimum number of atoms in a valid fragment * (excluding implicit hydrogen). * @param saturationSetting Determines whether fragments should be saturated @@ -218,9 +238,9 @@ public ExhaustiveFragmenter(int minFragSize, Saturation saturationSetting) { * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment * size. Saturation defaults to {@link Saturation#UNSATURATED_FRAGMENTS}. * Uses the default {@link SmilesGenerator} and the default - * inclusive maximum tree depth of 31 + * inclusive maximum tree depth of 31. Stereo information is not + * preserved. * - * @see ExhaustiveFragmenter#setInclusiveMaxTreeDepth * @param minFragSize Minimum number of atoms in a valid fragment * (excluding implicit hydrogen). */ @@ -235,11 +255,11 @@ public ExhaustiveFragmenter(int minFragSize) { } /** - * Constructs an ExhaustiveFragmenter with a user-defined saturation setting - * size. Fragment size defaults to 6. Uses the default {@link SmilesGenerator} - * and the default inclusive maximum tree depth of 31. + * Constructs an ExhaustiveFragmenter with a user-defined saturation setting. + * Minimum fragment size defaults to 6. Uses the default {@link SmilesGenerator} + * and the default inclusive maximum tree depth of 31. Stereo information is + * not preserved. * - * @see ExhaustiveFragmenter#setInclusiveMaxTreeDepth * @param saturation how open valences should be treated after the * fragmentation. */ @@ -254,9 +274,14 @@ public ExhaustiveFragmenter(Saturation saturation) { } /** - * Constructs an ExhaustiveFragmenter with a user-provided {@link SmilesGenerator}, - * user-defined minimum fragment size, inclusive max tree depth and - * saturation setting. + * Constructs an ExhaustiveFragmenter with a user-provided + * {@link SmilesGenerator} and user defined: + *
      + *
    • minimum fragment size
    • + *
    • inclusive max tree depth
    • + *
    • saturation setting
    • + *
    • preservation of stereochemistry information
    • + *
        * * @param smilesGenerator The {@link SmilesGenerator} instance to use for * creating SMILES strings @@ -265,34 +290,20 @@ public ExhaustiveFragmenter(Saturation saturation) { * (excluding implicit hydrogen). * @param saturationSetting Determines whether fragments should be saturated * (with hydrogens or R-atoms) or unsaturated. - * @param inclusiveMaxTreeDepth Represents the number of Bonds that will be - * split for a fragmentation. - * @param attemptCopySteroInfo Signals whether to attempt to copy stereochemical - * information from the original molecule to the generated fragments. - *

        - * Warning: This process is not reliable and can lead - * to incorrect stereochemistry in the resulting fragments. - * When a chiral center is broken during fragmentation, the new fragment may - * be incorrectly assigned as chiral even if it is not. - * This can occur because the algorithm may copy the original chirality - * information without having all the necessary atoms to correctly define - * the stereocenter in the new, smaller fragment. - *

        - * Note on Stereochemistry and SMILES: - * For stereochemical information to be included in the SMILES strings - * returned by {@link #getFragments()}, the `smilesGenerator` used by this - * fragmenter must be configured with the {@link SmiFlavor#Stereo} flag. - * If the flag is not set, the SMILES will not contain stereochemistry, - * even if this setting is enabled and the underlying `IAtomContainer` objects - * have stereo elements. - *

        + * @param inclusiveMaxTreeDepth Represents the maximum number of bonds that + * will be split for a fragmentation. + * @param preserveStero Signals whether to attempt to copy stereochemical + * information from the original molecule to the + * generated fragments. Warning: This + * process is not reliable and can lead to incorrect + * stereochemistry in the resulting fragments. */ public ExhaustiveFragmenter( SmilesGenerator smilesGenerator, int minFragSize, Saturation saturationSetting, int inclusiveMaxTreeDepth, - boolean attemptCopySteroInfo + boolean preserveStero ) { if (saturationSetting == null) { throw new NullPointerException( @@ -306,7 +317,7 @@ public ExhaustiveFragmenter( ); } this.smilesGenerator = smilesGenerator; - this.attemptCopySteroInfo = attemptCopySteroInfo; + this.preserveStereo = preserveStero; this.setInclusiveMaxTreeDepth(inclusiveMaxTreeDepth); this.setMinimumFragmentSize(minFragSize); this.fragMap = null; @@ -388,11 +399,11 @@ public void setInclusiveMaxTreeDepth(int inclusiveMaxTreeDepth) { * have stereo elements. *

        * - * @param attemptCopySteroInfo {@code true} to enable attempting to copy - * stereo information; {@code false} otherwise. + * @param preserve {@code true} to enable attempting to copy + * stereo information; {@code false} otherwise. */ - public void setAttemptCopySteroInfo(boolean attemptCopySteroInfo) { - this.attemptCopySteroInfo = attemptCopySteroInfo; + public void setPreserveStereo(boolean preserve) { + this.preserveStereo = preserve; } /** @@ -719,6 +730,16 @@ private static IBond copyBond( * is considered fully contained if all of its defining atoms and bonds * are present in the fragment, based on the provided atom and bond maps. *

        + * Warning: This process is not reliable and can lead to + * incorrect stereochemistry in the fragment. When a chiral + * center is broken during fragmentation, the new fragment may be + * incorrectly assigned as chiral even if it is not anymore because some of + * its substituents are now equal. + *

        + * Example:the chiral molecule {@code CC[C@@H](Cl)CCCC} will i.a. + * produce the fragment {@code CC[C@@H](Cl)CC} where the stereo configuration is + * preserved but the structure is actually not chiral anymore. + *

        * * @param origMol The original molecule containing the stereochemical * information. @@ -728,16 +749,8 @@ private static IBond copyBond( * their corresponding atoms in the new fragment. * @param origToCpyBondMap A mapping of bonds from the original molecule to * their corresponding bonds in the new fragment. - * - * Warning: The copied stereochemical information may be - * chemically invalid. This method copies elements based on the presence of - * atoms and bonds, but it does not perform a chemical validation check on - * the resulting fragment. For example, a chiral center might be copied even - * if the new fragment does not contain the minimum four different - * substituents required for chirality. Use caution and consider a separate - * validation step. */ - void attemptCopyStereoInformation( + private void attemptCopyStereoInformation( IAtomContainer origMol, IAtomContainer fragmentContainer, Map origToCpyAtomMap, @@ -746,36 +759,34 @@ void attemptCopyStereoInformation( // adding stereo information if all elements are present in the // new fragment for (IStereoElement elem : origMol.stereoElements()) { - boolean allAtomsPresent = true; + boolean focusIsPresent = true; + boolean carriersArePresent = true; final IChemObject origFocus = elem.getFocus(); if (origFocus instanceof IAtom) { if (!origToCpyAtomMap.containsKey(origFocus)) { - allAtomsPresent = false; + focusIsPresent = false; } } else if (origFocus instanceof IBond) { if (!origToCpyBondMap.containsKey(origFocus)) { - allAtomsPresent = false; + focusIsPresent = false; } } - if (allAtomsPresent) { - - for (IChemObject iChemObject : elem.getCarriers()) { - if (iChemObject instanceof IAtom) { - if (!origToCpyAtomMap.containsKey(iChemObject)) { - allAtomsPresent = false; - break; - } - } else if (iChemObject instanceof IBond) { - if (!origToCpyBondMap.containsKey(iChemObject)) { - allAtomsPresent = false; - break; - } + for (IChemObject iChemObject : elem.getCarriers()) { + if (iChemObject instanceof IAtom) { + if (!origToCpyAtomMap.containsKey(iChemObject)) { + carriersArePresent = false; + break; + } + } else if (iChemObject instanceof IBond) { + if (!origToCpyBondMap.containsKey(iChemObject)) { + carriersArePresent = false; + break; } } } - if (allAtomsPresent) { + if (focusIsPresent && carriersArePresent) { fragmentContainer.addStereoElement(elem.map(origToCpyAtomMap, origToCpyBondMap)); } } @@ -889,7 +900,7 @@ private IAtomContainer[] splitBondsWithCopy( } } } - if (this.attemptCopySteroInfo) { + if (this.preserveStereo) { attemptCopyStereoInformation( origMol, fragmentContainer, From 6302f6cfefb06ddf5526c3b95b610c560db59ae4 Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Thu, 4 Sep 2025 12:02:07 +0200 Subject: [PATCH 33/42] fixed typo --- .../org/openscience/cdk/fragment/ExhaustiveFragmenter.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index f42e4917d19..5b81e2c13e6 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -292,7 +292,7 @@ public ExhaustiveFragmenter(Saturation saturation) { * (with hydrogens or R-atoms) or unsaturated. * @param inclusiveMaxTreeDepth Represents the maximum number of bonds that * will be split for a fragmentation. - * @param preserveStero Signals whether to attempt to copy stereochemical + * @param preserveStereo Signals whether to attempt to copy stereochemical * information from the original molecule to the * generated fragments. Warning: This * process is not reliable and can lead to incorrect @@ -303,7 +303,7 @@ public ExhaustiveFragmenter( int minFragSize, Saturation saturationSetting, int inclusiveMaxTreeDepth, - boolean preserveStero + boolean preserveStereo ) { if (saturationSetting == null) { throw new NullPointerException( @@ -317,7 +317,7 @@ public ExhaustiveFragmenter( ); } this.smilesGenerator = smilesGenerator; - this.preserveStereo = preserveStero; + this.preserveStereo = preserveStereo; this.setInclusiveMaxTreeDepth(inclusiveMaxTreeDepth); this.setMinimumFragmentSize(minFragSize); this.fragMap = null; From d6f52e83f904411f7e7555cd5ac3f06e60320daa Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Thu, 4 Sep 2025 12:55:22 +0200 Subject: [PATCH 34/42] added documentation for custom assertion --- .../cdk/fragment/ExhaustiveFragmenterTest.java | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java index 55397a563dd..0e1af6ab8ba 100644 --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java @@ -901,6 +901,22 @@ void testTetrahedralStereoChemistryFalselyCopied() throws Exception { // --utility -- + /** + * Asserts that an array of strings contains all the expected elements, + * allowing for additional, unexpected elements in the actual array. + * + *

        + * This assertion is useful for verifying that a collection contains a + * specific subset of items. It fails only if an expected element is + * missing from the actual array. The failure message will list both + * missing elements and any extra, unexpected elements found. + *

        + * + * @param expected The {@code String} array containing the elements that are + * expected to be present in the {@code actual} array. + * @param actual The {@code String} array containing the elements to be + * tested against the {@code expected} array. + */ private static void assertFragsContain( String[] expected, String[] actual From 71da5e6e4db9a811fef2ec064cd4d300113dcb4d Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Thu, 4 Sep 2025 23:43:26 +0200 Subject: [PATCH 35/42] made param comments more consistent and beginning with lowercase --- .../cdk/fragment/ExhaustiveFragmenter.java | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index 5b81e2c13e6..bba7ee7c39a 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -219,9 +219,9 @@ public ExhaustiveFragmenter() { * default inclusive maximum tree depth of 31. Stereo information is not * preserved. * - * @param minFragSize Minimum number of atoms in a valid fragment - * (excluding implicit hydrogen). - * @param saturationSetting Determines whether fragments should be saturated + * @param minFragSize minimum number of atoms in a valid fragment + * (excluding implicit hydrogen). + * @param saturationSetting determines whether fragments should be saturated * (with hydrogens or R-atoms) or unsaturated. */ public ExhaustiveFragmenter(int minFragSize, Saturation saturationSetting) { @@ -241,7 +241,7 @@ public ExhaustiveFragmenter(int minFragSize, Saturation saturationSetting) { * inclusive maximum tree depth of 31. Stereo information is not * preserved. * - * @param minFragSize Minimum number of atoms in a valid fragment + * @param minFragSize minimum number of atoms in a valid fragment * (excluding implicit hydrogen). */ public ExhaustiveFragmenter(int minFragSize) { @@ -283,20 +283,20 @@ public ExhaustiveFragmenter(Saturation saturation) { *
      • preservation of stereochemistry information
      • *
          * - * @param smilesGenerator The {@link SmilesGenerator} instance to use for + * @param smilesGenerator the {@link SmilesGenerator} instance to use for * creating SMILES strings * for fragment deduplication and retrieval. - * @param minFragSize Minimum number of atoms in a valid fragment + * @param minFragSize minimum number of atoms in a valid fragment * (excluding implicit hydrogen). - * @param saturationSetting Determines whether fragments should be saturated + * @param saturationSetting determines whether fragments should be saturated * (with hydrogens or R-atoms) or unsaturated. - * @param inclusiveMaxTreeDepth Represents the maximum number of bonds that + * @param inclusiveMaxTreeDepth represents the maximum number of bonds that * will be split for a fragmentation. - * @param preserveStereo Signals whether to attempt to copy stereochemical - * information from the original molecule to the - * generated fragments. Warning: This - * process is not reliable and can lead to incorrect - * stereochemistry in the resulting fragments. + * @param preserveStereo signals whether to attempt to copy stereochemical + * information from the original molecule to the + * generated fragments. Warning: This + * process is not reliable and can lead to incorrect + * stereochemistry in the resulting fragments. */ public ExhaustiveFragmenter( SmilesGenerator smilesGenerator, @@ -326,7 +326,7 @@ public ExhaustiveFragmenter( /** * Sets the minimum allowed fragment size. This has to be greater than zero. * - * @param minFragSize Minimum number of atoms in a valid fragment. + * @param minFragSize minimum number of atoms in a valid fragment. */ public void setMinimumFragmentSize(int minFragSize) { if (minFragSize <= 0) { @@ -363,7 +363,7 @@ public void setSaturationSetting(Saturation saturationSetting) { * help manage computational resources for larger molecules. *

          * - * @param inclusiveMaxTreeDepth The exclusive maximum number of bonds that + * @param inclusiveMaxTreeDepth the exclusive maximum number of bonds that * can be split in one atom container. */ public void setInclusiveMaxTreeDepth(int inclusiveMaxTreeDepth) { @@ -617,11 +617,11 @@ public static IBond[] getSplittableBonds(IAtomContainer atomContainer) { * 1 → 3 → [1, 3] *
    * - * @param index An integer whose binary representation determines the subset + * @param index an integer whose binary representation determines the subset * elements. A `1` bit at position `j` means `nums[j]` is * included. - * @param nums The array from which to generate subsets. Duplicate values - * in `nums` may result in duplicate subset entries. + * @param nums the array from which to generate subsets. Duplicate values + * in `nums` may result in duplicate subset entries. * @return An array containing the subset corresponding to `index`. */ protected static int[] generateSubset(int index, int[] nums) { @@ -673,8 +673,8 @@ private void addRAtoms(IAtom atom, int rcount, IAtomContainer mol) { /** * Creates a copy of an atom and adds it to the specified atom container. * - * @param originalAtom The atom to be copied. - * @param atomContainer The destination container where the copied atom will + * @param originalAtom the atom to be copied. + * @param atomContainer the destination container where the copied atom will * be added. * @return A new atom with the same properties as `originalAtom`, added to * `atomContainer`. @@ -692,13 +692,13 @@ private static IAtom copyAtom(IAtom originalAtom, IAtomContainer atomContainer) /** * Creates a copy of a bond and adds it to the specified atom container. * - * @param cpyCurrentAtom Atom in the new atom container that is connected by + * @param cpyCurrentAtom atom in the new atom container that is connected by * the bond to be copied. - * @param cpyNbor The neighbour of `cpyCurrentAtom` that - * is connected by the bond one wants to copy. - * @param origBond The bond in the original molecule. - * @param atomContainer The new atom container to which the bond is to - * be copied. + * @param cpyNbor the neighbour of `cpyCurrentAtom` that is connected by the + * bond one wants to copy. + * @param origBond the bond in the original molecule. + * @param atomContainer the new atom container to which the bond is to + * be copied. * @return The bond in the new atom container. */ private static IBond copyBond( @@ -741,13 +741,13 @@ private static IBond copyBond( * preserved but the structure is actually not chiral anymore. *

    * - * @param origMol The original molecule containing the stereochemical + * @param origMol the original molecule containing the stereochemical * information. - * @param fragmentContainer The new fragment where the stereochemical + * @param fragmentContainer the new fragment where the stereochemical * information will be added. - * @param origToCpyAtomMap A mapping of atoms from the original molecule to + * @param origToCpyAtomMap a mapping of atoms from the original molecule to * their corresponding atoms in the new fragment. - * @param origToCpyBondMap A mapping of bonds from the original molecule to + * @param origToCpyBondMap a mapping of bonds from the original molecule to * their corresponding bonds in the new fragment. */ private void attemptCopyStereoInformation( @@ -797,8 +797,8 @@ private void attemptCopyStereoInformation( * molecule into multiple fragments by removing the specified bonds and * making copies of the resulting fragments. * - * @param origMol The molecule to be split. - * @param bondsToSplit The bonds that should be removed to create + * @param origMol the molecule to be split. + * @param bondsToSplit the bonds that should be removed to create * separate fragments. * @return An array of copied molecular fragments resulting from the split. */ From 521afe77fc15be045fe16f8726ce72d6279e9c50 Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Wed, 17 Sep 2025 18:08:20 +0200 Subject: [PATCH 36/42] switching to big integers to lift the upper limit of 31 splittable Bonds per fragmentation --- .../cdk/fragment/ExhaustiveFragmenter.java | 84 +++++++++---------- .../fragment/ExhaustiveFragmenterTest.java | 50 +++++++++-- 2 files changed, 85 insertions(+), 49 deletions(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index bba7ee7c39a..27aa3d21898 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -36,6 +36,7 @@ import org.openscience.cdk.tools.ILoggingTool; import org.openscience.cdk.tools.LoggingToolFactory; +import java.math.BigInteger; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; @@ -63,9 +64,7 @@ * and {@link SmiFlavor#UseAromaticSymbols}. *
  • Stereo information is disregarded
  • * - * However, users can modify these settings, with the exception, that the - * maximum tree depth can not be higher than 31 (Java's limitation caused by - * integer indexing). + * However, users can modify these settings. *

    * Warning on preservation of stereo information: This process * is not reliable and can lead to incorrect stereochemistry in the resulting @@ -180,7 +179,7 @@ public enum Saturation { new SmilesGenerator( SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols ); - private static final int DEFAULT_INCLUSIVE_MAX_TREE_DEPTH = Integer.SIZE - 1; + private static final int DEFAULT_INCLUSIVE_MAX_TREE_DEPTH = 31; private static final boolean DEFAULT_COPY_STEREO_INFO = false; private Map fragMap; @@ -356,21 +355,18 @@ public void setSaturationSetting(Saturation saturationSetting) { * Sets the maximum number of bonds that can be simultaneously split in a * single fragmentation event. *

    - * Must be within the range {@code 0 < inclusiveMaxTreeDepth < 32}. This - * limit is important due to the combinatorial explosion of fragments - * (which scales with 2^n, where n is the number of splittable bonds) and - * Java's use of 32-bit integers for indexing. Setting a lower limit can - * help manage computational resources for larger molecules. + * This is a practical limit to prevent combinatorial explosion and + * out-of-memory errors for very large molecules. The value must be + * a positive integer. *

    * * @param inclusiveMaxTreeDepth the exclusive maximum number of bonds that * can be split in one atom container. */ public void setInclusiveMaxTreeDepth(int inclusiveMaxTreeDepth) { - if (inclusiveMaxTreeDepth <= 0 || inclusiveMaxTreeDepth >= 32) { + if (inclusiveMaxTreeDepth <= 0) { throw new IllegalArgumentException( - "Inclusive max tree depth must be grater then zero and " + - "smaller then 32. Provided: " + inclusiveMaxTreeDepth + "Inclusive max tree depth must be greater than zero" ); } this.inclusiveMaxTreeDepth = inclusiveMaxTreeDepth; @@ -468,7 +464,9 @@ private void run(IAtomContainer atomContainer) throws CDKException { // Compute the number of possible bond subsets (excluding the empty set): // 2^n - 1 - int numberOfIterations = (1 << splittableBonds.length) - 1; + BigInteger numberOfIterations = BigInteger.ONE.shiftLeft( + splittableBonds.length + ).subtract(BigInteger.ONE); // Store indices of splittable bonds for subset generation int[] splittableBondIndices = new int[splittableBonds.length]; @@ -476,10 +474,17 @@ private void run(IAtomContainer atomContainer) throws CDKException { splittableBondIndices[i] = splittableBonds[i].getIndex(); } - this.fragMap = new HashMap<>(numberOfIterations); + if (numberOfIterations.compareTo(BigInteger.valueOf(Integer.MAX_VALUE)) >= 0) { + this.fragMap = new HashMap<>(Integer.MAX_VALUE); + } else { + this.fragMap = new HashMap<>(numberOfIterations.intValue()); + } // Iterate over all non-empty subsets of splittable bonds - for (int i = 1; i <= numberOfIterations; i++) { + for (BigInteger i = BigInteger.ONE; + i.compareTo(numberOfIterations) <= 0; + i = i.add(BigInteger.ONE) + ) { int[] subset = generateSubset(i, splittableBondIndices); int subsetSize = subset.length; @@ -528,20 +533,14 @@ private void run(IAtomContainer atomContainer) throws CDKException { * fragmentation. This method is especially useful to determine if it is * even possible to split a specific molecule exhaustively. The number of * fragments is 2^n - 1 with n being the number of splittable bonds. - * It is impossible to generate all possible fragment combinations for a molecule - * with more than 31 splittable bonds, as this would exceed the maximum tree depth - * of 31 due to the combinatorial explosion. For molecules with more than 31 - * splittable bonds, the fragmentation will still occur, but it will be limited - * to a maximum of {@code inclusiveMaxTreeDepth} bonds per fragmentation step. - * To mitigate this one can check this with this function, for example: - *
    -     *     {@code
    -     *     ExhaustiveFragmenter exhFragmenter = new Exhaustive Fragmenter;
    -     *     if (exhFragmenter.getSplittableBonds(mol) > Integer.SIZE - 1) {
    -     *         // handle the case, where it is impossible to entirely split the
    -     *         // molecule
    -     *     }}
    -     * 
    + * + *

    + * Due to the combinatorial explosion, the number of fragments can grow + * extremely large, potentially leading to out-of-memory errors or + * excessively long processing times. Consider using the + * {@link #setInclusiveMaxTreeDepth(int)} method to limit the number of + * simultaneous bond splits. + *

    * * @param atomContainer the container which contains the molecule in question. * @return the bonds which would be split by the exhaustive fragmentation. @@ -595,8 +594,9 @@ public static IBond[] getSplittableBonds(IAtomContainer atomContainer) { * elements does not matter (i.e., `[1, 2]` and `[2, 1]` are equivalent). * *

    The total number of possible subsets is (2^n) - 1, where `n` is the - * length of `nums`. Subsets are generated using bitwise operations, where - * each `1` bit in `index` selects the corresponding element from `nums`.

    + * length of `nums`. Subsets are generated using bitwise operations on the + * provided {@link BigInteger}, where each `1` bit in `index` selects the + * corresponding element from `nums`.

    * *

    Example output for `nums = [1, 2, 3]`:

    *
    @@ -624,24 +624,24 @@ public static IBond[] getSplittableBonds(IAtomContainer atomContainer) {
          *             in `nums` may result in duplicate subset entries.
          * @return An array containing the subset corresponding to `index`.
          */
    -    protected static int[] generateSubset(int index, int[] nums) {
    -        // Allocate subset array based on the number of 1-bits in index.
    -        int[] subset = new int[Integer.bitCount(index)];
    +    protected static int[] generateSubset(BigInteger index, int[] nums) {
    +        // allocate subset array based on the number of 1-bits in index.
    +        int[] subset = new int[index.bitCount()];
             int subsetIndex = 0;
     
    -        // Process using bit manipulation - only iterate through set bits
    -        while (index != 0) {
    -            // Find position of lowest set bit
    -            int lowestBitPos = Integer.numberOfTrailingZeros(index);
    +        // only iterate through set bits
    +        while (index.compareTo(BigInteger.ZERO) != 0) {
    +            // find position of lowest set bit
    +            int lowestBitPos = index.getLowestSetBit();
     
    -            // Add the corresponding element from nums if within bounds
    +            // add the corresponding element from nums if within bounds
                 if (lowestBitPos < nums.length) {
                     subset[subsetIndex] = nums[lowestBitPos];
                     subsetIndex++;
                 }
     
    -            // Clear the lowest set bit and continue
    -            index = index & (index - 1);
    +            // clear the lowest set bit and continue
    +            index = index.clearBit(lowestBitPos);
             }
     
             return subset;
    @@ -711,7 +711,7 @@ private static IBond copyBond(
                     cpyCurrentAtom,
                     cpyNbor,
                     origBond.getOrder());
    -        cpyBond.setStereo(origBond.getStereo());
    +        cpyBond.setDisplay(origBond.getDisplay());
             cpyBond.setIsAromatic(origBond.isAromatic());
             // Setting is in ring is possible here because we always detect rings
             // in the process of detecting the splittable bonds.
    diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java
    index 0e1af6ab8ba..750cebc4842 100644
    --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java
    +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java
    @@ -31,6 +31,7 @@
     import org.openscience.cdk.smiles.SmilesParser;
     import org.openscience.cdk.test.CDKTestCase;
     
    +import java.math.BigInteger;
     import java.util.Arrays;
     import java.util.HashSet;
     import java.util.Set;
    @@ -497,25 +498,60 @@ void testGenerateSubset() {
             int[] nums = new int[]{10, 20, 30, 40};
     
             // index = 1 (0001) -> {nums[0]}
    -        Assertions.assertArrayEquals(new int[]{10}, ExhaustiveFragmenter.generateSubset(1, nums));
    +        Assertions.assertArrayEquals(
    +                new int[]{10},
    +                ExhaustiveFragmenter.generateSubset(BigInteger.valueOf(1), nums)
    +        );
     
             // index = 2 (0010) -> {nums[1]}
    -        Assertions.assertArrayEquals(new int[]{20}, ExhaustiveFragmenter.generateSubset(2, nums));
    +        Assertions.assertArrayEquals(
    +                new int[]{20},
    +                ExhaustiveFragmenter.generateSubset(BigInteger.valueOf(2), nums)
    +        );
     
             // index = 3 (0011) -> {nums[0], nums[1]}
    -        Assertions.assertArrayEquals(new int[]{10, 20}, ExhaustiveFragmenter.generateSubset(3, nums));
    +        Assertions.assertArrayEquals(
    +                new int[]{10, 20},
    +                ExhaustiveFragmenter.generateSubset(BigInteger.valueOf(3), nums)
    +        );
     
             // index = 4 (0100) -> {nums[2]}
    -        Assertions.assertArrayEquals(new int[]{30}, ExhaustiveFragmenter.generateSubset(4, nums));
    +        Assertions.assertArrayEquals(
    +                new int[]{30},
    +                ExhaustiveFragmenter.generateSubset(BigInteger.valueOf(4), nums)
    +        );
     
             // index = 5 (0101) -> {nums[0], nums[2]}
    -        Assertions.assertArrayEquals(new int[]{10, 30}, ExhaustiveFragmenter.generateSubset(5, nums));
    +        Assertions.assertArrayEquals(
    +                new int[]{10, 30},
    +                ExhaustiveFragmenter.generateSubset(BigInteger.valueOf(5), nums)
    +        );
     
             // index = 7 (0111) -> {nums[0], nums[1], nums[2]}
    -        Assertions.assertArrayEquals(new int[]{10, 20, 30}, ExhaustiveFragmenter.generateSubset(7, nums));
    +        Assertions.assertArrayEquals(
    +                new int[]{10, 20, 30},
    +                ExhaustiveFragmenter.generateSubset(BigInteger.valueOf(7), nums)
    +        );
     
             // index = 15 (1111) -> {nums[0], nums[1], nums[2], nums[3]}
    -        Assertions.assertArrayEquals(new int[]{10, 20, 30, 40}, ExhaustiveFragmenter.generateSubset(15, nums));
    +        Assertions.assertArrayEquals(
    +                new int[]{10, 20, 30, 40},
    +                ExhaustiveFragmenter.generateSubset(BigInteger.valueOf(15), nums)
    +        );
    +
    +        int[] longRange = new int[100];
    +        for (int i = 0; i < longRange.length; i++) {
    +            longRange[i] = i;
    +        }
    +
    +        // index = 70 (1000... and 67 more zeros) -> {nums[70]}
    +        Assertions.assertArrayEquals(
    +                new int[] {70},
    +                ExhaustiveFragmenter.generateSubset(
    +                        BigInteger.ONE.shiftLeft(70),
    +                        longRange
    +                )
    +        );
         }
     
         /**
    
    From e7c54893474a48e25d8b81509adb14829e948d1d Mon Sep 17 00:00:00 2001
    From: ToLeWeiss 
    Date: Thu, 4 Sep 2025 23:43:26 +0200
    Subject: [PATCH 37/42] cherry picked commit: 71da5e6e4d to revert to integers
     instead of BigInt
    
    
    From b186c0e72e5074a32ddccb53ca7c02bb5fcba366 Mon Sep 17 00:00:00 2001
    From: ToLeWeiss 
    Date: Thu, 4 Sep 2025 12:55:22 +0200
    Subject: [PATCH 38/42] cherry picked hopefully the right commit commit:d6f52e8
     to revert to integers instead of BigInt
    
    
    From 6a0330593ae36f7bda0c4bc2e9217c32203e5823 Mon Sep 17 00:00:00 2001
    From: ToLeWeiss 
    Date: Wed, 22 Oct 2025 19:54:30 +0200
    Subject: [PATCH 39/42] I finally read the documentation of git cherry-pick and
     therefore applied the changes from BigInt back to int manually
    
    ---
     .../cdk/fragment/ExhaustiveFragmenter.java    | 81 ++++++++++---------
     .../fragment/ExhaustiveFragmenterTest.java    | 29 ++-----
     2 files changed, 49 insertions(+), 61 deletions(-)
    
    diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java
    index 27aa3d21898..b6cc0878da8 100644
    --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java
    +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java
    @@ -36,7 +36,6 @@
     import org.openscience.cdk.tools.ILoggingTool;
     import org.openscience.cdk.tools.LoggingToolFactory;
     
    -import java.math.BigInteger;
     import java.util.ArrayDeque;
     import java.util.ArrayList;
     import java.util.Arrays;
    @@ -59,12 +58,14 @@
      *     returned.
      * 
  • Fragments are returned with open valences, where a bond has been split.
  • *
  • The fragmentation splits at a maximum tree depth of 31, meaning that - * maximum 31 bonds are split in one run.
  • + * maximum 27 bonds are split in one run. *
  • The SMILES code of the fragments is generated with {@link SmiFlavor#Unique} * and {@link SmiFlavor#UseAromaticSymbols}.
  • *
  • Stereo information is disregarded
  • * - * However, users can modify these settings. + * However, users can modify these settings, with the exception, that the + * maximum tree depth can not be higher than 31 (Java's limitation caused by + * integer indexing). *

    * Warning on preservation of stereo information: This process * is not reliable and can lead to incorrect stereochemistry in the resulting @@ -179,7 +180,10 @@ public enum Saturation { new SmilesGenerator( SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols ); - private static final int DEFAULT_INCLUSIVE_MAX_TREE_DEPTH = 31; + // assuming each fragment is unique (as if there was no deduplication) + // 27 would be the maximum tree depth to hold all fragments in the + // hashmap. + private static final int DEFAULT_INCLUSIVE_MAX_TREE_DEPTH = 27; private static final boolean DEFAULT_COPY_STEREO_INFO = false; private Map fragMap; @@ -355,18 +359,21 @@ public void setSaturationSetting(Saturation saturationSetting) { * Sets the maximum number of bonds that can be simultaneously split in a * single fragmentation event. *

    - * This is a practical limit to prevent combinatorial explosion and - * out-of-memory errors for very large molecules. The value must be - * a positive integer. + * Must be within the range {@code 0 < inclusiveMaxTreeDepth < 32}. This + * limit is important due to the combinatorial explosion of fragments + * (which scales with 2^n, where n is the number of splittable bonds) and + * Java's use of 32-bit integers for indexing. Setting a lower limit can + * help manage computational resources for larger molecules. *

    * * @param inclusiveMaxTreeDepth the exclusive maximum number of bonds that * can be split in one atom container. */ public void setInclusiveMaxTreeDepth(int inclusiveMaxTreeDepth) { - if (inclusiveMaxTreeDepth <= 0) { + if (inclusiveMaxTreeDepth <= 0 || inclusiveMaxTreeDepth >= 32) { throw new IllegalArgumentException( - "Inclusive max tree depth must be greater than zero" + "Inclusive max tree depth must be grater then zero and " + + "smaller then 32. Provided: " + inclusiveMaxTreeDepth ); } this.inclusiveMaxTreeDepth = inclusiveMaxTreeDepth; @@ -464,9 +471,7 @@ private void run(IAtomContainer atomContainer) throws CDKException { // Compute the number of possible bond subsets (excluding the empty set): // 2^n - 1 - BigInteger numberOfIterations = BigInteger.ONE.shiftLeft( - splittableBonds.length - ).subtract(BigInteger.ONE); + int numberOfIterations = (1 << splittableBonds.length) - 1; // Store indices of splittable bonds for subset generation int[] splittableBondIndices = new int[splittableBonds.length]; @@ -474,17 +479,10 @@ private void run(IAtomContainer atomContainer) throws CDKException { splittableBondIndices[i] = splittableBonds[i].getIndex(); } - if (numberOfIterations.compareTo(BigInteger.valueOf(Integer.MAX_VALUE)) >= 0) { - this.fragMap = new HashMap<>(Integer.MAX_VALUE); - } else { - this.fragMap = new HashMap<>(numberOfIterations.intValue()); - } + this.fragMap = new HashMap<>(numberOfIterations); // Iterate over all non-empty subsets of splittable bonds - for (BigInteger i = BigInteger.ONE; - i.compareTo(numberOfIterations) <= 0; - i = i.add(BigInteger.ONE) - ) { + for (int i = 1; i <= numberOfIterations; i++) { int[] subset = generateSubset(i, splittableBondIndices); int subsetSize = subset.length; @@ -533,14 +531,20 @@ private void run(IAtomContainer atomContainer) throws CDKException { * fragmentation. This method is especially useful to determine if it is * even possible to split a specific molecule exhaustively. The number of * fragments is 2^n - 1 with n being the number of splittable bonds. - * - *

    - * Due to the combinatorial explosion, the number of fragments can grow - * extremely large, potentially leading to out-of-memory errors or - * excessively long processing times. Consider using the - * {@link #setInclusiveMaxTreeDepth(int)} method to limit the number of - * simultaneous bond splits. - *

    + * It is impossible to generate all possible fragment combinations for a molecule + * with more than 31 splittable bonds, as this would exceed the maximum tree depth + * of 31 due to the combinatorial explosion. For molecules with more than 31 + * splittable bonds, the fragmentation will still occur, but it will be limited + * to a maximum of {@code inclusiveMaxTreeDepth} bonds per fragmentation step. + * To mitigate this one can check this with this function, for example: + *
    +     *     {@code
    +     *     ExhaustiveFragmenter exhFragmenter = new Exhaustive Fragmenter;
    +     *     if (exhFragmenter.getSplittableBonds(mol) > Integer.SIZE - 1) {
    +     *         // handle the case, where it is impossible to entirely split the
    +     *         // molecule
    +     *     }}
    +     * 
    * * @param atomContainer the container which contains the molecule in question. * @return the bonds which would be split by the exhaustive fragmentation. @@ -594,9 +598,8 @@ public static IBond[] getSplittableBonds(IAtomContainer atomContainer) { * elements does not matter (i.e., `[1, 2]` and `[2, 1]` are equivalent). * *

    The total number of possible subsets is (2^n) - 1, where `n` is the - * length of `nums`. Subsets are generated using bitwise operations on the - * provided {@link BigInteger}, where each `1` bit in `index` selects the - * corresponding element from `nums`.

    + * length of `nums`. Subsets are generated using bitwise operations, where + * each `1` bit in `index` selects the corresponding element from `nums`.

    * *

    Example output for `nums = [1, 2, 3]`:

    *
    @@ -624,15 +627,15 @@ public static IBond[] getSplittableBonds(IAtomContainer atomContainer) {
          *             in `nums` may result in duplicate subset entries.
          * @return An array containing the subset corresponding to `index`.
          */
    -    protected static int[] generateSubset(BigInteger index, int[] nums) {
    +    protected static int[] generateSubset(int index, int[] nums) {
             // allocate subset array based on the number of 1-bits in index.
    -        int[] subset = new int[index.bitCount()];
    +        int[] subset = new int[Integer.bitCount(index)];
             int subsetIndex = 0;
     
    -        // only iterate through set bits
    -        while (index.compareTo(BigInteger.ZERO) != 0) {
    +        // process using bit manipulation - only iterate through set bits
    +        while (index != 0) {
                 // find position of lowest set bit
    -            int lowestBitPos = index.getLowestSetBit();
    +            int lowestBitPos = Integer.numberOfTrailingZeros(index);
     
                 // add the corresponding element from nums if within bounds
                 if (lowestBitPos < nums.length) {
    @@ -640,8 +643,8 @@ protected static int[] generateSubset(BigInteger index, int[] nums) {
                     subsetIndex++;
                 }
     
    -            // clear the lowest set bit and continue
    -            index = index.clearBit(lowestBitPos);
    +            // Clear the lowest set bit and continue
    +            index = index & (index - 1);
             }
     
             return subset;
    diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java
    index 750cebc4842..d58cbc4e967 100644
    --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java
    +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java
    @@ -31,7 +31,6 @@
     import org.openscience.cdk.smiles.SmilesParser;
     import org.openscience.cdk.test.CDKTestCase;
     
    -import java.math.BigInteger;
     import java.util.Arrays;
     import java.util.HashSet;
     import java.util.Set;
    @@ -500,57 +499,43 @@ void testGenerateSubset() {
             // index = 1 (0001) -> {nums[0]}
             Assertions.assertArrayEquals(
                     new int[]{10},
    -                ExhaustiveFragmenter.generateSubset(BigInteger.valueOf(1), nums)
    +                ExhaustiveFragmenter.generateSubset(1, nums)
             );
     
             // index = 2 (0010) -> {nums[1]}
             Assertions.assertArrayEquals(
                     new int[]{20},
    -                ExhaustiveFragmenter.generateSubset(BigInteger.valueOf(2), nums)
    +                ExhaustiveFragmenter.generateSubset(2, nums)
             );
     
             // index = 3 (0011) -> {nums[0], nums[1]}
             Assertions.assertArrayEquals(
                     new int[]{10, 20},
    -                ExhaustiveFragmenter.generateSubset(BigInteger.valueOf(3), nums)
    +                ExhaustiveFragmenter.generateSubset(3, nums)
             );
     
             // index = 4 (0100) -> {nums[2]}
             Assertions.assertArrayEquals(
                     new int[]{30},
    -                ExhaustiveFragmenter.generateSubset(BigInteger.valueOf(4), nums)
    +                ExhaustiveFragmenter.generateSubset(4, nums)
             );
     
             // index = 5 (0101) -> {nums[0], nums[2]}
             Assertions.assertArrayEquals(
                     new int[]{10, 30},
    -                ExhaustiveFragmenter.generateSubset(BigInteger.valueOf(5), nums)
    +                ExhaustiveFragmenter.generateSubset(5, nums)
             );
     
             // index = 7 (0111) -> {nums[0], nums[1], nums[2]}
             Assertions.assertArrayEquals(
                     new int[]{10, 20, 30},
    -                ExhaustiveFragmenter.generateSubset(BigInteger.valueOf(7), nums)
    +                ExhaustiveFragmenter.generateSubset(7, nums)
             );
     
             // index = 15 (1111) -> {nums[0], nums[1], nums[2], nums[3]}
             Assertions.assertArrayEquals(
                     new int[]{10, 20, 30, 40},
    -                ExhaustiveFragmenter.generateSubset(BigInteger.valueOf(15), nums)
    -        );
    -
    -        int[] longRange = new int[100];
    -        for (int i = 0; i < longRange.length; i++) {
    -            longRange[i] = i;
    -        }
    -
    -        // index = 70 (1000... and 67 more zeros) -> {nums[70]}
    -        Assertions.assertArrayEquals(
    -                new int[] {70},
    -                ExhaustiveFragmenter.generateSubset(
    -                        BigInteger.ONE.shiftLeft(70),
    -                        longRange
    -                )
    +                ExhaustiveFragmenter.generateSubset(15, nums)
             );
         }
     
    
    From 7abb1f40333123b21d3521482153d7e110ca234f Mon Sep 17 00:00:00 2001
    From: ToLeWeiss 
    Date: Wed, 22 Oct 2025 20:26:04 +0200
    Subject: [PATCH 40/42] adjusted maximum tree depth in the comments to 27
    
    ---
     .../openscience/cdk/fragment/ExhaustiveFragmenter.java | 10 +++++-----
     1 file changed, 5 insertions(+), 5 deletions(-)
    
    diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java
    index b6cc0878da8..decbfaa9904 100644
    --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java
    +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java
    @@ -57,7 +57,7 @@
      * 
  • Fragments smaller than 6 atoms (excluding implicit hydrogen) are not * returned.
  • *
  • Fragments are returned with open valences, where a bond has been split.
  • - *
  • The fragmentation splits at a maximum tree depth of 31, meaning that + *
  • The fragmentation splits at a maximum tree depth of 27, meaning that * maximum 27 bonds are split in one run.
  • *
  • The SMILES code of the fragments is generated with {@link SmiFlavor#Unique} * and {@link SmiFlavor#UseAromaticSymbols}.
  • @@ -202,7 +202,7 @@ public enum Saturation { *
  • {@link Saturation#UNSATURATED_FRAGMENTS}
  • *
  • Default {@link SmilesGenerator} * ({@link SmiFlavor#Unique} | {@link SmiFlavor#UseAromaticSymbols})
  • - *
  • inclusive maximum tree depth of 31
  • + *
  • inclusive maximum tree depth of 27
  • *
  • stereo information is not preserved
  • * */ @@ -219,7 +219,7 @@ public ExhaustiveFragmenter() { /** * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment * size and saturation setting. Uses the default {@link SmilesGenerator} and - * default inclusive maximum tree depth of 31. Stereo information is not + * default inclusive maximum tree depth of 27. Stereo information is not * preserved. * * @param minFragSize minimum number of atoms in a valid fragment @@ -241,7 +241,7 @@ public ExhaustiveFragmenter(int minFragSize, Saturation saturationSetting) { * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment * size. Saturation defaults to {@link Saturation#UNSATURATED_FRAGMENTS}. * Uses the default {@link SmilesGenerator} and the default - * inclusive maximum tree depth of 31. Stereo information is not + * inclusive maximum tree depth of 27. Stereo information is not * preserved. * * @param minFragSize minimum number of atoms in a valid fragment @@ -260,7 +260,7 @@ public ExhaustiveFragmenter(int minFragSize) { /** * Constructs an ExhaustiveFragmenter with a user-defined saturation setting. * Minimum fragment size defaults to 6. Uses the default {@link SmilesGenerator} - * and the default inclusive maximum tree depth of 31. Stereo information is + * and the default inclusive maximum tree depth of 27. Stereo information is * not preserved. * * @param saturation how open valences should be treated after the From 9ca3947fd18be15475366c0546ec95d016a351ec Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Thu, 23 Oct 2025 12:43:32 +0200 Subject: [PATCH 41/42] added exception to mitigate the indexing of the fragmentation result map to overflow --- .../org/openscience/cdk/fragment/ExhaustiveFragmenter.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index decbfaa9904..0c5d2b06b09 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -520,6 +520,13 @@ private void run(IAtomContainer atomContainer) throws CDKException { // Store the fragment if it meets the size requirement and is // unique if (numberOfAtoms >= minFragSize) { + if (fragMap.size() == HashMap.) { + throw new ArithmeticException( + "The maximum size to store the current amount of " + + "molecules would exceed the maximum value for an " + + "integer and overflow" + ); + } fragMap.putIfAbsent(tmpSmiles, partContainer); } } From 59e232c8125ee131dced0412f77e805bb81f658b Mon Sep 17 00:00:00 2001 From: ToLeWeiss Date: Thu, 23 Oct 2025 12:47:04 +0200 Subject: [PATCH 42/42] fixed missing value --- .../java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index 0c5d2b06b09..65bffc4acde 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -520,7 +520,7 @@ private void run(IAtomContainer atomContainer) throws CDKException { // Store the fragment if it meets the size requirement and is // unique if (numberOfAtoms >= minFragSize) { - if (fragMap.size() == HashMap.) { + if (fragMap.size() == Integer.MAX_VALUE) { throw new ArithmeticException( "The maximum size to store the current amount of " + "molecules would exceed the maximum value for an " +