diff --git a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java index af248b8e9f..65bffc4acd 100644 --- a/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java +++ b/tool/fragment/src/main/java/org/openscience/cdk/fragment/ExhaustiveFragmenter.java @@ -1,4 +1,5 @@ -/* Copyright (C) 2010 Rajarshi Guha +/* Copyright (C) 2025 Rajarshi Guha + * Tom Weiß * * Contact: cdk-devel@lists.sourceforge.net * @@ -22,185 +23,938 @@ */ package org.openscience.cdk.fragment; -import org.openscience.cdk.aromaticity.Aromaticity; import org.openscience.cdk.exception.CDKException; -import org.openscience.cdk.graph.SpanningTree; import org.openscience.cdk.interfaces.IAtom; import org.openscience.cdk.interfaces.IAtomContainer; import org.openscience.cdk.interfaces.IBond; -import org.openscience.cdk.interfaces.IRingSet; +import org.openscience.cdk.interfaces.IChemObject; +import org.openscience.cdk.interfaces.IPseudoAtom; +import org.openscience.cdk.interfaces.IStereoElement; +import org.openscience.cdk.ringsearch.RingSearch; +import org.openscience.cdk.smiles.SmiFlavor; import org.openscience.cdk.smiles.SmilesGenerator; -import org.openscience.cdk.tools.CDKHydrogenAdder; import org.openscience.cdk.tools.ILoggingTool; import org.openscience.cdk.tools.LoggingToolFactory; -import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; +import java.util.ArrayDeque; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Deque; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; /** - * Generate fragments exhaustively. - * - * This fragmentation scheme simply breaks single non-ring bonds. By default - * fragments smaller than 6 atoms in size are not considered, but this can be - * changed by the user. Side chains are retained. + * Performs exhaustive fragmentation of molecules by breaking single non-ring, + * non-terminal bonds in all combinations. If it is not possible to generate + * fragments, an empty list is returned. Non-terminal bonds are those connected + * to heavy atoms that respectively have another bond to a heavy atom. + *

+ * By default: + *

    + *
  • Fragments smaller than 6 atoms (excluding implicit hydrogen) are not + * returned.
  • + *
  • Fragments are returned with open valences, where a bond has been split.
  • + *
  • The fragmentation splits at a maximum tree depth of 27, meaning that + * maximum 27 bonds are split in one run.
  • + *
  • The SMILES code of the fragments is generated with {@link SmiFlavor#Unique} + * and {@link SmiFlavor#UseAromaticSymbols}.
  • + *
  • Stereo information is disregarded
  • + *
+ * However, users can modify these settings, with the exception, that the + * maximum tree depth can not be higher than 31 (Java's limitation caused by + * integer indexing). + *

+ * Warning on preservation of stereo information: This process + * is not reliable and can lead to incorrect stereochemistry in the resulting + * fragments. When a chiral center is broken during fragmentation, the new + * fragment may be incorrectly assigned as chiral even if it is not + * anymore because some of its substituents are now equal. + *

+ * Example:the chiral molecule {@code CC[C@@H](Cl)CCCC} will i.a. + * produce the fragment {@code CC[C@@H](Cl)CC} where the stereo configuration is + * preserved but the structure is actually not chiral anymore. + *

+ *

+ * Note on Stereochemistry and SMILES: + * For stereochemical information to be included in the SMILES strings + * returned by {@link #getFragments()}, the `smilesGenerator` used by this + * fragmenter must be configured with the {@link SmiFlavor#Stereo} flag. + * If the flag is not set, the SMILES will not contain stereochemistry, + * even if this setting is enabled and the underlying {@code IAtomContainer} + * objects have stereo elements. + *

+ *

+ * Fragment Deduplication: + * The `ExhaustiveFragmenter` uses unique SMILES strings for internal + * deduplication of generated fragments. This means that after a fragment is + * generated, its unique SMILES representation is computed (using the default or + * user specified {@link SmilesGenerator}). Be aware that stereo information is + * only copied and checked for deduplication if + * {@link ExhaustiveFragmenter#setPreserveStereo} is set to true and the + * specified {@link SmilesGenerator} has {@link SmiFlavor#Stereo}. If a fragment + * with the same canonical SMILES has already been generated and stored, the new + * fragment is considered a duplicate and is not added to the results. + *

+ * This deduplication strategy is particularly important when considering the + * {@link Saturation} setting: + *

    + *
  • If fragments are {@link Saturation#HYDROGEN_SATURATED_FRAGMENTS}, the + * saturation process might lead to a canonical SMILES that is identical to a + * fragment obtained via a different bond cleavage, or a fragment that appears + * different due to explicit hydrogen representation but becomes identical when + * canonicalized.
  • + *
  • For example, an unsaturated fragment like `[CH]1CCCCC1` (cyclohexyl + * radical) might deduplicate with a saturated `C1CCCCC1` (cyclohexane) if + * `HYDROGEN_SATURATED_FRAGMENTS` is enabled and both forms canonicalize to the + * same SMILES depending on the exact SMILES generator and atom properties.
  • + *
  • It is crucial to understand that the uniqueness is based solely on the + * canonical SMILES string, not on the exact atom-by-atom identity or origin + * within the original molecule.
  • + *
+ *

+ * Example Usage: + *

{@code
+ * // By default, returns unsaturated fragments with a minimum size of 6 atoms
+ * ExhaustiveFragmenter fragmenter = new ExhaustiveFragmenter();
+ * SmilesParser smiParser = new SmilesParser(SilentChemObjectBuilder.getInstance());
+ * // Cyclopentylbenzene
+ * IAtomContainer mol = smiParser.parseSmiles("C1CCCC1C1=CC=CC=C1");
+ * fragmenter.generateFragments(mol);
+ *
+ * // Retrieve SMILES representations of fragments
+ * String[] smilesFragments = fragmenter.getFragments();
+ * // Example Result (depending on exact fragmentation points and min size):
+ * // "[C]1=CC=CC=C1"
+ *
+ * // Retrieve AtomContainer representations of fragments
+ * IAtomContainer[] atomContainerFragments = fragmenter.getFragmentsAsContainers();
+ *
+ * // Example: Configuring for hydrogen-saturated fragments with a minimum size of 5
+ * ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(
+ *      5,
+ *      ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS
+ * );
+ * saturatedFragmenter.generateFragments(mol);
+ * String[] saturatedSmilesFragments = saturatedFragmenter.getFragments();
+ * // "C1CCCC1", "C1=CC=CC=C1"
+ * }
* * @author Rajarshi Guha + * @author Tom Weiß + * @cdk.module fragment * @cdk.keyword fragment */ public class ExhaustiveFragmenter implements IFragmenter { - private static final int DEFAULT_MIN_FRAG_SIZE = 6; + /** + * Specifies whether generated fragments should be saturated (hydrogens added) + * or unsaturated. + */ + public enum Saturation { + /** + * Fragments will be returned in their saturated form + * (implicit hydrogen atoms added). + */ + HYDROGEN_SATURATED_FRAGMENTS, + + /** + * Fragments will be saturated with R atoms. + */ + R_SATURATED_FRAGMENTS, + + /** + * Fragments will be returned in their unsaturated form + * (no additional hydrogen atoms). The unsaturated atoms are the atoms + * of the split bonds. + */ + UNSATURATED_FRAGMENTS + } + + private static final int DEFAULT_MIN_FRAG_SIZE = 6; + private static final Saturation DEFAULT_SATURATION = + Saturation.UNSATURATED_FRAGMENTS; + private static final SmilesGenerator DEFAULT_SMILES_GENERATOR = + new SmilesGenerator( + SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols + ); + // assuming each fragment is unique (as if there was no deduplication) + // 27 would be the maximum tree depth to hold all fragments in the + // hashmap. + private static final int DEFAULT_INCLUSIVE_MAX_TREE_DEPTH = 27; + private static final boolean DEFAULT_COPY_STEREO_INFO = false; - final Map fragMap; - final SmilesGenerator smilesGenerator; - String[] fragments = null; - int minFragSize; - private static final ILoggingTool logger = LoggingToolFactory - .createLoggingTool(ExhaustiveFragmenter.class); + private Map fragMap; + private final SmilesGenerator smilesGenerator; + private int inclusiveMaxTreeDepth; + private int minFragSize; + private Saturation saturationSetting; + private boolean preserveStereo; + private static final ILoggingTool logger = + LoggingToolFactory.createLoggingTool(ExhaustiveFragmenter.class); /** - * Instantiate fragmenter with default minimum fragment size. + * Constructs an ExhaustiveFragmenter with the default settings: + *
    + *
  • Minimum fragment size: 6 atoms (excluding implicit hydrogen)
  • + *
  • {@link Saturation#UNSATURATED_FRAGMENTS}
  • + *
  • Default {@link SmilesGenerator} + * ({@link SmiFlavor#Unique} | {@link SmiFlavor#UseAromaticSymbols})
  • + *
  • inclusive maximum tree depth of 27
  • + *
  • stereo information is not preserved
  • + *
*/ public ExhaustiveFragmenter() { - this(DEFAULT_MIN_FRAG_SIZE); + this( + DEFAULT_SMILES_GENERATOR, + DEFAULT_MIN_FRAG_SIZE, + DEFAULT_SATURATION, + DEFAULT_INCLUSIVE_MAX_TREE_DEPTH, + DEFAULT_COPY_STEREO_INFO + ); } /** - * Instantiate fragmenter with user specified minimum fragment size. + * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment + * size and saturation setting. Uses the default {@link SmilesGenerator} and + * default inclusive maximum tree depth of 27. Stereo information is not + * preserved. * - * @param minFragSize the minimum fragment size desired + * @param minFragSize minimum number of atoms in a valid fragment + * (excluding implicit hydrogen). + * @param saturationSetting determines whether fragments should be saturated + * (with hydrogens or R-atoms) or unsaturated. + */ + public ExhaustiveFragmenter(int minFragSize, Saturation saturationSetting) { + this( + DEFAULT_SMILES_GENERATOR, + minFragSize, + saturationSetting, + DEFAULT_INCLUSIVE_MAX_TREE_DEPTH, + DEFAULT_COPY_STEREO_INFO + ); + } + + /** + * Constructs an ExhaustiveFragmenter with a user-defined minimum fragment + * size. Saturation defaults to {@link Saturation#UNSATURATED_FRAGMENTS}. + * Uses the default {@link SmilesGenerator} and the default + * inclusive maximum tree depth of 27. Stereo information is not + * preserved. + * + * @param minFragSize minimum number of atoms in a valid fragment + * (excluding implicit hydrogen). */ public ExhaustiveFragmenter(int minFragSize) { - this.minFragSize = minFragSize; - fragMap = new HashMap<>(); - smilesGenerator = SmilesGenerator.unique().aromatic(); + this( + DEFAULT_SMILES_GENERATOR, + minFragSize, + DEFAULT_SATURATION, + DEFAULT_INCLUSIVE_MAX_TREE_DEPTH, + DEFAULT_COPY_STEREO_INFO + ); + } + + /** + * Constructs an ExhaustiveFragmenter with a user-defined saturation setting. + * Minimum fragment size defaults to 6. Uses the default {@link SmilesGenerator} + * and the default inclusive maximum tree depth of 27. Stereo information is + * not preserved. + * + * @param saturation how open valences should be treated after the + * fragmentation. + */ + public ExhaustiveFragmenter(Saturation saturation) { + this( + DEFAULT_SMILES_GENERATOR, + DEFAULT_MIN_FRAG_SIZE, + saturation, + DEFAULT_INCLUSIVE_MAX_TREE_DEPTH, + DEFAULT_COPY_STEREO_INFO + ); + } + + /** + * Constructs an ExhaustiveFragmenter with a user-provided + * {@link SmilesGenerator} and user defined: + *
    + *
  • minimum fragment size
  • + *
  • inclusive max tree depth
  • + *
  • saturation setting
  • + *
  • preservation of stereochemistry information
  • + *
      + * + * @param smilesGenerator the {@link SmilesGenerator} instance to use for + * creating SMILES strings + * for fragment deduplication and retrieval. + * @param minFragSize minimum number of atoms in a valid fragment + * (excluding implicit hydrogen). + * @param saturationSetting determines whether fragments should be saturated + * (with hydrogens or R-atoms) or unsaturated. + * @param inclusiveMaxTreeDepth represents the maximum number of bonds that + * will be split for a fragmentation. + * @param preserveStereo signals whether to attempt to copy stereochemical + * information from the original molecule to the + * generated fragments. Warning: This + * process is not reliable and can lead to incorrect + * stereochemistry in the resulting fragments. + */ + public ExhaustiveFragmenter( + SmilesGenerator smilesGenerator, + int minFragSize, + Saturation saturationSetting, + int inclusiveMaxTreeDepth, + boolean preserveStereo + ) { + if (saturationSetting == null) { + throw new NullPointerException( + "The given SaturationSetting can not be null" + ); + } + this.saturationSetting = saturationSetting; + if (smilesGenerator == null) { + throw new NullPointerException( + "The given SmilesGenerator can not be null" + ); + } + this.smilesGenerator = smilesGenerator; + this.preserveStereo = preserveStereo; + this.setInclusiveMaxTreeDepth(inclusiveMaxTreeDepth); + this.setMinimumFragmentSize(minFragSize); + this.fragMap = null; } /** - * Set the minimum fragment size. + * Sets the minimum allowed fragment size. This has to be greater than zero. * - * @param minFragSize the smallest size fragment that will be returned + * @param minFragSize minimum number of atoms in a valid fragment. */ public void setMinimumFragmentSize(int minFragSize) { + if (minFragSize <= 0) { + throw new IllegalArgumentException( + "Minimum fragment size must be a positive integer (>= 1)" + + " Provided: " + minFragSize + ); + } this.minFragSize = minFragSize; } /** - * Generate fragments for the input molecule. + * Sets whether fragments should be saturated or unsaturated. + * + * @param saturationSetting the saturation mode for generated fragments. + */ + public void setSaturationSetting(Saturation saturationSetting) { + if (saturationSetting == null) { + throw new NullPointerException( + "The given SaturationSetting can not be null" + ); + } + this.saturationSetting = saturationSetting; + } + + /** + * Sets the maximum number of bonds that can be simultaneously split in a + * single fragmentation event. + *

      + * Must be within the range {@code 0 < inclusiveMaxTreeDepth < 32}. This + * limit is important due to the combinatorial explosion of fragments + * (which scales with 2^n, where n is the number of splittable bonds) and + * Java's use of 32-bit integers for indexing. Setting a lower limit can + * help manage computational resources for larger molecules. + *

      + * + * @param inclusiveMaxTreeDepth the exclusive maximum number of bonds that + * can be split in one atom container. + */ + public void setInclusiveMaxTreeDepth(int inclusiveMaxTreeDepth) { + if (inclusiveMaxTreeDepth <= 0 || inclusiveMaxTreeDepth >= 32) { + throw new IllegalArgumentException( + "Inclusive max tree depth must be grater then zero and " + + "smaller then 32. Provided: " + inclusiveMaxTreeDepth + ); + } + this.inclusiveMaxTreeDepth = inclusiveMaxTreeDepth; + } + + /** + * Sets whether stereochemical information from the original molecule should + * be copied to the generated fragments. + * + *

      + * Warning: The copying process is not reliable and can + * result in fragments with incorrect stereochemistry. This method copies + * elements based on the presence of atoms and bonds, but it does not perform + * a chemical validation check on the resulting fragment. + * For example, a chiral center might be copied even if the new fragment does + * not contain the minimum four different substituents required for chirality. + * Use caution and consider a separate validation step. + *

      + *

      + * Note on Stereochemistry and SMILES: + * For stereochemical information to be included in the SMILES strings + * returned by {@link #getFragments()}, the `smilesGenerator` used by this + * fragmenter must be configured with the {@link SmiFlavor#Stereo} flag. + * If the flag is not set, the SMILES will not contain stereochemistry, + * even if this setting is enabled and the underlying `IAtomContainer` objects + * have stereo elements. + *

      + * + * @param preserve {@code true} to enable attempting to copy + * stereo information; {@code false} otherwise. + */ + public void setPreserveStereo(boolean preserve) { + this.preserveStereo = preserve; + } + + /** + * Generates fragments for the given molecule. + * The generated fragments are stored internally and can be retrieved via: + * - {@link #getFragments()} (SMILES representation) + * - {@link #getFragmentsAsContainers()} (IAtomContainer representation) * - * @param atomContainer The input molecule. + * @param atomContainer the input molecule. + * @throws CDKException if fragmentation encounters an error. */ @Override public void generateFragments(IAtomContainer atomContainer) throws CDKException { - fragMap.clear(); + if (this.fragMap != null) { + this.fragMap.clear(); + } run(atomContainer); } - private List run(IAtomContainer atomContainer) throws CDKException { + /** + * Splits the molecule at all possible combinations of splittable bonds and + * saturates the open valences of the resulting fragments according to the + * {@link ExhaustiveFragmenter#saturationSetting}. Only non-ring and + * non-terminal single bonds are considered for splitting. + * + * @param atomContainer the molecule to be split. + * @throws CDKException if an error occurs during hydrogen addition or atom + * type perception. + */ + private void run(IAtomContainer atomContainer) throws CDKException { + if (atomContainer == null) { + throw new NullPointerException("No molecule provided"); + } + + // Return early if the molecule has fewer than 3 bonds + // (no meaningful splits possible) + if (atomContainer.getBondCount() < 3 || + atomContainer.getAtomCount() < this.minFragSize || + atomContainer.isEmpty()) { + this.fragMap = new HashMap<>(0); + return; + } + + // Retrieve bonds that are eligible for splitting + IBond[] splittableBonds = getSplittableBonds(atomContainer); - ArrayList fragments = new ArrayList<>(); + // If no splittable bonds are found, return early + if (splittableBonds.length == 0) { + logger.debug("no splittable bonds found"); + this.fragMap = new HashMap<>(0); + return; + } + if (splittableBonds.length > this.inclusiveMaxTreeDepth) { + logger.debug( + "Got " + splittableBonds.length + " splittable bonds" + + " but only " + this.inclusiveMaxTreeDepth + " tree depth. " + + "This means only a maximum of " + this.inclusiveMaxTreeDepth + + " bonds can be split at once during a fragmentation step" + ); + } + logger.debug("Got " + splittableBonds.length + " splittable bonds"); + + // Compute the number of possible bond subsets (excluding the empty set): + // 2^n - 1 + int numberOfIterations = (1 << splittableBonds.length) - 1; + + // Store indices of splittable bonds for subset generation + int[] splittableBondIndices = new int[splittableBonds.length]; + for (int i = 0; i < splittableBonds.length; i++) { + splittableBondIndices[i] = splittableBonds[i].getIndex(); + } + + this.fragMap = new HashMap<>(numberOfIterations); + + // Iterate over all non-empty subsets of splittable bonds + for (int i = 1; i <= numberOfIterations; i++) { + int[] subset = generateSubset(i, splittableBondIndices); + int subsetSize = subset.length; + + // Skip subsets exceeding the allowed depth + if (subsetSize > this.inclusiveMaxTreeDepth) { + continue; + } - if (atomContainer.getBondCount() < 3) return fragments; - List splitableBonds = getSplitableBonds(atomContainer); - if (splitableBonds.size() == 0) return fragments; - logger.debug("Got " + splitableBonds.size() + " splittable bonds"); + // Convert subset indices back to bond objects + IBond[] bondsToSplit = new IBond[subsetSize]; + for (int j = 0; j < subsetSize; j++) { + bondsToSplit[j] = atomContainer.getBond(subset[j]); + } + + // Split the molecule and retrieve the resulting fragments + IAtomContainer[] parts = splitBondsWithCopy( + atomContainer, bondsToSplit + ); - String tmpSmiles; - for (IBond bond : splitableBonds) { - List parts = FragmentUtils.splitMolecule(atomContainer, bond); - // make sure we don't add the same fragment twice + // Process each fragment for (IAtomContainer partContainer : parts) { - AtomContainerManipulator.clearAtomConfigurations(partContainer); - for (IAtom atom : partContainer.atoms()) - atom.setImplicitHydrogenCount(null); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(partContainer); - CDKHydrogenAdder.getInstance(partContainer.getBuilder()).addImplicitHydrogens(partContainer); - Aromaticity.cdkLegacy().apply(partContainer); - tmpSmiles = smilesGenerator.create(partContainer); - if (partContainer.getAtomCount() >= minFragSize && !fragMap.containsKey(tmpSmiles)) { - fragments.add(partContainer); - fragMap.put(tmpSmiles, partContainer); + + // Generate a unique SMILES representation of the fragment + String tmpSmiles = this.smilesGenerator.create(partContainer); + + int numberOfAtoms = 0; + for (IAtom atom : partContainer.atoms()) { + + if (atom instanceof IPseudoAtom) { + continue; + } + numberOfAtoms++; } - } - } - // try and partition the fragments - List tmp = new ArrayList<>(fragments); - for (IAtomContainer fragment : fragments) { - if (fragment.getBondCount() < 3 || fragment.getAtomCount() < minFragSize) continue; - if (getSplitableBonds(fragment).size() == 0) continue; - - List frags = run(fragment); - if (frags.size() == 0) continue; - - for (IAtomContainer frag : frags) { - if (frag.getBondCount() < 3) continue; - AtomContainerManipulator.clearAtomConfigurations(frag); - for (IAtom atom : frag.atoms()) - atom.setImplicitHydrogenCount(null); - AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(frag); - CDKHydrogenAdder.getInstance(frag.getBuilder()).addImplicitHydrogens(frag); - Aromaticity.cdkLegacy().apply(frag); - tmpSmiles = smilesGenerator.create(frag); - if (frag.getAtomCount() >= minFragSize && !fragMap.containsKey(tmpSmiles)) { - tmp.add(frag); - fragMap.put(tmpSmiles, frag); + // Store the fragment if it meets the size requirement and is + // unique + if (numberOfAtoms >= minFragSize) { + if (fragMap.size() == Integer.MAX_VALUE) { + throw new ArithmeticException( + "The maximum size to store the current amount of " + + "molecules would exceed the maximum value for an " + + "integer and overflow" + ); + } + fragMap.putIfAbsent(tmpSmiles, partContainer); } } } - fragments = new ArrayList<>(tmp); - return fragments; } - private List getSplitableBonds(IAtomContainer atomContainer) throws CDKException { + /** + * Detects and returns the bonds, which will be split by an exhaustive + * fragmentation. This method is especially useful to determine if it is + * even possible to split a specific molecule exhaustively. The number of + * fragments is 2^n - 1 with n being the number of splittable bonds. + * It is impossible to generate all possible fragment combinations for a molecule + * with more than 31 splittable bonds, as this would exceed the maximum tree depth + * of 31 due to the combinatorial explosion. For molecules with more than 31 + * splittable bonds, the fragmentation will still occur, but it will be limited + * to a maximum of {@code inclusiveMaxTreeDepth} bonds per fragmentation step. + * To mitigate this one can check this with this function, for example: + *
      +     *     {@code
      +     *     ExhaustiveFragmenter exhFragmenter = new Exhaustive Fragmenter;
      +     *     if (exhFragmenter.getSplittableBonds(mol) > Integer.SIZE - 1) {
      +     *         // handle the case, where it is impossible to entirely split the
      +     *         // molecule
      +     *     }}
      +     * 
      + * + * @param atomContainer the container which contains the molecule in question. + * @return the bonds which would be split by the exhaustive fragmentation. + */ + public static IBond[] getSplittableBonds(IAtomContainer atomContainer) { + if (atomContainer == null) { + throw new NullPointerException("The atom container must not be null"); + } + if (atomContainer.isEmpty()) { + throw new IllegalArgumentException("The atom container must contain " + + "an actual molecule"); + } + // do ring detection - SpanningTree spanningTree = new SpanningTree(atomContainer); - IRingSet allRings = spanningTree.getAllRings(); + RingSearch ringSearch = new RingSearch(atomContainer); + IAtomContainer allRingsContainer = ringSearch.ringFragments(); - // find the splitable bonds - ArrayList splitableBonds = new ArrayList<>(); + // find the splittable bonds + ArrayList splittableBondSet = new ArrayList<>( + atomContainer.getBondCount() / 3 + ); for (IBond bond : atomContainer.bonds()) { - boolean isInRing = false; - boolean isTerminal = false; - - // lets see if it's in a ring - IRingSet rings = allRings.getRings(bond); - if (rings.getAtomContainerCount() != 0) isInRing = true; - - // lets see if it is a terminal bond - for (IAtom atom : bond.atoms()) { - if (atomContainer.getConnectedBondsCount(atom) == 1) { - isTerminal = true; - break; + + // only single bonds are candidates for splitting + if (bond.getOrder() == IBond.Order.SINGLE) { + boolean isInRing = false; + boolean isTerminal = false; + + // lets see if it's in a ring + if (allRingsContainer.contains(bond)) isInRing = true; + + // lets see if it is a terminal bond + for (IAtom atom : bond.atoms()) { + if (atomContainer.getConnectedBondsCount(atom) == 1) { + isTerminal = true; + break; + } } + + if (!(isInRing || isTerminal)) splittableBondSet.add(bond); + } + } + return splittableBondSet.toArray(new IBond[0]); + } + + /** + * Generates a subset from the given array `nums`, determined by the binary + * representation of `index`. Each bit in `index` indicates whether the + * corresponding element in `nums` is included in the subset. The order of + * elements does not matter (i.e., `[1, 2]` and `[2, 1]` are equivalent). + * + *

      The total number of possible subsets is (2^n) - 1, where `n` is the + * length of `nums`. Subsets are generated using bitwise operations, where + * each `1` bit in `index` selects the corresponding element from `nums`.

      + * + *

      Example output for `nums = [1, 2, 3]`:

      + *
      +     *   index = 1  → [1]
      +     *   index = 2  → [2]
      +     *   index = 3  → [1, 2]
      +     *   index = 4  → [3]
      +     *   index = 5  → [1, 3]
      +     *   index = 6  → [2, 3]
      +     *   index = 7  → [1, 2, 3]
      +     * 
      + * + *

      Example bitwise selection for `index = 5` (`101` in binary):

      + *
      +     * index (binary)   nums    result
      +     *      1        →   1   →  [1]
      +     *      0        →   2
      +     *      1        →   3   →  [1, 3]
      +     * 
      + * + * @param index an integer whose binary representation determines the subset + * elements. A `1` bit at position `j` means `nums[j]` is + * included. + * @param nums the array from which to generate subsets. Duplicate values + * in `nums` may result in duplicate subset entries. + * @return An array containing the subset corresponding to `index`. + */ + protected static int[] generateSubset(int index, int[] nums) { + // allocate subset array based on the number of 1-bits in index. + int[] subset = new int[Integer.bitCount(index)]; + int subsetIndex = 0; + + // process using bit manipulation - only iterate through set bits + while (index != 0) { + // find position of lowest set bit + int lowestBitPos = Integer.numberOfTrailingZeros(index); + + // add the corresponding element from nums if within bounds + if (lowestBitPos < nums.length) { + subset[subsetIndex] = nums[lowestBitPos]; + subsetIndex++; } - if (!(isInRing || isTerminal)) splitableBonds.add(bond); + // Clear the lowest set bit and continue + index = index & (index - 1); + } + + return subset; + } + + /** + * Add pseudo ("R") atoms to an atom in a molecule. + * + * @param atom the atom to add the pseudo atoms to + * @param rcount the number of pseudo atoms to add + * @param mol the molecule the atom belongs to + */ + private void addRAtoms(IAtom atom, int rcount, IAtomContainer mol) { + for (int i = 0; i < rcount; i++) { + IPseudoAtom tmpRAtom = atom.getBuilder().newInstance( + IPseudoAtom.class, "R" + ); + tmpRAtom.setAttachPointNum(1); + tmpRAtom.setImplicitHydrogenCount(0); + mol.addAtom(tmpRAtom); + mol.addBond(atom.getBuilder().newInstance( + IBond.class, + atom, tmpRAtom, + IBond.Order.SINGLE + )); } - return splitableBonds; + } + + /** + * Creates a copy of an atom and adds it to the specified atom container. + * + * @param originalAtom the atom to be copied. + * @param atomContainer the destination container where the copied atom will + * be added. + * @return A new atom with the same properties as `originalAtom`, added to + * `atomContainer`. + */ + private static IAtom copyAtom(IAtom originalAtom, IAtomContainer atomContainer) { + IAtom copiedAtom = atomContainer.newAtom(originalAtom.getAtomicNumber(), + originalAtom.getImplicitHydrogenCount()); + copiedAtom.setIsAromatic(originalAtom.isAromatic()); + copiedAtom.setValency(originalAtom.getValency()); + copiedAtom.setAtomTypeName(originalAtom.getAtomTypeName()); + copiedAtom.setFormalCharge(originalAtom.getFormalCharge()); + return copiedAtom; + } + + /** + * Creates a copy of a bond and adds it to the specified atom container. + * + * @param cpyCurrentAtom atom in the new atom container that is connected by + * the bond to be copied. + * @param cpyNbor the neighbour of `cpyCurrentAtom` that is connected by the + * bond one wants to copy. + * @param origBond the bond in the original molecule. + * @param atomContainer the new atom container to which the bond is to + * be copied. + * @return The bond in the new atom container. + */ + private static IBond copyBond( + IAtom cpyCurrentAtom, + IAtom cpyNbor, + IBond origBond, + IAtomContainer atomContainer + ) { + IBond cpyBond = atomContainer.newBond( + cpyCurrentAtom, + cpyNbor, + origBond.getOrder()); + cpyBond.setDisplay(origBond.getDisplay()); + cpyBond.setIsAromatic(origBond.isAromatic()); + // Setting is in ring is possible here because we always detect rings + // in the process of detecting the splittable bonds. + cpyBond.setIsInRing(origBond.isInRing()); + return cpyBond; + } + + /** + * Copies a subset of stereochemical information from a source molecule + * to a new fragment. + * + *

      + * This method iterates through the stereochemical elements of the original + * molecule (e.g., chiral centers, cis/trans bonds) and copies only those + * that are fully contained within the new fragment. A stereochemical element + * is considered fully contained if all of its defining atoms and bonds + * are present in the fragment, based on the provided atom and bond maps. + *

      + * Warning: This process is not reliable and can lead to + * incorrect stereochemistry in the fragment. When a chiral + * center is broken during fragmentation, the new fragment may be + * incorrectly assigned as chiral even if it is not anymore because some of + * its substituents are now equal. + *

      + * Example:the chiral molecule {@code CC[C@@H](Cl)CCCC} will i.a. + * produce the fragment {@code CC[C@@H](Cl)CC} where the stereo configuration is + * preserved but the structure is actually not chiral anymore. + *

      + * + * @param origMol the original molecule containing the stereochemical + * information. + * @param fragmentContainer the new fragment where the stereochemical + * information will be added. + * @param origToCpyAtomMap a mapping of atoms from the original molecule to + * their corresponding atoms in the new fragment. + * @param origToCpyBondMap a mapping of bonds from the original molecule to + * their corresponding bonds in the new fragment. + */ + private void attemptCopyStereoInformation( + IAtomContainer origMol, + IAtomContainer fragmentContainer, + Map origToCpyAtomMap, + Map origToCpyBondMap + ) { + // adding stereo information if all elements are present in the + // new fragment + for (IStereoElement elem : origMol.stereoElements()) { + boolean focusIsPresent = true; + boolean carriersArePresent = true; + final IChemObject origFocus = elem.getFocus(); + if (origFocus instanceof IAtom) { + if (!origToCpyAtomMap.containsKey(origFocus)) { + focusIsPresent = false; + } + } else if (origFocus instanceof IBond) { + if (!origToCpyBondMap.containsKey(origFocus)) { + focusIsPresent = false; + } + } + + for (IChemObject iChemObject : elem.getCarriers()) { + if (iChemObject instanceof IAtom) { + if (!origToCpyAtomMap.containsKey(iChemObject)) { + carriersArePresent = false; + break; + } + } else if (iChemObject instanceof IBond) { + if (!origToCpyBondMap.containsKey(iChemObject)) { + carriersArePresent = false; + break; + } + } + } + + if (focusIsPresent && carriersArePresent) { + fragmentContainer.addStereoElement(elem.map(origToCpyAtomMap, origToCpyBondMap)); + } + } + } + + /** + * Splits and saturates (if specified via {@link #saturationSetting}) a + * molecule into multiple fragments by removing the specified bonds and + * making copies of the resulting fragments. + * + * @param origMol the molecule to be split. + * @param bondsToSplit the bonds that should be removed to create + * separate fragments. + * @return An array of copied molecular fragments resulting from the split. + */ + private IAtomContainer[] splitBondsWithCopy( + IAtomContainer origMol, + IBond[] bondsToSplit + ) { + Set bondsToSplitSet = new HashSet<>( + (int) Math.ceil(bondsToSplit.length / 0.75) + ); + // for a faster lookup the hashset is used here. + bondsToSplitSet.addAll(Arrays.asList(bondsToSplit)); + boolean[] visitedOriginalAtoms = new boolean[origMol.getAtomCount()]; + List fragmentList = new ArrayList<>(bondsToSplit.length + 1); + + for (int i = 0; i < origMol.getAtomCount(); i++) { + IAtom currPotentialStartAtom = origMol.getAtom(i); + if (!visitedOriginalAtoms[origMol.indexOf(currPotentialStartAtom)]) { + IAtomContainer fragmentContainer = + origMol.getBuilder().newInstance(IAtomContainer.class); + Map origToCpyAtomMap = new HashMap<>( + (int) Math.ceil(origMol.getAtomCount() / 0.75) + ); + Map origToCpyBondMap = new HashMap<>( + (int) Math.ceil(origMol.getBondCount() / 0.75) + ); + Deque dfsStack = new ArrayDeque<>(); + // Store split counts specific to the atoms in the fragment being built + Map splitCountsCpyAtoms = new HashMap<>(); + + dfsStack.push(currPotentialStartAtom); + visitedOriginalAtoms[origMol.indexOf(currPotentialStartAtom)] = true; + IAtom cpyStartAtom = copyAtom(currPotentialStartAtom, fragmentContainer); + origToCpyAtomMap.put(currPotentialStartAtom, cpyStartAtom); + + while (!dfsStack.isEmpty()) { + IAtom origCurrAtom = dfsStack.pop(); + IAtom cpyCurrentAtom = origToCpyAtomMap.get(origCurrAtom); + + for (IBond origBond : origMol.getConnectedBondsList(origCurrAtom)) { + IAtom origNbor = origBond.getOther(origCurrAtom); + boolean isThisABondToSplit = bondsToSplitSet.contains(origBond); + + if (!isThisABondToSplit) { + if (!origToCpyAtomMap.containsKey(origNbor)) { + visitedOriginalAtoms[origMol.indexOf(origNbor)] = true; + IAtom cpyNbor = copyAtom(origNbor, fragmentContainer); + origToCpyAtomMap.put(origNbor, cpyNbor); + IBond cpyBond = copyBond( + cpyCurrentAtom, + cpyNbor, + origBond, + fragmentContainer + ); + origToCpyBondMap.put(origBond, cpyBond); + dfsStack.push(origNbor); + } else { + IAtom cpyNbor = origToCpyAtomMap.get(origNbor); + // Add bond only if not already present + if (fragmentContainer.getBond(cpyCurrentAtom, cpyNbor) == null) { + IBond cpyBond = copyBond( + cpyCurrentAtom, + cpyNbor, + origBond, + fragmentContainer + ); + origToCpyBondMap.put(origBond, cpyBond); + } + } + } else { + // This bond is being cut. The origCurrAtom is part of the fragment being built. + // Increment the cleavage count for its corresponding copied atom. + splitCountsCpyAtoms.put(cpyCurrentAtom, + splitCountsCpyAtoms.getOrDefault(cpyCurrentAtom, 0) + 1); + } + } + } + + // Apply saturation logic based on the number of splitting counts for this fragment + if (this.saturationSetting != Saturation.UNSATURATED_FRAGMENTS) { + for (Map.Entry entry : splitCountsCpyAtoms.entrySet()) { + IAtom atom = entry.getKey(); + int bondsCutCount = entry.getValue(); + + switch (this.saturationSetting) { + case HYDROGEN_SATURATED_FRAGMENTS: + Integer currImplHCount = atom.getImplicitHydrogenCount(); + int newImplHCount = + (currImplHCount == null ? 0 : currImplHCount) + bondsCutCount; + atom.setImplicitHydrogenCount(newImplHCount); + break; + case R_SATURATED_FRAGMENTS: + addRAtoms(atom, bondsCutCount, fragmentContainer); + break; + default: + throw new UnsupportedOperationException( + "no treatment defined yet for this new enum constant" + ); + } + } + } + if (this.preserveStereo) { + attemptCopyStereoInformation( + origMol, + fragmentContainer, + origToCpyAtomMap, + origToCpyBondMap + ); + } + fragmentList.add(fragmentContainer); + } + } + return fragmentList.toArray(new IAtomContainer[0]); } /** * Get the fragments generated as SMILES strings. + *

      + * Note on Stereochemistry: + * Stereochemistry information will only be included in the returned SMILES + * strings if the `SmilesGenerator` used by this fragmenter was configured + * with the {@link SmiFlavor#Stereo} flag. + *

      * * @return a String[] of the fragments. */ @Override public String[] getFragments() { + if (fragMap == null) { + throw new NullPointerException("It is mandatory to generate " + + "fragments before getting them"); + } return (new ArrayList<>(fragMap.keySet())).toArray(new String[0]); } /** - * Get the fragments generated as {@link IAtomContainer} objects.. + * Get the fragments generated as {@link IAtomContainer} objects. * * @return a IAtomContainer[] of the fragments. */ @Override public IAtomContainer[] getFragmentsAsContainers() { + if (fragMap == null) { + throw new NullPointerException("It is mandatory to generate " + + "fragments before getting them"); + } return (new ArrayList<>(fragMap.values())).toArray(new IAtomContainer[0]); } diff --git a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java index 960cbd2f7a..d58cbc4e96 100644 --- a/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java +++ b/tool/fragment/src/test/java/org/openscience/cdk/fragment/ExhaustiveFragmenterTest.java @@ -1,5 +1,5 @@ -/* - * Copyright (C) 2010 Rajarshi Guha +/* Copyright (C) 2025 Rajarshi Guha + * Tom Weiß * * Contact: cdk-devel@lists.sourceforge.net * @@ -22,112 +22,947 @@ import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -import org.openscience.cdk.test.CDKTestCase; -import org.openscience.cdk.DefaultChemObjectBuilder; +import org.openscience.cdk.exception.CDKException; import org.openscience.cdk.interfaces.IAtomContainer; +import org.openscience.cdk.interfaces.IBond; +import org.openscience.cdk.silent.SilentChemObjectBuilder; +import org.openscience.cdk.smiles.SmiFlavor; +import org.openscience.cdk.smiles.SmilesGenerator; import org.openscience.cdk.smiles.SmilesParser; +import org.openscience.cdk.test.CDKTestCase; import java.util.Arrays; -import java.util.List; +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; -import static org.hamcrest.CoreMatchers.hasItems; -import static org.hamcrest.CoreMatchers.is; +import static org.junit.jupiter.api.Assertions.fail; /** * Test exhaustive fragmenter. + * This test class covers various scenarios for the {@link ExhaustiveFragmenter}, + * including different saturation settings: + *
        + *
      • {@link org.openscience.cdk.fragment.ExhaustiveFragmenter.Saturation#UNSATURATED_FRAGMENTS}
      • + *
      • {@link org.openscience.cdk.fragment.ExhaustiveFragmenter.Saturation#HYDROGEN_SATURATED_FRAGMENTS}
      • + *
      • {@link org.openscience.cdk.fragment.ExhaustiveFragmenter.Saturation#R_SATURATED_FRAGMENTS}
      • + *
      + * and minimum fragment size. * + * @see ExhaustiveFragmenter */ class ExhaustiveFragmenterTest extends CDKTestCase { - - private static ExhaustiveFragmenter fragmenter; - private static SmilesParser smilesParser; + private static SmilesParser smilesParser; @BeforeAll static void setup() { - fragmenter = new ExhaustiveFragmenter(); - smilesParser = new SmilesParser(DefaultChemObjectBuilder.getInstance()); + smilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance()); } + // --- Unsaturated Fragments Tests --- + + /** + * Tests that a simple linear alkane (propane) with no splittable bonds + * yields no fragments when using the unsaturated setting. + */ @Test - void testEF1() throws Exception { + void testEF1Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("CCC"); - fragmenter.generateFragments(mol); - String[] frags = fragmenter.getFragments(); + ExhaustiveFragmenter unsaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); + unsaturatedFragmenter.generateFragments(mol); + String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertEquals(0, frags.length); } + /** + * Tests that a simple cycloalkane (cyclopentane) with no non-ring, non-terminal bonds + * yields no fragments when using the unsaturated setting. + */ @Test - void testEF2() throws Exception { + void testEF2Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1"); - fragmenter.generateFragments(mol); - String[] frags = fragmenter.getFragments(); + ExhaustiveFragmenter unsaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); + unsaturatedFragmenter.generateFragments(mol); + String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertEquals(0, frags.length); } + /** + * Tests fragmentation of ethylcyclohexane with unsaturated fragments. + * Expects "[CH]1CCCCC1" as a fragment, representing the cyclohexyl radical. + */ @Test - void testEF3() throws Exception { + void testEF3Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); - fragmenter.generateFragments(mol); - String[] frags = fragmenter.getFragments(); - org.hamcrest.MatcherAssert.assertThat(frags, is(new String[]{"C1CCCCC1"})); + ExhaustiveFragmenter unsaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); + unsaturatedFragmenter.generateFragments(mol); + String[] frags = unsaturatedFragmenter.getFragments(); + Assertions.assertArrayEquals(new String[]{"[CH]1CCCCC1"}, frags); } + /** + * Tests fragmentation of ethylbenzene with unsaturated fragments. + * Expects "[c]1ccccc1" as a fragment, representing the phenyl radical. + */ @Test - void testEF4() throws Exception { + void testEF4Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC"); - fragmenter.generateFragments(mol); - String[] frags = fragmenter.getFragments(); + ExhaustiveFragmenter unsaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); + unsaturatedFragmenter.generateFragments(mol); + String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); - org.hamcrest.MatcherAssert.assertThat(frags, is(new String[]{"c1ccccc1"})); + Assertions.assertArrayEquals(new String[]{"[c]1ccccc1"}, frags); } + /** + * Tests fragmentation of diphenylmethane with unsaturated fragments. + * Expects "[CH2]c1ccccc1" (benzyl radical) and "[c]1ccccc1" (phenyl radical). + */ @Test - void testEF5() throws Exception { + void testEF5Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); - fragmenter.generateFragments(mol); - String[] frags = fragmenter.getFragments(); + ExhaustiveFragmenter unsaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); + unsaturatedFragmenter.generateFragments(mol); + String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); - org.hamcrest.MatcherAssert.assertThat(Arrays.asList(frags), hasItems("c1ccc(cc1)C", "c1ccccc1")); - Assertions.assertNotNull(fragmenter.getFragmentsAsContainers()); - Assertions.assertEquals(2, fragmenter.getFragmentsAsContainers().length); - + assertFragsContain( + new String[]{ + "[CH2]c1ccccc1", + "[c]1ccccc1" + }, frags + ); + Assertions.assertNotNull(unsaturatedFragmenter.getFragmentsAsContainers()); + Assertions.assertEquals(2, unsaturatedFragmenter.getFragmentsAsContainers().length); } + /** + * Tests fragmentation of biphenyl with unsaturated fragments. + * Expects only "[c]1ccccc1" as the fragment. + */ @Test - void testEF6() throws Exception { + void testEF6Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); - fragmenter.generateFragments(mol); - String[] frags = fragmenter.getFragments(); + ExhaustiveFragmenter unsaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); + unsaturatedFragmenter.generateFragments(mol); + String[] frags = unsaturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); - org.hamcrest.MatcherAssert.assertThat(frags, is(new String[]{"c1ccccc1"})); + Assertions.assertArrayEquals(new String[]{"[c]1ccccc1"}, frags); - Assertions.assertNotNull(fragmenter.getFragmentsAsContainers()); - Assertions.assertEquals(1, fragmenter.getFragmentsAsContainers().length); + Assertions.assertNotNull(unsaturatedFragmenter.getFragmentsAsContainers()); + Assertions.assertEquals(1, unsaturatedFragmenter.getFragmentsAsContainers().length); } + /** + * Tests a complex molecule with unsaturated fragments. + * Expected fragments include phenyl and various complex radical fragments. + * Note: The number of fragments (26) is higher than the saturated version (25) + * because unsaturated fragments explicitly show radical centers, which can lead to + * unique SMILES for fragments that would be canonicalized identically when saturated + * due to differences in hydrogen counts or explicit radical representation. + */ @Test - void testEF7() throws Exception { + void testEF7Unsaturated() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); - fragmenter.generateFragments(mol); - List frags = Arrays.asList(fragmenter.getFragments()); + ExhaustiveFragmenter unsaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.UNSATURATED_FRAGMENTS); + unsaturatedFragmenter.generateFragments(mol); + String[] frags = unsaturatedFragmenter.getFragments(); + Assertions.assertNotNull(frags); + // There is one additional fragment in comparison to the saturated version because there are following fragments: + // [C]1CCC([CH2])C1 + // [CH2][C]1C[CH]CC1 + // these fragments only differ in the number of hydrogen's bonded to their respective carbon atoms. So these + // fragments would show up as one if saturated. + Assertions.assertEquals(26, frags.length); + + Assertions.assertNotNull(unsaturatedFragmenter.getFragmentsAsContainers()); + Assertions.assertEquals(26, unsaturatedFragmenter.getFragmentsAsContainers().length); + + assertFragsContain( + new String[]{ + "[c]1ccccc1", + "[CH2]CC1CCC(c2ccccc2)(CC3C=CC=C3)C1", + "[CH2]C1CCC([CH2])(c2ccccc2)C1" + }, frags + ); + } + + // --- Hydrogen-Saturated Fragments Tests --- + + /** + * Tests that a simple linear alkane (propane) with no splittable bonds + * yields no fragments when using the hydrogen-saturated setting. + */ + @Test + void testEF1Saturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("CCC"); + ExhaustiveFragmenter fragmenterSaturated = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + fragmenterSaturated.generateFragments(mol); + String[] frags = fragmenterSaturated.getFragments(); + Assertions.assertEquals(0, frags.length); + } + + /** + * Tests that a simple cycloalkane (cyclopentane) with no non-ring, non-terminal bonds + * yields no fragments when using the hydrogen-saturated setting. + */ + @Test + void testEF2Saturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1"); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); + Assertions.assertEquals(0, frags.length); + } + + /** + * Tests fragmentation of methylcyclohexane with hydrogen-saturated fragments. + * Expects "C1CCCCC1" as a fragment, representing cyclohexane. + */ + @Test + void testEF3Saturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); + Assertions.assertArrayEquals(new String[]{"C1CCCCC1"}, frags); + } + + /** + * Tests fragmentation of ethylbenzene with hydrogen-saturated fragments. + * Expects "c1ccccc1" as a fragment, representing benzene. + */ + @Test + void testEF4Saturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC"); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); + Assertions.assertNotNull(frags); + Assertions.assertArrayEquals(new String[]{"c1ccccc1"}, frags); + } + + /** + * Tests fragmentation of diphenylmethane with hydrogen-saturated fragments. + * Expects "c1ccc(cc1)C" (toluene) and "c1ccccc1" (benzene). + * Note: "c1ccc(cc1)C" might also be canonicalized as "Cc1ccccc1". + */ + @Test + void testEF5Saturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); + Assertions.assertNotNull(frags); + assertFragsContain( + new String[]{ + "c1ccc(cc1)C", + "c1ccccc1" + }, frags + ); + Assertions.assertNotNull(saturatedFragmenter.getFragmentsAsContainers()); + Assertions.assertEquals(2, saturatedFragmenter.getFragmentsAsContainers().length); + } + + /** + * Tests fragmentation of biphenyl with hydrogen-saturated fragments. + * Expects only "c1ccccc1" (benzene) as the fragment. + */ + @Test + void testEF6Saturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); Assertions.assertNotNull(frags); - Assertions.assertEquals(25, frags.size()); + Assertions.assertArrayEquals(new String[]{"c1ccccc1"}, frags); - Assertions.assertNotNull(fragmenter.getFragmentsAsContainers()); - Assertions.assertEquals(25, fragmenter.getFragmentsAsContainers().length); + Assertions.assertNotNull(saturatedFragmenter.getFragmentsAsContainers()); + Assertions.assertEquals(1, saturatedFragmenter.getFragmentsAsContainers().length); + } - org.hamcrest.MatcherAssert.assertThat(frags, hasItems("c1ccccc1", "c1ccc(cc1)C2(CCC(CC)C2)CC3C=CC=C3", "c1ccc(cc1)C2(C)CCC(C)C2")); + /** + * Tests a complex molecule with hydrogen-saturated fragments. + * Expected fragments include benzene and various complex saturated fragments. + * Compared to the unsaturated version, some fragments might canonicalize to the same SMILES + * after saturation, resulting in a slightly lower count (25 vs 26). + */ + @Test + void testEF7Saturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); + Assertions.assertNotNull(frags); + Assertions.assertEquals(25, frags.length); + + Assertions.assertNotNull(saturatedFragmenter.getFragmentsAsContainers()); + Assertions.assertEquals(25, saturatedFragmenter.getFragmentsAsContainers().length); + + assertFragsContain( + new String[]{ + "c1ccccc1", + "c1ccc(cc1)C2(CCC(CC)C2)CC3C=CC=C3", + "c1ccc(cc1)C2(C)CCC(C)C2" + }, frags + ); } + // --- R-Group Saturated Fragments Tests --- + + /** + * Tests fragmentation of ethylcyclohexane with R-group saturated fragments. + * Expects "*C1CCCCC1" as a fragment, representing the cyclohexyl group with an R-atom. + */ + @Test + void testEF3RestSaturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1CC"); + ExhaustiveFragmenter rSaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); + rSaturatedFragmenter.generateFragments(mol); + String[] frags = rSaturatedFragmenter.getFragments(); + Assertions.assertNotNull(frags); + Assertions.assertArrayEquals(new String[]{"*C1CCCCC1"}, frags); + } + + /** + * Tests fragmentation of toluene with R-group saturated fragments. + * Expects "*c1ccccc1" (phenyl with R-atom) and "*Cc1ccccc1" (benzyl with R-atom). + */ + @Test + void testEF5RestSaturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); + ExhaustiveFragmenter rSaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); + rSaturatedFragmenter.generateFragments(mol); + String[] frags = rSaturatedFragmenter.getFragments(); + Assertions.assertNotNull(frags); + assertFragsContain( + new String[]{ + "*c1ccccc1", + "*Cc1ccccc1" + }, frags + ); + Assertions.assertEquals(2, rSaturatedFragmenter.getFragmentsAsContainers().length); + } + + /** + * Tests fragmentation of biphenyl with R-group saturated fragments. + * Expects only "*c1ccccc1" as the fragment. + */ + @Test + void testEF6RestSaturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); + ExhaustiveFragmenter rSaturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); + rSaturatedFragmenter.generateFragments(mol); + String[] frags = rSaturatedFragmenter.getFragments(); + Assertions.assertNotNull(frags); + Assertions.assertArrayEquals(new String[]{"*c1ccccc1"}, frags); + Assertions.assertEquals(1, rSaturatedFragmenter.getFragmentsAsContainers().length); + } + + /** + * Tests a complex molecule with R-group saturated fragments. + * The number of fragments can differ from hydrogen-saturated or unsaturated versions + * due to the R-group affecting the size of the fragments. + */ + @Test + void testEF7RestSaturated() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("C1(c2ccccc2)(CC(CC1)CCc1ccccc1)CC1C=CC=C1"); + ExhaustiveFragmenter fragmenterRSaturated = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.R_SATURATED_FRAGMENTS); + fragmenterRSaturated.generateFragments(mol); + String[] frags = fragmenterRSaturated.getFragments(); + Assertions.assertNotNull(frags); + // Needs to have the same number of fragments as the unsaturated version. + Assertions.assertEquals(26, fragmenterRSaturated.getFragmentsAsContainers().length); + assertFragsContain( + new String[]{ + "*c1ccccc1", + "*C1CCC(c2ccccc2)(CC3C=CC=C3)C1", + "*C1CCC(*)(c2ccccc2)C1" + }, frags + ); + } + + // --- General Fragmenter Tests --- + + /** + * Tests the minimum fragment size setting. + * With a minimum size of 6, only the larger ring (cyclohexane) should be returned + * from a molecule composed of a cyclopentane and a cyclohexane connected by a single bond. + */ @Test void testMinSize() throws Exception { IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1C2CCCCC2"); - fragmenter.setMinimumFragmentSize(6); - fragmenter.generateFragments(mol); - String[] frags = fragmenter.getFragments(); + ExhaustiveFragmenter fragmenterSaturated = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + fragmenterSaturated.setMinimumFragmentSize(6); + fragmenterSaturated.generateFragments(mol); + String[] frags = fragmenterSaturated.getFragments(); Assertions.assertNotNull(frags); Assertions.assertEquals(1, frags.length); - Assertions.assertTrue(frags[0].equals("C1CCCCC1")); + Assertions.assertEquals("C1CCCCC1", frags[0]); + } + + /** + * Tests that lowering the minimum fragment size allows smaller fragments to be returned. + * For "C1CCCC1C2CCCCC2", setting min size to 5 should yield both rings. + */ + @Test + void testMinSizeLowered() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1C2CCCCC2"); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + saturatedFragmenter.setMinimumFragmentSize(5); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); + Assertions.assertNotNull(frags); + Assertions.assertEquals(2, frags.length); + assertFragsContain( + new String[]{ + "C1CCCCC1", + "C1CCCC1" + }, frags + ); + } + + /** + * Verifies that the SMILES representations obtained from fragments match + * the SMILES generated directly from their corresponding {@link IAtomContainer} objects. + */ + @Test + void testEqualityOfSmilesAndContainers() throws Exception { + SmilesGenerator smilesGenerator = new SmilesGenerator(SmiFlavor.UseAromaticSymbols | SmiFlavor.Unique); + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC(N)C(=O)O"); // Phenylalanine + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + saturatedFragmenter.generateFragments(mol); + String[] smilesFrags = saturatedFragmenter.getFragments(); + IAtomContainer[] containerFrags = saturatedFragmenter.getFragmentsAsContainers(); + for (IAtomContainer frag : containerFrags) { + assertFragsContain( + new String[]{ + smilesGenerator.create(frag) + }, smilesFrags + ); + } + } + + /** + * Tests the {@link ExhaustiveFragmenter#getSplittableBonds(IAtomContainer)} method + * for a linear alkane (propane), which should have no splittable bonds. + */ + @Test + void testGetSplittableBondsLinearMolecule() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("CCC"); // Propane + IBond[] splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); + Assertions.assertEquals(0, splittableBonds.length); + } + + /** + * Tests the {@link ExhaustiveFragmenter#getSplittableBonds(IAtomContainer)} method + * for a cyclic alkane (cyclopentane), which should have no splittable bonds (all bonds are in a ring). + */ + @Test + void testGetSplittableBondsCyclicMolecule() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("C1CCCC1"); // Cyclopentane + IBond[] splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); + Assertions.assertEquals(0, splittableBonds.length); + } + + /** + * Tests the {@link ExhaustiveFragmenter#getSplittableBonds(IAtomContainer)} method + * for ethylbenzene, which should have one splittable bond (the bond between the phenyl and ethyl groups). + */ + @Test + void testGetSplittableBondsBenzeneWithSideChain() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1CC"); // Ethylbenzene + IBond[] splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); + Assertions.assertEquals(1, splittableBonds.length); + } + + /** + * Tests the {@link ExhaustiveFragmenter#getSplittableBonds(IAtomContainer)} method + * for biphenyl, which should have one splittable bond (the bond connecting the two phenyl rings). + */ + @Test + void testGetSplittableBondsBiphenyl() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1c1ccccc1"); // Biphenyl + IBond[] splittableBonds = ExhaustiveFragmenter.getSplittableBonds(mol); + Assertions.assertEquals(1, splittableBonds.length); + } + + /** + * Tests the internal helper method `generateSubset` which creates subsets + * based on the bit representation of an index. + * This ensures the combinatorial generation of bond subsets works correctly. + */ + @Test + void testGenerateSubset() { + int[] nums = new int[]{10, 20, 30, 40}; + + // index = 1 (0001) -> {nums[0]} + Assertions.assertArrayEquals( + new int[]{10}, + ExhaustiveFragmenter.generateSubset(1, nums) + ); + + // index = 2 (0010) -> {nums[1]} + Assertions.assertArrayEquals( + new int[]{20}, + ExhaustiveFragmenter.generateSubset(2, nums) + ); + + // index = 3 (0011) -> {nums[0], nums[1]} + Assertions.assertArrayEquals( + new int[]{10, 20}, + ExhaustiveFragmenter.generateSubset(3, nums) + ); + + // index = 4 (0100) -> {nums[2]} + Assertions.assertArrayEquals( + new int[]{30}, + ExhaustiveFragmenter.generateSubset(4, nums) + ); + + // index = 5 (0101) -> {nums[0], nums[2]} + Assertions.assertArrayEquals( + new int[]{10, 30}, + ExhaustiveFragmenter.generateSubset(5, nums) + ); + + // index = 7 (0111) -> {nums[0], nums[1], nums[2]} + Assertions.assertArrayEquals( + new int[]{10, 20, 30}, + ExhaustiveFragmenter.generateSubset(7, nums) + ); + + // index = 15 (1111) -> {nums[0], nums[1], nums[2], nums[3]} + Assertions.assertArrayEquals( + new int[]{10, 20, 30, 40}, + ExhaustiveFragmenter.generateSubset(15, nums) + ); + } + + /** + * Tests the functionality of providing a custom SmilesGenerator to the ExhaustiveFragmenter. + * This test uses a SmilesGenerator that does NOT use aromatic symbols, expecting kekulized SMILES. + */ + @Test + void testCustomSmilesGenerator() throws Exception { + SmilesGenerator customSmilesGen = new SmilesGenerator(SmiFlavor.Unique); // No SmiFlavor.UseAromaticSymbols + ExhaustiveFragmenter customFragmenter = new ExhaustiveFragmenter( + customSmilesGen, + 6, + ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, + Integer.SIZE - 1, + false + ); + IAtomContainer mol = smilesParser.parseSmiles("c1ccccc1Cc1ccccc1"); // Diphenylmethane + customFragmenter.generateFragments(mol); + String[] frags = customFragmenter.getFragments(); + + Assertions.assertNotNull(frags); + assertFragsContain( + new String[]{ + "C=1C=CC=CC1", + "C=1C=CC(=CC1)C" + }, frags + ); + Assertions.assertEquals(2, frags.length); + } + + /** + * Tests the setInclusiveMaxTreeDepth method using 1,4-dibutylbenzene. + * By varying `inclusiveMaxTreeDepth`, we can observe how the number of generated fragments changes. + * + *
      +     * Molecule: 1,4-dibutylbenzene (CCCCc1ccc(CCCC)cc1)
      +     * Splittable bonds: 6 (the three C-C bonds for each butyl chain, from the ring until the second last C-atom).
      +     * Fragmenter setup: minFragSize = 4 (to include butyl and benzene fragments), hydrogen-saturated fragments.
      +     *
      +     * Expected fragments for different inclusiveMaxTreeDepth settings:
      +     *
      +     * 1.  inclusiveMaxTreeDepth = 1 (allows 1 simultaneous cuts):
      +     * - Expected fragments: 4:
      +     *
      +     * 2.  inclusiveMaxTreeDepth = 2 (allows up to 2 simultaneous cut):
      +     * - Considers all subsets of splittable bonds of size 1 and 2
      +     * - Expected unique fragments: 10
      +     *
      +     * 3.  inclusiveMaxTreeDepth = 3 (allows up to 3 simultaneous cuts):
      +     * - Considers all subsets of splittable bonds of size 1, 2 and 3
      +     * - Includes fragments from 1-cut operations, plus fragments from 2-cut
      +     *   and 3-cut operations:
      +     * - Expected unique fragments: 10
      +     * 
      + */ + @Test + void testSetInclusiveMaxTreeDepth() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles("CCCCc1ccc(CCCC)cc1"); + + // Define a standard SmilesGenerator for fragmenter instantiation + SmilesGenerator standardSmilesGen = new SmilesGenerator( + SmiFlavor.Unique | SmiFlavor.UseAromaticSymbols + ); + + ExhaustiveFragmenter localFragmenter; + + localFragmenter = new ExhaustiveFragmenter( + standardSmilesGen, + 4, + ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, + Integer.SIZE - 1, + false + ); + localFragmenter.setInclusiveMaxTreeDepth(1); + localFragmenter.generateFragments(mol); + String[] fragsDepth1 = localFragmenter.getFragments(); + Assertions.assertEquals(4, fragsDepth1.length, + "Expected 4 fragments when inclusiveMaxTreeDepth is 1 (allows 1 cuts) for 1,4-dibutylbenzene"); + assertFragsContain( + new String[]{ + "c1cc(ccc1C)CCCC", + "c1ccc(cc1)CCCC", + "c1cc(ccc1CC)CCCC", + "CCCC" + }, fragsDepth1 + ); + + localFragmenter = new ExhaustiveFragmenter( + standardSmilesGen, + 4, + ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, + Integer.SIZE - 1, + false + ); + localFragmenter.setInclusiveMaxTreeDepth(2); + localFragmenter.generateFragments(mol); + String[] fragsDepth2 = localFragmenter.getFragments(); + Assertions.assertEquals(10, fragsDepth2.length, + "Expected 10 fragments when inclusiveMaxTreeDepth is 2 (allows up to 2 cut)"); + assertFragsContain( + new String[]{ + "c1ccc(cc1)C", + "c1ccc(cc1)CC", + "c1ccc(cc1)CCCC", + "c1cc(ccc1C)C", + "c1cc(ccc1C)CC", + "c1cc(ccc1C)CCCC", + "c1cc(ccc1CC)CC", + "c1cc(ccc1CC)CCCC", + "c1ccccc1", + "CCCC" + }, fragsDepth2 + ); + + localFragmenter = new ExhaustiveFragmenter( + standardSmilesGen, + 4, + ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, + Integer.SIZE - 1, + false + ); + localFragmenter.setInclusiveMaxTreeDepth(3); + localFragmenter.generateFragments(mol); + String[] fragsDepth3 = localFragmenter.getFragments(); + Assertions.assertEquals(10, fragsDepth3.length, + "Expected 10 fragments when inclusiveMaxTreeDepth is 3 (allows up to 3 cuts), same as max 2 cuts"); + assertFragsContain( + new String[]{ + "c1ccc(cc1)C", + "c1ccc(cc1)CC", + "c1ccc(cc1)CCCC", + "c1cc(ccc1C)C", + "c1cc(ccc1C)CC", + "c1cc(ccc1C)CCCC", + "c1cc(ccc1CC)CC", + "c1cc(ccc1CC)CCCC", + "c1ccccc1", + "CCCC" + }, fragsDepth3 + ); + } + + /** + * Tests that double bonds will not be split. + */ + @Test + void testDoubleBondIssue() throws CDKException { + IAtomContainer mol = smilesParser.parseSmiles("C1CCCCC1=CCC"); + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); + assertFragsContain( + new String[]{ + "C=C1CCCCC1", + }, frags + ); + } + + // --- Complementary Molecule Tests --- + + /** + * Tests exhaustive fragmentation on an example molecule with a disconnected + * structure. + */ + @Test + void testDisconnectedMolecules() throws Exception { + IAtomContainer mol = smilesParser.parseSmiles( + "C(CN(CC(=O)[O-])CC(=O)[O-])N(CC(=O)[O-])CC(=O)[O-].[Na+].[Na+].[Na+].[Na+]" + ); //Sodium edetate + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); + assertFragsContain( + new String[]{ + "O=C([O-])CNCCNCC(=O)[O-]", + "O=C([O-])CNCC(=O)[O-]", + "O=C([O-])CN(C)CCN(C)C", + "O=C([O-])CNCCNC", + "O=C([O-])CN(CC(=O)[O-])CC" + }, frags + ); + } + + /** + * Testing a bigger molecule + * + * @throws Exception if anything goes wrong + */ + @Test + void testBigMolecule1() throws Exception { + SmilesParser smiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); + IAtomContainer mol = smiPar.parseSmiles("CC1=C(C(=CC=C1)NC2=CC=CC=C2C" + + "(=O)NC(CCS(=O)C)C(=O)NC(C)C3=CC=C(C=C3)F)C"); //PubChem CID 118705975 + + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter( + ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS + ); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); + assertFragsContain( + new String[]{ + "O=C(NCC)CCC", + "NC=1C=CC=CC1", + "O=C(N)CCCS(=O)C", + "FC=1C=CC(=CC1)C(N)C" + }, frags + ); + } + + /** + * Testing a molecule with 31 splittable bonds (takes extremely long, maybe days) + * + * @throws Exception if anything goes wrong + */ + // @Test + void testMaxSplittableBonds() throws Exception { + SmilesParser smiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); + IAtomContainer mol = smiPar.parseSmiles("C[C@]12CC[C@](CC1C3=CC(=O)C4[C@]5(CCC(C(C5CC[C@]4([C@@]3(CC2)C)C)(C)C)" + + "OC6C(C(C(C(O6)C(=O)N[C@H](CCC(=O)OC)C(=O)OC)O)O)OC7C(C(C(C(O7)C(=O)N[C@H](CCC(=O)OC)C(=O)OC)O)O)O)C)(C)C" + + "(=O)N[C@H](CCC(=O)OC)C(=O)OC"); // Pubchem CID 16396833 + ExhaustiveFragmenter saturatedFragmenter = new ExhaustiveFragmenter(ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS); + saturatedFragmenter.generateFragments(mol); + String[] frags = saturatedFragmenter.getFragments(); + assertFragsContain( + new String[]{ + "O=CCNC(=O)c1ccccc1", + "O=C(N)CNC(=O)c1ccccc1N", + "O=C(NC)c1ccccc1N", + "O=C(NCCC)c1ccccc1N", + "O=CCNC(=O)c1ccccc1Nc2cccc(c2C)C", + "O=C(N)CCCS(=O)C", + "O=C(N)C(NC(=O)c1ccccc1)CCS(=O)C" + }, frags + ); + } + + /** + * Testing the test molecule of the Java doc comment for the + * {@link ExhaustiveFragmenter} + * + * @throws Exception if anything goes wrong + */ + @Test + void testExampleUsage() throws Exception { + + // test with default settings + SmilesParser smiPar = new SmilesParser(SilentChemObjectBuilder.getInstance()); + IAtomContainer mol = smiPar.parseSmiles("C1CCCC1C1=CC=CC=C1"); + ExhaustiveFragmenter localDefaultFragmenter = new ExhaustiveFragmenter(); + localDefaultFragmenter.generateFragments(mol.clone()); + String[] fragsDefault = localDefaultFragmenter.getFragments(); + assertFragsContain( + new String[]{ + "[C]1=CC=CC=C1" + }, fragsDefault + ); + + // test with minimal fragment size of 5 and hydrogen saturation + + ExhaustiveFragmenter localCustomFragmenter = new ExhaustiveFragmenter( + 5, + ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS + ); + localCustomFragmenter.generateFragments(mol.clone()); + String[] fragsCustom = localCustomFragmenter.getFragments(); + assertFragsContain( + new String[]{ + "C1CCCC1", + "C1=CC=CC=C1", + }, + fragsCustom + ); + } + + /** + * Ensures that stereochemical information (double-bond E/Z markers '/' or + * '\') is preserved when generating fragments. + */ + @Test + void testStereoChemistryCopied() throws Exception { + SmilesGenerator smilesGenerator = + new SmilesGenerator( + SmiFlavor.UseAromaticSymbols | SmiFlavor.Stereo + ); + IAtomContainer mol = smilesParser.parseSmiles("CC[C@H](F)C/C=C/C"); + ExhaustiveFragmenter fragmenter = new ExhaustiveFragmenter( + smilesGenerator, + 6, + ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, + 31, + true + ); + fragmenter.generateFragments(mol); + + String[] smilesFrags = fragmenter.getFragments(); + IAtomContainer[] containerFrags = fragmenter.getFragmentsAsContainers(); + + Assertions.assertNotNull(smilesFrags); + Assertions.assertNotNull(containerFrags); + Assertions.assertEquals(smilesFrags.length, containerFrags.length, + "Number of SMILES fragments and container fragments must match"); + + String[] containerSmiles = new String[smilesFrags.length]; + for (int i = 0; i < containerSmiles.length; i++) { + containerSmiles[i] = smilesGenerator.create(containerFrags[i]); + } + + assertFragsContain( + new String[]{ + "C(F)C/C=C/C" + }, smilesFrags + ); + + assertFragsContain( + new String[]{ + "C(F)C/C=C/C" + }, containerSmiles + ); + } + + /** + * Ensures that stereochemical information (chiral centers '@') is + * copied if the fragmentation yields a fragment with the same chiral center. + */ + @Test + void testTetrahdralStereoChemistryCopied() throws Exception { + SmilesGenerator smilesGenerator = + new SmilesGenerator( + SmiFlavor.UseAromaticSymbols | SmiFlavor.Stereo + ); + IAtomContainer mol = smilesParser.parseSmiles("[C@@H](Cl)(O)CCCCCC"); + ExhaustiveFragmenter fragmenter = new ExhaustiveFragmenter( + smilesGenerator, + 6, + ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, + 31, + true + ); + fragmenter.generateFragments(mol); + + String[] smilesFrags = fragmenter.getFragments(); + assertFragsContain( + new String[]{ + "[C@@H](Cl)(O)CCC", + "CCCCCC", + "[C@@H](Cl)(O)CCCC" + }, smilesFrags + ); + } + + /** + * Tests a known bug where the stereo information for a chiral center is + * incorrectly copied to fragments where the center is no longer chiral. + * This occurs when the fragmentation results in two identical substituents, + * which should, by definition, remove the chirality from the center. + */ + @Test + void testTetrahedralStereoChemistryFalselyCopied() throws Exception { + SmilesGenerator smilesGenerator = + new SmilesGenerator( + SmiFlavor.UseAromaticSymbols | SmiFlavor.Stereo + ); + IAtomContainer mol = smilesParser.parseSmiles("CC[C@@H](Cl)CCCC"); + ExhaustiveFragmenter fragmenter = new ExhaustiveFragmenter( + smilesGenerator, + 6, + ExhaustiveFragmenter.Saturation.HYDROGEN_SATURATED_FRAGMENTS, + 31, + true + ); + fragmenter.generateFragments(mol); + + String[] smilesFrags = fragmenter.getFragments(); + assertFragsContain( + new String[]{ + "C(Cl)CCCC", + // The chemically correct representation would be CCC(Cl)CC + // instead of: + "CC[C@@H](Cl)CC" + }, smilesFrags + ); + } + + // --utility -- + + /** + * Asserts that an array of strings contains all the expected elements, + * allowing for additional, unexpected elements in the actual array. + * + *

      + * This assertion is useful for verifying that a collection contains a + * specific subset of items. It fails only if an expected element is + * missing from the actual array. The failure message will list both + * missing elements and any extra, unexpected elements found. + *

      + * + * @param expected The {@code String} array containing the elements that are + * expected to be present in the {@code actual} array. + * @param actual The {@code String} array containing the elements to be + * tested against the {@code expected} array. + */ + private static void assertFragsContain( + String[] expected, + String[] actual + ) { + + Set expectedSet = new HashSet<>(Arrays.asList(expected)); + Set actualSet = new HashSet<>(Arrays.asList(actual)); + + Set missing = expectedSet.stream() + .filter(item -> !actualSet.contains(item)) + .collect(Collectors.toSet()); + + Set extra = actualSet.stream() + .filter(item -> !expectedSet.contains(item)) + .collect(Collectors.toSet()); + + if (!missing.isEmpty()) { + StringBuilder failureMessage = new StringBuilder(); + + failureMessage.append("Expected but not found: ").append(missing).append("\n"); + if (!extra.isEmpty()) { + failureMessage.append("Found but not expected: ").append(extra).append("\n"); + } + + fail(failureMessage.toString()); + } } }