Source code for openff.qcsubmit.workflow_components.state_enumeration

"""
Components to expand stereochemistry and tautomeric states of molecules.
"""

from typing import List, Optional, Tuple

from openff.toolkit.topology import Molecule
from openff.toolkit.utils import ToolkitRegistry
from qcelemental.util import which_import
from typing_extensions import Literal

from openff.qcsubmit._pydantic import Field
from openff.qcsubmit.common_structures import ComponentProperties
from openff.qcsubmit.utils import (
    check_missing_stereo,
    get_symmetry_classes,
    get_symmetry_group,
)
from openff.qcsubmit.workflow_components.base_component import (
    CustomWorkflowComponent,
    ToolkitValidator,
)
from openff.qcsubmit.workflow_components.utils import (
    ComponentResult,
    ImproperScan,
    Scan1D,
    Scan2D,
    TorsionIndexer,
)


[docs]class EnumerateTautomers(ToolkitValidator, CustomWorkflowComponent):
    """
    Enumerate the tautomers of a molecule using the backend toolkits through the OFFTK.
    """

    type: Literal["EnumerateTautomers"] = "EnumerateTautomers"
    # custom settings for the class
    max_tautomers: int = Field(
        20, description="The maximum number of tautomers that should be generated."
    )

[docs]    @classmethod
    def description(cls) -> str:
        return "Enumerate the tautomers of a molecule if possible, returning the input plus any new molecules."

[docs]    @classmethod
    def fail_reason(cls) -> str:
        return "The molecule tautomers could not be enumerated."

[docs]    @classmethod
    def properties(cls) -> ComponentProperties:
        return ComponentProperties(process_parallel=True, produces_duplicates=True)

    def _apply(
        self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
    ) -> ComponentResult:
        """
        Enumerate tautomers of the input molecule.

        The input molecules tautomers are enumerated using the desired backend toolkit and are returned along with the input molecule.

         Parameters:
            molecules: The list of molecules the component should be applied on.
            toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the available toolkits.

        Returns:
            A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
            that passed and were filtered by the component and details about the component which generated the result.
        """

        result = self._create_result(toolkit_registry=toolkit_registry)

        for molecule in molecules:
            try:
                tautomers = molecule.enumerate_tautomers(
                    max_states=self.max_tautomers, toolkit_registry=toolkit_registry
                )
                for tautomer in tautomers:
                    result.add_molecule(tautomer)
                # add the input molecule
                result.add_molecule(molecule=molecule)

            except Exception:
                result.filter_molecule(molecule)

        return result


[docs]class EnumerateStereoisomers(ToolkitValidator, CustomWorkflowComponent):
    """
    Enumerate the stereo centers and bonds of a molecule using the backend toolkits through the OFFTK, only well defined
    molecules are returned by this component, this is check via a OFFTK round trip.
    """

    type: Literal["EnumerateStereoisomers"] = "EnumerateStereoisomers"
    undefined_only: bool = Field(
        False,
        description="If we should only enumerate parts of the molecule with undefined stereochemistry or all stereochemistry.",
    )
    max_isomers: int = Field(
        20, description="The maximum number of stereoisomers to be generated."
    )
    rationalise: bool = Field(
        True,
        description="If we should check that the resulting molecules are physically possible by attempting to generate conformers for them.",
    )

[docs]    @classmethod
    def description(cls) -> str:
        return "Enumerate the stereo centers and bonds of the molecule, returing the input molecule if valid and any new molecules."

[docs]    @classmethod
    def fail_reason(cls) -> str:
        return "The molecules stereo centers or bonds could not be enumerated."

[docs]    @classmethod
    def properties(cls) -> ComponentProperties:
        return ComponentProperties(process_parallel=True, produces_duplicates=True)

    def _apply(
        self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
    ) -> ComponentResult:
        """
        Enumerate stereo centers and bonds of the input molecule

        Parameters:
            molecules: The list of molecules the component should be applied on.
            toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the available toolkits.

        Returns:
            A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
            that passed and were filtered by the component and details about the component which generated the result.
        """

        result = self._create_result(toolkit_registry=toolkit_registry)

        for molecule in molecules:
            try:
                isomers = molecule.enumerate_stereoisomers(
                    undefined_only=self.undefined_only,
                    max_isomers=self.max_isomers,
                    rationalise=self.rationalise,
                    toolkit_registry=toolkit_registry,
                )

                # now check that each molecule is well defined
                for isomer in isomers:
                    if not check_missing_stereo(isomer):
                        result.add_molecule(isomer)

                # now check the input
                # rationalise if needed
                if self.rationalise:
                    molecule.generate_conformers(n_conformers=1)
                if not check_missing_stereo(molecule):
                    result.add_molecule(molecule)

            except Exception:
                result.filter_molecule(molecule)

        return result


[docs]class EnumerateProtomers(ToolkitValidator, CustomWorkflowComponent):
    """
    Enumerate the formal charges of the input molecule using the backend toolkits through the OFFTK.

    Important:
        Only Openeye is supported so far.
    """

    type: Literal["EnumerateProtomers"] = "EnumerateProtomers"
    max_states: int = Field(
        10, description="The maximum number of states that should be generated."
    )

[docs]    @classmethod
    def is_available(cls) -> bool:
        tk = super().is_available()
        oe = which_import(
            ".oechem",
            return_bool=True,
            raise_error=True,
            package="openeye",
            raise_msg="Please install via `conda install openeye-toolkits -c openeye`",
        )
        return tk and oe

[docs]    @classmethod
    def description(cls) -> str:
        return "Enumerate the protomers of the molecule, returning the input molecule and any new molecules."

[docs]    @classmethod
    def fail_reason(cls) -> str:
        return "The molecules formal charges could not be enumerated possibly due to a missing toolkit."

[docs]    @classmethod
    def properties(cls) -> ComponentProperties:
        return ComponentProperties(process_parallel=True, produces_duplicates=True)

    def _apply(
        self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
    ) -> ComponentResult:
        """
        Enumerate the protonation states of the molecule.

        Parameters:
            molecules: The list of molecules the component should be applied on.
            toolkit_registry: The openff.toolkit.utils.ToolkitRegistry which declares the avilable toolkits.

        Returns:
            A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
            that passed and were filtered by the component and details about the component which generated the result.
            Note that the input molecule is guaranteed to be included in this output, which in some cases may cause the
            number of molecules in the result to be one greater than max_states, or may cause a molecule in the results
            to have multiple conformers.

        Important:
            This is only possible using Openeye so far, if openeye is not available this step will fail.
        """

        result = self._create_result(toolkit_registry=toolkit_registry)

        for molecule in molecules:
            try:
                protomers = molecule.enumerate_protomers(max_states=self.max_states)

                for protomer in protomers:
                    result.add_molecule(protomer)
                result.add_molecule(molecule)

            except Exception:
                # if openeye is not available this will fail
                result.filter_molecule(molecule)

        return result


[docs]class ScanEnumerator(ToolkitValidator, CustomWorkflowComponent):
    """
    This module will tag any matching substructures for scanning, useful for torsiondrive datasets.
    """

    type: Literal["ScanEnumerator"] = "ScanEnumerator"

    torsion_scans: List[Scan1D] = Field(
        [],
        description="A list of scan objects which describes the scan range and scan increment"
        "that should be used with the associated smarts pattern.",
    )
    double_torsion_scans: List[Scan2D] = Field(
        [],
        description="A list of double scan objects which describes the scan ranges and scan increments,"
        "that should be used with each of the smarts patterns.",
    )
    improper_scans: List[ImproperScan] = Field(
        [],
        description="A list of improper scan objects which describes the scan range and scan increment"
        "that should be used with the smarts pattern.",
    )

[docs]    @classmethod
    def description(cls) -> str:
        return "Tag any matched substructures for scanning."

[docs]    @classmethod
    def fail_reason(cls) -> str:
        return "The molecule contained no substructure matches."

[docs]    @classmethod
    def properties(cls) -> ComponentProperties:
        return ComponentProperties(process_parallel=True, produces_duplicates=False)

[docs]    def add_torsion_scan(
        self,
        smarts: str,
        scan_rage: Optional[Tuple[int, int]] = None,
        scan_increment: int = 15,
    ) -> None:
        """
        Add a targeted 1D torsion scan to the scan enumerator.

        Args:
            smarts:
                The numerically tagged SMARTs pattern that should be used to identify the torsion atoms.
            scan_rage:
                The angle in degrees the torsion should be scanned between, from low to high
            scan_increment:
                The value in degrees between each grid point in the scan.
        """
        self.torsion_scans.append(
            Scan1D(
                smarts1=smarts, scan_range1=scan_rage, scan_increment=[scan_increment]
            )
        )

[docs]    def add_double_torsion(
        self,
        smarts1: str,
        smarts2: str,
        scan_range1: Optional[Tuple[int, int]] = None,
        scan_range2: Optional[Tuple[int, int]] = None,
        scan_increments: List[int] = (15, 15),
    ) -> None:
        """
        Add a targeted 2D torsion scan to the scan enumerator.

        Args:
            smarts1:
                The numerically tagged SMARTs pattern that should be used to identify the first torsion atoms.
            smarts2:
                The numerically tagged SMARTs pattern that should be used to identify the second torsion atoms.
            scan_range1:
                The angle in degrees the first torsion should be scanned between, from low to high
            scan_range2:
                The angle in degrees the second torsion should be scanned between, from low to high
            scan_increments:
                A list of the values in degrees between each grid point in the scans.
        """
        self.double_torsion_scans.append(
            Scan2D(
                smarts1=smarts1,
                smarts2=smarts2,
                scan_range1=scan_range1,
                scan_range2=scan_range2,
                scan_increment=scan_increments,
            )
        )

[docs]    def add_improper_torsion(
        self,
        smarts: str,
        central_smarts: str,
        scan_range: Optional[Tuple[int, int]] = None,
        scan_increment: int = 15,
    ) -> None:
        """
        Add a targeted Improper torsion to the scan enumerator.

        Args:
            smarts:
                The numerically tagged SMARTs pattern which describes the entire improper.
            central_smarts:
                The numerically tagged SMARTs pattern which identifies the central atom in the improper.
            scan_range:
                The angles in degrees the improper should be scanned between, from low to high.
            scan_increment:
                The value in degrees between each grid point in the scan.
        """
        self.improper_scans.append(
            ImproperScan(
                smarts=smarts,
                central_smarts=central_smarts,
                scan_range=scan_range,
                scan_increment=[scan_increment],
            )
        )

    def _get_unique_torsions(
        self, matches: List[Tuple[int, int, int, int]], symmetry_classes: List[int]
    ) -> List[Tuple[int, int, int, int]]:
        """
        Use the symmetry classes to condense the matches into unique torsions in the molecule by symmetry.
        """
        torsions_by_symmetry = {
            tuple(sorted(symmetry_classes[idx] for idx in match[1:3])): match
            for match in matches
        }
        unique_torsions = [*torsions_by_symmetry.values()]
        return unique_torsions

    def _apply(
        self, molecules: List[Molecule], toolkit_registry: ToolkitRegistry
    ) -> ComponentResult:
        """
        Tag any dihedrals which match the defined set of targets in the enumerator.
        """

        result = self._create_result(toolkit_registry=toolkit_registry)

        for molecule in molecules:
            symmetry_classes = get_symmetry_classes(molecule)
            molecule.properties["dihedrals"] = TorsionIndexer()
            self._tag_torsions(
                molecule, symmetry_classes, toolkit_registry=toolkit_registry
            )
            self._tag_double_torsions(
                molecule, symmetry_classes, toolkit_registry=toolkit_registry
            )
            self._tag_improper_torsions(
                molecule, symmetry_classes, toolkit_registry=toolkit_registry
            )

            indexer = molecule.properties["dihedrals"]
            if len(indexer.get_dihedrals) == 0:
                result.filter_molecule(molecule)

            result.add_molecule(molecule)

        return result

    def _tag_torsions(
        self,
        molecule: Molecule,
        symmetry_classes: List[int],
        toolkit_registry: ToolkitRegistry,
    ) -> None:
        """
        For each of the torsions in the torsion list try and tag only one in the molecule.
        """

        indexer: TorsionIndexer = molecule.properties["dihedrals"]

        for torsion in self.torsion_scans:
            matches = molecule.chemical_environment_matches(
                query=torsion.smarts1, toolkit_registry=toolkit_registry
            )
            unique_torsions = self._get_unique_torsions(
                matches=matches, symmetry_classes=symmetry_classes
            )
            for tagged_torsion in unique_torsions:
                indexer.add_torsion(
                    torsion=tagged_torsion,
                    scan_range=torsion.scan_range1,
                    scan_increment=torsion.scan_increment,
                    symmetry_group=get_symmetry_group(
                        atom_group=tagged_torsion[1:3],
                        symmetry_classes=symmetry_classes,
                    ),
                )

    def _tag_double_torsions(
        self,
        molecule: Molecule,
        symmetry_classes: List[int],
        toolkit_registry: ToolkitRegistry,
    ) -> None:
        """
        For each double torsion in the list try and tag the combination in the molecule.
        """

        indexer: TorsionIndexer = molecule.properties["dihedrals"]

        for double_torsion in self.double_torsion_scans:
            matches1 = molecule.chemical_environment_matches(
                query=double_torsion.smarts1, toolkit_registry=toolkit_registry
            )
            matches2 = molecule.chemical_environment_matches(
                query=double_torsion.smarts2, toolkit_registry=toolkit_registry
            )
            unique_torsions1 = self._get_unique_torsions(
                matches=matches1, symmetry_classes=symmetry_classes
            )
            unique_torsions2 = self._get_unique_torsions(
                matches=matches2, symmetry_classes=symmetry_classes
            )
            for tagged_torsion1 in unique_torsions1:
                symmetry_group1 = get_symmetry_group(
                    atom_group=tagged_torsion1[1:3], symmetry_classes=symmetry_classes
                )
                for tagged_torsion2 in unique_torsions2:
                    symmetry_group2 = get_symmetry_group(
                        atom_group=tagged_torsion2[1:3],
                        symmetry_classes=symmetry_classes,
                    )
                    indexer.add_double_torsion(
                        torsion1=tagged_torsion1,
                        torsion2=tagged_torsion2,
                        symmetry_group1=symmetry_group1,
                        symmetry_group2=symmetry_group2,
                        scan_range1=double_torsion.scan_range1,
                        scan_range2=double_torsion.scan_range2,
                        scan_increment=double_torsion.scan_increment,
                    )

    def _tag_improper_torsions(
        self,
        molecule: Molecule,
        symmetry_classes: List[int],
        toolkit_registry: ToolkitRegistry,
    ) -> None:
        """
        For each improper torsion in the list try and tag the combination in the molecule.
        """

        indexer: TorsionIndexer = molecule.properties["dihedrals"]

        for improper in self.improper_scans:
            matches = molecule.chemical_environment_matches(
                query=improper.smarts, toolkit_registry=toolkit_registry
            )
            unique_torsions = self._get_unique_torsions(
                matches=matches, symmetry_classes=symmetry_classes
            )
            central_atoms = molecule.chemical_environment_matches(
                improper.central_smarts
            )
            for tagged_torsion in unique_torsions:
                symmetry_group = get_symmetry_group(
                    atom_group=tagged_torsion, symmetry_classes=symmetry_classes
                )
                for atom in central_atoms:
                    if atom[0] in tagged_torsion:
                        indexer.add_improper(
                            central_atom=atom[0],
                            improper=tagged_torsion,
                            symmetry_group=symmetry_group,
                            scan_range=improper.scan_range,
                            scan_increment=improper.scan_increment,
                        )
                        break