Source code for nanoCAT.recipes.bulk

"""
nanoCAT.recipes.bulk
====================

A short recipe for accessing the ligand-bulkiness workflow.

Index
-----
.. currentmodule:: nanoCAT.recipes
.. autosummary::
    bulk_workflow
    fast_bulk_workflow

API
---
.. autofunction:: bulk_workflow
.. autofunction:: fast_bulk_workflow

"""

from __future__ import annotations

import sys
import warnings
import functools
from itertools import chain
from typing import (
    Any,
    Iterable,
    List,
    Iterator,
    Callable,
    Generator,
    Tuple,
    Union,
    Sequence,
    SupportsFloat,
    TYPE_CHECKING,
    TypeVar,
)


import numpy as np

from scm.plams import Molecule, Atom, Bond
from rdkit import Chem
from CAT.data_handling.mol_import import read_mol
from CAT.data_handling.validate_mol import validate_mol
from CAT.data_handling.anchor_parsing import parse_anchors
from CAT.attachment.ligand_anchoring import _smiles_to_rdmol, find_substructure
from CAT.attachment.ligand_opt import optimize_ligand, allign_axis

from nanoCAT.mol_bulk import get_V, get_lig_radius
from nanoCAT.bulk import yield_distances, GraphConstructor

if TYPE_CHECKING:
    import numpy.typing as npt
    from typing_extensions import SupportsIndex

if sys.version_info > (3, 8):
    _WeightReturn = Union[str, bytes, SupportsFloat, "SupportsIndex"]
else:
    _WeightReturn = Union[str, bytes, SupportsFloat]

_WT = TypeVar("_WT", bound=_WeightReturn)
_WeightFunc1 = Callable[[np.float64], _WT]
_WeightFunc2 = Callable[[np.float64, np.float64], _WT]

__all__ = ['bulk_workflow', 'fast_bulk_workflow']


[docs]def bulk_workflow( smiles_list: Iterable[str], anchor: str = 'O(C=O)[H]', *, anchor_condition: None | Callable[[int], bool] = None, diameter: None | float = 4.5, height_lim: None | float = 10.0, optimize: bool = True, ) -> Tuple[List[Molecule], npt.NDArray[np.float64]]: """Start the CAT ligand bulkiness workflow with an iterable of smiles strings. Examples -------- .. code:: python >>> from CAT.recipes import bulk_workflow >>> smiles_list = [...] >>> mol_list, bulk_array = bulk_workflow(smiles_list, optimize=True) Parameters ---------- smiles_list : :class:`Iterable[str] <collections.abc.Iterable>` An iterable of SMILES strings. anchor : :class:`str` A SMILES string representation of an anchor group such as ``"O(C=O)[H]"``. The first atom will be marked as anchor atom while the last will be dissociated. Used for filtering molecules in **smiles_list**. anchor_condition : :class:`Callable[[int], bool] <collections.abc.Callable>`, optional If not :data:`None`, filter ligands based on the number of identified functional groups. For example, ``anchor_condition = lambda n: n == 1`` will only accept ligands with a single **anchor** group, ``anchor_condition = lambda n: n >= 3`` requires three or more anchors and ``anchor_condition = lambda n: n < 2`` requires fewer than two anchors. diameter : :class:`float`, optional The lattice spacing, *i.e.* the average nearest-neighbor distance between the anchor atoms of all ligads. Set to :data:`None` to ignore the lattice spacing. Units should be in Angstrom. height_lim : :class:`float`, optional A cutoff above which all atoms are ignored. Set to :data:`None` to ignore the height cutoff. Units should be in Angstrom. optimize : :class:`bool` Enable or disable the ligand geometry optimization. Returns ------- :class:`list[plams.Molecule] <list>` and :class:`np.ndarray[np.float64] <numpy.ndarray>` A list of plams Molecules and a matching array of :math:`V_{bulk}` values. """ _mol_list = read_smiles(smiles_list) # smiles to molecule # filter based on functional groups mol_list = list(_filter_mol(_mol_list, anchor=anchor, condition=anchor_condition)) opt_and_allign(mol_list, opt=optimize) # optimize and allign V_bulk = bulkiness(mol_list, diameter=diameter, height_lim=height_lim) # calculate bulkiness return mol_list, V_bulk
def _weight( r: np.float64, h: np.float64, w_func: _WeightFunc1[_WT], h_max: float = 10, r_min: float = 4.5, ) -> _WT | int: if r < r_min or h > h_max: return 0 else: return w_func(r - (r_min / 2))
[docs]def fast_bulk_workflow( smiles_list: Iterable[str], anchor: str = 'O(C=O)[H]', *, anchor_condition: None | Callable[[int], bool] = None, diameter: None | float = 4.5, height_lim: None | float = 10.0, func: None | _WeightFunc1 = np.exp, ) -> Tuple[List[Molecule], npt.NDArray[np.float64]]: """Start the ligand fast-bulkiness workflow with an iterable of smiles strings. Examples -------- .. code:: python >>> from CAT.recipes import fast_bulk_workflow >>> smiles_list = [...] >>> mol_list, bulk_array = fast_bulk_workflow(smiles_list, optimize=True) Parameters ---------- smiles_list : :class:`Iterable[str] <collections.abc.Iterable>` An iterable of SMILES strings. anchor : :class:`str` A SMILES string representation of an anchor group such as ``"O(C=O)[H]"``. The first atom will be marked as anchor atom while the last will be dissociated. Used for filtering molecules in **smiles_list**. anchor_condition : :class:`Callable[[int], bool] <collections.abc.Callable>`, optional If not :data:`None`, filter ligands based on the number of identified functional groups. For example, ``anchor_condition = lambda n: n == 1`` will only accept ligands with a single **anchor** group, ``anchor_condition = lambda n: n >= 3`` requires three or more anchors and ``anchor_condition = lambda n: n < 2`` requires fewer than two anchors. diameter : :class:`float`, optional The lattice spacing, *i.e.* the average nearest-neighbor distance between the anchor atoms of all ligads. Set to :data:`None` to ignore the lattice spacing. Units should be in Angstrom. height_lim : :class:`float`, optional A cutoff above which all atoms are ignored. Set to :data:`None` to ignore the height cutoff. Units should be in Angstrom. func : :class:`Callable[[np.float64], Any] <collections.abc.Callable>` A function for weighting each radial distance. Defaults to :func:`np.exp <numpy.exp>`. Returns ------- :class:`list[plams.Molecule] <list>` and :class:`np.ndarray[np.float64] <numpy.ndarray>` A list of plams Molecules and a matching array of :math:`V_{bulk}` values. Raises ------ RuntimeWarning Issued if an exception is encountered when constructing or traversing one of the molecular graphs. The corresponding bulkiness value will be set to ``nan`` in such case. """ mol_list = list(read_smiles2(smiles_list, anchor=anchor, condition=anchor_condition)) if height_lim is None: height_lim = np.inf if diameter is None: diameter = -np.inf if func is None: w_func = lambda x, y: x else: w_func = functools.partial(_weight, h_max=height_lim, r_min=diameter, w_func=func) arg_iter = ((m, m.properties.dummies) for m in mol_list) iterator = _fast_bulkiness_iter(arg_iter, w_func) V_bulk = np.fromiter(iterator, dtype=np.float64, count=len(mol_list)) return mol_list, V_bulk
def _fast_bulkiness_iter( iterator: Iterator[Tuple[Molecule, Atom]], func: _WeightFunc2[_WT], ) -> Generator[_WT | float, None, None]: for mol, atom in iterator: try: graph = GraphConstructor(mol) dct = graph(atom) except Exception as ex1: warn_ = RuntimeWarning("Failed to construct the molecular graph " f"of {mol.properties.smiles!r}") warn_.__cause__ = ex1 warnings.warn(warn_, stacklevel=2) yield np.nan continue try: ret = sum(i for i, *_ in yield_distances(dct, func=func)) except Exception as ex2: warn_ = RuntimeWarning("Failed to traverse the molecular graph " f"of {mol.properties.smiles!r}") warn_.__cause__ = ex2 warnings.warn(warn_, stacklevel=2) yield np.nan else: yield ret def read_smiles(smiles_list: Iterable[str]) -> List[Molecule]: """Convert smiles strings into CAT-compatible plams molecules.""" input_mol = list(smiles_list) validate_mol(input_mol, 'input_ligands') return read_mol(input_mol) def _iter_mol_block( mol_block: str, size: int, ) -> Generator[Tuple[str, Tuple[str, str, str]], None, None]: """Yield the symbols and coordinates embedded within the passed ``MolBlock``.""" slc = np.s_[4:4 + size] lst = mol_block.split("\n")[slc] for i in lst: x, y, z, symbol, *_ = i.split() yield symbol, (x, y, z) def _molecule_from_rdmol( rdmol: Chem.Mol, smiles: str, matches: Iterable[Sequence[int]], split: bool = True, ) -> Generator[Molecule, None, None]: """Construct a PLAMS molecule from the passed rdkit mol's ``MolBlock``.""" for tup in matches: try: i, *_, j = tup # type: int, Any, None | int except ValueError: i = tup[0] j = None # Split the capping atom (j) from the main molecule if j is not None and split: if i > j: i -= 1 rdmol_edit = Chem.EditableMol(rdmol) rdmol_edit.RemoveAtom(j) rdmol_new = rdmol_edit.GetMol() anchor = rdmol_new.GetAtoms()[i] anchor.SetFormalCharge(anchor.GetFormalCharge() - 1) else: rdmol_new = rdmol # Parse the .mol block and convert it into a PLAMS molecule mol_block = Chem.MolToMolBlock(rdmol) iterator = _iter_mol_block(mol_block, size=len(rdmol.GetAtoms())) mol = Molecule() mol.atoms = [Atom(symbol=symbol, coords=xyz, mol=mol) for symbol, xyz in iterator] for bond in rdmol.GetBonds(): at1 = mol.atoms[bond.GetBeginAtomIdx()] at2 = mol.atoms[bond.GetEndAtomIdx()] mol.add_bond(Bond(at1, at2, order=bond.GetBondTypeAsDouble())) # Set properties and yield mol.properties.smiles = smiles mol.properties.dummies = mol.atoms[i] mol.properties.anchor = f"{mol.properties.dummies.symbol}{i + 1}" yield mol def read_smiles2( smiles_list: Iterable[str], anchor: str = "O(C=O)[H]", condition: None | Callable[[int], bool] = None, ) -> Generator[Molecule, None, None]: """Convert smiles strings into CAT-compatible plams molecules.""" anchor_rdmol = _smiles_to_rdmol(anchor) for smiles in smiles_list: # Read the SMILES strings try: canon_smiles = Chem.CanonSmiles(smiles) rdmol = Chem.AddHs(Chem.MolFromSmiles(smiles)) except Exception as ex1: warn_ = RuntimeWarning(f"Failed to parse {smiles!r}") warn_.__cause__ = ex1 warnings.warn(warn_, stacklevel=2) continue # Perform a substructure match try: matches = rdmol.GetSubstructMatches(anchor_rdmol, useChirality=True) n_matches = len(matches) assertable = bool(n_matches) if condition is not None: assertable &= condition(n_matches) assert assertable except AssertionError: continue except Exception as ex2: warn_ = RuntimeWarning(f"Failed to match {smiles!r} and {anchor!r}") warn_.__cause__ = ex2 warnings.warn(warn_, stacklevel=2) continue # Construct a new PLAMS molecule from the rdkit molecule try: yield from _molecule_from_rdmol(rdmol, canon_smiles, matches) except Exception as ex3: warn_ = RuntimeWarning(f"Failed to construct a PLAMS Molecule from {smiles!r}") warn_.__cause__ = ex3 warnings.warn(warn_, stacklevel=2) continue def _filter_mol( mol_list: Iterable[Molecule], anchor: str = 'O(C=O)[H]', condition: None | Callable[[int], bool] = None, ) -> Iterator[Molecule]: """Filter all input molecules based on the presence of a functional group (the "anchor").""" anchor_rdmols = parse_anchors(anchor) return chain.from_iterable( find_substructure(mol, anchor_rdmols, condition=condition) for mol in mol_list ) def opt_and_allign(mol_list: Iterable[Molecule], opt: bool = True) -> None: """Optimize all molecules and allign them along the x-axis; set :code:`opt=False` to disable the optimization.""" # noqa process_mol: Callable[[Molecule], None] = optimize_ligand if opt else allign_axis for mol in mol_list: process_mol(mol) def bulkiness( mol_list: Iterable[Molecule], diameter: None | float = 4.5, height_lim: None | float = 10.0, ) -> npt.NDArray[np.float64]: r"""Calculate the ligand bulkiness descriptor :math:`V_{bulk}`. .. math:: V(d, h_{lim}) = \sum_{i=1}^{n} e^{r_{i}} (\frac{2 r_{i}}{d} - 1)^{+} (1 - \frac{h_{i}}{h_{lim}})^{+} """ def _iterate(mol_list: Iterable[Molecule]) -> Generator[float, None, None]: for mol in mol_list: radius, height = get_lig_radius(mol) # From cartesian to cylindrical yield get_V(radius, height, diameter, None, h_lim=height_lim) try: count = len(mol_list) # type: ignore except TypeError: count = -1 return np.fromiter(_iterate(mol_list), count=count, dtype=np.float64)