Source code for nanoCAT.recipes.dissociation

"""
nanoCAT.recipes.surface_dissociation
====================================

A recipe for dissociation specific sets of surface atoms.

Index
-----
.. currentmodule:: nanoCAT.recipes
.. autosummary::
    dissociate_surface
    dissociate_bulk
    row_accumulator

API
---
.. autofunction:: dissociate_surface
.. autofunction:: dissociate_bulk
.. autofunction:: row_accumulator

"""

from __future__ import annotations

import sys
from typing import (
    Iterable,
    Generator,
    Any,
    TypeVar,
    cast,
    Tuple,
    TYPE_CHECKING,
)

import numpy as np
from scm.plams import Molecule, MoleculeError
from CAT.utils import get_nearest_neighbors
from CAT.mol_utils import to_atnum, to_symbol
from CAT.distribution import distribute_idx
from CAT.attachment.distribution_brute import brute_uniform_idx
from nanoCAT.bde.dissociate_xyn import dissociate_ligand
from nanoCAT.bde.identify_surface import identify_surface_ch

if TYPE_CHECKING:
    import numpy.typing as npt
    if sys.version_info >= (3, 8):
        from typing import Literal
    else:
        from typing_extensions import Literal

    _IT1 = TypeVar("_IT1", bound=np.integer[Any])
    _IT2 = TypeVar("_IT2", bound=np.integer[Any])
    _SCT = TypeVar("_SCT", bound=np.generic)
    _NDArray = np.ndarray[Any, np.dtype[_SCT]]
    _ModeKind = Literal['uniform', 'random', 'cluster']

__all__ = ['dissociate_surface', 'row_accumulator', 'dissociate_bulk']


def _parse_idx(idx: npt.ArrayLike, ndim: int, **kwargs: Any) -> _NDArray[np.int64]:
    _idx_ar = np.array(idx, ndmin=ndim, **kwargs)
    if _idx_ar.ndim != ndim:
        raise ValueError
    elif _idx_ar.dtype.kind in 'USO':
        return _idx_ar.astype(np.int64)
    else:
        return _idx_ar.astype(np.int64, casting='same_kind')


[docs]def dissociate_surface(mol: Molecule,
                       idx: npt.ArrayLike,
                       symbol: str = 'Cl',
                       lig_count: int = 1,
                       k: int = 4,
                       displacement_factor: float = 0.5,
                       **kwargs: Any) -> Generator[Molecule, None, None]:
    r"""A workflow for dissociating :math:`(XY_{n})_{\le m}` compounds from the surface of **mol**.

    The workflow consists of four distinct steps:

    1. Identify which atoms :math:`Y`, as specified by **symbol**,
       are located on the surface of **mol**.
    2. Identify which surface atoms are neighbors of :math:`X`, the latter being defined by **idx**.
    3. Identify which pairs of :math:`n*m` neighboring surface atoms are furthest removed from
       each other.
       :math:`n` is defined by **lig_count** and :math:`m`, if applicable, by the index along axis 1
       of **idx**.
    4. Yield :math:`(XY_{n})_{\le m}` molecules constructed from **mol**.

    Note
    ----
    The indices supplied in **idx** will, when applicable, be sorted along its last axis.

    Examples
    --------
    .. code:: python

        >>> from pathlib import Path

        >>> import numpy as np

        >>> from scm.plams import Molecule
        >>> from CAT.recipes import dissociate_surface, row_accumulator

        >>> base_path = Path(...)
        >>> mol = Molecule(base_path / 'mol.xyz')

        # The indices of, e.g., Cs-pairs
        >>> idx = np.array([
        ...     [1, 3],
        ...     [4, 5],
        ...     [6, 10],
        ...     [15, 12],
        ...     [99, 105],
        ...     [20, 4]
        ... ])

        # Convert 1- to 0-based indices by substracting 1 from idx
        >>> mol_generator = dissociate_surface(mol, idx-1, symbol='Cl', lig_count=1)

        # Note: The indices in idx are (always) be sorted along axis 1
        >>> iterator = zip(row_accumulator(np.sort(idx, axis=1)), mol_generator)
        >>> for i, mol in iterator:
        ...     mol.write(base_path / f'output{i}.xyz')


    Parameters
    ----------
    mol : :class:`Molecule<scm.plams.mol.molecule.Molecule>`
        The input molecule.

    idx : array-like, dimensions: :math:`\le 2`
        An array of indices denoting to-be dissociated atoms (*i.e.* :math:`X`);
        its elements will, if applicable, be sorted along the last axis.
        If a 2D array is provided then all elements along axis 1 will be dissociated
        in a cumulative manner.
        :math:`m` is herein defined as the index along axis 1.

    symbol : :class:`str` or :class:`int`
        An atomic symbol or number defining the super-set of the atoms to-be dissociated in
        combination with **idx** (*i.e.* :math:`Y`).

    lig_count : :class:`int`
        The number of atoms specified in **symbol** to-be dissociated in combination
        with a single atom from **idx** (*i.e.* :math:`n`).

    k : :class:`int`
        The number of atoms specified in **symbol** which are surrounding a single atom in **idx**.
        Must obey the following condition: :math:`k \ge 1`.

    displacement_factor : :class:`float`
        The smoothing factor :math:`n` for constructing a convex hull;
        should obey :math:`0 <= n <= 1`.
        Represents the degree of displacement of all atoms with respect to a spherical surface;
        :math:`n = 1` is a complete projection while :math:`n = 0` means no displacement at all.

        A non-zero value is generally recomended here,
        as the herein utilized :class:`ConvexHull<scipy.spatial.ConvexHull>` class
        requires an adequate degree of surface-convexness,
        lest it fails to properly identify all valid surface points.

    \**kwargs : :data:`Any<typing.Any>`
        Further keyword arguments for
        :func:`brute_uniform_idx()<CAT.attachment.distribution_brute.brute_uniform_idx>`.

    Yields
    ------
    :class:`Molecule<scm.plams.mol.molecule.Molecule>`
        Yields new :math:`(XY_{n})_{m}`-dissociated molecules.

    See Also
    --------
    :func:`brute_uniform_idx()<CAT.attachment.distribution_brute.brute_uniform_idx>`
        Brute force approach to creating uniform or clustered distributions.

    :func:`identify_surface()<nanoCAT.bde.identify_surface.identify_surface>`
        Take a molecule and identify which atoms are located on the surface,
        rather than in the bulk.

    :func:`identify_surface_ch()<nanoCAT.bde.identify_surface.identify_surface_ch>`
        Identify the surface of a molecule using a convex hull-based approach.

    :func:`dissociate_ligand()<nanoCAT.bde.dissociate_xyn.dissociate_ligand>`
        Remove :math:`XY_{n}` from **mol** with the help of the
        :class:`MolDissociater<nanoCAT.bde.dissociate_xyn.MolDissociater>` class.

    """
    idx_ar = _parse_idx(idx, ndim=2, copy=False)
    if idx_ar.ndim > 2:
        raise ValueError("'idx' expected a 2D array-like object; "
                         f"observed number of dimensions: {idx_ar.ndim}")
    idx_ar.sort(axis=1)
    idx_ar = idx_ar[:, ::-1]

    # Identify all atoms in **idx** located on the surface
    idx_surface_superset = _get_surface(
        mol, symbol=symbol, displacement_factor=displacement_factor
    )

    # Construct an array with the indices of opposing surface-atoms
    n = lig_count * idx_ar.shape[1]
    idx_surface = _get_opposite_neighbor(
        np.asarray(mol), idx_ar, idx_surface_superset, n=n, k=k, **kwargs
    )

    # Dissociate and yield new molecules
    idx_ar += 1
    idx_surface += 1
    for idx_pair, idx_pair_surface in zip(idx_ar, idx_surface):
        mol_tmp = mol.copy()
        _mark_atoms(mol_tmp, idx_pair_surface)

        for i in idx_pair:
            mol_tmp = next(dissociate_ligand(mol_tmp, lig_count=lig_count,
                                             core_index=i, lig_core_pairs=1,
                                             **kwargs))
            yield mol_tmp


[docs]def row_accumulator(iterable: Iterable[Iterable[object]]) -> Generator[str, None, None]:
    """Return a generator which accumulates elements along the nested elements of **iterable**.

    Examples
    --------
    .. code:: python

        >>> iterable = [[1, 3],
        ...             [4, 5],
        ...             [6, 10]]

        >>> for i in row_accumulator(iterable):
        ...     print(repr(i))
        '_1'
        '_1_3'
        '_4'
        '_4_5'
        '_6'
        '_6_10'

    Parameters
    ----------
    iterable : :class:`Iterable<collections.abc.Iterable>` [:class:`Iterable<collections.abc.Iterable>` [:data:`Any<typing.Any>`]]
        A nested iterable.

    Yields
    ------
    :class:`str`
        The accumulated nested elements of **iterable** as strings.

    """  # noqa
    for i in iterable:
        ret = ''
        for j in i:
            ret += f'_{j}'
            yield ret


[docs]def dissociate_bulk(
    mol: Molecule,
    symbol_x: str,
    symbol_y: None | str = None,
    count_x: int = 1,
    count_y: int = 1,
    n_pairs: int = 1,
    k: None | int = 4,
    r_max: None | float = None,
    mode: _ModeKind = 'uniform',
    **kwargs: Any,
) -> Molecule:
    r"""A workflow for removing :math:`XY`-based compounds from the bulk of **mol**.

    Examples
    --------
    .. code-block:: python

        >>> from scm.plams import Molecule
        >>> from CAT.recipes import dissociate_bulk

        >>> mol: Molecule = ...

        # Remove two PbBr2 pairs in a system where
        # each lead atom is surrounded by 6 bromides
        >>> mol_out1 = dissociate_bulk(
        ...     mol, symbol_x="Pb", symbol_y="Br", count_y=2, n_pairs=2, k=6
        ... )

        # The same as before, expect all potential bromides are
        # identified based on a radius, rather than a fixed number
        >>> mol_out2 = dissociate_bulk(
        ...     mol, symbol_x="Pb", symbol_y="Br", count_y=2, n_pairs=2, r_max=5.0
        ... )

        # Convert a fraction to a number of pairs
        >>> f = 0.5
        >>> count_x = 2
        >>> symbol_x = "Pb"
        >>> n_pairs = int(f * sum(at.symbol == symbol_x for at in mol) / count_x)

        >>> mol_out3 = dissociate_bulk(
        ...     mol, symbol_x="Pb", symbol_y="Br", count_y=2, n_pairs=n_pairs, k=6
        ... )

    Parameters
    ----------
    mol : :class:`~scm.plams.mol.molecule.Molecule`
        The input molecule.
    symbol_x : :class:`str` or :class:`int`
        The atomic symbol or number of the central (to-be dissociated) atom(s) :math:`X`.
    symbol_y : :class:`str` or :class:`int`, optional
        The atomic symbol or number of the surrounding (to-be dissociated) atom(s) :math:`Y`.
        If :data:`None`, do not dissociate any atoms :math:`Y`.
    count_x : :class:`int`
        The number of central atoms :math:`X` per individual to-be dissociated cluster.
    count_y : :class:`int`
        The number of surrounding atoms :math:`Y` per individual to-be dissociated cluster.
    n_pairs : :class:`int`
        The number of to-be removed :math:`XY` fragments.
    k : :class:`int`, optional
        The total number of :math:`Y` candidates surrounding each atom :math:`X`.
        This value should be smaller than or equal to **count_y**.
        See the **r_max** parameter for a radius-based approach;
        note that both parameters are not mutually exclusive.
    r_max : :class:`int`, optional
        The radius used for searching for :math:`Y` candidates surrounding each atom :math:`X`.
        See **k** parameter to use a fixed number of nearest neighbors;
        note that both parameters are not mutually exclusive.
    mode : :class:`str`
        How the subset of to-be removed atoms :math:`X` should be generated.
        Accepts one of the following values:

        * ``"random"``: A random distribution.
        * ``"uniform"``: A uniform distribution; the distance between each successive atom and
          all previous points is maximized.
        * ``"cluster"``: A clustered distribution; the distance between each successive atom and
          all previous points is minmized.

    Keyword Arguments
    -----------------
    \**kwargs : :data:`~typing.Any`
        Further keyword arguments for :func:`CAT.distribution.distribute_idx`.

    Returns
    -------
    :class:`~scm.plams.mol.molecule.Molecule`
        The molecule with :math:`n_{\text{pair}} * XY` fragments removed.

    """
    if n_pairs < 0:
        raise ValueError("`n_pairs` must be larger than or equal to 0; "
                         f"observed value: {n_pairs!r}")
    elif n_pairs == 0:
        return mol.copy()

    # Validate the input args
    if count_x <= 0:
        raise ValueError("`count_x` expected a an integer larger than 0; "
                         f"observed value: {count_x!r}")
    elif count_y < 0:
        raise ValueError("`count_y` expected a an integer larger than 0; "
                         f"observed value: {count_y!r}")
    elif count_y == 0:
        symbol_y = None

    # Parse `symbol_x`
    atnum_x = to_atnum(symbol_x)
    idx_x = np.fromiter([i for i, at in enumerate(mol) if at.atnum == atnum_x], dtype=np.intp)
    if len(idx_x) == 0:
        raise MoleculeError(f"No atoms {to_symbol(atnum_x)!r} in {mol.get_formula()}")

    # Parse `symbol_y`
    if symbol_y is not None:
        atnum_y = to_atnum(symbol_y)
        idx_y = np.fromiter([i for i, at in enumerate(mol) if at.atnum == atnum_y], dtype=np.intp)
        if len(idx_y) == 0:
            raise MoleculeError(f"No atoms {to_symbol(atnum_y)!r} in {mol.get_formula()}")

    # Parse `k` and `r_max`
    if k is None and r_max is None:
        raise TypeError("`k` and `r_max` cannot be both set to None")
    if k is None:
        k = cast(int, len(idx_y))
    if r_max is None:
        r_max = cast(float, np.inf)

    # Validate `n_pairs`
    if (n_pairs * count_x) > len(idx_x):
        x = n_pairs * count_x
        raise ValueError(
            f"Insufficient atoms {to_symbol(atnum_x)!r} in {mol.get_formula()}; "
            f"atleast {x} required with `n_pairs={n_pairs!r}` and `count_x={count_x!r}"
        )
    elif symbol_y is not None and (n_pairs * count_y) > len(idx_y):
        y = n_pairs * count_y
        raise ValueError(
            f"Insufficient atoms {to_symbol(atnum_y)!r} in {mol.get_formula()}; "
            f"atleast {y} required with `n_pairs={n_pairs!r}` and `count_y={count_y!r}"
        )

    # Identify a subset of atoms `x`
    if count_x != 1:
        if "cluster_size" in kwargs:
            raise TypeError("Specifying both `cluster_size` and `count_x` is not supported")
        kwargs["cluster_size"] = count_x
    idx_x_sorted = distribute_idx(mol, idx_x, f=1, **kwargs)
    idx_x_sorted.shape = -1, count_x

    if symbol_y is not None:
        # Identify a subset of matching atoms `y`
        idx_x_subset, idx_y_subset, = _get_opposite_neighbor2(
            np.asarray(mol), idx_x_sorted, idx_y, n_pairs, n=count_y, k=k, r_max=r_max
        )

        # Concatenate the subsets
        idx_xy_subset = np.concatenate([idx_x_subset.ravel(), idx_y_subset.ravel()])
    else:
        idx_xy_subset = idx_x[:(n_pairs * count_x)]
    idx_xy_subset.sort()
    idx_xy_subset += 1

    # Create and return the new molecule
    ret = mol.copy()
    atom_set = {ret[i] for i in idx_xy_subset}
    for at in atom_set:
        ret.delete_atom(at)
    return ret


def _get_opposite_neighbor(
    mol: _NDArray[np.number],
    idx_center: _NDArray[np.integer[Any]],
    idx_neighbor: _NDArray[_IT1],
    k: int = 4,
    n: int = 2,
    **kwargs: Any,
) -> _NDArray[_IT1]:
    """Identify the **k** nearest neighbors of **idx_center** and return those furthest removed from each other."""  # noqa
    # Indices of the **k** nearest neighbors in **neighbor** with respect to **center**
    xyz1 = mol[idx_neighbor]
    xyz2 = mol[idx_center.ravel()]
    try:
        idx_nn = idx_neighbor[get_nearest_neighbors(xyz2, xyz1, k=k)]
    except IndexError:
        raise ValueError("'k' should be smaller than the total number of surface atoms "
                         f"(len(idx_neighbor)); observed value: {k!r}") from None
    idx_nn.shape = -1, idx_nn.shape[1] * idx_center.shape[1]

    # Find the **n** atoms in **idx_nn** furthest removed from each other
    return brute_uniform_idx(mol, idx_nn, n=n, **kwargs)


def _get_opposite_neighbor2(
    mol: _NDArray[np.number],
    idx_center: _NDArray[_IT1],
    idx_neighbor: _NDArray[_IT2],
    n_pairs: int,
    k: int = 4,
    n: int = 2,
    r_max: float = np.inf,
    **kwargs: Any,
) -> Tuple[_NDArray[_IT1], _NDArray[_IT2]]:
    """Identify the **k** nearest neighbors of **idx_center** and return those furthest removed from each other."""  # noqa
    # Indices of the **k** nearest neighbors in **neighbor** with respect to **center**
    xyz1 = mol[idx_neighbor]
    xyz2 = mol[idx_center.ravel()]

    idx = get_nearest_neighbors(xyz2, xyz1, k=k, distance_upper_bound=r_max)
    if len(idx_neighbor) in idx:
        is_overflow = (idx != len(idx_neighbor)).sum(dtype=np.int64, axis=1) < n
        if np.count_nonzero(~is_overflow) < n_pairs:
            raise ValueError("Insufficient number of `XY` pairs with "
                             f"`k={k!r}` and `r_max={r_max!r}`")
        i_ar = np.arange(len(idx), dtype=np.intp)[~is_overflow][:n_pairs]
        i: np.intp = idx[i_ar].argmax(axis=1).min()
        idx = idx[i_ar][:, :i]
        idx_center_subset = idx_center[i_ar]
    else:
        idx = idx[:n_pairs]
        idx_center_subset = idx_center[:n_pairs]
    idx_nn = idx_neighbor[idx]

    # Find the **n** atoms in **idx_nn** furthest removed from each other
    idx_neighbor_subset = brute_uniform_idx(mol, idx_nn, n=n, **kwargs)
    return idx_center_subset, idx_neighbor_subset


def _get_surface(
    mol: Molecule,
    symbol: str,
    displacement_factor: float = 0.5,
) -> _NDArray[np.int64]:
    """Return the indices of all atoms, whose atomic symbol is equal to **atom_symbol**, located on the surface."""  # noqa
    # Identify all atom with atomic symbol **atom_symbol**
    atnum = to_atnum(symbol)
    idx = np.array([i for i, atom in enumerate(mol) if atom.atnum == atnum], dtype=np.int64)
    xyz = np.asarray(mol)

    # Identify all atoms on the surface
    try:
        return idx[identify_surface_ch(xyz[idx], n=displacement_factor)]
    except ValueError as ex:
        raise MoleculeError(f"No atoms with atomic symbol/number {repr(symbol)} available in "
                            f"{mol.get_formula()!r}") from ex


def _mark_atoms(mol: Molecule, idx: Iterable[int]) -> None:
    """Mark all atoms in **mol** whose index is in **idx**; indices should be 1-based."""
    for i in idx:
        mol[i].properties.anchor = True