Source code for nanoCAT.recipes.dissociation

"""
nanoCAT.recipes.surface_dissociation
====================================

A recipe for dissociation specific sets of surface atoms.

Index
-----
.. currentmodule:: nanoCAT.recipes
.. autosummary::
    dissociate_surface
    dissociate_bulk
    row_accumulator

API
---
.. autofunction:: dissociate_surface
.. autofunction:: dissociate_bulk
.. autofunction:: row_accumulator

"""

from __future__ import annotations

import sys
from typing import (
    Iterable,
    Generator,
    Any,
    TypeVar,
    cast,
    Tuple,
    TYPE_CHECKING,
)

import numpy as np
from scm.plams import Molecule, MoleculeError
from CAT.utils import get_nearest_neighbors
from CAT.mol_utils import to_atnum, to_symbol
from CAT.distribution import distribute_idx
from CAT.attachment.distribution_brute import brute_uniform_idx
from nanoCAT.bde.dissociate_xyn import dissociate_ligand
from nanoCAT.bde.identify_surface import identify_surface_ch

if TYPE_CHECKING:
    import numpy.typing as npt
    if sys.version_info >= (3, 8):
        from typing import Literal
    else:
        from typing_extensions import Literal

    _IT1 = TypeVar("_IT1", bound=np.integer[Any])
    _IT2 = TypeVar("_IT2", bound=np.integer[Any])
    _SCT = TypeVar("_SCT", bound=np.generic)
    _NDArray = np.ndarray[Any, np.dtype[_SCT]]
    _ModeKind = Literal['uniform', 'random', 'cluster']

__all__ = ['dissociate_surface', 'row_accumulator', 'dissociate_bulk']


def _parse_idx(idx: npt.ArrayLike, ndim: int, **kwargs: Any) -> _NDArray[np.int64]:
    _idx_ar = np.array(idx, ndmin=ndim, **kwargs)
    if _idx_ar.ndim != ndim:
        raise ValueError
    elif _idx_ar.dtype.kind in 'USO':
        return _idx_ar.astype(np.int64)
    else:
        return _idx_ar.astype(np.int64, casting='same_kind')


[docs]def dissociate_surface(mol: Molecule, idx: npt.ArrayLike, symbol: str = 'Cl', lig_count: int = 1, k: int = 4, displacement_factor: float = 0.5, **kwargs: Any) -> Generator[Molecule, None, None]: r"""A workflow for dissociating :math:`(XY_{n})_{\le m}` compounds from the surface of **mol**. The workflow consists of four distinct steps: 1. Identify which atoms :math:`Y`, as specified by **symbol**, are located on the surface of **mol**. 2. Identify which surface atoms are neighbors of :math:`X`, the latter being defined by **idx**. 3. Identify which pairs of :math:`n*m` neighboring surface atoms are furthest removed from each other. :math:`n` is defined by **lig_count** and :math:`m`, if applicable, by the index along axis 1 of **idx**. 4. Yield :math:`(XY_{n})_{\le m}` molecules constructed from **mol**. Note ---- The indices supplied in **idx** will, when applicable, be sorted along its last axis. Examples -------- .. code:: python >>> from pathlib import Path >>> import numpy as np >>> from scm.plams import Molecule >>> from CAT.recipes import dissociate_surface, row_accumulator >>> base_path = Path(...) >>> mol = Molecule(base_path / 'mol.xyz') # The indices of, e.g., Cs-pairs >>> idx = np.array([ ... [1, 3], ... [4, 5], ... [6, 10], ... [15, 12], ... [99, 105], ... [20, 4] ... ]) # Convert 1- to 0-based indices by substracting 1 from idx >>> mol_generator = dissociate_surface(mol, idx-1, symbol='Cl', lig_count=1) # Note: The indices in idx are (always) be sorted along axis 1 >>> iterator = zip(row_accumulator(np.sort(idx, axis=1)), mol_generator) >>> for i, mol in iterator: ... mol.write(base_path / f'output{i}.xyz') Parameters ---------- mol : :class:`Molecule<scm.plams.mol.molecule.Molecule>` The input molecule. idx : array-like, dimensions: :math:`\le 2` An array of indices denoting to-be dissociated atoms (*i.e.* :math:`X`); its elements will, if applicable, be sorted along the last axis. If a 2D array is provided then all elements along axis 1 will be dissociated in a cumulative manner. :math:`m` is herein defined as the index along axis 1. symbol : :class:`str` or :class:`int` An atomic symbol or number defining the super-set of the atoms to-be dissociated in combination with **idx** (*i.e.* :math:`Y`). lig_count : :class:`int` The number of atoms specified in **symbol** to-be dissociated in combination with a single atom from **idx** (*i.e.* :math:`n`). k : :class:`int` The number of atoms specified in **symbol** which are surrounding a single atom in **idx**. Must obey the following condition: :math:`k \ge 1`. displacement_factor : :class:`float` The smoothing factor :math:`n` for constructing a convex hull; should obey :math:`0 <= n <= 1`. Represents the degree of displacement of all atoms with respect to a spherical surface; :math:`n = 1` is a complete projection while :math:`n = 0` means no displacement at all. A non-zero value is generally recomended here, as the herein utilized :class:`ConvexHull<scipy.spatial.ConvexHull>` class requires an adequate degree of surface-convexness, lest it fails to properly identify all valid surface points. \**kwargs : :data:`Any<typing.Any>` Further keyword arguments for :func:`brute_uniform_idx()<CAT.attachment.distribution_brute.brute_uniform_idx>`. Yields ------ :class:`Molecule<scm.plams.mol.molecule.Molecule>` Yields new :math:`(XY_{n})_{m}`-dissociated molecules. See Also -------- :func:`brute_uniform_idx()<CAT.attachment.distribution_brute.brute_uniform_idx>` Brute force approach to creating uniform or clustered distributions. :func:`identify_surface()<nanoCAT.bde.identify_surface.identify_surface>` Take a molecule and identify which atoms are located on the surface, rather than in the bulk. :func:`identify_surface_ch()<nanoCAT.bde.identify_surface.identify_surface_ch>` Identify the surface of a molecule using a convex hull-based approach. :func:`dissociate_ligand()<nanoCAT.bde.dissociate_xyn.dissociate_ligand>` Remove :math:`XY_{n}` from **mol** with the help of the :class:`MolDissociater<nanoCAT.bde.dissociate_xyn.MolDissociater>` class. """ idx_ar = _parse_idx(idx, ndim=2, copy=False) if idx_ar.ndim > 2: raise ValueError("'idx' expected a 2D array-like object; " f"observed number of dimensions: {idx_ar.ndim}") idx_ar.sort(axis=1) idx_ar = idx_ar[:, ::-1] # Identify all atoms in **idx** located on the surface idx_surface_superset = _get_surface( mol, symbol=symbol, displacement_factor=displacement_factor ) # Construct an array with the indices of opposing surface-atoms n = lig_count * idx_ar.shape[1] idx_surface = _get_opposite_neighbor( np.asarray(mol), idx_ar, idx_surface_superset, n=n, k=k, **kwargs ) # Dissociate and yield new molecules idx_ar += 1 idx_surface += 1 for idx_pair, idx_pair_surface in zip(idx_ar, idx_surface): mol_tmp = mol.copy() _mark_atoms(mol_tmp, idx_pair_surface) for i in idx_pair: mol_tmp = next(dissociate_ligand(mol_tmp, lig_count=lig_count, core_index=i, lig_core_pairs=1, **kwargs)) yield mol_tmp
[docs]def row_accumulator(iterable: Iterable[Iterable[object]]) -> Generator[str, None, None]: """Return a generator which accumulates elements along the nested elements of **iterable**. Examples -------- .. code:: python >>> iterable = [[1, 3], ... [4, 5], ... [6, 10]] >>> for i in row_accumulator(iterable): ... print(repr(i)) '_1' '_1_3' '_4' '_4_5' '_6' '_6_10' Parameters ---------- iterable : :class:`Iterable<collections.abc.Iterable>` [:class:`Iterable<collections.abc.Iterable>` [:data:`Any<typing.Any>`]] A nested iterable. Yields ------ :class:`str` The accumulated nested elements of **iterable** as strings. """ # noqa for i in iterable: ret = '' for j in i: ret += f'_{j}' yield ret
[docs]def dissociate_bulk( mol: Molecule, symbol_x: str, symbol_y: None | str = None, count_x: int = 1, count_y: int = 1, n_pairs: int = 1, k: None | int = 4, r_max: None | float = None, mode: _ModeKind = 'uniform', **kwargs: Any, ) -> Molecule: r"""A workflow for removing :math:`XY`-based compounds from the bulk of **mol**. Examples -------- .. code-block:: python >>> from scm.plams import Molecule >>> from CAT.recipes import dissociate_bulk >>> mol: Molecule = ... # Remove two PbBr2 pairs in a system where # each lead atom is surrounded by 6 bromides >>> mol_out1 = dissociate_bulk( ... mol, symbol_x="Pb", symbol_y="Br", count_y=2, n_pairs=2, k=6 ... ) # The same as before, expect all potential bromides are # identified based on a radius, rather than a fixed number >>> mol_out2 = dissociate_bulk( ... mol, symbol_x="Pb", symbol_y="Br", count_y=2, n_pairs=2, r_max=5.0 ... ) # Convert a fraction to a number of pairs >>> f = 0.5 >>> count_x = 2 >>> symbol_x = "Pb" >>> n_pairs = int(f * sum(at.symbol == symbol_x for at in mol) / count_x) >>> mol_out3 = dissociate_bulk( ... mol, symbol_x="Pb", symbol_y="Br", count_y=2, n_pairs=n_pairs, k=6 ... ) Parameters ---------- mol : :class:`~scm.plams.mol.molecule.Molecule` The input molecule. symbol_x : :class:`str` or :class:`int` The atomic symbol or number of the central (to-be dissociated) atom(s) :math:`X`. symbol_y : :class:`str` or :class:`int`, optional The atomic symbol or number of the surrounding (to-be dissociated) atom(s) :math:`Y`. If :data:`None`, do not dissociate any atoms :math:`Y`. count_x : :class:`int` The number of central atoms :math:`X` per individual to-be dissociated cluster. count_y : :class:`int` The number of surrounding atoms :math:`Y` per individual to-be dissociated cluster. n_pairs : :class:`int` The number of to-be removed :math:`XY` fragments. k : :class:`int`, optional The total number of :math:`Y` candidates surrounding each atom :math:`X`. This value should be smaller than or equal to **count_y**. See the **r_max** parameter for a radius-based approach; note that both parameters are not mutually exclusive. r_max : :class:`int`, optional The radius used for searching for :math:`Y` candidates surrounding each atom :math:`X`. See **k** parameter to use a fixed number of nearest neighbors; note that both parameters are not mutually exclusive. mode : :class:`str` How the subset of to-be removed atoms :math:`X` should be generated. Accepts one of the following values: * ``"random"``: A random distribution. * ``"uniform"``: A uniform distribution; the distance between each successive atom and all previous points is maximized. * ``"cluster"``: A clustered distribution; the distance between each successive atom and all previous points is minmized. Keyword Arguments ----------------- \**kwargs : :data:`~typing.Any` Further keyword arguments for :func:`CAT.distribution.distribute_idx`. Returns ------- :class:`~scm.plams.mol.molecule.Molecule` The molecule with :math:`n_{\text{pair}} * XY` fragments removed. """ if n_pairs < 0: raise ValueError("`n_pairs` must be larger than or equal to 0; " f"observed value: {n_pairs!r}") elif n_pairs == 0: return mol.copy() # Validate the input args if count_x <= 0: raise ValueError("`count_x` expected a an integer larger than 0; " f"observed value: {count_x!r}") elif count_y < 0: raise ValueError("`count_y` expected a an integer larger than 0; " f"observed value: {count_y!r}") elif count_y == 0: symbol_y = None # Parse `symbol_x` atnum_x = to_atnum(symbol_x) idx_x = np.fromiter([i for i, at in enumerate(mol) if at.atnum == atnum_x], dtype=np.intp) if len(idx_x) == 0: raise MoleculeError(f"No atoms {to_symbol(atnum_x)!r} in {mol.get_formula()}") # Parse `symbol_y` if symbol_y is not None: atnum_y = to_atnum(symbol_y) idx_y = np.fromiter([i for i, at in enumerate(mol) if at.atnum == atnum_y], dtype=np.intp) if len(idx_y) == 0: raise MoleculeError(f"No atoms {to_symbol(atnum_y)!r} in {mol.get_formula()}") # Parse `k` and `r_max` if k is None and r_max is None: raise TypeError("`k` and `r_max` cannot be both set to None") if k is None: k = cast(int, len(idx_y)) if r_max is None: r_max = cast(float, np.inf) # Validate `n_pairs` if (n_pairs * count_x) > len(idx_x): x = n_pairs * count_x raise ValueError( f"Insufficient atoms {to_symbol(atnum_x)!r} in {mol.get_formula()}; " f"atleast {x} required with `n_pairs={n_pairs!r}` and `count_x={count_x!r}" ) elif symbol_y is not None and (n_pairs * count_y) > len(idx_y): y = n_pairs * count_y raise ValueError( f"Insufficient atoms {to_symbol(atnum_y)!r} in {mol.get_formula()}; " f"atleast {y} required with `n_pairs={n_pairs!r}` and `count_y={count_y!r}" ) # Identify a subset of atoms `x` if count_x != 1: if "cluster_size" in kwargs: raise TypeError("Specifying both `cluster_size` and `count_x` is not supported") kwargs["cluster_size"] = count_x idx_x_sorted = distribute_idx(mol, idx_x, f=1, **kwargs) idx_x_sorted.shape = -1, count_x if symbol_y is not None: # Identify a subset of matching atoms `y` idx_x_subset, idx_y_subset, = _get_opposite_neighbor2( np.asarray(mol), idx_x_sorted, idx_y, n_pairs, n=count_y, k=k, r_max=r_max ) # Concatenate the subsets idx_xy_subset = np.concatenate([idx_x_subset.ravel(), idx_y_subset.ravel()]) else: idx_xy_subset = idx_x[:(n_pairs * count_x)] idx_xy_subset.sort() idx_xy_subset += 1 # Create and return the new molecule ret = mol.copy() atom_set = {ret[i] for i in idx_xy_subset} for at in atom_set: ret.delete_atom(at) return ret
def _get_opposite_neighbor( mol: _NDArray[np.number], idx_center: _NDArray[np.integer[Any]], idx_neighbor: _NDArray[_IT1], k: int = 4, n: int = 2, **kwargs: Any, ) -> _NDArray[_IT1]: """Identify the **k** nearest neighbors of **idx_center** and return those furthest removed from each other.""" # noqa # Indices of the **k** nearest neighbors in **neighbor** with respect to **center** xyz1 = mol[idx_neighbor] xyz2 = mol[idx_center.ravel()] try: idx_nn = idx_neighbor[get_nearest_neighbors(xyz2, xyz1, k=k)] except IndexError: raise ValueError("'k' should be smaller than the total number of surface atoms " f"(len(idx_neighbor)); observed value: {k!r}") from None idx_nn.shape = -1, idx_nn.shape[1] * idx_center.shape[1] # Find the **n** atoms in **idx_nn** furthest removed from each other return brute_uniform_idx(mol, idx_nn, n=n, **kwargs) def _get_opposite_neighbor2( mol: _NDArray[np.number], idx_center: _NDArray[_IT1], idx_neighbor: _NDArray[_IT2], n_pairs: int, k: int = 4, n: int = 2, r_max: float = np.inf, **kwargs: Any, ) -> Tuple[_NDArray[_IT1], _NDArray[_IT2]]: """Identify the **k** nearest neighbors of **idx_center** and return those furthest removed from each other.""" # noqa # Indices of the **k** nearest neighbors in **neighbor** with respect to **center** xyz1 = mol[idx_neighbor] xyz2 = mol[idx_center.ravel()] idx = get_nearest_neighbors(xyz2, xyz1, k=k, distance_upper_bound=r_max) if len(idx_neighbor) in idx: is_overflow = (idx != len(idx_neighbor)).sum(dtype=np.int64, axis=1) < n if np.count_nonzero(~is_overflow) < n_pairs: raise ValueError("Insufficient number of `XY` pairs with " f"`k={k!r}` and `r_max={r_max!r}`") i_ar = np.arange(len(idx), dtype=np.intp)[~is_overflow][:n_pairs] i: np.intp = idx[i_ar].argmax(axis=1).min() idx = idx[i_ar][:, :i] idx_center_subset = idx_center[i_ar] else: idx = idx[:n_pairs] idx_center_subset = idx_center[:n_pairs] idx_nn = idx_neighbor[idx] # Find the **n** atoms in **idx_nn** furthest removed from each other idx_neighbor_subset = brute_uniform_idx(mol, idx_nn, n=n, **kwargs) return idx_center_subset, idx_neighbor_subset def _get_surface( mol: Molecule, symbol: str, displacement_factor: float = 0.5, ) -> _NDArray[np.int64]: """Return the indices of all atoms, whose atomic symbol is equal to **atom_symbol**, located on the surface.""" # noqa # Identify all atom with atomic symbol **atom_symbol** atnum = to_atnum(symbol) idx = np.array([i for i, atom in enumerate(mol) if atom.atnum == atnum], dtype=np.int64) xyz = np.asarray(mol) # Identify all atoms on the surface try: return idx[identify_surface_ch(xyz[idx], n=displacement_factor)] except ValueError as ex: raise MoleculeError(f"No atoms with atomic symbol/number {repr(symbol)} available in " f"{mol.get_formula()!r}") from ex def _mark_atoms(mol: Molecule, idx: Iterable[int]) -> None: """Mark all atoms in **mol** whose index is in **idx**; indices should be 1-based.""" for i in idx: mol[i].properties.anchor = True