Source code for dataCAT.pdb_array

"""A module for constructing array-representations of .pdb files.

Index
-----
.. currentmodule:: dataCAT
.. autosummary::
    PDBContainer
    PDBContainer.atoms
    PDBContainer.bonds
    PDBContainer.atom_count
    PDBContainer.bond_count
    PDBContainer.scale

.. autosummary::
    PDBContainer.__init__
    PDBContainer.__getitem__
    PDBContainer.__len__
    PDBContainer.keys
    PDBContainer.values
    PDBContainer.items
    PDBContainer.concatenate

.. autosummary::
    PDBContainer.from_molecules
    PDBContainer.to_molecules
    PDBContainer.to_rdkit
    PDBContainer.create_hdf5_group
    PDBContainer.validate_hdf5
    PDBContainer.from_hdf5
    PDBContainer.to_hdf5

.. autosummary::
    PDBContainer.intersection
    PDBContainer.difference
    PDBContainer.symmetric_difference
    PDBContainer.union


API
---
.. autoclass:: PDBContainer
    :members: atoms, bonds, atom_count, bond_count, scale, __init__

API: Miscellaneous Methods
--------------------------
.. automethod:: PDBContainer.__getitem__
.. automethod:: PDBContainer.__len__
.. automethod:: PDBContainer.keys
.. automethod:: PDBContainer.values
.. automethod:: PDBContainer.items
.. automethod:: PDBContainer.concatenate

API: Object Interconversion
---------------------------
.. automethod:: PDBContainer.from_molecules
.. automethod:: PDBContainer.to_molecules
.. automethod:: PDBContainer.to_rdkit
.. automethod:: PDBContainer.create_hdf5_group
.. automethod:: PDBContainer.validate_hdf5
.. automethod:: PDBContainer.from_hdf5
.. automethod:: PDBContainer.to_hdf5

API: Set Operations
-------------------
.. automethod:: PDBContainer.intersection
.. automethod:: PDBContainer.difference
.. automethod:: PDBContainer.symmetric_difference
.. automethod:: PDBContainer.union

"""  # noqa: E501

from __future__ import annotations

import textwrap
from types import MappingProxyType
from itertools import repeat, chain
from typing import (
    List, Iterable, Union, Type, TypeVar, Optional, Dict, Any,
    overload, Sequence, Mapping, Tuple, Generator, ClassVar, TYPE_CHECKING
)

import h5py
import numpy as np
from scm.plams import Molecule, Atom, Bond
from rdkit import Chem, Geometry
from nanoutils import SupportsIndex, TypedDict, Literal
from assertionlib import assertion

from .dtype import ATOMS_DTYPE, BONDS_DTYPE, ATOM_COUNT_DTYPE, BOND_COUNT_DTYPE, BACKUP_IDX_DTYPE
from .functions import int_to_slice, if_exception

if TYPE_CHECKING:
    from numpy.typing import ArrayLike, DTypeLike

__all__ = ['PDBContainer']

ST = TypeVar('ST', bound='PDBContainer')

_AtomTuple = Tuple[
    bool,  # IsHeteroAtom
    int,  # SerialNumber
    str,  # Name
    str,  # ResidueName
    str,  # ChainId
    int,  # ResidueNumber
    float,  # x
    float,  # y
    float,  # z
    float,  # Occupancy
    float,  # TempFactor
    str,  # symbol
    int,  # charge
    float  # charge_float
]

_BondTuple = Tuple[
    int,  # atom1
    int,  # atom2
    int,  # order
]

_ReduceTuple = Tuple[
    np.recarray,  # atoms
    np.recarray,  # bonds
    np.ndarray,  # atom_count
    np.ndarray,  # bond_count
    np.recarray,  # scale
    Literal[False]  # validate
]

_Coords = Tuple[float, float, float]

_ResInfo = Tuple[
    str,  # Name,
    int,  # SerialNumber,
    str,  # AltLoc
    str,  # ResidueName
    int,  # ResidueNumber
    int,  # ChainId
    str,  # InsertionCode
    float,  # Occupancy
    float,  # TempFactor
    bool,  # IsHeteroAtom
]


class _PDBInfo(TypedDict):
    IsHeteroAtom: bool
    SerialNumber: int
    Name: str
    ResidueName: str
    ChainId: str
    ResidueNumber: int
    Occupancy: float
    TempFactor: float


class _Properties(TypedDict):
    charge: int
    charge_float: float
    pdb_info: _PDBInfo


def _get_atom_info(at: Atom, i: int) -> _AtomTuple:
    """Helper function for :meth:`PDBContainer.from_molecules`: create a tuple representing a single :attr:`PDBContainer.atoms` row."""  # noqa: E501
    prop = at.properties
    symbol = at.symbol
    charge = prop.get('charge', 0)

    pdb = prop.get('pdb_info', {})
    return (
        pdb.get('IsHeteroAtom', False),  # type: ignore
        pdb.get('SerialNumber', i),
        pdb.get('Name', symbol),
        pdb.get('ResidueName', 'LIG'),
        pdb.get('ChainId', 'A'),
        pdb.get('ResidueNumber', 1),
        *at.coords,
        pdb.get('Occupancy', 1.0),
        pdb.get('TempFactor', 0.0),
        symbol,
        charge,
        prop.get('charge_float', charge)
    )


def _get_bond_info(mol: Molecule) -> List[_BondTuple]:
    """Helper function for :meth:`PDBContainer.from_molecules`: create a tuple representing a single :attr:`PDBContainer.bonds` row.

    Note that the atomic indices are 1-based.
    """  # noqa: E501
    mol.set_atoms_id(start=1)
    ret = [(b.atom1.id, b.atom2.id, b.order) for b in mol.bonds]
    mol.unset_atoms_id()
    return ret


def _iter_rec_plams(
    atom_array: np.recarray,
) -> Generator[Tuple[_Properties, _Coords, str], None, None]:
    """Helper function for :func:`_rec_to_mol`: create an iterator yielding atom properties and attributes."""  # noqa: E501
    for ar in atom_array:
        IsHeteroAtom, SerialNumber, Name, ResidueName, ChainId, ResidueNumber, x, y, z, Occupancy, TempFactor, symbol, charge, charge_float = ar.item()  # noqa: E501
        _pdb_info = {
            'IsHeteroAtom': IsHeteroAtom,
            'SerialNumber': SerialNumber,
            'Name': Name.decode(),
            'ResidueName': ResidueName.decode(),
            'ChainId': ChainId.decode(),
            'ResidueNumber': ResidueNumber,
            'Occupancy': Occupancy,
            'TempFactor': TempFactor
        }

        properties = {
            'charge': charge,
            'charge_float': charge_float,
            'pdb_info': _pdb_info
        }
        yield properties, (x, y, z), symbol.decode()  # type: ignore


def _rec_to_plams(
    atom_array: np.recarray,
    bond_array: np.recarray,
    atom_len: None | int = None,
    bond_len: None | int = None,
    mol: None | Molecule = None,
) -> Molecule:
    """Helper function for :meth:`PDBContainer.to_molecules`: update/create a single plams molecule from the passed **atom_array** and **bond_array**."""  # noqa: E501
    if mol is None:
        ret = Molecule()
        for _ in range(len(atom_array[:atom_len])):
            ret.add_atom(Atom(mol=ret))
    else:
        ret = mol

    iterator = _iter_rec_plams(atom_array[:atom_len])
    for atom, (properties, coords, symbol) in zip(ret, iterator):
        atom.coords = coords
        atom.symbol = symbol
        atom.properties.update(properties)

    if ret.bonds:
        ret.delete_all_bonds()
    for i, j, order in bond_array[:bond_len]:
        bond = Bond(atom1=ret[i], atom2=ret[j], order=order, mol=ret)
        ret.add_bond(bond)
    return ret


def _iter_rec_rdkit(
    atom_array: np.recarray,
) -> Generator[Tuple[str, int, _ResInfo], None, None]:
    """Helper function for :func:`_rec_to_mol`: create an iterator yielding atom properties and attributes."""  # noqa: E501
    for ar in atom_array:
        IsHeteroAtom, SerialNumber, Name, ResidueName, ChainId, ResidueNumber, x, y, z, Occupancy, TempFactor, symbol, charge, charge_float = ar.item()  # noqa: E501
        res_info = (
            Name,
            SerialNumber,
            "",
            ResidueName,
            ResidueNumber,
            ChainId,
            "",
            Occupancy,
            TempFactor,
            IsHeteroAtom,
        )
        yield symbol, charge, res_info


def _rec_to_rdkit(
    atom_array: np.recarray,
    bond_array: np.recarray,
    atom_len: None | int = None,
    bond_len: None | int = None,
    sanitize: bool = True,
) -> Chem.Mol:
    """Helper function for :meth:`PDBContainer.to_rdkit`: create a single rdkit molecule from the passed **atom_array** and **bond_array**."""  # noqa: E501
    edit_mol = Chem.EditableMol(Chem.Mol())

    iterator1 = _iter_rec_rdkit(atom_array[:atom_len])
    for symbol, charge, res_info in iterator1:
        atom = Chem.Atom(symbol)
        atom.SetFormalCharge(charge)
        atom.SetMonomerInfo(Chem.AtomPDBResidueInfo(*res_info))
        edit_mol.AddAtom(atom)

    for void in bond_array[:bond_len]:
        i, j, order = void.item()
        edit_mol.AddBond(i - 1, j - 1, Chem.BondType(order))

    mol = edit_mol.GetMol()
    if sanitize:
        Chem.SanitizeMol(mol)

    conf = Chem.Conformer()
    iterator2 = (void.item()[6:9] for void in atom_array[:atom_len])
    for i, xyz in enumerate(iterator2):
        conf.SetAtomPosition(i, Geometry.Point3D(*xyz))
    mol.AddConformer(conf)
    return mol


IndexLike = Union[None, SupportsIndex, Sequence[int], slice, np.ndarray]
Hdf5Mode = Literal['append', 'update']
_AttrName = Literal['atoms', 'bonds', 'atom_count', 'bond_count', 'scale']

#: The docstring to-be assigned by :meth:`PDBContainer.create_hdf5_group`.
HDF5_DOCSTRING = """A Group of datasets representing :class:`{cls_name}`."""


[docs]class PDBContainer: """An (immutable) class for holding array-like representions of a set of .pdb files. The :class:`PDBContainer` class serves as an (intermediate) container for storing .pdb files in the hdf5 format, thus facilitating the storage and interconversion between PLAMS molecules and the :mod:`h5py` interface. The methods implemented in this class can roughly be divided into three categories: * Molecule-interconversion: :meth:`~PDBContainer.to_molecules`, :meth:`~PDBContainer.from_molecules` & :meth:`~PDBContainer.to_rdkit`. * hdf5-interconversion: :meth:`~PDBContainer.create_hdf5_group`, :meth:`~PDBContainer.validate_hdf5`, :meth:`~PDBContainer.to_hdf5` & :meth:`~PDBContainer.from_hdf5`. * Miscellaneous: :meth:`~PDBContainer.keys`, :meth:`~PDBContainer.values`, :meth:`~PDBContainer.items`, :meth:`~PDBContainer.__getitem__` & :meth:`~PDBContainer.__len__`. Examples -------- .. testsetup:: python >>> import os >>> from dataCAT.testing_utils import ( ... MOL_TUPLE as mol_list, ... PDB as pdb, ... HDF5_TMP as hdf5_file ... ) >>> if os.path.isfile(hdf5_file): ... os.remove(hdf5_file) .. code:: python >>> import h5py >>> from scm.plams import readpdb >>> from dataCAT import PDBContainer >>> mol_list [readpdb(...), ...] # doctest: +SKIP >>> pdb = PDBContainer.from_molecules(mol_list) >>> print(pdb) PDBContainer( atoms = numpy.recarray(..., shape=(23, 76), dtype=...), bonds = numpy.recarray(..., shape=(23, 75), dtype=...), atom_count = numpy.ndarray(..., shape=(23,), dtype=int32), bond_count = numpy.ndarray(..., shape=(23,), dtype=int32), scale = numpy.recarray(..., shape=(23,), dtype=...) ) >>> hdf5_file = str(...) # doctest: +SKIP >>> with h5py.File(hdf5_file, 'a') as f: ... group = pdb.create_hdf5_group(f, name='ligand') ... pdb.to_hdf5(group, None) ... ... print('group', '=', group) ... for name, dset in group.items(): ... print(f'group[{name!r}]', '=', dset) group = <HDF5 group "/ligand" (5 members)> group['atoms'] = <HDF5 dataset "atoms": shape (23, 76), type "|V46"> group['bonds'] = <HDF5 dataset "bonds": shape (23, 75), type "|V9"> group['atom_count'] = <HDF5 dataset "atom_count": shape (23,), type "<i4"> group['bond_count'] = <HDF5 dataset "bond_count": shape (23,), type "<i4"> group['index'] = <HDF5 dataset "index": shape (23,), type "<i4"> .. testcleanup:: python >>> if os.path.isfile(hdf5_file): ... os.remove(hdf5_file) """ __slots__ = ( '__weakref__', '_hash', '_atoms', '_bonds', '_atom_count', '_bond_count', '_scale' ) #: A mapping holding the dimensionality of each array embedded within this class. NDIM: ClassVar[Mapping[_AttrName, int]] = MappingProxyType({ 'atoms': 2, 'bonds': 2, 'atom_count': 1, 'bond_count': 1, 'scale': 1 }) #: A mapping holding the dtype of each array embedded within this class. DTYPE: ClassVar[Mapping[_AttrName, np.dtype]] = MappingProxyType({ 'atoms': ATOMS_DTYPE, 'bonds': BONDS_DTYPE, 'atom_count': ATOM_COUNT_DTYPE, 'bond_count': BOND_COUNT_DTYPE, 'scale': BACKUP_IDX_DTYPE }) #: The name of the h5py dimensional scale. SCALE_NAME: ClassVar[str] = 'index' @property def atoms(self) -> np.recarray: """:class:`numpy.recarray`, shape :math:`(n, m)`: Get a read-only padded recarray for keeping track of all atom-related information. See :data:`dataCAT.dtype.ATOMS_DTYPE` for a comprehensive overview of all field names and dtypes. """ # noqa: E501 return self._atoms @property def bonds(self) -> np.recarray: """:class:`numpy.recarray`, shape :math:`(n, k)` : Get a read-only padded recarray for keeping track of all bond-related information. Note that all atomic indices are 1-based. See :data:`dataCAT.dtype.BONDS_DTYPE` for a comprehensive overview of all field names and dtypes. """ # noqa: E501 return self._bonds @property def atom_count(self) -> np.ndarray: """:class:`numpy.ndarray[int32]<numpy.ndarray>`, shape :math:`(n,)` : Get a read-only ndarray for keeping track of the number of atoms in each molecule in :attr:`~PDBContainer.atoms`.""" # noqa: E501 return self._atom_count @property def bond_count(self) -> np.ndarray: """:class:`numpy.ndarray[int32]<numpy.ndarray>`, shape :math:`(n,)` : Get a read-only ndarray for keeping track of the number of atoms in each molecule in :attr:`~PDBContainer.bonds`.""" # noqa: E501 return self._bond_count @property def scale(self) -> np.recarray: """:class:`numpy.recarray`, shape :math:`(n,)`: Get a recarray representing an index. Used as dimensional scale in the h5py Group. """ # noqa: E501 return self._scale @overload def __init__(self, atoms: np.recarray, bonds: np.recarray, atom_count: np.ndarray, bond_count: np.ndarray, scale: np.recarray, validate: Literal[False]) -> None: ... @overload # noqa: E301 def __init__(self, atoms: ArrayLike, bonds: ArrayLike, atom_count: ArrayLike, bond_count: ArrayLike, scale: Optional[ArrayLike] = None, validate: Literal[True] = ..., copy: bool = ...,) -> None: ...
[docs] def __init__(self, atoms, bonds, atom_count, bond_count, scale=None, validate=True, copy=True, index_dtype=None): # noqa: E501,E301 """Initialize an instance. Parameters ---------- atoms : :class:`numpy.recarray`, shape :math:`(n, m)` A padded recarray for keeping track of all atom-related information. See :attr:`PDBContainer.atoms`. bonds : :class:`numpy.recarray`, shape :math:`(n, k)` A padded recarray for keeping track of all bond-related information. See :attr:`PDBContainer.bonds`. atom_count : :class:`numpy.ndarray[int32]<numpy.ndarray>`, shape :math:`(n,)` An ndarray for keeping track of the number of atoms in each molecule in **atoms**. See :attr:`PDBContainer.atom_count`. bond_count : :class:`numpy.ndarray[int32]<numpy.ndarray>`, shape :math:`(n,)` An ndarray for keeping track of the number of bonds in each molecule in **bonds**. See :attr:`PDBContainer.bond_count`. scale : :class:`numpy.recarray`, shape :math:`(n,)`, optional A recarray representing an index. If :data:`None`, use a simple numerical index (*i.e.* :func:`numpy.arange`). See :attr:`PDBContainer.scale`. Keyword Arguments ----------------- validate : :class:`bool` If :data:`True` perform more thorough validation of the input arrays. Note that this also allows the parameters to-be passed as array-like objects in addition to aforementioned :class:`~numpy.ndarray` or :class:`~numpy.recarray` instances. copy : :class:`bool` If :data:`True`, set the passed arrays as copies. Only relevant if :data:`validate = True<True>`. :rtype: :data:`None` """ if validate: cls = type(self) rec_set = {'atoms', 'bonds'} items = [ ('atoms', atoms), ('bonds', bonds), ('atom_count', atom_count), ('bond_count', bond_count) ] for name, _array in items: ndmin = cls.NDIM[name] dtype = cls.DTYPE[name] array = np.array(_array, dtype=dtype, ndmin=ndmin, copy=copy) if name in rec_set: array = array.view(np.recarray) setattr(self, f'_{name}', array) if scale is None: self._scale: np.recarray = np.array(scale, ndmin=1, copy=copy).view(np.recarray) else: dtype = cls.DTYPE['scale'] self._scale = np.arange(len(array), dtype=dtype).view(np.recarray) len_set = {len(ar) for ar in self.values()} if len(len_set) != 1: raise ValueError("All passed arrays should be of the same length") # Assume the input does not have to be parsed else: self._atoms: np.recarray = atoms self._bonds: np.recarray = bonds self._atom_count: np.ndarray = atom_count self._bond_count: np.ndarray = bond_count self._scale: np.recarray = scale for ar in self.values(): ar.setflags(write=False)
def __repr__(self) -> str: """Implement :class:`str(self)<str>` and :func:`repr(self)<repr>`.""" wdith = max(len(k) for k in self.keys()) def _str(k, v): if isinstance(v, np.recarray): dtype = '...' else: dtype = str(v.dtype) return (f'{k:{wdith}} = {v.__class__.__module__}.{v.__class__.__name__}' f'(..., shape={v.shape}, dtype={dtype})') ret = ',\n'.join(_str(k, v) for k, v in self.items()) indent = 4 * ' ' return f'{self.__class__.__name__}(\n{textwrap.indent(ret, indent)}\n)' def __reduce__(self: ST) -> Tuple[Type[ST], _ReduceTuple]: """Helper for :mod:`pickle`.""" cls = type(self) return cls, (*self.values(), False) # type: ignore def __copy__(self: ST) -> ST: """Implement :func:`copy.copy(self)<copy.copy>`.""" return self # self is immutable def __deepcopy__(self: ST, memo: Optional[Dict[int, Any]] = None) -> ST: """Implement :func:`copy.deepcopy(self, memo=memo)<copy.deepcopy>`.""" return self # self is immutable
[docs] def __len__(self) -> int: """Implement :func:`len(self)<len>`. Returns ------- :class:`int` Returns the length of the arrays embedded within this instance (which are all of the same length). """ return len(self.atom_count)
def __eq__(self, value: object) -> bool: """Implement :meth:`self == value<object.__eq__>`.""" if type(self) is not type(value): return False elif hash(self) != hash(value): return False iterator = ((v, getattr(value, k)) for k, v in self.items()) return all(np.all(ar1 == ar2) for ar1, ar2 in iterator) def __hash__(self) -> int: """Implement :func:`hash(self)<hash>`.""" try: return self._hash except AttributeError: args = [] # The hash of each individual array consists of its shape appended # with the array's first and last element along axis 0 for ar in self.values(): if not len(ar): first_and_last: Tuple[Any, ...] = () else: first = ar[0] if ar.ndim == 1 else ar[0, 0] last = ar[-1] if ar.ndim == 1 else ar[-1, 0] first_and_last = (first, last) args.append(ar.shape + first_and_last) cls = type(self) self._hash: int = hash((cls, tuple(args))) return self._hash
[docs] def __getitem__(self: ST, index: IndexLike) -> ST: """Implement :meth:`self[index]<object.__getitem__>`. Constructs a new :class:`PDBContainer` instance by slicing all arrays with **index**. Follows the standard NumPy broadcasting rules: if an integer or slice is passed then a shallow copy is returned; otherwise a deep copy will be created. Examples -------- .. testsetup:: python >>> from dataCAT.testing_utils import PDB as pdb .. code:: python >>> from dataCAT import PDBContainer >>> pdb = PDBContainer(...) # doctest: +SKIP >>> print(pdb) PDBContainer( atoms = numpy.recarray(..., shape=(23, 76), dtype=...), bonds = numpy.recarray(..., shape=(23, 75), dtype=...), atom_count = numpy.ndarray(..., shape=(23,), dtype=int32), bond_count = numpy.ndarray(..., shape=(23,), dtype=int32), scale = numpy.recarray(..., shape=(23,), dtype=...) ) >>> pdb[0] PDBContainer( atoms = numpy.recarray(..., shape=(1, 76), dtype=...), bonds = numpy.recarray(..., shape=(1, 75), dtype=...), atom_count = numpy.ndarray(..., shape=(1,), dtype=int32), bond_count = numpy.ndarray(..., shape=(1,), dtype=int32), scale = numpy.recarray(..., shape=(1,), dtype=...) ) >>> pdb[:10] PDBContainer( atoms = numpy.recarray(..., shape=(10, 76), dtype=...), bonds = numpy.recarray(..., shape=(10, 75), dtype=...), atom_count = numpy.ndarray(..., shape=(10,), dtype=int32), bond_count = numpy.ndarray(..., shape=(10,), dtype=int32), scale = numpy.recarray(..., shape=(10,), dtype=...) ) >>> pdb[[0, 5, 7, 9, 10]] PDBContainer( atoms = numpy.recarray(..., shape=(5, 76), dtype=...), bonds = numpy.recarray(..., shape=(5, 75), dtype=...), atom_count = numpy.ndarray(..., shape=(5,), dtype=int32), bond_count = numpy.ndarray(..., shape=(5,), dtype=int32), scale = numpy.recarray(..., shape=(5,), dtype=...) ) Parameters ---------- index : :class:`int`, :class:`Sequence[int]<typing.Sequence>` or :class:`slice` An object for slicing arrays along :code:`axis=0`. Returns ------- :class:`dataCAT.PDBContainer` A shallow or deep copy of a slice of this instance. """ cls = type(self) if index is None: idx: Union[slice, np.ndarray] = slice(None) else: try: idx = int_to_slice(index, len(self)) # type: ignore except (AttributeError, TypeError): idx = np.asarray(index) if not isinstance(index, slice) else index assert getattr(index, 'ndim', 1) == 1 iterator = (ar[idx] for ar in self.values()) ret: ST = cls(*iterator, validate=False) # type: ignore ret._scale = ret.scale.view(np.recarray) return ret
[docs] @classmethod def keys(cls) -> Generator[_AttrName, None, None]: """Yield the (public) attribute names in this class. Examples -------- .. code:: python >>> from dataCAT import PDBContainer >>> for name in PDBContainer.keys(): ... print(name) atoms bonds atom_count bond_count scale Yields ------ :class:`str` The names of all attributes in this class. """ return (name.strip('_') for name in cls.__slots__[2:]) # type: ignore
[docs] def values(self) -> Generator[Union[np.ndarray, np.recarray], None, None]: """Yield the (public) attributes in this instance. Examples -------- .. testsetup:: python >>> from dataCAT.testing_utils import PDB as pdb .. code:: python >>> from dataCAT import PDBContainer >>> pdb = PDBContainer(...) # doctest: +SKIP >>> for value in pdb.values(): ... print(object.__repr__(value)) # doctest: +ELLIPSIS <numpy.recarray object at ...> <numpy.recarray object at ...> <numpy.ndarray object at ...> <numpy.ndarray object at ...> <numpy.recarray object at ...> Yields ------ :class:`str` The values of all attributes in this instance. """ cls = type(self) return (getattr(self, name) for name in cls.keys())
[docs] def items(self) -> Generator[Tuple[_AttrName, Union[np.ndarray, np.recarray]], None, None]: """Yield the (public) attribute name/value pairs in this instance. Examples -------- .. testsetup:: python >>> from dataCAT.testing_utils import PDB as pdb .. code:: python >>> from dataCAT import PDBContainer >>> pdb = PDBContainer(...) # doctest: +SKIP >>> for name, value in pdb.items(): ... print(name, '=', object.__repr__(value)) # doctest: +ELLIPSIS atoms = <numpy.recarray object at ...> bonds = <numpy.recarray object at ...> atom_count = <numpy.ndarray object at ...> bond_count = <numpy.ndarray object at ...> scale = <numpy.recarray object at ...> Yields ------ :class:`str` and :class:`numpy.ndarray` / :class:`numpy.recarray` The names and values of all attributes in this instance. """ return ((n, getattr(self, n)) for n in self.keys())
[docs] def concatenate(self: ST, *args: ST) -> ST: r"""Concatenate :math:`n` PDBContainers into a single new instance. Examples -------- .. testsetup:: python >>> from dataCAT.testing_utils import PDB as pdb1 >>> pdb2 = pdb3 = pdb1 .. code:: python >>> from dataCAT import PDBContainer >>> pdb1 = PDBContainer(...) # doctest: +SKIP >>> pdb2 = PDBContainer(...) # doctest: +SKIP >>> pdb3 = PDBContainer(...) # doctest: +SKIP >>> print(len(pdb1), len(pdb2), len(pdb3)) 23 23 23 >>> pdb_new = pdb1.concatenate(pdb2, pdb3) >>> print(pdb_new) PDBContainer( atoms = numpy.recarray(..., shape=(69, 76), dtype=...), bonds = numpy.recarray(..., shape=(69, 75), dtype=...), atom_count = numpy.ndarray(..., shape=(69,), dtype=int32), bond_count = numpy.ndarray(..., shape=(69,), dtype=int32), scale = numpy.recarray(..., shape=(69,), dtype=...) ) Parameters ---------- \*args : :class:`PDBContainer` One or more PDBContainers. Returns ------- :class:`PDBContainer` A new PDBContainer cosntructed by concatenating **self** and **args**. """ if not args: return self try: attr_list = [(k, v, [getattr(a, k) for a in args]) for k, v in self.items()] except AttributeError as ex: raise TypeError("'*args' expected one or more PDBContainer instances") from ex cls = type(self) ret_list = [] for k, ar_self, ar_list in attr_list: # 'scale': a 1D recarray if k == 'scale': dtype = ar_self.dtype ar_list = [ar.astype(dtype, copy=False) for ar in ar_list] ar_new = np.concatenate((ar_self, *ar_list)).view(np.recarray) # 'atom_count' and 'bond_count': two normal 1d arrays elif ar_self.ndim == 1: ar_new = np.concatenate((ar_self, *ar_list)) # 'atoms' and 'bonds': two padded 2D recarrays else: ax0 = len(ar_self) + sum(len(ar) for ar in ar_list) ax1 = max(ar_self.shape[1], *(ar.shape[1] for ar in ar_list)) ar_new = np.zeros((ax0, ax1), dtype=cls.DTYPE[k]).view(np.recarray) i = 0 for ar in chain([ar_self], ar_list): j = i + len(ar) ar_new[i:j, :ar.shape[1]] = ar i = j ret_list.append(ar_new) return cls(*ret_list, validate=False) # type: ignore
[docs] @classmethod def from_molecules(cls: Type[ST], mol_list: Iterable[Molecule], min_atom: int = 0, min_bond: int = 0, scale: Optional[ArrayLike] = None) -> ST: """Convert an iterable or sequence of molecules into a new :class:`PDBContainer` instance. Examples -------- .. testsetup:: python >>> from dataCAT.testing_utils import ( ... PDB as pdb, ... MOL_TUPLE as mol_list ... ) .. code:: python >>> from typing import List >>> from dataCAT import PDBContainer >>> from scm.plams import readpdb, Molecule >>> mol_list: List[Molecule] = [readpdb(...), ...] # doctest: +SKIP >>> PDBContainer.from_molecules(mol_list) PDBContainer( atoms = numpy.recarray(..., shape=(23, 76), dtype=...), bonds = numpy.recarray(..., shape=(23, 75), dtype=...), atom_count = numpy.ndarray(..., shape=(23,), dtype=int32), bond_count = numpy.ndarray(..., shape=(23,), dtype=int32), scale = numpy.recarray(..., shape=(23,), dtype=...) ) Parameters ---------- mol_list : :class:`Iterable[Molecule]<typing.Iterable>` An iterable consisting of PLAMS molecules. min_atom : :class:`int` The minimum number of atoms which :attr:`PDBContainer.atoms` should accomodate. min_bond : :class:`int` The minimum number of bonds which :attr:`PDBContainer.bonds` should accomodate. scale : array-like, optional An array-like object representing an user-specified index. Defaults to a simple range index if :data:`None` (see :func:`numpy.arange`). Returns ------- :class:`dataCAT.PDBContainer` A pdb container. """ try: mol_count = len(mol_list) # type: ignore except TypeError: mol_list = list(mol_list) mol_count = len(mol_list) # Parse the index if scale is None: idx = np.arange(mol_count, dtype=cls.DTYPE['scale']).view(np.recarray) else: idx = np.asarray(scale).view(np.recarray) if len(idx) != mol_count: raise ValueError("'mol_list' and 'idx' must be of equal length") # Gather the shape of the to-be created atom (pdb-file) array _atom_count = max((len(mol.atoms) for mol in mol_list), default=0) atom_count = max(_atom_count, min_atom) atom_shape = mol_count, atom_count # Gather the shape of the to-be created bond array _bond_count = max((len(mol.bonds) for mol in mol_list), default=0) bond_count = max(_bond_count, min_bond) bond_shape = mol_count, bond_count # Construct the to-be returned (padded) arrays DTYPE = cls.DTYPE atom_array = np.zeros(atom_shape, dtype=DTYPE['atoms']).view(np.recarray) bond_array = np.zeros(bond_shape, dtype=DTYPE['bonds']).view(np.recarray) atom_counter = np.empty(mol_count, dtype=DTYPE['atom_count']) bond_counter = np.empty(mol_count, dtype=DTYPE['bond_count']) # Fill the to-be returned arrays for i, mol in enumerate(mol_list): j_atom = len(mol.atoms) j_bond = len(mol.bonds) atom_array[i, :j_atom] = [_get_atom_info(at, k) for k, at in enumerate(mol, 1)] bond_array[i, :j_bond] = _get_bond_info(mol) atom_counter[i] = j_atom bond_counter[i] = j_bond return cls( atoms=atom_array, bonds=bond_array, atom_count=atom_counter, bond_count=bond_counter, scale=idx, validate=False )
@overload def to_molecules(self, index: Union[None, Sequence[int], slice, np.ndarray] = ..., mol: Optional[Iterable[Optional[Molecule]]] = ...) -> List[Molecule]: ... @overload # noqa: E301 def to_molecules(self, index: SupportsIndex = ..., mol: Optional[Molecule] = ...) -> Molecule: ...
[docs] def to_molecules(self, index=None, mol=None): # noqa: E301 """Create a molecule or list of molecules from this instance. Examples -------- .. testsetup:: python >>> from dataCAT.testing_utils import ( ... PDB as pdb, ... MOL_TUPLE as mol_list, ... MOL as mol ... ) An example where one or more new molecules are created. .. code:: python >>> from dataCAT import PDBContainer >>> from scm.plams import Molecule >>> pdb = PDBContainer(...) # doctest: +SKIP # Create a single new molecule from `pdb` >>> pdb.to_molecules(index=0) # doctest: +ELLIPSIS <scm.plams.mol.molecule.Molecule object at ...> # Create three new molecules from `pdb` >>> pdb.to_molecules(index=[0, 1]) # doctest: +ELLIPSIS,+NORMALIZE_WHITESPACE [<scm.plams.mol.molecule.Molecule object at ...>, <scm.plams.mol.molecule.Molecule object at ...>] An example where one or more existing molecules are updated in-place. .. code:: python # Update `mol` with the info from `pdb` >>> mol = Molecule(...) # doctest: +SKIP >>> mol_new = pdb.to_molecules(index=2, mol=mol) >>> mol is mol_new True # Update all molecules in `mol_list` with info from `pdb` >>> mol_list = [Molecule(...), Molecule(...), Molecule(...)] # doctest: +SKIP >>> mol_list_new = pdb.to_molecules(index=range(3), mol=mol_list) >>> for m, m_new in zip(mol_list, mol_list_new): ... print(m is m_new) True True True Parameters ---------- index : :class:`int`, :class:`Sequence[int]<typing.Sequence>` or :class:`slice`, optional An object for slicing the arrays embedded within this instance. Follows the standard numpy broadcasting rules (*e.g.* :code:`self.atoms[index]`). If a scalar is provided (*e.g.* an integer) then a single molecule will be returned. If a sequence, range, slice, *etc.* is provided then a list of molecules will be returned. mol : :class:`~scm.plams.mol.molecule.Molecule` or :class:`Iterable[Molecule]<typing.Iterable>`, optional A molecule or list of molecules. If one or molecules are provided here then they will be updated in-place. Returns ------- :class:`~scm.plams.mol.molecule.Molecule` or :class:`List[Molecule]<typing.List>` A molecule or list of molecules, depending on whether or not **index** is a scalar or sequence / slice. Note that if :data:`mol is not None<None>`, then the-be returned molecules won't be copies. """ # noqa: E501 if index is None: i = slice(None) is_seq = True else: try: i = index.__index__() is_seq = False except (AttributeError, TypeError): i = index is_seq = True atoms = self.atoms[i] bonds = self.bonds[i] atom_count = self.atom_count[i] bond_count = self.bond_count[i] if not is_seq: return _rec_to_plams(atoms, bonds, atom_count, bond_count, mol) if mol is None: mol_list = repeat(None) elif isinstance(mol, Molecule): raise TypeError else: mol_list = mol iterator = zip(atoms, bonds, atom_count, bond_count, mol_list) return [_rec_to_plams(*args) for args in iterator]
@overload def to_rdkit( self, index: None | Sequence[int] | slice | np.ndarray = ..., sanitize: bool = ..., ) -> List[Chem.Mol]: ... @overload # noqa: E301 def to_rdkit( self, index: SupportsIndex, sanitize: bool = ..., ) -> Chem.Mol: ...
[docs] def to_rdkit(self, index=None, sanitize=True): # noqa: E301 """Create an rdkit molecule or list of rdkit molecules from this instance. Examples -------- .. testsetup:: python >>> from dataCAT.testing_utils import PDB as pdb An example where one or more new molecules are created. .. code:: python >>> from dataCAT import PDBContainer >>> from rdkit.Chem import Mol >>> pdb = PDBContainer(...) # doctest: +SKIP # Create a single new molecule from `pdb` >>> pdb.to_rdkit(index=0) # doctest: +ELLIPSIS <rdkit.Chem.rdchem.Mol object at ...> # Create three new molecules from `pdb` >>> pdb.to_rdkit(index=[0, 1]) # doctest: +ELLIPSIS,+NORMALIZE_WHITESPACE [<rdkit.Chem.rdchem.Mol object at ...>, <rdkit.Chem.rdchem.Mol object at ...>] Parameters ---------- index : :class:`int`, :class:`Sequence[int]<Collections.abc.Sequence>` or :class:`slice`, optional An object for slicing the arrays embedded within this instance. Follows the standard numpy broadcasting rules (*e.g.* :code:`self.atoms[index]`). If a scalar is provided (*e.g.* an integer) then a single molecule will be returned. If a sequence, range, slice, *etc.* is provided then a list of molecules will be returned. sanitize : bool Whether to sanitize the molecule before returning or not. Returns ------- :class:`~rdkit.Chem.rdchem.Mol` or :class:`list[Mol]<list>` A molecule or list of molecules, depending on whether or not **index** is a scalar or sequence / slice. """ # noqa: E501 if index is None: i = slice(None) is_seq = True else: try: i = index.__index__() is_seq = False except (AttributeError, TypeError): i = index is_seq = True atoms = self.atoms[i] bonds = self.bonds[i] atom_count = self.atom_count[i] bond_count = self.bond_count[i] if not is_seq: return _rec_to_rdkit(atoms, bonds, atom_count, bond_count, sanitize) iterator = zip(atoms, bonds, atom_count, bond_count, repeat(sanitize)) return [_rec_to_rdkit(*args) for args in iterator]
@overload @classmethod def create_hdf5_group(cls, file: h5py.Group, name: str, *, scale: h5py.Dataset, **kwargs: Any) -> h5py.Group: ... @overload # noqa: E301 @classmethod def create_hdf5_group(cls, file: h5py.Group, name: str, *, scale_dtype: DTypeLike = ..., **kwargs: Any) -> h5py.Group: ...
[docs] @classmethod # noqa: E301 def create_hdf5_group(cls, file, name, *, scale=None, scale_dtype=None, **kwargs): r"""Create a h5py Group for storing :class:`dataCAT.PDBContainer` instances. Notes ----- The **scale** and **scale_dtype** parameters are mutually exclusive. Parameters ---------- file : :class:`h5py.File` or :class:`h5py.Group` The h5py File or Group where the new Group will be created. name : :class:`str` The name of the to-be created Group. Keyword Arguments ----------------- scale : :class:`h5py.Dataset`, keyword-only A pre-existing dataset serving as dimensional scale. See **scale_dtype** to create a new instead instead. scale_dtype : dtype-like, keyword-only The datatype of the to-be created dimensional scale. See **scale** to use a pre-existing dataset for this purpose. \**kwargs : :data:`~typing.Any` Further keyword arguments for the creation of each dataset. Arguments already specified by default are: ``name``, ``shape``, ``maxshape`` and ``dtype``. Returns ------- :class:`h5py.Group` The newly created Group. """ if scale is not None and scale_dtype is not None: raise TypeError("'scale' and 'scale_dtype' cannot be both specified") cls_name = cls.__name__ # Create the group grp = file.create_group(name, track_order=True) grp.attrs['__doc__'] = np.string_(HDF5_DOCSTRING.format(cls_name=cls_name)) # Create the datasets NDIM = cls.NDIM DTYPE = cls.DTYPE key_iter = (k for k in cls.keys() if k != 'scale') for key in key_iter: maxshape = NDIM[key] * (None,) shape = NDIM[key] * (0,) dtype = DTYPE[key] dset = grp.create_dataset(key, shape=shape, maxshape=maxshape, dtype=dtype, **kwargs) dset.attrs['__doc__'] = np.string_(f"A dataset representing `{cls_name}.atoms`.") # Set the index scale_name = cls.SCALE_NAME if scale is not None: scale_dset = scale else: _dtype = scale_dtype if scale_dtype is not None else cls.DTYPE['scale'] scale_dset = grp.create_dataset(scale_name, shape=(0,), maxshape=(None,), dtype=_dtype) scale_dset.make_scale(scale_name) # Use the index as a scale dset_iter = (grp[k] for k in cls.keys() if k != 'scale') for dset in dset_iter: dset.dims[0].label = scale_name dset.dims[0].attach_scale(scale_dset) grp['atoms'].dims[1].label = 'atoms' grp['bonds'].dims[1].label = 'bonds' return grp
[docs] @classmethod def validate_hdf5(cls, group: h5py.Group) -> None: """Validate the passed hdf5 **group**, ensuring it is compatible with :class:`PDBContainer` instances. An :exc:`AssertionError` will be raise if **group** does not validate. This method is called automatically when an exception is raised by :meth:`~PDBContainer.to_hdf5` or :meth:`~PDBContainer.from_hdf5`. Parameters ---------- group : :class:`h5py.Group` The to-be validated hdf5 Group. Raises ------ :exc:`AssertionError` Raised if the validation process fails. """ # noqa: E501 if not isinstance(group, h5py.Group): raise TypeError("'group' expected a h5py.Group; " f"observed type: {group.__class__.__name__}") # Check if **group** has all required keys keys: List[str] = list(cls.keys()) keys[-1] = cls.SCALE_NAME difference = set(keys) - group.keys() if difference: missing_keys = ', '.join(repr(i) for i in difference) raise AssertionError(f"Missing keys in {group!r}: {missing_keys}") # Check the dimensionality and dtype of all datasets len_dict = {} for key in cls.keys(): dset = group[key] if key != 'scale' else group[cls.SCALE_NAME] len_dict[key] = len(dset) assertion.eq(dset.ndim, cls.NDIM[key], message=f"{key!r} ndim mismatch") if key != 'scale': assertion.eq(dset.dtype, cls.DTYPE[key], message=f"{key!r} dtype mismatch") # Check that all datasets are of the same length if len(set(len_dict.values())) != 1: raise AssertionError( f"All datasets in {group!r} should be of the same length.\n" f"Observed lengths: {len_dict!r}" )
[docs] @if_exception(validate_hdf5.__func__) # type: ignore def to_hdf5(self, group: h5py.Group, index: IndexLike, update_scale: bool = True) -> None: """Update all datasets in **group** positioned at **index** with its counterpart from **pdb**. Follows the standard broadcasting rules as employed by h5py. Important --------- If **index** is passed as a sequence of integers then, contrary to NumPy, they *will* have to be sorted. Parameters ---------- group : :class:`h5py.Group` The to-be updated h5py group. index : :class:`int`, :class:`Sequence[int]<typing.Sequence>` or :class:`slice` An object for slicing all datasets in **group**. Note that, contrary to numpy, if a sequence of integers is provided then they'll have to ordered. update_scale : :class:`bool` If :data:`True`, also export :attr:`PDBContainer.scale` to the dimensional scale in the passed **group**. """ # Parse **idx** if index is None: idx: slice | np.ndarray = slice(None) idx_max: int | np.integer = len(self) else: try: idx = int_to_slice(index, len(self)) # type: ignore idx_max = idx.stop or len(self) except (AttributeError, TypeError): if not isinstance(index, slice): idx = np.asarray(index) idx_max = idx.max() assert idx.ndim == 1 assert issubclass(idx.dtype.type, np.integer) else: idx = index idx_max = idx.stop or len(self) # Update the length of all groups scale_name = self.SCALE_NAME scale = group['atoms'].dims[0][scale_name] if len(scale) < idx_max: scale.resize(idx_max, axis=0) self._update_hdf5_shape(group) # Update the datasets items = ((name, ar) for name, ar in self.items() if name != 'scale') for name, ar in items: dataset = group[name] if ar.ndim == 1: dataset[idx] = ar else: _, j = ar.shape dataset[idx, :j] = ar # Update the index if update_scale: scale[idx] = self.scale
def _update_hdf5_shape(self, group: h5py.Group) -> None: """Update the shape of all datasets in **group** such that it can accommodate **pdb**. The length of all datasets will be set equal to the ``index`` dimensional cale. This method is automatically called by :meth:`PDBContainer.update_hdf5` when required. Parameters ---------- group : :class:`h5py.Group` The h5py Group with the to-be reshaped datasets. pdb : :class:`dataCAT.PDBContainer` The pdb container for updating **group**. :rtype: :data:`None` """ # noqa: E501 # Get the length of the dimensional scale scale_name = self.SCALE_NAME idx = group['atoms'].dims[0][scale_name] idx_len = len(idx) items = ((name, ar) for name, ar in self.items() if name != 'scale') for name, ar in items: dataset = group[name] # Identify the new shape of all datasets shape = np.fromiter(dataset.shape, dtype=int) shape[0] = idx_len if ar.ndim == 2: shape[1] = max(shape[1], ar.shape[1]) # Set the new shape dataset.shape = shape
[docs] @classmethod @if_exception(validate_hdf5.__func__) # type: ignore def from_hdf5(cls: Type[ST], group: h5py.Group, index: IndexLike = None) -> ST: """Construct a new PDBContainer from the passed hdf5 **group**. Parameters ---------- group : :class:`h5py.Group` The to-be read h5py group. index : :class:`int`, :class:`Sequence[int]<typing.Sequence>` or :class:`slice`, optional An object for slicing all datasets in **group**. Returns ------- :class:`dataCAT.PDBContainer` A new PDBContainer constructed from **group**. """ if index is None: idx: Union[slice, np.ndarray] = slice(None) else: try: idx = int_to_slice(index, len(group['atom_count'])) # type: ignore except (AttributeError, TypeError): idx = np.asarray(index) if not isinstance(index, slice) else index assert getattr(idx, 'ndim', 1) == 1 scale_name = cls.SCALE_NAME return cls( atoms=group['atoms'][idx].view(np.recarray), bonds=group['bonds'][idx].view(np.recarray), atom_count=group['atom_count'][idx], bond_count=group['bond_count'][idx], scale=group[scale_name][idx].view(np.recarray), validate=False )
[docs] def intersection(self: ST, value: Union[ST, ArrayLike]) -> ST: """Construct a new PDBContainer by the intersection of **self** and **value**. Examples -------- .. testsetup:: python >>> from dataCAT.testing_utils import PDB as pdb An example where one or more new molecules are created. .. code:: python >>> from dataCAT import PDBContainer >>> pdb = PDBContainer(...) # doctest: +SKIP >>> print(pdb.scale) [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22] >>> pdb_new = pdb.intersection(range(4)) >>> print(pdb_new.scale) [0 1 2 3] Parameters ---------- value : :class:`PDBContainer` or array-like Another PDBContainer or an array-like object representing :attr:`PDBContainer.scale`. Note that both **value** and **self.scale** should consist of unique elements. Returns ------- :class:`PDBContainer` A new instance by intersecting :attr:`self.scale<PDBContainer.scale>` and **value**. See Also -------- :meth:`set.intersection<frozenset.intersection>` Return the intersection of two sets as a new set. """ idx1, idx2 = self._get_index(value) _, i, _ = np.intersect1d(idx1, idx2, assume_unique=True, return_indices=True) return self[i]
[docs] def difference(self: ST, value: Union[ST, ArrayLike]) -> ST: """Construct a new PDBContainer by the difference of **self** and **value**. Examples -------- .. testsetup:: python >>> from dataCAT.testing_utils import PDB as pdb .. code:: python >>> from dataCAT import PDBContainer >>> pdb = PDBContainer(...) # doctest: +SKIP >>> print(pdb.scale) [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22] >>> pdb_new = pdb.difference(range(10, 30)) >>> print(pdb_new.scale) [0 1 2 3 4 5 6 7 8 9] Parameters ---------- value : :class:`PDBContainer` or array-like Another PDBContainer or an array-like object representing :attr:`PDBContainer.scale`. Note that both **value** and **self.scale** should consist of unique elements. Returns ------- :class:`PDBContainer` A new instance as the difference of :attr:`self.scale<PDBContainer.scale>` and **value**. See Also -------- :meth:`set.difference<frozenset.difference>` Return the difference of two or more sets as a new set. """ idx1, idx2 = self._get_index(value) i = np.in1d(idx1, idx2, assume_unique=True, invert=True) return self[i]
[docs] def symmetric_difference(self: ST, value: ST) -> ST: """Construct a new PDBContainer by the symmetric difference of **self** and **value**. Examples -------- .. testsetup:: python >>> import numpy as np >>> from dataCAT.testing_utils import PDB as pdb >>> from dataCAT import PDBContainer >>> a = np.arange(10, 30) >>> b = a.reshape(len(a), 1) >>> pdb2 = PDBContainer(b, b, a, a, a, validate=False) .. code:: python >>> from dataCAT import PDBContainer >>> pdb = PDBContainer(...) # doctest: +SKIP >>> pdb2 = PDBContainer(..., scale=range(10, 30)) # doctest: +SKIP >>> print(pdb.scale) [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22] >>> pdb_new = pdb.symmetric_difference(pdb2) >>> print(pdb_new.scale) [ 0 1 2 3 4 5 6 7 8 9 23 24 25 26 27 28 29] Parameters ---------- value : :class:`PDBContainer` Another PDBContainer. Note that both **value.scale** and **self.scale** should consist of unique elements. Returns ------- :class:`PDBContainer` A new instance as the symmetric difference of :attr:`self.scale<PDBContainer.scale>` and **value**. See Also -------- :meth:`set.symmetric_difference<frozenset.symmetric_difference>` Return the symmetric difference of two sets as a new set. """ pdb1 = self.difference(value) pdb2 = value.difference(self) return pdb1.concatenate(pdb2)
[docs] def union(self: ST, value: ST) -> ST: """Construct a new PDBContainer by the union of **self** and **value**. Examples -------- .. testsetup:: python >>> import numpy as np >>> from dataCAT.testing_utils import PDB as pdb >>> from dataCAT import PDBContainer >>> a = np.arange(10, 30) >>> b = a.reshape(len(a), 1) >>> pdb2 = PDBContainer(b, b, a, a, a, validate=False) .. code:: python >>> from dataCAT import PDBContainer >>> pdb = PDBContainer(...) # doctest: +SKIP >>> pdb2 = PDBContainer(..., scale=range(10, 30)) # doctest: +SKIP >>> print(pdb.scale) [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22] >>> pdb_new = pdb.union(pdb2) >>> print(pdb_new.scale) [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29] Parameters ---------- value : :class:`PDBContainer` Another PDBContainer. Note that both **value** and **self.scale** should consist of unique elements. Returns ------- :class:`PDBContainer` A new instance as the union of :attr:`self.index<PDBContainer.index>` and **value**. See Also -------- :meth:`set.union<frozenset.union>` Return the union of sets as a new set. """ cls = type(self) if not isinstance(value, cls): raise TypeError(f"'value' expected a {cls.__name__} instance; " f"observed type: {value.__class__.__name__}") idx1 = self.scale idx2 = value.scale.astype(idx1.dtype, copy=False) i = np.in1d(idx2, idx1, assume_unique=True, invert=True) ret = value[i] return self.concatenate(ret)
def _get_index(self: ST, value: Union[ST, ArrayLike]) -> Tuple[np.recarray, np.ndarray]: """Parse and return the :attr:`~PDBContainer.scale` of **self** and **value**.""" cls = type(self) scale1 = self.scale _scale2 = value.scale if isinstance(value, cls) else value scale2 = np.asarray(_scale2, dtype=scale1.dtype) return scale1, scale2