"""A module for constructing array-representations of .pdb files.
Index
-----
.. currentmodule:: dataCAT
.. autosummary::
PDBContainer
PDBContainer.atoms
PDBContainer.bonds
PDBContainer.atom_count
PDBContainer.bond_count
PDBContainer.scale
.. autosummary::
PDBContainer.__init__
PDBContainer.__getitem__
PDBContainer.__len__
PDBContainer.keys
PDBContainer.values
PDBContainer.items
PDBContainer.concatenate
.. autosummary::
PDBContainer.from_molecules
PDBContainer.to_molecules
PDBContainer.to_rdkit
PDBContainer.create_hdf5_group
PDBContainer.validate_hdf5
PDBContainer.from_hdf5
PDBContainer.to_hdf5
.. autosummary::
PDBContainer.intersection
PDBContainer.difference
PDBContainer.symmetric_difference
PDBContainer.union
API
---
.. autoclass:: PDBContainer
:members: atoms, bonds, atom_count, bond_count, scale, __init__
API: Miscellaneous Methods
--------------------------
.. automethod:: PDBContainer.__getitem__
.. automethod:: PDBContainer.__len__
.. automethod:: PDBContainer.keys
.. automethod:: PDBContainer.values
.. automethod:: PDBContainer.items
.. automethod:: PDBContainer.concatenate
API: Object Interconversion
---------------------------
.. automethod:: PDBContainer.from_molecules
.. automethod:: PDBContainer.to_molecules
.. automethod:: PDBContainer.to_rdkit
.. automethod:: PDBContainer.create_hdf5_group
.. automethod:: PDBContainer.validate_hdf5
.. automethod:: PDBContainer.from_hdf5
.. automethod:: PDBContainer.to_hdf5
API: Set Operations
-------------------
.. automethod:: PDBContainer.intersection
.. automethod:: PDBContainer.difference
.. automethod:: PDBContainer.symmetric_difference
.. automethod:: PDBContainer.union
""" # noqa: E501
from __future__ import annotations
import textwrap
from types import MappingProxyType
from itertools import repeat, chain
from typing import (
List, Iterable, Union, Type, TypeVar, Optional, Dict, Any,
overload, Sequence, Mapping, Tuple, Generator, ClassVar, TYPE_CHECKING
)
import h5py
import numpy as np
from scm.plams import Molecule, Atom, Bond
from rdkit import Chem, Geometry
from nanoutils import SupportsIndex, TypedDict, Literal
from assertionlib import assertion
from .dtype import ATOMS_DTYPE, BONDS_DTYPE, ATOM_COUNT_DTYPE, BOND_COUNT_DTYPE, BACKUP_IDX_DTYPE
from .functions import int_to_slice, if_exception
if TYPE_CHECKING:
from numpy.typing import ArrayLike, DTypeLike
__all__ = ['PDBContainer']
ST = TypeVar('ST', bound='PDBContainer')
_AtomTuple = Tuple[
bool, # IsHeteroAtom
int, # SerialNumber
str, # Name
str, # ResidueName
str, # ChainId
int, # ResidueNumber
float, # x
float, # y
float, # z
float, # Occupancy
float, # TempFactor
str, # symbol
int, # charge
float # charge_float
]
_BondTuple = Tuple[
int, # atom1
int, # atom2
int, # order
]
_ReduceTuple = Tuple[
np.recarray, # atoms
np.recarray, # bonds
np.ndarray, # atom_count
np.ndarray, # bond_count
np.recarray, # scale
Literal[False] # validate
]
_Coords = Tuple[float, float, float]
_ResInfo = Tuple[
str, # Name,
int, # SerialNumber,
str, # AltLoc
str, # ResidueName
int, # ResidueNumber
int, # ChainId
str, # InsertionCode
float, # Occupancy
float, # TempFactor
bool, # IsHeteroAtom
]
class _PDBInfo(TypedDict):
IsHeteroAtom: bool
SerialNumber: int
Name: str
ResidueName: str
ChainId: str
ResidueNumber: int
Occupancy: float
TempFactor: float
class _Properties(TypedDict):
charge: int
charge_float: float
pdb_info: _PDBInfo
def _get_atom_info(at: Atom, i: int) -> _AtomTuple:
"""Helper function for :meth:`PDBContainer.from_molecules`: create a tuple representing a single :attr:`PDBContainer.atoms` row.""" # noqa: E501
prop = at.properties
symbol = at.symbol
charge = prop.get('charge', 0)
pdb = prop.get('pdb_info', {})
return (
pdb.get('IsHeteroAtom', False), # type: ignore
pdb.get('SerialNumber', i),
pdb.get('Name', symbol),
pdb.get('ResidueName', 'LIG'),
pdb.get('ChainId', 'A'),
pdb.get('ResidueNumber', 1),
*at.coords,
pdb.get('Occupancy', 1.0),
pdb.get('TempFactor', 0.0),
symbol,
charge,
prop.get('charge_float', charge)
)
def _get_bond_info(mol: Molecule) -> List[_BondTuple]:
"""Helper function for :meth:`PDBContainer.from_molecules`: create a tuple representing a single :attr:`PDBContainer.bonds` row.
Note that the atomic indices are 1-based.
""" # noqa: E501
mol.set_atoms_id(start=1)
ret = [(b.atom1.id, b.atom2.id, b.order) for b in mol.bonds]
mol.unset_atoms_id()
return ret
def _iter_rec_plams(
atom_array: np.recarray,
) -> Generator[Tuple[_Properties, _Coords, str], None, None]:
"""Helper function for :func:`_rec_to_mol`: create an iterator yielding atom properties and attributes.""" # noqa: E501
for ar in atom_array:
IsHeteroAtom, SerialNumber, Name, ResidueName, ChainId, ResidueNumber, x, y, z, Occupancy, TempFactor, symbol, charge, charge_float = ar.item() # noqa: E501
_pdb_info = {
'IsHeteroAtom': IsHeteroAtom,
'SerialNumber': SerialNumber,
'Name': Name.decode(),
'ResidueName': ResidueName.decode(),
'ChainId': ChainId.decode(),
'ResidueNumber': ResidueNumber,
'Occupancy': Occupancy,
'TempFactor': TempFactor
}
properties = {
'charge': charge,
'charge_float': charge_float,
'pdb_info': _pdb_info
}
yield properties, (x, y, z), symbol.decode() # type: ignore
def _rec_to_plams(
atom_array: np.recarray,
bond_array: np.recarray,
atom_len: None | int = None,
bond_len: None | int = None,
mol: None | Molecule = None,
) -> Molecule:
"""Helper function for :meth:`PDBContainer.to_molecules`: update/create a single plams molecule from the passed **atom_array** and **bond_array**.""" # noqa: E501
if mol is None:
ret = Molecule()
for _ in range(len(atom_array[:atom_len])):
ret.add_atom(Atom(mol=ret))
else:
ret = mol
iterator = _iter_rec_plams(atom_array[:atom_len])
for atom, (properties, coords, symbol) in zip(ret, iterator):
atom.coords = coords
atom.symbol = symbol
atom.properties.update(properties)
if ret.bonds:
ret.delete_all_bonds()
for i, j, order in bond_array[:bond_len]:
bond = Bond(atom1=ret[i], atom2=ret[j], order=order, mol=ret)
ret.add_bond(bond)
return ret
def _iter_rec_rdkit(
atom_array: np.recarray,
) -> Generator[Tuple[str, int, _ResInfo], None, None]:
"""Helper function for :func:`_rec_to_mol`: create an iterator yielding atom properties and attributes.""" # noqa: E501
for ar in atom_array:
IsHeteroAtom, SerialNumber, Name, ResidueName, ChainId, ResidueNumber, x, y, z, Occupancy, TempFactor, symbol, charge, charge_float = ar.item() # noqa: E501
res_info = (
Name,
SerialNumber,
"",
ResidueName,
ResidueNumber,
ChainId,
"",
Occupancy,
TempFactor,
IsHeteroAtom,
)
yield symbol, charge, res_info
def _rec_to_rdkit(
atom_array: np.recarray,
bond_array: np.recarray,
atom_len: None | int = None,
bond_len: None | int = None,
sanitize: bool = True,
) -> Chem.Mol:
"""Helper function for :meth:`PDBContainer.to_rdkit`: create a single rdkit molecule from the passed **atom_array** and **bond_array**.""" # noqa: E501
edit_mol = Chem.EditableMol(Chem.Mol())
iterator1 = _iter_rec_rdkit(atom_array[:atom_len])
for symbol, charge, res_info in iterator1:
atom = Chem.Atom(symbol)
atom.SetFormalCharge(charge)
atom.SetMonomerInfo(Chem.AtomPDBResidueInfo(*res_info))
edit_mol.AddAtom(atom)
for void in bond_array[:bond_len]:
i, j, order = void.item()
edit_mol.AddBond(i - 1, j - 1, Chem.BondType(order))
mol = edit_mol.GetMol()
if sanitize:
Chem.SanitizeMol(mol)
conf = Chem.Conformer()
iterator2 = (void.item()[6:9] for void in atom_array[:atom_len])
for i, xyz in enumerate(iterator2):
conf.SetAtomPosition(i, Geometry.Point3D(*xyz))
mol.AddConformer(conf)
return mol
IndexLike = Union[None, SupportsIndex, Sequence[int], slice, np.ndarray]
Hdf5Mode = Literal['append', 'update']
_AttrName = Literal['atoms', 'bonds', 'atom_count', 'bond_count', 'scale']
#: The docstring to-be assigned by :meth:`PDBContainer.create_hdf5_group`.
HDF5_DOCSTRING = """A Group of datasets representing :class:`{cls_name}`."""
[docs]class PDBContainer:
"""An (immutable) class for holding array-like representions of a set of .pdb files.
The :class:`PDBContainer` class serves as an (intermediate) container
for storing .pdb files in the hdf5 format,
thus facilitating the storage and interconversion
between PLAMS molecules and the :mod:`h5py` interface.
The methods implemented in this class can roughly be divided into three categories:
* Molecule-interconversion: :meth:`~PDBContainer.to_molecules`,
:meth:`~PDBContainer.from_molecules` & :meth:`~PDBContainer.to_rdkit`.
* hdf5-interconversion: :meth:`~PDBContainer.create_hdf5_group`,
:meth:`~PDBContainer.validate_hdf5`,
:meth:`~PDBContainer.to_hdf5` & :meth:`~PDBContainer.from_hdf5`.
* Miscellaneous: :meth:`~PDBContainer.keys`, :meth:`~PDBContainer.values`,
:meth:`~PDBContainer.items`, :meth:`~PDBContainer.__getitem__` &
:meth:`~PDBContainer.__len__`.
Examples
--------
.. testsetup:: python
>>> import os
>>> from dataCAT.testing_utils import (
... MOL_TUPLE as mol_list,
... PDB as pdb,
... HDF5_TMP as hdf5_file
... )
>>> if os.path.isfile(hdf5_file):
... os.remove(hdf5_file)
.. code:: python
>>> import h5py
>>> from scm.plams import readpdb
>>> from dataCAT import PDBContainer
>>> mol_list [readpdb(...), ...] # doctest: +SKIP
>>> pdb = PDBContainer.from_molecules(mol_list)
>>> print(pdb)
PDBContainer(
atoms = numpy.recarray(..., shape=(23, 76), dtype=...),
bonds = numpy.recarray(..., shape=(23, 75), dtype=...),
atom_count = numpy.ndarray(..., shape=(23,), dtype=int32),
bond_count = numpy.ndarray(..., shape=(23,), dtype=int32),
scale = numpy.recarray(..., shape=(23,), dtype=...)
)
>>> hdf5_file = str(...) # doctest: +SKIP
>>> with h5py.File(hdf5_file, 'a') as f:
... group = pdb.create_hdf5_group(f, name='ligand')
... pdb.to_hdf5(group, None)
...
... print('group', '=', group)
... for name, dset in group.items():
... print(f'group[{name!r}]', '=', dset)
group = <HDF5 group "/ligand" (5 members)>
group['atoms'] = <HDF5 dataset "atoms": shape (23, 76), type "|V46">
group['bonds'] = <HDF5 dataset "bonds": shape (23, 75), type "|V9">
group['atom_count'] = <HDF5 dataset "atom_count": shape (23,), type "<i4">
group['bond_count'] = <HDF5 dataset "bond_count": shape (23,), type "<i4">
group['index'] = <HDF5 dataset "index": shape (23,), type "<i4">
.. testcleanup:: python
>>> if os.path.isfile(hdf5_file):
... os.remove(hdf5_file)
"""
__slots__ = (
'__weakref__', '_hash', '_atoms', '_bonds', '_atom_count', '_bond_count', '_scale'
)
#: A mapping holding the dimensionality of each array embedded within this class.
NDIM: ClassVar[Mapping[_AttrName, int]] = MappingProxyType({
'atoms': 2,
'bonds': 2,
'atom_count': 1,
'bond_count': 1,
'scale': 1
})
#: A mapping holding the dtype of each array embedded within this class.
DTYPE: ClassVar[Mapping[_AttrName, np.dtype]] = MappingProxyType({
'atoms': ATOMS_DTYPE,
'bonds': BONDS_DTYPE,
'atom_count': ATOM_COUNT_DTYPE,
'bond_count': BOND_COUNT_DTYPE,
'scale': BACKUP_IDX_DTYPE
})
#: The name of the h5py dimensional scale.
SCALE_NAME: ClassVar[str] = 'index'
@property
def atoms(self) -> np.recarray:
""":class:`numpy.recarray`, shape :math:`(n, m)`: Get a read-only padded recarray for keeping track of all atom-related information.
See :data:`dataCAT.dtype.ATOMS_DTYPE` for a comprehensive overview of
all field names and dtypes.
""" # noqa: E501
return self._atoms
@property
def bonds(self) -> np.recarray:
""":class:`numpy.recarray`, shape :math:`(n, k)` : Get a read-only padded recarray for keeping track of all bond-related information.
Note that all atomic indices are 1-based.
See :data:`dataCAT.dtype.BONDS_DTYPE` for a comprehensive overview of
all field names and dtypes.
""" # noqa: E501
return self._bonds
@property
def atom_count(self) -> np.ndarray:
""":class:`numpy.ndarray[int32]<numpy.ndarray>`, shape :math:`(n,)` : Get a read-only ndarray for keeping track of the number of atoms in each molecule in :attr:`~PDBContainer.atoms`.""" # noqa: E501
return self._atom_count
@property
def bond_count(self) -> np.ndarray:
""":class:`numpy.ndarray[int32]<numpy.ndarray>`, shape :math:`(n,)` : Get a read-only ndarray for keeping track of the number of atoms in each molecule in :attr:`~PDBContainer.bonds`.""" # noqa: E501
return self._bond_count
@property
def scale(self) -> np.recarray:
""":class:`numpy.recarray`, shape :math:`(n,)`: Get a recarray representing an index.
Used as dimensional scale in the h5py Group.
""" # noqa: E501
return self._scale
@overload
def __init__(self, atoms: np.recarray, bonds: np.recarray,
atom_count: np.ndarray, bond_count: np.ndarray,
scale: np.recarray,
validate: Literal[False]) -> None:
...
@overload # noqa: E301
def __init__(self, atoms: ArrayLike, bonds: ArrayLike,
atom_count: ArrayLike, bond_count: ArrayLike,
scale: Optional[ArrayLike] = None,
validate: Literal[True] = ..., copy: bool = ...,) -> None:
...
[docs] def __init__(self, atoms, bonds, atom_count, bond_count, scale=None, validate=True, copy=True, index_dtype=None): # noqa: E501,E301
"""Initialize an instance.
Parameters
----------
atoms : :class:`numpy.recarray`, shape :math:`(n, m)`
A padded recarray for keeping track of all atom-related information.
See :attr:`PDBContainer.atoms`.
bonds : :class:`numpy.recarray`, shape :math:`(n, k)`
A padded recarray for keeping track of all bond-related information.
See :attr:`PDBContainer.bonds`.
atom_count : :class:`numpy.ndarray[int32]<numpy.ndarray>`, shape :math:`(n,)`
An ndarray for keeping track of the number of atoms in each molecule in **atoms**.
See :attr:`PDBContainer.atom_count`.
bond_count : :class:`numpy.ndarray[int32]<numpy.ndarray>`, shape :math:`(n,)`
An ndarray for keeping track of the number of bonds in each molecule in **bonds**.
See :attr:`PDBContainer.bond_count`.
scale : :class:`numpy.recarray`, shape :math:`(n,)`, optional
A recarray representing an index.
If :data:`None`, use a simple numerical index (*i.e.* :func:`numpy.arange`).
See :attr:`PDBContainer.scale`.
Keyword Arguments
-----------------
validate : :class:`bool`
If :data:`True` perform more thorough validation of the input arrays.
Note that this also allows the parameters to-be passed as array-like objects
in addition to aforementioned :class:`~numpy.ndarray` or
:class:`~numpy.recarray` instances.
copy : :class:`bool`
If :data:`True`, set the passed arrays as copies.
Only relevant if :data:`validate = True<True>`.
:rtype: :data:`None`
"""
if validate:
cls = type(self)
rec_set = {'atoms', 'bonds'}
items = [
('atoms', atoms),
('bonds', bonds),
('atom_count', atom_count),
('bond_count', bond_count)
]
for name, _array in items:
ndmin = cls.NDIM[name]
dtype = cls.DTYPE[name]
array = np.array(_array, dtype=dtype, ndmin=ndmin, copy=copy)
if name in rec_set:
array = array.view(np.recarray)
setattr(self, f'_{name}', array)
if scale is None:
self._scale: np.recarray = np.array(scale, ndmin=1, copy=copy).view(np.recarray)
else:
dtype = cls.DTYPE['scale']
self._scale = np.arange(len(array), dtype=dtype).view(np.recarray)
len_set = {len(ar) for ar in self.values()}
if len(len_set) != 1:
raise ValueError("All passed arrays should be of the same length")
# Assume the input does not have to be parsed
else:
self._atoms: np.recarray = atoms
self._bonds: np.recarray = bonds
self._atom_count: np.ndarray = atom_count
self._bond_count: np.ndarray = bond_count
self._scale: np.recarray = scale
for ar in self.values():
ar.setflags(write=False)
def __repr__(self) -> str:
"""Implement :class:`str(self)<str>` and :func:`repr(self)<repr>`."""
wdith = max(len(k) for k in self.keys())
def _str(k, v):
if isinstance(v, np.recarray):
dtype = '...'
else:
dtype = str(v.dtype)
return (f'{k:{wdith}} = {v.__class__.__module__}.{v.__class__.__name__}'
f'(..., shape={v.shape}, dtype={dtype})')
ret = ',\n'.join(_str(k, v) for k, v in self.items())
indent = 4 * ' '
return f'{self.__class__.__name__}(\n{textwrap.indent(ret, indent)}\n)'
def __reduce__(self: ST) -> Tuple[Type[ST], _ReduceTuple]:
"""Helper for :mod:`pickle`."""
cls = type(self)
return cls, (*self.values(), False) # type: ignore
def __copy__(self: ST) -> ST:
"""Implement :func:`copy.copy(self)<copy.copy>`."""
return self # self is immutable
def __deepcopy__(self: ST, memo: Optional[Dict[int, Any]] = None) -> ST:
"""Implement :func:`copy.deepcopy(self, memo=memo)<copy.deepcopy>`."""
return self # self is immutable
[docs] def __len__(self) -> int:
"""Implement :func:`len(self)<len>`.
Returns
-------
:class:`int`
Returns the length of the arrays embedded within this instance
(which are all of the same length).
"""
return len(self.atom_count)
def __eq__(self, value: object) -> bool:
"""Implement :meth:`self == value<object.__eq__>`."""
if type(self) is not type(value):
return False
elif hash(self) != hash(value):
return False
iterator = ((v, getattr(value, k)) for k, v in self.items())
return all(np.all(ar1 == ar2) for ar1, ar2 in iterator)
def __hash__(self) -> int:
"""Implement :func:`hash(self)<hash>`."""
try:
return self._hash
except AttributeError:
args = []
# The hash of each individual array consists of its shape appended
# with the array's first and last element along axis 0
for ar in self.values():
if not len(ar):
first_and_last: Tuple[Any, ...] = ()
else:
first = ar[0] if ar.ndim == 1 else ar[0, 0]
last = ar[-1] if ar.ndim == 1 else ar[-1, 0]
first_and_last = (first, last)
args.append(ar.shape + first_and_last)
cls = type(self)
self._hash: int = hash((cls, tuple(args)))
return self._hash
[docs] def __getitem__(self: ST, index: IndexLike) -> ST:
"""Implement :meth:`self[index]<object.__getitem__>`.
Constructs a new :class:`PDBContainer` instance by slicing all arrays with **index**.
Follows the standard NumPy broadcasting rules:
if an integer or slice is passed then a shallow copy is returned;
otherwise a deep copy will be created.
Examples
--------
.. testsetup:: python
>>> from dataCAT.testing_utils import PDB as pdb
.. code:: python
>>> from dataCAT import PDBContainer
>>> pdb = PDBContainer(...) # doctest: +SKIP
>>> print(pdb)
PDBContainer(
atoms = numpy.recarray(..., shape=(23, 76), dtype=...),
bonds = numpy.recarray(..., shape=(23, 75), dtype=...),
atom_count = numpy.ndarray(..., shape=(23,), dtype=int32),
bond_count = numpy.ndarray(..., shape=(23,), dtype=int32),
scale = numpy.recarray(..., shape=(23,), dtype=...)
)
>>> pdb[0]
PDBContainer(
atoms = numpy.recarray(..., shape=(1, 76), dtype=...),
bonds = numpy.recarray(..., shape=(1, 75), dtype=...),
atom_count = numpy.ndarray(..., shape=(1,), dtype=int32),
bond_count = numpy.ndarray(..., shape=(1,), dtype=int32),
scale = numpy.recarray(..., shape=(1,), dtype=...)
)
>>> pdb[:10]
PDBContainer(
atoms = numpy.recarray(..., shape=(10, 76), dtype=...),
bonds = numpy.recarray(..., shape=(10, 75), dtype=...),
atom_count = numpy.ndarray(..., shape=(10,), dtype=int32),
bond_count = numpy.ndarray(..., shape=(10,), dtype=int32),
scale = numpy.recarray(..., shape=(10,), dtype=...)
)
>>> pdb[[0, 5, 7, 9, 10]]
PDBContainer(
atoms = numpy.recarray(..., shape=(5, 76), dtype=...),
bonds = numpy.recarray(..., shape=(5, 75), dtype=...),
atom_count = numpy.ndarray(..., shape=(5,), dtype=int32),
bond_count = numpy.ndarray(..., shape=(5,), dtype=int32),
scale = numpy.recarray(..., shape=(5,), dtype=...)
)
Parameters
----------
index : :class:`int`, :class:`Sequence[int]<typing.Sequence>` or :class:`slice`
An object for slicing arrays along :code:`axis=0`.
Returns
-------
:class:`dataCAT.PDBContainer`
A shallow or deep copy of a slice of this instance.
"""
cls = type(self)
if index is None:
idx: Union[slice, np.ndarray] = slice(None)
else:
try:
idx = int_to_slice(index, len(self)) # type: ignore
except (AttributeError, TypeError):
idx = np.asarray(index) if not isinstance(index, slice) else index
assert getattr(index, 'ndim', 1) == 1
iterator = (ar[idx] for ar in self.values())
ret: ST = cls(*iterator, validate=False) # type: ignore
ret._scale = ret.scale.view(np.recarray)
return ret
[docs] @classmethod
def keys(cls) -> Generator[_AttrName, None, None]:
"""Yield the (public) attribute names in this class.
Examples
--------
.. code:: python
>>> from dataCAT import PDBContainer
>>> for name in PDBContainer.keys():
... print(name)
atoms
bonds
atom_count
bond_count
scale
Yields
------
:class:`str`
The names of all attributes in this class.
"""
return (name.strip('_') for name in cls.__slots__[2:]) # type: ignore
[docs] def values(self) -> Generator[Union[np.ndarray, np.recarray], None, None]:
"""Yield the (public) attributes in this instance.
Examples
--------
.. testsetup:: python
>>> from dataCAT.testing_utils import PDB as pdb
.. code:: python
>>> from dataCAT import PDBContainer
>>> pdb = PDBContainer(...) # doctest: +SKIP
>>> for value in pdb.values():
... print(object.__repr__(value)) # doctest: +ELLIPSIS
<numpy.recarray object at ...>
<numpy.recarray object at ...>
<numpy.ndarray object at ...>
<numpy.ndarray object at ...>
<numpy.recarray object at ...>
Yields
------
:class:`str`
The values of all attributes in this instance.
"""
cls = type(self)
return (getattr(self, name) for name in cls.keys())
[docs] def items(self) -> Generator[Tuple[_AttrName, Union[np.ndarray, np.recarray]], None, None]:
"""Yield the (public) attribute name/value pairs in this instance.
Examples
--------
.. testsetup:: python
>>> from dataCAT.testing_utils import PDB as pdb
.. code:: python
>>> from dataCAT import PDBContainer
>>> pdb = PDBContainer(...) # doctest: +SKIP
>>> for name, value in pdb.items():
... print(name, '=', object.__repr__(value)) # doctest: +ELLIPSIS
atoms = <numpy.recarray object at ...>
bonds = <numpy.recarray object at ...>
atom_count = <numpy.ndarray object at ...>
bond_count = <numpy.ndarray object at ...>
scale = <numpy.recarray object at ...>
Yields
------
:class:`str` and :class:`numpy.ndarray` / :class:`numpy.recarray`
The names and values of all attributes in this instance.
"""
return ((n, getattr(self, n)) for n in self.keys())
[docs] def concatenate(self: ST, *args: ST) -> ST:
r"""Concatenate :math:`n` PDBContainers into a single new instance.
Examples
--------
.. testsetup:: python
>>> from dataCAT.testing_utils import PDB as pdb1
>>> pdb2 = pdb3 = pdb1
.. code:: python
>>> from dataCAT import PDBContainer
>>> pdb1 = PDBContainer(...) # doctest: +SKIP
>>> pdb2 = PDBContainer(...) # doctest: +SKIP
>>> pdb3 = PDBContainer(...) # doctest: +SKIP
>>> print(len(pdb1), len(pdb2), len(pdb3))
23 23 23
>>> pdb_new = pdb1.concatenate(pdb2, pdb3)
>>> print(pdb_new)
PDBContainer(
atoms = numpy.recarray(..., shape=(69, 76), dtype=...),
bonds = numpy.recarray(..., shape=(69, 75), dtype=...),
atom_count = numpy.ndarray(..., shape=(69,), dtype=int32),
bond_count = numpy.ndarray(..., shape=(69,), dtype=int32),
scale = numpy.recarray(..., shape=(69,), dtype=...)
)
Parameters
----------
\*args : :class:`PDBContainer`
One or more PDBContainers.
Returns
-------
:class:`PDBContainer`
A new PDBContainer cosntructed by concatenating **self** and **args**.
"""
if not args:
return self
try:
attr_list = [(k, v, [getattr(a, k) for a in args]) for k, v in self.items()]
except AttributeError as ex:
raise TypeError("'*args' expected one or more PDBContainer instances") from ex
cls = type(self)
ret_list = []
for k, ar_self, ar_list in attr_list:
# 'scale': a 1D recarray
if k == 'scale':
dtype = ar_self.dtype
ar_list = [ar.astype(dtype, copy=False) for ar in ar_list]
ar_new = np.concatenate((ar_self, *ar_list)).view(np.recarray)
# 'atom_count' and 'bond_count': two normal 1d arrays
elif ar_self.ndim == 1:
ar_new = np.concatenate((ar_self, *ar_list))
# 'atoms' and 'bonds': two padded 2D recarrays
else:
ax0 = len(ar_self) + sum(len(ar) for ar in ar_list)
ax1 = max(ar_self.shape[1], *(ar.shape[1] for ar in ar_list))
ar_new = np.zeros((ax0, ax1), dtype=cls.DTYPE[k]).view(np.recarray)
i = 0
for ar in chain([ar_self], ar_list):
j = i + len(ar)
ar_new[i:j, :ar.shape[1]] = ar
i = j
ret_list.append(ar_new)
return cls(*ret_list, validate=False) # type: ignore
[docs] @classmethod
def from_molecules(cls: Type[ST], mol_list: Iterable[Molecule],
min_atom: int = 0,
min_bond: int = 0,
scale: Optional[ArrayLike] = None) -> ST:
"""Convert an iterable or sequence of molecules into a new :class:`PDBContainer` instance.
Examples
--------
.. testsetup:: python
>>> from dataCAT.testing_utils import (
... PDB as pdb,
... MOL_TUPLE as mol_list
... )
.. code:: python
>>> from typing import List
>>> from dataCAT import PDBContainer
>>> from scm.plams import readpdb, Molecule
>>> mol_list: List[Molecule] = [readpdb(...), ...] # doctest: +SKIP
>>> PDBContainer.from_molecules(mol_list)
PDBContainer(
atoms = numpy.recarray(..., shape=(23, 76), dtype=...),
bonds = numpy.recarray(..., shape=(23, 75), dtype=...),
atom_count = numpy.ndarray(..., shape=(23,), dtype=int32),
bond_count = numpy.ndarray(..., shape=(23,), dtype=int32),
scale = numpy.recarray(..., shape=(23,), dtype=...)
)
Parameters
----------
mol_list : :class:`Iterable[Molecule]<typing.Iterable>`
An iterable consisting of PLAMS molecules.
min_atom : :class:`int`
The minimum number of atoms which :attr:`PDBContainer.atoms` should accomodate.
min_bond : :class:`int`
The minimum number of bonds which :attr:`PDBContainer.bonds` should accomodate.
scale : array-like, optional
An array-like object representing an user-specified index.
Defaults to a simple range index if :data:`None` (see :func:`numpy.arange`).
Returns
-------
:class:`dataCAT.PDBContainer`
A pdb container.
"""
try:
mol_count = len(mol_list) # type: ignore
except TypeError:
mol_list = list(mol_list)
mol_count = len(mol_list)
# Parse the index
if scale is None:
idx = np.arange(mol_count, dtype=cls.DTYPE['scale']).view(np.recarray)
else:
idx = np.asarray(scale).view(np.recarray)
if len(idx) != mol_count:
raise ValueError("'mol_list' and 'idx' must be of equal length")
# Gather the shape of the to-be created atom (pdb-file) array
_atom_count = max((len(mol.atoms) for mol in mol_list), default=0)
atom_count = max(_atom_count, min_atom)
atom_shape = mol_count, atom_count
# Gather the shape of the to-be created bond array
_bond_count = max((len(mol.bonds) for mol in mol_list), default=0)
bond_count = max(_bond_count, min_bond)
bond_shape = mol_count, bond_count
# Construct the to-be returned (padded) arrays
DTYPE = cls.DTYPE
atom_array = np.zeros(atom_shape, dtype=DTYPE['atoms']).view(np.recarray)
bond_array = np.zeros(bond_shape, dtype=DTYPE['bonds']).view(np.recarray)
atom_counter = np.empty(mol_count, dtype=DTYPE['atom_count'])
bond_counter = np.empty(mol_count, dtype=DTYPE['bond_count'])
# Fill the to-be returned arrays
for i, mol in enumerate(mol_list):
j_atom = len(mol.atoms)
j_bond = len(mol.bonds)
atom_array[i, :j_atom] = [_get_atom_info(at, k) for k, at in enumerate(mol, 1)]
bond_array[i, :j_bond] = _get_bond_info(mol)
atom_counter[i] = j_atom
bond_counter[i] = j_bond
return cls(
atoms=atom_array, bonds=bond_array,
atom_count=atom_counter, bond_count=bond_counter,
scale=idx, validate=False
)
@overload
def to_molecules(self, index: Union[None, Sequence[int], slice, np.ndarray] = ...,
mol: Optional[Iterable[Optional[Molecule]]] = ...) -> List[Molecule]:
...
@overload # noqa: E301
def to_molecules(self, index: SupportsIndex = ..., mol: Optional[Molecule] = ...) -> Molecule:
...
[docs] def to_molecules(self, index=None, mol=None): # noqa: E301
"""Create a molecule or list of molecules from this instance.
Examples
--------
.. testsetup:: python
>>> from dataCAT.testing_utils import (
... PDB as pdb,
... MOL_TUPLE as mol_list,
... MOL as mol
... )
An example where one or more new molecules are created.
.. code:: python
>>> from dataCAT import PDBContainer
>>> from scm.plams import Molecule
>>> pdb = PDBContainer(...) # doctest: +SKIP
# Create a single new molecule from `pdb`
>>> pdb.to_molecules(index=0) # doctest: +ELLIPSIS
<scm.plams.mol.molecule.Molecule object at ...>
# Create three new molecules from `pdb`
>>> pdb.to_molecules(index=[0, 1]) # doctest: +ELLIPSIS,+NORMALIZE_WHITESPACE
[<scm.plams.mol.molecule.Molecule object at ...>,
<scm.plams.mol.molecule.Molecule object at ...>]
An example where one or more existing molecules are updated in-place.
.. code:: python
# Update `mol` with the info from `pdb`
>>> mol = Molecule(...) # doctest: +SKIP
>>> mol_new = pdb.to_molecules(index=2, mol=mol)
>>> mol is mol_new
True
# Update all molecules in `mol_list` with info from `pdb`
>>> mol_list = [Molecule(...), Molecule(...), Molecule(...)] # doctest: +SKIP
>>> mol_list_new = pdb.to_molecules(index=range(3), mol=mol_list)
>>> for m, m_new in zip(mol_list, mol_list_new):
... print(m is m_new)
True
True
True
Parameters
----------
index : :class:`int`, :class:`Sequence[int]<typing.Sequence>` or :class:`slice`, optional
An object for slicing the arrays embedded within this instance.
Follows the standard numpy broadcasting rules (*e.g.* :code:`self.atoms[index]`).
If a scalar is provided (*e.g.* an integer) then a single molecule will be returned.
If a sequence, range, slice, *etc.* is provided then
a list of molecules will be returned.
mol : :class:`~scm.plams.mol.molecule.Molecule` or :class:`Iterable[Molecule]<typing.Iterable>`, optional
A molecule or list of molecules.
If one or molecules are provided here then they will be updated in-place.
Returns
-------
:class:`~scm.plams.mol.molecule.Molecule` or :class:`List[Molecule]<typing.List>`
A molecule or list of molecules,
depending on whether or not **index** is a scalar or sequence / slice.
Note that if :data:`mol is not None<None>`, then the-be returned molecules won't be copies.
""" # noqa: E501
if index is None:
i = slice(None)
is_seq = True
else:
try:
i = index.__index__()
is_seq = False
except (AttributeError, TypeError):
i = index
is_seq = True
atoms = self.atoms[i]
bonds = self.bonds[i]
atom_count = self.atom_count[i]
bond_count = self.bond_count[i]
if not is_seq:
return _rec_to_plams(atoms, bonds, atom_count, bond_count, mol)
if mol is None:
mol_list = repeat(None)
elif isinstance(mol, Molecule):
raise TypeError
else:
mol_list = mol
iterator = zip(atoms, bonds, atom_count, bond_count, mol_list)
return [_rec_to_plams(*args) for args in iterator]
@overload
def to_rdkit(
self,
index: None | Sequence[int] | slice | np.ndarray = ...,
sanitize: bool = ...,
) -> List[Chem.Mol]:
...
@overload # noqa: E301
def to_rdkit(
self,
index: SupportsIndex,
sanitize: bool = ...,
) -> Chem.Mol:
...
[docs] def to_rdkit(self, index=None, sanitize=True): # noqa: E301
"""Create an rdkit molecule or list of rdkit molecules from this instance.
Examples
--------
.. testsetup:: python
>>> from dataCAT.testing_utils import PDB as pdb
An example where one or more new molecules are created.
.. code:: python
>>> from dataCAT import PDBContainer
>>> from rdkit.Chem import Mol
>>> pdb = PDBContainer(...) # doctest: +SKIP
# Create a single new molecule from `pdb`
>>> pdb.to_rdkit(index=0) # doctest: +ELLIPSIS
<rdkit.Chem.rdchem.Mol object at ...>
# Create three new molecules from `pdb`
>>> pdb.to_rdkit(index=[0, 1]) # doctest: +ELLIPSIS,+NORMALIZE_WHITESPACE
[<rdkit.Chem.rdchem.Mol object at ...>,
<rdkit.Chem.rdchem.Mol object at ...>]
Parameters
----------
index : :class:`int`, :class:`Sequence[int]<Collections.abc.Sequence>` or :class:`slice`, optional
An object for slicing the arrays embedded within this instance.
Follows the standard numpy broadcasting rules (*e.g.* :code:`self.atoms[index]`).
If a scalar is provided (*e.g.* an integer) then a single molecule will be returned.
If a sequence, range, slice, *etc.* is provided then
a list of molecules will be returned.
sanitize : bool
Whether to sanitize the molecule before returning or not.
Returns
-------
:class:`~rdkit.Chem.rdchem.Mol` or :class:`list[Mol]<list>`
A molecule or list of molecules,
depending on whether or not **index** is a scalar or sequence / slice.
""" # noqa: E501
if index is None:
i = slice(None)
is_seq = True
else:
try:
i = index.__index__()
is_seq = False
except (AttributeError, TypeError):
i = index
is_seq = True
atoms = self.atoms[i]
bonds = self.bonds[i]
atom_count = self.atom_count[i]
bond_count = self.bond_count[i]
if not is_seq:
return _rec_to_rdkit(atoms, bonds, atom_count, bond_count, sanitize)
iterator = zip(atoms, bonds, atom_count, bond_count, repeat(sanitize))
return [_rec_to_rdkit(*args) for args in iterator]
@overload
@classmethod
def create_hdf5_group(cls, file: h5py.Group, name: str, *,
scale: h5py.Dataset, **kwargs: Any) -> h5py.Group:
...
@overload # noqa: E301
@classmethod
def create_hdf5_group(cls, file: h5py.Group, name: str, *,
scale_dtype: DTypeLike = ..., **kwargs: Any) -> h5py.Group:
...
[docs] @classmethod # noqa: E301
def create_hdf5_group(cls, file, name, *, scale=None, scale_dtype=None, **kwargs):
r"""Create a h5py Group for storing :class:`dataCAT.PDBContainer` instances.
Notes
-----
The **scale** and **scale_dtype** parameters are mutually exclusive.
Parameters
----------
file : :class:`h5py.File` or :class:`h5py.Group`
The h5py File or Group where the new Group will be created.
name : :class:`str`
The name of the to-be created Group.
Keyword Arguments
-----------------
scale : :class:`h5py.Dataset`, keyword-only
A pre-existing dataset serving as dimensional scale.
See **scale_dtype** to create a new instead instead.
scale_dtype : dtype-like, keyword-only
The datatype of the to-be created dimensional scale.
See **scale** to use a pre-existing dataset for this purpose.
\**kwargs : :data:`~typing.Any`
Further keyword arguments for the creation of each dataset.
Arguments already specified by default are:
``name``, ``shape``, ``maxshape`` and ``dtype``.
Returns
-------
:class:`h5py.Group`
The newly created Group.
"""
if scale is not None and scale_dtype is not None:
raise TypeError("'scale' and 'scale_dtype' cannot be both specified")
cls_name = cls.__name__
# Create the group
grp = file.create_group(name, track_order=True)
grp.attrs['__doc__'] = np.string_(HDF5_DOCSTRING.format(cls_name=cls_name))
# Create the datasets
NDIM = cls.NDIM
DTYPE = cls.DTYPE
key_iter = (k for k in cls.keys() if k != 'scale')
for key in key_iter:
maxshape = NDIM[key] * (None,)
shape = NDIM[key] * (0,)
dtype = DTYPE[key]
dset = grp.create_dataset(key, shape=shape, maxshape=maxshape, dtype=dtype, **kwargs)
dset.attrs['__doc__'] = np.string_(f"A dataset representing `{cls_name}.atoms`.")
# Set the index
scale_name = cls.SCALE_NAME
if scale is not None:
scale_dset = scale
else:
_dtype = scale_dtype if scale_dtype is not None else cls.DTYPE['scale']
scale_dset = grp.create_dataset(scale_name, shape=(0,), maxshape=(None,), dtype=_dtype)
scale_dset.make_scale(scale_name)
# Use the index as a scale
dset_iter = (grp[k] for k in cls.keys() if k != 'scale')
for dset in dset_iter:
dset.dims[0].label = scale_name
dset.dims[0].attach_scale(scale_dset)
grp['atoms'].dims[1].label = 'atoms'
grp['bonds'].dims[1].label = 'bonds'
return grp
[docs] @classmethod
def validate_hdf5(cls, group: h5py.Group) -> None:
"""Validate the passed hdf5 **group**, ensuring it is compatible with :class:`PDBContainer` instances.
An :exc:`AssertionError` will be raise if **group** does not validate.
This method is called automatically when an exception is raised by
:meth:`~PDBContainer.to_hdf5` or :meth:`~PDBContainer.from_hdf5`.
Parameters
----------
group : :class:`h5py.Group`
The to-be validated hdf5 Group.
Raises
------
:exc:`AssertionError`
Raised if the validation process fails.
""" # noqa: E501
if not isinstance(group, h5py.Group):
raise TypeError("'group' expected a h5py.Group; "
f"observed type: {group.__class__.__name__}")
# Check if **group** has all required keys
keys: List[str] = list(cls.keys())
keys[-1] = cls.SCALE_NAME
difference = set(keys) - group.keys()
if difference:
missing_keys = ', '.join(repr(i) for i in difference)
raise AssertionError(f"Missing keys in {group!r}: {missing_keys}")
# Check the dimensionality and dtype of all datasets
len_dict = {}
for key in cls.keys():
dset = group[key] if key != 'scale' else group[cls.SCALE_NAME]
len_dict[key] = len(dset)
assertion.eq(dset.ndim, cls.NDIM[key], message=f"{key!r} ndim mismatch")
if key != 'scale':
assertion.eq(dset.dtype, cls.DTYPE[key], message=f"{key!r} dtype mismatch")
# Check that all datasets are of the same length
if len(set(len_dict.values())) != 1:
raise AssertionError(
f"All datasets in {group!r} should be of the same length.\n"
f"Observed lengths: {len_dict!r}"
)
[docs] @if_exception(validate_hdf5.__func__) # type: ignore
def to_hdf5(self, group: h5py.Group, index: IndexLike, update_scale: bool = True) -> None:
"""Update all datasets in **group** positioned at **index** with its counterpart from **pdb**.
Follows the standard broadcasting rules as employed by h5py.
Important
---------
If **index** is passed as a sequence of integers then, contrary to NumPy,
they *will* have to be sorted.
Parameters
----------
group : :class:`h5py.Group`
The to-be updated h5py group.
index : :class:`int`, :class:`Sequence[int]<typing.Sequence>` or :class:`slice`
An object for slicing all datasets in **group**.
Note that, contrary to numpy, if a sequence of integers is provided
then they'll have to ordered.
update_scale : :class:`bool`
If :data:`True`, also export :attr:`PDBContainer.scale` to the dimensional
scale in the passed **group**.
"""
# Parse **idx**
if index is None:
idx: slice | np.ndarray = slice(None)
idx_max: int | np.integer = len(self)
else:
try:
idx = int_to_slice(index, len(self)) # type: ignore
idx_max = idx.stop or len(self)
except (AttributeError, TypeError):
if not isinstance(index, slice):
idx = np.asarray(index)
idx_max = idx.max()
assert idx.ndim == 1
assert issubclass(idx.dtype.type, np.integer)
else:
idx = index
idx_max = idx.stop or len(self)
# Update the length of all groups
scale_name = self.SCALE_NAME
scale = group['atoms'].dims[0][scale_name]
if len(scale) < idx_max:
scale.resize(idx_max, axis=0)
self._update_hdf5_shape(group)
# Update the datasets
items = ((name, ar) for name, ar in self.items() if name != 'scale')
for name, ar in items:
dataset = group[name]
if ar.ndim == 1:
dataset[idx] = ar
else:
_, j = ar.shape
dataset[idx, :j] = ar
# Update the index
if update_scale:
scale[idx] = self.scale
def _update_hdf5_shape(self, group: h5py.Group) -> None:
"""Update the shape of all datasets in **group** such that it can accommodate **pdb**.
The length of all datasets will be set equal to the ``index`` dimensional cale.
This method is automatically called by :meth:`PDBContainer.update_hdf5` when required.
Parameters
----------
group : :class:`h5py.Group`
The h5py Group with the to-be reshaped datasets.
pdb : :class:`dataCAT.PDBContainer`
The pdb container for updating **group**.
:rtype: :data:`None`
""" # noqa: E501
# Get the length of the dimensional scale
scale_name = self.SCALE_NAME
idx = group['atoms'].dims[0][scale_name]
idx_len = len(idx)
items = ((name, ar) for name, ar in self.items() if name != 'scale')
for name, ar in items:
dataset = group[name]
# Identify the new shape of all datasets
shape = np.fromiter(dataset.shape, dtype=int)
shape[0] = idx_len
if ar.ndim == 2:
shape[1] = max(shape[1], ar.shape[1])
# Set the new shape
dataset.shape = shape
[docs] @classmethod
@if_exception(validate_hdf5.__func__) # type: ignore
def from_hdf5(cls: Type[ST], group: h5py.Group, index: IndexLike = None) -> ST:
"""Construct a new PDBContainer from the passed hdf5 **group**.
Parameters
----------
group : :class:`h5py.Group`
The to-be read h5py group.
index : :class:`int`, :class:`Sequence[int]<typing.Sequence>` or :class:`slice`, optional
An object for slicing all datasets in **group**.
Returns
-------
:class:`dataCAT.PDBContainer`
A new PDBContainer constructed from **group**.
"""
if index is None:
idx: Union[slice, np.ndarray] = slice(None)
else:
try:
idx = int_to_slice(index, len(group['atom_count'])) # type: ignore
except (AttributeError, TypeError):
idx = np.asarray(index) if not isinstance(index, slice) else index
assert getattr(idx, 'ndim', 1) == 1
scale_name = cls.SCALE_NAME
return cls(
atoms=group['atoms'][idx].view(np.recarray),
bonds=group['bonds'][idx].view(np.recarray),
atom_count=group['atom_count'][idx],
bond_count=group['bond_count'][idx],
scale=group[scale_name][idx].view(np.recarray),
validate=False
)
[docs] def intersection(self: ST, value: Union[ST, ArrayLike]) -> ST:
"""Construct a new PDBContainer by the intersection of **self** and **value**.
Examples
--------
.. testsetup:: python
>>> from dataCAT.testing_utils import PDB as pdb
An example where one or more new molecules are created.
.. code:: python
>>> from dataCAT import PDBContainer
>>> pdb = PDBContainer(...) # doctest: +SKIP
>>> print(pdb.scale)
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22]
>>> pdb_new = pdb.intersection(range(4))
>>> print(pdb_new.scale)
[0 1 2 3]
Parameters
----------
value : :class:`PDBContainer` or array-like
Another PDBContainer or an array-like object representing :attr:`PDBContainer.scale`.
Note that both **value** and **self.scale** should consist of unique elements.
Returns
-------
:class:`PDBContainer`
A new instance by intersecting :attr:`self.scale<PDBContainer.scale>` and **value**.
See Also
--------
:meth:`set.intersection<frozenset.intersection>`
Return the intersection of two sets as a new set.
"""
idx1, idx2 = self._get_index(value)
_, i, _ = np.intersect1d(idx1, idx2, assume_unique=True, return_indices=True)
return self[i]
[docs] def difference(self: ST, value: Union[ST, ArrayLike]) -> ST:
"""Construct a new PDBContainer by the difference of **self** and **value**.
Examples
--------
.. testsetup:: python
>>> from dataCAT.testing_utils import PDB as pdb
.. code:: python
>>> from dataCAT import PDBContainer
>>> pdb = PDBContainer(...) # doctest: +SKIP
>>> print(pdb.scale)
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22]
>>> pdb_new = pdb.difference(range(10, 30))
>>> print(pdb_new.scale)
[0 1 2 3 4 5 6 7 8 9]
Parameters
----------
value : :class:`PDBContainer` or array-like
Another PDBContainer or an array-like object representing :attr:`PDBContainer.scale`.
Note that both **value** and **self.scale** should consist of unique elements.
Returns
-------
:class:`PDBContainer`
A new instance as the difference of :attr:`self.scale<PDBContainer.scale>`
and **value**.
See Also
--------
:meth:`set.difference<frozenset.difference>`
Return the difference of two or more sets as a new set.
"""
idx1, idx2 = self._get_index(value)
i = np.in1d(idx1, idx2, assume_unique=True, invert=True)
return self[i]
[docs] def symmetric_difference(self: ST, value: ST) -> ST:
"""Construct a new PDBContainer by the symmetric difference of **self** and **value**.
Examples
--------
.. testsetup:: python
>>> import numpy as np
>>> from dataCAT.testing_utils import PDB as pdb
>>> from dataCAT import PDBContainer
>>> a = np.arange(10, 30)
>>> b = a.reshape(len(a), 1)
>>> pdb2 = PDBContainer(b, b, a, a, a, validate=False)
.. code:: python
>>> from dataCAT import PDBContainer
>>> pdb = PDBContainer(...) # doctest: +SKIP
>>> pdb2 = PDBContainer(..., scale=range(10, 30)) # doctest: +SKIP
>>> print(pdb.scale)
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22]
>>> pdb_new = pdb.symmetric_difference(pdb2)
>>> print(pdb_new.scale)
[ 0 1 2 3 4 5 6 7 8 9 23 24 25 26 27 28 29]
Parameters
----------
value : :class:`PDBContainer`
Another PDBContainer.
Note that both **value.scale** and **self.scale** should consist of unique elements.
Returns
-------
:class:`PDBContainer`
A new instance as the symmetric difference of :attr:`self.scale<PDBContainer.scale>`
and **value**.
See Also
--------
:meth:`set.symmetric_difference<frozenset.symmetric_difference>`
Return the symmetric difference of two sets as a new set.
"""
pdb1 = self.difference(value)
pdb2 = value.difference(self)
return pdb1.concatenate(pdb2)
[docs] def union(self: ST, value: ST) -> ST:
"""Construct a new PDBContainer by the union of **self** and **value**.
Examples
--------
.. testsetup:: python
>>> import numpy as np
>>> from dataCAT.testing_utils import PDB as pdb
>>> from dataCAT import PDBContainer
>>> a = np.arange(10, 30)
>>> b = a.reshape(len(a), 1)
>>> pdb2 = PDBContainer(b, b, a, a, a, validate=False)
.. code:: python
>>> from dataCAT import PDBContainer
>>> pdb = PDBContainer(...) # doctest: +SKIP
>>> pdb2 = PDBContainer(..., scale=range(10, 30)) # doctest: +SKIP
>>> print(pdb.scale)
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22]
>>> pdb_new = pdb.union(pdb2)
>>> print(pdb_new.scale)
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25 26 27 28 29]
Parameters
----------
value : :class:`PDBContainer`
Another PDBContainer.
Note that both **value** and **self.scale** should consist of unique elements.
Returns
-------
:class:`PDBContainer`
A new instance as the union of :attr:`self.index<PDBContainer.index>` and **value**.
See Also
--------
:meth:`set.union<frozenset.union>`
Return the union of sets as a new set.
"""
cls = type(self)
if not isinstance(value, cls):
raise TypeError(f"'value' expected a {cls.__name__} instance; "
f"observed type: {value.__class__.__name__}")
idx1 = self.scale
idx2 = value.scale.astype(idx1.dtype, copy=False)
i = np.in1d(idx2, idx1, assume_unique=True, invert=True)
ret = value[i]
return self.concatenate(ret)
def _get_index(self: ST, value: Union[ST, ArrayLike]) -> Tuple[np.recarray, np.ndarray]:
"""Parse and return the :attr:`~PDBContainer.scale` of **self** and **value**."""
cls = type(self)
scale1 = self.scale
_scale2 = value.scale if isinstance(value, cls) else value
scale2 = np.asarray(_scale2, dtype=scale1.dtype)
return scale1, scale2