Source code for dataCAT.hdf5_log

"""A module related to logging and hdf5.

Index
-----
.. currentmodule:: dataCAT
.. autosummary::
    create_hdf5_log
    update_hdf5_log
    reset_hdf5_log
    log_to_dataframe

API
---
.. autofunction:: create_hdf5_log
.. autofunction:: update_hdf5_log
.. autofunction:: reset_hdf5_log
.. autofunction:: log_to_dataframe

"""

from __future__ import annotations

from typing import Sequence, Tuple, Optional, Any, TYPE_CHECKING
from datetime import datetime

import h5py
import numpy as np
import pandas as pd

from . import CAT_VERSION, NANOCAT_VERSION, DATACAT_VERSION
from .dtype import DT_DTYPE, VERSION_DTYPE, MSG_DTYPE, INDEX_DTYPE

if TYPE_CHECKING:
    from numpy.typing import ArrayLike

__all__ = [
    'create_hdf5_log', 'update_hdf5_log', 'reset_hdf5_log', 'log_to_dataframe'
]

_VERSION = np.array([CAT_VERSION, NANOCAT_VERSION, DATACAT_VERSION], dtype=VERSION_DTYPE)  # type: ignore  # noqa: E501
_VERSION.setflags(write=False)

_VERSION_NAMES = np.array(['CAT', 'Nano-CAT', 'Data-CAT'], dtype=np.string_)
_VERSION_NAMES.setflags(write=False)


LOG_DOC = """A h5py Group for logging database modifications.

Attributes
----------
date : dataset
    A dataset for denoting dates and times when the database was modified.
    Used as dimensional scale for :code:`group['index'].dims[0]` and
    :code:`group['version'].dims[0]`.
version : dataset
    A dataset keeping track of (user-specified) package versions.
version_names : dataset
    A dataset with the names of the packages whose versions are displayed in **version**.
    Used as dimensional scale for :code:`group['version'].dims[1]`.
message : dataset
    A dataset holding user-specified modification messages.
index : dataset
    A dataset with the indices of which elements in the database were modified.

n : attribute
    An attribute with the index of the next to-be set dataset element.
n_step : attribute
    An attribute with the increment in which the length of each dataset should be
    increased in the case of :code:`n >= len(dataset)`.
    Only relevant when :code:`clear_when_full = False`.
clear_when_full : :class:`bool`
    Whether or not to delete and recreate the dataset when it's full.
    Otherwise its length be increased by **n_step**.
date_created : attribute
    An attribute with the date and time from when this logger was created.
version_created : attribute
    An attribute with the versions of a set of user-specified packages from when
    this logger was created.

"""


def _get_now() -> np.recarray:
    now = datetime.now()
    tup = tuple(getattr(now, k) for k in DT_DTYPE.fields.keys())  # type: ignore[union-attr]
    return np.rec.array(tup, dtype=DT_DTYPE)


[docs]def create_hdf5_log(file: h5py.Group, n_entries: int = 100, clear_when_full: bool = False, version_names: Sequence[str] | Sequence[bytes] | np.ndarray = _VERSION_NAMES, version_values: Sequence[Tuple[int, int, int]] | np.ndarray = _VERSION, **kwargs: Any) -> h5py.Group: r"""Create a hdf5 group for logging database modifications. The logger Group consists of four main datasets: * ``"date"``: Denotes dates and times for when the database is modified. * ``"version"``: Denotes user-specified package versions for when the database is modified. * ``"version_names"`` : See the **version_names** parameter. * ``"message"``: Holds user-specified modification messages. * ``"index"``: Denotes indices of which elements in the database were modified. Examples -------- .. testsetup:: python >>> import os >>> from dataCAT.testing_utils import HDF5_TMP as hdf5_file >>> if os.path.isfile(hdf5_file): ... os.remove(hdf5_file) .. code:: python >>> import h5py >>> from dataCAT import create_hdf5_log >>> hdf5_file = str(...) # doctest: +SKIP >>> with h5py.File(hdf5_file, 'a') as f: ... group = create_hdf5_log(f) ... ... print('group', '=', group) ... for name, dset in group.items(): ... print(f'group[{name!r}]', '=', dset) group = <HDF5 group "/logger" (5 members)> group['date'] = <HDF5 dataset "date": shape (100,), type "|V11"> group['version'] = <HDF5 dataset "version": shape (100, 3), type "|V3"> group['version_names'] = <HDF5 dataset "version_names": shape (3,), type "|S8"> group['message'] = <HDF5 dataset "message": shape (100,), type "|O"> group['index'] = <HDF5 dataset "index": shape (100,), type "|O"> .. testcleanup:: python >>> if os.path.isfile(hdf5_file): ... os.remove(hdf5_file) Parameters ---------- file : :class:`h5py.File` or :class:`h5py.Group` The File or Group where the logger should be created. n_entries : :class:`int` The initial number of entries in each to-be created dataset. In addition, everytime the datasets run out of available slots their length will be increased by this number (assuming :data:`clear_when_full = False<False>`). clear_when_full : :class:`bool` If :data:`True`, delete the logger and create a new one whenever it is full. Increase the size of each dataset by **n_entries** otherwise. version_names : :class:`Sequence[str or bytes]<typing.Sequence>` A sequence consisting of strings and/or bytes representing the names of the to-be stored package versions. Should be of the same length as **version_values**. version_values : :class:`Sequence[Tuple[int, int, int]]<typing.Sequence>` A sequence with 3-tuples, each tuple representing a package version associated with its respective counterpart in **version_names**. \**kwargs : :data:`~Any` Further keyword arguments for the h5py :meth:`~h5py.Group.create_dataset` function. Returns ------- :class:`h5py.Group` The newly created ``"logger"`` group. """ m = len(version_values) if n_entries < 1: raise ValueError(f"'n_entries' must ba larger than 1; observed value: {n_entries!r}") elif m < 1: raise ValueError(f"'version_values' must be larger than 1; observed value: {version_values!r}") # noqa: E501 # Set attributes grp = file.create_group('logger', track_order=True) grp.attrs['__doc__'] = np.string_(LOG_DOC) grp.attrs['n'] = 0 grp.attrs['n_step'] = n_entries grp.attrs['clear_when_full'] = clear_when_full grp.attrs['date_created'] = _get_now() grp.attrs['version_created'] = np.asarray(version_values, dtype=VERSION_DTYPE) # Set the datasets shape1 = (n_entries,) shape2 = (n_entries, m) data = np.asarray(version_names, dtype=np.string_) scale1 = grp.create_dataset('date', shape=shape1, maxshape=(None,), dtype=DT_DTYPE, chunks=shape1, **kwargs) # noqa: E501 grp.create_dataset('version', shape=shape2, maxshape=(None, m), dtype=VERSION_DTYPE, chunks=shape2, **kwargs) # noqa: E501 scale2 = grp.create_dataset('version_names', data=data, shape=(m,), dtype=data.dtype, **kwargs) grp.create_dataset('message', shape=shape1, maxshape=(None,), dtype=MSG_DTYPE, chunks=shape1, **kwargs) # noqa: E501 grp.create_dataset('index', shape=shape1, maxshape=(None,), dtype=INDEX_DTYPE, chunks=shape1, **kwargs) # noqa: E501 # Set dataset scales scale1.make_scale('date') grp['version'].dims[0].label = 'date' grp['version'].dims[0].attach_scale(scale1) grp['index'].dims[0].label = 'date' grp['index'].dims[0].attach_scale(scale1) grp['message'].dims[0].label = 'date' grp['message'].dims[0].attach_scale(scale1) scale2.make_scale('version_names') grp['version'].dims[1].label = 'version_names' grp['version'].dims[1].attach_scale(scale2) return grp
[docs]def update_hdf5_log( group: h5py.Group, index: ArrayLike, message: Optional[str] = None, version_values: Sequence[Tuple[int, int, int]] | np.ndarray = _VERSION, ) -> None: r"""Add a new entry to the hdf5 logger in **file**. Examples -------- .. testsetup:: python >>> import os >>> from shutil import copyfile >>> from dataCAT.testing_utils import HDF5_READ, HDF5_TMP as hdf5_file >>> if os.path.isfile(hdf5_file): ... os.remove(hdf5_file) >>> _ = copyfile(HDF5_READ, hdf5_file) .. code:: python >>> from datetime import datetime >>> import h5py >>> from dataCAT import update_hdf5_log >>> hdf5_file = str(...) # doctest: +SKIP >>> with h5py.File(hdf5_file, 'r+') as f: ... group = f['ligand/logger'] ... ... n = group.attrs['n'] ... date_before = group['date'][n] ... index_before = group['index'][n] ... ... update_hdf5_log(group, index=[0, 1, 2, 3], message='append') ... date_after = group['date'][n] ... index_after = group['index'][n] >>> print(index_before, index_after, sep='\n') [] [0 1 2 3] >>> print(date_before, date_after, sep='\n') # doctest: +SKIP (0, 0, 0, 0, 0, 0, 0) (2020, 6, 24, 16, 33, 7, 959888) .. testcleanup:: python >>> if os.path.isfile(hdf5_file): ... os.remove(hdf5_file) Parameters ---------- group : :class:`h5py.Group` The ``logger`` Group. idx : :class:`numpy.ndarray` A numpy array with the indices of (to-be logged) updated elements. version_values : :class:`Sequence[Tuple[int, int, int]]<typing.Sequence>` A sequence with 3-tuples representing to-be updated package versions. :rtype: :data:`None` """ n = group.attrs['n'] n_max = len(group['date']) # Increase the size of the datasets by *n_step* if n >= n_max: if group.attrs['clear_when_full']: group = reset_hdf5_log(group, version_values) n = 0 else: n_max += group.attrs['n_step'] group['date'].resize(n_max, axis=0) group['version'].resize(n_max, axis=0) group['index'].resize(n_max, axis=0) group['message'].resize(n_max, axis=0) # Parse the passed **idx** idx = np.array(index, ndmin=1, copy=False) generic = idx.dtype.type if idx.ndim > 1: raise ValueError("The dimensionality of 'index' should be <= 1; " f"observed dimensionality: {idx.ndim!r}") elif not idx.ndim: idx = idx.astype(INDEX_DTYPE) if issubclass(generic, np.bool_): idx, *_ = idx.nonzero() elif not issubclass(generic, np.integer): raise TypeError("'idx' expected an integer or boolean array; " f"observed dtype: {idx.dtype!r}") # Update the datasets group['date'][n] = _get_now() group['version'][n] = version_values group['index'][n] = idx if message is not None: group['message'][n] = message group.attrs['n'] += 1
[docs]def reset_hdf5_log( group: h5py.Group, version_values: Sequence[Tuple[int, int, int]] | np.ndarray = _VERSION, ) -> h5py.Group: r"""Clear and reset the passed ``logger`` Group. Examples -------- .. testsetup:: python >>> import os >>> from shutil import copyfile >>> from dataCAT.testing_utils import HDF5_READ, HDF5_TMP as hdf5_file >>> if os.path.isfile(hdf5_file): ... os.remove(hdf5_file) >>> _ = copyfile(HDF5_READ, hdf5_file) .. code:: python >>> import h5py >>> from dataCAT import reset_hdf5_log >>> hdf5_file = str(...) # doctest: +SKIP >>> with h5py.File(hdf5_file, 'r+') as f: ... group = f['ligand/logger'] ... print('before:') ... print(group.attrs['n']) ... ... group = reset_hdf5_log(group) ... print('\nafter:') ... print(group.attrs['n']) before: 2 <BLANKLINE> after: 0 .. testcleanup:: python >>> if os.path.isfile(hdf5_file): ... os.remove(hdf5_file) Parameters ---------- group : :class:`h5py.File` or :class:`h5py.Group` The ``logger`` Group. version_values : :class:`Sequence[Tuple[int, int, int]]<typing.Sequence>` A sequence with 3-tuples representing to-be updated package versions. Returns ------- :class:`h5py.Group` The newly (re-)created ``"logger"`` group. """ version_names = group['version_names'][:] n_entries = group.attrs['n_step'] clear_when_full = group.attrs['clear_when_full'] parent = group.parent file = group.file del file[group.name] return create_hdf5_log(parent, n_entries, clear_when_full, version_names, version_values)
[docs]def log_to_dataframe(group: h5py.Group) -> pd.DataFrame: """Export the log embedded within **file** to a Pandas DataFrame. Examples -------- .. testsetup:: python >>> from dataCAT.testing_utils import HDF5_READ as hdf5_file .. code:: python >>> import h5py >>> from dataCAT import log_to_dataframe >>> hdf5_file = str(...) # doctest: +SKIP >>> with h5py.File(hdf5_file, 'r') as f: ... group = f['ligand/logger'] ... df = log_to_dataframe(group) ... print(df) # doctest: +NORMALIZE_WHITESPACE CAT ... Data-CAT message index major minor micro ... micro date ... 2020-06-24 15:28:09.861074 0 9 6 ... 1 update [0] 2020-06-24 15:56:18.971201 0 9 6 ... 1 append [1, 2, 3, 4, 5, 6] <BLANKLINE> [2 rows x 11 columns] Parameters ---------- group : :class:`h5py.Group` The ``logger`` Group. Returns ------- :class:`pandas.DataFrame` A DataFrame containing the content of :code:`file["logger"]`. """ # noqa: E501 n = group.attrs['n'] # Prepare the columns _columns = group['version_names'][:].astype(str) columns = pd.MultiIndex.from_product([_columns, group['version'].dtype.names]) # In case the datasets are empty if not n: index = pd.Index([], dtype='datetime64[ns]', name='date') df = pd.DataFrame(columns=columns, index=index, dtype='int8') df[('message', '')] = np.array([], dtype=str) df[('index', '')] = np.array([], dtype=object) return df # Prepare the index date = group['date'][:n] _index = np.fromiter((datetime(*i) for i in date), count=len(date), dtype='datetime64[us]') index = pd.Index(_index, dtype='datetime64[ns]', name='date') # Construct and return the DataFrame data = group['version'][:n].view('int8') df = pd.DataFrame(data, index=index, columns=columns) df[('message', '')] = group['message'][:n].astype(str) df[('index', '')] = group['index'][:n] return df