"""A module related to logging and hdf5.
Index
-----
.. currentmodule:: dataCAT
.. autosummary::
create_hdf5_log
update_hdf5_log
reset_hdf5_log
log_to_dataframe
API
---
.. autofunction:: create_hdf5_log
.. autofunction:: update_hdf5_log
.. autofunction:: reset_hdf5_log
.. autofunction:: log_to_dataframe
"""
from __future__ import annotations
from typing import Sequence, Tuple, Optional, Any, TYPE_CHECKING
from datetime import datetime
import h5py
import numpy as np
import pandas as pd
from . import CAT_VERSION, NANOCAT_VERSION, DATACAT_VERSION
from .dtype import DT_DTYPE, VERSION_DTYPE, MSG_DTYPE, INDEX_DTYPE
if TYPE_CHECKING:
from numpy.typing import ArrayLike
__all__ = [
'create_hdf5_log', 'update_hdf5_log', 'reset_hdf5_log', 'log_to_dataframe'
]
_VERSION = np.array([CAT_VERSION, NANOCAT_VERSION, DATACAT_VERSION], dtype=VERSION_DTYPE) # type: ignore # noqa: E501
_VERSION.setflags(write=False)
_VERSION_NAMES = np.array(['CAT', 'Nano-CAT', 'Data-CAT'], dtype=np.string_)
_VERSION_NAMES.setflags(write=False)
LOG_DOC = """A h5py Group for logging database modifications.
Attributes
----------
date : dataset
A dataset for denoting dates and times when the database was modified.
Used as dimensional scale for :code:`group['index'].dims[0]` and
:code:`group['version'].dims[0]`.
version : dataset
A dataset keeping track of (user-specified) package versions.
version_names : dataset
A dataset with the names of the packages whose versions are displayed in **version**.
Used as dimensional scale for :code:`group['version'].dims[1]`.
message : dataset
A dataset holding user-specified modification messages.
index : dataset
A dataset with the indices of which elements in the database were modified.
n : attribute
An attribute with the index of the next to-be set dataset element.
n_step : attribute
An attribute with the increment in which the length of each dataset should be
increased in the case of :code:`n >= len(dataset)`.
Only relevant when :code:`clear_when_full = False`.
clear_when_full : :class:`bool`
Whether or not to delete and recreate the dataset when it's full.
Otherwise its length be increased by **n_step**.
date_created : attribute
An attribute with the date and time from when this logger was created.
version_created : attribute
An attribute with the versions of a set of user-specified packages from when
this logger was created.
"""
def _get_now() -> np.recarray:
now = datetime.now()
tup = tuple(getattr(now, k) for k in DT_DTYPE.fields.keys()) # type: ignore[union-attr]
return np.rec.array(tup, dtype=DT_DTYPE)
[docs]def create_hdf5_log(file: h5py.Group,
n_entries: int = 100,
clear_when_full: bool = False,
version_names: Sequence[str] | Sequence[bytes] | np.ndarray = _VERSION_NAMES,
version_values: Sequence[Tuple[int, int, int]] | np.ndarray = _VERSION,
**kwargs: Any) -> h5py.Group:
r"""Create a hdf5 group for logging database modifications.
The logger Group consists of four main datasets:
* ``"date"``: Denotes dates and times for when the database is modified.
* ``"version"``: Denotes user-specified package versions for when the database is modified.
* ``"version_names"`` : See the **version_names** parameter.
* ``"message"``: Holds user-specified modification messages.
* ``"index"``: Denotes indices of which elements in the database were modified.
Examples
--------
.. testsetup:: python
>>> import os
>>> from dataCAT.testing_utils import HDF5_TMP as hdf5_file
>>> if os.path.isfile(hdf5_file):
... os.remove(hdf5_file)
.. code:: python
>>> import h5py
>>> from dataCAT import create_hdf5_log
>>> hdf5_file = str(...) # doctest: +SKIP
>>> with h5py.File(hdf5_file, 'a') as f:
... group = create_hdf5_log(f)
...
... print('group', '=', group)
... for name, dset in group.items():
... print(f'group[{name!r}]', '=', dset)
group = <HDF5 group "/logger" (5 members)>
group['date'] = <HDF5 dataset "date": shape (100,), type "|V11">
group['version'] = <HDF5 dataset "version": shape (100, 3), type "|V3">
group['version_names'] = <HDF5 dataset "version_names": shape (3,), type "|S8">
group['message'] = <HDF5 dataset "message": shape (100,), type "|O">
group['index'] = <HDF5 dataset "index": shape (100,), type "|O">
.. testcleanup:: python
>>> if os.path.isfile(hdf5_file):
... os.remove(hdf5_file)
Parameters
----------
file : :class:`h5py.File` or :class:`h5py.Group`
The File or Group where the logger should be created.
n_entries : :class:`int`
The initial number of entries in each to-be created dataset.
In addition, everytime the datasets run out of available slots their length
will be increased by this number (assuming :data:`clear_when_full = False<False>`).
clear_when_full : :class:`bool`
If :data:`True`, delete the logger and create a new one whenever it is full.
Increase the size of each dataset by **n_entries** otherwise.
version_names : :class:`Sequence[str or bytes]<typing.Sequence>`
A sequence consisting of strings and/or bytes representing the
names of the to-be stored package versions.
Should be of the same length as **version_values**.
version_values : :class:`Sequence[Tuple[int, int, int]]<typing.Sequence>`
A sequence with 3-tuples, each tuple representing a package version associated with
its respective counterpart in **version_names**.
\**kwargs : :data:`~Any`
Further keyword arguments for the h5py :meth:`~h5py.Group.create_dataset` function.
Returns
-------
:class:`h5py.Group`
The newly created ``"logger"`` group.
"""
m = len(version_values)
if n_entries < 1:
raise ValueError(f"'n_entries' must ba larger than 1; observed value: {n_entries!r}")
elif m < 1:
raise ValueError(f"'version_values' must be larger than 1; observed value: {version_values!r}") # noqa: E501
# Set attributes
grp = file.create_group('logger', track_order=True)
grp.attrs['__doc__'] = np.string_(LOG_DOC)
grp.attrs['n'] = 0
grp.attrs['n_step'] = n_entries
grp.attrs['clear_when_full'] = clear_when_full
grp.attrs['date_created'] = _get_now()
grp.attrs['version_created'] = np.asarray(version_values, dtype=VERSION_DTYPE)
# Set the datasets
shape1 = (n_entries,)
shape2 = (n_entries, m)
data = np.asarray(version_names, dtype=np.string_)
scale1 = grp.create_dataset('date', shape=shape1, maxshape=(None,), dtype=DT_DTYPE, chunks=shape1, **kwargs) # noqa: E501
grp.create_dataset('version', shape=shape2, maxshape=(None, m), dtype=VERSION_DTYPE, chunks=shape2, **kwargs) # noqa: E501
scale2 = grp.create_dataset('version_names', data=data, shape=(m,), dtype=data.dtype, **kwargs)
grp.create_dataset('message', shape=shape1, maxshape=(None,), dtype=MSG_DTYPE, chunks=shape1, **kwargs) # noqa: E501
grp.create_dataset('index', shape=shape1, maxshape=(None,), dtype=INDEX_DTYPE, chunks=shape1, **kwargs) # noqa: E501
# Set dataset scales
scale1.make_scale('date')
grp['version'].dims[0].label = 'date'
grp['version'].dims[0].attach_scale(scale1)
grp['index'].dims[0].label = 'date'
grp['index'].dims[0].attach_scale(scale1)
grp['message'].dims[0].label = 'date'
grp['message'].dims[0].attach_scale(scale1)
scale2.make_scale('version_names')
grp['version'].dims[1].label = 'version_names'
grp['version'].dims[1].attach_scale(scale2)
return grp
[docs]def update_hdf5_log(
group: h5py.Group,
index: ArrayLike,
message: Optional[str] = None,
version_values: Sequence[Tuple[int, int, int]] | np.ndarray = _VERSION,
) -> None:
r"""Add a new entry to the hdf5 logger in **file**.
Examples
--------
.. testsetup:: python
>>> import os
>>> from shutil import copyfile
>>> from dataCAT.testing_utils import HDF5_READ, HDF5_TMP as hdf5_file
>>> if os.path.isfile(hdf5_file):
... os.remove(hdf5_file)
>>> _ = copyfile(HDF5_READ, hdf5_file)
.. code:: python
>>> from datetime import datetime
>>> import h5py
>>> from dataCAT import update_hdf5_log
>>> hdf5_file = str(...) # doctest: +SKIP
>>> with h5py.File(hdf5_file, 'r+') as f:
... group = f['ligand/logger']
...
... n = group.attrs['n']
... date_before = group['date'][n]
... index_before = group['index'][n]
...
... update_hdf5_log(group, index=[0, 1, 2, 3], message='append')
... date_after = group['date'][n]
... index_after = group['index'][n]
>>> print(index_before, index_after, sep='\n')
[]
[0 1 2 3]
>>> print(date_before, date_after, sep='\n') # doctest: +SKIP
(0, 0, 0, 0, 0, 0, 0)
(2020, 6, 24, 16, 33, 7, 959888)
.. testcleanup:: python
>>> if os.path.isfile(hdf5_file):
... os.remove(hdf5_file)
Parameters
----------
group : :class:`h5py.Group`
The ``logger`` Group.
idx : :class:`numpy.ndarray`
A numpy array with the indices of (to-be logged) updated elements.
version_values : :class:`Sequence[Tuple[int, int, int]]<typing.Sequence>`
A sequence with 3-tuples representing to-be updated package versions.
:rtype: :data:`None`
"""
n = group.attrs['n']
n_max = len(group['date'])
# Increase the size of the datasets by *n_step*
if n >= n_max:
if group.attrs['clear_when_full']:
group = reset_hdf5_log(group, version_values)
n = 0
else:
n_max += group.attrs['n_step']
group['date'].resize(n_max, axis=0)
group['version'].resize(n_max, axis=0)
group['index'].resize(n_max, axis=0)
group['message'].resize(n_max, axis=0)
# Parse the passed **idx**
idx = np.array(index, ndmin=1, copy=False)
generic = idx.dtype.type
if idx.ndim > 1:
raise ValueError("The dimensionality of 'index' should be <= 1; "
f"observed dimensionality: {idx.ndim!r}")
elif not idx.ndim:
idx = idx.astype(INDEX_DTYPE)
if issubclass(generic, np.bool_):
idx, *_ = idx.nonzero()
elif not issubclass(generic, np.integer):
raise TypeError("'idx' expected an integer or boolean array; "
f"observed dtype: {idx.dtype!r}")
# Update the datasets
group['date'][n] = _get_now()
group['version'][n] = version_values
group['index'][n] = idx
if message is not None:
group['message'][n] = message
group.attrs['n'] += 1
[docs]def reset_hdf5_log(
group: h5py.Group,
version_values: Sequence[Tuple[int, int, int]] | np.ndarray = _VERSION,
) -> h5py.Group:
r"""Clear and reset the passed ``logger`` Group.
Examples
--------
.. testsetup:: python
>>> import os
>>> from shutil import copyfile
>>> from dataCAT.testing_utils import HDF5_READ, HDF5_TMP as hdf5_file
>>> if os.path.isfile(hdf5_file):
... os.remove(hdf5_file)
>>> _ = copyfile(HDF5_READ, hdf5_file)
.. code:: python
>>> import h5py
>>> from dataCAT import reset_hdf5_log
>>> hdf5_file = str(...) # doctest: +SKIP
>>> with h5py.File(hdf5_file, 'r+') as f:
... group = f['ligand/logger']
... print('before:')
... print(group.attrs['n'])
...
... group = reset_hdf5_log(group)
... print('\nafter:')
... print(group.attrs['n'])
before:
2
<BLANKLINE>
after:
0
.. testcleanup:: python
>>> if os.path.isfile(hdf5_file):
... os.remove(hdf5_file)
Parameters
----------
group : :class:`h5py.File` or :class:`h5py.Group`
The ``logger`` Group.
version_values : :class:`Sequence[Tuple[int, int, int]]<typing.Sequence>`
A sequence with 3-tuples representing to-be updated package versions.
Returns
-------
:class:`h5py.Group`
The newly (re-)created ``"logger"`` group.
"""
version_names = group['version_names'][:]
n_entries = group.attrs['n_step']
clear_when_full = group.attrs['clear_when_full']
parent = group.parent
file = group.file
del file[group.name]
return create_hdf5_log(parent, n_entries, clear_when_full, version_names, version_values)
[docs]def log_to_dataframe(group: h5py.Group) -> pd.DataFrame:
"""Export the log embedded within **file** to a Pandas DataFrame.
Examples
--------
.. testsetup:: python
>>> from dataCAT.testing_utils import HDF5_READ as hdf5_file
.. code:: python
>>> import h5py
>>> from dataCAT import log_to_dataframe
>>> hdf5_file = str(...) # doctest: +SKIP
>>> with h5py.File(hdf5_file, 'r') as f:
... group = f['ligand/logger']
... df = log_to_dataframe(group)
... print(df) # doctest: +NORMALIZE_WHITESPACE
CAT ... Data-CAT message index
major minor micro ... micro
date ...
2020-06-24 15:28:09.861074 0 9 6 ... 1 update [0]
2020-06-24 15:56:18.971201 0 9 6 ... 1 append [1, 2, 3, 4, 5, 6]
<BLANKLINE>
[2 rows x 11 columns]
Parameters
----------
group : :class:`h5py.Group`
The ``logger`` Group.
Returns
-------
:class:`pandas.DataFrame`
A DataFrame containing the content of :code:`file["logger"]`.
""" # noqa: E501
n = group.attrs['n']
# Prepare the columns
_columns = group['version_names'][:].astype(str)
columns = pd.MultiIndex.from_product([_columns, group['version'].dtype.names])
# In case the datasets are empty
if not n:
index = pd.Index([], dtype='datetime64[ns]', name='date')
df = pd.DataFrame(columns=columns, index=index, dtype='int8')
df[('message', '')] = np.array([], dtype=str)
df[('index', '')] = np.array([], dtype=object)
return df
# Prepare the index
date = group['date'][:n]
_index = np.fromiter((datetime(*i) for i in date), count=len(date), dtype='datetime64[us]')
index = pd.Index(_index, dtype='datetime64[ns]', name='date')
# Construct and return the DataFrame
data = group['version'][:n].view('int8')
df = pd.DataFrame(data, index=index, columns=columns)
df[('message', '')] = group['message'][:n].astype(str)
df[('index', '')] = group['index'][:n]
return df