Source code for CAT.dye.addlig

"""A module for multiple compound attachment and export of the .xyz files.

Index
-----
.. currentmodule:: CAT.dye.addlig
.. autosummary::
    add_ligands
    export_dyes
    sa_scores

API
---
.. autofunction:: add_ligands
.. autofunction:: export_dyes
.. autofunction:: sa_scores

"""

import os
import gzip
import math
import pickle
from typing import Iterator, Iterable, Collection, Optional, Dict
from os.path import join, exists
from itertools import chain

import numpy as np
from scm.plams import read_molecules, Molecule
from scm.plams.interfaces.molecule.rdkit import to_rdmol
from rdkit.Chem import rdMolDescriptors, FindMolChiralCenters, Mol

from nanoutils import PathType
from CAT.logger import logger
from CAT.attachment.dye import label_lig, label_core, substitution
from CAT.attachment.substitution_symmetry import del_equiv_structures

__all__ = ['add_ligands', 'export_dyes', 'sa_scores']

# flake8: noqa: N806


[docs]def add_ligands(core_dir: str,
                ligand_dir: str,
                min_dist: float = 1.2,
                n: int = 1,
                symmetry: Collection[str] = ()) -> Iterator[Molecule]:
    """Add ligand(s) to one core.

    Parameters
    ----------
    core_dir: str
        Name of directory where core coordinates are located
    ligand_dir: str
        Name of directory where ligands coordinates are located
    min_dist: float
        Criterion for the minimal interatomic distances
    n: int
        Number of substitutions
    symmetry: tuple[str]
        Keywords for substitution symmetry for deleting equivalent molecules

    Returns
    -------
    Iterator[Molecule]
        New structures that are containg core and lingad fragments

    """
    # Validate the symmetry
    valid_sym = {'linear', 'triangle', 'D2h'}
    if not valid_sym.issuperset(symmetry):
        raise ValueError(f"Invalid 'symmetry' value: {symmetry!r}")

    input_ligands = list(read_molecules(ligand_dir).values())
    input_cores = list(read_molecules(core_dir).values())

    # Reading the comment section and labeling substitution atom and numbering the ligands
    label_lig(input_ligands)

    # Reading the comment section and labeling substitution atoms
    for core in input_cores:
        core.guess_bonds()
        label_core(core)

    # Generate structures by combining ligands and cores
    mono = substitution(input_ligands, input_cores, min_dist)
    new_mols = []
    new_mols.append(mono)
    if n > 1:
        for i in range(1, n):
            poli = substitution(input_ligands, mono, min_dist)
            new_mols.append(poli)
            mono = poli

    if not symmetry:
        pass
    elif 'linear' in symmetry:
        new_mols[1] = del_equiv_structures(new_mols[1], 'linear')
    elif 'triangle' in symmetry:
        new_mols[2] = del_equiv_structures(new_mols[2], 'triangle')
    elif 'D2h' in symmetry:
        new_mols[3] = del_equiv_structures(new_mols[3], 'D2h')

    # Combine and flatten all new molecules into a generator
    return chain.from_iterable(new_mols)


[docs]def export_dyes(mol_list: Iterable[Molecule],
                new_dir: str = 'new_molecules',
                err_dir: str = 'err_molecules',
                min_dist: float = 1.2) -> None:
    """Exports molecular coordinates to .xyz files.

    Parameters
    ----------
    mol_list: Iterable[Molecule]
        List of molecules
    new_dir: str
        Name of the durectory to place new structures
    err_dir: str
        Name of the directory for molecules that do not fullfill min_dist criterion
    min_dist: float
        Criterion for the minimal interatomic distances

    """
    if not exists(new_dir):
        os.makedirs(new_dir)
    if not exists(err_dir):
        os.makedirs(err_dir)

    # Export molecules to folders depending on the minimum core/ligand distance
    for mol in mol_list:
        name = mol.properties.name
        mol_distance = mol.properties.min_distance

        if mol_distance > min_dist:
            mol.write(join(new_dir, f'{name}.xyz'))
        else:
            mol.write(join(err_dir, f'err_{name}.xyz'))
            logger.warning(
                f"{name}: \n Minimal distance {mol_distance} A is smaller than {min_dist} A"
            )

# The functions '_compute_SAS' and 'SA_scores' as well as data set 'SA_score.pkl.gz'
# are copied and adapted from:
###########################################################################
#    Title: MolGAN: An implicit generative model for small molecular graphs
#    Authors: De Cao, Nicola and Kipf, Thomas
#    Date: 2018
#    Availability: https://github.com/nicola-decao/MolGAN
###########################################################################


def _compute_sas(mol: Mol, sa_model: Dict[int, float]) -> float:
    fp = rdMolDescriptors.GetMorganFingerprint(mol, 2)
    fps = fp.GetNonzeroElements()
    score1 = 0.
    nf = 0
    # for bitId, v in fps.items():
    for bitId, v in fps.items():
        nf += v
        sfp = bitId
        score1 += sa_model.get(sfp, -4) * v
    score1 /= nf

    # features score
    nAtoms = mol.GetNumAtoms()
    nChiralCenters = len(FindMolChiralCenters(
        mol, includeUnassigned=True))
    ri = mol.GetRingInfo()
    nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol)
    nBridgeheads = rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
    nMacrocycles = 0
    for x in ri.AtomRings():
        if len(x) > 8:
            nMacrocycles += 1

    sizePenalty = nAtoms ** 1.005 - nAtoms
    stereoPenalty = math.log10(nChiralCenters + 1)
    spiroPenalty = math.log10(nSpiro + 1)
    bridgePenalty = math.log10(nBridgeheads + 1)
    macrocyclePenalty = 0.

    # ---------------------------------------
    # This differs from the paper, which defines:
    # macrocyclePenalty = math.log10(nMacrocycles+1)
    # This form generates better results when 2 or more macrocycles are present
    if nMacrocycles > 0:
        macrocyclePenalty = math.log10(2)

    score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty

    # correction for the fingerprint density
    # not in the original publication, added in version 1.1
    # to make highly symmetrical molecules easier to synthetise
    score3 = 0.
    if nAtoms > len(fps):
        score3 = math.log(float(nAtoms) / len(fps)) * .5

    sascore = score1 + score2 + score3

    # need to transform "raw" value into scale between 1 and 10
    min = -4.0
    max = 2.5
    sascore = 11. - (sascore - min + 1) / (max - min) * 9.
    # smooth the 10-end
    if sascore > 8.:
        sascore = 8. + math.log(sascore + 1. - 9.)
    if sascore > 10.:
        sascore = 10.0
    elif sascore < 1.:
        sascore = 1.0

    return sascore


def _load_sa_model(filename: PathType) -> Dict[int, float]:
    sa_score = pickle.load(gzip.open(filename))
    return {j: i0 for i0, *i in sa_score for j in i}


[docs]def sa_scores(mols: Iterable[Molecule], filename: Optional[PathType] = None) -> np.ndarray:
    """Calculate the synthetic accessibility score for all molecules in **mols**."""
    sa_model = _load_sa_model(filename) if filename is not None else {}
    rdmols = (to_rdmol(mol) for mol in mols)

    try:
        count = len(mols)  # type: ignore
    except TypeError:
        count = -1
    iterator = (_compute_sas(mol, sa_model) for mol in rdmols)
    return np.fromiter(iterator, dtype=float, count=count)