Source code for CAT.dye.addlig

"""A module for multiple compound attachment and export of the .xyz files.

Index
-----
.. currentmodule:: CAT.dye.addlig
.. autosummary::
    add_ligands
    export_dyes
    sa_scores

API
---
.. autofunction:: add_ligands
.. autofunction:: export_dyes
.. autofunction:: sa_scores

"""

import os
import gzip
import math
import pickle
from typing import Iterator, Iterable, Collection, Optional, Dict
from os.path import join, exists
from itertools import chain

import numpy as np
from scm.plams import read_molecules, Molecule
from scm.plams.interfaces.molecule.rdkit import to_rdmol
from rdkit.Chem import rdMolDescriptors, FindMolChiralCenters, Mol

from nanoutils import PathType
from CAT.logger import logger
from CAT.attachment.dye import label_lig, label_core, substitution
from CAT.attachment.substitution_symmetry import del_equiv_structures

__all__ = ['add_ligands', 'export_dyes', 'sa_scores']

# flake8: noqa: N806


[docs]def add_ligands(core_dir: str, ligand_dir: str, min_dist: float = 1.2, n: int = 1, symmetry: Collection[str] = ()) -> Iterator[Molecule]: """Add ligand(s) to one core. Parameters ---------- core_dir: str Name of directory where core coordinates are located ligand_dir: str Name of directory where ligands coordinates are located min_dist: float Criterion for the minimal interatomic distances n: int Number of substitutions symmetry: tuple[str] Keywords for substitution symmetry for deleting equivalent molecules Returns ------- Iterator[Molecule] New structures that are containg core and lingad fragments """ # Validate the symmetry valid_sym = {'linear', 'triangle', 'D2h'} if not valid_sym.issuperset(symmetry): raise ValueError(f"Invalid 'symmetry' value: {symmetry!r}") input_ligands = list(read_molecules(ligand_dir).values()) input_cores = list(read_molecules(core_dir).values()) # Reading the comment section and labeling substitution atom and numbering the ligands label_lig(input_ligands) # Reading the comment section and labeling substitution atoms for core in input_cores: core.guess_bonds() label_core(core) # Generate structures by combining ligands and cores mono = substitution(input_ligands, input_cores, min_dist) new_mols = [] new_mols.append(mono) if n > 1: for i in range(1, n): poli = substitution(input_ligands, mono, min_dist) new_mols.append(poli) mono = poli if not symmetry: pass elif 'linear' in symmetry: new_mols[1] = del_equiv_structures(new_mols[1], 'linear') elif 'triangle' in symmetry: new_mols[2] = del_equiv_structures(new_mols[2], 'triangle') elif 'D2h' in symmetry: new_mols[3] = del_equiv_structures(new_mols[3], 'D2h') # Combine and flatten all new molecules into a generator return chain.from_iterable(new_mols)
[docs]def export_dyes(mol_list: Iterable[Molecule], new_dir: str = 'new_molecules', err_dir: str = 'err_molecules', min_dist: float = 1.2) -> None: """Exports molecular coordinates to .xyz files. Parameters ---------- mol_list: Iterable[Molecule] List of molecules new_dir: str Name of the durectory to place new structures err_dir: str Name of the directory for molecules that do not fullfill min_dist criterion min_dist: float Criterion for the minimal interatomic distances """ if not exists(new_dir): os.makedirs(new_dir) if not exists(err_dir): os.makedirs(err_dir) # Export molecules to folders depending on the minimum core/ligand distance for mol in mol_list: name = mol.properties.name mol_distance = mol.properties.min_distance if mol_distance > min_dist: mol.write(join(new_dir, f'{name}.xyz')) else: mol.write(join(err_dir, f'err_{name}.xyz')) logger.warning( f"{name}: \n Minimal distance {mol_distance} A is smaller than {min_dist} A" )
# The functions '_compute_SAS' and 'SA_scores' as well as data set 'SA_score.pkl.gz' # are copied and adapted from: ########################################################################### # Title: MolGAN: An implicit generative model for small molecular graphs # Authors: De Cao, Nicola and Kipf, Thomas # Date: 2018 # Availability: https://github.com/nicola-decao/MolGAN ########################################################################### def _compute_sas(mol: Mol, sa_model: Dict[int, float]) -> float: fp = rdMolDescriptors.GetMorganFingerprint(mol, 2) fps = fp.GetNonzeroElements() score1 = 0. nf = 0 # for bitId, v in fps.items(): for bitId, v in fps.items(): nf += v sfp = bitId score1 += sa_model.get(sfp, -4) * v score1 /= nf # features score nAtoms = mol.GetNumAtoms() nChiralCenters = len(FindMolChiralCenters( mol, includeUnassigned=True)) ri = mol.GetRingInfo() nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol) nBridgeheads = rdMolDescriptors.CalcNumBridgeheadAtoms(mol) nMacrocycles = 0 for x in ri.AtomRings(): if len(x) > 8: nMacrocycles += 1 sizePenalty = nAtoms ** 1.005 - nAtoms stereoPenalty = math.log10(nChiralCenters + 1) spiroPenalty = math.log10(nSpiro + 1) bridgePenalty = math.log10(nBridgeheads + 1) macrocyclePenalty = 0. # --------------------------------------- # This differs from the paper, which defines: # macrocyclePenalty = math.log10(nMacrocycles+1) # This form generates better results when 2 or more macrocycles are present if nMacrocycles > 0: macrocyclePenalty = math.log10(2) score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty # correction for the fingerprint density # not in the original publication, added in version 1.1 # to make highly symmetrical molecules easier to synthetise score3 = 0. if nAtoms > len(fps): score3 = math.log(float(nAtoms) / len(fps)) * .5 sascore = score1 + score2 + score3 # need to transform "raw" value into scale between 1 and 10 min = -4.0 max = 2.5 sascore = 11. - (sascore - min + 1) / (max - min) * 9. # smooth the 10-end if sascore > 8.: sascore = 8. + math.log(sascore + 1. - 9.) if sascore > 10.: sascore = 10.0 elif sascore < 1.: sascore = 1.0 return sascore def _load_sa_model(filename: PathType) -> Dict[int, float]: sa_score = pickle.load(gzip.open(filename)) return {j: i0 for i0, *i in sa_score for j in i}
[docs]def sa_scores(mols: Iterable[Molecule], filename: Optional[PathType] = None) -> np.ndarray: """Calculate the synthetic accessibility score for all molecules in **mols**.""" sa_model = _load_sa_model(filename) if filename is not None else {} rdmols = (to_rdmol(mol) for mol in mols) try: count = len(mols) # type: ignore except TypeError: count = -1 iterator = (_compute_sas(mol, sa_model) for mol in rdmols) return np.fromiter(iterator, dtype=float, count=count)