Source code for npfc.filter

"""
Module filter
==============
This modules contains the class Filter, which is used to filter molecules using
molecular descriptors.
"""

# data handling
import logging
import re
# chemoinformatics
from rdkit.Chem import Mol
from rdkit.Chem import Crippen
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
# docs
from typing import List


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #


[docs]def count_violations_lipinski(molecular_weight, slogp, num_hbd, num_hba): """Lipinski, J Pharmacol Toxicol Methods. 2000 Jul-Aug;44(1):235-49. """ n = 0 if molecular_weight < 150 or molecular_weight > 500: n += 1 if slogp > 5: n += 1 if num_hbd > 5: n += 1 if num_hba > 10: n += 1 return n
[docs]def count_violations_veber(num_rotatable_bonds, tpsa): """Veber DF, Johnson SR, Cheng HY, Smith BR, Ward KW, Kopple KD (June 2002). "Molecular properties that influence the oral bioavailability of drug candidates". J. Med. Chem. 45 (12): 2615–23. """ n = 0 if num_rotatable_bonds > 10: n += 1 if tpsa > 140: n += 1 return n
[docs]def count_violations_lead_like(molecular_weight, slogp, num_rotatable_bonds): """http://zinc.docking.org/browse/subsets/ Teague, Davis, Leeson, Oprea, Angew Chem Int Ed Engl. 1999 Dec 16;38(24):3743-3748. """ n = 0 if molecular_weight < 250 or molecular_weight > 350: n += 1 if slogp > 3.5: n += 1 if num_rotatable_bonds > 7: n += 1 return n
[docs]def count_violations_ppi_like(molecular_weight, slogp, num_hba, num_rings): """Hamon, V., Bourgeas, R., Ducrot, P., Theret, I., Xuereb, L., Basse, M.J., Brunel, J.M., Combes, S., Morelli, X., Roche, P., 2013. 2P2IHUNTER: a tool for filtering orthosteric protein–protein interaction modulators via a dedicated support vector machine. Journal of The Royal Society Interface 11. doi:10.1098/rsif.2013.0860 """ n = 0 if molecular_weight > 400: n += 1 if slogp < 4: n += 1 if num_hba < 4: n += 1 if num_rings < 4: n += 1 return n
[docs]def count_violations_fragment_like(molecular_weight, slogp, num_hba, num_hbd): """Congreve, M., Carr, R., Murray, C., Jhoti, H., 2003. A “Rule of Three” for fragment-based lead discovery? Drug Discovery Today 8, 876–877. doi:10.1016/S1359-6446(03)02831-9 """ n = 0 if molecular_weight >= 300: n += 1 if slogp > 3: n += 1 if num_hba > 3: n += 1 if num_hbd > 3: n += 1 return n
[docs]def count_violations_fragment_like_ext(num_fragment_like_violations, tpsa, num_rotatable_bonds): """Congreve, M., Carr, R., Murray, C., Jhoti, H., 2003. A “Rule of Three” for fragment-based lead discovery? Drug Discovery Today 8, 876–877. doi:10.1016/S1359-6446(03)02831-9 """ n = num_fragment_like_violations if tpsa > 60: n += 1 if num_rotatable_bonds: n += 1 return n
[docs]def get_min_max_ring_sizes(mol): """Return a tuple wih (minimum, maximum) ring sizes of the input molecule. In case the molecule is linear, (0, 0) is returned. """ ring_sizes = [len(x) for x in mol.GetRingInfo().AtomRings()] if len(ring_sizes) > 0: min_ring_size = min(ring_sizes) max_ring_size = max(ring_sizes) else: min_ring_size = 0 max_ring_size = 0 return (min_ring_size, max_ring_size)
DESCRIPTORS = { # classical molecular descriptors 'num_heavy_atoms': lambda x: x.GetNumAtoms(), 'molecular_weight': lambda x: round(Descriptors.ExactMolWt(x), 4), 'num_rings': lambda x: rdMolDescriptors.CalcNumRings(x), 'num_rings_arom': lambda x: rdMolDescriptors.CalcNumAromaticRings(x), 'elements': lambda x: set([a.GetSymbol() for a in x.GetAtoms()]), 'molecular_formula': lambda x: rdMolDescriptors.CalcMolFormula(x), 'num_hbd': lambda x: rdMolDescriptors.CalcNumLipinskiHBD(x), 'num_hba': lambda x: rdMolDescriptors.CalcNumLipinskiHBA(x), 'slogp': lambda x: round(Crippen.MolLogP(x), 4), 'tpsa': lambda x: round(rdMolDescriptors.CalcTPSA(x), 4), 'num_rotatable_bonds': lambda x: rdMolDescriptors.CalcNumRotatableBonds(x), 'num_atoms_oxygen': lambda x: len([a for a in x.GetAtoms() if a.GetAtomicNum() == 8]), 'num_atoms_nitrogen': lambda x: len([a for a in x.GetAtoms() if a.GetAtomicNum() == 7]), # custom molecular descriptors # ring_sizes: # it would have been faster to access only once RingInfo for both min and max, # but this is tricky because I would have to start making exceptions in the way # the functions are accessed or more complicated downstream process. # Indeed, I do not think it is possible to set two dict keys at once # within a dict comprehension and it is not that bad for # performance to call it twice anyway. 'ring_size_min': lambda x: min([len(y) for y in x.GetRingInfo().AtomRings()]), 'ring_size_max': lambda x: max([len(y) for y in x.GetRingInfo().AtomRings()]), } # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CLASSES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
[docs]class Filter: """A class for filtering molecules based on molecular descriptors.""" def __init__(self, descriptors: list = DESCRIPTORS): """Create a Filter object.""" self.descriptors = descriptors
[docs] def compute_descriptors(self, mol: Mol, descriptors: List = None) -> dict: """Compute descriptors. A subset of descriptors can be computed if a list of descriptor names is provided. To get an idea of what descriptors can be computed, the method get_possible_descriptors can be used. :param mol: the input molecule :param descriptors: the list of descriptors to compute. If none is provided, all possible descriptors are computed. :return: a dictionary with all descriptors """ # if no descriptor is specified, compute them all if descriptors is None: descriptors = list(self.descriptors.keys()) if len(descriptors) == 0: raise ValueError('Error! No descriptor is specified for computation!') return {descriptors[i]: self.descriptors[descriptors[i]](mol) for i in range(len(descriptors))}
[docs] def get_possible_descriptors(self) -> List: """Return a list of all descriptors that can be computed using this module. :return: the list of descriptors that can be computed """ return sorted(list((self.descriptors.keys())))
[docs] def filter_mol(self, mol: Mol, expr: str) -> bool: """Filter a molecule based on an expression. Two types of expressions are currently supported: - inclusion/exclusion - 'elements not in C, N, O' - 'elements in C, N, O' - numeric - 'num_heavy_atoms > 3' - '100.0 < molecular_weight <= 1000.0' - 'num_rings' != 0' - 'num_rings == 0' :param mol: the input molecule :param expr: the filter to apply :return: True if the molecule passes the filter, False otherwise """ # init split_expr = [s.lower() for s in expr.lower().split()] # filters of type: 'elements in C, N, O' if 'in' in split_expr: # 'in' or 'not in' return self._eval_set_expr(mol, expr) # filters of type: 'num_heavy_atoms > 3' return self._eval_numeric_expr(mol, expr.lower())
def _eval_numeric_expr(self, mol, expr): """ Evaluate if the statements stored in the expression are True or False. For now statement is composed of either 3 elements (['molweiht', '<=', '1000']) or 5 elements: (['0', '<=', 'molecular_weight', '<=', '1000']). ### No check has been added on this number because there might be an expanded functionality later on (combining statements with ';'?). Descriptors used for the comparisons need to be provided as a dictionary (name: value). Possible values for how: numeric, set or literal. """ mol = Mol(mol) expr = expr.replace(" ", "") split_expr = self._split_expr(expr) # something like 'molecular_weight', '<=', '1000' # replace descriptor names by their values split_expr = [self.descriptors[k](mol) if k in self.descriptors.keys() else k for k in split_expr] # now it is '250.0', '<=', '1000' logging.debug("Applying numeric filter: %s", ' '.join(str(v) for v in split_expr)) # convert all values extracted as string into their type split_expr = [float(x) if x not in split_expr[1::2] else x for x in split_expr] # and now it is 250.0, '<=', 1000.0 # operators are always at odd positions, whereas values are at even positions # and there is always a value on the left and on the right of an operator for i in range(1, len(split_expr), 2): operator = split_expr[i] left = split_expr[i-1] right = float(split_expr[i+1]) if operator == "<=": if not left <= right: return False elif operator == "<": if not left < right: return False elif operator == "==": if not left == right: return False elif operator == "!=": if not left != right: return False elif operator == ">=": if not left >= right: return False elif operator == ">": if not left > right: return False return True def _eval_set_expr(self, mol, expr): """Helper function for _eval_expr. Look for keywords ' in ' and ' not in ' in expression and check the condition by defining left as the descriptor and right as the values, i.e.: descriptor in values ('elements in H, C, N, O') """ for op in [' not in ', ' in ']: pattern = re.compile(op) # raw string hits = [(m.start(0), m.end(0)) for m in re.finditer(pattern, expr)] if len(hits) > 0: break # leave asap with op still set to the correct operator # in case we did not find anything, just stop if len(hits) == 0: raise ValueError(f"expected ' not in ' or ' in ' in expr ({expr})") expr_split = [e.replace(" ", "") for e in expr.split(op)] descriptor = self.descriptors[expr_split[0]](mol) # left values = set(expr_split[1].split(",")) # right logging.debug("Applying inclusion/exclusion filter: %s", ''.join(str(v) for v in [descriptor, op, values])) if (op == ' in ' and descriptor.issubset(values)) or (op == ' not in ' and not descriptor.issubset(values)): return True else: return False def _split_expr(self, expr): """Helper function for _eval_expr. From a string containing an expression (i.e. 'molecular_weight < 1000'), return a list of values and operators (['molecular_weight', '<', '1000']). """ opidx_eq = self._find_opidx("==", expr) opidx_diff = self._find_opidx("!=", expr) opidx_supeq = self._find_opidx(">=", expr) opidx_infeq = self._find_opidx("<=", expr) opidx_sup = self._find_opidx(">", expr) opidx_inf = self._find_opidx("<", expr) # filter sup and inf with supeq and infeq opidx_sup = self._filter_wrong_matches(opidx_supeq, opidx_sup) opidx_inf = self._filter_wrong_matches(opidx_infeq, opidx_inf) # split expr into values and operators # sorted operators so we can iterate over the expr from left to right opidx_all = sorted(opidx_eq + opidx_diff + opidx_supeq + opidx_infeq + opidx_sup + opidx_inf, key=lambda x: x[0]) split_expr = [] split_expr.append(expr[:opidx_all[0][0]]) for i in range(len(opidx_all) - 1): # always take on the value on the right side of the op, so init the first part outside of the loop opidx_curr = opidx_all[i] opidx_next = opidx_all[i+1] operator = expr[opidx_curr[0]:opidx_curr[1]] split_expr.append(operator) value = expr[opidx_curr[1]:opidx_next[0]] split_expr.append(value) split_expr.append(expr[opidx_all[-1][0]:opidx_all[-1][1]]) split_expr.append(expr[opidx_all[-1][1]:]) return split_expr def _find_opidx(self, op, expr): """ Helper function for _split_expr. Return all occurrences indices of a comparison operator (op) within an expr. """ # init possible operator symbols pattern = re.compile(op) # raw string return [(m.start(0), m.end(0)) for m in re.finditer(pattern, expr)] def _filter_wrong_matches(self, opidx_larger, opidx_smaller): """Helper function for __split_expr. Filter out false positives of comparison operators. For instance, '<' beginning at the same position as '<=' should be discarded. """ invalid = [] for smaller in opidx_smaller: for larger in opidx_larger: if smaller[0] == larger[0]: invalid.append(smaller) return [smaller for smaller in opidx_smaller if smaller not in invalid]