xtbf

  1from collections import defaultdict
  2from contextlib import redirect_stderr, redirect_stdout
  3import copy
  4import io
  5import multiprocessing
  6import uuid
  7from joblib import Parallel, delayed, Memory
  8import tqdm
  9import random
 10import re
 11import numpy as np
 12import pandas as pd
 13import sqlite3
 14import shutil
 15import os
 16from pathlib import Path
 17
 18
 19# TODO: copy all functionality to remove dependency to smal package
 20# from smal.io import obj_load, obj_store
 21# from smal.mol_edit import remove_atom_at_index, mol_to_xyzstring
 22from rdkit import Chem
 23from rdkit.Chem import AllChem, rdDistGeom
 24
 25import tempfile
 26
 27# Path to the xtb binary
 28
 29BIN_XTB = os.getenv("XTBF__BIN_XTB") or "xtb"
 30
 31import subprocess
 32import time
 33import psutil
 34
 35class Silencer:
 36    """
 37    A useful tool for silencing stdout and stderr.
 38    Usage:
 39    >>> with Silencer() as s:
 40    ...         print("kasldjf")
 41
 42    >>> print("I catched:",s.out.getvalue())
 43    I catched: kasldjf
 44    <BLANKLINE>
 45
 46    Note that nothing was printed and that we can later
 47    access the stdout via the out field. Similarly,
 48    stderr will be redirected to the err field.
 49    """
 50
 51    def __init__(self):
 52        self.out = io.StringIO()
 53        self.err = io.StringIO()
 54
 55    def __enter__(self):
 56        self.rs = redirect_stdout(self.out)
 57        self.re = redirect_stderr(self.err)
 58        self.rs.__enter__()
 59        self.re.__enter__()
 60        return self
 61
 62    def __exit__(self, exctype, excinst, exctb):
 63        self.rs.__exit__(exctype, excinst, exctb)
 64        self.re.__exit__(exctype, excinst, exctb)
 65
 66
 67
 68try:
 69    with Silencer() as s:
 70        xtb_version = subprocess.check_output(
 71            [f"{BIN_XTB}", "--version"],
 72            stderr=subprocess.DEVNULL,
 73        )
 74    if not isinstance(xtb_version, str):
 75        xtb_version = xtb_version.decode("utf-8")
 76    xtb_version = re.findall(r"(version \d+.\d+.\d+)", xtb_version)
 77except:
 78    print("could not determine xtb version.")
 79    print(
 80        "most likely no xtb binary is installed. See: https://xtb-docs.readthedocs.io/en/latest/setup.html"
 81    )
 82    raise
 83
 84if xtb_version:
 85    xtb_version = xtb_version[0]
 86    if xtb_version >= "6.5.1":
 87        print("xtb version:", xtb_version)
 88    else:
 89        assert (
 90            f"detected outdated xtb version: '{xtb_version}'. Please install version >= 6.5.1."
 91            "see https://xtb-docs.readthedocs.io/en/latest/setup.html"
 92        )
 93else:
 94    print("could not determine xtb version.")
 95    print(
 96        "most likely no xtb binary is installed. See: https://xtb-docs.readthedocs.io/en/latest/setup.html"
 97    )
 98    exit(1)
 99
100
101# Try to prioritize memory mapped file system
102# to improve speed and reduce strain,
103# fallback to potentially memory mapped, or
104# non-mem mapped file system otherwise...
105TMP_ROOT = Path("/dev/shm/")
106if not TMP_ROOT.exists():
107    print("Warning: could not find /dev/shm/ mem-mapped io not possible")
108    TMP_ROOT = Path("/tmp")
109if not TMP_ROOT.exists():
110    TMP_ROOT = tempfile.gettempdir()
111
112XTB_TMP_DIR = TMP_ROOT / Path("xtbf")
113
114XTB_OUTER_JOBS = min(32,multiprocessing.cpu_count()-1)
115XTB_INNER_JOBS = 1
116XTB_TIMEOUT_SECONDS = 1200 # 20 minutes
117
118
119
120def temp_dir():
121    """
122    Creates a path to a temporary directory
123    """
124    if Path("/tmp").exists():
125        tmp_dir = Path("/tmp") / "smal"
126    else:
127        tmp_dir = Path(tempfile.gettempdir()) / "smal"
128    tmp_dir.mkdir(
129        exist_ok=True,
130    )
131    return tmp_dir
132
133
134def random_fle(
135    suf: str,
136):
137    """
138    Creates a random file with the given suffix suf
139    """
140    fle_name = f"{uuid.uuid4().hex}.{suf}"
141    return temp_dir() / fle_name
142
143
144def mol_to_xyzstring(
145    mol: "Chem.Mol",
146    conf_id=0,
147) -> tuple[Chem.Mol, str]:
148    """
149    Embeds the given molecule in 3D and returns the
150    resulting molecule with 3D coordinates and the
151    corresponding xyz string.
152
153    >>> from smal.all import *
154    >>> mol = embed_molecule(from_smi('CCCO'))
155    >>> mol, xyz = mol_to_xyzstring(mol)
156    >>> print(xyz) # doctest: +SKIP
157    12
158    <BLANKLINE>
159    C     -1.285327   -0.056776    0.434662
160    C     -0.175447    0.695786   -0.299881
161    C      0.918409   -0.342619   -0.555572
162    O      1.309356   -0.801512    0.717050
163    H     -1.923389    0.626223    0.994971
164    H     -0.850209   -0.851665    1.084577
165    H     -1.831209   -0.618285   -0.370858
166    H     -0.509218    1.121896   -1.245390
167    H      0.216108    1.472826    0.409811
168    H      0.439516   -1.203877   -1.086785
169    H      1.744456    0.056257   -1.140213
170    H      1.946955   -0.098256    1.057627
171    <BLANKLINE>
172    """
173    rand_fle = random_fle("xyz")
174    Chem.MolToXYZFile(mol, str(rand_fle), confId=conf_id)
175    if rand_fle.exists():
176        xyz_string = rand_fle.read_text()
177        rand_fle.unlink()
178        return mol, xyz_string
179    else:
180        return None, None
181        
182
183
184def run_parallel(
185    n_jobs:int,
186    fun:"callable", elts:list,
187    do_tqdm:bool,
188) -> None:
189    if n_jobs > 1:
190        rslt = Parallel(n_jobs=n_jobs)(
191            delayed(fun)(elt)
192            for elt in (tqdm.tqdm(elts) if do_tqdm else elts)
193        )
194    else:
195        rslt = [fun(elt) for elt in (tqdm.tqdm(elts) if do_tqdm else elts)] 
196    return rslt
197
198def embed_molecule(mol: "Chem.Mol", enforceChirality:int, maxAttempts=100) -> "Chem.Mol":
199    mol = Chem.AddHs(mol)
200    ps = rdDistGeom.ETKDGv3()
201    ps.enforceChirality = bool(enforceChirality)
202    AllChem.EmbedMolecule(
203        mol,
204        enforceChirality=enforceChirality,
205        maxAttempts=maxAttempts,
206    )
207    return mol
208
209def embed_multi(
210    mol: "Chem.Mol",
211    n_confs: int,
212) -> "Chem.Mol":
213    mol = Chem.AddHs(mol)
214    ps = rdDistGeom.ETKDGv3()
215    cids = rdDistGeom.EmbedMultipleConfs(mol, n_confs, ps)
216
217    return cids
218
219
220def run_xtb(xtb_command:str, mol:Chem.Mol, multiplicity:int, conf_id:int=0,):
221    """
222
223    >>> from smal.all import *
224    >>> mol = from_smi("CCCOCCC")
225    >>> mol = embed_molecule(mol)
226    >>> success,rslt = run_xtb("--opt", mol, 1)
227    >>> success
228    True
229
230
231    """
232    mol, xyz_string = mol_to_xyzstring(mol,conf_id=conf_id)
233    return run_xtb_xyz(xtb_command=xtb_command,xyz_string=xyz_string, multiplicity=multiplicity, )
234
235def run_xtb_xyz(xtb_command:str, xyz_string:str, multiplicity:int,):
236    """
237    Runs the given xtb job as identified by
238    the following components:
239     - The command <xtb>, 
240     - the xyz string of the molecule <xyz_string>
241
242    """
243    assert XTB_TMP_DIR.parent.exists()
244    XTB_TMP_DIR.mkdir(exist_ok=True)
245    assert XTB_TMP_DIR.exists()
246    job_dir = XTB_TMP_DIR / f"job_{random.randint(1,10**9)}"
247    assert not job_dir.exists(), "internal logic error"
248    cwd_before = os.getcwd()
249
250    if multiplicity != 1:
251        uhf_arg = f"--uhf {multiplicity} "
252    else:
253        uhf_arg = ""
254    try:
255        os.mkdir(job_dir)
256        os.chdir(job_dir)
257        (job_dir / "input.xyz").write_text(xyz_string)
258        output_fle = job_dir / "output.out"
259        assert not output_fle.exists()
260        with Silencer() as s:
261            # alpb_str = ""
262            cmd = f"{BIN_XTB} input.xyz --parallel {XTB_INNER_JOBS} {xtb_command} {uhf_arg}> output.out 2> err.out"
263
264            # normal approach:
265            # subprocess.check_output(cmd,shell=True,timeout=XTB_TIMEOUT_SECONDS)
266            # ^
267            # \__ sadly, this approach does not work because the timeout isnt applied
268            #
269            # As suggested in https://stackoverflow.com/questions/48763362/python-subprocess-kill-with-timeout
270            # we apply the following workaround:
271            parent = subprocess.Popen(cmd, shell=True)
272            for _ in range(XTB_TIMEOUT_SECONDS):
273                if parent.poll() is not None:  # process just ended
274                    break
275                time.sleep(1)
276            else:
277                # the for loop ended without break: timeout
278                parent = psutil.Process(parent.pid)
279                for child in parent.children(
280                    recursive=True
281                ):  # or parent.children() for recursive=False
282                    child.kill()
283                parent.kill()
284
285        if output_fle.exists():
286            rslt = (True, output_fle.read_text())
287            return rslt
288        else:
289            return False, ""
290    finally:
291        os.chdir(cwd_before)
292        # Garbage Collection
293        if job_dir.exists():
294            shutil.rmtree(job_dir)
BIN_XTB = 'xtb'
class Silencer:
36class Silencer:
37    """
38    A useful tool for silencing stdout and stderr.
39    Usage:
40    >>> with Silencer() as s:
41    ...         print("kasldjf")
42
43    >>> print("I catched:",s.out.getvalue())
44    I catched: kasldjf
45    <BLANKLINE>
46
47    Note that nothing was printed and that we can later
48    access the stdout via the out field. Similarly,
49    stderr will be redirected to the err field.
50    """
51
52    def __init__(self):
53        self.out = io.StringIO()
54        self.err = io.StringIO()
55
56    def __enter__(self):
57        self.rs = redirect_stdout(self.out)
58        self.re = redirect_stderr(self.err)
59        self.rs.__enter__()
60        self.re.__enter__()
61        return self
62
63    def __exit__(self, exctype, excinst, exctb):
64        self.rs.__exit__(exctype, excinst, exctb)
65        self.re.__exit__(exctype, excinst, exctb)

A useful tool for silencing stdout and stderr. Usage:

>>> with Silencer() as s:
...         print("kasldjf")
>>> print("I catched:",s.out.getvalue())
I catched: kasldjf
<BLANKLINE>

Note that nothing was printed and that we can later access the stdout via the out field. Similarly, stderr will be redirected to the err field.

out
err
TMP_ROOT = PosixPath('/tmp')
XTB_TMP_DIR = PosixPath('/tmp/xtbf')
XTB_OUTER_JOBS = 7
XTB_INNER_JOBS = 1
XTB_TIMEOUT_SECONDS = 1200
def temp_dir():
121def temp_dir():
122    """
123    Creates a path to a temporary directory
124    """
125    if Path("/tmp").exists():
126        tmp_dir = Path("/tmp") / "smal"
127    else:
128        tmp_dir = Path(tempfile.gettempdir()) / "smal"
129    tmp_dir.mkdir(
130        exist_ok=True,
131    )
132    return tmp_dir

Creates a path to a temporary directory

def random_fle(suf: str):
135def random_fle(
136    suf: str,
137):
138    """
139    Creates a random file with the given suffix suf
140    """
141    fle_name = f"{uuid.uuid4().hex}.{suf}"
142    return temp_dir() / fle_name

Creates a random file with the given suffix suf

def mol_to_xyzstring( mol: rdkit.Chem.rdchem.Mol, conf_id=0) -> tuple[rdkit.Chem.rdchem.Mol, str]:
145def mol_to_xyzstring(
146    mol: "Chem.Mol",
147    conf_id=0,
148) -> tuple[Chem.Mol, str]:
149    """
150    Embeds the given molecule in 3D and returns the
151    resulting molecule with 3D coordinates and the
152    corresponding xyz string.
153
154    >>> from smal.all import *
155    >>> mol = embed_molecule(from_smi('CCCO'))
156    >>> mol, xyz = mol_to_xyzstring(mol)
157    >>> print(xyz) # doctest: +SKIP
158    12
159    <BLANKLINE>
160    C     -1.285327   -0.056776    0.434662
161    C     -0.175447    0.695786   -0.299881
162    C      0.918409   -0.342619   -0.555572
163    O      1.309356   -0.801512    0.717050
164    H     -1.923389    0.626223    0.994971
165    H     -0.850209   -0.851665    1.084577
166    H     -1.831209   -0.618285   -0.370858
167    H     -0.509218    1.121896   -1.245390
168    H      0.216108    1.472826    0.409811
169    H      0.439516   -1.203877   -1.086785
170    H      1.744456    0.056257   -1.140213
171    H      1.946955   -0.098256    1.057627
172    <BLANKLINE>
173    """
174    rand_fle = random_fle("xyz")
175    Chem.MolToXYZFile(mol, str(rand_fle), confId=conf_id)
176    if rand_fle.exists():
177        xyz_string = rand_fle.read_text()
178        rand_fle.unlink()
179        return mol, xyz_string
180    else:
181        return None, None

Embeds the given molecule in 3D and returns the resulting molecule with 3D coordinates and the corresponding xyz string.

>>> from smal.all import *
>>> mol = embed_molecule(from_smi('CCCO'))
>>> mol, xyz = mol_to_xyzstring(mol)
>>> print(xyz) # doctest: +SKIP
12
<BLANKLINE>
C     -1.285327   -0.056776    0.434662
C     -0.175447    0.695786   -0.299881
C      0.918409   -0.342619   -0.555572
O      1.309356   -0.801512    0.717050
H     -1.923389    0.626223    0.994971
H     -0.850209   -0.851665    1.084577
H     -1.831209   -0.618285   -0.370858
H     -0.509218    1.121896   -1.245390
H      0.216108    1.472826    0.409811
H      0.439516   -1.203877   -1.086785
H      1.744456    0.056257   -1.140213
H      1.946955   -0.098256    1.057627
<BLANKLINE>
def run_parallel( n_jobs: int, fun: <built-in function callable>, elts: list, do_tqdm: bool) -> None:
185def run_parallel(
186    n_jobs:int,
187    fun:"callable", elts:list,
188    do_tqdm:bool,
189) -> None:
190    if n_jobs > 1:
191        rslt = Parallel(n_jobs=n_jobs)(
192            delayed(fun)(elt)
193            for elt in (tqdm.tqdm(elts) if do_tqdm else elts)
194        )
195    else:
196        rslt = [fun(elt) for elt in (tqdm.tqdm(elts) if do_tqdm else elts)] 
197    return rslt
def embed_molecule( mol: rdkit.Chem.rdchem.Mol, enforceChirality: int, maxAttempts=100) -> rdkit.Chem.rdchem.Mol:
199def embed_molecule(mol: "Chem.Mol", enforceChirality:int, maxAttempts=100) -> "Chem.Mol":
200    mol = Chem.AddHs(mol)
201    ps = rdDistGeom.ETKDGv3()
202    ps.enforceChirality = bool(enforceChirality)
203    AllChem.EmbedMolecule(
204        mol,
205        enforceChirality=enforceChirality,
206        maxAttempts=maxAttempts,
207    )
208    return mol
def embed_multi(mol: rdkit.Chem.rdchem.Mol, n_confs: int) -> rdkit.Chem.rdchem.Mol:
210def embed_multi(
211    mol: "Chem.Mol",
212    n_confs: int,
213) -> "Chem.Mol":
214    mol = Chem.AddHs(mol)
215    ps = rdDistGeom.ETKDGv3()
216    cids = rdDistGeom.EmbedMultipleConfs(mol, n_confs, ps)
217
218    return cids
def run_xtb( xtb_command: str, mol: rdkit.Chem.rdchem.Mol, multiplicity: int, conf_id: int = 0):
221def run_xtb(xtb_command:str, mol:Chem.Mol, multiplicity:int, conf_id:int=0,):
222    """
223
224    >>> from smal.all import *
225    >>> mol = from_smi("CCCOCCC")
226    >>> mol = embed_molecule(mol)
227    >>> success,rslt = run_xtb("--opt", mol, 1)
228    >>> success
229    True
230
231
232    """
233    mol, xyz_string = mol_to_xyzstring(mol,conf_id=conf_id)
234    return run_xtb_xyz(xtb_command=xtb_command,xyz_string=xyz_string, multiplicity=multiplicity, )
>>> from smal.all import *
>>> mol = from_smi("CCCOCCC")
>>> mol = embed_molecule(mol)
>>> success,rslt = run_xtb("--opt", mol, 1)
>>> success
True
def run_xtb_xyz(xtb_command: str, xyz_string: str, multiplicity: int):
236def run_xtb_xyz(xtb_command:str, xyz_string:str, multiplicity:int,):
237    """
238    Runs the given xtb job as identified by
239    the following components:
240     - The command <xtb>, 
241     - the xyz string of the molecule <xyz_string>
242
243    """
244    assert XTB_TMP_DIR.parent.exists()
245    XTB_TMP_DIR.mkdir(exist_ok=True)
246    assert XTB_TMP_DIR.exists()
247    job_dir = XTB_TMP_DIR / f"job_{random.randint(1,10**9)}"
248    assert not job_dir.exists(), "internal logic error"
249    cwd_before = os.getcwd()
250
251    if multiplicity != 1:
252        uhf_arg = f"--uhf {multiplicity} "
253    else:
254        uhf_arg = ""
255    try:
256        os.mkdir(job_dir)
257        os.chdir(job_dir)
258        (job_dir / "input.xyz").write_text(xyz_string)
259        output_fle = job_dir / "output.out"
260        assert not output_fle.exists()
261        with Silencer() as s:
262            # alpb_str = ""
263            cmd = f"{BIN_XTB} input.xyz --parallel {XTB_INNER_JOBS} {xtb_command} {uhf_arg}> output.out 2> err.out"
264
265            # normal approach:
266            # subprocess.check_output(cmd,shell=True,timeout=XTB_TIMEOUT_SECONDS)
267            # ^
268            # \__ sadly, this approach does not work because the timeout isnt applied
269            #
270            # As suggested in https://stackoverflow.com/questions/48763362/python-subprocess-kill-with-timeout
271            # we apply the following workaround:
272            parent = subprocess.Popen(cmd, shell=True)
273            for _ in range(XTB_TIMEOUT_SECONDS):
274                if parent.poll() is not None:  # process just ended
275                    break
276                time.sleep(1)
277            else:
278                # the for loop ended without break: timeout
279                parent = psutil.Process(parent.pid)
280                for child in parent.children(
281                    recursive=True
282                ):  # or parent.children() for recursive=False
283                    child.kill()
284                parent.kill()
285
286        if output_fle.exists():
287            rslt = (True, output_fle.read_text())
288            return rslt
289        else:
290            return False, ""
291    finally:
292        os.chdir(cwd_before)
293        # Garbage Collection
294        if job_dir.exists():
295            shutil.rmtree(job_dir)

Runs the given xtb job as identified by the following components:

  • The command ,
  • the xyz string of the molecule
s = <xtbf.Silencer object>