Source code for npfc.save

"""
Module save
================

A module containing the Saver class, used for storing DataFrames with molecules
on disk.
"""

# standard
import logging
import gzip
import shutil
import os
import numpy as np
import time
from pathlib import Path
from random import random
# data science
import pandas as pd
from pandas import DataFrame
from pandas import HDFStore
# chemoinformatics
from rdkit import Chem
from rdkit.Chem import SDWriter
# docs
from typing import List
# dev
from npfc import utils


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #


[docs]def file(df: pd.DataFrame, output_file: str, shuffle: bool = False, random_seed: int = None, chunk_size: int = None, encode: bool = True, col_mol: str = 'mol', col_id: str = 'idm', csv_sep: str = '|'): """A method for saving DataFrames with molecules to different file types. This is handy way of using the Saver class without having to keep a Saver object. :param df: the input DataFrame :param output_file: the output file :param shuffle: randomize records :param random_seed: a number for reproducing the shuffling :param chunk_size: the maximum number of records per chunk. If this value is unset, no chunking is performed, otherwise each chunk filename gets appended with a suffix: file_XXX.ext. :param encode: encode RDKit Mol objects and other objects in predefined columns as base64 strings. :param col_mol: if molecules need to be encoded, then the encoding is perfomed on this column. :param csv_sep: separator to use in case of csv output :return: the list of output files with their number of records """ # check some arguments utils.check_arg_output_file(output_file) utils.check_arg_bool(shuffle) utils.check_arg_bool(encode) logging.debug("Excerpt of the data as provided to file function:\n\n%s\n", df.head(5)) # init path_output_file = Path(output_file) ext_output_file = path_output_file.suffixes output_dir = path_output_file.resolve().parent output_files = [] # for sdf, molecules cannot be encoded if ext_output_file[0] == '.sdf' and encode: logging.warning(f"Format is SDF, so column '{col_mol}' is not encoded.") # avoid pandas warnings df = df.copy() # shuffle if shuffle: logging.debug('Shuffling rows before saving file') df = df.sample(frac=1, random_state=random_seed) # encode predefined data # if nothing to encode, just don't if len(df.index) == 0: logging.warning("DataFrame is empty, skip encoding.") # in case there is stuff to encode, encode it: elif encode: # for SDF files, RDKit Mol objects to use for MolBlocks should not be encoded if ext_output_file[0] != '.sdf' and col_mol in df.columns: # df[col_mol] = df[col_mol].map(utils.encode_mol_smiles) df[col_mol] = df[col_mol].map(utils.encode_mol) # other RDKit Mol objects can be encoded though for col in ("mol", "mol_frag", "mol_frag_1", "mol_frag_2", "mol_rdkit"): if col in df.columns and col != col_mol: # df[col] = df[col].map(utils.encode_mol_smiles) df[col] = df[col].map(utils.encode_mol) # other objects are labelled with leading '_' for col in df.columns: if col.startswith('_') and col != '_Name': df[col] = df[col].map(utils.encode_object) logging.debug("Excerpt of the data to save before chuking:\n\n%s\n", df.head(3)) # chunking if chunk_size is None: # single output _save(df=df, output_file=output_file, col_mol=col_mol, col_id=col_id, suffixes=ext_output_file, key=path_output_file.stem.split('.')[0], csv_sep=csv_sep) output_files.append([output_file, len(df.index)]) else: # chunks start = 0 j = 1 for start in range(0, len(df.index), chunk_size): end = start + chunk_size output_chunk = str(output_dir) + "/" + path_output_file.stem.split('.')[0] + "_" + str(j).zfill(3) + ''.join(ext_output_file) # stem returns file.csv for file.csv.gz _save(df=df.iloc[start:end], output_file=output_chunk, col_mol=col_mol, col_id=col_id, suffixes=ext_output_file, key=path_output_file.stem.split('.')[0], csv_sep=csv_sep) output_files.append([output_chunk, len(df.iloc[start:end].index)]) j += 1 logging.debug("%s chunks were created", len(output_files)) return output_files
def _save(df: DataFrame, output_file: str, col_mol: str, col_id: str, suffixes: List[str], key: str, csv_sep: str): """Helper function for the save method. Does the actual export to the output file and picks a format based on provided infos. :param df: the input DataFrame :param suffixes: the suffixes of the output file :param key: the key for a HDF file :param csv_sep: the separator for a CSV file """ # infer from pandas.to_csv does not work as expected (no compression!) # so I need to specify the compression type manually. utils.check_arg_output_file(output_file) out_format, out_compression = utils.get_file_format(output_file) if out_format == 'CSV': if out_compression == 'gzip': df.to_csv(output_file, sep=csv_sep, compression=out_compression, index=False) else: df.to_csv(output_file, sep=csv_sep, index=False) elif out_format == 'HDF': df.to_hdf(output_file, key=key) elif out_format == 'SDF': # write the uncompressed file if out_compression == 'gzip': # init output_file_base = '.'.join(output_file.split('.')[:-1]) logging.debug("Output_file_base: %s", output_file_base) # write the file uncompressed write_sdf(df, output_file_base, molColName=col_mol, idName=col_id, properties=list(df.columns)) # compress the file with open(output_file_base, 'rb') as OUTPUT: with gzip.open(output_file, 'wb') as ARCHIVE: shutil.copyfileobj(OUTPUT, ARCHIVE) # delete the uncompressed file as it is only a byproduct Path(output_file_base).unlink() else: write_sdf(df, output_file, molColName=col_mol, idName=col_id, properties=list(df.columns)) elif out_format == 'FEATHER': df.to_feather(output_file) else: raise ValueError(f"Error! Cannot save DataFrame to unexpected format '{suffixes[0]}'.") logging.debug("Saved %s records at '%s'.", len(df.index), output_file)
[docs]def write_sdf(df, out, molColName='ROMol', idName=None, properties=None, allNumeric=False): """ Redefinition of PandasTools.WriteSDF because RDKit 2019.03.1 is incompatible with Pandas 25.1. Write an SD file for the molecules in the dataframe. Dataframe columns can be exported as SDF tags if specified in the "properties" list. "properties=list(df.columns)" would export all columns. The "allNumeric" flag allows to automatically include all numeric columns in the output. User has to make sure that correct data type is assigned to column. "idName" can be used to select a column to serve as molecule title. It can be set to "RowID" to use the dataframe row key as title. """ close = None if isinstance(out, str): if out.lower()[-3:] == ".gz": out = gzip.open(out, "wt") close = out.close writer = SDWriter(out) if properties is None: properties = [] else: properties = list(properties) if allNumeric: properties.extend([ dt for dt in df.dtypes.keys() if (np.issubdtype(df.dtypes[dt], np.floating) or np.issubdtype(df.dtypes[dt], np.integer)) ]) if molColName in properties: properties.remove(molColName) if idName in properties: properties.remove(idName) writer.SetProps(properties) for row in df.iterrows(): # make a local copy I can modify mol = Chem.Mol(row[1][molColName]) if idName is not None: if idName == 'RowID': mol.SetProp('_Name', str(row[0])) else: mol.SetProp('_Name', str(row[1][idName])) for p in properties: cell_value = row[1][p] # Make sure float does not get formatted in E notation if np.issubdtype(type(cell_value), np.floating): s = '{:f}'.format(cell_value).rstrip("0") # "f" will show 7.0 as 7.00000 if s[-1] == ".": s += "0" # put the "0" back on if it's something like "7." mol.SetProp(p, s) else: mol.SetProp(p, str(cell_value)) writer.write(mol) writer.close() if close is not None: close()
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CLASSES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
[docs]class SafeHDF5Store(HDFStore): """Implement safe HDFStore by obtaining file lock. Multiple writes will queue if lock is not obtained. Edited after: https://stackoverflow.com/questions/41231678/obtaining-a-exclusive-lock-when-writing-to-an-hdf5-file """ def __init__(self, *args, **kwargs): """Initialize and obtain file lock.""" interval = kwargs.pop('probe_interval', random()) self._lock = f"{args[0]}.lock" while True: try: self._flock = os.open(self._lock, os.O_CREAT | os.O_EXCL | os.O_WRONLY) break except (IOError, OSError): time.sleep(interval) HDFStore.__init__(self, *args, **kwargs) def __exit__(self, *args, **kwargs): """Exit and remove file lock.""" HDFStore.__exit__(self, *args, **kwargs) os.close(self._flock) os.remove(self._lock)