Module vflow.helpers

User-facing helper functions included at import vflow

Expand source code
"""User-facing helper functions included at import vflow
"""
from functools import partial
from itertools import product
from typing import Union

import mlflow
import numpy as np

from vflow.utils import dict_to_df, dict_keys, dict_data
from vflow.vfunc import Vfunc
from vflow.vset import Vset, Subkey, PREV_KEY, FILTER_PREV_KEY


def init_args(args_tuple: Union[tuple, list], names=None):
    """Converts tuple of arguments to a list of dicts

    Parameters
    ----------
    names: list-like (optional), default None
        given names for each of the arguments in the tuple
    """
    if names is None:
        names = ['start'] * len(args_tuple)
    else:
        assert len(names) == len(args_tuple), 'names should be same length as args_tuple'
    output_dicts = []
    for i, _ in enumerate(args_tuple):
        output_dicts.append({
            (Subkey(names[i], 'init'),): args_tuple[i],
            PREV_KEY: ('init',),
        })
    return output_dicts


def build_vset(name: str, obj, *args, param_dict=None, reps: int = 1,
               is_async: bool = False, output_matching: bool = False,
               lazy: bool = False, cache_dir: str = None, verbose: bool = True,
               tracking_dir: str = None, **kwargs) -> Vset:
    """Builds a Vset by currying callable obj with all combinations of parameters in param_dict.

    Parameters
    ----------
    name: str
        a name for the output Vset
    obj: callable
        a callable to use as the base for Vfuncs in the output Vset
    param_dict: dict[str, list]
        keys are obj kwarg names and values in the dict are lists of params to try
    *args
        additional fixed arguments to pass to obj
    reps: int (optional)
        the number of times to repeat the obj in the output Vset's modules for
        each combination of params in param_dict
    is_async: bool (optional)
        if True, modules are computed asynchronously
    output_matching: bool (optional)
        if True, then output keys from Vset will be matched when used
        in other Vsets
    cache_dir: str (optional)
        if provided, do caching and use cache_dir as the data store for
        joblib.Memory
    verbose : bool (optional)
        if True, modules are named with param_dict items as tuples of str("param_name=param_val")
    tracking_dir: str (optional)
        if provided, use the mlflow.tracking api to log outputs as metrics
        with params determined by input keys
    **kwargs
        additional fixed keyword arguments to pass to obj

    Returns
    -------
    new_vset : Vset
    """
    if param_dict is None:
        param_dict = {}
    assert callable(obj), 'obj must be callable'

    vfuncs = []
    vkeys = []

    kwargs_tuples = product(*list(param_dict.values()))
    for tup in kwargs_tuples:
        kwargs_dict = {}
        vkey_tup = ()
        for param_name, param_val in zip(list(param_dict.keys()), tup):
            kwargs_dict[param_name] = param_val
            vkey_tup += (f'{param_name}={param_val}', )
        # add additional fixed kwargs to kwargs_dict
        for k, v in kwargs.items():
            kwargs_dict[k] = v
        for i in range(reps):
            # add module key to vkeys
            if reps > 1:
                vkeys.append((f'rep={i}', ) + vkey_tup)
            else:
                vkeys.append(vkey_tup)
            # check if obj is a class
            if isinstance(obj, type):
                # instantiate obj
                vfuncs.append(Vfunc(module=obj(*args, **kwargs_dict), name=str(vkey_tup)))
            else:
                # use partial to wrap obj
                vfuncs.append(Vfunc(module=partial(obj, *args, **kwargs_dict), name=str(vkey_tup)))
    if not verbose or (len(param_dict) == 0 and reps == 1):
        vkeys = None
    return Vset(name, vfuncs, is_async=is_async, module_keys=vkeys,
                output_matching=output_matching, lazy=lazy,
                cache_dir=cache_dir, tracking_dir=tracking_dir)


def filter_vset_by_metric(metric_dict: dict, vset: Vset, *vsets: Vset, n_keep: int = 1,
                          bigger_is_better: bool = True, filter_on=None,
                          group: bool = False) -> Union[Vset, list]:
    """Returns a new Vset by filtering `vset.modules` based on values in filter_dict.

    Parameters
    ----------
    metric_dict: dict
        output from a Vset, typically with metrics or other numeric values to use when
        filtering `vset.modules`
    vset: Vset
        a Vsets
    *vsets: Vset
        zero or more additional Vsets
    n_keep: int (optional)
        number of entries to keep from `vset.modules`
    bigger_is_better: bool (optional)
        if True, then the top `n_keep` largest values are retained
    filter_on: list[str] (optional)
        if there are multiple metrics in `metric_dict`, you can specify a subset
        to consider
    group: bool (optional)
        if True, average metrics after grouping values in `metric_dict` by the
        input Vset names

    Returns
    -------
    *new_vset : Vset
        Copies of the input Vsets but with Vfuncs filtered based on metrics
    """
    if filter_on is None:
        filter_on = []
    df = dict_to_df(metric_dict)
    vsets = [vset, *vsets]
    vset_names = []
    for vset_i in vsets:
        if vset_i.name not in df.columns:
            raise ValueError((f'{vset_i.name} should be one '
                              'of the columns of dict_to_df(metric_dict)'))
        vset_names.append(vset_i.name)
    if len(filter_on) > 0:
        filter_col = list(metric_dict.keys())[0][-1].origin
        df = df[df[filter_col].isin(filter_on)]
    if group:
        df = df.groupby(by=vset_names, as_index=False).mean()
    if bigger_is_better:
        df = df.sort_values(by='out', ascending=False)
    else:
        df = df.sort_values(by='out')
    df = df.iloc[0:n_keep]
    for i, vset_i in enumerate(vsets):
        vfuncs = vset_i.modules
        vfunc_filter = [str(name) for name in df[vset_i.name].to_numpy()]
        new_vfuncs = {k: v for k, v in vfuncs.items() if str(v.name) in vfunc_filter}
        tracking_dir = None if vset_i._mlflow is None else mlflow.get_tracking_uri()
        new_vset = Vset('filtered_' + vset_i.name, new_vfuncs, is_async=vset_i._async,
                        output_matching=vset_i._output_matching, lazy=vset_i._lazy,
                        cache_dir=vset_i._cache_dir, tracking_dir=tracking_dir)
        setattr(new_vset, FILTER_PREV_KEY, (metric_dict[PREV_KEY], vset_i,))
        setattr(new_vset, PREV_KEY, getattr(new_vset, FILTER_PREV_KEY))
        vsets[i] = new_vset
    if len(vsets) == 1:
        return vsets[0]
    return vsets


def cum_acc_by_uncertainty(mean_preds, std_preds, true_labels):
    """Returns uncertainty and cumulative accuracy for grouped class predictions,
    sorted in increasing order of uncertainty

    Params
    ------
    mean_preds: dict
        mean predictions, output from Vset.predict_with_uncertainties
    std_preds: dict
        std predictions, output from Vset.predict_with_uncertainties
    true_labels: dict or list-like

    TODO: generalize to multi-class classification
    """
    assert dict_keys(mean_preds) == dict_keys(std_preds), \
        "mean_preds and std_preds must share the same keys"
    # match predictions on keys
    paired_preds = [[d[k] for d in (mean_preds, std_preds)] for k in dict_keys(mean_preds)]
    mean_preds, std_preds = (np.array(p)[:,:,1] for p in zip(*paired_preds))
    if isinstance(true_labels, dict):
        true_labels = dict_data(true_labels)
        assert len(true_labels) == 1, 'true_labels should have a single 1D vector entry'
        true_labels = true_labels[0]
    n_obs = len(mean_preds[0])
    assert len(true_labels) == n_obs, \
        f'true_labels has {len(true_labels)} obs. but should have same as predictions ({n_obs})'
    sorted_idx = np.argsort(std_preds, axis=1)
    correct_labels = np.take_along_axis(np.around(mean_preds) - true_labels == 0, sorted_idx, 1)
    uncertainty = np.take_along_axis(std_preds, sorted_idx, 1)
    cum_acc = np.cumsum(correct_labels, axis=1) / range(1, n_obs+1)
    return uncertainty, cum_acc, sorted_idx

Functions

def build_vset(name: str, obj, *args, param_dict=None, reps: int = 1, is_async: bool = False, output_matching: bool = False, lazy: bool = False, cache_dir: str = None, verbose: bool = True, tracking_dir: str = None, **kwargs) ‑> Vset

Builds a Vset by currying callable obj with all combinations of parameters in param_dict.

Parameters

name : str
a name for the output Vset
obj : callable
a callable to use as the base for Vfuncs in the output Vset
param_dict : dict[str, list]
keys are obj kwarg names and values in the dict are lists of params to try
*args
additional fixed arguments to pass to obj
reps : int (optional)
the number of times to repeat the obj in the output Vset's modules for each combination of params in param_dict
is_async : bool (optional)
if True, modules are computed asynchronously
output_matching : bool (optional)
if True, then output keys from Vset will be matched when used in other Vsets
cache_dir : str (optional)
if provided, do caching and use cache_dir as the data store for joblib.Memory
verbose : bool (optional)
if True, modules are named with param_dict items as tuples of str("param_name=param_val")
tracking_dir : str (optional)
if provided, use the mlflow.tracking api to log outputs as metrics with params determined by input keys
**kwargs
additional fixed keyword arguments to pass to obj

Returns

new_vset : Vset
 
Expand source code
def build_vset(name: str, obj, *args, param_dict=None, reps: int = 1,
               is_async: bool = False, output_matching: bool = False,
               lazy: bool = False, cache_dir: str = None, verbose: bool = True,
               tracking_dir: str = None, **kwargs) -> Vset:
    """Builds a Vset by currying callable obj with all combinations of parameters in param_dict.

    Parameters
    ----------
    name: str
        a name for the output Vset
    obj: callable
        a callable to use as the base for Vfuncs in the output Vset
    param_dict: dict[str, list]
        keys are obj kwarg names and values in the dict are lists of params to try
    *args
        additional fixed arguments to pass to obj
    reps: int (optional)
        the number of times to repeat the obj in the output Vset's modules for
        each combination of params in param_dict
    is_async: bool (optional)
        if True, modules are computed asynchronously
    output_matching: bool (optional)
        if True, then output keys from Vset will be matched when used
        in other Vsets
    cache_dir: str (optional)
        if provided, do caching and use cache_dir as the data store for
        joblib.Memory
    verbose : bool (optional)
        if True, modules are named with param_dict items as tuples of str("param_name=param_val")
    tracking_dir: str (optional)
        if provided, use the mlflow.tracking api to log outputs as metrics
        with params determined by input keys
    **kwargs
        additional fixed keyword arguments to pass to obj

    Returns
    -------
    new_vset : Vset
    """
    if param_dict is None:
        param_dict = {}
    assert callable(obj), 'obj must be callable'

    vfuncs = []
    vkeys = []

    kwargs_tuples = product(*list(param_dict.values()))
    for tup in kwargs_tuples:
        kwargs_dict = {}
        vkey_tup = ()
        for param_name, param_val in zip(list(param_dict.keys()), tup):
            kwargs_dict[param_name] = param_val
            vkey_tup += (f'{param_name}={param_val}', )
        # add additional fixed kwargs to kwargs_dict
        for k, v in kwargs.items():
            kwargs_dict[k] = v
        for i in range(reps):
            # add module key to vkeys
            if reps > 1:
                vkeys.append((f'rep={i}', ) + vkey_tup)
            else:
                vkeys.append(vkey_tup)
            # check if obj is a class
            if isinstance(obj, type):
                # instantiate obj
                vfuncs.append(Vfunc(module=obj(*args, **kwargs_dict), name=str(vkey_tup)))
            else:
                # use partial to wrap obj
                vfuncs.append(Vfunc(module=partial(obj, *args, **kwargs_dict), name=str(vkey_tup)))
    if not verbose or (len(param_dict) == 0 and reps == 1):
        vkeys = None
    return Vset(name, vfuncs, is_async=is_async, module_keys=vkeys,
                output_matching=output_matching, lazy=lazy,
                cache_dir=cache_dir, tracking_dir=tracking_dir)
def cum_acc_by_uncertainty(mean_preds, std_preds, true_labels)

Returns uncertainty and cumulative accuracy for grouped class predictions, sorted in increasing order of uncertainty

Params

mean_preds: dict mean predictions, output from Vset.predict_with_uncertainties std_preds: dict std predictions, output from Vset.predict_with_uncertainties true_labels: dict or list-like

TODO: generalize to multi-class classification

Expand source code
def cum_acc_by_uncertainty(mean_preds, std_preds, true_labels):
    """Returns uncertainty and cumulative accuracy for grouped class predictions,
    sorted in increasing order of uncertainty

    Params
    ------
    mean_preds: dict
        mean predictions, output from Vset.predict_with_uncertainties
    std_preds: dict
        std predictions, output from Vset.predict_with_uncertainties
    true_labels: dict or list-like

    TODO: generalize to multi-class classification
    """
    assert dict_keys(mean_preds) == dict_keys(std_preds), \
        "mean_preds and std_preds must share the same keys"
    # match predictions on keys
    paired_preds = [[d[k] for d in (mean_preds, std_preds)] for k in dict_keys(mean_preds)]
    mean_preds, std_preds = (np.array(p)[:,:,1] for p in zip(*paired_preds))
    if isinstance(true_labels, dict):
        true_labels = dict_data(true_labels)
        assert len(true_labels) == 1, 'true_labels should have a single 1D vector entry'
        true_labels = true_labels[0]
    n_obs = len(mean_preds[0])
    assert len(true_labels) == n_obs, \
        f'true_labels has {len(true_labels)} obs. but should have same as predictions ({n_obs})'
    sorted_idx = np.argsort(std_preds, axis=1)
    correct_labels = np.take_along_axis(np.around(mean_preds) - true_labels == 0, sorted_idx, 1)
    uncertainty = np.take_along_axis(std_preds, sorted_idx, 1)
    cum_acc = np.cumsum(correct_labels, axis=1) / range(1, n_obs+1)
    return uncertainty, cum_acc, sorted_idx
def filter_vset_by_metric(metric_dict: dict, vset: Vset, *vsets: Vset, n_keep: int = 1, bigger_is_better: bool = True, filter_on=None, group: bool = False) ‑> Union[Vset, list]

Returns a new Vset by filtering vset.modules based on values in filter_dict.

Parameters

metric_dict : dict
output from a Vset, typically with metrics or other numeric values to use when filtering vset.modules
vset : Vset
a Vsets
*vsets : Vset
zero or more additional Vsets
n_keep : int (optional)
number of entries to keep from vset.modules
bigger_is_better : bool (optional)
if True, then the top n_keep largest values are retained
filter_on : list[str] (optional)
if there are multiple metrics in metric_dict, you can specify a subset to consider
group : bool (optional)
if True, average metrics after grouping values in metric_dict by the input Vset names

Returns

*new_vset : Vset
Copies of the input Vsets but with Vfuncs filtered based on metrics
Expand source code
def filter_vset_by_metric(metric_dict: dict, vset: Vset, *vsets: Vset, n_keep: int = 1,
                          bigger_is_better: bool = True, filter_on=None,
                          group: bool = False) -> Union[Vset, list]:
    """Returns a new Vset by filtering `vset.modules` based on values in filter_dict.

    Parameters
    ----------
    metric_dict: dict
        output from a Vset, typically with metrics or other numeric values to use when
        filtering `vset.modules`
    vset: Vset
        a Vsets
    *vsets: Vset
        zero or more additional Vsets
    n_keep: int (optional)
        number of entries to keep from `vset.modules`
    bigger_is_better: bool (optional)
        if True, then the top `n_keep` largest values are retained
    filter_on: list[str] (optional)
        if there are multiple metrics in `metric_dict`, you can specify a subset
        to consider
    group: bool (optional)
        if True, average metrics after grouping values in `metric_dict` by the
        input Vset names

    Returns
    -------
    *new_vset : Vset
        Copies of the input Vsets but with Vfuncs filtered based on metrics
    """
    if filter_on is None:
        filter_on = []
    df = dict_to_df(metric_dict)
    vsets = [vset, *vsets]
    vset_names = []
    for vset_i in vsets:
        if vset_i.name not in df.columns:
            raise ValueError((f'{vset_i.name} should be one '
                              'of the columns of dict_to_df(metric_dict)'))
        vset_names.append(vset_i.name)
    if len(filter_on) > 0:
        filter_col = list(metric_dict.keys())[0][-1].origin
        df = df[df[filter_col].isin(filter_on)]
    if group:
        df = df.groupby(by=vset_names, as_index=False).mean()
    if bigger_is_better:
        df = df.sort_values(by='out', ascending=False)
    else:
        df = df.sort_values(by='out')
    df = df.iloc[0:n_keep]
    for i, vset_i in enumerate(vsets):
        vfuncs = vset_i.modules
        vfunc_filter = [str(name) for name in df[vset_i.name].to_numpy()]
        new_vfuncs = {k: v for k, v in vfuncs.items() if str(v.name) in vfunc_filter}
        tracking_dir = None if vset_i._mlflow is None else mlflow.get_tracking_uri()
        new_vset = Vset('filtered_' + vset_i.name, new_vfuncs, is_async=vset_i._async,
                        output_matching=vset_i._output_matching, lazy=vset_i._lazy,
                        cache_dir=vset_i._cache_dir, tracking_dir=tracking_dir)
        setattr(new_vset, FILTER_PREV_KEY, (metric_dict[PREV_KEY], vset_i,))
        setattr(new_vset, PREV_KEY, getattr(new_vset, FILTER_PREV_KEY))
        vsets[i] = new_vset
    if len(vsets) == 1:
        return vsets[0]
    return vsets
def init_args(args_tuple: Union[tuple, list], names=None)

Converts tuple of arguments to a list of dicts

Parameters

names : list-like (optional), default None
given names for each of the arguments in the tuple
Expand source code
def init_args(args_tuple: Union[tuple, list], names=None):
    """Converts tuple of arguments to a list of dicts

    Parameters
    ----------
    names: list-like (optional), default None
        given names for each of the arguments in the tuple
    """
    if names is None:
        names = ['start'] * len(args_tuple)
    else:
        assert len(names) == len(args_tuple), 'names should be same length as args_tuple'
    output_dicts = []
    for i, _ in enumerate(args_tuple):
        output_dicts.append({
            (Subkey(names[i], 'init'),): args_tuple[i],
            PREV_KEY: ('init',),
        })
    return output_dicts