Source code for atml.exp

"""
The :mod:'atml.exp' module holds a set of functions to perform machine learning experiments and gather the corresponding
performances metrics.
"""
# Author: Hao Song (nuacesh@gmail.com)
# License: BSD-3

import numpy
import pandas
import openml


[docs]def get_random_split_measurement(model_instance, x, y, measure, sparse=False, cap_size=10000, test_size=0.5):
    """
    Perform a random split validation experiment for a given combination of model, dataset, and evaluation measure.

    Parameters
    ----------
    model_instance: sklearn.predictor
        A model instance with the sklearn.predictor template, it should have a fit() method for model training, and
        a predict_proba() method to predict probability vectors on test data.

    x: numpy.ndarray
        The data matrix with shape (n samples, d dimensions)

    y: numpy.ndarray
        The label vector with shape (n samples, 1)

    measure: atml.Measure
        A evaluation measure selected from the atml.measure module.

    sparse: boolean
        To indicate whether to only use a subset of the dataset to perform the experiments.

    cap_size: integer
        In the case sparse=True, cap_size specifies the maximum size of the dataset to run the experiments.

    test_size: float
        The proportion of the dataset that is used as the testing set (validation set).

    Returns
    ----------

    measurement: float
        The performance measurement on the testing set (validation set).

    """
    # x : shape(n, m)
    # y : shape(n, 1)
    # y_vector: shape(n, k)

    n = numpy.shape(x)[0]
    _, y = numpy.unique(y, return_inverse=True)
    k = len(numpy.unique(y))

    shuffle_idx = numpy.random.permutation(numpy.arange(0, n))
    x = x[shuffle_idx, :]
    y = y[shuffle_idx]
    y_vector = numpy.zeros((n, k))
    for i in range(0, k):
        y_vector[:, i] = (y == i)

    if sparse & (n > cap_size):
        class_count = numpy.ceil(numpy.mean(y_vector, axis=0) * cap_size).astype('int')
        tmp_x = []
        tmp_y = []
        tmp_y_vector = []
        for i in range(0, k):
            idx_list = numpy.argwhere(y == i).ravel()
            tmp_idx = numpy.random.choice(idx_list, class_count[i], replace=False)
            tmp_x.append(x[tmp_idx, :])
            tmp_y.append(y[tmp_idx])
            tmp_y_vector.append(y_vector[tmp_idx, :])
        x = numpy.vstack(tmp_x)
        y = numpy.hstack(tmp_y)
        y_vector = numpy.vstack(tmp_y_vector)

    n_train = numpy.ceil(n * (1 - test_size)).astype('int')

    selected_idx = numpy.zeros(n)

    class_count = numpy.ceil(numpy.mean(y_vector, axis=0) * n_train).astype('int')

    x_train = []

    y_train = []

    for j in range(0, k):
        idx_list = numpy.argwhere(y == j).ravel()
        tmp_idx = numpy.random.choice(idx_list, class_count[j], replace=False)
        x_train.append(x[tmp_idx, :])
        y_train.append(y[tmp_idx])
        selected_idx[tmp_idx] = 1.0

    x_train = numpy.vstack(x_train)

    y_train = numpy.hstack(y_train)

    x_test = x[selected_idx == 0, :]

    y_test = y[selected_idx == 0]

    y_train_vector = numpy.zeros((len(y_train), k))

    y_test_vector = numpy.zeros((len(y_test), k))

    for k in range(0, k):
        y_train_vector[:, k] = (y_train == k)
        y_test_vector[:, k] = (y_test == k)

    mdl = model_instance
    mdl.fit(x_train, y_train)

    s_test = mdl.predict_proba(x_test)

    s_test[~numpy.isfinite(numpy.sum(s_test, axis=1)), :] = \
        numpy.repeat(numpy.mean(y_train_vector, axis=0).reshape(1, -1),
                     numpy.sum(~numpy.isfinite(numpy.sum(s_test, axis=1))), axis=0)

    measurement = measure.get_measure(s_test, y_test_vector)

    return measurement


[docs]def get_openml_testing(openml_dict, flow_dict, measure, max_n_exp=10):
    """
    Gather machine learning experiment results from OpenML

    Parameters
    ----------

    openml_dict: dict
        A dictionary that defines the (1) user defined dataset index, (2) name of the dataset, (3) OpenML defined
        dataset ID, (3) OpenML defined task ID. Example: {0, ('adult', 1590, 7592)}

    flow_dict: dict
        A dictionary that define the (1) OpenMl defined flow (model) ID, (2) user defined flow (model) index.
        Example: {1172: 0}

    measure: str
        The selected evaluation measure as defined by OpenML. Example: 'predictive_accuracy'
        See: https://www.openml.org/search?type=measure

    max_n_exp: int
        The maximum number of results collected for each dataset and task combination.

    Returns
    ----------

    res: pandas.DataFrame
        A table that contains the collected experiment results.

    """
    data = []
    for ii in range(0, len(openml_dict)):
        for jj in range(0, len(flow_dict)):
            did = openml_dict[ii][1]
            tid = openml_dict[ii][2]
            run_list = list(openml.evaluations.list_evaluations(function=measure,
                                                                tasks=[tid],
                                                                flows=[list(flow_dict.keys())[jj]]).items())
            print([tid, did, list(flow_dict.keys())[jj], measure, len(run_list)])
            for j in range(0, min(max_n_exp, len(run_list))):
                try:
                    tmp_run = openml.runs.get_run(run_id=run_list[j][0])
                    data.append(
                        [ii, openml_dict[ii][0], flow_dict[tmp_run.flow_id], tmp_run.flow_name, run_list[j][1].value])
                except ValueError:
                    pass
    res = pandas.DataFrame(data, columns=['data_idx', 'data_ref', 'model_idx', 'model_ref', measure])
    return res


[docs]def get_exhaustive_testing(data_dict, get_data, model_dict, get_model, measure,
                           sparse=False, cap_size=10000, test_size=0.5):
    """
    Perform testing experiments on all the possible combinations between different models and datasets.

    Parameters
    ----------

    data_dict: dict
        A dictionary that defines the index and the reference name of all the datasets.
        Example: data_dict = {0: 'iris', 1: 'digits', 2: 'wine'}

    get_data: Callable
        A function that takes the dataset index and returns the features (x), and target (y) for the specified dataset.

    model_dict: dict
        A dictionary that defines the index and the reference name of all the models.
        Example: model_dict = {0: 'logistic regression', 1: 'random forest', 2: 'naive bayes'}

    get_model: Callable
        A function that takes the model index and returns the instance from a model class with a sklearn template.
        The model should have a fit(x, y) method for training and predict_proba(x) for testing.

    measure: atml.Measure
        A evaluation measure selected from the atml.measure module.

    sparse: boolean
        To indicate whether to only use a subset of the dataset to perform the experiments.

    cap_size: integer
        In the case sparse=True, cap_size specifies the maximum size of the dataset to run the experiments.

    test_size: float
        The proportion of the dataset that is used as the testing set (validation set).

    Returns
    ----------

    res: pandas.DataFrame
        A table that contains the

    """
    n_data = len(data_dict)
    n_model = len(model_dict)

    res = pandas.DataFrame(columns=['data_idx', 'data_ref', 'model_idx', 'model_ref',
                                    measure.name])

    idx = 0

    for i in range(0, n_data):
        for j in range(0, n_model):
            mdl = get_model(model_dict[j])
            x, y = get_data(data_dict[i])
            tmp_m = get_random_split_measurement(mdl, x, y, measure,
                                                 sparse=sparse, cap_size=cap_size, test_size=test_size)
            res.loc[idx] = [i, data_dict[i], j, model_dict[j], tmp_m]

            idx = idx + 1

    return res


[docs]def get_single_testing(data_idx, mdl, data_dict, get_data, measure, sparse=False, cap_size=10000, test_size=0.5):
    """
    Perform a single testing experiment on a specified dataset with the given model.

    Parameters
    ----------

    data_idx: int
        The index of the selected dataset, as defined with data_dict.

    mdl: sklearn.predictor
        An instance of the sklearn predictor.
        The model should have a fit(x, y) method for training and predict_proba(x) for testing.

    data_dict: dict
        A dictionary that defines the index and the reference name of all the datasets.
        Example: data_dict = {0: 'iris', 1: 'digits', 2: 'wine'}

    get_data: Callable
        A function that takes the dataset index and returns the features (x), and target (y) for the specified dataset.

    measure: atml.Measure
        A evaluation measure selected from the atml.measure module.

    sparse: boolean
        To indicate whether to only use a subset of the dataset to perform the experiments.

    cap_size: int
        In the case sparse=True, cap_size specifies the maximum size of the dataset to run the experiments.

    test_size: float
        The proportion of the dataset that is used as the testing set (validation set).

    Returns
    ----------

        tmp_m: float
            The performance measurement on the testing set (validation set).

    """

    x, y = get_data(data_dict[data_idx])

    tmp_m = get_random_split_measurement(mdl, x, y, measure,
                                         sparse=sparse, cap_size=cap_size, test_size=test_size)

    return tmp_m
Source code for atml.exp

@ML

Navigation

Related Topics