Source code for atml.exp

"""
The :mod:'atml.exp' module holds a set of functions to perform machine learning experiments and gather the corresponding
performances metrics.
"""
# Author: Hao Song (nuacesh@gmail.com)
# License: BSD-3

import numpy
import pandas
import openml


[docs]def get_random_split_measurement(model_instance, x, y, measure, sparse=False, cap_size=10000, test_size=0.5): """ Perform a random split validation experiment for a given combination of model, dataset, and evaluation measure. Parameters ---------- model_instance: sklearn.predictor A model instance with the sklearn.predictor template, it should have a fit() method for model training, and a predict_proba() method to predict probability vectors on test data. x: numpy.ndarray The data matrix with shape (n samples, d dimensions) y: numpy.ndarray The label vector with shape (n samples, 1) measure: atml.Measure A evaluation measure selected from the atml.measure module. sparse: boolean To indicate whether to only use a subset of the dataset to perform the experiments. cap_size: integer In the case sparse=True, cap_size specifies the maximum size of the dataset to run the experiments. test_size: float The proportion of the dataset that is used as the testing set (validation set). Returns ---------- measurement: float The performance measurement on the testing set (validation set). """ # x : shape(n, m) # y : shape(n, 1) # y_vector: shape(n, k) n = numpy.shape(x)[0] _, y = numpy.unique(y, return_inverse=True) k = len(numpy.unique(y)) shuffle_idx = numpy.random.permutation(numpy.arange(0, n)) x = x[shuffle_idx, :] y = y[shuffle_idx] y_vector = numpy.zeros((n, k)) for i in range(0, k): y_vector[:, i] = (y == i) if sparse & (n > cap_size): class_count = numpy.ceil(numpy.mean(y_vector, axis=0) * cap_size).astype('int') tmp_x = [] tmp_y = [] tmp_y_vector = [] for i in range(0, k): idx_list = numpy.argwhere(y == i).ravel() tmp_idx = numpy.random.choice(idx_list, class_count[i], replace=False) tmp_x.append(x[tmp_idx, :]) tmp_y.append(y[tmp_idx]) tmp_y_vector.append(y_vector[tmp_idx, :]) x = numpy.vstack(tmp_x) y = numpy.hstack(tmp_y) y_vector = numpy.vstack(tmp_y_vector) n_train = numpy.ceil(n * (1 - test_size)).astype('int') selected_idx = numpy.zeros(n) class_count = numpy.ceil(numpy.mean(y_vector, axis=0) * n_train).astype('int') x_train = [] y_train = [] for j in range(0, k): idx_list = numpy.argwhere(y == j).ravel() tmp_idx = numpy.random.choice(idx_list, class_count[j], replace=False) x_train.append(x[tmp_idx, :]) y_train.append(y[tmp_idx]) selected_idx[tmp_idx] = 1.0 x_train = numpy.vstack(x_train) y_train = numpy.hstack(y_train) x_test = x[selected_idx == 0, :] y_test = y[selected_idx == 0] y_train_vector = numpy.zeros((len(y_train), k)) y_test_vector = numpy.zeros((len(y_test), k)) for k in range(0, k): y_train_vector[:, k] = (y_train == k) y_test_vector[:, k] = (y_test == k) mdl = model_instance mdl.fit(x_train, y_train) s_test = mdl.predict_proba(x_test) s_test[~numpy.isfinite(numpy.sum(s_test, axis=1)), :] = \ numpy.repeat(numpy.mean(y_train_vector, axis=0).reshape(1, -1), numpy.sum(~numpy.isfinite(numpy.sum(s_test, axis=1))), axis=0) measurement = measure.get_measure(s_test, y_test_vector) return measurement
[docs]def get_openml_testing(openml_dict, flow_dict, measure, max_n_exp=10): """ Gather machine learning experiment results from OpenML Parameters ---------- openml_dict: dict A dictionary that defines the (1) user defined dataset index, (2) name of the dataset, (3) OpenML defined dataset ID, (3) OpenML defined task ID. Example: {0, ('adult', 1590, 7592)} flow_dict: dict A dictionary that define the (1) OpenMl defined flow (model) ID, (2) user defined flow (model) index. Example: {1172: 0} measure: str The selected evaluation measure as defined by OpenML. Example: 'predictive_accuracy' See: https://www.openml.org/search?type=measure max_n_exp: int The maximum number of results collected for each dataset and task combination. Returns ---------- res: pandas.DataFrame A table that contains the collected experiment results. """ data = [] for ii in range(0, len(openml_dict)): for jj in range(0, len(flow_dict)): did = openml_dict[ii][1] tid = openml_dict[ii][2] run_list = list(openml.evaluations.list_evaluations(function=measure, tasks=[tid], flows=[list(flow_dict.keys())[jj]]).items()) print([tid, did, list(flow_dict.keys())[jj], measure, len(run_list)]) for j in range(0, min(max_n_exp, len(run_list))): try: tmp_run = openml.runs.get_run(run_id=run_list[j][0]) data.append( [ii, openml_dict[ii][0], flow_dict[tmp_run.flow_id], tmp_run.flow_name, run_list[j][1].value]) except ValueError: pass res = pandas.DataFrame(data, columns=['data_idx', 'data_ref', 'model_idx', 'model_ref', measure]) return res
[docs]def get_exhaustive_testing(data_dict, get_data, model_dict, get_model, measure, sparse=False, cap_size=10000, test_size=0.5): """ Perform testing experiments on all the possible combinations between different models and datasets. Parameters ---------- data_dict: dict A dictionary that defines the index and the reference name of all the datasets. Example: data_dict = {0: 'iris', 1: 'digits', 2: 'wine'} get_data: Callable A function that takes the dataset index and returns the features (x), and target (y) for the specified dataset. model_dict: dict A dictionary that defines the index and the reference name of all the models. Example: model_dict = {0: 'logistic regression', 1: 'random forest', 2: 'naive bayes'} get_model: Callable A function that takes the model index and returns the instance from a model class with a sklearn template. The model should have a fit(x, y) method for training and predict_proba(x) for testing. measure: atml.Measure A evaluation measure selected from the atml.measure module. sparse: boolean To indicate whether to only use a subset of the dataset to perform the experiments. cap_size: integer In the case sparse=True, cap_size specifies the maximum size of the dataset to run the experiments. test_size: float The proportion of the dataset that is used as the testing set (validation set). Returns ---------- res: pandas.DataFrame A table that contains the """ n_data = len(data_dict) n_model = len(model_dict) res = pandas.DataFrame(columns=['data_idx', 'data_ref', 'model_idx', 'model_ref', measure.name]) idx = 0 for i in range(0, n_data): for j in range(0, n_model): mdl = get_model(model_dict[j]) x, y = get_data(data_dict[i]) tmp_m = get_random_split_measurement(mdl, x, y, measure, sparse=sparse, cap_size=cap_size, test_size=test_size) res.loc[idx] = [i, data_dict[i], j, model_dict[j], tmp_m] idx = idx + 1 return res
[docs]def get_single_testing(data_idx, mdl, data_dict, get_data, measure, sparse=False, cap_size=10000, test_size=0.5): """ Perform a single testing experiment on a specified dataset with the given model. Parameters ---------- data_idx: int The index of the selected dataset, as defined with data_dict. mdl: sklearn.predictor An instance of the sklearn predictor. The model should have a fit(x, y) method for training and predict_proba(x) for testing. data_dict: dict A dictionary that defines the index and the reference name of all the datasets. Example: data_dict = {0: 'iris', 1: 'digits', 2: 'wine'} get_data: Callable A function that takes the dataset index and returns the features (x), and target (y) for the specified dataset. measure: atml.Measure A evaluation measure selected from the atml.measure module. sparse: boolean To indicate whether to only use a subset of the dataset to perform the experiments. cap_size: int In the case sparse=True, cap_size specifies the maximum size of the dataset to run the experiments. test_size: float The proportion of the dataset that is used as the testing set (validation set). Returns ---------- tmp_m: float The performance measurement on the testing set (validation set). """ x, y = get_data(data_dict[data_idx]) tmp_m = get_random_split_measurement(mdl, x, y, measure, sparse=sparse, cap_size=cap_size, test_size=test_size) return tmp_m