Source code for autoflow.metrics

import copy
from abc import ABCMeta, abstractmethod
from functools import partial
from typing import List

import numpy as np
import pandas as pd
import scipy.stats
import sklearn.metrics
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import type_of_target

from autoflow.metrics import classification_metrics
from autoflow.utils.array_ import sanitize_array
from autoflow.utils.ml_task import MLTask


[docs]class Scorer(object, metaclass=ABCMeta):
    def __init__(self, name, score_func, optimum, sign, kwargs):
        self.name = name
        self._kwargs = kwargs
        self._score_func = score_func
        self._optimum = optimum
        self._sign = sign
        self.score = None

    @abstractmethod
    def __call__(self, y_true, y_pred, sample_weight=None):
        pass

    def __repr__(self):
        return self.name


class _PredictScorer(Scorer):
    def __call__(self, y_true, y_pred, sample_weight=None):
        """Evaluate predicted target values for X relative to y_true.

        Parameters
        ----------
        y_true : array-like
            Gold standard target values for X.

        y_pred : array-like, [n_samples x n_classes]
            Model predictions

        sample_weight : array-like, optional (default=None)
            Sample weights.

        Returns
        -------
        score : float
            Score function applied to prediction of component on X.
        """
        type_true = type_of_target(y_true)
        if len(y_pred.shape) == 1 or y_pred.shape[1] == 1 or \
                type_true == 'continuous':
            # must be regression, all other ml_task types would return at least
            # two probabilities
            pass
        elif type_true in ['binary', 'multiclass']:
            y_pred = np.argmax(y_pred, axis=1)
        elif type_true == 'multilabel-indicator':
            y_pred[y_pred > 0.5] = 1.0
            y_pred[y_pred <= 0.5] = 0.0
        else:
            raise ValueError(type_true)

        if sample_weight is not None:
            self.score = self._score_func(y_true, y_pred,
                                          sample_weight=sample_weight,
                                          **self._kwargs)
        else:
            self.score = self._score_func(y_true, y_pred,
                                          **self._kwargs)
        return self._sign * self.score


class _ProbaScorer(Scorer):
    def __call__(self, y_true, y_pred, sample_weight=None):
        """Evaluate predicted probabilities for X relative to y_true.
        Parameters
        ----------
        y_true : array-like
            Gold standard target values for X. These must be class labels,
            not probabilities.

        y_pred : array-like, [n_samples x n_classes]
            Model predictions

        sample_weight : array-like, optional (default=None)
            Sample weights.

        Returns
        -------
        score : float
            Score function applied to prediction of component on X.
        """
        if sample_weight is not None:
            self.score = self._score_func(y_true, y_pred,
                                          sample_weight=sample_weight,
                                          **self._kwargs)
        else:
            self.score = self._score_func(y_true, y_pred,
                                          **self._kwargs)
        return self._sign * self.score


class _ThresholdScorer(Scorer):
    def __call__(self, y_true, y_pred, sample_weight=None):
        """Evaluate decision function output for X relative to y_true.
        Parameters
        ----------
        y_true : array-like
            Gold standard target values for X. These must be class labels,
            not probabilities.

        y_pred : array-like, [n_samples x n_classes]
            Model predictions

        sample_weight : array-like, optional (default=None)
            Sample weights.

        Returns
        -------
        score : float
            Score function applied to prediction of component on X.
        """
        y_type = type_of_target(y_true)
        if y_type not in ("binary", "multilabel-indicator"):
            raise ValueError("{0} format is not supported".format(y_type))

        if y_type == "binary":
            y_pred = y_pred[:, 1]
        elif isinstance(y_pred, list):
            y_pred = np.vstack([p[:, -1] for p in y_pred]).T

        if sample_weight is not None:
            self.score = self._score_func(y_true, y_pred,
                                          sample_weight=sample_weight,
                                          **self._kwargs)
        else:
            self.score = self._score_func(y_true, y_pred,
                                          **self._kwargs)
        return self._sign * self.score


[docs]def make_scorer(name, score_func, optimum=1, greater_is_better=True,
                needs_proba=False, needs_threshold=False, **kwargs):
    """Make a scorer from a performance metric or loss function.

    Factory inspired by scikit-learn which wraps scikit-learn scoring functions
    to be used in auto-sklearn.

    Parameters
    ----------
    score_func : callable
        Score function (or loss function) with signature
        ``score_func(y, y_pred, **kwargs)``.

    optimum : int or float, default=1
        The best score achievable by the score function, i.e. maximum in case of
        scorer function and minimum in case of loss function.

    greater_is_better : boolean, default=True
        Whether score_func is a score function (default), meaning high is good,
        or a loss function, meaning low is good. In the latter case, the
        scorer object will sign-flip the outcome of the score_func.

    needs_proba : boolean, default=False
        Whether score_func requires predict_proba to get probability estimates
        out of a classifier.

    needs_threshold : boolean, default=False
        Whether score_func takes a continuous decision certainty.
        This only works for binary classification.

    **kwargs : additional arguments
        Additional parameters to be passed to score_func.

    Returns
    -------
    scorer : callable
        Callable object that returns a scalar score; greater is better.
    """
    sign = 1 if greater_is_better else -1
    if needs_proba:
        cls = _ProbaScorer
    elif needs_threshold:
        cls = _ThresholdScorer
    else:
        cls = _PredictScorer
    return cls(name, score_func, optimum, sign, kwargs)


# Standard regression scores
r2 = make_scorer('r2',
                 sklearn.metrics.r2_score)

pearsonr = make_scorer(
    'pearsonr',
    lambda x, y: scipy.stats.pearsonr(x, y)[0]
)
spearmanr = make_scorer(
    'spearmanr',
    lambda x, y: scipy.stats.spearmanr(x, y)[0]
)
kendalltau = make_scorer(
    'kendalltau',
    lambda x, y: scipy.stats.kendalltau(x, y)[0]
)
mean_squared_error = make_scorer('mean_squared_error',
                                 sklearn.metrics.mean_squared_error,
                                 optimum=0,
                                 greater_is_better=False)
mean_absolute_error = make_scorer('mean_absolute_error',
                                  sklearn.metrics.mean_absolute_error,
                                  optimum=0,
                                  greater_is_better=False)
median_absolute_error = make_scorer('median_absolute_error',
                                    sklearn.metrics.median_absolute_error,
                                    optimum=0,
                                    greater_is_better=False)

# Standard Classification Scores
accuracy = make_scorer('accuracy',
                       sklearn.metrics.accuracy_score)
mcc = make_scorer('mcc', sklearn.metrics.matthews_corrcoef)
sensitivity = make_scorer("sensitivity", classification_metrics.sensitivity)
specificity = make_scorer("specificity", classification_metrics.specificity)
balanced_accuracy = make_scorer('balanced_accuracy',
                                classification_metrics.balanced_accuracy)
f1 = make_scorer('f1',
                 sklearn.metrics.f1_score)

# Score functions that need decision values

average_precision = make_scorer('average_precision',
                                sklearn.metrics.average_precision_score,
                                needs_threshold=True)
precision = make_scorer('precision',
                        sklearn.metrics.precision_score)
recall = make_scorer('recall',
                     sklearn.metrics.recall_score)

# Score function for probabilistic classification
log_loss = make_scorer('log_loss',
                       sklearn.metrics.log_loss,
                       optimum=0,
                       greater_is_better=False,
                       needs_proba=True)
pac_score = make_scorer('pac_score',
                        classification_metrics.pac_score,
                        greater_is_better=True,
                        needs_proba=True)
# TODO cohen_kappa


REGRESSION_METRICS = dict()
for scorer in [
    r2, mean_squared_error, mean_absolute_error,
    median_absolute_error, pearsonr, spearmanr, kendalltau
]:
    REGRESSION_METRICS[scorer.name] = scorer

CLASSIFICATION_METRICS = dict()

for scorer in [
    accuracy, average_precision, log_loss,
    balanced_accuracy, pac_score, mcc, sensitivity, specificity
]:
    CLASSIFICATION_METRICS[scorer.name] = scorer

for multi_class in ["ovo", "ovr"]:
    for average in ['macro', 'micro', 'samples', 'weighted']:
        qualified_name = '{0}_{1}_{2}'.format("roc_auc", multi_class, average)
        globals()[qualified_name] = make_scorer(qualified_name,
                                                partial(sklearn.metrics.roc_auc_score,
                                                        multi_class=multi_class,
                                                        average=average), needs_proba=True)
        CLASSIFICATION_METRICS[qualified_name] = globals()[qualified_name]

for name, metric in [
    ('precision', sklearn.metrics.precision_score),
    ('recall', sklearn.metrics.recall_score),
    ('f1', sklearn.metrics.f1_score),
    ('roc_auc', sklearn.metrics.roc_auc_score)
]:
    globals()[name] = make_scorer(name, metric)
    CLASSIFICATION_METRICS[name] = globals()[name]
    for average in ['macro', 'micro', 'samples', 'weighted']:
        qualified_name = '{0}_{1}'.format(name, average)
        if name == "roc_auc":
            globals()[qualified_name] = make_scorer(qualified_name,
                                                    partial(metric, average=average),
                                                    needs_threshold=True)
        else:
            globals()[qualified_name] = make_scorer(qualified_name,
                                                    partial(metric,
                                                            average=average))
        CLASSIFICATION_METRICS[qualified_name] = globals()[qualified_name]


[docs]def calculate_score(solution, prediction, ml_task: MLTask, metric,
                    should_calc_all_metric=False):
    if isinstance(solution, (pd.Series, pd.DataFrame)):
        solution = solution.values
    if should_calc_all_metric:
        score = dict()
        true_score = {}
        if ml_task.mainTask == "regression":
            # TODO put this into the regression metric itself
            cprediction = sanitize_array(prediction)
            metric_dict = copy.copy(REGRESSION_METRICS)
            metric_dict[metric.name] = metric
            for metric_ in REGRESSION_METRICS:
                func: Scorer = REGRESSION_METRICS[metric_]
                score[func.name] = func(solution, cprediction)
                true_score[func.name] = func.score

        else:
            metric_dict = copy.copy(CLASSIFICATION_METRICS)
            metric_dict[metric.name] = metric
            for metric_ in metric_dict:
                func = CLASSIFICATION_METRICS[metric_]

                # TODO maybe annotate metrics to define which cases they can
                # handle?

                try:
                    score[func.name] = float(func(solution, prediction))
                    true_score[func.name] = func.score
                except ValueError as e:
                    if e.args[0] == 'multiclass format is not supported':
                        continue
                    elif e.args[0] == "Samplewise metrics are not available " \
                                      "outside of multilabel classification.":
                        continue
                    elif e.args[0] == "Target is multiclass but " \
                                      "average='binary'. Please choose another average " \
                                      "setting, one of [None, 'micro', 'macro', 'weighted'].":
                        continue
                    # else:
                    #     raise e

    else:
        if ml_task.mainTask == "regression":
            # TODO put this into the regression metric itself
            cprediction = sanitize_array(prediction)
            score = metric(solution, cprediction)
        else:
            score = metric(solution, prediction)

    return score, true_score


[docs]def calculate_confusion_matrix(y_true, y_pred) -> List[List[int]]:
    # return 2d list
    if len(y_pred.shape) > 1 and y_pred.shape[1] > 1:
        y_pred = np.argmax(y_pred, axis=1)
    return confusion_matrix(y_true, y_pred).tolist()