Source code for skdatasets.utils.scores

"""
@author: David Diaz Vico
@license: MIT
"""
from __future__ import annotations

import itertools as it
from dataclasses import dataclass
from functools import reduce
from typing import (
    Any,
    Callable,
    Iterable,
    Literal,
    Mapping,
    Optional,
    Sequence,
    Tuple,
    overload,
)

import numpy as np
import pandas as pd
from scipy.stats import (
    friedmanchisquare,
    kruskal,
    mannwhitneyu,
    rankdata,
    wilcoxon,
)
from scipy.stats.stats import ttest_ind_from_stats, ttest_rel
from statsmodels.sandbox.stats.multicomp import multipletests

CorrectionLike = Literal[
    None,
    'bonferroni',
    'sidak',
    'holm-sidak',
    'holm',
    'simes-hochberg',
    'hommel',
    'fdr_bh',
    'fdr_by',
    'fdr_tsbh',
    'fdr_tsbky',
]

MultitestLike = Literal['kruskal', 'friedmanchisquare']

TestLike = Literal['mannwhitneyu', 'wilcoxon']


@dataclass
class SummaryRow:
    values: np.typing.NDArray[Any]
    greater_is_better: bool | None = None


@dataclass
class ScoreCell:
    mean: float
    std: float | None
    rank: int
    significant: bool


def average_rank(
    ranks: np.typing.NDArray[np.integer[Any]],
    **kwargs: Any,
) -> SummaryRow:
    """Compute rank averages."""
    return SummaryRow(
        values=np.mean(ranks, axis=0),
        greater_is_better=False,
    )


def average_mean_score(
    means: np.typing.NDArray[np.floating[Any]],
    greater_is_better: bool,
    **kwargs: Any,
) -> SummaryRow:
    """Compute score mean averages."""
    return SummaryRow(
        values=np.mean(means, axis=0),
        greater_is_better=greater_is_better,
    )


def _is_significant(
    scores1: np.typing.NDArray[np.floating[Any]],
    scores2: np.typing.NDArray[np.floating[Any]],
    mean1: np.typing.NDArray[np.floating[Any]],
    mean2: np.typing.NDArray[np.floating[Any]],
    std1: np.typing.NDArray[np.floating[Any]],
    std2: np.typing.NDArray[np.floating[Any]],
    *,
    nobs: int | None = None,
    two_sided: bool = True,
    paired_test: bool = False,
    significancy_level: float = 0.05,
) -> bool:

    alternative = "two-sided" if two_sided else "greater"

    if paired_test:
        assert scores1.ndim == 1
        assert scores2.ndim == 1

        _, pvalue = ttest_rel(
            scores1,
            scores2,
            axis=-1,
            alternative=alternative,
        )

    else:
        assert nobs

        _, pvalue = ttest_ind_from_stats(
            mean1=mean1,
            std1=std1,
            nobs1=nobs,
            mean2=mean2,
            std2=std2,
            nobs2=nobs,
            equal_var=False,
            alternative=alternative,
        )

    return pvalue < significancy_level


def _all_significants(
    scores: np.typing.NDArray[np.floating[Any]],
    means: np.typing.NDArray[np.floating[Any]],
    stds: np.typing.NDArray[np.floating[Any]] | None,
    ranks: np.typing.NDArray[np.integer[Any]],
    *,
    nobs: int | None = None,
    two_sided: bool = True,
    paired_test: bool = False,
    significancy_level: float = 0,
) -> np.typing.NDArray[np.bool_]:

    significant_matrix = np.zeros_like(ranks, dtype=np.bool_)

    if stds is None or significancy_level <= 0:
        return significant_matrix

    for row, (scores_row, mean_row, std_row, rank_row) in enumerate(
        zip(scores, means, stds, ranks),
    ):
        for column, (scores1, mean1, std1, rank1) in enumerate(
            zip(scores_row, mean_row, std_row, rank_row),
        ):
            # Compare every element with all the ones with immediate below rank
            # It must be significantly better than all of them
            index2 = np.flatnonzero(rank_row == (rank1 + 1))

            is_significant = len(index2) > 0 and all(
                _is_significant(
                    scores1,
                    scores_row[idx],
                    mean1,
                    mean_row[idx],
                    std1,
                    std_row[idx],
                    nobs=nobs,
                    two_sided=two_sided,
                    paired_test=paired_test,
                    significancy_level=significancy_level,
                )
                for idx in index2
            )

            if is_significant:
                significant_matrix[row, column] = True

    return significant_matrix


def _set_style_classes(
    table: pd.DataFrame,
    *,
    all_ranks: np.typing.NDArray[np.integer[Any]],
    significants: np.typing.NDArray[np.bool_],
    n_summary_rows: int,
) -> pd.io.formats.style.Styler:
    rank_class_names = np.char.add(
        "rank",
        all_ranks.astype(str),
    )

    is_summary_row = np.zeros_like(all_ranks, dtype=np.bool_)
    is_summary_row[-n_summary_rows:, :] = True

    summary_rows_class_name = np.char.multiply(
        "summary",
        is_summary_row.astype(int),
    )

    significant_class_name = np.char.multiply(
        "significant",
        np.insert(
            significants,
            (len(significants),) * n_summary_rows,
            0,
            axis=0,
        ).astype(int),
    )

    styler = table.style.set_td_classes(
        pd.DataFrame(
            reduce(
                np.char.add,
                (
                    rank_class_names,
                    " ",
                    summary_rows_class_name,
                    " ",
                    significant_class_name,
                ),
            ),
            index=table.index,
            columns=table.columns,
        ),
    )

    return styler


def _set_style_formatter(
    styler: pd.io.formats.style.Styler,
    *,
    precision: int,
) -> pd.io.formats.style.Styler:

    def _formatter(
        data: object,
    ) -> str:
        if isinstance(data, str):
            return data
        elif isinstance(data, int):
            return str(int)
        elif isinstance(data, float):
            return f"{data:.{precision}f}"
        elif isinstance(data, ScoreCell):
            str_repr = f'{data.mean:.{precision}f}'
            if data.std is not None:
                str_repr += f' ± {data.std:.{precision}f}'
            str_repr += f' ({data.rank:.0f})'
            return str_repr
        else:
            return ""

    return styler.format(
        _formatter,
    )


def _set_default_style_html(
    styler: pd.io.formats.style.Styler,
    *,
    n_summary_rows: int,
) -> pd.io.formats.style.Styler:

    last_rows_mask = np.zeros(len(styler.data), dtype=int)
    last_rows_mask[-n_summary_rows:] = 1

    styler = styler.set_table_styles(
        [
            {
                "selector": ".summary",
                "props": [("font-style", "italic")],
            },
            {
                "selector": ".rank1",
                "props": [("font-weight", "bold")],
            },
            {
                "selector": ".rank2",
                "props": [("text-decoration", "underline")],
            },
            {
                "selector": ".significant::after",
                "props": [
                    ("content", "\"*\""),
                    ("width", "0px"),
                    ("display", "inline-block"),
                ],
            },
            {
                "selector": ".col_heading",
                "props": [("font-weight", "bold")],
            },
        ],
    )

    styler = styler.apply_index(
        lambda _: np.char.multiply(
            "font-style: italic; font-weight: bold",
            last_rows_mask,
        ),
        axis=0,
    )

    styler = styler.apply_index(
        lambda idx: ["font-weight: bold"] * len(idx),
        axis=1,
    )

    return styler


def _set_style_from_class(
    styler: pd.io.formats.style.Styler,
    class_name: str,
    style: str,
) -> pd.io.formats.style.Styler:
    style_matrix = np.full(styler.data.shape, style)

    for row in range(style_matrix.shape[0]):
        for column in range(style_matrix.shape[1]):
            classes = styler.cell_context.get(
                (row, column),
                "",
            ).split()

            if class_name not in classes:
                style_matrix[row, column] = ""

    return styler.apply(lambda x: style_matrix, axis=None)


def _set_default_style_latex(
    styler: pd.io.formats.style.Styler,
    *,
    n_summary_rows: int,
) -> pd.io.formats.style.Styler:

    last_rows_mask = np.zeros(len(styler.data), dtype=int)
    last_rows_mask[-n_summary_rows:] = 1

    styler.set_table_styles(
        [
            {
                'selector': r'newcommand{\summary}',
                'props': r':[1]{\textit{#1}};',
            },
            {
                'selector': r'newcommand{\significant}',
                'props': r':[1]{#1*};',
            },
            {
                'selector': r'newcommand{\rank}',
                'props': (
                    r':[2]{\ifnum#1=1 \textbf{#2} \else '
                    r'\ifnum#1=2 \underline{#2} \fi\fi};'
                ),
            },
        ],
        overwrite=False,
    )

    for rank in range(styler.data.shape[1]):
        styler = _set_style_from_class(
            styler,
            f"rank{rank}",
            f"rank{{{rank}}}:--rwrap; ",
        )

    for class_name in ("summary", "significant"):

        styler = _set_style_from_class(
            styler,
            class_name,
            f"{class_name}:--rwrap; ",
        )

    styler = styler.apply_index(
        lambda _: np.char.multiply(
            "textbf:--rwrap;summary:--rwrap;",
            last_rows_mask,
        ),
        axis=0,
    )

    styler = styler.apply_index(
        lambda idx: ["textbf:--rwrap"] * len(idx),
        axis=1,
    )

    return styler


def _set_default_style(
    styler: pd.io.formats.style.Styler,
    *,
    n_summary_rows: int,
    default_style: Literal["html", "latex", None],
) -> pd.io.formats.style.Styler:

    if default_style == "html":
        styler = _set_default_style_html(
            styler,
            n_summary_rows=n_summary_rows,
        )
    elif default_style == "latex":
        styler = _set_default_style_latex(
            styler,
            n_summary_rows=n_summary_rows,
        )

    return styler


[docs]def scores_table(
    scores: np.typing.ArrayLike,
    stds: np.typing.ArrayLike | None = None,
    *,
    datasets: Sequence[str],
    estimators: Sequence[str],
    nobs: int | None = None,
    greater_is_better: bool = True,
    method: Literal['average', 'min', 'max', 'dense', 'ordinal'] = 'min',
    significancy_level: float = 0,
    paired_test: bool = False,
    two_sided: bool = True,
    default_style: Literal["html", "latex", None] = "html",
    precision: int = 2,
    summary_rows: Sequence[Tuple[str, Callable[..., SummaryRow]]] = (
        ("Average rank", average_rank),
    ),
) -> pd.io.formats.style.Styler:
    """
    Scores table.

    Prints a table where each row represents a dataset and each column
    represents an estimator.

    Parameters
    ----------
    scores: array-like
        Matrix of scores where each column represents a model.
        Either the full matrix with all experiment results or the
        matrix with the mean scores can be passed.
    stds: array-like, default=None
        Matrix of standard deviations where each column represents a
        model. If ``scores`` is the full matrix with all results
        this is automatically computed from it and should not be passed.
    datasets: sequence of :external:class:`str`
        List of dataset names.
    estimators: sequence of :external:class:`str`
        List of estimator names.
    nobs: :external:class:`int`
        Number of repetitions of the experiments. Used only for computing
        significances when ``scores`` is not the full matrix.
    greater_is_better: boolean, default=True
        Whether a greater score is better (score) or worse
        (loss).
    method: {'average', 'min', 'max', 'dense', 'ordinal'}, default='average'
        Method used to solve ties.
    significancy_level: :external:class:`float`, default=0
        Significancy level for considerin a result significant. If nonzero,
        significancy is calculated using a t-test. In that case, if
        ``paired_test`` is ``True``, ``scores`` should be the full matrix
        and a paired test is performed. Otherwise, the t-test assumes
        independence, and either ``scores`` should be the full matrix
        or ``nobs`` should be passed.
    paired_test: :external:class:`bool`, default=False
        Whether to perform a paired test or a test assuming independence.
        If ``True``, ``scores`` should be the full matrix.
        Otherwise, either ``scores`` should be the full matrix
        or ``nobs`` should be passed.
    two_sided: :external:class:`bool`, default=True
        Whether to perform a two sided t-test or a one sided t-test.
    default_style: {'html', 'latex', None}, default='html'
        Default style for the table. Use ``None`` for no style. Note that
        the CSS classes and textual formatting are always set.
    precision: :external:class:`int`
        Number of decimals used for floating point numbers.
    summary_rows: sequence
        List of (name, callable) tuples for additional summary rows.
        By default, the rank average is computed.

    Returns
    -------
    table: array-like
        Table of mean and standard deviation of each estimator-dataset
        pair. A ranking of estimators is also generated.

    """
    scores = np.asanyarray(scores)
    stds = None if stds is None else np.asanyarray(stds)

    assert scores.ndim in {2, 3}
    means = scores if scores.ndim == 2 else np.mean(scores, axis=-1)
    if scores.ndim == 3:
        assert stds is None
        assert nobs is None
        stds = np.std(scores, axis=-1)
        nobs = scores.shape[-1]

    ranks = np.asarray([
        rankdata(-m, method=method)
        if greater_is_better
        else rankdata(m, method=method)
        for m in means
    ])

    significants = _all_significants(
        scores,
        means,
        stds,
        ranks,
        nobs=nobs,
        two_sided=two_sided,
        paired_test=paired_test,
        significancy_level=significancy_level,
    )

    table = pd.DataFrame(data=means, index=datasets, columns=estimators)
    for i, d in enumerate(datasets):
        for j, e in enumerate(estimators):
            table.loc[d, e] = ScoreCell(
                mean=means[i, j],
                std=None if stds is None else stds[i, j],
                rank=ranks[i, j],
                significant=significants[i, j],
            )

    # Create additional summary rows
    additional_ranks = []
    for name, summary_fun in summary_rows:
        row = summary_fun(
            scores=scores,
            means=means,
            stds=stds,
            ranks=ranks,
            greater_is_better=greater_is_better,
        )
        table.loc[name] = row.values

        if row.greater_is_better is None:
            additional_ranks.append(np.full(len(row.values), -1))
        else:
            additional_ranks.append(
                rankdata(-row.values, method=method)
                if row.greater_is_better
                else rankdata(row.values, method=method),
            )

    styler = _set_style_classes(
        table,
        all_ranks=np.vstack([ranks] + additional_ranks),
        significants=significants,
        n_summary_rows=len(summary_rows),
    )

    styler = _set_style_formatter(
        styler,
        precision=precision,
    )

    return _set_default_style(
        styler,
        n_summary_rows=len(summary_rows),
        default_style=default_style,
    )


[docs]def hypotheses_table(
    samples: np.typing.ArrayLike,
    models: Sequence[str],
    *,
    alpha: float = 0.05,
    multitest: Optional[MultitestLike] = None,
    test: TestLike = 'wilcoxon',
    correction: CorrectionLike = None,
    multitest_args: Optional[Mapping[str, Any]] = None,
    test_args: Optional[Mapping[str, Any]] = None,
) -> Tuple[Optional[pd.DataFrame], Optional[pd.DataFrame]]:
    """
    Hypotheses table.

    Prints a hypothesis table with a selected test and correction.

    Parameters
    ----------
    samples: array-like
        Matrix of samples where each column represent a model.
    models: array-like
        Model names.
    alpha: float in [0, 1], default=0.05
        Significance level.
    multitest: {'kruskal', 'friedmanchisquare'}, default=None
        Ranking multitest used.
    test: {'mannwhitneyu', 'wilcoxon'}, default='wilcoxon'
        Ranking test used.
    correction: {'bonferroni', 'sidak', 'holm-sidak', 'holm', \
                 'simes-hochberg', 'hommel', 'fdr_bh', 'fdr_by', 'fdr_tsbh', \
                 'fdr_tsbky'}, default=None
        Method used to adjust the p-values.
    multitest_args: dict
        Optional ranking test arguments.
    test_args: dict
        Optional ranking test arguments.

    Returns
    -------
    multitest_table: array-like
        Table of p-value and rejection/non-rejection for the
        multitest hypothesis.
    test_table: array-like
        Table of p-values and rejection/non-rejection for each test
        hypothesis.

    """
    if multitest_args is None:
        multitest_args = {}

    if test_args is None:
        test_args = {}

    samples = np.asanyarray(samples)

    versus = list(it.combinations(range(len(models)), 2))
    comparisons = [
        f"{models[first]} vs {models[second]}"
        for first, second in versus
    ]

    multitests = {
        'kruskal': kruskal,
        'friedmanchisquare': friedmanchisquare,
    }
    tests = {
        'mannwhitneyu': mannwhitneyu,
        'wilcoxon': wilcoxon,
    }

    multitest_table = None
    if multitest is not None:
        multitest_table = pd.DataFrame(
            index=[multitest],
            columns=['p-value', 'Hypothesis'],
        )
        _, pvalue = multitests[multitest](
            *samples.T,
            **multitest_args,
        )
        reject_str = 'Rejected' if pvalue <= alpha else 'Not rejected'
        multitest_table.loc[multitest] = ['{0:.2f}'.format(pvalue), reject_str]

        # If the multitest does not detect a significative difference,
        # the individual tests are not meaningful, so skip them.
        if pvalue > alpha:
            return multitest_table, None

    pvalues = [
        tests[test](
            samples[:, first],
            samples[:, second],
            **test_args,
        )[1] for first, second in versus
    ]

    if correction is not None:
        reject_bool, pvalues, _, _ = multipletests(
            pvalues,
            alpha,
            method=correction,
        )
        reject = [
            'Rejected'
            if r
            else 'Not rejected'
            for r in reject_bool
        ]
    else:
        reject = [
            'Rejected'
            if pvalue <= alpha
            else 'Not rejected'
            for pvalue in pvalues
        ]

    data = [
        ('{0:.2f}'.format(p), r)
        for p, r in zip(pvalues, reject)
    ]

    test_table = pd.DataFrame(
        data,
        index=comparisons,
        columns=['p-value', 'Hypothesis'],
    )

    return multitest_table, test_table
Source code for skdatasets.utils.scores

scikit-datasets

Navigation

Related Topics