"""
@author: David Diaz Vico
@license: MIT
"""
from __future__ import annotations
import itertools as it
from dataclasses import dataclass
from functools import reduce
from typing import (
Any,
Callable,
Iterable,
Literal,
Mapping,
Optional,
Sequence,
Tuple,
overload,
)
import numpy as np
import pandas as pd
from scipy.stats import (
friedmanchisquare,
kruskal,
mannwhitneyu,
rankdata,
wilcoxon,
)
from scipy.stats.stats import ttest_ind_from_stats, ttest_rel
from statsmodels.sandbox.stats.multicomp import multipletests
CorrectionLike = Literal[
None,
'bonferroni',
'sidak',
'holm-sidak',
'holm',
'simes-hochberg',
'hommel',
'fdr_bh',
'fdr_by',
'fdr_tsbh',
'fdr_tsbky',
]
MultitestLike = Literal['kruskal', 'friedmanchisquare']
TestLike = Literal['mannwhitneyu', 'wilcoxon']
@dataclass
class SummaryRow:
values: np.typing.NDArray[Any]
greater_is_better: bool | None = None
@dataclass
class ScoreCell:
mean: float
std: float | None
rank: int
significant: bool
def average_rank(
ranks: np.typing.NDArray[np.integer[Any]],
**kwargs: Any,
) -> SummaryRow:
"""Compute rank averages."""
return SummaryRow(
values=np.mean(ranks, axis=0),
greater_is_better=False,
)
def average_mean_score(
means: np.typing.NDArray[np.floating[Any]],
greater_is_better: bool,
**kwargs: Any,
) -> SummaryRow:
"""Compute score mean averages."""
return SummaryRow(
values=np.mean(means, axis=0),
greater_is_better=greater_is_better,
)
def _is_significant(
scores1: np.typing.NDArray[np.floating[Any]],
scores2: np.typing.NDArray[np.floating[Any]],
mean1: np.typing.NDArray[np.floating[Any]],
mean2: np.typing.NDArray[np.floating[Any]],
std1: np.typing.NDArray[np.floating[Any]],
std2: np.typing.NDArray[np.floating[Any]],
*,
nobs: int | None = None,
two_sided: bool = True,
paired_test: bool = False,
significancy_level: float = 0.05,
) -> bool:
alternative = "two-sided" if two_sided else "greater"
if paired_test:
assert scores1.ndim == 1
assert scores2.ndim == 1
_, pvalue = ttest_rel(
scores1,
scores2,
axis=-1,
alternative=alternative,
)
else:
assert nobs
_, pvalue = ttest_ind_from_stats(
mean1=mean1,
std1=std1,
nobs1=nobs,
mean2=mean2,
std2=std2,
nobs2=nobs,
equal_var=False,
alternative=alternative,
)
return pvalue < significancy_level
def _all_significants(
scores: np.typing.NDArray[np.floating[Any]],
means: np.typing.NDArray[np.floating[Any]],
stds: np.typing.NDArray[np.floating[Any]] | None,
ranks: np.typing.NDArray[np.integer[Any]],
*,
nobs: int | None = None,
two_sided: bool = True,
paired_test: bool = False,
significancy_level: float = 0,
) -> np.typing.NDArray[np.bool_]:
significant_matrix = np.zeros_like(ranks, dtype=np.bool_)
if stds is None or significancy_level <= 0:
return significant_matrix
for row, (scores_row, mean_row, std_row, rank_row) in enumerate(
zip(scores, means, stds, ranks),
):
for column, (scores1, mean1, std1, rank1) in enumerate(
zip(scores_row, mean_row, std_row, rank_row),
):
# Compare every element with all the ones with immediate below rank
# It must be significantly better than all of them
index2 = np.flatnonzero(rank_row == (rank1 + 1))
is_significant = len(index2) > 0 and all(
_is_significant(
scores1,
scores_row[idx],
mean1,
mean_row[idx],
std1,
std_row[idx],
nobs=nobs,
two_sided=two_sided,
paired_test=paired_test,
significancy_level=significancy_level,
)
for idx in index2
)
if is_significant:
significant_matrix[row, column] = True
return significant_matrix
def _set_style_classes(
table: pd.DataFrame,
*,
all_ranks: np.typing.NDArray[np.integer[Any]],
significants: np.typing.NDArray[np.bool_],
n_summary_rows: int,
) -> pd.io.formats.style.Styler:
rank_class_names = np.char.add(
"rank",
all_ranks.astype(str),
)
is_summary_row = np.zeros_like(all_ranks, dtype=np.bool_)
is_summary_row[-n_summary_rows:, :] = True
summary_rows_class_name = np.char.multiply(
"summary",
is_summary_row.astype(int),
)
significant_class_name = np.char.multiply(
"significant",
np.insert(
significants,
(len(significants),) * n_summary_rows,
0,
axis=0,
).astype(int),
)
styler = table.style.set_td_classes(
pd.DataFrame(
reduce(
np.char.add,
(
rank_class_names,
" ",
summary_rows_class_name,
" ",
significant_class_name,
),
),
index=table.index,
columns=table.columns,
),
)
return styler
def _set_style_formatter(
styler: pd.io.formats.style.Styler,
*,
precision: int,
) -> pd.io.formats.style.Styler:
def _formatter(
data: object,
) -> str:
if isinstance(data, str):
return data
elif isinstance(data, int):
return str(int)
elif isinstance(data, float):
return f"{data:.{precision}f}"
elif isinstance(data, ScoreCell):
str_repr = f'{data.mean:.{precision}f}'
if data.std is not None:
str_repr += f' ± {data.std:.{precision}f}'
str_repr += f' ({data.rank:.0f})'
return str_repr
else:
return ""
return styler.format(
_formatter,
)
def _set_default_style_html(
styler: pd.io.formats.style.Styler,
*,
n_summary_rows: int,
) -> pd.io.formats.style.Styler:
last_rows_mask = np.zeros(len(styler.data), dtype=int)
last_rows_mask[-n_summary_rows:] = 1
styler = styler.set_table_styles(
[
{
"selector": ".summary",
"props": [("font-style", "italic")],
},
{
"selector": ".rank1",
"props": [("font-weight", "bold")],
},
{
"selector": ".rank2",
"props": [("text-decoration", "underline")],
},
{
"selector": ".significant::after",
"props": [
("content", "\"*\""),
("width", "0px"),
("display", "inline-block"),
],
},
{
"selector": ".col_heading",
"props": [("font-weight", "bold")],
},
],
)
styler = styler.apply_index(
lambda _: np.char.multiply(
"font-style: italic; font-weight: bold",
last_rows_mask,
),
axis=0,
)
styler = styler.apply_index(
lambda idx: ["font-weight: bold"] * len(idx),
axis=1,
)
return styler
def _set_style_from_class(
styler: pd.io.formats.style.Styler,
class_name: str,
style: str,
) -> pd.io.formats.style.Styler:
style_matrix = np.full(styler.data.shape, style)
for row in range(style_matrix.shape[0]):
for column in range(style_matrix.shape[1]):
classes = styler.cell_context.get(
(row, column),
"",
).split()
if class_name not in classes:
style_matrix[row, column] = ""
return styler.apply(lambda x: style_matrix, axis=None)
def _set_default_style_latex(
styler: pd.io.formats.style.Styler,
*,
n_summary_rows: int,
) -> pd.io.formats.style.Styler:
last_rows_mask = np.zeros(len(styler.data), dtype=int)
last_rows_mask[-n_summary_rows:] = 1
styler.set_table_styles(
[
{
'selector': r'newcommand{\summary}',
'props': r':[1]{\textit{#1}};',
},
{
'selector': r'newcommand{\significant}',
'props': r':[1]{#1*};',
},
{
'selector': r'newcommand{\rank}',
'props': (
r':[2]{\ifnum#1=1 \textbf{#2} \else '
r'\ifnum#1=2 \underline{#2} \fi\fi};'
),
},
],
overwrite=False,
)
for rank in range(styler.data.shape[1]):
styler = _set_style_from_class(
styler,
f"rank{rank}",
f"rank{{{rank}}}:--rwrap; ",
)
for class_name in ("summary", "significant"):
styler = _set_style_from_class(
styler,
class_name,
f"{class_name}:--rwrap; ",
)
styler = styler.apply_index(
lambda _: np.char.multiply(
"textbf:--rwrap;summary:--rwrap;",
last_rows_mask,
),
axis=0,
)
styler = styler.apply_index(
lambda idx: ["textbf:--rwrap"] * len(idx),
axis=1,
)
return styler
def _set_default_style(
styler: pd.io.formats.style.Styler,
*,
n_summary_rows: int,
default_style: Literal["html", "latex", None],
) -> pd.io.formats.style.Styler:
if default_style == "html":
styler = _set_default_style_html(
styler,
n_summary_rows=n_summary_rows,
)
elif default_style == "latex":
styler = _set_default_style_latex(
styler,
n_summary_rows=n_summary_rows,
)
return styler
[docs]def scores_table(
scores: np.typing.ArrayLike,
stds: np.typing.ArrayLike | None = None,
*,
datasets: Sequence[str],
estimators: Sequence[str],
nobs: int | None = None,
greater_is_better: bool = True,
method: Literal['average', 'min', 'max', 'dense', 'ordinal'] = 'min',
significancy_level: float = 0,
paired_test: bool = False,
two_sided: bool = True,
default_style: Literal["html", "latex", None] = "html",
precision: int = 2,
summary_rows: Sequence[Tuple[str, Callable[..., SummaryRow]]] = (
("Average rank", average_rank),
),
) -> pd.io.formats.style.Styler:
"""
Scores table.
Prints a table where each row represents a dataset and each column
represents an estimator.
Parameters
----------
scores: array-like
Matrix of scores where each column represents a model.
Either the full matrix with all experiment results or the
matrix with the mean scores can be passed.
stds: array-like, default=None
Matrix of standard deviations where each column represents a
model. If ``scores`` is the full matrix with all results
this is automatically computed from it and should not be passed.
datasets: sequence of :external:class:`str`
List of dataset names.
estimators: sequence of :external:class:`str`
List of estimator names.
nobs: :external:class:`int`
Number of repetitions of the experiments. Used only for computing
significances when ``scores`` is not the full matrix.
greater_is_better: boolean, default=True
Whether a greater score is better (score) or worse
(loss).
method: {'average', 'min', 'max', 'dense', 'ordinal'}, default='average'
Method used to solve ties.
significancy_level: :external:class:`float`, default=0
Significancy level for considerin a result significant. If nonzero,
significancy is calculated using a t-test. In that case, if
``paired_test`` is ``True``, ``scores`` should be the full matrix
and a paired test is performed. Otherwise, the t-test assumes
independence, and either ``scores`` should be the full matrix
or ``nobs`` should be passed.
paired_test: :external:class:`bool`, default=False
Whether to perform a paired test or a test assuming independence.
If ``True``, ``scores`` should be the full matrix.
Otherwise, either ``scores`` should be the full matrix
or ``nobs`` should be passed.
two_sided: :external:class:`bool`, default=True
Whether to perform a two sided t-test or a one sided t-test.
default_style: {'html', 'latex', None}, default='html'
Default style for the table. Use ``None`` for no style. Note that
the CSS classes and textual formatting are always set.
precision: :external:class:`int`
Number of decimals used for floating point numbers.
summary_rows: sequence
List of (name, callable) tuples for additional summary rows.
By default, the rank average is computed.
Returns
-------
table: array-like
Table of mean and standard deviation of each estimator-dataset
pair. A ranking of estimators is also generated.
"""
scores = np.asanyarray(scores)
stds = None if stds is None else np.asanyarray(stds)
assert scores.ndim in {2, 3}
means = scores if scores.ndim == 2 else np.mean(scores, axis=-1)
if scores.ndim == 3:
assert stds is None
assert nobs is None
stds = np.std(scores, axis=-1)
nobs = scores.shape[-1]
ranks = np.asarray([
rankdata(-m, method=method)
if greater_is_better
else rankdata(m, method=method)
for m in means
])
significants = _all_significants(
scores,
means,
stds,
ranks,
nobs=nobs,
two_sided=two_sided,
paired_test=paired_test,
significancy_level=significancy_level,
)
table = pd.DataFrame(data=means, index=datasets, columns=estimators)
for i, d in enumerate(datasets):
for j, e in enumerate(estimators):
table.loc[d, e] = ScoreCell(
mean=means[i, j],
std=None if stds is None else stds[i, j],
rank=ranks[i, j],
significant=significants[i, j],
)
# Create additional summary rows
additional_ranks = []
for name, summary_fun in summary_rows:
row = summary_fun(
scores=scores,
means=means,
stds=stds,
ranks=ranks,
greater_is_better=greater_is_better,
)
table.loc[name] = row.values
if row.greater_is_better is None:
additional_ranks.append(np.full(len(row.values), -1))
else:
additional_ranks.append(
rankdata(-row.values, method=method)
if row.greater_is_better
else rankdata(row.values, method=method),
)
styler = _set_style_classes(
table,
all_ranks=np.vstack([ranks] + additional_ranks),
significants=significants,
n_summary_rows=len(summary_rows),
)
styler = _set_style_formatter(
styler,
precision=precision,
)
return _set_default_style(
styler,
n_summary_rows=len(summary_rows),
default_style=default_style,
)
[docs]def hypotheses_table(
samples: np.typing.ArrayLike,
models: Sequence[str],
*,
alpha: float = 0.05,
multitest: Optional[MultitestLike] = None,
test: TestLike = 'wilcoxon',
correction: CorrectionLike = None,
multitest_args: Optional[Mapping[str, Any]] = None,
test_args: Optional[Mapping[str, Any]] = None,
) -> Tuple[Optional[pd.DataFrame], Optional[pd.DataFrame]]:
"""
Hypotheses table.
Prints a hypothesis table with a selected test and correction.
Parameters
----------
samples: array-like
Matrix of samples where each column represent a model.
models: array-like
Model names.
alpha: float in [0, 1], default=0.05
Significance level.
multitest: {'kruskal', 'friedmanchisquare'}, default=None
Ranking multitest used.
test: {'mannwhitneyu', 'wilcoxon'}, default='wilcoxon'
Ranking test used.
correction: {'bonferroni', 'sidak', 'holm-sidak', 'holm', \
'simes-hochberg', 'hommel', 'fdr_bh', 'fdr_by', 'fdr_tsbh', \
'fdr_tsbky'}, default=None
Method used to adjust the p-values.
multitest_args: dict
Optional ranking test arguments.
test_args: dict
Optional ranking test arguments.
Returns
-------
multitest_table: array-like
Table of p-value and rejection/non-rejection for the
multitest hypothesis.
test_table: array-like
Table of p-values and rejection/non-rejection for each test
hypothesis.
"""
if multitest_args is None:
multitest_args = {}
if test_args is None:
test_args = {}
samples = np.asanyarray(samples)
versus = list(it.combinations(range(len(models)), 2))
comparisons = [
f"{models[first]} vs {models[second]}"
for first, second in versus
]
multitests = {
'kruskal': kruskal,
'friedmanchisquare': friedmanchisquare,
}
tests = {
'mannwhitneyu': mannwhitneyu,
'wilcoxon': wilcoxon,
}
multitest_table = None
if multitest is not None:
multitest_table = pd.DataFrame(
index=[multitest],
columns=['p-value', 'Hypothesis'],
)
_, pvalue = multitests[multitest](
*samples.T,
**multitest_args,
)
reject_str = 'Rejected' if pvalue <= alpha else 'Not rejected'
multitest_table.loc[multitest] = ['{0:.2f}'.format(pvalue), reject_str]
# If the multitest does not detect a significative difference,
# the individual tests are not meaningful, so skip them.
if pvalue > alpha:
return multitest_table, None
pvalues = [
tests[test](
samples[:, first],
samples[:, second],
**test_args,
)[1] for first, second in versus
]
if correction is not None:
reject_bool, pvalues, _, _ = multipletests(
pvalues,
alpha,
method=correction,
)
reject = [
'Rejected'
if r
else 'Not rejected'
for r in reject_bool
]
else:
reject = [
'Rejected'
if pvalue <= alpha
else 'Not rejected'
for pvalue in pvalues
]
data = [
('{0:.2f}'.format(p), r)
for p, r in zip(pvalues, reject)
]
test_table = pd.DataFrame(
data,
index=comparisons,
columns=['p-value', 'Hypothesis'],
)
return multitest_table, test_table