Source code for sdt.stats

# SPDX-FileCopyrightText: 2023 Lukas Schrangl <lukas.schrangl@boku.ac.at>
#
# SPDX-License-Identifier: BSD-3-Clause

"""Statistical functions
=====================

Hypothesis testing
------------------

.. autofunction:: permutation_test
.. autofunction:: grouped_permutation_test


Probability density function estimation
---------------------------------------

.. autofunction:: avg_shifted_hist


References
----------
.. [Schn2022] Schneider, M. C. & Schütz, G. J.: "Don’t Be Fooled by Randomness: Valid
    p-Values for Single Molecule Microscopy", Frontiers in Bioinformatics, 2022, 2,
    811053
"""

import inspect
import math
from typing import Callable, Iterable, Literal, Optional, Tuple, TypeVar, Union

import numpy as np
import pandas as pd
import scipy.stats
from numpy.typing import ArrayLike

PermutationTestResult = TypeVar("PermutationTestResult")
"""Placeholder for the type returned by `scipy.stats.permutation_test`,
which is private
"""


[docs] def permutation_test( data1: Union[pd.DataFrame, np.ndarray], data2: Union[pd.DataFrame, np.ndarray], statistic: Callable[[np.ndarray], float] = np.mean, data_column: Optional = None, **kwargs, ) -> PermutationTestResult: """Permutation test for two independent samples Test the null hypothesis that two samples are not distinguishable by `statistic`. The null distribution is created from the difference of `statistic` applied to (permuted) datasets. Parameters ---------- data1, data2 Datasets statistic Which statistic (e.g., mean, median, …) to compare for `data1` and `data2` data_column Use this column if `data1` or `data2` are :py:class:`pandas.DataFrame` **kwargs Passed to :py:func:`scipy.stats.permutation_test` Returns ------- : Test result containing observed difference of `statistic` (i.e., ``statistic(data1) - statistic(data2)``), p-value, and null distribution generated from permuting datasets. """ vec = kwargs.pop("vectorized", None) if vec is None: vec = "axis" in inspect.signature(statistic).parameters if isinstance(data1, pd.DataFrame): data1 = data1[data_column] if isinstance(data2, pd.DataFrame): data2 = data2[data_column] if vec: def statfunc(d1, d2, axis): return statistic(d1, axis=axis) - statistic(d2, axis=axis) else: def statfunc(d1, d2): return statistic(d1) - statistic(d2) return scipy.stats.permutation_test( [data1, data2], statfunc, vectorized=vec, **kwargs )
[docs] def grouped_permutation_test( data1: Union[pd.DataFrame, Iterable[np.ndarray], pd.core.groupby.SeriesGroupBy], data2: Union[pd.DataFrame, Iterable[np.ndarray], pd.core.groupby.SeriesGroupBy], statistic: Callable[[np.ndarray], float] = np.mean, data_column: Optional = None, group_column: Optional = None, **kwargs, ) -> PermutationTestResult: """Grouped permutation test for two partly correlated samples Test the null hypothesis that two samples are not distinguishable by `statistic`. The null distribution is created from the difference of `statistic` applied to (permuted) datasets. Groups of datapoints are left in order. Thus datapoints within groups may be correlated (such as single-molecule trajectories); see [Schn2022]_. Parameters ---------- data1, data2 Grouped datasets. Either a column of a pandas DataFrame GroupBy, e.g. ``pandas.DataFrame(…).groupby("particle")["some_value"]`` or an iterable of arrays where each array represents a correlated block of data. statistic Which statistic (e.g., mean, median, …) to compare for `data1` and `data2` data_column Use this column if `data1` or `data2` are :py:class:`pandas.DataFrames` group_column Use this column to determine groups if `data1` or `data2` are :py:class:`pandas.DataFrame` **kwargs Passed to :py:func:`scipy.stats.permutation_test`. Note that this cannot be vectorized. Returns ------- : Test result containing observed difference of `statistic` (i.e., ``statistic(data1) - statistic(data2)``), p-value, and null distribution generated from permuting datasets. """ if isinstance(data1, pd.DataFrame): data1 = data1.groupby(group_column)[data_column] if isinstance(data2, pd.DataFrame): data2 = data2.groupby(group_column)[data_column] if isinstance(data1, pd.api.typing.SeriesGroupBy): data1 = (d[1] for d in data1) if isinstance(data2, pd.api.typing.SeriesGroupBy): data2 = (d[1] for d in data2) data1 = np.fromiter(data1, dtype=object) data2 = np.fromiter(data2, dtype=object) # With scipy 1.17, passing data1 and data2 to `permutation_test` fails. # As they have `object` dtype, some array type inference stuff fails. # Therefore, pass indices to `permutation_test` and access data elements # accordingly in `statfunc` defined below data_all = np.concatenate([data1, data2]) n1 = len(data1) n2 = len(data2) idx1 = np.arange(n1) idx2 = np.arange(n1, n1 + n2) def statfunc(i1, i2): return statistic(np.concatenate(data_all[i1])) - statistic( np.concatenate(data_all[i2]) ) return scipy.stats.permutation_test([idx1, idx2], statfunc, **kwargs)
[docs] def avg_shifted_hist( x: ArrayLike, nbins: int | Literal["terrell-scott", "rice", "sqrt", "sturges", "scott"], nshifts: int, limits: Tuple[float, float] | None = None, density: bool = False, ) -> Tuple[np.ndarray, np.ndarray]: """Average shifted histogram Parameters ---------- x values for which to generate average shifted histogram nbins either literal number of histogram bins or name of the method to determine the number (see https://en.wikipedia.org/wiki/Histogram) nshifts number of shifts limits histogram x axis range. If `None`, use min and max. density y axis scale. If `True`, generate probability density, if `False` use the number of events Returns ------- y axis value for each bin and bin edges """ x = np.asarray(x) if limits is None: limits = (x.min(), x.max()) # according to https://en.wikipedia.org/wiki/Histogram if nbins == "terrell-scott": nbins = math.ceil((2 * len(x)) ** (1 / 3)) elif nbins == "rice": nbins = math.ceil(2 * len(x) ** (1 / 3)) elif nbins == "sqrt": nbins = math.ceil(math.sqrt(len(x))) elif nbins == "sturges": nbins = math.ceil(math.log2(len(x))) + 1 elif nbins == "scott": nbins = math.ceil( (limits[1] - limits[0]) * len(x) ** (1 / 3) / (3.49 * np.std(x, ddof=1)) ) elif not isinstance(nbins, int): raise ValueError(f"unsupported value for nbins: {nbins}") nbins_shifted = nbins * nshifts bins = np.linspace(*limits, nbins_shifted + 1) hist = np.histogram(x, bins=bins, range=limits, density=density)[0] # DOI: 10.1002/wics.54 # weights w = np.empty(2 * nshifts - 1) ar = np.arange(1, nshifts + 1) w[:nshifts] = ar w[nshifts:] = ar[-2::-1] w /= nshifts * nshifts f = np.convolve(hist, w, mode="same") return f, bins