Source code for sdt.stats

# SPDX-FileCopyrightText: 2023 Lukas Schrangl <lukas.schrangl@boku.ac.at>
#
# SPDX-License-Identifier: BSD-3-Clause

"""Statistical functions
=====================

Hypothesis testing
------------------

.. autofunction:: permutation_test
.. autofunction:: grouped_permutation_test


Probability density function estimation
---------------------------------------

.. autofunction:: avg_shifted_hist


References
----------
.. [Schn2022] Schneider, M. C. & Schütz, G. J.: "Don’t Be Fooled by Randomness: Valid
    p-Values for Single Molecule Microscopy", Frontiers in Bioinformatics, 2022, 2,
    811053
"""

import inspect
import math
from typing import Callable, Iterable, Literal, Optional, Tuple, TypeVar, Union

import numpy as np
import pandas as pd
import scipy.stats
from numpy.typing import ArrayLike

PermutationTestResult = TypeVar("PermutationTestResult")
"""Placeholder for the type returned by `scipy.stats.permutation_test`,
which is private
"""



[docs]
def permutation_test(
    data1: Union[pd.DataFrame, np.ndarray],
    data2: Union[pd.DataFrame, np.ndarray],
    statistic: Callable[[np.ndarray], float] = np.mean,
    data_column: Optional = None,
    **kwargs,
) -> PermutationTestResult:
    """Permutation test for two independent samples

    Test the null hypothesis that two samples are not distinguishable by `statistic`.
    The null distribution is created from the difference of `statistic` applied to
    (permuted) datasets.

    Parameters
    ----------
    data1, data2
        Datasets
    statistic
        Which statistic (e.g., mean, median, …) to compare for `data1` and `data2`
    data_column
        Use this column if `data1` or `data2` are :py:class:`pandas.DataFrame`
    **kwargs
        Passed to :py:func:`scipy.stats.permutation_test`

    Returns
    -------
    :
        Test result containing observed difference of `statistic` (i.e.,
        ``statistic(data1) - statistic(data2)``), p-value, and null distribution
        generated from permuting datasets.
    """
    vec = kwargs.pop("vectorized", None)
    if vec is None:
        vec = "axis" in inspect.signature(statistic).parameters

    if isinstance(data1, pd.DataFrame):
        data1 = data1[data_column]
    if isinstance(data2, pd.DataFrame):
        data2 = data2[data_column]

    if vec:

        def statfunc(d1, d2, axis):
            return statistic(d1, axis=axis) - statistic(d2, axis=axis)

    else:

        def statfunc(d1, d2):
            return statistic(d1) - statistic(d2)

    return scipy.stats.permutation_test(
        [data1, data2], statfunc, vectorized=vec, **kwargs
    )




[docs]
def grouped_permutation_test(
    data1: Union[pd.DataFrame, Iterable[np.ndarray], pd.core.groupby.SeriesGroupBy],
    data2: Union[pd.DataFrame, Iterable[np.ndarray], pd.core.groupby.SeriesGroupBy],
    statistic: Callable[[np.ndarray], float] = np.mean,
    data_column: Optional = None,
    group_column: Optional = None,
    **kwargs,
) -> PermutationTestResult:
    """Grouped permutation test for two partly correlated samples

    Test the null hypothesis that two samples are not distinguishable by `statistic`.
    The null distribution is created from the difference of `statistic` applied to
    (permuted) datasets. Groups of datapoints are left in order. Thus datapoints within
    groups may be correlated (such as single-molecule trajectories); see [Schn2022]_.

    Parameters
    ----------
    data1, data2
        Grouped datasets. Either a column of a pandas DataFrame GroupBy, e.g.
        ``pandas.DataFrame(…).groupby("particle")["some_value"]`` or an iterable of
        arrays where each array represents a correlated block of data.
    statistic
        Which statistic (e.g., mean, median, …) to compare for `data1` and `data2`
    data_column
        Use this column if `data1` or `data2` are :py:class:`pandas.DataFrames`
    group_column
        Use this column to determine groups if `data1` or `data2` are
        :py:class:`pandas.DataFrame`
    **kwargs
        Passed to :py:func:`scipy.stats.permutation_test`. Note that this cannot be
        vectorized.

    Returns
    -------
    :
        Test result containing observed difference of `statistic` (i.e.,
        ``statistic(data1) - statistic(data2)``), p-value, and null distribution
        generated from permuting datasets.
    """
    if isinstance(data1, pd.DataFrame):
        data1 = data1.groupby(group_column)[data_column]
    if isinstance(data2, pd.DataFrame):
        data2 = data2.groupby(group_column)[data_column]
    if isinstance(data1, pd.api.typing.SeriesGroupBy):
        data1 = (d[1] for d in data1)
    if isinstance(data2, pd.api.typing.SeriesGroupBy):
        data2 = (d[1] for d in data2)
    data1 = np.fromiter(data1, dtype=object)
    data2 = np.fromiter(data2, dtype=object)
    # With scipy 1.17, passing data1 and data2 to `permutation_test` fails.
    # As they have `object` dtype, some array type inference stuff fails.
    # Therefore, pass indices to `permutation_test` and access data elements
    # accordingly in `statfunc` defined below
    data_all = np.concatenate([data1, data2])
    n1 = len(data1)
    n2 = len(data2)
    idx1 = np.arange(n1)
    idx2 = np.arange(n1, n1 + n2)

    def statfunc(i1, i2):
        return statistic(np.concatenate(data_all[i1])) - statistic(
            np.concatenate(data_all[i2])
        )

    return scipy.stats.permutation_test([idx1, idx2], statfunc, **kwargs)




[docs]
def avg_shifted_hist(
    x: ArrayLike,
    nbins: int | Literal["terrell-scott", "rice", "sqrt", "sturges", "scott"],
    nshifts: int,
    limits: Tuple[float, float] | None = None,
    density: bool = False,
) -> Tuple[np.ndarray, np.ndarray]:
    """Average shifted histogram

    Parameters
    ----------
    x
        values for which to generate average shifted histogram
    nbins
        either literal number of histogram bins or name of the method to determine the
        number (see https://en.wikipedia.org/wiki/Histogram)
    nshifts
        number of shifts
    limits
        histogram x axis range. If `None`, use min and max.
    density
        y axis scale. If `True`, generate probability density, if `False` use the
        number of events

    Returns
    -------
    y axis value for each bin and bin edges
    """
    x = np.asarray(x)

    if limits is None:
        limits = (x.min(), x.max())

    # according to https://en.wikipedia.org/wiki/Histogram
    if nbins == "terrell-scott":
        nbins = math.ceil((2 * len(x)) ** (1 / 3))
    elif nbins == "rice":
        nbins = math.ceil(2 * len(x) ** (1 / 3))
    elif nbins == "sqrt":
        nbins = math.ceil(math.sqrt(len(x)))
    elif nbins == "sturges":
        nbins = math.ceil(math.log2(len(x))) + 1
    elif nbins == "scott":
        nbins = math.ceil(
            (limits[1] - limits[0]) * len(x) ** (1 / 3) / (3.49 * np.std(x, ddof=1))
        )
    elif not isinstance(nbins, int):
        raise ValueError(f"unsupported value for nbins: {nbins}")

    nbins_shifted = nbins * nshifts
    bins = np.linspace(*limits, nbins_shifted + 1)
    hist = np.histogram(x, bins=bins, range=limits, density=density)[0]

    # DOI: 10.1002/wics.54
    # weights
    w = np.empty(2 * nshifts - 1)
    ar = np.arange(1, nshifts + 1)
    w[:nshifts] = ar
    w[nshifts:] = ar[-2::-1]
    w /= nshifts * nshifts

    f = np.convolve(hist, w, mode="same")
    return f, bins