"""Tools for comparing simulated smFRET data to experimental data"""
# Copyright 2017-2018 Lukas Schrangl
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import multiprocessing
import numpy as np
import scipy
from . import time_trace
[docs]class BatchTester:
def __init__(self, truth_generator, sim_func):
"""Parameters
----------
truth_generator : type
Class to be initialized with lifetimes and FRET efficiencies.
Instances should be callable with one parameter, the minimum
time trace length and return a ground truth time trace.
:py:class:`TwoStateExpTruth` would be an example.
sim_func : callable
Function to simulate a dataset. See :py:func:`simulate_dataset`
for an example and the call signature.
"""
self.truth_generator = truth_generator
self.sim_func = sim_func
[docs] def batch_test(self, test_times, efficiencies, exposure_time, data_points,
photons, experiment_data, donor_brightness,
acceptor_brightness,
nprocs=1, nchunks=None):
"""Test experiment data against many simulated data sets
For each set of (test) life times, simulate smFRET events and compare
to the experiment data using a two-sample KS test. The difference is
expressed in terms of the p-value.
Parameters
----------
test_times : array_like
Lifetimes to test experiment data against. Run simulations for
each set of test lifes. ``test_times[i]`` has to be an array of
test times for the i-th state.
efficiencies : array_like
Array of FRET efficiencies
exposure_time : float
Exposure time as used in the experiment
data_points : int
Number of smFRET data points to simulate
photons : int
Mean number of photons emitted per exposure
experiment_data : array_like
smFRET efficiency values from the experiment
donor_brightness, acceptor_brightness : callable
Takes one argument, an array of mean brightness values. For each
entry `m` it returns a random value drawn from the brightness
distribution with mean `m`.
nprocs : int, optional
Number of processes to use for parallel simulation. Defaults to 1,
i.e. no multiprocessing, since multiprocessing requires additional
measures, such as proper ``if __name__ == "__main__"`` blocks.
nchunks : int or None, optional
Number of chunks into which to split the data for parallel
simulation. If `None` (default), use ``nprocs * 2``.
Returns
-------
numpy.ndarray
p-values returned by KS tests. The array has the same shape as
the `test_times` minus the first dimension.
"""
# Transposing is necessary for broadcasting to work in case
# `efficiencies` is just a 1D array. Additionally, the flattened result
# can be passed directly to `batch_test_worker`, which can iterate
# over it.
lt, ef = np.broadcast_arrays(np.transpose(test_times),
np.transpose(efficiencies))
shape = lt.shape[:-1]
num_states = lt.shape[-1]
# "Flatten" s.t. each line gives a set of life times (or efficiencies,
# respectively). This is necessary for the `batch_test_worker` call.
lt = np.reshape(lt, (-1, num_states), "C")
ef = np.reshape(ef, (-1, num_states), "C")
if nprocs <= 1:
# If no multiprocessing is wanted, just execute the worker function
ret = self._batch_test_worker(lt, ef, exposure_time, data_points,
photons, experiment_data,
donor_brightness,
acceptor_brightness)
else:
# Use multiprocessing
if not nchunks:
# This seems to be a smart choice according to conducted
# benchmarks
nchunks = nprocs * 2
# Use the "spawn" method to create processes. At any rate, don't
# use "fork", since children inherit the parent's RNG state and
# every child process will generate the same random numbers.
with multiprocessing.get_context("spawn").Pool(nprocs) as pool:
# Split data into `nchunks` chunks.
lt_s = np.array_split(lt, nchunks)
ef_s = np.array_split(ef, nchunks)
ares = []
for l, e in zip(lt_s, ef_s):
# Async call to the worker function in the pool's processes
args = (l, e, exposure_time, data_points, photons,
experiment_data, donor_brightness,
acceptor_brightness)
r = pool.apply_async(self._batch_test_worker, args)
# Keep the async call results
ares.append(r)
# Wait for the results and concatenate them
ret = np.concatenate([r.get() for r in ares])
# Reshape to original shape and transpose again to undo intial
# transpose
return np.reshape(ret, shape).T
[docs] def frame_time(self, lifetimes):
"""Calculate a frame time from lifetimes to avoid autocorrelation
Parameters
----------
lifetimes : list of float
State lifetimes
Returns
-------
float
A frame time that avoids autocorrelation
"""
return 2 * max(lifetimes)
def _batch_test_worker(self, test_times, efficiencies, exposure_time,
data_points, photons, experiment_data,
donor_brightness, acceptor_brightness):
"""Test experiment data against many simulated data sets (worker func)
For each set of (test) life times, simulate smFRET events and compare
to the experiment data using a two-sample KS test. The difference is
expressed in terms of the KS statistics.
Parameters
----------
test_times : array-like, shape(n, 2)
Life times to test experiment data against. Run simulations for
each set of test lifes. ``test_times[i]`` a set of test life times
(one per state). This is different (transposed) from the
`test_times` argument of :py:meth:`batch_test`.
efficiencies : array_like, shape(n, 2)
Array of FRET efficiencies. Has to be of the same shape as
`test_times`.
exposure_time : float
Exposure time as used in the experiment
data_points : int
Number of smFRET data points to simulate
photons : int
Mean number of photons emitted per exposure
experiment_data : array_like
smFRET efficiency values from the experiment
donor_brightness, acceptor_brightness : callable
Takes one argument, an array of mean brightness values. For each
entry `m` it returns a random value drawn from the brightness
distribution with mean `m`.
Returns
-------
list
p-values returned by KS tests.
"""
ret = []
for lt, ef in zip(test_times, efficiencies):
truth = self.truth_generator(lt, ef)
d = self.sim_func(truth, exposure_time, data_points, photons,
donor_brightness, acceptor_brightness,
frame_time=self.frame_time(lt))
ks, p = scipy.stats.ks_2samp(d.exp_eff, experiment_data)
ret.append(p)
return ret
numpy_tester = BatchTester(time_trace.TwoStateExpTruth,
time_trace.simulate_dataset)
[docs]def batch_test(*args, **kwargs):
"""Call `batch_test` with `TwoStateExpTruth` and `simulate_dataset`
This calls the :py:meth:`BatchTester.batch_test` method for a
:py:class:`BatchTester` instance which uses
:py:class:`time_trace.TwoStateExpTruth` for state change trajectory
simulation and :py:func:`time_trace.simulate_dataset` for FRET
simulation.
For details about parameters, return values, etc. see
:py:meth:`BatchTester.batch_test`.
"""
return numpy_tester.batch_test(*args, **kwargs)
[docs]def combine_test_results(results):
r"""Given multiple test results, calculate p-values for global consistency
Multiple tests at e.g. different exposure times will lead to multiple
areas of consistency. Since lifetimes should not depend on exposure times,
one would only consider those lifetime combinations consistent which yield
high p-values for all exposure times. This is done by calculating
.. math::
\hat{p}(\tau_1, \tau_2) = 1 - (1 -
p_\text{min}(\tau_1, \tau_2))^{|T_\text{ex}|},
where :math:`T_\text{ex}` is the set of test results.
Parameters
----------
results : list of array-like, shape(m, n)
Test results
Returns
-------
numpy.ndarray, shape(m, n)
Array of :math:`\hat{p}` values
"""
n = len(results)
return 1 - (1 - np.min(results, axis=0))**n