Source code for sdt.io.sm

# SPDX-FileCopyrightText: 2020 Lukas Schrangl <lukas.schrangl@tuwien.ac.at>
#
# SPDX-License-Identifier: BSD-3-Clause

"""Load single molecule data from files

This module allows for reading data files produced by various MATLAB and
python tools. So far it can read data from

- particle_tracking_2D
- prepare_peakposition (and anything als that produces pk files)
- check_fit (i. e. pks files)
- msdplot.
"""
import collections
import logging
from pathlib import Path
import re

import scipy.io as sp_io
import pandas as pd
import numpy as np


adjust_index = ["x", "y", "frame", "particle"]
"""Since MATLAB's indices start at 1 and python's start at 0, some data
(such as feature coordinates and frame numbers) may be off by one.
This list contains the names of columns to be corrected.
"""

_logger = logging.getLogger(__name__)


[docs]def load(filename, typ="auto", fmt="auto", color="red"): r"""Load localization or tracking data from file Use the :func:`load_\*` function appropriate for the file type in order to load the data. The file type is determined by the file's extension or the `fmt` parameter. Supported file types: - HDF5 files (\*.h5) - ThunderSTORM CSV files (\*.csv) - particle_tracking_2D positions (\*_positions.mat) - particle_tracking_2D tracks (\*_tracks.mat) - pkc files (\*.pkc) - pks files (\*.pks) - trc files (\*.trc) Arguments --------- filename : str or pathlib.Path Name of the file typ : str, optional If the file is HDF5, load this key (usually either "features" or "tracks"), unless it is "auto". In that case try to read "tracks" and if that fails, try to read "features". If the file is in particle_tracker format, this can be either "auto", "features" or "tracks". Defaults to "auto". fmt : {"auto", "hdf5", "particle_tracker", "pkc", "pks", "trc", "csv"}, optional Output format. If "auto", infer the format from `filename`. Otherwise, write the given format. color : {"red", "green"}, optional For pkc files, specify whether to load the red (default) or green channel. Returns ------- pandas.DataFrame Loaded data """ p = Path(filename) if fmt == "auto": if p.suffix == ".h5": fmt = "hdf5" elif str(p).endswith("_positions.mat"): fmt = "particle_tracker" if typ == "auto": typ = "features" elif str(p).endswith("_tracks.mat"): fmt = "particle_tracker" if typ == "auto": typ = "tracks" elif p.suffix == ".pkc": fmt = "pkc" elif p.suffix == ".pks": fmt = "pks" elif p.suffix == ".trc": fmt = "trc" elif p.suffix == ".csv": fmt = "csv" else: raise ValueError("Could not determine format from file name " + filename + ".") if fmt == "hdf5": if typ == "auto": try: return pd.read_hdf(filename, "tracks") except Exception: typ = "features" return pd.read_hdf(p, typ) if fmt == "particle_tracker": return load_pt2d(p, typ=typ) if fmt == "pkc": return load_pkmatrix(p, (color == "green")) if fmt == "pks": return load_pks(p) if fmt == "trc": return load_trc(p) if fmt == "csv": return load_csv(p) raise ValueError('Unknown format "{}"'.format(fmt))
_pt2d_name_trans = collections.OrderedDict(( ("x-Position", "x"), ("y-Position", "y"), ("Integrated Intensity", "mass"), ("Radius of Gyration", "size"), ("Excentricity", "ecc"), ("Maximal Pixel Intensity", "signal"), ("Background per Pixel", "bg"), ("Standard Deviation of Background", "bg_dev"), ("Full Integrated Intensity - Background", "mass_corr"), ("Background per Feature", "feat_bg"), ("Frame Number", "frame"), ("Time in Frames", "time"), ("Trace ID", "particle")))
[docs]def load_pt2d(filename, typ, load_protocol=True): """Load a _positions.mat file created by particle_tracking_2D Use :py:func:`scipy.io.loadmat` to load the file and convert data to a :py:class:`pandas.DataFrame`. Parameters ---------- filename : str or pathlib.Path Name of the file to load typ : {"features", "tracks"} Specify whether to load feature data (positions.mat) or tracking data (tracks.mat) load_protocol : bool, optional Look for a _protocol.mat file (i. e. replace the "_positions.mat" part of `filename` with "_protocol.mat") in order to load the column names. This may be buggy for some older versions of particle_tracking_2D. If reading the protocol fails, this behaves as if load_protocol=False. Defaults to True. Returns ------- pandas.DataFrame Loaded data. """ filename = str(filename) if typ == "features": mat_component = "MT" filename_component = "positions.mat" protocol_component = "positions_output" elif typ == "tracks": mat_component = "tracks" filename_component = "tracks.mat" protocol_component = "tracking_output" else: raise ValueError("Unknown type: " + typ) mt = sp_io.loadmat(filename)[mat_component] # column names for DataFrame, will be overridden if load_protocol == True cols = list(_pt2d_name_trans.values()) if load_protocol: try: proto_path = (filename[:filename.rfind(filename_component)] + "protocol.mat") proto = sp_io.loadmat(proto_path, struct_as_record=False, squeeze_me=True) name_str = getattr(proto["X"], protocol_component) names = name_str.split(", ") cols = [_pt2d_name_trans.get(n, n) for n in names] except Exception: _logger.info("Failed to read protocol for " + filename + ". Falling back to using hard-coded names.") # append cols with names for unnamed columns for i in range(len(cols), mt.shape[1]): cols.append("column{}".format(i)) # columns name list cannot have more columns than there are in the file cols = cols[:mt.shape[1]] df = pd.DataFrame(mt, columns=cols) for c in adjust_index: if c in df.columns: df[c] -= 1 if "size" in df.columns: # particle_tracker returns the squared radius of gyration df["size"] = np.sqrt(df["size"]) return df
_pk_column_names = ["frame", "x", "y", "size", "mass", "bg", "column6", "column7", "bg_dev", "column9", "column10"] _pk_ret_column_names = ["x", "y", "size", "mass", "bg", "bg_dev", "frame"]
[docs]def load_pkmatrix(filename, green=False): """Load a pkmatrix from a .mat file Use :py:func:`scipy.io.loadmat` to load the file and convert data to a :py:class:`pandas.DataFrame`. Parameters ---------- filename : str or pathlib.Path Name of the file to load green : bool If True, load `pkmatrix_green`, which is the right half of the image when using ``prepare_peakposition`` in 2 color mode. Otherwise, load `pkmatrix`. Defaults to False. Returns ------- pandas.DataFrame Loaded data. """ mat = sp_io.loadmat(str(filename), struct_as_record=False, squeeze_me=True) if not green: d = mat["par"].pkmatrix else: d = mat["par"].pkmatrix_green # if no localizations were found, an empty array is returned. However, # the DataFrame constructor expects None in this case. d = None if len(d) == 0 else d df = pd.DataFrame(data=d, columns=_pk_column_names) for c in adjust_index: if c in df.columns: df[c] -= 1 if "size" in df.columns: # the size in a pkmatrix is FWHM; convert to sigma of the gaussian # sigma = FWHM/sqrt(8*log(2)) df["size"] /= 2.3548200450309493 return df[_pk_ret_column_names]
_pks_column_names = ["frame", "x", "y", "size", "mass", "bg", "bg_dev", "ep"]
[docs]def load_pks(filename): """Load a pks matrix from a MATLAB file Use :py:func:`scipy.io.loadmat` to load the file and convert data to a :py:class:`pandas.DataFrame`. Parameters ---------- filename : str or pathlib.Path Name of the file to load Returns ------- pandas.DataFrame Loaded data. """ mat = sp_io.loadmat(str(filename), struct_as_record=False, squeeze_me=True) d = mat["pks"] # if no localizations were found, an empty array is returned. However, # the DataFrame constructor expects None in this case. d = None if len(d) == 0 else d df = pd.DataFrame(data=d, columns=_pks_column_names) for c in adjust_index: if c in df.columns: df[c] -= 1 if "size" in df.columns: # the size in a pkmatrix is FWHM; convert to sigma of the gaussian # sigma = FWHM/sqrt(8*log(2)) df["size"] /= 2.3548200450309493 # TODO: trackpy ep is in pixels (?), pks in nm return df
_trc_col_names = ["particle", "frame", "x", "y", "mass", "idx"] _trc_ret_col_names = ["x", "y", "mass", "frame", "particle"]
[docs]def load_trc(filename): """Load tracking data from a .trc file Parameters ---------- filename : str or pathlib.Path Name of the file to load Returns ------- pandas.DataFrame Loaded data. """ df = pd.read_table(str(filename), sep=r"\s+", names=_trc_col_names) for c in adjust_index: if c in df.columns: df[c] -= 1 return df[_trc_ret_col_names]
_thunderstorm_name_map = { "sigma": "size", "intensity": "mass", "offset": "bg", "bkgstd": "bg_dev"}
[docs]def load_csv(filename): """Load localization data from a CSV file created by ThunderSTORM Parameters ---------- filename : str or pathlib.Path Name of the file to load Returns ------- pandas.DataFrame Single molecule data """ df = pd.read_csv(filename) cols = [] # ThunderSTORM column names are "<name> [<unit>]" # Get the name and translate it to the "standard" name if possible unit_re = re.compile(r"^(\w+) \[(\w+)\]$") for c in df.columns: m = unit_re.search(c) if m: key = m.group(1) cols.append(_thunderstorm_name_map.get(key, key)) else: cols.append(c) df.columns = cols if "frame" in df.columns: df["frame"] -= 1 return df
_msd_column_names = ["lagt", "msd", "stderr", "qianerr"]
[docs]def load_msdplot(filename): """Load msdplot data from .mat file Parameters ---------- filename : str or pathlib.Path Name of the file to load Returns: dict([d, stderr, qianerr, pa, emsd]) d is the diffusion coefficient in μm²/s, stderr its standard error, qianerr its Qian error, pa the positional accuracy in nm and emsd a pandas.DataFrame containing the msd-vs.-tlag data. """ mat = sp_io.loadmat(str(filename), struct_as_record=False, squeeze_me=True) data = mat["msd1"] data[:, 0] /= 1000. return dict(d=mat["d1"], stderr=mat["dstd1"], qianerr=mat["dstd_qian1"], pa=mat["pos1"], emsd=pd.DataFrame(data, columns=_msd_column_names))
[docs]def save(filename, data, typ="auto", fmt="auto"): """Save feature/tracking data This supports HDF5, trc, and particle_tracker formats. Parameters ---------- filename : str or pathlib.Path Name of the file to write to data : pandas.DataFrame Data to save typ : {"auto", "features", "tracks"} Specify whether to save feature data or tracking data. If "auto", consider `data` tracking data if a "particle" column is present, otherwise treat as feature data. fmt : {"auto", "hdf5", "particle_tracker", "trc"} Output format. If "auto", infer the format from `filename`. Otherwise, write the given format. """ p = Path(filename) if typ not in ("tracks", "features", "auto"): raise ValueError("Unknown type: " + typ) if fmt == "auto": if p.suffix == ".h5": fmt = "hdf5" elif (str(p).endswith("_positions.mat") or str(p).endswith("_tracks.mat")): fmt = "particle_tracker" elif p.suffix == ".trc": fmt = "trc" else: raise ValueError("Could not determine format from file name " + filename) if typ == "auto": if "particle" in data.columns: typ = "tracks" else: typ = "features" if fmt == "hdf5": data.to_hdf(p, typ) return if fmt == "particle_tracker": save_pt2d(p, data, typ) return if fmt == "trc": save_trc(p, data) return else: raise ValueError('Unknown format "{}"'.format(fmt))
[docs]def save_pt2d(filename, data, typ="tracks"): """Save feature/tracking data in particle_tracker format Parameters ---------- filename : str or pathlib.Path Name of the file to write to data : pandas.DataFrame Data to save typ : {"features", "tracks"} Specify whether to save feature data or tracking data. """ filename = str(filename) data_cols = [] num_features = len(data) for v in _pt2d_name_trans.values(): if (v == "particle") and (typ != "tracks"): continue if v in data.columns: cur_col = data[v] else: cur_col = np.zeros(num_features) if v in adjust_index: data_cols.append(cur_col + 1) elif v == "size": data_cols.append(cur_col**2) else: data_cols.append(cur_col) key_name = ("MT" if typ == "features" else "tracks") sp_io.savemat(filename, {key_name: np.column_stack(data_cols)})
[docs]def save_trc(filename, data): """Save tracking data in trc format Parameters ---------- filename : str Name of the file to write to data : pandas.DataFrame Data to save """ df = data.copy() idx = np.arange(len(df)) if "particle" not in df.columns: df["particle"] = idx for c in adjust_index: if c in df.columns: df[c] += 1 df["__trc_idx__"] = idx df.to_csv(filename, sep=" ", header=False, index=False, columns=["particle", "frame", "x", "y", "mass", "__trc_idx__"])