Source code for sdt.spatial

# SPDX-FileCopyrightText: 2020 Lukas Schrangl <lukas.schrangl@tuwien.ac.at>
#
# SPDX-License-Identifier: BSD-3-Clause

"""Analyze spatial aspects of data
===============================

The :py:mod:`sdt.spatial` module provides methods for analyzing spatial
aspects of single molecule data:

- Check whether features have near neighbors using the
  :py:func:`has_near_neighbor` function
- In tracking data, interpolate features that have been missed by the
  localization algorithm with help of :py:func:`interpolate_coords`
- Calculate the area and center of mass of a polygon using
  :py:func:`polygon_area` and :py:func:`polygon_center`
- Find the smallest enclosing circle of a set of points via
  :py:func:`smallest_enclosing_circle`.


Examples
--------

To find out whether single molecule features have other features nearby,
use the :py:func:`has_near_neighbor` function:

>>> loc = pandas.DataFrame([[10, 10], [10, 11], [20, 20]], columns=["x", "y"])
>>> loc
    x   y
0  10  10
1  10  11
2  20  20
>>> has_near_neighbor(loc, r=2.)
>>> loc
    x   y  has_neighbor
0  10  10             1
1  10  11             1
2  20  20             0

Missing localizations in single molecule tracking data can be interpolated
by :py:func:`interpolate_coords`:

>>> trc = pandas.DataFrame([[10, 10, 0, 0], [10, 10, 2, 0]],
...                        columns=["x", "y", "frame", "particle"])
>>> trc
    x   y  frame  particle
0  10  10      0         0
1  10  10      2         0
>>> trc_i = interpolate_coords(trc)
>>> trc_i
    x   y  frame  particle  interp
0  10  10      0         0       0
1  10  10      1         0       1
2  10  10      2         0       0

:py:func:`polygon_area` can be used to calculate the area of a polygon:

>>> vertices = [[0, 0], [10, 0], [10, 10], [0, 10]]
>>> polygon_area(vertices)
100.0


Programming reference
---------------------

.. autofunction:: has_near_neighbor
.. autofunction:: interpolate_coords
.. autofunction:: polygon_area
.. autofunction:: polygon_center
.. autofunction:: smallest_enclosing_circle


Smallest enclosing circle algorithm
-----------------------------------

The smallest enclosing circle :math:`C_n` for points :math:`p_1, p_2, …,
p_n` in randomized order is found iteratively. Let's assume that
:math:`C_{i-1}` has already been found. If :math:`p_i` lies within
:math:`C_{i-1}`, then :math:`C_i = C_{i-1}`. Else, :math:`C_i` needs to be the
smallest enclosing circle for :math:`p_1, p_2, …, p_i`; :math:`p_i` has to lie
on the circle.

This new problem is again solved iteratively. Assume :math:`C'_{j-1}` is the
smallest enclosing circle for :math:`p_1, p_2, …, p_{j-1}`, :math:`j < i` with
:math:`p_i` on the circle. Then :math:`C'_j = C'_{j-1}` if :math:`p_j` lies
within :math:`C'_{j-1}`. Else the smallest enclosing circle for :math:`p_1,
p_2, …, p_j` with :math:`p_j` and :math:`p_i` on the circle needs to be
found.

If all :math:`p_1, p_2, …, p_j` lie within the circle whose diameter is
the line :math:`l` connecting :math:`p_j` and :math:`p_i`, this circle is
:math:`C'_j`. Otherwise, two possible candidates for :math:`C'_j` are given by
those circles through :math:`p_k` on either side of the line :math:`l` such
that the circle centers are farthest away from :math:`l`. Of those two,
the circle with the smaller radius is chosen.

.. image:: /enclosing_circles.svg
    :width: 300
    :align: center
    :alt: How to find smallest enclosing circle with two given points

Source: `Project Nayuki
<https://www.nayuki.io/res/smallest-enclosing-circle/smallestenclosingcircle.py>`_,
in particular `this presentation
<https://www.nayuki.io/res/smallest-enclosing-circle/computational-geometry-lecture-6.pdf>`_.
"""
import math
from typing import List, Optional, Sequence, Tuple, Union

import numpy as np
import pandas as pd
from scipy.spatial import cKDTree

from . import config
from .helper import numba


def _has_near_neighbor_impl(data, r):
    """Implementation of finding near neighbors using KD trees

    Parameters
    ----------
    data : array-like, shape(n, m)
        n data points of dimension m
    r : float
        Maximum distance for data points to be considered near neighbors

    Returns
    -------
    numpy.ndarray, shape(n)
        For each data point this is 1 if it has neighbors closer than `r` and
        0 if it has not.
    """
    # Find data points with near neighbors
    t = cKDTree(data)
    nn = np.unique(t.query_pairs(r, output_type="ndarray"))
    # Record those data points
    hn = np.zeros(len(data), dtype=int)
    hn[nn] = 1
    return hn


[docs]@config.set_columns def has_near_neighbor(data, r, columns={}): """Check whether localized features have near neighbors Given a :py:class:`pandas.DataFrame` `data` with localization data, each data point is checked whether other points (in the same frame) are closer than `r`. The results will be written in a "has_neighbor" column of the `data` DataFrame. Parameters ---------- data : pandas.DataFrame Localization data. The "has_neighbor" column will be appended/overwritten with the results. r : float Maximum distance for data points to be considered near neighbors. Other parameters ---------------- columns : dict, optional Override default column names as defined in :py:attr:`config.columns`. Relevant names are `coords` and `time`. This means, if your DataFrame has coordinate columns "x" and "z" and the time column "alt_frame", set ``columns={"coords": ["x", "z"], "time": "alt_frame"}``. """ if not len(data): data["has_neighbor"] = [] return if columns["time"] in data.columns: data_arr = data[columns["coords"] + [columns["time"]]].values # Sort so that `diff` works below sort_idx = np.argsort(data_arr[:, -1]) data_arr = data_arr[sort_idx] # Split data according to frame number frame_bounds = np.nonzero(np.diff(data_arr[:, -1]))[0] + 1 data_split = np.split(data_arr[:, :-1], frame_bounds) # List of array of indices of data points with near neighbors has_neighbor = np.concatenate([_has_near_neighbor_impl(s, r) for s in data_split]) # Get the reverse of sort_idx s. t. all(x[sort_idx][rev_sort_idx] == x) ran = np.arange(len(data_arr), dtype=int) rev_sort_idx = np.empty_like(ran) rev_sort_idx[sort_idx] = ran # Undo sorting has_neighbor = has_neighbor[rev_sort_idx] else: has_neighbor = _has_near_neighbor_impl(data[columns["coords"]], r) # Append column to data frame data["has_neighbor"] = has_neighbor
[docs]@config.set_columns def interpolate_coords(tracks, columns={}): """Interpolate coordinates for missing localizations For each particle in `tracks`, interpolate coordinates for frames where no localization was detected. Parameters ---------- tracks : pandas.DataFrame Tracking data Returns ------- pandas.DataFrame Tracking data with missing frames interpolated. An "interp" column is added. If False, the localization was detected previously. If True, it was added via interpolation by this method. Other parameters ---------------- columns : dict, optional Override default column names as defined in :py:attr:`config.columns`. Relevant names are `coords`, `time`, and `particle`. This means, if your DataFrame has coordinate columns "x" and "z" and the time column "alt_frame", set ``columns={"coords": ["x", "z"], "time": "alt_frame"}``. """ tracks = tracks.copy() arr = tracks[columns["coords"] + [columns["particle"], columns["time"]]].values particles = np.unique(arr[:, -2]) missing_coords = [] missing_fno = [] missing_pno = [] for p in particles: a = arr[arr[:, -2] == p] # get particle p a = a[np.argsort(a[:, -1])] # sort according to frame number frames = a[:, -1].astype(int) # frame numbers # get missing frame numbers miss = list(set(range(frames[0], frames[-1]+1)) - set(frames)) miss = np.array(miss, dtype=int) coords = [] for c in a[:, :-2].T: # for missing frames interpolate each coordinate x = np.interp(miss, frames, c) coords.append(x) missing_coords.append(np.column_stack(coords)) missing_pno.append(np.full(len(miss), p, dtype=int)) missing_fno.append(miss) if not missing_coords: tracks["interp"] = 0 ret = tracks.sort_values([columns["particle"], columns["time"]]) return tracks.reset_index(drop=True) missing_coords = np.concatenate(missing_coords) missing_fno = np.concatenate(missing_fno) missing_pno = np.concatenate(missing_pno) missing_df = pd.DataFrame(missing_coords, columns=columns["coords"]) missing_df[columns["particle"]] = missing_pno missing_df[columns["time"]] = missing_fno # Don't use bool below. Otherwise, the `values` attribute of the DataFrame # will have "object" dtype. missing_df["interp"] = 1 tracks["interp"] = 0 ret = pd.merge(tracks, missing_df, "outer") ret.sort_values([columns["particle"], columns["time"]], inplace=True) return ret.reset_index(drop=True)
[docs]@numba.extending.register_jitable def polygon_area(vertices: Sequence[Sequence[float]]) -> float: """Calculate the (signed) area of a simple polygon The polygon may not self-intersect. This is based on JavaScript code from http://www.mathopenref.com/coordpolygonarea2.html. .. code-block:: javascript function polygonArea(X, Y, numPoints) { area = 0; // Accumulates area in the loop j = numPoints - 1; // The last vertex is the 'previous' one to the // first for (i=0; i<numPoints; i++) { area = area + (X[j]+X[i]) * (Y[j]-Y[i]); j = i; // j is previous vertex to i } return area/2; } For triangles, a faster, specialized code path based on the cross product is used. Parameters ---------- vertices Coordinates of the poligon vertices. Returns ------- Signed area of the polygon. Area is > 0 if vertices are given counterclockwise. """ if len(vertices) == 3: # Specialized, fast implementation for triangles, since this is heavily # used in `smallest_enclosing_circle` dx1 = vertices[1][0] - vertices[0][0] dy1 = vertices[1][1] - vertices[0][1] dx2 = vertices[2][0] - vertices[1][0] dy2 = vertices[2][1] - vertices[1][1] return (dx1 * dy2 - dy1 * dx2) / 2 x, y = np.vstack((vertices[-1], vertices)).T return np.sum((x[1:] + x[:-1]) * (y[1:] - y[:-1])) / 2
[docs]def polygon_center(vertices: Sequence[Sequence[float]], area: Optional[float] = None) -> Tuple[float, float]: r"""Compute center of mass of a polygon according to the formula .. math:: C_\mathrm{x} = \frac{1}{6A} \sum_{i=0}^{n-1} (x_{i} + x_{i+1}) (x_{i}y_{i+1} - x_{i+1}y_{i}) C_\mathrm{y} = \frac{1}{6A} \sum_{i=0}^{n-1} (y_{i} + y_{i+1}) (x_{i}y_{i+1} - x_{i+1}y_{i}) where :math:`A` is the signed polygon area as computed by :py:func:`polygon_area`. Note that the formula is valid for a closed polygon. This function also works for open polygons. Parameters ---------- vertices Sequence of :math:`(x, y)` coordinate pairs. area If already computed, pass area of the polygon for efficiency Returns ------- Coordinates of the center of mass """ if area is None: area = polygon_area(vertices) x, y = np.vstack((vertices[-1], vertices)).T fact = x[:-1] * y[1:] - x[1:] * y[:-1] x_c = np.sum((x[:-1] + x[1:]) * fact) / (6 * area) y_c = np.sum((y[:-1] + y[1:]) * fact) / (6 * area) return x_c, y_c
[docs]def smallest_enclosing_circle(coords: Sequence[Sequence[float]], shuffle: Union[bool, np.random.RandomState] = True ) -> Tuple[Tuple[float, float], float]: """Find the smallest circle enclosing a set of points Parameters ---------- coords 2D coordinates of points shuffle If `True`, shuffle coordinate list before calculating circle. If a `RandomState` instance is passed, use it for shuffling. Note that coordinates should be in a random order for performance reasons. Returns ------- Center coordinates and radius Note ---- If you want to calculate the smallest enclosing circle in a `numba.jit`-ed function, have a look at :py:func:`smallest_enclosing_circle_impl`. """ if numba.numba_available: coords = np.array(coords, copy=True) else: # Best performance with a list of tuples/lists coords = [(float(x), float(y)) for x, y in coords] if shuffle: if callable(getattr(shuffle, "shuffle", None)): # `shuffle` behaves like a np.random.RandomState instance shuffle.shuffle(coords) else: np.random.shuffle(coords) return smallest_enclosing_circle_impl(coords)
@numba.try_njit(cache=True) def smallest_enclosing_circle_impl(coords: Union[np.ndarray, List]): """Find the smallest circle enclosing a set of points (implementation) If numba is available, this function is jitted and `coords` needs to be provided as a :py:class:`numpy.ndarray`. If numba is unavailable, use a list of tuples or lists for maximum performance. :py:func:`smallest_enclosing_circle` provides a convenient wrapper around this function which ensures the right format of `coords` and allows for shuffling. Parameters ---------- coords 2D coordinates of points in random order for O(N) expected runtime. Returns ------- Center coordinates and radius """ center = (np.NaN, np.NaN) radius = np.NaN for i, p in enumerate(coords): if math.isnan(radius) or not _in_circle(center, radius, p): center, radius = _enclosing_circle_1(coords[:i+1], p) return center, radius @numba.try_njit(cache=True) def _in_circle(center: Sequence[float], radius: float, point: Sequence[float], eps: float = 1e-14) -> bool: """Check whether a point lies within a circle Parameters ---------- center Coordinates of circle center radius Circle radius point Coordinates of the point eps For numerical reasons, increase radius by ``radius * eps`` Returns ------- `True` if the point lies within the circle, `False` otherwise. """ return (math.hypot(center[0] - point[0], center[1] - point[1]) <= radius * (1 + eps)) @numba.try_njit(cache=True) def _enclosing_circle_1(coords: Sequence[Sequence[float]], point: Sequence[float] ) -> Tuple[Tuple[float, float], float]: """Find the smallest enclosing circle with one point on boundary given Calculate center and radius of the smallest circle enclosing all points specified by `coords` such that `point` lies on the circle. Parameters ---------- coords Sequence of x and y coordinates of points that are enclosed by the circle point x and y coordinates of the point on the boundary Returns ------- Center coordinates and radius. """ # Ensure this is a tuple, otherwise assigning a tuple to `center` in the # loop will fail with numba center = (point[0], point[1]) radius = 0.0 for i, p in enumerate(coords): if _in_circle(center, radius, p): continue if radius == 0.0: center, radius = _circumscribe_2(point, p) else: center, radius = _enclosing_circle_2(coords[:i+1], point, p) return center, radius @numba.try_njit(cache=True) def _enclosing_circle_2(coords: Sequence[Sequence[float]], point1: Sequence[float], point2: Sequence[float] ) -> Tuple[Tuple[float, float], float]: """Find the smallest enclosing circle with two points on boundary given Calculate center and radius of the smallest circle enclosing all points specified by `coords` such that `point1` and `point2` lie on the circle. Parameters ---------- coords Sequence of x and y coordinates of points that are enclosed by the circle point1, point2 x and y coordinates of the two points on the boundary Returns ------- Center coordinates and radius. """ center_2, radius_2 = _circumscribe_2(point1, point2) area_left = -np.inf area_right = np.inf center_left = center_right = (np.NaN, np.NaN) radius_left = radius_right = np.inf for p in coords: if _in_circle(center_2, radius_2, p): continue area_p = polygon_area((point1, point2, p)) c, r = _circumscribe_3(point1, point2, p) if math.isnan(r): continue area_c = polygon_area((point1, point2, c)) if area_p > 0 and area_c > area_left: # Larger area means that the center is farther left from the line # connecting point1 and point2 than the previous farthest point center_left = c radius_left = r area_left = area_c elif area_p < 0 and area_c < area_right: # Larger negative area means that the center is farther right from # the line # connecting point1 and point2 than the previous # farthest point center_right = c radius_right = r area_right = area_c if not math.isfinite(radius_left) and not math.isfinite(radius_right): # Both have not been set return center_2, radius_2 if radius_left < radius_right: return center_left, radius_left return center_right, radius_right @numba.try_njit(cache=True) def _circumscribe_2(point1: Sequence[float], point2: Sequence[float] ) -> Tuple[Tuple[float, float], float]: """Find circumscribed circle for three points The circle's center is the midpoint between the points. The radius is half the distance between the points. Parameters ---------- point1, point2 x and y coordinates of the two points Returns ------- Center coordinates and radius. """ xc = (point1[0] + point2[0]) / 2 yc = (point1[1] + point2[1]) / 2 # For numerical stability, compute the radius as follows and not by # calculating the half length of point1 - point2; see # https://stackoverflow.com/a/41776277 r1 = math.hypot(point1[0] - xc, point1[1] - yc) r2 = math.hypot(point2[0] - xc, point2[1] - yc) radius = max(r1, r2) return (xc, yc), radius @numba.try_njit(cache=True) def _circumscribe_3(point1: Sequence[float], point2: Sequence[float], point3: Sequence[float] ) -> Tuple[Tuple[float, float], float]: """Find circumscribed circle for three points Implements the algorithm found at `Wikipedia <https://en.wikipedia.org/wiki/Circumscribed_circle#Cartesian_coordinates_2>`_ Parameters ---------- point1, point2, point3 x and y coordinates of the three points Returns ------- Center coordinates and radius. In case the points lie on a line, all values are NaN. """ # Improve numerical stability in case coordinates of points are large, but # differ verly little by translating to center origin # # Using numpy functions and/or loops is much slower here x0 = (min(point1[0], point2[0], point3[0]) + max(point1[0], point2[0], point3[0])) / 2 y0 = (min(point1[1], point2[1], point3[1]) + max(point1[1], point2[1], point3[1])) / 2 x1 = point1[0] - x0 y1 = point1[1] - y0 x2 = point2[0] - x0 y2 = point2[1] - y0 x3 = point3[0] - x0 y3 = point3[1] - y0 # Circumscribed circle formula, see # https://en.wikipedia.org/wiki/Circumscribed_circle#Cartesian_coordinates_2 dx1 = x2 - x3 dy1 = y2 - y3 dx2 = x3 - x1 dy2 = y3 - y1 dx3 = x1 - x2 dy3 = y1 - y2 c = 2 * (x1 * dy1 + x2 * dy2 + x3 * dy3) if math.isclose(c, 0): # This happens e.g. when points lie on a line return (np.NaN, np.NaN), np.NaN n1 = x1 * x1 + y1 * y1 n2 = x2 * x2 + y2 * y2 n3 = x3 * x3 + y3 * y3 xc = x0 + (n1 * dy1 + n2 * dy2 + n3 * dy3) / c yc = y0 - (n1 * dx1 + n2 * dx2 + n3 * dx3) / c # For numerical stability, compute the radius as follows and not by # using a forumla such as abc / (4A); see # https://stackoverflow.com/a/41776277 r1 = math.hypot(point1[0] - xc, point1[1] - yc) r2 = math.hypot(point2[0] - xc, point2[1] - yc) r3 = math.hypot(point3[0] - xc, point3[1] - yc) radius = max(r1, r2, r3) return (xc, yc), radius