from collections import namedtuple
import numpy as np
from numpy import linalg
PopStats = namedtuple('PopStats', ['mean', 'stddev', 'min_max_ratio', 'disorder'])
PolyProps = namedtuple('PolyProps', ['area', 'peri', 'max_dist'])
TriProps = namedtuple('TriProps', ['sides', 'area'])
DensityProps = namedtuple('DensityProps', ['neighbors_in_distance',
'distance_for_neighbors'])
Props = namedtuple('Props', ['voronoi', 'delaunay', 'mst_branches', 'density'])
[docs]
def compute_global_cell_graph_features(
centroids,
neighbor_distances=None,
neighbor_counts=(3, 5, 7),
):
r"""Compute global (i.e., not per-nucleus) features of the nuclei with
the given centroids based on the partitioning of the space into
Voronoi cells and on the induced graph structure.
Parameters
----------
centroids : array_like
Nx2 numpy array of nuclear centroids
neighbor_distances : array_like
Radii to count neighbors in
neighbor_counts : sequence
Sequence of numbers of neighbors, each of which is used to
compute statistics relating to the distance required to reach
that many neighbors.
Returns
-------
props : pandas.DataFrame
A single-row DataFrame with the following columns:
- voronoi\_...: Voronoi diagram features
- area\_...: Polygon area features
- peri\_...: Polygon perimeter features
- max_dist\_...: Maximum distance in polygon features
- delaunay\_...: Delaunay triangulation features
- sides\_...: Triangle side length features
- area\_...: Triangle area features
- mst_branches\_...: Minimum spanning tree branch features
- density\_...: Density features
- neighbors_in_distance\_...
- 0, 1, ..., len(neighbor_distances) - 1: Neighbor count
within given radius features.
- distance_for_neighbors\_...
- 0, 1, ..., len(neighbor_counts) - 1: Minimum distance to
enclose count neighbors features
The "..."s are meant to signify that what precedes is the
start of a column name. At the end of each column name is one
of 'mean', 'stddev', 'min_max_ratio', and 'disorder'.
'min_max_ratio' is the minimum-to-maximum ratio, and disorder
is stddev / (mean + stddev).
Note
----
The indices for the density features are with respect to the
*sorted* values of the corresponding argument sequence.
References
----------
.. [#] Doyle, S., Agner, S., Madabhushi, A., Feldman, M., & Tomaszewski, J.
(2008, May). Automated grading of breast cancer histopathology using
spectral clustering with textural and architectural image features.
In Biomedical Imaging: From Nano to Macro, 2008. ISBI 2008.
5th IEEE International Symposium on (pp. 496-499). IEEE.
"""
if neighbor_distances is None:
neighbor_distances = 10. * np.arange(1, 6)
return _flatten_to_dataframe(_compute_global_cell_graph_features(
centroids,
neighbor_distances,
neighbor_counts,
))
def _compute_global_cell_graph_features(
centroids,
neighbor_distances,
neighbor_counts,
):
"""Internal support for compute_global_cell_graph_features that
returns its result in a nested nametuple structure instead of a
pandas DataFrame.
"""
from scipy import sparse
from scipy.sparse.csgraph import minimum_spanning_tree
from scipy.spatial import Voronoi
from scipy.spatial import cKDTree as KDTree
from scipy.spatial.distance import pdist
vor = Voronoi(centroids)
centroids = vor.points
vertices = vor.vertices
regions = [r for r in vor.regions if r and -1 not in r]
areas = np.array([_poly_area(vertices[r]) for r in regions])
peris = np.array([_poly_peri(vertices[r]) for r in regions])
max_dists = np.array([pdist(vertices[r]).max() for r in regions])
poly_props = PolyProps._make(map(_pop_stats, (areas, peris, max_dists)))
# Assume that each Voronoi vertex is on exactly three ridges.
ridge_points = vor.ridge_points
# This isn't exactly the collection of sides, since if they should
# be counted per-triangle then we weight border ridges wrong
# relative to ridges that are part of two triangles.
ridge_lengths = _dist(*np.swapaxes(centroids[ridge_points], 0, 1))
sides = ridge_lengths
# Point indices of each triangle
tris = [[] for _ in vertices]
for p, ri in enumerate(vor.point_region):
for vi in vor.regions[ri]:
if vi != -1:
tris[vi].append(p)
# This will only fail in particular symmetrical cases where a
# Voronoi vertex is associated with more than three centroids.
# Since this should be unlikely in practice, we don't handle it
# beyond throwing an AssertionError
assert all(len(t) == 3 for t in tris)
tris = np.asarray(tris)
areas = np.array([_poly_area(centroids[t]) for t in tris])
tri_props = TriProps._make(map(_pop_stats, (sides, areas)))
graph = sparse.coo_matrix((ridge_lengths, np.sort(ridge_points).T),
(len(centroids), len(centroids)))
mst = minimum_spanning_tree(graph)
# Without looking into exactly how minimum_spanning_tree
# constructs its output, elimate any explicit zeros to be on the
# safe side.
mst_branches = _pop_stats(mst.data[mst.data != 0])
tree = KDTree(centroids)
neighbors_in_distance = {
# Yes, we just throw away the actual points
r: _pop_stats(np.stack(np.array(list(map(len, tree.query_ball_tree(tree, r))))) - 1)
for r in neighbor_distances
}
distance_for_neighbors = dict(zip(
neighbor_counts,
map(_pop_stats, tree.query(centroids, [c + 1 for c in neighbor_counts])[0].T),
))
density_props = DensityProps(neighbors_in_distance, distance_for_neighbors)
return Props(poly_props, tri_props, mst_branches, density_props)
def _poly_area(vertices):
return abs(_poly_signed_area(vertices))
def _poly_signed_area(vertices):
return .5 * linalg.det(
np.stack((vertices, np.roll(vertices, -1, axis=-2)), -1),
).sum(-1)
def _poly_peri(vertices):
return _dist(vertices, np.roll(vertices, -1, axis=-2)).sum(-1)
def _dist(x, y):
"""Compute the distance between two sets of points. Has signature
(i),(i)->().
"""
return (np.subtract(x, y) ** 2).sum(-1) ** .5
def _pop_stats(pop):
# Filter out outliers (here defined as points more than three
# standard deviations away from the mean)
while True:
mean = pop.mean()
stddev = pop.std()
mask = abs(pop - mean) <= 3 * stddev
if mask.all():
break
pop = pop[mask]
if pop.max():
minmaxr = pop.min() / pop.max()
else:
minmaxr = 0.0
if mean + stddev:
disorder = stddev / (mean + stddev)
else:
disorder = 0.0
return PopStats(mean, stddev, minmaxr, disorder)
def _flatten_to_dataframe(nt):
"""Flatten the result of _compute_global_cell_graph_features to the
DataFrame returned by compute_global_cell_graph_features.
"""
from pandas import DataFrame
return DataFrame(_flatten_to_dict(nt), index=[0])
def _flatten_to_dict(nt, prefix=''):
result = {}
assert isinstance(nt, (tuple, dict))
if isinstance(nt, tuple):
d = nt._asdict()
else: # nt is a dict
# We only have numeric keys, and they may be non-nice floats,
# so just number them by sort order instead of doing something
# else for names
d = {str(i): kv[1] for i, kv in enumerate(sorted(nt.items()))}
for k, v in d.items():
if not isinstance(v, (tuple, dict)):
# Terminate
result[prefix + k] = v
else:
result.update(_flatten_to_dict(v, prefix + k + '_'))
return result