Source code for histomicstk.annotations_and_masks.masks_to_annotations_handler

"""
Created on Mon Aug 12 18:33:48 2019.

@author: tageldim

"""

import cv2
import numpy as np
from shapely.geometry.polygon import Polygon

from histomicstk.utils.general_utils import Print_and_log


[docs] def get_contours_from_bin_mask(bin_mask): """Given a binary mask, get opencv contours. Parameters ---------- bin_mask : nd array ground truth mask (m,n) - int32 with [0, 1] values. Returns ------- dict a dictionary with the following keys: - contour group: the actual contour x,y coordinates. - hierarchy: contour hierarchy. This contains information about how contours relate to each other, in the form: [Next, Previous, First_Child, Parent, index_relative_to_contour_group] The last column is added for convenience and is not part of the original opencv output. - outer_contours: index of contours that do not have a parent, and are therefore the outermost most contours. These may have children (holes), however. See docs.opencv.org/3.1.0/d9/d8b/tutorial_py_contours_hierarchy.html for more information. """ # Get contours using openCV. See this for modes: # https://docs.opencv.org/3.1.0/d9/d8b/tutorial_py_contours_hierarchy.html # we use the flag RETR_CCOMP so that we get boundary as well as holes # hierearchy output is: [Next, Previous, First_Child, Parent] ROI_cvuint8 = cv2.convertScaleAbs(bin_mask) cvout = cv2.findContours( ROI_cvuint8, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) if len(cvout) < 3: contour_group, hierarchy = cvout[0], cvout[1] else: contour_group, hierarchy = cvout[1], cvout[2] hierarchy = hierarchy[0, ...] # outermost contours are those that don't have a parent # We'll append an index column to the rightmost end # to keep track of things better relative to contour_group, now it is: # [Next, Previous, First_Child, Parent, index_relative_to_contour_group] hierarchy = np.concatenate((hierarchy, np.arange( hierarchy.shape[0])[..., None]), axis=1) outer_contours = hierarchy[hierarchy[:, 3] == -1, :] conts = { 'contour_group': contour_group, 'hierarchy': hierarchy, 'outer_contours': outer_contours, } return conts
def _add_contour_to_df( contours_df, mask_shape, conts, cidx, nest_info, pad_margin=0, MIN_SIZE=30, MAX_SIZE=None, monitorPrefix=''): """Add single contour to dataframe (Internal).""" # get coordinates for this contour. These are in x,y format. outer_cidx = conts['outer_contours'][cidx, 4] cont_outer = conts['contour_group'][outer_cidx][:, 0, :] if cont_outer.shape[0] <= 3: raise Exception('%s: TOO SIMPLE (%d coordinates) -- IGNORED' % ( monitorPrefix, cont_outer.shape[0])) # Get index of first child (hole) inner_cidx = conts['outer_contours'][cidx, 2] has_holes = 0 + (inner_cidx > -1) # get nest location and size xmin, ymin = np.min(cont_outer, axis=0) nest_width, nest_height = np.max( cont_outer, 0) - np.min(cont_outer, 0) ymax = ymin + nest_height xmax = xmin + nest_width # ignore nests that are too small if (nest_height < MIN_SIZE) or (nest_width < MIN_SIZE): raise Exception('%s: TOO SMALL (%d x %d pixels) -- IGNORED' % ( monitorPrefix, nest_height, nest_width)) # ignore extremely large nests -- THESE MAY CAUSE SEGMENTATION FAULTS if MAX_SIZE is not None: if (nest_height > MAX_SIZE) or (nest_width > MAX_SIZE): raise Exception( '%s: EXTREMELY LARGE NEST (%d x %d pixels) -- IGNORED' % (monitorPrefix, nest_height, nest_width)) # assign bounding box location ridx = contours_df.shape[0] contours_df.loc[ridx, 'group'] = nest_info['group'] contours_df.loc[ridx, 'color'] = nest_info['color'] contours_df.loc[ridx, 'ymin'] = ymin - pad_margin contours_df.loc[ridx, 'ymax'] = ymax - pad_margin contours_df.loc[ridx, 'xmin'] = xmin - pad_margin contours_df.loc[ridx, 'xmax'] = xmax - pad_margin # add other properties -- maybe useful later contours_df.loc[ridx, 'has_holes'] = has_holes contours_df.loc[ridx, 'touches_edge-top'] = 0 + ( ymin - pad_margin - 2 < 0) contours_df.loc[ridx, 'touches_edge-left'] = 0 + ( xmin - pad_margin - 2 < 0) contours_df.loc[ridx, 'touches_edge-bottom'] = 0 + ( ymax + pad_margin + 2 > mask_shape[0]) contours_df.loc[ridx, 'touches_edge-right'] = 0 + ( xmax + pad_margin + 2 > mask_shape[1]) # get x and y coordinates in HTK friendly format (string) cont_outer = conts['contour_group'][outer_cidx][:, 0, :].copy() contours_df.loc[ridx, 'coords_x'] = \ ','.join([str(j - pad_margin) for j in list(cont_outer[:, 0])]) contours_df.loc[ridx, 'coords_y'] = \ ','.join([str(j - pad_margin) for j in list(cont_outer[:, 1])]) return contours_df def _get_contours_df( MASK, GTCodes_df, groups_to_get=None, MIN_SIZE=30, MAX_SIZE=None, verbose=False, monitorPrefix=''): """Parse ground truth mask and gets contours (Internal).""" from pandas import DataFrame cpr = Print_and_log(verbose=verbose) _print = cpr._print # pad with zeros to be able to detect edge contours later pad_margin = 50 pad_value = 0 while (GTCodes_df.GT_code == pad_value).any(): pad_value += 1 MASK = np.pad(MASK, pad_margin, 'constant', constant_values=pad_value) # Go through unique groups one by one -- each group (i.e. GTCode) # is extracted separately by binarizing the multi-class mask if groups_to_get is None: groups_to_get = list(GTCodes_df.index) else: groups_to_get = [ GTCodes_df[GTCodes_df.group == group].head(1).index[0] if (GTCodes_df.group == group).any() else group for group in groups_to_get] contours_df = DataFrame() for nestgroup in groups_to_get: bin_mask = 0 + (MASK == GTCodes_df.loc[nestgroup, 'GT_code']) if bin_mask.sum() < MIN_SIZE * MIN_SIZE: _print('%s: %s: NO OBJECTS!!' % (monitorPrefix, nestgroup)) continue _print('%s: %s: getting contours' % (monitorPrefix, nestgroup)) conts = get_contours_from_bin_mask(bin_mask=bin_mask) n_tumor_nests = conts['outer_contours'].shape[0] # add nest contours _print('%s: %s: adding contours' % (monitorPrefix, nestgroup)) for cidx in range(n_tumor_nests): try: nestcountStr = '%s: nest %s of %s' % ( monitorPrefix, cidx, n_tumor_nests) if cidx % 25 == 100: _print(nestcountStr) contours_df = _add_contour_to_df( contours_df, mask_shape=bin_mask.shape, conts=conts, cidx=cidx, nest_info=dict(GTCodes_df.loc[nestgroup, :]), pad_margin=pad_margin, MIN_SIZE=MIN_SIZE, MAX_SIZE=MAX_SIZE, monitorPrefix=nestcountStr) except Exception as e: _print(e) continue return contours_df def _parse_annot_coords(annot, x_offset=0, y_offset=0): """Get x-, y- coordinates in a list format (Internal).""" coords_x = [int(j) + x_offset for j in annot['coords_x'].split(',')] coords_y = [int(j) + y_offset for j in annot['coords_y'].split(',')] coords = [(coords_x[i], coords_y[i]) for i in range(len(coords_x))] return coords def _discard_nonenclosed_background_group( contours_df, background_group='mostly_stroma', verbose=False, monitorPrefix=''): """If a background group contour is NOT fully enclosed, discard it. This is a purely aesthetic method, makes sure that the background group contours (eg stroma) are discarded by default to avoid cluttering the field when posted to DSA for viewing online. The only exception is if they are enclosed within something else (eg tumor), in which case they are kept since they represent holes. This is related to https://github.com/DigitalSlideArchive/HistomicsTK/issues/675 (Internal). """ cpr = Print_and_log(verbose=verbose) _print = cpr._print # isolate background contours and non-background contours with holes background = contours_df.loc[ contours_df.loc[:, 'group'] == background_group, :] contours_with_holes = contours_df.loc[ contours_df.loc[:, 'group'] != background_group, :] contours_with_holes = contours_with_holes.loc[ contours_with_holes.loc[:, 'has_holes'] == 1, :] def _append_polygon_if_valid(contDict, cid, polygon_list): try: polygon = Polygon(_parse_annot_coords(contDict)) if polygon.is_valid: polygon_list.append(polygon) except Exception as e: _print('%s: contour %d: Shapely Error (below) -- IGNORED!' % ( monitorPrefix, cid)) _print(e) return polygon_list # to avoid redoing things, keep all non-background with holes in a list contour_polygons = [] for cid, cont in contours_with_holes.iterrows(): contour_polygons = _append_polygon_if_valid( dict(cont), cid=cid, polygon_list=contour_polygons) # iterate through stromal polygons and find if enclosed within something discard_cids = [] for cid, cont in background.iterrows(): bck_list = _append_polygon_if_valid( dict(cont), cid=cid, polygon_list=[]) # only keep if enclosed with another contour discard = True if len(bck_list) > 0: for contour_polygon in contour_polygons: if contour_polygon.contains(bck_list[0]): discard = False if discard: discard_cids.append(cid) # now drop unnecessary contours _print('%s: discarded %d contours' % (monitorPrefix, len(discard_cids))) contours_df.drop(discard_cids, axis=0, inplace=True) return contours_df
[docs] def get_contours_from_mask( MASK, GTCodes_df, groups_to_get=None, MIN_SIZE=30, MAX_SIZE=None, get_roi_contour=True, roi_group='roi', discard_nonenclosed_background=False, background_group='mostly_stroma', verbose=False, monitorPrefix=''): """Parse ground truth mask and gets contours for annotations. Parameters ---------- MASK : nd array ground truth mask (m,n) where pixel values encode group membership. GTCodes_df : pandas Dataframe the ground truth codes and information dataframe. This is a dataframe that is indexed by the annotation group name and has the following columns. group: str group name of annotation, eg. mostly_tumor. GT_code: int desired ground truth code (in the mask). Pixels of this value belong to corresponding group (class). color: str rgb format. eg. rgb(255,0,0). groups_to_get : None if None (default) then all groups (ground truth labels) will be extracted. Otherwise pass a list of strings like ['mostly_tumor',]. MIN_SIZE : int minimum bounding box size of contour MAX_SIZE : None if not None, int. Maximum bounding box size of contour. Sometimes very large contours cause segmentation faults that originate from opencv and are not caught by python, causing the python process to unexpectedly hault. If you would like to set a maximum size to defend against this, a suggested maximum would be 15000. get_roi_contour : bool whether to get contour for boundary of region of interest (ROI). This is most relevant when dealing with multiple ROIs per slide and with rotated rectangular or polygonal ROIs. roi_group : str name of roi group in the GT_Codes dataframe (eg roi) discard_nonenclosed_background : bool If a background group contour is NOT fully enclosed, discard it. This is a purely aesthetic method, makes sure that the background group contours (eg stroma) are discarded by default to avoid cluttering the field when posted to DSA for viewing online. The only exception is if they are enclosed within something else (eg tumor), in which case they are kept since they represent holes. This is related to https://github.com/DigitalSlideArchive/HistomicsTK/issues/675 WARNING - This is a bit slower since the contours will have to be converted to shapely polygons. It is not noticeable for hundreds of contours, but you will notice the speed difference if you are parsing thousands of contours. Default, for this reason, is False. background_group : str name of background group in the GT_codes dataframe (eg mostly_stroma) verbose : bool Print progress to screen? monitorPrefix : str text to prepend to printed statements Returns ------- pandas DataFrame contours extracted from input mask. The following columns are output. group : str annotation group (ground truth label). color : str annotation color if it were to be posted to DSA. is_roi : bool whether this annotation is a region of interest boundary ymin : int minimum y coordinate ymax : int maximum y coordinate xmin : int minimum x coordinate xmax : int maximum x coordinate has_holes : bool whether this contour has holes touches_edge-top : bool whether this contour touches top mask edge touches_edge-bottom : bool whether this contour touches bottom mask edge touches_edge-left : bool whether this contour touches left mask edge touches_edge-right : bool whether this contour touches right mask edge coords_x : str vertex x coordinates comma-separated values coords_y vertex y coordinated comma-separated values """ from pandas import concat if MASK.sum() < 3: msg = 'Mask is empty!!' raise Exception(msg) cpr = Print_and_log(verbose=verbose) _print = cpr._print if groups_to_get is not None: _print("""WARNING!! Only specify groups_to_get is you do NOT mind having NO holes in polygons with holes that are occupied by a non-specified group. For example, let's say you specified that you only want to extract contours for tumor and stroma. If there is a large tumor polygon with two holes for stroma and blood vessel, the stroma hole will be accounted for, but not the blood vessel hole when you post these contours to DSA for viewing then pull them to be parse back into mask form. It's a subtle issue related to https://github.com/DigitalSlideArchive/HistomicsTK/issues/675 and will eventually be accounted for once HistomicsTK has an official format to encode polygons with holes.""") cont_kwargs = { 'GTCodes_df': GTCodes_df, 'MIN_SIZE': MIN_SIZE, 'MAX_SIZE': MAX_SIZE, 'verbose': verbose, } # get contours df for non-roi contours contours_df = _get_contours_df( MASK=MASK, groups_to_get=groups_to_get, monitorPrefix='%s: %s' % (monitorPrefix, 'non-roi'), **cont_kwargs) # discard non-enclosed background (eg stroma) if needed if discard_nonenclosed_background: contours_df = _discard_nonenclosed_background_group( contours_df, background_group=background_group, verbose=verbose, monitorPrefix='%s: %s' % (monitorPrefix, 'discarding backgrnd')) # get contours df for roi boundary and concat if get_roi_contour: MASK_BIN = np.zeros(MASK.shape, dtype=np.uint8) MASK_BIN[MASK > 0] = GTCodes_df.loc[roi_group, 'GT_code'] contours_df_roi = _get_contours_df( MASK=MASK_BIN, groups_to_get=[roi_group], monitorPrefix='%s: %s' % (monitorPrefix, roi_group), **cont_kwargs) contours_df = concat( (contours_df_roi, contours_df), axis=0, ignore_index=True) return contours_df
[docs] def get_single_annotation_document_from_contours( contours_df_slice, docname='default', F=1.0, X_OFFSET=0, Y_OFFSET=0, opacity=0.3, lineWidth=4.0, verbose=True, monitorPrefix=''): """Given dataframe of contours, get annotation document. This uses the large_image annotation schema to create an annotation document that maybe posted to DSA for viewing using something like: resp = gc.post("/annotation?itemId=" + slide_id, json=annotation_doc) The annotation schema can be found at: github.com/girder/large_image/blob/master/docs/annotations.md . Parameters ---------- contours_df_slice : pandas DataFrame The following columns are of relevance and must be contained. group : str annotation group (ground truth label). color : str annotation color if it were to be posted to DSA. coords_x : str vertex x coordinates comma-separated values coords_y vertex y coordinated comma-separated values docname : str annotation document name F : float how much smaller is the mask where the contours come from is relative to the slide scan magnification. For example, if the mask is at 10x whereas the slide scan magnification is 20x, then F would be 2.0. X_OFFSET : int x offset to add to contours at BASE (SCAN) magnification Y_OFFSET : int y offset to add to contours at BASE (SCAN) magnification opacity : float opacity of annotation elements (in the range [0, 1]) lineWidth : float width of boarders of annotation elements verbose : bool Print progress to screen? monitorPrefix : str text to prepend to printed statements Returns ------- dict DSA-style annotation document ready to be post for viewing. """ cpr = Print_and_log(verbose=verbose) _print = cpr._print def _get_fillColor(lineColor): fillColor = lineColor.replace('rgb', 'rgba') return fillColor[:fillColor.rfind(')')] + ',%.1f)' % opacity # Init annotation document in DSA style annotation_doc = {'name': docname, 'description': '', 'elements': []} # go through nests nno = 0 nnests = contours_df_slice.shape[0] for _, nest in contours_df_slice.iterrows(): nno += 1 nestStr = '%s: contour %d of %s' % (monitorPrefix, nno, nnests) _print(nestStr) # Parse coordinates try: x_coords = F * np.int32( [int(j) for j in nest['coords_x'].split(',')]) + X_OFFSET y_coords = F * np.int32( [int(j) for j in nest['coords_y'].split(',')]) + Y_OFFSET zeros = np.zeros(x_coords.shape, dtype=np.int32) coords = np.concatenate( (x_coords[:, None], y_coords[:, None], zeros[:, None]), axis=1) coords = coords.tolist() coords.append(coords[0]) except Exception as e: _print('%s: ERROR (below) - moving on!!!' % nestStr) _print(e) continue # assign to annotation style. See: # github.com/girder/large_image/blob/master/docs/annotations.md annotation_style = { 'group': nest['group'], 'type': 'polyline', 'lineColor': nest['color'], 'lineWidth': lineWidth, 'closed': True, 'points': coords, 'label': {'value': nest['label']}, } if opacity > 0: annotation_style['fillColor'] = _get_fillColor(nest['color']) # append to document annotation_doc['elements'].append(annotation_style) return annotation_doc
[docs] def get_annotation_documents_from_contours( contours_df, separate_docs_by_group=True, annots_per_doc=200, annprops=None, docnamePrefix='', verbose=True, monitorPrefix=''): """Given dataframe of contours, get list of annotation documents. This method parses a dataframe of contours to a list of dictionaries, each of which represents and large_image style annotation. This is a wrapper that extends the functionality of the method get_single_annotation_document_from_contours(), whose docstring should be referenced for implementation details and further explanation. Parameters ---------- contours_df : pandas DataFrame WARNING - This is modified inside the function, so pass a copy. This dataframe includes data on contours extracted from input mask using get_contours_from_mask(). If you have contours using some other method, just make sure the dataframe follows the same schema as the output from get_contours_from_mask(). You may find a sample dataframe in the repo at ./tests/test_files/annotations_and_masks/sample_contours_df.tsv. The following columns are relevant for this method. group : str annotation group (ground truth label). color : str annotation color if it were to be posted to DSA. coords_x : str vertex x coordinates comma-separated values coords_y vertex y coordinated comma-separated values separate_docs_by_group : bool if set to True, you get one or more annotation documents (dicts) for each group (eg tumor) independently. annots_per_doc : int maximum number of annotation elements (polygons) per dict. The smaller this number, the more numerous the annotation documents, but the more seamless it is to post this data to the DSA server or to view using the HistomicsTK interface since you will be loading smaller chunks of data at a time. annprops : dict properties of annotation elements. Contains the following keys F, X_OFFSET, Y_OFFSET, opacity, lineWidth. Refer to get_single_annotation_document_from_contours() for details. docnamePrefix : str test to prepend to annotation document name verbose : bool Print progress to screen? monitorPrefix : str text to prepend to printed statements Returns ------- list of dicts DSA-style annotation document. """ if annprops is None: annprops = { 'F': 1.0, 'X_OFFSET': 0, 'Y_OFFSET': 0, 'opacity': 0, 'lineWidth': 4.0, } if separate_docs_by_group: contours_df.loc[:, 'doc_group'] = contours_df.loc[:, 'group'] else: contours_df.loc[:, 'doc_group'] = 'default' if 'label' not in contours_df.columns: contours_df.loc[:, 'label'] = contours_df.loc[:, 'group'] # Each style goes to separate document(s) if sepate_docs_by_group annotation_docs = [] for doc_group in set(contours_df.loc[:, 'doc_group']): # separate annotations with this group contours_df_slice = contours_df.loc[ contours_df.loc[:, 'doc_group'] == doc_group, :] # Add every N annotations to a separate document if contours_df_slice.shape[0] > annots_per_doc: docbounds = list(range( 0, contours_df_slice.shape[0], annots_per_doc)) docbounds[-1] = contours_df_slice.shape[0] else: docbounds = [0, contours_df_slice.shape[0]] for docidx in range(len(docbounds) - 1): docStr = '%s: %s: doc %d of %d' % ( monitorPrefix, doc_group, docidx + 1, len(docbounds) - 1) start = docbounds[docidx] end = docbounds[docidx + 1] annotation_doc = get_single_annotation_document_from_contours( contours_df_slice.iloc[start:end, :], docname='%s_%s-%d' % (docnamePrefix, doc_group, docidx), verbose=verbose, monitorPrefix=docStr, **annprops) if len(annotation_doc['elements']) > 0: annotation_docs.append(annotation_doc) return annotation_docs