Source code for histomicstk.annotations_and_masks.annotation_database_parser

"""
Created on Thu Dec 12 13:19:18 2019

@author: tageldim
"""
import copy
import json
import os

from sqlalchemy import create_engine
from sqlalchemy.types import Boolean, Integer, String

from histomicstk.annotations_and_masks.annotation_and_mask_utils import \
    parse_slide_annotations_into_tables
from histomicstk.utils.girder_convenience_utils import \
    get_absolute_girder_folderpath
from histomicstk.workflows.workflow_runner import (Slide_iterator,
                                                   Workflow_runner)

# Helper functions


def _add_item_to_sqlite(dbcon, item):
    from pandas import DataFrame

    # modify item info to prep for appending to sqlite table
    item_info = copy.deepcopy(item)
    item_info['largeImage'] = str(item_info['largeImage'])

    item_info_dtypes = {
        '_id': String(),
        '_modelType': String(),
        'baseParentId': String(),
        'baseParentType': String(),
        'copyOfItem': String(),
        'created': String(),
        'creatorId': String(),
        'description': String(),
        'folderId': String(),
        'largeImage': String(),
        'name': String(),
        'size': Integer(),
        'updated': String(),
    }

    # in case anything is not in the schema, drop it
    item_info = {
        k: v for k, v in item_info.items()
        if k in item_info_dtypes.keys()}

    # convert to df and add to items table
    item_info_df = DataFrame.from_dict(item_info, orient='index').T
    item_info_df.to_sql(
        name='items', con=dbcon, if_exists='append',
        dtype=item_info_dtypes, index=False)


def _add_folder_to_sqlite(dbcon, folder_info):
    from pandas import DataFrame

    # modify folder info to prep for appending to sqlite table
    folder_info_dtypes = {
        '_accessLevel': Integer(),
        '_id': String(),
        '_modelType': String(),
        'baseParentId': String(),
        'baseParentType': String(),
        'created': String(),
        'creatorId': String(),
        'description': String(),
        'name': String(),
        'parentCollection': String(),
        'parentId': String(),
        'public': Boolean(),
        'size': Integer(),
        'updated': String(),
        'folder_path': String(),
    }

    # in case anything is not in the schema, drop it
    folder_info = {
        k: v for k, v in folder_info.items()
        if k in folder_info_dtypes.keys()}

    # convert to df and add to items table
    folder_info_df = DataFrame.from_dict(folder_info, orient='index').T
    folder_info_df.to_sql(
        name='folders', con=dbcon, if_exists='append',
        dtype=folder_info_dtypes, index=False)


def _add_annotation_docs_to_sqlite(dbcon, annotation_docs, item):
    # add full item path for convenience
    annotation_docs.loc[:, 'item_name'] = item['name']

    # save tables to sqlite
    annotation_docs.to_sql(
        name='annotation_docs', con=dbcon, if_exists='append',
        dtype={
            'annotation_girder_id': String(),
            '_modelType': String(),
            '_version': Integer(),
            'itemId': String(),
            'item_name': String(),
            'created': String(),
            'creatorId': String(),
            'public': Boolean(),
            'updated': String(),
            'updatedId': String(),
            'groups': String(),
            'element_count': Integer(),
            'element_details': Integer()},
        index=False,
    )


def _add_annotation_elements_to_sqlite(dbcon, annotation_elements):
    # drop index relative to JSON since its pretty arbitrary and would
    # change if the same girder client was used to get annotations twice
    # the actual girder ID string is what really matters and should be used
    annotation_elements.drop(
        labels=['annidx', 'elementidx'], axis=1, inplace=True)

    annotation_elements.to_sql(
        name='annotation_elements', con=dbcon, if_exists='append',
        dtype={
            'annotation_girder_id': String(),
            'element_girder_id': String(),
            'type': String(),
            'group': String(),
            'label': String(),
            'color': String(),
            'xmin': Integer(),
            'xmax': Integer(),
            'ymin': Integer(),
            'ymax': Integer(),
            'bbox_area': Integer(),
            'coords_x': String(),
            'coords_y': String()},
        index=False,
    )



[docs]
def parse_annotations_to_local_tables(
        item, annotations, local, monitorPrefix='',
        save_csv=True, save_sqlite=False, dbcon=None):
    """Parse loaded annotations for slide into tables.

    Parameters
    ----------
    item : dict
        girder response with item information

    annotations : dict
        loaded annotations

    local : str
        local directory

    save_csv : bool
        whether to use histomicstk.annotations_and_masks.annotation_and_mask.
        parse_slide_annotations_into_tables() to get a tabular representation
        (including some simple calculations like bounding box) and save
        the output as two csv files, one representing the annotation documents
        and the other representing the actual annotation elements (polygons).

    save_sqlite : bool
        whether to save the backup into an sqlite database

    dbcon : sqlalchemy.create_engine.connect() object
        IGNORE THIS PARAMETER!! This is used internally.

    monitorPrefix : str
        text to prepend to printed statements

    """
    print('%s: parse to tables' % monitorPrefix)
    savepath_base = os.path.join(local, item['name'])
    annotation_docs, annotation_elements = \
        parse_slide_annotations_into_tables(annotations)

    if save_csv:
        annotation_docs.to_csv(savepath_base + '_docs.csv')
        annotation_elements.to_csv(savepath_base + '_elements.csv')

    if save_sqlite:
        assert dbcon is not None, 'You must connect to database first!'
        _add_annotation_docs_to_sqlite(dbcon, annotation_docs, item)
        _add_annotation_elements_to_sqlite(dbcon, annotation_elements)



# Workflow at a single slide level



[docs]
def dump_annotations_workflow(
        gc, slide_id, local, monitorPrefix='',
        save_json=True, save_sqlite=False, dbcon=None,
        callback=None, callback_kwargs=None):
    """Dump annotations for single slide into the local folder.

    Parameters
    ----------
    gc : girder_client.GirderClient
        authenticated girder client instance

    slide_id : str
        girder id of item (slide)

    monitorPrefix : str
        prefix to monitor string

    local : str
        local path to dump annotations

    save_json : bool
        whether to dump annotations as json file

    save_sqlite : bool
        whether to save the backup into an sqlite database

    dbcon : sqlalchemy.create_engine.connect() object
        IGNORE THIS PARAMETER!! This is used internally.

    callback : function
        function to call that takes in AT LEAST the following params
        - item: girder response with item information
        - annotations: loaded annotations
        - local: local directory
        - monitorPrefix: string

    callback_kwargs : dict
        kwargs to pass along to callback

    """
    callback_kwargs = callback_kwargs or {}
    try:
        item = gc.get('/item/%s' % slide_id)

        savepath_base = os.path.join(local, item['name'])

        # dump item information json
        if save_json:
            print('%s: save item info' % monitorPrefix)
            with open(savepath_base + '.json', 'w') as fout:
                json.dump(item, fout)

        # save folder info to sqlite
        if save_sqlite:
            _add_item_to_sqlite(dbcon, item)

        # pull annotation
        print('%s: load annotations' % monitorPrefix)
        annotations = gc.get('/annotation/item/' + item['_id'])

        if annotations is not None:

            # dump annotations to JSON in local folder
            if save_json:
                print('%s: save annotations' % monitorPrefix)
                with open(savepath_base + '_annotations.json', 'w') as fout:
                    json.dump(annotations, fout)

            # run callback
            if callback is not None:
                print('%s: run callback' % monitorPrefix)
                callback(
                    item=item, annotations=annotations, local=local,
                    dbcon=dbcon, monitorPrefix=monitorPrefix,
                    **callback_kwargs)

    except Exception as e:
        print(str(e))



# Main method



[docs]
def dump_annotations_locally(
        gc, folderid, local, save_json=True,
        save_sqlite=False, dbcon=None,
        callback=None, callback_kwargs=None):
    """Dump annotations of folder and subfolders locally recursively.

    This reproduces this tiered structure locally and (possibly) dumps
    annotations there. Adapted from Lee A.D. Cooper

    Parameters
    ----------
    gc : girder_client.GirderClient
        authenticated girder client instance

    folderid : str
        girder id of source (base) folder

    local : str
        local path to dump annotations

    save_json : bool
        whether to dump annotations as json file

    save_sqlite : bool
        whether to save the backup into an sqlite database

    dbcon : sqlalchemy.create_engine.connect() object
        IGNORE THIS PARAMETER!! This is used internally.

    callback : function
        function to call that CAN accept AT LEAST the following params
        - item: girder response with item information
        - annotations: loaded annotations
        - local: local directory
        - monitorPrefix: string
        - dbcon: sqlalchemy.create_engine.connect() object
        You can just add kwargs at the end of your callback definition
        for simplicity.

    callback_kwargs : dict
        kwargs to pass along to callback. DO NOT pass any of the parameters
        item, annotations, local, monitorPrefix, or dbcon as these will be
        internally passed. Just include any specific parameters for the
        callback. See parse_annotations_to_local_tables() above for
        an example of a callback and the unir test of this function.

    """
    callback_kwargs = callback_kwargs or {}
    assert save_json or save_sqlite, 'must save results somehow!'
    monitor = os.path.basename(local)

    # get folder info
    folder_info = gc.get('folder/%s' % folderid)
    folder_info['folder_path'] = get_absolute_girder_folderpath(
        gc=gc, folder_info=folder_info)

    # connect to sqlite database -- only first stack does this
    if save_sqlite and (dbcon is None):
        db_path = os.path.join(local, folder_info['name'] + '.sqlite')
        sql_engine = create_engine('sqlite:///' + db_path, echo=False)
        dbcon = sql_engine.connect()

    # save folder information json
    if save_json:
        print('%s: save folder info' % monitor)
        savepath = os.path.join(local, folder_info['name'] + '.json')
        with open(savepath, 'w') as fout:
            json.dump(folder_info, fout)

    # save folder info to sqlite
    if save_sqlite:
        _add_folder_to_sqlite(dbcon, folder_info)

    # pull annotations for each slide in folder
    workflow_runner = Workflow_runner(
        slide_iterator=Slide_iterator(
            gc, source_folder_id=folderid,
            keep_slides=None,
        ),
        workflow=dump_annotations_workflow,
        workflow_kwargs={
            'gc': gc,
            'local': local,
            'save_json': save_json,
            'save_sqlite': save_sqlite,
            'dbcon': dbcon,
            'callback': callback,
            'callback_kwargs': callback_kwargs,
        },
        monitorPrefix=monitor)

    workflow_runner.run()

    # for each subfolder, create a new folder locally and call self
    for folder in gc.listFolder(parentId=folderid):

        # create folder in local
        new_folder = os.path.join(local, folder['name'])
        os.mkdir(new_folder)

        # call self with same parameters
        dump_annotations_locally(
            gc=gc, folderid=folder['_id'], local=new_folder,
            save_json=save_json, save_sqlite=save_sqlite, dbcon=dbcon,
            callback=callback, callback_kwargs=callback_kwargs)