From 87353ce6b0fbe3e8e2df6d7074b4d8e9064c85f1 Mon Sep 17 00:00:00 2001
From: David Manthey <david.manthey@kitware.com>
Date: Wed, 21 Aug 2024 17:12:11 -0400
Subject: [PATCH] Improve scanning data file for plottable data

This also does correspondence between bounding boxes and annotation
elements.

It is slower than desired for reading data via pandas and doing that
correspondence.
---
 CHANGELOG.md                                  |   6 +
 .../utils/__init__.py                         | 202 ++++++++++++++++--
 2 files changed, 196 insertions(+), 12 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a4c5065ba..50ec772da 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Change Log
 
+## 1.29.7
+
+### Improvements
+
+- Speed up getting plottable data from adjacent items; plot more data ([#1613](../../pull/1613), [#1614](../../pull/1614))
+
 ## 1.29.6
 
 ### Bug Fixes
diff --git a/girder_annotation/girder_large_image_annotation/utils/__init__.py b/girder_annotation/girder_large_image_annotation/utils/__init__.py
index 82f484a29..539eeb860 100644
--- a/girder_annotation/girder_large_image_annotation/utils/__init__.py
+++ b/girder_annotation/girder_large_image_annotation/utils/__init__.py
@@ -1,5 +1,7 @@
+import functools
 import json
 import math
+import os
 import re
 import threading
 
@@ -7,7 +9,48 @@
 
 from girder import logger
 from girder.constants import AccessType, SortDir
+from girder.models.file import File
 from girder.models.folder import Folder
+from girder.models.item import Item
+
+dataFileExtReaders = {
+    '.csv': 'read_csv',
+    'text/csv': 'read_csv',
+    '.xls': 'read_excel',
+    '.xlsx': 'read_excel',
+    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'read_excel',
+    'application/vnd.ms-excel ': 'read_excel',
+    'application/msexcel': 'read_excel',
+    'application/x-msexcel': 'read_excel',
+    'application/x-ms-excel': 'read_excel',
+    'application/x-excel': 'read_excel',
+    'application/x-dos_ms_excel': 'read_excel',
+    'application/xls': 'read_excel',
+    'application/x-xls': 'read_excel',
+}
+scanDatafileRecords = 50
+
+
+@functools.lru_cache(maxsize=100)
+def _dfFromFile(fileid, full=False):
+    import pandas as pd
+
+    file = File().load(fileid, force=True)
+    ext = os.path.splitext(file['name'])[1]
+    reader = dataFileExtReaders.get(
+        ext, dataFileExtReaders.get(file.get('mimeType'), None))
+    if reader == 'read_excel':
+        df = getattr(pd, reader)(File().open(file), sheet_name=None)
+    else:
+        df = {'entry': getattr(pd, reader)(File().open(file))}
+    df = {
+        k: sheet.iloc[:None if full else scanDatafileRecords].to_dict('records')
+        for k, sheet in df.items()}
+    logger.info(f'Read {len(df)} x {len(next(iter(df.values())))} values from '
+                f'{file["name"]} {file["size"]}')
+    if len(df) == 1:
+        df = next(iter(df.values()))
+    return df
 
 
 class AnnotationGeoJSON:
@@ -378,6 +421,7 @@ def __init__(self, user, item, annotations=None, adjacentItems=False, sources=No
         self._fullScan = adjacentItems == '__all__'
         self._findItems(item, adjacentItems)
         self._findAnnotations(annotations)
+        self._findDataFiles()
         self._dataLock = threading.RLock()
 
     def _findItems(self, item, adjacentItems=False):
@@ -452,6 +496,53 @@ def _findAnnotations(self, annotations):
                         annotList[names[annot['annotation']['name']]] = annot
                 self.annotations.append(annotList)
 
+    def _findDataFiles(self):  # noqa
+        """
+        Find data files inside the current item.  For adjacent items, the data
+        file must have the same name or, if the found file is prefixed with
+        the item name excluding the extension, then the adjancant file should
+        be similarly prefixed.  Data files must have a known suffix or a known
+        mimetype that can be read by pandas (and pandas must be installed).
+        """
+        self._itemfilelist = [[]] * len(self.items)
+        try:
+            import pandas as pd  # noqa
+        except Exception:
+            return
+        if self._sources and 'filedata' not in self._sources:
+            return
+        names0 = {}
+        for iidx, item in enumerate(self.items):
+            if iidx:
+                self._itemfilelist[iidx] = [None] * len(self._itemfilelist[0])
+            names = {}
+            for file in Item().childFiles(item):
+                try:
+                    if (file['_id'] == self.item['largeImage']['fileId'] or
+                            file['_id'] == self.item['largeImage'].get('originalId')):
+                        continue
+                except Exception:
+                    continue
+                ext = os.path.splitext(file['name'])[1]
+                if (ext not in dataFileExtReaders and
+                        file.get('mimeType') not in dataFileExtReaders):
+                    continue
+                if file['name'].startswith(item['name'].rsplit('.')[0]):
+                    base, name = True, file['name'][len(item['name'].rsplit('.')[0]):]
+                else:
+                    base, name = False, file['name']
+                if (base, name) in names:
+                    continue
+                if iidx and (base, name) not in names0:
+                    continue
+                names[(base, name)] = len(names)
+                if not iidx:
+                    self._itemfilelist[0].append(file)
+                else:
+                    self._itemfilelist[iidx][names0[(base, name)]] = file
+            if not iidx:
+                names0 = names
+
     # Common column keys and titles
     commonColumns = {
         'item.id': 'Item ID',
@@ -506,6 +597,23 @@ def itemIDSelector(record, data, row):
 
         return itemNameSelector if isName else itemIDSelector
 
+    def datafileAnnotationElementSelector(self, key, cols):
+
+        def annotationElementSelector(record, data, row):
+            bbox = [col[1](record, data, row) for col in cols]
+            if key in self._datacolumns:
+                for row in self._datacolumns[key]:
+                    if self._datacolumns[key][row] is not None:
+                        for bidx, bkey in enumerate(['bbox.x0', 'bbox.y0', 'bbox.x1', 'bbox.y1']):
+                            val = self._datacolumns[bkey].get(row)
+                            if val is None or abs(val - bbox[bidx]) > 2:
+                                break
+                        else:
+                            return self._datacolumns[key][row]
+            return None
+
+        return annotationElementSelector
+
     @staticmethod
     def keySelector(mode, key, key2=None):
         """
@@ -585,6 +693,13 @@ def annotationelementGetData(record):
 
             return annotationelementGetData
 
+        if doctype == 'datafile':
+
+            def datafileGetData(record):
+                return record
+
+            return datafileGetData
+
         def getData(record):
             return record.get('meta', {})
 
@@ -610,11 +725,12 @@ def _keysToColumns(self, columns, parts, doctype, getData, selector, length):
         title = ' '.join(str(v) for v in parts[1:] if v != '0')
         keymap = {
             r'(?i)(item|image)_(id|name)$': 'item.name',
-            r'(?i)(low|min)(_|)x': 'bbox.x0',
-            r'(?i)(low|min)(_|)y': 'bbox.y0',
-            r'(?i)(high|max)(_|)x': 'bbox.x1',
-            r'(?i)(high|max)(_|)y': 'bbox.y1',
+            r'(?i)((low|min)(_|)x|^x1$)': 'bbox.x0',
+            r'(?i)((low|min)(_|)y|^y1$)': 'bbox.y0',
+            r'(?i)((high|max)(_|)x|^x2$)': 'bbox.x1',
+            r'(?i)((high|max)(_|)y|^y2$)': 'bbox.y1',
         }
+        match = False
         for k, v in keymap.items():
             if re.match(k, lastpart):
                 if lastpart != parts[1]:
@@ -630,9 +746,25 @@ def _keysToColumns(self, columns, parts, doctype, getData, selector, length):
                         doctype, getData,
                         self.itemNameIDSelector(False, selector), length)
                     return
+                match = True
                 break
-        self._ensureColumn(
+        added = self._ensureColumn(
             columns, key, title, doctype, getData, selector, length)
+        if match and added and key.startswith('bbox'):
+            cols = [columns[bkey]['where'][doctype] for bkey in [
+                'bbox.x0', 'bbox.y0', 'bbox.x1', 'bbox.y1']
+                if bkey in columns and doctype in columns[bkey]['where']]
+            if len(cols) == 4:
+                # If we load all of these from annotation elements, use all
+                # three keys:
+                # for akey in {'annotation.id', 'annotation.name', 'annotationelement.id'}:
+                for akey in {'annotationelement.id'}:
+                    if self._datacolumns and akey in self._datacolumns:
+                        self._requiredColumns.add(akey)
+                    self._ensureColumn(
+                        columns, akey, self.commonColumns[akey], doctype,
+                        getData, self.datafileAnnotationElementSelector(akey, cols),
+                        length)
 
     def _ensureColumn(self, columns, keyname, title, doctype, getData, selector, length):
         """
@@ -649,6 +781,7 @@ def _ensureColumn(self, columns, keyname, title, doctype, getData, selector, len
             dictionary, and row, returns a value.
         :param length: None or a function that, given the document record and
             data dictionary, returns the number of rows.
+        :returns: True if the column where record was added.
         """
         if keyname not in columns:
             columns[keyname] = {
@@ -663,6 +796,8 @@ def _ensureColumn(self, columns, keyname, title, doctype, getData, selector, len
             }
         if doctype not in columns[keyname]['where']:
             columns[keyname]['where'][doctype] = (getData, selector, length)
+            return True
+        return False
 
     def _columnsFromData(self, columns, doctype, getData, record):  # noqa
         """
@@ -780,7 +915,7 @@ def _collectRecords(self, columns, recordlist, doctype, iid='', aid=''):
         Collect statistics and possibly row values from a list of records.
 
         :param columns: the column dictionary to possibly modify.
-        :param recordList: a list of records to use.
+        :param recordlist: a list of records to use.
         :param doctype: the base document type.
         :param iid: an optional item id to use for determining distinct rows.
         :param aid: an optional annotation id to use for determining distinct
@@ -796,7 +931,7 @@ def _collectRecords(self, columns, recordlist, doctype, iid='', aid=''):
             if self._datacolumns and colkey not in self._datacolumns:
                 continue
             for where, (getData, selector, length) in col['where'].items():
-                if doctype != where.split('.', 1)[0]:
+                if doctype != where and not where.startswith(doctype + '.'):
                     continue
                 for recidx, record in enumerate(recordlist):
                     if doctype == 'item':
@@ -820,7 +955,7 @@ def _collectColumns(self, columns, recordlist, doctype, first=True, iid='', aid=
         Collect the columns available for a set of records.
 
         :param columns: the column dictionary to possibly modify.
-        :param recordList: a list of records to use.
+        :param recordlist: a list of records to use.
         :param doctype: the base document type.
         :param first: False if this is not the first page of a multi-page list
             of records,
@@ -832,7 +967,7 @@ def _collectColumns(self, columns, recordlist, doctype, first=True, iid='', aid=
             If no required fields were specified, this will be the count of all
             added data entries.
         """
-        getData = self.recordSelector(doctype)
+        getData = self.recordSelector(doctype.split('.', 1)[0])
         if doctype == 'item':
             self._commonColumn(columns, 'item.id', doctype, getData,
                                lambda record, data, row: str(record['_id']))
@@ -881,7 +1016,7 @@ def _getColumnsFromAnnotations(self, columns):
             iid = str(self.items[iidx]['_id'])
             for anidx, annot in enumerate(annotList):
                 # If the first item's annotation didn't contribute any required
-                # data to the data set, skip subsequent item's annotations;
+                # data to the data set, skip subsequent items' annotations;
                 # they are likely to be discarded.
                 if iidx and not countsPerAnnotation.get(anidx, 0) and not self._fullScan:
                     continue
@@ -900,6 +1035,39 @@ def _getColumnsFromAnnotations(self, columns):
                     countsPerAnnotation[anidx] = count - startcount
         return count
 
+    def _getColumnsFromDataFiles(self, columns):
+        """
+        Collect columns and data from data files in items.
+        """
+        if not len(self._itemfilelist) or not len(self._itemfilelist[0]):
+            return 0
+        count = 0
+        countsPerDataFile = {}
+        for iidx, dfList in enumerate(self._itemfilelist or []):
+            iid = str(self.items[iidx]['_id'])
+            for dfidx, file in enumerate(dfList):
+                # If the first item's data file didn't contribute any required
+                # data to the data set, skip subsequent items' data files;
+                # they are likely to be discarded.
+                if iidx and not countsPerDataFile.get(dfidx, 0) and not self._fullScan:
+                    continue
+                startcount = count
+                if file is None:
+                    continue
+                if not self._sources or 'datafile' in self._sources:
+                    try:
+                        df = _dfFromFile(file['_id'], bool(self._datacolumns or self._fullScan))
+                        count += self._collectColumns(
+                            columns, [df] if isinstance(df, dict) else df,
+                            f'datafile.{dfidx}', iid=iid)
+                    except Exception:
+                        logger.info(
+                            f'Cannot process file {file["_id"]}: {file["name"]} as a dataframe')
+                        raise
+                if not iidx:
+                    countsPerDataFile[dfidx] = count - startcount
+        return count
+
     def _getColumns(self):
         """
         Get a sorted list of plottable columns with some metadata for each.
@@ -917,7 +1085,7 @@ def _getColumns(self):
                         self.folder, offset=len(self.items), **self._moreItems):
                     count += self._collectColumns(columns, [item], 'item', first=False)
         count += self._getColumnsFromAnnotations(columns)
-        # TODO: Add csv
+        count += self._getColumnsFromDataFiles(columns)
         for result in columns.values():
             if len(result['distinct']) <= self.maxDistinct:
                 result['distinct'] = sorted(result['distinct'])
@@ -1015,7 +1183,7 @@ def data(self, columns, requiredColumns=None):
         if not isinstance(requiredColumns, list):
             requiredColumns = requiredColumns.split(',') if requiredColumns is not None else []
         requiredColumns = set(requiredColumns)
-        self._requiredColumns = requiredColumns
+        self._requiredColumns = set(requiredColumns)
         with self._dataLock:
             self._datacolumns = {c: {} for c in columns}
             rows = set()
@@ -1038,6 +1206,16 @@ def data(self, columns, requiredColumns=None):
             if len(data) < numrows:
                 logger.info(f'Reduced row count from {numrows} to {len(data)} '
                             f'because of None values in column {colkey}')
+        subdata = data
+        for cidx, col in enumerate(colsout):
+            colkey = col['key']
+            numrows = len(data)
+            if colkey in self._requiredColumns and colkey not in requiredColumns:
+                subdata = [row for row in subdata if row[cidx] is not None]
+        if len(subdata) and len(subdata) < len(data):
+            logger.info(f'Reduced row count from {len(data)} to {len(subdata)} '
+                        f'because of None values in implied columns')
+            data = subdata
         # Refresh our count, distinct, distinctcount, min, max for each column
         for cidx, col in enumerate(colsout):
             col['count'] = len([row[cidx] for row in data if row[cidx] is not None])