Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve scanning data file for plottable data #1614

Merged
merged 1 commit into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Change Log

## 1.29.7

### Improvements

- Speed up getting plottable data from adjacent items; plot more data ([#1613](../../pull/1613), [#1614](../../pull/1614))

## 1.29.6

### Bug Fixes
Expand Down
202 changes: 190 additions & 12 deletions girder_annotation/girder_large_image_annotation/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,56 @@
import functools
import json
import math
import os
import re
import threading

from bson.objectid import ObjectId

from girder import logger
from girder.constants import AccessType, SortDir
from girder.models.file import File
from girder.models.folder import Folder
from girder.models.item import Item

dataFileExtReaders = {
'.csv': 'read_csv',
'text/csv': 'read_csv',
'.xls': 'read_excel',
'.xlsx': 'read_excel',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'read_excel',
'application/vnd.ms-excel ': 'read_excel',
'application/msexcel': 'read_excel',
'application/x-msexcel': 'read_excel',
'application/x-ms-excel': 'read_excel',
'application/x-excel': 'read_excel',
'application/x-dos_ms_excel': 'read_excel',
'application/xls': 'read_excel',
'application/x-xls': 'read_excel',
}
scanDatafileRecords = 50


@functools.lru_cache(maxsize=100)
def _dfFromFile(fileid, full=False):
import pandas as pd

file = File().load(fileid, force=True)
ext = os.path.splitext(file['name'])[1]
reader = dataFileExtReaders.get(
ext, dataFileExtReaders.get(file.get('mimeType'), None))
if reader == 'read_excel':
df = getattr(pd, reader)(File().open(file), sheet_name=None)
else:
df = {'entry': getattr(pd, reader)(File().open(file))}
df = {
k: sheet.iloc[:None if full else scanDatafileRecords].to_dict('records')
for k, sheet in df.items()}
logger.info(f'Read {len(df)} x {len(next(iter(df.values())))} values from '
f'{file["name"]} {file["size"]}')
if len(df) == 1:
df = next(iter(df.values()))
return df


class AnnotationGeoJSON:
Expand Down Expand Up @@ -378,6 +421,7 @@ def __init__(self, user, item, annotations=None, adjacentItems=False, sources=No
self._fullScan = adjacentItems == '__all__'
self._findItems(item, adjacentItems)
self._findAnnotations(annotations)
self._findDataFiles()
self._dataLock = threading.RLock()

def _findItems(self, item, adjacentItems=False):
Expand Down Expand Up @@ -452,6 +496,53 @@ def _findAnnotations(self, annotations):
annotList[names[annot['annotation']['name']]] = annot
self.annotations.append(annotList)

def _findDataFiles(self): # noqa
"""
Find data files inside the current item. For adjacent items, the data
file must have the same name or, if the found file is prefixed with
the item name excluding the extension, then the adjancant file should
be similarly prefixed. Data files must have a known suffix or a known
mimetype that can be read by pandas (and pandas must be installed).
"""
self._itemfilelist = [[]] * len(self.items)
try:
import pandas as pd # noqa
except Exception:
return
if self._sources and 'filedata' not in self._sources:
return
names0 = {}
for iidx, item in enumerate(self.items):
if iidx:
self._itemfilelist[iidx] = [None] * len(self._itemfilelist[0])
names = {}
for file in Item().childFiles(item):
try:
if (file['_id'] == self.item['largeImage']['fileId'] or
file['_id'] == self.item['largeImage'].get('originalId')):
continue
except Exception:
continue
ext = os.path.splitext(file['name'])[1]
if (ext not in dataFileExtReaders and
file.get('mimeType') not in dataFileExtReaders):
continue
if file['name'].startswith(item['name'].rsplit('.')[0]):
base, name = True, file['name'][len(item['name'].rsplit('.')[0]):]
else:
base, name = False, file['name']
if (base, name) in names:
continue
if iidx and (base, name) not in names0:
continue
names[(base, name)] = len(names)
if not iidx:
self._itemfilelist[0].append(file)
else:
self._itemfilelist[iidx][names0[(base, name)]] = file
if not iidx:
names0 = names

# Common column keys and titles
commonColumns = {
'item.id': 'Item ID',
Expand Down Expand Up @@ -506,6 +597,23 @@ def itemIDSelector(record, data, row):

return itemNameSelector if isName else itemIDSelector

def datafileAnnotationElementSelector(self, key, cols):

def annotationElementSelector(record, data, row):
bbox = [col[1](record, data, row) for col in cols]
if key in self._datacolumns:
for row in self._datacolumns[key]:
if self._datacolumns[key][row] is not None:
for bidx, bkey in enumerate(['bbox.x0', 'bbox.y0', 'bbox.x1', 'bbox.y1']):
val = self._datacolumns[bkey].get(row)
if val is None or abs(val - bbox[bidx]) > 2:
break
else:
return self._datacolumns[key][row]
return None

return annotationElementSelector

@staticmethod
def keySelector(mode, key, key2=None):
"""
Expand Down Expand Up @@ -585,6 +693,13 @@ def annotationelementGetData(record):

return annotationelementGetData

if doctype == 'datafile':

def datafileGetData(record):
return record

return datafileGetData

def getData(record):
return record.get('meta', {})

Expand All @@ -610,11 +725,12 @@ def _keysToColumns(self, columns, parts, doctype, getData, selector, length):
title = ' '.join(str(v) for v in parts[1:] if v != '0')
keymap = {
r'(?i)(item|image)_(id|name)$': 'item.name',
r'(?i)(low|min)(_|)x': 'bbox.x0',
r'(?i)(low|min)(_|)y': 'bbox.y0',
r'(?i)(high|max)(_|)x': 'bbox.x1',
r'(?i)(high|max)(_|)y': 'bbox.y1',
r'(?i)((low|min)(_|)x|^x1$)': 'bbox.x0',
r'(?i)((low|min)(_|)y|^y1$)': 'bbox.y0',
r'(?i)((high|max)(_|)x|^x2$)': 'bbox.x1',
r'(?i)((high|max)(_|)y|^y2$)': 'bbox.y1',
}
match = False
for k, v in keymap.items():
if re.match(k, lastpart):
if lastpart != parts[1]:
Expand All @@ -630,9 +746,25 @@ def _keysToColumns(self, columns, parts, doctype, getData, selector, length):
doctype, getData,
self.itemNameIDSelector(False, selector), length)
return
match = True
break
self._ensureColumn(
added = self._ensureColumn(
columns, key, title, doctype, getData, selector, length)
if match and added and key.startswith('bbox'):
cols = [columns[bkey]['where'][doctype] for bkey in [
'bbox.x0', 'bbox.y0', 'bbox.x1', 'bbox.y1']
if bkey in columns and doctype in columns[bkey]['where']]
if len(cols) == 4:
# If we load all of these from annotation elements, use all
# three keys:
# for akey in {'annotation.id', 'annotation.name', 'annotationelement.id'}:
for akey in {'annotationelement.id'}:
if self._datacolumns and akey in self._datacolumns:
self._requiredColumns.add(akey)
self._ensureColumn(
columns, akey, self.commonColumns[akey], doctype,
getData, self.datafileAnnotationElementSelector(akey, cols),
length)

def _ensureColumn(self, columns, keyname, title, doctype, getData, selector, length):
"""
Expand All @@ -649,6 +781,7 @@ def _ensureColumn(self, columns, keyname, title, doctype, getData, selector, len
dictionary, and row, returns a value.
:param length: None or a function that, given the document record and
data dictionary, returns the number of rows.
:returns: True if the column where record was added.
"""
if keyname not in columns:
columns[keyname] = {
Expand All @@ -663,6 +796,8 @@ def _ensureColumn(self, columns, keyname, title, doctype, getData, selector, len
}
if doctype not in columns[keyname]['where']:
columns[keyname]['where'][doctype] = (getData, selector, length)
return True
return False

def _columnsFromData(self, columns, doctype, getData, record): # noqa
"""
Expand Down Expand Up @@ -780,7 +915,7 @@ def _collectRecords(self, columns, recordlist, doctype, iid='', aid=''):
Collect statistics and possibly row values from a list of records.

:param columns: the column dictionary to possibly modify.
:param recordList: a list of records to use.
:param recordlist: a list of records to use.
:param doctype: the base document type.
:param iid: an optional item id to use for determining distinct rows.
:param aid: an optional annotation id to use for determining distinct
Expand All @@ -796,7 +931,7 @@ def _collectRecords(self, columns, recordlist, doctype, iid='', aid=''):
if self._datacolumns and colkey not in self._datacolumns:
continue
for where, (getData, selector, length) in col['where'].items():
if doctype != where.split('.', 1)[0]:
if doctype != where and not where.startswith(doctype + '.'):
continue
for recidx, record in enumerate(recordlist):
if doctype == 'item':
Expand All @@ -820,7 +955,7 @@ def _collectColumns(self, columns, recordlist, doctype, first=True, iid='', aid=
Collect the columns available for a set of records.

:param columns: the column dictionary to possibly modify.
:param recordList: a list of records to use.
:param recordlist: a list of records to use.
:param doctype: the base document type.
:param first: False if this is not the first page of a multi-page list
of records,
Expand All @@ -832,7 +967,7 @@ def _collectColumns(self, columns, recordlist, doctype, first=True, iid='', aid=
If no required fields were specified, this will be the count of all
added data entries.
"""
getData = self.recordSelector(doctype)
getData = self.recordSelector(doctype.split('.', 1)[0])
if doctype == 'item':
self._commonColumn(columns, 'item.id', doctype, getData,
lambda record, data, row: str(record['_id']))
Expand Down Expand Up @@ -881,7 +1016,7 @@ def _getColumnsFromAnnotations(self, columns):
iid = str(self.items[iidx]['_id'])
for anidx, annot in enumerate(annotList):
# If the first item's annotation didn't contribute any required
# data to the data set, skip subsequent item's annotations;
# data to the data set, skip subsequent items' annotations;
# they are likely to be discarded.
if iidx and not countsPerAnnotation.get(anidx, 0) and not self._fullScan:
continue
Expand All @@ -900,6 +1035,39 @@ def _getColumnsFromAnnotations(self, columns):
countsPerAnnotation[anidx] = count - startcount
return count

def _getColumnsFromDataFiles(self, columns):
"""
Collect columns and data from data files in items.
"""
if not len(self._itemfilelist) or not len(self._itemfilelist[0]):
return 0
count = 0
countsPerDataFile = {}
for iidx, dfList in enumerate(self._itemfilelist or []):
iid = str(self.items[iidx]['_id'])
for dfidx, file in enumerate(dfList):
# If the first item's data file didn't contribute any required
# data to the data set, skip subsequent items' data files;
# they are likely to be discarded.
if iidx and not countsPerDataFile.get(dfidx, 0) and not self._fullScan:
continue
startcount = count
if file is None:
continue
if not self._sources or 'datafile' in self._sources:
try:
df = _dfFromFile(file['_id'], bool(self._datacolumns or self._fullScan))
count += self._collectColumns(
columns, [df] if isinstance(df, dict) else df,
f'datafile.{dfidx}', iid=iid)
except Exception:
logger.info(
f'Cannot process file {file["_id"]}: {file["name"]} as a dataframe')
raise
if not iidx:
countsPerDataFile[dfidx] = count - startcount
return count

def _getColumns(self):
"""
Get a sorted list of plottable columns with some metadata for each.
Expand All @@ -917,7 +1085,7 @@ def _getColumns(self):
self.folder, offset=len(self.items), **self._moreItems):
count += self._collectColumns(columns, [item], 'item', first=False)
count += self._getColumnsFromAnnotations(columns)
# TODO: Add csv
count += self._getColumnsFromDataFiles(columns)
for result in columns.values():
if len(result['distinct']) <= self.maxDistinct:
result['distinct'] = sorted(result['distinct'])
Expand Down Expand Up @@ -1015,7 +1183,7 @@ def data(self, columns, requiredColumns=None):
if not isinstance(requiredColumns, list):
requiredColumns = requiredColumns.split(',') if requiredColumns is not None else []
requiredColumns = set(requiredColumns)
self._requiredColumns = requiredColumns
self._requiredColumns = set(requiredColumns)
with self._dataLock:
self._datacolumns = {c: {} for c in columns}
rows = set()
Expand All @@ -1038,6 +1206,16 @@ def data(self, columns, requiredColumns=None):
if len(data) < numrows:
logger.info(f'Reduced row count from {numrows} to {len(data)} '
f'because of None values in column {colkey}')
subdata = data
for cidx, col in enumerate(colsout):
colkey = col['key']
numrows = len(data)
if colkey in self._requiredColumns and colkey not in requiredColumns:
subdata = [row for row in subdata if row[cidx] is not None]
if len(subdata) and len(subdata) < len(data):
logger.info(f'Reduced row count from {len(data)} to {len(subdata)} '
f'because of None values in implied columns')
data = subdata
# Refresh our count, distinct, distinctcount, min, max for each column
for cidx, col in enumerate(colsout):
col['count'] = len([row[cidx] for row in data if row[cidx] is not None])
Expand Down