Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up correlating data files with annotations. #1642

Merged
merged 1 commit into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

- Only list computable plot columns if there are other numeric columns ([#1634](../../pull/1634))
- List official yaml mime type for the multi source ([#1636](../../pull/1636))
- Speed up correlating data files with annotations ([#1642](../../pull/1642))


### Bug Fixes

Expand Down
83 changes: 67 additions & 16 deletions girder_annotation/girder_large_image_annotation/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,24 @@ def _dfFromFile(fileid, full=False):
reader = dataFileExtReaders.get(
ext, dataFileExtReaders.get(file.get('mimeType'), None))
if reader == 'read_excel':
df = getattr(pd, reader)(File().open(file), sheet_name=None)
params = {
'sheet_name': None,
'usecols': lambda x: 'Unnamed: ' not in str(x),
}
try:
import python_calamine # noqa

params['engine'] = 'calamine'
except Exception:
pass
try:
df = getattr(pd, reader)(File().open(file), **params)
except Exception:
if 'engine' in params:
params.pop('engine')
df = getattr(pd, reader)(File().open(file), **params)
else:
raise
else:
df = {'entry': getattr(pd, reader)(File().open(file))}
df = {
Expand Down Expand Up @@ -620,19 +637,45 @@ def itemIDSelector(record, data, row):

return itemNameSelector if isName else itemIDSelector

def _bboxLookupTable(self):
self._bboxLookup = {}
for srow, x0val in self._datacolumns['bbox.x0'].items():
x0val = int(x0val)
y0val = self._datacolumns['bbox.y0'].get(srow)
if y0val is None:
continue
if x0val not in self._bboxLookup:
self._bboxLookup[x0val] = {}
if y0val not in self._bboxLookup[x0val]:
self._bboxLookup[x0val][y0val] = set()
self._bboxLookup[x0val][y0val].add(srow)

def datafileAnnotationElementSelector(self, key, cols):
# Max pixel difference for bounding box
epsilon = 2

def annotationElementSelector(record, data, row):
bbox = [col[1](record, data, row) for col in cols]
if 'bbox.x0' not in self._datacolumns or 'bbox.y0' not in self._datacolumns:
return None
if not hasattr(self, '_bboxLookup'):
self._bboxLookupTable()
if key in self._datacolumns:
for srow in self._datacolumns[key]:
if self._datacolumns[key][srow] is not None:
for bidx, bkey in enumerate(['bbox.x0', 'bbox.y0', 'bbox.x1', 'bbox.y1']):
val = self._datacolumns[bkey].get(srow)
if val is None or abs(val - bbox[bidx]) > 2:
break
else:
return self._datacolumns[key][srow]
for x0val in range(int(math.floor(bbox[0] - epsilon)),
int(math.ceil(bbox[0] + epsilon)) + 1):
if x0val in self._bboxLookup:
for y0val in range(int(math.floor(bbox[1] - epsilon)),
int(math.ceil(bbox[1] + epsilon)) + 1):
if y0val in self._bboxLookup[x0val]:
for srow in self._bboxLookup[x0val][y0val]:
if self._datacolumns[key][srow] is not None:
for bidx, bkey in enumerate([
'bbox.x0', 'bbox.y0', 'bbox.x1', 'bbox.y1']):
val = self._datacolumns[bkey].get(srow)
if val is None or abs(val - bbox[bidx]) > epsilon:
break
else:
return self._datacolumns[key][srow]
return None

return annotationElementSelector
Expand Down Expand Up @@ -779,8 +822,8 @@ def _keysToColumns(self, columns, parts, doctype, getData, selector, length):
if bkey in columns and doctype in columns[bkey]['where']]
if len(cols) == 4:
# If we load all of these from annotation elements, use all
# three keys:
for akey in {'annotation.id', 'annotation.name', 'annotationelement.id'}:
# available keys:
for akey in [col for col in self.commonColumns if col.startswith('annotation')]:
if self._datacolumns and akey in self._datacolumns:
self._requiredColumns.add(akey)
self._ensureColumn(
Expand Down Expand Up @@ -947,7 +990,7 @@ def _collectRecords(self, columns, recordlist, doctype, iid='', aid=''):
If no required fields were specified, this will be the count of all
added data entries.
"""
count = 0
count = None
eid = ''
for colkey, col in columns.items():
if self._datacolumns and colkey not in self._datacolumns:
Expand All @@ -967,10 +1010,15 @@ def _collectRecords(self, columns, recordlist, doctype, iid='', aid=''):
rows = 1 if length is None else length(record, data)
except Exception:
continue
count += self._collectRecordRows(
subcount = self._collectRecordRows(
record, data, selector, length, colkey, col, recidx,
rows, iid, aid, eid)
return count
if self._datacolumns:
if colkey in self._requiredColumns:
count = min(count, subcount) if count is not None else subcount
else:
count = (count or 0) + subcount
return count if count is not None else 0

def _collectColumns(self, columns, recordlist, doctype, first=True, iid='', aid=''):
"""
Expand Down Expand Up @@ -1044,7 +1092,7 @@ def _getColumnsFromAnnotations(self, columns):
# This had been checking if the first item's annotation didn't
# contribute any required data to the data set, skip subsequent
# items' annotations; they are likely to be discarded. This
# is untrue ui datafiles or folder level data augments the
# is untrue if datafiles or folder level data augments the
# element records
# if iidx and not countsPerAnnotation.get(anidx, 0) and not self._fullScan:
# continue
Expand Down Expand Up @@ -1206,7 +1254,10 @@ def _getColumns(self):
'item': 0, 'annotation': 1, 'annotationelement': 2, 'data': 3,
'bbox': 4, 'compute': 5}
columns = sorted(columns.values(), key=lambda x: (
prefixOrder.get(x['key'].split('.', 1)[0], len(prefixOrder)), x['key']))
prefixOrder.get(x['key'].split('.', 1)[0], len(prefixOrder)),
x['count'] <= 1,
x['title'].lower(),
x['key']))
return columns

@property
Expand Down
4 changes: 3 additions & 1 deletion girder_annotation/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,9 @@ def prerelease_local_scheme(version):
extras_require={
'compute': [
'openpyxl',
'pandas',
'pandas ; python_version < "3.9"',
'pandas>=2.2 ; python_version >= "3.9"',
'python-calamine ; python_version >= "3.9"',
'umap-learn',
],
'tasks': [
Expand Down