Skip to content

Commit

Permalink
improving startup and loading performance of benv2 lmdb readerby >10x (
Browse files Browse the repository at this point in the history
…#115)

* improving startup and loading performance of benv2 lmdb readerby >10x
increasing version number

* updating documentation
  • Loading branch information
lhackel-tub authored Jun 27, 2024
1 parent 689fbda commit 52310e8
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 10 deletions.
12 changes: 9 additions & 3 deletions configilm/extra/BENv2_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,9 +541,15 @@ def __init__(
metadata_snow_cloud = pd.read_parquet(metadata_snow_cloud_file)
self.metadata = pd.concat([self.metadata, metadata_snow_cloud])
self._print_info("Merged metadata with snow/cloud metadata")
self.lbls = {row["patch_id"]: row["labels"] for idx, row in self.metadata.iterrows()}

# self.lbls = {row["patch_id"]: row["labels"] for idx, row in self.metadata.iterrows()}
self.lbls = {p: l for p, l in zip(self.metadata["patch_id"], self.metadata["labels"])}
self._print_info(f"Loaded {len(self.lbls)} labels")
self.lbl_key_set = set(self.lbls.keys())
self.mapping = {row["patch_id"]: row["s1_name"] for idx, row in self.metadata.iterrows()}
self._print_info(f"Loaded {len(self.lbl_key_set)} keys")
# self.mapping = {row["patch_id"]: row["s1_name"] for idx, row in self.metadata.iterrows()}
self.mapping = {p: s for p, s in zip(self.metadata["patch_id"], self.metadata["s1_name"])}
self._print_info("Loaded mapping created")

# set mean and std based on bands selected
self.mean = None
Expand Down Expand Up @@ -571,7 +577,7 @@ def open_env(self):
readonly=True,
lock=False,
meminit=False,
readahead=False,
readahead=True,
map_size=8 * 1024**3, # 8GB blocked for caching
max_spare_txns=16, # expected number of concurrent transactions (e.g. threads/workers)
)
Expand Down
11 changes: 6 additions & 5 deletions configilm/extra/DataModules/BENv2_DataModule.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,12 @@ def __init__(
and test sets. The datamodule provides dataloaders for each of these sets.
:param data_dirs: A mapping from file key to file path. Required keys are
"images_lmdb", "train_data", "val_data" and "test_data". The "images_lmdb"
key is used to identify the lmdb file that contains the images. The "_data"
keys are used to identify paths to the respective split csv files.
Note, that the lmdb file is encoded using the BigEarthNet Encoder and contains
images and labels.
"images_lmdb", "metadata_parquet" and "metadata_snow_cloud_parquet". The "images_lmdb"
key is used to identify the lmdb file that contains the images. The "metadata_" keys
are used to identify the parquet files that contain the metadata. The metadata files
contain information about the images, such as the labels, split and cloud and snow info.
Note, that the lmdb file is encoded using the RICO-HDL Encoder and contains
images in the form of safe files.
:param batch_size: The batch size to use for the dataloaders.
Expand Down
2 changes: 1 addition & 1 deletion configilm/extra/DataSets/BENv2_DataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def __init__(
:param data_dirs: A mapping from file key to file path. The file key is
used to identify the function of the file. The required keys are:
"images_lmdb", "labels_csv", "s1_mapping_csv", "split_csv".
"images_lmdb", "metadata_parquet", "metadata_snow_cloud_parquet".
:param split: The name of the split to use. Can be either "train", "val" or
"test". If None is provided, all splits are used.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "configilm"
version = "0.6.6"
version = "0.6.7"
description = "A state-of-the-art tool for Python developers seeking to rapidly and iteratively develop vision and language models within the [`pytorch`](https://pytorch.org/) framework"
authors = ["Leonard Hackel <l.hackel@tu-berlin.de>"]
readme = "README.md"
Expand Down

0 comments on commit 52310e8

Please sign in to comment.