Skip to content

Commit

Permalink
adding codes to read parquet files instead of csv files (#114)
Browse files Browse the repository at this point in the history
* adding codes to read parquet files instead of csv files
adding new mock data files
removing dead files and adjusting tests
fixing numpy to v1.x (pandas can't work with newer version)

* updating version to v0.6.6
  • Loading branch information
lhackel-tub authored Jun 27, 2024
1 parent 462601b commit 689fbda
Show file tree
Hide file tree
Showing 15 changed files with 376 additions and 575 deletions.
45 changes: 9 additions & 36 deletions configilm/extra/BENv2_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,18 +367,6 @@
},
}


def _numpy_level_aggreagate(df, key_col, val_col):
# optimized version of df.groupby(key_col)[val_col].apply(list).reset_index(name=val_col)
# credits to B. M. @
# https://stackoverflow.com/questions/22219004/how-to-group-dataframe-rows-into-list-in-pandas-groupby
keys, values = df.sort_values(key_col).values.T
ukeys, index = np.unique(keys, True)
arrays = np.split(values, index[1:])
df2 = pd.DataFrame({key_col: ukeys, val_col: [list(a) for a in arrays]})
return df2


NEW_LABELS_ORIGINAL_ORDER = (
"Urban fabric",
"Industrial or commercial units",
Expand Down Expand Up @@ -532,8 +520,8 @@ class BENv2LDMBReader:
def __init__(
self,
image_lmdb_file: Union[str, Path],
label_file: Union[str, Path],
s1_mapping_file: Optional[Union[str, Path]] = None,
metadata_file: Union[str, Path],
metadata_snow_cloud_file: Optional[Union[str, Path]] = None,
bands: Optional[Union[Iterable, str, int]] = None,
process_bands_fn: Optional[Callable[[Dict[str, np.ndarray], List[str]], Any]] = None,
process_labels_fn: Optional[Callable[[List[str]], Any]] = None,
Expand All @@ -548,29 +536,14 @@ def __init__(
self.uses_s1 = any([x in _s1_bandnames for x in self.bands])
self.uses_s2 = any([x in _s2_bandnames for x in self.bands])

if s1_mapping_file is None:
assert not self.uses_s1, "If you want to use S1 bands, please provide a s2s1_mapping_file"
self.mapping = None
else:
# read and create mapping S2v2 name -> S1 name
self._print_info("Reading mapping ...")
mapping = pd.read_csv(str(s1_mapping_file))
self._print_info("Creating mapping dict ...")
self.mapping = dict(zip(mapping.patch_id, mapping.s1_name)) # naming of the columns is hardcoded
del mapping

# read labels and create mapping S2v2 name -> List[label]
self._print_info("Reading labels ...")
lbls = pd.read_csv(str(label_file))

self._print_info("Aggregating label list ...")
lbls = _numpy_level_aggreagate(lbls, "patch_id", "label")
# lbls = lbls.groupby('patch')['lbl_19'].apply(list).reset_index(name='lbl_19')

self._print_info("Creating label dict ...")
self.lbls = dict(zip(lbls.patch_id, lbls.label)) # naming of the columns is hardcoded
self.metadata = pd.read_parquet(metadata_file)
if metadata_snow_cloud_file is not None:
metadata_snow_cloud = pd.read_parquet(metadata_snow_cloud_file)
self.metadata = pd.concat([self.metadata, metadata_snow_cloud])
self._print_info("Merged metadata with snow/cloud metadata")
self.lbls = {row["patch_id"]: row["labels"] for idx, row in self.metadata.iterrows()}
self.lbl_key_set = set(self.lbls.keys())
del lbls
self.mapping = {row["patch_id"]: row["s1_name"] for idx, row in self.metadata.iterrows()}

# set mean and std based on bands selected
self.mean = None
Expand Down
32 changes: 21 additions & 11 deletions configilm/extra/DataSets/BENv2_DataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
https://bigearth.net/
"""
import csv
from functools import partial
from pathlib import Path
from typing import Callable
from typing import Mapping
from typing import Optional
from typing import Union

import pandas as pd
from torch.utils.data import Dataset

from configilm.extra.BENv2_utils import ben_19_labels_to_multi_hot
Expand Down Expand Up @@ -75,6 +75,8 @@ def __init__(
img_size: tuple = (3, 120, 120),
return_extras: bool = False,
patch_prefilter: Optional[Callable[[str], bool]] = None,
include_cloudy: bool = False,
include_snowy: bool = False,
):
"""
Dataset for BigEarthNet v2 dataset. Files can be requested by contacting
Expand Down Expand Up @@ -134,13 +136,21 @@ def __init__(
raise AssertionError(f"{img_size[0]} is not a valid channel configuration.")

print(f"Loading BEN data for {split}...")
# read split csv file
split_csv = Path(data_dirs["split_csv"])
with open(split_csv) as f:
reader = csv.reader(f)
split_data = list(reader)
split_data = split_data[1:] # remove header
self.patches = [x[0] for x in split_data if split is None or x[1] == split]
# read metadata
metadata = pd.read_parquet(data_dirs["metadata_parquet"])
if include_cloudy or include_snowy:
metadata_snow_cloud = pd.read_parquet(data_dirs["metadata_snow_cloud_parquet"])
metadata = pd.concat([metadata, metadata_snow_cloud])
if not include_cloudy:
# remove all rows with contains_cloud_or_shadow
metadata = metadata[~metadata["contains_cloud_or_shadow"]]
if not include_snowy:
# remove all rows with contains_seasonal_snow
metadata = metadata[~metadata["contains_seasonal_snow"]]
if split is not None:
metadata = metadata[metadata["split"] == split]
self.patches = metadata["patch_id"].tolist()

print(f" {len(self.patches)} patches indexed")

# if a prefilter is provided, filter patches based on function
Expand All @@ -157,8 +167,8 @@ def __init__(
self.channel_order = self.channel_configurations[c]
self.BENv2Loader = BENv2LDMBReader(
image_lmdb_file=self.lmdb_dir,
label_file=data_dirs["labels_csv"],
s1_mapping_file=data_dirs["s1_mapping_csv"],
metadata_file=data_dirs["metadata_parquet"],
metadata_snow_cloud_file=data_dirs["metadata_snow_cloud_parquet"],
bands=self.channel_order,
process_bands_fn=partial(stack_and_interpolate, img_size=h, upsample_mode="nearest"),
process_labels_fn=ben_19_labels_to_multi_hot,
Expand All @@ -168,7 +178,7 @@ def get_patchname_from_index(self, idx: int) -> Optional[str]:
"""
Gives the patch name of the image at the specified index. May return invalid
names (names that are not actually loadable because they are not part of the
lmdb file) if the name is included in the csv file.
lmdb file) if the name is included in the metadata file(s).
:param idx: index of an image
:return: patch name of the image or None, if the index is invalid
Expand Down
31 changes: 20 additions & 11 deletions configilm/extra/data_dir.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,20 +46,28 @@
],
"benv2": [
# MARS
{"images_lmdb": "INVALID_PATH"},
{
"images_lmdb": Path("/data") / "kaiclasen" / "BENv2.lmdb",
"metadata_parquet": Path("/data") / "kaiclasen" / "metadata.parquet",
"metadata_snow_cloud_parquet": Path("/data")
/ "kaiclasen"
/ "metadata_for_patches_with_snow_cloud_or_shadow.parquet",
},
# ERDE
{
"images_lmdb": Path("/faststorage") / "BigEarthNet-V2" / "BigEarthNet-V2-LMDB",
"split_csv": Path("/faststorage") / "BigEarthNet-V2" / "patch_id_split_mapping.csv",
"s1_mapping_csv": Path("/faststorage") / "BigEarthNet-V2" / "patch_id_s1_mapping.csv",
"labels_csv": Path("/faststorage") / "BigEarthNet-V2" / "patch_id_label_mapping.csv",
"metadata_parquet": Path("/faststorage") / "BigEarthNet-V2" / "metadata.parquet",
"metadata_snow_cloud_parquet": Path("/faststorage")
/ "BigEarthNet-V2"
/ "metadata_for_patches_with_snow_cloud_or_shadow.parquet",
},
# PLUTO
{
"images_lmdb": pluto_local / "BigEarthNet-V2" / "BigEarthNet-V2-LMDB",
"split_csv": pluto_local / "BigEarthNet-V2" / "patch_id_split_mapping.csv",
"s1_mapping_csv": pluto_local / "BigEarthNet-V2" / "patch_id_s1_mapping.csv",
"labels_csv": pluto_local / "BigEarthNet-V2" / "patch_id_label_mapping.csv",
"images_lmdb": pluto_local / "BigEarthNet-V2" / "BENv2.lmdb",
"metadata_parquet": pluto_local / "BigEarthNet-V2" / "metadata.parquet",
"metadata_snow_cloud_parquet": pluto_local
/ "BigEarthNet-V2"
/ "metadata_for_patches_with_snow_cloud_or_shadow.parquet",
},
],
"cocoqa": [],
Expand Down Expand Up @@ -148,9 +156,10 @@
},
"benv2": {
"images_lmdb": mock_data_dir / "BENv2" / "BigEarthNet-V2-LMDB",
"split_csv": mock_data_dir / "BENv2" / "patch_id_split_mapping.csv",
"s1_mapping_csv": mock_data_dir / "BENv2" / "patch_id_s1_mapping.csv",
"labels_csv": mock_data_dir / "BENv2" / "patch_id_label_mapping.csv",
"metadata_parquet": mock_data_dir / "BENv2" / "metadata.parquet",
"metadata_snow_cloud_parquet": mock_data_dir
/ "BENv2"
/ "metadata_for_patches_with_snow_cloud_or_shadow.parquet",
},
"cocoqa": {
"images": mock_data_dir / "COCO-QA" / "images",
Expand Down
Binary file modified configilm/extra/mock_data/BENv2/BigEarthNet-V2-LMDB/data.mdb
Binary file not shown.
Binary file modified configilm/extra/mock_data/BENv2/BigEarthNet-V2-LMDB/lock.mdb
Binary file not shown.
Binary file added configilm/extra/mock_data/BENv2/metadata.parquet
Binary file not shown.
Binary file not shown.
28 changes: 0 additions & 28 deletions configilm/extra/mock_data/BENv2/patch_id_country_mapping.csv

This file was deleted.

134 changes: 0 additions & 134 deletions configilm/extra/mock_data/BENv2/patch_id_label_mapping.csv

This file was deleted.

Loading

0 comments on commit 689fbda

Please sign in to comment.