Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mergeback 1.9.1 to develop #1623

Merged
merged 5 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Bug fixes

## \[Q3 2024 Release 1.9.0\]
## Q4 2024 Release 1.9.1
### Enhancements
- Support multiple labels for kaggle format
(<https://github.com/openvinotoolkit/datumaro/pull/1607>)
- Use DataFrame.map instead of DataFrame.applymap
(<https://github.com/openvinotoolkit/datumaro/pull/1613>)

### Bug fixes
- Fix StreamDataset merging when importing in eager mode
(<https://github.com/openvinotoolkit/datumaro/pull/1609>)

## Q3 2024 Release 1.9.0
### New features
- Add a new CLI command: datum format
(<https://github.com/openvinotoolkit/datumaro/pull/1570>)
Expand Down
12 changes: 12 additions & 0 deletions docs/source/docs/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@ Release Notes
.. toctree::
:maxdepth: 1

v1.9.1 (2024 Q3)
----------------

Enhancements
^^^^^^^^^^^^
- Support multiple labels for kaggle format
- Use DataFrame.map instead of DataFrame.applymap

Bug fixes
^^^^^^^^^
- Fix StreamDataset merging when importing in eager mode

v1.9.0 (2024 Q3)
----------------

Expand Down
11 changes: 8 additions & 3 deletions src/datumaro/components/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1023,17 +1023,22 @@
def __init__(self, *sources: IDataset):
from datumaro.components.hl_ops import HLOps

self.merged = HLOps.merge(*sources, merge_policy=merge_policy)
self._merged = HLOps.merge(*sources, merge_policy=merge_policy)
self._data = self._merged._data
self._env = env
self._format = DEFAULT_FORMAT
self._source_path = None
self._options = {}

def __iter__(self):
yield from self.merged
yield from self._merged

Check warning on line 1034 in src/datumaro/components/dataset.py

View check run for this annotation

Codecov / codecov/patch

src/datumaro/components/dataset.py#L1034

Added line #L1034 was not covered by tests

@property
def is_stream(self):
return True

def subsets(self) -> Dict[str, DatasetSubset]:
return self.merged.subsets()
return self._merged.subsets()

return _MergedStreamDataset(*sources)

Expand Down
75 changes: 59 additions & 16 deletions src/datumaro/plugins/data_formats/kaggle/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,21 +77,43 @@
# expected to output [x1, y1, x2, y2]
return [float(coord.strip()) for coord in coords]

def _load_annotations(self, datas: list, indices: Dict[str, int], bbox_flag: bool):
def _load_annotations(
self, datas: list, indices: Dict[str, Union[int, Dict[str, int]]], bbox_flag: bool
):
if "label" in indices:
label_name = str(datas[indices["label"]])
label, cat = self._label_cat.find(label_name)
if not cat:
self._label_cat.add(label_name)
label, _ = self._label_cat.find(label_name)
label_indices = indices["label"]
if isinstance(label_indices, dict):
labels = []
list_values = datas[1:]
index_to_label = {v: k for k, v in label_indices.items()}
present_labels = [
index_to_label[i + 1] for i, value in enumerate(list_values) if value == "1"
]

for label_name in present_labels:
label, cat = self._label_cat.find(label_name)
if not cat:
self._label_cat.add(label_name)
label, _ = self._label_cat.find(label_name)
labels.append(Label(label=label))
else:
label_name = str(datas[indices["label"]])
label, cat = self._label_cat.find(label_name)
if not cat:
self._label_cat.add(label_name)
label, _ = self._label_cat.find(label_name)
else:
_, cat = self._label_cat.find("object")
if not cat:
self._label_cat.add("object")
label = 0

if "label" in indices and not bbox_flag:
label_indices = indices["label"]
if isinstance(label_indices, dict):
return labels
return Label(label=label)

if bbox_flag:
if "bbox" in indices:
coords = self._parse_bbox_coords(datas[indices["bbox"]])
Expand Down Expand Up @@ -125,7 +147,14 @@

indices = {"media": df_fields.index(columns["media"])}
if "label" in columns:
indices.update({"label": df_fields.index(columns["label"])})
label_columns = columns["label"]
if isinstance(label_columns, list):
indices_label = {}
for label in label_columns:
indices_label[label] = df_fields.index(label)
indices.update({"label": indices_label})
else:
indices.update({"label": df_fields.index(label_columns)})

bbox_flag = False
bbox_index = columns.get("bbox")
Expand Down Expand Up @@ -165,16 +194,30 @@
continue

ann = self._load_annotations(data_info, indices, bbox_flag)
self._ann_types.add(ann.type)
if item_id in items:
items[item_id].annotations.append(ann)
if isinstance(ann, list):
for label in ann:
self._ann_types.add(label.type)
if item_id in items:
for label in ann:
items[item_id].annotations.append(label)

Check warning on line 202 in src/datumaro/plugins/data_formats/kaggle/base.py

View check run for this annotation

Codecov / codecov/patch

src/datumaro/plugins/data_formats/kaggle/base.py#L202

Added line #L202 was not covered by tests
else:
items[item_id] = DatasetItem(
id=item_id,
subset=self._subset,
media=Image.from_file(path=media_path),
annotations=ann,
)
else:
items[item_id] = DatasetItem(
id=item_id,
subset=self._subset,
media=Image.from_file(path=media_path),
annotations=[ann],
)
self._ann_types.add(ann.type)
if item_id in items:
items[item_id].annotations.append(ann)
else:
items[item_id] = DatasetItem(
id=item_id,
subset=self._subset,
media=Image.from_file(path=media_path),
annotations=[ann],
)
return items.values()

def categories(self):
Expand Down
2 changes: 1 addition & 1 deletion src/datumaro/plugins/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1974,7 +1974,7 @@ def refine_tabular_media(self, item):
or item.media.table.dtype(col) is int
]

df[str_cols] = df[str_cols].applymap(lambda x: self.remove_unnecessary_char(x))
df[str_cols] = df[str_cols].map(lambda x: self.remove_unnecessary_char(x))

if not (self._outlier_value):
self.check_outlier(media.table.data[float_cols + int_cols], float_cols + int_cols)
Expand Down
2 changes: 1 addition & 1 deletion src/datumaro/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.9.0"
__version__ = "1.10.0.dev0"
7 changes: 7 additions & 0 deletions tests/assets/kaggle_dataset/image_csv_multi_label/ann.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
image_name,dog,cat,person
1.jpg,1,0,0
2.jpg,0,1,0
3.jpg,0,0,1
4.jpg,1,1,0
5.jpg,1,0,1
6.jpg,0,1,1
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
image_name,dog,cat,person
1,1,0,0
2,0,1,0
3,0,0,1
4,1,1,0
5,1,0,1
6,0,1,1
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
70 changes: 70 additions & 0 deletions tests/unit/data_formats/test_kaggle.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
from tests.utils.test_utils import compare_datasets

DUMMY_DATASET_IMAGE_CSV_DIR = get_test_asset_path("kaggle_dataset", "image_csv")
DUMMY_DATASET_IMAGE_CSV_MULTI_LB_DIR = get_test_asset_path(
"kaggle_dataset", "image_csv_multi_label"
)
DUMMY_DATASET_IMAGE_CSV_DET_DIR = get_test_asset_path("kaggle_dataset", "image_csv_det")
DUMMY_DATASET_IMAGE_TXT_DIR = get_test_asset_path("kaggle_dataset", "image_txt")
DUMMY_DATASET_IMAGE_TXT_DET_DIR = get_test_asset_path("kaggle_dataset", "image_txt_det")
Expand Down Expand Up @@ -72,6 +75,51 @@ def fxt_img_dataset() -> Dataset:
)


@pytest.fixture
def fxt_img_multi_label_dataset() -> Dataset:
return Dataset.from_iterable(
[
DatasetItem(
id="1",
subset="default",
media=Image.from_numpy(data=np.ones((5, 10, 3))),
annotations=[Label(label=0)],
),
DatasetItem(
id="2",
subset="default",
media=Image.from_numpy(data=np.ones((5, 10, 3))),
annotations=[Label(label=1)],
),
DatasetItem(
id="3",
subset="default",
media=Image.from_numpy(data=np.ones((5, 10, 3))),
annotations=[Label(label=2)],
),
DatasetItem(
id="4",
subset="default",
media=Image.from_numpy(data=np.ones((5, 10, 3))),
annotations=[Label(label=0), Label(label=1)],
),
DatasetItem(
id="5",
subset="default",
media=Image.from_numpy(data=np.ones((5, 10, 3))),
annotations=[Label(label=0), Label(label=2)],
),
DatasetItem(
id="6",
subset="default",
media=Image.from_numpy(data=np.ones((5, 10, 3))),
annotations=[Label(label=1), Label(label=2)],
),
],
categories=["dog", "cat", "person"],
)


@pytest.fixture
def fxt_img_det_dataset() -> Dataset:
return Dataset.from_iterable(
Expand Down Expand Up @@ -321,6 +369,8 @@ def fxt_coco_dataset() -> Dataset:
IDS = [
"IMAGE_CSV",
"IMAGE_CSV_WO_EXT",
"IMAGE_CSV_MULTI_LB",
"IMAGE_CSV_MULTI_LB_WO_EXT",
"IMAGE_CSV_DET",
"IMAGE_CSV_DET2",
"IMAGE_CSV_DET3",
Expand Down Expand Up @@ -372,6 +422,26 @@ def test_can_detect(self, fxt_dataset_dir: str):
"columns": {"media": "image_name", "label": "label_name"},
},
),
(
DUMMY_DATASET_IMAGE_CSV_MULTI_LB_DIR,
"images",
"fxt_img_multi_label_dataset",
KaggleImageCsvBase,
{
"ann_file": osp.join(DUMMY_DATASET_IMAGE_CSV_MULTI_LB_DIR, "ann.csv"),
"columns": {"media": "image_name", "label": ["dog", "cat", "person"]},
},
),
(
DUMMY_DATASET_IMAGE_CSV_MULTI_LB_DIR,
"images",
"fxt_img_multi_label_dataset",
KaggleImageCsvBase,
{
"ann_file": osp.join(DUMMY_DATASET_IMAGE_CSV_MULTI_LB_DIR, "ann_wo_ext.csv"),
"columns": {"media": "image_name", "label": ["dog", "cat", "person"]},
},
),
(
DUMMY_DATASET_IMAGE_CSV_DET_DIR,
"images",
Expand Down
9 changes: 7 additions & 2 deletions tests/unit/test_imagenet_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pytest

from datumaro.components.annotation import AnnotationType, Label, LabelCategories
from datumaro.components.contexts.importer import ImportErrorPolicy
from datumaro.components.dataset import Dataset, StreamDataset
from datumaro.components.dataset_base import DatasetItem
from datumaro.components.environment import Environment
Expand Down Expand Up @@ -214,7 +215,9 @@ def _create_expected_dataset(self):
@pytest.mark.parametrize("dataset_cls, is_stream", [(Dataset, False), (StreamDataset, True)])
def test_can_import(self, dataset_cls, is_stream, helper_tc):
expected_dataset = self._create_expected_dataset()
dataset = dataset_cls.import_from(self.DUMMY_DATASET_DIR, self.IMPORTER_NAME)
dataset = dataset_cls.import_from(
self.DUMMY_DATASET_DIR, self.IMPORTER_NAME, error_policy=ImportErrorPolicy()
)
assert dataset.is_stream == is_stream

compare_datasets(helper_tc, expected_dataset, dataset, require_media=True)
Expand All @@ -240,7 +243,9 @@ class ImagenetWithSubsetDirsImporterTest(ImagenetImporterTest):
@mark_requirement(Requirements.DATUM_GENERAL_REQ)
@pytest.mark.parametrize("dataset_cls, is_stream", [(Dataset, False), (StreamDataset, True)])
def test_can_import(self, dataset_cls, is_stream, helper_tc):
dataset = dataset_cls.import_from(self.DUMMY_DATASET_DIR, self.IMPORTER_NAME)
dataset = dataset_cls.import_from(
self.DUMMY_DATASET_DIR, self.IMPORTER_NAME, error_policy=ImportErrorPolicy()
)
assert dataset.is_stream == is_stream

for subset_name, subset in dataset.subsets().items():
Expand Down
Loading