From b9c8a58cce515cb32721ba4b2c1d9275c795ae92 Mon Sep 17 00:00:00 2001 From: Sooah Lee Date: Fri, 20 Sep 2024 11:25:56 +0900 Subject: [PATCH 1/4] Support multiple labels for kaggle format (#1607) - Ticket no.152153 - Support multi labels for kaggle dataset - Add unit tests --- CHANGELOG.md | 11 ++- .../plugins/data_formats/kaggle/base.py | 75 ++++++++++++++---- .../image_csv_multi_label/ann.csv | 7 ++ .../image_csv_multi_label/ann_wo_ext.csv | 7 ++ .../image_csv_multi_label/images/1.jpg | Bin 0 -> 631 bytes .../image_csv_multi_label/images/2.jpg | Bin 0 -> 631 bytes .../image_csv_multi_label/images/3.jpg | Bin 0 -> 631 bytes .../image_csv_multi_label/images/4.jpg | Bin 0 -> 631 bytes .../image_csv_multi_label/images/5.jpg | Bin 0 -> 631 bytes .../image_csv_multi_label/images/6.jpg | Bin 0 -> 631 bytes tests/unit/data_formats/test_kaggle.py | 70 ++++++++++++++++ 11 files changed, 153 insertions(+), 17 deletions(-) create mode 100644 tests/assets/kaggle_dataset/image_csv_multi_label/ann.csv create mode 100644 tests/assets/kaggle_dataset/image_csv_multi_label/ann_wo_ext.csv create mode 100644 tests/assets/kaggle_dataset/image_csv_multi_label/images/1.jpg create mode 100644 tests/assets/kaggle_dataset/image_csv_multi_label/images/2.jpg create mode 100644 tests/assets/kaggle_dataset/image_csv_multi_label/images/3.jpg create mode 100644 tests/assets/kaggle_dataset/image_csv_multi_label/images/4.jpg create mode 100644 tests/assets/kaggle_dataset/image_csv_multi_label/images/5.jpg create mode 100644 tests/assets/kaggle_dataset/image_csv_multi_label/images/6.jpg diff --git a/CHANGELOG.md b/CHANGELOG.md index 161b2d54e9..5721c979be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## \[Q3 2024 Release 1.9.0\] +## \[Q4 2024 Release 1.9.1\] +### New features + +### Enhancements +- Support multiple labels for kaggle format + () + +### Bug fixes + +## Q3 2024 Release 1.9.0 ### New features - Add a new CLI command: datum format () diff --git a/src/datumaro/plugins/data_formats/kaggle/base.py b/src/datumaro/plugins/data_formats/kaggle/base.py index d21b1434c1..06d2ef9a15 100644 --- a/src/datumaro/plugins/data_formats/kaggle/base.py +++ b/src/datumaro/plugins/data_formats/kaggle/base.py @@ -77,13 +77,31 @@ def _parse_bbox_coords(self, bbox_str): # expected to output [x1, y1, x2, y2] return [float(coord.strip()) for coord in coords] - def _load_annotations(self, datas: list, indices: Dict[str, int], bbox_flag: bool): + def _load_annotations( + self, datas: list, indices: Dict[str, Union[int, Dict[str, int]]], bbox_flag: bool + ): if "label" in indices: - label_name = str(datas[indices["label"]]) - label, cat = self._label_cat.find(label_name) - if not cat: - self._label_cat.add(label_name) - label, _ = self._label_cat.find(label_name) + label_indices = indices["label"] + if isinstance(label_indices, dict): + labels = [] + list_values = datas[1:] + index_to_label = {v: k for k, v in label_indices.items()} + present_labels = [ + index_to_label[i + 1] for i, value in enumerate(list_values) if value == "1" + ] + + for label_name in present_labels: + label, cat = self._label_cat.find(label_name) + if not cat: + self._label_cat.add(label_name) + label, _ = self._label_cat.find(label_name) + labels.append(Label(label=label)) + else: + label_name = str(datas[indices["label"]]) + label, cat = self._label_cat.find(label_name) + if not cat: + self._label_cat.add(label_name) + label, _ = self._label_cat.find(label_name) else: _, cat = self._label_cat.find("object") if not cat: @@ -91,7 +109,11 @@ def _load_annotations(self, datas: list, indices: Dict[str, int], bbox_flag: boo label = 0 if "label" in indices and not bbox_flag: + label_indices = indices["label"] + if isinstance(label_indices, dict): + return labels return Label(label=label) + if bbox_flag: if "bbox" in indices: coords = self._parse_bbox_coords(datas[indices["bbox"]]) @@ -125,7 +147,14 @@ def _load_items(self, ann_file: str, columns: Dict[str, Union[str, list]]): indices = {"media": df_fields.index(columns["media"])} if "label" in columns: - indices.update({"label": df_fields.index(columns["label"])}) + label_columns = columns["label"] + if isinstance(label_columns, list): + indices_label = {} + for label in label_columns: + indices_label[label] = df_fields.index(label) + indices.update({"label": indices_label}) + else: + indices.update({"label": df_fields.index(label_columns)}) bbox_flag = False bbox_index = columns.get("bbox") @@ -165,16 +194,30 @@ def _load_items(self, ann_file: str, columns: Dict[str, Union[str, list]]): continue ann = self._load_annotations(data_info, indices, bbox_flag) - self._ann_types.add(ann.type) - if item_id in items: - items[item_id].annotations.append(ann) + if isinstance(ann, list): + for label in ann: + self._ann_types.add(label.type) + if item_id in items: + for label in ann: + items[item_id].annotations.append(label) + else: + items[item_id] = DatasetItem( + id=item_id, + subset=self._subset, + media=Image.from_file(path=media_path), + annotations=ann, + ) else: - items[item_id] = DatasetItem( - id=item_id, - subset=self._subset, - media=Image.from_file(path=media_path), - annotations=[ann], - ) + self._ann_types.add(ann.type) + if item_id in items: + items[item_id].annotations.append(ann) + else: + items[item_id] = DatasetItem( + id=item_id, + subset=self._subset, + media=Image.from_file(path=media_path), + annotations=[ann], + ) return items.values() def categories(self): diff --git a/tests/assets/kaggle_dataset/image_csv_multi_label/ann.csv b/tests/assets/kaggle_dataset/image_csv_multi_label/ann.csv new file mode 100644 index 0000000000..57b6540a15 --- /dev/null +++ b/tests/assets/kaggle_dataset/image_csv_multi_label/ann.csv @@ -0,0 +1,7 @@ +image_name,dog,cat,person +1.jpg,1,0,0 +2.jpg,0,1,0 +3.jpg,0,0,1 +4.jpg,1,1,0 +5.jpg,1,0,1 +6.jpg,0,1,1 diff --git a/tests/assets/kaggle_dataset/image_csv_multi_label/ann_wo_ext.csv b/tests/assets/kaggle_dataset/image_csv_multi_label/ann_wo_ext.csv new file mode 100644 index 0000000000..dd01be80e0 --- /dev/null +++ b/tests/assets/kaggle_dataset/image_csv_multi_label/ann_wo_ext.csv @@ -0,0 +1,7 @@ +image_name,dog,cat,person +1,1,0,0 +2,0,1,0 +3,0,0,1 +4,1,1,0 +5,1,0,1 +6,0,1,1 diff --git a/tests/assets/kaggle_dataset/image_csv_multi_label/images/1.jpg b/tests/assets/kaggle_dataset/image_csv_multi_label/images/1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8689b956311969f2efc9e3334f375c0ad65e24f1 GIT binary patch literal 631 zcmex=^(PF6}rMnOeST|r4lSw=>~TvNxu(8R<c1}I=;VrF4wW9Q)H;sz?% zD!{d!pzFb!U9xX3zTPI5o8roG<0MW4oqZMDikqloVbuf*=gfJ(V&YTRE(2~ znmD<{#3dx9RMpfqG__1j&CD$#!v`*nMGf}^(PF6}rMnOeST|r4lSw=>~TvNxu(8R<c1}I=;VrF4wW9Q)H;sz?% zD!{d!pzFb!U9xX3zTPI5o8roG<0MW4oqZMDikqloVbuf*=gfJ(V&YTRE(2~ znmD<{#3dx9RMpfqG__1j&CD$#!v`*nMGf}^(PF6}rMnOeST|r4lSw=>~TvNxu(8R<c1}I=;VrF4wW9Q)H;sz?% zD!{d!pzFb!U9xX3zTPI5o8roG<0MW4oqZMDikqloVbuf*=gfJ(V&YTRE(2~ znmD<{#3dx9RMpfqG__1j&CD$#!v`*nMGf}^(PF6}rMnOeST|r4lSw=>~TvNxu(8R<c1}I=;VrF4wW9Q)H;sz?% zD!{d!pzFb!U9xX3zTPI5o8roG<0MW4oqZMDikqloVbuf*=gfJ(V&YTRE(2~ znmD<{#3dx9RMpfqG__1j&CD$#!v`*nMGf}^(PF6}rMnOeST|r4lSw=>~TvNxu(8R<c1}I=;VrF4wW9Q)H;sz?% zD!{d!pzFb!U9xX3zTPI5o8roG<0MW4oqZMDikqloVbuf*=gfJ(V&YTRE(2~ znmD<{#3dx9RMpfqG__1j&CD$#!v`*nMGf}^(PF6}rMnOeST|r4lSw=>~TvNxu(8R<c1}I=;VrF4wW9Q)H;sz?% zD!{d!pzFb!U9xX3zTPI5o8roG<0MW4oqZMDikqloVbuf*=gfJ(V&YTRE(2~ znmD<{#3dx9RMpfqG__1j&CD$#!v`*nMGf} Dataset: ) +@pytest.fixture +def fxt_img_multi_label_dataset() -> Dataset: + return Dataset.from_iterable( + [ + DatasetItem( + id="1", + subset="default", + media=Image.from_numpy(data=np.ones((5, 10, 3))), + annotations=[Label(label=0)], + ), + DatasetItem( + id="2", + subset="default", + media=Image.from_numpy(data=np.ones((5, 10, 3))), + annotations=[Label(label=1)], + ), + DatasetItem( + id="3", + subset="default", + media=Image.from_numpy(data=np.ones((5, 10, 3))), + annotations=[Label(label=2)], + ), + DatasetItem( + id="4", + subset="default", + media=Image.from_numpy(data=np.ones((5, 10, 3))), + annotations=[Label(label=0), Label(label=1)], + ), + DatasetItem( + id="5", + subset="default", + media=Image.from_numpy(data=np.ones((5, 10, 3))), + annotations=[Label(label=0), Label(label=2)], + ), + DatasetItem( + id="6", + subset="default", + media=Image.from_numpy(data=np.ones((5, 10, 3))), + annotations=[Label(label=1), Label(label=2)], + ), + ], + categories=["dog", "cat", "person"], + ) + + @pytest.fixture def fxt_img_det_dataset() -> Dataset: return Dataset.from_iterable( @@ -321,6 +369,8 @@ def fxt_coco_dataset() -> Dataset: IDS = [ "IMAGE_CSV", "IMAGE_CSV_WO_EXT", + "IMAGE_CSV_MULTI_LB", + "IMAGE_CSV_MULTI_LB_WO_EXT", "IMAGE_CSV_DET", "IMAGE_CSV_DET2", "IMAGE_CSV_DET3", @@ -372,6 +422,26 @@ def test_can_detect(self, fxt_dataset_dir: str): "columns": {"media": "image_name", "label": "label_name"}, }, ), + ( + DUMMY_DATASET_IMAGE_CSV_MULTI_LB_DIR, + "images", + "fxt_img_multi_label_dataset", + KaggleImageCsvBase, + { + "ann_file": osp.join(DUMMY_DATASET_IMAGE_CSV_MULTI_LB_DIR, "ann.csv"), + "columns": {"media": "image_name", "label": ["dog", "cat", "person"]}, + }, + ), + ( + DUMMY_DATASET_IMAGE_CSV_MULTI_LB_DIR, + "images", + "fxt_img_multi_label_dataset", + KaggleImageCsvBase, + { + "ann_file": osp.join(DUMMY_DATASET_IMAGE_CSV_MULTI_LB_DIR, "ann_wo_ext.csv"), + "columns": {"media": "image_name", "label": ["dog", "cat", "person"]}, + }, + ), ( DUMMY_DATASET_IMAGE_CSV_DET_DIR, "images", From 7d7b3279532558259c4f8fb5544bee02aacdc1db Mon Sep 17 00:00:00 2001 From: Yunchu Lee Date: Mon, 23 Sep 2024 15:30:05 +0900 Subject: [PATCH 2/4] Update version to 1.9.1rc0 (#1611) --- src/datumaro/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datumaro/version.py b/src/datumaro/version.py index 0a0a43a57e..58e7e52d78 100644 --- a/src/datumaro/version.py +++ b/src/datumaro/version.py @@ -1 +1 @@ -__version__ = "1.9.0" +__version__ = "1.9.1rc0" From ad84aa7493b88566be51df0c1c27fb8b5c9d5a34 Mon Sep 17 00:00:00 2001 From: Ilya Trushkin Date: Tue, 24 Sep 2024 11:04:20 +0300 Subject: [PATCH 3/4] Fix merging of stream datasets (#1609) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Summary When importing a stream dataset with multiple sources in eager mode by specifying `error_policy` or `progress_reporting`, an error occurs: ``` '_MergedStreamDataset' object has no attribute '_data' ``` ### How to test ### Checklist - [x] I have added unit tests to cover my changes.​ - [x] I have added integration tests to cover my changes.​ - [x] I have added the description of my changes into [CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md).​ - [x] I have updated the [documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs) accordingly ### License - [x] I submit _my code changes_ under the same [MIT License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. - [x] I have updated the license header for each file (see an example below). ```python # Copyright (C) 2024 Intel Corporation # # SPDX-License-Identifier: MIT ``` --------- Signed-off-by: dependabot[bot] Signed-off-by: Ilya Trushkin Co-authored-by: williamcorsel <31770711+williamcorsel@users.noreply.github.com> Co-authored-by: Sooah Lee Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Yunchu Lee Co-authored-by: Wonju Lee --- .github/workflows/codeql.yml | 4 ++-- .github/workflows/publish_to_pypi.yml | 4 ++-- .github/workflows/scorecard.yml | 2 +- CHANGELOG.md | 2 ++ src/datumaro/components/dataset.py | 11 ++++++++--- tests/unit/test_imagenet_format.py | 9 +++++++-- 6 files changed, 22 insertions(+), 10 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 556686e397..e88835e2b4 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -52,7 +52,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@f0f3afee809481da311ca3a6ff1ff51d81dbeb24 # v3.26.4 + uses: github/codeql-action/init@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -73,7 +73,7 @@ jobs: python -m build - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@f0f3afee809481da311ca3a6ff1ff51d81dbeb24 # v3.26.4 + uses: github/codeql-action/analyze@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6 with: category: "/language:${{matrix.language}}" - name: Generate Security Report diff --git a/.github/workflows/publish_to_pypi.yml b/.github/workflows/publish_to_pypi.yml index 061b8f9f25..75341b384d 100644 --- a/.github/workflows/publish_to_pypi.yml +++ b/.github/workflows/publish_to_pypi.yml @@ -80,12 +80,12 @@ jobs: file_glob: true - name: Publish package distributions to PyPI if: ${{ steps.check-tag.outputs.match != '' }} - uses: pypa/gh-action-pypi-publish@v1.9.0 + uses: pypa/gh-action-pypi-publish@v1.10.1 with: password: ${{ secrets.PYPI_API_TOKEN }} - name: Publish package distributions to TestPyPI if: ${{ steps.check-tag.outputs.match == '' }} - uses: pypa/gh-action-pypi-publish@v1.9.0 + uses: pypa/gh-action-pypi-publish@v1.10.1 with: password: ${{ secrets.TESTPYPI_API_TOKEN }} repository-url: https://test.pypi.org/legacy/ diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index d55a24feb2..d85b9b7482 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -67,6 +67,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@f0f3afee809481da311ca3a6ff1ff51d81dbeb24 # v3.26.4 + uses: github/codeql-action/upload-sarif@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6 with: sarif_file: results.sarif diff --git a/CHANGELOG.md b/CHANGELOG.md index 5721c979be..b9488e950a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 () ### Bug fixes +- Fix StreamDataset merging when importing in eager mode + () ## Q3 2024 Release 1.9.0 ### New features diff --git a/src/datumaro/components/dataset.py b/src/datumaro/components/dataset.py index 2652c99a7d..574ec7cc33 100644 --- a/src/datumaro/components/dataset.py +++ b/src/datumaro/components/dataset.py @@ -1023,17 +1023,22 @@ class _MergedStreamDataset(cls): def __init__(self, *sources: IDataset): from datumaro.components.hl_ops import HLOps - self.merged = HLOps.merge(*sources, merge_policy=merge_policy) + self._merged = HLOps.merge(*sources, merge_policy=merge_policy) + self._data = self._merged._data + self._env = env + self._format = DEFAULT_FORMAT + self._source_path = None + self._options = {} def __iter__(self): - yield from self.merged + yield from self._merged @property def is_stream(self): return True def subsets(self) -> Dict[str, DatasetSubset]: - return self.merged.subsets() + return self._merged.subsets() return _MergedStreamDataset(*sources) diff --git a/tests/unit/test_imagenet_format.py b/tests/unit/test_imagenet_format.py index 6e3ca2abec..e84b9406ea 100644 --- a/tests/unit/test_imagenet_format.py +++ b/tests/unit/test_imagenet_format.py @@ -7,6 +7,7 @@ import pytest from datumaro.components.annotation import AnnotationType, Label, LabelCategories +from datumaro.components.contexts.importer import ImportErrorPolicy from datumaro.components.dataset import Dataset, StreamDataset from datumaro.components.dataset_base import DatasetItem from datumaro.components.environment import Environment @@ -214,7 +215,9 @@ def _create_expected_dataset(self): @pytest.mark.parametrize("dataset_cls, is_stream", [(Dataset, False), (StreamDataset, True)]) def test_can_import(self, dataset_cls, is_stream, helper_tc): expected_dataset = self._create_expected_dataset() - dataset = dataset_cls.import_from(self.DUMMY_DATASET_DIR, self.IMPORTER_NAME) + dataset = dataset_cls.import_from( + self.DUMMY_DATASET_DIR, self.IMPORTER_NAME, error_policy=ImportErrorPolicy() + ) assert dataset.is_stream == is_stream compare_datasets(helper_tc, expected_dataset, dataset, require_media=True) @@ -240,7 +243,9 @@ class ImagenetWithSubsetDirsImporterTest(ImagenetImporterTest): @mark_requirement(Requirements.DATUM_GENERAL_REQ) @pytest.mark.parametrize("dataset_cls, is_stream", [(Dataset, False), (StreamDataset, True)]) def test_can_import(self, dataset_cls, is_stream, helper_tc): - dataset = dataset_cls.import_from(self.DUMMY_DATASET_DIR, self.IMPORTER_NAME) + dataset = dataset_cls.import_from( + self.DUMMY_DATASET_DIR, self.IMPORTER_NAME, error_policy=ImportErrorPolicy() + ) assert dataset.is_stream == is_stream for subset_name, subset in dataset.subsets().items(): From ec9f3baabb2a248248d7ddd40618065cf9991859 Mon Sep 17 00:00:00 2001 From: Sooah Lee Date: Wed, 25 Sep 2024 16:03:28 +0900 Subject: [PATCH 4/4] Use DataFrame.map instead of DataFrame.applymap (#1613) - Use `DataFrame.map` instead of `DataFrame.applymap` - `DataFrame.applymap` has been deprecated. --- CHANGELOG.md | 2 ++ src/datumaro/plugins/transforms.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b9488e950a..0748fcb0d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Enhancements - Support multiple labels for kaggle format () +- Use DataFrame.map instead of DataFrame.applymap + () ### Bug fixes - Fix StreamDataset merging when importing in eager mode diff --git a/src/datumaro/plugins/transforms.py b/src/datumaro/plugins/transforms.py index 6060b0ad3b..2aa6811194 100644 --- a/src/datumaro/plugins/transforms.py +++ b/src/datumaro/plugins/transforms.py @@ -1957,7 +1957,7 @@ def refine_tabular_media(self, item): or item.media.table.dtype(col) is int ] - df[str_cols] = df[str_cols].applymap(lambda x: self.remove_unnecessary_char(x)) + df[str_cols] = df[str_cols].map(lambda x: self.remove_unnecessary_char(x)) if not (self._outlier_value): self.check_outlier(media.table.data[float_cols + int_cols], float_cols + int_cols)