Get target information for tabular dataset (#1471)

- Set `target` as `dictionary` to get input and output column. ex.`{"input": "question", "output":["rating", "sentiment"]}` - If `target` is `None`, bring all columns - Set input of target column as media and output of it as annotations - Set `CategoricalDtype` for column which have dtype as object but that could be used as label. For this, we should define the threshold. - Fix target for unit test and cli test
openvinotoolkit · Apr 25, 2024 · f9a25f5 · f9a25f5
1 parent c707924
commit f9a25f5
Show file tree

Hide file tree

Showing 6 changed files with 160 additions and 49 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Enhancements
 - Fix ambiguous COCO format detector
   (<https://github.com/openvinotoolkit/datumaro/pull/1442>)
+- Get target information for tabular dataset
+  (<https://github.com/openvinotoolkit/datumaro/pull/1471>)
 
 ### Bug fixes
 

diff --git a/docs/source/docs/data-formats/formats/tabular.md b/docs/source/docs/data-formats/formats/tabular.md
@@ -66,14 +66,14 @@ which can be specified by the user when importing the dataset as shown below.
 
 ```bash
 datum project create
-datum project import --format tabular <path/to/buddy/dataset> -- --target breed_category,pet_category
-datum project import --format tabular <path/to/electricity/dataset> -- --target class
+datum project import --format tabular <path/to/buddy/dataset> -- --target input:length(m),output:breed_category,pet_category
+datum project import --format tabular <path/to/electricity/dataset>
 ```
 
 ```python
 import datumaro as dm
-dataset = dm.Dataset.import_from('<path/to/buddy/dataset>', 'tabular', target=["breed_category", "pet_category"])
-dataset = dm.Dataset.import_from('<path/to/electricity/dataset>', 'tabular', target="class")
+dataset = dm.Dataset.import_from('<path/to/buddy/dataset>', 'tabular', target={"input":"length(m)", "output":["breed_category", "pet_category"]})
+dataset = dm.Dataset.import_from('<path/to/electricity/dataset>', 'tabular')
 ```
 
 As shown, the target can be a single column name or a comma-separated list of columns.

diff --git a/src/datumaro/components/media.py b/src/datumaro/components/media.py
@@ -1234,6 +1234,9 @@ def columns(self) -> List[str]:
     def dtype(self, column: str) -> Optional[Type[TableDtype]]:
         """Returns native python type for a given column"""
         numpy_type = self.data.dtypes[column]
+        if numpy_type == object and self.data[column].nunique() / self.shape[0] < 0.1:  # TODO
+            # Convert to CategoricalDtype for efficient storage and categorical analysis
+            return pd.api.types.CategoricalDtype()
         if numpy_type == object:
             return str
         else:
@@ -1299,6 +1302,10 @@ def data(self) -> Optional[pd.DataFrame]:
         """Table data in pandas DataFrame format"""
         return self.__data
 
+    def select(self, columns: List[str]):
+        self.__data = self.__data[columns]
+        self._shape = self.__data.shape
+
 
 class TableFromDataFrame(FromDataMixin, Table):
     def __init__(

diff --git a/src/datumaro/plugins/data_formats/tabular.py b/src/datumaro/plugins/data_formats/tabular.py
@@ -7,6 +7,8 @@
 import os.path as osp
 from typing import Dict, List, Optional, Tuple, Type, Union
 
+import pandas as pd
+
 from datumaro.components.annotation import AnnotationType, Categories, Tabular, TabularCategories
 from datumaro.components.dataset_base import DatasetBase, DatasetItem
 from datumaro.components.errors import MediaTypeError
@@ -62,16 +64,16 @@ def __init__(
     def _parse(
         self,
         paths: List[str],
-        target: Optional[Union[str, List[str]]] = None,
+        target: Optional[Dict[str, List[str]]] = None,
         dtype: Optional[Dict[str, Type[TableDtype]]] = None,
     ) -> Tuple[List[DatasetItem], Dict[AnnotationType, Categories]]:
         """
         parse tabular files. Each file is regarded as a subset.
 
         Args:
             paths (list(str)) : A list of paths to tabular data files(csv files).
-            target (optional, str or list) : Target column or list of target columns.
-                If this is not specified (None), the last column is regarded as a target column.
+            target (optional, dict(str or list)) : Target column or list of target columns for each input and output.
+                If this is not specified (None), the whole columns are regarded as a target column.
                 In case of a dataset with no targets, give an empty list as a parameter.
             dtype (optional, dict(str,str)) : Dictionay of column name -> type str ('str', 'int', or 'float').
                 This can be used when automatic type inferencing is failed.
@@ -84,45 +86,55 @@ def _parse(
         items: List[DatasetItem] = []
         categories: TabularCategories = TabularCategories()
 
+        if target is not None:
+            if "input" not in target or "output" not in target:
+                raise TypeError('Target should have both "input" and "output"')
+
         for path in paths:
             table = Table.from_csv(path, dtype=dtype)
 
             targets: List[str] = []
+            targets_ann: List[str] = []
             if target is None:
-                targets.append(table.columns[-1])  # last column
-            elif isinstance(target, str):
-                if target in table.columns:  # add valid column name only
-                    targets.append(target)
-            elif isinstance(target, list):  # add valid column names only
-                for t in target:
-                    if t in table.columns:
-                        targets.append(t)
+                targets.extend(table.columns)  # add all columns
+            else:
+                # add valid targeted output column name only
+                if isinstance(target.get("input"), str) and target["input"] in table.columns:
+                    targets.append(target["input"])
+                elif isinstance(target.get("input"), list):
+                    targets.extend(col for col in target["input"] if col in table.columns)
+                if isinstance(target.get("output"), str) and target["output"] in table.columns:
+                    targets_ann.append(target["output"])
+                elif isinstance(target.get("output"), list):
+                    targets_ann.extend(col for col in target["output"] if col in table.columns)
+            targets = targets + targets_ann
 
             # set categories
-            for target in targets:
-                _, category = categories.find(target)
-                target_dtype = table.dtype(target)
-                if target_dtype == str:
-                    labels = set(table.features(target, unique=True))
+            for target_ in targets_ann:
+                _, category = categories.find(target_)
+                target_dtype = table.dtype(target_)
+                if target_dtype in [int, float, pd.api.types.CategoricalDtype()]:
+                    # 'int' can be categorical, but we don't know this unless user gives information.
+                    labels = set(table.features(target_, unique=True))
                     if category is None:
-                        categories.add(target, target_dtype, labels)
+                        categories.add(target_, target_dtype, labels)
                     else:  # update labels if they are different.
                         category.labels.union(labels)
-                elif target_dtype in [int, float]:
-                    # 'int' can be categorical, but we don't know this unless user gives information.
+                elif target_dtype is str:
                     if category is None:
-                        categories.add(target, target_dtype)
+                        categories.add(target_, target_dtype)
                 else:
                     raise TypeError(
-                        f"Unsupported type '{target_dtype}' for target column '{target}'."
+                        f"Unsupported type '{target_dtype}' for target column '{target_}'."
                     )
 
             # load annotations
             subset = osp.splitext(osp.basename(path))[0]
             row: TableRow
+            table.select(targets)
             for row in table:  # type: TableRow
                 id = f"{row.index}@{subset}"
-                ann = [Tabular(values=row.data(targets))] if targets else None
+                ann = [Tabular(values=row.data(targets_ann))] if targets_ann else None
                 item = DatasetItem(
                     id=id,
                     subset=subset,
@@ -140,6 +152,29 @@ def __iter__(self):
         yield from self._items
 
 
+def string_to_dict(input_string):
+    pairs = input_string.split(",")
+    result = {}
+
+    for pair in pairs:
+        split_pair = pair.split(":")
+        # Check if the key is "input" or "output".
+        if len(split_pair) == 2:
+            key, value = split_pair
+            if key == "input" or key == "output":
+                if key in result:
+                    result[key].append(value)
+                else:
+                    result[key] = [value]
+            else:
+                # Ignore other keys
+                pass
+        else:
+            result[key].extend(split_pair)
+
+    return result
+
+
 class TabularDataImporter(Importer):
     """
     Import a tabular dataset.
@@ -153,9 +188,10 @@ def build_cmdline_parser(cls, **kwargs):
         parser = super().build_cmdline_parser(**kwargs)
         parser.add_argument(
             "--target",
-            type=lambda x: x.split(","),
-            help="Target column or list of target columns. (ex. 'class', 'class,breed') (default:None) "
-            "If this is not specified (None), the last column is regarded as a target column."
+            type=lambda x: string_to_dict(x),
+            help="Target column or list of target columns for each input and output."
+            "(ex. 'input:date,output:class', 'input:data,output:class,breed') (default:None)"
+            "If this is not specified (None), the whole columns are regarded as a target column."
             "In case of a dataset with no targets, give an empty list as a parameter.",
         )
         parser.add_argument(

diff --git a/tests/integration/cli/test_tabular_format.py b/tests/integration/cli/test_tabular_format.py
@@ -38,7 +38,7 @@ def fxt_buddy_path(fxt_tabular_root):
 
 @pytest.fixture()
 def fxt_buddy_target():
-    yield ("breed_category", "pet_category")
+    yield {"input": "length(m)", "output": ["breed_category", "pet_category"]}
 
 
 @pytest.fixture()
@@ -76,12 +76,13 @@ def test_can_import_and_export_tabular_dataset(
         dataset = request.getfixturevalue(fxt_dataset)
         path = request.getfixturevalue(fxt_path)
         target = request.getfixturevalue(fxt_target) if isinstance(fxt_target, str) else None
+        string_target = "input:length(m),output:breed_category,pet_category"
 
         with TestDir() as test_dir:
             run(helper_tc, "project", "create", "-o", test_dir)
             args = ["project", "import", "-p", test_dir, "-f", "tabular", path]
             if target:
-                args.extend(["--", "--target", ",".join(target)])
+                args.extend(["--", "--target", string_target])
             run(helper_tc, *args)
 
             export_dir = osp.join(test_dir, "export_dir")

diff --git a/tests/unit/test_tabular_format.py b/tests/unit/test_tabular_format.py
@@ -22,14 +22,14 @@ def fxt_tabular_root():
 
 
 @pytest.fixture()
-def txf_electricity(fxt_tabular_root):
+def fxt_electricity(fxt_tabular_root):
     path = osp.join(fxt_tabular_root, "electricity.csv")
     yield Dataset.import_from(path, "tabular")
 
 
 @pytest.fixture()
 def fxt_buddy_target():
-    yield ["breed_category", "pet_category"]
+    yield {"input": "length(m)", "output": ["breed_category", "pet_category"]}
 
 
 @pytest.fixture()
@@ -41,13 +41,9 @@ def fxt_buddy(fxt_tabular_root, fxt_buddy_target):
 @pytest.mark.new
 class TabularImporterTest:
     @mark_requirement(Requirements.DATUM_GENERAL_REQ)
-    def test_can_import_tabular_file(self, txf_electricity) -> None:
-        dataset: Type[Dataset] = txf_electricity
-        expected_categories = {
-            AnnotationType.tabular: TabularCategories.from_iterable(
-                [("class", str, {"UP", "DOWN"})]
-            )
-        }
+    def test_can_import_tabular_file(self, fxt_electricity) -> None:
+        dataset: Type[Dataset] = fxt_electricity
+        expected_categories = {AnnotationType.tabular: TabularCategories.from_iterable([])}
         expected_subset = "electricity"
 
         assert dataset.categories() == expected_categories
@@ -56,19 +52,16 @@ def test_can_import_tabular_file(self, txf_electricity) -> None:
 
         for idx, item in enumerate(dataset):
             assert idx == item.media.index
-            assert len(item.annotations) == 1
-            assert item.media.data()["class"] == item.annotations[0].values["class"]
+            assert len(item.annotations) == 0
 
     @mark_requirement(Requirements.DATUM_GENERAL_REQ)
     def test_can_import_tabular_folder(self, fxt_buddy) -> None:
         dataset: Type[Dataset] = fxt_buddy
-        expected_categories = {
-            AnnotationType.tabular: TabularCategories.from_iterable(
-                [("breed_category", float), ("pet_category", int)]
-            )
-        }
+        expected_categories_keys = [("breed_category", float), ("pet_category", int)]
 
-        assert dataset.categories() == expected_categories
+        assert [
+            (cat.name, cat.dtype) for cat in dataset.categories()[AnnotationType.tabular].items
+        ] == expected_categories_keys
         assert len(dataset) == 200
         assert set(dataset.subsets()) == {"train", "test"}
 
@@ -98,7 +91,7 @@ def test_can_detect_tabular(self, fxt_tabular_root: str) -> None:
 
     @mark_requirement(Requirements.DATUM_GENERAL_REQ)
     @pytest.mark.parametrize(
-        "fxt,target", [("txf_electricity", None), ("fxt_buddy", "fxt_buddy_target")]
+        "fxt,target", [("fxt_electricity", None), ("fxt_buddy", "fxt_buddy_target")]
     )
     def test_can_export_tabular(self, fxt: str, target, request) -> None:
         dataset: Type[Dataset] = request.getfixturevalue(fxt)
@@ -109,3 +102,75 @@ def test_can_export_tabular(self, fxt: str, target, request) -> None:
             dataset.export(test_dir, "tabular")
             back_dataset = Dataset.import_from(test_dir, "tabular", target=target)
             compare_datasets(TestCase(), dataset, back_dataset)
+
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    @pytest.mark.parametrize(
+        "target, expected_media_data_keys, expected_categories_keys",
+        [
+            (
+                {"input": "length(m)", "output": "breed_category"},
+                ["length(m)", "breed_category"],
+                [("breed_category", float)],
+            ),
+            (
+                {"input": "length", "output": "breed_category"},
+                ["breed_category"],
+                [("breed_category", float)],
+            ),
+            ({"input": "length(m)", "output": "breed"}, ["length(m)"], []),
+            (
+                {"input": ["length(m)", "height(cm)"], "output": "breed_category"},
+                ["length(m)", "height(cm)", "breed_category"],
+                [("breed_category", float)],
+            ),
+        ],
+    )
+    def test_target_check_in_table(
+        self, fxt_tabular_root, target, expected_media_data_keys, expected_categories_keys
+    ) -> None:
+        path = osp.join(fxt_tabular_root, "adopt-a-buddy")
+        dataset = Dataset.import_from(path, "tabular", target=target)
+
+        assert (
+            list(next(iter(dataset.get_subset("train"))).media.data().keys())
+            == expected_media_data_keys
+        )
+        assert [
+            (cat.name, cat.dtype) for cat in dataset.categories()[AnnotationType.tabular].items
+        ] == expected_categories_keys
+
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    @pytest.mark.parametrize(
+        "target,expected_included_labels",
+        [
+            ({"input": "length(m)", "output": "breed_category"}, [True]),
+            ({"input": "length(m)", "output": ["color_type", "breed_category"]}, [False, True]),
+        ],
+    )
+    def test_target_dtype(self, fxt_tabular_root, target, expected_included_labels) -> None:
+        path = osp.join(fxt_tabular_root, "adopt-a-buddy")
+        dataset = Dataset.import_from(path, "tabular", target=target)
+
+        included_lables_result = [
+            False if len(cat.labels) == 0 else True
+            for cat in dataset.categories()[AnnotationType.tabular].items
+        ]
+
+        assert included_lables_result == expected_included_labels
+
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    @pytest.mark.parametrize(
+        "input_string,expected_result",
+        [
+            ("input:date,output:class", {"input": ["date"], "output": ["class"]}),
+            (
+                "input:length(m),output:breed_category,pet_category",
+                {"input": ["length(m)"], "output": ["breed_category", "pet_category"]},
+            ),
+            ("input:age,color,output:size", {"input": ["age", "color"], "output": ["size"]}),
+            ("input:height", {"input": ["height"]}),
+            ("output:breed_category", {"output": ["breed_category"]}),
+        ],
+    )
+    def test_string_to_dict(self, input_string, expected_result):
+        assert string_to_dict(input_string) == expected_result