Skip to content

Commit

Permalink
Get target information for tabular dataset (#1471)
Browse files Browse the repository at this point in the history
- Set `target` as `dictionary` to get input and output column. ex.`{"input": "question", "output":["rating", "sentiment"]}`
- If `target` is `None`, bring all columns
- Set input of target column as media and output of it as annotations
- Set `CategoricalDtype` for column which have dtype as object but that
could be used as label. For this, we should define the threshold.
- Fix target for unit test and cli test
  • Loading branch information
sooahleex authored Apr 25, 2024
1 parent c707924 commit f9a25f5
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 49 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Enhancements
- Fix ambiguous COCO format detector
(<https://github.com/openvinotoolkit/datumaro/pull/1442>)
- Get target information for tabular dataset
(<https://github.com/openvinotoolkit/datumaro/pull/1471>)

### Bug fixes

Expand Down
8 changes: 4 additions & 4 deletions docs/source/docs/data-formats/formats/tabular.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,14 @@ which can be specified by the user when importing the dataset as shown below.

```bash
datum project create
datum project import --format tabular <path/to/buddy/dataset> -- --target breed_category,pet_category
datum project import --format tabular <path/to/electricity/dataset> -- --target class
datum project import --format tabular <path/to/buddy/dataset> -- --target input:length(m),output:breed_category,pet_category
datum project import --format tabular <path/to/electricity/dataset>
```

```python
import datumaro as dm
dataset = dm.Dataset.import_from('<path/to/buddy/dataset>', 'tabular', target=["breed_category", "pet_category"])
dataset = dm.Dataset.import_from('<path/to/electricity/dataset>', 'tabular', target="class")
dataset = dm.Dataset.import_from('<path/to/buddy/dataset>', 'tabular', target={"input":"length(m)", "output":["breed_category", "pet_category"]})
dataset = dm.Dataset.import_from('<path/to/electricity/dataset>', 'tabular')
```

As shown, the target can be a single column name or a comma-separated list of columns.
Expand Down
7 changes: 7 additions & 0 deletions src/datumaro/components/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -1234,6 +1234,9 @@ def columns(self) -> List[str]:
def dtype(self, column: str) -> Optional[Type[TableDtype]]:
"""Returns native python type for a given column"""
numpy_type = self.data.dtypes[column]
if numpy_type == object and self.data[column].nunique() / self.shape[0] < 0.1: # TODO
# Convert to CategoricalDtype for efficient storage and categorical analysis
return pd.api.types.CategoricalDtype()
if numpy_type == object:
return str
else:
Expand Down Expand Up @@ -1299,6 +1302,10 @@ def data(self) -> Optional[pd.DataFrame]:
"""Table data in pandas DataFrame format"""
return self.__data

def select(self, columns: List[str]):
self.__data = self.__data[columns]
self._shape = self.__data.shape


class TableFromDataFrame(FromDataMixin, Table):
def __init__(
Expand Down
86 changes: 61 additions & 25 deletions src/datumaro/plugins/data_formats/tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import os.path as osp
from typing import Dict, List, Optional, Tuple, Type, Union

import pandas as pd

from datumaro.components.annotation import AnnotationType, Categories, Tabular, TabularCategories
from datumaro.components.dataset_base import DatasetBase, DatasetItem
from datumaro.components.errors import MediaTypeError
Expand Down Expand Up @@ -62,16 +64,16 @@ def __init__(
def _parse(
self,
paths: List[str],
target: Optional[Union[str, List[str]]] = None,
target: Optional[Dict[str, List[str]]] = None,
dtype: Optional[Dict[str, Type[TableDtype]]] = None,
) -> Tuple[List[DatasetItem], Dict[AnnotationType, Categories]]:
"""
parse tabular files. Each file is regarded as a subset.
Args:
paths (list(str)) : A list of paths to tabular data files(csv files).
target (optional, str or list) : Target column or list of target columns.
If this is not specified (None), the last column is regarded as a target column.
target (optional, dict(str or list)) : Target column or list of target columns for each input and output.
If this is not specified (None), the whole columns are regarded as a target column.
In case of a dataset with no targets, give an empty list as a parameter.
dtype (optional, dict(str,str)) : Dictionay of column name -> type str ('str', 'int', or 'float').
This can be used when automatic type inferencing is failed.
Expand All @@ -84,45 +86,55 @@ def _parse(
items: List[DatasetItem] = []
categories: TabularCategories = TabularCategories()

if target is not None:
if "input" not in target or "output" not in target:
raise TypeError('Target should have both "input" and "output"')

for path in paths:
table = Table.from_csv(path, dtype=dtype)

targets: List[str] = []
targets_ann: List[str] = []
if target is None:
targets.append(table.columns[-1]) # last column
elif isinstance(target, str):
if target in table.columns: # add valid column name only
targets.append(target)
elif isinstance(target, list): # add valid column names only
for t in target:
if t in table.columns:
targets.append(t)
targets.extend(table.columns) # add all columns
else:
# add valid targeted output column name only
if isinstance(target.get("input"), str) and target["input"] in table.columns:
targets.append(target["input"])
elif isinstance(target.get("input"), list):
targets.extend(col for col in target["input"] if col in table.columns)
if isinstance(target.get("output"), str) and target["output"] in table.columns:
targets_ann.append(target["output"])
elif isinstance(target.get("output"), list):
targets_ann.extend(col for col in target["output"] if col in table.columns)
targets = targets + targets_ann

# set categories
for target in targets:
_, category = categories.find(target)
target_dtype = table.dtype(target)
if target_dtype == str:
labels = set(table.features(target, unique=True))
for target_ in targets_ann:
_, category = categories.find(target_)
target_dtype = table.dtype(target_)
if target_dtype in [int, float, pd.api.types.CategoricalDtype()]:
# 'int' can be categorical, but we don't know this unless user gives information.
labels = set(table.features(target_, unique=True))
if category is None:
categories.add(target, target_dtype, labels)
categories.add(target_, target_dtype, labels)
else: # update labels if they are different.
category.labels.union(labels)
elif target_dtype in [int, float]:
# 'int' can be categorical, but we don't know this unless user gives information.
elif target_dtype is str:
if category is None:
categories.add(target, target_dtype)
categories.add(target_, target_dtype)
else:
raise TypeError(
f"Unsupported type '{target_dtype}' for target column '{target}'."
f"Unsupported type '{target_dtype}' for target column '{target_}'."
)

# load annotations
subset = osp.splitext(osp.basename(path))[0]
row: TableRow
table.select(targets)
for row in table: # type: TableRow
id = f"{row.index}@{subset}"
ann = [Tabular(values=row.data(targets))] if targets else None
ann = [Tabular(values=row.data(targets_ann))] if targets_ann else None
item = DatasetItem(
id=id,
subset=subset,
Expand All @@ -140,6 +152,29 @@ def __iter__(self):
yield from self._items


def string_to_dict(input_string):
pairs = input_string.split(",")
result = {}

for pair in pairs:
split_pair = pair.split(":")
# Check if the key is "input" or "output".
if len(split_pair) == 2:
key, value = split_pair
if key == "input" or key == "output":
if key in result:
result[key].append(value)
else:
result[key] = [value]
else:
# Ignore other keys
pass
else:
result[key].extend(split_pair)

return result


class TabularDataImporter(Importer):
"""
Import a tabular dataset.
Expand All @@ -153,9 +188,10 @@ def build_cmdline_parser(cls, **kwargs):
parser = super().build_cmdline_parser(**kwargs)
parser.add_argument(
"--target",
type=lambda x: x.split(","),
help="Target column or list of target columns. (ex. 'class', 'class,breed') (default:None) "
"If this is not specified (None), the last column is regarded as a target column."
type=lambda x: string_to_dict(x),
help="Target column or list of target columns for each input and output."
"(ex. 'input:date,output:class', 'input:data,output:class,breed') (default:None)"
"If this is not specified (None), the whole columns are regarded as a target column."
"In case of a dataset with no targets, give an empty list as a parameter.",
)
parser.add_argument(
Expand Down
5 changes: 3 additions & 2 deletions tests/integration/cli/test_tabular_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def fxt_buddy_path(fxt_tabular_root):

@pytest.fixture()
def fxt_buddy_target():
yield ("breed_category", "pet_category")
yield {"input": "length(m)", "output": ["breed_category", "pet_category"]}


@pytest.fixture()
Expand Down Expand Up @@ -76,12 +76,13 @@ def test_can_import_and_export_tabular_dataset(
dataset = request.getfixturevalue(fxt_dataset)
path = request.getfixturevalue(fxt_path)
target = request.getfixturevalue(fxt_target) if isinstance(fxt_target, str) else None
string_target = "input:length(m),output:breed_category,pet_category"

with TestDir() as test_dir:
run(helper_tc, "project", "create", "-o", test_dir)
args = ["project", "import", "-p", test_dir, "-f", "tabular", path]
if target:
args.extend(["--", "--target", ",".join(target)])
args.extend(["--", "--target", string_target])
run(helper_tc, *args)

export_dir = osp.join(test_dir, "export_dir")
Expand Down
101 changes: 83 additions & 18 deletions tests/unit/test_tabular_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ def fxt_tabular_root():


@pytest.fixture()
def txf_electricity(fxt_tabular_root):
def fxt_electricity(fxt_tabular_root):
path = osp.join(fxt_tabular_root, "electricity.csv")
yield Dataset.import_from(path, "tabular")


@pytest.fixture()
def fxt_buddy_target():
yield ["breed_category", "pet_category"]
yield {"input": "length(m)", "output": ["breed_category", "pet_category"]}


@pytest.fixture()
Expand All @@ -41,13 +41,9 @@ def fxt_buddy(fxt_tabular_root, fxt_buddy_target):
@pytest.mark.new
class TabularImporterTest:
@mark_requirement(Requirements.DATUM_GENERAL_REQ)
def test_can_import_tabular_file(self, txf_electricity) -> None:
dataset: Type[Dataset] = txf_electricity
expected_categories = {
AnnotationType.tabular: TabularCategories.from_iterable(
[("class", str, {"UP", "DOWN"})]
)
}
def test_can_import_tabular_file(self, fxt_electricity) -> None:
dataset: Type[Dataset] = fxt_electricity
expected_categories = {AnnotationType.tabular: TabularCategories.from_iterable([])}
expected_subset = "electricity"

assert dataset.categories() == expected_categories
Expand All @@ -56,19 +52,16 @@ def test_can_import_tabular_file(self, txf_electricity) -> None:

for idx, item in enumerate(dataset):
assert idx == item.media.index
assert len(item.annotations) == 1
assert item.media.data()["class"] == item.annotations[0].values["class"]
assert len(item.annotations) == 0

@mark_requirement(Requirements.DATUM_GENERAL_REQ)
def test_can_import_tabular_folder(self, fxt_buddy) -> None:
dataset: Type[Dataset] = fxt_buddy
expected_categories = {
AnnotationType.tabular: TabularCategories.from_iterable(
[("breed_category", float), ("pet_category", int)]
)
}
expected_categories_keys = [("breed_category", float), ("pet_category", int)]

assert dataset.categories() == expected_categories
assert [
(cat.name, cat.dtype) for cat in dataset.categories()[AnnotationType.tabular].items
] == expected_categories_keys
assert len(dataset) == 200
assert set(dataset.subsets()) == {"train", "test"}

Expand Down Expand Up @@ -98,7 +91,7 @@ def test_can_detect_tabular(self, fxt_tabular_root: str) -> None:

@mark_requirement(Requirements.DATUM_GENERAL_REQ)
@pytest.mark.parametrize(
"fxt,target", [("txf_electricity", None), ("fxt_buddy", "fxt_buddy_target")]
"fxt,target", [("fxt_electricity", None), ("fxt_buddy", "fxt_buddy_target")]
)
def test_can_export_tabular(self, fxt: str, target, request) -> None:
dataset: Type[Dataset] = request.getfixturevalue(fxt)
Expand All @@ -109,3 +102,75 @@ def test_can_export_tabular(self, fxt: str, target, request) -> None:
dataset.export(test_dir, "tabular")
back_dataset = Dataset.import_from(test_dir, "tabular", target=target)
compare_datasets(TestCase(), dataset, back_dataset)

@mark_requirement(Requirements.DATUM_GENERAL_REQ)
@pytest.mark.parametrize(
"target, expected_media_data_keys, expected_categories_keys",
[
(
{"input": "length(m)", "output": "breed_category"},
["length(m)", "breed_category"],
[("breed_category", float)],
),
(
{"input": "length", "output": "breed_category"},
["breed_category"],
[("breed_category", float)],
),
({"input": "length(m)", "output": "breed"}, ["length(m)"], []),
(
{"input": ["length(m)", "height(cm)"], "output": "breed_category"},
["length(m)", "height(cm)", "breed_category"],
[("breed_category", float)],
),
],
)
def test_target_check_in_table(
self, fxt_tabular_root, target, expected_media_data_keys, expected_categories_keys
) -> None:
path = osp.join(fxt_tabular_root, "adopt-a-buddy")
dataset = Dataset.import_from(path, "tabular", target=target)

assert (
list(next(iter(dataset.get_subset("train"))).media.data().keys())
== expected_media_data_keys
)
assert [
(cat.name, cat.dtype) for cat in dataset.categories()[AnnotationType.tabular].items
] == expected_categories_keys

@mark_requirement(Requirements.DATUM_GENERAL_REQ)
@pytest.mark.parametrize(
"target,expected_included_labels",
[
({"input": "length(m)", "output": "breed_category"}, [True]),
({"input": "length(m)", "output": ["color_type", "breed_category"]}, [False, True]),
],
)
def test_target_dtype(self, fxt_tabular_root, target, expected_included_labels) -> None:
path = osp.join(fxt_tabular_root, "adopt-a-buddy")
dataset = Dataset.import_from(path, "tabular", target=target)

included_lables_result = [
False if len(cat.labels) == 0 else True
for cat in dataset.categories()[AnnotationType.tabular].items
]

assert included_lables_result == expected_included_labels

@mark_requirement(Requirements.DATUM_GENERAL_REQ)
@pytest.mark.parametrize(
"input_string,expected_result",
[
("input:date,output:class", {"input": ["date"], "output": ["class"]}),
(
"input:length(m),output:breed_category,pet_category",
{"input": ["length(m)"], "output": ["breed_category", "pet_category"]},
),
("input:age,color,output:size", {"input": ["age", "color"], "output": ["size"]}),
("input:height", {"input": ["height"]}),
("output:breed_category", {"output": ["breed_category"]}),
],
)
def test_string_to_dict(self, input_string, expected_result):
assert string_to_dict(input_string) == expected_result

0 comments on commit f9a25f5

Please sign in to comment.