Skip to content

Commit

Permalink
fix targets_ for multilabel
Browse files Browse the repository at this point in the history
  • Loading branch information
dev-rinchin committed Dec 13, 2024
1 parent baf5be4 commit 67e2e76
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 19 deletions.
4 changes: 3 additions & 1 deletion lightautoml/automl/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,9 @@ def fit_predict(
self.classes_ = roles["target"]
elif self.reader.task.name == "reg":
self.classes_ = [roles["target"]]
else:
elif self.reader.task.name == "multilabel":
self.classes_ = roles["target"]
else: # multiclass
self.classes_ = (
sorted(self.reader.class_mapping, key=self.reader.class_mapping.get, reverse=False)
if self.reader.class_mapping
Expand Down
14 changes: 7 additions & 7 deletions lightautoml/automl/blend.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def outp_dim(self) -> int: # noqa: D102
return self._outp_dim

def fit_predict(
self, predictions: Sequence[LAMLDataset], pipes: Sequence[MLPipeline], class_mapping: dict
self, predictions: Sequence[LAMLDataset], pipes: Sequence[MLPipeline], targets_mapping: dict
) -> Tuple[LAMLDataset, Sequence[MLPipeline]]:
"""Wraps custom ``._fit_predict`` methods of blenders.
Expand All @@ -63,7 +63,7 @@ def fit_predict(
self._bypass = True
return predictions[0], pipes

return self._fit_predict(predictions, pipes, class_mapping)
return self._fit_predict(predictions, pipes, targets_mapping)

def _fit_predict(
self, predictions: Sequence[LAMLDataset], pipes: Sequence[MLPipeline]
Expand Down Expand Up @@ -134,7 +134,7 @@ def split_models(self, predictions: Sequence[LAMLDataset]) -> Tuple[Sequence[LAM

return splitted_preds, model_idx, pipe_idx

def _set_metadata(self, predictions: Sequence[LAMLDataset], pipes: Sequence[MLPipeline], class_mapping: dict):
def _set_metadata(self, predictions: Sequence[LAMLDataset], pipes: Sequence[MLPipeline], targets_mapping: dict):

pred0 = predictions[0]
pipe0 = pipes[0]
Expand All @@ -143,7 +143,7 @@ def _set_metadata(self, predictions: Sequence[LAMLDataset], pipes: Sequence[MLPi
self._outp_prob = pred0.task.name in ["binary", "multiclass"]
self._score = predictions[0].task.get_dataset_metric()

self._class_mapping = class_mapping
self._targets_mapping = targets_mapping

def score(self, dataset: LAMLDataset) -> float:
"""Score metric for blender.
Expand Down Expand Up @@ -323,7 +323,7 @@ def _get_weighted_pred(self, splitted_preds: Sequence[NumpyDataset], wts: Option
outp = splitted_preds[0].empty()
outp.set_data(
weighted_pred,
self._class_mapping if self._class_mapping else list(range(weighted_pred.shape[1])),
self._targets_mapping if self._targets_mapping else list(range(weighted_pred.shape[1])),
NumericRole(np.float32, prob=self._outp_prob),
)

Expand Down Expand Up @@ -438,7 +438,7 @@ def _prune_pipe(
return new_pipes, wts

def _fit_predict(
self, predictions: Sequence[NumpyDataset], pipes: Sequence[MLPipeline], class_mapping: dict
self, predictions: Sequence[NumpyDataset], pipes: Sequence[MLPipeline], targets_mapping: dict
) -> Tuple[NumpyDataset, Sequence[MLPipeline]]:
"""Perform coordinate descent.
Expand All @@ -453,7 +453,7 @@ def _fit_predict(
Dataset and MLPipeline.
"""
self._set_metadata(predictions, pipes, class_mapping)
self._set_metadata(predictions, pipes, targets_mapping)
splitted_preds, _, pipe_idx = cast(List[NumpyDataset], self.split_models(predictions))

wts = self._optimize(splitted_preds)
Expand Down
5 changes: 4 additions & 1 deletion scripts/experiments/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def main( # noqa D103
dataset_project: str = None,
dataset_partial_name: str = None,
n_datasets: int = -1,
save_model: bool = False,
):
if dataset_name is not None:
dataset_list = [dataset_name]
Expand Down Expand Up @@ -66,7 +67,7 @@ def main( # noqa D103
tags = f"--tags {' '.join(tags)}" if len(tags) else ""

os.system(
f'clearml-task --project {project} --name {curr_task_name} --script scripts/experiments/run_tabular.py --queue {queue} {tags} --docker {image} --docker_args "--cpus={cpu_limit} --memory={memory_limit}g" --args dataset={dataset_name}'
f'clearml-task --project {project} --name {curr_task_name} --script scripts/experiments/run_tabular.py --queue {queue} {tags} --docker {image} --docker_args "--cpus={cpu_limit} --memory={memory_limit}g" --args dataset={dataset_name} save_model={save_model}'
)


Expand All @@ -84,6 +85,7 @@ def main( # noqa D103
parser.add_argument("--image", type=str, help="docker image", default="for_clearml:latest")
parser.add_argument("--n_datasets", type=int, help="number of datasets", default=-1)
parser.add_argument("--min_num_obs", type=int, help="min number of samples", default=None)
parser.add_argument("--save_model", action="store_true")
args = parser.parse_args()

main(
Expand All @@ -99,4 +101,5 @@ def main( # noqa D103
image=args.image,
n_datasets=args.n_datasets,
min_num_obs=args.min_num_obs,
save_model=args.save_model,
)
46 changes: 36 additions & 10 deletions scripts/experiments/run_tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@
RANDOM_STATE = 1234


def map_to_corect_order_of_classes(values, classes_): # noqa D103
class_mapping = {n: x for (x, n) in enumerate(classes_)}
def map_to_corect_order_of_classes(values, targets_order): # noqa D103
class_mapping = {n: x for (x, n) in enumerate(targets_order)}
mapped = list(map(class_mapping.get, values))
return mapped


def main(dataset_name: str, cpu_limit: int, memory_limit: int): # noqa D103
def main(dataset_name: str, cpu_limit: int, memory_limit: int, save_model: bool): # noqa D103
cml_task = clearml.Task.get_task(clearml.config.get_remote_task_id())
logger = cml_task.get_logger()

Expand All @@ -48,7 +48,7 @@ def main(dataset_name: str, cpu_limit: int, memory_limit: int): # noqa D103
task=task,
cpu_limit=cpu_limit,
memory_limit=memory_limit,
timeout=10 * 60 * 60,
timeout=15 * 60,
general_params={
# "use_algos": [["mlp"]]
}, # ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn', 'node', 'autoint', 'fttransformer'] or custom torch model
Expand All @@ -64,10 +64,25 @@ def main(dataset_name: str, cpu_limit: int, memory_limit: int): # noqa D103

cml_task.connect(automl)

target_name = test.columns[-1]
if task_type == "multilabel":
target_name = [x for x in test.columns if x.startswith("target")]
else:
target_name = test.columns[-1]

kwargs = {}
if save_model:
kwargs["path_to_save"] = "model"

with Timer() as timer_training:
oof_predictions = automl.fit_predict(train, roles={"target": target_name}, verbose=10)
oof_predictions = automl.fit_predict(train, roles={"target": target_name}, verbose=10, **kwargs)

# add and upload local file artifact
cml_task.upload_artifact(
name="model.joblib",
artifact_object=os.path.join(
"model.joblib",
),
)

with Timer() as timer_predict:
test_predictions = automl.predict(test)
Expand All @@ -84,17 +99,25 @@ def main(dataset_name: str, cpu_limit: int, memory_limit: int): # noqa D103
except:
# Some datasets can have dtype=float of target,
# so we must map this target for correct log_loss calculating (if we didn't caclulate it in the try block)
# and this mapping must be in the correct order so we extract automl.classes_ and map values
y_true = map_to_corect_order_of_classes(values=train[target_name].values[not_nan], classes_=automl.classes_)
# and this mapping must be in the correct order so we extract automl.targets_ and map values
y_true = map_to_corect_order_of_classes(
values=train[target_name].values[not_nan], targets_order=oof_predictions.features
)
metric_oof = log_loss(y_true, oof_predictions.data[not_nan, :])

y_true = map_to_corect_order_of_classes(values=test[target_name], classes_=automl.classes_)
y_true = map_to_corect_order_of_classes(values=test[target_name], targets_order=automl.targets_)
metric_ho = log_loss(y_true, test_predictions.data)

elif task_type == "reg":
metric_oof = task.metric_func(train[target_name].values, oof_predictions.data[:, 0])
metric_ho = task.metric_func(test[target_name].values, test_predictions.data[:, 0])

elif task_type == "multilabel":
metric_oof = task.metric_func(train[target_name].values, oof_predictions.data)
metric_ho = task.metric_func(test[target_name].values, test_predictions.data)
else:
raise ValueError("Bad task type.")

print(f"Score for out-of-fold predictions: {metric_oof}")
print(f"Score for hold-out: {metric_ho}")
print(f"Train duration: {timer_training.duration}")
Expand All @@ -114,6 +137,9 @@ def main(dataset_name: str, cpu_limit: int, memory_limit: int): # noqa D103
parser.add_argument("--dataset", type=str, help="dataset name or id", default="sampled_app_train")
parser.add_argument("--cpu_limit", type=int, help="", default=8)
parser.add_argument("--memory_limit", type=int, help="", default=16)
parser.add_argument("--save_model", action="store_true")
args = parser.parse_args()

main(dataset_name=args.dataset, cpu_limit=args.cpu_limit, memory_limit=args.memory_limit)
main(
dataset_name=args.dataset, cpu_limit=args.cpu_limit, memory_limit=args.memory_limit, save_model=args.save_model
)

0 comments on commit 67e2e76

Please sign in to comment.