Skip to content

Commit

Permalink
Merge branch 'experiments/clearml_241203' into 'master'
Browse files Browse the repository at this point in the history
Add exp dir

See merge request ai-lab-pmo/mltools/automl/LightAutoML!31
  • Loading branch information
dev-rinchin committed Dec 4, 2024
2 parents ed5d7ba + e198c9e commit 6209322
Show file tree
Hide file tree
Showing 12 changed files with 246 additions and 18 deletions.
2 changes: 1 addition & 1 deletion .gitlab/.gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ stages:
- default
- all_pythons
- docs


default:
stage: default
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ repos:
- id: check-yaml

- repo: https://github.com/PyCQA/flake8
rev: 3.8.4
rev: 6.1.0
hooks:
- id: flake8
additional_dependencies: [flake8-docstrings]
Expand Down
2 changes: 1 addition & 1 deletion lightautoml/addons/hypex/algorithms/faiss_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def __init__(

if self.info_col is not None:
self.columns_del = self.columns_del + [x for x in self.info_col if x in self.df.columns]
self.outcomes = outcomes if type(outcomes) == list else [outcomes]
self.outcomes = outcomes if isinstance(outcomes, list) else [outcomes]
self.treatment = treatment

if features is None:
Expand Down
2 changes: 1 addition & 1 deletion lightautoml/addons/hypex/matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def __init__(
self.input_data = input_data
if outcome is None:
outcome = list()
self.outcomes = outcome if type(outcome) == list else [outcome]
self.outcomes = outcome if isinstance(outcome, list) else [outcome]
self.treatment = treatment
self.group_col = group_col
self.info_col = info_col
Expand Down
2 changes: 1 addition & 1 deletion lightautoml/dataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def numpy_or_pandas_and_seq_concat(
assert len(datasets) == 2, "should be 1 sequential and 1 plain dataset"
# get 1 numpy / pandas dataset
for n, dataset in enumerate(datasets):
if type(dataset) == SeqNumpyPandasDataset:
if isinstance(dataset, SeqNumpyPandasDataset):
seq_dataset = dataset
else:
plain_dataset = dataset
Expand Down
4 changes: 2 additions & 2 deletions lightautoml/ml_algo/tuning/optuna.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def fit(
ml_algo = deepcopy(ml_algo)

flg_new_iterator = False
if self._fit_on_holdout and type(train_valid_iterator) != HoldoutIterator:
if self._fit_on_holdout and not isinstance(train_valid_iterator, HoldoutIterator):
train_valid_iterator = train_valid_iterator.convert_to_holdout_iterator()
flg_new_iterator = True

Expand Down Expand Up @@ -381,7 +381,7 @@ def fit(
ml_algo = deepcopy(ml_algo)

flg_new_iterator = False
if self._fit_on_holdout and type(train_valid_iterator) != HoldoutIterator:
if self._fit_on_holdout and not isinstance(train_valid_iterator, HoldoutIterator):
train_valid_iterator = train_valid_iterator.convert_to_holdout_iterator()
flg_new_iterator = True

Expand Down
10 changes: 5 additions & 5 deletions lightautoml/reader/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def fit_read(

for feat in parsed_roles:
r = parsed_roles[feat]
if type(r) == str:
if isinstance(r, str):
# get default role params if defined
r = self._get_default_role_from_str(r)

Expand Down Expand Up @@ -317,7 +317,7 @@ def fit_read(
assert "target" in kwargs, "Target should be defined"
if self.task.name in ["multi:reg", "multilabel"]:
kwargs["target"] = train_data.loc[:, kwargs["target"]]
self.target = kwargs["target"].name if type(kwargs["target"]) == pd.Series else kwargs["target"].columns
self.target = kwargs["target"].name if isinstance(kwargs["target"], pd.Series) else kwargs["target"].columns
kwargs["target"] = self._create_target(kwargs["target"])

# TODO: Check target and task
Expand Down Expand Up @@ -767,7 +767,7 @@ def parse_seq(self, seq_dataset, plain_data, dataset_name, parsed_roles, roles):

if feat in parsed_roles:
r = parsed_roles[feat]
if type(r) == str:
if isinstance(r, str):
# get default role params if defined
r = self._get_default_role_from_str(r)
# handle datetimes
Expand Down Expand Up @@ -895,7 +895,7 @@ def fit_read(

for feat in parsed_roles:
r = parsed_roles[feat]
if type(r) == str:
if isinstance(r, str):
# get default role params if defined
r = self._get_default_role_from_str(r)

Expand Down Expand Up @@ -932,7 +932,7 @@ def fit_read(
if isinstance(kwargs["target"], list):
kwargs["target"] = plain_data.loc[:, kwargs["target"]]

self.target = kwargs["target"].name if type(kwargs["target"]) == pd.Series else kwargs["target"].columns
self.target = kwargs["target"].name if isinstance(kwargs["target"], pd.Series) else kwargs["target"].columns
kwargs["target"] = self._create_target(kwargs["target"])

# TODO: Check target and task
Expand Down
8 changes: 4 additions & 4 deletions lightautoml/utils/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def verbosity_to_loglevel(verbosity: int, extended=True):

def get_stdout_level():
for handler in _logger.handlers:
if type(handler) == logging.StreamHandler:
if isinstance(handler, logging.StreamHandler):
return handler.level
return _logger.getEffectiveLevel()

Expand All @@ -131,7 +131,7 @@ def set_stdout_level(level):
has_console_handler = False

for handler in _logger.handlers:
if type(handler) == logging.StreamHandler:
if isinstance(handler, logging.StreamHandler):
if handler.level == level:
has_console_handler = True
else:
Expand All @@ -150,7 +150,7 @@ def add_filehandler(filename: str, level=logging.DEBUG):
has_file_handler = False

for handler in _logger.handlers:
if type(handler) == logging.FileHandler:
if isinstance(handler, logging.FileHandler):
if handler.baseFilename == filename or handler.baseFilename == os.path.join(os.getcwd(), filename):
has_file_handler = True
else:
Expand All @@ -172,7 +172,7 @@ def add_filehandler(filename: str, level=logging.DEBUG):
_logger.addHandler(file_handler)
else:
for handler in _logger.handlers:
if type(handler) == logging.FileHandler:
if isinstance(handler, logging.FileHandler):
_logger.handlers.remove(handler)


Expand Down
66 changes: 66 additions & 0 deletions scripts/experiments/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""Run tabular automl using ClearML logging."""
import argparse
import os
import clearml


def main( # noqa D103
dataset_name: str,
queue: str,
project: str,
cpu_limit: int,
memory_limit: int,
dataset_project: str = None,
dataset_partial_name: str = None,
tags=None,
):

if (dataset_project is not None) or (dataset_partial_name is not None) or (tags is not None):
tags = tags if isinstance(tags, list) else [tags]

dataset_list = clearml.Dataset.list_datasets(
dataset_project=dataset_project,
partial_name=dataset_partial_name,
tags=tags,
ids=None,
only_completed=True,
recursive_project_search=True,
include_archived=False,
)
print(dataset_list[0])
dataset_list = list(set([x["name"] for x in dataset_list]))

else:
dataset_list = [clearml.Dataset.get(dataset_id=None, dataset_name=dataset_name)]

print(f"Running {len(dataset_list)} datasets...")
print(dataset_list)

for dataset in dataset_list:
os.system(
f'clearml-task --project {project} --name {dataset} --script scripts/experiments/run_tabular.py --queue {queue} --docker for_clearml:latest --docker_args "--cpus={cpu_limit} --memory={memory_limit}g" --args dataset={dataset}'
)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="")
parser.add_argument("--dataset", type=str, help="dataset name or id", default="sampled_app_train")
parser.add_argument("--dataset_project", type=str, help="dataset_project", default=None)
parser.add_argument("--dataset_partial_name", type=str, help="dataset_partial_name", default=None)
parser.add_argument("--tags", type=str, help="tags", default=None)
parser.add_argument("--cpu_limit", type=int, help="", default=8)
parser.add_argument("--memory_limit", type=int, help="", default=16)
parser.add_argument("--queue", type=str, help="", default="cpu_queue")
parser.add_argument("--project", type=str, help="", default="junk")
args = parser.parse_args()

main(
dataset_name=args.dataset,
cpu_limit=args.cpu_limit,
memory_limit=args.memory_limit,
dataset_partial_name=args.dataset_partial_name,
dataset_project=args.dataset_project,
tags=args.tags,
queue=args.queue,
project=args.project,
)
95 changes: 95 additions & 0 deletions scripts/experiments/run_tabular.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
"""Run tabular automl using ClearML logging."""

from utils import Timer
from utils import install_lightautoml


install_lightautoml()

import argparse
import os

import clearml
import numpy as np
import pandas as pd

from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task


def main(dataset_name: str, cpu_limit: int, memory_limit: int): # noqa D103
cml_task = clearml.Task.get_task(clearml.config.get_remote_task_id())
logger = cml_task.get_logger()

dataset = clearml.Dataset.get(dataset_id=None, dataset_name=dataset_name)
dataset_local_path = dataset.get_local_copy()

with open(os.path.join(dataset_local_path, "task_type.txt"), "r") as f:
task_type = f.readline()
train = pd.read_csv(os.path.join(dataset_local_path, "train.csv"))
test = pd.read_csv(os.path.join(dataset_local_path, "test.csv"))

task = Task(task_type)

# =================================== automl config:
automl = TabularAutoML(
task=task,
cpu_limit=cpu_limit,
memory_limit=memory_limit,
timeout=10 * 60 * 60,
# general_params={
# "use_algos": [["mlp"]]
# }, # ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn', 'node', 'autoint', 'fttransformer'] or custom torch model
# nn_params={"n_epochs": 10, "bs": 512, "num_workers": 0, "path_to_save": None, "freeze_defaults": True},
# nn_pipeline_params={"use_qnt": True, "use_te": False},
)
# ===================================

cml_task.connect(automl)

target_name = test.columns[-1]

with Timer() as timer_training:
oof_predictions = automl.fit_predict(train, roles={"target": target_name}, verbose=10)

with Timer() as timer_predict:
test_predictions = automl.predict(test)

if task_type == "binary":
metric_oof = roc_auc_score(train[target_name].values, oof_predictions.data[:, 0])
metric_ho = roc_auc_score(test[target_name].values, test_predictions.data[:, 0])

elif task_type == "multiclass":
not_nan = np.any(~np.isnan(oof_predictions.data), axis=1)
metric_oof = log_loss(train[target_name].values[not_nan], oof_predictions.data[not_nan, :])
metric_ho = log_loss(test[target_name], test_predictions.data)

elif task_type == "reg":
metric_oof = task.metric_func(train[target_name].values, oof_predictions.data[:, 0])
metric_ho = task.metric_func(test[target_name].values, test_predictions.data[:, 0])

print(f"Score for out-of-fold predictions: {metric_oof}")
print(f"Score for hold-out: {metric_ho}")
print(f"Train duration: {timer_training.duration}")
print(f"Predict duration: {timer_predict.duration}")

logger.report_single_value("Metric OOF", metric_oof)
logger.report_single_value("Metric HO", metric_ho)

logger.report_single_value("Train duration", timer_training.duration)
logger.report_single_value("Predict duration", timer_predict.duration)

logger.flush()


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="")
parser.add_argument("--dataset", type=str, help="dataset name or id", default="sampled_app_train")
parser.add_argument("--cpu_limit", type=int, help="", default=8)
parser.add_argument("--memory_limit", type=int, help="", default=16)
args = parser.parse_args()

main(dataset_name=args.dataset, cpu_limit=args.cpu_limit, memory_limit=args.memory_limit)
60 changes: 60 additions & 0 deletions scripts/experiments/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""Utils for running experiments."""

import os
import time


class Timer: # noqa: D101
@staticmethod
def _zero():
return 0

def __init__(self, clock=time.time, enabled=True):
self.start = 0
self.stop = 0
self._time = clock if enabled else Timer._zero
self._tick = 0

def __enter__(self):
self.start = self._tick = self._time()
return self

def __exit__(self, *args):
self.stop = self._tick = self._time()

@property
def tick(self):
"""Make one tick."""
if self.stop > 0:
return -1
now = self._time()
tick = now - self._tick
self._tick = now
return tick

@property
def duration(self):
"""Get dureation in seconds."""
if self.stop > 0:
return self.stop - self.start
return self._time() - self.start


def install_lightautoml():
"""Install lightautoml using pip."""
# os.system("curl -sSL https://install.python-poetry.org | ../../bin/python -vvv -")
# os.system("/root/.local/bin/poetry build")
# os.system("ls -la ./dist/")
os.system("pip install packaging==22.0")
os.system("python scripts/poetry_fix.py -f")
os.system("../../bin/pip install .") # ./dist/*.whl


# .pip install --upgrade pip
# poetry config virtualenvs.create false --local
# poetry run python ./scripts/poetry_fix.py -c
# ls -la
# poetry run pip install pillow==9.2.0
# poetry install
# poetry run pip freeze
# poetry run python -c "import sys; print(sys.path)"
11 changes: 9 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ package = wheel
deps =
.[all]
pytest >= 6.2.5
commands = pytest {posargs} -v --basetemp="{envtmpdir}"
commands = pytest {posargs} -v --basetemp="{envtmpdir}" --log-level=DEBUG

[testenv:lint]
deps =
Expand Down Expand Up @@ -75,9 +75,16 @@ commands =
poetry run python scripts/poetry_fix.py -f
poetry build


# example: tox -e exp -- --dataset_project=Datasets_with_metadata --tags=binary
[testenv:codespell]
deps =
codespell == 2.3.0
commands =
codespell

[testenv:exp]
deps =
clearml

commands =
python scripts/experiments/run.py {posargs}

0 comments on commit 6209322

Please sign in to comment.