kyryl-opens-ml · truskovskiyk · Jul 17, 2024 · Jul 15, 2024 · Jul 15, 2024 · Jul 15, 2024
diff --git a/module-3/README.md b/module-3/README.md
@@ -52,6 +52,19 @@ https://huggingface.co/models?sort=downloads
 - https://github.com/microsoft/Phi-3CookBook
 - https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard
 
+## LLM API
+
+Run API LLMs
+
+```bash
+python generative-api/pipeline_api.py ./data/test.json
+```
+
+Run open LLMs
+
+```bash
+python generative-api/pipeline_phi3.py ./data/test.json
+```
 
 ## LLM API testing
 

diff --git a/module-3/classic-example/Makefile b/module-3/classic-example/Makefile
@@ -29,12 +29,12 @@ test_all:
 	pytest --cov=classic_example tests/
 
 train_example:
-	python classic_example/cli.py load-cola-data ./data
+	python classic_example/cli.py load-sst2-data ./data
 	python classic_example/cli.py train ./conf/example.json
 	python classic_example/cli.py upload-to-registry example_model /tmp/results
 
 train_fast_ci:
-	python classic_example/cli.py load-cola-data ./data
+	python classic_example/cli.py load-sst2-data ./data
 	python classic_example/cli.py train ./conf/fast.json
 	python classic_example/cli.py upload-to-registry fast-model /tmp/results
 
diff --git a/module-3/classic-example/README.md b/module-3/classic-example/README.md
@@ -24,6 +24,13 @@ make test
 
 reference: https://madewithml.com/courses/mlops/testing/
 
+## Run training job
+
+```bash
+modal deploy run_training_job.py
+python run_training_job.py
+```
+
 ## Reports
 
 ```bash

diff --git a/module-3/classic-example/classic_example/cli.py b/module-3/classic-example/classic_example/cli.py
@@ -1,14 +1,14 @@
 import typer
 
-from classic_example.data import load_cola_data, load_cola_data_file_input
+from classic_example.data import load_sst2_data, load_sst2_data_file_input
+from classic_example.predictor import run_inference_on_dataframe
 from classic_example.train import train
 from classic_example.utils import load_from_registry, upload_to_registry
-from classic_example.predictor import run_inference_on_dataframe
 
 app = typer.Typer()
 app.command()(train)
-app.command()(load_cola_data)
-app.command()(load_cola_data_file_input)
+app.command()(load_sst2_data)
+app.command()(load_sst2_data_file_input)
 app.command()(upload_to_registry)
 app.command()(load_from_registry)
 app.command()(run_inference_on_dataframe)

diff --git a/module-3/classic-example/classic_example/data.py b/module-3/classic-example/classic_example/data.py
@@ -5,7 +5,7 @@
 from sklearn.model_selection import train_test_split
 
 
-def _get_cola_data(random_state: int = 42):
+def _get_sst2_data(random_state: int = 42):
     dataset = load_dataset("glue", "sst2")
     df_all = ArrowReader.read_table(
         dataset.cache_files["train"][0]["filename"]
@@ -19,24 +19,24 @@ def _get_cola_data(random_state: int = 42):
     return df_train, df_val, df_test
 
 
-def load_cola_data(path_to_save: Path):
+def load_sst2_data(path_to_save: Path):
     path_to_save.mkdir(parents=True, exist_ok=True)
 
-    df_train, df_val, df_test = _get_cola_data()
+    df_train, df_val, df_test = _get_sst2_data()
 
     df_train.to_csv(path_to_save / "train.csv", index=False)
     df_val.to_csv(path_to_save / "val.csv", index=False)
     df_test.to_csv(path_to_save / "test.csv", index=False)
 
 
-def load_cola_data_file_input(
+def load_sst2_data_file_input(
     path_to_train: Path, path_to_val: Path, path_to_test: Path
 ):
     path_to_train.parent.mkdir(parents=True, exist_ok=True)
     path_to_val.parent.mkdir(parents=True, exist_ok=True)
     path_to_test.parent.mkdir(parents=True, exist_ok=True)
 
-    df_train, df_val, df_test = _get_cola_data()
+    df_train, df_val, df_test = _get_sst2_data()
 
     df_train.to_csv(path_to_train, index=False)
     df_val.to_csv(path_to_val, index=False)

diff --git a/module-3/classic-example/classic_example/predictor.py b/module-3/classic-example/classic_example/predictor.py
@@ -1,11 +1,11 @@
 import logging
+from pathlib import Path
 
+import pandas as pd
 import torch
 from torch.nn.functional import softmax
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-from pathlib import Path
-import pandas as pd
 from tqdm import tqdm
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
 logger = logging.getLogger()
 

diff --git a/module-3/classic-example/classic_example/train.py b/module-3/classic-example/classic_example/train.py
@@ -18,7 +18,11 @@
 )
 
 from classic_example.config import DataTrainingArguments, ModelArguments
-from classic_example.utils import compute_metrics, preprocess_function_examples, setup_logger
+from classic_example.utils import (
+    compute_metrics,
+    preprocess_function_examples,
+    setup_logger,
+)
 
 logger = logging.getLogger(__name__)
 

diff --git a/module-3/classic-example/conf/example-modal.json b/module-3/classic-example/conf/example-modal.json
@@ -0,0 +1,45 @@
+{
+    "model_name_or_path": "google/mobilebert-uncased",
+
+    "train_file": "/tmp/data/train.csv",
+    "validation_file": "/tmp/data/val.csv",
+    "output_dir": "/tmp/results",    
+    "max_seq_length": 128,
+
+
+
+    "use_fast_tokenizer": true,
+
+
+    "eval_strategy": "steps",
+    "per_device_train_batch_size": 32,
+    "per_device_eval_batch_size": 32,
+    "gradient_accumulation_steps": 1,
+    "eval_delay": 0,
+    "learning_rate": 5e-05,
+    "weight_decay": 0,
+    "adam_beta1": 0.9,
+    "adam_beta2": 0.999,
+    "adam_epsilon": 1e-08,
+    "max_grad_norm": 1,
+    "num_train_epochs": 5,
+    "max_steps": -1,
+    "lr_scheduler_type": "linear",
+    "logging_strategy": "steps",
+    "logging_first_step": true,
+    "logging_steps": 250,
+    "save_strategy": "steps",
+    "save_steps": 250,
+    "save_total_limit": 5,
+    "no_cuda": false,
+    "seed": 42,
+    "eval_steps": 250,
+    "run_name": "results",
+    "disable_tqdm": false,
+    "remove_unused_columns": true,
+    "label_names": null,
+    "load_best_model_at_end": true,
+    "metric_for_best_model": "eval_f1",
+    "greater_is_better": true,
+    "report_to": ["wandb"]
+  }
diff --git a/module-3/classic-example/conf/example.json b/module-3/classic-example/conf/example.json
@@ -1,16 +1,10 @@
 {
     "model_name_or_path": "google/mobilebert-uncased",
-
     "train_file": "./data/train.csv",
     "validation_file": "./data/val.csv",
     "output_dir": "results",    
     "max_seq_length": 128,
-
-
-
     "use_fast_tokenizer": true,
-
-
     "eval_strategy": "steps",
     "per_device_train_batch_size": 32,
     "per_device_eval_batch_size": 32,

diff --git a/module-3/classic-example/run_training_job.py b/module-3/classic-example/run_training_job.py
@@ -0,0 +1,43 @@
+import os
+
+import modal
+from modal import Image
+
+app = modal.App("ml-in-production-practice")
+env = {
+    "WANDB_PROJECT": os.getenv("WANDB_PROJECT"),
+    "WANDB_API_KEY": os.getenv("WANDB_API_KEY"),
+}
+custom_image = Image.from_registry("ghcr.io/kyryl-opens-ml/classic-example:pr-11").env(env)
+
+
+@app.function(image=custom_image, gpu="a10g", timeout=15 * 60)
+def run_classic_example():
+    from pathlib import Path
+
+    from classic_example.data import load_sst2_data
+    from classic_example.predictor import run_inference_on_dataframe
+    from classic_example.train import train
+    from classic_example.utils import load_from_registry, upload_to_registry
+
+    load_sst2_data(path_to_save=Path("/tmp/data/"))
+    train(config_path=Path("/app/conf/example.json"))
+    upload_to_registry(model_name="modal-classic-example", model_path=Path("results"))
+    load_from_registry(
+        model_name="modal-classic-example:latest", model_path=Path("loaded-model")
+    )
+    run_inference_on_dataframe(
+        df_path=Path("/tmp/data/test.csv"),
+        model_load_path=Path("loaded-model"),
+        result_path=Path("/tmp/data/inference.csv"),
+    )
+
+
+def main():
+    fn = modal.Function.lookup("ml-in-production-practice", "run_classic_example")
+    fn_id = fn.spawn()
+    print(f"Run training object: {fn_id}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/module-3/classic-example/tests/conftest.py b/module-3/classic-example/tests/conftest.py
@@ -4,17 +4,16 @@
 import great_expectations as ge
 import pandas as pd
 import pytest
+from classic_example.data import load_sst2_data
 from great_expectations.dataset.pandas_dataset import PandasDataset
 
-from classic_example.data import load_cola_data
-
 
 @pytest.fixture(scope="session")
 def data_path() -> Path:
     _data_path = Path("/tmp/data")
     _data_path.mkdir(exist_ok=True, parents=True)
 
-    load_cola_data(path_to_save=_data_path)
+    load_sst2_data(path_to_save=_data_path)
 
     return _data_path
 

diff --git a/module-3/classic-example/tests/test_cli.py b/module-3/classic-example/tests/test_cli.py
@@ -1,12 +1,13 @@
-from typer.testing import CliRunner
 from pathlib import Path
+
 from classic_example.cli import app
+from typer.testing import CliRunner
 
 runner = CliRunner()
 
 
 def test_app():
-    result = runner.invoke(app, ["load-cola-data", "/tmp/data"])
+    result = runner.invoke(app, ["load-sst2-data", "/tmp/data"])
     assert result.exit_code == 0, result.exception
     assert Path("/tmp/data/train.csv").exists()
     assert Path("/tmp/data/val.csv").exists()
@@ -18,4 +19,3 @@ def test_app():
 
     result = runner.invoke(app, ["upload-to-registry", "cli-test", "/tmp/results"])
     assert result.exit_code == 0, result.exception
-
diff --git a/module-3/classic-example/tests/test_code.py b/module-3/classic-example/tests/test_code.py
@@ -1,8 +1,7 @@
 import numpy as np
 import pytest
-from transformers import EvalPrediction
-
 from classic_example.utils import compute_metrics
+from transformers import EvalPrediction
 
 
 @pytest.fixture()

diff --git a/module-3/classic-example/tests/test_model.py b/module-3/classic-example/tests/test_model.py
@@ -1,8 +1,6 @@
 from pathlib import Path
 
 import pytest
-from transformers import Trainer, TrainingArguments
-
 from classic_example.config import DataTrainingArguments, ModelArguments
 from classic_example.train import (
     get_models,
@@ -11,6 +9,7 @@
     read_dataset,
     train,
 )
+from transformers import Trainer, TrainingArguments
 
 
 @pytest.fixture()

diff --git a/...xample/generative_example/pipeline_api.py → module-3/generative-api/pipeline_api.py b/...xample/generative_example/pipeline_api.py → module-3/generative-api/pipeline_api.py
@@ -1,20 +1,16 @@
-from openai import OpenAI
-from random import randrange
-import torch
-from datasets import load_dataset
-from joblib import Memory
-from tqdm import tqdm
 import json
-from datasets import Dataset
-
-import numpy as np
 
+import evaluate
+from datasets import Dataset
+from joblib import Memory
+from openai import OpenAI
+from tqdm import tqdm
+import typer
 cache_directory = ".cache"
 memory = Memory(cache_directory)
 persistent_cache = memory.cache
 
 
-
 @persistent_cache
 def get_sql(query: str, context: str) -> str:
     client = OpenAI()
@@ -37,26 +33,25 @@ def get_sql(query: str, context: str) -> str:
         model="gpt-4o",
         response_format={"type": "json_object"},
     )
-
-    return json.loads(chat_completion.choices[0].message.content)['sql']
 
+    return json.loads(chat_completion.choices[0].message.content)["sql"]
 
-def pipeline(test_json: str):
-    dataset = Dataset.from_json(test_json)
 
-    generate_sql = []
+def run_pipeline(test_json: str):
+    dataset = Dataset.from_json(test_json)
+    generated_sql = []
     gt_sql = []
     for row in tqdm(dataset):
-        _generate_sql = get_sql(query=query, context=context)
-        _gt_sql = row['answer']
+        _generate_sql = get_sql(query=row["question"], context=row["context"])
+        _gt_sql = row["answer"]
 
-        generate_sql.append(_generate_sql)
+        generated_sql.append(_generate_sql)
         gt_sql.append(_gt_sql)
 
-    rouge = evaluate.load('rouge')
+    rouge = evaluate.load("rouge")
     results = rouge.compute(predictions=generated_sql, references=gt_sql)
     print(f"results = {results}")
 
 
 if __name__ == "__main__":
-    pipeline()
+    typer.run(run_pipeline)