Module 2 Demo (#6)

* Module 2 Demo * Initialize DVC * Configure remote storage * add doc
kyryl-opens-ml · Jul 2, 2024 · a7b18c2 · a7b18c2
1 parent 1b956ae
commit a7b18c2
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 4 deletions.
diff --git a/module-2/README.md b/module-2/README.md
@@ -103,8 +103,9 @@ Results.
 | Name of Inference    | Time (seconds)      |
 |----------------------|---------------------|
 | Inference 1 worker   | 12.90  |
-| Inference 16 workers | 3.88  |
-| Inference with Ray   | 3.16  |
+| Inference 16 workers (ThreadPoolExecutor) | 0.86  |
+| Inference 16 workers (ProcessPoolExecutor) | 3.88  |
+| Inference with Ray   | 2.15  |
 
 
 # Streaming dataset
@@ -221,7 +222,7 @@ dvc push
 docker run -it --rm --name argilla -p 6900:6900 argilla/argilla-quickstart:v2.0.0rc1
 ```
 
-User/Password you can find [here])https://github.com/argilla-io/argilla/blob/v2.0.0rc1/argilla-server/docker/quickstart/Dockerfile#L60-L62).
+User/Password you can find [here](https://github.com/argilla-io/argilla/blob/v2.0.0rc1/argilla-server/docker/quickstart/Dockerfile#L60-L62).
 
 Alternatives on: [K8S](https://github.com/argilla-io/argilla/tree/develop/examples/deployments/k8s) or [Railway](https://railway.app/template/KNxfha?referralCode=_Q3XIe)
 
@@ -236,3 +237,7 @@ Create synthetic dataset:
 ```bash
 python ./labeling/create_dataset_synthetic.py
 ```
+
+## Updated design doc
+
+[Google doc](https://docs.google.com/document/d/1dEzWd3pPozmU3AhMXjW3xcONUeNJee53djilN1A-wR8/edit)
diff --git a/module-2/labeling/create_dataset_synthetic.py b/module-2/labeling/create_dataset_synthetic.py
@@ -61,6 +61,7 @@ def generate_synthetic_example(db_schema: str) -> Dict[str, str]:
 
 def create_text2sql_dataset_synthetic(num_samples: int = 10):
     db_schema = get_sqllite_schema("examples/chinook.db")
+
     samples = []
     for _ in tqdm(range(num_samples)):
         sample = generate_synthetic_example(db_schema=db_schema)

diff --git a/module-2/processing/inference_example.py b/module-2/processing/inference_example.py
@@ -48,6 +48,7 @@ def run_inference(
 def run_inference_process_pool(
     model: DummyClassifier, x_test: np.ndarray, max_workers: int = 16
 ) -> np.ndarray:
+    # with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
     with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
         chunk_size = len(x_test) // max_workers
 

diff --git a/module-2/streaming-dataset/mock_data.py b/module-2/streaming-dataset/mock_data.py
@@ -8,7 +8,7 @@
 from pathlib import Path
 
 
-def create_data(path_to_save: Path = Path("mds-dataset"), size: int = 10000):
+def create_data(path_to_save: Path = Path("mds-dataset"), size: int = 100_000):
     columns = {"image": "jpeg", "class": "int"}
     compression = "zstd"