diff --git a/.github/workflows/module-2.yaml b/.github/workflows/module-2.yaml index 8bccf1a..1c4be7a 100644 --- a/.github/workflows/module-2.yaml +++ b/.github/workflows/module-2.yaml @@ -2,12 +2,13 @@ name: Module 2 on: - workflow_dispatch: - # push: + pull_request: branches: - main - + push: + branches: + - main jobs: diff --git a/module-2/.dvc/.gitignore b/module-2/.dvc/.gitignore deleted file mode 100644 index 5ecbd4c..0000000 --- a/module-2/.dvc/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -/config.local -/tmp diff --git a/module-2/.dvc/config b/module-2/.dvc/config deleted file mode 100644 index 561d392..0000000 --- a/module-2/.dvc/config +++ /dev/null @@ -1,5 +0,0 @@ -[core] - remote = minio -['remote "minio"'] - url = s3://ml-data - endpointurl = http://127.0.0.1:9000 diff --git a/module-2/.dvcignore b/module-2/.dvcignore deleted file mode 100644 index 5197305..0000000 --- a/module-2/.dvcignore +++ /dev/null @@ -1,3 +0,0 @@ -# Add patterns of files dvc should ignore, which could improve -# the performance. Learn more at -# https://dvc.org/doc/user-guide/dvcignore diff --git a/module-2/README.md b/module-2/README.md index 4dd63e8..8ae237c 100644 --- a/module-2/README.md +++ b/module-2/README.md @@ -102,10 +102,10 @@ Results. | Name of Inference | Time (seconds) | |----------------------|---------------------| -| Inference 1 worker | 12.90 | -| Inference 16 workers (ThreadPoolExecutor) | 0.86 | -| Inference 16 workers (ProcessPoolExecutor) | 3.88 | -| Inference with Ray | 2.15 | +| Inference 1 worker | 12.64 | +| Inference 16 workers (ThreadPoolExecutor) | 0.85 | +| Inference 16 workers (ProcessPoolExecutor) | 4.03 | +| Inference with Ray | 2.19 | # Streaming dataset @@ -240,4 +240,4 @@ python ./labeling/create_dataset_synthetic.py ## Updated design doc -[Google doc](https://docs.google.com/document/d/1dEzWd3pPozmU3AhMXjW3xcONUeNJee53djilN1A-wR8/edit) \ No newline at end of file +[Google doc](https://docs.google.com/document/d/1dEzWd3pPozmU3AhMXjW3xcONUeNJee53djilN1A-wR8/edit) diff --git a/module-2/data/.gitignore b/module-2/data/.gitignore deleted file mode 100644 index a769338..0000000 --- a/module-2/data/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/big-data.csv diff --git a/module-2/data/big-data.csv.dvc b/module-2/data/big-data.csv.dvc deleted file mode 100644 index 3532e76..0000000 --- a/module-2/data/big-data.csv.dvc +++ /dev/null @@ -1,5 +0,0 @@ -outs: -- md5: d41d8cd98f00b204e9800998ecf8427e - size: 0 - hash: md5 - path: big-data.csv diff --git a/module-2/processing/inference_example.py b/module-2/processing/inference_example.py index ef2ee85..4016487 100644 --- a/module-2/processing/inference_example.py +++ b/module-2/processing/inference_example.py @@ -36,7 +36,6 @@ def run_inference( model: DummyClassifier, x_test: np.ndarray, batch_size: int = 2048 ) -> np.ndarray: y_pred = [] - y_batch = predict(model, x_test) for i in tqdm(range(0, x_test.shape[0], batch_size)): x_batch = x_test[i : i + batch_size] @@ -156,7 +155,7 @@ def run_pool(inference_size: int = 100_000_000, max_workers: int = 16): def run_ray(inference_size: int = 100_000_000, max_workers: int = 16): - ray.init() + ray.init(include_dashboard=True, dashboard_host='127.0.0.1', dashboard_port=5000) x_train, y_train, x_test = get_data(inference_size=inference_size) model = train_model(x_train, y_train) @@ -165,7 +164,6 @@ def run_ray(inference_size: int = 100_000_000, max_workers: int = 16): res = run_inference_ray_main(model=model, x_test=x_test, max_workers=max_workers) print(f"Inference with Ray {time.monotonic() - s} result: {res.shape}") - def run_dask(inference_size: int = 100_000_000, max_workers: int = 16): client = Client()