feat(machine-learning): add scikit-learn and cuML (#22720)

hongbo-miao · Jan 13, 2025 · c10f203 · c10f203
1 parent 268aebf
commit c10f203
Show file tree

Hide file tree

Showing 15 changed files with 1,603 additions and 0 deletions.
diff --git a/.github/workflows/.static-type-check.yml b/.github/workflows/.static-type-check.yml
@@ -165,6 +165,7 @@ jobs:
           uv run poe static-type-check-python --package=machine-learning.dali
           uv run poe static-type-check-python --package=machine-learning.feature-store
           uv run poe static-type-check-python --package=machine-learning.graph-neural-network
+          uv run poe static-type-check-python --package=machine-learning.hm-cuml
           uv run poe static-type-check-python --package=machine-learning.hm-docling
           uv run poe static-type-check-python --package=machine-learning.hm-faster-whisper
           uv run poe static-type-check-python --package=machine-learning.hm-gradio.applications.classify-image
@@ -175,6 +176,7 @@ jobs:
           uv run poe static-type-check-python --package=machine-learning.hm-mlflow.experiments.classify-mnist
           uv run poe static-type-check-python --package=machine-learning.hm-mlflow.experiments.predict-diabetes
           uv run poe static-type-check-python --package=machine-learning.hm-rasa
+          uv run poe static-type-check-python --package=machine-learning.hm-scikit-learn
           uv run poe static-type-check-python --package=machine-learning.hm-sglang
           uv run poe static-type-check-python --package=machine-learning.hm-streamlit.applications.live-line-chart
           uv run poe static-type-check-python --package=machine-learning.hm-streamlit.applications.map

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -49,6 +49,7 @@ jobs:
       hm-autogluon: ${{ steps.filter.outputs.hm-autogluon }}
       hm-aws-parallelcluster: ${{ steps.filter.outputs.hm-aws-parallelcluster }}
       hm-cudf: ${{ steps.filter.outputs.hm-cudf }}
+      hm-cuml: ${{ steps.filter.outputs.hm-cuml }}
       hm-cupy: ${{ steps.filter.outputs.hm-cupy }}
       hm-docling: ${{ steps.filter.outputs.hm-docling }}
       hm-duckdb-query-duckdb: ${{ steps.filter.outputs.hm-duckdb-query-duckdb }}
@@ -86,6 +87,7 @@ jobs:
       hm-rasa: ${{ steps.filter.outputs.hm-rasa }}
       hm-ray-calculate: ${{ steps.filter.outputs.hm-ray-calculate }}
       hm-ray-process-flight-data: ${{ steps.filter.outputs.hm-ray-process-flight-data }}
+      hm-scikit-learn: ${{ steps.filter.outputs.hm-scikit-learn }}
       hm-serial: ${{ steps.filter.outputs.hm-serial }}
       hm-sglang: ${{ steps.filter.outputs.hm-sglang }}
       hm-skypilot: ${{ steps.filter.outputs.hm-skypilot }}
@@ -221,6 +223,9 @@ jobs:
             hm-cudf:
               - '.github/workflows/test.yml'
               - 'data-analytics/hm-cudf/**'
+            hm-cuml:
+              - '.github/workflows/test.yml'
+              - 'machine-learning/hm-cuml/**'
             hm-cupy:
               - '.github/workflows/test.yml'
               - 'data-analytics/hm-cupy/**'
@@ -359,6 +364,9 @@ jobs:
             hm-ray-process-flight-data:
               - '.github/workflows/test.yml'
               - 'cloud-computing/hm-ray/applications/process-flight-data/**'
+            hm-scikit-learn:
+              - '.github/workflows/test.yml'
+              - 'machine-learning/hm-scikit-learn/**'
             hm-serial:
               - '.github/workflows/test.yml'
               - 'embedded-systems/hm-serial/**'
@@ -1850,6 +1858,72 @@ jobs:
         with:
           directory: data-analytics/hm-networkx
 
+  scikit-learn-test:
+    name: scikit-learn | Test
+    needs: detect-changes
+    if: ${{ needs.detect-changes.outputs.hm-scikit-learn == 'true' }}
+    runs-on: ubuntu-24.04
+    environment: test
+    timeout-minutes: 10
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.2.2
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5.1.0
+        with:
+          version: 0.5.18
+          enable-cache: true
+          cache-dependency-glob: machine-learning/hm-scikit-learn/uv.lock
+      - name: Set up Python
+        uses: actions/setup-python@v5.3.0
+        with:
+          python-version-file: machine-learning/hm-scikit-learn/pyproject.toml
+      - name: Install dependencies
+        working-directory: machine-learning/hm-scikit-learn
+        run: |
+          uv sync --dev
+      - name: Test
+        working-directory: machine-learning/hm-scikit-learn
+        run: |
+          uv run poe test-coverage
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v5.1.2
+        with:
+          directory: machine-learning/hm-scikit-learn
+
+  cuml-test:
+    name: cuML | Test
+    needs: detect-changes
+    if: ${{ needs.detect-changes.outputs.hm-cuml == 'true' }}
+    runs-on: ubuntu-24.04
+    environment: test
+    timeout-minutes: 10
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.2.2
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5.1.0
+        with:
+          version: 0.5.18
+          enable-cache: true
+          cache-dependency-glob: machine-learning/hm-cuml/uv.lock
+      - name: Set up Python
+        uses: actions/setup-python@v5.3.0
+        with:
+          python-version-file: machine-learning/hm-cuml/pyproject.toml
+      - name: Install dependencies
+        working-directory: machine-learning/hm-cuml
+        run: |
+          uv sync --dev
+      - name: Test
+        working-directory: machine-learning/hm-cuml
+        run: |
+          uv run poe test-coverage
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v5.1.2
+        with:
+          directory: machine-learning/hm-cuml
+
   hugging-face-analyze-sentiment-test:
     name: Hugging Face (analyze-sentiment) | Test
     needs: detect-changes

diff --git a/.mergify.yml b/.mergify.yml
@@ -302,6 +302,12 @@ pull_request_rules:
       - or:
           - check-success=NetworkX | Test
           - check-skipped=NetworkX | Test
+      - or:
+          - check-success=scikit-learn | Test
+          - check-skipped=scikit-learn | Test
+      - or:
+          - check-success=cuML | Test
+          - check-skipped=cuML | Test
       - or:
           - check-success=Hugging Face (analyze-sentiment) | Test
           - check-skipped=Hugging Face (analyze-sentiment) | Test

diff --git a/Makefile b/Makefile
@@ -379,6 +379,7 @@ static-type-check-python:
 	uv run poe static-type-check-python --package=machine-learning.dali
 	uv run poe static-type-check-python --package=machine-learning.feature-store
 	uv run poe static-type-check-python --package=machine-learning.graph-neural-network
+	uv run poe static-type-check-python --package=machine-learning.hm-cuml
 	uv run poe static-type-check-python --package=machine-learning.hm-docling
 	uv run poe static-type-check-python --package=machine-learning.hm-faster-whisper
 	uv run poe static-type-check-python --package=machine-learning.hm-gradio.applications.classify-image
@@ -389,6 +390,7 @@ static-type-check-python:
 	uv run poe static-type-check-python --package=machine-learning.hm-mlflow.experiments.classify-mnist
 	uv run poe static-type-check-python --package=machine-learning.hm-mlflow.experiments.predict-diabetes
 	uv run poe static-type-check-python --package=machine-learning.hm-rasa
+	uv run poe static-type-check-python --package=machine-learning.hm-scikit-learn
 	uv run poe static-type-check-python --package=machine-learning.hm-sglang
 	uv run poe static-type-check-python --package=machine-learning.hm-streamlit.applications.live-line-chart
 	uv run poe static-type-check-python --package=machine-learning.hm-streamlit.applications.map

diff --git a/README.md b/README.md
@@ -410,6 +410,8 @@ The diagram illustrates the repository's architecture, which is considered overl
 
 ### Machine Learning (ML)
 
+- **scikit-learn** - Machine learning library
+  - **cuML** - GPU-accelerated machine learning library
 - **PyTorch** - Machine learning
   - **PyTorch Geometric** - PyTorch geometric deep learning extension
   - **TorchServe** - PyTorch models serving

diff --git a/machine-learning/hm-cuml/Makefile b/machine-learning/hm-cuml/Makefile
@@ -0,0 +1,13 @@
+uv-install-python::
+	uv python install
+uv-update-lock-file:
+	uv lock
+uv-install-dependencies:
+	uv sync --dev
+
+uv-run-dev:
+	uv run poe dev
+uv-run-test:
+	uv run poe test
+uv-run-test-coverage:
+	uv run poe test-coverage
diff --git a/machine-learning/hm-cuml/pyproject.toml b/machine-learning/hm-cuml/pyproject.toml
@@ -0,0 +1,24 @@
+[project]
+name = "hm-cuml"
+version = "1.0.0"
+requires-python = "~=3.12.0"
+dependencies = [
+  "cudf-cu12==24.12.0",
+  "cuml-cu12==24.12.0",
+  "scikit-learn==1.6.1",
+]
+
+[dependency-groups]
+dev = [
+  "poethepoet==0.32.1",
+  "pytest==8.3.4",
+  "pytest-cov==6.0.0",
+]
+
+[tool.uv]
+package = false
+
+[tool.poe.tasks]
+dev = "python src/main.py"
+test = "pytest --verbose --verbose"
+test-coverage = "pytest --cov=. --cov-report=xml"
diff --git a/machine-learning/hm-cuml/src/dummy_test.py b/machine-learning/hm-cuml/src/dummy_test.py
@@ -0,0 +1,3 @@
+class TestDummy:
+    def test_dummy(self) -> None:
+        assert 1 + 1 == 2
diff --git a/machine-learning/hm-cuml/src/main.py b/machine-learning/hm-cuml/src/main.py
@@ -0,0 +1,61 @@
+import logging
+
+import cudf
+from cuml.ensemble import RandomForestClassifier
+from cuml.preprocessing import StandardScaler
+from sklearn.datasets import load_iris
+from sklearn.metrics import accuracy_score, classification_report
+from sklearn.model_selection import train_test_split
+
+logger = logging.getLogger(__name__)
+
+
+def main() -> None:
+    # Load the iris dataset
+    iris = load_iris()
+    x = iris.data
+    y = iris.target
+
+    # Split the data
+    x_train, x_test, y_train, y_test = train_test_split(
+        x,
+        y,
+        test_size=0.2,
+        random_state=42,
+    )
+
+    # Convert to cuDF DataFrames
+    x_train_cudf = cudf.DataFrame(x_train)
+    x_test_cudf = cudf.DataFrame(x_test)
+    y_train_cudf = cudf.Series(y_train)
+    y_test_cudf = cudf.Series(y_test)
+
+    # Scale the features
+    scaler = StandardScaler()
+    x_train_scaled = scaler.fit_transform(x_train_cudf)
+    x_test_scaled = scaler.transform(x_test_cudf)
+
+    # Create and train the model
+    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
+    rf_classifier.fit(x_train_scaled, y_train_cudf)
+
+    # Make predictions
+    y_pred_cudf = rf_classifier.predict(x_test_scaled)
+
+    # Convert predictions back to CPU for evaluation
+    y_pred = y_pred_cudf.values_host
+    y_test = y_test_cudf.values_host
+
+    # Print results
+    logger.info("cuML Results:")
+    logger.info(f"Accuracy: {accuracy_score(y_test, y_pred)}")
+    logger.info("Classification Report:")
+    logger.info(classification_report(y_test, y_pred, target_names=iris.target_names))
+
+
+if __name__ == "__main__":
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(levelname)s - %(message)s",
+    )
+    main()