Skip to content

Commit

Permalink
feat(machine-learning): add scikit-learn and cuML (#22720)
Browse files Browse the repository at this point in the history
  • Loading branch information
hongbo-miao authored Jan 13, 2025
1 parent 268aebf commit c10f203
Show file tree
Hide file tree
Showing 15 changed files with 1,603 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/.static-type-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ jobs:
uv run poe static-type-check-python --package=machine-learning.dali
uv run poe static-type-check-python --package=machine-learning.feature-store
uv run poe static-type-check-python --package=machine-learning.graph-neural-network
uv run poe static-type-check-python --package=machine-learning.hm-cuml
uv run poe static-type-check-python --package=machine-learning.hm-docling
uv run poe static-type-check-python --package=machine-learning.hm-faster-whisper
uv run poe static-type-check-python --package=machine-learning.hm-gradio.applications.classify-image
Expand All @@ -175,6 +176,7 @@ jobs:
uv run poe static-type-check-python --package=machine-learning.hm-mlflow.experiments.classify-mnist
uv run poe static-type-check-python --package=machine-learning.hm-mlflow.experiments.predict-diabetes
uv run poe static-type-check-python --package=machine-learning.hm-rasa
uv run poe static-type-check-python --package=machine-learning.hm-scikit-learn
uv run poe static-type-check-python --package=machine-learning.hm-sglang
uv run poe static-type-check-python --package=machine-learning.hm-streamlit.applications.live-line-chart
uv run poe static-type-check-python --package=machine-learning.hm-streamlit.applications.map
Expand Down
74 changes: 74 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ jobs:
hm-autogluon: ${{ steps.filter.outputs.hm-autogluon }}
hm-aws-parallelcluster: ${{ steps.filter.outputs.hm-aws-parallelcluster }}
hm-cudf: ${{ steps.filter.outputs.hm-cudf }}
hm-cuml: ${{ steps.filter.outputs.hm-cuml }}
hm-cupy: ${{ steps.filter.outputs.hm-cupy }}
hm-docling: ${{ steps.filter.outputs.hm-docling }}
hm-duckdb-query-duckdb: ${{ steps.filter.outputs.hm-duckdb-query-duckdb }}
Expand Down Expand Up @@ -86,6 +87,7 @@ jobs:
hm-rasa: ${{ steps.filter.outputs.hm-rasa }}
hm-ray-calculate: ${{ steps.filter.outputs.hm-ray-calculate }}
hm-ray-process-flight-data: ${{ steps.filter.outputs.hm-ray-process-flight-data }}
hm-scikit-learn: ${{ steps.filter.outputs.hm-scikit-learn }}
hm-serial: ${{ steps.filter.outputs.hm-serial }}
hm-sglang: ${{ steps.filter.outputs.hm-sglang }}
hm-skypilot: ${{ steps.filter.outputs.hm-skypilot }}
Expand Down Expand Up @@ -221,6 +223,9 @@ jobs:
hm-cudf:
- '.github/workflows/test.yml'
- 'data-analytics/hm-cudf/**'
hm-cuml:
- '.github/workflows/test.yml'
- 'machine-learning/hm-cuml/**'
hm-cupy:
- '.github/workflows/test.yml'
- 'data-analytics/hm-cupy/**'
Expand Down Expand Up @@ -359,6 +364,9 @@ jobs:
hm-ray-process-flight-data:
- '.github/workflows/test.yml'
- 'cloud-computing/hm-ray/applications/process-flight-data/**'
hm-scikit-learn:
- '.github/workflows/test.yml'
- 'machine-learning/hm-scikit-learn/**'
hm-serial:
- '.github/workflows/test.yml'
- 'embedded-systems/hm-serial/**'
Expand Down Expand Up @@ -1850,6 +1858,72 @@ jobs:
with:
directory: data-analytics/hm-networkx

scikit-learn-test:
name: scikit-learn | Test
needs: detect-changes
if: ${{ needs.detect-changes.outputs.hm-scikit-learn == 'true' }}
runs-on: ubuntu-24.04
environment: test
timeout-minutes: 10
steps:
- name: Checkout
uses: actions/checkout@v4.2.2
- name: Install uv
uses: astral-sh/setup-uv@v5.1.0
with:
version: 0.5.18
enable-cache: true
cache-dependency-glob: machine-learning/hm-scikit-learn/uv.lock
- name: Set up Python
uses: actions/setup-python@v5.3.0
with:
python-version-file: machine-learning/hm-scikit-learn/pyproject.toml
- name: Install dependencies
working-directory: machine-learning/hm-scikit-learn
run: |
uv sync --dev
- name: Test
working-directory: machine-learning/hm-scikit-learn
run: |
uv run poe test-coverage
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5.1.2
with:
directory: machine-learning/hm-scikit-learn

cuml-test:
name: cuML | Test
needs: detect-changes
if: ${{ needs.detect-changes.outputs.hm-cuml == 'true' }}
runs-on: ubuntu-24.04
environment: test
timeout-minutes: 10
steps:
- name: Checkout
uses: actions/checkout@v4.2.2
- name: Install uv
uses: astral-sh/setup-uv@v5.1.0
with:
version: 0.5.18
enable-cache: true
cache-dependency-glob: machine-learning/hm-cuml/uv.lock
- name: Set up Python
uses: actions/setup-python@v5.3.0
with:
python-version-file: machine-learning/hm-cuml/pyproject.toml
- name: Install dependencies
working-directory: machine-learning/hm-cuml
run: |
uv sync --dev
- name: Test
working-directory: machine-learning/hm-cuml
run: |
uv run poe test-coverage
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5.1.2
with:
directory: machine-learning/hm-cuml

hugging-face-analyze-sentiment-test:
name: Hugging Face (analyze-sentiment) | Test
needs: detect-changes
Expand Down
6 changes: 6 additions & 0 deletions .mergify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,12 @@ pull_request_rules:
- or:
- check-success=NetworkX | Test
- check-skipped=NetworkX | Test
- or:
- check-success=scikit-learn | Test
- check-skipped=scikit-learn | Test
- or:
- check-success=cuML | Test
- check-skipped=cuML | Test
- or:
- check-success=Hugging Face (analyze-sentiment) | Test
- check-skipped=Hugging Face (analyze-sentiment) | Test
Expand Down
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,7 @@ static-type-check-python:
uv run poe static-type-check-python --package=machine-learning.dali
uv run poe static-type-check-python --package=machine-learning.feature-store
uv run poe static-type-check-python --package=machine-learning.graph-neural-network
uv run poe static-type-check-python --package=machine-learning.hm-cuml
uv run poe static-type-check-python --package=machine-learning.hm-docling
uv run poe static-type-check-python --package=machine-learning.hm-faster-whisper
uv run poe static-type-check-python --package=machine-learning.hm-gradio.applications.classify-image
Expand All @@ -389,6 +390,7 @@ static-type-check-python:
uv run poe static-type-check-python --package=machine-learning.hm-mlflow.experiments.classify-mnist
uv run poe static-type-check-python --package=machine-learning.hm-mlflow.experiments.predict-diabetes
uv run poe static-type-check-python --package=machine-learning.hm-rasa
uv run poe static-type-check-python --package=machine-learning.hm-scikit-learn
uv run poe static-type-check-python --package=machine-learning.hm-sglang
uv run poe static-type-check-python --package=machine-learning.hm-streamlit.applications.live-line-chart
uv run poe static-type-check-python --package=machine-learning.hm-streamlit.applications.map
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,8 @@ The diagram illustrates the repository's architecture, which is considered overl

### Machine Learning (ML)

- **scikit-learn** - Machine learning library
- **cuML** - GPU-accelerated machine learning library
- **PyTorch** - Machine learning
- **PyTorch Geometric** - PyTorch geometric deep learning extension
- **TorchServe** - PyTorch models serving
Expand Down
13 changes: 13 additions & 0 deletions machine-learning/hm-cuml/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
uv-install-python::
uv python install
uv-update-lock-file:
uv lock
uv-install-dependencies:
uv sync --dev

uv-run-dev:
uv run poe dev
uv-run-test:
uv run poe test
uv-run-test-coverage:
uv run poe test-coverage
24 changes: 24 additions & 0 deletions machine-learning/hm-cuml/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
[project]
name = "hm-cuml"
version = "1.0.0"
requires-python = "~=3.12.0"
dependencies = [
"cudf-cu12==24.12.0",
"cuml-cu12==24.12.0",
"scikit-learn==1.6.1",
]

[dependency-groups]
dev = [
"poethepoet==0.32.1",
"pytest==8.3.4",
"pytest-cov==6.0.0",
]

[tool.uv]
package = false

[tool.poe.tasks]
dev = "python src/main.py"
test = "pytest --verbose --verbose"
test-coverage = "pytest --cov=. --cov-report=xml"
3 changes: 3 additions & 0 deletions machine-learning/hm-cuml/src/dummy_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
class TestDummy:
def test_dummy(self) -> None:
assert 1 + 1 == 2
61 changes: 61 additions & 0 deletions machine-learning/hm-cuml/src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import logging

import cudf
from cuml.ensemble import RandomForestClassifier
from cuml.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

logger = logging.getLogger(__name__)


def main() -> None:
# Load the iris dataset
iris = load_iris()
x = iris.data
y = iris.target

# Split the data
x_train, x_test, y_train, y_test = train_test_split(
x,
y,
test_size=0.2,
random_state=42,
)

# Convert to cuDF DataFrames
x_train_cudf = cudf.DataFrame(x_train)
x_test_cudf = cudf.DataFrame(x_test)
y_train_cudf = cudf.Series(y_train)
y_test_cudf = cudf.Series(y_test)

# Scale the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_cudf)
x_test_scaled = scaler.transform(x_test_cudf)

# Create and train the model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(x_train_scaled, y_train_cudf)

# Make predictions
y_pred_cudf = rf_classifier.predict(x_test_scaled)

# Convert predictions back to CPU for evaluation
y_pred = y_pred_cudf.values_host
y_test = y_test_cudf.values_host

# Print results
logger.info("cuML Results:")
logger.info(f"Accuracy: {accuracy_score(y_test, y_pred)}")
logger.info("Classification Report:")
logger.info(classification_report(y_test, y_pred, target_names=iris.target_names))


if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
main()
Loading

0 comments on commit c10f203

Please sign in to comment.