diff --git a/.github/workflows/nox.yml b/.github/workflows/nox.yml new file mode 100644 index 0000000..3916a24 --- /dev/null +++ b/.github/workflows/nox.yml @@ -0,0 +1,45 @@ +name: Run nox tests + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + test: + runs-on: ${{ matrix.os }} + + strategy: + matrix: + os: [ + ubuntu-22.04, + ubuntu-24.04, + windows-2019, + windows-2022, + windows-latest, + macos-11, + macos-12, + macos-13, + macos-14, + ] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install nox + + - name: Run nox + run: nox --non-interactive --error-on-missing-interpreter \ No newline at end of file diff --git a/.gitignore b/.gitignore index 07e1df4..0072a13 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ .DS_Store -/dist/* -/src/flameai/__pycache__/* \ No newline at end of file +.idea/ +.vscode/ +.pytest_cache/ +__pycache__/ +/src/flameai/__pycache__/ +/tests/__pycache__/ +dist/ +.nox/ \ No newline at end of file diff --git a/noxfile.py b/noxfile.py new file mode 100644 index 0000000..6a7d56c --- /dev/null +++ b/noxfile.py @@ -0,0 +1,14 @@ +import nox + + +@nox.session(python=['3.8', '3.9', '3.10', '3.11', '3.12']) +def tests(session): + session.install('pytest') + session.install('-e', '.') + session.run('pytest') + + +@nox.session +def lint(session): + session.install('flake8') + session.run('flake8') diff --git a/pyproject.toml b/pyproject.toml index 2434b27..a554d65 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "flameai" -version = "1.0.3" +version = "1.0.4" description = "Deep Learning Toolkit." readme = "README.md" keywords = [ @@ -16,7 +16,7 @@ authors = [ { name = "luochang" }, { email = "luochang212@gmail.com" }, ] -requires-python = ">=3.10" +requires-python = ">=3.8" dependencies = [ "numpy>=1.26.4", "pandas>=2.2.0", @@ -26,7 +26,6 @@ dependencies = [ "seaborn>=0.13.2", "optuna>=3.6.1", "click>=8.1.7", - "torch>=2.2.2", ] classifiers = [ "License :: OSI Approved :: Apache Software License", diff --git a/src/flameai/__init__.py b/src/flameai/__init__.py index 229c7d3..8ab3db9 100644 --- a/src/flameai/__init__.py +++ b/src/flameai/__init__.py @@ -12,4 +12,4 @@ 'metrics', 'mining', 'plot', -] \ No newline at end of file +] diff --git a/src/flameai/__main__.py b/src/flameai/__main__.py index 0a47e8d..7ff3c1d 100644 --- a/src/flameai/__main__.py +++ b/src/flameai/__main__.py @@ -1,13 +1,14 @@ -# Usage: python -m flameai -from ._env import check_hive_env, check_python_env, num_gpus +# Usage: python -m flameai +from ._env import check_hive_env, check_python_env, num_gpus, HAS_TORCH def check_env(): text = lambda e: 'YES' if e == 0 else 'NO' print(f'Python: {text(check_python_env())}') print(f'Hive: {text(check_hive_env())}') - print(f'GPU: {"YES" if num_gpus() >= 1 else "NO"}') + if HAS_TORCH: + print(f'GPU: {"YES" if num_gpus() >= 1 else "NO"}') if __name__ == "__main__": - check_env() \ No newline at end of file + check_env() diff --git a/src/flameai/_env.py b/src/flameai/_env.py index 49aeb29..29c3c31 100644 --- a/src/flameai/_env.py +++ b/src/flameai/_env.py @@ -1,5 +1,19 @@ import subprocess -import torch + +from .util import set_logger + + +logger = set_logger() + + +HAS_TORCH = None +try: + import torch + HAS_TORCH = True +except ImportError: + HAS_TORCH = False + logger.warning("PyTorch not found. Please install it using 'pip install torch'") + logger.warning("or 'pip install torch -i https://mirrors.aliyun.com/pypi/simple/'") def check_python_env() -> int: @@ -49,4 +63,5 @@ def try_gpu(i: int = 0): if __name__ == '__main__': print('check_python_env:', check_python_env()) print('check_hive_env:', check_hive_env()) - print('try_gpu:', try_gpu()) \ No newline at end of file + if HAS_TORCH: + print('try_gpu:', try_gpu()) diff --git a/src/flameai/cmd.py b/src/flameai/cmd.py index 4b8fd58..be8b7e4 100644 --- a/src/flameai/cmd.py +++ b/src/flameai/cmd.py @@ -31,7 +31,7 @@ def hive_cli(file_name: str) -> None: try: res = subprocess.run(command, shell=True, text=True) if res.returncode != 0: - logger.warning(f'Failed to execute query.') + logger.warning('Failed to execute query.') logger.error(f'Error: {res.stderr}') logger.error(f'returncode: {res.returncode}') except Exception as e: @@ -39,4 +39,4 @@ def hive_cli(file_name: str) -> None: if __name__ == "__main__": - hive_cli() \ No newline at end of file + hive_cli() diff --git a/src/flameai/metrics.py b/src/flameai/metrics.py index 7491297..55df251 100644 --- a/src/flameai/metrics.py +++ b/src/flameai/metrics.py @@ -1,5 +1,3 @@ -import numpy as np -import pandas as pd import sklearn.metrics from enum import Enum @@ -21,9 +19,9 @@ def lgb_feature_importance(gbm) -> None: :param gbm: The trained LightGBM model. """ items = [(k, v) for k, v in zip(gbm.feature_name(), gbm.feature_importance())] - sorted_items = sorted(items, key = lambda e: e[1], reverse = True) + sorted_items = sorted(items, key=lambda e: e[1], reverse=True) for i, (k, v) in enumerate(sorted_items): - print(f'[rank {i+1}] {k}: {v}') + print(f'[rank {i + 1}] {k}: {v}') def eval_continuous(y_true, y_pred) -> None: @@ -43,13 +41,14 @@ def eval_continuous(y_true, y_pred) -> None: print(f'r2_score: {r2_score:.5f}') -def eval_binary(y_true, - y_pred, - threshold: Optional[float] = None, - metric: Metric = Metric.F1_SCORE, - n_trials: int = 200, - ret: bool = False - ) -> Optional[Tuple[Any, float]]: +def eval_binary( + y_true, + y_pred, + threshold: Optional[float] = None, + metric: Metric = Metric.F1_SCORE, + n_trials: int = 200, + ret: bool = False +) -> Optional[Tuple[Any, float]]: """ Evaluate a binary classification task. @@ -66,8 +65,8 @@ def eval_binary(y_true, """ # Metrics that can be directly calculated using y_pred - auc = sklearn.metrics.roc_auc_score(y_true = y_true, y_score = y_pred) - log_loss = sklearn.metrics.log_loss(y_true = y_true, y_pred = y_pred) + auc = sklearn.metrics.roc_auc_score(y_true=y_true, y_score=y_pred) + log_loss = sklearn.metrics.log_loss(y_true=y_true, y_pred=y_pred) # If the threshold does not exist, obtain it if threshold is None: @@ -76,11 +75,11 @@ def eval_binary(y_true, y_label = [1 if e > threshold else 0 for e in y_pred] # Metrics that require the predicted labels (y_label) - acc = sklearn.metrics.accuracy_score(y_true = y_true, y_pred = y_label) - precision = sklearn.metrics.precision_score(y_true = y_true, y_pred = y_label) - recall = sklearn.metrics.recall_score(y_true = y_true, y_pred = y_label) - f1 = sklearn.metrics.f1_score(y_true = y_true, y_pred = y_label) - cm = sklearn.metrics.confusion_matrix(y_true = y_true, y_pred = y_label) + acc = sklearn.metrics.accuracy_score(y_true=y_true, y_pred=y_label) + precision = sklearn.metrics.precision_score(y_true=y_true, y_pred=y_label) + recall = sklearn.metrics.recall_score(y_true=y_true, y_pred=y_label) + f1 = sklearn.metrics.f1_score(y_true=y_true, y_pred=y_label) + cm = sklearn.metrics.confusion_matrix(y_true=y_true, y_pred=y_label) tn, fp, fn, tp = cm.ravel() print(f'threshold: {threshold:.5f}') @@ -97,4 +96,4 @@ def eval_binary(y_true, print(f'confusion matrix:\n{cm}') if ret: - return y_label, threshold \ No newline at end of file + return y_label, threshold diff --git a/src/flameai/mining.py b/src/flameai/mining.py index 005aa89..8bb776f 100644 --- a/src/flameai/mining.py +++ b/src/flameai/mining.py @@ -12,4 +12,4 @@ def value_counts(df: pd.DataFrame) -> pd.DataFrame: return pd.DataFrame({ 'col_name': df.columns, 'val_cnt': val_cnt_list - }) \ No newline at end of file + }) diff --git a/src/flameai/plot.py b/src/flameai/plot.py index d589b2b..0a1834e 100644 --- a/src/flameai/plot.py +++ b/src/flameai/plot.py @@ -11,18 +11,18 @@ def roc_curve(y_true, y_score) -> None: :param y_true: An array of true binary labels. :param y_score: An array of predicted probabilities. """ - fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_true = y_true, y_score = y_score) - auc = sklearn.metrics.roc_auc_score(y_true = y_true, y_score = y_score) + fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_true=y_true, y_score=y_score) + auc = sklearn.metrics.roc_auc_score(y_true=y_true, y_score=y_score) print(f'AUC: {auc:.5f}') - plt.figure(figsize = (8, 6)) + plt.figure(figsize=(8, 6)) plt.plot(fpr, tpr, label='ROC curve (AUC = {:.2f})'.format(auc)) plt.title('Receiver Operating Characteristic (ROC) Curve') plt.xlabel('False Positive Rate (FPR)') plt.ylabel('True Positive Rate (TPR)') plt.legend(loc="lower right") - plt.grid(True, linestyle = 'dashed', alpha = 0.5) + plt.grid(True, linestyle='dashed', alpha=0.5) plt.show() @@ -34,8 +34,8 @@ def confusion_matrix(y_true, y_label) -> None: :param y_true: An array of true binary labels. :param y_label: An array of labels predicted by the model. """ - cm = sklearn.metrics.confusion_matrix(y_true = y_true, y_pred = y_label) - cm_matrix = pd.DataFrame(data = cm, - columns = ['Predict Negative:0', 'Predict Positive:1'], - index = ['Actual Negative:0', 'Actual Positive:1']) - sns.heatmap(cm_matrix, annot = True, fmt = 'd', cmap = 'YlGnBu') \ No newline at end of file + cm = sklearn.metrics.confusion_matrix(y_true=y_true, y_pred=y_label) + cm_matrix = pd.DataFrame(data=cm, + columns=['Predict Negative:0', 'Predict Positive:1'], + index=['Actual Negative:0', 'Actual Positive:1']) + sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu') diff --git a/src/flameai/preprocessing.py b/src/flameai/preprocessing.py index b0ab688..df007b1 100644 --- a/src/flameai/preprocessing.py +++ b/src/flameai/preprocessing.py @@ -1,8 +1,6 @@ -import math import numpy as np import pandas as pd import sklearn.preprocessing -import torch def label_encoder(df: pd.DataFrame) -> pd.DataFrame: @@ -27,7 +25,7 @@ def gen_scale_pos_weight(y_train) -> float: class DataLoader: - def __init__(self, lst: list = []): + def __init__(self, lst: list): self.i = 0 self._data = lst @@ -49,16 +47,3 @@ def __next__(self): return self._data[self.i - 1] else: raise StopIteration - - -def data_iter(data: list, batch_size: int) -> DataLoader: - """Split the original input data list into batches.""" - lst = [] - batch_num = math.floor(len(data) / batch_size) - for i in range(batch_num): - start, end = batch_size * i, batch_size * (i + 1) - X = torch.tensor([e[0] for e in data[start:end]]) - y = torch.tensor([e[1] for e in data[start:end]]) - lst.append((X, y)) - - return DataLoader(lst) \ No newline at end of file diff --git a/src/flameai/train.py b/src/flameai/train.py index 25dae9b..9d58f0a 100644 --- a/src/flameai/train.py +++ b/src/flameai/train.py @@ -1,16 +1,18 @@ import numpy as np import scipy -import sklearn.metrics import optuna +from typing import Optional + class AdaptiveLearningRate: """Customized learning rate decay""" + def __init__(self, - learning_rate: float = 0.3, - decay_rate: float = 0.9, - patience: int = 10 - ) -> None: + learning_rate: float = 0.3, + decay_rate: float = 0.9, + patience: int = 10 + ) -> None: self.learning_rate = learning_rate self.decay_rate = decay_rate self.patience = patience @@ -52,10 +54,10 @@ def gen_threshold(y_true, y_pred, metric, n_trials: int) -> float: def objective(trial): t = trial.suggest_float('threshold', 0.0, 1.0) y_label = [1 if e > t else 0 for e in y_pred] - return metric(y_true = y_true, y_pred = y_label) + return metric(y_true=y_true, y_pred=y_label) - study = optuna.create_study(direction = 'maximize') - study.optimize(objective, n_trials = n_trials) + study = optuna.create_study(direction='maximize') + study.optimize(objective, n_trials=n_trials) best_params = study.best_params # Restore the original logging level @@ -64,7 +66,7 @@ def objective(trial): return best_params['threshold'] -def gen_threshold_cdf(y_pred, rate: float, interval: int = 100) -> float: +def gen_threshold_cdf(y_pred, rate: float, interval: int = 100) -> Optional[float]: """ Finds the optimal threshold based on the desired proportion of negative samples (label 0) @@ -81,8 +83,7 @@ def gen_threshold_cdf(y_pred, rate: float, interval: int = 100) -> float: px = 0 for x, y in zip(xx, cdf): if y > rate: - xa = (px + x) / 2 - break + return (px + x) / 2 px = x - return xa \ No newline at end of file + return None diff --git a/src/flameai/util.py b/src/flameai/util.py index 411503a..444b6c4 100644 --- a/src/flameai/util.py +++ b/src/flameai/util.py @@ -3,37 +3,38 @@ import pandas as pd -def gen_abspath(dir: str, rel_path: str) -> str: +def gen_abspath(directory: str, rel_path: str) -> str: """ Generate the absolute path by combining the given directory with a relative path. - :param dir: The specified directory, which can be either an absolute or a relative path. + :param directory: The specified directory, which can be either an absolute or a relative path. :param rel_path: The relative path with respect to the 'dir'. :return: The resulting absolute path formed by concatenating the absolute directory and the relative path. """ - abs_dir = os.path.abspath(dir) + abs_dir = os.path.abspath(directory) return os.path.join(abs_dir, rel_path) -def read_csv(file_path: str, - sep: str = ',', - header: int = 0, - on_bad_lines: str = 'warn', - encoding: str = 'utf-8', - dtype: dict = None - ) -> pd.DataFrame: +def read_csv( + file_path: str, + sep: str = ',', + header: int = 0, + on_bad_lines: str = 'warn', + encoding: str = 'utf-8', + dtype: dict = None +) -> pd.DataFrame: """ Read a CSV file from the specified path. """ return pd.read_csv(file_path, - header = header, - sep = sep, - on_bad_lines = on_bad_lines, - encoding = encoding, - dtype = dtype) + header=header, + sep=sep, + on_bad_lines=on_bad_lines, + encoding=encoding, + dtype=dtype) -def set_logger(name: str = 'flameai', level: int = logging.WARNING): +def set_logger(name: str = 'FlameAI', level: int = logging.WARNING): """ Set up the logger for the application. """ @@ -46,12 +47,12 @@ def set_logger(name: str = 'flameai', level: int = logging.WARNING): # Create formatter formatter = logging.Formatter( - fmt = '%(asctime)s %(levelname)s [%(name)s]: (%(module)s:%(funcName)s(%(lineno)d)) - %(message)s', - datefmt = '%Y-%m-%d %H:%M:%S' + fmt='%(asctime)s %(levelname)s [%(name)s]: (%(module)s:%(funcName)s(%(lineno)d)) - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' ) stream_handler.setFormatter(formatter) # add formatter to stream_handler # add stream_handler to logger logger.addHandler(stream_handler) - return logger \ No newline at end of file + return logger diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 71cb0ac..e8d3ca8 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -1,29 +1,26 @@ from flameai.metrics import eval_binary, Metric -y_true = [0, 0, 1, 0, 0, 1, 0, 1, 1, 0] +y_true = [0, 0, 0, 1, 0, 1, 0, 1, 1, 0] y_pred = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] def test_eval_binary_with_threshold(): """set threshold by hand.""" - eval_binary(y_true, y_pred, threshold = 0.5) + y_label, _ = eval_binary(y_true, y_pred, threshold=0.5, ret=True) + assert y_label == [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] + def test_eval_binary_maximize_precision(): """Selecting the optimal threshold to maximize precision.""" - eval_binary(y_true, y_pred, metric = Metric.PRECISION) + eval_binary(y_true, y_pred, metric=Metric.PRECISION) + def test_eval_binary_maximize_recall(): """Selecting the optimal threshold to maximize recall.""" - eval_binary(y_true, y_pred, metric = Metric.RECALL) + eval_binary(y_true, y_pred, metric=Metric.RECALL) + def test_eval_binary_maximize_f1_score(): """Selecting the optimal threshold to maximize f1_score.""" - eval_binary(y_true, y_pred, metric = Metric.F1_SCORE) - - -if __name__ == '__main__': - # test_eval_binary_with_threshold() - test_eval_binary_maximize_precision() - # test_eval_binary_maximize_recall() - # test_eval_binary_maximize_f1_score() \ No newline at end of file + eval_binary(y_true, y_pred, metric=Metric.F1_SCORE) diff --git a/tests/test_mining.py b/tests/test_mining.py new file mode 100644 index 0000000..6e9375d --- /dev/null +++ b/tests/test_mining.py @@ -0,0 +1,11 @@ +import pandas as pd +from flameai.mining import value_counts + + +def test_value_counts(): + df = pd.DataFrame({'a': [5, 1, 3, 4, 3, 6, 6, 8, 8, 10], + 'b': [1, 0, 3, 7, 5, 6, 7, 11, 5, 10], + }) + result = value_counts(df) + assert result.equals(pd.DataFrame({'col_name': ['a', 'b'], + 'val_cnt': [7, 8]})) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py new file mode 100644 index 0000000..bc31a15 --- /dev/null +++ b/tests/test_preprocessing.py @@ -0,0 +1,28 @@ +import pandas as pd +from flameai.preprocessing import label_encoder, gen_scale_pos_weight, DataLoader + + +def test_label_encoder(): + df = pd.DataFrame({'a': ['a', 'b', 'c', 'a', 'b', 'c'], + 'b': [1, 2, 3, 2, 1, 0]}) + result = label_encoder(df) + assert result.equals(pd.DataFrame({'a': [0, 1, 2, 0, 1, 2], + 'b': [1, 2, 3, 2, 1, 0]})) + + +def test_gen_scale_pos_weight(): + y_train = pd.Series([0, 1, 0, 0, 1, 0, 0]) + result = gen_scale_pos_weight(y_train) + assert result == 2.5 + + +def test_data_loader(): + lst1 = [1, 2, 3, 4, 5, 6] + dt = DataLoader(lst1) + assert [e for e in dt] == lst1 + assert [e for e in dt] == lst1 + + lst2 = [1, 2, 3] + dt.data = lst2 + assert [e for e in dt] == lst2 + assert dt.data == lst2 diff --git a/tests/test_train.py b/tests/test_train.py new file mode 100644 index 0000000..8fdebe4 --- /dev/null +++ b/tests/test_train.py @@ -0,0 +1,30 @@ +from flameai.metrics import Metric +from flameai.train import gen_threshold, gen_threshold_cdf + + +y_true = [0, 0, 0, 1, 0, 1, 0, 1, 1, 0] +y_pred = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] + + +def test_gen_threshold(): + gen_threshold(y_true=y_true, + y_pred=y_pred, + metric=Metric.PRECISION, + n_trials=200) + gen_threshold(y_true=y_true, + y_pred=y_pred, + metric=Metric.RECALL, + n_trials=200) + gen_threshold(y_true=y_true, + y_pred=y_pred, + metric=Metric.ACCURACY, + n_trials=200) + gen_threshold(y_true=y_true, + y_pred=y_pred, + metric=Metric.F1_SCORE, + n_trials=200) + + +def test_gen_threshold_cdf(): + result = gen_threshold_cdf(y_pred=y_pred, rate=0.4) + assert result == 0.55