From 6b33f598df2b835066b4f289b961b32f1f285e8d Mon Sep 17 00:00:00 2001 From: jojocys Date: Sun, 27 Nov 2022 03:50:03 +0800 Subject: [PATCH 1/2] feat: prediction poc --- .gitignore | 4 +- .../prediction/classification/__init__.py | 0 .../classification/classification.py | 26 +++++ services/prediction/main.py | 96 +++++++++++++++++++ services/prediction/regression/__init__.py | 0 services/prediction/regression/regression.py | 35 +++++++ services/prediction/transform.py | 47 +++++++++ 7 files changed, 207 insertions(+), 1 deletion(-) create mode 100644 services/prediction/classification/__init__.py create mode 100644 services/prediction/classification/classification.py create mode 100644 services/prediction/main.py create mode 100644 services/prediction/regression/__init__.py create mode 100644 services/prediction/regression/regression.py create mode 100644 services/prediction/transform.py diff --git a/.gitignore b/.gitignore index 8ec6f604..77bd7eda 100644 --- a/.gitignore +++ b/.gitignore @@ -50,4 +50,6 @@ __pycache__/ connect.db not_sqlalchemy/ run_flask_app.bat -venv \ No newline at end of file +venv + +dataset-with-metas.json \ No newline at end of file diff --git a/services/prediction/classification/__init__.py b/services/prediction/classification/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/services/prediction/classification/classification.py b/services/prediction/classification/classification.py new file mode 100644 index 00000000..5322fb82 --- /dev/null +++ b/services/prediction/classification/classification.py @@ -0,0 +1,26 @@ +from sklearn import tree +from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier +from sklearn.metrics import accuracy_score + +def classification (X_train, X_test, y_train, y_test, headers, algorithm): + clf = None + if algorithm == 'decisionTree': + clf = tree.DecisionTreeClassifier() + elif algorithm == 'gradientBoosting': + clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=3, random_state=0) + elif algorithm == 'adaBoost': + clf = AdaBoostClassifier(n_estimators=100) + else: + print('using random forest') + clf = RandomForestClassifier(max_depth=3, random_state=0) + + clf = clf.fit(X_train, y_train) + predict_res = clf.predict(X_test) + score = accuracy_score(y_test, predict_res) + diffs = [] + for i in range(len(y_test)): + if y_test[i] != predict_res[i]: + diffs.append(0) + else: + diffs.append(1) + return score, diffs \ No newline at end of file diff --git a/services/prediction/main.py b/services/prediction/main.py new file mode 100644 index 00000000..13717345 --- /dev/null +++ b/services/prediction/main.py @@ -0,0 +1,96 @@ +from flask import Flask, request +from flask_cors import CORS +import json +import numpy as np +import random +from classification.classification import classification +from regression.regression import regression +from transform import makeTrainingData + +app = Flask(__name__) +CORS(app) + +app = Flask(__name__) +cors = CORS(app, resources={r"/api/*": {"origins": "*"}}) + +def controlSplitTrainTest (X, y, split_states: 'list[int]'): + train_indices = [] + test_indices = [] + for i in range(len(split_states)): + if split_states[i] == 1: + train_indices.append(i) + if split_states[i] == 0: + test_indices.append(i) + train_indices = np.array(train_indices) + test_indices = np.array(test_indices) + X_train = X.take(train_indices, axis=0) + X_test = X.take(test_indices, axis=0) + y_train = y.take(train_indices, axis=0) + y_test = y.take(test_indices, axis=0) + return X_train, X_test, y_train, y_test + +def mockSplitIndices (size: int, ratio: float): + indices = [] + for i in range(size): + if random.random() > ratio: + indices.append(1) + else: + indices.append(0) + return indices + +@app.get('/api/ping') +def ping(): + return { + "success": True + } + +@app.post("/api/train_test") +def runClassificationModel(): + try: + dataset = json.loads(request.data) + data = dataset['dataSource'] + fields = dataset['fields'] + model = json.loads(request.data)['model'] + features = model['features'] + targets = model['targets'] + algorithm = model['algorithm'] + mode = dataset['mode'] + trainTestSplitIndices = [] + if 'trainTestSplitIndices' in dataset: + trainTestSplitIndices = dataset['trainTestSplitIndices'] + else: + trainTestSplitIndices = mockSplitIndices(len(data), 0.2) + testset_indices = [] + for i in range(len(trainTestSplitIndices)): + if trainTestSplitIndices[i] == 0: + testset_indices.append(i) + X, y, headers = makeTrainingData(data=data, fields=fields, features=features, target=targets[0]) + # print(X.shape, y.shape, len(headers)) + X_train, X_test, y_train, y_test = controlSplitTrainTest(X, y, trainTestSplitIndices) + # print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) + score = 0 + diffs = [] + if mode == 'classification': + score, diffs = classification(X_train, X_test, y_train, y_test, headers, algorithm) + elif mode == 'regression': + score, diffs = regression(X_train, X_test, y_train, y_test, headers, algorithm) + if len(diffs) != len(testset_indices): + print('[warning] diffs and testset_indices have different lengths') + result = [] + for i in range(len(diffs)): + result.append([testset_indices[i], diffs[i]]) + return { + "success": True, + "data": { + "accuracy": score, + "result": result + } + } + except Exception as e: + return { + "success": False, + "message": str(e) + } + +if __name__ == '__main__': + app.run(host= '0.0.0.0',port=5533,debug=True) diff --git a/services/prediction/regression/__init__.py b/services/prediction/regression/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/services/prediction/regression/regression.py b/services/prediction/regression/regression.py new file mode 100644 index 00000000..a008a303 --- /dev/null +++ b/services/prediction/regression/regression.py @@ -0,0 +1,35 @@ +from sklearn import linear_model +from sklearn.metrics import mean_squared_error, r2_score +from sklearn import tree, ensemble +import numpy as np + +def regression (X_train, X_test, y_train, y_test, headers, algorithm): + regr = None + if algorithm == 'linearRegression': + regr = linear_model.LinearRegression() + elif algorithm == 'lasso': + regr = linear_model.Lasso(alpha=0.1) + elif algorithm == 'ridge': + regr = linear_model.Ridge(alpha=0.5) + elif algorithm == 'decisionTree': + regr = tree.DecisionTreeRegressor() + elif algorithm == 'randomForest': + regr = ensemble.RandomForestRegressor(max_depth=3, random_state=50, oob_score=True) + else: + regr = linear_model.ElasticNet(alpha=0.1, l1_ratio=0.7) + + regr.fit(X_train, y_train) + predict_res = regr.predict(X_test) + score = regr.score(X_test, y_test) + # score = r2_score(y_test, predict_res) + diffs = [] + std = np.std(y_test) + for i in range(len(y_test)): + z_score = (y_test[i] - predict_res[i]) / std + if z_score > 2 or z_score < -2: + diffs.append(0) + else: + diffs.append(1) + return score, diffs + + \ No newline at end of file diff --git a/services/prediction/transform.py b/services/prediction/transform.py new file mode 100644 index 00000000..c1e83d7f --- /dev/null +++ b/services/prediction/transform.py @@ -0,0 +1,47 @@ +from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder +import numpy as np +import pandas as pd +def makeTrainingData(data, fields, features, target): + headers = [] + featureFields = list(filter(lambda x: x['fid'] in features, fields)) + targetField = list(filter(lambda x: x['fid'] == target, fields))[0] + X = np.zeros(shape=(len(data), 1)) + y = np.zeros(shape=(len(data), 1)) + target_encoder = OrdinalEncoder() + target_values = np.array([row[targetField['fid']] for row in data]) + y = target_encoder.fit_transform(target_values.reshape(-1, 1)) + for field in featureFields: + if field['semanticType'] == 'nominal': + values = np.array([row[field['fid']] for row in data]) + values = values.reshape(-1, 1) + if field['features']['unique'] > 2: + encoder = OneHotEncoder() + res = encoder.fit_transform(values) + X = np.concatenate((X, res.toarray()), axis=1) + for v in encoder.categories_[0]: + headers.append(field['name'] + '_' + v) + continue + else: + encoder = OrdinalEncoder() + res = encoder.fit_transform(values) + X = np.concatenate((X, res), axis=1) + elif field['semanticType'] == 'ordinal': + values = np.array([row[field['fid']] for row in data]) + values = values.reshape(-1, 1) + encoder = OrdinalEncoder() + res = encoder.fit_transform(values) + X = np.concatenate((X, res), axis=1) + elif field['semanticType'] == 'quantitative': + values = np.array([row[field['fid']] for row in data]) + values = values.reshape(-1, 1) + X = np.concatenate((X, values), axis=1) + elif field['semanticType'] == 'temporal': + timestamps = [] + for row in data: + ts = pd.Timestamp(row[field['fid']]).timestamp() + timestamps.append(ts) + values = np.array(timestamps) + values = values.reshape(-1, 1) + X = np.concatenate((X, values), axis=1) + headers.append(field['name']) + return X[:,1:], y, headers \ No newline at end of file From 17dd1116eb5f44c650afb09fa3c5cecfbc556f5f Mon Sep 17 00:00:00 2001 From: jojocys Date: Sun, 27 Nov 2022 06:29:43 +0800 Subject: [PATCH 2/2] fix: routes --- services/prediction/main.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/services/prediction/main.py b/services/prediction/main.py index 13717345..21e6ca3d 100644 --- a/services/prediction/main.py +++ b/services/prediction/main.py @@ -7,9 +7,6 @@ from regression.regression import regression from transform import makeTrainingData -app = Flask(__name__) -CORS(app) - app = Flask(__name__) cors = CORS(app, resources={r"/api/*": {"origins": "*"}}) @@ -38,13 +35,13 @@ def mockSplitIndices (size: int, ratio: float): indices.append(0) return indices -@app.get('/api/ping') +@app.route('/api/ping', methods=['GET']) def ping(): return { "success": True } -@app.post("/api/train_test") +@app.route("/api/train_test", methods=['POST']) def runClassificationModel(): try: dataset = json.loads(request.data)