Support KNN model

miaohancheng · Oct 31, 2024 · 78129bd · 78129bd
1 parent 86092cc
commit 78129bd
Show file tree

Hide file tree

Showing 17 changed files with 322 additions and 246 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@
 .idea*
 
 loan_full.csv
+/catboost_info/
diff --git a/Example.py b/Example.py
@@ -1,4 +1,4 @@
-
+print(1)
 import warnings
 warnings.filterwarnings('ignore')
 from pysmatch.Matcher import Matcher
@@ -23,8 +23,9 @@
 # for reproducibility
 np.random.seed(20240919)
 
-m.fit_scores(balance=True, nmodels=10,n_jobs=3,model_type='linear')
-# m.fit_scores(balance=True, nmodels=10,n_jobs=3,model_type='tree')
+# m.fit_scores(balance=True, nmodels=10,n_jobs=3,model_type='knn')
+m.fit_scores(balance=True, nmodels=10,n_jobs=3,model_type='tree')
+# m.fit_scores(balance=True, nmodels=10,n_jobs=3,model_type='linear')
 
 
 m.predict_scores()

diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@
 
 - **Bug Fixes**: Addresses known issues from the original `pymatch` project.
 - **Parallel Computing**: Speeds up computation by utilizing multiple CPU cores.
-- **Model Selection**: Supports both linear (logistic regression) and tree-based models for propensity score estimation.
+- **Model Selection**: Supports linear models (logistic regression), tree-based models (e.g., CatBoost), and K-Nearest Neighbors (KNN) for propensity score estimation.
 
 ## Installation
 
@@ -192,14 +192,13 @@ We also specify nmodels=100 to train 100 models on different random samples of t
 
 
 
-With pysmatch, you can choose between linear models (logistic regression) and tree-based models (e.g., decision trees) for propensity score estimation. You can also leverage parallel computing to speed up model fitting by specifying the number of jobs (n_jobs).
-
+With pysmatch, you can choose between linear models (logistic regression), tree-based models (e.g., CatBoost), and K-Nearest Neighbors (KNN) for propensity score estimation. You can also leverage parallel computing to speed up model fitting by specifying the number of jobs (n_jobs).
 ```python
 # Set random seed for reproducibility
 np.random.seed(42)
 
 # Fit propensity score models
-m.fit_scores(balance=True, nmodels=100, n_jobs=5, model_type='linear')
+m.fit_scores(balance=True, nmodels=100, n_jobs=5, model_type='linear')# model_type='linear', model_type='tree'
 ```
 
 Output:

diff --git a/README_CHINESE.md b/README_CHINESE.md
@@ -15,7 +15,7 @@
 
 - **错误修复**：解决了原 `pymatch` 项目中的已知问题。
 - **并行计算**：利用多核 CPU 加速计算过程。
-- **模型选择**：支持线性模型（逻辑回归）和基于树的模型（如决策树）进行倾向得分估计。
+- **模型选择**：支持线性模型（逻辑回归）、树模型（如 CatBoost），以及 K 近邻（KNN）模型进行倾向评分估计。
 
 ## 安装
 
@@ -207,14 +207,13 @@ n minority: 1219
 
 ## **模型选择和并行计算**
 
-使用 pysmatch，您可以在倾向得分估计中选择线性模型（逻辑回归）或基于树的模型（如决策树）。您还可以通过指定作业数（n_jobs）利用并行计算来加速模型拟合。
-
+使用 pysmatch，您可以在倾向评分估计中选择线性模型（逻辑回归）、树模型（例如 CatBoost）和 K 近邻（KNN）模型。您还可以通过指定作业数量（n_jobs）利用并行计算来加速模型拟合。
 ```python
 # Set random seed for reproducibility
 np.random.seed(42)
 
 # Fit propensity score models
-m.fit_scores(balance=True, nmodels=100, n_jobs=5, model_type='linear')
+m.fit_scores(balance=True, nmodels=100, n_jobs=5, model_type='linear') # model_type='knn',model_type='tree'
 ```
 
 输出:

diff --git a/build/lib/pysmatch/Matcher.py b/build/lib/pysmatch/Matcher.py
@@ -1,4 +1,8 @@
 from __future__ import print_function
+import matplotlib.pyplot as plt
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.preprocessing import StandardScaler
+
 from pysmatch import *
 import pysmatch.functions as uf
 from catboost import CatBoostClassifier
@@ -52,6 +56,7 @@ def __init__(self, test, control, yvar, formula=None, exclude=None):
         self.errors = 0
         self.data[yvar] = self.data[yvar].astype(int)  # should be binary 0, 1
         self.xvars = [i for i in self.data.columns if i not in self.exclude]
+        self.original_xvars = self.xvars.copy()
         self.data = self.data.dropna(subset=self.xvars)
         self.matched_data = []
         self.xvars_escaped = [f"Q('{x}')" for x in self.xvars]
@@ -71,7 +76,25 @@ def __init__(self, test, control, yvar, formula=None, exclude=None):
         print('Formula:\n{} ~ {}'.format(yvar, '+'.join(self.xvars)))
         print('n majority:', len(self.data[self.data[yvar] == self.majority]))
         print('n minority:', len(self.data[self.data[yvar] == self.minority]))
+    def preprocess_data(self, X, fit_scaler=False, index=None):
+        X_encoded = pd.get_dummies(X)
+
+        if not hasattr(self, 'X_columns'):
+            self.X_columns = X_encoded.columns
+        else:
+            X_encoded = X_encoded.reindex(columns=self.X_columns, fill_value=0)
+
+        if fit_scaler:
+            scaler = StandardScaler()
+            X_scaled = scaler.fit_transform(X_encoded)
+            if not hasattr(self, 'scalers'):
+                self.scalers = {}
+            self.scalers[index] = scaler
+        else:
+            scaler = self.scalers[index]
+            X_scaled = scaler.transform(X_encoded)
 
+        return X_scaled
     def fit_model(self, index, X, y, model_type, balance):
         X_train, _, y_train, _ = train_test_split(X, y, train_size=0.7, random_state=index)
 
@@ -81,10 +104,15 @@ def fit_model(self, index, X, y, model_type, balance):
         else:
             X_resampled, y_resampled = X_train, y_train
 
+        if model_type in ['linear', 'knn']:
+            X_processed = self.preprocess_data(X_resampled, fit_scaler=True, index=index)
+        else:
+            X_processed = X_resampled
+
         if model_type == 'linear':
             model = LogisticRegression(max_iter=100)
-            model.fit(X_resampled, y_resampled.iloc[:, 0])
-            accuracy = model.score(X_resampled, y_resampled)
+            model.fit(X_processed, y_resampled.iloc[:, 0])
+            accuracy = model.score(X_processed, y_resampled)
         elif model_type == 'tree':
             cat_features_indices = np.where(X_resampled.dtypes == 'object')[0]
             model = CatBoostClassifier(iterations=100, depth=6,
@@ -94,6 +122,12 @@ def fit_model(self, index, X, y, model_type, balance):
                                        logging_level='Silent')
             model.fit(X_resampled, y_resampled.iloc[:, 0], plot=False)
             accuracy = model.score(X_resampled, y_resampled)
+        elif model_type == 'knn':
+            model = KNeighborsClassifier(n_neighbors=5)
+            model.fit(X_processed, y_resampled.iloc[:, 0])
+            accuracy = model.score(X_processed, y_resampled)
+        else:
+            raise ValueError("Invalid model_type. Choose from 'linear', 'tree', or 'knn'.")
         print(f"Model {index + 1}/{self.nmodels} trained. Accuracy: {accuracy:.2%}")
         return {'model': model, 'accuracy': accuracy}
 
@@ -134,11 +168,20 @@ def predict_scores(self):
         """
         model_preds = []
 
-        for m in self.models:
+        for idx, m in enumerate(self.models):
+            if self.model_type in ['linear', 'knn']:
+                X_processed = self.preprocess_data(self.X, fit_scaler=False, index=idx)
+            else:
+                X_processed = self.X
+
             if self.model_type == 'linear':
-                preds = m.predict_proba(self.X)[:, 1]
+                preds = m.predict_proba(X_processed)[:, 1]
             elif self.model_type == 'tree':
                 preds = m.predict(self.X, prediction_type='Probability')[:, 1]
+            elif self.model_type == 'knn':
+                preds = m.predict_proba(X_processed)[:, 1]
+            else:
+                raise ValueError("Invalid model_type. Choose from 'linear', 'tree', or 'knn'.")
             model_preds.append(preds)
 
         model_preds = np.array(model_preds)
@@ -202,19 +245,6 @@ def match(self, threshold=0.001, nmatches=1, method='min', max_rand=10):
         self.matched_data['match_id'] = match_ids
         self.matched_data['record_id'] = self.matched_data.index
 
-    def select_from_design(self, cols):
-        d = pd.DataFrame()
-        for c in cols:
-            d = pd.concat([d, self.X.select(lambda x: x.startswith(c), axis=1)], axis=1, sort=True)
-        return d
-
-    def balanced_sample(self, data=None):
-        if not data:
-            data = self.data
-        minor, major = data[data[self.yvar] == self.minority], \
-            data[data[self.yvar] == self.majority]
-        return pd.concat([major.sample(len(minor)), minor], ignore_index=True).dropna()
-
     def plot_scores(self):
         """
         Plots the distribution of propensity scores before matching between
@@ -229,6 +259,7 @@ def plot_scores(self):
         plt.title("Propensity Scores Before Matching")
         plt.ylabel("Percentage (%)")
         plt.xlabel("Scores")
+        plt.show()
 
     def prop_test(self, col):
         """
@@ -336,6 +367,7 @@ def compare_continuous(self, save=False, return_table=False,plot_result = True):
                                                    std_diff_med_after, std_diff_mean_after))
                     ax2.legend(loc="lower right")
                     plt.xlim((0, np.percentile(xta.x, 99)))
+                    plt.show()
 
                 test_results.append({
                     "var": col,
@@ -494,6 +526,7 @@ def tune_threshold(self, method, nmatches=1, rng=np.arange(0, .001, .0001)):
         plt.ylabel("Proportion Retained")
         plt.xlabel("Threshold")
         plt.xticks(rng)
+        plt.show()
 
     def record_frequency(self):
         """