Skip to content

Commit

Permalink
Support KNN model
Browse files Browse the repository at this point in the history
Support KNN model
  • Loading branch information
miaohancheng committed Oct 31, 2024
1 parent 86092cc commit 78129bd
Show file tree
Hide file tree
Showing 17 changed files with 322 additions and 246 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
.idea*

loan_full.csv
/catboost_info/
7 changes: 4 additions & 3 deletions Example.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

print(1)
import warnings
warnings.filterwarnings('ignore')
from pysmatch.Matcher import Matcher
Expand All @@ -23,8 +23,9 @@
# for reproducibility
np.random.seed(20240919)

m.fit_scores(balance=True, nmodels=10,n_jobs=3,model_type='linear')
# m.fit_scores(balance=True, nmodels=10,n_jobs=3,model_type='tree')
# m.fit_scores(balance=True, nmodels=10,n_jobs=3,model_type='knn')
m.fit_scores(balance=True, nmodels=10,n_jobs=3,model_type='tree')
# m.fit_scores(balance=True, nmodels=10,n_jobs=3,model_type='linear')


m.predict_scores()
Expand Down
7 changes: 3 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

- **Bug Fixes**: Addresses known issues from the original `pymatch` project.
- **Parallel Computing**: Speeds up computation by utilizing multiple CPU cores.
- **Model Selection**: Supports both linear (logistic regression) and tree-based models for propensity score estimation.
- **Model Selection**: Supports linear models (logistic regression), tree-based models (e.g., CatBoost), and K-Nearest Neighbors (KNN) for propensity score estimation.

## Installation

Expand Down Expand Up @@ -192,14 +192,13 @@ We also specify nmodels=100 to train 100 models on different random samples of t



With pysmatch, you can choose between linear models (logistic regression) and tree-based models (e.g., decision trees) for propensity score estimation. You can also leverage parallel computing to speed up model fitting by specifying the number of jobs (n_jobs).

With pysmatch, you can choose between linear models (logistic regression), tree-based models (e.g., CatBoost), and K-Nearest Neighbors (KNN) for propensity score estimation. You can also leverage parallel computing to speed up model fitting by specifying the number of jobs (n_jobs).
```python
# Set random seed for reproducibility
np.random.seed(42)

# Fit propensity score models
m.fit_scores(balance=True, nmodels=100, n_jobs=5, model_type='linear')
m.fit_scores(balance=True, nmodels=100, n_jobs=5, model_type='linear')# model_type='linear', model_type='tree'
```

Output:
Expand Down
7 changes: 3 additions & 4 deletions README_CHINESE.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

- **错误修复**:解决了原 `pymatch` 项目中的已知问题。
- **并行计算**:利用多核 CPU 加速计算过程。
- **模型选择**:支持线性模型(逻辑回归)和基于树的模型(如决策树)进行倾向得分估计
- **模型选择**:支持线性模型(逻辑回归)、树模型(如 CatBoost),以及 K 近邻(KNN)模型进行倾向评分估计

## 安装

Expand Down Expand Up @@ -207,14 +207,13 @@ n minority: 1219

## **模型选择和并行计算**

使用 pysmatch,您可以在倾向得分估计中选择线性模型(逻辑回归)或基于树的模型(如决策树)。您还可以通过指定作业数(n_jobs)利用并行计算来加速模型拟合。

使用 pysmatch,您可以在倾向评分估计中选择线性模型(逻辑回归)、树模型(例如 CatBoost)和 K 近邻(KNN)模型。您还可以通过指定作业数量(n_jobs)利用并行计算来加速模型拟合。
```python
# Set random seed for reproducibility
np.random.seed(42)

# Fit propensity score models
m.fit_scores(balance=True, nmodels=100, n_jobs=5, model_type='linear')
m.fit_scores(balance=True, nmodels=100, n_jobs=5, model_type='linear') # model_type='knn',model_type='tree'
```

输出:
Expand Down
67 changes: 50 additions & 17 deletions build/lib/pysmatch/Matcher.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
from __future__ import print_function
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

from pysmatch import *
import pysmatch.functions as uf
from catboost import CatBoostClassifier
Expand Down Expand Up @@ -52,6 +56,7 @@ def __init__(self, test, control, yvar, formula=None, exclude=None):
self.errors = 0
self.data[yvar] = self.data[yvar].astype(int) # should be binary 0, 1
self.xvars = [i for i in self.data.columns if i not in self.exclude]
self.original_xvars = self.xvars.copy()
self.data = self.data.dropna(subset=self.xvars)
self.matched_data = []
self.xvars_escaped = [f"Q('{x}')" for x in self.xvars]
Expand All @@ -71,7 +76,25 @@ def __init__(self, test, control, yvar, formula=None, exclude=None):
print('Formula:\n{} ~ {}'.format(yvar, '+'.join(self.xvars)))
print('n majority:', len(self.data[self.data[yvar] == self.majority]))
print('n minority:', len(self.data[self.data[yvar] == self.minority]))
def preprocess_data(self, X, fit_scaler=False, index=None):
X_encoded = pd.get_dummies(X)

if not hasattr(self, 'X_columns'):
self.X_columns = X_encoded.columns
else:
X_encoded = X_encoded.reindex(columns=self.X_columns, fill_value=0)

if fit_scaler:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)
if not hasattr(self, 'scalers'):
self.scalers = {}
self.scalers[index] = scaler
else:
scaler = self.scalers[index]
X_scaled = scaler.transform(X_encoded)

return X_scaled
def fit_model(self, index, X, y, model_type, balance):
X_train, _, y_train, _ = train_test_split(X, y, train_size=0.7, random_state=index)

Expand All @@ -81,10 +104,15 @@ def fit_model(self, index, X, y, model_type, balance):
else:
X_resampled, y_resampled = X_train, y_train

if model_type in ['linear', 'knn']:
X_processed = self.preprocess_data(X_resampled, fit_scaler=True, index=index)
else:
X_processed = X_resampled

if model_type == 'linear':
model = LogisticRegression(max_iter=100)
model.fit(X_resampled, y_resampled.iloc[:, 0])
accuracy = model.score(X_resampled, y_resampled)
model.fit(X_processed, y_resampled.iloc[:, 0])
accuracy = model.score(X_processed, y_resampled)
elif model_type == 'tree':
cat_features_indices = np.where(X_resampled.dtypes == 'object')[0]
model = CatBoostClassifier(iterations=100, depth=6,
Expand All @@ -94,6 +122,12 @@ def fit_model(self, index, X, y, model_type, balance):
logging_level='Silent')
model.fit(X_resampled, y_resampled.iloc[:, 0], plot=False)
accuracy = model.score(X_resampled, y_resampled)
elif model_type == 'knn':
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_processed, y_resampled.iloc[:, 0])
accuracy = model.score(X_processed, y_resampled)
else:
raise ValueError("Invalid model_type. Choose from 'linear', 'tree', or 'knn'.")
print(f"Model {index + 1}/{self.nmodels} trained. Accuracy: {accuracy:.2%}")
return {'model': model, 'accuracy': accuracy}

Expand Down Expand Up @@ -134,11 +168,20 @@ def predict_scores(self):
"""
model_preds = []

for m in self.models:
for idx, m in enumerate(self.models):
if self.model_type in ['linear', 'knn']:
X_processed = self.preprocess_data(self.X, fit_scaler=False, index=idx)
else:
X_processed = self.X

if self.model_type == 'linear':
preds = m.predict_proba(self.X)[:, 1]
preds = m.predict_proba(X_processed)[:, 1]
elif self.model_type == 'tree':
preds = m.predict(self.X, prediction_type='Probability')[:, 1]
elif self.model_type == 'knn':
preds = m.predict_proba(X_processed)[:, 1]
else:
raise ValueError("Invalid model_type. Choose from 'linear', 'tree', or 'knn'.")
model_preds.append(preds)

model_preds = np.array(model_preds)
Expand Down Expand Up @@ -202,19 +245,6 @@ def match(self, threshold=0.001, nmatches=1, method='min', max_rand=10):
self.matched_data['match_id'] = match_ids
self.matched_data['record_id'] = self.matched_data.index

def select_from_design(self, cols):
d = pd.DataFrame()
for c in cols:
d = pd.concat([d, self.X.select(lambda x: x.startswith(c), axis=1)], axis=1, sort=True)
return d

def balanced_sample(self, data=None):
if not data:
data = self.data
minor, major = data[data[self.yvar] == self.minority], \
data[data[self.yvar] == self.majority]
return pd.concat([major.sample(len(minor)), minor], ignore_index=True).dropna()

def plot_scores(self):
"""
Plots the distribution of propensity scores before matching between
Expand All @@ -229,6 +259,7 @@ def plot_scores(self):
plt.title("Propensity Scores Before Matching")
plt.ylabel("Percentage (%)")
plt.xlabel("Scores")
plt.show()

def prop_test(self, col):
"""
Expand Down Expand Up @@ -336,6 +367,7 @@ def compare_continuous(self, save=False, return_table=False,plot_result = True):
std_diff_med_after, std_diff_mean_after))
ax2.legend(loc="lower right")
plt.xlim((0, np.percentile(xta.x, 99)))
plt.show()

test_results.append({
"var": col,
Expand Down Expand Up @@ -494,6 +526,7 @@ def tune_threshold(self, method, nmatches=1, rng=np.arange(0, .001, .0001)):
plt.ylabel("Proportion Retained")
plt.xlabel("Threshold")
plt.xticks(rng)
plt.show()

def record_frequency(self):
"""
Expand Down
Loading

0 comments on commit 78129bd

Please sign in to comment.