bankruptcy (15).py

# -*- coding: utf-8 -*-
"""bankruptcy.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1WAf9nRlRHjU5zhYlm-IF-CfppRjedjKK

#Bankruptcy prediction

## Overview
In this notebook we explore bankruptcy prediction using machine learning techniques. It employs various methods such as logistic regression and random forest classifier to predict bankruptcy based on financial data.

## Dataset

The dataset used in this analysis is sourced from Kaggle: [Company Bankruptcy Prediction](https://www.kaggle.com/datasets/fedesoriano/company-bankruptcy-prediction). It comprises various financial attributes of companies along with a binary target variable indicating bankruptcy.
"""

# Imports

import numpy as np
import pandas as pd

"""## Dataset"""

!wget -O data.csv 'https://drive.google.com/uc?export=download&id=1-qkqMYZ5O-ZkpEkwbY1Fwexiz6Y4KYc0'
dataset = pd.read_csv('data.csv')
dataset.head()

from scipy.stats import zscore

features_only = dataset.iloc[:, 1:]

z_scores = np.abs(zscore(features_only))

threshold = 3

outlier_mask = (z_scores > threshold).any(axis=1)

clean_features = features_only[~outlier_mask]
clean_target = dataset.iloc[:, 0][~outlier_mask]  # Align the target with the cleaned features

clean_dataset = pd.concat([clean_target, clean_features], axis=1)

print(f"Original dataset shape: {dataset.shape}")
print(f"Cleaned dataset shape: {clean_dataset.shape}")

#dataset = clean_dataset

X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values

y_series = pd.Series(y)
print("Class Balance:")
print(y_series.value_counts())

"""## Correlation HeatMap"""

import seaborn as sns
import matplotlib.pyplot as plt

correlation_matrix = dataset.iloc[:, 1:].corr()

# Plot the heatmap
plt.figure(figsize=(60, 60))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix Heatmap')
plt.show()

"""## Univariate Selection
Univariate selection methods evaluate each feature individually against the target variable. One such method is selecting features based on the p-value from an ANOVA F-value.
"""

from sklearn.feature_selection import SelectKBest, f_classif

column_names = dataset.columns[1:]

selector = SelectKBest(f_classif, k=60)  # Adjust 'k'
selector.fit(X, y)

mask = selector.get_support()
new_features = []

for i, bool in enumerate(mask):
    if bool:
        new_features.append(column_names[i])

print(new_features)

"""## Train / Test Split"""

from sklearn.model_selection import train_test_split

X_df = pd.DataFrame(X, columns=column_names)

selected_X_df = X_df[new_features] #Filter attributes

selected_X = selected_X_df.values

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)
X_train, X_test, y_train, y_test = train_test_split(selected_X, y, test_size = 0.25, random_state = 1)

print(X_train)

print(X_test)

print(y_train)

print(y_test)

"""## Feature Scaling"""

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

print(X_train)

print(X_test)

"""# Logistic Regresion

### Inbalanced prediction
"""

from sklearn.linear_model import LogisticRegression
LRclassifier = LogisticRegression(max_iter=1000)
LRclassifier.fit(X_train, y_train)

# Print dataset balance
y_series = pd.Series(y_train)
class_counts = y_series.value_counts()
total_instances = len(y_train)
class_percentages = (class_counts / total_instances) * 100
print("Class Counts:")
print(class_counts)
print("\nClass Percentages:")
print(class_percentages)

y_pred = LRclassifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

"""### Class weights"""

# Print dataset balance
y_series = pd.Series(y_train)
class_counts = y_series.value_counts()
total_instances = len(y_train)
class_percentages = (class_counts / total_instances) * 100
print("Class Counts:")
print(class_counts)
print("\nClass Percentages:")
print(class_percentages)

from sklearn.linear_model import LogisticRegression
LRclassifier = LogisticRegression(max_iter=1000, class_weight={0: 0.4, 1: 1})
LRclassifier.fit(X_train, y_train)

y_pred = LRclassifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

"""### Synthetic Minority Over-sampling Technique (SMOTE)

"""

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=1)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Print dataset balance
y_series = pd.Series(y_train_smote)
class_counts = y_series.value_counts()
total_instances = len(y_train_smote)
class_percentages = (class_counts / total_instances) * 100
print("Class Counts:")
print(class_counts)
print("\nClass Percentages:")
print(class_percentages)

from sklearn.linear_model import LogisticRegression
LRclassifier = LogisticRegression(max_iter=1000)
LRclassifier.fit(X_train_smote, y_train_smote)

y_pred = LRclassifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

"""### Synthetic Minority Over-sampling Technique (SMOTE) 70:30

"""

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=1, sampling_strategy=0.3)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Print dataset balance
y_series = pd.Series(y_train_smote)
class_counts = y_series.value_counts()
total_instances = len(y_train_smote)
class_percentages = (class_counts / total_instances) * 100
print("Class Counts:")
print(class_counts)
print("\nClass Percentages:")
print(class_percentages)

from sklearn.linear_model import LogisticRegression
LRclassifier = LogisticRegression(max_iter=1000)
LRclassifier.fit(X_train_smote, y_train_smote)

y_pred = LRclassifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

"""### Combination of oversampling and undersampling"""

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

smote = SMOTE(random_state=1, sampling_strategy=0.3)
rus = RandomUnderSampler(random_state=1, sampling_strategy=0.4)

# Combine oversampling and undersampling
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_resampled, y_train_resampled)

# Print dataset balance
y_series = pd.Series(y_train_resampled)
class_counts = y_series.value_counts()
total_instances = len(y_train_resampled)
class_percentages = (class_counts / total_instances) * 100
print("Class Counts:")
print(class_counts)
print("\nClass Percentages:")
print(class_percentages)

# Train the logistic regression model on the balanced dataset
LRclassifier = LogisticRegression(max_iter=1000)
LRclassifier.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred = LRclassifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

"""### Cross validation"""

from sklearn.model_selection import cross_val_score, StratifiedKFold

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

scores = cross_val_score(LRclassifier, X_train_resampled, y_train_resampled, cv=kf, scoring='accuracy')

print("Cross-validated scores:", scores)
print("Average score:", scores.mean())

scores_f1 = cross_val_score(LRclassifier, X_train_resampled, y_train_resampled, cv=kf, scoring='f1')

print("Cross-validated F1 scores:", scores_f1)
print("Average F1 score:", scores_f1.mean())

"""# Random Forest Classifier

### Using class weights
"""

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pandas as pd

# Train the Random Forest model directly on the original training data
RFclassifier = RandomForestClassifier(n_estimators=100, random_state=1, class_weight={0: 0.6, 1: 1.0})
RFclassifier.fit(X_train, y_train)
y_pred_RF = RFclassifier.predict(X_test)

# Print dataset balance
y_series = pd.Series(y_train)
class_counts = y_series.value_counts()
total_instances = len(y_train)
class_percentages = (class_counts / total_instances) * 100
print("Class Counts:")
print(class_counts)
print("\nClass Percentages:")
print(class_percentages)

# Print Random Forest results
print("\nRandom Forest Results:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_RF))
print("\nClassification Report:\n", classification_report(y_test, y_pred_RF))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred_RF))

"""### Using SMOTE and RandomUnderSampler


"""

from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

smote = SMOTE(random_state=1, sampling_strategy=0.3)
rus = RandomUnderSampler(random_state=1, sampling_strategy=0.4)

# Combine oversampling and undersampling
X_train_forest, y_train_forest = smote.fit_resample(X_train, y_train)
X_train_forest, y_train_forest = rus.fit_resample(X_train_forest, y_train_forest)


# Print dataset balance
y_series = pd.Series(y_train_forest)
class_counts = y_series.value_counts()
total_instances = len(y_train_forest)
class_percentages = (class_counts / total_instances) * 100
print("Class Counts:")
print(class_counts)
print("\nClass Percentages:")
print(class_percentages)


RFclassifier = RandomForestClassifier(n_estimators=100, random_state=1, class_weight={0: 0.6, 1: 1.0})
RFclassifier.fit(X_train_forest, y_train_forest)
y_pred_RF = RFclassifier.predict(X_test)

print("\nRandom Forest Results:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_RF))
print("\nClassification Report:\n", classification_report(y_test, y_pred_RF))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred_RF))

"""## Cross validation"""

from sklearn.model_selection import cross_val_score, StratifiedKFold

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

scores = cross_val_score(RFclassifier, X_train_forest, y_train_forest, cv=kf, scoring='accuracy')

print("Cross-validated scores:", scores)
print("Average score:", scores.mean())

scores_f1 = cross_val_score(RFclassifier, X_train_forest, y_train_forest, cv=kf, scoring='f1')

print("Cross-validated F1 scores:", scores_f1)
print("Average F1 score:", scores_f1.mean())

"""## k-Nearest Neighbors (k-NN)


"""

from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

smote = SMOTE(random_state=1, sampling_strategy=0.3)
rus = RandomUnderSampler(random_state=1, sampling_strategy=0.4)

# Combine oversampling and undersampling
X_train_knn, y_train_knn = smote.fit_resample(X_train, y_train)
X_train_knn, y_train_knn = rus.fit_resample(X_train_knn, y_train_knn)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_knn, y_train_knn)
y_pred_knn = knn.predict(X_test)

print("\nk-NN Results:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
print("\nClassification Report:\n", classification_report(y_test, y_pred_knn))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred_knn))

"""## Cross-Validation k-NN"""

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

knn = KNeighborsClassifier(n_neighbors=5)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

knn_scores = cross_val_score(knn, X_train_knn, y_train_knn, cv=kf, scoring='accuracy')
knn_f1_scores = cross_val_score(knn, X_train_knn, y_train_knn, cv=kf, scoring='f1')

print("k-NN Cross-Validated Scores:", knn_scores)
print("k-NN Average Score:", knn_scores.mean())
print("k-NN Cross-Validated F1 Scores:", knn_f1_scores)
print("k-NN Average F1 Score:", knn_f1_scores.mean())

"""## Support Vector Machine (SVM)

"""

from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Combine oversampling and undersampling
X_train_svc, y_train_svc = smote.fit_resample(X_train, y_train)
X_train_svc, y_train_svc = rus.fit_resample(X_train_svc, y_train_svc)


# SVM with linear kernel
svm_classifier = SVC(kernel='linear', C=1.0, random_state=1)
svm_classifier.fit(X_train_svc, y_train_svc)

y_pred_svm = svm_classifier.predict(X_test)

print("\nSVM Results:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred_svm))

"""## Cross-Validation SVM"""

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, StratifiedKFold

svm_classifier = SVC(kernel='linear', C=1.0, random_state=1)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

svm_scores = cross_val_score(svm_classifier, X_train_svc, y_train_svc, cv=kf, scoring='accuracy')
svm_f1_scores = cross_val_score(svm_classifier, X_train_svc, y_train_svc, cv=kf, scoring='f1')

print("SVM Cross-Validated Scores:", svm_scores)
print("SVM Average Score:", svm_scores.mean())
print("SVM Cross-Validated F1 Scores:", svm_f1_scores)
print("SVM Average F1 Score:", svm_f1_scores.mean())

"""### CatBoost"""

# Install CatBoost
!pip install catboost

from catboost import CatBoostClassifier

# Combine oversampling and undersampling
X_train_cat, y_train_cat = smote.fit_resample(X_train, y_train)
X_train_cat, y_train_cat = rus.fit_resample(X_train_cat, y_train_cat)

catboost_model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, random_seed=1, verbose=False)

catboost_model.fit(X_train_cat, y_train_cat)
y_pred_catboost = catboost_model.predict(X_test)

print("\nCatBoost Results:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_catboost))
print("\nClassification Report:\n", classification_report(y_test, y_pred_catboost))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred_catboost))

"""## Cross-Validation CatBoost"""

from sklearn.model_selection import cross_val_score, StratifiedKFold
from catboost import CatBoostClassifier

catboost_model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, random_seed=1, verbose=False)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# Perform cross-validation for CatBoost
catboost_scores = cross_val_score(catboost_model, X_train_cat, y_train_cat, cv=kf, scoring='accuracy')
catboost_f1_scores = cross_val_score(catboost_model, X_train_cat, y_train_cat, cv=kf, scoring='f1')

print("CatBoost Cross-Validated Scores:", catboost_scores)
print("CatBoost Average Score:", catboost_scores.mean())
print("CatBoost Cross-Validated F1 Scores:", catboost_f1_scores)
print("CatBoost Average F1 Score:", catboost_f1_scores.mean())

"""### XGBoost"""

# Install XGBoost
!pip install xgboost

from xgboost import XGBClassifier

# Combine oversampling and undersampling
X_train_xboost, y_train_xboost = smote.fit_resample(X_train, y_train)
X_train_xboost, y_train_xboost = rus.fit_resample(X_train_xboost, y_train_xboost)

xgboost_model = XGBClassifier(n_estimators=1000, learning_rate=0.1, max_depth=6, random_state=1, use_label_encoder=False)

xgboost_model.fit(X_train_xboost, y_train_xboost)
y_pred_xgboost = xgboost_model.predict(X_test)

print("\nXGBoost Results:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgboost))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgboost))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred_xgboost))

"""## Cross-Validation XGBoost"""

from sklearn.model_selection import cross_val_score, StratifiedKFold
from xgboost import XGBClassifier

xgboost_model = XGBClassifier(n_estimators=1000, learning_rate=0.1, max_depth=6, random_state=1, use_label_encoder=False)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# Perform cross-validation for XGBoost
xgboost_scores = cross_val_score(xgboost_model, X_train_xboost, y_train_xboost, cv=kf, scoring='accuracy')
xgboost_f1_scores = cross_val_score(xgboost_model, X_train_xboost, y_train_xboost, cv=kf, scoring='f1')

print("XGBoost Cross-Validated Scores:", xgboost_scores)
print("XGBoost Average Score:", xgboost_scores.mean())
print("XGBoost Cross-Validated F1 Scores:", xgboost_f1_scores)
print("XGBoost Average F1 Score:", xgboost_f1_scores.mean())