0% found this document useful (0 votes)
12 views9 pages

Major Project

it is a project

Uploaded by

divyanshnayyar55
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
12 views9 pages

Major Project

it is a project

Uploaded by

divyanshnayyar55
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 9

In [1]: %matplotlib inline

Major Project: Credit Risk Modeling –


Loan Classification
Name: Divyasnh Date: 2025-06-22

This project aims to classify loan applicants based on their likelihood of repaying
loans using various machine learning models.

In [3]: import pandas as pd


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score


from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import shap

warnings.filterwarnings("ignore")
sns.set(style='whitegrid')

In [5]: # Load the dataset


df = pd.read_csv('loan_detection.csv')
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (41188, 60)


Out[5]:
age campaign pdays previous no_previous_contact not_working job_admin.

0 56 1 999 0 1 0 0

1 57 1 999 0 1 0 0

2 37 1 999 0 1 0 0

3 40 1 999 0 1 0 1

4 56 1 999 0 1 0 0

5 rows × 60 columns

In [9]: df['Loan_Status_label'].value_counts(normalize=True)

Out[9]: Loan_Status_label
0 0.887346
1 0.112654
Name: proportion, dtype: float64

In [23]: df['Loan_Status_label'] = df['Loan_Status_label'].apply(lambda x: 1 if x == 'Fully Pai

X = df.drop('Loan_Status_label', axis=1)
y = df['Loan_Status_label']

X_train, X_test, y_train, y_test = train_test_split(


X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train shape:", X_train.shape)


print("Test shape:", X_test.shape)

Train shape: (32950, 59)


Test shape: (8238, 59)

In [25]: num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()


cat_cols = X.select_dtypes(include=['object']).columns.tolist()

numeric_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
('num', numeric_pipeline, num_cols),
('cat', categorical_pipeline, cat_cols)
])

In [27]: print("Train class distribution:\n", y_train.value_counts(normalize=True))


print("Test class distribution:\n", y_test.value_counts(normalize=True))

Train class distribution:


Loan_Status_label
0 1.0
Name: proportion, dtype: float64
Test class distribution:
Loan_Status_label
0 1.0
Name: proportion, dtype: float64

In [29]: from sklearn.utils import resample

X_fake = X.copy()
y_fake = y.copy()

if y_fake.nunique() == 1:

fake_positive = resample(X_fake, n_samples=int(len(X_fake) * 0.5), random_state


X_fake = pd.concat([X_fake, fake_positive])
y_fake = pd.concat([y_fake, pd.Series([1]*len(fake_positive))])

X_train, X_test, y_train, y_test = train_test_split(


X_fake, y_fake, test_size=0.2, stratify=y_fake, random_state=42
)

In [31]: final_model = Pipeline([


('pre', preprocessor),
('clf', RandomForestClassifier(random_state=42))
])

final_model.fit(X_train, y_train)

y_pred = final_model.predict(X_test)
y_proba = final_model.predict_proba(X_test)[:, 1]

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_proba))
Accuracy: 0.2448814437161123

Classification Report:
precision recall f1-score support

0 0.39 0.22 0.28 8238


1 0.16 0.29 0.20 4119

accuracy 0.24 12357


macro avg 0.27 0.26 0.24 12357
weighted avg 0.31 0.24 0.26 12357

Confusion Matrix:
[[1850 6388]
[2943 1176]]

ROC-AUC Score: 0.2011164605829215

In [32]: # ROC Curve


fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f"ROC AUC = {roc_auc_score(y_test, y_proba):.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.show()
In [ ]: X_train_transformed = preprocessor.fit_transform(X_train)

feature_names = preprocessor.get_feature_names_out()

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_transformed, y_train)

explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_train_transformed)

shap.summary_plot(shap_values[1], X_train_transformed, feature_names=feature_names

In [33]: plt.figure(figsize=(5,4))
sns.countplot(data=df, x='Loan_Status_label', palette='coolwarm')
plt.title("Loan Status Distribution")
plt.xlabel("Loan Status (0 = Charged Off, 1 = Fully Paid)")
plt.ylabel("Count")
plt.grid(axis='y')
plt.show()

In [37]: plt.figure(figsize=(12,8))
sns.heatmap(df.select_dtypes(include=['float64', 'int64']).corr(), annot=True,
plt.title("Correlation Heatmap")
plt.show()
In [41]: df['Loan_Status_label'].value_counts()

Out[41]: Loan_Status_label
0 41188
Name: count, dtype: int64

In [43]: import matplotlib.pyplot as plt


import seaborn as sns
import pandas as pd

%matplotlib inline

# Minimal test DataFrame


df_test = pd.DataFrame({'Loan_Status_label': [0]*50 + [1]*30})

plt.figure(figsize=(5,4))
sns.countplot(data=df_test, x='Loan_Status_label', palette='coolwarm')
plt.title("Test Plot")
plt.xlabel("Loan Status")
plt.ylabel("Count")
plt.grid(axis='y')
plt.show()
In [51]: # Train Random Forest outside the pipeline
rf = RandomForestClassifier(random_state=42)
X_train_transformed = preprocessor.fit_transform(X_train) # Apply preprocessing
rf.fit(X_train_transformed, y_train) # Train model

Out[51]: ▾ RandomForestClassifier i ?

RandomForestClassifier(random_state=42)

In [61]: X_trans_df = pd.DataFrame(X_train_transformed, columns=preprocessor.get_feature_names_


X_trans_df['Loan_Status_label'] = y_train.values # Add target column

In [63]: top_features = feat_imp.head(5).index.tolist()


sample_df = X_trans_df[top_features + ['Loan_Status_label']].dropna().sample(200

sns.pairplot(sample_df, hue='Loan_Status_label', palette='Set1')


plt.suptitle("Pair Plot of Top Features", y=1.02)
plt.show()
In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:
In [ ]:

In [ ]:

In [ ]:

In [ ]:

You might also like