0% found this document useful (0 votes)

12 views9 pages

Major Project

it is a project

Uploaded by

divyanshnayyar55

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

12 views9 pages

Major Project

it is a project

Uploaded by

divyanshnayyar55

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 9

In [1]: %matplotlib inline

Major Project: Credit Risk Modeling –

Loan Classification
Name: Divyasnh Date: 2025-06-22

This project aims to classify loan applicants based on their likelihood of repaying
loans using various machine learning models.

In [3]: import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import shap

warnings.filterwarnings("ignore")
sns.set(style='whitegrid')

In [5]: # Load the dataset

df = pd.read_csv('loan_detection.csv')
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (41188, 60)

Out[5]:
age campaign pdays previous no_previous_contact not_working job_admin.

0 56 1 999 0 1 0 0

1 57 1 999 0 1 0 0

2 37 1 999 0 1 0 0

3 40 1 999 0 1 0 1

4 56 1 999 0 1 0 0

5 rows × 60 columns

In [9]: df['Loan_Status_label'].value_counts(normalize=True)

Out[9]: Loan_Status_label
0 0.887346
1 0.112654
Name: proportion, dtype: float64

In [23]: df['Loan_Status_label'] = df['Loan_Status_label'].apply(lambda x: 1 if x == 'Fully Pai

X = df.drop('Loan_Status_label', axis=1)
y = df['Loan_Status_label']

X_train, X_test, y_train, y_test = train_test_split(

X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train shape:", X_train.shape)

print("Test shape:", X_test.shape)

Train shape: (32950, 59)

Test shape: (8238, 59)

In [25]: num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

cat_cols = X.select_dtypes(include=['object']).columns.tolist()

numeric_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
('num', numeric_pipeline, num_cols),
('cat', categorical_pipeline, cat_cols)
])

In [27]: print("Train class distribution:\n", y_train.value_counts(normalize=True))

print("Test class distribution:\n", y_test.value_counts(normalize=True))

Train class distribution:

Loan_Status_label
0 1.0
Name: proportion, dtype: float64
Test class distribution:
Loan_Status_label
0 1.0
Name: proportion, dtype: float64

In [29]: from sklearn.utils import resample

X_fake = X.copy()
y_fake = y.copy()

if y_fake.nunique() == 1:

fake_positive = resample(X_fake, n_samples=int(len(X_fake) * 0.5), random_state

X_fake = pd.concat([X_fake, fake_positive])
y_fake = pd.concat([y_fake, pd.Series([1]*len(fake_positive))])

X_train, X_test, y_train, y_test = train_test_split(

X_fake, y_fake, test_size=0.2, stratify=y_fake, random_state=42
)

In [31]: final_model = Pipeline([

('pre', preprocessor),
('clf', RandomForestClassifier(random_state=42))
])

final_model.fit(X_train, y_train)

y_pred = final_model.predict(X_test)
y_proba = final_model.predict_proba(X_test)[:, 1]

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_proba))
Accuracy: 0.2448814437161123

Classification Report:
precision recall f1-score support

0 0.39 0.22 0.28 8238

1 0.16 0.29 0.20 4119

accuracy 0.24 12357

macro avg 0.27 0.26 0.24 12357
weighted avg 0.31 0.24 0.26 12357

Confusion Matrix:
[[1850 6388]
[2943 1176]]

ROC-AUC Score: 0.2011164605829215

In [32]: # ROC Curve

fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f"ROC AUC = {roc_auc_score(y_test, y_proba):.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.show()
In [ ]: X_train_transformed = preprocessor.fit_transform(X_train)

feature_names = preprocessor.get_feature_names_out()

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_transformed, y_train)

explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_train_transformed)

shap.summary_plot(shap_values[1], X_train_transformed, feature_names=feature_names

In [33]: plt.figure(figsize=(5,4))
sns.countplot(data=df, x='Loan_Status_label', palette='coolwarm')
plt.title("Loan Status Distribution")
plt.xlabel("Loan Status (0 = Charged Off, 1 = Fully Paid)")
plt.ylabel("Count")
plt.grid(axis='y')
plt.show()

In [37]: plt.figure(figsize=(12,8))
sns.heatmap(df.select_dtypes(include=['float64', 'int64']).corr(), annot=True,
plt.title("Correlation Heatmap")
plt.show()
In [41]: df['Loan_Status_label'].value_counts()

Out[41]: Loan_Status_label
0 41188
Name: count, dtype: int64

In [43]: import matplotlib.pyplot as plt

import seaborn as sns
import pandas as pd

%matplotlib inline

# Minimal test DataFrame

df_test = pd.DataFrame({'Loan_Status_label': [0]*50 + [1]*30})

plt.figure(figsize=(5,4))
sns.countplot(data=df_test, x='Loan_Status_label', palette='coolwarm')
plt.title("Test Plot")
plt.xlabel("Loan Status")
plt.ylabel("Count")
plt.grid(axis='y')
plt.show()
In [51]: # Train Random Forest outside the pipeline
rf = RandomForestClassifier(random_state=42)
X_train_transformed = preprocessor.fit_transform(X_train) # Apply preprocessing
rf.fit(X_train_transformed, y_train) # Train model

Out[51]: ▾ RandomForestClassifier i ?

RandomForestClassifier(random_state=42)

In [61]: X_trans_df = pd.DataFrame(X_train_transformed, columns=preprocessor.get_feature_names_

X_trans_df['Loan_Status_label'] = y_train.values # Add target column

In [63]: top_features = feat_imp.head(5).index.tolist()

sample_df = X_trans_df[top_features + ['Loan_Status_label']].dropna().sample(200

sns.pairplot(sample_df, hue='Loan_Status_label', palette='Set1')

plt.suptitle("Pair Plot of Top Features", y=1.02)
plt.show()
In [ ]:

In [ ]:

In [ ]:
In [ ]:

In [ ]:

Telecom Churn Proj
No ratings yet
Telecom Churn Proj
4 pages
Random Forest
No ratings yet
Random Forest
8 pages
Loan Default Prediction System 1753830667
No ratings yet
Loan Default Prediction System 1753830667
11 pages
05 E RandomForest LoanData
No ratings yet
05 E RandomForest LoanData
8 pages
Loan Approval Prediction Python
No ratings yet
Loan Approval Prediction Python
6 pages
Assignment
No ratings yet
Assignment
5 pages
Random Forest Classifier on Banking Dataset
No ratings yet
Random Forest Classifier on Banking Dataset
7 pages
Hands-On Activity 3.3 Random Forest Mantaring - Ipynb - Mantaring
No ratings yet
Hands-On Activity 3.3 Random Forest Mantaring - Ipynb - Mantaring
13 pages
Introduction of Phase 4
No ratings yet
Introduction of Phase 4
14 pages
Step by Step Data Processing For ML Project
No ratings yet
Step by Step Data Processing For ML Project
16 pages
4-1 Fine-Tuning Your Model
No ratings yet
4-1 Fine-Tuning Your Model
60 pages
Random Forest
100% (1)
Random Forest
11 pages
Case Study Stock Market Prediciton
No ratings yet
Case Study Stock Market Prediciton
10 pages
Loan Default Logistics Regression
No ratings yet
Loan Default Logistics Regression
6 pages
Online Payment Fraud Detection Using Machine Learning
No ratings yet
Online Payment Fraud Detection Using Machine Learning
2 pages
Detect Fake Profiles in Online Social Networks Using Support Vector Machine
No ratings yet
Detect Fake Profiles in Online Social Networks Using Support Vector Machine
8 pages
MLfull
No ratings yet
MLfull
29 pages
Final-12-Lab Programs
No ratings yet
Final-12-Lab Programs
30 pages
Binary Classifier Evaluation Guide
No ratings yet
Binary Classifier Evaluation Guide
12 pages
Loan Approval
No ratings yet
Loan Approval
12 pages
Bacdeaf 23032025 115708 Split 1
No ratings yet
Bacdeaf 23032025 115708 Split 1
37 pages
Import As Import As From Import From Import From Import From Import
No ratings yet
Import As Import As From Import From Import From Import From Import
4 pages
Credit Risk Modeling in Python Chapter2
100% (1)
Credit Risk Modeling in Python Chapter2
36 pages
Classification
No ratings yet
Classification
3 pages
Credit Card Loan Approval Demo
No ratings yet
Credit Card Loan Approval Demo
3 pages
Ensembles Models and Decision Tree
No ratings yet
Ensembles Models and Decision Tree
21 pages
Lab4 - Jupyter Notebook
No ratings yet
Lab4 - Jupyter Notebook
7 pages
23BCE7092 ML Lab Assignment
No ratings yet
23BCE7092 ML Lab Assignment
14 pages
Import As Import As From Import From Import From Import From Import
No ratings yet
Import As Import As From Import From Import From Import From Import
6 pages
Decision Tree & Random Forest Guide
No ratings yet
Decision Tree & Random Forest Guide
7 pages
DA PRA WEEK 13 (Random Forest) - 054551
No ratings yet
DA PRA WEEK 13 (Random Forest) - 054551
12 pages
Project
No ratings yet
Project
31 pages
Assign 4 8057
No ratings yet
Assign 4 8057
7 pages
Code ExerciseModelSelection
100% (1)
Code ExerciseModelSelection
19 pages
Da Lab Mannual
No ratings yet
Da Lab Mannual
25 pages
Logistic Regression
No ratings yet
Logistic Regression
8 pages
Experiment 10
No ratings yet
Experiment 10
2 pages
Assgn 06 ML - Ipynb - Colab
No ratings yet
Assgn 06 ML - Ipynb - Colab
5 pages
Fraud Detection with Random Forest
No ratings yet
Fraud Detection with Random Forest
2 pages
NF Assighment4
No ratings yet
NF Assighment4
5 pages
Python Code For Loan Default Prediction
No ratings yet
Python Code For Loan Default Prediction
4 pages
AUC and The ROC Curve in Machine Learning - DataCamp
No ratings yet
AUC and The ROC Curve in Machine Learning - DataCamp
12 pages
Mllla
No ratings yet
Mllla
2 pages
ROC and AUC Practical Implementation PDF
No ratings yet
ROC and AUC Practical Implementation PDF
6 pages
Decision Tree
No ratings yet
Decision Tree
5 pages
AAM 6th Prac
No ratings yet
AAM 6th Prac
3 pages
Heart Disease Prediction Guide
100% (1)
Heart Disease Prediction Guide
73 pages
DWDM Lab 3
No ratings yet
DWDM Lab 3
10 pages
Machine Learning
No ratings yet
Machine Learning
16 pages
Random Forest
No ratings yet
Random Forest
3 pages
CODE
No ratings yet
CODE
4 pages
Ai Lab PRGM
No ratings yet
Ai Lab PRGM
10 pages
Setup: This Notebook Contains All The Sample Code and Solutions To The Exercises in Chapter 7
No ratings yet
Setup: This Notebook Contains All The Sample Code and Solutions To The Exercises in Chapter 7
23 pages
TP - Ipynb - Colab
No ratings yet
TP - Ipynb - Colab
6 pages
Last Day
No ratings yet
Last Day
35 pages
PA v0.25
No ratings yet
PA v0.25
18 pages
Decision Tree - Jupyter Notebook
No ratings yet
Decision Tree - Jupyter Notebook
4 pages
Top 10 Erp
No ratings yet
Top 10 Erp
86 pages
CV - Ilaha Asadova
No ratings yet
CV - Ilaha Asadova
1 page
MCQ TQM
No ratings yet
MCQ TQM
4 pages
Keratograph 5m en PDF
No ratings yet
Keratograph 5m en PDF
16 pages
The Lements Orthodontic Philosophy Courses: Registration Information
No ratings yet
The Lements Orthodontic Philosophy Courses: Registration Information
8 pages
Financial Accounting and Reporting - QUIZ 5
100% (1)
Financial Accounting and Reporting - QUIZ 5
4 pages
CSS Module 1-Grade 12
No ratings yet
CSS Module 1-Grade 12
41 pages
Westside Story
No ratings yet
Westside Story
6 pages
WEF The Net Zero Challenge
100% (1)
WEF The Net Zero Challenge
41 pages
SU-Field and 76 Solutions
No ratings yet
SU-Field and 76 Solutions
90 pages
Contoh Business Plan Indonesia
No ratings yet
Contoh Business Plan Indonesia
9 pages
Carretero and Lagaly 2007
No ratings yet
Carretero and Lagaly 2007
3 pages
Employer's Annual Federal Unemployment (FUTA) Tax Return
No ratings yet
Employer's Annual Federal Unemployment (FUTA) Tax Return
4 pages
Novak Djokovic's Biography
No ratings yet
Novak Djokovic's Biography
13 pages
Love & Pair Co.: Lovebird Breeding & Sales
No ratings yet
Love & Pair Co.: Lovebird Breeding & Sales
88 pages
Final Examination in Eng121
No ratings yet
Final Examination in Eng121
24 pages
Tle10 Eim10 Q1 M5
No ratings yet
Tle10 Eim10 Q1 M5
12 pages
AW109SP QRH - Issue 2 - Rev.2
100% (1)
AW109SP QRH - Issue 2 - Rev.2
380 pages
Enrich Marital Satisfaction Scale
100% (1)
Enrich Marital Satisfaction Scale
22 pages
Unit 4 SportsReporting
No ratings yet
Unit 4 SportsReporting
22 pages
Mini M-70 Series Industrial Spray Nozzle: Specifications
No ratings yet
Mini M-70 Series Industrial Spray Nozzle: Specifications
2 pages
Gasket Brochure
100% (1)
Gasket Brochure
8 pages
Global Economic Impact of COVID-19
100% (1)
Global Economic Impact of COVID-19
15 pages
Pizza Hut & Domino's Branding Insights
No ratings yet
Pizza Hut & Domino's Branding Insights
24 pages
Laboratory #4: Control Charts For Variable Data (X-Bar and R) Purpose: Materials
No ratings yet
Laboratory #4: Control Charts For Variable Data (X-Bar and R) Purpose: Materials
7 pages
Genetics and Genomics Chapter 4 Questions & Answers Multiple Choice Questions
No ratings yet
Genetics and Genomics Chapter 4 Questions & Answers Multiple Choice Questions
23 pages
Traditional Witches Silence Hex
No ratings yet
Traditional Witches Silence Hex
2 pages
Environmental Movements in India
No ratings yet
Environmental Movements in India
2 pages
Math 1: Learning Philippine Money
No ratings yet
Math 1: Learning Philippine Money
7 pages
CFA Level 1 FRA
No ratings yet
CFA Level 1 FRA
17 pages

Major Project

Uploaded by

Major Project

Uploaded by

In [1]: %matplotlib inline

Major Project: Credit Risk Modeling –

In [3]: import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

In [5]: # Load the dataset

Dataset shape: (41188, 60)

In [23]: df['Loan_Status_label'] = df['Loan_Status_label'].apply(lambda x: 1 if x == 'Fully Pai

X_train, X_test, y_train, y_test = train_test_split(

print("Train shape:", X_train.shape)

Train shape: (32950, 59)

In [25]: num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [27]: print("Train class distribution:\n", y_train.value_counts(normalize=True))

Train class distribution:

In [29]: from sklearn.utils import resample

fake_positive = resample(X_fake, n_samples=int(len(X_fake) * 0.5), random_state

X_train, X_test, y_train, y_test = train_test_split(

In [31]: final_model = Pipeline([

0 0.39 0.22 0.28 8238

accuracy 0.24 12357

ROC-AUC Score: 0.2011164605829215

In [32]: # ROC Curve

shap.summary_plot(shap_values[1], X_train_transformed, feature_names=feature_names

In [43]: import matplotlib.pyplot as plt

# Minimal test DataFrame

In [61]: X_trans_df = pd.DataFrame(X_train_transformed, columns=preprocessor.get_feature_names_

In [63]: top_features = feat_imp.head(5).index.tolist()

sns.pairplot(sample_df, hue='Loan_Status_label', palette='Set1')

You might also like