In [1]: import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
Exploratory Data Analysis
In [2]: df = pd.read_csv('banking_data.csv')
# Data exploration
df.info() # Analyze the datatypes of every column
df.describe() # Analyze the statistics of numerical columns
# Remove redundant column
df.drop(columns='marital_status', axis = 1, inplace = True)
# Drop the missing data
df.dropna(inplace=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45216 entries, 0 to 45215
Data columns (total 19 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 45216 non-null int64
1 job 45216 non-null object
2 marital 45213 non-null object
3 marital_status 45213 non-null object
4 education 45213 non-null object
5 default 45216 non-null object
6 balance 45216 non-null int64
7 housing 45216 non-null object
8 loan 45216 non-null object
9 contact 45216 non-null object
10 day 45216 non-null int64
11 month 45216 non-null object
12 day_month 45216 non-null object
13 duration 45216 non-null int64
14 campaign 45216 non-null int64
15 pdays 45216 non-null int64
16 previous 45216 non-null int64
17 poutcome 45216 non-null object
18 y 45216 non-null object
dtypes: int64(7), object(12)
memory usage: 6.6+ MB
Plot the Distributions
In [3]: # Distribution of Age among clients
plt.hist(df['age'])
plt.xlabel('Age of clients')
plt.ylabel('Number of clients')
plt.title('Age distribution of clients')
plt.show()
# Bar plot for job type
plt.figure(figsize=(6,6))
df['job'].value_counts().sort_values(ascending=False).plot(kind = 'barh')
plt.title('Distribution of Job Types')
plt.xlabel('Number of clients')
plt.ylabel('Job Type')
plt.show()
# Pie chart for marital status distribution
plt.pie(df['marital'].value_counts(), labels = df['marital'].unique(), autopct='%1.1f%%', startangle=90, shadow = True)
plt.title('Distribution of marital status')
plt.show()
# Bar plot for level of education among clients
plt.figure(figsize=(6,6))
df['education'].value_counts().plot(kind = 'bar')
plt.title('Distribution of Education')
plt.xlabel('Level of education')
plt.ylabel('Number of clients')
plt.xticks(rotation = 0)
plt.show()
# Pie chart for clients with default credit
plt.pie(df['default'].value_counts(), labels = df['default'].unique(), autopct='%1.1f%%', startangle=90, shadow = True)
plt.title('Proportion of clients with default credit')
plt.show()
# Pie chart for clients with housing loans
plt.pie(df['housing'].value_counts(), labels = df['housing'].unique(), autopct='%1.1f%%', startangle=90, shadow = True)
plt.title('Proportion of clients with housing loans')
plt.show()
# Pie chart for clients with personal loans
plt.pie(df['loan'].value_counts(), labels = df['loan'].unique(), autopct='%1.1f%%', startangle=90, shadow = True)
plt.title('Proportion of clients with personal loans')
plt.show()
# Bar plot for types of communication employed among clients
plt.figure(figsize=(6,6))
df['contact'].value_counts().plot(kind = 'bar')
plt.title('Distribution of Communication means')
plt.xlabel('Communication type')
plt.ylabel('Number of clients')
plt.xticks(rotation = 0)
plt.show()
# Histogram for distribution of last contact day
plt.figure(figsize=(12,12))
plt.hist(df['day'])
plt.xlabel('Contact Day')
plt.ylabel('Number of clients')
plt.title('Last contact day with the client')
plt.show()
# Bar plot for distribution of contact month
plt.figure(figsize=(6,6))
df['month'].value_counts().plot(kind = 'bar')
plt.title('Distribution of Last contact month')
plt.xlabel('Contact month')
plt.ylabel('Number of clients')
plt.xticks(rotation = 0)
plt.show()
# Histogram for distribution of last contact duration
df['duration_in_min'] = df['duration'] / 60 # Convert duration in seconds to minutes
plt.hist(df['duration_in_min'])
plt.xlabel('Duration with the client (in min)')
plt.ylabel('Number of clients')
plt.xlim([0, 40])
plt.title('Distribution of time spoken with clients')
plt.show()
# Histogram for number of contacts performed during the campaign
plt.figure(figsize=(10, 6)) # More suitable aspect ratio
# Using hist with specific bins
plt.hist(df['campaign'], bins=range(min(df['campaign']), max(df['campaign']) + 2, 1),
edgecolor='black', alpha=0.7)
# Add grid for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xlabel('Number of contacts during campaign')
plt.ylabel('Number of clients')
plt.title('Distribution of Contact Attempts per Client')
plt.tight_layout()
plt.show()
# Distribution of days passed since last contact
# Create two plots side by side
plt.subplot(1, 2, 1)
# Count how many were never contacted (-1) vs contacted before
never_contacted = (df['pdays'] == -1).sum()
contacted_before = (df['pdays'] != -1).sum()
plt.bar(['No contact', 'Previous contact'],
[never_contacted, contacted_before])
plt.title('Contact History Distribution')
plt.ylabel('Number of clients')
# For the second plot, show distribution of days for those who were contacted
plt.subplot(1, 2, 2)
# Only include pdays > 0 (excluding -1)
contacted_days = df[df['pdays'] != -1]['pdays']
plt.hist(contacted_days, bins=30, edgecolor='black')
plt.title('Distribution of Days Since Previous Contact\n(For Previously Contacted Clients)')
plt.xlabel('Number of days since previous contact')
plt.ylabel('Number of clients')
plt.tight_layout()
plt.show()
# Histogram for number of contacts performed in the previous campaign
plt.figure(figsize=(10, 6)) # More suitable aspect ratio
# Using hist with specific bins
plt.hist(df['previous'], bins=range(min(df['previous']), max(df['previous']) + 2, 1),
edgecolor='black', alpha=0.7)
# Add grid for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xlabel('Number of contacts in the previous campaign')
plt.ylabel('Number of clients')
plt.title('Distribution of Contact Attempts per Client in previous campaign')
plt.tight_layout()
plt.show()
# Bar plot for campaign outcomes
plt.figure(figsize=(10, 6))
# Using value_counts for exact frequencies
contact_counts = df['poutcome'].value_counts().sort_index()
plt.bar(contact_counts.index, contact_counts.values,
edgecolor='black', alpha=0.7)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xlabel('Outcomes during the campaign')
plt.ylabel('Count')
plt.title('Distribution of Outcomes of the previous campaign')
plt.tight_layout()
plt.show()
# Pie chart for distriubtion of clients with term deposit
plt.pie(df['y'].value_counts(), labels = df['y'].unique(), autopct='%1.1f%%', startangle=90, shadow = True)
plt.title('Proportion of clients with term deposit')
plt.show()
Correlation and Statistical Analysis
In [4]: # Analyze correlation between subscribing for term deposit and different attributes
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# First, analyze numerical variables
numerical_vars = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
# Create a figure with subplots
plt.figure(figsize=(15, 10))
# Convert y to numeric (1 for 'yes', 0 for 'no')
df['y_numeric'] = (df['y'] == 'yes').astype(int)
# Calculate correlations for numerical variables
correlations = df[numerical_vars + ['y_numeric']].corr()['y_numeric'].sort_values(ascending=False)
print("Correlations with term deposit subscription:")
print(correlations)
# Create correlation heatmap
plt.subplot(2,1,1)
sns.heatmap(df[numerical_vars + ['y_numeric']].corr(),
annot=True,
cmap='coolwarm',
center=0)
plt.title('Correlation Heatmap - Numerical Variables')
# For categorical variables, calculate subscription rate for each category
categorical_vars = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
# Create a subplot for categorical variables
plt.subplot(2,1,2)
# Example with one categorical variable (e.g., 'job')
subscription_rates = df.groupby('job')['y_numeric'].mean().sort_values(ascending=True)
subscription_rates.plot(kind='barh')
plt.title('Subscription Rate by Job Type')
plt.xlabel('Subscription Rate')
plt.tight_layout()
plt.show()
# Create binary numeric y variable
df['y_numeric'] = (df['y'] == 'yes').astype(int)
# Print subscription rates for all categorical variables
print("\nSubscription rates by categories:")
for var in categorical_vars:
print(f"\n{var.upper()} category:")
rates = df.groupby(var)['y_numeric'].agg(['count', 'mean']).round(3)
rates.columns = ['Count', 'Subscription Rate']
print(rates)
# Set up the plots
plt.figure(figsize=(15, 12))
# 1. Box plots for numerical variables vs subscription
numerical_vars = ['age', 'balance', 'duration', 'campaign']
plt.subplot(2, 2, 1)
sns.boxplot(x='y', y='balance', data=df)
plt.title('Balance Distribution by Subscription')
plt.yticks(np.arange(-20000, 100000, 10000)) # Creates ticks every 10000 units from -20000 to 100000
# 2. Bar chart for categorical variables
plt.subplot(2, 2, 2)
subscription_by_education = df.groupby('education')['y_numeric'].mean().sort_values()
subscription_by_education.plot(kind='barh')
plt.title('Subscription Rate by Education')
plt.xlabel('Subscription Rate')
# 3. Pie chart for binary outcome (y)
plt.subplot(2, 2, 3)
df['y'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Distribution of Subscription Decisions')
# 4. Histogram with subscription overlay
plt.subplot(2, 2, 4)
sns.histplot(data=df, x='age', hue='y', multiple="layer", alpha=.5)
plt.title('Age Distribution by Subscription')
plt.tight_layout()
plt.show()
# Statistical Analysis
print("\nStatistical Analysis:")
# 1. For numerical variables - Mann-Whitney U test
from scipy.stats import mannwhitneyu
print("\nNumerical Variables Analysis:")
for var in numerical_vars:
stat, p = mannwhitneyu(df[df['y']=='yes'][var], df[df['y']=='no'][var])
print(f"\n{var.upper()}:")
print(f"p-value: {p:.10f}")
print("Median for subscribers:", df[df['y']=='yes'][var].median())
print("Median for non-subscribers:", df[df['y']=='no'][var].median())
# 2. For categorical variables - Chi-square test
from scipy.stats import chi2_contingency
categorical_vars = ['job', 'marital', 'education', 'default', 'housing', 'loan']
print("\nCategorical Variables Analysis:")
for var in categorical_vars:
contingency = pd.crosstab(df[var], df['y'])
chi2, p, dof, expected = chi2_contingency(contingency)
print(f"\n{var.upper()}:")
print(f"Chi-square p-value: {p:.10f}")
success_rate = df[df['y']=='yes'][var].value_counts() / df[var].value_counts() * 100
print("Success rates by category:")
print(success_rate.round(2))
Correlations with term deposit subscription:
y_numeric 1.000000
duration 0.394472
pdays 0.103813
previous 0.093628
balance 0.052844
age 0.025718
day -0.028264
campaign -0.073277
Name: y_numeric, dtype: float64
Subscription rates by categories:
JOB category:
Count Subscription Rate
job
admin. 5171 0.122
blue-collar 9731 0.073
entrepreneur 1487 0.083
housemaid 1240 0.088
management 9458 0.138
retired 2266 0.229
self-employed 1579 0.118
services 4154 0.089
student 936 0.287
technician 7597 0.111
unemployed 1303 0.155
unknown 288 0.118
MARITAL category:
Count Subscription Rate
marital
divorced 5207 0.120
married 27216 0.101
single 12787 0.150
EDUCATION category:
Count Subscription Rate
education
primary 6851 0.086
secondary 23201 0.106
tertiary 13301 0.150
unknown 1857 0.136
DEFAULT category:
Count Subscription Rate
default
no 44395 0.118
yes 815 0.064
HOUSING category:
Count Subscription Rate
housing
no 20080 0.167
yes 25130 0.077
LOAN category:
Count Subscription Rate
loan
no 37966 0.127
yes 7244 0.067
CONTACT category:
Count Subscription Rate
contact
cellular 29288 0.149
telephone 2902 0.134
unknown 13020 0.041
MONTH category:
Count Subscription Rate
month
apr 2932 0.197
aug 6247 0.110
dec 214 0.467
feb 2649 0.166
jan 1403 0.101
jul 6895 0.091
jun 5341 0.102
mar 477 0.520
may 13766 0.067
nov 3972 0.102
oct 735 0.439
sep 579 0.465
POUTCOME category:
Count Subscription Rate
poutcome
failure 4900 0.126
other 1838 0.167
success 1513 0.648
unknown 36959 0.092
Statistical Analysis:
Numerical Variables Analysis:
AGE:
p-value: 0.0748132131
Median for subscribers: 38.0
Median for non-subscribers: 39.0
BALANCE:
p-value: 0.0000000000
Median for subscribers: 733.0
Median for non-subscribers: 417.0
DURATION:
p-value: 0.0000000000
Median for subscribers: 426.0
Median for non-subscribers: 164.0
CAMPAIGN:
p-value: 0.0000000000
Median for subscribers: 2.0
Median for non-subscribers: 2.0
Categorical Variables Analysis:
JOB:
Chi-square p-value: 0.0000000000
Success rates by category:
job
admin. 12.20
blue-collar 7.28
entrepreneur 8.27
housemaid 8.79
management 13.77
retired 22.90
self-employed 11.84
services 8.88
student 28.74
technician 11.06
unemployed 15.50
unknown 11.81
Name: count, dtype: float64
MARITAL:
Chi-square p-value: 0.0000000000
Success rates by category:
marital
married 10.13
single 14.95
divorced 11.96
Name: count, dtype: float64
EDUCATION:
Chi-square p-value: 0.0000000000
Success rates by category:
education
secondary 10.57
tertiary 15.01
primary 8.64
unknown 13.57
Name: count, dtype: float64
DEFAULT:
Chi-square p-value: 0.0000023759
Success rates by category:
default
no 11.81
yes 6.38
Name: count, dtype: float64
HOUSING:
Chi-square p-value: 0.0000000000
Success rates by category:
housing
no 16.72
yes 7.70
Name: count, dtype: float64
LOAN:
Chi-square p-value: 0.0000000000
Success rates by category:
loan
no 12.67
yes 6.68
Name: count, dtype: float64
Plots for term deposit subscription v/s categorical variables
In [8]: import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# Create figure with subplots using gridspec
fig = plt.figure(figsize=(15, 10))
gs = fig.add_gridspec(2, 2)
# 1. Loan and Default Impact (Top Left)
ax1 = fig.add_subplot(gs[0, 0])
loan_data = {
'Housing Loan': [16.7, 7.7],
'Personal Loan': [12.7, 6.7],
'Default Status': [11.8, 6.4]
}
x = np.arange(3)
width = 0.35
ax1.bar(x - width/2, [16.7, 12.7, 11.8], width, label='No', color='lightblue')
ax1.bar(x + width/2, [7.7, 6.7, 6.4], width, label='Yes', color='lightcoral')
ax1.set_xticks(x)
ax1.set_xticklabels(['Housing', 'Personal', 'Default'])
ax1.set_ylabel('Success Rate (%)')
ax1.set_title('Impact of Loans and Default Status')
ax1.legend()
# 2. Education Level Impact (Top Right)
ax2 = fig.add_subplot(gs[0, 1])
education_data = {
'tertiary': 15.0,
'secondary': 10.6,
'primary': 8.6,
'unknown': 13.6
}
ax2.bar(education_data.keys(), education_data.values())
ax2.set_ylabel('Success Rate (%)')
ax2.set_title('Success Rate by Education Level')
# 3. Marital Status Impact (Bottom, spanning both columns)
ax3 = fig.add_subplot(gs[1, :])
marital_data = {
'single': 15.0,
'divorced': 12.0,
'married': 10.1
}
ax3.bar(marital_data.keys(), marital_data.values())
ax3.set_ylabel('Success Rate (%)')
ax3.set_title('Success Rate by Marital Status')
plt.tight_layout()
plt.show()