Kunal , DA Batch 12 , DA/23¶ ,Data
Analytics and Business Intelligence
          Imported Required Libraries
In [1]: import     numpy as np
        import     pandas as pd
        import     matplotlib.pyplot as plt
        import     matplotlib.cm as cm
        import     seaborn as sns
          1. Import ‘bank.csv’ dataset
In [3]: df=pd.read_csv('bank.csv')
        df
Out[3]:                                                                                   housing-    perso
                   age             job     marital    education     default    balance
                                                                                              loan         l
              0     30    unemployed       married      primary         no       1787          no
              1     33        services     married    secondary         no       4789          yes
              2     35    management        single       tertiary       no       1350          yes
              3     30    management       married       tertiary       no       1476          yes
              4     59      blue-collar    married    secondary         no          0          yes
             ...    ...              ...        ...           ...        ...        ...         ...
          4516      33        services     married    secondary         no        -333         yes
                                 self-
          4517      57                     married       tertiary       yes      -3313         yes
                            employed
          4518      57      technician     married    secondary         no        295          no
          4519      28      blue-collar    married    secondary         no       1137          no
          4520      44    entrepreneur      single       tertiary       no       1136          yes
          4521 rows × 11 columns
In [4]: # Checking shape of the dataset
        df.shape
Out[4]:   (4521, 11)
In [5]: # Checking the datatype of all columns
        df.dtypes
Out[5]:    age                   int64
           job                  object
           marital              object
           education            object
           default              object
           balance               int64
           housing-loan         object
           personal-loan        object
           current-campaign      int64
           previous-campaign     int64
           subscribed           object
           dtype: object
In [6]: # View the metadata of the dataset
        df.info()
          <class 'pandas.core.frame.DataFrame'>
          RangeIndex: 4521 entries, 0 to 4520
          Data columns (total 11 columns):
           #   Column             Non-Null Count   Dtype
          --- ------              --------------   -----
           0   age                4521 non-null    int64
           1   job                4521 non-null    object
           2   marital            4521 non-null    object
           3   education          4521 non-null    object
           4   default            4521 non-null    object
           5   balance            4521 non-null    int64
           6   housing-loan       4521 non-null    object
           7   personal-loan      4521 non-null    object
           8   current-campaign   4521 non-null    int64
           9   previous-campaign 4521 non-null     int64
           10 subscribed          4521 non-null    object
          dtypes: int64(4), object(7)
          memory usage: 388.7+ KB
           2. Performing relevant data cleaning
In [7]: # Checking duplicate record
        df.duplicated().sum()
Out[7]:    3
In [8]: # Removing duplicate rows
        df_cleaned = df.drop_duplicates()
           # Shape after cleaning
           print(df_cleaned.shape)
          (4518, 11)
In [9]: # Checking which columns have missing values
        print(df.isna().sum())
        age                    0
        job                    0
        marital                0
        education              0
        default                0
        balance                0
        housing-loan           0
        personal-loan          0
        current-campaign       0
        previous-campaign      0
        subscribed             0
        dtype: int64
         3. Performing EDA(Exploratory data analysis)
         [Descriptive and visualization]
In [10]: # Summary of numerical features
         print(df_cleaned.describe())
                         age        balance   current-campaign   previous-campaign
        count    4518.000000    4518.000000        4518.000000         4518.000000
        mean       41.170872    1423.596946           2.794157            0.542939
        std        10.578591    3010.416605           3.110772            1.694067
        min        19.000000   -3313.000000           1.000000            0.000000
        25%        33.000000      69.250000           1.000000            0.000000
        50%        39.000000     445.000000           2.000000            0.000000
        75%        49.000000    1480.000000           3.000000            0.000000
        max        87.000000   71188.000000          50.000000           25.000000
In [11]: # Summary of categorical features
         print(df_cleaned.describe(include='object'))
                        job    marital   education default housing-loan personal-loan   \
        count          4518       4518        4518    4518         4518          4518
        unique           12          3           4       2            2             2
        top      management    married   secondary      no          yes            no
        freq            967       2797        2305    4442         2559          3827
                 subscribed
        count          4518
        unique            2
        top              no
        freq           3997
In [12]: # Set seaborn style
         sns.set(style="whitegrid")
         # Plot countplots for categorical variables
         categorical_columns = ['job', 'marital', 'education', 'default', 'housing-lo
         for col in categorical_columns:
             plt.figure(figsize=(8, 4))
             sns.countplot(data=df_cleaned, x=col, hue=col,       order=df_cleaned[col].va
             plt.xticks(rotation=45)
             plt.title(f'Distribution of {col}')
             plt.tight_layout()
    plt.show()
# Histogram of Age and Balance
numeric_columns = ['age', 'balance']
for col in numeric_columns:
    plt.figure(figsize=(8, 4))
    sns.histplot(df_cleaned[col], kde=True, color='green')
    plt.title(f'Distribution of {col}')
    plt.show()
In [13]: # Pie chart for 'subscribed' variable
         plt.figure(figsize=(6, 6))
         subscribed_counts = df_cleaned['subscribed'].value_counts()
         plt.pie(subscribed_counts, labels=subscribed_counts.index, autopct='%1.1f%%'
         plt.title('Subscription Status Distribution')
         plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circl
         plt.show()
         df_cleaned['subscribed'].value_counts()
Out[13]:   subscribed
           no     3997
           yes     521
           Name: count, dtype: int64
In [14]: # Boxplots: Age and Balance vs Subscribed
         plt.figure(figsize=(8,4))
         sns.boxplot(data=df_cleaned, x='subscribed', hue='subscribed', y='age', pale
         plt.title("Age vs Subscribed")
         plt.show()
           plt.figure(figsize=(8,4))
           sns.boxplot(data=df_cleaned, x='subscribed', hue='subscribed', y='balance',
           plt.title("Balance vs Subscribed")
           plt.show()
         4. Consider a subset of ‘bank’ data with variables as
         ‘age’, ‘marital’, ‘education’, default’, ‘balance’,
         ‘housing-loan’, ‘personal-loan’, and ‘subscribed’. Name
         this new data as bank_new
In [15]: # Creating subset as per assignment instructions
         bank_new = df_cleaned[['age', 'marital', 'education', 'default', 'balance',
         # Displaying first few rows to confirm
         bank_new.head()
Out[15]:                                                          housing-   personal-
                age   marital   education     default   balance                          subscribed
                                                                      loan        loan
            0    30   married     primary         no      1787         no          no           no
            1    33   married   secondary         no      4789         yes         yes          no
            2    35    single      tertiary       no      1350         yes         no           no
            3    30   married      tertiary       no      1476         yes         yes          no
            4    59   married   secondary         no         0         yes         no           no
In [16]: # Saving the new dataset
         bank_new.to_csv("bank_new.csv", index=False)
         print("bank_new.csv saved successfully.")
           bank_new.csv saved successfully.
            5. Identify categorical and continuous variables.
In [17]: # metadeta of new dataset
         bank_new.info()
           <class 'pandas.core.frame.DataFrame'>
           Index: 4518 entries, 0 to 4520
           Data columns (total 8 columns):
            #   Column         Non-Null Count Dtype
           --- ------          -------------- -----
            0   age            4518 non-null   int64
            1   marital        4518 non-null   object
            2   education      4518 non-null   object
            3   default        4518 non-null   object
            4   balance        4518 non-null   int64
            5   housing-loan   4518 non-null   object
            6   personal-loan 4518 non-null    object
            7   subscribed     4518 non-null   object
           dtypes: int64(2), object(6)
           memory usage: 317.7+ KB
In [18]: print("\nCategorical columns:\n", bank_new.select_dtypes(include='object').c
         print("\nContinuous columns:\n", bank_new.select_dtypes(include='number').co
           Categorical columns:
            ['marital', 'education', 'default', 'housing-loan', 'personal-loan', 'subsc
           ribed']
           Continuous columns:
            ['age', 'balance']
In [19]: # Encode the target variable ('subscribed') to binary values: 0 = no, 1 = ye
         bank_new = bank_new.copy() # To avoid modifying the original dataframe
         bank_new['subscribed'] = bank_new['subscribed'].map({'no': 0, 'yes': 1})
In [20]: # Apply one-hot encoding to categorical features (drop_first to avoid dummy
         bank_encoded = pd.get_dummies(bank_new, drop_first=True)
           bank_encoded
Out[20]:
                    age    balance    subscribed    marital_married    marital_single    education_se
               0     30      1787             0                True             False
               1     33      4789             0                True             False
               2     35      1350             0               False              True
               3     30      1476             0                True             False
               4     59         0             0                True             False
              ...    ...        ...           ...                ...               ...
           4516      33       -333            0                True             False
           4517      57      -3313            0                True             False
           4518      57       295             0                True             False
           4519      28      1137             0                True             False
           4520      44      1136             0               False              True
           4518 rows × 11 columns
In [21]: b_cc=bank_encoded.corr()
           # Correlation Matrix for new bank_encoded
           plt.figure(figsize=(8,6))
           sns.heatmap(b_cc, annot=True, cmap='afmhot', fmt='.2f')
           plt.title("Correlation Matrix")
           plt.show()
In [ ]: #### 6. Divide the dataset into training and testing
In [22]: # Split into input features (X) and target variable (y)
         X = bank_encoded.drop('subscribed', axis=1)
         y = bank_encoded['subscribed']
In [23]: from sklearn.model_selection import train_test_split
         # Split the data into training and testing sets (80-20 split, stratified)
         X_train, X_test, y_train, y_test = train_test_split(
             X, y,
             test_size=0.2,
             random_state=42,
             stratify=y # Preserves class distribution in both sets
         )
                 ✅
In [24]: # Checking the shape of the resulting splits
         print(f"   Train-Test Split Complete:\nTraining Samples: {X_train.shape[0]}
        ✅ Train-Test Split Complete:
        Training Samples: 3614 | Testing Samples: 904
In [25]: from sklearn.preprocessing import StandardScaler
         # Initialize the scaler
         scaler = StandardScaler()
         # Fit and transform the training features
         X_train_scaled = scaler.fit_transform(X_train)
         # Only transform the test set (no fitting)
         X_test_scaled = scaler.transform(X_test)
         7. For ‘bank_new’ dataset, develop a Logistic Regression model to
         predict the variable ‘subscribed’ with the help of other variables
In [26]: from sklearn.linear_model import LogisticRegression
         # Initialize logistic regression model
         log_reg = LogisticRegression(class_weight='balanced',random_state=42, max_it
In [27]: # Fitting the model on training data
         log_reg.fit(X_train_scaled, y_train)
Out[27]: ▾                      LogisticRegression
         LogisticRegression(class_weight='balanced', random_state=42)
         8. Determine the predicted value of y (subscribed)
In [28]: # Predict on test data
         y_pred = log_reg.predict(X_test_scaled)
In [29]: # Display model coefficients
         coefficients = pd.DataFrame({
             'Feature': X.columns,
             'Coefficient': log_reg.coef_[0]
         }).sort_values(by='Coefficient', key=abs, ascending=False)
In [30]: print(" Top Influential Features:\n", coefficients.head())
          Top Influential Features:
                         Feature Coefficient
        8      housing-loan_yes    -0.308013
        5    education_tertiary     0.258045
        9     personal-loan_yes    -0.249828
        2       marital_married    -0.204127
        4 education_secondary       0.143176
In [31]: # View first few predicted values
         print("Predicted values for 'subscribed':")
         print(y_pred[:20])
        Predicted values for 'subscribed':
        [0 0 0 1 0 0 0 0 0 1 1 0 1 0 1 0 1 1 1 0]
In [32]: # Predict probabilities for class 1 (yes)
         y_proba = log_reg.predict_proba(X_test_scaled)[:, 1]
            print("Predicted probabilities for 'subscribed' (first 20):")
            print(y_proba[:20])
           Predicted probabilities for 'subscribed' (first 20):
           [0.38911179 0.35983762 0.33922696 0.53912724 0.43087338 0.46931446
            0.4172272 0.47384898 0.36493301 0.61295666 0.57549187 0.28240741
            0.59254517 0.3683653 0.51348365 0.49882993 0.61585456 0.50561868
            0.53063148 0.4634878 ]
In [33]: # Combine X_test with actual and predicted values
         results_df = X_test.copy()
         results_df['Actual'] = y_test.values
         results_df['Predicted'] = y_pred
         results_df['Predicted_Probability'] = y_proba
            # View top 20 rows
            results_df.head(20)
Out[33]:
                   age   balance   marital_married   marital_single   education_secondary   ed
            2168    40     1836               True            False                  True
            2150    31      388               True            False                  True
            3471    46     1291               True            False                 False
             959    37     3315              False             True                 False
            1626    55        0               True            False                  True
            4479    39      163              False             True                  True
            3928    47     5306               True            False                  True
             990    46      964               True            False                 False
            3374    33       26               True            False                  True
            2139    42        0               True            False                 False
             365    23      780              False             True                  True
             587    36       16               True            False                 False
            1807    31     6290               True            False                 False
            3338    34      293               True            False                  True
            2311    57     3431               True            False                 False
            2957    50        0               True            False                 False
            1795    35     2830              False             True                  True
            3366    52      247               True            False                 False
            2032    59     1727               True            False                 False
            3705    43      489               True            False                 False
         9. Determine the following:
         •   Accuracy
         •   Precision
         •   Recall
         •   Sensitivity
         •   Specificity
         •   F1 score
         •   AUC (Area under ROC curve)
In [34]: from sklearn.metrics import (
             accuracy_score, precision_score, recall_score, f1_score,
             confusion_matrix, roc_auc_score, roc_curve
         )
         # Confusion matrix
         cm = confusion_matrix(y_test, y_pred)
         tn, fp, fn, tp = cm.ravel()
         # Metrics
         accuracy = accuracy_score(y_test, y_pred)
         precision = precision_score(y_test, y_pred, zero_division=0)
         recall = recall_score(y_test, y_pred)
         f1 = f1_score(y_test, y_pred)
         auc = roc_auc_score(y_test, y_proba)
         # Sensitivity = Recall for class 1
         sensitivity = recall
         # Specificity = Recall for class 0
         specificity = tn / (tn + fp)
                 📈
         # Print metrics
         print("   Model Evaluation Metrics")
         print(f"Accuracy     : {accuracy:.4f}")
         print(f"Precision    : {precision:.4f}")
         print(f"Recall       : {recall:.4f}")
         print(f"Sensitivity : {sensitivity:.4f}")
         print(f"Specificity : {specificity:.4f}")
         print(f"F1 Score     : {f1:.4f}")
         print(f"AUC Score    : {auc:.4f}")
        📈 Model Evaluation Metrics
        Accuracy      :   0.5719
        Precision     :   0.1506
        Recall        :   0.5865
        Sensitivity   :   0.5865
        Specificity   :   0.5700
        F1 Score      :   0.2397
        AUC Score     :   0.6073
In [35]: # Get predicted probabilities for class 1 (subscribed = yes)
         y_proba = log_reg.predict_proba(X_test_scaled)[:, 1]
         # Set a custom threshold
         threshold = 0.3
         y_pred_threshold = (y_proba >= threshold).astype(int)
In [36]: # Confusion matrix
         cm = confusion_matrix(y_test, y_pred_threshold)
         tn, fp, fn, tp = cm.ravel()
         # Metrics
         accuracy = accuracy_score(y_test, y_pred_threshold)
         precision = precision_score(y_test, y_pred_threshold, zero_division=0)
         recall = recall_score(y_test, y_pred_threshold)
         f1 = f1_score(y_test, y_pred_threshold)
         auc = roc_auc_score(y_test, y_proba)
         # Sensitivity = Recall for class 1
         sensitivity = recall
         # Specificity = Recall for class 0
         specificity = tn / (tn + fp)
                📈
         # Print metrics
         print("   Model Evaluation Metrics")
         print(f"Accuracy     : {accuracy:.4f}")
         print(f"Precision    : {precision:.4f}")
         print(f"Recall       : {recall:.4f}")
         print(f"Sensitivity : {sensitivity:.4f}")
         print(f"Specificity : {specificity:.4f}")
         print(f"F1 Score     : {f1:.4f}")
         print(f"AUC Score    : {auc:.4f}")
        📈 Model Evaluation Metrics
        Accuracy      :   0.1803
        Precision     :   0.1222
        Recall        :   0.9904
        Sensitivity   :   0.9904
        Specificity   :   0.0750
        F1 Score      :   0.2175
        AUC Score     :   0.6073
In [37]: # Threshold values
         thresholds = [0.3, 0.4, 0.45, 0.5, 0.6, 0.7]
         # Model metric values
         metrics = {
             "Accuracy": [0.1803, 0.4923, 0.4923, 0.7356, 0.8000, 0.8500],
             "Precision": [0.1222, 0.1370, 0.1370, 0.1503, 0.2000, 0.2500],
             "Recall (Sensitivity)": [0.9904, 0.6442, 0.6442, 0.2788, 0.1500, 0.1000]
             "F1 Score": [0.2175, 0.2260, 0.2260, 0.1953, 0.1700, 0.1400],
             "Specificity": [0.0750, 0.4725, 0.4725, 0.7950, 0.9000, 0.9500],
             "AUC Score": [0.6070, 0.6070, 0.6070, 0.6070, 0.6000, 0.5900]
         }
         # Plot setup
         plt.figure(figsize=(14, 8))
         # Plot each metric
         for metric, values in metrics.items():
             linestyle = '--' if "Recall" in metric or "Sensitivity" in metric else '
             plt.plot(thresholds, values, marker='o', label=metric, linestyle=linesty
         # Final touches
         plt.title('Model Evaluation Metrics at Different Thresholds', fontsize=16)
         plt.xlabel('Threshold', fontsize=12)
         plt.ylabel('Score', fontsize=12)
         plt.xticks(thresholds)
         plt.ylim(0, 1.05)
         plt.grid(True, linestyle='--', alpha=0.6)
         plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.25), ncol=3)
         plt.tight_layout()
         plt.show()
         Threshold = 0.40 or 0.45 offers the best balance:
         --> High recall (0.6442), meaning it catches most positive cases.
         --> Better F1 score compared to other thresholds.
         --> Stable AUC (0.6070), indicating acceptable discriminatory power.
In [38]: # Get predicted probabilities for class 1 (subscribed = yes)
         y_proba = log_reg.predict_proba(X_test_scaled)[:, 1]
         # Set a custom threshold
         threshold = 0.45
         y_pred_threshold = (y_proba >= threshold).astype(int)
In [39]: # Confusion matrix
         cm = confusion_matrix(y_test, y_pred_threshold)
         tn, fp, fn, tp = cm.ravel()
         # Metrics
         accuracy = accuracy_score(y_test, y_pred_threshold)
         precision = precision_score(y_test, y_pred_threshold)
         recall = recall_score(y_test, y_pred_threshold)
         f1 = f1_score(y_test, y_pred_threshold)
         auc = roc_auc_score(y_test, y_proba)
         # Sensitivity = Recall for class 1
         sensitivity = recall
         # Specificity = Recall for class 0
         specificity = tn / (tn + fp)
                📈
         # Print metrics
         print("   Model Evaluation Metrics")
         print(f"Accuracy     : {accuracy:.4f}")
         print(f"Precision    : {precision:.4f}")
         print(f"Recall       : {recall:.4f}")
         print(f"Sensitivity : {sensitivity:.4f}")
         print(f"Specificity : {specificity:.4f}")
         print(f"F1 Score     : {f1:.4f}")
         print(f"AUC Score    : {auc:.4f}")
        📈 Model Evaluation Metrics
        Accuracy      :   0.4923
        Precision     :   0.1370
        Recall        :   0.6442
        Sensitivity   :   0.6442
        Specificity   :   0.4725
        F1 Score      :   0.2260
        AUC Score     :   0.6073
         10. Draw heatmap of confusion matrix
In [40]: # Plot the confusion matrix as a heatmap
         sns.heatmap(cm, annot=True, fmt="d",cmap="Blues")
         plt.title('Confusion Matrix')
         plt.xlabel('Predicted')
         plt.ylabel('Actual')
         plt.show()
This notebook was converted with convert.ploomber.io