eda
August 1, 2024
[1]: import   pandas as pd
     import   numpy as np
     import   matplotlib.pyplot as plt
     import   seaborn as sns
[2]: tips=pd.read_csv("tips.csv")
[3]: # Display the first few rows of the dataset
     print(tips.head())
        total_bill    tip      sex smoker    day     time    size
    0        16.99   1.01   Female     No    Sun   Dinner       2
    1        10.34   1.66     Male     No    Sun   Dinner       3
    2        21.01   3.50     Male     No    Sun   Dinner       3
    3        23.68   3.31     Male     No    Sun   Dinner       2
    4        24.59   3.61   Female     No    Sun   Dinner       4
[4]: # Get basic information about the dataset
     print(tips.info())
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 244 entries, 0 to 243
    Data columns (total 7 columns):
     #   Column      Non-Null Count Dtype
    --- ------       -------------- -----
     0   total_bill 244 non-null     float64
     1   tip         244 non-null    float64
     2   sex         244 non-null    object
     3   smoker      244 non-null    object
     4   day         244 non-null    object
     5   time        244 non-null    object
     6   size        244 non-null    int64
    dtypes: float64(2), int64(1), object(4)
    memory usage: 13.5+ KB
    None
[5]: # Check for missing values
     print(tips.isnull().sum())
                                                   1
     total_bill      0
     tip             0
     sex             0
     smoker          0
     day             0
     time            0
     size            0
     dtype: int64
[6]: # The dataset contains 244 rows and 7 columns.
     # Columns include 'total_bill', 'tip', 'sex', 'smoker', 'day', 'time', and␣
      ↪'size'.
     # There are no missing values in the dataset.
[7]: # Summary statistics for numerical columns
     print(tips.describe())
              total_bill          tip         size
     count    244.000000   244.000000   244.000000
     mean      19.785943     2.998279     2.569672
     std        8.902412     1.383638     0.951100
     min        3.070000     1.000000     1.000000
     25%       13.347500     2.000000     2.000000
     50%       17.795000     2.900000     2.000000
     75%       24.127500     3.562500     3.000000
     max       50.810000    10.000000     6.000000
[8]: # The average total bill is approximately $19.79 with a standard deviation of␣
      ↪$8.90.
     # The average tip is approximately $2.99 with a standard deviation of $1.38.
     # The dataset includes bills ranging from $3.07 to $50.81, and tips ranging␣
      ↪from $1.00 to $10.00.
[9]: # Summary statistics for categorical columns
     print(tips.describe(include=['O']))
                sex smoker   day     time
     count      244    244   244      244
     unique       2      2     4        2
     top       Male     No   Sat   Dinner
     freq       157    151    87      176
[10]: #   There are more male customers (157) compared to female customers (87).
      #   There are more non-smokers (151) compared to smokers (93).
      #   Most of the data is collected on Thursday, Friday, Saturday, and Sunday.
      #   Most meals are dinner (176) compared to lunch (68).
                                                 2
[11]: sns.set_style("whitegrid")
      # Distribution of total bill
      plt.figure(figsize=(8, 6))
      sns.histplot(tips['total_bill'], kde=True)
      plt.title('Distribution of Total Bill')
      plt.xlabel('Total Bill')
      plt.ylabel('Frequency')
      plt.show()
[12]: # The distribution of total bills is right-skewed with most bills falling␣
       ↪between $10 and $20.
      # There are some outliers with very high total bills.
[13]: # Distribution of tips
      plt.figure(figsize=(8, 6))
      sns.histplot(tips['tip'], kde=True)
      plt.title('Distribution of Tips')
      plt.xlabel('Tip')
      plt.ylabel('Frequency')
                                             3
      plt.show()
[14]: # The distribution of tips is also right-skewed with most tips falling between␣
       ↪$2 and $4.
      # There are a few outliers with very high tips.
[15]: # Scatter plot of total bill vs. tip
      plt.figure(figsize=(8, 6))
      sns.scatterplot(x='total_bill', y='tip', data=tips)
      plt.title('Total Bill vs Tip')
      plt.xlabel('Total Bill')
      plt.ylabel('Tip')
      plt.show()
                                             4
[16]: # There is a positive correlation between the total bill and the tip amount.
      # Higher total bills tend to have higher tips.
[17]: # Pairplot of the dataset
      sns.pairplot(tips)
      plt.show()
                                             5
[18]: # The pairplot shows scatter plots for every pair of numerical variables and␣
       ↪histograms for individual variables.
      # It confirms the positive correlation between total bill and tip.
[19]: # Count plot for categorical features
      plt.figure(figsize=(8, 6))
      sns.countplot(x='sex', data=tips)
      plt.title('Count of Gender')
      plt.show()
                                              6
[20]: #There are more male customers than female customers in the dataset.
[21]: plt.figure(figsize=(8, 6))
      sns.countplot(x='smoker', data=tips)
      plt.title('Count of Smokers')
      plt.show()
                                             7
[22]: # There are more non-smokers than smokers in the dataset.
[23]: plt.figure(figsize=(8, 6))
      sns.countplot(x='day', data=tips)
      plt.title('Count of Days')
      plt.show()
                                             8
[24]: # The dataset has more entries for Saturday and Sunday compared to other days.
[25]: plt.figure(figsize=(8, 6))
      sns.countplot(x='time', data=tips)
      plt.title('Count of Time')
      plt.show()
                                             9
[26]: # There are more records for dinner time compared to lunch time.
[27]: # Box plot of tips by gender
      plt.figure(figsize=(8, 6))
      sns.boxplot(x='sex', y='tip', data=tips)
      plt.title('Tips by Gender')
      plt.show()
                                             10
[28]: # The median tip amount is slightly higher for males compared to females.
[29]: # Box plot of tips by day
      plt.figure(figsize=(8, 6))
      sns.boxplot(x='day', y='tip', data=tips)
      plt.title('Tips by Day')
      plt.show()
                                             11
[30]: # Tips tend to be higher on the weekends (Saturday and Sunday) compared to␣
       ↪weekdays (Thursday and Friday).
[31]: # Box plot of tips by time
      plt.figure(figsize=(8, 6))
      sns.boxplot(x='time', y='tip', data=tips)
      plt.title('Tips by Time')
      plt.show()
                                             12
[32]: # Tips are generally higher during dinner time compared to lunch time.
[33]: # Box plot of tips by smoking status
      plt.figure(figsize=(8, 6))
      sns.boxplot(x='smoker', y='tip', data=tips)
      plt.title('Tips by Smoking Status')
      plt.show()
                                             13
[34]: # Correlation matrix
      corr = tips.select_dtypes(include=['float64', 'int64']).corr()
      # Heatmap of the correlation matrix
      plt.figure(figsize=(10, 8))
      sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
      plt.title('Correlation Heatmap')
      plt.show()
                                             14
[36]: # The heatmap shows a strong positive correlation (0.68) between total bill and␣
       ↪tip.
      # Other correlations are relatively weak.
     Customer Demographics:
     -There are more male customers than female customers.
     -The majority of customers are non-smokers.
     Restaurant Activity:
     -The restaurant sees more activity on the weekends, especially on Saturdays.
     -Dinner time is more popular than lunch time.
     Spending and Tipping Behavior:
     -Higher total bills are associated with higher tips, but the relationship is not perfectly linear. This
     suggests that while people generally tip more when they spend more, other factors also influence
     tipping behavior.
                                                       15
-Tips are higher on weekends, potentially due to larger dining groups or more generous tipping
behavior.
-The variability in tips is higher among male customers and smokers.
Service Insights:
-The restaurant might consider focusing on improving customer experiences during lunch times and
weekdays to potentially increase tips during these periods.
-Understanding the reasons behind high variability in tips among different groups (e.g., males,
smokers) could help in devising targeted strategies to encourage more consistent tipping.
                                               16