pandas Day 4
November 8, 2023
[ ]: import pandas as pd
     import seaborn as sns
[ ]: df = sns.load_dataset('titanic')
     df.head()
[ ]:       survived   pclass      sex    age   sibsp   parch      fare embarked   class   \
       0          0        3     male   22.0       1       0    7.2500        S   Third
       1          1        1   female   38.0       1       0   71.2833        C   First
       2          1        3   female   26.0       0       0    7.9250        S   Third
       3          1        1   female   35.0       1       0   53.1000        S   First
       4          0        3     male   35.0       0       0    8.0500        S   Third
             who   adult_male deck   embark_town alive    alone
       0     man         True NaN    Southampton    no    False
       1   woman        False    C     Cherbourg   yes    False
       2   woman        False NaN    Southampton   yes     True
       3   woman        False    C   Southampton   yes    False
       4     man         True NaN    Southampton    no     True
[ ]: df.shape
[ ]: (891, 15)
[ ]: df.info()
       <class 'pandas.core.frame.DataFrame'>
       RangeIndex: 891 entries, 0 to 890
       Data columns (total 15 columns):
        #   Column       Non-Null Count Dtype
       --- ------        -------------- -----
        0   survived     891 non-null    int64
        1   pclass       891 non-null    int64
        2   sex          891 non-null    object
        3   age          714 non-null    float64
        4   sibsp        891 non-null    int64
        5   parch        891 non-null    int64
        6   fare         891 non-null    float64
        7   embarked     889 non-null    object
                                                   1
     8   class        891 non-null    category
     9   who          891 non-null    object
     10 adult_male    891 non-null    bool
     11 deck          203 non-null    category
     12 embark_town 889 non-null      object
     13 alive         891 non-null    object
     14 alone         891 non-null    bool
    dtypes: bool(2), category(2), float64(2), int64(4), object(5)
    memory usage: 80.7+ KB
[ ]: df.isnull().sum()
[ ]: survived           0
     pclass             0
     sex                0
     age              177
     sibsp              0
     parch              0
     fare               0
     embarked           2
     class              0
     who                0
     adult_male         0
     deck             688
     embark_town        2
     alive              0
     alone              0
     dtype: int64
[ ]: df.isnull().sum() / df.shape[0]
[ ]: survived         0.000000
     pclass           0.000000
     sex              0.000000
     age              0.198653
     sibsp            0.000000
     parch            0.000000
     fare             0.000000
     embarked         0.002245
     class            0.000000
     who              0.000000
     adult_male       0.000000
     deck             0.772166
     embark_town      0.002245
     alive            0.000000
     alone            0.000000
     dtype: float64
                                            2
[ ]: df.shape[0]
[ ]: 891
[ ]: df.shape[0]
     df.shape[1]
[ ]: 15
[ ]: sns.heatmap(df.isnull(), cbar=False)
[ ]: <Axes: >
[ ]: sns.heatmap(df.isnull(), cbar=True)
[ ]: <Axes: >
                                            3
[ ]: df[['sex', 'age', 'class']]
[ ]:            sex   age   class
       0       male 22.0    Third
       1     female 38.0    First
       2     female 26.0    Third
       3     female 35.0    First
       4       male 35.0    Third
       ..       …   …     …
       886     male 27.0 Second
       887   female 19.0    First
       888   female   NaN   Third
       889     male 26.0    First
       890     male 32.0    Third
       [891 rows x 3 columns]
                                    4
[ ]: print(df['sex'].unique())
     df.sex.nunique()
    ['male' 'female']
[ ]: 2
[ ]: df.nunique()
[ ]: survived         2
     pclass           3
     sex              2
     age             88
     sibsp            7
     parch            7
     fare           248
     embarked         3
     class            3
     who              3
     adult_male       2
     deck             7
     embark_town      3
     alive            2
     alone            2
     dtype: int64
[ ]: df.columns
[ ]: Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
            'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
            'alive', 'alone'],
           dtype='object')
[ ]: df['embark_town'].unique()
[ ]: array(['Southampton', 'Cherbourg', 'Queenstown', nan], dtype=object)
[ ]: df['embark_town'].value_counts()
[ ]: embark_town
     Southampton    644
     Cherbourg      168
     Queenstown      77
     Name: count, dtype: int64
[ ]: df.groupby('sex')['fare'].mean()
                                            5
[ ]: sex
     female    44.479818
     male      25.523893
     Name: fare, dtype: float64
[ ]: df.groupby(['class', 'sex'])['fare'].mean()
    C:\Users\Lala Abu\AppData\Local\Temp\ipykernel_9808\299023266.py:1:
    FutureWarning: The default of observed=False is deprecated and will be changed
    to True in a future version of pandas. Pass observed=False to retain current
    behavior or observed=True to adopt the future default and silence this warning.
      df.groupby(['class', 'sex'])['fare'].mean()
[ ]: class   sex
     First   female    106.125798
             male       67.226127
     Second female      21.970121
             male       19.741782
     Third   female     16.118810
             male       12.661633
     Name: fare, dtype: float64
[ ]: df.groupby(['survived', 'sex', 'who'])['fare'].mean()
[ ]: survived   sex    who
     0          female child      32.076113
                       woman      20.967174
               male    child      33.073905
                       man        21.490736
     1         female child       30.887800
                       woman      54.813801
               male    child      35.562700
                       man        42.076422
     Name: fare, dtype: float64
[ ]: df.groupby(['survived', 'who']).size()
[ ]: survived  who
     0         child      34
               man       449
               woman      66
     1         child      49
               man        88
               woman     205
     dtype: int64
[ ]: df.drop('deck', axis=1, inplace=True)
                                              6
[ ]: print (df.age.median())
     print (df.age.mean())
       28.0
       29.69911764705882
[ ]: df['age'].fillna(df['age'].mean(), inplace=True)
[ ]: print (df.age.median())
     print (df.age.mean())
       29.69911764705882
       29.69911764705882
[ ]: