9/20/2018                                             komal_DT1_EDAWithFunctions_Titanic
Decision Tree and EDA with functions
        In [51]: import numpy as np
                 import pandas as pd
        In [52]: datafile = "D:\komal\SIMPLILEARN\MY COURSES\IN PROGRESS\MACHINE LEARNING RECOR
                 DINGS\Jul 28 Sat - Aug 25 Sat\Drive downloads\Machine Learning _ Jul 28 - Aug
                  25 _ Sayan\Decision Trees/titanicdata.htm"
        In [53]: #BeautifulSoup is the library used for web scrapping
                    from bs4 import BeautifulSoup
                    with open(datafile,"r",encoding="Latin-1") as f:
                        soup = BeautifulSoup(f,"html.parser")
        In [54]: table = soup.find('table')
        In [55]: import pandas as pd
                 data = data = pd.read_html(str(table).encode('ascii', errors='replace'), flavo
                 r='bs4')[0]
        In [56]: data.head()
        Out[56]:
                                                                                                         Boat Unnamed:
                              Name Age Class/Dept             Ticket          Joined             Job
                                                                                                       [Body]        7
                      AB??-AL-
                      MUN??, Mr             3rd Class      2699?18
                    0                 27                               Cherbourg           ?           15?   NaN
                      N??s??f               Passenger      15s 9d
                      Q??sim
                        ABBING, Mr          3rd Class      5547?7                          Blacksmith
                    1              42                                  Southampton                    ??     NaN
                        Anthony             Passenger      11s                             ?
                      ABBOTT,
                                            3rd Class      CA2673?
                    2 Mrs Rhoda       39                           Southampton ?                       A?    NaN
                                            Passenger      20 5s
                      Mary 'Rosa'
                      ABBOTT, Mr
                                            3rd Class      CA2673?
                    3 Rossmore   16                                Southampton Jeweller ? ?[190]             NaN
                                            Passenger      20 5s
                      Edward
                      ABBOTT, Mr
                                            3rd Class      CA2673?
                    4 Eugene     13                                Southampton Scholar ?               ??    NaN
                                            Passenger      20 5s
                      Joseph
file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_DT1_EDAWithFunctio… 1/9
9/20/2018                                             komal_DT1_EDAWithFunctions_Titanic
        In [57]: def cleanup(value):
                     return value.replace("?"," ")
        In [58]: data['Name'] = data['Name'].apply(cleanup)
                 data['Boat [Body]'] = data['Boat [Body]'].apply(cleanup)
                    data['Age'] = data['Age'].apply(pd.to_numeric,errors='coerce')
                    data.head()
        Out[58]:
                                                                                                          Boat Unnamed:
                              Name Age Class/Dept             Ticket          Joined             Job
                                                                                                        [Body]        7
                      AB -AL-
                                             3rd Class     2699?18
                    0 MUN , Mr N      27.0                             Cherbourg           ?            15    NaN
                                             Passenger     15s 9d
                      s f Q sim
                        ABBING, Mr      3rd Class          5547?7                          Blacksmith
                    1              42.0                                Southampton                            NaN
                        Anthony         Passenger          11s                             ?
                      ABBOTT,
                                             3rd Class     CA2673?
                    2 Mrs Rhoda       39.0                         Southampton ?                        A     NaN
                                             Passenger     20 5s
                      Mary 'Rosa'
                      ABBOTT, Mr
                                      3rd Class            CA2673?
                    3 Rossmore   16.0                              Southampton Jeweller ? [190]               NaN
                                      Passenger            20 5s
                      Edward
                      ABBOTT, Mr
                                      3rd Class            CA2673?
                    4 Eugene     13.0                              Southampton Scholar ?                      NaN
                                      Passenger            20 5s
                      Joseph
        In [59]: data = data[["Name","Age","Class/Dept","Boat [Body]"]]
                 data.head()
        Out[59]:
                                                    Name Age                 Class/Dept Boat [Body]
                    0 AB -AL-MUN , Mr N s f Q sim           27.0 3rd Class Passenger 15
                    1 ABBING, Mr Anthony                    42.0 3rd Class Passenger
                    2 ABBOTT, Mrs Rhoda Mary 'Rosa' 39.0 3rd Class Passenger A
                    3 ABBOTT, Mr Rossmore Edward            16.0 3rd Class Passenger [190]
                    4 ABBOTT, Mr Eugene Joseph              13.0 3rd Class Passenger
file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_DT1_EDAWithFunctio… 2/9
9/20/2018                                             komal_DT1_EDAWithFunctions_Titanic
        In [60]: def checkPass(class_type):
                     if "Passenger" in class_type:
                         return "Passenger"
                     else:
                         return "Crew"
                    data["Crew/Pass"]=data["Class/Dept"].apply(checkPass)
                    data.head()
        Out[60]:
                                                    Name Age                 Class/Dept Boat [Body] Crew/Pass
                    0 AB -AL-MUN , Mr N s f Q sim           27.0 3rd Class Passenger 15                   Passenger
                    1 ABBING, Mr Anthony                    42.0 3rd Class Passenger                      Passenger
                    2 ABBOTT, Mrs Rhoda Mary 'Rosa' 39.0 3rd Class Passenger A                            Passenger
                    3 ABBOTT, Mr Rossmore Edward            16.0 3rd Class Passenger [190]                Passenger
                    4 ABBOTT, Mr Eugene Joseph              13.0 3rd Class Passenger                      Passenger
        In [61]: def class_person(class_type):
                     if "Passenger" in class_type:
                         return class_type.split(" ")[0]
                     else:
                         return 'crew'
                    data['Class'] = data['Class/Dept'].apply(class_person)
                    data.head()
        Out[61]:
                                                                                                  Boat
                                                 Name Age                Class/Dept                    Crew/Pass Class
                                                                                                [Body]
                                                                3rd Class
                    0 AB -AL-MUN , Mr N s f Q sim        27.0                              15         Passenger 3rd
                                                                Passenger
                                                                3rd Class
                    1 ABBING, Mr Anthony                 42.0                                         Passenger 3rd
                                                                Passenger
                        ABBOTT, Mrs Rhoda Mary                  3rd Class
                    2                                    39.0                              A          Passenger 3rd
                        'Rosa'                                  Passenger
                        ABBOTT, Mr Rossmore                     3rd Class
                    3                                    16.0                              [190]      Passenger 3rd
                        Edward                                  Passenger
                                                                3rd Class
                    4 ABBOTT, Mr Eugene Joseph           13.0                                         Passenger 3rd
                                                                Passenger
file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_DT1_EDAWithFunctio… 3/9
9/20/2018                                             komal_DT1_EDAWithFunctions_Titanic
        In [62]: def child_class(value):
                     if value>=18:
                         return 'adult'
                     else:
                         return 'child'
                    data['Adult/Child'] = data['Age'].apply(child_class)
                    data.head()
        Out[62]:
                                                                                  Boat
                                          Name Age          Class/Dept                 Crew/Pass Class Adult/Child
                                                                                [Body]
                        AB -AL-MUN , Mr N s f           3rd Class
                    0                            27.0                      15              Passenger 3rd   adult
                        Q sim                           Passenger
                                                        3rd Class
                    1 ABBING, Mr Anthony         42.0                                      Passenger 3rd   adult
                                                        Passenger
                        ABBOTT, Mrs Rhoda               3rd Class
                    2                            39.0                      A               Passenger 3rd   adult
                        Mary 'Rosa'                     Passenger
                        ABBOTT, Mr Rossmore      3rd Class
                    3                       16.0                           [190]           Passenger 3rd   child
                        Edward                   Passenger
                        ABBOTT, Mr Eugene               3rd Class
                    4                            13.0                                      Passenger 3rd   child
                        Joseph                          Passenger
file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_DT1_EDAWithFunctio… 4/9
9/20/2018                                             komal_DT1_EDAWithFunctions_Titanic
        In [63]: def gender_determiner(name):
                     firstname = name[name.index(",")+ 2:]
                     salutation = firstname.split(" ")[0]
                     if salutation in ['Mr','Master']:
                         return 'Male'
                     else:
                         return 'Female'
                    data['Gender'] = data['Name'].apply(gender_determiner)
                    data.head()
        Out[63]:
                                                                     Boat
                                   Name Age        Class/Dept             Crew/Pass Class Adult/Child Gender
                                                                   [Body]
                        AB -AL-MUN , Mr      3rd Class
                    0                   27.0                      15         Passenger 3rd       adult         Male
                        N s f Q sim          Passenger
                        ABBING, Mr                3rd Class
                    1                      42.0                              Passenger 3rd       adult         Male
                        Anthony                   Passenger
                      ABBOTT, Mrs
                                                  3rd Class
                    2 Rhoda Mary           39.0                   A          Passenger 3rd       adult         Female
                                                  Passenger
                      'Rosa'
                      ABBOTT, Mr
                                                  3rd Class
                    3 Rossmore             16.0                   [190]      Passenger 3rd       child         Male
                                                  Passenger
                      Edward
                        ABBOTT, Mr                3rd Class
                    4                      13.0                              Passenger 3rd       child         Male
                        Eugene Joseph             Passenger
file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_DT1_EDAWithFunctio… 5/9
9/20/2018                                             komal_DT1_EDAWithFunctions_Titanic
        In [65]: def checkSurvival(value):
                     if value.strip() == " " or "[" in value:
                         return 0
                     else:
                         return 1
                    data["Survival"]=data["Boat [Body]"].apply(checkSurvival)
                    data.head()
        Out[65]:
                                                          Boat
                           Name Age Class/Dept                 Crew/Pass Class Adult/Child Gender Survival
                                                        [Body]
                      AB -AL-
                      MUN , Mr      3rd Class
                    0          27.0                    15        Passenger 3rd             adult    Male      1
                      NsfQ          Passenger
                      sim
                      ABBING,
                                          3rd Class
                    1 Mr           42.0                          Passenger 3rd             adult    Male      1
                                          Passenger
                      Anthony
                      ABBOTT,
                      Mrs
                                          3rd Class
                    2 Rhoda        39.0                A         Passenger 3rd             adult    Female 1
                                          Passenger
                      Mary
                      'Rosa'
                      ABBOTT,
                      Mr            3rd Class
                    3          16.0                    [190]     Passenger 3rd             child    Male      0
                      Rossmore      Passenger
                      Edward
                      ABBOTT,
                      Mr                  3rd Class
                    4              13.0                          Passenger 3rd             child    Male      1
                      Eugene              Passenger
                      Joseph
        In [67]: data.groupby(['Crew/Pass'])['Survival'].sum()*100/data.groupby(['Crew/Pass'])[
                 'Survival'].count()
        Out[67]: Crew/Pass
                 Crew         90.217391
                 Passenger    90.310651
                 Name: Survival, dtype: float64
file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_DT1_EDAWithFunctio… 6/9
9/20/2018                                             komal_DT1_EDAWithFunctions_Titanic
        In [69]: def compare(group,data):
                     return data.groupby([group])['Survival'].sum()*100/data.groupby([group])[
                 'Survival'].count()
                    compare("Class",data)
        Out[69]: Class
                 1st     89.714286
                 2nd     88.395904
                 3rd     91.396333
                 crew    90.217391
                 Name: Survival, dtype: float64
        In [70]: compare("Gender",data)
        Out[70]: Gender
                 Female    95.840555
                 Male      88.557743
                 Name: Survival, dtype: float64
        In [71]: compare("Adult/Child",data)
        Out[71]: Adult/Child
                 adult    89.699955
                 child    95.964126
                 Name: Survival, dtype: float64
        In [72]: trainingData=data[["Age","Crew/Pass","Class","Adult/Child","Gender","Survival"
                 ]]
                 trainingData.head()
        Out[72]:
                       Age Crew/Pass Class Adult/Child Gender Survival
                    0 27.0 Passenger 3rd           adult         Male       1
                    1 42.0 Passenger 3rd           adult         Male       1
                    2 39.0 Passenger 3rd           adult         Female 1
                    3 16.0 Passenger 3rd           child         Male       0
                    4 13.0 Passenger 3rd           child         Male       1
file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_DT1_EDAWithFunctio… 7/9
9/20/2018                                              komal_DT1_EDAWithFunctions_Titanic
        In [73]: def catToNum(series):
                     series = series.astype('category')
                     return series.cat.codes
                    catData=trainingData[["Crew/Pass","Class","Adult/Child","Gender"]].apply(catTo
                    Num)
                    trainingData[["Crew/Pass","Class","Adult/Child","Gender"]]=catData
                    trainingData.head()
                   C:\Users\hariz\Anaconda3\lib\site-packages\pandas\core\frame.py:3137: Setting
                   WithCopyWarning:
                   A value is trying to be set on a copy of a slice from a DataFrame.
                   Try using .loc[row_indexer,col_indexer] = value instead
                   See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/st
                   able/indexing.html#indexing-view-versus-copy
                     self[k1] = value[k2]
        Out[73]:
                       Age Crew/Pass Class Adult/Child Gender Survival
                    0 27.0 1               2       0              1          1
                    1 42.0 1               2       0              1          1
                    2 39.0 1               2       0              0          1
                    3 16.0 1               2       1              1          0
                    4 13.0 1               2       1              1          1
        In [74]: len(trainingData)
        Out[74]: 2456
        In [75]: trainingData = trainingData.dropna()
                 len(trainingData)
        Out[75]: 2426
        In [76]: from sklearn.model_selection import train_test_split
                 train, test = train_test_split(trainingData, test_size = 0.2)
        In [77]: len(train)
        Out[77]: 1940
        In [78]: len(test)
        Out[78]: 486
        In [79]: from sklearn.tree import DecisionTreeClassifier
                 clf=DecisionTreeClassifier(max_leaf_nodes=25)
                 clf=clf.fit(train[["Age","Crew/Pass","Class","Adult/Child","Gender"]],train["S
                 urvival"])
file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_DT1_EDAWithFunctio… 8/9
9/20/2018                                             komal_DT1_EDAWithFunctions_Titanic
        In [81]: clf
        Out[81]: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                             max_features=None, max_leaf_nodes=25,
                             min_impurity_decrease=0.0, min_impurity_split=None,
                             min_samples_leaf=1, min_samples_split=2,
                             min_weight_fraction_leaf=0.0, presort=False, random_state=None,
                             splitter='best')
        In [82]: clf.feature_importances_
        Out[82]: array([0.72325166, 0.03119177, 0.15634522, 0.                             , 0.08921135])
        In [83]: predictions = clf.predict(test[["Age","Crew/Pass","Class","Adult/Child","Gende
                 r"]])
        In [89]: from sklearn.metrics import accuracy_score
                 accuracy_score(test["Survival"], predictions)
        Out[89]: 0.8847736625514403
file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_DT1_EDAWithFunctio… 9/9