In [72]: import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
In [74]: df = pd.read_csv('titanic.csv')
df
Out[74]: PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q
891 rows × 12 columns
In [76]: df.isnull()
Out[76]: PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 False False False False False False False False False False True False
1 False False False False False False False False False False False False
2 False False False False False False False False False False True False
3 False False False False False False False False False False False False
4 False False False False False False False False False False True False
... ... ... ... ... ... ... ... ... ... ... ... ...
886 False False False False False False False False False False True False
887 False False False False False False False False False False False False
888 False False False False False True False False False False True False
889 False False False False False False False False False False False False
890 False False False False False False False False False False True False
891 rows × 12 columns
In [78]: df.isnull().sum()
Out[78]: PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
In [80]: df1 = df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis='columns')
df1
Out[80]: Survived Pclass Sex Age Fare
0 0 3 male 22.0 7.2500
1 1 1 female 38.0 71.2833
2 1 3 female 26.0 7.9250
3 1 1 female 35.0 53.1000
4 0 3 male 35.0 8.0500
... ... ... ... ... ...
886 0 2 male 27.0 13.0000
887 1 1 female 19.0 30.0000
888 0 3 female NaN 23.4500
889 1 1 male 26.0 30.0000
890 0 3 male 32.0 7.7500
891 rows × 5 columns
In [82]: df1.isnull().sum()
Out[82]: Survived 0
Pclass 0
Sex 0
Age 177
Fare 0
dtype: int64
In [84]: df['Age'].mode()
Out[84]: 0 24.0
Name: Age, dtype: float64
In [86]: df1.loc[:, 'Age'] = df1['Age'].fillna(df1['Age'].mode()[0])
In [88]: df1.isnull().sum()
Out[88]: Survived 0
Pclass 0
Sex 0
Age 0
Fare 0
dtype: int64
In [96]: df1.drop(columns=['Survived'], inplace=True)
In [100… target = df['Survived']
target
Out[100… 0 0
1 1
2 1
3 1
4 0
..
886 0
887 1
888 0
889 1
890 0
Name: Survived, Length: 891, dtype: int64
In [114… from sklearn.preprocessing import LabelEncoder
le_Sex = LabelEncoder()
In [126… df1['Sex_n'] = le_Sex.fit_transform(df1['Sex'])
In [128… df1
Out[128… Pclass Sex Age Fare age_n Sex_n
0 3 male 22.0 7.2500 28 1
1 1 female 38.0 71.2833 51 0
2 3 female 26.0 7.9250 34 0
3 1 female 35.0 53.1000 47 0
4 3 male 35.0 8.0500 47 1
... ... ... ... ... ... ...
886 2 male 27.0 13.0000 35 1
887 1 female 19.0 30.0000 24 0
888 3 female 24.0 23.4500 31 0
889 1 male 26.0 30.0000 34 1
890 3 male 32.0 7.7500 42 1
891 rows × 6 columns
In [132… df2 = df1.drop(['Sex','age_n'], axis='columns')
df2
Out[132… Pclass Age Fare Sex_n
0 3 22.0 7.2500 1
1 1 38.0 71.2833 0
2 3 26.0 7.9250 0
3 1 35.0 53.1000 0
4 3 35.0 8.0500 1
... ... ... ... ...
886 2 27.0 13.0000 1
887 1 19.0 30.0000 0
888 3 24.0 23.4500 0
889 1 26.0 30.0000 1
890 3 32.0 7.7500 1
891 rows × 4 columns
In [102… from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
In [134… X_train, X_test, y_train, y_test = train_test_split(df2, target, test_size=0.3)
In [136… X_train
Out[136… Pclass Age Fare Sex_n
728 2 25.0 26.0000 1
229 3 24.0 25.4667 0
589 3 24.0 8.0500 1
37 3 21.0 8.0500 1
333 3 16.0 18.0000 1
... ... ... ... ...
359 3 24.0 7.8792 0
3 1 35.0 53.1000 0
180 3 24.0 69.5500 0
426 2 28.0 26.0000 0
756 3 28.0 7.7958 1
623 rows × 4 columns
In [138… y_train
Out[138… 728 0
229 0
589 0
37 0
333 0
..
359 1
3 1
180 0
426 1
756 0
Name: Survived, Length: 623, dtype: int64
In [140… classifier = DecisionTreeClassifier()
In [143… classifier.fit(X_train,y_train)
Out[143… ▾ DecisionTreeClassifier i ?
DecisionTreeClassifier()
In [145… y_pred = classifier.predict(X_test)
In [147… y_pred
Out[147… array([0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1,
0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0,
1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
0, 1, 0, 1], dtype=int64)
In [149… classifier.score(df2,target)
Out[149… 0.9315375982042648
In [151… classifier.score(X_test,y_pred)
Out[151… 1.0
In [155… classifier.predict([[3,25,28,0]])# pclass(1,2,3) ,age , fare , sex -- 0 for female & 1 -- for male
C:\Users\MyPc\anaconda3\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
warnings.warn(
Out[155… array([0], dtype=int64)
In [169… classifier.predict([[1,85,1200,1]])# pclass ,age , fare = money , sex -- 0 for female & 1 -- for male
C:\Users\MyPc\anaconda3\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
warnings.warn(
Out[169… array([1], dtype=int64)
In [179… classifier.predict([[2,38,71000,0]])# pclass ,age , fare , sex -- 0 for female & 1 -- for male
C:\Users\MyPc\anaconda3\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
warnings.warn(
Out[179… array([1], dtype=int64)
In [ ]: