10/8/2020                                                                 Cancer
Meaningful Predictive Modeling Week-4 Assignment
    CANCER DISEASE PREDICTION
    In [1]:
    #importing the libraries
    import numpy as np
    import matplotlib.pyplot as plt
    import pandas as pd
    In [16]:
    #importing our cancer dataset
    dataset = pd.read_csv('cancer.csv')
    X = dataset.iloc[:, 2:31].values
    Y = dataset.iloc[:, 1].values
    In [17]:
    dataset.head()
    Out[17]:
              842302   M   17.99   10.38     122.8     1001     0.1184    0.2776   0.3001    0.1471   ...   25.38
     0        842517   M   20.57   17.77   132.90    1326.0    0.08474   0.07864   0.0869   0.07017   ...   24.99   2
     1      84300903   M   19.69   21.25   130.00    1203.0    0.10960   0.15990   0.1974   0.12790   ...   23.57   2
     2      84348301   M   11.42   20.38     77.58    386.1    0.14250   0.28390   0.2414   0.10520   ...   14.91   2
     3      84358402   M   20.29   14.34   135.10    1297.0    0.10030   0.13280   0.1980   0.10430   ...   22.54
     4        843786   M   12.45   15.70     82.57    477.1    0.12780   0.17000   0.1578   0.08089   ...   15.47   2
    5 rows × 32 columns
    In [18]:
    print("Cancer data set dimensions : {}".format(dataset.shape))
    Cancer data set dimensions : (568, 32)
localhost:8888/nbconvert/html/Desktop/Cancer.ipynb?download=false                                                       1/6
10/8/2020                                                           Cancer
    In [19]:
    dataset.isnull().sum()
    dataset.isna().sum()
    Out[19]:
    842302       0
    M            0
    17.99        0
    10.38        0
    122.8        0
    1001         0
    0.1184       0
    0.2776       0
    0.3001       0
    0.1471       0
    0.2419       0
    0.07871      0
    1.095        0
    0.9053       0
    8.589        0
    153.4        0
    0.006399     0
    0.04904      0
    0.05373      0
    0.01587      0
    0.03003      0
    0.006193     0
    25.38        0
    17.33        0
    184.6        0
    2019         0
    0.1622       0
    0.6656       0
    0.7119       0
    0.2654       0
    0.4601       0
    0.1189       0
    dtype: int64
    In [20]:
    #Encoding categorical data values
    from sklearn.preprocessing import LabelEncoder
    labelencoder_Y = LabelEncoder()
    Y = labelencoder_Y.fit_transform(Y)
    In [21]:
    # Splitting the dataset into the Training set and Test set
    from sklearn.model_selection import train_test_split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_stat
    e = 0)
localhost:8888/nbconvert/html/Desktop/Cancer.ipynb?download=false                             2/6
10/8/2020                                                           Cancer
    In [22]:
    #Feature Scaling
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
localhost:8888/nbconvert/html/Desktop/Cancer.ipynb?download=false            3/6
10/8/2020                                                           Cancer
    In [24]:
    #Using Logistic Regression Algorithm to the Training Set
    from sklearn.linear_model import LogisticRegression
    classifier1 = LogisticRegression(random_state = 0)
    classifier1.fit(X_train, Y_train)
    #Using KNeighborsClassifier Method of neighbors class to use Nearest Neighbor algorithm
    from sklearn.neighbors import KNeighborsClassifier
    classifier2 = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier2.fit(X_train, Y_train)
    #Using SVC method of svm class to use Support Vector Machine Algorithm
    from sklearn.svm import SVC
    classifier3 = SVC(kernel = 'linear', random_state = 0)
    classifier3.fit(X_train, Y_train)
    #Using SVC method of svm class to use Kernel SVM Algorithm
    from sklearn.svm import SVC
    classifier4 = SVC(kernel = 'rbf', random_state = 0)
    classifier4.fit(X_train, Y_train)
    #Using GaussianNB method of naïve_bayes class to use Naïve Bayes Algorithm
    from sklearn.naive_bayes import GaussianNB
    classifier5 = GaussianNB()
    classifier5.fit(X_train, Y_train)
    #Using DecisionTreeClassifier of tree class to use Decision Tree Algorithm
    from sklearn.tree import DecisionTreeClassifier
    classifier6 = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    classifier6.fit(X_train, Y_train)
    #Using RandomForestClassifier method of ensemble class to use Random Forest Classificat
    ion algorithm
    from sklearn.ensemble import RandomForestClassifier
    classifier7 = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_s
    tate = 0)
    classifier7.fit(X_train, Y_train)
    C:\Users\ROHINI\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.
    py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22.
    Specify a solver to silence this warning.
      FutureWarning)
    Out[24]:
    RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entro
    py',
                           max_depth=None, max_features='auto', max_leaf_nodes
    =None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=10,
                           n_jobs=None, oob_score=False, random_state=0, verbo
    se=0,
                           warm_start=False)
localhost:8888/nbconvert/html/Desktop/Cancer.ipynb?download=false                             4/6
10/8/2020                                                           Cancer
    In [29]:
    Y_pred1     =   classifier1.predict(X_test)
    Y_pred2     =   classifier2.predict(X_test)
    Y_pred3     =   classifier3.predict(X_test)
    Y_pred4     =   classifier4.predict(X_test)
    Y_pred5     =   classifier5.predict(X_test)
    Y_pred6     =   classifier6.predict(X_test)
    Y_pred7     =   classifier7.predict(X_test)
    In [30]:
    from sklearn.metrics import confusion_matrix
    cm1 = confusion_matrix(Y_test, Y_pred1)
    cm2 = confusion_matrix(Y_test, Y_pred2)
    cm3 = confusion_matrix(Y_test, Y_pred3)
    cm4 = confusion_matrix(Y_test, Y_pred4)
    cm5 = confusion_matrix(Y_test, Y_pred5)
    cm6 = confusion_matrix(Y_test, Y_pred6)
    cm7 = confusion_matrix(Y_test, Y_pred7)
    print(cm1)
    print(cm2)
    print(cm3)
    print(cm4)
    print(cm5)
    print(cm6)
    print(cm7)
    [[91     1]
     [ 2    48]]
    [[91     1]
     [ 6    44]]
    [[90     2]
     [ 4    46]]
    [[92     0]
     [ 6    44]]
    [[89     3]
     [ 6    44]]
    [[84     8]
     [ 6    44]]
    [[89     3]
     [ 6    44]]
localhost:8888/nbconvert/html/Desktop/Cancer.ipynb?download=false            5/6
10/8/2020                                                           Cancer
    In [34]:
    from sklearn.metrics import                accuracy_score
    acc1=accuracy_score(Y_test,                Y_pred1)*100
    acc2=accuracy_score(Y_test,                Y_pred2)*100
    acc3=accuracy_score(Y_test,                Y_pred3)*100
    acc4=accuracy_score(Y_test,                Y_pred4)*100
    acc5=accuracy_score(Y_test,                Y_pred5)*100
    acc6=accuracy_score(Y_test,                Y_pred6)*100
    acc7=accuracy_score(Y_test,                Y_pred7)*100
    print("LogR",acc1)
    print("KNN",acc2)
    print("SVM",acc3)
    print("K-SVM",acc4)
    print("NB",acc5)
    print("DT",acc6)
    print("RF",acc7)
    LogR 97.88732394366197
    KNN 95.07042253521126
    SVM 95.77464788732394
    K-SVM 95.77464788732394
    NB 93.66197183098592
    DT 90.14084507042254
    RF 93.66197183098592
    In [38]:
    import numpy as np
    import pandas as pd
    from pandas import Series, DataFrame
    import matplotlib.pyplot as plt
    data = [acc1, acc2, acc3, acc4,acc5,acc6,acc7]
    labels = ['LogR', 'KNN', 'SVM', 'KSVM', 'NB','DT','RF']
    plt.xticks(range(len(data)), labels)
    plt.xlabel('Algorithms')
    plt.ylabel('Accuracy(%)')
    plt.title('Comparision of Algorithms')
    plt.bar(range(len(data)), data,color=['pink', 'red', 'green', 'blue', 'cyan','yellow',
    'purple'])
    plt.show()
localhost:8888/nbconvert/html/Desktop/Cancer.ipynb?download=false                            6/6