FACULTY INITIATIVE – SLOT-3
BARATH P
                                                            II YEAR CSE
                      CANCER DISEASE CLASSIFICATION
Program and Output:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#load the data
df = pd.read_csv('/content/data.csv')
df['diagnosis'] = df['diagnosis'].replace({'M':1,'B':0})
df.head()
print("Cancer data set dimensions : {}".format(df.shape))
df.isnull().sum()
df.describe()
X = df.iloc[:, 1:31].values
Y = df.iloc[:, 31].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)
sns.countplot(x ='diagnosis', data = df)
plt.show()
B, M = df['diagnosis'].value_counts()
print('Number of Benign: ',B)
print('Number of Malignant : ',M)
df.corr()
#Visualize the correlation
f,ax = plt.subplots(figsize=(20, 20))
sns.heatmap(df.corr(), annot = True, fmt= '.2f')
#for data scaling
from sklearn.preprocessing import StandardScaler
#for splitting dataset
from sklearn.model_selection import train_test_split
#for fitting SVM model
from sklearn.svm import SVC
#for displaying evaluation metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
#Scale the data (Feature Scaling)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)
#Create a function for models
def models(X_train, Y_train):
  #Logistic Regression
  from sklearn.linear_model import LogisticRegression
  log = LogisticRegression()
  log.fit(X_train, Y_train)
  #Decision Tree
  from sklearn.tree import DecisionTreeClassifier
  tree = DecisionTreeClassifier()
  tree.fit(X_train, Y_train)
  #Random Forest
  from sklearn.ensemble import RandomForestClassifier
  forest = RandomForestClassifier()
  forest.fit(X_train, Y_train)
  #Print the model accuracy of training data
  print('[0]Logistic Regression Training Accuracy : ',log.score(X_train, Y_train))
  print('[1]Decision Tree Training Accuracy : ',tree.score(X_train, Y_train))
  print('[2]Random Forest Training Accuracy : ',forest.score(X_train, Y_train))
  return log, tree, forest
#Getting all the models
model = models(X_train, Y_train)
#test model accuracy on test data using confusion matrix
from sklearn.metrics import confusion_matrix
for i in range (len(model)):
  print('Model :',model[i])
  cm = confusion_matrix(Y_test,model[i].predict(X_test))
  TP = cm[0][0]
  FP = cm[0][1]
  FN = cm[1][0]
  TN = cm[1][1]
  print(cm)
  print('Testing Accuracy =',(TP + TN)/(TP + FP + FN + TN))
  print()
#test our trained model on the test data
test_df = pd.read_csv('/content/data.csv')
test = test_df.drop(['id','diagnosis'],axis = 1)
test.head()
y_pred = pd.DataFrame(model[0].predict(test), columns=['diagnosis'])
final_df = pd.DataFrame({'Id': test_df['id'], 'diagnosis': y_pred['diagnosis']})
final_df