In [49]: import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import export_graphviz
from IPython.display import Image
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("/home/iselab-17/Desktop/ML DATA SETS/Breast Cancer Dataset.csv")
pd.set_option('display.max_columns', None)
data.head()
Out[49]: id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave_points_mean symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se area_se smoothness_se compactness_se conc
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 0.2419 0.07871 1.0950 0.9053 8.589 153.40 0.006399 0.04904
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 0.1812 0.05667 0.5435 0.7339 3.398 74.08 0.005225 0.01308
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 0.2069 0.05999 0.7456 0.7869 4.585 94.03 0.006150 0.04006
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 0.2597 0.09744 0.4956 1.1560 3.445 27.23 0.009110 0.07458
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 0.1809 0.05883 0.7572 0.7813 5.438 94.44 0.011490 0.02461
In [50]: data.shape
Out[50]: (569, 32)
In [51]: data['diagnosis'].unique()
Out[51]: array(['M', 'B'], dtype=object)
In [52]: df = data.drop(['id'], axis=1)
In [53]: df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
In [54]: X=df.drop('diagnosis',axis=1)
y=df['diagnosis']
In [55]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_test
Out[55]: radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave_points_mean symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se area_se smoothness_se compactness_se concavity_se concave_p
204 12.47 18.60 81.09 481.9 0.09965 0.10580 0.08005 0.03821 0.1925 0.06373 0.3961 1.0440 2.497 30.29 0.006953 0.01911 0.02701 0
70 18.94 21.31 123.60 1130.0 0.09009 0.10290 0.10800 0.07951 0.1582 0.05461 0.7888 0.7975 5.486 96.05 0.004444 0.01652 0.02269 0
131 15.46 19.48 101.70 748.9 0.10920 0.12230 0.14660 0.08087 0.1931 0.05796 0.4743 0.7859 3.094 48.31 0.006240 0.01484 0.02813 0
431 12.40 17.68 81.47 467.8 0.10540 0.13160 0.07741 0.02799 0.1811 0.07102 0.1767 1.4600 2.204 15.43 0.010000 0.03295 0.04861 0
540 11.54 14.44 74.65 402.9 0.09984 0.11200 0.06737 0.02594 0.1818 0.06782 0.2784 1.7680 1.628 20.86 0.012150 0.04112 0.05553 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
486 14.64 16.85 94.21 666.0 0.08641 0.06698 0.05192 0.02791 0.1409 0.05355 0.2204 1.0060 1.471 19.98 0.003535 0.01393 0.01800 0
75 16.07 19.65 104.10 817.7 0.09168 0.08424 0.09769 0.06638 0.1798 0.05391 0.7474 1.0160 5.029 79.25 0.010820 0.02203 0.03500 0
249 11.52 14.93 73.87 406.3 0.10130 0.07808 0.04328 0.02929 0.1883 0.06168 0.2562 1.0380 1.686 18.62 0.006662 0.01228 0.02105 0
238 14.22 27.85 92.55 623.9 0.08223 0.10390 0.11030 0.04408 0.1342 0.06129 0.3354 2.3240 2.105 29.96 0.006307 0.02845 0.03850 0
265 20.73 31.12 135.70 1419.0 0.09469 0.11430 0.13670 0.08646 0.1769 0.05674 1.1720 1.6170 7.749 199.70 0.004551 0.01478 0.02143 0
114 rows × 30 columns
In [56]: model = DecisionTreeClassifier(criterion='entropy')
model.fit(X_train, y_train)
model
Out[56]: DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
max_depth=None, max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=None, splitter='best')
In [57]: import math
def entropy(column):
counts = column.value_counts()
probabilities = counts / len(column)
return sum([-p * math.log2(p) for p in probabilities])
def conditional_entropy(data, X, target):
feature_values = data[X].unique()
weighted_entropy = 0
for value in feature_values:
subset = data[data[X] == value]
weighted_entropy += (len(subset) / len(data)) * entropy(subset[target])
return weighted_entropy
def information_gain(data, X, target):
total_entropy = entropy(data[target])
feature_conditional_entropy = conditional_entropy(data, X, target)
return total_entropy - feature_conditional_entropy
for feature in X:
ig = information_gain(df, feature, 'diagnosis')
print(f"Information Gain[{feature}] : {ig}")
Information Gain[radius_mean] : 0.8607815854835991
Information Gain[texture_mean] : 0.8357118798482908
Information Gain[perimeter_mean] : 0.9267038614138748
Information Gain[area_mean] : 0.9280305529818247
Information Gain[smoothness_mean] : 0.7761788341876101
Information Gain[compactness_mean] : 0.9091291689709926
Information Gain[concavity_mean] : 0.9350604299589776
Information Gain[concave_points_mean] : 0.9420903069361305
Information Gain[symmetry_mean] : 0.735036638169654
Information Gain[fractal_dimension_mean] : 0.8361770160635639
Information Gain[radius_se] : 0.9337337383910278
Information Gain[texture_se] : 0.8642965239721755
Information Gain[perimeter_se] : 0.9315454914704012
Information Gain[area_se] : 0.925377169845925
Information Gain[smoothness_se] : 0.9350604299589776
Information Gain[compactness_se] : 0.9231889229252984
Information Gain[concavity_se] : 0.9280305529818247
Information Gain[concave_points_se] : 0.8585933385629725
Information Gain[symmetry_se] : 0.8181371874054084
Information Gain[fractal_dimension_se] : 0.9174857375160954
Information Gain[radius_worst] : 0.9003074642106167
Information Gain[texture_worst] : 0.8634349686194988
Information Gain[perimeter_worst] : 0.8985843535052632
Information Gain[area_worst] : 0.9350604299589776
Information Gain[smoothness_worst] : 0.7197189097252679
Information Gain[compactness_worst] : 0.9183472928687721
Information Gain[concavity_worst] : 0.9302187999024514
Information Gain[concave_points_worst] : 0.9148323543801957
Information Gain[symmetry_worst] : 0.8453951399613433
Information Gain[fractal_dimension_worst] : 0.8915544765281104
In [58]: import matplotlib.pyplot as plt
from sklearn import tree
plt.figure(figsize=(20, 8))
tree.plot_tree(model, feature_names=X.columns, filled=True, rounded=True, fontsize=8)
plt.show()
In [59]: y_pred = model.predict(X_test)
print(y_pred)
[0 1 1 0 0 1 1 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1
0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 1 1 0 0 0 1 1 0 0 1 1 0 1
0 0 0 0 0 0 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 1 1 0 1 1 0 0 0 1 0 0 1
0 0 1]
In [60]: accuracy = accuracy_score(y_test, y_pred) * 100
classification_rep = classification_report(y_test, y_pred)
print("Accuracy:\n", accuracy)
print("Classification Report:\n", classification_rep)
Accuracy:
94.73684210526315
Classification Report:
precision recall f1-score support
0 0.95 0.97 0.96 71
1 0.95 0.91 0.93 43
accuracy 0.95 114
macro avg 0.95 0.94 0.94 114
weighted avg 0.95 0.95 0.95 114
In [61]: new = [[12.5, 19.2, 80.0, 500.0,0.035,0.1,0.05,0.02,0.17,0.06,0.4,1.0,2.5,40.0,0.006,0.02,0.03,0.01,0.02,0.003,
16.0,25.0,105.0,900.0,0.13,0.25,0.28,0.12,0.29,0.08]]
y_pred = model.predict(new)
if y_pred[0] == 0:
print("Prediction: Benign")
else:
print("Prediction: Malignant")
Prediction: Benign
In [ ]:
In [ ]: