Task 1:
import pandas
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib.pyplot as plt
df = pandas.read_csv("F:\\4th sem\\Data\\agaricus-lepiota.csv")
att1 = {'e': 0, 'p': 1}
att2 = {'b': 0, 'c': 1, 'x': 2, 'f': 3, 'k': 4, 's': 5}
att3 = {'f': 0, 'g': 1, 'y': 2, 's': 3}
att4 = {'n': 0, 'b': 1, 'c': 2, 'g': 3, 'r': 4, 'p': 5, 'u': 6, 'e': 7,
'w': 8, 'y': 9}
att5 = {'t': 0, 'f': 1}
att6 = {'a': 0, 'l': 1, 'c': 2, 'y': 3, 'f': 4, 'm': 5, 'n': 6, 'p': 7,
's': 8}
att7 = {'a': 0, 'd': 1, 'f': 2, 'n': 3}
att8 = {'c': 0, 'w': 1, 'd': 2}
att9 = {'b': 0, 'n': 1}
att10 = {'k': 0, 'n': 1, 'b': 2, 'h': 3, 'g': 4, 'r': 5, 'o': 6, 'p': 7,
'u': 8, 'e': 9, 'w': 10, 'y': 11}
att11 = {'e': 0, 't': 1}
att12 = {'b': 0, 'c': 1, 'u': 2, 'e': 3, 'z': 4, 'r': 5, '?': 6}
att13 = {'f': 0, 'y': 1, 'k': 2, 's': 3}
att14 = {'f': 0, 'y': 1, 'k': 2, 's': 3}
att15 = {'n': 0, 'b': 1, 'c': 2, 'g': 3, 'o': 4, 'p': 5, 'e': 6, 'w': 7,
'y': 8}
att16 = {'n': 0, 'b': 1, 'c': 2, 'g': 3, 'o': 4, 'p': 5, 'e': 6, 'w': 7,
'y': 8}
att17 = {'p': 0, 'u': 1}
att18 = {'n': 0, 'o': 1, 'w': 2, 'y': 3}
att19 = {'n': 0, 'o': 1, 't': 2}
att20 = {'c': 0, 'e': 1, 'f': 2, 'l': 3, 'n': 4, 'p': 5, 's': 6, 'z': 7}
att21 = {'k': 0, 'n': 1, 'b': 2, 'h': 3, 'r': 4, 'o': 5, 'u': 6, 'w': 7,
'y': 8}
att22 = {'a': 0, 'c': 1, 'n': 2, 's': 3, 'v': 4, 'y': 5}
att23 = {'g': 0, 'l': 1, 'm': 2, 'p': 3, 'u': 4, 'w': 5, 'd': 6}
df['cap-shape'] = df['cap-shape'].map(att2)
df['cap-surface'] = df['cap-surface'].map(att3)
df['cap-color'] = df['cap-color'].map(att4)
df['bruises'] = df['bruises'].map(att5)
df['odor'] = df['odor'].map(att6)
df['gill-attachment'] = df['gill-attachment'].map(att7)
df['gill-spacing'] = df['gill-spacing'].map(att8)
df['gill-size'] = df['gill-size'].map(att9)
df['gill-color'] = df['gill-color'].map(att10)
df['stalk-shape'] = df['stalk-shape'].map(att11)
df['stalk-root'] = df['stalk-root'].map(att12)
df['stalk-surface-above-ring'] = df['stalk-surface-above-ring'].map(att13)
df['stalk-surface-below-ring'] = df['stalk-surface-below-ring'].map(att14)
df['stalk-color-above-ring'] = df['stalk-color-above-ring'].map(att15)
df['stalk-color-below-ring'] = df['stalk-color-below-ring'].map(att16)
df['veil-type'] = df['veil-type'].map(att17)
df['veil-color'] = df['veil-color'].map(att18)
df['ring-number'] = df['ring-number'].map(att19)
df['ring-type'] = df['ring-type'].map(att20)
df['spore-print-color'] = df['spore-print-color'].map(att21)
df['population'] = df['population'].map(att22)
df['habitat'] = df['habitat'].map(att23)
df['classes'] = df['classes'].map(att1)
features = ['cap-shape','cap-surface','cap-color','bruises','odor','gill-
attachment','gill-spacing','gill-size','gill-color','stalk-shape','stalk-
root','stalk-surface-above-ring','stalk-surface-below-ring', 'stalk-color-
above-ring','stalk-color-below-ring','veil-type','veil-color','ring-
number','ring-type','spore-print-color','population','habitat']
X = df[features]
y = df['classes']
clf = DecisionTreeClassifier()
clf = clf.fit(X, y)
fig, ax = plt.subplots(figsize=(10, 5))
tree.plot_tree(clf, ax=ax)
plt.show()
Output:
Task 2:
import pandas
import numpy as np
def entropy(target_col):
elements, counts = np.unique(target_col, return_counts=True)
entropy = np.sum(
[(counts[i] / np.sum(counts)) * np.log2((counts[i] /
np.sum(counts))) for i in range(len(elements))])
return -entropy
def info_gain(data, split_attribute_name, target_name):
total_entropy = entropy(data[target_name])
vals, counts = np.unique(data[split_attribute_name],
return_counts=True)
weighted_entropy = np.sum(
[(counts[i] / np.sum(counts)) *
entropy(data.where(data[split_attribute_name] == vals[i]).dropna()
[target_name])
for i in range(len(vals))])
info_gain = total_entropy - weighted_entropy
return info_gain
df = pandas.read_csv("F:\\4th sem\\Data\\agaricus-lepiota.csv")
att1 = {'e': 0, 'p': 1}
att2 = {'b': 0, 'c': 1, 'x': 2, 'f': 3, 'k': 4, 's': 5}
att3 = {'f': 0, 'g': 1, 'y': 2, 's': 3}
att4 = {'n': 0, 'b': 1, 'c': 2, 'g': 3, 'r': 4, 'p': 5, 'u': 6, 'e': 7,
'w': 8, 'y': 9}
att5 = {'t': 0, 'f': 1}
att6 = {'a': 0, 'l': 1, 'c': 2, 'y': 3, 'f': 4, 'm': 5, 'n': 6, 'p': 7,
's': 8}
att7 = {'a': 0, 'd': 1, 'f': 2, 'n': 3}
att8 = {'c': 0, 'w': 1, 'd': 2}
att9 = {'b': 0, 'n': 1}
att10 = {'k': 0, 'n': 1, 'b': 2, 'h': 3, 'g': 4, 'r': 5, 'o': 6, 'p': 7,
'u': 8, 'e': 9, 'w': 10, 'y': 11}
att11 = {'e': 0, 't': 1}
att12 = {'b': 0, 'c': 1, 'u': 2, 'e': 3, 'z': 4, 'r': 5, '?': 6}
att13 = {'f': 0, 'y': 1, 'k': 2, 's': 3}
att14 = {'f': 0, 'y': 1, 'k': 2, 's': 3}
att15 = {'n': 0, 'b': 1, 'c': 2, 'g': 3, 'o': 4, 'p': 5, 'e': 6, 'w': 7,
'y': 8}
att16 = {'n': 0, 'b': 1, 'c': 2, 'g': 3, 'o': 4, 'p': 5, 'e': 6, 'w': 7,
'y': 8}
att17 = {'p': 0, 'u': 1}
att18 = {'n': 0, 'o': 1, 'w': 2, 'y': 3}
att19 = {'n': 0, 'o': 1, 't': 2}
att20 = {'c': 0, 'e': 1, 'f': 2, 'l': 3, 'n': 4, 'p': 5, 's': 6, 'z': 7}
att21 = {'k': 0, 'n': 1, 'b': 2, 'h': 3, 'r': 4, 'o': 5, 'u': 6, 'w': 7,
'y': 8}
att22 = {'a': 0, 'c': 1, 'n': 2, 's': 3, 'v': 4, 'y': 5}
att23 = {'g': 0, 'l': 1, 'm': 2, 'p': 3, 'u': 4, 'w': 5, 'd': 6}
df['cap-shape'] = df['cap-shape'].map(att2)
df['cap-surface'] = df['cap-surface'].map(att3)
df['cap-color'] = df['cap-color'].map(att4)
df['bruises'] = df['bruises'].map(att5)
df['odor'] = df['odor'].map(att6)
df['gill-attachment'] = df['gill-attachment'].map(att7)
df['gill-spacing'] = df['gill-spacing'].map(att8)
df['gill-size'] = df['gill-size'].map(att9)
df['gill-color'] = df['gill-color'].map(att10)
df['stalk-shape'] = df['stalk-shape'].map(att11)
df['stalk-root'] = df['stalk-root'].map(att12)
df['stalk-surface-above-ring'] = df['stalk-surface-above-ring'].map(att13)
df['stalk-surface-below-ring'] = df['stalk-surface-below-ring'].map(att14)
df['stalk-color-above-ring'] = df['stalk-color-above-ring'].map(att15)
df['stalk-color-below-ring'] = df['stalk-color-below-ring'].map(att16)
df['veil-type'] = df['veil-type'].map(att17)
df['veil-color'] = df['veil-color'].map(att18)
df['ring-number'] = df['ring-number'].map(att19)
df['ring-type'] = df['ring-type'].map(att20)
df['spore-print-color'] = df['spore-print-color'].map(att21)
df['population'] = df['population'].map(att22)
df['habitat'] = df['habitat'].map(att23)
df['classes'] = df['classes'].map(att1)
features = ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
'gill-attachment', 'gill-spacing', 'gill-size',
'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-
above-ring', 'stalk-surface-below-ring',
'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-
type', 'veil-color', 'ring-number', 'ring-type',
'spore-print-color', 'population', 'habitat']
X = df.drop('classes', axis=1)
y = df['classes']
info_gain_list = []
# Loop through each feature and calculate the information gain
for feature in X.columns:
info_gain_val = info_gain(df, feature, 'classes')
info_gain_list.append(info_gain_val)
print("Information Gain for", feature, "=", info_gain_val)
print("Information Gain List:", info_gain_list)
Output:
Information Gain for cap-shape = 0.048796701935373
Information Gain for cap-surface = 0.028590232773772706
Information Gain for cap-color = 0.0360492829762038
Information Gain for bruises = 0.19237948576121955
Information Gain for odor = 0.9060749773839998
Information Gain for gill-attachment = 0.014165027250616191
Information Gain for gill-spacing = 0.10088318399657026
Information Gain for gill-size = 0.23015437514804604
Information Gain for gill-color = 0.41697752341613137
Information Gain for stalk-shape = 0.00751677256966421
Information Gain for stalk-root = 0.1348176376272756
Information Gain for stalk-surface-above-ring = 0.2847255992184844
Information Gain for stalk-surface-below-ring = 0.2718944733927463
Information Gain for stalk-color-above-ring = 0.2538451734622398
Information Gain for stalk-color-below-ring = 0.24141556652756646
Information Gain for veil-type = 0.0
Information Gain for veil-color = 0.02381701612091669
Information Gain for ring-number = 0.03845266924309043
Information Gain for ring-type = 0.3180215107935376
Information Gain for spore-print-color = 0.4807049176849153
Information Gain for population = 0.2019580190668523
Information Gain for habitat = 0.156833604605092
Information Gain List: [0.048796701935373, 0.028590232773772706,
0.0360492829762038, 0.19237948576121955, 0.9060749773839998,
0.014165027250616191, 0.10088318399657026, 0.23015437514804604,
0.41697752341613137, 0.00751677256966421, 0.1348176376272756,
0.2847255992184844, 0.2718944733927463, 0.2538451734622398,
0.24141556652756646, 0.0, 0.02381701612091669, 0.03845266924309043,
0.3180215107935376, 0.4807049176849153, 0.2019580190668523,
0.156833604605082]