import pandas as pd
import math
df = pd.read_csv('1d3.csv')
print("\n Input Data Set is:\n", df)
t = df.keys()[-1]
print('Target Attribute is: ', t)
attribute_names = list(df.keys())
attribute_names.remove(t)
print('Predicting Attributes:', attribute_names)
def entropy(probs):
return sum([-prob * math.log(prob, 2) for prob in probs])
def entropy_of_list(ls, value):
from collections import Counter
cnt = Counter(x for x in ls)
print('Target attribute class count(Yes/No)', dict(cnt))
total_instances = len(ls)
print("Total no of instances/records associated with {0} is: {1}".format(value, total_instances))
probs = [x / total_instances for x in cnt.values()]
return entropy(probs)
def information_gain(df, split_attribute, target_attribute, battr):
print("\n\n-----Information Gain Calculation of ", split_attribute, "--------")
df_split = df.groupby(split_attribute)
for gname, group in df_split:
glist = []
glist.append(gname)
print('Grouped Attribute Values \n', group)
df_agg1 = df_split.agg({target_attribute: lambda x: entropy_of_list(x, glist.pop())})
df_agg1.columns = ['Entropy']
df_agg2 = df_split.agg({target_attribute: lambda x: len(x) / nobs})
df_agg2.columns = ['Proportion']
if battr != 'S':
new_entropy = sum(df_agg1['Entropy'] * df_agg2['Proportion'])
else:
old_entropy = entropy_of_list(df[target_attribute], battr)
old_entropy = entropy_of_list(df[target_attribute], 'S-' + df.iloc[0][df.columns.get_loc(battr)])
return old_entropy - new_entropy
def id3(df, target_attribute, attribute_names, default_class=None, default_attr='S'):
from collections import Counter
cnt = Counter(x for x in df[target_attribute]) # class of YES/NO
if len(cnt) == 1:
return next(iter(cnt))
elif df.empty or (not attribute_names):
return default_class
else:
default_class = max(cnt.keys())
gainz = []
for attr in attribute_names:
ig = information_gain(df, attr, target_attribute, default_attr)
gainz.append(ig)
print("Information gain of ", attr, " is: ", ig)
index_of_max = gainz.index(max(gainz))
best_attr = attribute_names[index_of_max]
print("\nAttribute with the maximum gain is: ", best_attr)
tree = {best_attr: {}}
remaining_attribute_names = [i for i in attribute_names if i != best_attr]
for attr_val, data_subset in df.groupby(best_attr):
subtree = id3(data_subset, target_attribute, remaining_attribute_names, default_class,
best_attr)
tree[best_attr][attr_val] = subtree
return tree
from pprint import pprint
tree = id3(df, t, attribute_names)
print("\nThe Resultant Decision Tree is:")
pprint(tree)
The code you've shared implements a decision tree classifier using the ID3 (Iterative Dichotomiser
algorithm. Let's walk through the process and explain what the code is doing:
1. Reading the Dataset:
The dataset is read using pandas.read_csv():
python
Copy code
df = pd.read_csv('1d3.csv')
print("\n Input Data Set is:\n", df)
This loads the data from the CSV file 1d3.csv into a DataFrame, and prints the dataset.
2. Extracting Target and Predicting Attributes:
The target attribute (i.e., the attribute to be predicted) is the last column of the dataset:
python
Copy code
t = df.keys()[-1]
print('Target Attribute is: ', t)
The remaining columns are the attributes that are used to predict the target:
python
Copy code
attribute_names = list(df.keys())
attribute_names.remove(t)
print('Predicting Attributes:', attribute_names)
3. Entropy Calculation:
The function entropy(probs) calculates the entropy of a set of probabilities. The entropy is a
measure of the uncertainty or impurity of a set of classes. A lower entropy means the set is more
pure.
python
Copy code
def entropy(probs):
return sum([-prob * math.log(prob, 2) for prob in probs])
The function entropy_of_list(ls, value) calculates the entropy of a list of target values for a
particular attribute value.
python
Copy code
def entropy_of_list(ls, value):
from collections import Counter
cnt = Counter(x for x in ls)
print('Target attribute class count(Yes/No)', dict(cnt))
total_instances = len(ls)
print("Total no of instances/records associated with {0} is: {1}".format(value, total_instances))
probs = [x / total_instances for x in cnt.values()]
return entropy(probs)
4. Information Gain:
The Information Gain (IG) is used to determine which attribute to split on at each step of the tree.
It measures the reduction in entropy when splitting the data based on a particular attribute. The
function information_gain() calculates the information gain of splitting the dataset on a specific
attribute:
python
Copy code
def information_gain(df, split_attribute, target_attribute, battr):
print("\n\n-----Information Gain Calculation of ", split_attribute, "--------")
df_split = df.groupby(split_attribute)
for gname, group in df_split:
glist = []
glist.append(gname)
print('Grouped Attribute Values \n', group)
df_agg1 = df_split.agg({target_attribute: lambda x: entropy_of_list(x, glist.pop())})
df_agg1.columns = ['Entropy']
df_agg2 = df_split.agg({target_attribute: lambda x: len(x) / nobs})
df_agg2.columns = ['Proportion']
if battr != 'S':
new_entropy = sum(df_agg1['Entropy'] * df_agg2['Proportion'])
else:
old_entropy = entropy_of_list(df[target_attribute], battr)
old_entropy = entropy_of_list(df[target_attribute], 'S-' + df.iloc[0][df.columns.get_loc(battr)])
return old_entropy - new_entropy
5. ID3 Algorithm:
The main function, id3(), implements the ID3 algorithm for building a decision tree. This function
works recursively to split the data based on the attribute that maximizes the information gain.
python
Copy code
def id3(df, target_attribute, attribute_names, default_class=None, default_attr='S'):
from collections import Counter
cnt = Counter(x for x in df[target_attribute]) # class of YES/NO
if len(cnt) == 1:
return next(iter(cnt))
elif df.empty or (not attribute_names):
return default_class
else:
default_class = max(cnt.keys())
gainz = []
for attr in attribute_names:
ig = information_gain(df, attr, target_attribute, default_attr)
gainz.append(ig)
print("Information gain of ", attr, " is: ", ig)
index_of_max = gainz.index(max(gainz))
best_attr = attribute_names[index_of_max]
print("\nAttribute with the maximum gain is: ", best_attr)
tree = {best_attr: {}}
remaining_attribute_names = [i for i in attribute_names if i != best_attr]
for attr_val, data_subset in df.groupby(best_attr):
subtree = id3(data_subset, target_attribute, remaining_attribute_names, default_class,
best_attr)
tree[best_attr][attr_val] = subtree
return tree
Steps in the id3() Function:
1. Base Case: If all the instances in the dataset belong to the same class (e.g., all "YES" or all
"NO"), return that class.
2. Base Case: If the dataset is empty or there are no attributes left to split on, return the
default class.
3. Choose the Best Attribute: Calculate the information gain for each attribute, and choose
the attribute with the highest information gain to split on.
4. Recursive Case: Split the dataset based on the chosen attribute and recursively build
subtrees for each subset of data.
6. Printing the Decision Tree:
After running the id3() function, the decision tree is printed using the pprint() function for a better
visual representation:
python
Copy code
from pprint import pprint
tree = id3(df, t, attribute_names)
print("\nThe Resultant Decision Tree is:")
pprint(tree)
Conclusion:
The code will generate a decision tree based on the ID3 algorithm. The tree will be constructed
step-by-step by evaluating which attribute (from the available ones) provides the highest
information gain, and then recursively applying this process to each subset of the data.
The result is a hierarchical decision tree structure that can be used to make predictions based on
the values of the attributes.