Jaypee University of Information Technology
Department of Computer Science and Engineering
Course Code: 18B1WCI674
Course Name: MACHINE LEARNING LAB
Submitted by: Submitted to:
Kshitiz Tayal Mr. Praveen Modi
211173
Batch: CS - 63
S.no Experiments Date Remarks
EXPERIMENT-1
AIM:
Data Preprocessing, Data cleaning and Dimensionality Reduction
using PCA on Titanic dataset.
CODE:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
# Load the Titanic dataset
url =
"https://raw.githubusercontent.com/datasciencedojo/datasets/master/tit
anic.csv"
titanic_df = pd.read_csv(url)
print(titanic_df.head().to_string(index=False))
print(titanic_df.info())
# Check for missing values
print(titanic_df.isnull().sum().to_string())
# Data Preprocessing and Cleaning
titanic_df = titanic_df.drop(columns=['Cabin', 'Ticket', 'Name'])
titanic_df['Age'].fillna(titanic_df['Age'].mean(), inplace=True)
# Fill missing values in 'Embarked' with mode
titanic_df['Embarked'].fillna(titanic_df['Embarked'].mode()[0],
inplace=True)
# Encode categorical variables: Sex and Embarked
label_encoder = LabelEncoder()
titanic_df['Sex'] = label_encoder.fit_transform(titanic_df['Sex'])
titanic_df['Embarked'] =
label_encoder.fit_transform(titanic_df['Embarked'])
# Scale features: Age, Fare
scaler = StandardScaler()
titanic_df[['Age', 'Fare']] = scaler.fit_transform(titanic_df[['Age',
'Fare']])
print(titanic_df.var().to_string())
#PCA
# Separate features (X) and target (Survived)
X = titanic_df.drop(columns=['Survived'])
y = titanic_df['Survived']
# Apply PCA with 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
# Check explained variance ratio
print(pca.explained_variance_ratio_)
# Dimension before PCA
print("Dimension before PCA:", X.shape)
# Dimension after PCA
print("Dimension after PCA:", X_pca.shape)
# Scatter plot of the reduced dataset after PCA
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.5)
plt.title('Scatter plot of Titanic dataset after PCA')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Survived')
plt.grid(True)
plt.show()
OUTPUT:
PassengerId Survived Pclass
Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
1 0 3 Braund, Mr. Owen
Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs
Thayer) female 38.0 1 0 PC 17599 71.2833 C85 C
3 1 3 Heikkinen, Miss.
Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
4 1 1 Futrelle, Mrs. Jacques Heath (Lily May
Peel) female 35.0 1 0 113803 53.1000 C123 S
5 0 3 Allen, Mr. William
Henry male 35.0 0 0 373450 8.0500 NaN S
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
PassengerId 66231.000000
Survived 0.236772
Pclass 0.699015
Sex 0.228475
Age 1.001124
SibSp 1.216043
Parch 0.649728
Fare 1.001124
Embarked 0.626477
[9.99918243e-01 2.43786002e-05]
Dimension before PCA: (891, 8)
Dimension after PCA: (891, 2)
EXPERIMENT-2
AIM:
Implementation Of Decision Tree using ID3 Algorithm.
CODE:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Load the Titanic dataset
url =
"https://raw.githubusercontent.com/datasciencedojo/datasets/master/tit
anic.csv"
titanic_df = pd.read_csv(url)
# Calculate the entropy of the whole dataset
def calc_total_entropy(train_data, label, class_list):
total_row = train_data.shape[0]
total_entr = 0
for c in class_list: # for each class in the label
total_class_count = train_data[train_data[label] ==
c].shape[0]
total_class_entr = - (total_class_count / total_row) *
np.log2(total_class_count / total_row)
total_entr += total_class_entr
return total_entr
# Calculate the entropy of a filtered dataset
def calc_entropy(feature_value_data, label, class_list):
class_count = feature_value_data.shape[0]
entropy = 0
for c in class_list:
label_class_count =
feature_value_data[feature_value_data[label] == c].shape[0]
entropy_class = 0
if label_class_count != 0:
probability_class = label_class_count / class_count
entropy_class = - probability_class *
np.log2(probability_class) # entropy
entropy += entropy_class
return entropy
# Calculate information gain for a feature
def calc_info_gain(feature_name, train_data, label, class_list):
feature_value_list = train_data[feature_name].unique()
total_row = train_data.shape[0]
feature_info = 0.0
for feature_value in feature_value_list:
feature_value_data = train_data[train_data[feature_name] ==
feature_value]
feature_value_count = feature_value_data.shape[0]
feature_value_entropy = calc_entropy(feature_value_data,
label, class_list)
feature_value_probability = feature_value_count / total_row
feature_info += feature_value_probability *
feature_value_entropy
return calc_total_entropy(train_data, label, class_list) -
feature_info
# Find the most informative feature
def find_most_informative_feature(train_data, label, class_list):
feature_list = train_data.columns.drop(label)
max_info_gain = -1
max_info_feature = None
for feature in feature_list:
feature_info_gain = calc_info_gain(feature, train_data, label,
class_list)
if max_info_gain < feature_info_gain:
max_info_gain = feature_info_gain
max_info_feature = feature
return max_info_feature
# Generate a subtree
def generate_sub_tree(feature_name, train_data, label, class_list):
feature_value_count_dict =
train_data[feature_name].value_counts(sort=False)
tree = {}
for feature_value, count in feature_value_count_dict.items():
feature_value_data = train_data[train_data[feature_name] ==
feature_value]
assigned_to_node = False
for c in class_list:
class_count = feature_value_data[feature_value_data[label]
== c].shape[0]
if class_count == count:
tree[feature_value] = c
train_data = train_data[train_data[feature_name] !=
feature_value]
assigned_to_node = True
if not assigned_to_node:
tree[feature_value] = "?"
return tree, train_data
# Recursively create the decision tree
def make_tree(root, prev_feature_value, train_data, label,
class_list):
if train_data.shape[0] != 0:
max_info_feature = find_most_informative_feature(train_data,
label, class_list)
tree, train_data = generate_sub_tree(max_info_feature,
train_data, label, class_list)
next_root = None
if prev_feature_value != None:
root[prev_feature_value] = dict()
root[prev_feature_value][max_info_feature] = tree
next_root = root[prev_feature_value][max_info_feature]
else:
root[max_info_feature] = tree
next_root = root[max_info_feature]
for node, branch in list(next_root.items()):
if branch == "?":
feature_value_data =
train_data[train_data[max_info_feature] == node]
make_tree(next_root, node, feature_value_data, label,
class_list)
# ID3 Algorithm
def id3(train_data_m, label):
train_data = train_data_m.copy()
tree = {}
class_list = train_data[label].unique()
make_tree(tree, None, train_data, label, class_list)
return tree
# Build the decision tree
# Impute missing values with the mean
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X_encoded),
columns=X_encoded.columns)
# Now fit the decision tree classifier
tree_clf = DecisionTreeClassifier(max_depth=3) # Adjust max_depth as
needed
tree_clf.fit(X_imputed, y)
# Export and visualize the decision tree
export_graphviz(
tree_clf,
out_file="titanic_tree.dot",
feature_names=X_imputed.columns,
class_names=['Not Survived', 'Survived'],
rounded=True,
filled=True
)
with open("titanic_tree.dot") as f:
dot_graph = f.read()
graphviz.Source(dot_graph)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y,
test_size=0.2, random_state=42)
# Fit the decision tree classifier on the training data
tree_clf = DecisionTreeClassifier(max_depth=3) # Adjust max_depth as
needed
tree_clf.fit(X_train, y_train)
# Make predictions on the testing data
y_pred = tree_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
OUTPUT:
Accuracy: 0.7988826815642458
EXPERIMENT-3
AIM:
Implementation of Decision tree using Random Forest Algorithm.
CODE:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# Load the Titanic dataset
url =
"https://raw.githubusercontent.com/datasciencedojo/datasets/master/tit
anic.csv"
titanic_df = pd.read_csv(url)
# Data preprocessing
titanic_df.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1,
inplace=True)
titanic_df['Age'].fillna(titanic_df['Age'].median(), inplace=True)
titanic_df['Embarked'].fillna(titanic_df['Embarked'].mode()[0],
inplace=True)
titanic_df['Sex'] = titanic_df['Sex'].map({'male': 0, 'female': 1})
titanic_df = pd.get_dummies(titanic_df, columns=['Embarked'])
# Split data into features and target variable
X = titanic_df.drop('Survived', axis=1).values
y = titanic_df['Survived'].values
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
class DecisionTree:
def __init__(self, max_depth=None):
self.max_depth = max_depth
def fit(self, X, y):
self.tree = self._grow_tree(X, y)
def _grow_tree(self, X, y, depth=0):
num_samples, num_features = X.shape
num_labels = len(np.unique(y))
# Stopping criteria
if depth == self.max_depth or num_labels == 1 or num_samples <
2:
return {'prediction': np.argmax(np.bincount(y))}
# Select random subset of features
feature_indices = random.sample(range(num_features),
int(np.sqrt(num_features)))
best_feature, best_threshold = self._best_criteria(X, y,
feature_indices)
# Handle case where no suitable split is found
if best_feature is None or best_threshold is None:
return {'prediction': np.argmax(np.bincount(y))}
# Split data
left_indices = np.where(X[:, best_feature] <=
best_threshold)[0]
right_indices = np.where(X[:, best_feature] >
best_threshold)[0]
# Create sub-trees
left_tree = self._grow_tree(X[left_indices], y[left_indices],
depth + 1)
right_tree = self._grow_tree(X[right_indices],
y[right_indices], depth + 1)
return {'feature': best_feature,
'threshold': best_threshold,
'left': left_tree,
'right': right_tree}
def _best_criteria(self, X, y, feature_indices):
best_gain = -1
best_feature = None
best_threshold = None
for feature_index in feature_indices:
thresholds = np.unique(X[:, feature_index])
for threshold in thresholds:
left_indices = np.where(X[:, feature_index] <=
threshold)[0]
right_indices = np.where(X[:, feature_index] >
threshold)[0]
if len(left_indices) == 0 or len(right_indices) == 0:
continue
gain = self._information_gain(y, y[left_indices],
y[right_indices])
if gain > best_gain:
best_gain = gain
best_feature = feature_index
best_threshold = threshold
return best_feature, best_threshold
def _information_gain(self, parent, left_child, right_child):
p = len(left_child) / len(parent)
entropy_parent = self._entropy(parent)
entropy_children = p * self._entropy(left_child) + (1 - p) *
self._entropy(right_child)
return entropy_parent - entropy_children
def _entropy(self, y):
_, counts = np.unique(y, return_counts=True)
probabilities = counts / len(y)
return -np.sum(probabilities * np.log2(probabilities + 1e-10))
def predict(self, X):
return np.array([self._predict_tree(x, self.tree) for x in X])
def _predict_tree(self, x, tree):
if 'prediction' in tree:
return tree['prediction']
else:
feature_value = x[tree['feature']]
if feature_value <= tree['threshold']:
return self._predict_tree(x, tree['left'])
else:
return self._predict_tree(x, tree['right'])
class RandomForest:
def __init__(self, n_estimators=100, max_depth=None):
self.n_estimators = n_estimators
self.max_depth = max_depth
self.trees = []
def fit(self, X, y):
for _ in range(self.n_estimators):
tree = DecisionTree(max_depth=self.max_depth)
indices = np.random.choice(len(X), len(X), replace=True)
tree.fit(X[indices], y[indices])
self.trees.append(tree)
def predict(self, X):
predictions = np.array([tree.predict(X) for tree in
self.trees])
return np.mean(predictions, axis=0).astype(int)
# Instantiate and train the Random Forest model
rf = RandomForest(n_estimators=100, max_depth=5)
rf.fit(X_train, y_train)
# Make predictions
predictions = rf.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
# Encode categorical variables
label_encoder = LabelEncoder()
X_encoded = pd.DataFrame(X, columns=titanic_df.drop('Survived',
axis=1).columns)
for col_idx in range(X_encoded.shape[1]):
if X_encoded.iloc[:, col_idx].dtype == 'object':
X_encoded.iloc[:, col_idx] =
label_encoder.fit_transform(X_encoded.iloc[:, col_idx])
# Now fit the decision tree classifier
tree_clf = DecisionTreeClassifier(max_depth=3) # Adjust max_depth as
needed
tree_clf.fit(X_encoded, y)
# Export and visualize the decision tree
export_graphviz(
tree_clf,
out_file="titanic_tree.dot",
feature_names=X_encoded.columns,
class_names=['Not Survived', 'Survived'],
rounded=True,
filled=True
)
# Read the DOT file and visualize the decision tree
with open("titanic_tree.dot") as f:
dot_graph = f.read()
graph = graphviz.Source(dot_graph)
graph.render("titanic_decision_tree", format='png', cleanup=True) #
Save tree as PNG
OUTPUT:
Accuracy: 0.5921787709497207
titanic_decision_tree.png