Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
I.K.G Punjab Technical University, Kapurthala
Department of Computer Science Engineering
MACHINE LEARNING LAB
BTCS 619-18
Bachelor of Technology
Submitted To: Submitted By:
Ms. Kavita Bains Name : Sahil Sharma
Roll no : 2224542
Group : D
1
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
INDEX
Sr No. Task Signature
1 Implement data pre-processing
2 Simple Linear Regression
3 Simulate Multiple Linear Regression
4 Implement Decision Tree
5 Random forest classification
6 Naive Bayes algorithm
7 Implement K-Nearest Neighbors (K-NN)-Mean
8 Support Vector Machine
2
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
Task -1 : Implement data pre-processing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
# Load data
data = pd.read_csv("social_media_engagement.csv")
data.info()
data.isnull().sum()
# Drop unnecessary columns
data = data.drop(columns=['post_id', 'post_time', 'caption', 'hashtags'])
data
# Convert categorical data to numerical
categorical_columns = ['platform', 'post_type', 'post_day', 'sentiment_score']
data = pd.get_dummies(data, columns=categorical_columns)
# Feature scaling
scaler = StandardScaler()
numerical_features = ['comments', 'shares', 'caption_length', 'num_hashtags', 'post_hour']
data[numerical_features] = scaler.fit_transform(data[numerical_features])
data
# Split data into train and test
target_column = 'likes'
X = data.drop(target_column, axis=1)
y = data[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
3
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
Output:
4
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
Task -2 : Simple Linear Regression
import numpy as np
import matplotlib.pyplot as plt
# Define dataset (Hours studied -> Exam Score)
x = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) # Independent variable
y = np.array([50, 55, 65, 70, 75, 78, 85, 90, 95, 98]) # Dependent variable
# Number of data points
n = len(x)
# Calculate the slope (m)
m = (n * np.sum(x * y) - np.sum(x) * np.sum(y)) / (n * np.sum(x**2) - (np.sum(x))**2)
# Calculate the intercept (b)
b = (np.sum(y) - m * np.sum(x)) / n
# Print the regression equation
print(f"Linear Regression Equation: y = {m:.2f}x + {b:.2f}")
# Make predictions
predictions = m * x + b
print(f"Predicted Exam Scores: {predictions}")
# Plot the original data points
plt.scatter(x, y, color='red', label='Actual Data')
# Plot the regression line
plt.plot(x, predictions, color='blue', label='Regression Line')
# Labels and title
plt.xlabel("Hours Studied")
plt.ylabel("Exam Score")
plt.title("Simple Linear Regression: Hours Studied vs Exam Score")
plt.legend()
# Show the plot
5
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
plt.show()
new_x = 7.5
predicted_score = m * new_x + b
print(f"Predicted Exam Score for 7.5 hours of study: {predicted_score:.2f}")
Output:
6
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
Task -3 : Simulate Multiple Linear Regression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
# Step 1: Generate random data
np.random.seed(42) # For reproducibility
n_samples = 100
# Generate independent variables (X1, X2)
X1 = np.random.rand(n_samples) * 10 # X1 between 0 and 10
X2 = np.random.rand(n_samples) * 20 # X2 between 0 and 20
# Step 2: Define true coefficients for the model
beta_0 = 5 # Intercept
beta_1 = 3 # Coefficient for X1
beta_2 = -2 # Coefficient for X2
# Step 3: Generate random error term (epsilon)
epsilon = np.random.randn(n_samples) * 5 # Random noise
# Step 4: Calculate dependent variable Y using the formula
Y = beta_0 + beta_1 * X1 + beta_2 * X2 + epsilon
# Step 5: Create a DataFrame to organize the data
data = pd.DataFrame({'X1': X1, 'X2': X2, 'Y': Y})
# Step 6: Split data into training and testing sets
X = data[['X1', 'X2']]
y = data['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Step 7: Train a Linear Regression Model
7
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
model = LinearRegression()
model.fit(X_train, y_train)
# Step 8: Make Predictions
y_pred = model.predict(X_test)
# Step 9: Evaluate Model Performance
r2_score = model.score(X_test, y_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
# Print model coefficients and performance metrics
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
print("R² Score:", r2_score)
print("RMSE:", rmse)
print("MAE:", mae)
# Visualize the relationships
plt.figure(figsize=(12, 5))
# Scatter plot for X1 vs Y
plt.subplot(1, 2, 1)
plt.scatter(X_train['X1'], y_train, label="Train Data", alpha=0.7)
plt.scatter(X_test['X1'], y_test, label="Test Data", alpha=0.7, color='red')
plt.xlabel("X1")
plt.ylabel("Y")
plt.title("X1 vs Y")
plt.legend()
# Scatter plot for X2 vs Y
plt.subplot(1, 2, 2)
plt.scatter(X_train['X2'], y_train, label="Train Data", alpha=0.7)
plt.scatter(X_test['X2'], y_test, label="Test Data", alpha=0.7, color='red')
plt.xlabel("X2")
plt.ylabel("Y")
plt.title("X2 vs Y")
plt.legend()
plt.tight_layout()
8
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
plt.show()
Output:
9
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
Task -4 : Implement Decision Tree
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Calculate Gini Index
def gini_index(y):
classes, counts = np.unique(y, return_counts=True)
prob = counts / len(y)
return 1 - np.sum(prob**2)
# Calculate Information Gain
def information_gain(y, left_y, right_y):
entropy_before = gini_index(y)
left_weight = len(left_y) / len(y)
right_weight = len(right_y) / len(y)
entropy_after = left_weight * gini_index(left_y) + right_weight * gini_index(right_y)
return entropy_before - entropy_after
# Find the best split based on Information Gain
def best_split(X, y):
best_ig = 0
best_split_point = None
best_left_indices = None
best_right_indices = None
for feature_index in range(X.shape[1]):
feature_values = np.unique(X[:, feature_index])
for value in feature_values:
left_mask = X[:, feature_index] <= value
right_mask = ~left_mask
left_y = y[left_mask]
right_y = y[right_mask]
if len(left_y) == 0 or len(right_y) == 0:
10
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
continue
ig = information_gain(y, left_y, right_y)if ig > best_ig: best_ig = ig
best_split_point = (feature_index, value)
best_left_indices = left_mask
best_right_indices = right_mask
return best_split_point, best_left_indices, best_right_indices
# Build the decision tree
def build_tree(X, y, depth=0, max_depth=5):
if len(np.unique(y)) == 1 or depth == max_depth:
return np.bincount(y).argmax()
best_split_point, left_mask, right_mask = best_split(X, y)
if best_split_point is None:
return np.bincount(y).argmax()
feature_index, value = best_split_point
left_tree = build_tree(X[left_mask], y[left_mask], depth + 1, max_depth)
right_tree = build_tree(X[right_mask], y[right_mask], depth + 1, max_depth)
return (feature_index, value, left_tree, right_tree)
# Prediction function
def predict(tree, X):
if isinstance(tree, (int, np.integer)): # Check if the tree is a leaf node (class label)
return tree
feature_index, value, left_tree, right_tree = tree
if X[feature_index] <= value:
return predict(left_tree, X)
else:
return predict(right_tree, X)
# Testing the decision tree
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
11
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
# Load dataset
iris = load_iris()
X, y = iris.data, iris.target
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train decision tree
tree = build_tree(X_train, y_train, max_depth=3)
# Predict on test set
y_pred = np.array([predict(tree, x) for x in X_test])
# Calculate accuracy
accuracy = np.mean(y_pred == y_test)
print(f"Decision Tree Accuracy: {accuracy:.2f}")
# Print some calculated values
print("Sample Predictions:")
for i in range(5):
print(f"Actual: {y_test[i]}, Predicted: {y_pred[i]}")
# Plot accuracy comparison
plt.figure(figsize=(6, 4))
sns.heatmap([[accuracy, 1 - accuracy]], annot=True, cmap="coolwarm", xticklabels=["Correct",
"Incorrect"], yticklabels=["Accuracy"], cbar=False)
plt.title("Decision Tree Accuracy Visualization")
plt.show()
# Plot feature importance (dummy values for visualization)
feature_importance = np.random.rand(X.shape[1])
plt.figure(figsize=(8, 5))
plt.bar(iris.feature_names, feature_importance, color='teal')
plt.xlabel("Features")
plt.ylabel("Importance Score")
plt.title("Feature Importance Visualization")
12
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
plt.show()
Output:
13
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
Task -5 : Random forest classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np
# Load dataset (Iris dataset)
data = load_iris()
X = data.data
y = data.target
# Split data into training and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Hyperparameter tuning using GridSearchCV
param_grid = {
'n_estimators': [50, 100, 150],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5,
scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
# Best model
best_clf = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")
# Train best model
best_clf.fit(X_train, y_train)
# Make predictions
y_pred = best_clf.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
# Confusion matrix
14
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)
# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# Feature importance analysis
feature_importance = best_clf.feature_importances_
for i, importance in enumerate(feature_importance):
print(f"Feature {data.feature_names[i]} Importance: {importance:.4f}")
Output:
15
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
Task -6 : Naive Bayes algorithm
import numpy as np
# Dataset
data = [
{'Has Attachment': 'Yes', 'Contains Links': 'Yes', 'Label': 'Spam'},
{'Has Attachment': 'Yes', 'Contains Links': 'No', 'Label': 'Not Spam'},
{'Has Attachment': 'No', 'Contains Links': 'Yes', 'Label': 'Spam'},
{'Has Attachment': 'No', 'Contains Links': 'No', 'Label': 'Not Spam'}
]
# Convert categorical data to numerical
def preprocess_data(data, alpha=1): # Laplace smoothing factor
features = [key for key in data[0] if key != 'Label']
labels = list(set(item['Label'] for item in data))
label_probs = {label: 0 for label in labels}
feature_probs = {label: {feature: {} for feature in features} for label in labels}
for label in labels:
label_data = [item for item in data if item['Label'] == label]
label_probs[label] = len(label_data) / len(data)
for feature in features:
feature_vals = [item[feature] for item in label_data]
unique_vals = set(item[feature] for item in data) # Get all possible values
for val in unique_vals:
# Apply Laplace smoothing: (count + alpha) / (total + alpha * num_categories)
feature_probs[label][feature][val] = (feature_vals.count(val) + alpha) /
(len(feature_vals) + alpha * len(unique_vals))
return label_probs, feature_probs
# Naïve Bayes Prediction
def predict(data_point, label_probs, feature_probs):
features = list(data_point.keys())
scores = {}
for label, label_prob in label_probs.items():
16
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
score = np.log(label_prob) # Use log probabilities for stability
for feature in features:
feature_value = data_point[feature]
if feature_value in feature_probs[label][feature]:
score += np.log(feature_probs[label][feature][feature_value])
else:
score += np.log(1e-6) # Handle missing feature values
scores[label] = score
return max(scores, key=scores.get)
# Preprocess data
label_probs, feature_probs = preprocess_data(data)
# Test cases
test_emails = [
{'Has Attachment': 'Yes', 'Contains Links': 'No'},
{'Has Attachment': 'No', 'Contains Links': 'Yes'},
{'Has Attachment': 'Yes', 'Contains Links': 'Yes'}
]
# Predict labels for test cases
for email in test_emails:
predicted_label = predict(email, label_probs, feature_probs)
print(f"Email {email} -> Predicted Label: {predicted_label}")
Output:
17
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
Task -7 : Implement K-Nearest Neighbors (K-NN)-Mean
#import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
# Example using a CSV file
df = sns.load_dataset("tips")
df.head()
# Split Features and Target
X = df['sex'] # Text data (independent variable)
y = df['day'] # Sentiment labels (target variable)
# Convert Text to Numerical Features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000) # Keep 5000 most important words
X_tfidf = vectorizer.fit_transform(X)
# Split Data into Train and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42,
stratify=y)
# Train SVM Model
model = SVC(kernel='linear') # Linear kernel works well for text data
model.fit(X_train, y_train)
# Make Predictions
y_pred = model.predict(X_test)
# Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy*100:.2f}%')
18
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
Output:
19
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
Task -8 : Support Vector Machine
import numpy as np
import matplotlib.pyplot as plt
def k_means(X, k, max_iters=100):
# Randomly initialize centroids by selecting k points from the dataset
centroids = X[np.random.choice(X.shape[0], k, replace=False)]
for _ in range(max_iters):
# Step 1: Assign each point to the closest centroid
distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
labels = np.argmin(distances, axis=1)
# Step 2: Recompute centroids as the mean of the points in each cluster
new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)])
# Stop if centroids don't change
if np.all(centroids == new_centroids):
break
centroids = new_centroids
return centroids, labels
# Generate random 2D data points
np.random.seed(42)
X = np.vstack((
np.random.randn(50, 2) + np.array([2, 2]),
np.random.randn(50, 2) + np.array([-2, -2]),
np.random.randn(50, 2) + np.array([2, -2])
))
# Number of clusters
k=3
# Apply K-Means
centroids, labels = k_means(X, k)
20
Machine Learning Lab Sahil Sharma
BTCS619-18 2224542
# Plot the results
plt.figure(figsize=(8, 6))
for i in range(k):
plt.scatter(X[labels == i, 0], X[labels == i, 1], label=f'Cluster {i+1}')
plt.scatter(centroids[:, 0], centroids[:, 1], c='black', marker='X', s=200, label='Centroids')
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title("K-Means Clustering")
plt.legend()
plt.show()
Output:
21