Bacdeaf 23032025 115708 Split 1
Bacdeaf 23032025 115708 Split 1
AIM:
ALGORITHM:
PROGRAM:
NB_from_scratch.py
import csv
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, roc_curve, auc
import matplotlib.pyplot as plt
from itertools import cycle
from scipy import interp
import warnings
import random
import math
warnings.filterwarnings("ignore")
# Example of Naive Bayes implemented from Scratch in Python
for z in range(5):
print("\n\n\nTest Train Split no. ", z + 1, "\n\n\n")
trainsize = int(len(dataset) * 0.75)
trainset = []
testset = list(dataset)
for i in range(trainsize):
index = random.randrange(len(testset))
trainset.append(testset.pop(index))
y_pred.append(resultant_class)
# Getting Accuracy
count = 0
for i in range(len(testset)):
if testset[i][-1] == y_pred[i]:
count += 1
accuracy = (count / float(len(testset))) * 100.0
print("\n\n Accuracy: ", accuracy, "%")
print("\n\n\n\nF1 Score")
f_score = f1_score(y1, y_pred1, average='weighted')
print(f_score)
for i in range(len(y_pred1)):
y3[i][int(y_pred1[i])] = 1
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average (area = {0:0.2f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=4)
NB_from_Gaussian_Sklearn.py
import csv
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, f1_score, roc_curve, auc
import matplotlib.pyplot as plt
from itertools import cycle
from scipy import interp
for z in range(5):
print("\n\n\nTest Train Split no. ", z + 1, "\n\n\n")
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=None) #
Gaussian function of sklearn
gnb = GaussianNB()
gnb.fit(x_train, y_train.ravel())
y_pred = gnb.predict(x_test)
print("\n\n\n\nConfusion Matrix")
cf_matrix = confusion_matrix(y1, y_pred1)
print(cf_matrix)
print("\n\n\n\nF1 Score")
f_score = f1_score(y1, y_pred1, average='weighted')
print(f_score)
for i in range(len(y_pred1)):
y3[i][int(y_pred1[i])] = 1
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y2[:, i], y3[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average (area = {0:0.2f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=4)
AIM:
ALGORITHM:
PROGRAM:
import bayespy as bp
import numpy as np
import csv
from colorama import init
from colorama import Fore, Back, Style
init()
ageEnum = {'SuperSeniorCitizen':0, 'SeniorCitizen':1, 'MiddleAged':2, 'Youth':3, 'Teen':4}
genderEnum = {'Male':0, 'Female':1}
familyHistoryEnum = {'Yes':0, 'No':1}
dietEnum = {'High':0, 'Medium':1, 'Low':2}
lifeStyleEnum = {'Athlete':0, 'Active':1, 'Moderate':2, 'Sedetary':3}
cholesterolEnum = {'High':0, 'BorderLine':1, 'Normal':2}
heartDiseaseEnum = {'Yes':0, 'No':1}
with open('heart_disease_data.csv') as csvfile:
lines = csv.reader(csvfile)
dataset = list(lines)
data = []
for x in dataset:
data.append([ageEnum[x[0]],genderEnum[x[1]],familyHistoryEnum[x[2]],dietEnum[
x [3]],lifeStyleEnum[x[4]],cholesterolEnum[x[5]],heartDiseaseEnum[x[6]]]) data =
np.array(data)
N = len(data)
p_age = bp.nodes.Dirichlet(1.0*np.ones(5))
age = bp.nodes.Categorical(p_age, plates=(N,))
age.observe(data[:,0])
p_gender = bp.nodes.Dirichlet(1.0*np.ones(2))
gender = bp.nodes.Categorical(p_gender, plates=(N,))
gender.observe(data[:,1])
p_familyhistory = bp.nodes.Dirichlet(1.0*np.ones(2))
familyhistory = bp.nodes.Categorical(p_familyhistory, plates=(N,))
familyhistory.observe(data[:,2])
p_diet = bp.nodes.Dirichlet(1.0*np.ones(3))
diet = bp.nodes.Categorical(p_diet, plates=(N,))
diet.observe(data[:,3])
p_lifestyle = bp.nodes.Dirichlet(1.0*np.ones(4))
lifestyle = bp.nodes.Categorical(p_lifestyle, plates=(N,))
lifestyle.observe(data[:,4])
p_cholesterol = bp.nodes.Dirichlet(1.0*np.ones(3))
cholesterol = bp.nodes.Categorical(p_cholesterol, plates=(N,))
cholesterol.observe(data[:,5])
p_heartdisease = bp.nodes.Dirichlet(np.ones(2), plates=(5, 2, 2, 3, 4, 3)) heartdisease =
bp.nodes.MultiMixture([age, gender, familyhistory, diet, lifestyle, cholesterol],
bp.nodes.Categorical, p_heartdisease)
heartdisease.observe(data[:,6])
p_heartdisease.update()
m=0
while m == 0:
print("\n")
res = bp.nodes.MultiMixture([int(input('Enter Age: ' + str(ageEnum))), int(input('Enter
Gender: ' + str(genderEnum))), int(input('Enter FamilyHistory: ' +
str(familyHistoryEnum))), int(input('Enter dietEnum: ' + str(dietEnum))),
int(input('Enter LifeStyle: ' + str(lifeStyleEnum))), int(input('Enter Cholesterol: '
+ str(cholesterolEnum)))], bp.nodes.Categorical,
p_heartdisease).get_moments()[0][heartDiseaseEnum['Yes']]
print("Probability(HeartDisease) = " + str(res))
m = int(input("Enter for Continue:0, Exit :1 "))
OUTPUT:
RESULT:
EX.NO : 5 BUILD REGRESSION MODELS
DATE:
AIM:
ALGORITHM:
Step 1: Read the Given data Sample to X and the curve (linear or non-
linear) to Y
Step 2: Set the value for Smoothening parameter or Free parameter say τ
Step 3: Set the bias /Point of interest set x0 which is a subset of X
Step 4: Determine the weight matrix using :
PROGRAM:
OUTPUT:
Result:
EX.NO : 6 BUILD DECISION TREES AND RANDOM FORESTS
DATE:
AIM:
ALGORITHM:
PROGRAM:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
data =
pd.read_csv('/Users/ganesh/PycharmProjects/DecisionTree/Social_Network_Ads.csv')
data.head()
feature_cols = ['Age', 'EstimatedSalary']
x = data.iloc[:, [2, 3]].values
y = data.iloc[:, 4].values
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier = classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
from sklearn import metrics
print('Accuracy Score:', metrics.accuracy_score(y_test, y_pred))
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
from matplotlib.colors import ListedColormap
x_set, y_set = x_test, y_test
x1, x2 = np.meshgrid(np.arange(start=x_set[:, 0].min()-1, stop=x_set[:, 0].max()+1,
step=0.01), np.arange(start=x_set[:, 1].min()-1, stop=x_set[:, 1].max()+1, step=0.01))
plt.contourf(x1,x2, classifier.predict(np.array([x1.ravel(), x2.ravel()]).T).reshape(x1.shape),
alpha=0.75, cmap=ListedColormap(("red", "green")))
plt.xlim(x1.min(), x1.max())
plt.ylim(x2.min(), x2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(x_set[y_set == j, 0], x_set[y_set == j, 1], c=ListedColormap(("red",
"green"))(i), label=j)
plt.title("Decision Tree(Test set)")
plt.xlabel("Age")
plt.ylabel("Estimated Salary")
plt.legend()
plt.show()
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image
import pydotplus
dot_data = StringIO()
export_graphviz(classifier, out_file=dot_data, filled=True, rounded=True,
special_characters=True, feature_names=feature_cols, class_names=['0', '1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.write_png('decisiontree.png'))
classifier = DecisionTreeClassifier(criterion="gini", max_depth=3)
classifier = classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
dot_data = StringIO()
export_graphviz(classifier, out_file=dot_data, filled=True, rounded=True,
special_characters=True, feature_names=feature_cols, class_names=['0', '1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.write_png('opt_decisiontree_gini.png'))
RESULT:
EX.NO: 7 BUILD SVM MODELS
DATE:
AIM:
ALGORITHM:
Step 1: Import all the necessary libraries.
Step 2: Read the given csv file which contains the emails which are both
spam and ham.
Step 3: Gather all the words given in that dataset and Identify the stop words
with a mean distribution.
Step 4: Create an ML model using the Support Vector Classifier after
splitting the dataset into training and test set.
Step 5: Display the accuracy and f1 score and print the confusion matrix
for the classification of spam and ham.
PROGRAM:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
from nltk.corpus import stopwords
import os
from wordcloud import WordCloud, STOPWORDS,
ImageColorGenerator from PIL import Image
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
from sklearn import model_selection
from sklearn import svm
from nltk import word_tokenize
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
from sklearn.metrics import plot_confusion_matrix
class data_read_write(object):
def init (self):
pass
def init (self, file_link):
self.data_frame = pd.read_csv(file_link)
def read_csv_file(self, file_link):
return self.data_frame
def write_to_csvfile(self, file_link):
self.data_frame.to_csv(file_link, encoding='utf-8', index=False, header=True)
return
class generate_word_cloud(data_read_write):
def init (self):
pass
def variance_column(self, data):
return np.variance(data)
def word_cloud(self, data_frame_column, output_image_file):
text = " ".join(review for review in data_frame_column)
stopwords = set(STOPWORDS)
stopwords.update(["subject"])
wordcloud = WordCloud(width = 1200, height = 800, stopwords=stopwords,
max_font_size = 50, margin=0,
background_color = "white").generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig("Distribution.png")
plt.show()
wordcloud.to_file(output_image_file)
return
class data_cleaning(data_read_write):
def init (self):
pass
def message_cleaning(self, message):
Test_punc_removed = [char for char in message if char not in string.punctuation]
Test_punc_removed_join = ''.join(Test_punc_removed)
Test_punc_removed_join_clean = [word for word in Test_punc_removed_join.split() if
word.lower() not in stopwords.words('english')] final_join = '
'.join(Test_punc_removed_join_clean)
return final_join
def apply_to_column(self, data_column_text):
data_processed = data_column_text.apply(self.message_cleaning)
return data_processed
class apply_embeddding_and_model(data_read_write):
def init (self):
pass
def apply_count_vector(self, v_data_column):
vectorizer = CountVectorizer(min_df=2, analyzer="word", tokenizer=None,
preprocessor=None, stop_words=None)
return vectorizer.fit_transform(v_data_column)
def apply_svm(self, X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
params = {'kernel': 'linear', 'C': 2, 'gamma': 1}
svm_cv = svm.SVC(C=params['C'], kernel=params['kernel'], gamma=params['gamma'],
probability=True)
svm_cv.fit(X_train, y_train)
y_predict_test = svm_cv.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot=True)
print(classification_report(y_test, y_predict_test))
print("test set")
print("\nAccuracy Score: " + str(metrics.accuracy_score(y_test, y_predict_test)))
print("F1 Score: " + str(metrics.f1_score(y_test, y_predict_test))) print("Recall: " +
str(metrics.recall_score(y_test, y_predict_test))) print("Precision: " +
str(metrics.precision_score(y_test, y_predict_test)))
data_frame.head()
data_obj.data_frame.head()
data_obj.write_to_csvfile("processed_file.csv")
cv_object = apply_embeddding_and_model()
spamham_countvectorizer = cv_object.apply_count_vector(data_frame['clean_text'])
X = spamham_countvectorizer
label = data_frame['spam'].values
y = label
cv_object.apply_svm(X,y)
OUTPUT:
RESULT:
EX.NO: 8 IMPLEMENT ENSEMBLING TECHNIQUES
DATE:
AIM:
ALGORITHM:
1. Split the training dataset into train, test and validation dataset.
2. Fit all the base models using train dataset.
3. Make predictions on validation and test dataset.
4. These predictions are used as features to build a second level model
5. This model is used to make predictions on test and meta-features.
PROGRAM:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
df = pd.read_csv("train_data.csv")
target = df["target"]
train = df.drop("target")
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.20)
train_ratio = 0.70
validation_ratio = 0.20
test_ratio = 0.10
x_train, x_test, y_train, y_test = train_test_split(
train, target, test_size=1 - train_ratio)
x_val, x_test, y_val, y_test = train_test_split(
x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio))
model_1 = LinearRegression()
model_2 = xgb.XGBRegressor()
model_3 = RandomForestRegressor()
model_1.fit(x_train, y_train)
val_pred_1 = model_1.predict(x_val)
test_pred_1 = model_1.predict(x_test)
val_pred_1 = pd.DataFrame(val_pred_1)
test_pred_1 = pd.DataFrame(test_pred_1)
model_2.fit(x_train, y_train)
val_pred_2 = model_2.predict(x_val)
test_pred_2 = model_2.predict(x_test)
val_pred_2 = pd.DataFrame(val_pred_2)
test_pred_2 = pd.DataFrame(test_pred_2)
model_3.fit(x_train, y_train)
val_pred_3 = model_1.predict(x_val)
test_pred_3 = model_1.predict(x_test)
val_pred_3 = pd.DataFrame(val_pred_3)
test_pred_3 = pd.DataFrame(test_pred_3)
df_val = pd.concat([x_val, val_pred_1, val_pred_2, val_pred_3], axis=1)
df_test = pd.concat([x_test, test_pred_1, test_pred_2, test_pred_3], axis=1)
final_model = LinearRegression()
final_model.fit(df_val, y_val)
final_pred = final_model.predict(df_test)
print(mean_squared_error(y_test, pred_final))
OUTPUT:
RESULT:
EX.NO: 9 IMPLEMENT CLUSTERING ALGORITHMS
DATE:
AIM:
Algorithm:
PROGRAM:
import pandas as pd
import numpy as np
from sklearn import datasets
iris=datasets.load_iris()
iris_data=iris.data
iris_labels=iris.target
print("accuracy is")
print(classification_report(y_test, y_pred))
OUTPUT:
Result:
EX.NO: 10 IMPLEMENT EM FOR BAYESIAN NETWORKS
DATE:
AIM:
ALGORITHM:
PROGRAM:
dataset=load_iris()
# print(dataset)
X=pd.DataFrame(dataset.data)
X.columns=['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width'
] y=pd.DataFrame(dataset.target)
y.columns=['Targets']
# print(X)
plt.figure(figsize=(14,7))
colormap=np.array(['red','lime','black'])
# REAL PLOT
plt.subplot(1,3,1)
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[y.Targets],s=4
0) plt.title('Real')
# K-PLOT
plt.subplot(1,3,2)
model=KMeans(n_clusters=3)
model.fit(X)
predY=np.choose(model.labels_,[0,1,2]).astype(np.int64)
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[predY],s=40)
plt.title('KMeans')
# GMM PLOT
scaler=preprocessing.StandardScaler()
scaler.fit(X)
xsa=scaler.transform(X)
xs=pd.DataFrame(xsa,columns=X.columns)
gmm=GaussianMixture(n_components=3)
gmm.fit(xs)
y_cluster_gmm=gmm.predict(xs)
plt.subplot(1,3,3)
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[y_cluster_gmm],s=4
0) plt.title('GMM Classification')
OUTPUT:
RESULT:
EX.NO: 11 BUILD SIMPLE NN MODELS.
DATE:
AIM:
ALGORITHM:
1. Image Acquisition: The first step is to acquire images of paper documents with the
help of optical scanners. This way, an original image can be captured and stored.
2. Pre-processing: The noise level on an image should be optimized and areas outside
the text removed. Pre-processing is especially vital for recognizing handwritten
documents that are more sensitive to noise.
3. Segmentation: The process of segmentation is aimed at grouping characters into
meaningful chunks. There can be predefined classes for characters. So, images can
be scanned for patterns that match the classes.
4. Feature Extraction: This step means splitting the input data into a set of features,
that is, to find essential characteristics that make one or another pattern
recognizable.
5. Training an MLP neural network using the following steps:
1. Starting with the input layer, propagate data forward to the output layer.
This step is the forward propagation.
2. Based on the output, calculate the error (the difference between the
predicted and known outcome). The error needs to be minimized.
3. Backpropagate the error. Find its derivative with respect to each weight
in the network, and update the model.
6. Post processing: This stage is the process of refinement as an OCR model can
require some corrections. However, it isn’t possible to achieve 100% recognition
accuracy. The identification of characters heavily depends on the context.
PROGRAM:
model.compile(loss='categorical_crossentropy',
optimizer=OPTIMIZER,
metrics=['accuracy'])
history = model.fit(X_train, Y_train,
batch_size=BATCH_SIZE, epochs=NB_EPOCH,
verbose=VERBOSE,
validation_split=VALIDATION_SPLIT) score =
model.evaluate(X_test, Y_test, verbose=VERBOSE)
print("\nTest score:", score[0])
print('Test accuracy:', score[1])
OUTPUT:
RESULT:
EX.NO: 12 BUILD DEEP LEARNING NN MODELS
DATE: .
AIM:
ALGORITHM:
Steps in CNN Algorithm:
Step-1: Choose the Dataset.
Step-2: Prepare the Dataset for training.
Step-3: Create training Data.
Step-4: Shuffle the Dataset.
Step-5: Assigning Labels and Features.
Step-6: Normalising X and converting labels to categorical data.
Step-7: Split X and Y for use in CNN.
Step-8: Define, compile and train the CNN Model.
Step-9: Accuracy and Score of the model.
PROGRAM:
import cv2 as cv
import math
import time
from google.colab.patches import cv2_imshow
def getFaceBox(net, frame, conf_threshold=0.7):
frameOpencvDnn = frame.copy()
frameHeight = frameOpencvDnn.shape[0]
frameWidth = frameOpencvDnn.shape[1]
blob = cv.dnn.blobFromImage(frameOpencvDnn, 1.0, (300, 300), [104, 117, 123], True,
False) net.setInput(blob)
detections = net.forward()
bboxes = []
for i in range(detections.shape[2]):
confidence = detections[0, 0, i, 2]
if confidence > conf_threshold:
x1 = int(detections[0, 0, i, 3] * frameWidth)
y1 = int(detections[0, 0, i, 4] * frameHeight)
x2 = int(detections[0, 0, i, 5] * frameWidth)
y2 = int(detections[0, 0, i, 6] * frameHeight)
bboxes.append([x1, y1, x2, y2])
cv.rectangle(frameOpencvDnn, (x1, y1), (x2, y2), (0, 255, 0),
int(round(frameHeight/150)), 8)
return frameOpencvDnn, bboxes
faceProto = "/content/opencv_face_detector.pbtxt"
faceModel = "/content/opencv_face_detector_uint8.pb"
ageProto = "/content/age_deploy.prototxt"
ageModel = "/content/age_net.caffemodel"
genderProto = "/content/gender_deploy.prototxt"
genderModel = "/content/gender_net.caffemodel"
MODEL_MEAN_VALUES = (78.4263377603, 87.7689143744, 114.895847746)
ageList = ['(0-2)', '(4-6)', '(8-12)', '(15-20)', '(25-32)', '(38-43)', '(48-53)', '(60-100)']
genderList = ['Male', 'Female']
ageNet = cv.dnn.readNet(ageModel, ageProto)
genderNet = cv.dnn.readNet(genderModel, genderProto)
faceNet = cv.dnn.readNet(faceModel, faceProto)
def age_gender_detector(frame):
# Read frame
t = time.time()
frameFace, bboxes = getFaceBox(faceNet, frame)
for bbox in bboxes:
# print(bbox)
face = frame[max(0,bbox[1]-padding):min(bbox[3]+padding,frame.shape[0]-
1),max(0,bbox[0]-padding):min(bbox[2]+padding, frame.shape[1]-1)]blob =
cv.dnn.blobFromImage(face, 1.0, (227, 227), MODEL_MEAN_VALUES, swapRB=False)
genderNet.setInput(blob)
genderPreds = genderNet.forward()
gender = genderList[genderPreds[0].argmax()]
# print("Gender Output : {}".format(genderPreds))
print("Gender : {}, conf = {:.3f}".format(gender,
genderPreds[0].max()))ageNet.setInput(blob)
agePreds = ageNet.forward()
age = ageList[agePreds[0].argmax()]
print("Age Output : {}".format(agePreds))
print("Age : {}, conf = {:.3f}".format(age, agePreds[0].max()))label = "{},{}".format(gender,
age)
cv.putText(frameFace, label, (bbox[0], bbox[1]-10), cv.FONT_HERSHEY_SIMPLEX, 0.8,
(0,
255, 255), 2, cv.LINE_AA)
return frameFace
from google.colab import files
uploaded = files.upload()
input = cv.imread("2.jpg")
output = age_gender_detector(input)
cv2_imshow(output)
OUTPUT:
RESULT: