0% found this document useful (0 votes)
8 views11 pages

ML Lab1

Uploaded by

ssmmiitt12
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
8 views11 pages

ML Lab1

Uploaded by

ssmmiitt12
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 11

Name: Smit Ahire,

Roll. No: 05,


PRN: 12311496

 Linear reggression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import
train_test_split from sklearn import linear_model

from sklearn.metrics import r2_score

import plotly.express as px
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import


StandardScaler from sklearn.metrics import
accuracy_score

fc = pd.read_csv('/content/FuelConsumption .csv')
fc.head()

Year MAKE MODEL VEHICLE ENGINE SIZE CYLINDERS TRANSMISSION FUEL FUEL COEMISSIONS
CLASS CONSUMPTION
0 2000 ACURA 1.6EL COMPACT 1.6 4 A4 X 10.5 216

1 2000 ACURA 1.6EL COMPACT 1.6 4 M5 X 9.8 205

2 2000 ACURA 3.2TL MID-SIZE 3.2 6 AS5 Z 13.7 265

3 2000 ACURA 3.5RL MID-SIZE 3.5 6 A4 Z 15.0 301

4 2000 ACURA INTEGRA SUBCOMPACT 1.8 4 A4 X 11.4 230

fc.shape

(639, 10)

fc.info()

<class
'pandas.core.frame.DataFrame'>
RangeIndex: 639 entries, 0 to 638
Data columns (total 10 columns):
# Column Non-Null Dtyp
Count e
0 Year 639 non-null int6
4
1 MAKE 639 non-null objec
t
2 MODEL 639 non-null objec
t
3 VEHICLE CLASS 639 non-null objec
t
4 ENGINE SIZE 639 non-null float6
4
5 CYLINDERS 639 non-null int6
4
6 TRANSMISSION 639 non-null objec
t
7 FUEL 639 non-null objec
t
8 FUEL CONSUMPTION 639 non-null float6
4
9 COEMISSIONS 639 non-null int6
4
dtypes: float64(2), int64(3),
object(5) memory usage: 50.1+ KB

fc.dropna(inplace=True)

fc.describe()
Year ENGINE SIZE CYLINDERS FUEL COEMISSIONS
CONSUMPTION
count 639.0 639.000000 639.00000 639.000000 639.000000
0
mean 2000.0 3.265728 5.805947 14.713615 296.809077

std 0.0 1.231012 1.625588 3.307044 65.504178

min 2000.0 1.000000 3.000000 4.900000 104.000000

25% 2000.0 2.200000 4.000000 12.500000 253.000000

50% 2000.0 3.000000 6.000000 14.400000 288.000000

75% 2000.0 4.300000 6.000000 16.600000 343.000000

max 2000.0 8.000000 12.000000 30.200000 582.000000

fc.columns
cfc = fc[['ENGINE SIZE','CYLINDERS','FUEL CONSUMPTION','COEMISSIONS ']]
cfc.head()

ENGINE SIZE CYLINDERS FUEL COEMISSIONS


CONSUMPTION
0 1.6 4 10.5 216

1 1.6 4 9.8 205

2 3.2 6 13.7 265

3 3.5 6 15.0 301

4 1.8 4 11.4 230

for i in cfc[['ENGINE SIZE','CYLINDERS','FUEL CONSUMPTION','COEMISSIONS ']]:


plt.scatter(cfc[i],cfc['COEMISSIONS '],color = 'green')
plt.xlabel(i)
plt.ylabel("EMISSION")
plt.show()
Train test split

train, test = train_test_split(cfc, test_size=0.20, random_state=42)

Apply Linear Regression on Training data.


calculating coefficient and intercept

coefficient=[]
intercept=[]
regress_model = {}
for i in ['ENGINE SIZE','CYLINDERS','FUEL CONSUMPTION']:
reg = linear_model.LinearRegression()
train_x = np.asanyarray(train[[i]])
train_y = np.asanyarray(train[['COEMISSIONS
']]) reg.fit(train_x, train_y)
regress_model[i] = reg
print("Relation between {} & {}".format(i,
"'COEMISSION'")) print("Coefficient :", reg.coef_)
print("Intercept :", reg.intercept_)
coefficient.append(reg.coef_)
intercept.append(reg.intercept_)
print('\n')

Relation between ENGINE SIZE & 'COEMISSION'


Coefficient : [[45.30016431]]
Intercept : [147.95137136]

Relation between CYLINDERS &


'COEMISSION' Coefficient :
[[32.86827926]]
Intercept : [105.00863971]

Relation between FUEL CONSUMPTION & 'COEMISSION'


Coefficient : [[19.572552]]
Intercept : [8.77548259]

Evaluate model performance on test data.

calculating Mean absolute error,Residual sum of squares,R2-score

for i in train[['ENGINE SIZE','CYLINDERS','FUEL CONSUMPTION']]:


test_x = np.asanyarray(test[[i]])
test_y = np.asanyarray(test[['COEMISSIONS ']])
test_x_ = regress_model[i].predict(test_x)
print("Fitting Error between {} & {}".format(i,"'CO2EMISSIONS'"))
print("Mean absolute error: %.2f" % np.mean(np.absolute(test_x_ - test_y)))
print("Residual sum of squares (MSE): %.2f" % np.mean((test_x_ - test_y) ** 2))
print("R2-score: %.2f" % r2_score(test_x_ , test_y) )
print('\n')

Fitting Error between ENGINE SIZE & 'CO2EMISSIONS'


Mean absolute error: 28.00
Residual sum of squares (MSE): 1720.37
R2-score: 0.53
Fitting Error between CYLINDERS & 'CO2EMISSIONS'
Mean absolute error: 33.65
Residual sum of squares (MSE): 1825.88
R2-score: 0.56

Fitting Error between FUEL CONSUMPTION & 'CO2EMISSIONS'


Mean absolute error: 7.77
Residual sum of squares (MSE): 98.84
R2-score: 0.98
LOGISTIC REGRESSION ON UNLABELLED DATA WITH MULTICLASS

CLASSIFICATION
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder,
LabelEncoder from sklearn.linear_model import
LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,
confusion_matrix import matplotlib.pyplot as plt
import seaborn as sns

column_names = ['buying_cost', 'maintainance_cost', 'doors', 'person_capacity', 'lug_boot', 'safety', 'class']

df = pd.read_csv('car_evaluation.csv',header=None,
names=column_names)
df.head()

buying_cost maintainance_cost doors person_capacity lug_boot safety class

0 vhigh 2 2 small low unacc


vhigh
1 vhigh vhigh 2 2 small med unacc

2 vhigh vhigh 2 2 small high unacc

3 vhigh vhigh 2 2 med low unacc

4 vhigh vhigh 2 2 med med unacc

Next steps: Generate code with New interactive


df  View recommended plots
sheet

print("\nDataset Information:")
df.info()

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
# Column Non-Null Dtyp
Count e
0 buying_cost 1728 non- objec
null t
1 maintainance_cost 1728 non- objec
null t
2 doors 1728 non- objec
null t
3 person_capacity 1728 non- objec
null t
4 lug_boot 1728 non- objec
null t
5 safety 1728 non- objec
null t
6 class 1728 non- objec
null t
dtypes: object(7)
memory usage: 94.6+ KB

print("\nShape of the dataset (rows, columns):", df.shape)

Shape of the dataset (rows, columns): (1728, 7)

print("\nChecking for missing values:")


print(df.isnull().sum()) # No missing values expected for this dataset

Checking for missing


values: buying_cost 0
maintainance_cost 0
doors 0
person_capacity 0
lug_boot 0
safety 0
class 0
dtype: int64

print("\nUnique values in each categorical column:")


for col in column_names:
print(f"{col}: {df[col].unique()}")

 What can I help you build? 


Unique values in each categorical
column: buying_cost: ['vhigh' 'high'
'med' 'low']
maintainance_cost: ['vhigh' 'high' 'med' 'low']
doors: ['2' '3' '4' '5more']
person_capacity: ['2' '4' 'more']
lug_boot: ['small' 'med' 'big']
safety: ['low' 'med' 'high']
class: ['unacc' 'acc' 'vgood' 'good']

# Define feature columns (all columns except the last one, which is the target)
categorical_features = column_names[:-1]
target_column = 'class'

feature_categories = [
['low', 'med', 'high', 'vhigh'], # buying
['low', 'med', 'high', 'vhigh'], # maint
['2', '3', '4', '5more'], # doors
['2', '4', 'more'], # persons
['small', 'med', 'big'], # lug_boot
['low', 'med', 'high'] # safety
]

# Initialize OrdinalEncoder for features


# `handle_unknown='use_encoded_value', unknown_value=-1` can be used for unseen
categories # but for this fixed dataset, it's not strictly necessary.
encoder = OrdinalEncoder(categories=feature_categories)

# Apply encoding to the feature columns


df[categorical_features] = encoder.fit_transform(df[categorical_features])
print("\nFeatures after Ordinal Encoding:")
print(df[categorical_features].head())

Features after Ordinal Encoding:


buying_cost maintainance_cost doors person_capacity lug_boot safety
0 3.0 3.0 0.0 0.0 0.0 0.0
1 3.0 3.0 0.0 0.0 0.0 1.0
2 3.0 3.0 0.0 0.0 0.0 2.0
3 3.0 3.0 0.0 0.0 1.0 0.0
4 3.0 3.0 0.0 0.0 1.0 1.0

# Initialize LabelEncoder for the target variable


label_encoder = LabelEncoder()

# Apply encoding to the target column


df[target_column] = label_encoder.fit_transform(df[target_column])
print("\nTarget variable after Label Encoding:")
print(df[target_column].head())

Target variable after Label Encoding:


0 2
1 2
2 2
3 2
4 2
Name: class, dtype: int64

# Display the mapping of original labels to encoded numbers for the target
variable print("\nMapping of target labels:")
for i, label in enumerate(label_encoder.classes_):
print(f"{label}: {i}")

Mapping of target labels:


acc: 0
good: 1
unacc: 2
vgood: 3

# Define X (features) and y (target)


X =
df[categorical_features]
y = df[target_column]

# Split the data into training and testing


sets # random_state for reproducibility
# stratify=y ensures that the proportion of target classes is maintained in splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining set shape: {X_train.shape}")


print(f"Testing set shape: {X_test.shape}")

Training set shape: (1382, 6)


Testing set shape: (346, 6)

# Initialize the Logistic Regression model


# max_iter is increased to ensure convergence for some datasets
# solver='liblinear' is good for small datasets and handles L1/L2
regularization model = LogisticRegression(max_iter=1000, solver='liblinear',
random_state=42)

# Train the model using the training data


print("\nTraining Logistic Regression model...")
model.fit(X_train, y_train)
print("Model training complete.")

Training Logistic Regression model...


Model training complete.

# Make predictions on the test


set y_pred =
model.predict(X_test)

# Evaluate the model


accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred,

target_names=label_encoder.classes_) print(f"\nModel Accuracy: {accuracy:.4f}")

print("\nConfusion Matrix:")
print(conf_matrix)

Model Accuracy: 0.8092

Confusion Matrix:

[[ 36 2 39 0]
[ 5 5 4 0]
[ 5 0 237 0]
[ 11 0 0 2]]
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
xticklabels=label_encoder.classes_,
yticklabels=label_encoder.classes_) plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Car Evaluation
Classification') plt.show()
print("\nClassification
Report:") print(class_report)

Classification
Report: recall f1-score support
precision
acc 0.63 0.47 0.54 77
good 0.71 0.36 0.48 14
unacc 0.85 0.98 0.91 242
vgood 1.00 0.15 0.27 13

accuracy 0.81 346


macro avg 0.80 0.49 0.55 346
weighted avg 0.80 0.81 0.78 346

You might also like