Name: Smit Ahire,
Roll. No: 05,
PRN: 12311496
Linear reggression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import
train_test_split from sklearn import linear_model
from sklearn.metrics import r2_score
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import
StandardScaler from sklearn.metrics import
accuracy_score
fc = pd.read_csv('/content/FuelConsumption .csv')
fc.head()
Year MAKE MODEL VEHICLE ENGINE SIZE CYLINDERS TRANSMISSION FUEL FUEL COEMISSIONS
CLASS CONSUMPTION
0 2000 ACURA 1.6EL COMPACT 1.6 4 A4 X 10.5 216
1 2000 ACURA 1.6EL COMPACT 1.6 4 M5 X 9.8 205
2 2000 ACURA 3.2TL MID-SIZE 3.2 6 AS5 Z 13.7 265
3 2000 ACURA 3.5RL MID-SIZE 3.5 6 A4 Z 15.0 301
4 2000 ACURA INTEGRA SUBCOMPACT 1.8 4 A4 X 11.4 230
fc.shape
(639, 10)
fc.info()
<class
'pandas.core.frame.DataFrame'>
RangeIndex: 639 entries, 0 to 638
Data columns (total 10 columns):
# Column Non-Null Dtyp
Count e
0 Year 639 non-null int6
4
1 MAKE 639 non-null objec
t
2 MODEL 639 non-null objec
t
3 VEHICLE CLASS 639 non-null objec
t
4 ENGINE SIZE 639 non-null float6
4
5 CYLINDERS 639 non-null int6
4
6 TRANSMISSION 639 non-null objec
t
7 FUEL 639 non-null objec
t
8 FUEL CONSUMPTION 639 non-null float6
4
9 COEMISSIONS 639 non-null int6
4
dtypes: float64(2), int64(3),
object(5) memory usage: 50.1+ KB
fc.dropna(inplace=True)
fc.describe()
Year ENGINE SIZE CYLINDERS FUEL COEMISSIONS
CONSUMPTION
count 639.0 639.000000 639.00000 639.000000 639.000000
0
mean 2000.0 3.265728 5.805947 14.713615 296.809077
std 0.0 1.231012 1.625588 3.307044 65.504178
min 2000.0 1.000000 3.000000 4.900000 104.000000
25% 2000.0 2.200000 4.000000 12.500000 253.000000
50% 2000.0 3.000000 6.000000 14.400000 288.000000
75% 2000.0 4.300000 6.000000 16.600000 343.000000
max 2000.0 8.000000 12.000000 30.200000 582.000000
fc.columns
cfc = fc[['ENGINE SIZE','CYLINDERS','FUEL CONSUMPTION','COEMISSIONS ']]
cfc.head()
ENGINE SIZE CYLINDERS FUEL COEMISSIONS
CONSUMPTION
0 1.6 4 10.5 216
1 1.6 4 9.8 205
2 3.2 6 13.7 265
3 3.5 6 15.0 301
4 1.8 4 11.4 230
for i in cfc[['ENGINE SIZE','CYLINDERS','FUEL CONSUMPTION','COEMISSIONS ']]:
plt.scatter(cfc[i],cfc['COEMISSIONS '],color = 'green')
plt.xlabel(i)
plt.ylabel("EMISSION")
plt.show()
Train test split
train, test = train_test_split(cfc, test_size=0.20, random_state=42)
Apply Linear Regression on Training data.
calculating coefficient and intercept
coefficient=[]
intercept=[]
regress_model = {}
for i in ['ENGINE SIZE','CYLINDERS','FUEL CONSUMPTION']:
reg = linear_model.LinearRegression()
train_x = np.asanyarray(train[[i]])
train_y = np.asanyarray(train[['COEMISSIONS
']]) reg.fit(train_x, train_y)
regress_model[i] = reg
print("Relation between {} & {}".format(i,
"'COEMISSION'")) print("Coefficient :", reg.coef_)
print("Intercept :", reg.intercept_)
coefficient.append(reg.coef_)
intercept.append(reg.intercept_)
print('\n')
Relation between ENGINE SIZE & 'COEMISSION'
Coefficient : [[45.30016431]]
Intercept : [147.95137136]
Relation between CYLINDERS &
'COEMISSION' Coefficient :
[[32.86827926]]
Intercept : [105.00863971]
Relation between FUEL CONSUMPTION & 'COEMISSION'
Coefficient : [[19.572552]]
Intercept : [8.77548259]
Evaluate model performance on test data.
calculating Mean absolute error,Residual sum of squares,R2-score
for i in train[['ENGINE SIZE','CYLINDERS','FUEL CONSUMPTION']]:
test_x = np.asanyarray(test[[i]])
test_y = np.asanyarray(test[['COEMISSIONS ']])
test_x_ = regress_model[i].predict(test_x)
print("Fitting Error between {} & {}".format(i,"'CO2EMISSIONS'"))
print("Mean absolute error: %.2f" % np.mean(np.absolute(test_x_ - test_y)))
print("Residual sum of squares (MSE): %.2f" % np.mean((test_x_ - test_y) ** 2))
print("R2-score: %.2f" % r2_score(test_x_ , test_y) )
print('\n')
Fitting Error between ENGINE SIZE & 'CO2EMISSIONS'
Mean absolute error: 28.00
Residual sum of squares (MSE): 1720.37
R2-score: 0.53
Fitting Error between CYLINDERS & 'CO2EMISSIONS'
Mean absolute error: 33.65
Residual sum of squares (MSE): 1825.88
R2-score: 0.56
Fitting Error between FUEL CONSUMPTION & 'CO2EMISSIONS'
Mean absolute error: 7.77
Residual sum of squares (MSE): 98.84
R2-score: 0.98
LOGISTIC REGRESSION ON UNLABELLED DATA WITH MULTICLASS
CLASSIFICATION
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder,
LabelEncoder from sklearn.linear_model import
LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,
confusion_matrix import matplotlib.pyplot as plt
import seaborn as sns
column_names = ['buying_cost', 'maintainance_cost', 'doors', 'person_capacity', 'lug_boot', 'safety', 'class']
df = pd.read_csv('car_evaluation.csv',header=None,
names=column_names)
df.head()
buying_cost maintainance_cost doors person_capacity lug_boot safety class
0 vhigh 2 2 small low unacc
vhigh
1 vhigh vhigh 2 2 small med unacc
2 vhigh vhigh 2 2 small high unacc
3 vhigh vhigh 2 2 med low unacc
4 vhigh vhigh 2 2 med med unacc
Next steps: Generate code with New interactive
df View recommended plots
sheet
print("\nDataset Information:")
df.info()
Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
# Column Non-Null Dtyp
Count e
0 buying_cost 1728 non- objec
null t
1 maintainance_cost 1728 non- objec
null t
2 doors 1728 non- objec
null t
3 person_capacity 1728 non- objec
null t
4 lug_boot 1728 non- objec
null t
5 safety 1728 non- objec
null t
6 class 1728 non- objec
null t
dtypes: object(7)
memory usage: 94.6+ KB
print("\nShape of the dataset (rows, columns):", df.shape)
Shape of the dataset (rows, columns): (1728, 7)
print("\nChecking for missing values:")
print(df.isnull().sum()) # No missing values expected for this dataset
Checking for missing
values: buying_cost 0
maintainance_cost 0
doors 0
person_capacity 0
lug_boot 0
safety 0
class 0
dtype: int64
print("\nUnique values in each categorical column:")
for col in column_names:
print(f"{col}: {df[col].unique()}")
What can I help you build?
Unique values in each categorical
column: buying_cost: ['vhigh' 'high'
'med' 'low']
maintainance_cost: ['vhigh' 'high' 'med' 'low']
doors: ['2' '3' '4' '5more']
person_capacity: ['2' '4' 'more']
lug_boot: ['small' 'med' 'big']
safety: ['low' 'med' 'high']
class: ['unacc' 'acc' 'vgood' 'good']
# Define feature columns (all columns except the last one, which is the target)
categorical_features = column_names[:-1]
target_column = 'class'
feature_categories = [
['low', 'med', 'high', 'vhigh'], # buying
['low', 'med', 'high', 'vhigh'], # maint
['2', '3', '4', '5more'], # doors
['2', '4', 'more'], # persons
['small', 'med', 'big'], # lug_boot
['low', 'med', 'high'] # safety
]
# Initialize OrdinalEncoder for features
# `handle_unknown='use_encoded_value', unknown_value=-1` can be used for unseen
categories # but for this fixed dataset, it's not strictly necessary.
encoder = OrdinalEncoder(categories=feature_categories)
# Apply encoding to the feature columns
df[categorical_features] = encoder.fit_transform(df[categorical_features])
print("\nFeatures after Ordinal Encoding:")
print(df[categorical_features].head())
Features after Ordinal Encoding:
buying_cost maintainance_cost doors person_capacity lug_boot safety
0 3.0 3.0 0.0 0.0 0.0 0.0
1 3.0 3.0 0.0 0.0 0.0 1.0
2 3.0 3.0 0.0 0.0 0.0 2.0
3 3.0 3.0 0.0 0.0 1.0 0.0
4 3.0 3.0 0.0 0.0 1.0 1.0
# Initialize LabelEncoder for the target variable
label_encoder = LabelEncoder()
# Apply encoding to the target column
df[target_column] = label_encoder.fit_transform(df[target_column])
print("\nTarget variable after Label Encoding:")
print(df[target_column].head())
Target variable after Label Encoding:
0 2
1 2
2 2
3 2
4 2
Name: class, dtype: int64
# Display the mapping of original labels to encoded numbers for the target
variable print("\nMapping of target labels:")
for i, label in enumerate(label_encoder.classes_):
print(f"{label}: {i}")
Mapping of target labels:
acc: 0
good: 1
unacc: 2
vgood: 3
# Define X (features) and y (target)
X =
df[categorical_features]
y = df[target_column]
# Split the data into training and testing
sets # random_state for reproducibility
# stratify=y ensures that the proportion of target classes is maintained in splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nTraining set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")
Training set shape: (1382, 6)
Testing set shape: (346, 6)
# Initialize the Logistic Regression model
# max_iter is increased to ensure convergence for some datasets
# solver='liblinear' is good for small datasets and handles L1/L2
regularization model = LogisticRegression(max_iter=1000, solver='liblinear',
random_state=42)
# Train the model using the training data
print("\nTraining Logistic Regression model...")
model.fit(X_train, y_train)
print("Model training complete.")
Training Logistic Regression model...
Model training complete.
# Make predictions on the test
set y_pred =
model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred,
target_names=label_encoder.classes_) print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
Model Accuracy: 0.8092
Confusion Matrix:
[[ 36 2 39 0]
[ 5 5 4 0]
[ 5 0 237 0]
[ 11 0 0 2]]
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
xticklabels=label_encoder.classes_,
yticklabels=label_encoder.classes_) plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Car Evaluation
Classification') plt.show()
print("\nClassification
Report:") print(class_report)
Classification
Report: recall f1-score support
precision
acc 0.63 0.47 0.54 77
good 0.71 0.36 0.48 14
unacc 0.85 0.98 0.91 242
vgood 1.00 0.15 0.27 13
accuracy 0.81 346
macro avg 0.80 0.49 0.55 346
weighted avg 0.80 0.81 0.78 346