In [1]: import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
In [2]: data=pd.read_csv("cars .csv")
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404 entries, 0 to 403
Data columns (total 26 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 car_ID 404 non-null int64
1 symboling 404 non-null int64
2 CarName 404 non-null object
3 fueltype 403 non-null object
4 aspiration 403 non-null object
5 doornumber 403 non-null object
6 carbody 403 non-null object
7 drivewheel 404 non-null object
8 enginelocation 404 non-null object
9 wheelbase 404 non-null float64
10 carlength 404 non-null float64
11 carwidth 404 non-null float64
12 carheight 404 non-null float64
13 curbweight 404 non-null int64
14 enginetype 404 non-null object
15 cylindernumber 404 non-null object
16 enginesize 404 non-null int64
17 fuelsystem 404 non-null object
18 boreratio 404 non-null float64
19 stroke 404 non-null float64
20 compressionratio 404 non-null float64
21 horsepower 404 non-null int64
22 peakrpm 404 non-null int64
23 citympg 404 non-null int64
24 highwaympg 404 non-null int64
25 price 404 non-null float64
dtypes: float64(8), int64(8), object(10)
memory usage: 82.2+ KB
In [3]: data=data.dropna()
In [4]: data.describe(include=['object'])
Out[4]: CarName fueltype aspiration doornumber carbody drivewheel enginelocation enginetype cylindernumber fuelsyste
count 400 400 400 400 400 400 400 400 400 4
unique 319 2 2 2 5 3 2 7 7
toyota
top gas std four sedan fwd front ohc four m
corolla
freq 10 363 353 215 173 228 397 305 315 1
In [5]: data=data.drop(columns=["CarName"],axis=1)
In [6]: object=data.select_dtypes(include=['object']).columns.tolist()
dummeis=pd.get_dummies(data,columns=object)
In [7]: corr_matrix = dummeis.corr()
plt.figure(figsize=(50, 40))
sb.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f',
xticklabels=corr_matrix.columns,
yticklabels=corr_matrix.columns)
plt.title('Correlation Heatmap of DataFrame')
plt.show()
In [8]: dummeis.columns
Out[8]: Index(['car_ID', 'symboling', 'wheelbase', 'carlength', 'carwidth',
'carheight', 'curbweight', 'enginesize', 'boreratio', 'stroke',
'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
'price', 'fueltype_diesel', 'fueltype_gas', 'aspiration_std',
'aspiration_turbo', 'doornumber_four', 'doornumber_two',
'carbody_convertible', 'carbody_hardtop', 'carbody_hatchback',
'carbody_sedan', 'carbody_wagon', 'drivewheel_4wd', 'drivewheel_fwd',
'drivewheel_rwd', 'enginelocation_front', 'enginelocation_rear',
'enginetype_dohc', 'enginetype_dohcv', 'enginetype_l', 'enginetype_ohc',
'enginetype_ohcf', 'enginetype_ohcv', 'enginetype_rotor',
'cylindernumber_eight', 'cylindernumber_five', 'cylindernumber_four',
'cylindernumber_six', 'cylindernumber_three', 'cylindernumber_twelve',
'cylindernumber_two', 'fuelsystem_1bbl', 'fuelsystem_2bbl',
'fuelsystem_4bbl', 'fuelsystem_idi', 'fuelsystem_mfi',
'fuelsystem_mpfi', 'fuelsystem_spdi', 'fuelsystem_spfi'],
dtype='object')
In [9]: remove_col=['car_ID', 'symboling','carheight','stroke','peakrpm','fueltype','aspiration','doornumber',
In [10]: data=data.drop(columns=remove_col,axis=1)
In [11]: object=data.select_dtypes(include=['object']).columns.tolist()
dum=pd.get_dummies(data,columns=object)
In [12]: x=dum.drop("price",axis=1)
y=dum["price"]
In [13]: x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
In [14]: model1=RandomForestRegressor()
model1.fit(x_train,y_train)
model1.score(x_test,y_test)
Out[14]: 0.9554863631762747