8/31/23, 11:04 AM practical-5 - Jupyter Notebook
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
In [ ]:
In [2]:
df=pd.read_csv('diamonds.csv')
In [3]:
df.head()
Out[3]:
Unnamed: 0 carat cut color clarity depth table price x y z
0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
3 4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
In [4]:
nulls=df.isnull().sum()
print(nulls)
Unnamed: 0 0
carat 0
cut 0
color 0
clarity 0
depth 0
table 0
price 0
x 0
y 0
z 0
dtype: int64
In [ ]:
localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 1/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook
In [5]:
to_drop=['Unnamed: 0']
df.drop(columns=to_drop,inplace=True)
In [6]:
df.head()
Out[6]:
carat cut color clarity depth table price x y z
0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
In [7]:
df.shape
Out[7]:
(53940, 10)
In [8]:
df.dtypes
Out[8]:
carat float64
cut object
color object
clarity object
depth float64
table float64
price int64
x float64
y float64
z float64
dtype: object
localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 2/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook
In [9]:
df.describe()
Out[9]:
carat depth table price x y
count 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000
mean 0.797940 61.749405 57.457184 3932.799722 5.731157 5.734526
std 0.474011 1.432621 2.234491 3989.439738 1.121761 1.142135
min 0.200000 43.000000 43.000000 326.000000 0.000000 0.000000
25% 0.400000 61.000000 56.000000 950.000000 4.710000 4.720000
50% 0.700000 61.800000 57.000000 2401.000000 5.700000 5.710000
75% 1.040000 62.500000 59.000000 5324.250000 6.540000 6.540000
max 5.010000 79.000000 95.000000 18823.000000 10.740000 58.900000
In [10]:
cols=['cut','color','clarity']
df_new=df.drop(columns=cols,inplace=False)
#mth-2 df_new= df[['carat','depth'...all numerical columns]]
#droping categorical values to find z-score. Z-score is useful in finding outliers.
In [11]:
df_new.dtypes
Out[11]:
carat float64
depth float64
table float64
price int64
x float64
y float64
z float64
dtype: object
localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 3/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook
In [12]:
z_score= (df_new-df_new.mean())/df_new.std()
z_score # z-score is new squeezed value for dataset
Out[12]:
carat depth table price x y z
0 -1.198157 -0.174090 -1.099662 -0.904087 -1.587823 -1.536181 -1.571115
1 -1.240350 -1.360726 1.585514 -0.904087 -1.641310 -1.658759 -1.741159
2 -1.198157 -3.384987 3.375631 -0.903836 -1.498677 -1.457382 -1.741159
3 -1.071577 0.454129 0.242926 -0.902081 -1.364959 -1.317293 -1.287708
4 -1.029384 1.082348 0.242926 -0.901831 -1.240155 -1.212227 -1.117663
... ... ... ... ... ... ... ...
53935 -0.164426 -0.662705 -0.204603 -0.294728 0.016798 0.022304 -0.054887
53936 -0.164426 0.942744 -1.099662 -0.294728 -0.036690 0.013548 0.100987
53937 -0.206619 0.733338 1.137985 -0.294728 -0.063434 -0.047740 0.030135
53938 0.130926 -0.523100 0.242926 -0.294728 0.373380 0.337503 0.285201
53939 -0.101136 0.314525 -1.099662 -0.294728 0.088114 0.118615 0.143498
53940 rows × 7 columns
In [13]:
outliers=df_new[(z_score>3).any(axis=1)]
outliers
#means after z_score SD of all features will be 1 and mean 0. If any row have value of z_
Out[13]:
carat depth table price x y z
2 0.23 56.9 65.0 327 4.05 4.07 2.31
91 0.86 55.1 69.0 2757 6.45 6.33 3.52
97 0.96 66.3 62.0 2759 6.27 5.95 4.07
204 0.98 67.9 60.0 2777 6.05 5.97 4.08
227 0.84 55.1 67.0 2782 6.39 6.20 3.47
... ... ... ... ... ... ... ...
53697 0.70 64.5 65.0 2717 5.52 5.45 3.54
53727 0.78 66.9 57.0 2721 5.70 5.66 3.60
53785 0.89 64.3 65.0 2728 6.00 5.95 3.84
53800 0.90 68.7 62.0 2732 5.83 5.79 3.99
53863 1.00 66.8 56.0 2743 6.22 6.12 4.13
2077 rows × 7 columns
localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 4/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook
In [14]:
z_score.std()
Out[14]:
carat 1.0
depth 1.0
table 1.0
price 1.0
x 1.0
y 1.0
z 1.0
dtype: float64
In [15]:
df.drop(index = outliers.index,inplace=True)
In [16]:
num_cols=df.columns[(df.dtypes)=='float64'].tolist()
cat_cols=df.columns[(df.dtypes)=='object'].tolist()
#seperate features into numerical and categorical types
In [17]:
std_scaler=StandardScaler() #making object of StandardScaler
df[num_cols]=std_scaler.fit_transform(df[num_cols])
In [18]:
df.head()
Out[18]:
carat cut color clarity depth table price x y
0 -1.248109 Ideal E SI2 -0.171754 -1.126114 326 -1.613627 -1.599720 -1.623
1 -1.295150 Premium E SI1 -1.455756 1.705217 326 -1.670243 -1.732761 -1.806
3 -1.106987 Premium I VS2 0.508013 0.289551 334 -1.377728 -1.362149 -1.317
4 -1.059946 Good J SI2 1.187779 0.289551 335 -1.245624 -1.248114 -1.134
Very
5 -1.224589 J VVS2 0.810131 -0.182337 336 -1.623063 -1.618726 -1.546
Good
In [19]:
df[['cut','color','clarity']]=df[['cut','color','clarity']].apply(LabelEncoder().fit_tra
#assigns numerical value to categorical data
localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 5/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook
In [20]:
df.head()
Out[20]:
carat cut color clarity depth table price x y z
0 -1.248109 2 1 3 -0.171754 -1.126114 326 -1.613627 -1.599720 -1.623059
1 -1.295150 3 1 2 -1.455756 1.705217 326 -1.670243 -1.732761 -1.806163
3 -1.106987 3 5 5 0.508013 0.289551 334 -1.377728 -1.362149 -1.317887
4 -1.059946 1 6 3 1.187779 0.289551 335 -1.245624 -1.248114 -1.134783
5 -1.224589 4 6 7 0.810131 -0.182337 336 -1.623063 -1.618726 -1.546766
In [21]:
from sklearn.model_selection import train_test_split
In [22]:
#seperating data first into i/p and o/p columns and then into test and train
y=df['price']
x=df.drop('price',axis=1)
x_train,x_test,y_train,y_test= train_test_split(x,y, test_size=0.2, random_state=1)
In [23]:
print(len(x_train))
41490
In [24]:
from sklearn import datasets,linear_model, metrics
#obj of linear regression
reg=linear_model.LinearRegression()
#training model using fit
reg.fit(x_train,y_train)
print('Coefficients ', reg.coef_)# coef_ variable stores values of coef. i.e theta-1,the
Coefficients [ 4847.84477929 39.73811533 -242.60007725 258.69401187
-185.36160706 -178.87300891 -2317.7757166 675.34322907
175.75354744]
In [25]:
# y-intercept
print(reg.intercept_)
3109.709922835895
localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 6/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook
In [26]:
y_pred=reg.predict(x_test)
In [27]:
#coefficient of determination
r_squared=reg.score(x_test,y_test)
print(r_squared)
0.8903048480241464
In [28]:
x_train= x_train.values[:,0].reshape(-1,1)
x_test= x_test.values[:,0].reshape(-1,1)
plt.scatter(x_train,y_train,color='red')
plt.scatter(x_test,y_test,color='green')
plt.plot(x_test,y_pred,color='blue',linewidth=3)
Out[28]:
[<matplotlib.lines.Line2D at 0x262092e0f40>]
localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 7/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook
In [29]:
print('mean absolute error:' , metrics.mean_absolute_error(y_test,y_pred))
print('mean squared error:' , metrics.mean_squared_error(y_test,y_pred))
print('root mean squared error:' , np.sqrt(metrics.mean_squared_error(y_test,y_pred)))
#r_squared=reg.score(x_test,y_test)
#print(r_squared)
mean absolute error: 745.7533805818331
mean squared error: 1286516.9599848085
root mean squared error: 1134.2473098865205
support vector regression
In [30]:
from sklearn.svm import SVR
In [*]:
svr_rbf = SVR(kernel='rbf',C=1e3,gamma=0.1)
svr_lin=SVR(kernel='linear',C=1e3)
svr_poly=SVR(kernel='poly',C=1e3,degree=2)
y_rbf=svr_rbf.fit(x_train,y_train).predict(x_test)
y_lin=svr_lin.fit(x_train,y_train).predict(x_test)
y_poly=svr_poly.fit(x_train,y_train).predict(x_test)
In [*]:
lw=2
plt.scatter(x_train,y_train,color='darkorange',label='data')
plt.plot(x_test,y_rbf, color='navy',lw=lw,label='RBF model')
plt.xlabel('data')
plt.ylabel('target')
plt.title('support vector regression')
plt.legend
plt.show()
In [ ]:
localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 8/8