Project 5 – Machine Learning
Problem Statement
This project requires you to understand what mode of transport employees prefers to commute to
their office. The attached data 'Cars.csv' includes employee information about their mode of transport
as well as their personal and professional details like age, salary, work exp. We need to predict
whether or not an employee will use Car as a mode of transport. Also, which variables are a
significant predictor behind this decision?
You are expected to do the following:
1.EDA
Perform an EDA on the data
Illustrate the insights based on EDA
Check for Multicollinearity - Plot the graph based on Multicollinearity & treat it
2.Data Preparation
Prepare the data for analysis (SMOTE)
3.Modeling
Create multiple models and explore how each model perform using appropriate model
performance metrics
o KNN
o Naive Bayes (is it applicable here? comment and if it is not applicable, how can
you build an NB model in this case?)
o Logistic Regression
Apply both bagging and boosting modeling procedures to create 2 models and compare its
accuracy with the best model of the above step.
4.Actionable Insights & Recommendations
Summarize your findings from the exercise in a concise yet actionable note
Data Importing –
setwd("C:\\Users\\Bhumika\\Documents\\Analytics\\Project - 5")
library(readr)
Cars=read_csv("Cars_edited.csv")
Parsed with column specification:
cols(
Age = col_double(),
Gender = col_character(),
Engineer = col_double(),
MBA = col_double(),
`Work Exp` = col_double(),
Salary = col_double(),
Distance = col_double(),
license = col_double(),
Transport = col_character()
)
View(Cars)
Exploratory Data Analysis
str(Cars)
Classes ‘spec_tbl_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 444 obs. of 9 variables:
$ Age : num 28 23 29 28 27 26 28 26 22 27 ...
$ Gender : chr "Male" "Female" "Male" "Female" ...
$ Engineer : num 0 1 1 1 1 1 1 1 1 1 ...
$ MBA : num 0 0 0 1 0 0 0 0 0 0 ...
$ Work Exp : num 4 4 7 5 4 4 5 3 1 4 ...
$ Salary : num 14.3 8.3 13.4 13.4 13.4 12.3 14.4 10.5 7.5 13.5 ...
$ Distance : num 3.2 3.3 4.1 4.5 4.6 4.8 5.1 5.1 5.1 5.2 ...
$ license : num 0 0 0 0 0 1 0 0 0 0 ...
$ Transport: chr "Public Transport" "Public Transport" "Public Transport" "Public Transport" ...
- attr(*, "spec")=
.. cols(
.. Age = col_double(),
.. Gender = col_character(),
.. Engineer = col_double(),
.. MBA = col_double(),
.. `Work Exp` = col_double(),
.. Salary = col_double(),
.. Distance = col_double(),
.. license = col_double(),
.. Transport = col_character()
.. )
Convert to categorical variables –
Cars$Engineer = as.factor(Cars$Engineer)
Cars$MBA = as.factor(Cars$MBA)
Cars$license = as.factor(Cars$license)
summary(Cars)
Rename Work Exp column -
colnames(Cars)[5] <- "Work_Exp"
Check for missing values –
sum(is.na(Cars))
Remove Missing Values –
Cars<- na.omit(Cars)
Cars <- as.data.frame(Cars)
Convert Gender to binary –
Cars[Cars=="Female"]<- 1
Cars[Cars=="Male"]<- 0
Cars$Gender <- as.numeric(Cars$Gender)
Plotting the data –
library(ggplot2)
hist(Cars$Age)
hist(as.numeric(Cars$Engineer))
hist(as.numeric(Cars$MBA))
hist(Cars$Work_Exp)
hist(Cars$Salary)
hist(Cars$Distance)
hist(as.numeric(Cars$license))
Bivariate Analysis –
boxplot(Cars$Age ~Cars$Engineer, main = "Age vs Eng.")
boxplot(Cars$Age ~Cars$MBA, main ="Age Vs MBA")
People of all qualifications and all work experience would be employed in firm.
boxplot(Cars$Salary ~Cars$Engineer, main = "Salary vs Eng.")
boxplot(Cars$Salary ~Cars$MBA, main = "Salary vs MBA.")
Not much difference in salary of Engs Vs Non-Engs or Mba vs Non-MBA’s
Also, mean salary for both MBA’s and Eng is around 16
hist(Cars$Work_Exp, col = "cyan", main = "Distribution of work exp")
This is skewed towards right,there would be more juniors then seniors in
any firm
table(Cars$license,Cars$Transport)
2Wheeler Car Public Transport
0 60 13 266
1 23 48 33
boxplot(Cars$Work_Exp ~ Cars$Gender)
Not much of difference between mean work experience in two genders, so population is
equally distributed for both male and females.
#Convert Transport to Binary -
Cars[Cars=="2Wheeler"]<- 0
Cars[Cars=="Public Transport"]<- 0
Cars[Cars=="Car"]<- 1
Cars$Transport <- as.numeric(Cars$Transport)
To check for Multicollinearity –
plot(Cars)
Age is highly correlated to Work_Exp and Distance.
Work_Exp is highly correlated to Distance.
Work_Exp is also correlated to Salary and Age.
Cars$Engineer <- as.numeric(Cars$Engineer)
Cars$MBA <- as.numeric(Cars$MBA)
Cars$license <- as.numeric(Cars$license)
library(usdm)
library(VIF)
vifcor(Cars[-9])
1 variables from the 8 input variables have collinearity problem:
Work_Exp
After excluding the collinear variables, the linear correlation coefficients ranges between:
min correlation ( Salary ~ MBA ): -0.007592236
max correlation ( Salary ~ Age ): 0.8607652
---------- VIFs of the remained variables --------
Variables VIF
1 Age 3.896910
2 Gender 1.070855
3 Engineer 1.014883
4 MBA 1.019907
5 Salary 4.457554
6 Distance 1.260307
7 license 1.430460
Remove Work_Exp-
Cars <- Cars[-5]
names(Cars)
[1] "Age" "Gender" "Engineer" "MBA" "Salary" "Distance" "license" "Transport"
Check for Outliers -
boxplot(Cars$Age)
boxplot(Cars$Salary)
boxplot(Cars$Distance)
Removing Outliers –
quantile(Cars$Age, c(0.95))
Cars$Age[which(Cars$Age>38)]<- 38
quantile(Cars$Age, c(0.95))
95%
38
quantile(Cars$Salary,c(0.95))
Cars$Salary[which(Cars$Salary>43)] <- 43
quantile(Cars$Salary,c(0.95))
95%
43
quantile(Cars$Distance,c(0.95))
Cars$Distance[which(Cars$Distance> 17.89)] <- 17.89
quantile(Cars$Distance,c(0.95))
95%
17.89
Check for data -
table(Cars$Transport)
2Wheeler Car Public Transport
83 61 299
Change Target Variable to factor -
Cars$Transport <- as.factor(Cars$Transport)
SMOTE -
# SMOTE
library(DMwR)
library(caret)
set.seed(42)
Carsdata = SMOTE(Transport~., Cars)
summary(Carsdata$Transport)
index=createDataPartition(y=Carsdata$Transport,p=0.7,list=FALSE)
traindata=Carsdata[index,]
table(traindata$Transport)
0 1
171 129
testdata=Carsdata[-index,]
table(testdata$Transport)
0 1
73 54
Logistic Regression -
lgmodel <- glm(formula= Transport ~.,traindata, family=binomial)
lgmodel
Call: glm(formula = Transport ~ ., family = binomial, data = traindata)
Coefficients:
(Intercept) Age Gender Engineer MBA Salary Distance license
-92.10708 2.54987 7.07060 1.66439 -6.15736 -0.08231 1.00486 2.82411
Degrees of Freedom: 299 Total (i.e. Null); 292 Residual
Null Deviance: 410
Residual Deviance: 38.12 AIC: 54.12
lg_predictions <- predict(lgmodel,testdata,type="response")
Naive Bayes -
library(e1071)
NBmodel <- naiveBayes(Transport ~., data=traindata)
NBmodel
Naive Bayes Classifier for Discrete Predictors
Call:
naiveBayes.default(x = X, y = Y, laplace = laplace)
A-priori probabilities:
Y
0 1
0.57 0.43
Conditional probabilities:
Age
Y [,1] [,2]
0 26.56725 2.904334
1 35.79332 3.221259
Gender
Y [,1] [,2]
0 0.2339181 0.4245640
1 0.2103642 0.4060913
Engineer
Y [,1] [,2]
0 1.789474 0.4088798
1 1.872710 0.3313629
MBA
Y [,1] [,2]
0 1.304094 0.4613735
1 1.211885 0.4053939
Salary
Y [,1] [,2]
0 13.47310 5.488242
1 36.90872 12.250825
Distance
Y [,1] [,2]
0 10.61111 3.047689
1 15.71253 3.301019
license
Y [,1] [,2]
0 1.134503 0.3421939
1 1.775857 0.4124676
NB_predictions <- predict(NBmodel,testdata)
table(NB_predictions,testdata$Transport)
NB_predictions 0 1
0 70 6
1 3 48
Confusion Matrix
confusionMatrix(NB_predictions,testdata$Transport)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 70 6
1 3 48
Accuracy : 0.9291
95% CI : (0.8697, 0.9671)
No Information Rate : 0.5748
P-Value [Acc > NIR] : <2e-16
Kappa : 0.854
Mcnemar's Test P-Value : 0.505
Sensitivity : 0.9589
Specificity : 0.8889
Pos Pred Value : 0.9211
Neg Pred Value : 0.9412
Prevalence : 0.5748
Detection Rate : 0.5512
Detection Prevalence : 0.5984
Balanced Accuracy : 0.9239
'Positive' Class : 0
KNN -
library(class)
trControl <- trainControl(method = "cv", number = 10)
KNNmod <- caret::train(Transport ~ .,
method = "knn",
tuneGrid = expand.grid(k = 2:20),
trControl = trControl,
metric = "Accuracy",
preProcess = c("center","scale"),
data = traindata)
KNNmod
k-Nearest Neighbors
300 samples
7 predictor
2 classes: '0', '1'
Pre-processing: centered (7), scaled (7)
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 270, 270, 271, 269, 270, 270, ...
Resampling results across tuning parameters:
k Accuracy Kappa
2 0.9501001 0.8979187
3 0.9502076 0.8985918
4 0.9469818 0.8922827
5 0.9504227 0.8993616
6 0.9501001 0.8979187
7 0.9434334 0.8841002
8 0.9401001 0.8774934
9 0.9365369 0.8690393
10 0.9433185 0.8835794
11 0.9399852 0.8768563
12 0.9399852 0.8768618
13 0.9265369 0.8491158
14 0.9266518 0.8499538
15 0.9232036 0.8422598
16 0.9134186 0.8206479
17 0.9067519 0.8063086
18 0.9098628 0.8135544
19 0.8933037 0.7789983
20 0.8865295 0.7643311
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was k = 5.
KNN_predictions <- predict(KNNmod,testdata)
confusionMatrix(KNN_predictions, testdata$Transport)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 69 3
1 4 51
Accuracy : 0.9449
95% CI : (0.8897, 0.9776)
No Information Rate : 0.5748
P-Value [Acc > NIR] : <2e-16
Kappa : 0.8875
Mcnemar's Test P-Value : 1
Sensitivity : 0.9452
Specificity : 0.9444
Pos Pred Value : 0.9583
Neg Pred Value : 0.9273
Prevalence : 0.5748
Detection Rate : 0.5433
Detection Prevalence : 0.5669
Balanced Accuracy : 0.9448
'Positive' Class : 0
summary(testdata$Transport)
> summary(testdata$Transport)
0 1
73 54
Bagging -
library(gbm)
library(xgboost)
library(caret)
library(ipred)
library(plyr)
library(rpart)
mod.bagging <- bagging(Transport ~.,
data=traindata,
control=rpart.control(maxdepth=5, minsplit=4))
bag.pred <- predict(mod.bagging, testdata)
confusionMatrix(bag.pred,testdata$Transport)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 68 2
1 5 52
Accuracy : 0.9449
95% CI : (0.8897, 0.9776)
No Information Rate : 0.5748
P-Value [Acc > NIR] : <2e-16
Kappa : 0.888
Mcnemar's Test P-Value : 0.4497
Sensitivity : 0.9315
Specificity : 0.9630
Pos Pred Value : 0.9714
Neg Pred Value : 0.9123
Prevalence : 0.5748
Detection Rate : 0.5354
Detection Prevalence : 0.5512
Balanced Accuracy : 0.9472
'Positive' Class : 0
Boosting -
mod.boost <- gbm(Transport ~ .,data=traindata, distribution=
"bernoulli",n.trees =5000 , interaction.depth =4, shrinkage=0.01)
summary(mod.boost)
var rel.inf
Age Age 84.1483764
Salary Salary 8.9887923
Distance Distance 5.3097642
MBA MBA 0.9294766
license license 0.4844714
Gender Gender 0.1241797
Engineer Engineer 0.0149394
boost.pred <- predict(mod.boost, testdata,n.trees =5000, type="response")
y_pred_num <- ifelse(boost.pred > 0.5, 1, 0)
y_pred <- factor(y_pred_num, levels=c(0, 1))
table(y_pred,testdata$Transport)
y_pred 0 1
0 72 2
1 1 52
confusionMatrix(y_pred,testdata$Transport)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 72 2
1 1 52
Accuracy : 0.9764
95% CI : (0.9325, 0.9951)
No Information Rate : 0.5748
P-Value [Acc > NIR] : <2e-16
Kappa : 0.9516
Mcnemar's Test P-Value : 1
Sensitivity : 0.9863
Specificity : 0.9630
Pos Pred Value : 0.9730
Neg Pred Value : 0.9811
Prevalence : 0.5748
Detection Rate : 0.5669
Detection Prevalence : 0.5827
Balanced Accuracy : 0.9746
'Positive' Class : 0
Model Performance –
library(ROCR)
pred.lg <- prediction(lg_predictions, testdf$Transport)
perf.lg <- performance(pred.lg, "tpr", "fpr")
plot(perf.lg)
#Kolmogorov Smirnov -
KS <- max(attr(perf.lg, 'y.values')[[1]]-attr(perf.lg, 'x.values')[[1]])
KS
[1] 0.9170472
# Area Under Curve -
auc <- performance(pred.lg,"auc");
auc <- as.numeric(auc@y.values)
auc
[1] 0.992136
# Gini Coefficient -
library(ineq)
gini = ineq(lg_predictions, type="Gini")
gini
[1] 0.5804087
Model Comparison –
As we can see that Boosting has the highest sensitivity as compared to KNN,Naïve Bayes and Bagging.
Therefore sensitivity and accuracy is highly relative for Boosting.