library(caret)
library(kernlab)
library(ROCR)
data(segmentationData)
help(sementationData)
head(segmentationData)
#Number of rows and columns
dim(segmentationData)
str(segmentationData)
#Distribution of Target Variable
table(segmentationData$Class)
table(segmentationData$Class) / length(segmentationData$Class)
# Split Data into Training and Validation
Index <- createDataPartition(segmentationData$Class,p=.7,list=FALSE)
svm.train <- segmentationData[Index,]
names(svm.train)
svm.validate <- segmentationData[-Index,]
svm.train1 <- svm.train[,3:61]
names(svm.train1)
library(e1071)
model <- svm(Class ~ . , kernel = "linear", data=svm.train1)
predict1 <- predict(model, svm.validate)
mean(predict1 == svm.validate$Class)
# Setup for cross validation
set.seed(123)
ctrl <- trainControl(method="cv",
number = 2,
summaryFunction=twoClassSummary,
classProbs=TRUE)
# Grid search to fine tune SVM
grid <- expand.grid(sigma = c(.01, .015, 0.2),
C = c(0.75, 0.9, 1, 1.1, 1.25)
)
#Train SVM
svm.tune <- train(x=svm.train1,
y= svm.train$Class,
method = "svmRadial",
metric="ROC",
tuneGrid = grid,
trControl=ctrl)
svm.tune
# Predict Target Label
valX <- svm.validate[,4:61]
pred <- predict(svm.tune, valX, type="prob")[2]
head(pred)
# Model Performance Statistics
pred_val <- prediction(pred, svm.validate$Class)
# Calculating Area under Curve
perf_val <- performance(pred_val,"auc")
auc <- as.numeric(perf_val@y.values)
auc
# Calculating True Positive and False Positive Rate
perf_val <- performance(pred_val, "tpr", "fpr")
# Plot the ROC curve
plot(perf_val, col = "green", lwd = 1.5)
abline(a=0,b=1, lty=2)
# #Calculating KS statistics
# ks <- max(attr(perf_val, "y.values")[[1]] - (attr(perf_val, "x.values")[[1]]))
# ks
############################################################################
# Load the data from the csv file
# dataDirectory <- "D:/" # put your own folder here
# data <- read.csv(paste(dataDirectory, 'regression.csv', sep=""), header = TRUE)
data <- read.csv('E:\\DataScience\\ML\\regression.csv', sep=",", header = TRUE)
head(data)
# Plot the data
plot(data, pch=16)
# Create a linear regression model
model <- lm(Y ~ X, data)
# Add the fitted line
abline(model)
# make a prediction for each X
LpredictedY <- predict(model, data)
# display the predictions
plot(data, pch=16)
points(data$X, LpredictedY, col = "blue", pch=4)
rmse <- function(error)
{
sqrt(mean(error^2))
}
error <- model$residuals # same as data$Y - predictedY
predictionRMSE <- rmse(error)
mean(data$Y)
# Support Vector Regression
library(e1071)
model <- svm(Y ~ X , kernel = "linear", data)
plot(data, pch=16)
predictedY = predict(model,data)
points(data$X, predictedY, col = "red", pch=4)
model <- svm(Y ~ X , data)
predictedY <- predict(model, data)
plot(data, pch=16)
points(data$X, predictedY, col = "red", pch=4)
error <- data$Y - predictedY
svrPredictionRMSE <- rmse(error)
# perform a grid search
tuneResult <- tune(svm, Y ~ X, data = data,
ranges = list(epsilon = seq(0,1,0.1), cost = 2^(2:9))
)
print(tuneResult)
# Draw the tuning graph
plot(tuneResult)
tuneResult <- tune(svm, Y ~ X, data = data,
ranges = list(epsilon = seq(0,0.2,0.01), cost = 2^(2:9))
)
print(tuneResult)
plot(tuneResult)
tunedModel <- tuneResult$best.model
tunedModelY <- predict(tunedModel, data)
error <- data$Y - tunedModelY
# this value can be different on your computer
# because the tune method randomly shuffles the data
tunedModelRMSE <- rmse(error)
plot(data, pch=16)
points(data$X, LpredictedY, col = "green", pch=4)
lines(data$X, LpredictedY, col = "green", pch=4)
points(data$X, predictedY, col = "red", pch=4)
lines(data$X, predictedY, col = "red", pch=4)
points(data$X, tunedModelY, col = "blue", pch=4)
lines(data$X, tunedModelY, col = "blue", pch=4)