0% found this document useful (0 votes)

22 views35 pages

Heart Disease Prediction Model

Epidemiología 2

Uploaded by

Arquimedes Gavino

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

22 views35 pages

Heart Disease Prediction Model

Epidemiología 2

Uploaded by

Arquimedes Gavino

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 35

HEART DISEASE PREDICTION MODEL

Enock Bereka

2024-12-14
#Load the required library and the dataset

heart <- read_csv("C:/Users/PC/OneDrive/Desktop/Data

Science/Datasets/Project/heart.csv")

view(heart)

#Data Exploration
head(heart)

## # A tibble: 6 × 14
## age sex cp trestbps chol fbs restecg thalach exang oldpeak
slope
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
<dbl>
## 1 52 1 0 125 212 0 1 168 0 1
2
## 2 53 1 0 140 203 1 0 155 1 3.1
0
## 3 70 1 0 145 174 0 1 125 1 2.6
0
## 4 61 1 0 148 203 0 1 161 0 0
2
## 5 62 0 0 138 294 1 1 106 0 1.9
1
## 6 58 0 0 100 248 0 0 122 0 1
1
## # ℹ 3 more variables: ca <dbl>, thal <dbl>, target <dbl>

glimpse(heart)

## Rows: 1,025
## Columns: 14
## $ age <dbl> 52, 53, 70, 61, 62, 58, 58, 55, 46, 54, 71, 43, 34, 51,
52, 3…
## $ sex <dbl> 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,
1, 1…
## $ cp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 0, 1,
2, 2…
## $ trestbps <dbl> 125, 140, 145, 148, 138, 100, 114, 160, 120, 122, 112,
132, 1…
## $ chol <dbl> 212, 203, 174, 203, 294, 248, 318, 289, 249, 286, 149,
341, 2…
## $ fbs <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
1, 0…
## $ restecg <dbl> 1, 0, 1, 1, 1, 0, 2, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
0, 0…
## $ thalach <dbl> 168, 155, 125, 161, 106, 122, 140, 145, 144, 116, 125,
136, 1…
## $ exang <dbl> 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
0, 0…
## $ oldpeak <dbl> 1.0, 3.1, 2.6, 0.0, 1.9, 1.0, 4.4, 0.8, 0.8, 3.2, 1.6,
3.0, 0…
## $ slope <dbl> 2, 0, 0, 2, 1, 1, 0, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2,
2, 1…
## $ ca <dbl> 2, 0, 0, 1, 3, 0, 3, 1, 0, 2, 0, 0, 0, 3, 0, 0, 1, 1, 0,
0, 0…
## $ thal <dbl> 3, 3, 3, 3, 2, 2, 1, 3, 3, 2, 2, 3, 2, 3, 0, 2, 2, 3, 2,
2, 2…
## $ target <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
1, 0…

anyNA(heart)

## [1] FALSE

duplicated(heart) %>% table()

## .
## FALSE TRUE
## 302 723

heart <- heart %>% distinct(.)

#Check the relationship between variables

round(cor(heart), 2)

## age sex cp trestbps chol fbs restecg thalach exang

oldpeak
## age 1.00 -0.09 -0.06 0.28 0.21 0.12 -0.11 -0.40 0.09
0.21
## sex -0.09 1.00 -0.05 -0.06 -0.20 0.05 -0.06 -0.05 0.14
0.10
## cp -0.06 -0.05 1.00 0.05 -0.07 0.10 0.04 0.29 -0.39 -
0.15
## trestbps 0.28 -0.06 0.05 1.00 0.13 0.18 -0.12 -0.05 0.07
0.19
## chol 0.21 -0.20 -0.07 0.13 1.00 0.01 -0.15 -0.01 0.06
0.05
## fbs 0.12 0.05 0.10 0.18 0.01 1.00 -0.08 -0.01 0.02
0.00
## restecg -0.11 -0.06 0.04 -0.12 -0.15 -0.08 1.00 0.04 -0.07 -
0.06
## thalach -0.40 -0.05 0.29 -0.05 -0.01 -0.01 0.04 1.00 -0.38 -
0.34
## exang 0.09 0.14 -0.39 0.07 0.06 0.02 -0.07 -0.38 1.00
0.29
## oldpeak 0.21 0.10 -0.15 0.19 0.05 0.00 -0.06 -0.34 0.29
1.00
## slope -0.16 -0.03 0.12 -0.12 0.00 -0.06 0.09 0.38 -0.26 -
0.58
## ca 0.30 0.11 -0.20 0.10 0.09 0.14 -0.08 -0.23 0.13
0.24
## thal 0.07 0.21 -0.16 0.06 0.10 -0.03 -0.01 -0.09 0.21
0.21
## target -0.22 -0.28 0.43 -0.15 -0.08 -0.03 0.13 0.42 -0.44 -
0.43
## slope ca thal target
## age -0.16 0.30 0.07 -0.22
## sex -0.03 0.11 0.21 -0.28
## cp 0.12 -0.20 -0.16 0.43
## trestbps -0.12 0.10 0.06 -0.15
## chol 0.00 0.09 0.10 -0.08
## fbs -0.06 0.14 -0.03 -0.03
## restecg 0.09 -0.08 -0.01 0.13
## thalach 0.38 -0.23 -0.09 0.42
## exang -0.26 0.13 0.21 -0.44
## oldpeak -0.58 0.24 0.21 -0.43
## slope 1.00 -0.09 -0.10 0.34
## ca -0.09 1.00 0.16 -0.41
## thal -0.10 0.16 1.00 -0.34
## target 0.34 -0.41 -0.34 1.00

cr <- round(cor(heart), 2)

#Visualize your correlations

library(ggcorrplot)
ggcorrplot(cr,title = "correlogram",lab_col = "black",
lab = TRUE, legend.title = "Pearson Correlation",
lab_size = 3, ggtheme = theme_classic(),
outline.color = "black",
colors = c("orange", "green", "blue"))
#Setting up factors
heart$sex <- as.factor(heart$sex)
heart$cp <- as.factor(heart$cp)
heart$fbs <- as.factor(heart$fbs)
heart$restecg <- as.factor(heart$restecg)
heart$exang <- as.factor(heart$exang)
heart$slope <- as.factor(heart$slope)
heart$ca <- as.factor(heart$ca)
heart$thal <- as.factor(heart$thal)
heart$target <- as.factor(heart$target)

#Splitting the dataset

library(caTools)
set.seed(2)
split <- sample.split(heart$target, SplitRatio = 0.8)
training <- subset(heart, split == TRUE)
testing <- subset(heart, split == FALSE)

#Build the model

model <- glm(target~., training, family = binomial)
summary(model)

##
## Call:
## glm(formula = target ~ ., family = binomial, data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.295200 4.353818 0.068 0.945943
## age 0.016480 0.030258 0.545 0.585989
## sex1 -2.624782 0.747849 -3.510 0.000448 ***
## cp1 1.306173 0.714180 1.829 0.067413 .
## cp2 2.596896 0.671185 3.869 0.000109 ***
## cp3 2.920285 0.861528 3.390 0.000700 ***
## trestbps -0.035263 0.013776 -2.560 0.010474 *
## chol -0.006496 0.005081 -1.278 0.201115
## fbs1 1.341979 0.831161 1.615 0.106401
## restecg1 0.074422 0.478785 0.155 0.876474
## restecg2 -0.243861 3.353627 -0.073 0.942032
## thalach 0.033606 0.017009 1.976 0.048174 *
## exang1 -0.619649 0.545508 -1.136 0.255993
## oldpeak -0.409794 0.267555 -1.532 0.125615
## slope1 -0.893167 1.000572 -0.893 0.372041
## slope2 0.861472 1.072122 0.804 0.421674
## ca1 -2.077715 0.649907 -3.197 0.001389 **
## ca2 -3.701530 0.989076 -3.742 0.000182 ***
## ca3 -1.845235 1.123495 -1.642 0.100506
## ca4 -1.084200 3.504617 -0.309 0.757045
## thal1 3.856076 2.968733 1.299 0.193980
## thal2 2.734698 2.856244 0.957 0.338342
## thal3 1.295089 2.852677 0.454 0.649835
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 126.82 on 218 degrees of freedom
## AIC: 172.82
##
## Number of Fisher Scoring iterations: 7

#Model optimization
model <- glm(target~.-age, training, family = binomial)
summary(model)

##
## Call:
## glm(formula = target ~ . - age, family = binomial, data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.325665 4.011251 0.330 0.741032
## sex1 -2.645056 0.744208 -3.554 0.000379 ***
## cp1 1.314534 0.712747 1.844 0.065136 .
## cp2 2.614185 0.672591 3.887 0.000102 ***
## cp3 2.879652 0.851004 3.384 0.000715 ***
## trestbps -0.033395 0.013231 -2.524 0.011602 *
## chol -0.006145 0.004981 -1.234 0.217310
## fbs1 1.315863 0.823644 1.598 0.110129
## restecg1 0.072474 0.479066 0.151 0.879754
## restecg2 -0.203098 3.107969 -0.065 0.947897
## thalach 0.030176 0.015683 1.924 0.054334 .
## exang1 -0.628631 0.543178 -1.157 0.247142
## oldpeak -0.423506 0.266105 -1.591 0.111497
## slope1 -0.879646 1.000413 -0.879 0.379248
## slope2 0.874920 1.071971 0.816 0.414398
## ca1 -2.029203 0.643416 -3.154 0.001612 **
## ca2 -3.555466 0.944992 -3.762 0.000168 ***
## ca3 -1.760305 1.124240 -1.566 0.117402
## ca4 -1.173104 3.359677 -0.349 0.726960
## thal1 3.911792 3.091294 1.265 0.205720
## thal2 2.791222 2.982312 0.936 0.349311
## thal3 1.356410 2.977998 0.455 0.648766
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 127.12 on 219 degrees of freedom
## AIC: 171.12
##
## Number of Fisher Scoring iterations: 7

#Age should be removed since it reduces aic after its removal

model <- glm(target~.-chol, training, family = binomial)

summary(model)

##
## Call:
## glm(formula = target ~ . - chol, family = binomial, data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.44404 4.24888 -0.105 0.916766
## age 0.01272 0.02959 0.430 0.667287
## sex1 -2.41489 0.71785 -3.364 0.000768 ***
## cp1 1.22038 0.70464 1.732 0.083288 .
## cp2 2.58580 0.66586 3.883 0.000103 ***
## cp3 2.87907 0.85219 3.378 0.000729 ***
## trestbps -0.03508 0.01383 -2.537 0.011182 *
## fbs1 1.31623 0.81371 1.618 0.105757
## restecg1 0.19375 0.46605 0.416 0.677608
## restecg2 -0.06578 3.01838 -0.022 0.982614
## thalach 0.02989 0.01624 1.840 0.065696 .
## exang1 -0.63343 0.53583 -1.182 0.237144
## oldpeak -0.42392 0.26841 -1.579 0.114249
## slope1 -0.97229 1.00300 -0.969 0.332354
## slope2 0.78275 1.07296 0.730 0.465681
## ca1 -2.09718 0.65763 -3.189 0.001428 **
## ca2 -3.62163 0.98329 -3.683 0.000230 ***
## ca3 -1.83598 1.10052 -1.668 0.095261 .
## ca4 -1.08181 3.47259 -0.312 0.755400
## thal1 3.75233 2.93266 1.279 0.200723
## thal2 2.56257 2.81508 0.910 0.362663
## thal3 1.02733 2.80375 0.366 0.714057
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 128.37 on 219 degrees of freedom
## AIC: 172.37
##
## Number of Fisher Scoring iterations: 7

#Chol should be removed since it reduces aic after removal.

model <- glm(target~.-fbs, training, family = binomial)

summary(model)

##
## Call:
## glm(formula = target ~ . - fbs, family = binomial, data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.707565 4.920003 0.144 0.885647
## age 0.014721 0.030413 0.484 0.628355
## sex1 -2.572096 0.747487 -3.441 0.000580 ***
## cp1 1.209278 0.694549 1.741 0.081666 .
## cp2 2.629193 0.654316 4.018 5.86e-05 ***
## cp3 3.003895 0.864801 3.474 0.000514 ***
## trestbps -0.032134 0.013221 -2.431 0.015075 *
## chol -0.006386 0.004972 -1.284 0.199035
## restecg1 0.125134 0.474661 0.264 0.792066
## restecg2 -0.301995 3.265169 -0.092 0.926309
## thalach 0.033734 0.016897 1.996 0.045886 *
## exang1 -0.651458 0.544731 -1.196 0.231725
## oldpeak -0.436764 0.266089 -1.641 0.100710
## slope1 -0.968080 1.006197 -0.962 0.335990
## slope2 0.679868 1.077629 0.631 0.528111
## ca1 -1.906226 0.618536 -3.082 0.002057 **
## ca2 -3.514953 0.973282 -3.611 0.000304 ***
## ca3 -1.524615 1.046876 -1.456 0.145297
## ca4 -0.522147 4.651809 -0.112 0.910628
## thal1 3.303207 3.755174 0.880 0.379054
## thal2 2.051047 3.663348 0.560 0.575559
## thal3 0.705558 3.670620 0.192 0.847572
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 129.54 on 219 degrees of freedom
## AIC: 173.54
##
## Number of Fisher Scoring iterations: 7

#fbs should not be removed since it increases aic after removal

model <- glm(target~.-restecg, training, family = binomial)

summary(model)

##
## Call:
## glm(formula = target ~ . - restecg, family = binomial, data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.363025 4.353294 0.083 0.933541
## age 0.016401 0.030232 0.543 0.587459
## sex1 -2.635390 0.740914 -3.557 0.000375 ***
## cp1 1.315717 0.712078 1.848 0.064644 .
## cp2 2.597341 0.671275 3.869 0.000109 ***
## cp3 2.924994 0.859950 3.401 0.000671 ***
## trestbps -0.035573 0.013639 -2.608 0.009104 **
## chol -0.006644 0.004961 -1.339 0.180501
## fbs1 1.346928 0.827585 1.628 0.103622
## thalach 0.033933 0.016907 2.007 0.044748 *
## exang1 -0.621461 0.545736 -1.139 0.254804
## oldpeak -0.410861 0.265914 -1.545 0.122325
## slope1 -0.893861 0.996916 -0.897 0.369919
## slope2 0.870259 1.066995 0.816 0.414719
## ca1 -2.085554 0.647452 -3.221 0.001277 **
## ca2 -3.706346 0.986647 -3.757 0.000172 ***
## ca3 -1.848735 1.127038 -1.640 0.100933
## ca4 -1.088224 3.450364 -0.315 0.752463
## thal1 3.878334 3.008006 1.289 0.197281
## thal2 2.736131 2.900255 0.943 0.345471
## thal3 1.308902 2.894590 0.452 0.651133
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 126.85 on 220 degrees of freedom
## AIC: 168.85
##
## Number of Fisher Scoring iterations: 7

#restecg should be removed since it reduces aic after removal

model <- glm(target~.-exang, training, family = binomial)

summary(model)

##
## Call:
## glm(formula = target ~ . - exang, family = binomial, data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.451644 4.102964 -0.110 0.912348
## age 0.017589 0.029896 0.588 0.556301
## sex1 -2.566693 0.733236 -3.501 0.000464 ***
## cp1 1.525123 0.688089 2.216 0.026660 *
## cp2 2.853317 0.637938 4.473 7.72e-06 ***
## cp3 3.060928 0.857519 3.570 0.000358 ***
## trestbps -0.035481 0.013831 -2.565 0.010308 *
## chol -0.006545 0.004967 -1.318 0.187579
## fbs1 1.348230 0.813749 1.657 0.097557 .
## restecg1 0.083141 0.476601 0.174 0.861516
## restecg2 -0.215391 3.169351 -0.068 0.945817
## thalach 0.035485 0.016760 2.117 0.034239 *
## oldpeak -0.445472 0.267789 -1.664 0.096208 .
## slope1 -0.892251 0.989585 -0.902 0.367248
## slope2 0.892124 1.061350 0.841 0.400597
## ca1 -2.087751 0.649074 -3.217 0.001298 **
## ca2 -3.598974 0.974396 -3.694 0.000221 ***
## ca3 -1.855011 1.117170 -1.660 0.096823 .
## ca4 -1.238432 3.346158 -0.370 0.711304
## thal1 3.987878 2.734565 1.458 0.144752
## thal2 2.843159 2.615743 1.087 0.277063
## thal3 1.344503 2.609336 0.515 0.606367
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 128.10 on 219 degrees of freedom
## AIC: 172.1
##
## Number of Fisher Scoring iterations: 7

#exang should be removed since it reduces aic after removal

model <- glm(target~.-oldpeak, training, family = binomial)

summary(model)

##
## Call:
## glm(formula = target ~ . - oldpeak, family = binomial, data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.381717 4.232311 -0.090 0.928135
## age 0.021295 0.030209 0.705 0.480861
## sex1 -2.711512 0.727086 -3.729 0.000192 ***
## cp1 1.406083 0.718866 1.956 0.050468 .
## cp2 2.398675 0.638389 3.757 0.000172 ***
## cp3 2.734715 0.858382 3.186 0.001443 **
## trestbps -0.036830 0.013751 -2.678 0.007401 **
## chol -0.006730 0.005041 -1.335 0.181857
## fbs1 1.367347 0.790969 1.729 0.083863 .
## restecg1 0.082123 0.473702 0.173 0.862365
## restecg2 -0.600034 2.904674 -0.207 0.836342
## thalach 0.034541 0.016853 2.049 0.040415 *
## exang1 -0.708292 0.538294 -1.316 0.188239
## slope1 -0.507411 0.901779 -0.563 0.573654
## slope2 1.501021 0.926555 1.620 0.105232
## ca1 -2.010400 0.639365 -3.144 0.001664 **
## ca2 -3.909845 0.959312 -4.076 4.59e-05 ***
## ca3 -1.944451 1.082769 -1.796 0.072524 .
## ca4 -0.666123 3.643758 -0.183 0.854945
## thal1 3.552278 2.822071 1.259 0.208121
## thal2 2.589238 2.710921 0.955 0.339520
## thal3 1.103418 2.708624 0.407 0.683735
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 129.31 on 219 degrees of freedom
## AIC: 173.31
##
## Number of Fisher Scoring iterations: 7
#oldpeak should not be removed since it increases aic after removal

model <- glm(target~.-slope, training, family = binomial)

summary(model)

##
## Call:
## glm(formula = target ~ . - slope, family = binomial, data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.742790 3.584117 -0.207 0.835818
## age 0.015372 0.030434 0.505 0.613505
## sex1 -2.211146 0.676249 -3.270 0.001077 **
## cp1 1.347198 0.686410 1.963 0.049684 *
## cp2 2.397706 0.625744 3.832 0.000127 ***
## cp3 2.655163 0.806184 3.293 0.000990 ***
## trestbps -0.033253 0.013236 -2.512 0.011994 *
## chol -0.006654 0.004919 -1.353 0.176135
## fbs1 1.030259 0.764565 1.348 0.177816
## restecg1 0.186206 0.464385 0.401 0.688440
## restecg2 -0.357638 3.282078 -0.109 0.913229
## thalach 0.040990 0.015769 2.599 0.009337 **
## exang1 -0.695550 0.532766 -1.306 0.191708
## oldpeak -0.636878 0.253941 -2.508 0.012143 *
## ca1 -1.617110 0.584633 -2.766 0.005674 **
## ca2 -2.934552 0.873648 -3.359 0.000782 ***
## ca3 -1.492465 0.994232 -1.501 0.133324
## ca4 -1.135837 2.763365 -0.411 0.681048
## thal1 3.130701 2.100989 1.490 0.136196
## thal2 2.371865 1.949207 1.217 0.223667
## thal3 0.809129 1.946088 0.416 0.677577
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 136.30 on 220 degrees of freedom
## AIC: 178.3
##
## Number of Fisher Scoring iterations: 6

#slope should not be removed since it reduces aic after removal.

model <- glm(target~.-thal, training, family = binomial)

summary(model)

##
## Call:
## glm(formula = target ~ . - thal, family = binomial, data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.609153 3.473813 1.327 0.184566
## age 0.012875 0.029076 0.443 0.657895
## sex1 -2.873978 0.637989 -4.505 6.65e-06 ***
## cp1 1.608956 0.698737 2.303 0.021298 *
## cp2 2.507789 0.615335 4.075 4.59e-05 ***
## cp3 2.777243 0.775936 3.579 0.000345 ***
## trestbps -0.032936 0.013123 -2.510 0.012077 *
## chol -0.009421 0.004888 -1.928 0.053912 .
## fbs1 1.075935 0.724441 1.485 0.137492
## restecg1 -0.054437 0.450792 -0.121 0.903883
## restecg2 -0.729176 2.797157 -0.261 0.794336
## thalach 0.026316 0.014890 1.767 0.077164 .
## exang1 -0.868600 0.527297 -1.647 0.099503 .
## oldpeak -0.392701 0.244089 -1.609 0.107651
## slope1 -0.978026 0.932975 -1.048 0.294506
## slope2 0.769392 1.031594 0.746 0.455771
## ca1 -1.942310 0.589225 -3.296 0.000979 ***
## ca2 -3.318062 0.894370 -3.710 0.000207 ***
## ca3 -2.177570 1.060378 -2.054 0.040016 *
## ca4 -0.998000 2.652173 -0.376 0.706698
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 139.70 on 221 degrees of freedom
## AIC: 179.7
##
## Number of Fisher Scoring iterations: 6

#Thal should not be removed since it increases aic after removal.

#Final model after optimization

model <- glm(target~.-age-chol-restecg-exang, training, family = binomial)
summary(model)

##
## Call:
## glm(formula = target ~ . - age - chol - restecg - exang, family =
binomial,
## data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.17499 3.65481 -0.048 0.961812
## sex1 -2.39086 0.69210 -3.454 0.000551 ***
## cp1 1.46593 0.67782 2.163 0.030562 *
## cp2 2.85714 0.63490 4.500 6.79e-06 ***
## cp3 2.98425 0.83328 3.581 0.000342 ***
## trestbps -0.03439 0.01311 -2.623 0.008729 **
## fbs1 1.30984 0.78043 1.678 0.093277 .
## thalach 0.02941 0.01465 2.008 0.044618 *
## oldpeak -0.47145 0.26303 -1.792 0.073080 .
## slope1 -0.96292 0.97900 -0.984 0.325326
## slope2 0.83988 1.05150 0.799 0.424442
## ca1 -2.08207 0.64692 -3.218 0.001289 **
## ca2 -3.38838 0.91279 -3.712 0.000206 ***
## ca3 -1.80585 1.09189 -1.654 0.098153 .
## ca4 -1.31700 3.06447 -0.430 0.667367
## thal1 3.97623 2.87657 1.382 0.166886
## thal2 2.73084 2.76286 0.988 0.322950
## thal3 1.14514 2.74812 0.417 0.676900
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 130.19 on 223 degrees of freedom
## AIC: 166.19
##
## Number of Fisher Scoring iterations: 6

#Check model assumptions

library(car)

library(performance)

#Test of goodness of fit

check_predictions(model)
#The models predicted data fits the observed data hence the
#assumption of goodness of fit is not violated.

#Check outliers
check_outliers(model)

## OK: No outliers detected.

## - Based on the following method and threshold: cook (0.9).
## - For variable: (Whole model)

check_outliers(model) %>% plot()

plot(model, which = 4)
#Check for linearity
check_autocorrelation(model)

## OK: Residuals appear to be independent and not autocorrelated (p = 0.876).

check_residuals(model) %>% plot()

#Check autocorrelation
check_autocorrelation(model)

## OK: Residuals appear to be independent and not autocorrelated (p = 0.940).

#Check for multicollinearity

check_collinearity(model)

## # Check for Multicollinearity

##
## Low Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI
## sex 1.80 [1.55, 2.16] 1.34 0.56 [0.46, 0.65]
## cp 1.85 [1.59, 2.23] 1.36 0.54 [0.45, 0.63]
## trestbps 1.19 [1.08, 1.44] 1.09 0.84 [0.70, 0.93]
## fbs 1.23 [1.11, 1.48] 1.11 0.81 [0.68, 0.90]
## thalach 1.43 [1.26, 1.71] 1.19 0.70 [0.59, 0.79]
## oldpeak 1.45 [1.28, 1.74] 1.21 0.69 [0.58, 0.78]
## slope 1.90 [1.63, 2.29] 1.38 0.53 [0.44, 0.61]
## ca 2.11 [1.79, 2.55] 1.45 0.47 [0.39, 0.56]
## thal 1.75 [1.51, 2.11] 1.32 0.57 [0.47, 0.66]

#Check for independence of residuals

durbinWatsonTest(model)

## lag Autocorrelation D-W Statistic p-value

## 1 -0.009517289 2.018653 0.964
## Alternative hypothesis: rho != 0

#Interpret model predictions

library(ggeffects)
ggeffect(model)

##
## $sex
## # Predicted probabilities of target
##
## sex | Predicted | 95% CI
## ----------------------------
## 0 | 0.86 | 0.68, 0.95
## 1 | 0.37 | 0.24, 0.52
##
##
## $cp
## # Predicted probabilities of target
##
## cp | Predicted | 95% CI
## ---------------------------
## 0 | 0.25 | 0.15, 0.40
## 1 | 0.60 | 0.31, 0.83
## 2 | 0.86 | 0.69, 0.94
## 3 | 0.87 | 0.62, 0.96
##
##
## $trestbps
## # Predicted probabilities of target
##
## trestbps | Predicted | 95% CI
## ---------------------------------
## 94 | 0.82 | 0.62, 0.93
## 107 | 0.75 | 0.58, 0.87
## 121 | 0.65 | 0.52, 0.76
## 134 | 0.54 | 0.42, 0.65
## 147 | 0.43 | 0.29, 0.58
## 160 | 0.33 | 0.17, 0.54
## 173 | 0.24 | 0.09, 0.50
## 200 | 0.11 | 0.02, 0.43

##
## $fbs
## # Predicted probabilities of target
##
## fbs | Predicted | 95% CI
## ----------------------------
## 0 | 0.51 | 0.39, 0.63
## 1 | 0.80 | 0.48, 0.94

##
## $thalach
## # Predicted probabilities of target
##
## thalach | Predicted | 95% CI
## --------------------------------
## 70 | 0.11 | 0.01, 0.57
## 85 | 0.16 | 0.03, 0.57
## 105 | 0.25 | 0.08, 0.58
## 120 | 0.35 | 0.16, 0.59
## 135 | 0.45 | 0.30, 0.61
## 155 | 0.60 | 0.48, 0.70
## 170 | 0.70 | 0.53, 0.82
## 205 | 0.87 | 0.56, 0.97

## $oldpeak
## # Predicted probabilities of target
##
## oldpeak | Predicted | 95% CI
## --------------------------------
## 0.00 | 0.67 | 0.51, 0.80
## 0.50 | 0.62 | 0.49, 0.73
## 1.50 | 0.50 | 0.37, 0.64
## 2.50 | 0.39 | 0.20, 0.62
## 3.00 | 0.33 | 0.14, 0.62
## 3.50 | 0.28 | 0.09, 0.62
## 4.50 | 0.20 | 0.04, 0.62
## 6.00 | 0.11 | 0.01, 0.63

##
## $slope
## # Predicted probabilities of target
##
## slope | Predicted | 95% CI
## ------------------------------
## 0 | 0.57 | 0.17, 0.89
## 1 | 0.33 | 0.19, 0.51
## 2 | 0.75 | 0.58, 0.87
##
##
## $ca
## # Predicted probabilities of target
##
## ca | Predicted | 95% CI
## ---------------------------
## 0 | 0.78 | 0.65, 0.87
## 1 | 0.30 | 0.13, 0.55
## 2 | 0.11 | 0.02, 0.37
## 3 | 0.36 | 0.07, 0.81
## 4 | 0.48 | 0.00, 1.00
##
##
## $thal
## # Predicted probabilities of target
##
## thal | Predicted | 95% CI
## -----------------------------
## 0 | 0.13 | 0.00, 0.97
## 1 | 0.89 | 0.59, 0.98
## 2 | 0.69 | 0.54, 0.81
## 3 | 0.31 | 0.18, 0.50
##
## ## attr(,"class")
## [1] "ggalleffects" "list"
## attr(,"model.name")
## [1] "model"

ggpredict(model, terms = "sex")

## # Predicted probabilities of target

##
## sex | Predicted | 95% CI
## ----------------------------
## 0 | 0.31 | 0.00, 0.99
## 1 | 0.04 | 0.00, 0.93
##
## Adjusted for:
## * age = 54.52
## * cp = 0
## * trestbps = 131.91
## * chol = 244.69
## * fbs = 0
## * restecg = 0
## * thalach = 149.77
## * exang = 0
## * oldpeak = 1.03
## * slope = 0
## * ca = 0
## * thal = 0

#Visualize model predictions

theme_set(theme_bw()+theme(title = element_text
(face = "bold",
colour = "steelblue")))
ggplot(heart, aes(target, fill = target))+
geom_bar(width = 0.7)+
labs(title = "Incidence of Heart Disease",
y = "Frequency",
x = "Disease Outcome")

ggplot(training, aes(sex, fill = target))+

geom_bar(position = "dodge")+
labs(title = "Incidence of Heart Disease based on Sex",
y = "Frequency",
x = "Sex",
caption = "Africa CDC(2024)")
ggplot(training, aes(cp, fill = target))+
geom_bar(position = "dodge")+
labs(title = "Incidence of Heart Disease based on Chest pain type",
y = "Frequency",
x = "Chest Pain Type",
caption = "Africa CDC(2024)")
ggplot(training, aes(trestbps, fill = target))+
geom_histogram(binwidth = 9, position = "dodge")+
labs(title = "Incidence of Heart Disease based on Resting Blood Pressure",
y = "Frequency",
x = "Resting Blood Pressure",
caption = "Africa CDC(2024)")
ggplot(training, aes(fbs, fill = target))+
geom_bar(position = "dodge")+
labs(title = "Incidence of Heart Disease based on Fasting blood sugar",
y = "Frequency",
x = "Fasting blood sugar",
caption = "Africa CDC(2024)")
table(heart$target, heart$fbs)

##
## 0 1
## 0 116 22
## 1 141 23

ggplot(training, aes(thalach, fill = target))+

geom_histogram(binwidth = 10, position = "dodge")+
labs(title = "Incidence of Heart Disease based on Heart rate",
y = "Frequency",
x = "Maximum heart rate",
caption = "Africa CDC(2024)")
ggplot(training, aes(oldpeak, fill = target))+
geom_histogram(binwidth = 1, position = "dodge")+
labs(title = "Incidence of Heart Disease based on ST Depression",
y = "Frequency",
x = "ST Depression induced dy Excercise",
caption = "Africa CDC(2024)")
ggplot(training, aes(slope, fill = target))+
geom_bar(position = "dodge")+
labs(title = "Incidence of Heart Disease based on Slope of Peak exercise",
y = "Frequency",
x = "Slope of the peak exercise",
caption = "Africa CDC(2024)")
table(heart$target, heart$slope)

##
## 0 1 2
## 0 12 91 35
## 1 9 49 106

ggplot(training, aes(ca, fill = target))+

geom_bar(position = "dodge")+
labs(title = "Incidence of Heart Disease based on Vessels colored by
floursopy",
y = "Frequency",
x = "Number of major vessels",
caption = "Africa CDC(2024)")
table(heart$target, heart$ca)

##
## 0 1 2 3 4
## 0 45 44 31 17 1
## 1 130 21 7 3 3

ggplot(training, aes(thal, fill = target))+

geom_bar(position = "dodge")+
labs(title = "Incidence of Heart Disease based on thal",
y = "Frequency",
caption = "Africa CDC(2024)")
table(heart$target, heart$thal)

##
## 0 1 2 3
## 0 1 12 36 89
## 1 1 6 129 28

#Checking variable importance

library(vip)

car::Anova(model)

## Analysis of Deviance Table (Type II tests)

##
## Response: target
## LR Chisq Df Pr(>Chisq)
## sex 14.541 1 0.0001372 ***
## cp 32.465 3 4.176e-07 ***
## trestbps 7.608 1 0.0058124 **
## fbs 2.931 1 0.0868757 .
## thalach 4.597 1 0.0320293 *
## oldpeak 3.401 1 0.0651615 .
## slope 10.335 2 0.0056982 **
## ca 25.131 4 4.736e-05 ***
## thal 16.569 3 0.0008666 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vip(model)

#Produce model equation

library(equatiomatic)

## Warning: package 'equatiomatic' was built under R version 4.4.2

extract_eq(model)

𝑃(target = 1)
log [ ]
1 − 𝑃(target = 1)
= 𝛼 + 𝛽1 (sex1 ) + 𝛽2 (cp1 ) + 𝛽3 (cp2 ) + 𝛽4 (cp3 ) + 𝛽5 (trestbps) + 𝛽6 (fbs1 ) + 𝛽7 (thalach)
+ 𝛽8 (oldpeak) + 𝛽9 (slope1 ) + 𝛽10 (slope2 ) + 𝛽11 (ca1 ) + 𝛽12 (ca2 ) + 𝛽13 (ca3 ) + 𝛽14 (ca4 )
+ 𝛽15 (thal1 ) + 𝛽16 (thal2 ) + 𝛽17 (thal3 )
#How well our model fits the data
library(performance)
performance(model)

## # Indices of model performance

##
## AIC | AICc | BIC | Tjur's R2 | RMSE | Sigma | Log_loss |
Score_log
## --------------------------------------------------------------------------
----
## 166.193 | 169.274 | 228.920 | 0.665 | 0.287 | 1.000 | 0.270 |
-Inf
##
## AIC | Score_spherical | PCP
## ---------------------------------
## 166.193 | 0.025 | 0.834

library(effectsize)
interpret_r2(0.665)

## [1] "substantial"
## (Rules: cohen1988)

#Making Predictions

predictions <- predict(model, testing, type = "response")

results <- data.frame(Actual = testing$target,
Predicted = predictions)
tail(results)

## Actual Predicted
## 56 0 0.05760362
## 57 1 0.07685093
## 58 1 0.98546924
## 59 0 0.75525302
## 60 0 0.05270169
## 61 1 0.95253725

#Model Evaluation
table(Actual = testing$target, Predicted = predictions > 0.3)

## Predicted
## Actual FALSE TRUE
## 0 21 7
## 1 4 29

#Accuracy
((21+29)/(21+7+4+29))*100

## [1] 81.96721

#Recall
((29)/(7+29))*100

## [1] 80.55556

#Precision
((29)/(29+4))*100

## [1] 87.87879

#F1-Score
((80.56*87.88)/(80.56+87.88))*2

## [1] 84.06095
#MCC
((29*21)-(4*7))/911.9

## [1] 0.6371313

library(pROC)

roc(target~fitted.values(model), data = training,

plot = TRUE, legacy.axes = TRUE,
print.auc = TRUE, ci = TRUE)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

##
## Call:
## roc.formula(formula = target ~ fitted.values(model), data = training,
plot = TRUE, legacy.axes = TRUE, print.auc = TRUE, ci = TRUE)
##
## Data: fitted.values(model) in 110 controls (target 0) < 131 cases (target
1).
## Area under the curve: 0.9555
## 95% CI: 0.9318-0.9792 (DeLong)

#AUC
#0.956
#Generate model probabilities
probs <- predict(model, testing, type = "response")

#Data frame with actual target and predicted probabilities

scores <- data.frame(
Actual = testing$target,
Predicted_Probability = probs
)

# Classify the scores into risk categories

# Define risk categories based on probability thresholds
scores <- scores %>%
mutate(
Risk_Category = case_when(
Predicted_Probability < 0.3 ~ "Low Risk",
Predicted_Probability >= 0.3 & Predicted_Probability < 0.6 ~ "Medium
Risk",
Predicted_Probability >= 0.6 ~ "High Risk"
)
)

# Visualize the distribution of risk categories

library(ggplot2)
ggplot(scores, aes(x = Predicted_Probability, fill = Risk_Category)) +
geom_histogram(binwidth = 0.05, color = "black") +
labs(title = "Distribution of Predicted Probability Scores",
x = "Predicted Probability",
y = "Frequency")
# Review the first few rows of the score data
head(scores, 30)

## Actual Predicted_Probability Risk_Category

## 1 0 0.1936875725 Low Risk
## 2 0 0.0934782022 Low Risk
## 3 0 0.2997032177 Low Risk
## 4 0 0.9294019274 High Risk
## 5 0 0.8761094468 High Risk
## 6 1 0.0028066440 Low Risk
## 7 0 0.1435069905 Low Risk
## 8 1 0.8063566989 High Risk
## 9 1 0.9472678160 High Risk
## 10 0 0.0036480026 Low Risk
## 11 1 0.5850682700 Medium Risk
## 12 0 0.1976715032 Low Risk
## 13 0 0.0132086376 Low Risk
## 14 1 0.9759977332 High Risk
## 15 0 0.0016043285 Low Risk
## 16 1 0.9659359017 High Risk
## 17 0 0.0667607931 Low Risk
## 18 1 0.9706073626 High Risk
## 19 0 0.0002886308 Low Risk
## 20 1 0.9930553909 High Risk
## 21 0 0.5857953139 Medium Risk
## 22 0 0.0153843213 Low Risk
## 23 1 0.9572845471 High Risk
## 24 1 0.3382015221 Medium Risk
## 25 0 0.0004665334 Low Risk
## 26 1 0.7879620087 High Risk
## 27 0 0.0228474896 Low Risk
## 28 1 0.9777087138 High Risk
## 29 1 0.5422930068 Medium Risk
## 30 0 0.0021259245 Low Risk

Ex 10 - Decision Tree With Rpart and Fancy Plot and Cardio Data
No ratings yet
Ex 10 - Decision Tree With Rpart and Fancy Plot and Cardio Data
4 pages
Exame Do Dia 13 12 2019
No ratings yet
Exame Do Dia 13 12 2019
8 pages
Manova
No ratings yet
Manova
8 pages
A028 GLM-SC3
No ratings yet
A028 GLM-SC3
137 pages
HW 2
No ratings yet
HW 2
9 pages
20mia1006 FDA LAB REGRESSION TYPES
No ratings yet
20mia1006 FDA LAB REGRESSION TYPES
11 pages
Multicollinearity and Oaxaca - Tutorial
No ratings yet
Multicollinearity and Oaxaca - Tutorial
35 pages
212011497-4SE5-Kautsar Hilmi Izzuddin Pertemuan 5
No ratings yet
212011497-4SE5-Kautsar Hilmi Izzuddin Pertemuan 5
13 pages
WEEK
No ratings yet
WEEK
17 pages
Model Linear
No ratings yet
Model Linear
33 pages
Chapter 6. Product Limit Stimator Peter Smith: Required Libraries
No ratings yet
Chapter 6. Product Limit Stimator Peter Smith: Required Libraries
9 pages
Report
No ratings yet
Report
10 pages
Exame Do Dia 31 01 2020
No ratings yet
Exame Do Dia 31 01 2020
7 pages
Assingment 8: Chitresh Kumar
No ratings yet
Assingment 8: Chitresh Kumar
20 pages
PH6205 RTutorial 2
No ratings yet
PH6205 RTutorial 2
15 pages
Kautsar Hilmi Izzuddin - Tugas SAE P5
No ratings yet
Kautsar Hilmi Izzuddin - Tugas SAE P5
13 pages
Stata 2
No ratings yet
Stata 2
11 pages
Results 1
No ratings yet
Results 1
4 pages
BAN5
No ratings yet
BAN5
2 pages
Amta - Final - Notes.r: ### Step Wise AIC Regression
No ratings yet
Amta - Final - Notes.r: ### Step Wise AIC Regression
6 pages
Assignment# 06
No ratings yet
Assignment# 06
16 pages
ASSIGNMENT NO - 2, FDAS - SUMANYAKUMARI - Bfia
No ratings yet
ASSIGNMENT NO - 2, FDAS - SUMANYAKUMARI - Bfia
6 pages
Maths Lab
No ratings yet
Maths Lab
17 pages
R Code Default Data PDF
No ratings yet
R Code Default Data PDF
10 pages
Sae P5 Kautsar
No ratings yet
Sae P5 Kautsar
13 pages
Fruit Veg Price Analysis Results1
No ratings yet
Fruit Veg Price Analysis Results1
46 pages
Old Faithful Geyser Data Analysis
No ratings yet
Old Faithful Geyser Data Analysis
12 pages
Heart Disease Prediction! ?
No ratings yet
Heart Disease Prediction! ?
52 pages
Prac 1 AP
No ratings yet
Prac 1 AP
9 pages
BT PTTKNC
No ratings yet
BT PTTKNC
5 pages
Pool
No ratings yet
Pool
13 pages
SKG Assignemnet
No ratings yet
SKG Assignemnet
3 pages
ECO311 Stata
100% (1)
ECO311 Stata
111 pages
Machine Learning-Lecture 2 (Student)
No ratings yet
Machine Learning-Lecture 2 (Student)
9 pages
Project of Biostatistics#02-RaeesaAli-MS - BIOTECH
No ratings yet
Project of Biostatistics#02-RaeesaAli-MS - BIOTECH
27 pages
Assignment - 3.R: 2021-08-02 by Harshith H S 2001115
No ratings yet
Assignment - 3.R: 2021-08-02 by Harshith H S 2001115
12 pages
Final Estimationsa
No ratings yet
Final Estimationsa
7 pages
Lab Wk1soln PDF
No ratings yet
Lab Wk1soln PDF
14 pages
WK 2 DQ1
No ratings yet
WK 2 DQ1
9 pages
Saad Akhtar
No ratings yet
Saad Akhtar
48 pages
Data Science
No ratings yet
Data Science
15 pages
Exam
No ratings yet
Exam
6 pages
kNN with R Caret Package Guide
No ratings yet
kNN with R Caret Package Guide
17 pages
Problem 4.1 A)
No ratings yet
Problem 4.1 A)
11 pages
Bda Assign
No ratings yet
Bda Assign
15 pages
Code
No ratings yet
Code
25 pages
Việt Cường
No ratings yet
Việt Cường
14 pages
2024 Bank Syariah NPL
No ratings yet
2024 Bank Syariah NPL
4 pages
Uji Data NPL, Bopo, Car, Nim, Dan Roa 15 Bank Umum
No ratings yet
Uji Data NPL, Bopo, Car, Nim, Dan Roa 15 Bank Umum
9 pages
STAT-2450 Assignment 1: Name:, Student ID: B00
No ratings yet
STAT-2450 Assignment 1: Name:, Student ID: B00
9 pages
Output Da Record
No ratings yet
Output Da Record
16 pages
Problem Set 3
No ratings yet
Problem Set 3
9 pages
4027 Assignment Q5
No ratings yet
4027 Assignment Q5
12 pages
Apotelesmata 2000+ Robust
No ratings yet
Apotelesmata 2000+ Robust
4 pages
R Commands
No ratings yet
R Commands
5 pages
Assignment 4: Chitresh Kumar
No ratings yet
Assignment 4: Chitresh Kumar
7 pages
Da Lab File 2
No ratings yet
Da Lab File 2
13 pages
Toc
No ratings yet
Toc
14 pages
Brand Preference Regression Analysis
100% (2)
Brand Preference Regression Analysis
18 pages
One Way Anova: SUM 175 195 153 Mean
No ratings yet
One Way Anova: SUM 175 195 153 Mean
4 pages
Hypothesis
No ratings yet
Hypothesis
11 pages
MT271 Chapter
No ratings yet
MT271 Chapter
14 pages
Research (Third Quarter)
No ratings yet
Research (Third Quarter)
3 pages
Advanced Least Squares Techniques
No ratings yet
Advanced Least Squares Techniques
12 pages
Analytical Chemistry MCQ Part 2 of 10
No ratings yet
Analytical Chemistry MCQ Part 2 of 10
11 pages
Unit 4 L3 Analysis of Empirical Data
No ratings yet
Unit 4 L3 Analysis of Empirical Data
29 pages
Section+02+ +Getting+Started+With+AzureML
No ratings yet
Section+02+ +Getting+Started+With+AzureML
33 pages
Epidemiology Statistical Tests Guide
No ratings yet
Epidemiology Statistical Tests Guide
6 pages
Evidence Based Ophthalmology: Noel D. Atienza, MD, MSC
No ratings yet
Evidence Based Ophthalmology: Noel D. Atienza, MD, MSC
59 pages
Probability and Statistics Mock Test
No ratings yet
Probability and Statistics Mock Test
3 pages
Binomial Distribution
No ratings yet
Binomial Distribution
22 pages
Statistical Inference - 1 Notes
No ratings yet
Statistical Inference - 1 Notes
17 pages
ANOVA Examples
No ratings yet
ANOVA Examples
5 pages
Biostatistics
No ratings yet
Biostatistics
3 pages
Males, Ages Birth - 36 Months
No ratings yet
Males, Ages Birth - 36 Months
13 pages
دور المعلم في مراعات الفروق الفردية لدى طلبته في ضوء بعض المتغيرات
No ratings yet
دور المعلم في مراعات الفروق الفردية لدى طلبته في ضوء بعض المتغيرات
23 pages
Probability and Statistics With Examples Using R Siva Athreya, Deepayan Sarkar, and Steve Tanner
No ratings yet
Probability and Statistics With Examples Using R Siva Athreya, Deepayan Sarkar, and Steve Tanner
258 pages
Scan 12 Jun 25 16 17 27
No ratings yet
Scan 12 Jun 25 16 17 27
10 pages
Correlation & Regression Key Answers
No ratings yet
Correlation & Regression Key Answers
2 pages
Week 1 - Introduction To Statistics PDF
No ratings yet
Week 1 - Introduction To Statistics PDF
34 pages
The Mann Whitney or Wilcoxon Rank
No ratings yet
The Mann Whitney or Wilcoxon Rank
6 pages
Project On Travel Sector
No ratings yet
Project On Travel Sector
35 pages
Unit 3 (Hypothesis Testing)
No ratings yet
Unit 3 (Hypothesis Testing)
40 pages
Statistics For Business and Economics (13e) : John Loucks
100% (1)
Statistics For Business and Economics (13e) : John Loucks
48 pages
Unit 6 - CPH 4510 Biostatistics - Reading Assignment
No ratings yet
Unit 6 - CPH 4510 Biostatistics - Reading Assignment
13 pages
Chapter 7 Dynamic Econometric Models
No ratings yet
Chapter 7 Dynamic Econometric Models
15 pages

Heart Disease Prediction Model

Uploaded by

Heart Disease Prediction Model

Uploaded by

HEART DISEASE PREDICTION MODEL

heart <- read_csv("C:/Users/PC/OneDrive/Desktop/Data

duplicated(heart) %>% table()

heart <- heart %>% distinct(.)

#Check the relationship between variables

## age sex cp trestbps chol fbs restecg thalach exang

#Visualize your correlations

#Splitting the dataset

#Build the model

#Age should be removed since it reduces aic after its removal

model <- glm(target~.-chol, training, family = binomial)

#Chol should be removed since it reduces aic after removal.

model <- glm(target~.-fbs, training, family = binomial)

#fbs should not be removed since it increases aic after removal

model <- glm(target~.-restecg, training, family = binomial)

#restecg should be removed since it reduces aic after removal

model <- glm(target~.-exang, training, family = binomial)

#exang should be removed since it reduces aic after removal

model <- glm(target~.-oldpeak, training, family = binomial)

model <- glm(target~.-slope, training, family = binomial)

#slope should not be removed since it reduces aic after removal.

model <- glm(target~.-thal, training, family = binomial)

#Thal should not be removed since it increases aic after removal.

#Final model after optimization

#Check model assumptions

#Test of goodness of fit

## OK: No outliers detected.

check_outliers(model) %>% plot()

## OK: Residuals appear to be independent and not autocorrelated (p = 0.876).

check_residuals(model) %>% plot()

## OK: Residuals appear to be independent and not autocorrelated (p = 0.940).

#Check for multicollinearity

## # Check for Multicollinearity

#Check for independence of residuals

## lag Autocorrelation D-W Statistic p-value

#Interpret model predictions

ggpredict(model, terms = "sex")

## # Predicted probabilities of target

#Visualize model predictions

ggplot(training, aes(sex, fill = target))+

ggplot(training, aes(thalach, fill = target))+

ggplot(training, aes(ca, fill = target))+

ggplot(training, aes(thal, fill = target))+

#Checking variable importance

## Analysis of Deviance Table (Type II tests)

#Produce model equation

## Warning: package 'equatiomatic' was built under R version 4.4.2

## # Indices of model performance

predictions <- predict(model, testing, type = "response")

roc(target~fitted.values(model), data = training,

## Setting levels: control = 0, case = 1

#Data frame with actual target and predicted probabilities

# Classify the scores into risk categories

# Visualize the distribution of risk categories

## Actual Predicted_Probability Risk_Category

You might also like