0% found this document useful (0 votes)
22 views35 pages

Heart Disease Prediction Model

Epidemiología 2
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
22 views35 pages

Heart Disease Prediction Model

Epidemiología 2
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 35

HEART DISEASE PREDICTION MODEL

Enock Bereka

2024-12-14
#Load the required library and the dataset

heart <- read_csv("C:/Users/PC/OneDrive/Desktop/Data


Science/Datasets/Project/heart.csv")

view(heart)

#Data Exploration
head(heart)

## # A tibble: 6 × 14
## age sex cp trestbps chol fbs restecg thalach exang oldpeak
slope
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
<dbl>
## 1 52 1 0 125 212 0 1 168 0 1
2
## 2 53 1 0 140 203 1 0 155 1 3.1
0
## 3 70 1 0 145 174 0 1 125 1 2.6
0
## 4 61 1 0 148 203 0 1 161 0 0
2
## 5 62 0 0 138 294 1 1 106 0 1.9
1
## 6 58 0 0 100 248 0 0 122 0 1
1
## # ℹ 3 more variables: ca <dbl>, thal <dbl>, target <dbl>

glimpse(heart)

## Rows: 1,025
## Columns: 14
## $ age <dbl> 52, 53, 70, 61, 62, 58, 58, 55, 46, 54, 71, 43, 34, 51,
52, 3…
## $ sex <dbl> 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,
1, 1…
## $ cp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 0, 1,
2, 2…
## $ trestbps <dbl> 125, 140, 145, 148, 138, 100, 114, 160, 120, 122, 112,
132, 1…
## $ chol <dbl> 212, 203, 174, 203, 294, 248, 318, 289, 249, 286, 149,
341, 2…
## $ fbs <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
1, 0…
## $ restecg <dbl> 1, 0, 1, 1, 1, 0, 2, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
0, 0…
## $ thalach <dbl> 168, 155, 125, 161, 106, 122, 140, 145, 144, 116, 125,
136, 1…
## $ exang <dbl> 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
0, 0…
## $ oldpeak <dbl> 1.0, 3.1, 2.6, 0.0, 1.9, 1.0, 4.4, 0.8, 0.8, 3.2, 1.6,
3.0, 0…
## $ slope <dbl> 2, 0, 0, 2, 1, 1, 0, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2,
2, 1…
## $ ca <dbl> 2, 0, 0, 1, 3, 0, 3, 1, 0, 2, 0, 0, 0, 3, 0, 0, 1, 1, 0,
0, 0…
## $ thal <dbl> 3, 3, 3, 3, 2, 2, 1, 3, 3, 2, 2, 3, 2, 3, 0, 2, 2, 3, 2,
2, 2…
## $ target <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
1, 0…

anyNA(heart)

## [1] FALSE

duplicated(heart) %>% table()

## .
## FALSE TRUE
## 302 723

heart <- heart %>% distinct(.)

#Check the relationship between variables


round(cor(heart), 2)

## age sex cp trestbps chol fbs restecg thalach exang


oldpeak
## age 1.00 -0.09 -0.06 0.28 0.21 0.12 -0.11 -0.40 0.09
0.21
## sex -0.09 1.00 -0.05 -0.06 -0.20 0.05 -0.06 -0.05 0.14
0.10
## cp -0.06 -0.05 1.00 0.05 -0.07 0.10 0.04 0.29 -0.39 -
0.15
## trestbps 0.28 -0.06 0.05 1.00 0.13 0.18 -0.12 -0.05 0.07
0.19
## chol 0.21 -0.20 -0.07 0.13 1.00 0.01 -0.15 -0.01 0.06
0.05
## fbs 0.12 0.05 0.10 0.18 0.01 1.00 -0.08 -0.01 0.02
0.00
## restecg -0.11 -0.06 0.04 -0.12 -0.15 -0.08 1.00 0.04 -0.07 -
0.06
## thalach -0.40 -0.05 0.29 -0.05 -0.01 -0.01 0.04 1.00 -0.38 -
0.34
## exang 0.09 0.14 -0.39 0.07 0.06 0.02 -0.07 -0.38 1.00
0.29
## oldpeak 0.21 0.10 -0.15 0.19 0.05 0.00 -0.06 -0.34 0.29
1.00
## slope -0.16 -0.03 0.12 -0.12 0.00 -0.06 0.09 0.38 -0.26 -
0.58
## ca 0.30 0.11 -0.20 0.10 0.09 0.14 -0.08 -0.23 0.13
0.24
## thal 0.07 0.21 -0.16 0.06 0.10 -0.03 -0.01 -0.09 0.21
0.21
## target -0.22 -0.28 0.43 -0.15 -0.08 -0.03 0.13 0.42 -0.44 -
0.43
## slope ca thal target
## age -0.16 0.30 0.07 -0.22
## sex -0.03 0.11 0.21 -0.28
## cp 0.12 -0.20 -0.16 0.43
## trestbps -0.12 0.10 0.06 -0.15
## chol 0.00 0.09 0.10 -0.08
## fbs -0.06 0.14 -0.03 -0.03
## restecg 0.09 -0.08 -0.01 0.13
## thalach 0.38 -0.23 -0.09 0.42
## exang -0.26 0.13 0.21 -0.44
## oldpeak -0.58 0.24 0.21 -0.43
## slope 1.00 -0.09 -0.10 0.34
## ca -0.09 1.00 0.16 -0.41
## thal -0.10 0.16 1.00 -0.34
## target 0.34 -0.41 -0.34 1.00

cr <- round(cor(heart), 2)

#Visualize your correlations


library(ggcorrplot)
ggcorrplot(cr,title = "correlogram",lab_col = "black",
lab = TRUE, legend.title = "Pearson Correlation",
lab_size = 3, ggtheme = theme_classic(),
outline.color = "black",
colors = c("orange", "green", "blue"))
#Setting up factors
heart$sex <- as.factor(heart$sex)
heart$cp <- as.factor(heart$cp)
heart$fbs <- as.factor(heart$fbs)
heart$restecg <- as.factor(heart$restecg)
heart$exang <- as.factor(heart$exang)
heart$slope <- as.factor(heart$slope)
heart$ca <- as.factor(heart$ca)
heart$thal <- as.factor(heart$thal)
heart$target <- as.factor(heart$target)

#Splitting the dataset


library(caTools)
set.seed(2)
split <- sample.split(heart$target, SplitRatio = 0.8)
training <- subset(heart, split == TRUE)
testing <- subset(heart, split == FALSE)

#Build the model


model <- glm(target~., training, family = binomial)
summary(model)

##
## Call:
## glm(formula = target ~ ., family = binomial, data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.295200 4.353818 0.068 0.945943
## age 0.016480 0.030258 0.545 0.585989
## sex1 -2.624782 0.747849 -3.510 0.000448 ***
## cp1 1.306173 0.714180 1.829 0.067413 .
## cp2 2.596896 0.671185 3.869 0.000109 ***
## cp3 2.920285 0.861528 3.390 0.000700 ***
## trestbps -0.035263 0.013776 -2.560 0.010474 *
## chol -0.006496 0.005081 -1.278 0.201115
## fbs1 1.341979 0.831161 1.615 0.106401
## restecg1 0.074422 0.478785 0.155 0.876474
## restecg2 -0.243861 3.353627 -0.073 0.942032
## thalach 0.033606 0.017009 1.976 0.048174 *
## exang1 -0.619649 0.545508 -1.136 0.255993
## oldpeak -0.409794 0.267555 -1.532 0.125615
## slope1 -0.893167 1.000572 -0.893 0.372041
## slope2 0.861472 1.072122 0.804 0.421674
## ca1 -2.077715 0.649907 -3.197 0.001389 **
## ca2 -3.701530 0.989076 -3.742 0.000182 ***
## ca3 -1.845235 1.123495 -1.642 0.100506
## ca4 -1.084200 3.504617 -0.309 0.757045
## thal1 3.856076 2.968733 1.299 0.193980
## thal2 2.734698 2.856244 0.957 0.338342
## thal3 1.295089 2.852677 0.454 0.649835
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 126.82 on 218 degrees of freedom
## AIC: 172.82
##
## Number of Fisher Scoring iterations: 7

#Model optimization
model <- glm(target~.-age, training, family = binomial)
summary(model)

##
## Call:
## glm(formula = target ~ . - age, family = binomial, data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.325665 4.011251 0.330 0.741032
## sex1 -2.645056 0.744208 -3.554 0.000379 ***
## cp1 1.314534 0.712747 1.844 0.065136 .
## cp2 2.614185 0.672591 3.887 0.000102 ***
## cp3 2.879652 0.851004 3.384 0.000715 ***
## trestbps -0.033395 0.013231 -2.524 0.011602 *
## chol -0.006145 0.004981 -1.234 0.217310
## fbs1 1.315863 0.823644 1.598 0.110129
## restecg1 0.072474 0.479066 0.151 0.879754
## restecg2 -0.203098 3.107969 -0.065 0.947897
## thalach 0.030176 0.015683 1.924 0.054334 .
## exang1 -0.628631 0.543178 -1.157 0.247142
## oldpeak -0.423506 0.266105 -1.591 0.111497
## slope1 -0.879646 1.000413 -0.879 0.379248
## slope2 0.874920 1.071971 0.816 0.414398
## ca1 -2.029203 0.643416 -3.154 0.001612 **
## ca2 -3.555466 0.944992 -3.762 0.000168 ***
## ca3 -1.760305 1.124240 -1.566 0.117402
## ca4 -1.173104 3.359677 -0.349 0.726960
## thal1 3.911792 3.091294 1.265 0.205720
## thal2 2.791222 2.982312 0.936 0.349311
## thal3 1.356410 2.977998 0.455 0.648766
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 127.12 on 219 degrees of freedom
## AIC: 171.12
##
## Number of Fisher Scoring iterations: 7

#Age should be removed since it reduces aic after its removal

model <- glm(target~.-chol, training, family = binomial)


summary(model)

##
## Call:
## glm(formula = target ~ . - chol, family = binomial, data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.44404 4.24888 -0.105 0.916766
## age 0.01272 0.02959 0.430 0.667287
## sex1 -2.41489 0.71785 -3.364 0.000768 ***
## cp1 1.22038 0.70464 1.732 0.083288 .
## cp2 2.58580 0.66586 3.883 0.000103 ***
## cp3 2.87907 0.85219 3.378 0.000729 ***
## trestbps -0.03508 0.01383 -2.537 0.011182 *
## fbs1 1.31623 0.81371 1.618 0.105757
## restecg1 0.19375 0.46605 0.416 0.677608
## restecg2 -0.06578 3.01838 -0.022 0.982614
## thalach 0.02989 0.01624 1.840 0.065696 .
## exang1 -0.63343 0.53583 -1.182 0.237144
## oldpeak -0.42392 0.26841 -1.579 0.114249
## slope1 -0.97229 1.00300 -0.969 0.332354
## slope2 0.78275 1.07296 0.730 0.465681
## ca1 -2.09718 0.65763 -3.189 0.001428 **
## ca2 -3.62163 0.98329 -3.683 0.000230 ***
## ca3 -1.83598 1.10052 -1.668 0.095261 .
## ca4 -1.08181 3.47259 -0.312 0.755400
## thal1 3.75233 2.93266 1.279 0.200723
## thal2 2.56257 2.81508 0.910 0.362663
## thal3 1.02733 2.80375 0.366 0.714057
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 128.37 on 219 degrees of freedom
## AIC: 172.37
##
## Number of Fisher Scoring iterations: 7

#Chol should be removed since it reduces aic after removal.

model <- glm(target~.-fbs, training, family = binomial)


summary(model)

##
## Call:
## glm(formula = target ~ . - fbs, family = binomial, data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.707565 4.920003 0.144 0.885647
## age 0.014721 0.030413 0.484 0.628355
## sex1 -2.572096 0.747487 -3.441 0.000580 ***
## cp1 1.209278 0.694549 1.741 0.081666 .
## cp2 2.629193 0.654316 4.018 5.86e-05 ***
## cp3 3.003895 0.864801 3.474 0.000514 ***
## trestbps -0.032134 0.013221 -2.431 0.015075 *
## chol -0.006386 0.004972 -1.284 0.199035
## restecg1 0.125134 0.474661 0.264 0.792066
## restecg2 -0.301995 3.265169 -0.092 0.926309
## thalach 0.033734 0.016897 1.996 0.045886 *
## exang1 -0.651458 0.544731 -1.196 0.231725
## oldpeak -0.436764 0.266089 -1.641 0.100710
## slope1 -0.968080 1.006197 -0.962 0.335990
## slope2 0.679868 1.077629 0.631 0.528111
## ca1 -1.906226 0.618536 -3.082 0.002057 **
## ca2 -3.514953 0.973282 -3.611 0.000304 ***
## ca3 -1.524615 1.046876 -1.456 0.145297
## ca4 -0.522147 4.651809 -0.112 0.910628
## thal1 3.303207 3.755174 0.880 0.379054
## thal2 2.051047 3.663348 0.560 0.575559
## thal3 0.705558 3.670620 0.192 0.847572
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 129.54 on 219 degrees of freedom
## AIC: 173.54
##
## Number of Fisher Scoring iterations: 7

#fbs should not be removed since it increases aic after removal

model <- glm(target~.-restecg, training, family = binomial)


summary(model)

##
## Call:
## glm(formula = target ~ . - restecg, family = binomial, data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.363025 4.353294 0.083 0.933541
## age 0.016401 0.030232 0.543 0.587459
## sex1 -2.635390 0.740914 -3.557 0.000375 ***
## cp1 1.315717 0.712078 1.848 0.064644 .
## cp2 2.597341 0.671275 3.869 0.000109 ***
## cp3 2.924994 0.859950 3.401 0.000671 ***
## trestbps -0.035573 0.013639 -2.608 0.009104 **
## chol -0.006644 0.004961 -1.339 0.180501
## fbs1 1.346928 0.827585 1.628 0.103622
## thalach 0.033933 0.016907 2.007 0.044748 *
## exang1 -0.621461 0.545736 -1.139 0.254804
## oldpeak -0.410861 0.265914 -1.545 0.122325
## slope1 -0.893861 0.996916 -0.897 0.369919
## slope2 0.870259 1.066995 0.816 0.414719
## ca1 -2.085554 0.647452 -3.221 0.001277 **
## ca2 -3.706346 0.986647 -3.757 0.000172 ***
## ca3 -1.848735 1.127038 -1.640 0.100933
## ca4 -1.088224 3.450364 -0.315 0.752463
## thal1 3.878334 3.008006 1.289 0.197281
## thal2 2.736131 2.900255 0.943 0.345471
## thal3 1.308902 2.894590 0.452 0.651133
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 126.85 on 220 degrees of freedom
## AIC: 168.85
##
## Number of Fisher Scoring iterations: 7

#restecg should be removed since it reduces aic after removal

model <- glm(target~.-exang, training, family = binomial)


summary(model)

##
## Call:
## glm(formula = target ~ . - exang, family = binomial, data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.451644 4.102964 -0.110 0.912348
## age 0.017589 0.029896 0.588 0.556301
## sex1 -2.566693 0.733236 -3.501 0.000464 ***
## cp1 1.525123 0.688089 2.216 0.026660 *
## cp2 2.853317 0.637938 4.473 7.72e-06 ***
## cp3 3.060928 0.857519 3.570 0.000358 ***
## trestbps -0.035481 0.013831 -2.565 0.010308 *
## chol -0.006545 0.004967 -1.318 0.187579
## fbs1 1.348230 0.813749 1.657 0.097557 .
## restecg1 0.083141 0.476601 0.174 0.861516
## restecg2 -0.215391 3.169351 -0.068 0.945817
## thalach 0.035485 0.016760 2.117 0.034239 *
## oldpeak -0.445472 0.267789 -1.664 0.096208 .
## slope1 -0.892251 0.989585 -0.902 0.367248
## slope2 0.892124 1.061350 0.841 0.400597
## ca1 -2.087751 0.649074 -3.217 0.001298 **
## ca2 -3.598974 0.974396 -3.694 0.000221 ***
## ca3 -1.855011 1.117170 -1.660 0.096823 .
## ca4 -1.238432 3.346158 -0.370 0.711304
## thal1 3.987878 2.734565 1.458 0.144752
## thal2 2.843159 2.615743 1.087 0.277063
## thal3 1.344503 2.609336 0.515 0.606367
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 128.10 on 219 degrees of freedom
## AIC: 172.1
##
## Number of Fisher Scoring iterations: 7

#exang should be removed since it reduces aic after removal

model <- glm(target~.-oldpeak, training, family = binomial)


summary(model)

##
## Call:
## glm(formula = target ~ . - oldpeak, family = binomial, data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.381717 4.232311 -0.090 0.928135
## age 0.021295 0.030209 0.705 0.480861
## sex1 -2.711512 0.727086 -3.729 0.000192 ***
## cp1 1.406083 0.718866 1.956 0.050468 .
## cp2 2.398675 0.638389 3.757 0.000172 ***
## cp3 2.734715 0.858382 3.186 0.001443 **
## trestbps -0.036830 0.013751 -2.678 0.007401 **
## chol -0.006730 0.005041 -1.335 0.181857
## fbs1 1.367347 0.790969 1.729 0.083863 .
## restecg1 0.082123 0.473702 0.173 0.862365
## restecg2 -0.600034 2.904674 -0.207 0.836342
## thalach 0.034541 0.016853 2.049 0.040415 *
## exang1 -0.708292 0.538294 -1.316 0.188239
## slope1 -0.507411 0.901779 -0.563 0.573654
## slope2 1.501021 0.926555 1.620 0.105232
## ca1 -2.010400 0.639365 -3.144 0.001664 **
## ca2 -3.909845 0.959312 -4.076 4.59e-05 ***
## ca3 -1.944451 1.082769 -1.796 0.072524 .
## ca4 -0.666123 3.643758 -0.183 0.854945
## thal1 3.552278 2.822071 1.259 0.208121
## thal2 2.589238 2.710921 0.955 0.339520
## thal3 1.103418 2.708624 0.407 0.683735
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 129.31 on 219 degrees of freedom
## AIC: 173.31
##
## Number of Fisher Scoring iterations: 7
#oldpeak should not be removed since it increases aic after removal

model <- glm(target~.-slope, training, family = binomial)


summary(model)

##
## Call:
## glm(formula = target ~ . - slope, family = binomial, data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.742790 3.584117 -0.207 0.835818
## age 0.015372 0.030434 0.505 0.613505
## sex1 -2.211146 0.676249 -3.270 0.001077 **
## cp1 1.347198 0.686410 1.963 0.049684 *
## cp2 2.397706 0.625744 3.832 0.000127 ***
## cp3 2.655163 0.806184 3.293 0.000990 ***
## trestbps -0.033253 0.013236 -2.512 0.011994 *
## chol -0.006654 0.004919 -1.353 0.176135
## fbs1 1.030259 0.764565 1.348 0.177816
## restecg1 0.186206 0.464385 0.401 0.688440
## restecg2 -0.357638 3.282078 -0.109 0.913229
## thalach 0.040990 0.015769 2.599 0.009337 **
## exang1 -0.695550 0.532766 -1.306 0.191708
## oldpeak -0.636878 0.253941 -2.508 0.012143 *
## ca1 -1.617110 0.584633 -2.766 0.005674 **
## ca2 -2.934552 0.873648 -3.359 0.000782 ***
## ca3 -1.492465 0.994232 -1.501 0.133324
## ca4 -1.135837 2.763365 -0.411 0.681048
## thal1 3.130701 2.100989 1.490 0.136196
## thal2 2.371865 1.949207 1.217 0.223667
## thal3 0.809129 1.946088 0.416 0.677577
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 136.30 on 220 degrees of freedom
## AIC: 178.3
##
## Number of Fisher Scoring iterations: 6

#slope should not be removed since it reduces aic after removal.

model <- glm(target~.-thal, training, family = binomial)


summary(model)

##
## Call:
## glm(formula = target ~ . - thal, family = binomial, data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.609153 3.473813 1.327 0.184566
## age 0.012875 0.029076 0.443 0.657895
## sex1 -2.873978 0.637989 -4.505 6.65e-06 ***
## cp1 1.608956 0.698737 2.303 0.021298 *
## cp2 2.507789 0.615335 4.075 4.59e-05 ***
## cp3 2.777243 0.775936 3.579 0.000345 ***
## trestbps -0.032936 0.013123 -2.510 0.012077 *
## chol -0.009421 0.004888 -1.928 0.053912 .
## fbs1 1.075935 0.724441 1.485 0.137492
## restecg1 -0.054437 0.450792 -0.121 0.903883
## restecg2 -0.729176 2.797157 -0.261 0.794336
## thalach 0.026316 0.014890 1.767 0.077164 .
## exang1 -0.868600 0.527297 -1.647 0.099503 .
## oldpeak -0.392701 0.244089 -1.609 0.107651
## slope1 -0.978026 0.932975 -1.048 0.294506
## slope2 0.769392 1.031594 0.746 0.455771
## ca1 -1.942310 0.589225 -3.296 0.000979 ***
## ca2 -3.318062 0.894370 -3.710 0.000207 ***
## ca3 -2.177570 1.060378 -2.054 0.040016 *
## ca4 -0.998000 2.652173 -0.376 0.706698
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 139.70 on 221 degrees of freedom
## AIC: 179.7
##
## Number of Fisher Scoring iterations: 6

#Thal should not be removed since it increases aic after removal.

#Final model after optimization


model <- glm(target~.-age-chol-restecg-exang, training, family = binomial)
summary(model)

##
## Call:
## glm(formula = target ~ . - age - chol - restecg - exang, family =
binomial,
## data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.17499 3.65481 -0.048 0.961812
## sex1 -2.39086 0.69210 -3.454 0.000551 ***
## cp1 1.46593 0.67782 2.163 0.030562 *
## cp2 2.85714 0.63490 4.500 6.79e-06 ***
## cp3 2.98425 0.83328 3.581 0.000342 ***
## trestbps -0.03439 0.01311 -2.623 0.008729 **
## fbs1 1.30984 0.78043 1.678 0.093277 .
## thalach 0.02941 0.01465 2.008 0.044618 *
## oldpeak -0.47145 0.26303 -1.792 0.073080 .
## slope1 -0.96292 0.97900 -0.984 0.325326
## slope2 0.83988 1.05150 0.799 0.424442
## ca1 -2.08207 0.64692 -3.218 0.001289 **
## ca2 -3.38838 0.91279 -3.712 0.000206 ***
## ca3 -1.80585 1.09189 -1.654 0.098153 .
## ca4 -1.31700 3.06447 -0.430 0.667367
## thal1 3.97623 2.87657 1.382 0.166886
## thal2 2.73084 2.76286 0.988 0.322950
## thal3 1.14514 2.74812 0.417 0.676900
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 332.26 on 240 degrees of freedom
## Residual deviance: 130.19 on 223 degrees of freedom
## AIC: 166.19
##
## Number of Fisher Scoring iterations: 6

#Check model assumptions


library(car)

library(performance)

#Test of goodness of fit


check_predictions(model)
#The models predicted data fits the observed data hence the
#assumption of goodness of fit is not violated.

#Check outliers
check_outliers(model)

## OK: No outliers detected.


## - Based on the following method and threshold: cook (0.9).
## - For variable: (Whole model)

check_outliers(model) %>% plot()


plot(model, which = 4)
#Check for linearity
check_autocorrelation(model)

## OK: Residuals appear to be independent and not autocorrelated (p = 0.876).

check_residuals(model) %>% plot()

#Check autocorrelation
check_autocorrelation(model)

## OK: Residuals appear to be independent and not autocorrelated (p = 0.940).

#Check for multicollinearity


check_collinearity(model)

## # Check for Multicollinearity


##
## Low Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI
## sex 1.80 [1.55, 2.16] 1.34 0.56 [0.46, 0.65]
## cp 1.85 [1.59, 2.23] 1.36 0.54 [0.45, 0.63]
## trestbps 1.19 [1.08, 1.44] 1.09 0.84 [0.70, 0.93]
## fbs 1.23 [1.11, 1.48] 1.11 0.81 [0.68, 0.90]
## thalach 1.43 [1.26, 1.71] 1.19 0.70 [0.59, 0.79]
## oldpeak 1.45 [1.28, 1.74] 1.21 0.69 [0.58, 0.78]
## slope 1.90 [1.63, 2.29] 1.38 0.53 [0.44, 0.61]
## ca 2.11 [1.79, 2.55] 1.45 0.47 [0.39, 0.56]
## thal 1.75 [1.51, 2.11] 1.32 0.57 [0.47, 0.66]

#Check for independence of residuals


durbinWatsonTest(model)

## lag Autocorrelation D-W Statistic p-value


## 1 -0.009517289 2.018653 0.964
## Alternative hypothesis: rho != 0

#Interpret model predictions


library(ggeffects)
ggeffect(model)

##
## $sex
## # Predicted probabilities of target
##
## sex | Predicted | 95% CI
## ----------------------------
## 0 | 0.86 | 0.68, 0.95
## 1 | 0.37 | 0.24, 0.52
##
##
## $cp
## # Predicted probabilities of target
##
## cp | Predicted | 95% CI
## ---------------------------
## 0 | 0.25 | 0.15, 0.40
## 1 | 0.60 | 0.31, 0.83
## 2 | 0.86 | 0.69, 0.94
## 3 | 0.87 | 0.62, 0.96
##
##
## $trestbps
## # Predicted probabilities of target
##
## trestbps | Predicted | 95% CI
## ---------------------------------
## 94 | 0.82 | 0.62, 0.93
## 107 | 0.75 | 0.58, 0.87
## 121 | 0.65 | 0.52, 0.76
## 134 | 0.54 | 0.42, 0.65
## 147 | 0.43 | 0.29, 0.58
## 160 | 0.33 | 0.17, 0.54
## 173 | 0.24 | 0.09, 0.50
## 200 | 0.11 | 0.02, 0.43

##
## $fbs
## # Predicted probabilities of target
##
## fbs | Predicted | 95% CI
## ----------------------------
## 0 | 0.51 | 0.39, 0.63
## 1 | 0.80 | 0.48, 0.94

##
## $thalach
## # Predicted probabilities of target
##
## thalach | Predicted | 95% CI
## --------------------------------
## 70 | 0.11 | 0.01, 0.57
## 85 | 0.16 | 0.03, 0.57
## 105 | 0.25 | 0.08, 0.58
## 120 | 0.35 | 0.16, 0.59
## 135 | 0.45 | 0.30, 0.61
## 155 | 0.60 | 0.48, 0.70
## 170 | 0.70 | 0.53, 0.82
## 205 | 0.87 | 0.56, 0.97

## $oldpeak
## # Predicted probabilities of target
##
## oldpeak | Predicted | 95% CI
## --------------------------------
## 0.00 | 0.67 | 0.51, 0.80
## 0.50 | 0.62 | 0.49, 0.73
## 1.50 | 0.50 | 0.37, 0.64
## 2.50 | 0.39 | 0.20, 0.62
## 3.00 | 0.33 | 0.14, 0.62
## 3.50 | 0.28 | 0.09, 0.62
## 4.50 | 0.20 | 0.04, 0.62
## 6.00 | 0.11 | 0.01, 0.63

##
## $slope
## # Predicted probabilities of target
##
## slope | Predicted | 95% CI
## ------------------------------
## 0 | 0.57 | 0.17, 0.89
## 1 | 0.33 | 0.19, 0.51
## 2 | 0.75 | 0.58, 0.87
##
##
## $ca
## # Predicted probabilities of target
##
## ca | Predicted | 95% CI
## ---------------------------
## 0 | 0.78 | 0.65, 0.87
## 1 | 0.30 | 0.13, 0.55
## 2 | 0.11 | 0.02, 0.37
## 3 | 0.36 | 0.07, 0.81
## 4 | 0.48 | 0.00, 1.00
##
##
## $thal
## # Predicted probabilities of target
##
## thal | Predicted | 95% CI
## -----------------------------
## 0 | 0.13 | 0.00, 0.97
## 1 | 0.89 | 0.59, 0.98
## 2 | 0.69 | 0.54, 0.81
## 3 | 0.31 | 0.18, 0.50
##
## ## attr(,"class")
## [1] "ggalleffects" "list"
## attr(,"model.name")
## [1] "model"

ggpredict(model, terms = "sex")

## # Predicted probabilities of target


##
## sex | Predicted | 95% CI
## ----------------------------
## 0 | 0.31 | 0.00, 0.99
## 1 | 0.04 | 0.00, 0.93
##
## Adjusted for:
## * age = 54.52
## * cp = 0
## * trestbps = 131.91
## * chol = 244.69
## * fbs = 0
## * restecg = 0
## * thalach = 149.77
## * exang = 0
## * oldpeak = 1.03
## * slope = 0
## * ca = 0
## * thal = 0

#Visualize model predictions


theme_set(theme_bw()+theme(title = element_text
(face = "bold",
colour = "steelblue")))
ggplot(heart, aes(target, fill = target))+
geom_bar(width = 0.7)+
labs(title = "Incidence of Heart Disease",
y = "Frequency",
x = "Disease Outcome")

ggplot(training, aes(sex, fill = target))+


geom_bar(position = "dodge")+
labs(title = "Incidence of Heart Disease based on Sex",
y = "Frequency",
x = "Sex",
caption = "Africa CDC(2024)")
ggplot(training, aes(cp, fill = target))+
geom_bar(position = "dodge")+
labs(title = "Incidence of Heart Disease based on Chest pain type",
y = "Frequency",
x = "Chest Pain Type",
caption = "Africa CDC(2024)")
ggplot(training, aes(trestbps, fill = target))+
geom_histogram(binwidth = 9, position = "dodge")+
labs(title = "Incidence of Heart Disease based on Resting Blood Pressure",
y = "Frequency",
x = "Resting Blood Pressure",
caption = "Africa CDC(2024)")
ggplot(training, aes(fbs, fill = target))+
geom_bar(position = "dodge")+
labs(title = "Incidence of Heart Disease based on Fasting blood sugar",
y = "Frequency",
x = "Fasting blood sugar",
caption = "Africa CDC(2024)")
table(heart$target, heart$fbs)

##
## 0 1
## 0 116 22
## 1 141 23

ggplot(training, aes(thalach, fill = target))+


geom_histogram(binwidth = 10, position = "dodge")+
labs(title = "Incidence of Heart Disease based on Heart rate",
y = "Frequency",
x = "Maximum heart rate",
caption = "Africa CDC(2024)")
ggplot(training, aes(oldpeak, fill = target))+
geom_histogram(binwidth = 1, position = "dodge")+
labs(title = "Incidence of Heart Disease based on ST Depression",
y = "Frequency",
x = "ST Depression induced dy Excercise",
caption = "Africa CDC(2024)")
ggplot(training, aes(slope, fill = target))+
geom_bar(position = "dodge")+
labs(title = "Incidence of Heart Disease based on Slope of Peak exercise",
y = "Frequency",
x = "Slope of the peak exercise",
caption = "Africa CDC(2024)")
table(heart$target, heart$slope)

##
## 0 1 2
## 0 12 91 35
## 1 9 49 106

ggplot(training, aes(ca, fill = target))+


geom_bar(position = "dodge")+
labs(title = "Incidence of Heart Disease based on Vessels colored by
floursopy",
y = "Frequency",
x = "Number of major vessels",
caption = "Africa CDC(2024)")
table(heart$target, heart$ca)

##
## 0 1 2 3 4
## 0 45 44 31 17 1
## 1 130 21 7 3 3

ggplot(training, aes(thal, fill = target))+


geom_bar(position = "dodge")+
labs(title = "Incidence of Heart Disease based on thal",
y = "Frequency",
caption = "Africa CDC(2024)")
table(heart$target, heart$thal)

##
## 0 1 2 3
## 0 1 12 36 89
## 1 1 6 129 28

#Checking variable importance


library(vip)

car::Anova(model)

## Analysis of Deviance Table (Type II tests)


##
## Response: target
## LR Chisq Df Pr(>Chisq)
## sex 14.541 1 0.0001372 ***
## cp 32.465 3 4.176e-07 ***
## trestbps 7.608 1 0.0058124 **
## fbs 2.931 1 0.0868757 .
## thalach 4.597 1 0.0320293 *
## oldpeak 3.401 1 0.0651615 .
## slope 10.335 2 0.0056982 **
## ca 25.131 4 4.736e-05 ***
## thal 16.569 3 0.0008666 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vip(model)

#Produce model equation


library(equatiomatic)

## Warning: package 'equatiomatic' was built under R version 4.4.2

extract_eq(model)

𝑃(target = 1)
log [ ]
1 − 𝑃(target = 1)
= 𝛼 + 𝛽1 (sex1 ) + 𝛽2 (cp1 ) + 𝛽3 (cp2 ) + 𝛽4 (cp3 ) + 𝛽5 (trestbps) + 𝛽6 (fbs1 ) + 𝛽7 (thalach)
+ 𝛽8 (oldpeak) + 𝛽9 (slope1 ) + 𝛽10 (slope2 ) + 𝛽11 (ca1 ) + 𝛽12 (ca2 ) + 𝛽13 (ca3 ) + 𝛽14 (ca4 )
+ 𝛽15 (thal1 ) + 𝛽16 (thal2 ) + 𝛽17 (thal3 )
#How well our model fits the data
library(performance)
performance(model)

## # Indices of model performance


##
## AIC | AICc | BIC | Tjur's R2 | RMSE | Sigma | Log_loss |
Score_log
## --------------------------------------------------------------------------
----
## 166.193 | 169.274 | 228.920 | 0.665 | 0.287 | 1.000 | 0.270 |
-Inf
##
## AIC | Score_spherical | PCP
## ---------------------------------
## 166.193 | 0.025 | 0.834

library(effectsize)
interpret_r2(0.665)

## [1] "substantial"
## (Rules: cohen1988)

#Making Predictions

predictions <- predict(model, testing, type = "response")


results <- data.frame(Actual = testing$target,
Predicted = predictions)
tail(results)

## Actual Predicted
## 56 0 0.05760362
## 57 1 0.07685093
## 58 1 0.98546924
## 59 0 0.75525302
## 60 0 0.05270169
## 61 1 0.95253725

#Model Evaluation
table(Actual = testing$target, Predicted = predictions > 0.3)

## Predicted
## Actual FALSE TRUE
## 0 21 7
## 1 4 29

#Accuracy
((21+29)/(21+7+4+29))*100

## [1] 81.96721

#Recall
((29)/(7+29))*100

## [1] 80.55556

#Precision
((29)/(29+4))*100

## [1] 87.87879

#F1-Score
((80.56*87.88)/(80.56+87.88))*2

## [1] 84.06095
#MCC
((29*21)-(4*7))/911.9

## [1] 0.6371313

library(pROC)

roc(target~fitted.values(model), data = training,


plot = TRUE, legacy.axes = TRUE,
print.auc = TRUE, ci = TRUE)

## Setting levels: control = 0, case = 1


## Setting direction: controls < cases

##
## Call:
## roc.formula(formula = target ~ fitted.values(model), data = training,
plot = TRUE, legacy.axes = TRUE, print.auc = TRUE, ci = TRUE)
##
## Data: fitted.values(model) in 110 controls (target 0) < 131 cases (target
1).
## Area under the curve: 0.9555
## 95% CI: 0.9318-0.9792 (DeLong)

#AUC
#0.956
#Generate model probabilities
probs <- predict(model, testing, type = "response")

#Data frame with actual target and predicted probabilities


scores <- data.frame(
Actual = testing$target,
Predicted_Probability = probs
)

# Classify the scores into risk categories


# Define risk categories based on probability thresholds
scores <- scores %>%
mutate(
Risk_Category = case_when(
Predicted_Probability < 0.3 ~ "Low Risk",
Predicted_Probability >= 0.3 & Predicted_Probability < 0.6 ~ "Medium
Risk",
Predicted_Probability >= 0.6 ~ "High Risk"
)
)

# Visualize the distribution of risk categories


library(ggplot2)
ggplot(scores, aes(x = Predicted_Probability, fill = Risk_Category)) +
geom_histogram(binwidth = 0.05, color = "black") +
labs(title = "Distribution of Predicted Probability Scores",
x = "Predicted Probability",
y = "Frequency")
# Review the first few rows of the score data
head(scores, 30)

## Actual Predicted_Probability Risk_Category


## 1 0 0.1936875725 Low Risk
## 2 0 0.0934782022 Low Risk
## 3 0 0.2997032177 Low Risk
## 4 0 0.9294019274 High Risk
## 5 0 0.8761094468 High Risk
## 6 1 0.0028066440 Low Risk
## 7 0 0.1435069905 Low Risk
## 8 1 0.8063566989 High Risk
## 9 1 0.9472678160 High Risk
## 10 0 0.0036480026 Low Risk
## 11 1 0.5850682700 Medium Risk
## 12 0 0.1976715032 Low Risk
## 13 0 0.0132086376 Low Risk
## 14 1 0.9759977332 High Risk
## 15 0 0.0016043285 Low Risk
## 16 1 0.9659359017 High Risk
## 17 0 0.0667607931 Low Risk
## 18 1 0.9706073626 High Risk
## 19 0 0.0002886308 Low Risk
## 20 1 0.9930553909 High Risk
## 21 0 0.5857953139 Medium Risk
## 22 0 0.0153843213 Low Risk
## 23 1 0.9572845471 High Risk
## 24 1 0.3382015221 Medium Risk
## 25 0 0.0004665334 Low Risk
## 26 1 0.7879620087 High Risk
## 27 0 0.0228474896 Low Risk
## 28 1 0.9777087138 High Risk
## 29 1 0.5422930068 Medium Risk
## 30 0 0.0021259245 Low Risk

You might also like