-
Notifications
You must be signed in to change notification settings - Fork 1
/
outlier_funs.R
272 lines (219 loc) · 8.01 KB
/
outlier_funs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
#' Get the left and right Median Absolute Deviations (MAD) from the median for asymmetric distributions
#'
#' Suited to find outliers in asymetric distributions (in contrast to the standard mad() function which works for symmetric distributions only)
#' The function splits the values along the median and returns separate MADs for the left and the right side of the distribution.
#' https://eurekastatistics.com/using-the-median-absolute-deviation-to-find-outliers/
#'
#' @param x A vector of numeric values.
#' @param zero_mad_action Determines the action in the event of an MAD of zero.
#' Defaults to NULL. The options are:
#' * \strong{NULL}: process runs with no warning
#' * \strong{"warn"}: a warning will be displayed
#' * \strong{"stop"}: process is stopped
#'
#' @return A numeric vector of length 2.
#'
#' @importFrom stats median
#'
#' @export
#'
#' @examples
#'
#' x <- c(1, 2, 3, 3, 4, 4, 4, 5, 5.5, 6, 6, 6.5, 7, 7, 7.5, 8, 9, 12, 52, 90)
#'
#' double_mad(x)
#'
double_mad <- function(x, zero_mad_action = NULL){
if (!is.numeric(x)) {
stop("Your input must be numeric.")
}
# drop all NAs
x <- x[!is.na(x)]
# calculate the median
median_x <- stats::median(x)
# calculate the absolute deviations
abs_dev <- abs(x - median_x)
# calculate the left and the right MADs
left_mad <- stats::median(abs_dev[x <= median_x])
right_mad <- stats::median(abs_dev[x >= median_x])
# handling of MAD = 0
if (!is.null(zero_mad_action) && (left_mad == 0 || right_mad == 0)) {
if (zero_mad_action == "stop") stop("MAD is 0")
if (zero_mad_action == "warn") warning("MAD is 0")
}
return(c(left_mad, right_mad))
}
#' Calculate the distance of a value from the median of a distribution in relation to its Median Absolute Deviation (MAD)
#'
#' This function is suited to find outliers in asymetric distributions (in contrast to the standard mad() function which works for
#' symetric distributions only). The function splits the values along the median and returns the distance for every value from the
#' median, relative to the left or right side MAD.
#' https://eurekastatistics.com/using-the-median-absolute-deviation-to-find-outliers/
#'
#' @inheritParams double_mad
#'
#' @importFrom stats median
#'
#' @return A numeric vector of length \code{length(x)}.
#' @export
#'
#' @examples
#'
#' x <- c(1, 2, 3, 3, 4, 4, 4, 5, 5.5, 6, 6, 6.5, 7, 7, 7.5, 8, 9, 12, 52, 90)
#'
#' double_mad_from_median(x)
#'
double_mad_from_median <- function(x, zero_mad_action = NULL){
# get left/right MAD
two_sided_mad <- double_mad(x, zero_mad_action)
# calculate the median
median_x <- stats::median(x)
# create vector of left/right MADs (don't do it with length(x)/2 since it is possible that x == median_x exists multiple times in the vector)
left_mad <- rep(two_sided_mad[1], length(x[x <= median_x]))
right_mad <- rep(two_sided_mad[2], length(x[x > median_x]))
x_mad <- c(left_mad, right_mad)
# calculate MAD distance, that is distance of every value to the median, relative to the left/right MAD
mad_distance <- abs(x - median_x) / x_mad
mad_distance[x == median_x] <- 0
return(mad_distance)
}
#' Detect outliers using MAD from the median for asymmetric distributions
#'
#' Outlier detection based on Median Absolute Deviation (MAD) for asymmetric distributions. The function calculates the distance
#' to the median for every value in the distribution relative to the left or right side MAD. It then compares the value to your
#' threshold and labels the outliers.
#'
#' @inheritParams double_mad
#' @param threshold Z-score threshold (defaults to 3.5).
#'
#' @return A logical vector.
#' @export
#'
#' @examples
#'
#' x <- c(1, 2, 3, 3, 4, 4, 4, 5, 5.5, 6, 6, 6.5, 7, 7, 7.5, 8, 9, 12, 52, 90)
#'
#' is_outlier_double_mad(x)
#'
is_outlier_double_mad <- function(x, zero_mad_action = NULL, threshold = 3.5){
ifelse(double_mad_from_median(x, zero_mad_action) >= threshold, TRUE, FALSE)
}
#' Get boundaries beyond which a value is an outlier via MAD from the median for asymmetric distributions and IQR
#'
#' Outlier detection based on Median Absolute Deviation (MAD) for asymetric distributions and interquartile range. The function
#' calculates the distance to the median for every value in the distribution relative to the left or right side MAD. It then
#' compares the value to your threshold and labels the outliers.
#'
#' @inheritParams is_outlier_double_mad
#' @param percent Indicator for the scale of the data. If function is run for percantage data, the lower limit will not be negative
#' while the upper limit does not exceed 100 percent. Defaults to TRUE.
#'
#' @importFrom stats median
#' @importFrom stats IQR
#'
#' @return A data.frame with numeric range.
#' @export
#'
#' @examples
#'
#' x <- c(1, 2, 3, 3, 4, 4, 4, 5, 5.5, 6, 6, 6.5, 7, 7, 7.5, 8, 9, 12, 52, 90)
#'
#' outlier_range(x)
#'
outlier_range <- function(x, zero_mad_action = NULL, threshold = 3.5, percent = TRUE){
if (!is.numeric(x)) {
stop("Your input must be numeric.")
}
# create table
data <- data.frame(
median = stats::median(x),
iqr = stats::IQR(x),
lower = round(stats::median(x) - double_mad(x)[1] * threshold, 2),
upper = round(stats::median(x) + double_mad(x)[1] * threshold, 2)
)
# limit bandwidth in case of percentage scale
if(percent == TRUE){
data["lower"] <- ifelse(data$lower < 0, 0, data$lower)
data["upper"] <- ifelse(data$upper > 100, 100, data$upper)
}
# add label column
data["label"] <- paste0(data$lower, " - ", data$upper)
# return the table
return(data)
}
#' Detect outliers using Z-score with MAD for symmetric distributions
#'
#' Outlier detection based on Median Absolute Deviation (MAD) for symmetric distributions. The function calculates the distance
#' to the median for every value in the distribution relative to the MAD. It then compares the value to your threshold and
#' labels the outliers.
#'
#'
#' @inheritParams double_mad
#' @param threshold Z-score threshold (defaults to 3).
#' @param na.rm Remove NAs, defaults to TRUE.
#'
#' @importFrom stats median
#' @importFrom stats mad
#'
#' @return A logical vector.
#' @export
#'
#' @examples
#'
#' x <- c(1, 2, 3, 3, 4, 4, 4, 5, 5.5, 6, 6, 6.5, 7, 7, 7.5, 8, 9, 12, 52, 90)
#'
#' is_outlier_single_mad(x)
#'
is_outlier_single_mad <- function(x, threshold = 3, na.rm = TRUE) {
abs(x - stats::median(x, na.rm = na.rm)) > threshold * stats::mad(x, na.rm = na.rm)
}
#' Detect outliers using classic Z-scores for symmetric distributions
#'
#' Outlier detection based on Z-scores for symetric distributions. The function calculates the Z-score, i. e. the distance of a value
#' from the mean in number of standard deviations.
#'
#'
#' @inheritParams is_outlier_single_mad
#'
#' @importFrom stats sd
#'
#' @return A logical vector.
#' @export
#'
#' @examples
#'
#' x <- c(1, 2, 3, 3, 4, 4, 4, 5, 5.5, 6, 6, 6.5, 7, 7, 7.5, 8, 9, 12, 52, 90)
#'
#' is_outlier_z(x)
#'
is_outlier_z <- function(x, threshold = 3, na.rm = TRUE) {
abs(x - mean(x, na.rm = na.rm)) > threshold * stats::sd(x, na.rm = na.rm)
}
#' Detect outliers using turkey's fences
#'
#' Outlier detection based on turkey's fences. Tukey’s fences is a technique used in box plots. The non-outlier range is defined as
#' Q1−k(Q3−Q1), Q3+k(Q3−Q1), where Q1 and Q3 are the lower and upper quartiles respectively and k - some non-negative constant
#' (popular choice is 1.5).
#'
#'
#' @inheritParams double_mad
#' @param threshold Multiplier for the IQR to set outlier boundaries. Higher values widen the range; default is 1.5.
#' @param na.rm if TRUE, removes NA values before calculations. Default is TRUE.
#'
#' @importFrom stats quantile
#'
#' @return A logical vector.
#'
#' @export
#'
#' @examples
#'
#' x <- c(1, 2, 3, 3, 4, 4, 4, 5, 5.5, 6, 6, 6.5, 7, 7, 7.5, 8, 9, 12, 52, 90)
#'
#' is_outlier_turkey(x)
#'
is_outlier_turkey <- function(x, threshold = 1.5, na.rm = TRUE) {
quar <- stats::quantile(x, probs = c(0.25, 0.75), na.rm = na.rm)
iqr <- diff(quar)
(quar[1] - threshold * iqr > x) | (x > quar[2] + threshold * iqr) # must not be >= or <= since identical values would be counted as outliers
}