Skip to content

Generalized Pareto, also need param_estimate and stats_tbl #474

@spsanderson

Description

@spsanderson

Param Estimate

Function:

#' Estimate Generalized Pareto Parameters
#'
#' @family Parameter Estimation
#' @family Generalized Pareto
#' 
#' @author Steven P. Sanderson II, MPH
#'
#' @details This function will attempt to estimate the generalized Pareto shape1, shape2, and rate
#' parameters given some vector of values.
#'
#' @description The function will return a list output by default, and if the parameter
#' `.auto_gen_empirical` is set to `TRUE` then the empirical data given to the
#' parameter `.x` will be run through the `tidy_empirical()` function and combined
#' with the estimated generalized Pareto data.
#'
#' @param .x The vector of data to be passed to the function.
#' @param .auto_gen_empirical This is a boolean value of TRUE/FALSE with default
#' set to TRUE. This will automatically create the `tidy_empirical()` output
#' for the `.x` parameter and use the `tidy_combine_distributions()`. The user
#' can then plot out the data using `$combined_data_tbl` from the function output.
#'
#' @examples
#' library(dplyr)
#' library(ggplot2)
#'
#' set.seed(123)
#' x <- tidy_generalized_pareto(100, .shape1 = 1, .shape2 = 2, .scale = 3)[["y"]]
#' output <- util_generalized_pareto_param_estimate(x)
#'
#' output$parameter_tbl
#'
#' output$combined_data_tbl %>%
#'   tidy_combined_autoplot()
#'
#' @return
#' A tibble/list
#' 
#' @name util_generalized_pareto_param_estimate
NULL

#' @export
#' @rdname util_generalized_pareto_param_estimate

util_generalized_pareto_param_estimate <- function(.x, .auto_gen_empirical = TRUE) {
  
  # Tidyeval ----
  x_term <- as.numeric(.x)
  n <- length(x_term)
  
  # Checks ----
  if (!is.vector(x_term, mode = "numeric") || is.factor(x_term)) {
    rlang::abort(
      message = "'.x' must be a numeric vector.",
      use_cli_format = TRUE
    )
  }
  
  if (n < 2 || any(x_term <= 0)) {
    rlang::abort(
      message = "'.x' must contain at least two non-missing distinct values. All values of '.x' must be positive.",
      use_cli_format = TRUE
    )
  }
  
  # Negative log-likelihood function for generalized Pareto distribution
  genpareto_lik <- function(params, data) {
    shape1 <- params[1]
    shape2 <- params[2]
    scale <- params[3]
    -sum(actuar::dgenpareto(data, shape1 = shape1, shape2 = shape2, scale = scale, log = TRUE))
  }
  
  # Initial parameter guesses
  initial_params <- c(shape1 = 1, shape2 = 1, scale = 1)
  
  # Optimize to minimize the negative log-likelihood
  opt_result <- optim(
    par = initial_params,
    fn = genpareto_lik,
    data = x_term,
    method = "L-BFGS-B",
    lower = c(1e-5, 1e-5, 1e-5)
  )
  
  shape1 <- opt_result$par[1]
  shape2 <- opt_result$par[2]
  scale <- opt_result$par[3]
  rate <- 1 / scale
  
  # Return Tibble ----
  if (.auto_gen_empirical) {
    te <- tidy_empirical(.x = x_term)
    td <- tidy_generalized_pareto(
      .n = n, 
      .shape1 = round(shape1, 3), 
      .shape2 = round(shape2, 3), 
      .rate = round(rate, 3)
      )
    combined_tbl <- tidy_combine_distributions(te, td)
  }
  
  ret <- dplyr::tibble(
    dist_type = "Generalized Pareto",
    samp_size = n,
    min = min(x_term),
    max = max(x_term),
    mean = mean(x_term),
    shape1 = shape1,
    shape2 = shape2,
    rate = rate,
    scale = scale
  )
  
  # Return ----
  attr(ret, "tibble_type") <- "parameter_estimation"
  attr(ret, "family") <- "generalized_pareto"
  attr(ret, "x_term") <- .x
  attr(ret, "n") <- n
  
  if (.auto_gen_empirical) {
    output <- list(
      combined_data_tbl = combined_tbl,
      parameter_tbl     = ret
    )
  } else {
    output <- list(
      parameter_tbl = ret
    )
  }
  
  return(output)
}

Example:

> set.seed(123)
> x <- tidy_generalized_pareto(100, .shape1 = 1, .shape2 = 2, .scale = 3)[["y"]]
> output <- util_generalized_pareto_param_estimate(x)
> 
> output$parameter_tbl
# A tibble: 1 × 9
  dist_type          samp_size   min   max  mean shape1 shape2  rate scale
  <chr>                  <int> <dbl> <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl>
1 Generalized Pareto       100 0.688  412.  21.5   1.08   5.47  1.01 0.993
> 
> output$combined_data_tbl %>%
+   tidy_combined_autoplot()

image

Stats Tibble

Function:

#' Distribution Statistics
#'
#' @family Generalized Pareto
#' @family Distribution Statistics
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @details This function will take in a tibble and return the statistics
#' of the given type of `tidy_` distribution. It is required that data be
#' passed from a `tidy_` distribution function.
#'
#' @description Returns distribution statistics in a tibble.
#'
#' @param .data The data being passed from a `tidy_` distribution function.
#'
#' @examples
#' library(dplyr)
#'
#' tidy_generalized_pareto() |>
#'   util_generalized_pareto_stats_tbl() |>
#'   glimpse()
#'
#' @return
#' A tibble
#'
#' @name util_generalized_pareto_stats_tbl
NULL
#' @export
#' @rdname util_generalized_pareto_stats_tbl
util_generalized_pareto_stats_tbl <- function(.data) {
  
  # Immediate check for tidy_ distribution function
  if (!"tibble_type" %in% names(attributes(.data))) {
    rlang::abort(
      message = "You must pass data from the 'tidy_dist' function.",
      use_cli_format = TRUE
    )
  }
  
  if (attributes(.data)$tibble_type != "tidy_generalized_pareto") {
    rlang::abort(
      message = "You must use 'tidy_generalized_pareto()'",
      use_cli_format = TRUE
    )
  }
  
  # Data
  data_tbl <- dplyr::as_tibble(.data)
  
  atb <- attributes(data_tbl)
  shape1 <- atb$.shape1
  shape2 <- atb$.shape2
  rate <- atb$.rate
  scale <- 1 / rate
  
  stat_mean <- ifelse(shape1 <= 1, Inf, scale * shape1 / (shape1 - 1))
  stat_mode <- scale * (shape1 - 1) / shape1
  stat_median <- scale * actuar::qgenpareto(0.5, shape1 = shape1, shape2 = shape2, scale = scale)
  stat_var <- ifelse(shape1 <= 2, Inf, (scale^2 * shape1) / ((shape1 - 1)^2 * (shape1 - 2)))
  stat_skewness <- ifelse(shape1 <= 3, "undefined", 2 * (1 + shape1) / (shape1 - 3) * sqrt((shape1 - 2) / shape1))
  stat_kurtosis <- ifelse(shape1 <= 4, "undefined", 6 * (shape1^3 + shape1^2 - 6 * shape1 - 2) / (shape1 * (shape1 - 3) * (shape1 - 4)))
  
  # Data Tibble
  ret <- dplyr::tibble(
    tidy_function = atb$tibble_type,
    function_call = atb$dist_with_params,
    distribution = dist_type_extractor(atb$tibble_type),
    distribution_type = atb$distribution_family_type,
    points = atb$.n,
    simulations = atb$.num_sims,
    mean = stat_mean,
    mode = stat_mode,
    median = stat_median,
    coeff_var = sqrt(stat_var) / stat_mean,
    skewness = stat_skewness,
    kurtosis = stat_kurtosis,
    computed_std_skew = tidy_skewness_vec(data_tbl$y),
    computed_std_kurt = tidy_kurtosis_vec(data_tbl$y),
    ci_lo = ci_lo(data_tbl$y),
    ci_hi = ci_hi(data_tbl$y)
  )
  
  # Return
  return(ret)
}

Example:

> tidy_generalized_pareto() |>
+   util_generalized_pareto_stats_tbl() |>
+   glimpse()
Rows: 1
Columns: 16
$ tidy_function     <chr> "tidy_generalized_pareto"
$ function_call     <chr> "Generalized Pareto c(1, 1, 1, 1)"
$ distribution      <chr> "Generalized Pareto"
$ distribution_type <chr> "continuous"
$ points            <dbl> 50
$ simulations       <dbl> 1
$ mean              <dbl> Inf
$ mode              <dbl> 0
$ median            <dbl> 1
$ coeff_var         <dbl> NaN
$ skewness          <chr> "undefined"
$ kurtosis          <chr> "undefined"
$ computed_std_skew <dbl> 5.314218
$ computed_std_kurt <dbl> 32.13978
$ ci_lo             <dbl> 0.02209331
$ ci_hi             <dbl> 55.2022

AIC

Function:

#' Calculate Akaike Information Criterion (AIC) for Generalized Pareto Distribution
#'
#' This function calculates the Akaike Information Criterion (AIC) for a generalized Pareto distribution fitted to the provided data.
#'
#' @family Utility
#' 
#' @author Steven P. Sanderson II, MPH
#'
#' @description
#' This function estimates the shape1, shape2, and rate parameters of a generalized Pareto distribution
#' from the provided data using maximum likelihood estimation,
#' and then calculates the AIC value based on the fitted distribution.
#'
#' @param .x A numeric vector containing the data to be fitted to a generalized Pareto distribution.
#'
#' @details
#' This function fits a generalized Pareto distribution to the provided data using maximum
#' likelihood estimation. It estimates the shape1, shape2, and rate parameters
#' of the generalized Pareto distribution using maximum likelihood estimation. Then, it
#' calculates the AIC value based on the fitted distribution.
#'
#' Initial parameter estimates: The function uses the method of moments estimates
#' as starting points for the shape1, shape2, and rate parameters of the generalized Pareto distribution.
#'
#' Optimization method: The function uses the optim function for optimization.
#' You might explore different optimization methods within optim for potentially
#' better performance.
#'
#' Goodness-of-fit: While AIC is a useful metric for model comparison, it's
#' recommended to also assess the goodness-of-fit of the chosen model using
#' visualization and other statistical tests.
#'
#' @examples
#' # Example 1: Calculate AIC for a sample dataset
#' set.seed(123)
#' x <- actuar::rgenpareto(100, shape1 = 1, shape2 = 2, scale = 3)
#' util_generalized_pareto_aic(x)
#'
#' @return
#' The AIC value calculated based on the fitted generalized Pareto distribution to the provided data.
#'
#' @name util_generalized_pareto_aic
NULL

#' @export
#' @rdname util_generalized_pareto_aic
util_generalized_pareto_aic <- function(.x) {
  # Tidyeval
  x <- as.numeric(.x)
  
  # Negative log-likelihood function for generalized Pareto distribution
  neg_log_lik_genpareto <- function(par, data) {
    shape1 <- par[1]
    shape2 <- par[2]
    scale <- par[3]
    -sum(actuar::dgenpareto(data, shape1 = shape1, shape2 = shape2, scale = scale, log = TRUE))
  }
  
  # Initial parameter estimates
  pe <- TidyDensity::util_generalized_pareto_param_estimate(x)$parameter_tbl
  shape1_init <- pe$shape1
  shape2_init <- pe$shape2
  scale_init <- pe$scale
  initial_params <- c(shape1 = shape1_init, shape2 = shape2_init, scale = scale_init)
  
  # Fit generalized Pareto distribution using optim
  fit_genpareto <- stats::optim(
    par = initial_params,
    fn = neg_log_lik_genpareto,
    data = x,
    method = "L-BFGS-B",
    lower = c(1e-5, 1e-5, 1e-5)
  )
  
  # Extract log-likelihood and number of parameters
  logLik_genpareto <- -fit_genpareto$value
  k_genpareto <- 3 # Number of parameters for generalized Pareto distribution (shape1, shape2, and scale)
  
  # Calculate AIC
  AIC_genpareto <- 2 * k_genpareto - 2 * logLik_genpareto
  
  # Return AIC
  return(AIC_genpareto)
}

Example:

> set.seed(123)
> x <- actuar::rgenpareto(100, shape1 = 1, shape2 = 2, scale = 3)
> util_generalized_pareto_aic(x)
[1] 742.2865

Metadata

Metadata

Assignees

Labels

enhancementNew feature or request

Projects

Status

Done

Relationships

None yet

Development

No branches or pull requests

Issue actions