In this vignette we show a pipeline for predicting individual claim
counts using the ReSurv
package.
This vignette was created for the 2024 Reserving Call Paper Program on Technology and the Reserving Actuary of the Casualty Actuarial Society (CAS) and the CAS Reserves Working Group. We would like to thank the team of reviewers that provided feedback that improved our package.
To install the developer version of the ReSurv
package.
library(devtools)
devtools::install_github("edhofman/resurv")
library(ReSurv)
packageVersion("ReSurv")
To handle the Python dependencies for the first package installation, we suggest to create a dedicated virtual environment. The remaining part of this Section can be disregard from users that are not interested in using our models based on Neural Networks.
The default name of the virtual environment is
"pyresurv"
.
We then suggest to refresh the R session and to import the
ReSurv
package in R using
We simulate a synthetic data set from scenario Alpha.
input_data <- data_generator(random_seed = 7,
scenario = "alpha",
time_unit = 1 / 360,
years = 4,
period_exposure = 200)
We inspect the data set.
The data is pre-processed using the IndividualDataPP
function.
This Sections first shows the usage of the ReSurvCV
function for cross-validation. Secondly, we show how to combine the
ReSurvCV
function into the
ParBayesianOptimization
package routine for performing the
Bayesian Optimization of Hyperparameters.
Example of K-Fold cross-validation (CV) for the XGB model.
resurv_cv_xgboost <- ReSurvCV(IndividualDataPP = individual_data,
model = "XGB",
hparameters_grid = list(booster = "gbtree",
eta = c(.001),
max_depth = c(3),
subsample = c(1),
alpha = c(0),
lambda = c(0),
min_child_weight = c(.5)),
print_every_n = 1L,
nrounds = 1,
verbose = FALSE,
verbose.cv = TRUE,
early_stopping_rounds = 1,
folds = 5,
parallel = TRUE,
ncores = 2,
random_seed = 1)
Example of K-Fold cross-validation (CV) for the NN model.
resurv_cv_nn <- ReSurvCV(IndividualDataPP = individual_data,
model = "NN",
hparameters_grid = list(num_layers = c(1, 2),
num_nodes = c(2, 4),
optim = "Adam",
activation = "ReLU",
lr = .5,
xi = .5,
eps = .5,
tie = "Efron",
batch_size = as.integer(5000),
early_stopping = "TRUE",
patience = 20),
epochs = as.integer(300),
num_workers = 0,
verbose = FALSE,
verbose.cv = TRUE,
folds = 3,
parallel = FALSE,
random_seed = as.integer(Sys.time()))
Example of Bayesian Optimization of Hyperparameters for the XGB model.
bounds <- list(eta = c(0, 1),
max_depth = c(1L, 25L),
min_child_weight = c(0, 50),
subsample = c(0.51, 1),
lambda = c(0, 15),
alpha = c(0, 15))
obj_func <- function(eta,
max_depth,
min_child_weight,
subsample,
lambda,
alpha) {
xgbcv <- ReSurvCV(
IndividualDataPP = individual_data,
model = "XGB",
hparameters_grid = list(
booster = "gbtree",
eta = eta,
max_depth = max_depth,
subsample = subsample,
alpha = lambda,
lambda = alpha,
min_child_weight = min_child_weight
),
print_every_n = 1L,
nrounds = 500,
verbose = FALSE,
verbose.cv = TRUE,
early_stopping_rounds = 30,
folds = 3,
parallel = FALSE,
random_seed = as.integer(Sys.time())
)
lst <- list(Score = -xgbcv$out.cv.best.oos$test.lkh,
train.lkh = xgbcv$out.cv.best.oos$train.lkh)
return(lst)
}
bayes_out <- bayesOpt(FUN = obj_func,
bounds = bounds,
initPoints = 50,
iters.n = 1000,
iters.k = 50,
otherHalting = list(timeLimit = 18000))
Example of Bayesian Optimization of Hyperparameters for the XGB model.
bounds <- list(num_layers = c(2L, 10L),
num_nodes = c(2L, 10L),
optim = c(1L, 2L),
activation = c(1L, 2L),
lr = c(0.005, 0.5),
xi = c(0, 0.5),
eps = c(0, 0.5))
obj_func <- function(num_layers,
num_nodes,
optim,
activation,
lr,
xi,
eps) {
optim <- switch(optim,
"Adam",
"SGD")
activation <- switch(activation, "LeakyReLU", "SELU")
batch_size <- as.integer(5000L)
number_layers <- as.integer(num_layers)
num_nodes <- as.integer(num_nodes)
deepsurv_cv <- ReSurvCV(IndividualData = individual_data,
model = "NN",
hparameters_grid = list(num_layers = number_layers,
num_nodes = num_nodes,
optim = optim,
activation = activation,
lr = lr,
xi = xi,
eps = eps,
tie = "Efron",
batch_size = batch_size,
early_stopping = "TRUE",
patience = 20),
epochs = as.integer(300),
num_workers = 0,
verbose = FALSE,
verbose.cv = TRUE,
folds = 3,
parallel = FALSE,
random_seed = as.integer(Sys.time()))
lst <- list(Score = -deepsurv_cv$out.cv.best.oos$test.lkh,
train.lkh = deepsurv_cv$out.cv.best.oos$train.lkh)
return(lst)
}
bayes_out <- bayesOpt(FUN = obj_func,
bounds = bounds,
initPoints = 50,
iters.n = 1000,
iters.k = 50,
otherHalting = list(timeLimit = 18000))
In this Section we show the usage of the ReSurv
method
for the estimation of our models parameters. The hyperparameters for NN
and XGB were derived with the Bayesian Optimisation procedure.
hparameters_xgb <- list(
params = list(
booster = "gbtree",
eta = 0.9611239,
subsample = 0.62851,
alpha = 5.836211,
lambda = 15,
min_child_weight = 29.18158,
max_depth = 1
),
print_every_n = 0,
nrounds = 3000,
verbose = FALSE,
early_stopping_rounds = 500
)
resurv_fit_xgb <- ReSurv(individual_data,
hazard_model = "XGB",
hparameters = hparameters_xgb)
hparameters_nn = list(num_layers= 2,
early_stopping= TRUE,
patience=350,
verbose=FALSE,
network_structure=NULL,
num_nodes= 10,
activation ="LeakyReLU",
optim ="SGD",
lr =0.02226655,
xi=0.4678993,
epsilon= 0,
batch_size= 5000L,
epochs= 5500L,
num_workers= 0,
tie="Efron" )
resurv_fit_nn <- ReSurv(individual_data,
hazard_model = "NN",
hparameters = hparameters_nn)
In this Section we show how to use our models for predictions with
the predict
method and for different output time
granularities (quarterly, yearly and monthly). The output can be
displayed using the methods summary
and
print
.
resurv_fit_predict_q <- predict(resurv_fit_cox)
individual_data_y <- IndividualDataPP(input_data,
id = "claim_number",
continuous_features = "AP_i",
categorical_features = "claim_type",
accident_period = "AP",
calendar_period = "RP",
input_time_granularity = "days",
output_time_granularity = "years",
years = 4,
continuous_features_spline = NULL,
calendar_period_extrapolation = FALSE)
resurv_fit_predict_y <- predict(resurv_fit_cox,
newdata = individual_data_y,
grouping_method = "probability")
individual_data_m <- IndividualDataPP(input_data,
id = "claim_number",
continuous_features = "AP_i",
categorical_features = "claim_type",
accident_period = "AP",
calendar_period = "RP",
input_time_granularity = "days",
output_time_granularity = "months",
years = 4,
continuous_features_spline = NULL,
calendar_period_extrapolation = FALSE)
resurv_fit_predict_m <- predict(resurv_fit_cox,
newdata = individual_data_m,
grouping_method = "probability")
model_s <- summary(resurv_fit_predict_y)
print(model_s)
Similar functions for the other models
In this Section we show an example of development factors plot for the COX model.
In this Section we show the CRPS calculation using the
survival_crps
method.
We first extract the output group code for an arbitrary group.
resurv_fit_predict_q$long_triangle_format_out$output_granularity %>%
filter(AP_o==15 & claim_type == 1) %>%
filter(row_number()==1) %>%
select(group_o)
We use the group code to plot the feature dependent development factors.
p1 <- input_data %>%
as.data.frame() %>%
mutate(claim_type = as.factor(claim_type)) %>%
ggplot(aes(x = RT - AT, color = claim_type)) +
stat_ecdf(size = 1) +
labs(title = "", x = "Notification delay (in days)", y = "ECDF") +
xlim(0.01, 1500) +
scale_color_manual(
values = c("royalblue", "#a71429"),
labels = c("Claim type 0", "Claim type 1")
) +
scale_linetype_manual(values = c(1, 3),
labels = c("Claim type 0", "Claim type 1")) +
guides(
color = guide_legend(
title = "Claim type",
override.aes = list(
color = c("royalblue", "#a71429"),
linewidth = 2
)
),
linetype = guide_legend(
title = "Claim type",
override.aes = list(linetype = c(1, 3), linewidth = 0.7)
)
) +
theme_bw() +
theme(
axis.text = element_text(size = 20),
axis.title.y = element_text(size = 20),
axis.title.x = element_text(size = 20),
legend.text = element_text(size = 20)
)
p1
p2 <- input_data %>%
as.data.frame() %>%
mutate(claim_type = as.factor(claim_type)) %>%
ggplot(aes(x = claim_type, fill = claim_type)) +
geom_bar() +
scale_fill_manual(
values = c("royalblue", "#a71429"),
labels = c("Claim type 0", "Claim type 1")
) +
guides(fill = guide_legend(title = "Claim type")) +
theme_bw() +
labs(title = "", x = "Claim Type", y = "") +
theme(
axis.text = element_text(size = 20),
axis.title.y = element_text(size = 20),
axis.title.x = element_text(size = 20),
legend.text = element_text(size = 20)
)
p2
Chain-Ladder case.
clmodel <- resurv_fit_cox$IndividualDataPP$training.data %>%
mutate(DP_o = 16 -
DP_rev_o + 1) %>%
group_by(AP_o, DP_o) %>%
summarize(I = sum(I), .groups = "drop") %>%
group_by(AP_o) %>%
arrange(DP_o) %>%
mutate(I_cum = cumsum(I), I_cum_lag = lag(I_cum, default = 0)) %>%
ungroup() %>%
group_by(DP_o) %>%
reframe(df_o = sum(I_cum * (
AP_o <= max(resurv_fit_cox$IndividualDataPP$training.data$AP_o) - DP_o +
1
)) /
sum(I_cum_lag * (
AP_o <= max(resurv_fit_cox$IndividualDataPP$training.data$AP_o) - DP_o +
1
)),
I = sum(I * (
AP_o <= max(resurv_fit_cox$IndividualDataPP$training.data$AP_o) - DP_o
))) %>%
mutate(DP_o_join = DP_o - 1) %>%
as.data.frame()
clmodel %>%
filter(DP_o > 1) %>%
ggplot(aes(x = DP_o, y = df_o)) +
geom_line(linewidth = 2.5,
color = "#454555") +
labs(title = "Chain ladder",
x = "Development quarter",
y = "Development factor") +
ylim(1, 3. + .01) +
theme_bw(base_size = rel(5)) +
theme(plot.title = element_text(size = 20))
##
clmodel_months <- individual_data_m$training.data %>%
mutate(DP_o = 48 -
DP_rev_o + 1) %>%
group_by(AP_o, DP_o) %>%
summarize(I = sum(I), .groups = "drop") %>%
group_by(AP_o) %>%
arrange(DP_o) %>%
mutate(I_cum = cumsum(I), I_cum_lag = lag(I_cum, default = 0)) %>%
ungroup() %>%
group_by(DP_o) %>%
reframe(df_o = sum(I_cum * (
AP_o <= max(individual_data_m$training.data$AP_o) - DP_o + 1
)) /
sum(I_cum_lag * (
AP_o <= max(individual_data_m$training.data$AP_o) - DP_o + 1
)),
I = sum(I * (
AP_o <= max(individual_data_m$training.data$AP_o) - DP_o
))) %>%
mutate(DP_o_join = DP_o - 1) %>%
as.data.frame()
ticks_at <- seq(1, 48, 4)
labels_as <- as.character(ticks_at)
clmodel_months %>%
filter(DP_o > 1) %>%
ggplot(aes(x = DP_o,
y = df_o)) +
geom_line(linewidth = 2.5,
color = "#454555") +
labs(title = "Chain ladder",
x = "Development month",
y = "Development factor") +
ylim(1, 2.5 + .01) +
scale_x_continuous(breaks = ticks_at, labels = labels_as) +
theme_bw(base_size = rel(5)) +
theme(plot.title = element_text(size = 20))
##
clmodel_years <- individual_data_y$training.data %>%
mutate(DP_o = 4 -
DP_rev_o + 1) %>%
group_by(AP_o, DP_o) %>%
summarize(I = sum(I), .groups = "drop") %>%
group_by(AP_o) %>%
arrange(DP_o) %>%
mutate(I_cum = cumsum(I), I_cum_lag = lag(I_cum, default = 0)) %>%
ungroup() %>%
group_by(DP_o) %>%
reframe(df_o = sum(I_cum * (
AP_o <= max(individual_data_m$training.data$AP_o) - DP_o + 1
)) /
sum(I_cum_lag * (
AP_o <= max(individual_data_m$training.data$AP_o) - DP_o + 1
)),
I = sum(I * (
AP_o <= max(individual_data_m$training.data$AP_o) - DP_o
))) %>%
mutate(DP_o_join = DP_o - 1) %>%
as.data.frame()
ticks_at <- seq(1, 4, 1)
labels_as <- as.character(ticks_at)
clmodel_years %>%
filter(DP_o > 1) %>%
ggplot(aes(x = DP_o,
y = df_o)) +
geom_line(linewidth = 2.5,
color = "#454555") +
labs(title = "Chain ladder",
x = "Development year",
y = "Development factor") +
ylim(1, 2.5 + .01) +
scale_x_continuous(breaks = ticks_at, labels = labels_as) +
theme_bw(base_size = rel(5)) +
theme(plot.title = element_text(size = 20))
COX, quarterly output.
ct <- 0
ap <- 12
resurv_fit_predict_q$long_triangle_format_out$output_granularity %>%
filter(AP_o == ap & claim_type == ct) %>%
filter(row_number() == 1) %>%
select(group_o)
plot(
resurv_fit_predict_q,
granularity = "output",
title_par = "COX: Accident Quarter 12 Claim Type 0",
group_code = 23
)
ct <- 0
ap <- 36
resurv_fit_predict_m$long_triangle_format_out$output_granularity %>%
filter(AP_o == ap & claim_type == ct) %>%
filter(row_number() == 1) %>%
select(group_o)
plot(
resurv_fit_predict_m,
granularity = "output",
color_par = "#a71429",
title_par = "COX: Accident Month 36 Claim Type 0",
group_code = 71
)
ct <- 1
ap <- 7
resurv_fit_predict_m$long_triangle_format_out$output_granularity %>%
filter(AP_o == ap & claim_type == ct) %>%
filter(row_number() == 1) %>%
select(group_o)
plot(
resurv_fit_predict_m,
granularity = "output",
color_par = "#a71429",
title_par = "COX: Accident Month 7 Claim Type 1",
group_code = 14
)
ct <- 0
ap <- 2
resurv_fit_predict_y$long_triangle_format_out$output_granularity %>%
filter(AP_o == ap & claim_type == ct) %>%
filter(row_number() == 1) %>%
select(group_o)
plot(
resurv_fit_predict_y,
granularity = "output",
color_par = "#FF6A7A",
title_par = "COX: Accident Year 2 Claim Type 0",
group_code = 3
)
ct <- 1
ap <- 3
resurv_fit_predict_y$long_triangle_format_out$output_granularity %>%
filter(AP_o == ap & claim_type == ct) %>%
filter(row_number() == 1) %>%
select(group_o)
plot(
resurv_fit_predict_y,
granularity = "output",
color_par = "#FF6A7A",
title_par = "COX: Accident Year 3 Claim Type 1",
group_code = 6
)
The code below can be used to compute the performance metrics in Table 6 of the manuscript.
conversion_factor <- individual_data$conversion_factor
max_dp_i <- 1440
true_output <- resurv_fit_cox$IndividualDataPP$full.data %>%
mutate(
DP_rev_o = floor(max_dp_i * conversion_factor) -
ceiling(
DP_i * conversion_factor +
((AP_i - 1) %% (
1 / conversion_factor
)) * conversion_factor) + 1,
AP_o = ceiling(AP_i * conversion_factor),
TR_o = AP_o - 1
) %>%
filter(DP_rev_o <= TR_o) %>%
group_by(claim_type, AP_o, DP_rev_o) %>%
mutate(claim_type = as.character(claim_type)) %>%
summarize(I = sum(I), .groups = "drop") %>%
filter(DP_rev_o > 0)
out_list <- resurv_fit_predict_q$long_triangle_format_out
out <- out_list$output_granularity
#Total output
score_total <- out[, c("claim_type", "AP_o", "DP_o", "expected_counts")] %>%
mutate(DP_rev_o = 16 - DP_o + 1) %>%
inner_join(true_output, by = c("claim_type", "AP_o", "DP_rev_o")) %>%
mutate(ave = I - expected_counts, abs_ave = abs(ave)) %>%
# from here it is reformulated for the are tot
ungroup() %>%
group_by(AP_o, DP_rev_o) %>%
reframe(abs_ave = abs(sum(ave)), I = sum(I))
are_tot <- sum(score_total$abs_ave) / sum(score_total$I)
dfs_output <- out %>%
mutate(DP_rev_o = 16 - DP_o + 1) %>%
select(AP_o, claim_type, DP_rev_o, f_o) %>%
mutate(DP_rev_o = DP_rev_o) %>%
distinct()
#Cashflow on output scale.Etc quarterly cashflow development
score_diagonal <- resurv_fit_cox$IndividualDataPP$full.data %>%
mutate(
DP_rev_o = floor(max_dp_i * conversion_factor) -
ceiling(
DP_i * conversion_factor +
((AP_i - 1) %% (
1 / conversion_factor
)) * conversion_factor) + 1,
AP_o = ceiling(AP_i * conversion_factor)
) %>%
group_by(claim_type, AP_o, DP_rev_o) %>%
mutate(claim_type = as.character(claim_type)) %>%
summarize(I = sum(I), .groups = "drop") %>%
group_by(claim_type, AP_o) %>%
arrange(desc(DP_rev_o)) %>%
mutate(I_cum = cumsum(I)) %>%
mutate(I_cum_lag = lag(I_cum, default = 0)) %>%
left_join(dfs_output, by = c("AP_o", "claim_type", "DP_rev_o")) %>%
mutate(I_cum_hat = I_cum_lag * f_o,
RP_o = max(DP_rev_o) - DP_rev_o + AP_o) %>%
inner_join(true_output[, c("AP_o", "DP_rev_o")] %>% distinct()
, by = c("AP_o", "DP_rev_o")) %>%
group_by(AP_o, DP_rev_o) %>%
reframe(abs_ave2_diag = abs(sum(I_cum_hat) - sum(I_cum)), I = sum(I))
are_cal_q <- sum(score_diagonal$abs_ave2_diag) / sum(score_diagonal$I)
# scoring XGB ----
out_xgb <- resurv_predict_xgb$long_triangle_format_out$output_granularity
score_total_xgb <- out_xgb[, c("claim_type",
"AP_o",
"DP_o",
"expected_counts")] %>%
mutate(DP_rev_o = 16 - DP_o + 1) %>%
inner_join(true_output, by = c("claim_type", "AP_o", "DP_rev_o")) %>%
mutate(ave = I - expected_counts, abs_ave = abs(ave)) %>%
# from here it is reformulated for the are tot
ungroup() %>%
group_by(AP_o, DP_rev_o) %>%
reframe(abs_ave = abs(sum(ave)), I = sum(I))
are_tot_xgb <- sum(score_total_xgb$abs_ave) / sum(score_total$I)
dfs_output_xgb <- out_xgb %>%
mutate(DP_rev_o = 16 - DP_o + 1) %>%
select(AP_o, claim_type, DP_rev_o, f_o) %>%
mutate(DP_rev_o = DP_rev_o) %>%
distinct()
#Cashflow on output scale.Etc quarterly cashflow development
score_diagonal_xgb <- resurv_fit_cox$IndividualDataPP$full.data %>%
mutate(
DP_rev_o = floor(max_dp_i * conversion_factor) -
ceiling(
DP_i * conversion_factor +
((AP_i - 1) %% (
1 / conversion_factor
)) * conversion_factor) + 1,
AP_o = ceiling(AP_i * conversion_factor)
) %>%
group_by(claim_type, AP_o, DP_rev_o) %>%
mutate(claim_type = as.character(claim_type)) %>%
summarize(I = sum(I), .groups = "drop") %>%
group_by(claim_type, AP_o) %>%
arrange(desc(DP_rev_o)) %>%
mutate(I_cum = cumsum(I)) %>%
mutate(I_cum_lag = lag(I_cum, default = 0)) %>%
left_join(dfs_output_xgb, by = c("AP_o", "claim_type", "DP_rev_o")) %>%
mutate(I_cum_hat = I_cum_lag * f_o,
RP_o = max(DP_rev_o) - DP_rev_o + AP_o) %>%
inner_join(true_output[, c("AP_o", "DP_rev_o")] %>% distinct()
, by = c("AP_o", "DP_rev_o")) %>%
group_by(AP_o, DP_rev_o) %>%
reframe(abs_ave2_diag = abs(sum(I_cum_hat) - sum(I_cum)), I = sum(I))
are_cal_q_xgb <- sum(score_diagonal_xgb$abs_ave2_diag) / sum(score_diagonal$I)
# scoring NN ----
out_nn <- resurv_predict_nn$long_triangle_format_out$output_granularity
score_total_nn <- out_nn[, c("claim_type",
"AP_o",
"DP_o",
"expected_counts")] %>%
mutate(DP_rev_o = 16 - DP_o + 1) %>%
inner_join(true_output, by = c("claim_type", "AP_o", "DP_rev_o")) %>%
mutate(ave = I - expected_counts, abs_ave = abs(ave)) %>%
# from here it is reformulated for the are tot
ungroup() %>%
group_by(AP_o, DP_rev_o) %>%
reframe(abs_ave = abs(sum(ave)), I = sum(I))
are_tot_nn <- sum(score_total_nn$abs_ave) / sum(score_total$I)
dfs_output_nn <- out_nn %>%
mutate(DP_rev_o = 16 - DP_o + 1) %>%
select(AP_o, claim_type, DP_rev_o, f_o) %>%
mutate(DP_rev_o = DP_rev_o) %>%
distinct()
#Cashflow on output scale.Etc quarterly cashflow development
score_diagonal_nn <- resurv_fit_cox$IndividualDataPP$full.data %>%
mutate(
DP_rev_o = floor(max_dp_i * conversion_factor) -
ceiling(
DP_i * conversion_factor +
((AP_i - 1) %% (
1 / conversion_factor
)) * conversion_factor) + 1,
AP_o = ceiling(AP_i * conversion_factor)
) %>%
group_by(claim_type, AP_o, DP_rev_o) %>%
mutate(claim_type = as.character(claim_type)) %>%
summarize(I = sum(I), .groups = "drop") %>%
group_by(claim_type, AP_o) %>%
arrange(desc(DP_rev_o)) %>%
mutate(I_cum = cumsum(I)) %>%
mutate(I_cum_lag = lag(I_cum, default = 0)) %>%
left_join(dfs_output_nn, by = c("AP_o", "claim_type", "DP_rev_o")) %>%
mutate(I_cum_hat = I_cum_lag * f_o,
RP_o = max(DP_rev_o) - DP_rev_o + AP_o) %>%
inner_join(true_output[, c("AP_o", "DP_rev_o")] %>% distinct()
, by = c("AP_o", "DP_rev_o")) %>%
group_by(AP_o, DP_rev_o) %>%
reframe(abs_ave2_diag = abs(sum(I_cum_hat) - sum(I_cum)), I = sum(I))
are_cal_q_nn <- sum(score_diagonal_nn$abs_ave2_diag) / sum(score_diagonal$I)
# Scoring Chain-Ladder ----
true_output_cl <- individual_data$full.data %>%
mutate(
DP_rev_o = floor(max_dp_i * conversion_factor) -
ceiling(
DP_i * conversion_factor +
((AP_i - 1) %% (
1 / conversion_factor
)) * conversion_factor) + 1,
AP_o = ceiling(AP_i * conversion_factor)
) %>%
filter(DP_rev_o <= TR_o) %>%
mutate(DP_o = max(individual_data$training.data$DP_rev_o) - DP_rev_o + 1) %>%
group_by(AP_o, DP_o, DP_rev_o) %>%
summarize(I = sum(I), .groups = "drop") %>%
filter(DP_rev_o > 0)
latest_observed <- individual_data$training.data %>%
filter(DP_rev_o >= TR_o) %>%
mutate(DP_o = max(individual_data$training.data$DP_rev_o) - DP_rev_o + 1) %>%
group_by(AP_o) %>%
mutate(DP_max = max(DP_o)) %>%
group_by(AP_o, DP_max) %>%
summarize(I = sum(I), .groups = "drop")
clmodel <- individual_data$training.data %>%
mutate(DP_o = max(individual_data$training.data$DP_rev_o) - DP_rev_o + 1) %>%
group_by(AP_o, DP_o) %>%
summarize(I = sum(I), .groups = "drop") %>%
group_by(AP_o) %>%
arrange(DP_o) %>%
mutate(I_cum = cumsum(I), I_cum_lag = lag(I_cum, default = 0)) %>%
ungroup() %>%
group_by(DP_o) %>%
reframe(df = sum(I_cum * (
AP_o <= max(individual_data$training.data$AP_o) - DP_o + 1
)) /
sum(I_cum_lag * (
AP_o <= max(individual_data$training.data$AP_o) - DP_o + 1
)), I = sum(I * (
AP_o <= max(individual_data$training.data$AP_o) - DP_o
))) %>%
mutate(DP_o_join = DP_o) %>%
mutate(DP_rev_o = max(DP_o) - DP_o + 1)
predictions <- expand.grid(AP_o = latest_observed$AP_o,
DP_o = clmodel$DP_o_join) %>%
left_join(clmodel[, c("DP_o_join", "df")], by = c("DP_o" = "DP_o_join")) %>%
left_join(latest_observed, by = "AP_o") %>%
rowwise() %>%
filter(DP_o > DP_max) %>%
ungroup() %>%
group_by(AP_o) %>%
arrange(DP_o) %>%
mutate(df_cum = cumprod(df)) %>%
mutate(I_expected = I * df_cum - I * lag(df_cum, default = 1)) %>%
select(DP_o, AP_o, I_expected)
conversion_factor <- individual_data$conversion_factor
max_dp_i <- 1440
score_total <- predictions %>%
inner_join(true_output_cl, by = c("AP_o", "DP_o")) %>%
mutate(ave = I - I_expected, abs_ave = abs(ave)) %>%
# from here it is reformulated for the are tot
ungroup() %>%
group_by(AP_o, DP_rev_o) %>%
reframe(abs_ave = abs(sum(ave)), I = sum(I))
are_tot <- sum(score_total$abs_ave) / sum(score_total$I)
score_diagonal <- individual_data$full.data %>%
mutate(
DP_rev_o = floor(max_dp_i * conversion_factor) -
ceiling(
DP_i * conversion_factor +
((AP_i - 1) %% (
1 / conversion_factor
)) * conversion_factor) + 1,
AP_o = ceiling(AP_i * conversion_factor)
) %>%
group_by(claim_type, AP_o, DP_rev_o) %>%
mutate(claim_type = as.character(claim_type)) %>%
summarize(I = sum(I), .groups = "drop") %>%
group_by(claim_type, AP_o) %>%
arrange(desc(DP_rev_o)) %>%
mutate(I_cum = cumsum(I)) %>%
mutate(I_cum_lag = lag(I_cum, default = 0)) %>%
mutate(DP_o = max(individual_data$training.data$DP_rev_o) - DP_rev_o + 1) %>%
left_join(CL[, c("DP_o", "df")], by = c("DP_o")) %>%
mutate(I_cum_hat = I_cum_lag * df,
RP_o = max(DP_rev_o) - DP_rev_o + AP_o) %>%
inner_join(true_output_cl[, c("AP_o", "DP_rev_o")] %>% distinct()
, by = c("AP_o", "DP_rev_o")) %>%
group_by(AP_o, DP_rev_o) %>%
reframe(abs_ave2_diag = abs(sum(I_cum_hat) - sum(I_cum)), I = sum(I))
are_cal_q <- sum(score_diagonal$abs_ave2_diag) / sum(score_diagonal$I)