# Example Usage of autoFlagR
# This script demonstrates the main features of the autoFlagR package

library(autoFlagR)
library(dplyr)

# ============================================================================
# Example 1: Basic Anomaly Detection
# ============================================================================

# Create example EHR data
set.seed(123)
n <- 1000
data <- data.frame(
  patient_id = 1:n,
  age = rnorm(n, 50, 15),
  cost = rnorm(n, 10000, 5000),
  length_of_stay = rpois(n, 5),
  gender = sample(c("M", "F"), n, replace = TRUE),
  diagnosis_code = sample(c("A", "B", "C", "D"), n, replace = TRUE)
)

# Introduce some anomalies
data$cost[1:10] <- data$cost[1:10] * 10  # Unusually high costs
data$age[11:15] <- c(150, 200, 180, 190, 170)  # Impossible ages

# Score anomalies using Isolation Forest
scored_data <- score_anomaly(data, method = "iforest", contamination = 0.05)

# Flag top anomalies
flagged_data <- flag_top_anomalies(scored_data, contamination = 0.05)

# View top anomalies
top_anomalies <- get_top_anomalies(scored_data, n = 20)
print(top_anomalies[, c("patient_id", "age", "cost", "anomaly_score")])

# ============================================================================
# Example 2: Benchmarking with Ground Truth
# ============================================================================

# Create data with known errors (ground truth)
set.seed(456)
n <- 1000
data_with_truth <- data.frame(
  patient_id = 1:n,
  age = rnorm(n, 50, 15),
  cost = rnorm(n, 10000, 5000),
  is_error = sample(c(0, 1), n, replace = TRUE, prob = c(0.95, 0.05))
)

# Introduce errors in the flagged records
error_indices <- which(data_with_truth$is_error == 1)
data_with_truth$cost[error_indices] <- data_with_truth$cost[error_indices] * 5
data_with_truth$age[error_indices] <- abs(data_with_truth$age[error_indices] + rnorm(length(error_indices), 50, 20))

# Score with ground truth
scored_with_truth <- score_anomaly(
  data_with_truth,
  method = "iforest",
  contamination = 0.05,
  ground_truth_col = "is_error"
)

# Extract benchmarking metrics
metrics <- extract_benchmark_metrics(scored_with_truth)
cat("AUC-ROC:", metrics$auc_roc, "\n")
cat("AUC-PR:", metrics$auc_pr, "\n")
cat("Top-K Recall:\n")
print(metrics$top_k_recall)

# ============================================================================
# Example 3: Generate PDF Report
# ============================================================================

# Generate a comprehensive audit report
generate_audit_report(
  data = data,
  filename = "example_audit_report",
  output_format = "pdf",
  method = "iforest",
  contamination = 0.05,
  top_n = 50
)

# ============================================================================
# Example 4: Using Local Outlier Factor (LOF)
# ============================================================================

# Score using LOF method
scored_lof <- score_anomaly(data, method = "lof", contamination = 0.05, k = 10)
flagged_lof <- flag_top_anomalies(scored_lof, contamination = 0.05)

# Compare top anomalies from both methods
top_iforest <- get_top_anomalies(scored_data, n = 10)
top_lof <- get_top_anomalies(scored_lof, n = 10)

cat("Top 10 anomalies (Isolation Forest):\n")
print(top_iforest$patient_id)
cat("\nTop 10 anomalies (LOF):\n")
print(top_lof$patient_id)

# ============================================================================
# Example 5: Custom Preprocessing
# ============================================================================

# Manually prepare data with custom settings
prep_result <- prep_for_anomaly(
  data,
  id_cols = "patient_id",
  exclude_cols = NULL,
  scale_method = "mad"
)

# Access prepared data and metadata
prepared_matrix <- prep_result$prepared_data
metadata <- prep_result$metadata

cat("Numeric columns:", paste(metadata$numeric_cols, collapse = ", "), "\n")
cat("Categorical columns:", paste(metadata$categorical_cols, collapse = ", "), "\n")

