SNP Claims Outlier Report
Author: Jared Walker
Department: Child Nutrition Programs
Department: Child Nutrition Programs
This document describes the statistical method and R implementation used to identify sponsor- and site-level claim anomalies.
Purpose:
To flag unusual claim patterns for
review.
1. Conceptual Definitions
The analysis evaluates monthly reimbursement claims using a rolling historical panel. Two complementary deviation metrics are computed:
- Year-over-year deviation — deviation from the prior year value.
- Deviation from the mean — deviation from the 12-month mean.
2. Dispersion Measure
Modified z-scores are calculated using mean absolute deviation (from a reference value), a robust alternative to standard deviation.
Statistical Justification:
Reimbursement data
are highly skewed. Standard deviation–based z-scores are sensitive to
outliers. Mean absolute deviation (MAD) provides a more stable
dispersion measure under these conditions.
Deviation From the Prior Year
(current claim − prior year claim) / mean absolute deviation [from prior year claim]
Deviation From the Mean
(current claim − 12-month mean) / mean absolute deviation [from the 12 month mean]
Time Window
Rolling window defined as T-2 to T-14 months.
3. Expected Output
Interpretation:
Large absolute values indicate unusual behavior.
Large absolute values indicate unusual behavior.
- Positive values → unusually high reimbursement
- Negative values → unusually low reimbursement
Tables
- 5 highest and 5 lowest scores per metric per entity are retained for review
- Tableau table
- Full site-level cross-table
- Full sponsor-level cross-table
- Raw data retained for traceability
4. R Implementation
Environment Setup
rm(list = ls(all.names = T)) #clear all objects from environment.
start.time <- Sys.time() # Log start time.
gc() # Free up memory and report the memory usage.
options(max.print=100) # Limit output.
options(scipen=999) # Prohibit use of scientific notation
library(lubridate)
library(janitor)
library(tidyverse)
library(openxlsx)
library(naniar)
library(data.table)
library(readr)
Date Handling
# Assign current-month date
curr_date <- as.Date(format(Sys.Date(), "%Y-%m-01"))
# Set time window
report_offset <- -2 # Reporting month or start month is 2 months in the past
window_start <- report_offset - 12 # End month is 14 months in the past
report_date <- curr_date %m+% months(report_offset)
# Reformat curr_date for downstream automation
mmm_yyyy <- paste(month.abb[as.numeric(format(report_date, "%m"))],
format(report_date, "%Y"))
Import Data and Clean
# Site level data
data1197_s <- read_csv(
"C:/Users/Jared Walker/Downloads/Data1197.csv",
col_types = cols("Claim Date" = col_date(format = "%m/%d/%Y"))) %>%
clean_names() %>%
replace_with_na(replace = list(total_reimbursement = 0.0)) %>% # $0 reimbursement == NA
filter(claim_status != "Error") %>%
mutate(operating_sites = 1)
Site-Level Crosstable
# Pivot site claims into crosstable
ct_s <- data1197_s %>%
select(sponsor_name:site_number, claim_date, total_reimbursement) %>%
filter(claim_date >= curr_date %m+% months(window_start)) %>%
arrange(claim_date) %>%
pivot_wider(names_from = claim_date,
values_from = total_reimbursement,
values_fill = NA)
Site-Level Calculations
# Month indexing function
col_idx_cts <- function(offset) {
nm <- as.character(curr_date %m+% months(offset))
i <- match(nm, names(ct_s))
if (is.na(i)) stop("Missing month column in ct_s: ", nm)
i
}
# Column indexing
idx_prior <- col_idx_cts(window_start) # -14
idx_months <- vapply((window_start + 1):report_offset, col_idx_cts, integer(1)) ##### -13:-2
idx_current <- col_idx_cts(report_offset) # -2
# Reference columns for current claim and prior year claim
ct_s$prior_year <- ct_s[[idx_prior]]
ct_s$current <- ct_s[[idx_current]]
# Mean claim value
ct_s$mean_claim <- rowMeans(ct_s[, idx_months, drop = FALSE], na.rm = TRUE)
ct_s$counts <- rowSums(!is.na(ct_s[, idx_months, drop = FALSE]))
# Absolute deviation from the prior year
abs_dev_prior <- abs(ct_s[, idx_months, drop = FALSE] - ct_s$prior_year)
ct_s$sums_prior <- rowSums(abs_dev_prior, na.rm = TRUE)
ct_s$mad_yoy <- ct_s$sums_prior / ct_s$counts
# Z-score calculation
ct_s$z_yoy <- (ct_s$current - ct_s$prior_year) / ct_s$mad_yoy
ct_s$z_yoy <- fifelse(
ct_s$prior_year == 0 | is.na(ct_s$prior_year) | ct_s$current == 0 | is.na(ct_s$current),
NA,
ct_s$z_yoy)
# Absolute deviation from the mean
abs_dev_mean <- abs(ct_s[, idx_months, drop = FALSE] - ct_s$mean_claim)
ct_s$sums_mean <- rowSums(abs_dev_mean, na.rm = TRUE)
ct_s$mad_mean <- ct_s$sums_mean / ct_s$counts
# Z-score calculation
ct_s$z_mean <- (ct_s$current - ct_s$mean_claim) / ct_s$mad_mean
ct_s$z_mean <- fifelse(
ct_s$prior_year == 0 | is.na(ct_s$prior_year) | ct_s$current == 0 | is.na(ct_s$current),
NA,
ct_s$z_mean)
site_var <- ct_s
site_var_ref <- site_var
Top Ten Sites
# Convert to absolute values for ranking
site_var$abs_z_yoy <- abs(site_var$z_yoy)
top_yoy_s <- site_var[with(site_var, order(-z_yoy)), ]
top_yoy_s1 <- top_yoy_s[1:5, ]
top_yoy_s <- site_var[with(site_var, order(z_yoy)), ]
top_yoy_s2 <- top_yoy_s[1:5, ] %>%
arrange(desc(z_yoy))
top_yoy_s <- rbind(top_yoy_s1, top_yoy_s2) %>%
arrange(desc(abs_z_yoy)) %>%
select(-abs_z_yoy)
drop <- c("mean_claim", "z_mean", "mad_mean")
top_yoy_s <- top_yoy_s[, !(names(top_yoy_s) %in% drop)]
site_var$abs_z_mean <- abs(site_var$z_mean)
top_dm_s <- site_var[with(site_var, order(-z_mean)), ]
top_dm_s1 <- top_dm_s[1:5, ]
top_dm_s <- site_var[with(site_var, order(z_mean)), ]
top_dm_s2 <- top_dm_s[1:5, ] %>%
arrange(desc(z_mean))
top_dm_s <- rbind(top_dm_s1, top_dm_s2) %>%
mutate(abs_M = abs(z_mean)) %>%
arrange(desc(abs_M)) %>%
select(-abs_M)
top_yoy_s_ref <- top_yoy_s
top_dm_s_ref <- top_dm_s
site_var_ref2 <- site_var
Sponsor-Level Cross Table
# Aggregate site-level data to sponsor-level
data1197 <- data1197_s %>%
select(-c(site_name:site_id, claim_status:brk_type,
afterschool_snack_participation,
asp_jul:sso_snk_participation, x80)) %>% # Remove Categorical Data
aggregate(. ~ sponsor_number + sponsor_name + claim_date, FUN=sum) %>%
as.data.frame()
# Pivot sponsor claims into crosstable
ct <- data1197 %>%
select(sponsor_number, sponsor_name, claim_date, total_reimbursement) %>%
filter(claim_date >= curr_date %m+% months(window_start)) %>%
arrange(claim_date) %>%
pivot_wider(names_from = claim_date,
values_from = total_reimbursement,
values_fill = NA)
Sponsor-Level Calculations
# Month indexing function
col_idx_ct <- function(offset) {
nm <- as.character(curr_date %m+% months(offset))
i <- match(nm, names(ct))
if (is.na(i)) stop("Missing month column in ct: ", nm)
i
}
# Column indexing
idx_prior <- col_idx_ct(window_start) # -14
idx_months <- vapply((window_start + 1):report_offset, col_idx_ct, integer(1)) # -13:-2
idx_current <- col_idx_ct(report_offset) # -2
# Reference values
ct$prior_year <- ct[[idx_prior]]
ct$current <- ct[[idx_current]]
# Mean claim value
ct$mean_claim <- rowMeans(ct[, idx_months, drop = FALSE], na.rm = TRUE)
ct$counts <- rowSums(!is.na(ct[, idx_months, drop = FALSE]))
# Absolute deviation from prior year
abs_dev_prior <- abs(ct[, idx_months, drop = FALSE] - ct$prior_year)
ct$sums_prior <- rowSums(abs_dev_prior, na.rm = TRUE)
ct$mad_yoy <- ct$sums_prior / ct$counts
# Z-score calculation
ct$z_yoy <- (ct$current - ct$prior_year) / ct$mad_yoy
ct$z_yoy <- fifelse(
ct$prior_year == 0 | is.na(ct$prior_year) | ct$current == 0 | is.na(ct$current),
NA,
ct$z_yoy)
# Absolute deviation from the mean
abs_dev_mean <- abs(ct[, idx_months, drop = FALSE] - ct$mean_claim)
ct$sums_mean <- rowSums(abs_dev_mean, na.rm = TRUE)
ct$mad_mean <- ct$sums_mean / ct$counts
# Z-score calculation
ct$z_mean <- (ct$current - ct$mean_claim) / ct$mad_mean
ct$z_mean <- fifelse(
ct$prior_year == 0 | is.na(
ct$prior_year) | ct$current == 0 | is.na(ct$current),
NA,
ct$z_mean)
spon_var <- ct
spon_var_ref <- spon_var
Top Ten Sponsors
# Convert to absolute values for ranking
spon_var <- as.data.frame(spon_var_ref)
spon_var <- do.call(data.frame, spon_var)
top_yoy <- spon_var[with(spon_var, order(-z_yoy)), ]
top_yoy1 <- top_yoy[1:5, ]
top_yoy <- spon_var[with(spon_var, order(z_yoy)), ]
top_yoy2 <- top_yoy[1:5, ] %>%
arrange(desc(z_yoy))
top_yoy <- rbind(top_yoy1, top_yoy2) %>%
mutate(abs_YOY = abs(z_yoy)) %>%
arrange(desc(abs_YOY)) %>%
select(-abs_YOY)
top_dm <- spon_var[with(spon_var, order(-z_mean)), ]
top_dm1 <- top_dm[1:5, ]
top_dm <- spon_var[with(spon_var, order(z_mean)), ]
top_dm2 <- top_dm[1:5, ] %>%
arrange(desc(z_mean))
top_dm <- rbind(top_dm1, top_dm2) %>%
mutate(abs_M = abs(z_mean)) %>%
arrange(desc(abs_M)) %>%
select(-abs_M)
spon_var_ref <- spon_var
top_yoy_ref <- top_yoy
top_dm_ref <- top_dm
5. Generate Report
Build Tableau Data
# Reformat as tibble
top_yoy <- as_tibble(top_yoy_ref)
top_dm <- as_tibble(top_dm_ref)
top_yoy_s <- as_tibble(top_yoy_s_ref)
top_dm_s <- as_tibble(top_dm_s_ref)
spon_var <- as_tibble(spon_var_ref)
site_var <- as_tibble(site_var_ref2)
# Flags for Tableau sheet
top_dm_y <- top_dm %>%
select(sponsor_name)
top_dm_y$date <- format(report_date, "%m/%d/%Y")
top_dm_y$unique <- paste(top_dm_y$date, top_dm_y$sponsor_name)
top_dm_y$top_dm_y <- "Y"
top_dm_y <- top_dm_y[, !(names(top_dm_y) %in% c("sponsor_name", "date"))]
top_yoy_y <- top_yoy %>%
select(sponsor_name)
top_yoy_y$date <- format(report_date, "%m/%d/%Y")
top_yoy_y$unique <- paste(top_yoy_y$date, top_yoy_y$sponsor_name)
top_yoy_y$top_yoy_y <- "Y"
top_yoy_y <- top_yoy_y[, !(names(top_yoy_y) %in% c("sponsor_name", "date"))]
# Tableau base
tab1 <- data1197 %>%
select(sponsor_name, sponsor_number, claim_date, total_eligible_students,
adp_brk, adp_lunch, adp_sso_brk, adp_sso_lunch, breakfast_operating_days,
lunch_operating_days, sso_operating_days, free_lunches, rdc_lunches,
paid_lunches, free_breakfasts, rdc_breakfasts, paid_breakfasts, sso_lunches,
sso_breakfasts, total_reimbursement, operating_sites)
tab1$adp <- tab1$adp_brk + tab1$adp_lunch +
tab1$adp_sso_brk + tab1$adp_sso_lunch
tab1$days <- tab1$breakfast_operating_days +
tab1$lunch_operating_days + tab1$sso_operating_days
tab1$meals <- tab1$free_lunches + tab1$rdc_lunches + tab1$paid_lunches +
tab1$free_breakfasts + tab1$rdc_breakfasts + tab1$paid_breakfasts +
tab1$sso_lunches + tab1$sso_breakfasts
tab1$uniq <- paste0(tab1$sponsor_number, tab1$claim_date)
# Attach outlier metrics
spon_var$claim_date <- format(report_date, "%m/%d/%Y")
spon_var$uniq <- paste0(spon_var$sponsor_number, spon_var$claim_date)
spon_var_mean <- spon_var[!is.na(spon_var$z_mean), ] %>%
rename(current_mean = current,
sponsor_name_mean = sponsor_name) %>%
select(sponsor_number, sponsor_name_mean, current_mean, mean_claim, mad_mean, z_mean, uniq)
spon_var_yoy <- spon_var[!is.na(spon_var$z_yoy), ] %>%
rename(current_YOY = current,
sponsor_name_YOY = sponsor_name) %>%
select(sponsor_number, sponsor_name_YOY, prior_year, current_YOY, mad_yoy, z_yoy, uniq)
tab2 <- merge(tab1, spon_var_yoy, by = "uniq", all.x = TRUE)
tab3 <- merge(tab2, spon_var_mean, by = "uniq", all = TRUE)
tab3$unique <- paste(tab3$claim_date, tab3$sponsor_name)
tab4 <- merge(tab3, top_dm_y, by = "unique", all.x = TRUE)
tab5 <- merge(tab4, top_yoy_y, by = "unique", all.x = TRUE)
tab5$top_20 <- as.character(NA)
tab5$top_20 <- fifelse(!is.na(tab5$top_dm_y), "Y", tab5$top_20)
tab5$top_20 <- fifelse(!is.na(tab5$top_yoy_y), "Y", tab5$top_20)
tab5 <- tab5[, !(names(tab5) %in% c("unique"))]
Column Names to Upper Case
# Column Names (Title Case)
colnames(tab5) <- stringr::str_to_title(gsub("*", " ", colnames(tab5)))
colnames(data1197) <- stringr::str_to_title(gsub("*", " ", colnames(data1197)))
colnames(data1197_s)<- stringr::str_to_title(gsub("*", " ", colnames(data1197_s)))
colnames(top_dm_s) <- stringr::str_to_title(gsub("*", " ", colnames(top_dm_s)))
colnames(top_yoy_s) <- stringr::str_to_title(gsub("*", " ", colnames(top_yoy_s)))
colnames(site_var) <- stringr::str_to_title(gsub("*", " ", colnames(site_var)))
colnames(top_dm) <- stringr::str_to_title(gsub("*", " ", colnames(top_dm)))
colnames(top_yoy) <- stringr::str_to_title(gsub("*", " ", colnames(top_yoy)))
colnames(spon_var) <- stringr::str_to_title(gsub("_", " ", colnames(spon_var)))
Build Main Tables
SNP_list.1 <- list(
"DEVIATION FROM THE MEAN" = top_dm,
"DEVIATION FROM PRIOR YEAR" = top_yoy)
variances <- createWorkbook()
addWorksheet(variances, "Top Ten Sponsors")
curr_row <- 3
for (i in seq_along(SNP_list.1)) {
writeData(variances, "Top Ten Sponsors", names(SNP_list.1)[i], startCol = 1, startRow = curr_row)
writeData(variances, "Top Ten Sponsors", SNP_list.1[[i]], startCol = 1, startRow = curr_row + 1)
curr_row <- curr_row + nrow(SNP_list.1[[i]]) + 6
}
SNP_list.2 <- list(
"DEVIATION FROM THE MEAN" = top_dm_s,
"DEVIATION FROM PRIOR YEAR" = top_yoy_s)
addWorksheet(variances, "Top Ten Sites")
curr_row <- 3
for (i in seq_along(SNP_list.2)) {
writeData(variances, "Top Ten Sites", names(SNP_list.2)[i], startCol = 1, startRow = curr_row)
writeData(variances, "Top Ten Sites", SNP_list.2[[i]], startCol = 1, startRow = curr_row + 1)
curr_row <- curr_row + nrow(SNP_list.2[[i]]) + 5
}
Add Sheets to Workbook
addWorksheet(variances, "Sponsor Variances")
writeData(variances, "Sponsor Variances", spon_var)
addWorksheet(variances, "Site Variances")
writeData(variances, "Site Variances", site_var)
addWorksheet(variances, "Raw Data 1197 (Sponsor Level)")
writeData(variances, "Raw Data 1197 (Sponsor Level)", data1197)
addWorksheet(variances, "Raw Data 1197 (Site Level)")
writeData(variances, "Raw Data 1197 (Site Level)", data1197_s)
addWorksheet(variances, "Tableau Extract")
writeData(variances, "Tableau Extract", tab5)
Save Workbook
title <- paste0(mmm_yyyy, " SNP Claims Outlier Report.xlsx")
saveWorkbook(variances, file = title, overwrite = TRUE)
6. Log Runtime
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken