Contents

    SNP Claims Outlier Report

    Author: Jared Walker
    Department: Child Nutrition Programs

    This document describes the statistical method and R implementation used to identify sponsor- and site-level claim anomalies.

    Purpose:
    To flag unusual claim patterns for review.

    1. Conceptual Definitions

    The analysis evaluates monthly reimbursement claims using a rolling historical panel. Two complementary deviation metrics are computed:

    • Year-over-year deviation — deviation from the prior year value.
    • Deviation from the mean — deviation from the 12-month mean.

    2. Dispersion Measure

    Modified z-scores are calculated using mean absolute deviation (from a reference value), a robust alternative to standard deviation.

    Statistical Justification:
    Reimbursement data are highly skewed. Standard deviation–based z-scores are sensitive to outliers. Mean absolute deviation (MAD) provides a more stable dispersion measure under these conditions.

    Deviation From the Prior Year

    (current claim − prior year claim) / mean absolute deviation [from prior year claim]

    Deviation From the Mean

    (current claim − 12-month mean) / mean absolute deviation [from the 12 month mean]

    Time Window

    Rolling window defined as T-2 to T-14 months.

    3. Expected Output

    Interpretation:
    Large absolute values indicate unusual behavior.
    • Positive values → unusually high reimbursement
    • Negative values → unusually low reimbursement

    Tables

    • 5 highest and 5 lowest scores per metric per entity are retained for review
    • Tableau table
    • Full site-level cross-table
    • Full sponsor-level cross-table
    • Raw data retained for traceability

    4. R Implementation

    Environment Setup

    rm(list = ls(all.names = T)) #clear all objects from environment.
    start.time <- Sys.time()   # Log start time.
    gc() # Free up memory and report the memory usage.
    options(max.print=100) # Limit output.
    options(scipen=999) # Prohibit use of scientific notation
    
    library(lubridate)
    library(janitor)
    library(tidyverse)
    library(openxlsx)
    library(naniar)
    library(data.table)
    library(readr)

    Date Handling

    # Assign current-month date
    curr_date  <- as.Date(format(Sys.Date(), "%Y-%m-01"))
    
    # Set time window
    report_offset <- -2 # Reporting month or start month is 2 months in the past
    window_start <- report_offset - 12 # End month is 14 months in the past
    
    report_date <- curr_date %m+% months(report_offset)
    
    # Reformat curr_date for downstream automation
    mmm_yyyy <- paste(month.abb[as.numeric(format(report_date, "%m"))],
                      format(report_date, "%Y"))

    Import Data and Clean

    # Site level data 
    data1197_s <- read_csv(
      "C:/Users/Jared Walker/Downloads/Data1197.csv",
      col_types = cols("Claim Date" = col_date(format = "%m/%d/%Y"))) %>%
      clean_names() %>%
      replace_with_na(replace = list(total_reimbursement = 0.0)) %>%  # $0 reimbursement == NA
      filter(claim_status != "Error") %>%
      mutate(operating_sites = 1)

    Site-Level Crosstable

    # Pivot site claims into crosstable
    ct_s <- data1197_s %>%
      select(sponsor_name:site_number, claim_date, total_reimbursement) %>%
      filter(claim_date >= curr_date %m+% months(window_start)) %>%
      arrange(claim_date) %>%
      pivot_wider(names_from  = claim_date,
                  values_from = total_reimbursement,
                  values_fill = NA)

    Site-Level Calculations

    # Month indexing function
    col_idx_cts <- function(offset) {
      nm <- as.character(curr_date %m+% months(offset))
      i <- match(nm, names(ct_s))
      if (is.na(i)) stop("Missing month column in ct_s: ", nm)
      i
    }
    
    # Column indexing
    idx_prior  <- col_idx_cts(window_start)              # -14
    idx_months <- vapply((window_start + 1):report_offset, col_idx_cts, integer(1)) ##### -13:-2
    idx_current <- col_idx_cts(report_offset)            # -2
    
    # Reference columns for current claim and prior year claim
    ct_s$prior_year <- ct_s[[idx_prior]]
    ct_s$current    <- ct_s[[idx_current]]
    
    # Mean claim value
    ct_s$mean_claim <- rowMeans(ct_s[, idx_months, drop = FALSE], na.rm = TRUE)
    ct_s$counts     <- rowSums(!is.na(ct_s[, idx_months, drop = FALSE]))
    
    # Absolute deviation from the prior year
    abs_dev_prior   <- abs(ct_s[, idx_months, drop = FALSE] - ct_s$prior_year)
    ct_s$sums_prior <- rowSums(abs_dev_prior, na.rm = TRUE)
    ct_s$mad_yoy    <- ct_s$sums_prior / ct_s$counts
    
    # Z-score calculation
    ct_s$z_yoy <- (ct_s$current - ct_s$prior_year) / ct_s$mad_yoy
    ct_s$z_yoy <- fifelse(
      ct_s$prior_year == 0 | is.na(ct_s$prior_year) | ct_s$current == 0 | is.na(ct_s$current), 
      NA, 
      ct_s$z_yoy)
    
    # Absolute deviation from the mean
    abs_dev_mean   <- abs(ct_s[, idx_months, drop = FALSE] - ct_s$mean_claim)
    ct_s$sums_mean <- rowSums(abs_dev_mean, na.rm = TRUE)
    ct_s$mad_mean  <- ct_s$sums_mean / ct_s$counts
    
    # Z-score calculation
    ct_s$z_mean <- (ct_s$current - ct_s$mean_claim) / ct_s$mad_mean
    ct_s$z_mean <- fifelse(
      ct_s$prior_year == 0 | is.na(ct_s$prior_year) | ct_s$current == 0 | is.na(ct_s$current), 
      NA, 
      ct_s$z_mean)
    
    site_var <- ct_s
    site_var_ref <- site_var

    Top Ten Sites

    # Convert to absolute values for ranking
    
    site_var$abs_z_yoy <- abs(site_var$z_yoy)
    top_yoy_s  <- site_var[with(site_var, order(-z_yoy)), ]
    top_yoy_s1 <- top_yoy_s[1:5, ]
    top_yoy_s  <- site_var[with(site_var, order(z_yoy)), ]
    top_yoy_s2 <- top_yoy_s[1:5, ] %>% 
      arrange(desc(z_yoy))
    top_yoy_s  <- rbind(top_yoy_s1, top_yoy_s2) %>%
      arrange(desc(abs_z_yoy)) %>%
      select(-abs_z_yoy)
    
    drop <- c("mean_claim", "z_mean", "mad_mean")
    top_yoy_s <- top_yoy_s[, !(names(top_yoy_s) %in% drop)]
    
    site_var$abs_z_mean <- abs(site_var$z_mean)
    top_dm_s  <- site_var[with(site_var, order(-z_mean)), ]
    top_dm_s1 <- top_dm_s[1:5, ]
    top_dm_s  <- site_var[with(site_var, order(z_mean)), ]
    top_dm_s2 <- top_dm_s[1:5, ] %>% 
      arrange(desc(z_mean))
    top_dm_s  <- rbind(top_dm_s1, top_dm_s2) %>%
      mutate(abs_M = abs(z_mean)) %>%
      arrange(desc(abs_M)) %>%
      select(-abs_M)
    
    top_yoy_s_ref <- top_yoy_s
    top_dm_s_ref  <- top_dm_s
    site_var_ref2 <- site_var

    Top Ten Sponsors

    # Convert to absolute values for ranking
    
    spon_var <- as.data.frame(spon_var_ref)
    spon_var <- do.call(data.frame, spon_var)
    
    top_yoy  <- spon_var[with(spon_var, order(-z_yoy)), ]
    top_yoy1 <- top_yoy[1:5, ]
    top_yoy  <- spon_var[with(spon_var, order(z_yoy)), ]
    top_yoy2 <- top_yoy[1:5, ] %>% 
      arrange(desc(z_yoy))
    top_yoy  <- rbind(top_yoy1, top_yoy2) %>%
      mutate(abs_YOY = abs(z_yoy)) %>%
      arrange(desc(abs_YOY)) %>%
      select(-abs_YOY)
    
    top_dm  <- spon_var[with(spon_var, order(-z_mean)), ]
    top_dm1 <- top_dm[1:5, ]
    top_dm  <- spon_var[with(spon_var, order(z_mean)), ]
    top_dm2 <- top_dm[1:5, ] %>% 
      arrange(desc(z_mean))
    top_dm  <- rbind(top_dm1, top_dm2) %>%
      mutate(abs_M = abs(z_mean)) %>%
      arrange(desc(abs_M)) %>%
      select(-abs_M)
    
    spon_var_ref <- spon_var
    top_yoy_ref  <- top_yoy
    top_dm_ref   <- top_dm

    5. Generate Report

    Build Tableau Data

    # Reformat as tibble
    top_yoy   <- as_tibble(top_yoy_ref)
    top_dm    <- as_tibble(top_dm_ref)
    top_yoy_s <- as_tibble(top_yoy_s_ref)
    top_dm_s  <- as_tibble(top_dm_s_ref)
    
    spon_var  <- as_tibble(spon_var_ref)
    site_var  <- as_tibble(site_var_ref2)
    
    # Flags for Tableau sheet
    top_dm_y <- top_dm %>% 
      select(sponsor_name)
    top_dm_y$date <- format(report_date, "%m/%d/%Y")
    top_dm_y$unique <- paste(top_dm_y$date, top_dm_y$sponsor_name)
    top_dm_y$top_dm_y <- "Y"
    top_dm_y <- top_dm_y[, !(names(top_dm_y) %in% c("sponsor_name", "date"))]
    
    top_yoy_y <- top_yoy %>% 
      select(sponsor_name)
    top_yoy_y$date <- format(report_date, "%m/%d/%Y")
    top_yoy_y$unique <- paste(top_yoy_y$date, top_yoy_y$sponsor_name)
    top_yoy_y$top_yoy_y <- "Y"
    top_yoy_y <- top_yoy_y[, !(names(top_yoy_y) %in% c("sponsor_name", "date"))]
    
    # Tableau base
    tab1 <- data1197 %>%
      select(sponsor_name, sponsor_number, claim_date, total_eligible_students,
             adp_brk, adp_lunch, adp_sso_brk, adp_sso_lunch, breakfast_operating_days,
             lunch_operating_days, sso_operating_days, free_lunches, rdc_lunches,
             paid_lunches, free_breakfasts, rdc_breakfasts, paid_breakfasts, sso_lunches,
             sso_breakfasts, total_reimbursement, operating_sites)
    
    tab1$adp  <- tab1$adp_brk + tab1$adp_lunch + 
      tab1$adp_sso_brk + tab1$adp_sso_lunch
    tab1$days <- tab1$breakfast_operating_days + 
      tab1$lunch_operating_days + tab1$sso_operating_days
    tab1$meals <- tab1$free_lunches + tab1$rdc_lunches + tab1$paid_lunches + 
      tab1$free_breakfasts + tab1$rdc_breakfasts + tab1$paid_breakfasts + 
      tab1$sso_lunches + tab1$sso_breakfasts
    
    tab1$uniq <- paste0(tab1$sponsor_number, tab1$claim_date)
    
    # Attach outlier metrics
    spon_var$claim_date <- format(report_date, "%m/%d/%Y")
    spon_var$uniq <- paste0(spon_var$sponsor_number, spon_var$claim_date)
    
    spon_var_mean <- spon_var[!is.na(spon_var$z_mean), ] %>%
      rename(current_mean = current,
             sponsor_name_mean = sponsor_name) %>%
      select(sponsor_number, sponsor_name_mean, current_mean, mean_claim, mad_mean, z_mean, uniq)
    
    spon_var_yoy <- spon_var[!is.na(spon_var$z_yoy), ] %>%
      rename(current_YOY = current,
             sponsor_name_YOY = sponsor_name) %>%
      select(sponsor_number, sponsor_name_YOY, prior_year, current_YOY, mad_yoy, z_yoy, uniq)
    
    tab2 <- merge(tab1, spon_var_yoy, by = "uniq", all.x = TRUE)
    tab3 <- merge(tab2, spon_var_mean, by = "uniq", all = TRUE)
    
    tab3$unique <- paste(tab3$claim_date, tab3$sponsor_name)
    tab4 <- merge(tab3, top_dm_y,  by = "unique", all.x = TRUE)
    tab5 <- merge(tab4, top_yoy_y, by = "unique", all.x = TRUE)
    
    tab5$top_20 <- as.character(NA)
    tab5$top_20 <- fifelse(!is.na(tab5$top_dm_y), "Y", tab5$top_20)
    tab5$top_20 <- fifelse(!is.na(tab5$top_yoy_y), "Y", tab5$top_20)
    
    tab5 <- tab5[, !(names(tab5) %in% c("unique"))]

    Column Names to Upper Case

    # Column Names (Title Case)
    colnames(tab5)      <- stringr::str_to_title(gsub("*", " ", colnames(tab5)))
    colnames(data1197)  <- stringr::str_to_title(gsub("*", " ", colnames(data1197)))
    colnames(data1197_s)<- stringr::str_to_title(gsub("*", " ", colnames(data1197_s)))
    colnames(top_dm_s)  <- stringr::str_to_title(gsub("*", " ", colnames(top_dm_s)))
    colnames(top_yoy_s) <- stringr::str_to_title(gsub("*", " ", colnames(top_yoy_s)))
    colnames(site_var)  <- stringr::str_to_title(gsub("*", " ", colnames(site_var)))
    colnames(top_dm)    <- stringr::str_to_title(gsub("*", " ", colnames(top_dm)))
    colnames(top_yoy)   <- stringr::str_to_title(gsub("*", " ", colnames(top_yoy)))
    colnames(spon_var)  <- stringr::str_to_title(gsub("_", " ", colnames(spon_var)))

    Build Main Tables

    SNP_list.1 <- list(
      "DEVIATION FROM THE MEAN"      = top_dm,
      "DEVIATION FROM PRIOR YEAR"    = top_yoy)
    
    variances <- createWorkbook()
    addWorksheet(variances, "Top Ten Sponsors")
    
    curr_row <- 3
    
    for (i in seq_along(SNP_list.1)) {
      writeData(variances, "Top Ten Sponsors", names(SNP_list.1)[i], startCol = 1, startRow = curr_row)
      writeData(variances, "Top Ten Sponsors", SNP_list.1[[i]], startCol = 1, startRow = curr_row + 1)
      curr_row <- curr_row + nrow(SNP_list.1[[i]]) + 6
      }
    
    SNP_list.2 <- list(
      "DEVIATION FROM THE MEAN"      = top_dm_s,
      "DEVIATION FROM PRIOR YEAR"    = top_yoy_s)
    
    addWorksheet(variances, "Top Ten Sites")
    
    curr_row <- 3
    
    for (i in seq_along(SNP_list.2)) {
      writeData(variances, "Top Ten Sites", names(SNP_list.2)[i], startCol = 1, startRow = curr_row)
      writeData(variances, "Top Ten Sites", SNP_list.2[[i]], startCol = 1, startRow = curr_row + 1)
      curr_row <- curr_row + nrow(SNP_list.2[[i]]) + 5
      }

    Add Sheets to Workbook

    addWorksheet(variances, "Sponsor Variances")
    writeData(variances, "Sponsor Variances", spon_var)
    
    addWorksheet(variances, "Site Variances")
    writeData(variances, "Site Variances", site_var)
    
    addWorksheet(variances, "Raw Data 1197 (Sponsor Level)")
    writeData(variances, "Raw Data 1197 (Sponsor Level)", data1197)
    
    addWorksheet(variances, "Raw Data 1197 (Site Level)")
    writeData(variances, "Raw Data 1197 (Site Level)", data1197_s)
    
    addWorksheet(variances, "Tableau Extract")
    writeData(variances, "Tableau Extract", tab5)

    Save Workbook

    title <- paste0(mmm_yyyy, " SNP Claims Outlier Report.xlsx")
    saveWorkbook(variances, file = title, overwrite = TRUE)

    6. Log Runtime

    end.time <- Sys.time()
    time.taken <- end.time - start.time
    time.taken