###################################################################
# Functions to use with estimationRt_OSIRIS script
###################################################################

library(tidyverse)
library(lubridate)

get_symptomonset2reporting <- function(start_date = as.Date("2020-03-13"), end_date = today(), IC = FALSE) {
  
  # get all case data files from Osiris
  files <- list.files("/___UITZONDERINGSGROND_6___Previous", full.names = TRUE)
  files <- c(files, list.files("/___UITZONDERINGSGROND_6___Geschoond", full.names = TRUE))
  files <- files[grepl(files, pattern = "rds")]

  report_dates <- seq(start_date, end_date, by = "day") %>% as.character

  SO2rep <- rep(0,40)
  for(report_date in report_dates) {
    pattern_date <- report_date %>% str_split(pattern = "-") %>% unlist %>% paste0(collapse = "")
    tmpfiles <- files[grep(files, pattern = pattern_date)]
    file.name <- tmpfiles[tmpfiles %>% file.info %>% pull(ctime) %>% which.max]

    # if today's data is not yet available
    if(length(file.name)==0) {
      print(paste("only files up to ", report_date, "used"))
      return(cumsum(rev(SO2rep)/sum(SO2rep)))
    }
    
    newdata <- read_rds(file.name) %>% 
      {if(IC) filter(., NCOVopnameICU == "J") else filter(., NCOVpatZhs == "J")} %>% 
      filter(!is.na(ZIE1eZiekteDt) & ZIE1eZiekteDt <= end_date) %>%
      group_by(ZIE1eZiekteDt) %>%
      summarise(incidence = n()) %>% 
      full_join(tibble(ZIE1eZiekteDt = seq(min(min(.$ZIE1eZiekteDt), as.Date(report_date)-length(SO2rep)+1), as.Date(report_date), by = "day"))) %>%
      replace_na(list(incidence = 0)) %>%
      arrange(ZIE1eZiekteDt)

    if(as.Date(report_date) > start_date) {
      newdata <- full_join(prevdata, newdata, by = "ZIE1eZiekteDt") %>% 
        replace_na(list(incidence.x = 0, incidence.y = 0)) %>%
        arrange(ZIE1eZiekteDt) %>% 
        mutate(newentries = incidence.y - incidence.x) 
      SO2rep <- SO2rep + (newdata %>% pull(newentries) %>% tail(40))
      newdata <- newdata %>% rename(incidence = incidence.y)  %>% select(ZIE1eZiekteDt, incidence)
    }
    
    prevdata <- newdata
  }
  cdf <- cumsum(rev(SO2rep)/sum(SO2rep))
  cdf[cdf > 1] <- 1
  return(cdf)
}

get_symptomonset2reporting_perprov <- function(provincie, start_date = as.Date("2020-03-13"), end_date = today(), IC = FALSE) {
  
  # get all case data files from Osiris
  files <- list.files("/___UITZONDERINGSGROND_6___Previous", full.names = TRUE)
  files <- c(files, list.files("/___UITZONDERINGSGROND_6___Geschoond", full.names = TRUE))
  files <- files[grepl(files, pattern = "rds")]
  
  report_dates <- seq(start_date, end_date, by = "day") %>% as.character
  
  SO2rep <- rep(0,40)
  for(report_date in report_dates) {
    pattern_date <- report_date %>% str_split(pattern = "-") %>% unlist %>% paste0(collapse = "")
    tmpfiles <- files[grep(files, pattern = pattern_date)]
    file.name <- tmpfiles[tmpfiles %>% file.info %>% pull(ctime) %>% which.max]
    
    # if today's data is not yet available
    if(length(file.name)==0) {
      print(paste("only files up to ", report_date, "used"))
      return(cumsum(rev(SO2rep)/sum(SO2rep)))
    }
    
    newdata <- read_rds(file.name) %>% 
      filter(Provincie == provincie) %>%
    {if(IC) filter(., NCOVopnameICU == "J") else filter(., NCOVpatZhs == "J")} %>% 
      filter(!is.na(ZIE1eZiekteDt) & ZIE1eZiekteDt <= end_date) %>%
      group_by(ZIE1eZiekteDt) %>%
      summarise(incidence = n()) %>% 
      full_join(tibble(ZIE1eZiekteDt = seq(min(min(.$ZIE1eZiekteDt), as.Date(report_date)-length(SO2rep)+1), as.Date(report_date), by = "day"))) %>%
      replace_na(list(incidence = 0)) %>%
      arrange(ZIE1eZiekteDt)
    
    if(as.Date(report_date) > start_date) {
      newdata <- full_join(prevdata, newdata, by = "ZIE1eZiekteDt") %>% 
        replace_na(list(incidence.x = 0, incidence.y = 0)) %>%
        arrange(ZIE1eZiekteDt) %>% 
        mutate(newentries = incidence.y - incidence.x) 
      SO2rep <- SO2rep + (newdata %>% pull(newentries) %>% tail(40))
      newdata <- newdata %>% rename(incidence = incidence.y)  %>% select(ZIE1eZiekteDt, incidence)
    }
    
    prevdata <- newdata
  }
  cdf <- cumsum(rev(SO2rep)/sum(SO2rep))
  cdf[cdf > 1] <- 1
  return(cdf)
}

extract_epicurve <- function(data, IC = FALSE, report_date = today(), SOtoRep = NULL) {
  data <- data %>%
  {if(IC) filter(., NCOVopnameICU == "J") else filter(., NCOVpatZhs == "J")} %>%
    mutate(bron = if("Bron" %in% names(data)) Bron else Bron_land)

  epicurve <- data %>% 
    filter(!is.na(ZIE1eZiekteDt) & ZIE1eZiekteDt <= today()) %>% # omit symptom onset dates in future
    group_by(ZIE1eZiekteDt) %>%
    summarise(incidenceABROAD = sum(bron == "Buitenland", na.rm = TRUE),
              incidenceTOT = n()) %>%
    mutate(incidenceNL = incidenceTOT - incidenceABROAD) %>% 
    full_join(tibble(ZIE1eZiekteDt = seq(min(.$ZIE1eZiekteDt), as.Date(report_date), by = "day"))) %>%
    replace_na(list(incidenceNL = 0, incidenceTOT = 0, incidenceABROAD = 0)) %>%
    arrange(ZIE1eZiekteDt) %>%
    mutate(dates = ZIE1eZiekteDt) %>%
    select(dates, incidenceNL, incidenceABROAD, incidenceTOT)

  if(length(SOtoRep) == 0) {
    # default SO to reporting delay over last 7 days
    reporting_delay <- get_symptomonset2reporting(start_date = as.Date(report_date) - 7, end_date = as.Date(report_date), IC = IC)
  } else {
    reporting_delay = SOtoRep
  }
  if(nrow(epicurve) > length(reporting_delay)) {
    reporting_delay <- c(rep(1, nrow(epicurve)-length(reporting_delay)), rev(reporting_delay))
  } else {
    reporting_delay <- rev(reporting_delay[1:nrow(epicurve)])
  }
  epicurve <- epicurve %>% mutate(p = reporting_delay)
  
  # if most recent days are zero, give them one case and a p to reflect total number of last non-zero day
  max_nonzero_inc_day <- max(which(epicurve$incidenceTOT != 0))
  if(max_nonzero_inc_day < nrow(epicurve)) {
    epicurve$incidenceTOT[(max_nonzero_inc_day + 1):nrow(epicurve)] <- 1
    epicurve$incidenceNL[(max_nonzero_inc_day + 1):nrow(epicurve)] <- 1
    epicurve$p[(max_nonzero_inc_day + 1):nrow(epicurve)] <- epicurve$p[max_nonzero_inc_day]/epicurve$incidenceTOT[max_nonzero_inc_day]
  }
  return(epicurve)
}

calculate_Ru_Osiris <- function(epicurve, nsample = 1000) {
  
  pdfSI <- dnorm(x = 1:7, mean = 6.8/2, sd = 1.27)
  pdfSI <- pdfSI/sum(pdfSI)
  g <- pdfSI
  
  # sample non-observed cases due to reporting delay from neg binomial
  incidence <- matrix(0, nrow = nrow(epicurve), ncol = nsample)
  for(j in 1:nrow(epicurve)) {
    if(epicurve$incidenceTOT[j] != 0) {
      incidence[j, ] <- rnbinom(n = nsample, size = epicurve$incidenceTOT[j], prob = epicurve$p[j]) + epicurve$incidenceTOT[j]
    }
  }

  # construct instantaneous Rt from incidence
  Rt <- matrix(0, nrow = nrow(epicurve), ncol = nsample)
  for(t in 2:nrow(epicurve)) {
    g2 <- g[1:min(length(g), t-1)]
    Rt[t,] <- as.numeric((incidence[t, ] - epicurve$incidenceABROAD[t])/colSums(incidence[t - (1:length(g2)), , drop = FALSE]*g2))
  }
  
  # construct case Ru from instantaneous Rt 
  Ru <- matrix(0, nrow = nrow(epicurve), ncol = nsample)
  for(u in 1:(nrow(Rt)-1)) {
    g2 <- g[1:min(length(g), nrow(Rt)-u)]
    Ru[u,] <- colSums(g2*Rt[u+1:length(g2), ,drop = FALSE])/sum(g2) # divide by sum(g2) to account for not observed Rt's in future
  }
  
  # construct confidence bounds by sampling from poisson
  Rtupper <- rep(0, nrow(incidence))
  Rtlower <- rep(0, nrow(incidence))
  for(j in 1:nrow(incidence)) {
    tmp <- sapply(1:nsample, function(i) if(incidence[j,i] != 0 & Rt[j,i] !=Inf) rpois(100, incidence[j,i]*Rt[j,i])/incidence[j,i])
    Rtlower[j] <- quantile(unlist(tmp), probs = 0.025)
    Rtupper[j] <- quantile(unlist(tmp), probs = 0.975)
  }

  # construct confidence bounds by sampling from poisson
  Ruupper <- rep(0, nrow(incidence))
  Rulower <- rep(0, nrow(incidence))
  for(j in 1:nrow(incidence)) {
    tmp <- sapply(1:nsample, function(i) if(incidence[j,i] != 0 & Ru[j,i] !=Inf) rpois(100, incidence[j,i]*Ru[j,i])/incidence[j,i])
    Rulower[j] <- quantile(unlist(tmp), probs = 0.025)
    Ruupper[j] <- quantile(unlist(tmp), probs = 0.975)
  }

  epicurve <- epicurve %>% mutate(mean_incidence = apply(incidence, MARGIN = 1, mean),
                                  instR = apply(Rt, MARGIN = 1, mean),
                                  caseR = apply(Ru, MARGIN = 1, mean),
                                  instRlower = Rtlower,
                                  instRupper = Rtupper,
                                  caseRlower = Rulower,
                                  caseRupper = Ruupper)
  return(epicurve)
}

plot_epicurve_Osiris <- function(epicurve, IC = FALSE) {
p <- 
    ggplot(data = epicurve,
           mapping = aes(x = dates, y = incidenceTOT/p)) +
    geom_bar(stat = "identity",
             fill = "grey") +
    geom_bar(mapping = aes(y = incidenceTOT),
             stat = "identity",
             fill = "#01689b") +
    geom_bar(mapping = aes(y = incidenceABROAD),
             stat = "identity",
             fill = 2) +
    scale_x_date(
      limits = c(as.Date("2020-02-27"), NA),
      breaks = as.Date("2020-02-17") + seq(0, 50, 7),
      labels = format(as.Date("2020-02-17") + seq(0, 50, 7), "%d %b")) +
    labs(x = "eerste ziektedag",
         y = "aantal per dag",
         subtitle = if(IC) paste("gebaseerd op IC opnames uit OSIRIS data", last(epicurve$dates)) else paste("gebaseerd op ziekenhuisopnames uit OSIRIS data", last(epicurve$dates))) +
    theme_minimal()  
  
  return(p)
}

plot_Reff <- function(epicurve, caseReff = TRUE, IC = TRUE) {
  p <- 
    ggplot(data = epicurve, # %>% {if(caseReff) slice(., -nrow(.)) else .},
           mapping = aes(x = dates, 
                         y = if(caseReff) caseR else instR, 
                         ymin = if(caseReff) caseRlower else instRlower, 
                         ymax = if(caseReff) caseRupper else instRupper)) + 
    geom_ribbon(fill = "grey", col = NA) +
    geom_line(col = 1) +
    geom_line(y = 1, lty = 2) +
    coord_cartesian(
      ylim = c(0,4)) +
    scale_x_date(
      limits = c(as.Date("2020-02-27"), if(caseReff) max(epicurve$dates)-1 else max(epicurve$dates)),
      breaks = as.Date("2020-02-17") + seq(0, 50, 7),
      labels = format(as.Date("2020-02-17") + seq(0, 50, 7), "%d %b")) +
    labs(x = "eerste ziektedag",
         y = if(caseReff) "effectieve R" else "instantane R",
         subtitle = if(IC) paste("gebaseerd op IC opnames uit OSIRIS data", last(epicurve$dates)) else paste("gebaseerd op ziekenhuisopnames uit OSIRIS data", last(epicurve$dates))) +
    theme_minimal()
  
  return(p)
}