############################################################ # Functions to determine reporting delays in Osiris and NICE # from previous downloads # 06-04-2020 ___UITZONDERINGSGROND_2___ ############################################################ # functions get_reportingdelay_Osiris <- function(var_name = NULL, type = NULL, start_date = as.Date("2020-03-13"), end_date = today()) { if(length(var_name) == 0) stop("provide Osiris name to analyse") if(length(type) == 0) stop("provide type of reports (\"reported\", \"hosp\" or \"IC\")") if(type == "IC") start_date <- max(start_date, as.Date("2020-03-21")) # get official NL confirmed and dead cases per day data_NL <- read_tsv("data/dataNL.txt", col_types = cols( date = col_date(format = ""), conf = col_double(), dead = col_double() )) %>% mutate(date = as.Date(date)) # get filenames of all Osiris downloads files <- list.files("___UITZONDERINGSGROND_6___", full.names = TRUE) files <- c(files, list.files("___UITZONDERINGSGROND_6___", full.names = TRUE)) files <- files[grepl(files, pattern = "rds")] files <- tibble(file_name = files, date = files %>% str_extract_all(pattern = "(Data_[0-9]+)") %>% unlist %>% gsub(pattern = "Data_", replacement = "") %>% as.Date(format = "%Y%m%d"), time = files %>% file.info %>% pull(ctime)) files <- files %>% left_join(data_NL, by = "date") report_dates <- seq(start_date, end_date, by = "day") %>% as.character rep_delay <- rep(0,100) for(report_date in report_dates) { file_name <- files %>% filter(date == report_date) %>% mutate(gemeld = sapply(.$file_name, function(f) f %>% read_rds %>% nrow)) %>% filter(conf == gemeld) %>% # get only files with official number reported filter(time == max(time)) %>% # get most recent file (if multiple files exist) pull(file_name) # if today's data is not yet available if(length(file_name)==0) { print(paste("only files up to ", report_date, "used")) return(cumsum(rev(rep_delay)/sum(rep_delay))) } newdata <- read_rds(file_name) %>% {if(type == "IC") filter(., NCOVopnameICU == "J") else if(type == "hosp") filter(., NCOVpatZhs == "J") else .} %>% mutate(var_date = pull(., var_name)) %>% mutate(var_date = as.Date(var_date, tryFormats = c("%d-%m-%Y", "%Y-%m-%d"))) %>% filter(!is.na(var_date) & var_date >= as.Date("2020-02-01") & var_date <= end_date) %>% group_by(var_date) %>% summarise(incidence = n()) %>% full_join(tibble(var_date = seq(min(min(.$var_date), as.Date(report_date)-length(rep_delay)+1), as.Date(report_date), by = "day")), by = "var_date") %>% replace_na(list(incidence = 0)) %>% arrange(var_date) if(as.Date(report_date) > start_date) { newdata <- full_join(prevdata, newdata, by = "var_date") %>% replace_na(list(incidence.x = 0, incidence.y = 0)) %>% arrange(var_date) %>% mutate(newentries = incidence.y - incidence.x) %>% mutate(newentries = ifelse(newentries < 0, 0, newentries)) # later corrections can cause decrease in cumulative numbers, ignored here rep_delay <- rep_delay + (newdata %>% pull(newentries) %>% tail(length(rep_delay))) newdata <- newdata %>% rename(incidence = incidence.y) %>% select(var_date, incidence) } prevdata <- newdata } return(cumsum(rev(rep_delay)/sum(rep_delay))) } get_reportingdelay_NICE <- function(start_date = as.Date("2020-03-28"), end_date = today(), IC = TRUE, discharge = FALSE) { coltypesNICE <- cols( pid = col_double(), seq = col_double(), name = col_character(), age = col_character(), adm_date_icu = col_character(), dis_date_icu = col_character(), discharged_to = col_character(), is_from_other_ic = col_character(), naar_zkh = col_character(), is_ic = col_double(), died_in_hospital = col_character(), covid19status = col_character() ) files <- grep("NICEID_", list.files("data"), value = TRUE) report_dates <- as.Date(substr(files, 8, 15), "%d%m%Y") files <- files[order(report_dates)] report_dates <- sort(report_dates) rep_delay <- rep(0, 100) for(i in 1:length(files)) { newdata <- read_csv(paste0("data/", files[i]), col_types = coltypesNICE) %>% {if(discharge) mutate(., adm_date_icu = as.Date(dis_date_icu, "%d/%m/%Y")) else mutate(., adm_date_icu = as.Date(adm_date_icu, "%d/%m/%Y"))} %>% # mutate(adm_date_icu = as.Date(adm_date_icu, "%d/%m/%Y")) %>% filter(covid19status %in% c("lab", "ct")) %>% filter(adm_date_icu >= as.Date("2020-02-27")) %>% {if(IC) filter(., is_ic==1) else filter(., is_ic == 0)} %>% group_by(pid) %>% summarise(adm_date_icu = min(adm_date_icu)) %>% group_by(adm_date_icu) %>% summarise(incidence = n()) %>% full_join(tibble(adm_date_icu = seq(min(min(.$adm_date_icu), report_dates[i]-length(rep_delay)+1), report_dates[i], by = "day")), by = "adm_date_icu") %>% replace_na(list(incidence = 0)) %>% arrange(adm_date_icu) if(i > 2) { newdata <- full_join(prevdata, newdata, by = "adm_date_icu") %>% replace_na(list(incidence.x = 0, incidence.y = 0)) %>% arrange(adm_date_icu) %>% mutate(newentries = incidence.y - incidence.x) #%>% #mutate(newentries = ifelse(newentries < 0, 0, newentries)) # later corrections can cause decrease in cumulative numbers, ignored here rep_delay <- rep_delay + (newdata %>% pull(newentries) %>% tail(length(rep_delay))) newdata <- newdata %>% rename(incidence = incidence.y) %>% select(adm_date_icu, incidence) } prevdata <- newdata } return(cumsum(rev(rep_delay)/sum(rep_delay))) }