# Uncomments these lines to get latest packages and script
# install.packages("admiral")
# admiral::use_ad_template("adsl")
# devtools::install_github("https://github.com/pharmaverse/logrx", ref = "dev")

# Name: ADSL
#
# Label: Subject Level Analysis Dataset
#
# Input: dm, ex, ds
library(admiral)
library(admiral.test) # Contains example datasets from the CDISC pilot project
library(dplyr)
library(lubridate)
library(stringr)
library(diffdf)

# Load source datasets ----

# Use e.g. haven::read_sas to read in .sas7bdat, or other suitable functions
# as needed and assign to the variables below.
# For illustration purposes read in admiral test data

data("admiral_dm")
data("admiral_ds")
data("admiral_ex")
data("admiral_ae")
data("admiral_lb")

dm <- admiral_dm
ds <- admiral_ds
ex <- admiral_ex
ae <- admiral_ae
lb <- admiral_lb

# When SAS datasets are imported into R using haven::read_sas(), missing
# character values from SAS appear as "" characters in R, instead of appearing
# as NA values. Further details can be obtained via the following link:
# https://pharmaverse.github.io/admiral/cran-release/articles/admiral.html#handling-of-missing-values

dm <- convert_blanks_to_na(dm)
ds <- convert_blanks_to_na(ds)
ex <- convert_blanks_to_na(ex)
ae <- convert_blanks_to_na(ae)
lb <- convert_blanks_to_na(lb)

# User defined functions ----

# Here are some examples of how you can create your own functions that
#  operates on vectors, which can be used in `mutate`.

# Grouping
format_racegr1 <- function(x) {
   case_when(
      x == "WHITE" ~ "White",
      x != "WHITE" ~ "Non-white",
      TRUE ~ "Missing"
   )
}

format_agegr1 <- function(x) {
   case_when(
      x < 18 ~ "<18",
      between(x, 18, 64) ~ "18-64",
      x > 64 ~ ">64",
      TRUE ~ "Missing"
   )
}

format_region1 <- function(x) {
   case_when(
      x %in% c("CAN", "USA") ~ "NA",
      !is.na(x) ~ "RoW",
      TRUE ~ "Missing"
   )
}

format_lddthgr1 <- function(x) {
   case_when(
      x <= 30 ~ "<= 30",
      x > 30 ~ "> 30",
      TRUE ~ NA_character_
   )
}

# EOSSTT mapping
format_eoxxstt <- function(x) {
   case_when(
      x %in% c("COMPLETED") ~ "COMPLETED",
      !(x %in% c("COMPLETED", "SCREEN FAILURE")) & !is.na(x) ~ "DISCONTINUED",
      x %in% c("SCREEN FAILURE") ~ NA_character_,
      TRUE ~ "ONGOING"
   )
}

# Derivations ----
# impute start and end time of exposure to first and last respectively, do not impute date
ex_ext <- ex %>%
   derive_vars_dtm(
      dtc = EXSTDTC,
      new_vars_prefix = "EXST"
   ) %>%
   derive_vars_dtm(
      dtc = EXENDTC,
      new_vars_prefix = "EXEN",
      time_imputation = "last"
   )

adsl <- dm %>%
   ## derive treatment variables (TRT01P, TRT01A) ----
# See also the "Visit and Period Variables" vignette
# (https://pharmaverse.github.io/admiral/cran-release/articles/visits_periods.html#treatment_adsl)
mutate(TRT01P = ARM, TRT01A = ACTARM) %>%
   ## derive treatment start date (TRTSDTM) ----
derive_vars_merged(
   dataset_add = ex_ext,
   filter_add = (EXDOSE > 0 |
                    (EXDOSE == 0 &
                        str_detect(EXTRT, "PLACEBO"))) &
      !is.na(EXSTDTM),
   new_vars = vars(TRTSDTM = EXSTDTM, TRTSTMF = EXSTTMF),
   order = vars(EXSTDTM, EXSEQ),
   mode = "first",
   by_vars = vars(STUDYID, USUBJID)
) %>%
   ## derive treatment end date (TRTEDTM) ----
derive_vars_merged(
   dataset_add = ex_ext,
   filter_add = (EXDOSE > 0 |
                    (EXDOSE == 0 &
                        str_detect(EXTRT, "PLACEBO"))) & !is.na(EXENDTM),
   new_vars = vars(TRTEDTM = EXENDTM, TRTETMF = EXENTMF),
   order = vars(EXENDTM, EXSEQ),
   mode = "last",
   by_vars = vars(STUDYID, USUBJID)
) %>%
   ## Derive treatment end/start date TRTSDT/TRTEDT ----
derive_vars_dtm_to_dt(source_vars = vars(TRTSDTM, TRTEDTM)) %>%
   ## derive treatment duration (TRTDURD) ----
derive_var_trtdurd()

## Disposition dates, status ----
# convert character date to numeric date without imputation
ds_ext <- derive_vars_dt(
   ds,
   dtc = DSSTDTC,
   new_vars_prefix = "DSST"
)

# Screen fail date
adsl <- adsl %>%
   derive_vars_merged(
      dataset_add = ds_ext,
      by_vars = vars(STUDYID, USUBJID),
      new_vars = vars(SCRFDT = DSSTDT),
      filter_add = DSCAT == "DISPOSITION EVENT" & DSDECOD == "SCREEN FAILURE"
   ) %>%
   derive_vars_merged(
      dataset_add = ds_ext,
      by_vars = vars(STUDYID, USUBJID),
      new_vars = vars(EOSDT = DSSTDT),
      filter_add = DSCAT == "DISPOSITION EVENT" & DSDECOD != "SCREEN FAILURE"
   ) %>%
   # EOS status
   derive_var_disposition_status(
      dataset_ds = ds_ext,
      new_var = EOSSTT,
      status_var = DSDECOD,
      format_new_var = format_eoxxstt,
      filter_ds = DSCAT == "DISPOSITION EVENT"
   ) %>%
   # Last retrieval date
   derive_vars_merged(
      dataset_add = ds_ext,
      by_vars = vars(STUDYID, USUBJID),
      new_vars = vars(FRVDT = DSSTDT),
      filter_add = DSCAT == "OTHER EVENT" & DSDECOD == "FINAL RETRIEVAL VISIT"
   ) %>%
   # Derive Randomization Date
   derive_vars_merged(
      dataset_add = ds_ext,
      filter_add = DSDECOD == "RANDOMIZED",
      by_vars = vars(STUDYID, USUBJID),
      new_vars = vars(RANDDT = DSSTDT)
   ) %>%
   # Death date - impute partial date to first day/month
   derive_vars_dt(
      new_vars_prefix = "DTH",
      dtc = DTHDTC,
      highest_imputation = "M",
      date_imputation = "first"
   ) %>%
   # Relative Day of Death
   derive_vars_duration(
      new_var = DTHADY,
      start_date = TRTSDT,
      end_date = DTHDT
   ) %>%
   # Elapsed Days from Last Dose to Death
   derive_vars_duration(
      new_var = LDDTHELD,
      start_date = TRTEDT,
      end_date = DTHDT,
      add_one = FALSE
   )

## Last known alive date ----
ae_start_date <- date_source(
   dataset_name = "ae",
   date = AESTDT
)
ae_end_date <- date_source(
   dataset_name = "ae",
   date = AEENDT
)
lb_date <- date_source(
   dataset_name = "lb",
   date = LBDT,
   filter = !is.na(LBDT)
)
trt_end_date <- date_source(
   dataset_name = "adsl",
   date = TRTEDT
)

# impute AE start and end date to first
ae_ext <- ae %>%
   derive_vars_dt(
      dtc = AESTDTC,
      new_vars_prefix = "AEST",
      highest_imputation = "M"
   ) %>%
   derive_vars_dt(
      dtc = AEENDTC,
      new_vars_prefix = "AEEN",
      highest_imputation = "M"
   )

# impute LB date to first
lb_ext <- derive_vars_dt(
   lb,
   dtc = LBDTC,
   new_vars_prefix = "LB",
   highest_imputation = "M"
)

adsl <- adsl %>%
   derive_var_extreme_dt(
      new_var = LSTALVDT,
      ae_start_date, ae_end_date, lb_date, trt_end_date,
      source_datasets = list(ae = ae_ext, lb = lb_ext, adsl = adsl),
      mode = "last"
   ) %>%
   derive_var_merged_exist_flag(
      dataset_add = ex,
      by_vars = vars(STUDYID, USUBJID),
      new_var = SAFFL,
      condition = (EXDOSE > 0 | (EXDOSE == 0 & str_detect(EXTRT, "PLACEBO")))
   ) %>%
   ## Groupings and others variables ----
mutate(
   RACEGR1 = format_racegr1(RACE),
   AGEGR1 = format_agegr1(AGE),
   REGION1 = format_region1(COUNTRY),
   LDDTHGR1 = format_lddthgr1(LDDTHELD),
   DTH30FL = if_else(LDDTHGR1 == "<= 30", "Y", NA_character_),
   DTHA30FL = if_else(LDDTHGR1 == "> 30", "Y", NA_character_),
   DTHB30FL = if_else(DTHDT <= TRTSDT + 30, "Y", NA_character_),
   DOMAIN = NULL
)


# Save output ----

dir <- tempdir() # Change to whichever directory you want to save the dataset in
saveRDS(adsl, file = file.path(dir, "adsl.rds"), compress = "bzip2")

adsl %>% slice(1:3) %>% glimpse()

# We create a mock QC/Double Programming  dataset to showcase diffdf and the use of
# the package for capturing warnings for differences in the datasets

# Variable removed.
adsl_qc <- adsl %>% select(-ETHNIC)

# Simple comparision between objects.  Check the file for what was captured.
adsl_lst <- diffdf(adsl, adsl_qc, file = "adsl.lst")

message("Please check for this important message!!")

# ---- Save output ----

dir <- tempdir() # Change to whichever directory you want to save the dataset in
save(adsl, file = file.path(dir, "adsl.rda"), compress = "bzip2")