Skip to contents

assign_datetime() maps one or more variables with date/time components in a raw dataset to a target SDTM variable following the ISO8601 format.

Usage

assign_datetime(
  tgt_dat = NULL,
  tgt_var,
  raw_dat,
  raw_var,
  raw_fmt,
  raw_unk = c("UN", "UNK"),
  id_vars = oak_id_vars(),
  .warn = TRUE
)

Arguments

tgt_dat

Target dataset: a data frame to be merged against raw_dat by the variables indicated in id_vars. This parameter is optional, see section Value for how the output changes depending on this argument value.

tgt_var

The target SDTM variable: a single string indicating the name of variable to be derived.

raw_dat

The raw dataset (dataframe); must include the variables passed in id_vars and raw_var.

raw_var

The raw variable(s): a character vector indicating the name(s) of the raw variable(s) in raw_dat with date or time components to be parsed into a ISO8601 format variable in tgt_var.

raw_fmt

A date/time parsing format. Either a character vector or a list of character vectors. If a character vector is passed then each element is taken as parsing format for each variable indicated in raw_var. If a list is provided, then each element must be a character vector of formats. The first vector of formats is used for parsing the first variable in raw_var, and so on.

raw_unk

A character vector of string literals to be regarded as missing values during parsing.

id_vars

Key variables to be used in the join between the raw dataset (raw_dat) and the target data set (tgt_dat).

.warn

Whether to warn about parsing failures.

Value

The returned data set depends on the value of tgt_dat:

  • If no target dataset is supplied, meaning that tgt_dat defaults to NULL, then the returned data set is raw_dat, selected for the variables indicated in id_vars, and a new extra column: the derived variable, as indicated in tgt_var.

  • If the target dataset is provided, then it is merged with the raw data set raw_dat by the variables indicated in id_vars, with a new column: the derived variable, as indicated in tgt_var.

Examples

# `md1`: an example raw data set.
md1 <-
  tibble::tribble(
    ~oak_id, ~raw_source, ~patient_number, ~MDBDR,        ~MDEDR,        ~MDETM,
    1L,      "MD1",       375,             NA,            NA,            NA,
    2L,      "MD1",       375,             "15-Sep-20",   NA,            NA,
    3L,      "MD1",       376,             "17-Feb-21",   "17-Feb-21",   NA,
    4L,      "MD1",       377,             "4-Oct-20",    NA,            NA,
    5L,      "MD1",       377,             "20-Jan-20",   "20-Jan-20",   "10:00:00",
    6L,      "MD1",       377,             "UN-UNK-2019", "UN-UNK-2019", NA,
    7L,      "MD1",       377,             "20-UNK-2019", "20-UNK-2019", NA,
    8L,      "MD1",       378,             "UN-UNK-2020", "UN-UNK-2020", NA,
    9L,      "MD1",       378,             "26-Jan-20",   "26-Jan-20",   "07:00:00",
    10L,     "MD1",       378,             "28-Jan-20",   "1-Feb-20",    NA,
    11L,     "MD1",       378,             "12-Feb-20",   "18-Feb-20",   NA,
    12L,     "MD1",       379,             "10-UNK-2020", "20-UNK-2020", NA,
    13L,     "MD1",       379,             NA,            NA,            NA,
    14L,     "MD1",       379,             NA,            "17-Feb-20",   NA
  )

# Using the raw data set `md1`, derive the variable CMSTDTC from MDBDR using
# the parsing format (`raw_fmt`) `"d-m-y"` (day-month-year), while allowing
# for the presence of special date component values (e.g. `"UN"` or `"UNK"`),
# indicating that these values are missing/unknown (unk).
cm1 <-
  assign_datetime(
    tgt_var = "CMSTDTC",
    raw_dat = md1,
    raw_var = "MDBDR",
    raw_fmt = "d-m-y",
    raw_unk = c("UN", "UNK")
  )

cm1
#> # A tibble: 14 × 4
#>    oak_id raw_source patient_number CMSTDTC   
#>     <int> <chr>               <dbl> <iso8601> 
#>  1      1 MD1                   375 NA        
#>  2      2 MD1                   375 2020-09-15
#>  3      3 MD1                   376 2021-02-17
#>  4      4 MD1                   377 2020-10-04
#>  5      5 MD1                   377 2020-01-20
#>  6      6 MD1                   377 2019      
#>  7      7 MD1                   377 2019---20 
#>  8      8 MD1                   378 2020      
#>  9      9 MD1                   378 2020-01-26
#> 10     10 MD1                   378 2020-01-28
#> 11     11 MD1                   378 2020-02-12
#> 12     12 MD1                   379 2020---10 
#> 13     13 MD1                   379 NA        
#> 14     14 MD1                   379 NA        

# Inspect parsing failures associated with derivation of CMSTDTC.
problems(cm1$CMSTDTC)
#> # A tibble: 3 × 2
#>     ..i MDBDR
#>   <int> <chr>
#> 1     1 NA   
#> 2    13 NA   
#> 3    14 NA   

# `cm_inter`: an example target data set.
cm_inter <-
  tibble::tibble(
    oak_id = 1L:14L,
    raw_source = "MD1",
    patient_number = c(
      375, 375, 376, 377, 377, 377, 377, 378,
      378, 378, 378, 379, 379, 379
    ),
    CMTRT = c(
      "BABY ASPIRIN",
      "CORTISPORIN",
      "ASPIRIN",
      "DIPHENHYDRAMINE HCL",
      "PARCETEMOL",
      "VOMIKIND",
      "ZENFLOX OZ",
      "AMITRYPTYLINE",
      "BENADRYL",
      "DIPHENHYDRAMINE HYDROCHLORIDE",
      "TETRACYCLINE",
      "BENADRYL",
      "SOMINEX",
      "ZQUILL"
    ),
    CMINDC = c(
      "NA",
      "NAUSEA",
      "ANEMIA",
      "NAUSEA",
      "PYREXIA",
      "VOMITINGS",
      "DIARHHEA",
      "COLD",
      "FEVER",
      "LEG PAIN",
      "FEVER",
      "COLD",
      "COLD",
      "PAIN"
    )
  )

# Same derivation as above but now involving the merging with the target
# data set `cm_inter`.
cm2 <-
  assign_datetime(
    tgt_dat = cm_inter,
    tgt_var = "CMSTDTC",
    raw_dat = md1,
    raw_var = "MDBDR",
    raw_fmt = "d-m-y"
  )

cm2
#> # A tibble: 14 × 6
#>    oak_id raw_source patient_number CMTRT                         CMINDC CMSTDTC
#>     <int> <chr>               <dbl> <chr>                         <chr>  <iso86>
#>  1      1 MD1                   375 BABY ASPIRIN                  NA     NA    …
#>  2      2 MD1                   375 CORTISPORIN                   NAUSEA 2020-0…
#>  3      3 MD1                   376 ASPIRIN                       ANEMIA 2021-0…
#>  4      4 MD1                   377 DIPHENHYDRAMINE HCL           NAUSEA 2020-1…
#>  5      5 MD1                   377 PARCETEMOL                    PYREX… 2020-0…
#>  6      6 MD1                   377 VOMIKIND                      VOMIT… 2019  …
#>  7      7 MD1                   377 ZENFLOX OZ                    DIARH… 2019--…
#>  8      8 MD1                   378 AMITRYPTYLINE                 COLD   2020  …
#>  9      9 MD1                   378 BENADRYL                      FEVER  2020-0…
#> 10     10 MD1                   378 DIPHENHYDRAMINE HYDROCHLORIDE LEG P… 2020-0…
#> 11     11 MD1                   378 TETRACYCLINE                  FEVER  2020-0…
#> 12     12 MD1                   379 BENADRYL                      COLD   2020--…
#> 13     13 MD1                   379 SOMINEX                       COLD   NA    …
#> 14     14 MD1                   379 ZQUILL                        PAIN   NA    …

# Inspect parsing failures associated with derivation of CMSTDTC.
problems(cm2$CMSTDTC)
#> # A tibble: 3 × 2
#>     ..i MDBDR
#>   <int> <chr>
#> 1     1 NA   
#> 2    13 NA   
#> 3    14 NA   

# Derive CMSTDTC using both MDEDR and MDETM variables.
# Note that the format `"d-m-y"` is used for parsing MDEDR and `"H:M:S"` for
# MDETM (correspondence is by positional matching).
cm3 <-
  assign_datetime(
    tgt_var = "CMSTDTC",
    raw_dat = md1,
    raw_var = c("MDEDR", "MDETM"),
    raw_fmt = c("d-m-y", "H:M:S"),
    raw_unk = c("UN", "UNK")
  )

cm3
#> # A tibble: 14 × 4
#>    oak_id raw_source patient_number CMSTDTC            
#>     <int> <chr>               <dbl> <iso8601>          
#>  1      1 MD1                   375 NA                 
#>  2      2 MD1                   375 NA                 
#>  3      3 MD1                   376 2021-02-17         
#>  4      4 MD1                   377 NA                 
#>  5      5 MD1                   377 2020-01-20T10:00:00
#>  6      6 MD1                   377 2019               
#>  7      7 MD1                   377 2019---20          
#>  8      8 MD1                   378 2020               
#>  9      9 MD1                   378 2020-01-26T07:00:00
#> 10     10 MD1                   378 2020-02-01         
#> 11     11 MD1                   378 2020-02-18         
#> 12     12 MD1                   379 2020---20          
#> 13     13 MD1                   379 NA                 
#> 14     14 MD1                   379 2020-02-17         

# Inspect parsing failures associated with derivation of CMSTDTC.
problems(cm3$CMSTDTC)
#> # A tibble: 12 × 3
#>      ..i MDEDR       MDETM
#>    <int> <chr>       <chr>
#>  1     1 NA          NA   
#>  2     2 NA          NA   
#>  3     3 17-Feb-21   NA   
#>  4     4 NA          NA   
#>  5     6 UN-UNK-2019 NA   
#>  6     7 20-UNK-2019 NA   
#>  7     8 UN-UNK-2020 NA   
#>  8    10 1-Feb-20    NA   
#>  9    11 18-Feb-20   NA   
#> 10    12 20-UNK-2020 NA   
#> 11    13 NA          NA   
#> 12    14 17-Feb-20   NA