This check looks for duplicate patient demographics records in DM

check_dm_usubjid_dup(DM)

Arguments

DM

Demographics SDTM dataset with variable USUBJID

Value

Boolean value for whether the check passed or failed, with 'msg' attribute if the check failed

Author

Madeleine Ma, Stella Banjo (HackR 2021)

Examples


## duplicates and same patient number across sites for 3-part USUBJID
DM <- data.frame(USUBJID = c("GO12345-00000-1000",
                             "GO12345-11111-1000",
                             "GO12345-00000-1000",
                             "GO12345-00000-1001"),
      stringsAsFactors = FALSE)

check_dm_usubjid_dup(DM)
#> [1] FALSE
#> attr(,"msg")
#> [1] "Duplicate USUBJID and/or same Patient number across different USUBJIDs"
#> attr(,"data")
#> # A tibble: 3 × 2
#>   USUBJID            FLAG                                        
#>   <chr>              <chr>                                       
#> 1 GO12345-00000-1000 Same Patient Number Across Different USUBJID
#> 2 GO12345-11111-1000 Same Patient Number Across Different USUBJID
#> 3 GO12345-00000-1000 Duplicate USUBJID                           


## no duplicate IDs in the dataframe for 3-part USUBJID
DM2 <- data.frame(USUBJID = c("GO12345-00000-1000",
                              "GO12345-11111-1001",
                              "GO12345-11111-1002"),
             stringAsFactors = FALSE)

check_dm_usubjid_dup(DM2)
#> [1] TRUE


## duplicates for 2-part USUBJID
DM3 <- data.frame(USUBJID = c("GO12345-1000",
                              "GO12345-1000"),
            stringAsFactors = FALSE)

check_dm_usubjid_dup(DM3)
#> [1] FALSE
#> attr(,"msg")
#> [1] "Duplicate USUBJID and/or same Patient number across different USUBJIDs"
#> attr(,"data")
#> # A tibble: 1 × 2
#>   USUBJID      FLAG             
#>   <chr>        <chr>            
#> 1 GO12345-1000 Duplicate USUBJID


##  no duplicate IDs in the dataframe for 2-part USUBJID
DM4 <- data.frame(USUBJID = c("GO12345-1000",
                              "GO12345-1001",
                              "GO12345-1002"),
             stringAsFactors = FALSE)

check_dm_usubjid_dup(DM4)
#> [1] TRUE

##  dataframe with one or two additional variables, if there is variation across other variables
DM5 <- data.frame(USUBJID = c("GO12345-1000",
                              "GO12345-1000"),
                  SEX = c("M", "F"),
                  AGE = c(18, 60),
         stringAsFactors = FALSE)

check_dm_usubjid_dup(DM5)
#> [1] FALSE
#> attr(,"msg")
#> [1] "Duplicate USUBJID and/or same Patient number across different USUBJIDs"
#> attr(,"data")
#> # A tibble: 1 × 2
#>   USUBJID      FLAG             
#>   <chr>        <chr>            
#> 1 GO12345-1000 Duplicate USUBJID

## dataframe in which USUBJID is not present
DM6 <- data.frame(
         STUDYID = c("GO12345"),
         SEX = c("M"),
         AGE = c(72),
     stringAsFactors = FALSE)

check_dm_usubjid_dup(DM6)
#> [1] FALSE
#> attr(,"msg")
#> [1] "DM is missing the variable: USUBJID"