TransWikia.com

Interactive join in r based on different variables

Stack Overflow Asked on November 4, 2021

I have two data frames as follows:

 df<-data.frame(
  id=c("1-1","2-2","3-3","4-4","5-5","6-6"),
  identifer=c(1,2,3,4,5,6),
  key=c("A","B","C","D","E","F"),
  product=c("productA","productB","productC","productD","productE","productF"),
  ingredient=c("ingredientA","ingredientB","ingredientC","ingredientD","ingredientE","ingredientF"),
  DF=c("Tablet","Powder","Suspension","System","Capsule","Capsule"))

  df_2<-data.frame(
  identifer=c(1,2,2,3,4,6),
  key=c("A","B","B","C","D","F"),
  product=c("productA","productB","productB","productCC","productDD","productFF"),
  ingredient=c("ingredientA","ingredientBB","ingredientB","ingredientC","ingredientDD","ingredeintFF"),
  DF=c("Tablet","Powder","Powder","Suspension","injection","tablet"),
  Route=c("ORAL","INHALATION","INHALATION","topical","injecatable","oral")
)

I want to join these two datasets first on the following variables + create a new column called “match” that describes the join:

1) identifier,key, product, ingredient,DF
match="identifier,key, product, ingredient,DF"

Then, I want to join the REMAINING rows on these variables:

2)identifier, key, product, DF
match="identifier,key, product,DF"

Then the remaining rows from step 2 on these variables, so and so forth.

3) identifier, key, Ingredient, DF
4) identifier, key, DF 
5) identifer, key, product, ingredient
7) identifer, key, product
8) identifer, key, ingredient 
9) identifier, key 

And I want to return the rows that do not have a match as well. I know how to do this stepwise but I’m wondering if there is an easier way to do this?

this is the expected output:

df_out<-data.frame(
  identifer=c(1,2,3,4,5,6),
  key=c("A","B","C","D","E","F"),
  product_1=c("productA","productB","productC","productD","productE","productF"),
  ingredient_1=c("ingredientA","ingredientB","ingredientC","ingredientD","ingredientE","ingredientF"),
  DF_1=c("Tablet","Powder","Suspension","System","Capsule","Capsule"),
  product_2=c("productA","productB","productCC","productDD",NA,"productFF"),
  ingredient_2=c("ingredientA","ingredientB","ingredientC","ingredientDD",NA,"ingredeintFF"),
  DF_2=c("Tablet","Powder","Suspension","injection",NA,"tablet"),
  Route_2=c("ORAL","INHALATION",'topical',"injecatable",NA,"oral"),
  Match=c("identifer+key+product+ingredient+DF","identifier+key+product+ingredient+DF","identifier+key+ingredient+DF","identifer+key","None","identifer+key+product+ingredient"))

2 Answers

Here is an option using data.table:

library(data.table)
setDT(df)
setDT(df_2)

keyord <- list(
    c("product", "ingredient", "DF"),
    c("product", "DF"),
    c("ingredient", "DF"),
    "DF",
    c("product", "ingredient"),
    "product",
    "ingredient",
    c()
)

cols <- c("product", "ingredient", "DF", "Route")
df[, Match := NA_character_]

for (v in keyord) {
    k <- c("identifier", "key", v)
    df[df_2, on=k, c(paste0(cols, "_2"), "check") := c(mget(paste0("i.", cols)), .(TRUE))]
    df[is.na(Match) & check, Match := toString(k)]
}
setnames(df, cols, paste0(cols, "_1"), skip_absent=TRUE)

output:

    id identifier key product_1 ingredient_1       DF_1                                    Match product_2 ingredient_2       DF_2     Route_2 check
1: 1-1          1   A  productA  ingredientA     Tablet identifier, key, product, ingredient, DF  productA  ingredientA     Tablet        ORAL  TRUE
2: 2-2          2   B  productB  ingredientB     Powder identifier, key, product, ingredient, DF  productB  ingredientB     Powder  INHALATION  TRUE
3: 3-3          3   C  productC  ingredientC Suspension          identifier, key, ingredient, DF productCC  ingredientC Suspension     topical  TRUE
4: 4-4          4   D  productD  ingredientD     System                          identifier, key productDD ingredientDD  injection injecatable  TRUE
5: 5-5          5   E  productE  ingredientE    Capsule                                     <NA>      <NA>         <NA>       <NA>        <NA>    NA
6: 6-6          6   F  productF  ingredientF    Capsule     identifier, key, product, ingredient  productF  ingredientF     tablet        oral  TRUE

data after fixing some typos in OP:

df <- data.frame(
    id=c("1-1","2-2","3-3","4-4","5-5","6-6"),
    identifier=c(1,2,3,4,5,6),
    key=c("A","B","C","D","E","F"),
    product=c("productA","productB","productC","productD","productE","productF"),
    ingredient=c("ingredientA","ingredientB","ingredientC","ingredientD","ingredientE","ingredientF"),
    DF=c("Tablet","Powder","Suspension","System","Capsule","Capsule"))

df_2 <- data.frame(
    identifier=c(1,2,2,3,4,6),
    key=c("A","B","B","C","D","F"),
    product=c("productA","productB","productB","productCC","productDD","productF"),
    ingredient=c("ingredientA","ingredientBB","ingredientB","ingredientC","ingredientDD","ingredientF"),
    DF=c("Tablet","Powder","Powder","Suspension","injection","tablet"),
    Route=c("ORAL","INHALATION","INHALATION","topical","injecatable","oral")
)

edit for multiple matches:

df_2 <- data.frame( identifier=c(1,2,2,3,4,4,6), key=c("A","B","B","C","D","D","F"), product=c("productA","productB","productB","productCC","productDD","productDd","productF"), ingredient=c("ingredientA","ingredientBB","ingredientB","ingredientC","ingredientDD",NA,"ingredientF"), DF=c("Tablet","Powder","Powder","Suspension","injection",NA,"tablet"), Route=c("ORAL","INHALATION","INHALATION","topical","injecatable",NA,"oral") )
setDT(df_2)
df[, c("Match", "check") := .(NA_character_, FALSE)]

ocols <- unique(unlist(keyord))
rbindlist(lapply(keyord, function(v) {
    k <- c("identifier", "key", v)
    a <- df_2[df[(!check)], on=k, nomatch=0L, c(.(id=id),
        setNames(mget(paste0("i.", ocols)), paste0(ocols, "_1")), 
        setNames(mget(paste0("x.", c(ocols, "Route"))), paste0(c(ocols, "Route"), "_2"))) 
    ]
    df[id %chin% a$id, check := TRUE]
    a
}), use.names=TRUE)

output:

    id product_1 ingredient_1       DF_1 product_2 ingredient_2       DF_2     Route_2
1: 1-1  productA  ingredientA     Tablet  productA  ingredientA     Tablet        ORAL
2: 2-2  productB  ingredientB     Powder  productB  ingredientB     Powder  INHALATION
3: 3-3  productC  ingredientC Suspension productCC  ingredientC Suspension     topical
4: 6-6  productF  ingredientF    Capsule  productF  ingredientF     tablet        oral
5: 4-4  productD  ingredientD     System productDD ingredientDD  injection injecatable
6: 4-4  productD  ingredientD     System productDd         <NA>       <NA>        <NA>

Answered by chinsoon12 on November 4, 2021

Here is a solution that might feel slightly over-engineered but achieves the expected outcome:

library(dplyr)
library(purrr)
library(stringr)

get_match=function(data, cols, keys){
  rtn = ifelse(rowSums(is.na(data[paste0(cols, "_1")]))==rowSums(is.na(data[paste0(cols, "_2")])), paste(keys, collapse="+"), "None")

  rtn2 = cols %>% 
    map(~{
      case_when(as.character(data[[paste0(.x, "_1")]])==as.character(data[[paste0(.x, "_2")]])~.x)
    }) %>% 
    reduce(paste, sep="+") %>% str_replace_all("\+?NA\+?", "")

  paste(rtn, rtn2, sep="+") %>% str_replace_all("\+$", "")
}

df_out = left_join(df, df_2, by=c("identifer", "key"), suffix=c("_1", "_2")) %>% 
    mutate(Match = get_match(., cols=c("product", "ingredient", "DF"), keys=c("identifer", "key")), 
           match_strength = str_count(Match, "\+")) %>% 
    group_by(id) %>% 
    filter(match_strength==max(match_strength, na.rm=TRUE))

dplyr::left_join removes the by keys so the only way I found to add them is to check that all the _1 or the _2 were missing. I could have used the keep=TRUE option and remove/rename them hereafter though...

Answered by Dan Chaltiel on November 4, 2021

Add your own answers!

Ask a Question

Get help from others!

© 2024 TransWikia.com. All rights reserved. Sites we Love: PCI Database, UKBizDB, Menu Kuliner, Sharing RPP