Interactive join in r based on different variables

Question

I have two data frames as follows:

df<-data.frame(
  id=c("1-1","2-2","3-3","4-4","5-5","6-6"),
  identifer=c(1,2,3,4,5,6),
  key=c("A","B","C","D","E","F"),
  product=c("productA","productB","productC","productD","productE","productF"),
  ingredient=c("ingredientA","ingredientB","ingredientC","ingredientD","ingredientE","ingredientF"),
  DF=c("Tablet","Powder","Suspension","System","Capsule","Capsule"))

df_2<-data.frame(
  identifer=c(1,2,2,3,4,6),
  key=c("A","B","B","C","D","F"),
  product=c("productA","productB","productB","productCC","productDD","productFF"),
  ingredient=c("ingredientA","ingredientBB","ingredientB","ingredientC","ingredientDD","ingredeintFF"),
  DF=c("Tablet","Powder","Powder","Suspension","injection","tablet"),
  Route=c("ORAL","INHALATION","INHALATION","topical","injecatable","oral")
)

I want to join these two datasets first on the following variables + create a new column called "match" that describes the join:

1) identifier,key, product, ingredient,DF
match="identifier,key, product, ingredient,DF"

Then, I want to join the REMAINING rows on these variables:

2)identifier, key, product, DF
match="identifier,key, product,DF"

Then the remaining rows from step 2 on these variables, so and so forth.

3) identifier, key, Ingredient, DF
4) identifier, key, DF 
5) identifer, key, product, ingredient
7) identifer, key, product
8) identifer, key, ingredient 
9) identifier, key

And I want to return the rows that do not have a match as well. I know how to do this stepwise but I'm wondering if there is an easier way to do this?

this is the expected output:

df_out<-data.frame(
  identifer=c(1,2,3,4,5,6),
  key=c("A","B","C","D","E","F"),
  product_1=c("productA","productB","productC","productD","productE","productF"),
  ingredient_1=c("ingredientA","ingredientB","ingredientC","ingredientD","ingredientE","ingredientF"),
  DF_1=c("Tablet","Powder","Suspension","System","Capsule","Capsule"),
  product_2=c("productA","productB","productCC","productDD",NA,"productFF"),
  ingredient_2=c("ingredientA","ingredientB","ingredientC","ingredientDD",NA,"ingredeintFF"),
  DF_2=c("Tablet","Powder","Suspension","injection",NA,"tablet"),
  Route_2=c("ORAL","INHALATION",'topical',"injecatable",NA,"oral"),
  Match=c("identifer+key+product+ingredient+DF","identifier+key+product+ingredient+DF","identifier+key+ingredient+DF","identifer+key","None","identifer+key+product+ingredient"))

chinsoon12 · Answer

Here is an option using data.table: library(data.table) setDT(df) setDT(df_2) keyord <- list( c("product", "ingredient", "DF"), c("product", "DF"), c("ingredient", "DF"), "DF", c("product", "ingredient"), "product", "ingredient", c() ) cols <- c("product", "ingredient", "DF", "Route") df[, Match := NA_character_] for (v in keyord) { k <- c("identifier", "key", v) df[df_2, on=k, c(paste0(cols, "_2"), "check") := c(mget(paste0("i.", cols)), .(TRUE))] df[is.na(Match) & check, Match := toString(k)] } setnames(df, cols, paste0(cols, "_1"), skip_absent=TRUE) output: id identifier key product_1 ingredient_1 DF_1 Match product_2 ingredient_2 DF_2 Route_2 check 1: 1-1 1 A productA ingredientA Tablet identifier, key, product, ingredient, DF productA ingredientA Tablet ORAL TRUE 2: 2-2 2 B productB ingredientB Powder identifier, key, product, ingredient, DF productB ingredientB Powder INHALATION TRUE 3: 3-3 3 C productC ingredientC Suspension identifier, key, ingredient, DF productCC ingredientC Suspension topical TRUE 4: 4-4 4 D productD ingredientD System identifier, key productDD ingredientDD injection injecatable TRUE 5: 5-5 5 E productE ingredientE Capsule NA 6: 6-6 6 F productF ingredientF Capsule identifier, key, product, ingredient productF ingredientF tablet oral TRUE data after fixing some typos in OP: df <- data.frame( id=c("1-1","2-2","3-3","4-4","5-5","6-6"), identifier=c(1,2,3,4,5,6), key=c("A","B","C","D","E","F"), product=c("productA","productB","productC","productD","productE","productF"), ingredient=c("ingredientA","ingredientB","ingredientC","ingredientD","ingredientE","ingredientF"), DF=c("Tablet","Powder","Suspension","System","Capsule","Capsule")) df_2 <- data.frame( identifier=c(1,2,2,3,4,6), key=c("A","B","B","C","D","F"), product=c("productA","productB","productB","productCC","productDD","productF"), ingredient=c("ingredientA","ingredientBB","ingredientB","ingredientC","ingredientDD","ingredientF"), DF=c("Tablet","Powder","Powder","Suspension","injection","tablet"), Route=c("ORAL","INHALATION","INHALATION","topical","injecatable","oral") ) edit for multiple matches: df_2 <- data.frame( identifier=c(1,2,2,3,4,4,6), key=c("A","B","B","C","D","D","F"), product=c("productA","productB","productB","productCC","productDD","productDd","productF"), ingredient=c("ingredientA","ingredientBB","ingredientB","ingredientC","ingredientDD",NA,"ingredientF"), DF=c("Tablet","Powder","Powder","Suspension","injection",NA,"tablet"), Route=c("ORAL","INHALATION","INHALATION","topical","injecatable",NA,"oral") ) setDT(df_2) df[, c("Match", "check") := .(NA_character_, FALSE)] ocols <- unique(unlist(keyord)) rbindlist(lapply(keyord, function(v) { k <- c("identifier", "key", v) a <- df_2[df[(!check)], on=k, nomatch=0L, c(.(id=id), setNames(mget(paste0("i.", ocols)), paste0(ocols, "_1")), setNames(mget(paste0("x.", c(ocols, "Route"))), paste0(c(ocols, "Route"), "_2"))) ] df[id %chin% a$id, check := TRUE] a }), use.names=TRUE) output: id product_1 ingredient_1 DF_1 product_2 ingredient_2 DF_2 Route_2 1: 1-1 productA ingredientA Tablet productA ingredientA Tablet ORAL 2: 2-2 productB ingredientB Powder productB ingredientB Powder INHALATION 3: 3-3 productC ingredientC Suspension productCC ingredientC Suspension topical 4: 6-6 productF ingredientF Capsule productF ingredientF tablet oral 5: 4-4 productD ingredientD System productDD ingredientDD injection injecatable 6: 4-4 productD ingredientD System productDd

Dan Chaltiel · Answer

Here is a solution that might feel slightly over-engineered but achieves the expected outcome:

library(dplyr)
library(purrr)
library(stringr)

get_match=function(data, cols, keys){
  rtn = ifelse(rowSums(is.na(data[paste0(cols, "_1")]))==rowSums(is.na(data[paste0(cols, "_2")])), paste(keys, collapse="+"), "None")

rtn2 = cols %>% 
    map(~{
      case_when(as.character(data[[paste0(.x, "_1")]])==as.character(data[[paste0(.x, "_2")]])~.x)
    }) %>% 
    reduce(paste, sep="+") %>% str_replace_all("\+?NA\+?", "")

paste(rtn, rtn2, sep="+") %>% str_replace_all("\+$", "")
}

df_out = left_join(df, df_2, by=c("identifer", "key"), suffix=c("_1", "_2")) %>% 
    mutate(Match = get_match(., cols=c("product", "ingredient", "DF"), keys=c("identifer", "key")), 
           match_strength = str_count(Match, "\+")) %>% 
    group_by(id) %>% 
    filter(match_strength==max(match_strength, na.rm=TRUE))

dplyr::left_join removes the by keys so the only way I found to add them is to check that all the _1 or the _2 were missing. I could have used the keep=TRUE option and remove/rename them hereafter though...

Interactive join in r based on different variables

2 Answers

Add your own answers!

Ask a Question