R Calculate sample weights based on two socio-demographic variables and aggregate results

Question

After conducting a short survey, I have collected the results in the form of a dataframe. This is a reproducible version of what the actual data frame looks like.

library(dplyr)
library(tidyr)
df=data.frame(ID=c("1101","1102","1103","1104",
               "1105","1106","1107","1108",
               "1109","1110","1111","1112",
               "1113","1114","1115","1116",
               "1117","1118","1119","1120",
               "1121","1122","1123","1124",
               "1125","1126","1127","1128",
               "1129","1130","1131","1132",
               "1133","1134","1135","1136",
               "1137","1138","1139","1140",
               "1141","1142","1143","1144",
               "1145","1146","1147","1148",
               "1149","1150","1151","1152",
               "1153","1154","1155","1156"),
          Country=c("US","UK","Canada","Mexico",
                    "India","US","Peru","China",
                    "US","UK","Canada","Mexico",
                    "Portugal","India","Portugal","Mexico",
                    "Peru","India","Canada","Mexico",
                    "India","UK","India","Canada",
                    "US","UK","China","India",
                    "US","Mexico","Canada","Mexico",
                    "Canada","China","Canada","Canada",
                    "China","China","India","Mexico",
                    "Portugal","Portugal","Portugal","Portugal",
                    "UK","UK","UK","Peru",
                    "Peru","Mexico","US","US",
                    "Peru","Mexico","Peru","Mexico"),
          Gender=c("Male","Male","Male","Female",
                    "Female","Female","Male","Female",
                    "Female","Female","Male","Female",
                    "Male","Male","Female","Female",
                    "Female","Male","Female","Female",
                    "Female","Female","Male","Female",
                    "Male","Female","Male","Female",
                    "Female","Male","Female","Female",
                    "Male","Male","Male","Female",
                    "Male","Male","Female","Female",
                    "Male","Female","Male","Female",
                    "Male","Female","Male","Female",
                    "Male","Female","Male","Female",
                    "Male","Male","Male","Male"),
          Age=c("<25","25-35","25-35","36-45",
                ">55",">55","25-35",">55",
                "<25","25-35","25-35","36-45",
                "25-35","25-35","25-35","36-45",
                ">55","36-45","46-55","36-45",
                ">55","46-55","25-35","46-55",
                "<25","46-55","25-35","46-55",
                "25-35","25-35","46-55","36-45",
                "<25","<25",">55","36-45",
                "36-45","46-55","<25","<25",
                "<25",">55","36-45","46-55",
                "<25",">55","36-45","46-55",
                "36-45",">55","36-45","46-55",
                "<25","46-55","<25","46-55"),
          Score_Q1=c(4,4,3,2,
                  1,1,4,2,
                  1,1,1,2,
                  2,1,4,3,
                  4,3,1,1,
                  1,2,1,1,
                  1,4,1,4,
                  3,4,3,3,
                  1,3,3,1,
                  1,1,2,1,
                  1,2,1,2,
                  1,1,1,1,
                  2,2,2,2,
                  1,2,3,4),
          Score_Q2=c(1,4,1,1,
                     1,2,1,1,
                     1,4,4,4,
                     2,1,1,3,
                     4,3,1,1,
                     1,3,3,3,
                     2,4,1,2,
                     4,4,4,4,
                     1,1,1,1,
                     1,2,3,4,
                     4,4,2,1,
                     1,2,3,2,
                     1,2,1,2,
                     4,3,2,1))

The survey is split into the following parts-

1) ID: A respodent ID

2) Country: Country of origin of respondent

3) Gender: The gender of the respondent

4) Age: The age of the respondent

5) Score_Q1: The satisfaction score for Q1, on a scale from 1 (Very satisfied) to 4(Very dissatisfied).

6) Score_Q2: The satisfaction score for Q2, on a scale from 1(Very satisfied) to 4(Very dissatisfied).

First, I convert the columns Age, Gender and Country to factors

#convert to factor
df$Country=as.factor(df$Country)
df$Gender=as.factor(df$Gender)
df$Age=as.factor(df$Age)

Next, I check my data to see what the ratios by socio-demographic variables looks like –

I begin with Gender, and see what the sample ratios are for gender by Country

#1) Gender by Country: Sample Ratio
split_gender=df %>% select(Country,Gender) %>%
  group_by(Gender,Country) %>%
  summarise(n=n()) %>%
  ungroup() %>%
  select(Country,Gender,n) %>%
  group_by(Country,add=TRUE) %>%
  spread(Country,n)

split_gender=data.frame(apply(split_gender, 2, as.numeric))
split_gender_sample=as.data.frame(sweep(split_gender,2,colSums(split_gender),`/`))
split_gender_sample[1,1]="Female"
split_gender_sample[2,1]="Male"

I do the same for Age by Country

#2) Age by Country: Sample Ratio
split_age=df %>% select(Country,Age) %>%
  group_by(Age,Country) %>%
  summarise(n=n()) %>%
  ungroup() %>%
  select(Country,Age,n) %>%
  group_by(Country,add=TRUE) %>%
  spread(Country,n)

split_age=data.frame(apply(split_age, 2, as.numeric))
split_age[is.na(split_age)] <- 0
split_age_sample=as.data.frame(sweep(split_age,2,colSums(split_age),`/`))
split_age_sample[1,1]="<25"
split_age_sample[2,1]=">55"
split_age_sample[3,1]="25-35"
split_age_sample[4,1]="36-45"
split_age_sample[5,1]="46-55"

#Clean up unwanted dataframes
rm(list=c('split_age','split_gender'))

The above two steps give me two data frames – split_age_sample & split_gender_sample. These dataframes contain the sample ratios for age and gender by country for my 56 respondents.

My Objective: Calculating Sampling Weights Based on Gender & Age

In order to make my data frame more representative of reality, I would like to attribute weights to my respondents based on the official population ratios for age and gender by country.

Following some online research, these are the latest population ratios I found for the countries I surveyed.

#Gender by Country
split_gender_official=data.frame(Gender=c("Female","Male"),
                                 Canada=c(0.4,0.6),
                                 China=c(0.3,0.7),
                                 India=c(0.3,0.7),
                                 Mexico=c(0.5,0.5),
                                 Peru=c(0.6,0.4),
                                 Portugal=c(0.5,0.5),
                                 UK=c(0.4,0.6),
                                 US=c(0.4,0.6))
#Age by Country
split_age_official=data.frame(Age=c("<25",">55","25-35","36-45","46-55"),
                                 Canada=c(0.1,0.3,0.3,0.2,0.1),
                                 China=c(0.3,0.05,0.35,0.1,0.2),
                                 India=c(0.5,0.05,0.35,0.05,0.05),
                                 Mexico=c(0.2,0.3,0.2,0.1,0.2),
                                 Peru=c(0.1,0.3,0.2,0.2,0.2),
                                 Portugal=c(0.2,0.1,0.05,0.05,0.6),
                                 UK=c(0.2,0.3,0.1,0.3,0.1),
                                 US=c(0.2,0.3,0.1,0.3,0.1))

Desired Output

Based on my sample ratios and the offical population ratios for both age & gender, I’d like to attribute weights to my respondents, in a separate column called weights.

Currently I am unable to figure out how to do this calculation.

Then, once the weights are calculated, I’d like to summarize the scores using the weights column. The aggregation would look something like this (except with the weights included in the calculation) –

#Calculate weighted overall scores by Country & Gender: example UK
weighted_aggregated_scores_gender=df %>%
  select(-Age) %>%
  group_by(Country,Gender) %>%
  filter(Country=='UK') %>%
  summarise(Q1_KPI=round(sum(Score_Q1 %in% c(1,2)/n()),2),
            Q2_KPI=round(sum(Score_Q2 %in% c(1,2)/n()),2))

I’d really appreciate any help I can get on the weight calculation and its usage in the aggregation step that follows.

r sample weighted data

Knio · Answer

You can use ?melt to convert your official ratios to a long format, and then ?merge with the original dataframe to get weights. summarizing is easily done with data.table

#Convert to long dataframe format

split_age_official <- melt(split_age_official, variable.name="Country", value.name="popratio.Age")
split_gender_official <- melt(split_gender_official , variable.name="Country", value.name="popratio.Gender")

# merge with original dataframe

df <- merge(df, split_age_official, by=c("Country", "Age"))
df <- merge(df, split_gender_official, by=c("Country", "Gender"))

# Calculated weighted scores
df$Weighted_Score_Q1.Gender <- df$Score_Q1 * df$popratio.Gender
df$Weighted_Score_Q1.Age <- df$Score_Q1 * df$popratio.Age

df$Weighted_Score_Q2.Gender <- df$Score_Q2 * df$popratio.Gender
df$Weighted_Score_Q2.Age <- df$Score_Q2 * df$popratio.Age

I usually prefer datatable for summarization, but you can use dplyr as well of course.

A data.table summarization would be something like:

library(data.table) # install.packages("data.table")
df <- data.table(df)

Weighted_Gender <- df[,list(Q1_KPI=sum(Weighted_Score_Q1.Gender, na.rm=TRUE), Q2_KPI=sum(Weighted_Score_Q2.Gender, na.rm=TRUE)), by=c("Country", "Gender")]
Weighted_Age <- df[,list(Q1_KPI=sum(Weighted_Score_Q1.Age, na.rm=TRUE), Q2_KPI=sum(Weighted_Score_Q2.Age, na.rm=TRUE)), by=c("Country", "Age")]

You can subset by the country you want, for example:

Weighted_Gender[Country=="UK"]

R Calculate sample weights based on two socio-demographic variables and aggregate results

One Answer

Add your own answers!

Ask a Question