Cross Validated Asked on January 3, 2022
After conducting a short survey, I have collected the results in the form of a dataframe. This is a reproducible version of what the actual data frame looks like.
library(dplyr)
library(tidyr)
df=data.frame(ID=c("1101","1102","1103","1104",
"1105","1106","1107","1108",
"1109","1110","1111","1112",
"1113","1114","1115","1116",
"1117","1118","1119","1120",
"1121","1122","1123","1124",
"1125","1126","1127","1128",
"1129","1130","1131","1132",
"1133","1134","1135","1136",
"1137","1138","1139","1140",
"1141","1142","1143","1144",
"1145","1146","1147","1148",
"1149","1150","1151","1152",
"1153","1154","1155","1156"),
Country=c("US","UK","Canada","Mexico",
"India","US","Peru","China",
"US","UK","Canada","Mexico",
"Portugal","India","Portugal","Mexico",
"Peru","India","Canada","Mexico",
"India","UK","India","Canada",
"US","UK","China","India",
"US","Mexico","Canada","Mexico",
"Canada","China","Canada","Canada",
"China","China","India","Mexico",
"Portugal","Portugal","Portugal","Portugal",
"UK","UK","UK","Peru",
"Peru","Mexico","US","US",
"Peru","Mexico","Peru","Mexico"),
Gender=c("Male","Male","Male","Female",
"Female","Female","Male","Female",
"Female","Female","Male","Female",
"Male","Male","Female","Female",
"Female","Male","Female","Female",
"Female","Female","Male","Female",
"Male","Female","Male","Female",
"Female","Male","Female","Female",
"Male","Male","Male","Female",
"Male","Male","Female","Female",
"Male","Female","Male","Female",
"Male","Female","Male","Female",
"Male","Female","Male","Female",
"Male","Male","Male","Male"),
Age=c("<25","25-35","25-35","36-45",
">55",">55","25-35",">55",
"<25","25-35","25-35","36-45",
"25-35","25-35","25-35","36-45",
">55","36-45","46-55","36-45",
">55","46-55","25-35","46-55",
"<25","46-55","25-35","46-55",
"25-35","25-35","46-55","36-45",
"<25","<25",">55","36-45",
"36-45","46-55","<25","<25",
"<25",">55","36-45","46-55",
"<25",">55","36-45","46-55",
"36-45",">55","36-45","46-55",
"<25","46-55","<25","46-55"),
Score_Q1=c(4,4,3,2,
1,1,4,2,
1,1,1,2,
2,1,4,3,
4,3,1,1,
1,2,1,1,
1,4,1,4,
3,4,3,3,
1,3,3,1,
1,1,2,1,
1,2,1,2,
1,1,1,1,
2,2,2,2,
1,2,3,4),
Score_Q2=c(1,4,1,1,
1,2,1,1,
1,4,4,4,
2,1,1,3,
4,3,1,1,
1,3,3,3,
2,4,1,2,
4,4,4,4,
1,1,1,1,
1,2,3,4,
4,4,2,1,
1,2,3,2,
1,2,1,2,
4,3,2,1))
The survey is split into the following parts-
1) ID: A respodent ID
2) Country: Country of origin of respondent
3) Gender: The gender of the respondent
4) Age: The age of the respondent
5) Score_Q1: The satisfaction score for Q1, on a scale from 1
(Very satisfied) to 4
(Very dissatisfied).
6) Score_Q2: The satisfaction score for Q2, on a scale from 1
(Very satisfied) to 4
(Very dissatisfied).
First, I convert the columns Age
, Gender
and Country
to factors
#convert to factor
df$Country=as.factor(df$Country)
df$Gender=as.factor(df$Gender)
df$Age=as.factor(df$Age)
Next, I check my data to see what the ratios by socio-demographic variables looks like –
I begin with Gender
, and see what the sample ratios are for gender by Country
#1) Gender by Country: Sample Ratio
split_gender=df %>% select(Country,Gender) %>%
group_by(Gender,Country) %>%
summarise(n=n()) %>%
ungroup() %>%
select(Country,Gender,n) %>%
group_by(Country,add=TRUE) %>%
spread(Country,n)
split_gender=data.frame(apply(split_gender, 2, as.numeric))
split_gender_sample=as.data.frame(sweep(split_gender,2,colSums(split_gender),`/`))
split_gender_sample[1,1]="Female"
split_gender_sample[2,1]="Male"
I do the same for Age
by Country
#2) Age by Country: Sample Ratio
split_age=df %>% select(Country,Age) %>%
group_by(Age,Country) %>%
summarise(n=n()) %>%
ungroup() %>%
select(Country,Age,n) %>%
group_by(Country,add=TRUE) %>%
spread(Country,n)
split_age=data.frame(apply(split_age, 2, as.numeric))
split_age[is.na(split_age)] <- 0
split_age_sample=as.data.frame(sweep(split_age,2,colSums(split_age),`/`))
split_age_sample[1,1]="<25"
split_age_sample[2,1]=">55"
split_age_sample[3,1]="25-35"
split_age_sample[4,1]="36-45"
split_age_sample[5,1]="46-55"
#Clean up unwanted dataframes
rm(list=c('split_age','split_gender'))
The above two steps give me two data frames – split_age_sample
& split_gender_sample
. These dataframes contain the sample ratios for age and gender by country for my 56 respondents.
My Objective: Calculating Sampling Weights Based on Gender & Age
In order to make my data frame more representative of reality, I would like to attribute weights to my respondents based on the official population ratios for age and gender by country.
Following some online research, these are the latest population ratios I found for the countries I surveyed.
#Gender by Country
split_gender_official=data.frame(Gender=c("Female","Male"),
Canada=c(0.4,0.6),
China=c(0.3,0.7),
India=c(0.3,0.7),
Mexico=c(0.5,0.5),
Peru=c(0.6,0.4),
Portugal=c(0.5,0.5),
UK=c(0.4,0.6),
US=c(0.4,0.6))
#Age by Country
split_age_official=data.frame(Age=c("<25",">55","25-35","36-45","46-55"),
Canada=c(0.1,0.3,0.3,0.2,0.1),
China=c(0.3,0.05,0.35,0.1,0.2),
India=c(0.5,0.05,0.35,0.05,0.05),
Mexico=c(0.2,0.3,0.2,0.1,0.2),
Peru=c(0.1,0.3,0.2,0.2,0.2),
Portugal=c(0.2,0.1,0.05,0.05,0.6),
UK=c(0.2,0.3,0.1,0.3,0.1),
US=c(0.2,0.3,0.1,0.3,0.1))
Desired Output
Based on my sample ratios and the offical population ratios for both age & gender, I’d like to attribute weights to my respondents, in a separate column called weights
.
Currently I am unable to figure out how to do this calculation.
Then, once the weights are calculated, I’d like to summarize the scores using the weights
column. The aggregation would look something like this (except with the weights included in the calculation) –
#Calculate weighted overall scores by Country & Gender: example UK
weighted_aggregated_scores_gender=df %>%
select(-Age) %>%
group_by(Country,Gender) %>%
filter(Country=='UK') %>%
summarise(Q1_KPI=round(sum(Score_Q1 %in% c(1,2)/n()),2),
Q2_KPI=round(sum(Score_Q2 %in% c(1,2)/n()),2))
I’d really appreciate any help I can get on the weight calculation and its usage in the aggregation step that follows.
You can use ?melt to convert your official ratios to a long format, and then ?merge with the original dataframe to get weights. summarizing is easily done with data.table
#Convert to long dataframe format
split_age_official <- melt(split_age_official, variable.name="Country", value.name="popratio.Age")
split_gender_official <- melt(split_gender_official , variable.name="Country", value.name="popratio.Gender")
# merge with original dataframe
df <- merge(df, split_age_official, by=c("Country", "Age"))
df <- merge(df, split_gender_official, by=c("Country", "Gender"))
# Calculated weighted scores
df$Weighted_Score_Q1.Gender <- df$Score_Q1 * df$popratio.Gender
df$Weighted_Score_Q1.Age <- df$Score_Q1 * df$popratio.Age
df$Weighted_Score_Q2.Gender <- df$Score_Q2 * df$popratio.Gender
df$Weighted_Score_Q2.Age <- df$Score_Q2 * df$popratio.Age
I usually prefer datatable for summarization, but you can use dplyr as well of course.
A data.table summarization would be something like:
library(data.table) # install.packages("data.table")
df <- data.table(df)
Weighted_Gender <- df[,list(Q1_KPI=sum(Weighted_Score_Q1.Gender, na.rm=TRUE), Q2_KPI=sum(Weighted_Score_Q2.Gender, na.rm=TRUE)), by=c("Country", "Gender")]
Weighted_Age <- df[,list(Q1_KPI=sum(Weighted_Score_Q1.Age, na.rm=TRUE), Q2_KPI=sum(Weighted_Score_Q2.Age, na.rm=TRUE)), by=c("Country", "Age")]
You can subset by the country you want, for example:
Weighted_Gender[Country=="UK"]
Answered by Knio on January 3, 2022
1 Asked on February 14, 2021 by pol
chi squared distribution distributions non central probability
1 Asked on February 13, 2021
0 Asked on February 13, 2021 by comte
feature engineering feature selection neural networks time series
0 Asked on February 13, 2021 by rando
0 Asked on February 12, 2021 by itsallpurple
matrix decomposition moment generating function multivariate normal distribution normal distribution quadratic form
0 Asked on February 12, 2021 by user294496
2 Asked on February 11, 2021 by desperate-about-statistics
1 Asked on February 11, 2021 by user3676846
0 Asked on February 11, 2021 by iterator516
continuous data distributions expected value integral probability
2 Asked on February 10, 2021 by d-b
0 Asked on February 10, 2021 by lisa-ann
1 Asked on February 10, 2021 by user136083
2 Asked on February 9, 2021 by naveen-y
1 Asked on February 9, 2021 by mads-lumholdt
agreement statistics anova bland altman plot interpretation r
2 Asked on February 9, 2021 by user26067
1 Asked on February 8, 2021 by mishe-mitasek
0 Asked on February 8, 2021 by amit-s
1 Asked on February 6, 2021 by rkabra
0 Asked on February 6, 2021 by pst0102
Get help from others!
Recent Questions
Recent Answers
© 2023 AnswerBun.com. All rights reserved. Sites we Love: PCI Database, UKBizDB, Menu Kuliner, Sharing RPP