Cross Validated Asked on January 3, 2022
After conducting a short survey, I have collected the results in the form of a dataframe. This is a reproducible version of what the actual data frame looks like.
library(dplyr)
library(tidyr)
df=data.frame(ID=c("1101","1102","1103","1104",
"1105","1106","1107","1108",
"1109","1110","1111","1112",
"1113","1114","1115","1116",
"1117","1118","1119","1120",
"1121","1122","1123","1124",
"1125","1126","1127","1128",
"1129","1130","1131","1132",
"1133","1134","1135","1136",
"1137","1138","1139","1140",
"1141","1142","1143","1144",
"1145","1146","1147","1148",
"1149","1150","1151","1152",
"1153","1154","1155","1156"),
Country=c("US","UK","Canada","Mexico",
"India","US","Peru","China",
"US","UK","Canada","Mexico",
"Portugal","India","Portugal","Mexico",
"Peru","India","Canada","Mexico",
"India","UK","India","Canada",
"US","UK","China","India",
"US","Mexico","Canada","Mexico",
"Canada","China","Canada","Canada",
"China","China","India","Mexico",
"Portugal","Portugal","Portugal","Portugal",
"UK","UK","UK","Peru",
"Peru","Mexico","US","US",
"Peru","Mexico","Peru","Mexico"),
Gender=c("Male","Male","Male","Female",
"Female","Female","Male","Female",
"Female","Female","Male","Female",
"Male","Male","Female","Female",
"Female","Male","Female","Female",
"Female","Female","Male","Female",
"Male","Female","Male","Female",
"Female","Male","Female","Female",
"Male","Male","Male","Female",
"Male","Male","Female","Female",
"Male","Female","Male","Female",
"Male","Female","Male","Female",
"Male","Female","Male","Female",
"Male","Male","Male","Male"),
Age=c("<25","25-35","25-35","36-45",
">55",">55","25-35",">55",
"<25","25-35","25-35","36-45",
"25-35","25-35","25-35","36-45",
">55","36-45","46-55","36-45",
">55","46-55","25-35","46-55",
"<25","46-55","25-35","46-55",
"25-35","25-35","46-55","36-45",
"<25","<25",">55","36-45",
"36-45","46-55","<25","<25",
"<25",">55","36-45","46-55",
"<25",">55","36-45","46-55",
"36-45",">55","36-45","46-55",
"<25","46-55","<25","46-55"),
Score_Q1=c(4,4,3,2,
1,1,4,2,
1,1,1,2,
2,1,4,3,
4,3,1,1,
1,2,1,1,
1,4,1,4,
3,4,3,3,
1,3,3,1,
1,1,2,1,
1,2,1,2,
1,1,1,1,
2,2,2,2,
1,2,3,4),
Score_Q2=c(1,4,1,1,
1,2,1,1,
1,4,4,4,
2,1,1,3,
4,3,1,1,
1,3,3,3,
2,4,1,2,
4,4,4,4,
1,1,1,1,
1,2,3,4,
4,4,2,1,
1,2,3,2,
1,2,1,2,
4,3,2,1))
The survey is split into the following parts-
1) ID: A respodent ID
2) Country: Country of origin of respondent
3) Gender: The gender of the respondent
4) Age: The age of the respondent
5) Score_Q1: The satisfaction score for Q1, on a scale from 1
(Very satisfied) to 4
(Very dissatisfied).
6) Score_Q2: The satisfaction score for Q2, on a scale from 1
(Very satisfied) to 4
(Very dissatisfied).
First, I convert the columns Age
, Gender
and Country
to factors
#convert to factor
df$Country=as.factor(df$Country)
df$Gender=as.factor(df$Gender)
df$Age=as.factor(df$Age)
Next, I check my data to see what the ratios by socio-demographic variables looks like –
I begin with Gender
, and see what the sample ratios are for gender by Country
#1) Gender by Country: Sample Ratio
split_gender=df %>% select(Country,Gender) %>%
group_by(Gender,Country) %>%
summarise(n=n()) %>%
ungroup() %>%
select(Country,Gender,n) %>%
group_by(Country,add=TRUE) %>%
spread(Country,n)
split_gender=data.frame(apply(split_gender, 2, as.numeric))
split_gender_sample=as.data.frame(sweep(split_gender,2,colSums(split_gender),`/`))
split_gender_sample[1,1]="Female"
split_gender_sample[2,1]="Male"
I do the same for Age
by Country
#2) Age by Country: Sample Ratio
split_age=df %>% select(Country,Age) %>%
group_by(Age,Country) %>%
summarise(n=n()) %>%
ungroup() %>%
select(Country,Age,n) %>%
group_by(Country,add=TRUE) %>%
spread(Country,n)
split_age=data.frame(apply(split_age, 2, as.numeric))
split_age[is.na(split_age)] <- 0
split_age_sample=as.data.frame(sweep(split_age,2,colSums(split_age),`/`))
split_age_sample[1,1]="<25"
split_age_sample[2,1]=">55"
split_age_sample[3,1]="25-35"
split_age_sample[4,1]="36-45"
split_age_sample[5,1]="46-55"
#Clean up unwanted dataframes
rm(list=c('split_age','split_gender'))
The above two steps give me two data frames – split_age_sample
& split_gender_sample
. These dataframes contain the sample ratios for age and gender by country for my 56 respondents.
My Objective: Calculating Sampling Weights Based on Gender & Age
In order to make my data frame more representative of reality, I would like to attribute weights to my respondents based on the official population ratios for age and gender by country.
Following some online research, these are the latest population ratios I found for the countries I surveyed.
#Gender by Country
split_gender_official=data.frame(Gender=c("Female","Male"),
Canada=c(0.4,0.6),
China=c(0.3,0.7),
India=c(0.3,0.7),
Mexico=c(0.5,0.5),
Peru=c(0.6,0.4),
Portugal=c(0.5,0.5),
UK=c(0.4,0.6),
US=c(0.4,0.6))
#Age by Country
split_age_official=data.frame(Age=c("<25",">55","25-35","36-45","46-55"),
Canada=c(0.1,0.3,0.3,0.2,0.1),
China=c(0.3,0.05,0.35,0.1,0.2),
India=c(0.5,0.05,0.35,0.05,0.05),
Mexico=c(0.2,0.3,0.2,0.1,0.2),
Peru=c(0.1,0.3,0.2,0.2,0.2),
Portugal=c(0.2,0.1,0.05,0.05,0.6),
UK=c(0.2,0.3,0.1,0.3,0.1),
US=c(0.2,0.3,0.1,0.3,0.1))
Desired Output
Based on my sample ratios and the offical population ratios for both age & gender, I’d like to attribute weights to my respondents, in a separate column called weights
.
Currently I am unable to figure out how to do this calculation.
Then, once the weights are calculated, I’d like to summarize the scores using the weights
column. The aggregation would look something like this (except with the weights included in the calculation) –
#Calculate weighted overall scores by Country & Gender: example UK
weighted_aggregated_scores_gender=df %>%
select(-Age) %>%
group_by(Country,Gender) %>%
filter(Country=='UK') %>%
summarise(Q1_KPI=round(sum(Score_Q1 %in% c(1,2)/n()),2),
Q2_KPI=round(sum(Score_Q2 %in% c(1,2)/n()),2))
I’d really appreciate any help I can get on the weight calculation and its usage in the aggregation step that follows.
You can use ?melt to convert your official ratios to a long format, and then ?merge with the original dataframe to get weights. summarizing is easily done with data.table
#Convert to long dataframe format
split_age_official <- melt(split_age_official, variable.name="Country", value.name="popratio.Age")
split_gender_official <- melt(split_gender_official , variable.name="Country", value.name="popratio.Gender")
# merge with original dataframe
df <- merge(df, split_age_official, by=c("Country", "Age"))
df <- merge(df, split_gender_official, by=c("Country", "Gender"))
# Calculated weighted scores
df$Weighted_Score_Q1.Gender <- df$Score_Q1 * df$popratio.Gender
df$Weighted_Score_Q1.Age <- df$Score_Q1 * df$popratio.Age
df$Weighted_Score_Q2.Gender <- df$Score_Q2 * df$popratio.Gender
df$Weighted_Score_Q2.Age <- df$Score_Q2 * df$popratio.Age
I usually prefer datatable for summarization, but you can use dplyr as well of course.
A data.table summarization would be something like:
library(data.table) # install.packages("data.table")
df <- data.table(df)
Weighted_Gender <- df[,list(Q1_KPI=sum(Weighted_Score_Q1.Gender, na.rm=TRUE), Q2_KPI=sum(Weighted_Score_Q2.Gender, na.rm=TRUE)), by=c("Country", "Gender")]
Weighted_Age <- df[,list(Q1_KPI=sum(Weighted_Score_Q1.Age, na.rm=TRUE), Q2_KPI=sum(Weighted_Score_Q2.Age, na.rm=TRUE)), by=c("Country", "Age")]
You can subset by the country you want, for example:
Weighted_Gender[Country=="UK"]
Answered by Knio on January 3, 2022
0 Asked on November 20, 2021
forecasting intermittent time series multivariate regression prophet time series
1 Asked on November 18, 2021
1 Asked on November 18, 2021 by charles-orlando
2 Asked on November 18, 2021
intuition neural networks semi supervised learning transfer learning unsupervised learning
6 Asked on November 18, 2021
1 Asked on November 18, 2021 by imagineerthat
1 Asked on November 18, 2021 by crimson_idiot
0 Asked on November 18, 2021
2 Asked on November 16, 2021 by shial-de
2 Asked on November 16, 2021 by bruce-rawlings
1 Asked on November 16, 2021
0 Asked on November 16, 2021 by dimitriy
0 Asked on November 16, 2021
3 Asked on November 16, 2021
0 Asked on November 16, 2021 by user291976
1 Asked on November 16, 2021 by ben-s
1 Asked on November 16, 2021
1 Asked on November 16, 2021 by ashirwad
Get help from others!
Recent Questions
Recent Answers
© 2023 AnswerBun.com. All rights reserved. Sites we Love: PCI Database, UKBizDB, Menu Kuliner, Sharing RPP