Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
fertilityfacebook/fertilityfacebook.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
755 lines (607 sloc)
23.8 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
####################################################################################### | |
###################### Mater Certa Est, Pater Numquam ############################### | |
###### What Can Facebook Advertising Platform Tell Us about Male Fertility? ########### | |
####################################################################################### | |
################################# Packages ############################################ | |
#install.packages("tidyverse") | |
library(tidyverse) | |
#install.packages("plyr") | |
library(plyr) | |
#install.packages("readr") | |
library(readr) | |
#install.packages("data.table") | |
library(data.table) | |
#install.packages("rworldmap") | |
library(rworldmap) | |
#install.packages("MLmetrics") | |
library(MLmetrics) | |
################################# Data ################################################ | |
# The data from Facebook were collected on the 2nd of January 2018 | |
fb <- read_csv("fb.csv") | |
# Read all the numbers in audience | |
options(scipen = 50) | |
################################# FEMALE ################################################ | |
# We first focus on the Women | |
women <- filter(fb, genders==2, age!="15-49", fertility=="Child") | |
women$mage<-revalue(women$age,c( | |
"15-19"="17.5", | |
"20-24"="22.5", | |
"25-29"="27.5", | |
"30-34"="32.5", | |
"35-39"="37.5", | |
"40-44"="42.5", | |
"45-49"="47.5" | |
)) | |
# We transformed the vraibale mage and audience into numeric variables | |
women$mage<-as.numeric(women$mage) | |
women$audience<-as.numeric(women$audience) | |
# We create the variable ca which is the product between mage and the fb audience estimate | |
women$ca=(women$mage*women$audience) | |
# We sort the file | |
sort.people <- with(women, women[order(country) , ]) | |
# The following function computes for every single country the Mean Age at Childbearing | |
macfunction <- function(country.name) { | |
data.country <- subset(sort.people, sort.people$country==country.name) | |
ca <- (data.country$mage*data.country$audience) | |
numerator <- sum(ca) | |
# denominator births for each country | |
denominator <- sum(data.country$audience) | |
# mean age at childbearing by country | |
MAC<- numerator/denominator | |
return(MAC) | |
} | |
the.names <- names(table((sort.people$country))) | |
all.MAC <- lapply(the.names, macfunction) | |
names(all.MAC) <- the.names | |
all.MAC | |
all.mac <- ldply (all.MAC, data.frame) | |
colnames(all.mac)<-c("country", "fb_mac") | |
# all.mac contains all the mean age at childbearing computed with the macfunction | |
# UN data for mac | |
UNdata_macw <- read_csv("UNdata_macw.csv") | |
head(UNdata_macw) | |
# Merge the fb data with the UN data | |
comparison_macw <- merge(all.mac,UNdata_macw, by="country") | |
head(comparison_macw) | |
# 135 country in the paper 143 in this analysis, since we checked again the code linking the country | |
comparison_macw <-subset(comparison_macw, comparison_macw$fb_mac!=32.50) | |
###################################################################################################### | |
# Overall Correlation | |
class(comparison_macw$un_mac) | |
comparison_macw$un_mac<-as.numeric(as.character(comparison_macw$un_mac)) | |
cor(comparison_macw$un_mac, comparison_macw$fb_mac, method="pearson") | |
cor(comparison_macw$un_mac, comparison_macw$fb_mac, method="spearman") | |
cor.test(comparison_macw$fb_mac,comparison_macw$un_mac, method="spearman") | |
# 0.462538 in the paper it is reported to be 0.47 | |
# p-value = 0.000000006032 | |
MAE(comparison_macw$un_mac, comparison_macw$fb_mac) | |
MAPE(comparison_macw$un_mac, comparison_macw$fb_mac) | |
MSE(comparison_macw$un_mac, comparison_macw$fb_mac) | |
###################################################################################################### | |
# We now compute the Correlation and the Mape for each continent, | |
# and the leave-one-out-cross validation | |
###################################################################################################### | |
# Africa | |
macc_africa<-comparison_macw[which(comparison_macw$continent=='Africa'),] | |
# Correlation | |
cor(macc_africa$fb_mac,macc_africa$un_mac, method='spearman') | |
cor.test(macc_africa$fb_mac,macc_africa$un_mac, method="spearman") | |
# MAPE | |
MAPE(macc_africa$fb_mac,macc_africa$un_mac)*100 | |
# LOOCV | |
results_africaw <- list(NULL) | |
for (i in 1:(nrow(macc_africa))) { | |
leave.out.row <- macc_africa[-c(i),] | |
only.the.row <- macc_africa[i,] | |
model <- lm(un_mac ~ fb_mac, data=leave.out.row) | |
pred <- predict(model, newdata=only.the.row) | |
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac) | |
frame <- data.frame(pred=pred, mape=mape) | |
results_africaw[[i]] <- frame | |
} | |
results_africaw | |
# Added two countries in Africa | |
df <- data.frame(matrix(unlist(results_africaw), nrow=30, byrow=T)) | |
colnames(df) <- c("pred","mape") | |
macc_africa<-cbind(macc_africa,df) | |
# CORRELATION loocv | |
cor(macc_africa$un_mac, macc_africa$pred, method='spearman') | |
cor.test(macc_africa$un_mac, macc_africa$pred, method="spearman") | |
# MAPE loocv | |
average_out_of_sample_mape<- mean(macc_africa$mape) | |
average_out_of_sample_mape*100 | |
###################################################################################################### | |
# Asia | |
macc_asia<-comparison_macw[which(comparison_macw$continent=='Asia'),] | |
# Correlation | |
cor(macc_asia$fb_mac,macc_africa$un_mac, method='spearman') | |
cor.test(macc_asia$fb_mac,macc_asia$un_mac, method="spearman") | |
# MAPE | |
MAPE(macc_asia$fb_mac,macc_asia$un_mac)*100 | |
# LOOCV | |
results_asiaw <- list(NULL) | |
for (i in 1:(nrow(macc_asia))) { | |
leave.out.row <- macc_asia[-c(i),] | |
only.the.row <- macc_asia[i,] | |
model <- lm(un_mac ~ fb_mac, data=leave.out.row) | |
pred <- predict(model, newdata=only.the.row) | |
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac) | |
frame <- data.frame(pred=pred, mape=mape) | |
results_asiaw[[i]] <- frame | |
} | |
results_asiaw | |
# Two more Asian countries | |
df <- data.frame(matrix(unlist(results_asiaw), nrow=35, byrow=T)) | |
colnames(df) <- c("pred","mape") | |
macc_asia<-cbind(macc_asia,df) | |
# CORRELATION loocv | |
cor(macc_asia$un_mac, macc_asia$pred, method='spearman') | |
cor.test(macc_asia$un_mac, macc_asia$pred, method="spearman") | |
# MAPE loocv | |
average_out_of_sample_mape<- mean(macc_asia$mape) | |
average_out_of_sample_mape*100 | |
###################################################################################################### | |
# Europe | |
macc_europe<-comparison_macw[which(comparison_macw$continent=='Europe'),] | |
# Correlation | |
cor(macc_europe$fb_mac,macc_europe$un_mac, method='spearman') | |
cor.test(macc_europe$fb_mac,macc_europe$un_mac, method="spearman") | |
# MAPE | |
MAPE(macc_europe$fb_mac,macc_europe$un_mac)*100 | |
# LOOCV | |
results_europew <- list(NULL) | |
for (i in 1:(nrow(macc_europe))) { | |
leave.out.row <- macc_europe[-c(i),] | |
only.the.row <- macc_europe[i,] | |
model <- lm(un_mac ~ fb_mac, data=leave.out.row) | |
pred <- predict(model, newdata=only.the.row) | |
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac) | |
frame <- data.frame(pred=pred, mape=mape) | |
results_europew[[i]] <- frame | |
} | |
results_europew | |
# 41 as in the paper | |
df <- data.frame(matrix(unlist(results_europew), nrow=41, byrow=T)) | |
colnames(df) <- c("pred","mape") | |
macc_europe<-cbind(macc_europe,df) | |
# CORRELATION loocv | |
cor(macc_europe$un_mac, macc_europe$pred, method='spearman') | |
cor.test(macc_europe$un_mac, macc_europe$pred, method="spearman") | |
# MAPE loocv | |
average_out_of_sample_mape<- mean(macc_europe$mape) | |
average_out_of_sample_mape*100 | |
###################################################################################################### | |
# North America | |
macc_na<-comparison_macw[which(comparison_macw$continent=='North America'),] | |
# Correlation | |
cor(macc_na$fb_mac,macc_na$un_mac, method='spearman') | |
cor.test(macc_na$fb_mac,macc_na$un_mac, method="spearman") | |
# MAPE | |
MAPE(macc_na$fb_mac,macc_na$un_mac)*100 | |
# LOOCV | |
results_naw <- list(NULL) | |
for (i in 1:(nrow(macc_na))) { | |
leave.out.row <- macc_na[-c(i),] | |
only.the.row <- macc_na[i,] | |
model <- lm(un_mac ~ fb_mac, data=leave.out.row) | |
pred <- predict(model, newdata=only.the.row) | |
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac) | |
frame <- data.frame(pred=pred, mape=mape) | |
results_naw[[i]] <- frame | |
} | |
results_naw | |
# One additional country | |
df <- data.frame(matrix(unlist(results_naw), nrow=17, byrow=T)) | |
colnames(df) <- c("pred","mape") | |
macc_na<-cbind(macc_na,df) | |
# CORRELATION loocv | |
cor(macc_na$un_mac, macc_na$pred, method='spearman') | |
cor.test(macc_na$un_mac, macc_na$pred, method="spearman") | |
# MAPE loocv | |
average_out_of_sample_mape<- mean(macc_na$mape) | |
average_out_of_sample_mape*100 | |
###################################################################################################### | |
# Oceania | |
macc_oceania<-comparison_macw[which(comparison_macw$continent=='Oceania'),] | |
# Correlation | |
cor(macc_oceania$fb_mac,macc_oceania$un_mac, method='spearman') | |
cor.test(macc_oceania$fb_mac,macc_oceania$un_mac, method="spearman") | |
# MAPE | |
MAPE(macc_oceania$fb_mac,macc_oceania$un_mac)*100 | |
# LOOCV | |
results_oceaniaw <- list(NULL) | |
for (i in 1:(nrow(macc_oceania))) { | |
leave.out.row <- macc_oceania[-c(i),] | |
only.the.row <- macc_oceania[i,] | |
model <- lm(un_mac ~ fb_mac, data=leave.out.row) | |
pred <- predict(model, newdata=only.the.row) | |
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac) | |
frame <- data.frame(pred=pred, mape=mape) | |
results_oceaniaw[[i]] <- frame | |
} | |
results_oceaniaw | |
# 8 country as in the paper | |
df <- data.frame(matrix(unlist(results_oceaniaw), nrow=8, byrow=T)) | |
colnames(df) <- c("pred","mape") | |
macc_oceania<-cbind(macc_oceania,df) | |
# CORRELATION loocv | |
cor(macc_oceania$un_mac, macc_oceania$pred, method='spearman') | |
cor.test(macc_oceania$un_mac, macc_oceania$pred, method="spearman") | |
# MAPE loocv | |
average_out_of_sample_mape<- mean(macc_oceania$mape) | |
average_out_of_sample_mape*100 | |
###################################################################################################### | |
# South America | |
macc_sa<-comparison_macw[which(comparison_macw$continent=='South America'),] | |
# Correlation | |
cor(macc_sa$fb_mac,macc_sa$un_mac, method='spearman') | |
cor.test(macc_sa$fb_mac,macc_sa$un_mac, method="spearman") | |
# MAPE | |
MAPE(macc_asa$fb_mac,macc_sa$un_mac)*100 | |
# LOOCV | |
results_saw <- list(NULL) | |
for (i in 1:(nrow(macc_sa))) { | |
leave.out.row <- macc_sa[-c(i),] | |
only.the.row <- macc_sa[i,] | |
model <- lm(un_mac ~ fb_mac, data=leave.out.row) | |
pred <- predict(model, newdata=only.the.row) | |
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac) | |
frame <- data.frame(pred=pred, mape=mape) | |
results_saw[[i]] <- frame | |
} | |
results_saw | |
# 12 countries as in the paper | |
df <- data.frame(matrix(unlist(results_saw), nrow=12, byrow=T)) | |
colnames(df) <- c("pred","mape") | |
macc_sa<-cbind(macc_sa,df) | |
# CORRELATION loocv | |
cor(macc_sa$un_mac, macc_sa$pred, method='spearman') | |
cor.test(macc_sa$un_mac, macc_sa$pred, method="spearman") | |
# MAPE loocv | |
average_out_of_sample_mape<- mean(macc_sa$mape) | |
average_out_of_sample_mape*100 | |
################################# MALE ################################################ | |
# Men | |
men <- filter(fb, genders==1, age!="15-49", fertility=="Child") | |
men$mage<-revalue(men$age,c( | |
"15-19"="17.5", | |
"20-24"="22.5", | |
"25-29"="27.5", | |
"30-34"="32.5", | |
"35-39"="37.5", | |
"40-44"="42.5", | |
"45-49"="47.5" | |
)) | |
men$mage<-as.numeric(men$mage) | |
class(men$mage) | |
men$audience<-as.numeric(men$audience) | |
men$ca=(men$mage*men$audience) | |
sort.people <- with(men, men[order(country) , ]) | |
the.names <- names(table((sort.people$country))) | |
all.MAC.men <- lapply(the.names, macfunction) | |
names(all.MAC.men) <- the.names | |
all.MAC.men | |
all.mac.men <- ldply (all.MAC.men, data.frame) | |
colnames(all.mac.men)<-c("country", "fb_mac") | |
all.mac.men<-subset(all.mac.men, fb_mac!=32.5) | |
# all.mac contains all the mean age at childbearing computed with the macfunction | |
# UN data for mac | |
UNdata_macm <- read_delim("UNdata_macm.csv", ";", escape_double = FALSE, trim_ws = TRUE) | |
head(UNdata_macm) | |
# Merge the fb data with the UN data | |
comparison_macm <- merge(all.mac.men,UNdata_macm, by="country") | |
head(comparison_macm) | |
# 78 countries, 82 in the paper, but there were mismatch. | |
###################################################################################################### | |
# Overall Correlation | |
class(comparison_macm$un_mac) | |
cor(comparison_macm$un_mac, comparison_macm$fb_mac, method="pearson") | |
cor(comparison_macm$un_mac, comparison_macm$fb_mac, method="spearman") | |
cor.test(comparison_macm$fb_mac,comparison_macm$un_mac, method="spearman") | |
# As in the paper, the correlation is equal to 0.79, however, there are 4 countries less than | |
# in the calculation for the paper. | |
MAE(comparison_macm$un_mac, comparison_macm$fb_mac) | |
MAPE(comparison_macm$un_mac, comparison_macm$fb_mac) | |
MSE(comparison_macm$un_mac, comparison_macm$fb_mac) | |
###################################################################################################### | |
# We now compute the Correlation and the Mape for each continent, | |
# and the leave-one-out-cross validation | |
###################################################################################################### | |
# Africa | |
macc_africa<-comparison_macm[which(comparison_macm$continent=='Africa'),] | |
# CORRELATION | |
cor(macc_africa$fb_mac,macc_africa$un_mac, method='spearman') | |
cor.test(macc_africa$fb_mac,macc_africa$un_mac, method="spearman") | |
# MAPE | |
MAPE(macc_africa$fb_mac,macc_africa$un_mac)*100 | |
# LOOCV | |
results_africam <- list(NULL) | |
for (i in 1:(nrow(macc_africa))) { | |
leave.out.row <- macc_africa[-c(i),] | |
only.the.row <- macc_africa[i,] | |
model <- lm(un_mac ~ fb_mac, data=leave.out.row) | |
pred <- predict(model, newdata=only.the.row) | |
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac) | |
frame <- data.frame(pred=pred, mape=mape) | |
results_africam[[i]] <- frame | |
} | |
results_africam | |
# There are only two Africa countries: Egypt and Reunion. The UN provides the | |
# data for Mauritius as well, but it was not possible to include Mauritus in the analaysis | |
# since it has a FB mac of 32.5 | |
# Table 11: https://unstats.un.org/unsd/demographic/products/dyb/dyb2014.htm | |
df <- data.frame(matrix(unlist(results_africam), nrow=2, byrow=T)) | |
colnames(df) <- c("pred","mape") | |
macc_africa<-cbind(macc_africa,df) | |
# CORRELATION loocv | |
cor(macc_africa$un_mac, macc_africa$pred, method='spearman') | |
cor.test(macc_africa$un_mac, macc_africa$pred, method="spearman") | |
# MAPE loocv | |
average_out_of_sample_mape<- mean(macc_africa$mape) | |
average_out_of_sample_mape*100 | |
###################################################################################################### | |
# Asia | |
macc_asia<-comparison_macm[which(comparison_macm$continent=='Asia'),] | |
# CORRELATION | |
cor(macc_asia$fb_mac,macc_asia$un_mac, method='spearman') | |
cor.test(macc_asia$fb_mac,macc_asia$un_mac, method="spearman") | |
# MAPE | |
MAPE(macc_asia$fb_mac,macc_asia$un_mac)*100 | |
results_asiam <- list(NULL) | |
for (i in 1:(nrow(macc_asia))) { | |
leave.out.row <- macc_asia[-c(i),] | |
only.the.row <- macc_asia[i,] | |
model <- lm(un_mac ~ fb_mac, data=leave.out.row) | |
pred <- predict(model, newdata=only.the.row) | |
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac) | |
frame <- data.frame(pred=pred, mape=mape) | |
results_asiam[[i]] <- frame | |
} | |
results_asiam | |
# Two more Asian countries | |
df <- data.frame(matrix(unlist(results_asiam), nrow=16, byrow=T)) | |
colnames(df) <- c("pred","mape") | |
macc_asia<-cbind(macc_asia,df) | |
# CORRELATION loocv | |
cor(macc_asia$un_mac, macc_asia$pred, method='spearman') | |
cor.test(macc_asia$un_mac, macc_asia$pred, method="spearman") | |
# MAPE loocv | |
average_out_of_sample_mape<- mean(macc_asia$mape) | |
average_out_of_sample_mape*100 | |
###################################################################################################### | |
# Europe | |
macc_europe<-comparison_macm[which(comparison_macm$continent=='Europe'),] | |
# CORRELATION | |
cor(macc_europe$fb_mac,macc_europe$un_mac, method='spearman') | |
cor.test(macc_europe$fb_mac,macc_europe$un_mac, method="spearman") | |
# MAPE | |
MAPE(macc_europe$fb_mac,macc_europe$un_mac)*100 | |
results_europem <- list(NULL) | |
for (i in 1:(nrow(macc_europe))) { | |
leave.out.row <- macc_europe[-c(i),] | |
only.the.row <- macc_europe[i,] | |
model <- lm(un_mac ~ fb_mac, data=leave.out.row) | |
pred <- predict(model, newdata=only.the.row) | |
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac) | |
frame <- data.frame(pred=pred, mape=mape) | |
results_europem[[i]] <- frame | |
} | |
results_europem | |
# 40 as in the paper | |
df <- data.frame(matrix(unlist(results_europem), nrow=40, byrow=T)) | |
colnames(df) <- c("pred","mape") | |
macc_europe<-cbind(macc_europe,df) | |
# CORRELATION loocv | |
cor(macc_europe$un_mac, macc_europe$pred, method='spearman') | |
cor.test(macc_europe$un_mac, macc_europe$pred, method="spearman") | |
# MAPE loocv | |
average_out_of_sample_mape<- mean(macc_europe$mape) | |
average_out_of_sample_mape*100 | |
###################################################################################################### | |
# North America | |
macc_na<-comparison_macm[which(comparison_macm$continent=='North America'),] | |
# CORRELATION | |
cor(macc_na$fb_mac,macc_na$un_mac, method='spearman') | |
cor.test(macc_na$fb_mac,macc_na$un_mac, method="spearman") | |
# MAPE | |
MAPE(macc_na$fb_mac,macc_na$un_mac)*100 | |
results_nam <- list(NULL) | |
for (i in 1:(nrow(macc_na))) { | |
leave.out.row <- macc_na[-c(i),] | |
only.the.row <- macc_na[i,] | |
model <- lm(un_mac ~ fb_mac, data=leave.out.row) | |
pred <- predict(model, newdata=only.the.row) | |
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac) | |
frame <- data.frame(pred=pred, mape=mape) | |
results_nam[[i]] <- frame | |
} | |
results_nam | |
# One country less | |
df <- data.frame(matrix(unlist(results_nam), nrow=12, byrow=T)) | |
colnames(df) <- c("pred","mape") | |
macc_na<-cbind(macc_na,df) | |
# CORRELATION loocv | |
cor(macc_na$un_mac, macc_na$pred, method='spearman') | |
cor.test(macc_na$un_mac, macc_na$pred, method="spearman") | |
# MAPE loocv | |
average_out_of_sample_mape<- mean(macc_na$mape) | |
average_out_of_sample_mape*100 | |
###################################################################################################### | |
# Oceania | |
macc_oceania<-comparison_macm[which(comparison_macm$continent=='Oceania'),] | |
# CORRELATION | |
cor(macc_oceania$fb_mac,macc_oceania$un_mac, method='spearman') | |
cor.test(macc_oceania$fb_mac,macc_oceania$un_mac, method="spearman") | |
# MAPE | |
MAPE(macc_oceania$fb_mac,macc_oceania$un_mac)*100 | |
results_oceaniam <- list(NULL) | |
for (i in 1:(nrow(macc_oceania))) { | |
leave.out.row <- macc_oceania[-c(i),] | |
only.the.row <- macc_oceania[i,] | |
model <- lm(un_mac ~ fb_mac, data=leave.out.row) | |
pred <- predict(model, newdata=only.the.row) | |
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac) | |
frame <- data.frame(pred=pred, mape=mape) | |
results_oceaniam[[i]] <- frame | |
} | |
results_oceaniam | |
# Change number of row | |
df <- data.frame(matrix(unlist(results_oceaniam), nrow=2, byrow=T)) | |
colnames(df) <- c("pred","mape") | |
macc_oceania<-cbind(macc_oceania,df) | |
# CORRELATION loocv | |
cor(macc_oceania$un_mac, macc_oceania$pred, method='spearman') | |
cor.test(macc_oceania$un_mac, macc_oceania$pred, method="spearman") | |
# MAPE loocv | |
average_out_of_sample_mape<- mean(macc_oceania$mape) | |
average_out_of_sample_mape*100 | |
###################################################################################################### | |
# South America | |
macc_sa<-comparison_macm[which(comparison_macm$continent=='South America'),] | |
# CORRELATION | |
cor(macc_sa$fb_mac,macc_sa$un_mac, method='spearman') | |
cor.test(macc_sa$fb_mac,macc_sa$un_mac, method="spearman") | |
# MAPE | |
MAPE(macc_sa$fb_mac,macc_sa$un_mac)*100 | |
results_sam <- list(NULL) | |
for (i in 1:(nrow(macc_sa))) { | |
leave.out.row <- macc_sa[-c(i),] | |
only.the.row <- macc_sa[i,] | |
model <- lm(un_mac ~ fb_mac, data=leave.out.row) | |
pred <- predict(model, newdata=only.the.row) | |
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac) | |
frame <- data.frame(pred=pred, mape=mape) | |
results_sam[[i]] <- frame | |
} | |
results_sam | |
# Change number of row | |
df <- data.frame(matrix(unlist(results_sam), nrow=6, byrow=T)) | |
colnames(df) <- c("pred","mape") | |
macc_sa<-cbind(macc_sa,df) | |
# CORRELATION loocv | |
cor(macc_sa$un_mac, macc_sa$pred, method='spearman') | |
cor.test(macc_sa$un_mac, macc_sa$pred, method="spearman") | |
# MAPE loocv | |
average_out_of_sample_mape<- mean(macc_sa$mape) | |
average_out_of_sample_mape*100 | |
###################################################################################################### | |
# Regression | |
unfbmac_male <- read_delim("unfbmac_allmale.csv", ";", escape_double = FALSE, trim_ws = TRUE) | |
# linear regression | |
m1<-lm(unfbmac_male$un_mac~unfbmac_male$fb_mac) | |
summary(m1) | |
summary(m1)$adj.r.squared | |
pre<-predict.lm(m1) | |
sample<-unfbmac_male[sample(nrow(unfbmac_male), 68), ] | |
m1<-lm(unfbmac_male$un_mac~unfbmac_male$fb_mac) | |
summary(m1)$adj.r.squared | |
pre<-predict.lm(m1) | |
funcor<- function(sample) { | |
# sample | |
sample<-unfbmac_male[sample(nrow(unfbmac_male), 68), ] | |
# regression | |
m1<-lm(un_mac~fb_mac, data=sample) | |
# prediction | |
pre<-predict.lm(m1) | |
return(pre) | |
} | |
the.names.country <- names(table((unfbmac_male$country))) | |
all.pre <- lapply(the.names.country, funcor) | |
names(all.pre) <- the.names.country | |
all.pre | |
# We sample removing 10 observations from the total number of male observations. | |
#------ | |
#1 | |
sample1<-unfbmac_male[sample(nrow(unfbmac_male), 68), ] | |
m1<-lm(un_mac~fb_mac, data=sample1) | |
adjr1<-summary(m1)$adj.r.squared | |
pre<-predict.lm(m1) | |
#2 | |
sample2<-unfbmac_male[sample(nrow(unfbmac_male), 68), ] | |
m2<-lm(un_mac~fb_mac, data=sample2) | |
adjr2<-summary(m2)$adj.r.squared | |
pre<-predict.lm(m2) | |
#3 | |
sample3<-unfbmac_male[sample(nrow(unfbmac_male), 68), ] | |
m3<-lm(un_mac~fb_mac, data=sample3) | |
adjr3<-summary(m3)$adj.r.squared | |
pre<-predict.lm(m3) | |
#4 | |
sample4<-unfbmac_male[sample(nrow(unfbmac_male), 68), ] | |
m4<-lm(un_mac~fb_mac, data=sample4) | |
adjr4<-summary(m4)$adj.r.squared | |
pre<-predict.lm(m4) | |
#5 | |
sample5<-unfbmac_male[sample(nrow(unfbmac_male), 68), ] | |
m5<-lm(un_mac~fb_mac, data=sample5) | |
adjr5<-summary(m5)$adj.r.squared | |
pre<-predict.lm(m5) | |
#6 | |
sample6<-unfbmac_male[sample(nrow(unfbmac_male), 68), ] | |
m6<-lm(un_mac~fb_mac, data=sample6) | |
adjr6<-summary(m6)$adj.r.squared | |
pre<-predict.lm(m6) | |
#7 | |
sample7<-unfbmac_male[sample(nrow(unfbmac_male), 68), ] | |
m7<-lm(un_mac~fb_mac, data=sample7) | |
adjr7<-summary(m7)$adj.r.squared | |
pre<-predict.lm(m7) | |
#8 | |
sample8<-unfbmac_male[sample(nrow(unfbmac_male), 68), ] | |
m8<-lm(un_mac~fb_mac, data=sample8) | |
adjr8<-summary(m8)$adj.r.squared | |
pre<-predict.lm(m8) | |
#9 | |
sample9<-unfbmac_male[sample(nrow(unfbmac_male), 68), ] | |
m9<-lm(un_mac~fb_mac, data=sample9) | |
adjr9<-summary(m9)$adj.r.squared | |
pre<-predict.lm(m9) | |
#10 | |
sample10<-unfbmac_male[sample(nrow(unfbmac_male), 68), ] | |
m10<-lm(un_mac~fb_mac, data=sample10) | |
adjr10<-summary(m10)$adj.r.squared | |
pre<-predict.lm(m10) | |
#adjr | |
adjr<-rbind(adjr1, adjr2,adjr3,adjr4,adjr5, adjr6, adjr7, adjr8, adjr9, adjr10) | |
adjr<-data.frame(adjr) | |
summary(adjr$adjr) | |
############################# | |
## Out of sample analysis ### | |
############################# | |
MAPE_values<- NULL | |
for(ii in 1:10){ | |
unfbmac_male$pred <- NA | |
training_data<-unfbmac_male[!with(unfbmac_male,is.na(fb_mac) | is.na(un_mac)),] | |
test_data<- unfbmac_male[is.na(unfbmac_male$un_mac),] | |
model<-lm(un_mac ~ fb_mac, data=training_data) | |
training_data$pred<-predict(model,newdata=training_data) | |
mape_pred_single_run<- mean(abs(training_data$pred - training_data$un_mac)/training_data$un_mac) | |
MAPE_values<- c(MAPE_values,mape_pred_single_run) | |
} | |
MAPE_values | |
average_out_of_sample_mape<- mean(MAPE_values) | |
average_out_of_sample_mape | |
average_out_of_sample_mape*100 | |
################################################################################################ | |
## Model summary with all the observations for predicting the MAC | |
## for the countries not included in the UN data | |
################################################################################################ | |
unfbmac_male <- read_delim("unfbmac_allmale.csv", ";", escape_double = FALSE, trim_ws = TRUE) | |
# linear regression | |
m1<-lm(unfbmac_male$un_mac~unfbmac_male$fb_mac) | |
summary(m1) | |
summary(m1)$adj.r.squared | |
pre<-predict(m1) | |
leave_out_rows<-unfbmac_male[is.na(unfbmac_male$un_mac),] | |
training_data<- na.omit(unfbmac_male) | |
test_data<- unfbmac_male[is.na(unfbmac_male$un_mac),] | |
model<-lm(un_mac ~ fb_mac, data=training_data) | |
unfbmac_male$pred<-predict(model,newdata=unfbmac_male) | |
unfbmac_male$pred | |
map_data<-unfbmac_male[is.na(unfbmac_male$un_mac),] | |
map_data | |
data(countryExData) | |
sPDF <- joinCountryData2Map(map_data, joinCode = "ISO3", nameJoinColumn = "ISO3V10") | |
par(mai=c(0,0,0.2,0),xaxs="i",yaxs="i") | |
mapParams <- mapCountryData( sPDF, nameColumnToPlot="pred" , addLegend=TRUE, mapTitle="") | |
# Prediction on 82 countries (we exclude China) | |