Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#######################################################################################
###################### Mater Certa Est, Pater Numquam ###############################
###### What Can Facebook Advertising Platform Tell Us about Male Fertility? ###########
#######################################################################################
################################# Packages ############################################
#install.packages("tidyverse")
library(tidyverse)
#install.packages("plyr")
library(plyr)
#install.packages("readr")
library(readr)
#install.packages("data.table")
library(data.table)
#install.packages("rworldmap")
library(rworldmap)
#install.packages("MLmetrics")
library(MLmetrics)
################################# Data ################################################
# The data from Facebook were collected on the 2nd of January 2018
fb <- read_csv("fb.csv")
# Read all the numbers in audience
options(scipen = 50)
################################# FEMALE ################################################
# We first focus on the Women
women <- filter(fb, genders==2, age!="15-49", fertility=="Child")
women$mage<-revalue(women$age,c(
"15-19"="17.5",
"20-24"="22.5",
"25-29"="27.5",
"30-34"="32.5",
"35-39"="37.5",
"40-44"="42.5",
"45-49"="47.5"
))
# We transformed the vraibale mage and audience into numeric variables
women$mage<-as.numeric(women$mage)
women$audience<-as.numeric(women$audience)
# We create the variable ca which is the product between mage and the fb audience estimate
women$ca=(women$mage*women$audience)
# We sort the file
sort.people <- with(women, women[order(country) , ])
# The following function computes for every single country the Mean Age at Childbearing
macfunction <- function(country.name) {
data.country <- subset(sort.people, sort.people$country==country.name)
ca <- (data.country$mage*data.country$audience)
numerator <- sum(ca)
# denominator births for each country
denominator <- sum(data.country$audience)
# mean age at childbearing by country
MAC<- numerator/denominator
return(MAC)
}
the.names <- names(table((sort.people$country)))
all.MAC <- lapply(the.names, macfunction)
names(all.MAC) <- the.names
all.MAC
all.mac <- ldply (all.MAC, data.frame)
colnames(all.mac)<-c("country", "fb_mac")
# all.mac contains all the mean age at childbearing computed with the macfunction
# UN data for mac
UNdata_macw <- read_csv("UNdata_macw.csv")
head(UNdata_macw)
# Merge the fb data with the UN data
comparison_macw <- merge(all.mac,UNdata_macw, by="country")
head(comparison_macw)
# 135 country in the paper 143 in this analysis, since we checked again the code linking the country
comparison_macw <-subset(comparison_macw, comparison_macw$fb_mac!=32.50)
######################################################################################################
# Overall Correlation
class(comparison_macw$un_mac)
comparison_macw$un_mac<-as.numeric(as.character(comparison_macw$un_mac))
cor(comparison_macw$un_mac, comparison_macw$fb_mac, method="pearson")
cor(comparison_macw$un_mac, comparison_macw$fb_mac, method="spearman")
cor.test(comparison_macw$fb_mac,comparison_macw$un_mac, method="spearman")
# 0.462538 in the paper it is reported to be 0.47
# p-value = 0.000000006032
MAE(comparison_macw$un_mac, comparison_macw$fb_mac)
MAPE(comparison_macw$un_mac, comparison_macw$fb_mac)
MSE(comparison_macw$un_mac, comparison_macw$fb_mac)
######################################################################################################
# We now compute the Correlation and the Mape for each continent,
# and the leave-one-out-cross validation
######################################################################################################
# Africa
macc_africa<-comparison_macw[which(comparison_macw$continent=='Africa'),]
# Correlation
cor(macc_africa$fb_mac,macc_africa$un_mac, method='spearman')
cor.test(macc_africa$fb_mac,macc_africa$un_mac, method="spearman")
# MAPE
MAPE(macc_africa$fb_mac,macc_africa$un_mac)*100
# LOOCV
results_africaw <- list(NULL)
for (i in 1:(nrow(macc_africa))) {
leave.out.row <- macc_africa[-c(i),]
only.the.row <- macc_africa[i,]
model <- lm(un_mac ~ fb_mac, data=leave.out.row)
pred <- predict(model, newdata=only.the.row)
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
frame <- data.frame(pred=pred, mape=mape)
results_africaw[[i]] <- frame
}
results_africaw
# Added two countries in Africa
df <- data.frame(matrix(unlist(results_africaw), nrow=30, byrow=T))
colnames(df) <- c("pred","mape")
macc_africa<-cbind(macc_africa,df)
# CORRELATION loocv
cor(macc_africa$un_mac, macc_africa$pred, method='spearman')
cor.test(macc_africa$un_mac, macc_africa$pred, method="spearman")
# MAPE loocv
average_out_of_sample_mape<- mean(macc_africa$mape)
average_out_of_sample_mape*100
######################################################################################################
# Asia
macc_asia<-comparison_macw[which(comparison_macw$continent=='Asia'),]
# Correlation
cor(macc_asia$fb_mac,macc_africa$un_mac, method='spearman')
cor.test(macc_asia$fb_mac,macc_asia$un_mac, method="spearman")
# MAPE
MAPE(macc_asia$fb_mac,macc_asia$un_mac)*100
# LOOCV
results_asiaw <- list(NULL)
for (i in 1:(nrow(macc_asia))) {
leave.out.row <- macc_asia[-c(i),]
only.the.row <- macc_asia[i,]
model <- lm(un_mac ~ fb_mac, data=leave.out.row)
pred <- predict(model, newdata=only.the.row)
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
frame <- data.frame(pred=pred, mape=mape)
results_asiaw[[i]] <- frame
}
results_asiaw
# Two more Asian countries
df <- data.frame(matrix(unlist(results_asiaw), nrow=35, byrow=T))
colnames(df) <- c("pred","mape")
macc_asia<-cbind(macc_asia,df)
# CORRELATION loocv
cor(macc_asia$un_mac, macc_asia$pred, method='spearman')
cor.test(macc_asia$un_mac, macc_asia$pred, method="spearman")
# MAPE loocv
average_out_of_sample_mape<- mean(macc_asia$mape)
average_out_of_sample_mape*100
######################################################################################################
# Europe
macc_europe<-comparison_macw[which(comparison_macw$continent=='Europe'),]
# Correlation
cor(macc_europe$fb_mac,macc_europe$un_mac, method='spearman')
cor.test(macc_europe$fb_mac,macc_europe$un_mac, method="spearman")
# MAPE
MAPE(macc_europe$fb_mac,macc_europe$un_mac)*100
# LOOCV
results_europew <- list(NULL)
for (i in 1:(nrow(macc_europe))) {
leave.out.row <- macc_europe[-c(i),]
only.the.row <- macc_europe[i,]
model <- lm(un_mac ~ fb_mac, data=leave.out.row)
pred <- predict(model, newdata=only.the.row)
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
frame <- data.frame(pred=pred, mape=mape)
results_europew[[i]] <- frame
}
results_europew
# 41 as in the paper
df <- data.frame(matrix(unlist(results_europew), nrow=41, byrow=T))
colnames(df) <- c("pred","mape")
macc_europe<-cbind(macc_europe,df)
# CORRELATION loocv
cor(macc_europe$un_mac, macc_europe$pred, method='spearman')
cor.test(macc_europe$un_mac, macc_europe$pred, method="spearman")
# MAPE loocv
average_out_of_sample_mape<- mean(macc_europe$mape)
average_out_of_sample_mape*100
######################################################################################################
# North America
macc_na<-comparison_macw[which(comparison_macw$continent=='North America'),]
# Correlation
cor(macc_na$fb_mac,macc_na$un_mac, method='spearman')
cor.test(macc_na$fb_mac,macc_na$un_mac, method="spearman")
# MAPE
MAPE(macc_na$fb_mac,macc_na$un_mac)*100
# LOOCV
results_naw <- list(NULL)
for (i in 1:(nrow(macc_na))) {
leave.out.row <- macc_na[-c(i),]
only.the.row <- macc_na[i,]
model <- lm(un_mac ~ fb_mac, data=leave.out.row)
pred <- predict(model, newdata=only.the.row)
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
frame <- data.frame(pred=pred, mape=mape)
results_naw[[i]] <- frame
}
results_naw
# One additional country
df <- data.frame(matrix(unlist(results_naw), nrow=17, byrow=T))
colnames(df) <- c("pred","mape")
macc_na<-cbind(macc_na,df)
# CORRELATION loocv
cor(macc_na$un_mac, macc_na$pred, method='spearman')
cor.test(macc_na$un_mac, macc_na$pred, method="spearman")
# MAPE loocv
average_out_of_sample_mape<- mean(macc_na$mape)
average_out_of_sample_mape*100
######################################################################################################
# Oceania
macc_oceania<-comparison_macw[which(comparison_macw$continent=='Oceania'),]
# Correlation
cor(macc_oceania$fb_mac,macc_oceania$un_mac, method='spearman')
cor.test(macc_oceania$fb_mac,macc_oceania$un_mac, method="spearman")
# MAPE
MAPE(macc_oceania$fb_mac,macc_oceania$un_mac)*100
# LOOCV
results_oceaniaw <- list(NULL)
for (i in 1:(nrow(macc_oceania))) {
leave.out.row <- macc_oceania[-c(i),]
only.the.row <- macc_oceania[i,]
model <- lm(un_mac ~ fb_mac, data=leave.out.row)
pred <- predict(model, newdata=only.the.row)
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
frame <- data.frame(pred=pred, mape=mape)
results_oceaniaw[[i]] <- frame
}
results_oceaniaw
# 8 country as in the paper
df <- data.frame(matrix(unlist(results_oceaniaw), nrow=8, byrow=T))
colnames(df) <- c("pred","mape")
macc_oceania<-cbind(macc_oceania,df)
# CORRELATION loocv
cor(macc_oceania$un_mac, macc_oceania$pred, method='spearman')
cor.test(macc_oceania$un_mac, macc_oceania$pred, method="spearman")
# MAPE loocv
average_out_of_sample_mape<- mean(macc_oceania$mape)
average_out_of_sample_mape*100
######################################################################################################
# South America
macc_sa<-comparison_macw[which(comparison_macw$continent=='South America'),]
# Correlation
cor(macc_sa$fb_mac,macc_sa$un_mac, method='spearman')
cor.test(macc_sa$fb_mac,macc_sa$un_mac, method="spearman")
# MAPE
MAPE(macc_asa$fb_mac,macc_sa$un_mac)*100
# LOOCV
results_saw <- list(NULL)
for (i in 1:(nrow(macc_sa))) {
leave.out.row <- macc_sa[-c(i),]
only.the.row <- macc_sa[i,]
model <- lm(un_mac ~ fb_mac, data=leave.out.row)
pred <- predict(model, newdata=only.the.row)
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
frame <- data.frame(pred=pred, mape=mape)
results_saw[[i]] <- frame
}
results_saw
# 12 countries as in the paper
df <- data.frame(matrix(unlist(results_saw), nrow=12, byrow=T))
colnames(df) <- c("pred","mape")
macc_sa<-cbind(macc_sa,df)
# CORRELATION loocv
cor(macc_sa$un_mac, macc_sa$pred, method='spearman')
cor.test(macc_sa$un_mac, macc_sa$pred, method="spearman")
# MAPE loocv
average_out_of_sample_mape<- mean(macc_sa$mape)
average_out_of_sample_mape*100
################################# MALE ################################################
# Men
men <- filter(fb, genders==1, age!="15-49", fertility=="Child")
men$mage<-revalue(men$age,c(
"15-19"="17.5",
"20-24"="22.5",
"25-29"="27.5",
"30-34"="32.5",
"35-39"="37.5",
"40-44"="42.5",
"45-49"="47.5"
))
men$mage<-as.numeric(men$mage)
class(men$mage)
men$audience<-as.numeric(men$audience)
men$ca=(men$mage*men$audience)
sort.people <- with(men, men[order(country) , ])
the.names <- names(table((sort.people$country)))
all.MAC.men <- lapply(the.names, macfunction)
names(all.MAC.men) <- the.names
all.MAC.men
all.mac.men <- ldply (all.MAC.men, data.frame)
colnames(all.mac.men)<-c("country", "fb_mac")
all.mac.men<-subset(all.mac.men, fb_mac!=32.5)
# all.mac contains all the mean age at childbearing computed with the macfunction
# UN data for mac
UNdata_macm <- read_delim("UNdata_macm.csv", ";", escape_double = FALSE, trim_ws = TRUE)
head(UNdata_macm)
# Merge the fb data with the UN data
comparison_macm <- merge(all.mac.men,UNdata_macm, by="country")
head(comparison_macm)
# 78 countries, 82 in the paper, but there were mismatch.
######################################################################################################
# Overall Correlation
class(comparison_macm$un_mac)
cor(comparison_macm$un_mac, comparison_macm$fb_mac, method="pearson")
cor(comparison_macm$un_mac, comparison_macm$fb_mac, method="spearman")
cor.test(comparison_macm$fb_mac,comparison_macm$un_mac, method="spearman")
# As in the paper, the correlation is equal to 0.79, however, there are 4 countries less than
# in the calculation for the paper.
MAE(comparison_macm$un_mac, comparison_macm$fb_mac)
MAPE(comparison_macm$un_mac, comparison_macm$fb_mac)
MSE(comparison_macm$un_mac, comparison_macm$fb_mac)
######################################################################################################
# We now compute the Correlation and the Mape for each continent,
# and the leave-one-out-cross validation
######################################################################################################
# Africa
macc_africa<-comparison_macm[which(comparison_macm$continent=='Africa'),]
# CORRELATION
cor(macc_africa$fb_mac,macc_africa$un_mac, method='spearman')
cor.test(macc_africa$fb_mac,macc_africa$un_mac, method="spearman")
# MAPE
MAPE(macc_africa$fb_mac,macc_africa$un_mac)*100
# LOOCV
results_africam <- list(NULL)
for (i in 1:(nrow(macc_africa))) {
leave.out.row <- macc_africa[-c(i),]
only.the.row <- macc_africa[i,]
model <- lm(un_mac ~ fb_mac, data=leave.out.row)
pred <- predict(model, newdata=only.the.row)
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
frame <- data.frame(pred=pred, mape=mape)
results_africam[[i]] <- frame
}
results_africam
# There are only two Africa countries: Egypt and Reunion. The UN provides the
# data for Mauritius as well, but it was not possible to include Mauritus in the analaysis
# since it has a FB mac of 32.5
# Table 11: https://unstats.un.org/unsd/demographic/products/dyb/dyb2014.htm
df <- data.frame(matrix(unlist(results_africam), nrow=2, byrow=T))
colnames(df) <- c("pred","mape")
macc_africa<-cbind(macc_africa,df)
# CORRELATION loocv
cor(macc_africa$un_mac, macc_africa$pred, method='spearman')
cor.test(macc_africa$un_mac, macc_africa$pred, method="spearman")
# MAPE loocv
average_out_of_sample_mape<- mean(macc_africa$mape)
average_out_of_sample_mape*100
######################################################################################################
# Asia
macc_asia<-comparison_macm[which(comparison_macm$continent=='Asia'),]
# CORRELATION
cor(macc_asia$fb_mac,macc_asia$un_mac, method='spearman')
cor.test(macc_asia$fb_mac,macc_asia$un_mac, method="spearman")
# MAPE
MAPE(macc_asia$fb_mac,macc_asia$un_mac)*100
results_asiam <- list(NULL)
for (i in 1:(nrow(macc_asia))) {
leave.out.row <- macc_asia[-c(i),]
only.the.row <- macc_asia[i,]
model <- lm(un_mac ~ fb_mac, data=leave.out.row)
pred <- predict(model, newdata=only.the.row)
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
frame <- data.frame(pred=pred, mape=mape)
results_asiam[[i]] <- frame
}
results_asiam
# Two more Asian countries
df <- data.frame(matrix(unlist(results_asiam), nrow=16, byrow=T))
colnames(df) <- c("pred","mape")
macc_asia<-cbind(macc_asia,df)
# CORRELATION loocv
cor(macc_asia$un_mac, macc_asia$pred, method='spearman')
cor.test(macc_asia$un_mac, macc_asia$pred, method="spearman")
# MAPE loocv
average_out_of_sample_mape<- mean(macc_asia$mape)
average_out_of_sample_mape*100
######################################################################################################
# Europe
macc_europe<-comparison_macm[which(comparison_macm$continent=='Europe'),]
# CORRELATION
cor(macc_europe$fb_mac,macc_europe$un_mac, method='spearman')
cor.test(macc_europe$fb_mac,macc_europe$un_mac, method="spearman")
# MAPE
MAPE(macc_europe$fb_mac,macc_europe$un_mac)*100
results_europem <- list(NULL)
for (i in 1:(nrow(macc_europe))) {
leave.out.row <- macc_europe[-c(i),]
only.the.row <- macc_europe[i,]
model <- lm(un_mac ~ fb_mac, data=leave.out.row)
pred <- predict(model, newdata=only.the.row)
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
frame <- data.frame(pred=pred, mape=mape)
results_europem[[i]] <- frame
}
results_europem
# 40 as in the paper
df <- data.frame(matrix(unlist(results_europem), nrow=40, byrow=T))
colnames(df) <- c("pred","mape")
macc_europe<-cbind(macc_europe,df)
# CORRELATION loocv
cor(macc_europe$un_mac, macc_europe$pred, method='spearman')
cor.test(macc_europe$un_mac, macc_europe$pred, method="spearman")
# MAPE loocv
average_out_of_sample_mape<- mean(macc_europe$mape)
average_out_of_sample_mape*100
######################################################################################################
# North America
macc_na<-comparison_macm[which(comparison_macm$continent=='North America'),]
# CORRELATION
cor(macc_na$fb_mac,macc_na$un_mac, method='spearman')
cor.test(macc_na$fb_mac,macc_na$un_mac, method="spearman")
# MAPE
MAPE(macc_na$fb_mac,macc_na$un_mac)*100
results_nam <- list(NULL)
for (i in 1:(nrow(macc_na))) {
leave.out.row <- macc_na[-c(i),]
only.the.row <- macc_na[i,]
model <- lm(un_mac ~ fb_mac, data=leave.out.row)
pred <- predict(model, newdata=only.the.row)
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
frame <- data.frame(pred=pred, mape=mape)
results_nam[[i]] <- frame
}
results_nam
# One country less
df <- data.frame(matrix(unlist(results_nam), nrow=12, byrow=T))
colnames(df) <- c("pred","mape")
macc_na<-cbind(macc_na,df)
# CORRELATION loocv
cor(macc_na$un_mac, macc_na$pred, method='spearman')
cor.test(macc_na$un_mac, macc_na$pred, method="spearman")
# MAPE loocv
average_out_of_sample_mape<- mean(macc_na$mape)
average_out_of_sample_mape*100
######################################################################################################
# Oceania
macc_oceania<-comparison_macm[which(comparison_macm$continent=='Oceania'),]
# CORRELATION
cor(macc_oceania$fb_mac,macc_oceania$un_mac, method='spearman')
cor.test(macc_oceania$fb_mac,macc_oceania$un_mac, method="spearman")
# MAPE
MAPE(macc_oceania$fb_mac,macc_oceania$un_mac)*100
results_oceaniam <- list(NULL)
for (i in 1:(nrow(macc_oceania))) {
leave.out.row <- macc_oceania[-c(i),]
only.the.row <- macc_oceania[i,]
model <- lm(un_mac ~ fb_mac, data=leave.out.row)
pred <- predict(model, newdata=only.the.row)
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
frame <- data.frame(pred=pred, mape=mape)
results_oceaniam[[i]] <- frame
}
results_oceaniam
# Change number of row
df <- data.frame(matrix(unlist(results_oceaniam), nrow=2, byrow=T))
colnames(df) <- c("pred","mape")
macc_oceania<-cbind(macc_oceania,df)
# CORRELATION loocv
cor(macc_oceania$un_mac, macc_oceania$pred, method='spearman')
cor.test(macc_oceania$un_mac, macc_oceania$pred, method="spearman")
# MAPE loocv
average_out_of_sample_mape<- mean(macc_oceania$mape)
average_out_of_sample_mape*100
######################################################################################################
# South America
macc_sa<-comparison_macm[which(comparison_macm$continent=='South America'),]
# CORRELATION
cor(macc_sa$fb_mac,macc_sa$un_mac, method='spearman')
cor.test(macc_sa$fb_mac,macc_sa$un_mac, method="spearman")
# MAPE
MAPE(macc_sa$fb_mac,macc_sa$un_mac)*100
results_sam <- list(NULL)
for (i in 1:(nrow(macc_sa))) {
leave.out.row <- macc_sa[-c(i),]
only.the.row <- macc_sa[i,]
model <- lm(un_mac ~ fb_mac, data=leave.out.row)
pred <- predict(model, newdata=only.the.row)
mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
frame <- data.frame(pred=pred, mape=mape)
results_sam[[i]] <- frame
}
results_sam
# Change number of row
df <- data.frame(matrix(unlist(results_sam), nrow=6, byrow=T))
colnames(df) <- c("pred","mape")
macc_sa<-cbind(macc_sa,df)
# CORRELATION loocv
cor(macc_sa$un_mac, macc_sa$pred, method='spearman')
cor.test(macc_sa$un_mac, macc_sa$pred, method="spearman")
# MAPE loocv
average_out_of_sample_mape<- mean(macc_sa$mape)
average_out_of_sample_mape*100
######################################################################################################
# Regression
unfbmac_male <- read_delim("unfbmac_allmale.csv", ";", escape_double = FALSE, trim_ws = TRUE)
# linear regression
m1<-lm(unfbmac_male$un_mac~unfbmac_male$fb_mac)
summary(m1)
summary(m1)$adj.r.squared
pre<-predict.lm(m1)
sample<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m1<-lm(unfbmac_male$un_mac~unfbmac_male$fb_mac)
summary(m1)$adj.r.squared
pre<-predict.lm(m1)
funcor<- function(sample) {
# sample
sample<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
# regression
m1<-lm(un_mac~fb_mac, data=sample)
# prediction
pre<-predict.lm(m1)
return(pre)
}
the.names.country <- names(table((unfbmac_male$country)))
all.pre <- lapply(the.names.country, funcor)
names(all.pre) <- the.names.country
all.pre
# We sample removing 10 observations from the total number of male observations.
#------
#1
sample1<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m1<-lm(un_mac~fb_mac, data=sample1)
adjr1<-summary(m1)$adj.r.squared
pre<-predict.lm(m1)
#2
sample2<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m2<-lm(un_mac~fb_mac, data=sample2)
adjr2<-summary(m2)$adj.r.squared
pre<-predict.lm(m2)
#3
sample3<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m3<-lm(un_mac~fb_mac, data=sample3)
adjr3<-summary(m3)$adj.r.squared
pre<-predict.lm(m3)
#4
sample4<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m4<-lm(un_mac~fb_mac, data=sample4)
adjr4<-summary(m4)$adj.r.squared
pre<-predict.lm(m4)
#5
sample5<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m5<-lm(un_mac~fb_mac, data=sample5)
adjr5<-summary(m5)$adj.r.squared
pre<-predict.lm(m5)
#6
sample6<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m6<-lm(un_mac~fb_mac, data=sample6)
adjr6<-summary(m6)$adj.r.squared
pre<-predict.lm(m6)
#7
sample7<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m7<-lm(un_mac~fb_mac, data=sample7)
adjr7<-summary(m7)$adj.r.squared
pre<-predict.lm(m7)
#8
sample8<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m8<-lm(un_mac~fb_mac, data=sample8)
adjr8<-summary(m8)$adj.r.squared
pre<-predict.lm(m8)
#9
sample9<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m9<-lm(un_mac~fb_mac, data=sample9)
adjr9<-summary(m9)$adj.r.squared
pre<-predict.lm(m9)
#10
sample10<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m10<-lm(un_mac~fb_mac, data=sample10)
adjr10<-summary(m10)$adj.r.squared
pre<-predict.lm(m10)
#adjr
adjr<-rbind(adjr1, adjr2,adjr3,adjr4,adjr5, adjr6, adjr7, adjr8, adjr9, adjr10)
adjr<-data.frame(adjr)
summary(adjr$adjr)
#############################
## Out of sample analysis ###
#############################
MAPE_values<- NULL
for(ii in 1:10){
unfbmac_male$pred <- NA
training_data<-unfbmac_male[!with(unfbmac_male,is.na(fb_mac) | is.na(un_mac)),]
test_data<- unfbmac_male[is.na(unfbmac_male$un_mac),]
model<-lm(un_mac ~ fb_mac, data=training_data)
training_data$pred<-predict(model,newdata=training_data)
mape_pred_single_run<- mean(abs(training_data$pred - training_data$un_mac)/training_data$un_mac)
MAPE_values<- c(MAPE_values,mape_pred_single_run)
}
MAPE_values
average_out_of_sample_mape<- mean(MAPE_values)
average_out_of_sample_mape
average_out_of_sample_mape*100
################################################################################################
## Model summary with all the observations for predicting the MAC
## for the countries not included in the UN data
################################################################################################
unfbmac_male <- read_delim("unfbmac_allmale.csv", ";", escape_double = FALSE, trim_ws = TRUE)
# linear regression
m1<-lm(unfbmac_male$un_mac~unfbmac_male$fb_mac)
summary(m1)
summary(m1)$adj.r.squared
pre<-predict(m1)
leave_out_rows<-unfbmac_male[is.na(unfbmac_male$un_mac),]
training_data<- na.omit(unfbmac_male)
test_data<- unfbmac_male[is.na(unfbmac_male$un_mac),]
model<-lm(un_mac ~ fb_mac, data=training_data)
unfbmac_male$pred<-predict(model,newdata=unfbmac_male)
unfbmac_male$pred
map_data<-unfbmac_male[is.na(unfbmac_male$un_mac),]
map_data
data(countryExData)
sPDF <- joinCountryData2Map(map_data, joinCode = "ISO3", nameJoinColumn = "ISO3V10")
par(mai=c(0,0,0.2,0),xaxs="i",yaxs="i")
mapParams <- mapCountryData( sPDF, nameColumnToPlot="pred" , addLegend=TRUE, mapTitle="")
# Prediction on 82 countries (we exclude China)