fertilityfacebook.R

#######################################################################################
###################### Mater Certa Est, Pater Numquam   ###############################
###### What Can Facebook Advertising Platform Tell Us about Male Fertility? ###########
#######################################################################################

################################# Packages ############################################

#install.packages("tidyverse")
library(tidyverse)
#install.packages("plyr")
library(plyr)
#install.packages("readr")
library(readr)
#install.packages("data.table")
library(data.table)
#install.packages("rworldmap")
library(rworldmap)
#install.packages("MLmetrics")
library(MLmetrics)

################################# Data ################################################

# The data from Facebook were collected on the 2nd of January 2018
fb <- read_csv("fb.csv")
# Read all the numbers in audience
options(scipen = 50)


################################# FEMALE ################################################

# We first focus on the Women

women <- filter(fb, genders==2, age!="15-49", fertility=="Child")

women$mage<-revalue(women$age,c(
  "15-19"="17.5",
  "20-24"="22.5",
  "25-29"="27.5",
  "30-34"="32.5",
  "35-39"="37.5",
  "40-44"="42.5",
  "45-49"="47.5"
))

# We transformed the vraibale mage and audience into numeric variables
women$mage<-as.numeric(women$mage)
women$audience<-as.numeric(women$audience)

# We create the variable ca which is the product between mage and the fb audience estimate
women$ca=(women$mage*women$audience)

# We sort the file
sort.people <- with(women,  women[order(country) , ])

# The following function computes for every single country the Mean Age at Childbearing
macfunction <- function(country.name) {
  data.country <- subset(sort.people, sort.people$country==country.name)
  ca <- (data.country$mage*data.country$audience)
  numerator <- sum(ca)
  # denominator births for each country
  denominator <- sum(data.country$audience)
  # mean age at childbearing by country
  MAC<- numerator/denominator
  return(MAC)
}

the.names <- names(table((sort.people$country)))
all.MAC <- lapply(the.names, macfunction)
names(all.MAC) <- the.names
all.MAC
all.mac <- ldply (all.MAC, data.frame)
colnames(all.mac)<-c("country", "fb_mac")

# all.mac contains all the mean age at childbearing computed with the macfunction

# UN data for mac
UNdata_macw <- read_csv("UNdata_macw.csv")
head(UNdata_macw)

# Merge the fb data with the UN data
comparison_macw <- merge(all.mac,UNdata_macw, by="country")
head(comparison_macw)


# 135 country in the paper 143 in this analysis, since we checked again the code linking the country
comparison_macw <-subset(comparison_macw, comparison_macw$fb_mac!=32.50)

######################################################################################################
# Overall Correlation
class(comparison_macw$un_mac)
comparison_macw$un_mac<-as.numeric(as.character(comparison_macw$un_mac))

cor(comparison_macw$un_mac, comparison_macw$fb_mac, method="pearson")
cor(comparison_macw$un_mac, comparison_macw$fb_mac, method="spearman")
cor.test(comparison_macw$fb_mac,comparison_macw$un_mac, method="spearman")
# 0.462538 in the paper it is reported to be 0.47
#  p-value = 0.000000006032

MAE(comparison_macw$un_mac, comparison_macw$fb_mac)
MAPE(comparison_macw$un_mac, comparison_macw$fb_mac)
MSE(comparison_macw$un_mac, comparison_macw$fb_mac)

######################################################################################################
# We now compute the Correlation and the Mape for each continent,
# and the leave-one-out-cross validation
######################################################################################################
# Africa
macc_africa<-comparison_macw[which(comparison_macw$continent=='Africa'),]

# Correlation
cor(macc_africa$fb_mac,macc_africa$un_mac, method='spearman')
cor.test(macc_africa$fb_mac,macc_africa$un_mac, method="spearman")

# MAPE
MAPE(macc_africa$fb_mac,macc_africa$un_mac)*100

# LOOCV
results_africaw <- list(NULL)
for (i in 1:(nrow(macc_africa))) {
  leave.out.row <- macc_africa[-c(i),]
  only.the.row <- macc_africa[i,]
  model <- lm(un_mac ~ fb_mac, data=leave.out.row)
  pred <- predict(model, newdata=only.the.row)
  mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
  frame <- data.frame(pred=pred, mape=mape)
  results_africaw[[i]] <- frame
}

results_africaw
# Added two countries in Africa
df <- data.frame(matrix(unlist(results_africaw), nrow=30, byrow=T))
colnames(df) <- c("pred","mape")
macc_africa<-cbind(macc_africa,df)

# CORRELATION loocv
cor(macc_africa$un_mac, macc_africa$pred, method='spearman')
cor.test(macc_africa$un_mac, macc_africa$pred, method="spearman")

# MAPE loocv
average_out_of_sample_mape<- mean(macc_africa$mape)
average_out_of_sample_mape*100
######################################################################################################
# Asia
macc_asia<-comparison_macw[which(comparison_macw$continent=='Asia'),]

# Correlation
cor(macc_asia$fb_mac,macc_africa$un_mac, method='spearman')
cor.test(macc_asia$fb_mac,macc_asia$un_mac, method="spearman")

# MAPE
MAPE(macc_asia$fb_mac,macc_asia$un_mac)*100

# LOOCV

results_asiaw <- list(NULL)
for (i in 1:(nrow(macc_asia))) {
  leave.out.row <- macc_asia[-c(i),]
  only.the.row <- macc_asia[i,]
  model <- lm(un_mac ~ fb_mac, data=leave.out.row)
  pred <- predict(model, newdata=only.the.row)
  mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
  frame <- data.frame(pred=pred, mape=mape)
  results_asiaw[[i]] <- frame
}

results_asiaw

# Two more Asian countries
df <- data.frame(matrix(unlist(results_asiaw), nrow=35, byrow=T))
colnames(df) <- c("pred","mape")
macc_asia<-cbind(macc_asia,df)

# CORRELATION loocv
cor(macc_asia$un_mac, macc_asia$pred, method='spearman')
cor.test(macc_asia$un_mac, macc_asia$pred, method="spearman")

# MAPE loocv
average_out_of_sample_mape<- mean(macc_asia$mape)
average_out_of_sample_mape*100

######################################################################################################
# Europe
macc_europe<-comparison_macw[which(comparison_macw$continent=='Europe'),]

# Correlation
cor(macc_europe$fb_mac,macc_europe$un_mac, method='spearman')
cor.test(macc_europe$fb_mac,macc_europe$un_mac, method="spearman")

# MAPE
MAPE(macc_europe$fb_mac,macc_europe$un_mac)*100

# LOOCV

results_europew <- list(NULL)
for (i in 1:(nrow(macc_europe))) {
  leave.out.row <- macc_europe[-c(i),]
  only.the.row <- macc_europe[i,]
  model <- lm(un_mac ~ fb_mac, data=leave.out.row)
  pred <- predict(model, newdata=only.the.row)
  mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
  frame <- data.frame(pred=pred, mape=mape)
  results_europew[[i]] <- frame
}

results_europew

# 41 as in the paper
df <- data.frame(matrix(unlist(results_europew), nrow=41, byrow=T))
colnames(df) <- c("pred","mape")
macc_europe<-cbind(macc_europe,df)

# CORRELATION loocv
cor(macc_europe$un_mac, macc_europe$pred, method='spearman')
cor.test(macc_europe$un_mac, macc_europe$pred, method="spearman")
# MAPE loocv
average_out_of_sample_mape<- mean(macc_europe$mape)
average_out_of_sample_mape*100
######################################################################################################
# North America
macc_na<-comparison_macw[which(comparison_macw$continent=='North America'),]

# Correlation
cor(macc_na$fb_mac,macc_na$un_mac, method='spearman')
cor.test(macc_na$fb_mac,macc_na$un_mac, method="spearman")

# MAPE
MAPE(macc_na$fb_mac,macc_na$un_mac)*100

# LOOCV
results_naw <- list(NULL)
for (i in 1:(nrow(macc_na))) {
  leave.out.row <- macc_na[-c(i),]
  only.the.row <- macc_na[i,]
  model <- lm(un_mac ~ fb_mac, data=leave.out.row)
  pred <- predict(model, newdata=only.the.row)
  mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
  frame <- data.frame(pred=pred, mape=mape)
  results_naw[[i]] <- frame
}

results_naw

# One additional country
df <- data.frame(matrix(unlist(results_naw), nrow=17, byrow=T))
colnames(df) <- c("pred","mape")
macc_na<-cbind(macc_na,df)

# CORRELATION loocv
cor(macc_na$un_mac, macc_na$pred, method='spearman')
cor.test(macc_na$un_mac, macc_na$pred, method="spearman")

# MAPE loocv
average_out_of_sample_mape<- mean(macc_na$mape)
average_out_of_sample_mape*100
######################################################################################################
# Oceania
macc_oceania<-comparison_macw[which(comparison_macw$continent=='Oceania'),]

# Correlation
cor(macc_oceania$fb_mac,macc_oceania$un_mac, method='spearman')
cor.test(macc_oceania$fb_mac,macc_oceania$un_mac, method="spearman")

# MAPE
MAPE(macc_oceania$fb_mac,macc_oceania$un_mac)*100

# LOOCV

results_oceaniaw <- list(NULL)
for (i in 1:(nrow(macc_oceania))) {
  leave.out.row <- macc_oceania[-c(i),]
  only.the.row <- macc_oceania[i,]
  model <- lm(un_mac ~ fb_mac, data=leave.out.row)
  pred <- predict(model, newdata=only.the.row)
  mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
  frame <- data.frame(pred=pred, mape=mape)
  results_oceaniaw[[i]] <- frame
}

results_oceaniaw

# 8 country as in the paper
df <- data.frame(matrix(unlist(results_oceaniaw), nrow=8, byrow=T))
colnames(df) <- c("pred","mape")
macc_oceania<-cbind(macc_oceania,df)

# CORRELATION loocv
cor(macc_oceania$un_mac, macc_oceania$pred, method='spearman')
cor.test(macc_oceania$un_mac, macc_oceania$pred, method="spearman")

# MAPE loocv
average_out_of_sample_mape<- mean(macc_oceania$mape)
average_out_of_sample_mape*100
######################################################################################################
# South America
macc_sa<-comparison_macw[which(comparison_macw$continent=='South America'),]

# Correlation
cor(macc_sa$fb_mac,macc_sa$un_mac, method='spearman')
cor.test(macc_sa$fb_mac,macc_sa$un_mac, method="spearman")

# MAPE
MAPE(macc_asa$fb_mac,macc_sa$un_mac)*100

# LOOCV
results_saw <- list(NULL)
for (i in 1:(nrow(macc_sa))) {
  leave.out.row <- macc_sa[-c(i),]
  only.the.row <- macc_sa[i,]
  model <- lm(un_mac ~ fb_mac, data=leave.out.row)
  pred <- predict(model, newdata=only.the.row)
  mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
  frame <- data.frame(pred=pred, mape=mape)
  results_saw[[i]] <- frame
}

results_saw

# 12 countries as in the paper
df <- data.frame(matrix(unlist(results_saw), nrow=12, byrow=T))
colnames(df) <- c("pred","mape")
macc_sa<-cbind(macc_sa,df)

# CORRELATION loocv
cor(macc_sa$un_mac, macc_sa$pred, method='spearman')
cor.test(macc_sa$un_mac, macc_sa$pred, method="spearman")

# MAPE loocv
average_out_of_sample_mape<- mean(macc_sa$mape)
average_out_of_sample_mape*100
################################# MALE ################################################
# Men
men <- filter(fb, genders==1, age!="15-49", fertility=="Child")

men$mage<-revalue(men$age,c(
  "15-19"="17.5",
  "20-24"="22.5",
  "25-29"="27.5",
  "30-34"="32.5",
  "35-39"="37.5",
  "40-44"="42.5",
  "45-49"="47.5"
))


men$mage<-as.numeric(men$mage)
class(men$mage)
men$audience<-as.numeric(men$audience)
men$ca=(men$mage*men$audience)

sort.people <- with(men,  men[order(country) , ])

the.names <- names(table((sort.people$country)))
all.MAC.men <- lapply(the.names, macfunction)
names(all.MAC.men) <- the.names
all.MAC.men
all.mac.men <- ldply (all.MAC.men, data.frame)
colnames(all.mac.men)<-c("country", "fb_mac")
all.mac.men<-subset(all.mac.men, fb_mac!=32.5)

# all.mac contains all the mean age at childbearing computed with the macfunction

# UN data for mac
UNdata_macm <- read_delim("UNdata_macm.csv", ";", escape_double = FALSE, trim_ws = TRUE)
head(UNdata_macm)

# Merge the fb data with the UN data
comparison_macm <- merge(all.mac.men,UNdata_macm, by="country")
head(comparison_macm)
# 78 countries, 82 in the paper, but there were mismatch.

######################################################################################################
# Overall Correlation
class(comparison_macm$un_mac)

cor(comparison_macm$un_mac, comparison_macm$fb_mac, method="pearson")
cor(comparison_macm$un_mac, comparison_macm$fb_mac, method="spearman")
cor.test(comparison_macm$fb_mac,comparison_macm$un_mac, method="spearman")
# As in the paper, the correlation is equal to 0.79, however, there are 4 countries less than
# in the calculation for the paper.

MAE(comparison_macm$un_mac, comparison_macm$fb_mac)
MAPE(comparison_macm$un_mac, comparison_macm$fb_mac)
MSE(comparison_macm$un_mac, comparison_macm$fb_mac)

######################################################################################################
# We now compute the Correlation and the Mape for each continent,
# and the leave-one-out-cross validation
######################################################################################################
# Africa
macc_africa<-comparison_macm[which(comparison_macm$continent=='Africa'),]

# CORRELATION
cor(macc_africa$fb_mac,macc_africa$un_mac, method='spearman')
cor.test(macc_africa$fb_mac,macc_africa$un_mac, method="spearman")

# MAPE
MAPE(macc_africa$fb_mac,macc_africa$un_mac)*100

# LOOCV
results_africam <- list(NULL)
for (i in 1:(nrow(macc_africa))) {
  leave.out.row <- macc_africa[-c(i),]
  only.the.row <- macc_africa[i,]
  model <- lm(un_mac ~ fb_mac, data=leave.out.row)
  pred <- predict(model, newdata=only.the.row)
  mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
  frame <- data.frame(pred=pred, mape=mape)
  results_africam[[i]] <- frame
}

results_africam
# There are only two Africa countries: Egypt and Reunion. The UN provides the
# data for Mauritius as well, but it was not possible to include Mauritus in the analaysis
# since it has a FB mac of 32.5
# Table 11: https://unstats.un.org/unsd/demographic/products/dyb/dyb2014.htm
df <- data.frame(matrix(unlist(results_africam), nrow=2, byrow=T))
colnames(df) <- c("pred","mape")
macc_africa<-cbind(macc_africa,df)

# CORRELATION loocv
cor(macc_africa$un_mac, macc_africa$pred, method='spearman')
cor.test(macc_africa$un_mac, macc_africa$pred, method="spearman")

# MAPE loocv
average_out_of_sample_mape<- mean(macc_africa$mape)
average_out_of_sample_mape*100
######################################################################################################
# Asia
macc_asia<-comparison_macm[which(comparison_macm$continent=='Asia'),]

# CORRELATION
cor(macc_asia$fb_mac,macc_asia$un_mac, method='spearman')
cor.test(macc_asia$fb_mac,macc_asia$un_mac, method="spearman")

# MAPE
MAPE(macc_asia$fb_mac,macc_asia$un_mac)*100

results_asiam <- list(NULL)
for (i in 1:(nrow(macc_asia))) {
  leave.out.row <- macc_asia[-c(i),]
  only.the.row <- macc_asia[i,]
  model <- lm(un_mac ~ fb_mac, data=leave.out.row)
  pred <- predict(model, newdata=only.the.row)
  mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
  frame <- data.frame(pred=pred, mape=mape)
  results_asiam[[i]] <- frame
}

results_asiam

# Two more Asian countries
df <- data.frame(matrix(unlist(results_asiam), nrow=16, byrow=T))
colnames(df) <- c("pred","mape")
macc_asia<-cbind(macc_asia,df)

# CORRELATION loocv
cor(macc_asia$un_mac, macc_asia$pred, method='spearman')
cor.test(macc_asia$un_mac, macc_asia$pred, method="spearman")

# MAPE loocv
average_out_of_sample_mape<- mean(macc_asia$mape)
average_out_of_sample_mape*100

######################################################################################################
# Europe
macc_europe<-comparison_macm[which(comparison_macm$continent=='Europe'),]

# CORRELATION
cor(macc_europe$fb_mac,macc_europe$un_mac, method='spearman')
cor.test(macc_europe$fb_mac,macc_europe$un_mac, method="spearman")

# MAPE
MAPE(macc_europe$fb_mac,macc_europe$un_mac)*100

results_europem <- list(NULL)
for (i in 1:(nrow(macc_europe))) {
  leave.out.row <- macc_europe[-c(i),]
  only.the.row <- macc_europe[i,]
  model <- lm(un_mac ~ fb_mac, data=leave.out.row)
  pred <- predict(model, newdata=only.the.row)
  mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
  frame <- data.frame(pred=pred, mape=mape)
  results_europem[[i]] <- frame
}

results_europem

# 40  as in the paper
df <- data.frame(matrix(unlist(results_europem), nrow=40, byrow=T))
colnames(df) <- c("pred","mape")
macc_europe<-cbind(macc_europe,df)

# CORRELATION loocv
cor(macc_europe$un_mac, macc_europe$pred, method='spearman')
cor.test(macc_europe$un_mac, macc_europe$pred, method="spearman")
# MAPE loocv
average_out_of_sample_mape<- mean(macc_europe$mape)
average_out_of_sample_mape*100

######################################################################################################
# North America
macc_na<-comparison_macm[which(comparison_macm$continent=='North America'),]

# CORRELATION
cor(macc_na$fb_mac,macc_na$un_mac, method='spearman')
cor.test(macc_na$fb_mac,macc_na$un_mac, method="spearman")

# MAPE
MAPE(macc_na$fb_mac,macc_na$un_mac)*100

results_nam <- list(NULL)
for (i in 1:(nrow(macc_na))) {
  leave.out.row <- macc_na[-c(i),]
  only.the.row <- macc_na[i,]
  model <- lm(un_mac ~ fb_mac, data=leave.out.row)
  pred <- predict(model, newdata=only.the.row)
  mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
  frame <- data.frame(pred=pred, mape=mape)
  results_nam[[i]] <- frame
}

results_nam

# One country less
df <- data.frame(matrix(unlist(results_nam), nrow=12, byrow=T))
colnames(df) <- c("pred","mape")
macc_na<-cbind(macc_na,df)

# CORRELATION loocv
cor(macc_na$un_mac, macc_na$pred, method='spearman')
cor.test(macc_na$un_mac, macc_na$pred, method="spearman")

# MAPE loocv
average_out_of_sample_mape<- mean(macc_na$mape)
average_out_of_sample_mape*100

######################################################################################################
# Oceania
macc_oceania<-comparison_macm[which(comparison_macm$continent=='Oceania'),]

# CORRELATION
cor(macc_oceania$fb_mac,macc_oceania$un_mac, method='spearman')
cor.test(macc_oceania$fb_mac,macc_oceania$un_mac, method="spearman")

# MAPE
MAPE(macc_oceania$fb_mac,macc_oceania$un_mac)*100

results_oceaniam <- list(NULL)
for (i in 1:(nrow(macc_oceania))) {
  leave.out.row <- macc_oceania[-c(i),]
  only.the.row <- macc_oceania[i,]
  model <- lm(un_mac ~ fb_mac, data=leave.out.row)
  pred <- predict(model, newdata=only.the.row)
  mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
  frame <- data.frame(pred=pred, mape=mape)
  results_oceaniam[[i]] <- frame
}

results_oceaniam

# Change number of row
df <- data.frame(matrix(unlist(results_oceaniam), nrow=2, byrow=T))
colnames(df) <- c("pred","mape")
macc_oceania<-cbind(macc_oceania,df)

# CORRELATION loocv
cor(macc_oceania$un_mac, macc_oceania$pred, method='spearman')
cor.test(macc_oceania$un_mac, macc_oceania$pred, method="spearman")

# MAPE loocv
average_out_of_sample_mape<- mean(macc_oceania$mape)
average_out_of_sample_mape*100

######################################################################################################
# South America
macc_sa<-comparison_macm[which(comparison_macm$continent=='South America'),]

# CORRELATION
cor(macc_sa$fb_mac,macc_sa$un_mac, method='spearman')
cor.test(macc_sa$fb_mac,macc_sa$un_mac, method="spearman")

# MAPE
MAPE(macc_sa$fb_mac,macc_sa$un_mac)*100


results_sam <- list(NULL)
for (i in 1:(nrow(macc_sa))) {
  leave.out.row <- macc_sa[-c(i),]
  only.the.row <- macc_sa[i,]
  model <- lm(un_mac ~ fb_mac, data=leave.out.row)
  pred <- predict(model, newdata=only.the.row)
  mape <- (abs(pred -only.the.row$un_mac)/only.the.row$un_mac)
  frame <- data.frame(pred=pred, mape=mape)
  results_sam[[i]] <- frame
}

results_sam

# Change number of row
df <- data.frame(matrix(unlist(results_sam), nrow=6, byrow=T))
colnames(df) <- c("pred","mape")
macc_sa<-cbind(macc_sa,df)

# CORRELATION loocv
cor(macc_sa$un_mac, macc_sa$pred, method='spearman')
cor.test(macc_sa$un_mac, macc_sa$pred, method="spearman")

# MAPE loocv
average_out_of_sample_mape<- mean(macc_sa$mape)
average_out_of_sample_mape*100

######################################################################################################

# Regression

unfbmac_male <- read_delim("unfbmac_allmale.csv", ";", escape_double = FALSE, trim_ws = TRUE)

# linear regression
m1<-lm(unfbmac_male$un_mac~unfbmac_male$fb_mac)
summary(m1)
summary(m1)$adj.r.squared
pre<-predict.lm(m1)

sample<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m1<-lm(unfbmac_male$un_mac~unfbmac_male$fb_mac)
summary(m1)$adj.r.squared
pre<-predict.lm(m1)
funcor<- function(sample) {
  # sample
  sample<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
  # regression
  m1<-lm(un_mac~fb_mac, data=sample)
  # prediction
  pre<-predict.lm(m1)
  return(pre)
}

the.names.country <- names(table((unfbmac_male$country)))
all.pre <- lapply(the.names.country, funcor)
names(all.pre) <- the.names.country
all.pre

# We sample removing 10 observations from the total number of male observations.
#------
#1
sample1<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m1<-lm(un_mac~fb_mac, data=sample1)
adjr1<-summary(m1)$adj.r.squared
pre<-predict.lm(m1)
#2
sample2<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m2<-lm(un_mac~fb_mac, data=sample2)
adjr2<-summary(m2)$adj.r.squared
pre<-predict.lm(m2)
#3
sample3<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m3<-lm(un_mac~fb_mac, data=sample3)
adjr3<-summary(m3)$adj.r.squared
pre<-predict.lm(m3)
#4
sample4<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m4<-lm(un_mac~fb_mac, data=sample4)
adjr4<-summary(m4)$adj.r.squared
pre<-predict.lm(m4)
#5
sample5<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m5<-lm(un_mac~fb_mac, data=sample5)
adjr5<-summary(m5)$adj.r.squared
pre<-predict.lm(m5)
#6
sample6<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m6<-lm(un_mac~fb_mac, data=sample6)
adjr6<-summary(m6)$adj.r.squared
pre<-predict.lm(m6)
#7
sample7<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m7<-lm(un_mac~fb_mac, data=sample7)
adjr7<-summary(m7)$adj.r.squared
pre<-predict.lm(m7)
#8
sample8<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m8<-lm(un_mac~fb_mac, data=sample8)
adjr8<-summary(m8)$adj.r.squared
pre<-predict.lm(m8)
#9
sample9<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m9<-lm(un_mac~fb_mac, data=sample9)
adjr9<-summary(m9)$adj.r.squared
pre<-predict.lm(m9)
#10
sample10<-unfbmac_male[sample(nrow(unfbmac_male), 68), ]
m10<-lm(un_mac~fb_mac, data=sample10)
adjr10<-summary(m10)$adj.r.squared
pre<-predict.lm(m10)

#adjr

adjr<-rbind(adjr1, adjr2,adjr3,adjr4,adjr5, adjr6, adjr7, adjr8, adjr9, adjr10)
adjr<-data.frame(adjr)
summary(adjr$adjr)

#############################
## Out of sample analysis ###
#############################

MAPE_values<- NULL

for(ii in 1:10){
  unfbmac_male$pred <- NA
  training_data<-unfbmac_male[!with(unfbmac_male,is.na(fb_mac) | is.na(un_mac)),]
  test_data<- unfbmac_male[is.na(unfbmac_male$un_mac),]
  model<-lm(un_mac ~ fb_mac, data=training_data)
  training_data$pred<-predict(model,newdata=training_data)

  mape_pred_single_run<- mean(abs(training_data$pred - training_data$un_mac)/training_data$un_mac)

  MAPE_values<- c(MAPE_values,mape_pred_single_run)
}

MAPE_values

average_out_of_sample_mape<- mean(MAPE_values)

average_out_of_sample_mape
average_out_of_sample_mape*100

################################################################################################
## Model summary with all the observations for predicting the MAC
## for the countries not included in the UN data
################################################################################################

unfbmac_male <- read_delim("unfbmac_allmale.csv", ";", escape_double = FALSE, trim_ws = TRUE)

# linear regression
m1<-lm(unfbmac_male$un_mac~unfbmac_male$fb_mac)
summary(m1)
summary(m1)$adj.r.squared
pre<-predict(m1)

leave_out_rows<-unfbmac_male[is.na(unfbmac_male$un_mac),]
training_data<- na.omit(unfbmac_male)
test_data<- unfbmac_male[is.na(unfbmac_male$un_mac),]
model<-lm(un_mac ~ fb_mac, data=training_data)
unfbmac_male$pred<-predict(model,newdata=unfbmac_male)
unfbmac_male$pred

map_data<-unfbmac_male[is.na(unfbmac_male$un_mac),]
map_data

data(countryExData)
sPDF <- joinCountryData2Map(map_data, joinCode = "ISO3", nameJoinColumn = "ISO3V10")
par(mai=c(0,0,0.2,0),xaxs="i",yaxs="i")
mapParams <- mapCountryData( sPDF, nameColumnToPlot="pred" , addLegend=TRUE, mapTitle="")
# Prediction on 82 countries (we exclude China)