Preparing data
Libraries
fpackage.check <- function(packages) { # (c) Jochem Tolsma
lapply(packages, FUN = function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dependencies = TRUE)
library(x, character.only = TRUE)
}
})
}
packages = c("tidyverse","kableExtra", "ggplot2", "patchwork", "foreach", "ggpattern")
fpackage.check(packages)
Import results and data
Import the NSUM data and recreate the NSUM module.
#import nells file.
load(file = "data_analysis/data/data_processed/nells_data/2022-11-09_nells-nsum-prepped-data.rds")
Import the model estimates from the estimated NSUM models. I have
chosen the model which uses Ibrahim for the ethnic names.
if (file.exists(
"data_analysis/results/nsum_output/main/combined_data/df_models_nsum_long.rds"
)) {
load(file = "data_analysis/results/nsum_output/main/combined_data/df_models_nsum_long.rds")
} else {
list_files <-
as.list(
dir(
"data_analysis/results/nsum_output/main/model/",
full.names = T
)
)
#create loop lists
kds <- list()
kdssd <- list()
data <- list()
list_df <- list()
#loop to extract information
for (i in 1:length(list_files)) {
#i = 1
print(paste0("Number ", i, " of ", length(list_files)))
load(list_files[[i]])
kds[[i]] <-
rowMeans(degree$d.values, na.rm = TRUE) # calculate rowmean of netsize iterations: so the retained chains
kdssd[[i]] <-
matrixStats::rowSds(degree$d.values) # calculate sd of 4k estimates per row: sd for those values
data[[i]] <- cbind(kds[[i]], kdssd[[i]]) # combine them
list_df[[i]] <-
cbind(as_tibble(data[[i]]), nells_nsum$id) # add NELLS id variable
strings <-
str_split(str_extract(list_files[[i]][1], pattern = "estimates.+"),
pattern = "_") # add holdout number
list_df[[i]] <- list_df[[i]] %>%
mutate(
holdout = as.numeric(str_extract(strings[[1]][2], pattern = "[[:digit:]]{1,}")))
}
#combine results and save
df_models_nsum_long <- list_df %>%
bind_rows() %>%
rename(mean = V1,
sd = V2,
id = 3)
#save image
save(df_models_nsum_long, file = "data_analysis/results/nsum_output/main/combined_data/df_models_nsum_long.rds")
}
We use Ibrahim as population for the size estimates, so let’s combine
the size estimates from holdout 10 with the other NSUM information.
#select holdout ten 10
size_selection <- df_models_nsum_long %>%
dplyr::filter(holdout == 10)
#add netsize data to NELLS data
nells_df <- size_selection %>%
left_join(nells_nsum, by = "id")
Selection of respondents
We remove 32 observations as they deviate more than 3 SD from the
mean.
nells_df <- nells_df %>%
mutate(mean_size = mean(mean, na.rm = T),
sd_size = sd(mean, na.rm = T),
z = (mean - mean_size)/sd_size) %>%
filter(z < 3)
#filter out other
nells_df <- nells_df %>%
filter(migration_background_fac != "Other")
Describing network size
First of all, we want to show the density distribution of extended
network size. We also show the median size of extended networks. These
estimates are in line with previous estimates that have been found of
extended network size.
options(scipen = 999)
size_density_plot <- nells_df %>%
ggplot(aes(x = mean)) +
geom_density(alpha = 0.4,
colour = "black",
fill = "grey") +
geom_vline(xintercept = median(nells_df$mean, na.rm = T),
colour = "red") +
annotate(
"text",
x = 1500,
y = 0.0008,
label = paste("Median:", as.character(round(
median(nells_df$mean, na.rm = T), 3
))),
colour = "black"
) +
#facet_wrap(vars(migration_background_fac)) +
scale_fill_viridis_d() +
scale_color_viridis_d() +
theme(
panel.background = element_rect(fill = "#FFFFFF"),
plot.background = element_rect(fill = "#FFFFFF"),
panel.grid = element_line(colour = "grey"),
text = element_text(family = "sans", size = 12),
axis.title.x = element_text(hjust = 0.9, face = "bold"),
axis.text.x = element_text(),
axis.line = element_blank(),
axis.title.y = element_text(hjust = 0.9, face = "bold"),
axis.ticks = element_blank(),
strip.background = element_rect(fill = "#A9A9A9"),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
legend.position = "right",
legend.title = element_blank(),
legend.background = element_rect(fill = "#FFFFFF"),
legend.key = element_rect(fill = "#FFFFFF")
) +
labs(x = "Extended social network size", y = "Density")
#show plot
size_density_plot
#export plot
ggsave(size_density_plot,
file = "data_analysis/plots/descriptive/density_network_size.jpg",
width = 5,
height = 4,
dpi = 320)
Group comparison
To present differences in network size
Extended groups boxplot
nells_df <- nells_df %>%
mutate(
migration_background_fac = fct_relevel(migration_background_fac,
"Dutch",
"1st gen Turkish",
"2nd gen Turkish",
"1st gen Moroccan",
"2nd gen Moroccan"),
migration_background_fac = factor(as.numeric(migration_background_fac),
levels = 1:5,
labels = c("Dutch Majority",
"1st gen Turkish-Dutch",
"2nd gen Turkish-Dutch",
"1st gen Moroccan-Dutch",
"2nd gen Moroccan-Dutch")),
migration_background_simple_fac = case_when(
migration_background_fac == "1st gen Turkish-Dutch" ~ 2,
migration_background_fac == "2nd gen Turkish-Dutch" ~ 2,
migration_background_fac == "1st gen Moroccan-Dutch" ~ 3,
migration_background_fac == "2nd gen Moroccan-Dutch" ~ 3,
migration_background_fac == "Dutch Majority" ~ 1
),
migration_background_simple_fac = factor(
migration_background_simple_fac,
levels = 1:3,
labels = c("Dutch Majority", "Turkish-Dutch", "Moroccan-Dutch")
),
migrant_generation = case_when(
str_detect(migration_background_fac, "1st") ~ 1,
str_detect(migration_background_fac, "2nd") ~ 2
)
)
Boxplot
Create panel with complete groups (no generation distinction).
#set custom pallet
pal <- c("#66c2a5",
"#fc8d62",
"#8da0cb")
#create simple boxplot
boxplot_size_simple_groups <- nells_df %>%
ggplot(aes(x = fct_rev(migration_background_simple_fac),
y = mean,
fill = migration_background_simple_fac,
colour = migration_background_simple_fac
)) +
geom_boxplot(alpha = 0.6) +
coord_flip() +
scale_colour_manual(
values = pal,
aesthetics = c("colour", "fill")
) +
theme(
panel.background = element_rect(fill = "#FFFFFF"),
plot.background = element_rect(fill = "#FFFFFF"),
panel.grid = element_line(colour = "grey"),
text = element_text(family = "sans", size = 12),
axis.title.x = element_text(hjust = 0.9, face = "bold"),
axis.text.x = element_text(),
axis.line = element_blank(),
axis.title.y = element_text(hjust = 0.9, face = "bold"),
axis.ticks = element_blank(),
strip.background = element_rect(fill = "#A9A9A9"),
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
legend.position = "none",
legend.title = element_blank(),
legend.background = element_rect(fill = "#FFFFFF"),
legend.key = element_rect(fill = "#FFFFFF")
) +
labs(y = "",
x = "")
Create panel with generation distinction.
#set custom pallet
pal <- c("#fc8d62",
"#fc8d62",
"#8da0cb",
"#8da0cb")
#create extended boplot
boxplot_size_extended_groups <- nells_df %>%
filter(migration_background_fac != "Dutch Majority") %>%
ggplot(aes(x = fct_rev(migration_background_fac),
y = mean,
fill = migration_background_fac,
colour = migration_background_fac
)) +
geom_boxplot_pattern(aes(pattern_density = as.factor(migrant_generation)),
alpha = 0.6,
pattern = "circle"
) +
coord_flip() +
scale_colour_manual(
values = pal,
aesthetics = c("colour", "fill")
) +
scale_pattern_density_manual(values = c("1" = 0, "2"=0.1)) +
theme(
panel.background = element_rect(fill = "#FFFFFF"),
plot.background = element_rect(fill = "#FFFFFF"),
panel.grid = element_line(colour = "grey"),
text = element_text(family = "sans", size = 12),
axis.title.x = element_text(hjust = 0.9, face = "bold"),
axis.text.x = element_text(),
axis.line = element_blank(),
axis.title.y = element_text(hjust = 0.9, face = "bold"),
axis.ticks = element_blank(),
strip.background = element_rect(fill = "#A9A9A9"),
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
legend.position = "none",
legend.title = element_blank(),
legend.background = element_rect(fill = "#FFFFFF"),
legend.key = element_rect(fill = "#FFFFFF")
) +
labs(y = "",
x = "")
Combine panels in multipanel plot.
## Panel plot
hom_size_panel <- boxplot_size_simple_groups +
boxplot_size_extended_groups +
plot_annotation(tag_levels ='a',
tag_prefix = '(',
tag_suffix = ')') +
plot_layout(ncol = 1,
guides = "collect",
heights = c(1,2)) &
theme(legend.position='none')
hom_size_panel
ggsave(hom_size_panel,
file = "data_analysis/plots/descriptive/size_plot_panel.jpg",
width = 8,
height = 5,
dpi = 320)
Ethnic Homogeneity
Multipanel boxplot
Prepare data for ethnic homogeneity plot.
#weighted by name frequency.
nells_df <- nells_df %>%
mutate(
sum_dutch_w = knows_daan_boundary/22704 +
knows_kevin_boundary/23167 +
knows_emma_boundary/18730 +
knows_linda_boundary/29955 +
knows_albert_boundary/31767 +
knows_edwin_boundary/21866 +
knows_willemina_boundary/17133 +
knows_ingrid_boundary/31323,
sum_turkish_w = knows_ibrahim_boundary/2099 +
knows_esra_boundary/1878,
sum_moroccan_w = knows_mohammed_boundary/13448 +
knows_fatima_boundary/2808,
sum_total_w = sum_dutch_w + sum_turkish_w + sum_moroccan_w,
per_dutch_w = (sum_dutch_w / sum_total_w) * 100,
per_turkish_w = (sum_turkish_w / sum_total_w) * 100,
per_moroccan_w = (sum_moroccan_w / sum_total_w) * 100
)
#assign correct percentage co-ethnic to each group
nells_df <- nells_df %>%
mutate(per_ingroup_w = case_when(
str_detect(migration_background_fac, "kish") ~ per_turkish_w,
str_detect(migration_background_fac, "occan") ~ per_moroccan_w,
migration_background_fac == "Dutch Majority" ~ per_dutch_w,
migration_background_fac == "Other" ~ per_dutch_w
))
Create panel with complete groups (no generation distinction).
#set custom pallet
pal <- c("#66c2a5",
"#fc8d62",
"#8da0cb")
#create graph for simple groups
boxplot_hom_simple_groups <- nells_df %>%
ggplot(aes(x = fct_rev(migration_background_simple_fac),
y = per_ingroup_w,
fill = migration_background_simple_fac,
colour = migration_background_simple_fac
)) +
geom_boxplot(alpha = 0.6) +
coord_flip() +
scale_colour_manual(
values = pal,
aesthetics = c("colour", "fill")
) +
theme(
panel.background = element_rect(fill = "#FFFFFF"),
plot.background = element_rect(fill = "#FFFFFF"),
panel.grid = element_line(colour = "grey"),
text = element_text(family = "sans", size = 12),
axis.title.x = element_text(hjust = 0.9, face = "bold"),
axis.text.x = element_text(),
axis.line = element_blank(),
axis.title.y = element_text(hjust = 0.9, face = "bold"),
axis.ticks = element_blank(),
strip.background = element_rect(fill = "#A9A9A9"),
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
legend.position = "none",
legend.title = element_blank(),
legend.background = element_rect(fill = "#FFFFFF"),
legend.key = element_rect(fill = "#FFFFFF")
) +
labs(y = "",
x = "")
Create panel with generation distinction.
#set custom pallet
pal <- c("#fc8d62",
"#fc8d62",
"#8da0cb",
"#8da0cb")
#create boxplot for extended groups
boxplot_hom_extended_groups <- nells_df %>%
filter(migration_background_fac != "Dutch Majority") %>%
ggplot(aes(x = fct_rev(migration_background_fac),
y = per_ingroup_w,
colour = migration_background_fac,
fill = migration_background_fac,
)) +
geom_boxplot_pattern(aes(pattern_density = as.factor(migrant_generation)),
alpha = 0.6,
pattern = "circle"
) +
coord_flip() +
scale_colour_manual(
values = pal,
aesthetics = c("colour", "fill")
) +
scale_pattern_density_manual(values = c("1" = 0, "2"=0.1)) +
theme(
panel.background = element_rect(fill = "#FFFFFF"),
plot.background = element_rect(fill = "#FFFFFF"),
panel.grid = element_line(colour = "grey"),
text = element_text(family = "sans", size = 12),
axis.title.x = element_text(hjust = 0.9, face = "bold"),
axis.text.x = element_text(),
axis.line = element_blank(),
axis.title.y = element_text(hjust = 0.9, face = "bold"),
axis.ticks = element_blank(),
strip.background = element_rect(fill = "#A9A9A9"),
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
legend.position = "none",
legend.title = element_blank(),
legend.background = element_rect(fill = "#FFFFFF"),
legend.key = element_rect(fill = "#FFFFFF")
) +
labs(y = "",
x = "")
Create multipanel plot.
## Panel plot
hom_plot_panel <- boxplot_hom_simple_groups +
boxplot_hom_extended_groups +
plot_annotation(tag_levels ='a',
tag_prefix = '(',
tag_suffix = ')') +
plot_layout(ncol = 1,
guides = "collect",
heights = c(1,2)) &
theme(legend.position='none')
#show plot
hom_plot_panel
#save plor
ggsave(hom_plot_panel,
file = "data_analysis/plots/descriptive/hom_plot_panel.jpg",
width = 8,
height = 5,
dpi = 320)
Name differences
For every x (for now names) we can estimate an NB regression to see
differences between migration backgrounds. Please note: this does not
take into account naming frequency in the population. Differences
between different ethnic groups may indeed be larger or smaller for
different names.
# use a loop.
#set var_names to use in loop.
variable_names_model <- c("knows_daan",
"knows_kevin",
"knows_edwin",
"knows_albert",
"knows_emma",
"knows_linda",
"knows_ingrid",
"knows_willemina",
"knows_mohammed",
"knows_fatima",
"knows_ibrahim",
"knows_esra")
#start analysis loop
model_results <- list()
for(i in 1:length(variable_names_model)) {#i = 1
fm <- as.formula(paste(variable_names_model[[i]], "~", "migration_background_fac"))
model_results[[i]] <- MASS::glm.nb(fm,
data = nells_df)
}
#clean output with tidy r
model_results_df_list <- model_results %>%
purrr::map(.x =.,
.f = ~ broom::tidy(.x))
#add var_names to model_results
for(i in 1:length(model_results_df_list)){
model_results_df_list[[i]] <- model_results_df_list[[i]] %>%
mutate(dep_var = variable_names_model[i])
}
#combine model dfs.
model_results_df <- model_results_df_list %>%
bind_rows()
#set correct variable names
model_results_df <- model_results_df %>%
mutate(
term = case_when(
str_detect(term, "2nd gen Moroccan") ~ "2nd gen Moroccan-Dutch",
str_detect(term, "2nd gen Turkish") ~ "2nd gen Turkish-Dutch",
str_detect(term, "1st gen Moroccan") ~ "1st gen Moroccan-Dutch",
str_detect(term, "1st gen Turkish") ~ "1st gen Turkish-Dutch",
term == "(Intercept)" ~ "Intercept"
)
)
#Set correct names
correct_names <- model_results_df %>%
pull(dep_var) %>%
str_replace(., pattern = "knows_", replacement = "") %>%
str_to_title()
#drop old names and add the correct names
model_results_df <- model_results_df %>%
select(-dep_var) %>%
mutate(dep_var = correct_names)
Predicted counts plot for names and ethnicity
pred_nb_f <- function(nb_model, names){#nb_model = model_results[[1]], names = variable_names_model[[1]]
pred <- predict(object = nb_model,
type = "response",
se.fit = T
)
plot_df <- nells_df %>%
select(id, migration_background_fac) %>%
bind_cols(pred) %>%
mutate(dep_var = names)
return(plot_df)
}
model_pred_list <- map2(.x = model_results,
.y = variable_names_model,
.f = ~pred_nb_f(nb_model = .x,
names = .y))
model_pred_df <- model_pred_list %>%
bind_rows()
#Set correct names
correct_names <- model_pred_df %>%
pull(dep_var) %>%
str_replace(., pattern = "knows_", replacement = "") %>%
str_to_title()
#drop old names and add the correct names
model_pred_df <- model_pred_df %>%
select(-dep_var) %>%
mutate(dep_var = correct_names)
#set custom pallet
pal <- c("#66c2a5",
"#fc8d62",
"#fc8d62",
"#8da0cb",
"#8da0cb")
#crete plot with minority names
ethnic_names_pred_plot <- model_pred_df %>%
filter(dep_var %in% c("Mohammed",
"Fatima",
"Ibrahim",
"Esra")) %>%
ggplot(aes(x = dep_var,
y = fit,
shape = migration_background_fac)) +
geom_linerange(aes(ymin = fit - (se.fit *1.96),
ymax = fit + (se.fit *1.96)),
position = position_dodge(width = 1)) +
geom_point(aes(colour = migration_background_fac,
fill = migration_background_fac),
position = position_dodge(width = 1)) +
facet_wrap(vars(dep_var),
scales = "free",
ncol = 2) +
scale_colour_manual(
values = pal,
aesthetics = c("colour", "fill")
) +
scale_shape_manual(values = c(21,22,24,22,24)) +
theme(
panel.background = element_rect(fill = "#FFFFFF",
colour = "black"),
plot.background = element_rect(fill = "#FFFFFF"),
panel.grid = element_line(colour = "grey"),
text = element_text(family = "sans", size = 12),
axis.title.x = element_text(hjust = 0.9, face = "bold"),
axis.text.x = element_blank(),
axis.line = element_blank(),
axis.title.y = element_text(hjust = 0.9, face = "bold"),
axis.ticks = element_blank(),
strip.background = element_rect(fill = "#FFFFFF"),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
legend.position = "none",
legend.title = element_blank(),
legend.background = element_rect(fill = "#FFFFFF"),
legend.key = element_rect(fill = "#FFFFFF")
) +
labs(x = "",
y = "",
colour = "",
shape = "",
fill = ""
)
#set custom pallet
pal <- c("#66c2a5",
"#fc8d62",
"#fc8d62",
"#8da0cb",
"#8da0cb")
#create plot for majority names
non_ethnic_names_pred_plot <- model_pred_df %>%
filter(!dep_var %in% c("Mohammed",
"Fatima",
"Ibrahim",
"Esra")) %>%
ggplot(aes(x = dep_var,
y = fit,
shape = migration_background_fac)) +
geom_linerange(aes(ymin = fit - (se.fit *1.96),
ymax = fit + (se.fit *1.96)),
position = position_dodge(width = 1)) +
geom_point(aes(colour = migration_background_fac,
fill = migration_background_fac),
position = position_dodge(width = 1)) +
facet_wrap(vars(dep_var),
scales = "free_x",
ncol = 2) +
scale_colour_manual(
values = pal,
aesthetics = c("colour", "fill")
) +
scale_shape_manual(values = c(21,22,24,22,24)) +
theme(
panel.background = element_rect(fill = "#FFFFFF",
colour = "black"),
plot.background = element_rect(fill = "#FFFFFF"),
panel.grid = element_line(colour = "grey"),
text = element_text(family = "sans", size = 12),
axis.title.x = element_text(hjust = 0.9, face = "bold"),
axis.text.x = element_blank(),
axis.line = element_blank(),
axis.title.y = element_text(hjust = 0.9, face = "bold"),
axis.ticks = element_blank(),
strip.background = element_rect(fill = "#FFFFFF"),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
legend.position = "none",
legend.title = element_blank(),
legend.background = element_rect(fill = "#FFFFFF"),
legend.key = element_rect(fill = "#FFFFFF")
) +
labs(x = "",
y = "",
colour = "",
shape = "",
fill = ""
)
#Combine plots in multipanel plot
names_pred_het_panel <- ethnic_names_pred_plot +
non_ethnic_names_pred_plot +
plot_annotation(
tag_levels = 'a',
tag_prefix = '(',
tag_suffix = ')'
) +
plot_layout(ncol = 1,
heights = c(1, 3),
guides = 'collect',
) &
theme(legend.position = c(-2,-5),
legend.direction = 'vertical')
#preview plot
names_pred_het_panel
#save plot
ggsave(names_pred_het_panel,
file = "data_analysis/plots/descriptive/names_het_pred_panel.jpg",
width = 6,
height = 8,
dpi = 320)
---
title: "Describing extended network size and ethnic composition"
author: "Thijmen Jeroense"
date: "Last compiled on `r format(Sys.time(), '%d %B, %Y')`"
output:
  html_document:
    toc: TRUE
    toc_depth: 3
    toc_float: TRUE
    code_folding: show
    code_download: TRUE
editor_options: 
  chunk_output_type: console
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(cache = TRUE, message = FALSE, warning = FALSE, results = "asis",
                      fig.align = "center")
```

# Goal
Recreate the descriptive analysis and plots used in the published research paper.

# Preparing data

## Libraries

```{r libraries and files, results = 'hide'}
fpackage.check <- function(packages) { # (c) Jochem Tolsma
  lapply(packages, FUN = function(x) {
    if (!require(x, character.only = TRUE)) {
      install.packages(x, dependencies = TRUE)
      library(x, character.only = TRUE)
    }
  })
}
packages = c("tidyverse","kableExtra", "ggplot2", "patchwork", "foreach", "ggpattern")
fpackage.check(packages)

```

## Import results and data

Import the NSUM data and recreate the NSUM module. 

```{r files}
#import nells file.
load(file = "data_analysis/data/data_processed/nells_data/2022-11-09_nells-nsum-prepped-data.rds")

```

Import the model estimates from the estimated NSUM models. I have chosen the model which uses Ibrahim for the ethnic names.

```{r import results}

if (file.exists(
  "data_analysis/results/nsum_output/main/combined_data/df_models_nsum_long.rds"
)) {
  load(file = "data_analysis/results/nsum_output/main/combined_data/df_models_nsum_long.rds")
} else {
  list_files <-
    as.list(
      dir(
        "data_analysis/results/nsum_output/main/model/",
        full.names = T
      )
    )
  #create loop lists
  kds <- list()
  kdssd <- list()
  data <- list()
  list_df <- list()
  
  #loop to extract information
  for (i in 1:length(list_files)) {
    #i = 1
    print(paste0("Number ", i, " of ", length(list_files)))
    load(list_files[[i]])
    kds[[i]] <-
      rowMeans(degree$d.values, na.rm = TRUE) # calculate rowmean of netsize iterations: so the retained chains
    kdssd[[i]] <-
      matrixStats::rowSds(degree$d.values) # calculate sd of 4k estimates per row: sd for those values
    data[[i]] <- cbind(kds[[i]], kdssd[[i]]) # combine them
    list_df[[i]] <-
      cbind(as_tibble(data[[i]]), nells_nsum$id) # add NELLS id variable
    strings <-
      str_split(str_extract(list_files[[i]][1], pattern = "estimates.+"),
                pattern = "_")  # add holdout number
    list_df[[i]] <- list_df[[i]] %>%
      mutate(
        holdout = as.numeric(str_extract(strings[[1]][2], pattern = "[[:digit:]]{1,}")))
  }
    #combine results and save
    df_models_nsum_long <- list_df %>%
      bind_rows() %>%
      rename(mean = V1,
             sd = V2,
             id = 3)
    #save image
    save(df_models_nsum_long, file = "data_analysis/results/nsum_output/main/combined_data/df_models_nsum_long.rds")
}
```

We use Ibrahim as population for the size estimates, so let's combine the size estimates from holdout 10 with the other NSUM information. 

```{r holdout 2, fig.width=7, fig.height=7}
#select holdout ten 10
size_selection <- df_models_nsum_long %>% 
  dplyr::filter(holdout == 10) 

#add netsize data to NELLS data
nells_df <- size_selection %>% 
  left_join(nells_nsum, by = "id")

```

## Selection of respondents

We remove 32 observations as they deviate more than 3 SD from the mean. 

```{r delete outliers}
nells_df <- nells_df %>% 
    mutate(mean_size = mean(mean, na.rm = T),
         sd_size = sd(mean, na.rm = T),
         z = (mean - mean_size)/sd_size) %>% 
  filter(z < 3) 

#filter out other
nells_df <- nells_df %>% 
  filter(migration_background_fac != "Other")


```

# Describing network size 

First of all, we want to show the density distribution of extended network size. We also show the median size of extended networks. These estimates are in line with previous estimates that have been found of extended network size. 

```{r density plot sample, width = 4, heigth = 4}
options(scipen = 999)

size_density_plot <- nells_df %>%
  ggplot(aes(x = mean)) +
  geom_density(alpha = 0.4,
               colour = "black",
               fill = "grey") +
  geom_vline(xintercept = median(nells_df$mean, na.rm = T),
             colour = "red") +
  annotate(
    "text",
    x = 1500,
    y = 0.0008,
    label = paste("Median:", as.character(round(
      median(nells_df$mean, na.rm = T), 3
    ))),
    colour = "black"
  ) +
  #facet_wrap(vars(migration_background_fac)) +
  scale_fill_viridis_d() +
  scale_color_viridis_d() +
  theme(
    panel.background = element_rect(fill = "#FFFFFF"),
    plot.background = element_rect(fill = "#FFFFFF"),
    panel.grid = element_line(colour = "grey"),
    text = element_text(family = "sans", size = 12),
    axis.title.x = element_text(hjust = 0.9, face = "bold"),
    axis.text.x = element_text(),
    axis.line = element_blank(),
    axis.title.y = element_text(hjust = 0.9, face = "bold"),
    axis.ticks = element_blank(),
    strip.background = element_rect(fill = "#A9A9A9"),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "right",
    legend.title = element_blank(),
    legend.background = element_rect(fill = "#FFFFFF"),
    legend.key = element_rect(fill = "#FFFFFF")
  ) +
  labs(x = "Extended social network size", y = "Density")

#show plot
size_density_plot

#export plot
ggsave(size_density_plot,
       file = "data_analysis/plots/descriptive/density_network_size.jpg",
       width = 5,
       height = 4,
       dpi = 320)


```

## Group comparison

To present differences in network size

### Extended groups boxplot

```{r prepare factor labels}
nells_df <- nells_df %>%
  mutate(
    migration_background_fac = fct_relevel(migration_background_fac,
                                                "Dutch",
                                                "1st gen Turkish",
                                                "2nd gen Turkish",
                                                "1st gen Moroccan",
                                                "2nd gen Moroccan"),
         migration_background_fac = factor(as.numeric(migration_background_fac),
         levels = 1:5,
         labels = c("Dutch Majority",
                                                "1st gen Turkish-Dutch",
                                                "2nd gen Turkish-Dutch",
                                                "1st gen Moroccan-Dutch",
                                                "2nd gen Moroccan-Dutch")),
    migration_background_simple_fac = case_when(
      migration_background_fac == "1st gen Turkish-Dutch" ~ 2,
      migration_background_fac == "2nd gen Turkish-Dutch" ~ 2,
      migration_background_fac == "1st gen Moroccan-Dutch" ~ 3,
      migration_background_fac == "2nd gen Moroccan-Dutch" ~ 3,
      migration_background_fac == "Dutch Majority" ~ 1
    ),
    migration_background_simple_fac = factor(
      migration_background_simple_fac,
      levels = 1:3,
      labels = c("Dutch Majority", "Turkish-Dutch", "Moroccan-Dutch")
    ),
    migrant_generation = case_when(
      str_detect(migration_background_fac, "1st") ~ 1,
      str_detect(migration_background_fac, "2nd") ~ 2
    )
  )

```

#### Boxplot 

Create panel with complete groups (no generation distinction).

```{r compare simple groups size}
#set custom pallet
pal <- c("#66c2a5",
         "#fc8d62",
         "#8da0cb")

#create simple boxplot
boxplot_size_simple_groups <- nells_df %>%
  ggplot(aes(x = fct_rev(migration_background_simple_fac),
             y = mean,
             fill = migration_background_simple_fac,
             colour = migration_background_simple_fac
             )) +
  geom_boxplot(alpha = 0.6) +
  coord_flip() +
  scale_colour_manual(
    values = pal,
    aesthetics = c("colour", "fill")
  ) +
   theme(
    panel.background = element_rect(fill = "#FFFFFF"),
    plot.background = element_rect(fill = "#FFFFFF"),
    panel.grid = element_line(colour = "grey"),
    text = element_text(family = "sans", size = 12),
    axis.title.x = element_text(hjust = 0.9, face = "bold"),
    axis.text.x = element_text(),
    axis.line = element_blank(),
    axis.title.y = element_text(hjust = 0.9, face = "bold"),
    axis.ticks = element_blank(),
    strip.background = element_rect(fill = "#A9A9A9"),
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_blank(),
    legend.position = "none",
    legend.title = element_blank(),
    legend.background = element_rect(fill = "#FFFFFF"),
    legend.key = element_rect(fill = "#FFFFFF")
  ) +
  labs(y  = "",
       x = "")
```

Create panel with generation distinction.

```{r compare ethnic size boxplot extended, fig.width=5, fig.height=5}
#set custom pallet
pal <- c("#fc8d62",
         "#fc8d62",
         "#8da0cb",
         "#8da0cb")

#create extended boplot
boxplot_size_extended_groups <- nells_df %>%
  filter(migration_background_fac != "Dutch Majority") %>% 
  ggplot(aes(x = fct_rev(migration_background_fac),
             y = mean,
             fill = migration_background_fac,
             colour = migration_background_fac
             )) +
  geom_boxplot_pattern(aes(pattern_density = as.factor(migrant_generation)),
                       alpha = 0.6,
                       pattern = "circle"
                       ) +
  coord_flip() +
  scale_colour_manual(
    values = pal,
    aesthetics = c("colour", "fill")
  )  +
  scale_pattern_density_manual(values = c("1" = 0, "2"=0.1)) +
   theme(
    panel.background = element_rect(fill = "#FFFFFF"),
    plot.background = element_rect(fill = "#FFFFFF"),
    panel.grid = element_line(colour = "grey"),
    text = element_text(family = "sans", size = 12),
    axis.title.x = element_text(hjust = 0.9, face = "bold"),
    axis.text.x = element_text(),
    axis.line = element_blank(),
    axis.title.y = element_text(hjust = 0.9, face = "bold"),
    axis.ticks = element_blank(),
    strip.background = element_rect(fill = "#A9A9A9"),
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_blank(),
    legend.position = "none",
    legend.title = element_blank(),
    legend.background = element_rect(fill = "#FFFFFF"),
    legend.key = element_rect(fill = "#FFFFFF")
  ) +
  labs(y  = "",
       x = "")
```

Combine panels in multipanel plot.

```{r compare size plots}

## Panel plot
hom_size_panel <- boxplot_size_simple_groups + 
  boxplot_size_extended_groups + 
  plot_annotation(tag_levels ='a',
                  tag_prefix = '(',
                  tag_suffix = ')') +
  plot_layout(ncol = 1,
              guides = "collect",
              heights = c(1,2)) & 
  theme(legend.position='none')


hom_size_panel

ggsave(hom_size_panel,
       file = "data_analysis/plots/descriptive/size_plot_panel.jpg",
       width = 8,
       height = 5,
       dpi = 320)

```

# Ethnic Homogeneity

## Multipanel boxplot

Prepare data for ethnic homogeneity plot.

```{r create ethnic homogeneity}
#weighted by name frequency.
nells_df <- nells_df %>%
  mutate(
    sum_dutch_w = knows_daan_boundary/22704 + 
      knows_kevin_boundary/23167 +
      knows_emma_boundary/18730 + 
      knows_linda_boundary/29955 + 
      knows_albert_boundary/31767 + 
      knows_edwin_boundary/21866 + 
      knows_willemina_boundary/17133 + 
      knows_ingrid_boundary/31323,
    sum_turkish_w = knows_ibrahim_boundary/2099 + 
      knows_esra_boundary/1878,
    sum_moroccan_w = knows_mohammed_boundary/13448 + 
      knows_fatima_boundary/2808,
    sum_total_w = sum_dutch_w + sum_turkish_w + sum_moroccan_w,
    per_dutch_w = (sum_dutch_w / sum_total_w) * 100,
    per_turkish_w = (sum_turkish_w / sum_total_w) * 100,
    per_moroccan_w = (sum_moroccan_w / sum_total_w) * 100
  )

#assign correct percentage co-ethnic to each group
nells_df <- nells_df %>% 
  mutate(per_ingroup_w = case_when(
    str_detect(migration_background_fac, "kish") ~ per_turkish_w,
    str_detect(migration_background_fac, "occan") ~ per_moroccan_w,
    migration_background_fac == "Dutch Majority" ~ per_dutch_w,
    migration_background_fac == "Other" ~ per_dutch_w
  ))

```

Create panel with complete groups (no generation distinction).

```{r compare simple groups hom}
#set custom pallet
pal <- c("#66c2a5",
         "#fc8d62",
         "#8da0cb")

#create graph for simple groups
boxplot_hom_simple_groups <- nells_df %>%
  ggplot(aes(x = fct_rev(migration_background_simple_fac),
             y = per_ingroup_w,
             fill = migration_background_simple_fac,
             colour = migration_background_simple_fac
             )) +
  geom_boxplot(alpha = 0.6) +
  coord_flip() +
  scale_colour_manual(
    values = pal,
    aesthetics = c("colour", "fill")
  ) +
   theme(
    panel.background = element_rect(fill = "#FFFFFF"),
    plot.background = element_rect(fill = "#FFFFFF"),
    panel.grid = element_line(colour = "grey"),
    text = element_text(family = "sans", size = 12),
    axis.title.x = element_text(hjust = 0.9, face = "bold"),
    axis.text.x = element_text(),
    axis.line = element_blank(),
    axis.title.y = element_text(hjust = 0.9, face = "bold"),
    axis.ticks = element_blank(),
    strip.background = element_rect(fill = "#A9A9A9"),
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_blank(),
    legend.position = "none",
    legend.title = element_blank(),
    legend.background = element_rect(fill = "#FFFFFF"),
    legend.key = element_rect(fill = "#FFFFFF")
  ) +
  labs(y  = "",
       x = "")
```

Create panel with generation distinction.

```{r compare ethnic hom boxplot extended, fig.width=5, fig.height=5}
#set custom pallet
pal <- c("#fc8d62",
         "#fc8d62",
         "#8da0cb",
         "#8da0cb")

#create boxplot for extended groups
boxplot_hom_extended_groups <- nells_df %>%
  filter(migration_background_fac != "Dutch Majority") %>% 
  ggplot(aes(x = fct_rev(migration_background_fac),
             y = per_ingroup_w,
             colour = migration_background_fac,
             fill = migration_background_fac,
             )) +
    geom_boxplot_pattern(aes(pattern_density = as.factor(migrant_generation)),
                       alpha = 0.6,
                       pattern = "circle"
                       ) +
  coord_flip() +
  scale_colour_manual(
    values = pal,
    aesthetics = c("colour", "fill")
  )  +
  scale_pattern_density_manual(values = c("1" = 0, "2"=0.1)) +
   theme(
    panel.background = element_rect(fill = "#FFFFFF"),
    plot.background = element_rect(fill = "#FFFFFF"),
    panel.grid = element_line(colour = "grey"),
    text = element_text(family = "sans", size = 12),
    axis.title.x = element_text(hjust = 0.9, face = "bold"),
    axis.text.x = element_text(),
    axis.line = element_blank(),
    axis.title.y = element_text(hjust = 0.9, face = "bold"),
    axis.ticks = element_blank(),
    strip.background = element_rect(fill = "#A9A9A9"),
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_blank(),
    legend.position = "none",
    legend.title = element_blank(),
    legend.background = element_rect(fill = "#FFFFFF"),
    legend.key = element_rect(fill = "#FFFFFF")
  ) +
  labs(y  = "",
       x = "")
```

Create multipanel plot.

```{r compare plots hom}

## Panel plot
hom_plot_panel <- boxplot_hom_simple_groups + 
  boxplot_hom_extended_groups + 
  plot_annotation(tag_levels ='a',
                  tag_prefix = '(',
                  tag_suffix = ')') +
  plot_layout(ncol = 1,
              guides = "collect",
              heights = c(1,2)) & 
  theme(legend.position='none')

#show plot
hom_plot_panel

#save plor
ggsave(hom_plot_panel,
       file = "data_analysis/plots/descriptive/hom_plot_panel.jpg",
       width = 8,
       height = 5,
       dpi = 320)

```


## Name differences

For every x (for now names) we can estimate an NB regression to see differences between migration backgrounds. Please note: this does not take into account naming frequency in the population. Differences between different ethnic groups may indeed be larger or smaller for different names. 


```{r analysis of x categories}
# use a loop.
#set var_names to use in loop. 
variable_names_model <- c("knows_daan",
  "knows_kevin", 
  "knows_edwin",
  "knows_albert",
  "knows_emma",
  "knows_linda",
  "knows_ingrid",
  "knows_willemina",
  "knows_mohammed",
  "knows_fatima",
  "knows_ibrahim",
  "knows_esra")

#start analysis loop
model_results <- list()

for(i in 1:length(variable_names_model)) {#i = 1
  fm <- as.formula(paste(variable_names_model[[i]], "~", "migration_background_fac"))
  model_results[[i]] <- MASS::glm.nb(fm,
     data = nells_df)
}

#clean output with tidy r
model_results_df_list <- model_results %>% 
  purrr::map(.x =., 
             .f = ~ broom::tidy(.x))

#add var_names to model_results
for(i in 1:length(model_results_df_list)){
  model_results_df_list[[i]] <- model_results_df_list[[i]] %>% 
    mutate(dep_var = variable_names_model[i])
}

#combine model dfs.
model_results_df <- model_results_df_list %>%
  bind_rows() 

#set correct variable names
model_results_df <- model_results_df %>%
  mutate(
    term = case_when(
      str_detect(term, "2nd gen Moroccan") ~ "2nd gen Moroccan-Dutch",
      str_detect(term, "2nd gen Turkish") ~ "2nd gen Turkish-Dutch",
      str_detect(term, "1st gen Moroccan") ~ "1st gen Moroccan-Dutch",
      str_detect(term, "1st gen Turkish") ~ "1st gen Turkish-Dutch",
      term == "(Intercept)" ~ "Intercept"
    )
  )

#Set correct names
correct_names <- model_results_df %>% 
  pull(dep_var) %>% 
  str_replace(., pattern = "knows_", replacement = "") %>% 
  str_to_title()

#drop old names and add the correct names
model_results_df <- model_results_df %>% 
  select(-dep_var) %>% 
  mutate(dep_var = correct_names)

```

## Predicted counts plot for names and ethnicity

```{r pred count names }
pred_nb_f <- function(nb_model, names){#nb_model = model_results[[1]], names = variable_names_model[[1]]
pred <- predict(object = nb_model,
                type = "response",
                se.fit = T
        )

plot_df <- nells_df %>% 
  select(id, migration_background_fac) %>% 
  bind_cols(pred) %>% 
  mutate(dep_var = names)

return(plot_df)
}

model_pred_list <- map2(.x = model_results,
     .y = variable_names_model,
     .f = ~pred_nb_f(nb_model = .x,
                     names = .y))

model_pred_df <- model_pred_list %>% 
  bind_rows()


#Set correct names
correct_names <- model_pred_df %>% 
  pull(dep_var) %>% 
  str_replace(., pattern = "knows_", replacement = "") %>% 
  str_to_title()


#drop old names and add the correct names
model_pred_df <- model_pred_df %>% 
  select(-dep_var) %>% 
  mutate(dep_var = correct_names)
```


```{r ethnic names plot pred}
#set custom pallet
pal <- c("#66c2a5",
         "#fc8d62",
         "#fc8d62",
         "#8da0cb",
         "#8da0cb")

#crete plot with minority names
ethnic_names_pred_plot <- model_pred_df %>% 
  filter(dep_var %in% c("Mohammed",
  "Fatima",
  "Ibrahim",
  "Esra")) %>% 
  ggplot(aes(x = dep_var,
             y = fit,
             shape = migration_background_fac)) +
  geom_linerange(aes(ymin = fit - (se.fit *1.96),
                      ymax =  fit + (se.fit *1.96)),
                  position = position_dodge(width = 1)) +
  geom_point(aes(colour = migration_background_fac,
                 fill = migration_background_fac),
            position = position_dodge(width = 1)) +
  facet_wrap(vars(dep_var),
             scales = "free",
             ncol = 2) +
  scale_colour_manual(
    values = pal,
    aesthetics = c("colour", "fill")
  ) +
  scale_shape_manual(values = c(21,22,24,22,24)) +
  theme(
    panel.background = element_rect(fill = "#FFFFFF",
                                    colour = "black"),
    plot.background = element_rect(fill = "#FFFFFF"),
    panel.grid = element_line(colour = "grey"),
    text = element_text(family = "sans", size = 12),
    axis.title.x = element_text(hjust = 0.9, face = "bold"),
    axis.text.x = element_blank(),
    axis.line = element_blank(),
    axis.title.y = element_text(hjust = 0.9, face = "bold"),
    axis.ticks = element_blank(),
    strip.background = element_rect(fill = "#FFFFFF"),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "none",
    legend.title = element_blank(),
    legend.background = element_rect(fill = "#FFFFFF"),
    legend.key = element_rect(fill = "#FFFFFF")
  ) +
    labs(x = "", 
       y = "",
       colour = "",
       shape = "",
       fill = ""
       )
```

```{r non ethnic names plot pred}
#set custom pallet
pal <- c("#66c2a5",
         "#fc8d62",
         "#fc8d62",
         "#8da0cb",
         "#8da0cb")

#create plot for majority names
non_ethnic_names_pred_plot <- model_pred_df %>% 
  filter(!dep_var %in% c("Mohammed",
  "Fatima",
  "Ibrahim",
  "Esra")) %>% 
  ggplot(aes(x = dep_var,
             y = fit,
             shape = migration_background_fac)) +
  geom_linerange(aes(ymin = fit - (se.fit *1.96),
                      ymax =  fit + (se.fit *1.96)),
                  position = position_dodge(width = 1)) +
  geom_point(aes(colour = migration_background_fac,
                 fill = migration_background_fac),
            position = position_dodge(width = 1)) +
  facet_wrap(vars(dep_var),
             scales = "free_x",
             ncol = 2) +
  scale_colour_manual(
    values = pal,
    aesthetics = c("colour", "fill")
  ) +
  scale_shape_manual(values = c(21,22,24,22,24)) +
  theme(
    panel.background = element_rect(fill = "#FFFFFF",
                                    colour = "black"),
    plot.background = element_rect(fill = "#FFFFFF"),
    panel.grid = element_line(colour = "grey"),
    text = element_text(family = "sans", size = 12),
    axis.title.x = element_text(hjust = 0.9, face = "bold"),
    axis.text.x = element_blank(),
    axis.line = element_blank(),
    axis.title.y = element_text(hjust = 0.9, face = "bold"),
    axis.ticks = element_blank(),
    strip.background = element_rect(fill = "#FFFFFF"),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "none",
    legend.title = element_blank(),
    legend.background = element_rect(fill = "#FFFFFF"),
    legend.key = element_rect(fill = "#FFFFFF")
  ) +
    labs(x = "", 
       y = "",
       colour = "",
       shape = "",
       fill = ""
       )

```

```{r pred names panel plot, fig.width=6, fig.height=8}
#Combine plots in multipanel plot
names_pred_het_panel <- ethnic_names_pred_plot +
  non_ethnic_names_pred_plot + 
  plot_annotation(
    tag_levels = 'a',
    tag_prefix = '(',
    tag_suffix = ')'
  ) +
  plot_layout(ncol = 1,
              heights = c(1, 3),
              guides = 'collect',
  ) &
  theme(legend.position = c(-2,-5),
        legend.direction = 'vertical')

#preview plot
names_pred_het_panel

#save plot
ggsave(names_pred_het_panel,
        file = "data_analysis/plots/descriptive/names_het_pred_panel.jpg",
       width = 6,
       height = 8,
       dpi = 320)
```




