Initial set up

fpackage.check <- function(packages) { # (c) Jochem Tolsma
  lapply(packages, FUN = function(x) {
    if (!require(x, character.only = TRUE)) {
      install.packages(x, dependencies = TRUE)
      library(x, character.only = TRUE)
    }
  })
}
packages = c("tidyverse", 
             "viridis", "kableExtra","ggridges", "viridis", "ggdark", "ggplot2", "patchwork", "mice","qualvar",
             "oaxaca", "foreach"
)
fpackage.check(packages)

Goal

Create the descriptive statistics table and estimate the decomposition analysis.

Data preperation

NELLS and NSUM data import

Import the NSUM data and recreate the NSUM module.

#import NELLS file.
load(file = "data_analysis/data/data_processed/nells_data/2023-05-08_nells-nsum-prepped-data.rds")

Import the model estimates from the estimated NSUM models. We have chosen the model which uses Ibrahim for the ethnic names.

if (file.exists(
  "data_analysis/results/nsum_output/main/combined_data/df_models_nsum_long.rds"
)) {
  load(file = "data_analysis/results/nsum_output/main/combined_data/df_models_nsum_long.rds")
} else {
  list_files <-
    as.list(
      dir(
        "data_analysis/results/nsum_output/main/model/",
        full.names = T
      )
    )
  #create loop lists
  kds <- list()
  kdssd <- list()
  data <- list()
  list_df <- list()
  
  #loop to extract information
  for (i in 1:length(list_files)) {
    #i = 1
    print(paste0("Number ", i, " of ", length(list_files)))
    load(list_files[[i]])
    kds[[i]] <-
      rowMeans(degree$d.values, na.rm = TRUE) # calculate rowmean of netsize iterations: so the retained chains
    kdssd[[i]] <-
      matrixStats::rowSds(degree$d.values) # calculate sd of 4k estimates per row: sd for those values
    data[[i]] <- cbind(kds[[i]], kdssd[[i]]) # combine them
    list_df[[i]] <-
      cbind(as_tibble(data[[i]]), nells_nsum$id) # add NELLS id variable
    strings <-
      str_split(str_extract(list_files[[i]][1], pattern = "estimates.+"),
                pattern = "_")  # add holdout and tauk number
    list_df[[i]] <- list_df[[i]] %>%
      mutate(
        holdout = as.numeric(str_extract(strings[[1]][2], pattern = "[[:digit:]]{1,}")))
  }
    #combine results and save
    df_models_nsum_long <- list_df %>%
      bind_rows() %>%
      rename(mean = V1,
             sd = V2,
             id = 3)
    #save image
    save(df_models_nsum_long, file = "data_analysis/results/nsum_output/main/combined_data/df_models_nsum_long.rds")
}

Prepare data for analysis

Final data preperation for analysis

#extract results of NSUM 
nsum_estimates <- df_models_nsum_long %>% 
  filter(holdout == 10) 

#add nsum results to NELLS data
nells_df <- nsum_estimates %>% 
  left_join(nells_nsum)

#data prep
nells_df <- nells_df %>% 
  mutate(has_child = if_else(nr_children == "0", 0, 1),
         religious = if_else(religious_denom == "None", 0, 1),
         education_num = as.numeric(education_completed_highest),
         education_recoded_numeric = case_when(
           education_num == 1 ~ 1,
           education_num %in% c(2,13) ~ 2,
           education_num %in% c(3,4,14) ~ 3,
           education_num %in% c(5,6) ~ 4,
           education_num %in% c(7,8) ~ 5,
           education_num %in% c(9,15) ~ 6,
           education_num %in% c(10,11,12) ~ 7
         )
  )

#create inwoner coethnic
nells_df <- nells_df %>% 
  mutate(inwoner_coethnic_pc4 = case_when(
    str_detect(migration_background_fac, "kish") ~ per_TKY_mig_a_pc4,
    str_detect(migration_background_fac, "occan") ~ per_MOR_mig_a_pc4,
    migration_background_fac == "Dutch-Majority" ~ per_NL_mig_a_pc4,
    migration_background_fac == "Other" ~ per_MOR_mig_a_pc4 + per_TKY_mig_a_pc4
  ),
  inwoner_coethnic_pc4 = inwoner_coethnic_pc4*100,
  inwoner_coethnic_gem = case_when(
    str_detect(migration_background_fac, "kish") ~ per_TKY_mig_a_gem,
    str_detect(migration_background_fac, "occan") ~ per_MOR_mig_a_gem,
    migration_background_fac == "Dutch-Majority" ~ per_NL_mig_a_gem,
    migration_background_fac == "Other" ~ per_MOR_mig_a_gem + per_TKY_mig_a_gem
  ),
  inwoner_coethnic_gem = inwoner_coethnic_gem*100,
  per_NW_mig_a_gem = per_NW_mig_a_gem*100,
  per_NW_mig_a_pc4 = per_NW_mig_a_pc4*100
  )

#thermometer score outgroup
nells_df <- nells_df %>% 
   mutate(therm_outgroup_mean = case_when(
    str_detect(migration_background_fac, "kish") ~ (therm_dutch_maj + therm_mor)/2,
    str_detect(migration_background_fac, "occan") ~ (therm_dutch_maj + therm_tur)/2,
    migration_background_fac == "Dutch-Majority" ~ (therm_mor + therm_tur)/2,
    migration_background_fac == "Other" ~ (therm_dutch_maj + therm_tur + therm_mor)/3
  ),
  therm_outgroup_diff = case_when(
    str_detect(migration_background_fac, "kish") ~ ((therm_dutch_maj + therm_mor)/2) - therm_tur,
    str_detect(migration_background_fac, "occan") ~ ((therm_dutch_maj + therm_tur)/2) - therm_mor,
    migration_background_fac == "Dutch-Majority" ~ ((therm_mor + therm_tur)/2) - therm_dutch_maj,
    migration_background_fac == "Other" ~ (therm_dutch_maj + therm_tur + therm_mor)/3
  )) 

Prepare ethnic homogeneity measure.

nells_df <- nells_df %>%
  mutate(
    sum_dutch = knows_daan_boundary + knows_kevin_boundary + knows_emma_boundary + knows_linda_boundary + knows_albert_boundary + knows_edwin_boundary + knows_willemina_boundary + knows_ingrid_boundary,
    sum_turkish = knows_ibrahim_boundary + knows_esra_boundary,
    sum_moroccan = knows_mohammed_boundary + knows_fatima_boundary,
    sum_cat = knows_prison + knows_mbo + knows_secundary,
    +knows_hbo + knows_university + knows_unemployed + knows_secondhome,
    sum_total = sum_dutch + sum_turkish + sum_moroccan,
    per_dutch = (sum_dutch / sum_total) * 100,
    per_turkish = (sum_turkish / sum_total) * 100,
    per_moroccan = (sum_moroccan / sum_total) * 100
  )

#assign correct percentage co-ethnic to each group
nells_df <- nells_df %>% 
  mutate(per_ingroup = case_when(
    str_detect(migration_background_fac, "kish") ~ per_turkish,
    str_detect(migration_background_fac, "occan") ~ per_moroccan,
    migration_background_fac == "Dutch-Majority" ~ per_dutch,
    migration_background_fac == "Other" ~ per_dutch
  ))

#weighted by name frequency.
nells_df <- nells_df %>%
  mutate(
    sum_dutch_w = knows_daan_boundary/22704 + 
      knows_kevin_boundary/23167 +
      knows_emma_boundary/18730 + 
      knows_linda_boundary/29955 + 
      knows_albert_boundary/31767 + 
      knows_edwin_boundary/21866 + 
      knows_willemina_boundary/17133 + 
      knows_ingrid_boundary/31323,
    sum_turkish_w = knows_ibrahim_boundary/2099 + 
      knows_esra_boundary/1878,
    sum_moroccan_w = knows_mohammed_boundary/13448 + 
      knows_fatima_boundary/2808,
    sum_total_w = sum_dutch_w + sum_turkish_w + sum_moroccan_w,
    per_dutch_w = (sum_dutch_w / sum_total_w) * 100,
    per_turkish_w = (sum_turkish_w / sum_total_w) * 100,
    per_moroccan_w = (sum_moroccan_w / sum_total_w) * 100
  )

#assign correct percentage co-ethnic to each group
nells_df <- nells_df %>% 
  mutate(per_ingroup_w = case_when(
    str_detect(migration_background_fac, "kish") ~ per_turkish_w,
    str_detect(migration_background_fac, "occan") ~ per_moroccan_w,
    migration_background_fac == "Dutch-Majority" ~ per_dutch_w,
    migration_background_fac == "Other" ~ per_dutch_w
  ))

Create homogeneity measure based on naive NSUM method. First create the function.

#get the frequencies
frequencies <- read_csv("data_analysis/2022-08-26_namefrequencies.csv")

name_df <- nells_nsum %>% 
  select(id, contains("boundary")) %>% 
  select(-contains("prison"))

#nsum function
function_nsum <- function(dat, #data field of x with ID
                          popsize, #population size
                          freq) {#frequency vector
  #freq extract
  freq <- freq %>% 
    pull(number)
  
  #create df with ID
  dat_id <- dat %>%
    select(id)
  
  #create df without ID
  dat <- dat %>% select(-id)
  
  #extract names
  names_dat <- names(dat) %>%
    str_match(., "_(.*)_")
  
  #select correct list
  names_dat <- names_dat[, 2]
  
  
  #loop to apply naive function for all elements in the DF
  nsum_df <- foreach(a = 1:ncol(dat),
                     .combine = cbind) %do% {
                       dat %>%
                         select(.data[[names(dat)[a]]]) %>%
                         mutate(!!paste0(names_dat[a], "_predicted") := (.data[[names(dat)[a]]] /
                                                                           freq[a]) * popsize)
                     }
  #create NSUM predicted estimate (averaged over all individual scale ups)
  nsum_df <- nsum_df %>%
    bind_cols(dat_id) %>% #add ID
    rowwise() %>% #rowwise means
    mutate(ave_predicted = mean(x = c_across(cols = contains("predicted")),
                                na.rm = T)) %>%
    ungroup()
  
  return(nsum_df)
}

Apply function to NELLS data.

#Create naive scale up estimates with all names. 
name_all_df <- function_nsum(dat = name_df,
              popsize = 17407585,
              freq = frequencies) 

#rename all names and delete individual predictions
name_all_df <- name_all_df %>% 
  rename(all_pred = ave_predicted) %>% 
  rowwise() %>% 
  mutate(maj_pred = mean(x = c_across(cols = c(
    daan_predicted,
    kevin_predicted,
    edwin_predicted,
    albert_predicted,
    linda_predicted,
    emma_predicted,
    ingrid_predicted,
    willemina_predicted
  )),
  na.rm = T),
  min_pred = mean(x = c_across(cols = c(
    esra_predicted,
    mohammed_predicted,
    ibrahim_predicted,
    fatima_predicted,
  )),
  na.rm = T),
  ) %>% 
  ungroup()

#combine information
nells_df <- nells_df %>% 
  left_join(name_all_df, by = "id")

Delete outliers and filter out “Other” mig background

nells_df <- nells_df %>% 
    mutate(mean_size = mean(mean, na.rm = T),
         sd_size = sd(mean, na.rm = T),
         z = (mean - mean_size)/sd_size) %>% 
  filter(z < 3) %>% 
  filter(migration_background_fac != "Other")
#create standardized variables
nells_df <- nells_df %>% 
  mutate(sd_maj_pred = sd(maj_pred, na.rm = T),
         sd_min_pred = sd(min_pred, na.rm = T),
         z_maj_pred = (maj_pred - mean(maj_pred, na.rm = T))/sd_maj_pred,
         z_min_pred = (min_pred - mean(min_pred, na.rm = T))/sd_min_pred,
         z_value_ingroup = ifelse(migration_background_fac == "Dutch-Majority", z_maj_pred, z_min_pred),
         z_value_outgroup = ifelse(migration_background_fac == "Dutch-Majority", z_min_pred, z_maj_pred),
         z_distance_groups = z_value_outgroup - z_value_ingroup,
         z_distance_min_maj = z_min_pred - z_maj_pred) 

Missing data imputation

nells_df <- nells_df %>% 
  mutate(therm_diff_missing = ifelse(is.na(therm_outgroup_diff), 1, 0),
         therm_mean_missing = ifelse(is.na(therm_outgroup_mean), 1, 0),
         pc4_missing = ifelse(is.na(Inwoner_pc4), 1, 0))

imp <- nells_df %>% 
  select(migration_background_fac,
      age,
      gender,
      in_education,
      education_recoded_numeric,
      partner,
      partner_extended,
      has_child,
      paid_work,
      religious,
      religious_denom,
      rel_attendance,
      inwoner_coethnic_pc4,
      inwoner_coethnic_gem,
      therm_outgroup_mean,
      therm_outgroup_diff,
      therm_dutch_maj,
      therm_mor,
      therm_tur,
      starts_with("per"),
      woz_pc4,
      mean_woz_gem,
      Inwoner_gem,
      Inwoner_pc4,
      per_NW_mig_a_pc4) %>%  
  select(-per_ingroup, -per_ingroup_w) %>% 
  mice(method = "mean") %>% 
  complete()

imp <- nells_df %>%
  select(
    id,
    mean,
    per_ingroup,
    per_ingroup_w,
    therm_diff_missing,
    therm_mean_missing,
    pc4_missing,
    z_maj_pred,
    z_min_pred,
    z_value_ingroup,
    z_value_outgroup,
    z_distance_groups,
    z_distance_min_maj
  ) %>%
  bind_cols(imp)

Mean centering of continouos variables

imp <- imp %>% 
  mutate(education_recoded_numeric = education_recoded_numeric - mean(education_recoded_numeric, na.rm = T),
         age = age - mean(age, na.rm = T),
         woz_pc4 = woz_pc4 - mean(woz_pc4, na.rm = T),
         mean_woz_gem = mean_woz_gem - mean(mean_woz_gem, na.rm = T),
         Inwoner_gem = Inwoner_gem - mean(Inwoner_gem, na.rm = T),
         Inwoner_pc4 = Inwoner_pc4 - mean(Inwoner_pc4, na.rm = T),
         therm_outgroup_mean = therm_outgroup_mean - mean(therm_outgroup_mean, na.rm = T),
         therm_outgroup_diff = therm_outgroup_diff - mean(therm_outgroup_diff, na.rm = T),
         inwoner_coethnic_pc4 = inwoner_coethnic_pc4 - mean(inwoner_coethnic_pc4, na.rm = T),
         inwoner_coethnic_gem = inwoner_coethnic_gem - mean(inwoner_coethnic_gem, na.rm = T),
        per_NW_mig_a_pc4 = per_NW_mig_a_pc4 - mean(per_NW_mig_a_pc4, na.rm = T)
         )

Set correct labels of categorical variables.

#relabel the factor variables. 
imp <- imp %>%
  mutate(
    migration_background_fac = fct_relevel(migration_background_fac,
                                                "Dutch-Majority",
                                                "1st gen Turkish",
                                                "2nd gen Turkish",
                                                "1st gen Moroccan",
                                                "2nd gen Moroccan"),
         migration_background_fac = factor(as.numeric(migration_background_fac),
         levels = 1:5,
         labels = c("Dutch Majority",
                                                "1st gen Turkish-Dutch",
                                                "2nd gen Turkish-Dutch",
                                                "1st gen Moroccan-Dutch",
                                                "2nd gen Moroccan-Dutch")),
    migration_background_simple_fac = case_when(
      migration_background_fac == "1st gen Turkish-Dutch" ~ 2,
      migration_background_fac == "2nd gen Turkish-Dutch" ~ 2,
      migration_background_fac == "1st gen Moroccan-Dutch" ~ 3,
      migration_background_fac == "2nd gen Moroccan-Dutch" ~ 3,
      migration_background_fac == "Dutch Majority" ~ 1
    ),
    migration_background_simple_fac = factor(
      migration_background_simple_fac,
      levels = 1:3,
      labels = c("Dutch Majority", "Turkish-Dutch", "Moroccan-Dutch")
    )
  )

#not all subgroups have complete cases for all the factor variables. Hence, the need to combine some categories. 
imp <- imp %>%
  mutate(
    rel_attendance_num = if_else(as.numeric(rel_attendance) > 3, 4, as.numeric(rel_attendance)),
    rel_fac_2 = factor(
      rel_attendance_num,
      levels = 1:4,
      labels = c("Never", "1-2 Year", "3-11 Year", "Monthly or more")
    ),
    partner = if_else(partner_extended == "Does not have a partner", 0, 1)
  )  

Descriptive statistics table

Desstats table function and table creation

prepare_desstats <- function(x, full.sample) {
  if (full.sample == T) {
    desstats <- x %>%
      mutate(
        ln_size = log(mean),
        Dutch = if_else(migration_background_fac == "Dutch Majority", 1, 0),
        `2nd_gen_Moroccan` = if_else(migration_background_fac == "2nd gen Moroccan-Dutch", 1, 0),
        `2nd_gen_Turkish` = if_else(migration_background_fac == "2nd gen Turkish-Dutch", 1, 0),
        `1st_gen_Turkish` = if_else(migration_background_fac == "1st gen Turkish-Dutch", 1, 0),
        `1st_gen_Moroccan` = if_else(migration_background_fac == "1st gen Moroccan-Dutch", 1, 0),
        age = as.numeric(age),
        female = if_else(gender == "Female", 1, 0),
        gender_other = if_else(gender == "Other", 1, 0),
        no_partner = if_else(partner_extended != "Does not have a partner", 1, 0),
        partner_seperate = if_else(partner_extended == "Has a partner, lives seperately", 1, 0),
        partner_together = if_else(partner_extended == "Lives together, unmarried", 1, 0),
        partner_married = if_else(partner_extended == "Married", 1, 0),
        rel_never = if_else(rel_attendance == "Never", 1, 0),
        rel_12_year = if_else(rel_attendance == "1-2 a year", 1, 0),
        rel_311_year = if_else(rel_attendance == "3-11 a year", 1, 0),
        rel_month = if_else(rel_attendance == "Once a month", 1, 0),
        rel_23_month = if_else(rel_attendance == "2-3 a month", 1, 0),
        rel_weekly = if_else(rel_attendance == "Every week", 1, 0),
        rel_mul_weekly = if_else(rel_attendance == "Multiple times a week", 1, 0),
        rel_month_more = if_else(rel_fac_2 == "Monthly or more", 1, 0)
      ) %>%
      rename(size = mean) %>%
      select(
        size,
        ln_size,
        per_ingroup_w,
        #per_ingroup,
        #ada_value,
        Dutch,
        `1st_gen_Turkish`,
        `2nd_gen_Turkish`,
        `1st_gen_Moroccan`,
        `2nd_gen_Moroccan`,
        age,
        female,
        education_recoded_numeric,
        no_partner,
        paid_work,
        in_education,
        rel_never,
        rel_12_year,
        rel_311_year,
        rel_month_more,
        therm_outgroup_diff,
        per_NW_mig_a_pc4,
        woz_pc4,
        Inwoner_pc4
      ) %>%
      psych::describe() %>% 
      round(., digits = 3)
  }
  else {
    desstats <- x %>%
      mutate(
        ln_size = log(mean),
        age = as.numeric(age),
        female = if_else(gender == "Female", 1, 0),
        gender_other = if_else(gender == "Other", 1, 0),
        no_partner = if_else(partner_extended != "Does not have a partner", 1, 0),
        partner_seperate = if_else(partner_extended == "Has a partner, lives seperately", 1, 0),
        partner_together = if_else(partner_extended == "Lives together, unmarried", 1, 0),
        partner_married = if_else(partner_extended == "Married", 1, 0),
        rel_never = if_else(rel_attendance == "Never", 1, 0),
        rel_12_year = if_else(rel_attendance == "1-2 a year", 1, 0),
        rel_311_year = if_else(rel_attendance == "3-11 a year", 1, 0),
        rel_month = if_else(rel_attendance == "Once a month", 1, 0),
        rel_23_month = if_else(rel_attendance == "2-3 a month", 1, 0),
        rel_weekly = if_else(rel_attendance == "Every week", 1, 0),
        rel_mul_weekly = if_else(rel_attendance == "Multiple times a week", 1, 0),
        rel_month_more = if_else(rel_fac_2 == "Monthly or more", 1, 0)
      ) %>%
      rename(size = mean) %>%
      select(
        size,
        ln_size,
        per_ingroup_w,
        #ada_value,
        age,
        female,
        education_recoded_numeric,
        no_partner,
        paid_work,
        in_education,
        rel_never,
        rel_12_year,
        rel_311_year,
        rel_month_more,
        therm_outgroup_diff,
        per_NW_mig_a_pc4,
        woz_pc4,
        Inwoner_pc4
      ) %>%
      psych::describe() %>% 
      round(., digits = 3)
  }
  
}

var_names_sample <- c(
  "Network size",
  "Network size (log)",
  "Ethnic homogeneity",
  "Age",
  "Gender [Female]",
  "Educational attainment",
  "Partner [Has a partner]",
  "Paid work",
  "Currently in education",
  "Religious attendance[Never]",
  "Religious attendance[1-2 year]",
  "Religious attendance[3-11 a year]",
  "Religious attendance[Once a month or more]",
  "Outgroup attitude",
  "Neighbourhood: % Non-western migrant",
  "Neighbourhood: mean property valuation",
  "Neighbourhood: # inhabitants"
)

var_names_full<- c(
  "Network size",
  "Network size (log)",
  "Ethnic homogeneity",
  "Group[Dutch majority]",
  "Group[1st gen Turkish-Dutch]",
  "Group[2nd gen Turkish-Dutch]",
  "Group[1st gen Moroccan-Dutch]",
  "Group[2nd gen Moroccan-Dutch]",
  "Age",
  "Gender [Female]",
  "Educational attainment",
  "Partner [Has a partner]",
  "Paid work",
  "Currently in education",
  "Religious attendance[Never]",
  "Religious attendance[1-2 year]",
  "Religious attendance[3-11 a year]",
  "Religious attendance[Once a month or more]",
  "Outgroup attitude",
  "Neighbourhood: % Non-western migrant",
  "Neighbourhood: mean property valuation",
  "Neighbourhood: # inhabitants"
)

desstats <- imp %>% 
  prepare_desstats(x = ., full.sample = T)

desstats_sample <- imp %>% 
  group_split(migration_background_fac) %>% 
  map(.x = ., .f = ~ prepare_desstats(x = .x, full.sample = F)) %>% 
  map(.x = ., .f = ~ dplyr::select(.data = .x, mean, sd))


#custom function to set rownames in pipe.
set_rownames <- function(df,names) {
row.names(df) <- names
return(df)
}

Descriptive statistics table full sample

desstats %>% 
  bind_cols(.name_repair = "minimal") %>% 
  select(mean, sd, median, min, max) %>% 
set_rownames(df = ., names = var_names_full) %>% 
    kbl(row.names = T) %>% 
  kable_classic()
mean sd median min max
Network size 943.463 639.821 778.346 48.220 3592.104
Network size (log) 6.633 0.674 6.657 3.876 8.186
Ethnic homogeneity 58.828 31.522 57.368 0.000 100.000
Group[Dutch majority] 0.599 0.490 1.000 0.000 1.000
Group[1st gen Turkish-Dutch] 0.128 0.335 0.000 0.000 1.000
Group[2nd gen Turkish-Dutch] 0.122 0.327 0.000 0.000 1.000
Group[1st gen Moroccan-Dutch] 0.066 0.249 0.000 0.000 1.000
Group[2nd gen Moroccan-Dutch] 0.084 0.277 0.000 0.000 1.000
Age 0.000 8.804 0.290 -15.710 23.290
Gender [Female] 0.540 0.499 1.000 0.000 1.000
Educational attainment 0.000 1.516 -0.189 -4.189 1.811
Partner [Has a partner] 0.655 0.476 1.000 0.000 1.000
Paid work 0.820 0.385 1.000 0.000 1.000
Currently in education 0.259 0.438 0.000 0.000 1.000
Religious attendance[Never] 0.602 0.490 1.000 0.000 1.000
Religious attendance[1-2 year] 0.170 0.376 0.000 0.000 1.000
Religious attendance[3-11 a year] 0.073 0.260 0.000 0.000 1.000
Religious attendance[Once a month or more] 0.155 0.362 0.000 0.000 1.000
Outgroup attitude 0.000 20.804 1.808 -88.192 92.808
Neighbourhood: % Non-western migrant 0.000 16.696 -5.692 -17.748 65.773
Neighbourhood: mean property valuation 0.000 97.381 -17.765 -157.765 910.235
Neighbourhood: # inhabitants 0.000 4788.646 -282.805 -9112.805 19347.195

Descriptive statistics by group

desstats_sample %>% 
  bind_cols(.name_repair = "minimal") %>%
  set_rownames(df = ., names = var_names_sample) %>% 
  kbl(row.names = T) %>% 
  kable_classic() %>%
  add_header_above(c(" " = 1,"Dutch" = 2, "1st gen Turkish" = 2, "2nd gen Turkish" = 2, "1st gen Moroccan" = 2, "2nd gen Moroccan" = 2)) %>% 
   footnote(general = "Size of the different groups ",
           number = c("Dutch: 658", "2nd gen Moroccan: 92",
                      "2nd gen Turkish: 134", "1st gen Turkish: 141",
                      "1st gen Moroccan: 73")) 
Dutch
1st gen Turkish
2nd gen Turkish
1st gen Moroccan
2nd gen Moroccan
mean sd mean sd mean sd mean sd mean sd
Network size 989.318 618.266 731.901 639.703 945.282 667.750 873.671 689.242 992.469 653.562
Network size (log) 6.711 0.623 6.282 0.795 6.639 0.651 6.491 0.767 6.716 0.603
Ethnic homogeneity 64.341 32.601 47.669 34.283 55.233 25.408 47.997 24.524 50.325 21.868
Age -0.572 8.742 4.907 6.931 -1.956 8.305 5.834 7.104 -5.210 8.238
Gender [Female] 0.538 0.499 0.489 0.502 0.619 0.487 0.466 0.502 0.576 0.497
Educational attainment 0.115 1.337 0.392 1.805 -0.234 1.440 -0.491 1.831 -0.689 1.732
Partner [Has a partner] 0.679 0.467 0.823 0.383 0.485 0.502 0.781 0.417 0.370 0.485
Paid work 0.877 0.329 0.709 0.456 0.716 0.452 0.726 0.449 0.804 0.399
Currently in education 0.257 0.437 0.149 0.357 0.313 0.466 0.123 0.331 0.467 0.502
Religious attendance[Never] 0.711 0.454 0.560 0.498 0.410 0.494 0.356 0.482 0.359 0.482
Religious attendance[1-2 year] 0.160 0.366 0.121 0.327 0.224 0.418 0.219 0.417 0.207 0.407
Religious attendance[3-11 a year] 0.038 0.191 0.113 0.318 0.119 0.325 0.096 0.296 0.174 0.381
Religious attendance[Once a month or more] 0.091 0.288 0.206 0.406 0.246 0.432 0.329 0.473 0.261 0.442
Outgroup attitude -1.590 22.075 4.993 22.287 -0.315 18.033 3.704 14.412 1.237 15.028
Neighbourhood: % Non-western migrant -6.437 10.871 8.574 17.408 9.419 19.864 10.843 21.265 10.577 18.839
Neighbourhood: mean property valuation 8.592 99.445 1.546 112.147 -22.556 82.258 -15.285 74.288 -18.841 86.491
Neighbourhood: # inhabitants -1022.266 4462.235 1484.925 4821.204 1563.426 4917.847 1619.798 4828.722 1473.173 4924.390
Note:
Size of the different groups
1 Dutch: 658
2 2nd gen Moroccan: 92
3 2nd gen Turkish: 134
4 1st gen Turkish: 141
5 1st gen Moroccan: 73

Inferential analysis

We now turn to testing group differences. Different methods are used. First we can check group differences with an OLS and dummies. Second, we can use an anova to check the differences. Third, we will use a decomposition analysis to explain group differences.

OLS model with dummy

size_group_model <- imp %>%  
  filter(gender != "Other") %>% 
  mutate(
  mean = as.integer(mean)
) %>%
  lm(
    log(mean) ~ migration_background_fac,
    data = .
  )
hom_group_model <- imp %>%
  filter(gender != "Other") %>%
  lm(
    per_ingroup_w ~ migration_background_fac,
    data = .
  )

#show table of OLS results
sjPlot::tab_model(size_group_model,
                  hom_group_model,
          pred.labels = c(
            "Constant",
            "Group[1st gen Turkish-Dutch]",
            "Group[2nd gen Turkish-Dutch]",
            "Group[1st gen Moroccan-Dutch]",
            "Group[2nd gen Moroccan-Dutch]"
            ),
          digits = 3,
          show.ci = F,
          show.stat = T,
          collapse.se  = F,
          col.order = c("est", "stat"),
          p.style = "scientific"
          #file = "reg.doc"
          )
  Dependent variable Dependent variable
Predictors Estimates Statistic Estimates Statistic
Constant 6.711 260.834 64.341 53.604
Group[1st gen Turkish-Dutch] -0.424 -6.908 -16.918 -5.904
Group[2nd gen Turkish-Dutch] -0.073 -1.162 -9.108 -3.121
Group[1st gen Moroccan-Dutch] -0.221 -2.713 -16.344 -4.303
Group[2nd gen Moroccan-Dutch] -0.000 -0.005 -13.951 -4.051
Observations 1096 1096
R2 / R2 adjusted 0.046 / 0.042 0.050 / 0.047

Anova

Size analysis. Complete groups.

# Compute the analysis of variance
res_aov_size <- aov(log(mean) ~ migration_background_simple_fac, data = imp)

# Summary of the analysis
results_size <- summary(res_aov_size)

#look at group differences
anova_diff_df_size  <- as_tibble(TukeyHSD(res_aov_size)$migration_background_simple_fac) %>% 
  mutate(Difference = row.names(TukeyHSD(res_aov_size)$migration_background_simple_fac)) %>% 
  select(5, 1:4) %>% 
  select(Difference,diff, `p adj`) %>% 
  rename(value = diff,
         p = `p adj`) %>% 
  mutate(variable = "ln(Size)")

# Compute the analysis of variance
res_aov_hom <- aov(per_ingroup_w ~ migration_background_simple_fac, data = imp)

# Summary of the analysis
results_hom <- summary(res_aov_hom)

#look at group differences
anova_diff_df_hom  <- as_tibble(TukeyHSD(res_aov_hom)$migration_background_simple_fac) %>% 
  mutate(Difference = row.names(TukeyHSD(res_aov_hom)$migration_background_simple_fac)) %>% 
  select(5, 1:4) %>% 
  select(Difference, diff, `p adj`) %>% 
  rename(value = diff,
         p = `p adj`) %>% 
  mutate(variable = "Homogeneity")

#add sig. indicator
anova_diff_df <- anova_diff_df_size %>% 
  bind_rows(anova_diff_df_hom) %>% 
  mutate(sig = ifelse(p < 0.05, 1, 0),
         sig_f = ifelse(sig == 1, "*", ""))


#create anova plot
anova_diff_simple_plot <- anova_diff_df %>% 
  mutate(diff_fac = fct_rev(factor(rep(1:3,2),
                           levels = 1:3,
                           labels = c("TKY - NED",
                                      "MOR - NED",
                                      "MOR - TKY")
                                      ))

         ) %>% 
  ggplot(aes(x = value, 
             y = diff_fac)) +
  geom_col(aes(fill = variable,
           colour = variable),
           alpha = 0.6) +
  geom_text(aes(label = sig_f),
            size = 6,
            hjust = 1) +
  facet_wrap(vars(variable),
             scales = "free_x") +
    theme_minimal() +
  scale_fill_viridis_d(option = "D") +
  scale_colour_viridis_d(option = "D") +
  theme(legend.position = "none",
        legend.title = element_blank(),
        axis.title.y = element_text(hjust = 1),
        axis.title.x = element_text(hjust = 1),
        text = element_text(colour = "black"),
        strip.text = element_text(colour = "black")) +
  labs(y = "",
       x = "",
       caption = "")

Migrant generations split.

# Compute the analysis of variance
res_aov_size <- aov(log(mean) ~ migration_background_fac, data = imp)

# Summary of the analysis
results_size <- summary(res_aov_size)

#look at group differences
anova_diff_df_size  <- as_tibble(TukeyHSD(res_aov_size)$migration_background_fac) %>% 
  mutate(Difference = row.names(TukeyHSD(res_aov_size)$migration_background_fac)) %>% 
  select(5, 1:4) %>% 
  select(Difference,diff, `p adj`) %>% 
  rename(value = diff,
         p = `p adj`) %>% 
  mutate(variable = "ln(Size)")

# Compute the analysis of variance
res_aov_hom <- aov(per_ingroup_w ~ migration_background_fac, data = imp)

# Summary of the analysis
results_hom <- summary(res_aov_hom)

#look at group differences
anova_diff_df_hom  <- as_tibble(TukeyHSD(res_aov_hom)$migration_background_fac) %>% 
  mutate(Difference = row.names(TukeyHSD(res_aov_hom)$migration_background_fac)) %>% 
  select(5, 1:4) %>% 
  select(Difference, diff, `p adj`) %>% 
  rename(value = diff,
         p = `p adj`) %>% 
  mutate(variable = "Homogeneity")

#add sig. indicator
anova_diff_df <- anova_diff_df_size %>% 
  bind_rows(anova_diff_df_hom) %>% 
  mutate(sig = ifelse(p < 0.05, 1, 0),
         sig_f = ifelse(sig == 1, "*", ""))

#create plot
anova_diff_extended_plot <- anova_diff_df %>% 
  mutate(diff_fac = factor(rep(1:10,2),
                           levels = 1:10,
                           labels = c("1st gen TKY - NED",
                                      "2nd gen TKY - NED",
                                      "1st gen MOR - NED",
                                      "2nd gen MOR - NED",
                                      "2nd gen TKY - 1st gen TKY",
                                      "1st gen MOR - 1st gen TKY",
                                      "2nd gen MOR - 1st gen TKY",
                                      "1st gen MOR - 2nd gen TKY",
                                      "2nd gen MOR - 2nd gen TKY",
                                      "2nd gen MOR - 1st gen MOR")

                                      ),
         diff_fac = fct_rev(fct_relevel(diff_fac,
                                "1st gen TKY - NED",
                                      "2nd gen TKY - NED",
                                      "1st gen MOR - NED",
                                      "2nd gen MOR - NED",
                                      "2nd gen TKY - 1st gen TKY",
                                      "2nd gen MOR - 1st gen MOR",
                                      "1st gen MOR - 1st gen TKY",
                                      "2nd gen MOR - 1st gen TKY",
                                      "1st gen MOR - 2nd gen TKY",
                                      "2nd gen MOR - 2nd gen TKY")

         )) %>% 
  filter(!diff_fac %in% c("1st gen MOR - 1st gen TKY",
                                      "2nd gen MOR - 1st gen TKY",
                                      "1st gen MOR - 2nd gen TKY",
                                      "2nd gen MOR - 2nd gen TKY")) %>% 
  ggplot(aes(x = value, 
             y = diff_fac)) +
  geom_col(aes(fill = variable,
           colour = variable),
           alpha = 0.6) +
  geom_text(aes(label = sig_f),
            size = 6,
            hjust = 1) +
  facet_wrap(vars(variable),
             scales = "free_x") +
    theme_minimal() +
  scale_fill_viridis_d(option = "D") +
  scale_colour_viridis_d(option = "D") +
  theme(legend.position = "none",
        legend.title = element_blank(),
        axis.title.y = element_text(hjust = 1),
        axis.title.x = element_text(hjust = 1),
        text = element_text(colour = "black"),
        strip.text = element_text(colour = "black")) +
  labs(y = "",
       x = "Coefficient")
#create combined plot
anova_plot <- anova_diff_simple_plot + 
anova_diff_extended_plot +
  plot_annotation(tag_levels ='a',
                  tag_prefix = '(',
                  tag_suffix = ')',
                  caption = "* p < 0.05") +
  plot_layout(ncol = 1,
              heights = c(1,3))

#show final plot
anova_plot

#save final anova plot
ggsave(anova_plot,
       file = "data_analysis/plots/results/anova_plot.jpg",
       width = 7,
       height = 5,
       dpi = 320)

Pooled regression model

Recreate the pooled regression model

#estimate size model
size_pooled_model <- imp %>%  
  filter(gender != "Other") %>% 
  mutate(
  mean = as.integer(mean)
) %>%
  lm(
    log(mean) ~ migration_background_fac +
        age +
        gender + 
        education_recoded_numeric +
        partner + 
        in_education +
        paid_work +
        rel_fac_2 +
        per_NW_mig_a_pc4 +
        woz_pc4 +
        Inwoner_pc4 +
        therm_outgroup_diff +
      therm_diff_missing,
    data = .
  )

#estiamte homogeneity model
hom_pooled_model <- imp %>%
  filter(gender != "Other") %>%
  lm(
    per_ingroup_w ~ migration_background_fac +
      age +
      gender +
      education_recoded_numeric +
      partner +
      in_education +
      paid_work +
      rel_fac_2 +
      per_NW_mig_a_pc4 +
      woz_pc4 +
      Inwoner_pc4 +
      therm_outgroup_diff +
      therm_diff_missing,
    data = .
  )

#create table
sjPlot::tab_model(size_pooled_model,
                  hom_pooled_model,
          pred.labels = c(
            "Constant",
            "Group[1st gen Turkish-Dutch]",
            "Group[2nd gen Turkish-Dutch]",
            "Group[1st gen Moroccan-Dutch]",
            "Group[2nd gen Moroccan-Dutch]",
            "Age",
            "Gender [Fenale]",
            "Educational attainment",
            "Partner[Has a partner]",
            "Paid work",
            "Currently in education",
            "Religious attendance[1-2 year]",
            "Religious attendance[3-11 a year]",
            "Religious attendance[Once a month or more]",
            "Neighborhood: % Non-western migrant",
            "Neighborhood: mean property valuation",
            "Neighborhood: # inhabitants",
            "Outgroup attitude",
            "Outgroup attitude missing"
            ),
          digits = 3,
          show.ci = F,
          show.stat = T,
          collapse.se  = F,
          col.order = c("est", "stat"),
          p.style = "stars"
          #file = "reg.doc"
          )
  Dependent variable Dependent variable
Predictors Estimates Statistic Estimates Statistic
Constant 6.514 *** 94.603 62.240 *** 19.189
Group[1st gen Turkish-Dutch] -0.375 *** -5.590 -15.014 *** -4.745
Group[2nd gen Turkish-Dutch] -0.091 -1.348 -8.994 ** -2.829
Group[1st gen Moroccan-Dutch] -0.225 * -2.561 -15.700 *** -3.799
Group[2nd gen Moroccan-Dutch] -0.075 -0.943 -13.496 *** -3.613
Age 0.000 0.100 0.034 0.231
Gender [Fenale] -0.048 -1.200 0.085 0.045
Educational attainment -0.018 -1.226 0.102 0.145
Partner[Has a partner] -0.010 -0.200 -1.170 -0.506
Paid work 0.168 ** 2.871 0.180 0.065
Currently in education 0.155 ** 2.879 0.661 0.260
Religious attendance[1-2 year] 0.152 ** 2.763 4.934 1.908
Religious attendance[3-11 a year] 0.129 1.623 0.191 0.051
Religious attendance[Once a month or more] 0.181 ** 3.027 5.670 * 2.012
Neighborhood: % Non-western migrant -0.000 -0.073 -0.085 -1.188
Neighborhood: mean property valuation -0.000 -0.513 0.000 0.025
Neighborhood: # inhabitants -0.000 -1.460 0.000 0.257
Outgroup attitude -0.001 -1.212 -0.188 *** -4.167
Outgroup attitude missing -0.015 -0.282 0.983 0.384
Observations 1096 1096
R2 / R2 adjusted 0.087 / 0.072 0.074 / 0.058
  • p<0.05   ** p<0.01   *** p<0.001

Decomposition analysis

Decomposition dataprep

Create separate df frames to use in the decomposition analyses.

#Dutch - TKY 1st gen
data_dutch_turkish_1st <- imp %>%
  filter(gender != "Other") %>% 
  filter(migration_background_fac %in% c("Dutch Majority", "1st gen Turkish-Dutch")) %>% 
  mutate(z = if_else(migration_background_fac == "Dutch Majority", 1, 0)
         ) 

#Dutch - TKY 2nd gen
data_dutch_turkish_2nd <- imp %>%
  filter(gender != "Other") %>% 
  filter(migration_background_fac %in% c("Dutch Majority", "2nd gen Turkish-Dutch")) %>% 
  mutate(z = if_else(migration_background_fac == "Dutch Majority", 1, 0)
         ) 

#Dutch - moroccan 2nd gen
data_dutch_moroccan_2nd <- imp %>%
  filter(gender != "Other") %>% 
  filter(migration_background_fac %in% c("Dutch Majority", "2nd gen Moroccan-Dutch")) %>% 
  mutate(z = if_else(migration_background_fac == "Dutch Majority", 1, 0)
         ) 

#Dutch - moroccan 1st gen
data_dutch_moroccan_1st <- imp %>%
  filter(gender != "Other") %>% 
  filter(migration_background_fac %in% c("Dutch Majority", "1st gen Moroccan-Dutch")) %>% 
  mutate(z = if_else(migration_background_fac == "Dutch Majority", 1, 0)
         ) 

#Turkish generation difference
data_turkish_diff <- imp %>%
  filter(gender != "Other") %>% 
  filter(migration_background_fac %in% c("1st gen Turkish-Dutch", "2nd gen Turkish-Dutch")) %>% 
  mutate(z = if_else(migration_background_fac == "2nd gen Turkish-Dutch", 1, 0)
         ) 

#Moroccan generation difference
data_moroccan_diff <- imp %>%
  filter(gender != "Other") %>% 
  filter(migration_background_fac %in% c("1st gen Moroccan-Dutch", "2nd gen Moroccan-Dutch")) %>% 
  mutate(z = if_else(migration_background_fac == "2nd gen Moroccan-Dutch", 1, 0)
         )

#Dutch turkish minority
data_dutch_turkish <- imp %>%
  filter(gender != "Other") %>% 
  filter(migration_background_fac %in% c("Dutch Majority", "1st gen Turkish-Dutch","2nd gen Turkish-Dutch")) %>% 
  mutate(z = if_else(migration_background_fac == "Dutch Majority", 1, 0)
         ) 

#Dutch moroccan minority
data_dutch_moroccan <- imp %>%
  filter(gender != "Other") %>% 
  filter(migration_background_fac %in% c("Dutch Majority", "1st gen Moroccan-Dutch","2nd gen Moroccan-Dutch")) %>% 
  mutate(z = if_else(migration_background_fac == "Dutch Majority", 1, 0)
         ) 

#combine elements in list
data_list <- list(data_dutch_turkish_1st,
                  data_dutch_turkish_2nd,
                  data_dutch_moroccan_2nd,
                  data_dutch_moroccan_1st,
                  data_turkish_diff,
                  data_moroccan_diff,
                  data_dutch_turkish,
                  data_dutch_moroccan)

Size ln

Decomposition estimation

#set seed
set.seed(2702)

#create the analysis function
oaxaca_spec_f <- function(x) {
  results <- x  %>%
  oaxaca(
    formula = log(mean) ~ age +
      gender +
      education_recoded_numeric +
      partner +
      in_education +
      paid_work +
      rel_fac_2 +
      per_NW_mig_a_pc4 +
      woz_pc4 +
      Inwoner_pc4 +
      therm_outgroup_diff | z,
  reg.fun = lm,
  data = .)
}

#create model list
oaxaca_model_list  <- map(.x = data_list, 
    .f = ~ oaxaca_spec_f(x = .x))


#create empty lists to store information in
predicted_list <- list()
two_fold_results_plot_list <- list()
two_fold_results_table_list <- list()
two_fold_variables_list <- list()


#select correct information from the model object and store in list
for(i in 1:length(oaxaca_model_list)) { #i = 8
  model_object <- oaxaca_model_list[[i]]
  
  #get the predicted network size of both groups
  predicted_list[[i]] <- tibble(y_a = model_object$y$y.A,
  y_b = model_object$y$y.B,
  y_diff = model_object$y$y.diff,
  model = i)
  
  #get the results of the twofold analysis for the plot
  two_fold_results_plot_list[[i]] <- model_object$twofold$overall %>% 
    as_tibble() %>% 
    filter(group.weight == 1) %>% 
    rename(coef_explained = `coef(explained)`,
           se_explained = `se(explained)`,
           coef_unexplained = `coef(unexplained)`,
           se_unexplained = `se(unexplained)`,
           coef_unexplained.a = `coef(unexplained A)`,
           se_unexplained.a = `se(unexplained A)`,
           coef_unexplained.b = `coef(unexplained B)`,
           se_unexplained.b = `se(unexplained B)`) %>% 
    select(-group.weight) %>% 
    select(1:8) %>% 
    pivot_longer(data = .,
                 cols = 1:8,
                 names_to = c("var", "name"),
                 values_to = "value",
                 names_sep = "_") %>% 
    mutate(model = i)
  
  #get the results of the twofold analysis for the table
  two_fold_results_table_list[[i]] <- model_object$twofold$overall %>% 
    as_tibble() %>% 
    filter(group.weight == -2) %>% 
    rename(coef_explained = `coef(explained)`,
           se_explained = `se(explained)`,
           coef_unexplained = `coef(unexplained)`,
           se_unexplained = `se(unexplained)`,
           coef_unexplained.a = `coef(unexplained A)`,
           se_unexplained.a = `se(unexplained A)`,
           coef_unexplained.b = `coef(unexplained B)`,
           se_unexplained.b = `se(unexplained B)`) %>% 
    select(-group.weight) %>% 
    select(1:8)
  
  #get the results of the twofold analysis for the variable plot
  two_fold_variables_list[[i]] <- model_object$twofold$variables[[6]] %>% 
    as_tibble() %>% 
    mutate(variables = row.names(model_object$twofold$variables[[6]])) %>% 
    filter(group.weight == -2) %>% 
    select(10,2:5) %>% 
    mutate(model = i)
}

Twofold decomposition results table

#extract twofold information
twofold_table <- two_fold_results_table_list %>% 
  bind_rows() %>% 
  filter(row_number()> 4) %>% 
  mutate(model = factor(1:4,
                        levels = 1:4,
                        labels = c(
                          "1st gen Turkish-Dutch -  2nd gen Turkish-Dutch",
                          "1st gen Moroccan-Dutch -  2nd gen Moroccan-Dutch",
                          "Turkish-Dutch -  Dutch-Majority",
                          "Moroccan-Dutch -  Dutch-Majority"
                        ))) %>% 
  select(9,1:4) %>% 
  mutate(z_explained = coef_explained/se_explained,
         z_unexplained = coef_unexplained/se_unexplained) %>% 
  select(model, coef_explained, z_explained, coef_unexplained, z_unexplained)

predicted_df <- predicted_list %>% 
  bind_rows() %>% 
  filter(model > 4) %>% 
  select(1:3)

#show results in table
twofold_table %>% 
  bind_cols(predicted_df) %>% 
  select(1,6:8, 2:5) %>% 
  kbl(digits = 3,
      align = 'l') %>% 
  kable_paper()
model y_a y_b y_diff coef_explained z_explained coef_unexplained z_unexplained
1st gen Turkish-Dutch - 2nd gen Turkish-Dutch 6.288 6.639 -0.351 -0.061 -1.045 -0.290 -3.240
1st gen Moroccan-Dutch - 2nd gen Moroccan-Dutch 6.491 6.711 -0.220 0.074 0.800 -0.294 -2.154
Turkish-Dutch - Dutch-Majority 6.459 6.711 -0.252 -0.025 -0.722 -0.227 -4.103
Moroccan-Dutch - Dutch-Majority 6.613 6.711 -0.098 0.007 0.167 -0.106 -1.760

Twofold panel plot

#create plot df
variables_plot_df <- two_fold_variables_list %>% 
  bind_rows() %>% 
  filter(model > 4) %>% 
  mutate(model = factor(model, 
                        levels = 5:8,
                        labels = c(
                          "Turkish-Dutch 1st gen - 2nd gen",
                          "Moroccan-Dutch 1st gen - 2nd gen",
                          "Turkish-Dutch -  Dutch-Majority",
                          "Moroccan-Dutch -  Dutch-Majority"
                        ))) %>% 
  rename(coef_explained = `coef(explained)`,
           se_explained = `se(explained)`,
           coef_unexplained = `coef(unexplained)`,
           se_unexplained = `se(unexplained)`) %>%
  pivot_longer(data = .,
                 cols = 2:5,
                 names_to = c("var", "name"),
                 values_to = "value",
                 names_sep = "_") %>% 
  pivot_wider(names_from = var,
              values_from = value) %>% 
  mutate(abs_z = abs(coef/se),
         sig = ifelse(abs_z > 1.95, 1, 0),
         sig_f = factor(sig,
                        levels = 0:1,
                        labels = c("", "*")))

#create vector to filter variables
indep_names <- c(
  "rel_fac_2Monthly or more",
  "rel_fac_23-11 Year",
  "rel_fac_21-2 Year",
  "paid_work",
  "in_education",
  "per_NW_mig_a_pc4",
  "Inwoner_pc4",
  "education_recoded_numeric",
  "therm_outgroup_diff",
  "age",
  "partner",
  "genderFemale",
  "woz_pc4")

#create plot
variables_decomp_size_plot <- variables_plot_df %>% #dataprep
  filter(variables %in% indep_names) %>%
  mutate(
    var_fac = case_when(
      variables == "paid_work" ~ 1,
      variables == "in_education" ~ 2,
      variables == "rel_fac_21-2 Year" ~ 3,
      variables == "rel_fac_23-11 Year" ~ 4,
      variables == "rel_fac_2Monthly or more" ~ 5,
      variables == "education_recoded_numeric" ~ 6,
      variables == "Inwoner_pc4" ~ 7,
      variables == "per_NW_mig_a_pc4" ~ 8,
      variables == "woz_pc4" ~ 9,
      variables == "therm_outgroup_diff" ~ 10,
      variables == "age" ~ 11,
      variables == "partner" ~ 12,
      variables == "genderFemale" ~ 13
    ),
    var_fac = factor(
      var_fac,
      levels = 1:13,
      labels = c(
        "Paid work",
        "Currently in education",
        "Religious attendance[1-2 year]",
        "Religious attendance[3-11 a year]",
        "Religious attendance[Once a month or more]",
        "Educational attainment",
        "Neighbourhood: # inhabitants",
        "Neighbourhood: mean property valuation",
        "Neighbourhood: % non-western migrant",
        "Outgroup attitudes",
        "Age",
        "Partner[no partner]",
        "Gender[Male]"
      )
    )
  ) %>% #plot
  ggplot(aes(x = coef, y = var_fac)) +
  geom_col(aes(colour = name,
             fill = name),
           alpha = 0.6) + 
  geom_errorbar(aes(xmin = coef - (se*1.96),
                    xmax = coef + (se*1.96))) +
  geom_vline(xintercept = 0) +
  geom_text(aes(label = sig_f),
            hjust = 2,
            size = 6) +
  facet_grid(cols = vars(model),
             rows = vars(name)) +
  theme_minimal() +
  scale_fill_viridis_d(option = "D") +
  scale_colour_viridis_d(option = "D") +
  theme(legend.position = "bottom",
        legend.title = element_blank(),
        axis.title.y = element_text(hjust = 1),
        axis.title.x = element_text(hjust = 1),
        text = element_text(colour = "black"),
        strip.text = element_text(colour = "black")) +
  labs(y = "Independent Variables",
       x = "Coefficient",
       caption = "* p < 0.05")

#save plot
ggsave(variables_decomp_size_plot,
        file = "data_analysis/plots/results/variables_decomp_size_plot.jpg",
       width = 12,
       height = 8,
       dpi = 320)

#show plot
variables_decomp_size_plot

Regression model results per group

All (sub)groups

#estimate model for different groups
size_pooled_model <- imp %>%
  filter(gender != "Other") %>%
   group_split(migration_background_fac) %>%
  map(
    .x = .,
    .f = ~ lm(
      log(mean) ~ age +
        gender + 
        education_recoded_numeric +
        partner + 
        in_education +
        paid_work +
        rel_fac_2 +
        per_NW_mig_a_pc4 +
        woz_pc4 +
        Inwoner_pc4 +
        therm_outgroup_diff,
      data = .x
    )
  )

#Show results in table
sjPlot::tab_model(size_pooled_model,
          pred.labels = c(
            "Constant",
           "Age",
            "Gender [Fenale]",
            "Educational attainment",
            "Partner[Has a partner]",
            "Paid work",
            "Currently in education",
            "Religious attendance[1-2 year]",
            "Religious attendance[3-11 a year]",
            "Religious attendance[Once a month or more]",
            "Outgroup attitude",
            "Neighborhood: % non-western migrant",
            "Neighborhood: mean property valuation",
            "Neighborhood: # inhabitants"
            ),
          dv.labels = c("Dutch", "1st gen Turkish", "2nd gen Turkish", "1st gen Moroccan","2nd gen Moroccan"),
          digits = 3,
          show.ci = F,
          show.stat = T,
          collapse.se  = F,
          col.order = c("est", "stat")
          #file = "reg.doc"
          )
  Dutch 1st gen Turkish 2nd gen Turkish 1st gen Moroccan 2nd gen Moroccan
Predictors Estimates Statistic Estimates Statistic Estimates Statistic Estimates Statistic Estimates Statistic
Constant 6.544 76.814 5.950 25.410 6.622 39.647 6.254 16.001 6.309 26.814
Age -0.002 -0.574 -0.010 -0.923 0.006 0.559 0.008 0.447 0.017 1.410
Gender [Fenale] -0.003 -0.070 -0.053 -0.397 -0.076 -0.607 -0.132 -0.628 -0.182 -1.275
Educational attainment -0.024 -1.147 0.024 0.530 -0.016 -0.344 -0.059 -0.927 -0.010 -0.203
Partner[Has a partner] 0.014 0.240 -0.076 -0.438 -0.177 -1.215 0.007 0.026 0.073 0.441
Paid work 0.205 2.850 0.114 0.506 0.031 0.197 0.130 0.330 0.314 1.835
Currently in education 0.033 0.436 0.256 1.691 0.206 1.560 0.246 0.923 0.419 2.398
Religious attendance[1-2 year] 0.233 3.513 0.399 1.764 -0.128 -0.776 -0.022 -0.080 0.057 0.322
Religious attendance[3-11 a year] 0.074 0.589 0.340 1.506 -0.153 -0.800 0.182 0.494 0.182 0.954
Religious attendance[Once a month or more] 0.047 0.556 0.515 2.541 0.221 1.391 0.094 0.344 0.131 0.743
Outgroup attitude -0.003 -1.432 0.002 0.535 -0.001 -0.225 0.003 0.435 -0.004 -0.896
Neighborhood: % non-western migrant 0.000 1.446 -0.001 -2.061 -0.000 -0.554 -0.001 -0.546 -0.001 -1.329
Neighborhood: mean property valuation -0.000 -0.965 0.000 0.974 -0.000 -0.465 -0.000 -1.488 0.000 0.693
Neighborhood: # inhabitants -0.003 -2.665 0.007 2.089 -0.002 -0.558 -0.000 -0.048 0.003 0.662
Observations 658 140 134 73 91
R2 / R2 adjusted 0.072 / 0.053 0.167 / 0.082 0.108 / 0.011 0.111 / -0.085 0.189 / 0.052

Only major groups.

results_list <- imp %>%
  filter(gender != "Other") %>% 
  group_split(migration_background_simple_fac) %>%
  map(
    .x = .,
    .f = ~ lm(
      log(mean) ~  age +
        gender + 
        education_recoded_numeric +
        therm_outgroup_diff +
        partner + 
        in_education +
        paid_work +
        rel_fac_2 +
        per_NW_mig_a_pc4 +
        woz_pc4 +
        Inwoner_pc4,
      data = .x
    )
  )

sjPlot::tab_model(results_list,
          pred.labels = c(
            "Constant",
            "Age",
            "Gender [Fenale]",
            "Educational attainment",
            "Partner[Has a partner]",
            "Paid work",
            "Currently in education",
            "Religious attendance[1-2 year]",
            "Religious attendance[3-11 a year]",
            "Religious attendance[Once a month or more]",
            "Outgroup attitude",
            "Neighborhood: % non-western migrant",
            "Neighborhood: mean property valuation",
            "Neighborhood: # inhabitants"
            ),
          dv.labels = c("Dutch","Turkish-Dutch", "Moroccan-Dutch"),
          digits = 3,
          show.ci = F,
          show.stat = T,
          collapse.se  = F,
          col.order = c("est", "stat")
          #file = "reg.doc"
          )
  Dutch Turkish-Dutch Moroccan-Dutch
Predictors Estimates Statistic Estimates Statistic Estimates Statistic
Constant 6.544 76.814 6.286 46.427 6.256 32.820
Age -0.002 -0.574 -0.007 -1.014 0.007 0.787
Gender [Fenale] -0.003 -0.070 -0.049 -0.547 -0.121 -1.037
Educational attainment -0.024 -1.147 -0.006 -0.183 -0.024 -0.682
Partner[Has a partner] -0.003 -2.665 0.002 0.895 0.001 0.268
Paid work 0.014 0.240 -0.128 -1.188 0.017 0.121
Currently in education 0.205 2.850 0.032 0.245 0.270 1.679
Religious attendance[1-2 year] 0.033 0.436 0.236 2.357 0.322 2.357
Religious attendance[3-11 a year] 0.233 3.513 0.084 0.623 0.018 0.119
Religious attendance[Once a month or more] 0.074 0.589 0.102 0.699 0.182 1.053
Outgroup attitude 0.047 0.556 0.325 2.593 0.093 0.633
Neighborhood: % non-western migrant -0.003 -1.432 0.000 0.093 0.000 0.002
Neighborhood: mean property valuation 0.000 1.446 -0.001 -2.335 -0.001 -1.559
Neighborhood: # inhabitants -0.000 -0.965 0.000 0.085 -0.000 -0.564
Observations 658 274 164
R2 / R2 adjusted 0.072 / 0.053 0.114 / 0.070 0.102 / 0.024

Ethnic homogeneity

Decomposition estimation

#set seed
set.seed(2702)

#create the analysis function
oaxaca_spec_f <- function(x) {
  results <- x  %>%
  oaxaca(
    formula = per_ingroup_w ~  age +
        gender + 
        education_recoded_numeric +
        therm_outgroup_diff +
        partner + 
        in_education +
        paid_work +
        rel_fac_2 +
        per_NW_mig_a_pc4 +
        woz_pc4 +
        Inwoner_pc4 | z ,
  reg.fun = lm,
  data = .)
}

#create model list
oaxaca_model_list_homogeneity  <- map(.x = data_list, 
    .f = ~ oaxaca_spec_f(x = .x))

predicted_list_hom <- list()
two_fold_results_plot_list_hom <- list()
two_fold_results_table_list_hom <- list()
two_fold_variables_list_hom <- list()

#select correct information from results
for(i in 1:length(oaxaca_model_list_homogeneity)) { #i = 8
  model_object <- oaxaca_model_list_homogeneity[[i]]
  
  predicted_list_hom[[i]] <- tibble(y_a = model_object$y$y.A,
  y_b = model_object$y$y.B,
  y_diff = model_object$y$y.diff,
  model = i)
  
  two_fold_results_plot_list_hom[[i]] <- model_object$twofold$overall %>% 
    as_tibble() %>% 
    filter(group.weight == -2) %>% 
    rename(coef_explained = `coef(explained)`,
           se_explained = `se(explained)`,
           coef_unexplained = `coef(unexplained)`,
           se_unexplained = `se(unexplained)`,
           coef_unexplained.a = `coef(unexplained A)`,
           se_unexplained.a = `se(unexplained A)`,
           coef_unexplained.b = `coef(unexplained B)`,
           se_unexplained.b = `se(unexplained B)`) %>% 
    select(-group.weight) %>% 
    select(1:8) %>% 
    pivot_longer(data = .,
                 cols = 1:8,
                 names_to = c("var", "name"),
                 values_to = "value",
                 names_sep = "_") %>% 
    mutate(model = i)
  
  two_fold_results_table_list_hom[[i]] <- model_object$twofold$overall %>% 
    as_tibble() %>% 
    filter(group.weight == -2) %>% 
    rename(coef_explained = `coef(explained)`,
           se_explained = `se(explained)`,
           coef_unexplained = `coef(unexplained)`,
           se_unexplained = `se(unexplained)`,
           coef_unexplained.a = `coef(unexplained A)`,
           se_unexplained.a = `se(unexplained A)`,
           coef_unexplained.b = `coef(unexplained B)`,
           se_unexplained.b = `se(unexplained B)`) %>% 
    select(-group.weight) %>% 
    select(1:8)
  
  
  two_fold_variables_list_hom[[i]] <- model_object$twofold$variables[[6]] %>% 
    as_tibble() %>% 
    mutate(variables = row.names(model_object$twofold$variables[[2]])) %>% 
    filter(group.weight == -2) %>% 
    select(10,2:5) %>% 
    mutate(model = i)
  
}

Decomposition results table

#extract twofold information
twofold_table_hom <- two_fold_results_table_list_hom %>% 
  bind_rows() %>% 
  filter(row_number()> 4) %>% 
  mutate(model = factor(1:4,
                        levels = 1:4,
                        labels = c(
                          "1st gen Turkish -  2nd gen Turkish",
                          "1st gen Moroccan -  2nd gen Moroccan",
                          "Turkish -  Dutch-Majority",
                          "Moroccan -  Dutch-Majority"
                        ))) %>% 
  select(9,1:4) %>% 
  mutate(z_explained = coef_explained/se_explained,
         z_unexplained = coef_unexplained/se_unexplained) %>% 
  select(model,coef_explained, z_explained, coef_unexplained, z_unexplained)


predicted_df_hom <- predicted_list_hom %>% 
  bind_rows() %>% 
  filter(model > 4) %>% 
  select(1:3)

#create resuls table
twofold_table_hom %>% 
  bind_cols(predicted_df_hom) %>% 
  select(1,6:8, 2:5) %>% 
    kbl(digits = 3,
      align = 'l') %>% 
  kable_paper()
model y_a y_b y_diff coef_explained z_explained coef_unexplained z_unexplained
1st gen Turkish - 2nd gen Turkish 47.423 55.233 -7.810 -2.507 -1.112 -5.304 -1.239
1st gen Moroccan - 2nd gen Moroccan 47.997 50.390 -2.393 1.993 0.654 -4.385 -0.973
Turkish - Dutch-Majority 51.243 64.341 -13.098 -2.570 -1.621 -10.528 -3.402
Moroccan - Dutch-Majority 49.325 64.341 -15.016 -4.782 -2.974 -10.234 -4.058

Decomposition results plot

#create plot df
variables_plot_df_hom <- two_fold_variables_list_hom %>% 
  bind_rows() %>% 
  filter(model > 4) %>% 
  mutate(model = factor(model, 
                        levels = 5:8,
                        labels = c(
                          "Turkish-Dutch 1st gen - 2nd gen",
                          "Moroccan-Dutch 1st gen - 2nd gen",
                          "Turkish-Dutch -  Dutch-Majority",
                          "Moroccan-Dutch -  Dutch-Majority"
                        ))) %>% 
  rename(coef_explained = `coef(explained)`,
           se_explained = `se(explained)`,
           coef_unexplained = `coef(unexplained)`,
           se_unexplained = `se(unexplained)`) %>%
  pivot_longer(data = .,
                 cols = 2:5,
                 names_to = c("var", "name"),
                 values_to = "value",
                 names_sep = "_") %>% 
  pivot_wider(names_from = var,
              values_from = value) %>% 
  mutate(abs_z = abs(coef/se),
         sig = ifelse(abs_z > 1.95, 1, 0),
         sig_f = factor(sig,
                        levels = 0:1,
                        labels = c("", "*")))

#create selection variable
indep_names <- c(
  "rel_fac_2Monthly or more",
  "rel_fac_23-11 Year",
  "rel_fac_21-2 Year",
  "paid_work",
  "in_education",
  "per_NW_mig_a_pc4",
  "Inwoner_pc4",
  "education_recoded_numeric",
  "therm_outgroup_diff",
  "age",
  "partner",
  "genderFemale",
  "woz_pc4")

#create decomposition homogeneity plot
variables_decomp_hom_plot <- variables_plot_df_hom %>%
  filter(variables %in% indep_names) %>%
  mutate(
    var_fac = case_when(
      variables == "paid_work" ~ 1,
      variables == "in_education" ~ 2,
      variables == "rel_fac_21-2 Year" ~ 3,
      variables == "rel_fac_23-11 Year" ~ 4,
      variables == "rel_fac_2Monthly or more" ~ 5,
      variables == "education_recoded_numeric" ~ 6,
      variables == "Inwoner_pc4" ~ 7,
      variables == "per_NW_mig_a_pc4" ~ 8,
      variables == "woz_pc4" ~ 9,
      variables == "therm_outgroup_diff" ~ 10,
      variables == "age" ~ 11,
      variables == "partner" ~ 12,
      variables == "genderFemale" ~ 13
    ),
    var_fac = factor(
      var_fac,
      levels = 1:13,
      labels = c(
        "Paid work",
        "Currently in education",
        "Religious attendance[1-2 year]",
        "Religious attendance[3-11 a year]",
        "Religious attendance[Once a month or more]",
        "Educational attainment",
        "Neighbourhood: # inhabitants",
        "Neighbourhood: % non-western migrant",
        "Neighbourhood: mean property valuation",
        "Outgroup attitudes",
        "Age",
        "Partner[no partner]",
        "Gender[Male]"
      )
    )
  ) %>% 
  ggplot(aes(x = coef, y = var_fac)) +
  geom_col(aes(colour = name,
             fill = name),
           alpha = 0.6) + 
  geom_errorbar(aes(xmin = coef - (se*1.96),
                    xmax = coef + (se*1.96))) +
  facet_grid(cols = vars(model),
             rows = vars(name)) +
  geom_vline(xintercept = 0) +
  geom_text(aes(label = sig_f),
            hjust = 2,
            size = 6) +
  theme_minimal() +
  scale_fill_viridis_d(option = "D") +
  scale_colour_viridis_d(option = "D") +
  theme(legend.position = "bottom",
        #plot.background = element_rect(fill = '#404040', colour = '#404040'),
        #panel.background = element_rect(fill = '#404040', colour = '#404040'),
        axis.title.y = element_text(hjust = 1),
        axis.title.x = element_text(hjust = 1),
        text = element_text(colour = "black"),
        strip.text = element_text(colour = "black"),
        legend.title = element_blank()) +
  labs(y = "Independent Variables",
       x = "Coefficient",
       caption = "* p < 0.05")

#save plot
  ggsave(variables_decomp_hom_plot,
        file = "data_analysis/plots/results/variables_decomp_hom_plot.jpg",
       width = 12,
       height = 8,
       dpi = 320)

  #show plot
variables_decomp_hom_plot

Regression model results per group

All (sub)groups

#create list with regression by migration group
results_list <- imp %>%
  filter(gender != "Other") %>% 
  group_split(migration_background_fac) %>%
  map(
    .x = .,
    .f = ~ lm(
      per_ingroup_w ~  age +
        gender + 
        education_recoded_numeric +
        partner + 
        in_education +
        paid_work +
        rel_fac_2 +
        therm_outgroup_diff +
        per_NW_mig_a_pc4 +
        woz_pc4 +
        Inwoner_pc4,
      data = .x
    )
  )

#show table with results
sjPlot::tab_model(results_list,
          pred.labels = c(
            "Constant",
            "Age",
            "Gender [Fenale]",
            "Educational attainment",
            "Partner[Has a partner]",
            "Paid work",
            "Currently in education",
            "Religious attendance[1-2 year]",
            "Religious attendance[3-11 a year]",
            "Religious attendance[Once a month or more]",
            "Outgroup attitude",
            "Neighborhood: % non-western migrant",
            "Neighborhood: mean property valuation",
            "Neighborhood: # inhabitants"
            ),
          dv.labels = c("Dutch","1st gen Turkish", "2nd gen Turkish", "1st gen Moroccan", "2nd gen Moroccan"),
          digits = 3,
          show.ci = F,
          show.stat = T,
          collapse.se  = F,
          col.order = c("est", "stat")
          #file = "reg.doc"
          )
  Dutch 1st gen Turkish 2nd gen Turkish 1st gen Moroccan 2nd gen Moroccan
Predictors Estimates Statistic Estimates Statistic Estimates Statistic Estimates Statistic Estimates Statistic
Constant 58.544 13.007 43.347 4.098 48.772 7.732 59.849 4.912 37.304 4.394
Age -0.251 -1.301 0.425 0.835 0.108 0.287 0.645 1.175 0.500 1.143
Gender [Fenale] -0.663 -0.258 -0.958 -0.159 1.840 0.387 -2.541 -0.388 14.452 2.799
Educational attainment 2.308 2.069 -0.368 -0.182 -3.198 -1.857 0.179 0.090 0.631 0.357
Partner[Has a partner] -1.168 -0.372 -0.608 -0.077 -3.592 -0.653 -14.754 -1.778 -3.992 -0.673
Paid work -0.198 -0.052 6.658 0.657 -3.138 -0.522 -8.770 -0.714 4.279 0.692
Currently in education 3.530 0.893 -3.897 -0.570 0.008 0.002 -6.621 -0.798 5.834 0.926
Religious attendance[1-2 year] 2.352 0.671 16.399 1.606 1.246 0.201 7.031 0.820 9.305 1.447
Religious attendance[3-11 a year] -5.375 -0.811 -2.216 -0.217 13.634 1.892 5.392 0.468 -1.516 -0.220
Religious attendance[Once a month or more] -0.646 -0.146 15.779 1.722 14.742 2.459 5.172 0.608 0.914 0.144
Outgroup attitude -0.192 -3.349 0.029 0.204 -0.215 -1.781 -0.387 -1.610 0.005 0.034
Neighborhood: % non-western migrant -0.472 -3.689 -0.074 -0.355 0.059 0.431 0.164 0.863 0.201 1.384
Neighborhood: mean property valuation 0.006 0.450 -0.032 -1.014 -0.007 -0.230 0.038 0.857 0.047 1.435
Neighborhood: # inhabitants 0.000 0.034 0.000 0.372 0.001 1.485 -0.000 -0.372 -0.001 -0.861
Observations 658 140 134 73 91
R2 / R2 adjusted 0.054 / 0.034 0.087 / -0.007 0.164 / 0.074 0.156 / -0.030 0.203 / 0.068

All major groups.

#list withr results of each group
results_list <- imp %>%
  filter(gender != "Other") %>% 
  group_split(migration_background_simple_fac) %>%
  map(
    .x = .,
    .f = ~ lm(
      per_ingroup_w ~  age +
        gender + 
        education_recoded_numeric +
        partner + 
        in_education +
        paid_work +
        rel_fac_2 +
        therm_outgroup_diff +
        per_NW_mig_a_pc4 +
        woz_pc4 +
        Inwoner_pc4,
      data = .x
    )
  )

#show results in a table
sjPlot::tab_model(results_list,
          pred.labels = c(
            "Constant",
            "Age",
            "Gender [Fenale]",
            "Educational attainment",
            "Partner[Has a partner]",
            "Paid work",
            "Currently in education",
            "Religious attendance[1-2 year]",
            "Religious attendance[3-11 a year]",
            "Religious attendance[Once a month or more]",
            "Outgroup attitude",
            "Neighborhood: % co-ethnic",
            "Neighborhood: mean property valuation",
            "Neighborhood: # inhabitants",
            "Therm missing"
            ),
          dv.labels = c("Dutch","Turkish-Dutch", "Moroccan-Dutch"),
          digits = 3,
          show.ci = F,
          show.stat = T,
          collapse.se  = F,
          col.order = c("est", "stat")
          #file = "reg.doc"
          )
  Dutch Turkish-Dutch Moroccan-Dutch
Predictors Estimates Statistic Estimates Statistic Estimates Statistic
(Intercept) 58.544 13.007 48.506 8.663 45.919 7.176
age -0.251 -1.301 0.073 0.252 0.453 1.608
genderFemale -0.663 -0.258 -0.103 -0.028 7.324 1.867
education_recoded_numeric 2.308 2.069 -0.913 -0.722 0.174 0.145
partner -1.168 -0.372 -2.559 -0.574 -7.029 -1.509
in_education -0.198 -0.052 1.017 0.186 2.206 0.409
paid_work 3.530 0.893 -2.846 -0.686 0.164 0.036
rel_fac_21-2 Year 2.352 0.671 9.702 1.747 6.540 1.312
rel_fac_23-11 Year -5.375 -0.811 6.451 1.065 -0.482 -0.083
rel_fac_2Monthly or more -0.646 -0.146 16.219 3.124 1.782 0.360
therm_outgroup_diff -0.192 -3.349 -0.080 -0.866 -0.210 -1.658
per_NW_mig_a_pc4 -0.472 -3.689 -0.078 -0.662 0.205 1.826
woz_pc4 0.006 0.450 -0.029 -1.398 0.034 1.374
Inwoner_pc4 0.000 0.034 0.001 1.131 -0.000 -0.267
Observations 658 274 164
R2 / R2 adjusted 0.054 / 0.034 0.088 / 0.042 0.103 / 0.026
---
title: "Inferential analysis: Blinder-Oaxaca decomposition"
author: "Thijmen Jeroense"
date: "Last compiled on `r format(Sys.time(), '%d %B, %Y')`"
output:
  html_document:
    toc: TRUE
    toc_depth: 3
    toc_float: TRUE
    code_folding: show
    code_download: TRUE
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(cache = TRUE, message = FALSE, warning = FALSE, results = "asis",
                      fig.align = "center")
```


# Initial set up

```{r libraries and files, results = 'hide'}
fpackage.check <- function(packages) { # (c) Jochem Tolsma
  lapply(packages, FUN = function(x) {
    if (!require(x, character.only = TRUE)) {
      install.packages(x, dependencies = TRUE)
      library(x, character.only = TRUE)
    }
  })
}
packages = c("tidyverse", 
             "viridis", "kableExtra","ggridges", "viridis", "ggdark", "ggplot2", "patchwork", "mice","qualvar",
             "oaxaca", "foreach"
)
fpackage.check(packages)
```


# Goal
Create the descriptive statistics table and estimate the decomposition analysis.

# Data preperation

## NELLS and NSUM data import

Import the NSUM data and recreate the NSUM module. 

```{r files}
#import NELLS file.
load(file = "data_analysis/data/data_processed/nells_data/2023-05-08_nells-nsum-prepped-data.rds")

```

Import the model estimates from the estimated NSUM models. We have chosen the model which uses Ibrahim for the ethnic names. 

```{r import results}

if (file.exists(
  "data_analysis/results/nsum_output/main/combined_data/df_models_nsum_long.rds"
)) {
  load(file = "data_analysis/results/nsum_output/main/combined_data/df_models_nsum_long.rds")
} else {
  list_files <-
    as.list(
      dir(
        "data_analysis/results/nsum_output/main/model/",
        full.names = T
      )
    )
  #create loop lists
  kds <- list()
  kdssd <- list()
  data <- list()
  list_df <- list()
  
  #loop to extract information
  for (i in 1:length(list_files)) {
    #i = 1
    print(paste0("Number ", i, " of ", length(list_files)))
    load(list_files[[i]])
    kds[[i]] <-
      rowMeans(degree$d.values, na.rm = TRUE) # calculate rowmean of netsize iterations: so the retained chains
    kdssd[[i]] <-
      matrixStats::rowSds(degree$d.values) # calculate sd of 4k estimates per row: sd for those values
    data[[i]] <- cbind(kds[[i]], kdssd[[i]]) # combine them
    list_df[[i]] <-
      cbind(as_tibble(data[[i]]), nells_nsum$id) # add NELLS id variable
    strings <-
      str_split(str_extract(list_files[[i]][1], pattern = "estimates.+"),
                pattern = "_")  # add holdout and tauk number
    list_df[[i]] <- list_df[[i]] %>%
      mutate(
        holdout = as.numeric(str_extract(strings[[1]][2], pattern = "[[:digit:]]{1,}")))
  }
    #combine results and save
    df_models_nsum_long <- list_df %>%
      bind_rows() %>%
      rename(mean = V1,
             sd = V2,
             id = 3)
    #save image
    save(df_models_nsum_long, file = "data_analysis/results/nsum_output/main/combined_data/df_models_nsum_long.rds")
}
```


## Prepare data for analysis

Final data preperation for analysis

```{r holdout 10}

#extract results of NSUM 
nsum_estimates <- df_models_nsum_long %>% 
  filter(holdout == 10) 

#add nsum results to NELLS data
nells_df <- nsum_estimates %>% 
  left_join(nells_nsum)

#data prep
nells_df <- nells_df %>% 
  mutate(has_child = if_else(nr_children == "0", 0, 1),
         religious = if_else(religious_denom == "None", 0, 1),
         education_num = as.numeric(education_completed_highest),
         education_recoded_numeric = case_when(
           education_num == 1 ~ 1,
           education_num %in% c(2,13) ~ 2,
           education_num %in% c(3,4,14) ~ 3,
           education_num %in% c(5,6) ~ 4,
           education_num %in% c(7,8) ~ 5,
           education_num %in% c(9,15) ~ 6,
           education_num %in% c(10,11,12) ~ 7
         )
  )

#create inwoner coethnic
nells_df <- nells_df %>% 
  mutate(inwoner_coethnic_pc4 = case_when(
    str_detect(migration_background_fac, "kish") ~ per_TKY_mig_a_pc4,
    str_detect(migration_background_fac, "occan") ~ per_MOR_mig_a_pc4,
    migration_background_fac == "Dutch-Majority" ~ per_NL_mig_a_pc4,
    migration_background_fac == "Other" ~ per_MOR_mig_a_pc4 + per_TKY_mig_a_pc4
  ),
  inwoner_coethnic_pc4 = inwoner_coethnic_pc4*100,
  inwoner_coethnic_gem = case_when(
    str_detect(migration_background_fac, "kish") ~ per_TKY_mig_a_gem,
    str_detect(migration_background_fac, "occan") ~ per_MOR_mig_a_gem,
    migration_background_fac == "Dutch-Majority" ~ per_NL_mig_a_gem,
    migration_background_fac == "Other" ~ per_MOR_mig_a_gem + per_TKY_mig_a_gem
  ),
  inwoner_coethnic_gem = inwoner_coethnic_gem*100,
  per_NW_mig_a_gem = per_NW_mig_a_gem*100,
  per_NW_mig_a_pc4 = per_NW_mig_a_pc4*100
  )

#thermometer score outgroup
nells_df <- nells_df %>% 
   mutate(therm_outgroup_mean = case_when(
    str_detect(migration_background_fac, "kish") ~ (therm_dutch_maj + therm_mor)/2,
    str_detect(migration_background_fac, "occan") ~ (therm_dutch_maj + therm_tur)/2,
    migration_background_fac == "Dutch-Majority" ~ (therm_mor + therm_tur)/2,
    migration_background_fac == "Other" ~ (therm_dutch_maj + therm_tur + therm_mor)/3
  ),
  therm_outgroup_diff = case_when(
    str_detect(migration_background_fac, "kish") ~ ((therm_dutch_maj + therm_mor)/2) - therm_tur,
    str_detect(migration_background_fac, "occan") ~ ((therm_dutch_maj + therm_tur)/2) - therm_mor,
    migration_background_fac == "Dutch-Majority" ~ ((therm_mor + therm_tur)/2) - therm_dutch_maj,
    migration_background_fac == "Other" ~ (therm_dutch_maj + therm_tur + therm_mor)/3
  )) 

```

Prepare ethnic homogeneity measure.

```{r ethnic hom names}
nells_df <- nells_df %>%
  mutate(
    sum_dutch = knows_daan_boundary + knows_kevin_boundary + knows_emma_boundary + knows_linda_boundary + knows_albert_boundary + knows_edwin_boundary + knows_willemina_boundary + knows_ingrid_boundary,
    sum_turkish = knows_ibrahim_boundary + knows_esra_boundary,
    sum_moroccan = knows_mohammed_boundary + knows_fatima_boundary,
    sum_cat = knows_prison + knows_mbo + knows_secundary,
    +knows_hbo + knows_university + knows_unemployed + knows_secondhome,
    sum_total = sum_dutch + sum_turkish + sum_moroccan,
    per_dutch = (sum_dutch / sum_total) * 100,
    per_turkish = (sum_turkish / sum_total) * 100,
    per_moroccan = (sum_moroccan / sum_total) * 100
  )

#assign correct percentage co-ethnic to each group
nells_df <- nells_df %>% 
  mutate(per_ingroup = case_when(
    str_detect(migration_background_fac, "kish") ~ per_turkish,
    str_detect(migration_background_fac, "occan") ~ per_moroccan,
    migration_background_fac == "Dutch-Majority" ~ per_dutch,
    migration_background_fac == "Other" ~ per_dutch
  ))

#weighted by name frequency.
nells_df <- nells_df %>%
  mutate(
    sum_dutch_w = knows_daan_boundary/22704 + 
      knows_kevin_boundary/23167 +
      knows_emma_boundary/18730 + 
      knows_linda_boundary/29955 + 
      knows_albert_boundary/31767 + 
      knows_edwin_boundary/21866 + 
      knows_willemina_boundary/17133 + 
      knows_ingrid_boundary/31323,
    sum_turkish_w = knows_ibrahim_boundary/2099 + 
      knows_esra_boundary/1878,
    sum_moroccan_w = knows_mohammed_boundary/13448 + 
      knows_fatima_boundary/2808,
    sum_total_w = sum_dutch_w + sum_turkish_w + sum_moroccan_w,
    per_dutch_w = (sum_dutch_w / sum_total_w) * 100,
    per_turkish_w = (sum_turkish_w / sum_total_w) * 100,
    per_moroccan_w = (sum_moroccan_w / sum_total_w) * 100
  )

#assign correct percentage co-ethnic to each group
nells_df <- nells_df %>% 
  mutate(per_ingroup_w = case_when(
    str_detect(migration_background_fac, "kish") ~ per_turkish_w,
    str_detect(migration_background_fac, "occan") ~ per_moroccan_w,
    migration_background_fac == "Dutch-Majority" ~ per_dutch_w,
    migration_background_fac == "Other" ~ per_dutch_w
  ))

```

Create homogeneity measure based on naive NSUM method. First create the function.

```{r nusm naive frq function}
#get the frequencies
frequencies <- read_csv("data_analysis/2022-08-26_namefrequencies.csv")

name_df <- nells_nsum %>% 
  select(id, contains("boundary")) %>% 
  select(-contains("prison"))

#nsum function
function_nsum <- function(dat, #data field of x with ID
                          popsize, #population size
                          freq) {#frequency vector
  #freq extract
  freq <- freq %>% 
    pull(number)
  
  #create df with ID
  dat_id <- dat %>%
    select(id)
  
  #create df without ID
  dat <- dat %>% select(-id)
  
  #extract names
  names_dat <- names(dat) %>%
    str_match(., "_(.*)_")
  
  #select correct list
  names_dat <- names_dat[, 2]
  
  
  #loop to apply naive function for all elements in the DF
  nsum_df <- foreach(a = 1:ncol(dat),
                     .combine = cbind) %do% {
                       dat %>%
                         select(.data[[names(dat)[a]]]) %>%
                         mutate(!!paste0(names_dat[a], "_predicted") := (.data[[names(dat)[a]]] /
                                                                           freq[a]) * popsize)
                     }
  #create NSUM predicted estimate (averaged over all individual scale ups)
  nsum_df <- nsum_df %>%
    bind_cols(dat_id) %>% #add ID
    rowwise() %>% #rowwise means
    mutate(ave_predicted = mean(x = c_across(cols = contains("predicted")),
                                na.rm = T)) %>%
    ungroup()
  
  return(nsum_df)
}

```

Apply function to NELLS data.

```{r ethnic composition plot}
#Create naive scale up estimates with all names. 
name_all_df <- function_nsum(dat = name_df,
              popsize = 17407585,
              freq = frequencies) 

#rename all names and delete individual predictions
name_all_df <- name_all_df %>% 
  rename(all_pred = ave_predicted) %>% 
  rowwise() %>% 
  mutate(maj_pred = mean(x = c_across(cols = c(
    daan_predicted,
    kevin_predicted,
    edwin_predicted,
    albert_predicted,
    linda_predicted,
    emma_predicted,
    ingrid_predicted,
    willemina_predicted
  )),
  na.rm = T),
  min_pred = mean(x = c_across(cols = c(
    esra_predicted,
    mohammed_predicted,
    ibrahim_predicted,
    fatima_predicted,
  )),
  na.rm = T),
  ) %>% 
  ungroup()

#combine information
nells_df <- nells_df %>% 
  left_join(name_all_df, by = "id")

```


## Delete outliers and filter out "Other" mig background

```{r delete outliers}
nells_df <- nells_df %>% 
    mutate(mean_size = mean(mean, na.rm = T),
         sd_size = sd(mean, na.rm = T),
         z = (mean - mean_size)/sd_size) %>% 
  filter(z < 3) %>% 
  filter(migration_background_fac != "Other")
```

```{r create standardized variables}
#create standardized variables
nells_df <- nells_df %>% 
  mutate(sd_maj_pred = sd(maj_pred, na.rm = T),
         sd_min_pred = sd(min_pred, na.rm = T),
         z_maj_pred = (maj_pred - mean(maj_pred, na.rm = T))/sd_maj_pred,
         z_min_pred = (min_pred - mean(min_pred, na.rm = T))/sd_min_pred,
         z_value_ingroup = ifelse(migration_background_fac == "Dutch-Majority", z_maj_pred, z_min_pred),
         z_value_outgroup = ifelse(migration_background_fac == "Dutch-Majority", z_min_pred, z_maj_pred),
         z_distance_groups = z_value_outgroup - z_value_ingroup,
         z_distance_min_maj = z_min_pred - z_maj_pred) 
```


## Missing data imputation

```{r Missing imputation: simple mean, results='hide'}
nells_df <- nells_df %>% 
  mutate(therm_diff_missing = ifelse(is.na(therm_outgroup_diff), 1, 0),
         therm_mean_missing = ifelse(is.na(therm_outgroup_mean), 1, 0),
         pc4_missing = ifelse(is.na(Inwoner_pc4), 1, 0))

imp <- nells_df %>% 
  select(migration_background_fac,
      age,
      gender,
      in_education,
      education_recoded_numeric,
      partner,
      partner_extended,
      has_child,
      paid_work,
      religious,
      religious_denom,
      rel_attendance,
      inwoner_coethnic_pc4,
      inwoner_coethnic_gem,
      therm_outgroup_mean,
      therm_outgroup_diff,
      therm_dutch_maj,
      therm_mor,
      therm_tur,
      starts_with("per"),
      woz_pc4,
      mean_woz_gem,
      Inwoner_gem,
      Inwoner_pc4,
      per_NW_mig_a_pc4) %>%  
  select(-per_ingroup, -per_ingroup_w) %>% 
  mice(method = "mean") %>% 
  complete()

imp <- nells_df %>%
  select(
    id,
    mean,
    per_ingroup,
    per_ingroup_w,
    therm_diff_missing,
    therm_mean_missing,
    pc4_missing,
    z_maj_pred,
    z_min_pred,
    z_value_ingroup,
    z_value_outgroup,
    z_distance_groups,
    z_distance_min_maj
  ) %>%
  bind_cols(imp)
```

## Mean centering of continouos variables

```{r mean subtraction}
imp <- imp %>% 
  mutate(education_recoded_numeric = education_recoded_numeric - mean(education_recoded_numeric, na.rm = T),
         age = age - mean(age, na.rm = T),
         woz_pc4 = woz_pc4 - mean(woz_pc4, na.rm = T),
         mean_woz_gem = mean_woz_gem - mean(mean_woz_gem, na.rm = T),
         Inwoner_gem = Inwoner_gem - mean(Inwoner_gem, na.rm = T),
         Inwoner_pc4 = Inwoner_pc4 - mean(Inwoner_pc4, na.rm = T),
         therm_outgroup_mean = therm_outgroup_mean - mean(therm_outgroup_mean, na.rm = T),
         therm_outgroup_diff = therm_outgroup_diff - mean(therm_outgroup_diff, na.rm = T),
         inwoner_coethnic_pc4 = inwoner_coethnic_pc4 - mean(inwoner_coethnic_pc4, na.rm = T),
         inwoner_coethnic_gem = inwoner_coethnic_gem - mean(inwoner_coethnic_gem, na.rm = T),
        per_NW_mig_a_pc4 = per_NW_mig_a_pc4 - mean(per_NW_mig_a_pc4, na.rm = T)
         )

```

## Set correct labels of categorical variables. 

```{r data prep decom}
#relabel the factor variables. 
imp <- imp %>%
  mutate(
    migration_background_fac = fct_relevel(migration_background_fac,
                                                "Dutch-Majority",
                                                "1st gen Turkish",
                                                "2nd gen Turkish",
                                                "1st gen Moroccan",
                                                "2nd gen Moroccan"),
         migration_background_fac = factor(as.numeric(migration_background_fac),
         levels = 1:5,
         labels = c("Dutch Majority",
                                                "1st gen Turkish-Dutch",
                                                "2nd gen Turkish-Dutch",
                                                "1st gen Moroccan-Dutch",
                                                "2nd gen Moroccan-Dutch")),
    migration_background_simple_fac = case_when(
      migration_background_fac == "1st gen Turkish-Dutch" ~ 2,
      migration_background_fac == "2nd gen Turkish-Dutch" ~ 2,
      migration_background_fac == "1st gen Moroccan-Dutch" ~ 3,
      migration_background_fac == "2nd gen Moroccan-Dutch" ~ 3,
      migration_background_fac == "Dutch Majority" ~ 1
    ),
    migration_background_simple_fac = factor(
      migration_background_simple_fac,
      levels = 1:3,
      labels = c("Dutch Majority", "Turkish-Dutch", "Moroccan-Dutch")
    )
  )

#not all subgroups have complete cases for all the factor variables. Hence, the need to combine some categories. 
imp <- imp %>%
  mutate(
    rel_attendance_num = if_else(as.numeric(rel_attendance) > 3, 4, as.numeric(rel_attendance)),
    rel_fac_2 = factor(
      rel_attendance_num,
      levels = 1:4,
      labels = c("Never", "1-2 Year", "3-11 Year", "Monthly or more")
    ),
    partner = if_else(partner_extended == "Does not have a partner", 0, 1)
  )  



```


# Descriptive statistics table

## Desstats table function and table creation
```{r descriptive statistics table function}
prepare_desstats <- function(x, full.sample) {
  if (full.sample == T) {
    desstats <- x %>%
      mutate(
        ln_size = log(mean),
        Dutch = if_else(migration_background_fac == "Dutch Majority", 1, 0),
        `2nd_gen_Moroccan` = if_else(migration_background_fac == "2nd gen Moroccan-Dutch", 1, 0),
        `2nd_gen_Turkish` = if_else(migration_background_fac == "2nd gen Turkish-Dutch", 1, 0),
        `1st_gen_Turkish` = if_else(migration_background_fac == "1st gen Turkish-Dutch", 1, 0),
        `1st_gen_Moroccan` = if_else(migration_background_fac == "1st gen Moroccan-Dutch", 1, 0),
        age = as.numeric(age),
        female = if_else(gender == "Female", 1, 0),
        gender_other = if_else(gender == "Other", 1, 0),
        no_partner = if_else(partner_extended != "Does not have a partner", 1, 0),
        partner_seperate = if_else(partner_extended == "Has a partner, lives seperately", 1, 0),
        partner_together = if_else(partner_extended == "Lives together, unmarried", 1, 0),
        partner_married = if_else(partner_extended == "Married", 1, 0),
        rel_never = if_else(rel_attendance == "Never", 1, 0),
        rel_12_year = if_else(rel_attendance == "1-2 a year", 1, 0),
        rel_311_year = if_else(rel_attendance == "3-11 a year", 1, 0),
        rel_month = if_else(rel_attendance == "Once a month", 1, 0),
        rel_23_month = if_else(rel_attendance == "2-3 a month", 1, 0),
        rel_weekly = if_else(rel_attendance == "Every week", 1, 0),
        rel_mul_weekly = if_else(rel_attendance == "Multiple times a week", 1, 0),
        rel_month_more = if_else(rel_fac_2 == "Monthly or more", 1, 0)
      ) %>%
      rename(size = mean) %>%
      select(
        size,
        ln_size,
        per_ingroup_w,
        #per_ingroup,
        #ada_value,
        Dutch,
        `1st_gen_Turkish`,
        `2nd_gen_Turkish`,
        `1st_gen_Moroccan`,
        `2nd_gen_Moroccan`,
        age,
        female,
        education_recoded_numeric,
        no_partner,
        paid_work,
        in_education,
        rel_never,
        rel_12_year,
        rel_311_year,
        rel_month_more,
        therm_outgroup_diff,
        per_NW_mig_a_pc4,
        woz_pc4,
        Inwoner_pc4
      ) %>%
      psych::describe() %>% 
      round(., digits = 3)
  }
  else {
    desstats <- x %>%
      mutate(
        ln_size = log(mean),
        age = as.numeric(age),
        female = if_else(gender == "Female", 1, 0),
        gender_other = if_else(gender == "Other", 1, 0),
        no_partner = if_else(partner_extended != "Does not have a partner", 1, 0),
        partner_seperate = if_else(partner_extended == "Has a partner, lives seperately", 1, 0),
        partner_together = if_else(partner_extended == "Lives together, unmarried", 1, 0),
        partner_married = if_else(partner_extended == "Married", 1, 0),
        rel_never = if_else(rel_attendance == "Never", 1, 0),
        rel_12_year = if_else(rel_attendance == "1-2 a year", 1, 0),
        rel_311_year = if_else(rel_attendance == "3-11 a year", 1, 0),
        rel_month = if_else(rel_attendance == "Once a month", 1, 0),
        rel_23_month = if_else(rel_attendance == "2-3 a month", 1, 0),
        rel_weekly = if_else(rel_attendance == "Every week", 1, 0),
        rel_mul_weekly = if_else(rel_attendance == "Multiple times a week", 1, 0),
        rel_month_more = if_else(rel_fac_2 == "Monthly or more", 1, 0)
      ) %>%
      rename(size = mean) %>%
      select(
        size,
        ln_size,
        per_ingroup_w,
        #ada_value,
        age,
        female,
        education_recoded_numeric,
        no_partner,
        paid_work,
        in_education,
        rel_never,
        rel_12_year,
        rel_311_year,
        rel_month_more,
        therm_outgroup_diff,
        per_NW_mig_a_pc4,
        woz_pc4,
        Inwoner_pc4
      ) %>%
      psych::describe() %>% 
      round(., digits = 3)
  }
  
}

var_names_sample <- c(
  "Network size",
  "Network size (log)",
  "Ethnic homogeneity",
  "Age",
  "Gender [Female]",
  "Educational attainment",
  "Partner [Has a partner]",
  "Paid work",
  "Currently in education",
  "Religious attendance[Never]",
  "Religious attendance[1-2 year]",
  "Religious attendance[3-11 a year]",
  "Religious attendance[Once a month or more]",
  "Outgroup attitude",
  "Neighbourhood: % Non-western migrant",
  "Neighbourhood: mean property valuation",
  "Neighbourhood: # inhabitants"
)

var_names_full<- c(
  "Network size",
  "Network size (log)",
  "Ethnic homogeneity",
  "Group[Dutch majority]",
  "Group[1st gen Turkish-Dutch]",
  "Group[2nd gen Turkish-Dutch]",
  "Group[1st gen Moroccan-Dutch]",
  "Group[2nd gen Moroccan-Dutch]",
  "Age",
  "Gender [Female]",
  "Educational attainment",
  "Partner [Has a partner]",
  "Paid work",
  "Currently in education",
  "Religious attendance[Never]",
  "Religious attendance[1-2 year]",
  "Religious attendance[3-11 a year]",
  "Religious attendance[Once a month or more]",
  "Outgroup attitude",
  "Neighbourhood: % Non-western migrant",
  "Neighbourhood: mean property valuation",
  "Neighbourhood: # inhabitants"
)

desstats <- imp %>% 
  prepare_desstats(x = ., full.sample = T)

desstats_sample <- imp %>% 
  group_split(migration_background_fac) %>% 
  map(.x = ., .f = ~ prepare_desstats(x = .x, full.sample = F)) %>% 
  map(.x = ., .f = ~ dplyr::select(.data = .x, mean, sd))


#custom function to set rownames in pipe.
set_rownames <- function(df,names) {
row.names(df) <- names
return(df)
}
```

## Descriptive statistics table full sample

```{r descriptive statistics full sample}
desstats %>% 
  bind_cols(.name_repair = "minimal") %>% 
  select(mean, sd, median, min, max) %>% 
set_rownames(df = ., names = var_names_full) %>% 
    kbl(row.names = T) %>% 
  kable_classic()
  
```

## Descriptive statistics by group

```{r descriptive statistics by group}
desstats_sample %>% 
  bind_cols(.name_repair = "minimal") %>%
  set_rownames(df = ., names = var_names_sample) %>% 
  kbl(row.names = T) %>% 
  kable_classic() %>%
  add_header_above(c(" " = 1,"Dutch" = 2, "1st gen Turkish" = 2, "2nd gen Turkish" = 2, "1st gen Moroccan" = 2, "2nd gen Moroccan" = 2)) %>% 
   footnote(general = "Size of the different groups ",
           number = c("Dutch: 658", "2nd gen Moroccan: 92",
                      "2nd gen Turkish: 134", "1st gen Turkish: 141",
                      "1st gen Moroccan: 73")) 


```


# Inferential analysis

We now turn to testing group differences. Different methods are used. First we can check group differences with an OLS and dummies. Second, we can use an anova to check the differences. Third, we will use a decomposition analysis to explain group differences. 

## OLS model with dummy
```{r group differecces}
size_group_model <- imp %>%  
  filter(gender != "Other") %>% 
  mutate(
  mean = as.integer(mean)
) %>%
  lm(
    log(mean) ~ migration_background_fac,
    data = .
  )
hom_group_model <- imp %>%
  filter(gender != "Other") %>%
  lm(
    per_ingroup_w ~ migration_background_fac,
    data = .
  )

#show table of OLS results
sjPlot::tab_model(size_group_model,
                  hom_group_model,
          pred.labels = c(
            "Constant",
            "Group[1st gen Turkish-Dutch]",
            "Group[2nd gen Turkish-Dutch]",
            "Group[1st gen Moroccan-Dutch]",
            "Group[2nd gen Moroccan-Dutch]"
            ),
          digits = 3,
          show.ci = F,
          show.stat = T,
          collapse.se  = F,
          col.order = c("est", "stat"),
          p.style = "scientific"
          #file = "reg.doc"
          )


```

## Anova

Size analysis. 
Complete groups.

```{r simple size}
# Compute the analysis of variance
res_aov_size <- aov(log(mean) ~ migration_background_simple_fac, data = imp)

# Summary of the analysis
results_size <- summary(res_aov_size)

#look at group differences
anova_diff_df_size  <- as_tibble(TukeyHSD(res_aov_size)$migration_background_simple_fac) %>% 
  mutate(Difference = row.names(TukeyHSD(res_aov_size)$migration_background_simple_fac)) %>% 
  select(5, 1:4) %>% 
  select(Difference,diff, `p adj`) %>% 
  rename(value = diff,
         p = `p adj`) %>% 
  mutate(variable = "ln(Size)")

# Compute the analysis of variance
res_aov_hom <- aov(per_ingroup_w ~ migration_background_simple_fac, data = imp)

# Summary of the analysis
results_hom <- summary(res_aov_hom)

#look at group differences
anova_diff_df_hom  <- as_tibble(TukeyHSD(res_aov_hom)$migration_background_simple_fac) %>% 
  mutate(Difference = row.names(TukeyHSD(res_aov_hom)$migration_background_simple_fac)) %>% 
  select(5, 1:4) %>% 
  select(Difference, diff, `p adj`) %>% 
  rename(value = diff,
         p = `p adj`) %>% 
  mutate(variable = "Homogeneity")

#add sig. indicator
anova_diff_df <- anova_diff_df_size %>% 
  bind_rows(anova_diff_df_hom) %>% 
  mutate(sig = ifelse(p < 0.05, 1, 0),
         sig_f = ifelse(sig == 1, "*", ""))


#create anova plot
anova_diff_simple_plot <- anova_diff_df %>% 
  mutate(diff_fac = fct_rev(factor(rep(1:3,2),
                           levels = 1:3,
                           labels = c("TKY - NED",
                                      "MOR - NED",
                                      "MOR - TKY")
                                      ))

         ) %>% 
  ggplot(aes(x = value, 
             y = diff_fac)) +
  geom_col(aes(fill = variable,
           colour = variable),
           alpha = 0.6) +
  geom_text(aes(label = sig_f),
            size = 6,
            hjust = 1) +
  facet_wrap(vars(variable),
             scales = "free_x") +
    theme_minimal() +
  scale_fill_viridis_d(option = "D") +
  scale_colour_viridis_d(option = "D") +
  theme(legend.position = "none",
        legend.title = element_blank(),
        axis.title.y = element_text(hjust = 1),
        axis.title.x = element_text(hjust = 1),
        text = element_text(colour = "black"),
        strip.text = element_text(colour = "black")) +
  labs(y = "",
       x = "",
       caption = "")
```

Migrant generations split.

```{r anova size and homogeneity}
# Compute the analysis of variance
res_aov_size <- aov(log(mean) ~ migration_background_fac, data = imp)

# Summary of the analysis
results_size <- summary(res_aov_size)

#look at group differences
anova_diff_df_size  <- as_tibble(TukeyHSD(res_aov_size)$migration_background_fac) %>% 
  mutate(Difference = row.names(TukeyHSD(res_aov_size)$migration_background_fac)) %>% 
  select(5, 1:4) %>% 
  select(Difference,diff, `p adj`) %>% 
  rename(value = diff,
         p = `p adj`) %>% 
  mutate(variable = "ln(Size)")

# Compute the analysis of variance
res_aov_hom <- aov(per_ingroup_w ~ migration_background_fac, data = imp)

# Summary of the analysis
results_hom <- summary(res_aov_hom)

#look at group differences
anova_diff_df_hom  <- as_tibble(TukeyHSD(res_aov_hom)$migration_background_fac) %>% 
  mutate(Difference = row.names(TukeyHSD(res_aov_hom)$migration_background_fac)) %>% 
  select(5, 1:4) %>% 
  select(Difference, diff, `p adj`) %>% 
  rename(value = diff,
         p = `p adj`) %>% 
  mutate(variable = "Homogeneity")

#add sig. indicator
anova_diff_df <- anova_diff_df_size %>% 
  bind_rows(anova_diff_df_hom) %>% 
  mutate(sig = ifelse(p < 0.05, 1, 0),
         sig_f = ifelse(sig == 1, "*", ""))

#create plot
anova_diff_extended_plot <- anova_diff_df %>% 
  mutate(diff_fac = factor(rep(1:10,2),
                           levels = 1:10,
                           labels = c("1st gen TKY - NED",
                                      "2nd gen TKY - NED",
                                      "1st gen MOR - NED",
                                      "2nd gen MOR - NED",
                                      "2nd gen TKY - 1st gen TKY",
                                      "1st gen MOR - 1st gen TKY",
                                      "2nd gen MOR - 1st gen TKY",
                                      "1st gen MOR - 2nd gen TKY",
                                      "2nd gen MOR - 2nd gen TKY",
                                      "2nd gen MOR - 1st gen MOR")

                                      ),
         diff_fac = fct_rev(fct_relevel(diff_fac,
                                "1st gen TKY - NED",
                                      "2nd gen TKY - NED",
                                      "1st gen MOR - NED",
                                      "2nd gen MOR - NED",
                                      "2nd gen TKY - 1st gen TKY",
                                      "2nd gen MOR - 1st gen MOR",
                                      "1st gen MOR - 1st gen TKY",
                                      "2nd gen MOR - 1st gen TKY",
                                      "1st gen MOR - 2nd gen TKY",
                                      "2nd gen MOR - 2nd gen TKY")

         )) %>% 
  filter(!diff_fac %in% c("1st gen MOR - 1st gen TKY",
                                      "2nd gen MOR - 1st gen TKY",
                                      "1st gen MOR - 2nd gen TKY",
                                      "2nd gen MOR - 2nd gen TKY")) %>% 
  ggplot(aes(x = value, 
             y = diff_fac)) +
  geom_col(aes(fill = variable,
           colour = variable),
           alpha = 0.6) +
  geom_text(aes(label = sig_f),
            size = 6,
            hjust = 1) +
  facet_wrap(vars(variable),
             scales = "free_x") +
    theme_minimal() +
  scale_fill_viridis_d(option = "D") +
  scale_colour_viridis_d(option = "D") +
  theme(legend.position = "none",
        legend.title = element_blank(),
        axis.title.y = element_text(hjust = 1),
        axis.title.x = element_text(hjust = 1),
        text = element_text(colour = "black"),
        strip.text = element_text(colour = "black")) +
  labs(y = "",
       x = "Coefficient")
#create combined plot
anova_plot <- anova_diff_simple_plot + 
anova_diff_extended_plot +
  plot_annotation(tag_levels ='a',
                  tag_prefix = '(',
                  tag_suffix = ')',
                  caption = "* p < 0.05") +
  plot_layout(ncol = 1,
              heights = c(1,3))

#show final plot
anova_plot

#save final anova plot
ggsave(anova_plot,
       file = "data_analysis/plots/results/anova_plot.jpg",
       width = 7,
       height = 5,
       dpi = 320)

  
```

## Pooled regression model 

Recreate the pooled regression model

```{r pooled model size}
#estimate size model
size_pooled_model <- imp %>%  
  filter(gender != "Other") %>% 
  mutate(
  mean = as.integer(mean)
) %>%
  lm(
    log(mean) ~ migration_background_fac +
        age +
        gender + 
        education_recoded_numeric +
        partner + 
        in_education +
        paid_work +
        rel_fac_2 +
        per_NW_mig_a_pc4 +
        woz_pc4 +
        Inwoner_pc4 +
        therm_outgroup_diff +
      therm_diff_missing,
    data = .
  )

#estiamte homogeneity model
hom_pooled_model <- imp %>%
  filter(gender != "Other") %>%
  lm(
    per_ingroup_w ~ migration_background_fac +
      age +
      gender +
      education_recoded_numeric +
      partner +
      in_education +
      paid_work +
      rel_fac_2 +
      per_NW_mig_a_pc4 +
      woz_pc4 +
      Inwoner_pc4 +
      therm_outgroup_diff +
      therm_diff_missing,
    data = .
  )

#create table
sjPlot::tab_model(size_pooled_model,
                  hom_pooled_model,
          pred.labels = c(
            "Constant",
            "Group[1st gen Turkish-Dutch]",
            "Group[2nd gen Turkish-Dutch]",
            "Group[1st gen Moroccan-Dutch]",
            "Group[2nd gen Moroccan-Dutch]",
            "Age",
            "Gender [Fenale]",
            "Educational attainment",
            "Partner[Has a partner]",
            "Paid work",
            "Currently in education",
            "Religious attendance[1-2 year]",
            "Religious attendance[3-11 a year]",
            "Religious attendance[Once a month or more]",
            "Neighborhood: % Non-western migrant",
            "Neighborhood: mean property valuation",
            "Neighborhood: # inhabitants",
            "Outgroup attitude",
            "Outgroup attitude missing"
            ),
          digits = 3,
          show.ci = F,
          show.stat = T,
          collapse.se  = F,
          col.order = c("est", "stat"),
          p.style = "stars"
          #file = "reg.doc"
          )
```


# Decomposition analysis

## Decomposition dataprep

Create separate df frames to use in the decomposition analyses.

```{r oaxaca data prep}
#Dutch - TKY 1st gen
data_dutch_turkish_1st <- imp %>%
  filter(gender != "Other") %>% 
  filter(migration_background_fac %in% c("Dutch Majority", "1st gen Turkish-Dutch")) %>% 
  mutate(z = if_else(migration_background_fac == "Dutch Majority", 1, 0)
         ) 

#Dutch - TKY 2nd gen
data_dutch_turkish_2nd <- imp %>%
  filter(gender != "Other") %>% 
  filter(migration_background_fac %in% c("Dutch Majority", "2nd gen Turkish-Dutch")) %>% 
  mutate(z = if_else(migration_background_fac == "Dutch Majority", 1, 0)
         ) 

#Dutch - moroccan 2nd gen
data_dutch_moroccan_2nd <- imp %>%
  filter(gender != "Other") %>% 
  filter(migration_background_fac %in% c("Dutch Majority", "2nd gen Moroccan-Dutch")) %>% 
  mutate(z = if_else(migration_background_fac == "Dutch Majority", 1, 0)
         ) 

#Dutch - moroccan 1st gen
data_dutch_moroccan_1st <- imp %>%
  filter(gender != "Other") %>% 
  filter(migration_background_fac %in% c("Dutch Majority", "1st gen Moroccan-Dutch")) %>% 
  mutate(z = if_else(migration_background_fac == "Dutch Majority", 1, 0)
         ) 

#Turkish generation difference
data_turkish_diff <- imp %>%
  filter(gender != "Other") %>% 
  filter(migration_background_fac %in% c("1st gen Turkish-Dutch", "2nd gen Turkish-Dutch")) %>% 
  mutate(z = if_else(migration_background_fac == "2nd gen Turkish-Dutch", 1, 0)
         ) 

#Moroccan generation difference
data_moroccan_diff <- imp %>%
  filter(gender != "Other") %>% 
  filter(migration_background_fac %in% c("1st gen Moroccan-Dutch", "2nd gen Moroccan-Dutch")) %>% 
  mutate(z = if_else(migration_background_fac == "2nd gen Moroccan-Dutch", 1, 0)
         )

#Dutch turkish minority
data_dutch_turkish <- imp %>%
  filter(gender != "Other") %>% 
  filter(migration_background_fac %in% c("Dutch Majority", "1st gen Turkish-Dutch","2nd gen Turkish-Dutch")) %>% 
  mutate(z = if_else(migration_background_fac == "Dutch Majority", 1, 0)
         ) 

#Dutch moroccan minority
data_dutch_moroccan <- imp %>%
  filter(gender != "Other") %>% 
  filter(migration_background_fac %in% c("Dutch Majority", "1st gen Moroccan-Dutch","2nd gen Moroccan-Dutch")) %>% 
  mutate(z = if_else(migration_background_fac == "Dutch Majority", 1, 0)
         ) 

#combine elements in list
data_list <- list(data_dutch_turkish_1st,
                  data_dutch_turkish_2nd,
                  data_dutch_moroccan_2nd,
                  data_dutch_moroccan_1st,
                  data_turkish_diff,
                  data_moroccan_diff,
                  data_dutch_turkish,
                  data_dutch_moroccan)

```

## Size ln
### Decomposition estimation
```{r oaxaca formula size}
#set seed
set.seed(2702)

#create the analysis function
oaxaca_spec_f <- function(x) {
  results <- x  %>%
  oaxaca(
    formula = log(mean) ~ age +
      gender +
      education_recoded_numeric +
      partner +
      in_education +
      paid_work +
      rel_fac_2 +
      per_NW_mig_a_pc4 +
      woz_pc4 +
      Inwoner_pc4 +
      therm_outgroup_diff | z,
  reg.fun = lm,
  data = .)
}

#create model list
oaxaca_model_list  <- map(.x = data_list, 
    .f = ~ oaxaca_spec_f(x = .x))


#create empty lists to store information in
predicted_list <- list()
two_fold_results_plot_list <- list()
two_fold_results_table_list <- list()
two_fold_variables_list <- list()


#select correct information from the model object and store in list
for(i in 1:length(oaxaca_model_list)) { #i = 8
  model_object <- oaxaca_model_list[[i]]
  
  #get the predicted network size of both groups
  predicted_list[[i]] <- tibble(y_a = model_object$y$y.A,
  y_b = model_object$y$y.B,
  y_diff = model_object$y$y.diff,
  model = i)
  
  #get the results of the twofold analysis for the plot
  two_fold_results_plot_list[[i]] <- model_object$twofold$overall %>% 
    as_tibble() %>% 
    filter(group.weight == 1) %>% 
    rename(coef_explained = `coef(explained)`,
           se_explained = `se(explained)`,
           coef_unexplained = `coef(unexplained)`,
           se_unexplained = `se(unexplained)`,
           coef_unexplained.a = `coef(unexplained A)`,
           se_unexplained.a = `se(unexplained A)`,
           coef_unexplained.b = `coef(unexplained B)`,
           se_unexplained.b = `se(unexplained B)`) %>% 
    select(-group.weight) %>% 
    select(1:8) %>% 
    pivot_longer(data = .,
                 cols = 1:8,
                 names_to = c("var", "name"),
                 values_to = "value",
                 names_sep = "_") %>% 
    mutate(model = i)
  
  #get the results of the twofold analysis for the table
  two_fold_results_table_list[[i]] <- model_object$twofold$overall %>% 
    as_tibble() %>% 
    filter(group.weight == -2) %>% 
    rename(coef_explained = `coef(explained)`,
           se_explained = `se(explained)`,
           coef_unexplained = `coef(unexplained)`,
           se_unexplained = `se(unexplained)`,
           coef_unexplained.a = `coef(unexplained A)`,
           se_unexplained.a = `se(unexplained A)`,
           coef_unexplained.b = `coef(unexplained B)`,
           se_unexplained.b = `se(unexplained B)`) %>% 
    select(-group.weight) %>% 
    select(1:8)
  
  #get the results of the twofold analysis for the variable plot
  two_fold_variables_list[[i]] <- model_object$twofold$variables[[6]] %>% 
    as_tibble() %>% 
    mutate(variables = row.names(model_object$twofold$variables[[6]])) %>% 
    filter(group.weight == -2) %>% 
    select(10,2:5) %>% 
    mutate(model = i)
}


```


### Twofold decomposition results table

```{r results tables size}
#extract twofold information
twofold_table <- two_fold_results_table_list %>% 
  bind_rows() %>% 
  filter(row_number()> 4) %>% 
  mutate(model = factor(1:4,
                        levels = 1:4,
                        labels = c(
                          "1st gen Turkish-Dutch -  2nd gen Turkish-Dutch",
                          "1st gen Moroccan-Dutch -  2nd gen Moroccan-Dutch",
                          "Turkish-Dutch -  Dutch-Majority",
                          "Moroccan-Dutch -  Dutch-Majority"
                        ))) %>% 
  select(9,1:4) %>% 
  mutate(z_explained = coef_explained/se_explained,
         z_unexplained = coef_unexplained/se_unexplained) %>% 
  select(model, coef_explained, z_explained, coef_unexplained, z_unexplained)

predicted_df <- predicted_list %>% 
  bind_rows() %>% 
  filter(model > 4) %>% 
  select(1:3)

#show results in table
twofold_table %>% 
  bind_cols(predicted_df) %>% 
  select(1,6:8, 2:5) %>% 
  kbl(digits = 3,
      align = 'l') %>% 
  kable_paper()



```

### Twofold panel plot

```{r panel plot size, fig.width=12,fig.height = 8}
#create plot df
variables_plot_df <- two_fold_variables_list %>% 
  bind_rows() %>% 
  filter(model > 4) %>% 
  mutate(model = factor(model, 
                        levels = 5:8,
                        labels = c(
                          "Turkish-Dutch 1st gen - 2nd gen",
                          "Moroccan-Dutch 1st gen - 2nd gen",
                          "Turkish-Dutch -  Dutch-Majority",
                          "Moroccan-Dutch -  Dutch-Majority"
                        ))) %>% 
  rename(coef_explained = `coef(explained)`,
           se_explained = `se(explained)`,
           coef_unexplained = `coef(unexplained)`,
           se_unexplained = `se(unexplained)`) %>%
  pivot_longer(data = .,
                 cols = 2:5,
                 names_to = c("var", "name"),
                 values_to = "value",
                 names_sep = "_") %>% 
  pivot_wider(names_from = var,
              values_from = value) %>% 
  mutate(abs_z = abs(coef/se),
         sig = ifelse(abs_z > 1.95, 1, 0),
         sig_f = factor(sig,
                        levels = 0:1,
                        labels = c("", "*")))

#create vector to filter variables
indep_names <- c(
  "rel_fac_2Monthly or more",
  "rel_fac_23-11 Year",
  "rel_fac_21-2 Year",
  "paid_work",
  "in_education",
  "per_NW_mig_a_pc4",
  "Inwoner_pc4",
  "education_recoded_numeric",
  "therm_outgroup_diff",
  "age",
  "partner",
  "genderFemale",
  "woz_pc4")

#create plot
variables_decomp_size_plot <- variables_plot_df %>% #dataprep
  filter(variables %in% indep_names) %>%
  mutate(
    var_fac = case_when(
      variables == "paid_work" ~ 1,
      variables == "in_education" ~ 2,
      variables == "rel_fac_21-2 Year" ~ 3,
      variables == "rel_fac_23-11 Year" ~ 4,
      variables == "rel_fac_2Monthly or more" ~ 5,
      variables == "education_recoded_numeric" ~ 6,
      variables == "Inwoner_pc4" ~ 7,
      variables == "per_NW_mig_a_pc4" ~ 8,
      variables == "woz_pc4" ~ 9,
      variables == "therm_outgroup_diff" ~ 10,
      variables == "age" ~ 11,
      variables == "partner" ~ 12,
      variables == "genderFemale" ~ 13
    ),
    var_fac = factor(
      var_fac,
      levels = 1:13,
      labels = c(
        "Paid work",
        "Currently in education",
        "Religious attendance[1-2 year]",
        "Religious attendance[3-11 a year]",
        "Religious attendance[Once a month or more]",
        "Educational attainment",
        "Neighbourhood: # inhabitants",
        "Neighbourhood: mean property valuation",
        "Neighbourhood: % non-western migrant",
        "Outgroup attitudes",
        "Age",
        "Partner[no partner]",
        "Gender[Male]"
      )
    )
  ) %>% #plot
  ggplot(aes(x = coef, y = var_fac)) +
  geom_col(aes(colour = name,
             fill = name),
           alpha = 0.6) + 
  geom_errorbar(aes(xmin = coef - (se*1.96),
                    xmax = coef + (se*1.96))) +
  geom_vline(xintercept = 0) +
  geom_text(aes(label = sig_f),
            hjust = 2,
            size = 6) +
  facet_grid(cols = vars(model),
             rows = vars(name)) +
  theme_minimal() +
  scale_fill_viridis_d(option = "D") +
  scale_colour_viridis_d(option = "D") +
  theme(legend.position = "bottom",
        legend.title = element_blank(),
        axis.title.y = element_text(hjust = 1),
        axis.title.x = element_text(hjust = 1),
        text = element_text(colour = "black"),
        strip.text = element_text(colour = "black")) +
  labs(y = "Independent Variables",
       x = "Coefficient",
       caption = "* p < 0.05")

#save plot
ggsave(variables_decomp_size_plot,
        file = "data_analysis/plots/results/variables_decomp_size_plot.jpg",
       width = 12,
       height = 8,
       dpi = 320)

#show plot
variables_decomp_size_plot
```


### Regression model results per group

All (sub)groups

```{r different model results size}
#estimate model for different groups
size_pooled_model <- imp %>%
  filter(gender != "Other") %>%
   group_split(migration_background_fac) %>%
  map(
    .x = .,
    .f = ~ lm(
      log(mean) ~ age +
        gender + 
        education_recoded_numeric +
        partner + 
        in_education +
        paid_work +
        rel_fac_2 +
        per_NW_mig_a_pc4 +
        woz_pc4 +
        Inwoner_pc4 +
        therm_outgroup_diff,
      data = .x
    )
  )

#Show results in table
sjPlot::tab_model(size_pooled_model,
          pred.labels = c(
            "Constant",
           "Age",
            "Gender [Fenale]",
            "Educational attainment",
            "Partner[Has a partner]",
            "Paid work",
            "Currently in education",
            "Religious attendance[1-2 year]",
            "Religious attendance[3-11 a year]",
            "Religious attendance[Once a month or more]",
            "Outgroup attitude",
            "Neighborhood: % non-western migrant",
            "Neighborhood: mean property valuation",
            "Neighborhood: # inhabitants"
            ),
          dv.labels = c("Dutch", "1st gen Turkish", "2nd gen Turkish", "1st gen Moroccan","2nd gen Moroccan"),
          digits = 3,
          show.ci = F,
          show.stat = T,
          collapse.se  = F,
          col.order = c("est", "stat")
          #file = "reg.doc"
          )

```

Only major groups.

```{r different model results size simple}
results_list <- imp %>%
  filter(gender != "Other") %>% 
  group_split(migration_background_simple_fac) %>%
  map(
    .x = .,
    .f = ~ lm(
      log(mean) ~  age +
        gender + 
        education_recoded_numeric +
        therm_outgroup_diff +
        partner + 
        in_education +
        paid_work +
        rel_fac_2 +
        per_NW_mig_a_pc4 +
        woz_pc4 +
        Inwoner_pc4,
      data = .x
    )
  )

sjPlot::tab_model(results_list,
          pred.labels = c(
            "Constant",
            "Age",
            "Gender [Fenale]",
            "Educational attainment",
            "Partner[Has a partner]",
            "Paid work",
            "Currently in education",
            "Religious attendance[1-2 year]",
            "Religious attendance[3-11 a year]",
            "Religious attendance[Once a month or more]",
            "Outgroup attitude",
            "Neighborhood: % non-western migrant",
            "Neighborhood: mean property valuation",
            "Neighborhood: # inhabitants"
            ),
          dv.labels = c("Dutch","Turkish-Dutch", "Moroccan-Dutch"),
          digits = 3,
          show.ci = F,
          show.stat = T,
          collapse.se  = F,
          col.order = c("est", "stat")
          #file = "reg.doc"
          )

```

## Ethnic homogeneity

### Decomposition estimation

```{r oaxaca formula homogeneity }
#set seed
set.seed(2702)

#create the analysis function
oaxaca_spec_f <- function(x) {
  results <- x  %>%
  oaxaca(
    formula = per_ingroup_w ~  age +
        gender + 
        education_recoded_numeric +
        therm_outgroup_diff +
        partner + 
        in_education +
        paid_work +
        rel_fac_2 +
        per_NW_mig_a_pc4 +
        woz_pc4 +
        Inwoner_pc4 | z ,
  reg.fun = lm,
  data = .)
}

#create model list
oaxaca_model_list_homogeneity  <- map(.x = data_list, 
    .f = ~ oaxaca_spec_f(x = .x))

predicted_list_hom <- list()
two_fold_results_plot_list_hom <- list()
two_fold_results_table_list_hom <- list()
two_fold_variables_list_hom <- list()

#select correct information from results
for(i in 1:length(oaxaca_model_list_homogeneity)) { #i = 8
  model_object <- oaxaca_model_list_homogeneity[[i]]
  
  predicted_list_hom[[i]] <- tibble(y_a = model_object$y$y.A,
  y_b = model_object$y$y.B,
  y_diff = model_object$y$y.diff,
  model = i)
  
  two_fold_results_plot_list_hom[[i]] <- model_object$twofold$overall %>% 
    as_tibble() %>% 
    filter(group.weight == -2) %>% 
    rename(coef_explained = `coef(explained)`,
           se_explained = `se(explained)`,
           coef_unexplained = `coef(unexplained)`,
           se_unexplained = `se(unexplained)`,
           coef_unexplained.a = `coef(unexplained A)`,
           se_unexplained.a = `se(unexplained A)`,
           coef_unexplained.b = `coef(unexplained B)`,
           se_unexplained.b = `se(unexplained B)`) %>% 
    select(-group.weight) %>% 
    select(1:8) %>% 
    pivot_longer(data = .,
                 cols = 1:8,
                 names_to = c("var", "name"),
                 values_to = "value",
                 names_sep = "_") %>% 
    mutate(model = i)
  
  two_fold_results_table_list_hom[[i]] <- model_object$twofold$overall %>% 
    as_tibble() %>% 
    filter(group.weight == -2) %>% 
    rename(coef_explained = `coef(explained)`,
           se_explained = `se(explained)`,
           coef_unexplained = `coef(unexplained)`,
           se_unexplained = `se(unexplained)`,
           coef_unexplained.a = `coef(unexplained A)`,
           se_unexplained.a = `se(unexplained A)`,
           coef_unexplained.b = `coef(unexplained B)`,
           se_unexplained.b = `se(unexplained B)`) %>% 
    select(-group.weight) %>% 
    select(1:8)
  
  
  two_fold_variables_list_hom[[i]] <- model_object$twofold$variables[[6]] %>% 
    as_tibble() %>% 
    mutate(variables = row.names(model_object$twofold$variables[[2]])) %>% 
    filter(group.weight == -2) %>% 
    select(10,2:5) %>% 
    mutate(model = i)
  
}


```

### Decomposition results table

```{r results table homogeneity}
#extract twofold information
twofold_table_hom <- two_fold_results_table_list_hom %>% 
  bind_rows() %>% 
  filter(row_number()> 4) %>% 
  mutate(model = factor(1:4,
                        levels = 1:4,
                        labels = c(
                          "1st gen Turkish -  2nd gen Turkish",
                          "1st gen Moroccan -  2nd gen Moroccan",
                          "Turkish -  Dutch-Majority",
                          "Moroccan -  Dutch-Majority"
                        ))) %>% 
  select(9,1:4) %>% 
  mutate(z_explained = coef_explained/se_explained,
         z_unexplained = coef_unexplained/se_unexplained) %>% 
  select(model,coef_explained, z_explained, coef_unexplained, z_unexplained)


predicted_df_hom <- predicted_list_hom %>% 
  bind_rows() %>% 
  filter(model > 4) %>% 
  select(1:3)

#create resuls table
twofold_table_hom %>% 
  bind_cols(predicted_df_hom) %>% 
  select(1,6:8, 2:5) %>% 
    kbl(digits = 3,
      align = 'l') %>% 
  kable_paper()

```

### Decomposition results plot

```{r panel plot hom, fig.width=12,fig.height = 8}
#create plot df
variables_plot_df_hom <- two_fold_variables_list_hom %>% 
  bind_rows() %>% 
  filter(model > 4) %>% 
  mutate(model = factor(model, 
                        levels = 5:8,
                        labels = c(
                          "Turkish-Dutch 1st gen - 2nd gen",
                          "Moroccan-Dutch 1st gen - 2nd gen",
                          "Turkish-Dutch -  Dutch-Majority",
                          "Moroccan-Dutch -  Dutch-Majority"
                        ))) %>% 
  rename(coef_explained = `coef(explained)`,
           se_explained = `se(explained)`,
           coef_unexplained = `coef(unexplained)`,
           se_unexplained = `se(unexplained)`) %>%
  pivot_longer(data = .,
                 cols = 2:5,
                 names_to = c("var", "name"),
                 values_to = "value",
                 names_sep = "_") %>% 
  pivot_wider(names_from = var,
              values_from = value) %>% 
  mutate(abs_z = abs(coef/se),
         sig = ifelse(abs_z > 1.95, 1, 0),
         sig_f = factor(sig,
                        levels = 0:1,
                        labels = c("", "*")))

#create selection variable
indep_names <- c(
  "rel_fac_2Monthly or more",
  "rel_fac_23-11 Year",
  "rel_fac_21-2 Year",
  "paid_work",
  "in_education",
  "per_NW_mig_a_pc4",
  "Inwoner_pc4",
  "education_recoded_numeric",
  "therm_outgroup_diff",
  "age",
  "partner",
  "genderFemale",
  "woz_pc4")

#create decomposition homogeneity plot
variables_decomp_hom_plot <- variables_plot_df_hom %>%
  filter(variables %in% indep_names) %>%
  mutate(
    var_fac = case_when(
      variables == "paid_work" ~ 1,
      variables == "in_education" ~ 2,
      variables == "rel_fac_21-2 Year" ~ 3,
      variables == "rel_fac_23-11 Year" ~ 4,
      variables == "rel_fac_2Monthly or more" ~ 5,
      variables == "education_recoded_numeric" ~ 6,
      variables == "Inwoner_pc4" ~ 7,
      variables == "per_NW_mig_a_pc4" ~ 8,
      variables == "woz_pc4" ~ 9,
      variables == "therm_outgroup_diff" ~ 10,
      variables == "age" ~ 11,
      variables == "partner" ~ 12,
      variables == "genderFemale" ~ 13
    ),
    var_fac = factor(
      var_fac,
      levels = 1:13,
      labels = c(
        "Paid work",
        "Currently in education",
        "Religious attendance[1-2 year]",
        "Religious attendance[3-11 a year]",
        "Religious attendance[Once a month or more]",
        "Educational attainment",
        "Neighbourhood: # inhabitants",
        "Neighbourhood: % non-western migrant",
        "Neighbourhood: mean property valuation",
        "Outgroup attitudes",
        "Age",
        "Partner[no partner]",
        "Gender[Male]"
      )
    )
  ) %>% 
  ggplot(aes(x = coef, y = var_fac)) +
  geom_col(aes(colour = name,
             fill = name),
           alpha = 0.6) + 
  geom_errorbar(aes(xmin = coef - (se*1.96),
                    xmax = coef + (se*1.96))) +
  facet_grid(cols = vars(model),
             rows = vars(name)) +
  geom_vline(xintercept = 0) +
  geom_text(aes(label = sig_f),
            hjust = 2,
            size = 6) +
  theme_minimal() +
  scale_fill_viridis_d(option = "D") +
  scale_colour_viridis_d(option = "D") +
  theme(legend.position = "bottom",
        #plot.background = element_rect(fill = '#404040', colour = '#404040'),
        #panel.background = element_rect(fill = '#404040', colour = '#404040'),
        axis.title.y = element_text(hjust = 1),
        axis.title.x = element_text(hjust = 1),
        text = element_text(colour = "black"),
        strip.text = element_text(colour = "black"),
        legend.title = element_blank()) +
  labs(y = "Independent Variables",
       x = "Coefficient",
       caption = "* p < 0.05")

#save plot
  ggsave(variables_decomp_hom_plot,
        file = "data_analysis/plots/results/variables_decomp_hom_plot.jpg",
       width = 12,
       height = 8,
       dpi = 320)

  #show plot
variables_decomp_hom_plot

```

### Regression model results per group

All (sub)groups

```{r different model results homogeneity gen diff}
#create list with regression by migration group
results_list <- imp %>%
  filter(gender != "Other") %>% 
  group_split(migration_background_fac) %>%
  map(
    .x = .,
    .f = ~ lm(
      per_ingroup_w ~  age +
        gender + 
        education_recoded_numeric +
        partner + 
        in_education +
        paid_work +
        rel_fac_2 +
        therm_outgroup_diff +
        per_NW_mig_a_pc4 +
        woz_pc4 +
        Inwoner_pc4,
      data = .x
    )
  )

#show table with results
sjPlot::tab_model(results_list,
          pred.labels = c(
            "Constant",
            "Age",
            "Gender [Fenale]",
            "Educational attainment",
            "Partner[Has a partner]",
            "Paid work",
            "Currently in education",
            "Religious attendance[1-2 year]",
            "Religious attendance[3-11 a year]",
            "Religious attendance[Once a month or more]",
            "Outgroup attitude",
            "Neighborhood: % non-western migrant",
            "Neighborhood: mean property valuation",
            "Neighborhood: # inhabitants"
            ),
          dv.labels = c("Dutch","1st gen Turkish", "2nd gen Turkish", "1st gen Moroccan", "2nd gen Moroccan"),
          digits = 3,
          show.ci = F,
          show.stat = T,
          collapse.se  = F,
          col.order = c("est", "stat")
          #file = "reg.doc"
          )
```

All major groups.

```{r different model results homogeneity simple}
#list withr results of each group
results_list <- imp %>%
  filter(gender != "Other") %>% 
  group_split(migration_background_simple_fac) %>%
  map(
    .x = .,
    .f = ~ lm(
      per_ingroup_w ~  age +
        gender + 
        education_recoded_numeric +
        partner + 
        in_education +
        paid_work +
        rel_fac_2 +
        therm_outgroup_diff +
        per_NW_mig_a_pc4 +
        woz_pc4 +
        Inwoner_pc4,
      data = .x
    )
  )

#show results in a table
sjPlot::tab_model(results_list,
          pred.labels = c(
            "Constant",
            "Age",
            "Gender [Fenale]",
            "Educational attainment",
            "Partner[Has a partner]",
            "Paid work",
            "Currently in education",
            "Religious attendance[1-2 year]",
            "Religious attendance[3-11 a year]",
            "Religious attendance[Once a month or more]",
            "Outgroup attitude",
            "Neighborhood: % co-ethnic",
            "Neighborhood: mean property valuation",
            "Neighborhood: # inhabitants",
            "Therm missing"
            ),
          dv.labels = c("Dutch","Turkish-Dutch", "Moroccan-Dutch"),
          digits = 3,
          show.ci = F,
          show.stat = T,
          collapse.se  = F,
          col.order = c("est", "stat")
          #file = "reg.doc"
          )

```


Copyright © 2024 Jeroense Thijmen