Data preparation script NELLS module for anayses

Libraries

# get packages needed
fpackage.check <- function(packages) { # (c) Jochem Tolsma
  lapply(packages, FUN = function(x) {
    if (!require(x, character.only = TRUE)) {
      install.packages(x, dependencies = TRUE)
      library(x, character.only = TRUE)
    }
  })
}
packages = c("readr", "tidyverse")
fpackage.check(packages)

NELLS combined import

Import the NELLS datafile with selected neigbhourhood variables.

load(file =  "data_analysis/data/data_processed/nells_data/2022-05-08_nells_nsum_data.rds")

Clean NSUM module data

First select the correct NSUM question. For the cleaning we use the variables which still have the original coding. Then we listwise delete missing values from these data. By doing so we go from 1595 responses to 1240.

#select nsum questions
raw_nsum_module <- nells_nsum %>% 
  arrange(id) %>%  #order on personal id code.
  select(!ends_with("boundary")) %>% 
  select(id, starts_with("knows")) %>% 
  na.omit()

Second task is to delete responses which have an sd of zero across name variables and catagory variables. These are respondents who filled in the same response for each X question. We lose 45 responses for names and 15 for categories. After deleting those responses we are left with 1187 responses.

#create filter
raw_nsum_module <- raw_nsum_module %>% 
  rowwise() %>% #create for every row the SD of all the naming questions. 
  mutate(sd_names = sd(c_across(2:12), na.rm = T),
         sd_categories = sd(c_across(3:24), na.rm = T)) %>% 
  ungroup()

#How many do you loose this way?
# raw_nsum_module %>% #N = 45 for names
#   filter(sd_names == 0) %>% 
#   count()

# raw_nsum_module %>% #N = 15 for categories
#   filter(sd_categories == 0) %>% 
#   count()

#filter
raw_nsum_module <- raw_nsum_module %>% 
  filter(sd_categories != 0) %>% 
  filter(sd_names != 0)

Third task of cleaning is to filter out responses that have a high frequency of 50/35/15. We chose to filter out responses when they indicate that they indicate to know 35 people for more than two names and now 50 people for more than one name. We remove 8 responses this way.

#functions to check prevalence of a value in variable
count_frequency_value_f <- function(object, value){
  y <- length(object[!is.na(object) & object == value])
}

#check for high frequency of 50.
raw_nsum_module <- raw_nsum_module %>%
  rowwise() %>%
  mutate(
    count_50_names = count_frequency_value_f(object = c_across(
      c(
        knows_daan,
        knows_kevin,
        knows_edwin,
        knows_albert,
        knows_emma,
        knows_linda,
        knows_ingrid,
        knows_willemina,
        knows_mohammed,
        knows_esra,
        knows_ibrahim,
        knows_fatima
      )
    ),
    value = 50),
    count_35_names = count_frequency_value_f(object = c_across(
      c(
        knows_daan,
        knows_kevin,
        knows_edwin,
        knows_albert,
        knows_emma,
        knows_linda,
        knows_ingrid,
        knows_willemina,
        knows_mohammed,
        knows_esra,
        knows_ibrahim,
        knows_fatima
      )
    ),
    value = 35),
    count_15_names = count_frequency_value_f(object = c_across(
      c(
        knows_daan,
        knows_kevin,
        knows_edwin,
        knows_albert,
        knows_emma,
        knows_linda,
        knows_ingrid,
        knows_willemina,
        knows_mohammed,
        knows_esra,
        knows_ibrahim,
        knows_fatima
      )
    ),
    value = 15)
  )

#for 35, remove those people who have a higher frequency than 2 for the names.
#for 50, remove those people who have a higher frequency than 7 for the names.
#we remove 8 names
selection <- raw_nsum_module %>% 
  filter(count_35_names < 3) %>% 
  filter(count_50_names < 2) %>% 
  pull(id)

#create selection from nells nsum file
nells_nsum <- nells_nsum %>% 
  filter(id %in% selection) %>% 
  arrange(id)

Ethnic names

nells_nsum <- nells_nsum %>% 
  mutate(knows_ethnic_boundary = ((knows_mohammed*(13448/20233)) + (knows_fatima*(2808/20233)) + (knows_esra*(1878/20233)) + (knows_ibrahim*(2099/20233)))/4) 

Create NSUM module

We do not use all of the questions from the NSUM module. We created a module with more names and and also more categories who are specified to people with a migration background then we actually need or use. If we use more names or categories that measure knowing so-called migratns we inflate the networks of these groups. So we come to a final selection of 16 populations for the main NSUM analysis.

#relevel migration background and create simple migration background 
nells_nsum <- nells_nsum %>%
  mutate(
    migration_background_fac = fct_relevel(
      migration_background_fac,
      "Dutch",
      "1st gen Turkish",
      "2nd gen Turkish",
      "1st gen Moroccan",
      "2nd gen Moroccan"
    ),
    migration_background_fac = factor(
      as.numeric(migration_background_fac),
      levels = 1:5,
      labels = c(
        "Dutch-Majority",
        "1st gen Turkish-Dutch",
        "2nd gen Turkish-Dutch",
        "1st gen Moroccan-Dutch",
        "2nd gen Moroccan-Dutch"
      )
    ),
    migration_background_simple_fac = case_when(
      migration_background_fac == "1st gen Turkish-Dutch" ~ 2,
      migration_background_fac == "2nd gen Turkish-Dutch" ~ 2,
      migration_background_fac == "1st gen Moroccan-Dutch" ~ 3,
      migration_background_fac == "2nd gen Moroccan-Dutch" ~ 3,
      migration_background_fac == "Dutch-Majority" ~ 1
    ),
    migration_background_simple_fac = factor(
      migration_background_simple_fac,
      levels = 1:3,
      labels = c("Dutch-Majority", "Turkish-Dutch", "Moroccan-Dutch")
    )
  )

Extract information for NSUM analysis.

#create matrix
#select observations from nells_nsum
nsum_model_selection <- nells_nsum %>% 
  arrange(id) %>% 
  select(knows_daan_boundary,
         knows_kevin_boundary, 
         knows_edwin_boundary,
         knows_albert_boundary,
         knows_emma_boundary,
         knows_linda_boundary,
         knows_ingrid_boundary,
         knows_willemina_boundary,
         knows_ibrahim_boundary,
         knows_prison_boundary,
         knows_mbo,
         knows_hbo,
         knows_university,
         knows_secundary,
         knows_secondhome,
         knows_unemployed)

#observations
mat <- nsum_model_selection %>% 
  as.matrix()

#known population information
#import frequencies
nsum_frequencies <- read_csv(file = "data_analysis/2022-08-26_namefrequencies.csv")

#vector with known population information
known_vector <- known_vector <- nsum_frequencies %>% 
  filter(name %in% c(
    "knows_daan",
    "knows_kevin", 
    "knows_edwin",
    "knows_albert",
    "knows_emma",
    "knows_linda",
    "knows_ingrid",
    "knows_willemina",
    "knows_ibrahim",
    "knows_prison",
    "knows_mbo",
    "knows_hbo",
    "knows_university",
    "knows_secundary",
    "knows_secondhome",
    "knows_unemployed"
  )) %>% 
  pull(number)
##########################

#export analysis matrix and known vector
analysis_files_selection <- list(mat, known_vector)

save(analysis_files_selection, file = "data_analysis/data/data_processed/nsum_input/2023-04-11_analysis_files_selection.rds")
#create matrix
#select observations from nells_nsum
nsum_model_selection_2 <- nells_nsum %>% 
  arrange(id) %>% 
  select(knows_daan_boundary,
         knows_kevin_boundary, 
         knows_edwin_boundary,
         knows_albert_boundary,
         knows_emma_boundary,
         knows_linda_boundary,
         knows_ingrid_boundary,
         knows_willemina_boundary,
         knows_ethnic_boundary,
         knows_prison_boundary,
         knows_mbo,
         knows_hbo,
         knows_university,
         knows_secundary,
         knows_secondhome,
         knows_unemployed)

#observations
mat_2 <- nsum_model_selection_2 %>% 
  as.matrix()

#vector with known population information
known_vector_2 <- nsum_frequencies %>% 
  mutate(number = ifelse(name == "knows_ibrahim", 20233, number)) %>% 
  filter(name %in% c(
    "knows_daan",
    "knows_kevin", 
    "knows_edwin",
    "knows_albert",
    "knows_emma",
    "knows_linda",
    "knows_ingrid",
    "knows_willemina",
    "knows_ibrahim",
    "knows_prison",
    "knows_mbo",
    "knows_hbo",
    "knows_university",
    "knows_secundary",
    "knows_secondhome",
    "knows_unemployed"
  )) %>% 
  pull(number)
##########################

#export analysis matrix and known vector
analysis_files_selection_2 <- list(mat_2, known_vector_2)

save(analysis_files_selection_2, file = "data_analysis/data/data_processed/nsum_input/2023-04-11_analysis_files_selection_2.rds")

Create model file for each subgroup

Full model

#create function to use in loop
nsum_full_model_prepare <- function(dataframe, known_df) { #dataframe = group_list[[1]]
  df <- dataframe %>%
    arrange(id) %>% #arrange on id so we can merge the results
    select( #only select information we need
      knows_daan_boundary,
      knows_kevin_boundary,
      knows_edwin_boundary,
      knows_albert_boundary,
      knows_emma_boundary,
      knows_linda_boundary,
      knows_ingrid_boundary,
      knows_willemina_boundary,
      knows_ibrahim_boundary,
      knows_prison_boundary,
      knows_mbo,
      knows_hbo,
      knows_university,
      knows_secundary,
      knows_secondhome,
      knows_unemployed
    )
  
  mat <- df %>% 
  as.matrix()
  
  known_vector <- known_df %>% 
  filter(name %in% c(
    "knows_daan",
    "knows_kevin", 
    "knows_edwin",
    "knows_albert",
    "knows_emma",
    "knows_linda",
    "knows_ingrid",
    "knows_willemina",
    "knows_ibrahim",
    "knows_prison",
    "knows_mbo",
    "knows_hbo",
    "knows_university",
    "knows_secundary",
    "knows_secondhome",
    "knows_unemployed"
  )) %>% 
  pull(number)
  
  nsum_model_selection <- list(mat, known_vector)
  
  return(nsum_model_selection)
}

#create list for loop
group_list <- nells_nsum %>% 
  group_split(migration_background_simple_fac)

#replace other with full sample
group_list[[4]] <- nells_nsum

#apply map and save result
full_model_groups <- group_list %>% 
  map(.x = .,
      .f = ~ nsum_full_model_prepare(dataframe = .x,
                                   known_df = nsum_frequencies))


#add names
names(full_model_groups) <- c("Dutch-Majority",
                              "Turkish-Dutch",
                              "Moroccan-Dutch", 
                              "Full sample")

Model with only categories

#create function to use in loop
nsum_category_model_prepare <- function(dataframe, known_df) { #dataframe = group_list[[1]]
  df <- dataframe %>%
    arrange(id) %>% #arrange on id so we can merge the results
    select( #only select information we need
      knows_prison_boundary,
      knows_mbo,
      knows_hbo,
      knows_university,
      knows_secundary,
      knows_secondhome,
      knows_unemployed
    )
  
  mat <- df %>% 
  as.matrix()
  
  known_vector <- known_df %>% 
  filter(name %in% c(
    "knows_prison",
    "knows_mbo",
    "knows_hbo",
    "knows_university",
    "knows_secundary",
    "knows_secondhome",
    "knows_unemployed"
  )) %>% 
  pull(number)
  
  nsum_model_selection <- list(mat, known_vector)
  
  return(nsum_model_selection)
}

#apply map and save result
cat_model_groups <- group_list %>% 
  map(.x = .,
      .f = ~ nsum_category_model_prepare(dataframe = .x,
                                   known_df = nsum_frequencies))


#add names
names(cat_model_groups) <- c("Dutch-Majority",
                              "Turkish-Dutch",
                              "Moroccan-Dutch", 
                              "Full sample")

Model with only ethnic categories

#create function to use in loop
nsum_ethnicnames_model_prepare <- function(dataframe, known_df) { #dataframe = group_list[[1]]
  df <- dataframe %>%
    arrange(id) %>% #arrange on id so we can merge the results
    select( #only select information we need
      knows_mohammed_boundary,
      knows_fatima_boundary,
      knows_esra_boundary,
      knows_ibrahim_boundary
    )
  
  mat <- df %>% 
  as.matrix()
  
  known_vector <- known_df %>% 
  filter(name %in% c(
    "knows_mohammed",
    "knows_fatima",
    "knows_esra",
    "knows_ibrahim"
  )) %>% 
  pull(number)
  
  nsum_model_selection <- list(mat, known_vector)
  
  return(nsum_model_selection)
}

#apply map and save result
ethnicnames_model_groups <- group_list %>% 
  map(.x = .,
      .f = ~ nsum_ethnicnames_model_prepare(dataframe = .x,
                                   known_df = nsum_frequencies))


#add names
names(ethnicnames_model_groups) <- c("Dutch-Majority",
                              "Turkish-Dutch",
                              "Moroccan-Dutch", 
                              "Full sample")

Model with only majority categories

#create function to use in loop
nsum_maj_model_prepare <- function(dataframe, known_df) { #dataframe = group_list[[1]]
  df <- dataframe %>%
    arrange(id) %>% #arrange on id so we can merge the results
    select( #only select information we need
      knows_daan_boundary,
      knows_kevin_boundary,
      knows_edwin_boundary,
      knows_albert_boundary,
      knows_emma_boundary,
      knows_linda_boundary,
      knows_ingrid_boundary,
      knows_willemina_boundary
    )
  
  mat <- df %>% 
  as.matrix()
  
  known_vector <- known_df %>% 
  filter(name %in% c(
    "knows_daan",
    "knows_kevin", 
    "knows_edwin",
    "knows_albert",
    "knows_emma",
    "knows_linda",
    "knows_ingrid",
    "knows_willemina"
  )) %>% 
  pull(number)
  
  nsum_model_selection <- list(mat, known_vector)
  
  return(nsum_model_selection)
}

#apply map and save result
maj_model_groups <- group_list %>% 
  map(.x = .,
      .f = ~ nsum_maj_model_prepare(dataframe = .x,
                                   known_df = nsum_frequencies))

#add names
names(maj_model_groups) <- c("Dutch-Majority",
                              "Turkish-Dutch",
                              "Moroccan-Dutch", 
                              "Full sample")

Model with all names

#create function to use in loop
nsum_names_model_prepare <- function(dataframe, known_df) { #dataframe = group_list[[1]]
  df <- dataframe %>%
    arrange(id) %>% #arrange on id so we can merge the results
    select( #only select information we need
      knows_daan_boundary,
      knows_kevin_boundary,
      knows_edwin_boundary,
      knows_albert_boundary,
      knows_emma_boundary,
      knows_linda_boundary,
      knows_ingrid_boundary,
      knows_willemina_boundary,
      knows_mohammed_boundary,
      knows_fatima_boundary,
      knows_esra_boundary,
      knows_ibrahim_boundary
    )
  
  mat <- df %>% 
  as.matrix()
  
  known_vector <- known_df %>% 
  filter(name %in% c(
    "knows_daan",
    "knows_kevin", 
    "knows_edwin",
    "knows_albert",
    "knows_emma",
    "knows_linda",
    "knows_ingrid",
    "knows_willemina",
    "knows_mohammed",
    "knows_fatima",
    "knows_esra",
    "knows_ibrahim"
  )) %>% 
  pull(number)
  
  nsum_model_selection <- list(mat, known_vector)
  
  return(nsum_model_selection)
}

#apply map and save result
names_model_groups <- group_list %>% 
  map(.x = .,
      .f = ~ nsum_names_model_prepare(dataframe = .x,
                                   known_df = nsum_frequencies))


#add names
names(names_model_groups) <- c("Dutch-Majority",
                              "Turkish-Dutch",
                              "Moroccan-Dutch", 
                              "Full sample")

Combine robustness data

robustness_dfs <- list(full_model_groups, 
     cat_model_groups,
     ethnicnames_model_groups,
     maj_model_groups,
     names_model_groups
     )

names(robustness_dfs) <- c("full_model_groups",
                           "cat_model_groups",
                           "ethnicnames_model_groups",
                           "maj_model_groups",
                           "names_model_groups") 
save(robustness_dfs, 
     file = "data_analysis/data/data_processed/nsum_input/2023-04-21_robustness-dfs.rda")

Export data

#export datafile
save(nells_nsum, file = "data_analysis/data/data_processed/nells_data/2023-05-08_nells-nsum-prepped-data.rds")
---
title: "NSUM preparation"
author: "Thijmen Jeroense"
date: "Last compiled on `r format(Sys.time(), '%d %B, %Y')`"
output:
  html_document:
    toc: TRUE
    toc_depth: 3
    toc_float: TRUE
    code_folding: show
    code_download: TRUE
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(cache = TRUE, message = FALSE, warning = FALSE, results = "asis",
                      fig.align = "center")
```


# Data preparation script NELLS module for anayses

## Libraries

```{r libraries, results='hide'}
# get packages needed
fpackage.check <- function(packages) { # (c) Jochem Tolsma
  lapply(packages, FUN = function(x) {
    if (!require(x, character.only = TRUE)) {
      install.packages(x, dependencies = TRUE)
      library(x, character.only = TRUE)
    }
  })
}
packages = c("readr", "tidyverse")
fpackage.check(packages)
```

## NELLS combined import

Import the NELLS datafile with selected neigbhourhood variables. 

```{r data import}
load(file =  "data_analysis/data/data_processed/nells_data/2022-05-08_nells_nsum_data.rds")
```


# Clean NSUM module data

First select the correct NSUM question. For the cleaning we use the variables which still have the original coding. Then we listwise delete missing values from these data. By doing so we go from 1595 responses to 1240. 

```{r listwise deletion}
#select nsum questions
raw_nsum_module <- nells_nsum %>% 
  arrange(id) %>%  #order on personal id code.
  select(!ends_with("boundary")) %>% 
  select(id, starts_with("knows")) %>% 
  na.omit()

```


Second task is to delete responses which have an sd of zero across name variables and catagory variables. These are respondents who filled in the same response for each X question. We lose 45 responses for names and 15 for categories. After deleting those responses we are left with 1187 responses. 

```{r deselect on sd}
#create filter
raw_nsum_module <- raw_nsum_module %>% 
  rowwise() %>% #create for every row the SD of all the naming questions. 
  mutate(sd_names = sd(c_across(2:12), na.rm = T),
         sd_categories = sd(c_across(3:24), na.rm = T)) %>% 
  ungroup()

#How many do you loose this way?
# raw_nsum_module %>% #N = 45 for names
#   filter(sd_names == 0) %>% 
#   count()

# raw_nsum_module %>% #N = 15 for categories
#   filter(sd_categories == 0) %>% 
#   count()

#filter
raw_nsum_module <- raw_nsum_module %>% 
  filter(sd_categories != 0) %>% 
  filter(sd_names != 0)

```

Third task of cleaning is to filter out responses that have a high frequency of 50/35/15. We chose to filter out responses when they indicate that they indicate to know 35 people for more than two names and now 50 people for more than one name. We remove 8 responses this way. 

```{r check high frequencies}

#functions to check prevalence of a value in variable
count_frequency_value_f <- function(object, value){
  y <- length(object[!is.na(object) & object == value])
}

#check for high frequency of 50.
raw_nsum_module <- raw_nsum_module %>%
  rowwise() %>%
  mutate(
    count_50_names = count_frequency_value_f(object = c_across(
      c(
        knows_daan,
        knows_kevin,
        knows_edwin,
        knows_albert,
        knows_emma,
        knows_linda,
        knows_ingrid,
        knows_willemina,
        knows_mohammed,
        knows_esra,
        knows_ibrahim,
        knows_fatima
      )
    ),
    value = 50),
    count_35_names = count_frequency_value_f(object = c_across(
      c(
        knows_daan,
        knows_kevin,
        knows_edwin,
        knows_albert,
        knows_emma,
        knows_linda,
        knows_ingrid,
        knows_willemina,
        knows_mohammed,
        knows_esra,
        knows_ibrahim,
        knows_fatima
      )
    ),
    value = 35),
    count_15_names = count_frequency_value_f(object = c_across(
      c(
        knows_daan,
        knows_kevin,
        knows_edwin,
        knows_albert,
        knows_emma,
        knows_linda,
        knows_ingrid,
        knows_willemina,
        knows_mohammed,
        knows_esra,
        knows_ibrahim,
        knows_fatima
      )
    ),
    value = 15)
  )

#for 35, remove those people who have a higher frequency than 2 for the names.
#for 50, remove those people who have a higher frequency than 7 for the names.
#we remove 8 names
selection <- raw_nsum_module %>% 
  filter(count_35_names < 3) %>% 
  filter(count_50_names < 2) %>% 
  pull(id)

#create selection from nells nsum file
nells_nsum <- nells_nsum %>% 
  filter(id %in% selection) %>% 
  arrange(id)
```

# Ethnic names
```{r ethnicity weighted}
nells_nsum <- nells_nsum %>% 
  mutate(knows_ethnic_boundary = ((knows_mohammed*(13448/20233)) + (knows_fatima*(2808/20233)) + (knows_esra*(1878/20233)) + (knows_ibrahim*(2099/20233)))/4) 

```


# Create NSUM module

We do not use all of the questions from the NSUM module. We created a module with more names and and also more categories who are specified to people with a migration background then we actually need or use. If we use more names or categories that measure knowing so-called migratns we inflate the networks of these groups. So we come to a final selection of 16 populations for the main NSUM analysis. 


```{r migration dataprep}
#relevel migration background and create simple migration background 
nells_nsum <- nells_nsum %>%
  mutate(
    migration_background_fac = fct_relevel(
      migration_background_fac,
      "Dutch",
      "1st gen Turkish",
      "2nd gen Turkish",
      "1st gen Moroccan",
      "2nd gen Moroccan"
    ),
    migration_background_fac = factor(
      as.numeric(migration_background_fac),
      levels = 1:5,
      labels = c(
        "Dutch-Majority",
        "1st gen Turkish-Dutch",
        "2nd gen Turkish-Dutch",
        "1st gen Moroccan-Dutch",
        "2nd gen Moroccan-Dutch"
      )
    ),
    migration_background_simple_fac = case_when(
      migration_background_fac == "1st gen Turkish-Dutch" ~ 2,
      migration_background_fac == "2nd gen Turkish-Dutch" ~ 2,
      migration_background_fac == "1st gen Moroccan-Dutch" ~ 3,
      migration_background_fac == "2nd gen Moroccan-Dutch" ~ 3,
      migration_background_fac == "Dutch-Majority" ~ 1
    ),
    migration_background_simple_fac = factor(
      migration_background_simple_fac,
      levels = 1:3,
      labels = c("Dutch-Majority", "Turkish-Dutch", "Moroccan-Dutch")
    )
  )
```

Extract information for NSUM analysis. 

```{r nsum input prep}
#create matrix
#select observations from nells_nsum
nsum_model_selection <- nells_nsum %>% 
  arrange(id) %>% 
  select(knows_daan_boundary,
         knows_kevin_boundary, 
         knows_edwin_boundary,
         knows_albert_boundary,
         knows_emma_boundary,
         knows_linda_boundary,
         knows_ingrid_boundary,
         knows_willemina_boundary,
         knows_ibrahim_boundary,
         knows_prison_boundary,
         knows_mbo,
         knows_hbo,
         knows_university,
         knows_secundary,
         knows_secondhome,
         knows_unemployed)

#observations
mat <- nsum_model_selection %>% 
  as.matrix()

#known population information
#import frequencies
nsum_frequencies <- read_csv(file = "data_analysis/2022-08-26_namefrequencies.csv")

#vector with known population information
known_vector <- known_vector <- nsum_frequencies %>% 
  filter(name %in% c(
    "knows_daan",
    "knows_kevin", 
    "knows_edwin",
    "knows_albert",
    "knows_emma",
    "knows_linda",
    "knows_ingrid",
    "knows_willemina",
    "knows_ibrahim",
    "knows_prison",
    "knows_mbo",
    "knows_hbo",
    "knows_university",
    "knows_secundary",
    "knows_secondhome",
    "knows_unemployed"
  )) %>% 
  pull(number)
##########################

#export analysis matrix and known vector
analysis_files_selection <- list(mat, known_vector)

save(analysis_files_selection, file = "data_analysis/data/data_processed/nsum_input/2023-04-11_analysis_files_selection.rds")
```


```{r nsum input prep 2}
#create matrix
#select observations from nells_nsum
nsum_model_selection_2 <- nells_nsum %>% 
  arrange(id) %>% 
  select(knows_daan_boundary,
         knows_kevin_boundary, 
         knows_edwin_boundary,
         knows_albert_boundary,
         knows_emma_boundary,
         knows_linda_boundary,
         knows_ingrid_boundary,
         knows_willemina_boundary,
         knows_ethnic_boundary,
         knows_prison_boundary,
         knows_mbo,
         knows_hbo,
         knows_university,
         knows_secundary,
         knows_secondhome,
         knows_unemployed)

#observations
mat_2 <- nsum_model_selection_2 %>% 
  as.matrix()

#vector with known population information
known_vector_2 <- nsum_frequencies %>% 
  mutate(number = ifelse(name == "knows_ibrahim", 20233, number)) %>% 
  filter(name %in% c(
    "knows_daan",
    "knows_kevin", 
    "knows_edwin",
    "knows_albert",
    "knows_emma",
    "knows_linda",
    "knows_ingrid",
    "knows_willemina",
    "knows_ibrahim",
    "knows_prison",
    "knows_mbo",
    "knows_hbo",
    "knows_university",
    "knows_secundary",
    "knows_secondhome",
    "knows_unemployed"
  )) %>% 
  pull(number)
##########################

#export analysis matrix and known vector
analysis_files_selection_2 <- list(mat_2, known_vector_2)

save(analysis_files_selection_2, file = "data_analysis/data/data_processed/nsum_input/2023-04-11_analysis_files_selection_2.rds")
```


# Create model file for each subgroup

## Full model

```{r create file for loop (full model)}
#create function to use in loop
nsum_full_model_prepare <- function(dataframe, known_df) { #dataframe = group_list[[1]]
  df <- dataframe %>%
    arrange(id) %>% #arrange on id so we can merge the results
    select( #only select information we need
      knows_daan_boundary,
      knows_kevin_boundary,
      knows_edwin_boundary,
      knows_albert_boundary,
      knows_emma_boundary,
      knows_linda_boundary,
      knows_ingrid_boundary,
      knows_willemina_boundary,
      knows_ibrahim_boundary,
      knows_prison_boundary,
      knows_mbo,
      knows_hbo,
      knows_university,
      knows_secundary,
      knows_secondhome,
      knows_unemployed
    )
  
  mat <- df %>% 
  as.matrix()
  
  known_vector <- known_df %>% 
  filter(name %in% c(
    "knows_daan",
    "knows_kevin", 
    "knows_edwin",
    "knows_albert",
    "knows_emma",
    "knows_linda",
    "knows_ingrid",
    "knows_willemina",
    "knows_ibrahim",
    "knows_prison",
    "knows_mbo",
    "knows_hbo",
    "knows_university",
    "knows_secundary",
    "knows_secondhome",
    "knows_unemployed"
  )) %>% 
  pull(number)
  
  nsum_model_selection <- list(mat, known_vector)
  
  return(nsum_model_selection)
}

#create list for loop
group_list <- nells_nsum %>% 
  group_split(migration_background_simple_fac)

#replace other with full sample
group_list[[4]] <- nells_nsum

#apply map and save result
full_model_groups <- group_list %>% 
  map(.x = .,
      .f = ~ nsum_full_model_prepare(dataframe = .x,
                                   known_df = nsum_frequencies))


#add names
names(full_model_groups) <- c("Dutch-Majority",
                              "Turkish-Dutch",
                              "Moroccan-Dutch", 
                              "Full sample")
```


##  Model with only categories

```{r create file for loop (categories)}
#create function to use in loop
nsum_category_model_prepare <- function(dataframe, known_df) { #dataframe = group_list[[1]]
  df <- dataframe %>%
    arrange(id) %>% #arrange on id so we can merge the results
    select( #only select information we need
      knows_prison_boundary,
      knows_mbo,
      knows_hbo,
      knows_university,
      knows_secundary,
      knows_secondhome,
      knows_unemployed
    )
  
  mat <- df %>% 
  as.matrix()
  
  known_vector <- known_df %>% 
  filter(name %in% c(
    "knows_prison",
    "knows_mbo",
    "knows_hbo",
    "knows_university",
    "knows_secundary",
    "knows_secondhome",
    "knows_unemployed"
  )) %>% 
  pull(number)
  
  nsum_model_selection <- list(mat, known_vector)
  
  return(nsum_model_selection)
}

#apply map and save result
cat_model_groups <- group_list %>% 
  map(.x = .,
      .f = ~ nsum_category_model_prepare(dataframe = .x,
                                   known_df = nsum_frequencies))


#add names
names(cat_model_groups) <- c("Dutch-Majority",
                              "Turkish-Dutch",
                              "Moroccan-Dutch", 
                              "Full sample")
```


##  Model with only ethnic categories

```{r create file for loop (migrant)}
#create function to use in loop
nsum_ethnicnames_model_prepare <- function(dataframe, known_df) { #dataframe = group_list[[1]]
  df <- dataframe %>%
    arrange(id) %>% #arrange on id so we can merge the results
    select( #only select information we need
      knows_mohammed_boundary,
      knows_fatima_boundary,
      knows_esra_boundary,
      knows_ibrahim_boundary
    )
  
  mat <- df %>% 
  as.matrix()
  
  known_vector <- known_df %>% 
  filter(name %in% c(
    "knows_mohammed",
    "knows_fatima",
    "knows_esra",
    "knows_ibrahim"
  )) %>% 
  pull(number)
  
  nsum_model_selection <- list(mat, known_vector)
  
  return(nsum_model_selection)
}

#apply map and save result
ethnicnames_model_groups <- group_list %>% 
  map(.x = .,
      .f = ~ nsum_ethnicnames_model_prepare(dataframe = .x,
                                   known_df = nsum_frequencies))


#add names
names(ethnicnames_model_groups) <- c("Dutch-Majority",
                              "Turkish-Dutch",
                              "Moroccan-Dutch", 
                              "Full sample")
```


##  Model with only majority categories

```{r create file for loop (majority)}
#create function to use in loop
nsum_maj_model_prepare <- function(dataframe, known_df) { #dataframe = group_list[[1]]
  df <- dataframe %>%
    arrange(id) %>% #arrange on id so we can merge the results
    select( #only select information we need
      knows_daan_boundary,
      knows_kevin_boundary,
      knows_edwin_boundary,
      knows_albert_boundary,
      knows_emma_boundary,
      knows_linda_boundary,
      knows_ingrid_boundary,
      knows_willemina_boundary
    )
  
  mat <- df %>% 
  as.matrix()
  
  known_vector <- known_df %>% 
  filter(name %in% c(
    "knows_daan",
    "knows_kevin", 
    "knows_edwin",
    "knows_albert",
    "knows_emma",
    "knows_linda",
    "knows_ingrid",
    "knows_willemina"
  )) %>% 
  pull(number)
  
  nsum_model_selection <- list(mat, known_vector)
  
  return(nsum_model_selection)
}

#apply map and save result
maj_model_groups <- group_list %>% 
  map(.x = .,
      .f = ~ nsum_maj_model_prepare(dataframe = .x,
                                   known_df = nsum_frequencies))

#add names
names(maj_model_groups) <- c("Dutch-Majority",
                              "Turkish-Dutch",
                              "Moroccan-Dutch", 
                              "Full sample")
```

## Model with all names

```{r create file for loop (all names)}
#create function to use in loop
nsum_names_model_prepare <- function(dataframe, known_df) { #dataframe = group_list[[1]]
  df <- dataframe %>%
    arrange(id) %>% #arrange on id so we can merge the results
    select( #only select information we need
      knows_daan_boundary,
      knows_kevin_boundary,
      knows_edwin_boundary,
      knows_albert_boundary,
      knows_emma_boundary,
      knows_linda_boundary,
      knows_ingrid_boundary,
      knows_willemina_boundary,
      knows_mohammed_boundary,
      knows_fatima_boundary,
      knows_esra_boundary,
      knows_ibrahim_boundary
    )
  
  mat <- df %>% 
  as.matrix()
  
  known_vector <- known_df %>% 
  filter(name %in% c(
    "knows_daan",
    "knows_kevin", 
    "knows_edwin",
    "knows_albert",
    "knows_emma",
    "knows_linda",
    "knows_ingrid",
    "knows_willemina",
    "knows_mohammed",
    "knows_fatima",
    "knows_esra",
    "knows_ibrahim"
  )) %>% 
  pull(number)
  
  nsum_model_selection <- list(mat, known_vector)
  
  return(nsum_model_selection)
}

#apply map and save result
names_model_groups <- group_list %>% 
  map(.x = .,
      .f = ~ nsum_names_model_prepare(dataframe = .x,
                                   known_df = nsum_frequencies))


#add names
names(names_model_groups) <- c("Dutch-Majority",
                              "Turkish-Dutch",
                              "Moroccan-Dutch", 
                              "Full sample")
```

## Combine robustness data 

```{r combine robustness datafiles}
robustness_dfs <- list(full_model_groups, 
     cat_model_groups,
     ethnicnames_model_groups,
     maj_model_groups,
     names_model_groups
     )

names(robustness_dfs) <- c("full_model_groups",
                           "cat_model_groups",
                           "ethnicnames_model_groups",
                           "maj_model_groups",
                           "names_model_groups") 
save(robustness_dfs, 
     file = "data_analysis/data/data_processed/nsum_input/2023-04-21_robustness-dfs.rda")

```


# Export data

```{r export data }
#export datafile
save(nells_nsum, file = "data_analysis/data/data_processed/nells_data/2023-05-08_nells-nsum-prepped-data.rds")

```










Copyright © 2024 Jeroense Thijmen