#library
library(tidyverse) #data transformation.
library(data.table) #data transformation
library(igraph) #for egonet variables (degree and density)
library(furrr) #for parallel computing
library(future) #for parallel computing
load("datafiles/data-processed/common_data/0623_v5_liss_merged_core_file.rds")
load(file = "datafiles/data-processed/disaggregated_data/2023-06-12_liss-repeated-risk-alter-ego-data.rda")
#age recode
fage_rec <- function (x) {
y <- ifelse(x < 16, 1, x)
y <- ifelse(x > 15 & x < 21, 2, y)
y <- ifelse(x > 20 & x < 26, 3, y)
y <- ifelse(x > 25 & x < 31, 4, y)
y <- ifelse(x > 30 & x < 36, 5, y)
y <- ifelse(x > 35 & x < 41, 6, y)
y <- ifelse(x > 40 & x < 46, 7, y)
y <- ifelse(x > 45 & x < 51, 8, y)
y <- ifelse(x > 50 & x < 56, 9, y)
y <- ifelse(x > 55 & x < 61, 10, y)
y <- ifelse(x > 60 & x < 66, 11, y)
y <- ifelse(x > 65 & x < 71, 12, y)
y <- ifelse(x > 70, 13, y)
return(y)
}
#similarity functions
#education
f_sim <- function (x,y,z) {
result <- 1 - (abs(x - y)/z)
return(result)
}
#ei index function
fEIindexAdjusted <- function(x, y){
z <- (length(x[!is.na(x) & x != y]) - length(x[!is.na(x) & x==y]))/length(x[!is.na(x)])
return(z)
}
#make egonet function
#source of function https://bookdown.org/markhoff/social_network_analysis/ego-networks.html
make_ego_nets <- function(tie) {
#tie <- tes
# make the matrix
mat = matrix(nrow = 5, ncol = 5)
# assign the tie values to the lower triangle
mat[lower.tri(mat)] <- as.numeric(tie)
# symmetrize
mat[upper.tri(mat)] = t(mat)[upper.tri(mat)]
# identify missing values
na_vals <- is.na(mat)
# identify rows where all values are missing
non_missing_rows <- rowSums(na_vals) < nrow(mat)
# if any rows
if (sum(!non_missing_rows) > 0) {
mat <- mat[non_missing_rows, non_missing_rows]
}
diag(mat) <- 0
ego_net <- graph.adjacency(mat, mode = "undirected", weighted = T)
return(ego_net)
}
#create network variables function.
f_make_net_variables <- function(df, range, variable) {#df <- test
#store input in df
df <- df
#create a list to store information in
neteffects_alter <- list()
#create for every alter the neteffects.
for (i in 1:nrow(df)) {#i = 1
#select alter from network
alter <- as.numeric(df[i,5])
#create a test, if alter has NA, then the avsim will be NA for that alter.
if (is.na(alter)){
#extract dyad and nomem_encr id.
dyad_id <- df[i,] %>%
pull(dyad_id)
nomem_encr <- df[i,] %>%
pull(nomem_encr)
#network effects alter
neteffects_alter[[i]] <- tibble(avsim_alter = NA,
avealter_alter = NA,
ei_alter = NA,
dyad_id = dyad_id,
nomem_encr = nomem_encr) %>%
rename(!!paste0("avsim_alter", "_", variable) := avsim_alter,
!!paste0("avealter_alter", "_", variable) := avealter_alter,
!!paste0("ei_alter", "_", variable) := ei_alter)
} else{
#drop alter from the group to create the network vector
df_net <- df[-i,]
#extract alter var
net <- as.vector(t(df_net[,5]))
#calculate EI score
ei_score_alter <- fEIindexAdjusted(x = net, y = alter)
#calculate avsim and average alter score.
net_df <- tibble(net, alter)
#create net_df to create the scores. #range = 2
net_df <- net_df %>%
filter(!is.na(net) & !is.na(alter)) %>% #filter out missings
mutate(dyad_sim = 1 - (abs(alter - net)/range), #dyadic sim
avsim_alter = mean(dyad_sim), #ave sim (Rsiena)
avealter_alter = alter * sum(net)/nrow(net_df), # average alter (Rsiena)
ei_alter = ei_score_alter, #EI index
dyad_id = df[i,] %>%
pull(dyad_id),
nomem_encr = df[i,] %>%
pull(nomem_encr))
#store in list variable <- "origin"
neteffects_alter[[i]] <- net_df %>%
select(nomem_encr, dyad_id, avsim_alter, avealter_alter, ei_alter) %>%
distinct() %>%
#create names specific for variable
rename(!!paste0("avsim_alter", "_", variable) := avsim_alter,
!!paste0("avealter_alter", "_", variable) := avealter_alter,
!!paste0("ei_alter", "_", variable) := ei_alter)
}
}
#store network effects for alter
neteffects_alter <- neteffects_alter %>%
bind_rows()
#check whether ego knowledge is missing.
if(is.na(as.numeric(df[1,4]))){
#extract dyad and nomem_encr id.
survey_wave <- df[i,] %>%
pull(survey_wave)
nomem_encr <- df[i,] %>%
pull(nomem_encr)
#network effects ego
neteffects_ego <- tibble(avsim_ego = NA,
avealter_ego = NA,
ei_ego = NA,
nomem_encr = nomem_encr,
survey_wave = survey_wave) %>%
rename(!!paste0("avsim_ego", "_", variable) := avsim_ego,
!!paste0("avealter_ego", "_", variable) := avealter_ego,
!!paste0("ei_ego", "_", variable) := ei_ego)
} else{
#neteffects for ego
#alters
net <- as.vector(t(df[,5]))
#ego
ego <- as.numeric(df[1,4])
#calculate the EI score
ei_score_ego <- fEIindexAdjusted(x = net, y = ego)
net_df <- df %>%
select(nomem_encr, survey_wave) %>%
bind_cols(tibble(net, ego))
neteffects_ego <- net_df %>%
filter(!is.na(net) & !is.na(ego)) %>%
mutate(dyad_sim = 1 - (abs(ego - net)/range),
avsim_ego = mean(dyad_sim),
avealter_ego = ego * sum(net)/nrow(net_df),
ei_ego = ei_score_ego) %>%
select(nomem_encr, survey_wave, avsim_ego, avealter_ego, ei_ego) %>%
distinct() %>%
rename(!!paste0("avsim_ego", "_", variable) := avsim_ego,
!!paste0("avealter_ego", "_", variable) := avealter_ego,
!!paste0("ei_ego", "_", variable) := ei_ego)
}
neteffects <- neteffects_alter %>%
left_join(neteffects_ego, by = "nomem_encr")
return(neteffects)
}
#function for calculating degree of each alter and store it in a tibble with dyad id info.
F_degree_calculation <- function(egonet, degree_net) {# egonet = net_info_df_list[[10]]
#degree_net = ego_nets[[10]]
#calculate degree for each alter
degree_df <- tibble(degree = degree(degree_net))
#create col selection variable.
if(nrow(degree_df) == 0){
total_alters <- 3
}else{
total_alters <- 3:(nrow(degree_df)+2)}
#add degree to dyad id
egonet_df <- egonet %>%
pivot_longer(cols = all_of(total_alters),
names_to = "alter",
values_to = "dyad_id") %>%
select(nomem_encr, survey_wave, dyad_id) %>%
bind_cols(degree_df)
#return egonet_df as result of function.
return(egonet_df)
}
First some recode of the alter and the ego variables so they are comparable.
#network variables
repeated_event_data <- repeated_event_data %>%
rename(censor = censor_process,
times_dropped_earlier = times_dropped_rec) %>%
group_by(nomem_encr, survey_wave) %>% #for every id/wave combinations, which identifies network
mutate(net_educ = mean(educ_alter, na.rm = T),
net_age = mean(age_alter, na.rm = T),
net_gender = mean(gender_alter, na.rm = T)) %>% #network variables
ungroup()
#similarity variables
#some data prep
repeated_event_data <- repeated_event_data %>%
mutate(age_rec = fage_rec(as.numeric(leeftijd)),
gender_alter = if_else(gender_alter == 3, NA, gender_alter),
gender = if_else(gender == 3, NA, gender)) %>%
#create sim variables
mutate(dyad_educ_sim = f_sim(educ_alter, educ_ego, 12),
dyad_gender_sim = ifelse(gender_alter == gender, 1, 0),
dyad_age_sim = f_sim(age_alter, age_rec, 12),
dyad_ethnicity_sim = ifelse(origin_rec_nar == origin_alter_rec, 1, 0),
dyad_age_sim_rec = dyad_age_sim/age_rec) #age sim divided by age ego
#recode alter_dear.
repeated_event_data <- repeated_event_data %>%
mutate(dear_alter_rec = ifelse(is.na(dear_alter), 2, dear_alter),
dear_alter_rec = ifelse(is.na(dear_alter_rec), 3, dear_alter_rec),
dear_alter_fac = factor(dear_alter_rec,
levels = 0:3,
labels = c("not_dear", "dear", "Not Asked", "Missing")))
We need to clean the data from faulty re-occurrences. For instance, some dyads change from being male to female and other alters change from being someone’s partner to being their parent. This should be impossible.
#gender check
#first check on the gender variable if people change gender.
check_data <- repeated_event_data %>%
filter(dropped == 0) %>%
select(nomem_encr, dyad_id, process_id, gender_alter, rel_alter, survey_wave)
#create mean of gender over time. If not 0 or 1, then we have a problem.
check_data_gender <- check_data %>%
select(nomem_encr, dyad_id, survey_wave, gender_alter, rel_alter) %>%
distinct() %>%
arrange(nomem_encr, dyad_id, survey_wave) %>%
group_by(dyad_id) %>%
mutate(mean_gender = mean(gender_alter)) %>%
ungroup()
gender_fault_ids <- check_data_gender %>%
filter(mean_gender != 1 & mean_gender != 2) %>%
select(dyad_id) %>%
distinct()
#relationship check. Use paste0 to create a new variable which contain unique transition combinations
#then we can actually filter out the impossible combinations.
check_data <- check_data %>%
group_by(dyad_id) %>%
select(nomem_encr, dyad_id, survey_wave, rel_alter) %>%
filter(!is.na(rel_alter)) %>%
mutate(rel_check = paste0(rel_alter, lag(rel_alter))) %>% #create unique variable
ungroup()
#set impossible combinations. See codebook for the meaning of these.
impossible_combinations <- c(12,13,14,15,16,17,18,110,
21,23,24,25,26,27,28,29,210,
31,32,34,35,36,37,38,39,310,
41,42,43,45,46,47,48,49,410,
51,52,53,54,56,57,58,59,510,
62,63,64,65,
72,73,74,75,
82,83,84,85,
92,93,94,95,
102,103,104,105)
#select the rows with impossible combinations
faulty_dyads <- check_data %>%
filter(rel_check %in% impossible_combinations) %>%
select(nomem_encr) %>%
distinct()
#filter out the networks with impossible combinations
repeated_event_data <- repeated_event_data %>%
filter(!nomem_encr %in% faulty_dyads$nomem_encr)
#from 240000 to 204786
We need to construct a number of network variables. The first are network size and density
#calculate network density
net_density <- liss_long %>%
mutate(survey_wave = as.numeric(survey_wave)) %>%
arrange(nomem_encr, survey_wave) %>%
select(starts_with("close_")) %>%
mutate(across(starts_with("close"), ~ ifelse(. == 3, 0, .)))
#create networks
ego_nets <- lapply(
1:nrow(net_density),
FUN = function(x)
make_ego_nets(net_density[x, ])
)
#density of network (igraph)
densities <- lapply(ego_nets, graph.density)
densities <- unlist(densities)
#use future_map for vectorized iteration
net_density_data <- liss_long %>%
select(nomem_encr, survey_wave) %>%
cbind(densities)
#create new tibble with network data
net_data <- net_density_data %>%
rename(net_density = densities) %>%
mutate(survey_wave = as.numeric(survey_wave))
#add data to repeated event data
repeated_event_data <- repeated_event_data %>%
left_join(net_data, by = c("nomem_encr", "survey_wave"))
#------------------------------ Education ------------------------------#
#create file name to store info in.
file_name <- "datafiles/data-processed/disaggregated_data/education_nets.rds"
#create alter net info
if(!file.exists(file_name)){
#extract education data
educ_net_df <- repeated_event_data %>%
filter(dropped == 0) %>%
arrange(nomem_encr, survey_wave) %>%
select(nomem_encr, survey_wave, dyad_id, educ_ego, educ_alter)
#create count variable
count <- educ_net_df %>%
arrange(nomem_encr, survey_wave) %>%
distinct() %>%
group_by(nomem_encr, survey_wave) %>%
count()
#add count to educ_net_df
educ_net_df <- educ_net_df %>%
left_join(count, by = c("nomem_encr", "survey_wave"))
#create alist with group_split
educ_net_list <- educ_net_df %>%
group_split(nomem_encr, survey_wave)
#use future_map and the f_make_net_variables_df
#plan parallel session
plan(multisession, workers = 7)
#use future_map for vectorized iteration
educ_net_list_results <- educ_net_list %>%
future_map(.f = ~ f_make_net_variables(df = ., variable = "educ", range = 12),
.progress = T)
#store results in df
educ_net_df_results <- educ_net_list_results %>%
bind_rows()
#save intermediate results
save(educ_net_df_results,
file = file_name)
#stop parallel session
plan(sequential)
} else {
educ_net_df_results <- get(load(file = file_name))
}
#------------------------------ Origin ------------------------------#
#create file name to store info in.
file_name <- "datafiles/data-processed/disaggregated_data/origin_nets.rds"
#create alter net info
if(!file.exists(file_name)){
#extract education data
origin_net_df <- repeated_event_data %>%
filter(dropped == 0) %>%
arrange(nomem_encr, survey_wave) %>%
select(nomem_encr, survey_wave, dyad_id, origin_rec_nar, origin_alter_rec)
#create alist with group_split
origin_net_list <- origin_net_df %>%
group_split(nomem_encr, survey_wave)
#use future_map and the f_make_net_variables_df
#start parallel session
plan(multisession, workers = 7)
#use future_map for vectorized iteration
origin_net_list <- origin_net_list %>%
future_map(.x = .,
.f = ~ f_make_net_variables(df = .x, variable = "ethnicity", range = 2),
.progress = T)
#store results
origin_net_df_results <- origin_net_list %>%
bind_rows()
#save intermediate results
save(origin_net_df_results,
file = file_name)
#stop parallel session
plan(sequential)
} else {
origin_net_df_results <- get(load(file = file_name))
}
#------------------------------ Age ------------------------------#
#create file name to store info in.
file_name <- "datafiles/data-processed/disaggregated_data/age_nets.rds"
#create alter net info
if(!file.exists(file_name)){
#extract age data
age_net_df <- repeated_event_data %>%
filter(dropped == 0) %>%
arrange(nomem_encr, survey_wave) %>%
select(nomem_encr, survey_wave, dyad_id, age_rec, age_alter)
#create count variable
count <- age_net_df %>%
arrange(nomem_encr, survey_wave) %>%
distinct() %>%
group_by(nomem_encr, survey_wave) %>%
count()
#add count to educ_net_df
age_net_df <- age_net_df %>%
left_join(count, by = c("nomem_encr", "survey_wave"))
#create a list with group_split
age_net_list <- age_net_df %>%
group_split(nomem_encr, survey_wave)
#use future_map and the f_make_net_variables_df
#start parallel session
plan(multisession, workers = 7)
#use future_map for vectorized iteration
age_net_list_results <- age_net_list %>%
future_map(.f = ~ f_make_net_variables(df = ., variable = "age", range = 12),
.progress = T)
#store results
age_net_df_results <- age_net_list_results %>%
bind_rows()
#save intermediate results
save(age_net_df_results,
file = file_name)
#stop parallel session
plan(sequential)
} else {
age_net_df_results <- get(load(file = file_name))
}
#------------------------------ Gender ------------------------------#
#create file name to store info in.
file_name <- "datafiles/data-processed/disaggregated_data/gender_nets.rds"
#create gender net info
if(!file.exists(file_name)){
#extract education data
gender_net_df <- repeated_event_data %>%
filter(dropped == 0) %>%
arrange(nomem_encr, survey_wave) %>%
select(nomem_encr, survey_wave, dyad_id, gender, gender_alter)
#create a list with group_split
gender_net_list <- gender_net_df %>%
group_split(nomem_encr, survey_wave)
#use future_map and the f_make_net_variables_df
#plan parallel session.
plan(multisession, workers = 7)
#use future_map for vectorized iteration
gender_net_list_results <- gender_net_list %>%
future_map(.f = ~ f_make_net_variables(df = ., variable = "gender", range = 1),
.progress = T)
gender_net_df_results <- gender_net_list_results %>%
bind_rows()
#save intermediate results
save(gender_net_df_results,
file = file_name)
#stop parallel session
plan(sequential)
} else {
gender_net_df_results <- get(load(file = file_name))
}
#-------------------------- Merging -----------------------------#
#add info to repeated_event_data
repeated_event_data <- repeated_event_data %>%
left_join(educ_net_df_results, by = c("dyad_id", "survey_wave", "nomem_encr")) %>%
left_join(age_net_df_results, by = c("dyad_id", "survey_wave", "nomem_encr")) %>%
left_join(gender_net_df_results, by = c("dyad_id", "survey_wave", "nomem_encr")) %>%
left_join(origin_net_df_results, by = c("dyad_id", "survey_wave", "nomem_encr"))
#create a list with network info for each respondent year combination.
net_info_df_list <- liss_long %>%
select(nomem_encr, starts_with("alter_id"), survey_wave) %>%
pivot_longer(cols = 2:6,
names_to = "var",
values_to = "alter_id") %>%
mutate(dyad_id = ifelse(is.na(alter_id), NA, paste0(nomem_encr, alter_id)),
survey_wave = as.numeric(survey_wave)) %>%
select(-alter_id) %>%
mutate(order = case_when(
var == "alter_id_1" ~ 1,
var == "alter_id_2" ~ 2,
var == "alter_id_3" ~ 3,
var == "alter_id_4" ~ 4,
var == "alter_id_5" ~ 5,
)) %>%
select(-var) %>%
pivot_wider(names_from = order,
values_from = dyad_id) %>%
arrange(nomem_encr, survey_wave) %>%
group_split(row_number())
#use degree calculation function with the ego_nets list and the network info list
#plan future session, parallel computing
plan(multisession, workers = 7)
#use future_map for vectorized iteration
degree_egonet_list <- future_map2(.x = ego_nets,
.y = net_info_df_list,
.f = ~ F_degree_calculation(egonet = .y,
degree_net = .x),
.progress = T)
plan(sequential)
#unlist
degree_egonet_df <- degree_egonet_list %>%
bind_rows() %>%
mutate(survey_wave = as.numeric(survey_wave))
#add data to repeated event data
repeated_event_data <- repeated_event_data %>%
left_join(degree_egonet_df, by = c("dyad_id", "survey_wave", "nomem_encr"))
#normalized degree and size variable
size_degree_nor_df <- repeated_event_data %>%
arrange(nomem_encr, survey_wave) %>%
filter(dropped == 0) %>%
select(nomem_encr, survey_wave, dyad_id, degree) %>%
group_by(nomem_encr, survey_wave) %>%
mutate(size = n()) %>%
ungroup() %>%
mutate(degree_normalized = degree / (size - 1)) %>%
select(nomem_encr, survey_wave, dyad_id, degree_normalized, size)
#add normalized degree to the data
repeated_event_data <- repeated_event_data %>%
left_join(size_degree_nor_df, by = c("dyad_id", "survey_wave", "nomem_encr"))
#clean global environment
rm(list=ls()[! ls() %in% c("repeated_event_data", "liss_long", "liss_wide")])
#save the data.
save.image("datafiles/data-processed/disaggregated_data/2023-06-12_dyad-survival-data.rda")
Copyright © 2023 Jeroense Thijmen