library(mice)
library(tidyverse)
library(data.table)
#load prepared data.
load("datafiles/data-processed/disaggregated_data/2023-06-12_dyad-survival-data.rda")
# 
# #load ego and alter data
# load(file = "datafiles/data-processed/disaggregated_data/2023-01-17_ego-data.rds")
# load(file = "datafiles/data-processed/disaggregated_data/2023-01-17_alter-data.rds")
# 
#change scientific notation
 options(scipen = 999)

Main analysis dataset

kin_ids_df <- repeated_event_data %>% 
  select(dyad_id, rel_alter) %>% 
  filter(rel_alter < 6) %>% 
  select(dyad_id) %>% 
  distinct()

#we lose half of the data. 
nonkin_repeated_event_data <- repeated_event_data %>% 
  filter(!dyad_id %in% kin_ids_df$dyad_id)
nonkin_survival_data_lead_dependent <-
  nonkin_repeated_event_data %>%
  select(!contains("recode")) %>%
  select(
    nomem_encr,
    process_id,
    dyad_id,
    survey_wave,
    time,
    dropped,
    gender_fac,
    divorced_fac,
    moving_fac,
    educ_alter,
    age_alter,
    gender_alter_fac,
    dear_alter_rec,
    rel_alter_rec,
    origin_rec_nar,
    origin_rec_nar_fac,
    origin_alter_rec,
    origin_alter_rec_fac,
    dyad_educ_sim,
    dyad_gender_sim,
    dyad_age_sim,
    dyad_ethnicity_sim,
    times_dropped_earlier,
    censor,
    length,
    educ_ego,
    leeftijd,
    age_sq,
    net_educ,
    net_density,
    size,
    net_age,
    net_gender,
    starts_with("avealter"),
    starts_with("ei_"),
    starts_with("avsim"),
    degree,
    degree_normalized,
    length_rel_member,
    length_rel_total,
    divorced_event,
    divorced_transition,
    move_event,
    move_transition,
    new_municipality_event,
    new_municipality_transition,
    new_residence_event,
    new_residence_transition,
    first_child,
    first_child_event,
    first_child_transition,
    employment_status,
    paid_work,
    employment,
    employment_event,
    employment_transition,
    retired,
    retired_event,
    retired_transition,
    unemployment,
    unemployed_event,
    unemployed_transition,
    divorced_event,
    divorced_transition,
    lost_partner,
    lost_partner_event,
    lost_partner_transition,
    divorced_seperated
  ) %>%
  group_by(process_id) %>%
  mutate(dropped_lead = lead(dropped)) %>%
  ungroup() %>%
  filter(!is.na(dropped_lead))

Imputation of missing values

Dataprep. We use mean imputation to impute numeric missing values.

#recode error in data in which NA should have been coded as 0.
nonkin_survival_data_lead_dependent <- nonkin_survival_data_lead_dependent %>% 
  mutate(move_event = ifelse(is.na(move_event) & !is.na(new_residence_event),0, move_event),
         new_municipality_event = ifelse(is.na(move_event) & !is.na(new_residence_event),0,new_municipality_event))

#create imputation dummies for divorce, child, move transitions
nonkin_survival_data_lead_dependent <- nonkin_survival_data_lead_dependent %>% 
  mutate(first_child_event_missing = if_else(is.na(first_child_event), 1, 0),
         divorced_event_missing = if_else(is.na(divorced_event), 1, 0),
         move_event_missing = if_else(is.na(move_event), 1, 0),
         new_municipality_event_missing = if_else(is.na(new_municipality_event), 1, 0),
         new_residence_event_missing = if_else(is.na(move_event), 1, 0),
         employment_event_missing = if_else(is.na(employment_event), 1, 0),
         unemployed_event_missing = if_else(is.na(unemployed_event), 1, 0),
         lost_partner_event_missing = if_else(is.na(lost_partner_event), 1, 0),
         retired_event_missing = if_else(is.na(retired_event), 1, 0))


#set seed for imp
set.seed(123)

#mean imputation(for now)
#impute the numeric variables
nonkin_survival_data_lead_dependent_numeric <-
  nonkin_survival_data_lead_dependent %>%
  select(
    educ_ego,
    leeftijd,
    age_sq,
    educ_alter,
    age_alter,
    dyad_educ_sim,
    dyad_gender_sim,
    dyad_age_sim,
    dyad_ethnicity_sim,
    net_educ,
    net_density,
    size,
    net_age,
    net_gender,
    degree_normalized,
    degree,
    starts_with("avealter"),
    starts_with("ei_"),
    starts_with("avsim"),
    divorced_event,
    divorced_transition,
    move_event,
    move_transition,
    new_municipality_event,
    new_municipality_transition,
    new_residence_event,
    new_residence_transition,
    first_child_transition,
    employment_event,
    employment_transition,
    unemployed_event,
    unemployed_transition,
    lost_partner_event,
    lost_partner_transition,
    retired_event,
    retired_transition
  ) %>%
  mutate(across(everything(), ~ as.numeric(.)))

#impute with Mice
nonkin_survival_data_lead_dependent_numeric_imputed <- mice(nonkin_survival_data_lead_dependent_numeric, method = "mean", m = 1, maxit = 1)

iter imp variable 1 1 educ_ego leeftijd age_sq educ_alter age_alter dyad_educ_sim dyad_gender_sim dyad_age_sim dyad_ethnicity_sim net_educ net_density net_age degree avealter_alter_educ avealter_ego_educ avealter_alter_age avealter_ego_age avealter_alter_gender avealter_ego_gender avealter_alter_ethnicity avealter_ego_ethnicity ei_alter_educ ei_ego_educ ei_alter_age ei_ego_age ei_alter_gender ei_ego_gender ei_alter_ethnicity ei_ego_ethnicity avsim_alter_educ avsim_ego_educ avsim_alter_age avsim_ego_age avsim_alter_ethnicity avsim_ego_ethnicity divorced_event divorced_transition new_municipality_event new_municipality_transition new_residence_event new_residence_transition first_child_transition employment_event employment_transition unemployed_event unemployed_transition lost_partner_event lost_partner_transition retired_event retired_transition

#selection of non numeric variables. 
nonkin_survival_data_lead_dependent <- nonkin_survival_data_lead_dependent %>%
  select(nomem_encr,
         process_id,
         dyad_id,
         time,
         dropped_lead,
        gender_fac, 
        divorced_fac, 
        moving_fac,
        gender_alter_fac,
        dear_alter_rec,
        rel_alter_rec,
        times_dropped_earlier,
        censor,
        length,
        length_rel_member,
        length_rel_total,
        origin_rec_nar,
        origin_rec_nar_fac,
        origin_alter_rec,
        origin_alter_rec_fac,
        survey_wave,
        first_child,
        first_child_event_missing,
        divorced_event_missing,
        move_event_missing,
        new_municipality_event_missing,
        new_residence_event_missing,
        employment_event_missing,
        employment_status,
        employment,
        unemployment,
        retired,
        lost_partner,
        unemployed_event_missing,
        retired_event_missing,
        lost_partner_event_missing,
        paid_work
        ) %>% 
  mutate(paid_work_fac = ifelse(is.na(paid_work), 2, paid_work),
         paid_work_fac = factor(
           paid_work,
           levels = 0:2,
           labels = c(
             "No paid work",
             "Paid work",
             "Missing")
           ),
         employment_status_fac = ifelse(
           is.na(employment_status),
           8,
           as.numeric(employment_status)
           ),
         employment_status_fac = factor(
           employment_status_fac,
           levels = 1:8,
           labels = c(
             "employed",
             "self-employed",
             "unemployed",
             "student",
             "housework",
             "retired",
             "other",
             "missing"
             )
           ),
         first_child = ifelse(is.na(first_child), 2, first_child),
         first_child_fac = factor(
           first_child,
           levels = 0:2,
           labels = c("No change", "First child born", "Missing")
         ),
         retired = ifelse(is.na(retired), 2, retired),
         retired_fac = factor(
           retired,
           levels = 0:2,
           labels = c("No change", "Retired", "Missing")
         ),
         employment = ifelse(is.na(employment), 2, employment),
         employment_fac = factor(
           employment,
           levels = 0:2,
           labels = c("No change", "Employed", "Missing")
         ),
         unemployment = ifelse(is.na(unemployment), 2, unemployment),
         unemployment_fac = factor(
           unemployment,
           levels = 0:2,
           labels = c("No change", "Employed", "Missing")
         ),
         lost_partner = ifelse(is.na(lost_partner), 2, lost_partner),
         lost_partner_fac = factor(
           lost_partner,
           levels = 0:2,
           labels = c("No change", "Lost partner", "Missing")
         )
         )

#add the imputed data
nonkin_survival_data_lead_dependent_imputed <- cbind(nonkin_survival_data_lead_dependent, complete(nonkin_survival_data_lead_dependent_numeric_imputed))

#create time variables
nonkin_survival_data_lead_dependent_imputed <- nonkin_survival_data_lead_dependent_imputed %>%
  mutate(time_2 = time*time,
         time_3 = time^3,
         time_4 = time^4,
         time_5 = time^5,
         time_6 = time^6,
         time_7 = time^7,
         time_8 = time^8,
         time_9 = time^9,
         time_10 = time^10)

#create dummy for netsize is 1. These have missings on the ave_alter variables. 
nonkin_survival_data_lead_dependent_imputed <- nonkin_survival_data_lead_dependent_imputed %>% 
  mutate(size_net_one = ifelse(size == 1, 1, 0))


#center and scale numeric
nonkin_survival_data_lead_dependent_imputed <- nonkin_survival_data_lead_dependent_imputed %>%
  mutate(educ_ego_cen = scale(educ_ego),
         age_cen = scale(leeftijd),
         age_sq_cen = scale(age_sq),
         educ_alter_cen = scale(educ_alter),
         age_alter_cen = scale(age_alter),
         dyad_educ_sim_cen = scale(dyad_educ_sim),
         dyad_age_sim_cen = scale(dyad_age_sim),
         dyad_gender_sim_cen = scale(dyad_gender_sim),
         dyad_ethnicity_sim_cen = scale(dyad_ethnicity_sim),
         net_density_cen = scale(net_density),
         net_size_cen = scale(size),
         times_dropped_earlier_cen = scale(times_dropped_earlier),
         length_fac = factor(length, levels = 1:3, labels = c("< 3 years", "3 - 6 years", "> 6 years"))
         )
#save prepared data.
save(nonkin_survival_data_lead_dependent_imputed, file = "datafiles/data-processed/disaggregated_data/2022-09-20_dyad-survival-data-imputed.rda")



Copyright © 2023 Jeroense Thijmen