Dataprep. We use mean imputation to impute numeric missing
values.
#recode error in data in which NA should have been coded as 0.
nonkin_survival_data_lead_dependent <- nonkin_survival_data_lead_dependent %>%
mutate(move_event = ifelse(is.na(move_event) & !is.na(new_residence_event),0, move_event),
new_municipality_event = ifelse(is.na(move_event) & !is.na(new_residence_event),0,new_municipality_event))
#create imputation dummies for divorce, child, move transitions
nonkin_survival_data_lead_dependent <- nonkin_survival_data_lead_dependent %>%
mutate(first_child_event_missing = if_else(is.na(first_child_event), 1, 0),
divorced_event_missing = if_else(is.na(divorced_event), 1, 0),
move_event_missing = if_else(is.na(move_event), 1, 0),
new_municipality_event_missing = if_else(is.na(new_municipality_event), 1, 0),
new_residence_event_missing = if_else(is.na(move_event), 1, 0),
employment_event_missing = if_else(is.na(employment_event), 1, 0),
unemployed_event_missing = if_else(is.na(unemployed_event), 1, 0),
lost_partner_event_missing = if_else(is.na(lost_partner_event), 1, 0),
retired_event_missing = if_else(is.na(retired_event), 1, 0))
#set seed for imp
set.seed(123)
#mean imputation(for now)
#impute the numeric variables
nonkin_survival_data_lead_dependent_numeric <-
nonkin_survival_data_lead_dependent %>%
select(
educ_ego,
leeftijd,
age_sq,
educ_alter,
age_alter,
dyad_educ_sim,
dyad_gender_sim,
dyad_age_sim,
dyad_ethnicity_sim,
net_educ,
net_density,
size,
net_age,
net_gender,
degree_normalized,
degree,
starts_with("avealter"),
starts_with("ei_"),
starts_with("avsim"),
divorced_event,
divorced_transition,
move_event,
move_transition,
new_municipality_event,
new_municipality_transition,
new_residence_event,
new_residence_transition,
first_child_transition,
employment_event,
employment_transition,
unemployed_event,
unemployed_transition,
lost_partner_event,
lost_partner_transition,
retired_event,
retired_transition
) %>%
mutate(across(everything(), ~ as.numeric(.)))
#impute with Mice
nonkin_survival_data_lead_dependent_numeric_imputed <- mice(nonkin_survival_data_lead_dependent_numeric, method = "mean", m = 1, maxit = 1)
iter imp variable 1 1 educ_ego leeftijd age_sq educ_alter age_alter
dyad_educ_sim dyad_gender_sim dyad_age_sim dyad_ethnicity_sim net_educ
net_density net_age degree avealter_alter_educ avealter_ego_educ
avealter_alter_age avealter_ego_age avealter_alter_gender
avealter_ego_gender avealter_alter_ethnicity avealter_ego_ethnicity
ei_alter_educ ei_ego_educ ei_alter_age ei_ego_age ei_alter_gender
ei_ego_gender ei_alter_ethnicity ei_ego_ethnicity avsim_alter_educ
avsim_ego_educ avsim_alter_age avsim_ego_age avsim_alter_ethnicity
avsim_ego_ethnicity divorced_event divorced_transition
new_municipality_event new_municipality_transition new_residence_event
new_residence_transition first_child_transition employment_event
employment_transition unemployed_event unemployed_transition
lost_partner_event lost_partner_transition retired_event
retired_transition
#selection of non numeric variables.
nonkin_survival_data_lead_dependent <- nonkin_survival_data_lead_dependent %>%
select(nomem_encr,
process_id,
dyad_id,
time,
dropped_lead,
gender_fac,
divorced_fac,
moving_fac,
gender_alter_fac,
dear_alter_rec,
rel_alter_rec,
times_dropped_earlier,
censor,
length,
length_rel_member,
length_rel_total,
origin_rec_nar,
origin_rec_nar_fac,
origin_alter_rec,
origin_alter_rec_fac,
survey_wave,
first_child,
first_child_event_missing,
divorced_event_missing,
move_event_missing,
new_municipality_event_missing,
new_residence_event_missing,
employment_event_missing,
employment_status,
employment,
unemployment,
retired,
lost_partner,
unemployed_event_missing,
retired_event_missing,
lost_partner_event_missing,
paid_work
) %>%
mutate(paid_work_fac = ifelse(is.na(paid_work), 2, paid_work),
paid_work_fac = factor(
paid_work,
levels = 0:2,
labels = c(
"No paid work",
"Paid work",
"Missing")
),
employment_status_fac = ifelse(
is.na(employment_status),
8,
as.numeric(employment_status)
),
employment_status_fac = factor(
employment_status_fac,
levels = 1:8,
labels = c(
"employed",
"self-employed",
"unemployed",
"student",
"housework",
"retired",
"other",
"missing"
)
),
first_child = ifelse(is.na(first_child), 2, first_child),
first_child_fac = factor(
first_child,
levels = 0:2,
labels = c("No change", "First child born", "Missing")
),
retired = ifelse(is.na(retired), 2, retired),
retired_fac = factor(
retired,
levels = 0:2,
labels = c("No change", "Retired", "Missing")
),
employment = ifelse(is.na(employment), 2, employment),
employment_fac = factor(
employment,
levels = 0:2,
labels = c("No change", "Employed", "Missing")
),
unemployment = ifelse(is.na(unemployment), 2, unemployment),
unemployment_fac = factor(
unemployment,
levels = 0:2,
labels = c("No change", "Employed", "Missing")
),
lost_partner = ifelse(is.na(lost_partner), 2, lost_partner),
lost_partner_fac = factor(
lost_partner,
levels = 0:2,
labels = c("No change", "Lost partner", "Missing")
)
)
#add the imputed data
nonkin_survival_data_lead_dependent_imputed <- cbind(nonkin_survival_data_lead_dependent, complete(nonkin_survival_data_lead_dependent_numeric_imputed))
#create time variables
nonkin_survival_data_lead_dependent_imputed <- nonkin_survival_data_lead_dependent_imputed %>%
mutate(time_2 = time*time,
time_3 = time^3,
time_4 = time^4,
time_5 = time^5,
time_6 = time^6,
time_7 = time^7,
time_8 = time^8,
time_9 = time^9,
time_10 = time^10)
#create dummy for netsize is 1. These have missings on the ave_alter variables.
nonkin_survival_data_lead_dependent_imputed <- nonkin_survival_data_lead_dependent_imputed %>%
mutate(size_net_one = ifelse(size == 1, 1, 0))
#center and scale numeric
nonkin_survival_data_lead_dependent_imputed <- nonkin_survival_data_lead_dependent_imputed %>%
mutate(educ_ego_cen = scale(educ_ego),
age_cen = scale(leeftijd),
age_sq_cen = scale(age_sq),
educ_alter_cen = scale(educ_alter),
age_alter_cen = scale(age_alter),
dyad_educ_sim_cen = scale(dyad_educ_sim),
dyad_age_sim_cen = scale(dyad_age_sim),
dyad_gender_sim_cen = scale(dyad_gender_sim),
dyad_ethnicity_sim_cen = scale(dyad_ethnicity_sim),
net_density_cen = scale(net_density),
net_size_cen = scale(size),
times_dropped_earlier_cen = scale(times_dropped_earlier),
length_fac = factor(length, levels = 1:3, labels = c("< 3 years", "3 - 6 years", "> 6 years"))
)
#save prepared data.
save(nonkin_survival_data_lead_dependent_imputed, file = "datafiles/data-processed/disaggregated_data/2022-09-20_dyad-survival-data-imputed.rda")