Final data preparation for RI-CLPM analysis.
#load packages
library(tidyverse)
library(doParallel)
library(parallel)
library(data.table)
#disable the scientific notation in R (else all the id's will be in scientific notation)
options(scipen = 999)
#Data import
load("data/data-processed/lisscdn_cl-ready_240816.Rdata")
#------------------------- Functions for recoding of data -------------------------#
#function to recode education into education years
feduc_ego <- function(x) {
x2 <- ifelse(x == 1, 6, x)
x3 <- ifelse(x == 2, 10, x2)
x4 <- ifelse(x == 3, 11.5, x3)
x5 <- ifelse(x == 4, 10.5, x4)
x6 <- ifelse(x == 5, 15, x5)
x7 <- ifelse(x == 6, 16, x6)
x8 <- ifelse(x == 7, NA, x7)
x9 <- ifelse(x == 8, 4, x8)
x10 <- ifelse(x == 9, 0, x9)
return(x10)
}
feduc_alter <- function(x) {
x2 <- ifelse(x == 1, 4, x)
x3 <- ifelse(x == 2, 6, x2)
x4 <- ifelse(x == 3, 10, x3)
x5 <- ifelse(x == 4, 11.5, x4)
x6 <- ifelse(x == 5, 10.5, x5)
x7 <- ifelse(x == 6, 15, x6)
x8<- ifelse(x == 7, 16, x7)
return(x8)
}
feduc_alter_orig <- function(x) {
x1 <- ifelse(x == 4, 5, x)
x2 <- ifelse(x == 5, 4, x1)
return(x2)
}
#poltalk reverse code
fpoltalk <- function(x) {
y <- 7 - x
return(y)
}
#create function for similarity score (numerical variables) (see Rsiena Manual)
feduc_sim <- function (x,y) {
result <- 1 - (abs(x - y)/12)
return(result)
}
#create function
fcat_sim <- function (x,y) {
result <- 1 - (abs(x - y)/1)
return(result)
}
#recode the age of ego into the same categories as the confidant.
fage_rec <- function (x) {
y <- ifelse(x < 16, 1, x)
y <- ifelse(x > 15 & x < 21, 2, y)
y <- ifelse(x > 20 & x < 26, 3, y)
y <- ifelse(x > 25 & x < 31, 4, y)
y <- ifelse(x > 30 & x < 36, 5, y)
y <- ifelse(x > 35 & x < 41, 6, y)
y <- ifelse(x > 40 & x < 46, 7, y)
y <- ifelse(x > 45 & x < 51, 8, y)
y <- ifelse(x > 50 & x < 56, 9, y)
y <- ifelse(x > 55 & x < 61, 10, y)
y <- ifelse(x > 60 & x < 66, 11, y)
y <- ifelse(x > 65 & x < 71, 12, y)
y <- ifelse(x > 70, 13, y)
return(y)
}
#create function
fage_sim <- function (x,y) {
result <- 1 - (abs(x - y)/13)
return(result)
}
#recode functions
forigin_rec_alter <- function(x) {
y <- ifelse((x > 1) & (x < 8), 0, 1)
}
forigin_rec_ego <- function(x) {
y <- ifelse(x == 0, 1, x)
y <- ifelse(x == 201 | x == 101, 1, y)
y <- ifelse(x == 102 | x == 202, 0, y)
}
#create function to recode talk alter.
ftalk_rec <- function(x) {
y <- ifelse(x == 5, NA, x)
z <- 4 - y
return(z)
}
#create function for similarity score (numerical variables) (see Rsiena Manual)
feduc_sim <- function (x,y) {
result <- 1 - (abs(x - y)/12)
return(result)
}
#create distance function. I will subtract alter from ego.
feduc_distance <- function(x,y) {
z <- y - x
return(z)
}
#create categorical similarity function
fcat_sim <- function (x,y) {
result <- 1 - (abs(x - y)/1)
return(result)
}
#create function
fage_sim <- function (x,y) {
result <- 1 - (abs(x - y)/13)
return(result)
}
#age distance function. Subtract the score of x from y. So if y is lower the score is negative.
fage_dis <- function(x,y){
result <- y - x
return(result)
}
#create function.
#rl times reciprocal of n (number of waves respondent is in; more waves more chance for alter to pop up.
frl_normalize <- function(x,y) {
z <- (1/x)*y
return(z)
}
#work recode
fwork <- function(x){
ifelse(x < 3, 1, 0)}
#Function for EI index of segregation.
fEIindex <- function(x, y){(length(x[!is.na(x) & x == y]) - length(x[!is.na(x) & x!=y]))/length(x[!is.na(x)])}
#Ego Control variables.
data_long <- data_long %>%
mutate(female = gender - 1, #create female dummy
work = ifelse(belbezig < 4, 1, 0), #create paidwork
married = ifelse(burgstat == 1, 1, 0), #create married dummy
educ_orig = educ, #save old education coding
educ = feduc_ego(educ), #change educ coding
inc_ln = log(inc_ln + 1), #log transform
urban = 5 - urban, #reverse coding
cult = 5 - cult_inc, #reverse coding
eu = 5 - eu, #reverse coding
inc_diff = inc_diff - 1,
age_rec = fage_rec(age),
origin = forigin_rec_ego(origin)) #reverse coding
#alter variables
data_long <- data_long %>%
mutate(educ_orig_a.1 = educ_a.1,
educ_orig_a.2 = educ_a.2,
educ_orig_a.3 = educ_a.3,
educ_orig_a.4 = educ_a.4,
educ_orig_a.5 = educ_a.5,
across(starts_with("educ_a"), ~ feduc_alter(.x)),
across(starts_with("poltalk_a"), ~ fpoltalk(.x)),
across(starts_with("educ_orig_a"), ~ feduc_alter_orig(.x)),)
#create net data long. With alters in survey ego combinations.
net_data_long <- data_long %>%
pivot_longer(col = contains("_a."),
names_to = c("measure", "alter"),
names_pattern = "(.+)\\.(.+)",
values_to = "value"
) %>%
pivot_wider(names_from = measure,
values_from = value)
# Create similarity scores for the alter variables
net_data_long <- net_data_long %>%
group_by(nomem_encr, wave) %>%
mutate(net_size = n()) %>% #create network size variable
ungroup() %>%
mutate(g_a = g_a - 1,
orig_a = forigin_rec_alter(orig_a), #origin recode
talk_a = ftalk_rec(talk_a), #talk recode
work_a = fwork(work_a), #work recode
rln_a = frl_normalize(x = net_size, y = rl_a), #normalized rl var
educ_sim = feduc_sim(x = educ, y = educ_a), #educ sim
educ_dist = feduc_distance(x = educ, y = educ_a), #educ dist
g_sim = fcat_sim(female, g_a), #gender sim
age_sim = fage_sim(x = age_rec, y = age_a), #age sim
age_dist = fage_dis(x = age_rec, y = age_a), #age distance
orig_sim = fcat_sim(x = origin, y = orig_a), #origin sim
ave_sim = (g_sim + age_sim)/2,
rll_a = rl_a,
rl_a = ifelse(rl_a == 1, 1, 0)) #ave sim
#Network measurs
net_data_list <- net_data_long %>%
group_split(nomem_encr, wave)
# paralellize the estimation
numCores <- detectCores()
registerDoParallel(core=numCores-1)
#output list
output <- list()
output <- foreach(i = 1:length(net_data_list),
.packages = c("tidyverse"),
.combine = rbind) %dopar% {#i =1
df <- net_data_list[[i]]
#create ei index
net <- as.vector(t(as.numeric(df$educ_a)))
ego <- as.vector(t(df[1, 2]))
output[[i]] <- df %>%
mutate(ei_educ = fEIindex(x = net, y = ego))
}
#stop parralellization
stopImplicitCluster()
#set all variables to numeric and reset labels.
net_data_result <- output
#extract ego data
ego_data <- net_data_result %>%
select(nomem_encr,
wave,
educ,
educ_orig,
age,
age_rec,
female,
work,
origin,
inc_ln,
inc_diff,
burgstat,
married,
eu,
cult_inc,
cult,
ei_educ) %>%
distinct()
#extract alter data
alter_data <- net_data_result %>%
mutate(across(.cols = 2:46,
.fns = ~ as.numeric(x = .))) %>%
pivot_wider(id_cols = c("nomem_encr", "wave"),
names_from = "alter",
values_from = c(contains("_a"),
contains("_dist"),
contains("_sim")),
names_sep = ".")
#combine ego and alter data
mlsem_data <- ego_data %>%
left_join(alter_data, by = c("nomem_encr", "wave"))
#create wide file
mlsem_data <- mlsem_data %>%
pivot_wider(id_cols = "nomem_encr",
names_from = "wave",
values_from = 3:117,
names_sep = "_")
# Household ID selection
# Based on IDS that were selected using the following code.
# CAVEAT: unfortunately the random generator I used did not respond to the set.seed() function
# #select
# df <- data_long %>%
# select(nomem_encr, nohouse_encr) %>%
# distinct() %>%
# na.omit()
#
# #rename collumn into x
# names(df)[2] <- "x"
#
# #randomly select only one respondent per hh.
# df$Chosen <- 0
#
# #first set the seed so we can reproduce the outcomes.
# set.seed(50)
#
# #apply
# df[-tapply(-seq_along(df$x),df$x, sample, size=1),]$Chosen <- 1
#
# #so finally we select 6728 people.
# table(df$Chosen)
# In order to exactly replicate the findings please load the ids_analysis.rds file.
load(file = "data/data-processed/ml_sem_data/ids_analysis.rds")
#complete data
mlsem_data_compl <- mlsem_data
#selection of data
mlsem_data <- mlsem_data %>%
filter(nomem_encr %in% sample_ids$nomem_encr)
#save in list
mlsem_datafiles <- list(mlsem_data_compl,
mlsem_data)
#export data
save(mlsem_datafiles,
file = "data/data-processed/ml_sem_data/240816_lisscdn-mlsem-panel-data-cleaned.Rdata")
Copyright © 2024 Jeroense Thijmen