Load libraries that we need for the preparation of the data.
#library
library(tidyverse)
library(data.table)
Import the merged LISS core files data.
load(file = "datafiles/data-processed/common_data/0623_v5_liss_merged_core_file.rds")
An overview of the custom function I use in the preparation of the data.
## Person-Level Person-Period Converter Function
PLPP <- function(data, id, period, event, direction = c("period", "level")) {
## Data Checking and Verification Steps
stopifnot(is.matrix(data) || is.data.frame(data))
stopifnot(c(id, period, event) %in% c(colnames(data), 1:ncol(data)))
if (any(is.na(data[, c(id, period, event)]))) {
stop("PLPP cannot currently handle missing data in the id, period, or event variables")
}
## Do the conversion
switch(match.arg(direction),
period = {
index <- rep(1:nrow(data), data[, period])
idmax <- cumsum(data[, period])
reve <- !data[, event]
dat <- data[index, ]
dat[, period] <- ave(dat[, period], dat[, id], FUN = seq_along)
dat[, event] <- 0
dat[idmax, event] <- reve},
level = {
tmp <- cbind(data[, c(period, id)], i = 1:nrow(data))
index <- as.vector(by(tmp, tmp[, id],
FUN = function(x) x[which.max(x[, period]), "i"]))
dat <- data[index, ]
dat[, event] <- as.integer(!dat[, event])
})
rownames(dat) <- NULL
return(dat)
}
#education recode function
func1 <- function(x) {
x2 <- ifelse(x == 1, 6, x)
x3 <- ifelse(x == 2, 10, x2)
x4 <- ifelse(x == 3, 11.5, x3)
x5 <- ifelse(x == 4, 10.5, x4)
x6 <- ifelse(x == 5, 15, x5)
x7 <- ifelse(x == 6, 16, x6)
x8 <- ifelse(x == 7, NA, x7)
x9 <- ifelse(x == 8, 4, x8)
x10 <- ifelse(x == 9, 0, x9)
return(x10)
}
#age recode
fage_rec <- function (x) {
y <- ifelse(x < 16, 1, x)
y <- ifelse(x > 15 & x < 21, 2, y)
y <- ifelse(x > 20 & x < 26, 3, y)
y <- ifelse(x > 25 & x < 31, 4, y)
y <- ifelse(x > 30 & x < 36, 5, y)
y <- ifelse(x > 35 & x < 41, 6, y)
y <- ifelse(x > 40 & x < 46, 7, y)
y <- ifelse(x > 45 & x < 51, 8, y)
y <- ifelse(x > 50 & x < 56, 9, y)
y <- ifelse(x > 55 & x < 61, 10, y)
y <- ifelse(x > 60 & x < 66, 11, y)
y <- ifelse(x > 65 & x < 71, 12, y)
y <- ifelse(x > 70, 13, y)
return(y)
}
#similarity functions
#education
feduc_sim <- function (x,y) {
result <- 1 - (abs(x - y)/16)
return(result)
}
#categorical similarity function
fcat_sim <- function (x,y) {
result <- 1 - (abs(x - y)/1)
return(result)
}
#age
fage_sim <- function (x,y) {
result <- 1 - (abs(x - y)/13)
return(result)
}
First start with creation of event file. Select the alter ids from the data and reshape the file so we can identify when a dyad is selected.
#select the alter data from the liss long file
event_data <- liss_long %>%
select(nomem_encr, alter_id_1:alter_id_5, survey_wave) %>%
mutate(survey_wave = as.numeric(survey_wave))
#create a dyad id variable.
event_data <- event_data %>%
pivot_longer(cols = alter_id_1:alter_id_5,
names_to = "name",
values_to = "alter_id") %>%
mutate(
alter_id = ifelse(alter_id == -9, NA, alter_id),
#set alter id to NA if -9
dyad_id = paste0(nomem_encr, alter_id),
#create new dyad id with paste.
dyad_id = ifelse(is.na(alter_id), NA, dyad_id)
) #if alter id is na dyad id na.
#create a selection variable.
event_data <- event_data %>%
select(nomem_encr, dyad_id, alter_id, survey_wave) %>%
arrange(dyad_id, survey_wave) %>% # sort on dyad id and surveywave.
mutate(selected = ifelse(!is.na(dyad_id), 1, 0)) %>% #if not na, then dyad id is selected
filter(!is.na(dyad_id)) #filter our missing dyad ids.
This should not be possible but is a fault of the data collection.
#to be safe, delete complete ego networks where this happens.
event_data <- event_data %>%
group_by(nomem_encr, survey_wave, dyad_id) %>%
add_count(dyad_id) %>%
ungroup() %>%
group_by(nomem_encr) %>%
mutate(duplicates_network_ego = max(n)) %>%
filter(duplicates_network_ego == 1) %>%
ungroup()
#reshape to wide file
event_data <- event_data %>%
pivot_wider(names_from = survey_wave,
values_from = selected)
#transform variables
#rename the selection variables
event_data <- event_data %>%
rename(selected_1 = '1',
selected_2 = '2',
selected_3 = '3',
selected_4 = '4',
selected_5 = '5',
selected_6 = '6',
selected_7 = '7',
selected_8 = '8',
selected_9 = '9',
selected_10 = '10',
selected_11 = '11')
#recode so NA == 0
recoded <- event_data %>%
select(starts_with("selected")) %>%
map_df(.f = ~ ifelse(is.na(.), 0, .))
#add the recoded collumns to the tibble
event_data <- event_data %>%
select(1:2) %>%
cbind(recoded)
First we need to identify when a respondent has participated in the survey.
#create a long file.
event_data <- event_data %>%
pivot_longer(cols = 3:13,
names_to = c("variables", "survey_wave"),
values_to = "selected",
names_sep = "_") %>%
mutate(survey_wave = as.numeric(survey_wave)) %>%
arrange(nomem_encr, dyad_id, survey_wave)
#code what the possible end date is for each respondent (as alters are nested in respondents).
#First step is to identify in which rounds there are no valid responses
no_participation <- liss_long %>%
select(nomem_encr, survey_wave, leisure_part) %>%
mutate(noparticipation = ifelse(is.na(leisure_part), 1, 0),
survey_wave = as.numeric(survey_wave)) %>%
select(nomem_encr, survey_wave, noparticipation)
Second, we can identify the start of the respondent spell and the end of the respondent spell. Then we know whether someone can be selected or dropped at a given time.
#for every respondent code when the enter and leave the data.
ego_start_end_year <- no_participation %>%
filter(noparticipation == 0) %>%
group_by(nomem_encr) %>%
mutate(survey_wave = as.numeric(survey_wave)) %>%
mutate(start_year = min(survey_wave), #set start year of nomem_encr
end_year = max(survey_wave)) %>% #set end year of nomem_encr
ungroup() %>%
select(nomem_encr, end_year, start_year) %>% #keep selection of variables.
distinct() #keep unique observations.
#add start and end year to the event data.
event_data <- event_data %>%
left_join(ego_start_end_year, by = "nomem_encr")
#add start and end year to the event data.
event_data <- event_data %>%
left_join(no_participation, by = c("nomem_encr","survey_wave"))
#set selected to NA if respondent is not in the data
event_data <- event_data %>%
mutate(selected = ifelse(survey_wave > end_year, NA, selected),
selected = ifelse(survey_wave < start_year, NA, selected),
selected = ifelse(noparticipation == 1, NA, selected))
#calculate for each alter what the starting wave is.
#So when is he/she first at risk to be deselected
entered_network <- event_data %>%
group_by(dyad_id) %>%
filter(selected == 1) %>%
mutate(entered_network = min(survey_wave)) %>% #year dyad entered network.
select(dyad_id, entered_network) %>%
ungroup() %>%
distinct()
#add entered network variable to the event data.
event_data <- event_data %>%
left_join(entered_network, by = "dyad_id")
With this information we can identify the first and last drop of a dyad but also when the last time is that they are selected.
#drop when selected is missing and smaller then endyear and bigger than startyear
event_data <- event_data %>%
filter((survey_wave <= end_year) & (survey_wave >= start_year)) %>%
filter(!is.na(selected))
#calculate for each alter the first year in which he or she is dropped from the network
first_drop <- event_data %>%
group_by(dyad_id) %>%
mutate(transition = selected - lag(selected), #create transition variable. Just a lag diff.
dropped = ifelse(transition == -1, 1, 0)) %>% #use transition to identify drop. (1 to 0)
filter(dropped == 1) %>% #select observations that are dropped.
mutate(first_drop = min(survey_wave)) %>% #first year dropped is first drop.
ungroup() %>%
select(dyad_id, first_drop) %>% #select first drop variables.
distinct()
#add first drop data to event data.
event_data <- event_data %>%
left_join(first_drop, by = "dyad_id")
#calculate for each alter the final time they are dropped from the network.
last_drop <- event_data %>%
group_by(dyad_id) %>%
mutate(transition = selected - lag(selected),
dropped = ifelse(transition == -1, 1, 0)) %>%
filter(dropped == 1) %>%
mutate(last_drop = max(survey_wave)) %>% #last time they are are dropped from the network.
ungroup() %>%
select(dyad_id, last_drop) %>%
distinct()
#add last drop data to event data.
event_data <- event_data %>%
left_join(last_drop, by = "dyad_id")
#calculate for each alter the final time they are selected.
final_selected <- event_data %>%
group_by(dyad_id) %>%
filter(selected == 1) %>%
mutate(final_selected = max(survey_wave)) %>%
ungroup() %>%
select(dyad_id, final_selected) %>%
distinct()
event_data <- event_data %>%
left_join(final_selected, by = "dyad_id")
Compute for every dyad when they reenter the data. Also create censored variable and a time variable which describes the range between time of entering data and time of final drop.
#calculate for each alter when they reenter the network.
event_data <- event_data %>%
group_by(dyad_id) %>%
mutate(transition = selected - lag(selected),
re_entrance = ifelse((transition == 1) & (survey_wave > entered_network), 1, 0)) %>%
ungroup()
#create censor variable
event_data <- event_data %>%
mutate(censor = ifelse(final_selected == end_year, 1, 0))
#create new time variable. What is the time of final deselection after entering the network.
event_data <- event_data %>%
group_by(dyad_id) %>%
mutate(range = ifelse(censor == 0,
(max(last_drop) - entered_network) + 1,
(final_selected - entered_network) + 1)) %>%
ungroup()
#create person level data, not repeated risk
person_level <- event_data %>%
select(dyad_id, nomem_encr, range, censor) %>%
distinct()
#person period
person_period <- PLPP(data = as.data.frame(person_level),
id = "dyad_id",
period = "range",
event = "censor",
direction = "period")
Export and save the risk data and the person_period data.
#save event data as 2022-07-01_risk-data.rds
save(event_data, file = "datafiles/data-processed/disaggregated_data/2023-06-12_liss-risk-data.rds")
#save person period data
save(person_period, file = "datafiles/data-processed/disaggregated_data/2023-06-12_liss-person_period.rds")
#clean global environment
rm(list=ls()[! ls() %in% c("event_data", "liss_long", "liss_wide",
"person_level", "person_period")])
#save the data.
save.image("datafiles/data-processed/disaggregated_data/2023-06-12_liss_event_data.rds")
Copyright © 2023 Jeroense Thijmen