Repeated Risk Data
With this code we will now create a repeated risk data set. So we can
identify how often a person is in the data and in what spell he or she
is in.
Step 1: count number of drops per dyad
We first start with creation of such a count variable. This will
count the number of dyad spells.
#for every dyad, count the number of drops
nr_transition <- event_data %>%
filter(transition == -1) %>%
group_by(dyad_id) %>%
count(dyad_id) %>%
rename(number_transition = n) %>%
ungroup()
#add the number of transitions to event_data
event_data <- event_data %>%
left_join(nr_transition, by = "dyad_id")
Step 2: create spell count variable
Now we know the number of dyad spells we can start with our custom
function for each dyad. For each dyad it will count the number of spells
and for each observation of the dyad it will also note in which spell it
is currently in.
#create the variable dropped
#based on transition, when transition is -1, alter is dropped.
data_list <- event_data %>%
ungroup() %>%
mutate(dropped = ifelse(transition == -1, 1, 0)) %>%
mutate(dropped = ifelse(is.na(dropped), 0, dropped)) %>%
group_split(dyad_id)
plan(multisession, workers = 6)
data_list <- data_list %>%
future_map(.f = ~ Ftimes_dropped(.x),
.progress = T)
#stop parallel session
plan(sequential)
#reset data to event_data df
repeated_event_data <- data_list %>%
rbindlist()
#create a new repeated risk dataset
repeated_event_data <- repeated_event_data %>%
select(!c(range, censor)) %>%
mutate(
times_dropped_rec = ifelse(survey_wave == 1, 0, times_dropped_rec),
dropped = ifelse(is.na(selected), NA, dropped),
dropped = ifelse(survey_wave < entered_network, NA, dropped),
times_dropped_rec = ifelse(is.na(selected), NA, times_dropped_rec),
times_dropped_rec = ifelse(survey_wave < entered_network, NA, times_dropped_rec),
times_dropped_rec = ifelse(survey_wave == entered_network, 0, times_dropped_rec),
process_id = paste0(dyad_id, times_dropped_rec)
) %>%
filter(!is.na(dropped))
Step 3: Delete empty observations
We have created a counter for each dyad spell and also created a
process id, which identifies the dyad spell. However, right now we also
have observations in the data who are dropped and not in the data. These
cannot be at the risk of deselection. Hence we need to drop them from
the data as reocurrence is not part of study.
## filter out observations in which the dyad_id is not dropped from the network and also not selected
# in this instances the dyad is not at risk of being deselected.
repeated_event_data <- repeated_event_data %>%
mutate(filter = dropped + selected) %>%
filter(filter > 0)
Step 4: create a new censor variable.
#create censor variable for process_id
censor_process_id <- repeated_event_data %>%
select(dyad_id, process_id, survey_wave, dropped, selected) %>%
group_by(process_id) %>%
mutate(max_wave = max(survey_wave)) %>%
mutate(censor = ifelse((max_wave == survey_wave) & (selected == 1), 1, 0)) %>%
summarize(censor_process = max(censor)) %>%
ungroup()
#add censor to the data
repeated_event_data <- repeated_event_data %>%
left_join(censor_process_id, by = "process_id")
Step 5: create a new time variable
We need a new time variable that calculates for every process_id how
long it is and for every observation in which period it is of the full
time range.
#calculate for every process id how long the process is
process_length <- repeated_event_data %>%
group_by(process_id) %>%
count() %>%
rename(range = n) %>%
ungroup()
## add the process length to the repeated_data
repeated_event_data <- repeated_event_data %>%
#select(!range) %>%
left_join(process_length, by = "process_id")
#create new time variable
repeated_event_data <- repeated_event_data %>%
group_by(process_id) %>%
mutate(time = seq(from = 1, to = max(range))) %>%
ungroup()
Step 6: create relationship length variable
I also want a variable which denotes how long an alter and an ego
know each other. Two variations on this theme: first, calculate the
total amount of time between entering the network and the final drop.
Second, calculate the total number of times an alter was present in the
network.
relationship_length <- repeated_event_data %>%
select(nomem_encr, survey_wave, dyad_id, time, entered_network, end_year, dropped) %>%
filter(dropped == 0) %>%
group_by(dyad_id) %>%
mutate(length_rel_member = 1:n(),
length_rel_total = survey_wave - entered_network + 1) %>%
ungroup() %>%
select(nomem_encr, survey_wave, dyad_id, length_rel_member, length_rel_total)
repeated_event_data <- repeated_event_data %>%
left_join(relationship_length, by = c("nomem_encr", "survey_wave", "dyad_id"))
#create a selection of the data
repeated_person_level <- repeated_event_data %>%
select(process_id, dyad_id, nomem_encr, times_dropped_rec, censor_process, range) %>%
distinct() %>%
rename(times_dropped_earlier = times_dropped_rec)
Data export
Export the repeated risk data.
#save event data as 2022-09-02_risk-data.rds
save(repeated_event_data, file = "datafiles/data-processed/disaggregated_data/2023-06-12_liss-repeated-risk-data.rda")
#save person period data
save(repeated_person_level, file = "datafiles/data-processed/disaggregated_data/2023-06-12_liss-repeated-person-level.rda")
---
title: 'Repeated Risk Data Preperation'
author: "Thijmen Jeroense"
date: "Last compiled on `r format(Sys.time(), '%d %B, %Y')`"
output:
  html_document:
    toc: TRUE
    toc_depth: 3
    toc_float: TRUE
    code_folding: show
    code_download: TRUE
    
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(cache = TRUE, message = FALSE, warning = FALSE, results = "asis",
                      fig.align = "center")
```

# Set up

## Packages

```{r load data library}
#library
library(tidyverse)
library(data.table)
library(igraph)#for egonetdata
library(furrr)
```

## Import 

```{r data import}
load(file = "datafiles/data-processed/disaggregated_data/2023-06-12_liss-risk-data.rds")

```

## Custom functions

```{r custom functions}

#create function in which we can distinguish for dyads different spells
#so spell of first time drop, second time, dropped etc. 
# This way we can identify different dyad spells and also add
# how often an alter is already dropped. 

Ftimes_dropped <- function(x) {
  #x = data_list[[1]]
  #select variable that we need to use (in this case dropped)
  y <- x %>%
    select(dropped)
  
  #create empty list to loop over
  y_list <- list()
  
  #initiate loop
  for (i in 1:nrow(y)) {
    #i = 9
    if (i == 1) {
      a <- y[i, ]
      y_list[[i]] <- a
    }
    else {
      a <- y[i, ] + y_list[[i - 1]]
      y_list[[i]] <- a
    }
  }
  
  #reduce to one df
  y <- y_list %>%
    rbindlist() %>%
    rename(times_dropped = dropped) %>%
    mutate(times_dropped_rec = lag(times_dropped))
  
  #add df to group split
  return(cbind(x, y))
}



```


# Repeated Risk Data

With this code we will now create a repeated risk data set. So we can identify how often a person is in the data and in what spell he or she is in. 

## Step 1: count number of drops per dyad

We first start with creation of such a count variable. This will count the number of dyad spells. 

```{r repeated risk data 1}
#for every dyad, count the number of drops 
nr_transition <- event_data %>%
  filter(transition == -1) %>%
  group_by(dyad_id) %>%
  count(dyad_id) %>%
  rename(number_transition = n) %>%
  ungroup()

#add the number of transitions to event_data
event_data <- event_data %>%
  left_join(nr_transition, by = "dyad_id")

```

## Step 2: create spell count variable

Now we know the number of dyad spells we can start with our custom function for each dyad. For each dyad it will count the number of spells and for each observation of the dyad it will also note in which spell it is currently in. 

```{r repeated risk data 1.1}

#create the variable dropped
#based on transition, when transition is -1, alter is dropped.
data_list <- event_data %>%
  ungroup() %>% 
  mutate(dropped = ifelse(transition == -1, 1, 0)) %>%
  mutate(dropped = ifelse(is.na(dropped), 0, dropped)) %>%
  group_split(dyad_id)

plan(multisession, workers = 6)

data_list <- data_list %>%
  future_map(.f = ~ Ftimes_dropped(.x),
                 .progress = T)


#stop parallel session
plan(sequential)

#reset data to event_data df
repeated_event_data <- data_list %>%
  rbindlist()

#create a new repeated risk dataset
repeated_event_data <- repeated_event_data %>%
  select(!c(range, censor)) %>%
  mutate(
    times_dropped_rec = ifelse(survey_wave == 1, 0, times_dropped_rec),
    dropped = ifelse(is.na(selected), NA, dropped),
    dropped = ifelse(survey_wave < entered_network, NA, dropped),
    times_dropped_rec = ifelse(is.na(selected), NA, times_dropped_rec),
    times_dropped_rec = ifelse(survey_wave < entered_network, NA, times_dropped_rec),
    times_dropped_rec = ifelse(survey_wave == entered_network, 0, times_dropped_rec),
    process_id = paste0(dyad_id, times_dropped_rec)
  ) %>%
  filter(!is.na(dropped))

```


## Step 3: Delete empty observations

We have created a counter for each dyad spell and also created a process id, which identifies the dyad spell. However, right now we also have observations in the data who are dropped and not in the data. These cannot be at the risk of deselection. Hence we need to drop them from the data as reocurrence is not part of study.


```{r repeated risk data 2}

## filter out observations in which the dyad_id is not dropped from the network and also not selected
# in this instances the dyad is not at risk of being deselected.
repeated_event_data <- repeated_event_data %>%
  mutate(filter = dropped + selected) %>%
  filter(filter > 0)

```

## Step 4:  create a new censor variable. 

```{r repeated risk data 3}
#create censor variable for process_id
censor_process_id <- repeated_event_data %>%
  select(dyad_id, process_id, survey_wave, dropped, selected) %>%
  group_by(process_id) %>%
  mutate(max_wave = max(survey_wave)) %>%
  mutate(censor = ifelse((max_wave == survey_wave) & (selected == 1), 1, 0)) %>%
  summarize(censor_process = max(censor)) %>%
  ungroup()

#add censor to the data
repeated_event_data <- repeated_event_data %>%
  left_join(censor_process_id, by = "process_id")

```

## Step 5: create a new time variable

We need a new time variable that calculates for every process_id how long it is and for every observation in which period it is of the full time range. 

```{r repeated risk data 4}
#calculate for every process id how long the process is
process_length <- repeated_event_data %>%
  group_by(process_id) %>%
  count() %>%
  rename(range = n) %>%
  ungroup()

## add the process length to the repeated_data
repeated_event_data <- repeated_event_data %>%
  #select(!range) %>%
  left_join(process_length, by = "process_id")

#create new time variable 
repeated_event_data <- repeated_event_data %>%
  group_by(process_id) %>%
  mutate(time = seq(from = 1, to = max(range))) %>%
  ungroup()
```


## Step 6: create relationship length variable

I also want a variable which denotes how long an alter and an ego know each other. Two variations on this theme: first, calculate the total amount of time between entering the network and the final drop. Second, calculate the total number of times an alter was present in the network.

```{r repeated risk 5}
relationship_length <- repeated_event_data %>% 
  select(nomem_encr, survey_wave, dyad_id, time, entered_network, end_year, dropped) %>%
  filter(dropped == 0) %>% 
  group_by(dyad_id) %>% 
  mutate(length_rel_member = 1:n(),
         length_rel_total = survey_wave - entered_network + 1) %>% 
  ungroup() %>% 
  select(nomem_encr, survey_wave, dyad_id, length_rel_member, length_rel_total)

repeated_event_data <- repeated_event_data %>% 
  left_join(relationship_length, by = c("nomem_encr", "survey_wave", "dyad_id"))


#create a selection of the data
repeated_person_level <- repeated_event_data %>%
  select(process_id, dyad_id, nomem_encr, times_dropped_rec, censor_process, range) %>%
  distinct() %>%
  rename(times_dropped_earlier = times_dropped_rec)

```

# Data export

Export the repeated risk data.

```{r export data}
#save event data as 2022-09-02_risk-data.rds
save(repeated_event_data, file = "datafiles/data-processed/disaggregated_data/2023-06-12_liss-repeated-risk-data.rda")

#save person period data
save(repeated_person_level, file = "datafiles/data-processed/disaggregated_data/2023-06-12_liss-repeated-person-level.rda")

```


