Set up
Libraries
# get packages needed
fpackage.check <- function(packages) { # (c) Jochem Tolsma
lapply(packages, FUN = function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dependencies = TRUE)
library(x, character.only = TRUE)
}
})
}
packages = c("haven", "NSUM","matrixStats", "parallel",
"MASS", "doParallel", "tictoc", "beepr", "dplyr", "purrr",
"readr")
fpackage.check(packages)
Import the NSUM prep files.
#import nsum input
load(file = "data_analysis/data/data_processed/nsum_input/analysis_files_selection.rds")
#extract mat and known vector
mat <- analysis_files_selection[[1]]
known_vector <- analysis_files_selection[[2]]
NSUM analysis
We will initialize the NSUM estimation. First we will set the seed to
make it repricable. Second, we set the number of iterations on 40000,
the burin at 1000, the number of retains results at 4000, the population
size, and the number of models we need to run (holdouts)
#set seed
set.seed(220914)
#------------------------------------------------------------------------------------------------
# some info to work with
iters <- 40000# number of iterations (40k)
burns <- 1000 # burnin size (1k)
retain <- 4000 # how many chains do we want to retain? (4k)
popsize <- 17407585 # population size
holdouts <- 1:length(known_vector)
#------------------------------------------------------------------------------------------------
We will use parallel computing to speed up the NSUM calculations. My
work lapop has 8 cores so we can use 7 cores for overnight computation.
There are 16 models to estimate so we need 3 runs.
# paralellize the estimation
numCores <- detectCores()
if (numCores > length(holdouts-1)) {
registerDoParallel(cores = length(holdouts))
} else {
print("Too few cores to do in one run!")
}
[1] “Too few cores to do in one run!”
# so we need to do in seperate runs
registerDoParallel(core=numCores-1)
Main analysis loop. We use the killworth start function to create
starting values. We then use these in the analysis loop. We calculate
for each respondent the average and sd over the 4000 retained chains.
These are stored in separate datafiles.
#------------------------------------------------------------------------------------------------
# fill up empty lists
kds <- list()
kdssd <- list()
data_list <- list()
tic()
foreach(i = holdouts[1:16]) %dopar% { # change the number here on own max core you want to use #i = 1
# 16 models to estimate. I use 7 cores, each core solves one per night.
# so 16/7 = 2.3 = 3 nights.
cd <- mat[, -c(i)] # take out pop of interest and make a new mat
# calculate starting values
z <- NSUM::killworth.start(cbind(cd, mat[, c(i)]), # paste the "takenout" at the END of matrix
known_vector[-c(i)], # this is the known pop, WITHOUT unknown pop
popsize) # population size
file_name_data <- paste0("data_analysis/results/nsum_output/main/data/estimates_holdout", i, ".txt") #create file.name for data
file_name_model <- paste0("data_analysis/results/nsum_output/main/model/estimates_holdout", i, ".rds") #create file.name for model
if(!file.exists(file_name_model)){#check for file, if it does not exist run the NSUM model
# run the Bayesian estimation
degree <- NSUM::nsum.mcmc(cbind(cd, mat[, c(i)]), # gets pasted at the last column
known_vector[-c(i)], # here we again take out the "holdout", or artificial "unknown" pop
popsize, model = "combined", # combined control for transmission and recall errors
indices.k = length(holdouts), # notice that "holdout" gets pasted as last column
iterations = iters, burnin = burns, size = retain, # 40k iterations, retain 4k chains
d.start = z$d.start, mu.start = z$mu.start, # starting values from simple estimator
sigma.start = z$sigma.start, NK.start = z$NK.start)
#store model_results
save(degree, file = file_name_model)
} else {
load(file_name_model)
}
#store mean and sd in df
# calculate rowmean and it's SD of the retained 4k chains
kds[[i]] <- rowMeans(degree$d.values,na.rm = TRUE) # calculate rowmean of netsize iterations: so the retained chains
kdssd[[i]] <- matrixStats::rowSds(degree$d.values) # calculate sd of 4k estimates per row: sd for those values
data_list[[i]] <- data.frame(cbind(kds[[i]],kdssd[[i]])) # combine and put in df
# Save the data, new .txt for each iteration, if something goes wrong, we can always start at prior one and combine
write.table(data_list[[i]], file = file_name_data, row.names = F) #store them in results.
}
toc() # time passed?
beep() # done!
LS0tDQp0aXRsZTogIk5TVU0gbWFpbiBhbmFseXNpcyINCmF1dGhvcjogIlRoaWptZW4gSmVyb2Vuc2UiDQpkYXRlOiAiTGFzdCBjb21waWxlZCBvbiBgciBmb3JtYXQoU3lzLnRpbWUoKSwgJyVkICVCLCAlWScpYCINCm91dHB1dDoNCiAgaHRtbF9kb2N1bWVudDoNCiAgICB0b2M6IFRSVUUNCiAgICB0b2NfZGVwdGg6IDMNCiAgICB0b2NfZmxvYXQ6IFRSVUUNCiAgICBjb2RlX2ZvbGRpbmc6IHNob3cNCiAgICBjb2RlX2Rvd25sb2FkOiBUUlVFDQotLS0NCg0KYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9DQprbml0cjo6b3B0c19jaHVuayRzZXQoY2FjaGUgPSBUUlVFLCBtZXNzYWdlID0gRkFMU0UsIHdhcm5pbmcgPSBGQUxTRSwgcmVzdWx0cyA9ICJhc2lzIiwNCiAgICAgICAgICAgICAgICAgICAgICBmaWcuYWxpZ24gPSAiY2VudGVyIikNCmBgYA0KDQoNCiMgU2V0IHVwDQoNCiMjIExpYnJhcmllcw0KDQpgYGB7ciBsaWJyYXJpZXMsIHJlc3VsdHM9J2hpZGUnfQ0KIyBnZXQgcGFja2FnZXMgbmVlZGVkDQpmcGFja2FnZS5jaGVjayA8LSBmdW5jdGlvbihwYWNrYWdlcykgeyAjIChjKSBKb2NoZW0gVG9sc21hDQogIGxhcHBseShwYWNrYWdlcywgRlVOID0gZnVuY3Rpb24oeCkgew0KICAgIGlmICghcmVxdWlyZSh4LCBjaGFyYWN0ZXIub25seSA9IFRSVUUpKSB7DQogICAgICBpbnN0YWxsLnBhY2thZ2VzKHgsIGRlcGVuZGVuY2llcyA9IFRSVUUpDQogICAgICBsaWJyYXJ5KHgsIGNoYXJhY3Rlci5vbmx5ID0gVFJVRSkNCiAgICB9DQogIH0pDQp9DQpwYWNrYWdlcyA9IGMoImhhdmVuIiwgIk5TVU0iLCJtYXRyaXhTdGF0cyIsICJwYXJhbGxlbCIsIA0KICAgICAgICAgICAgICJNQVNTIiwgImRvUGFyYWxsZWwiLCAidGljdG9jIiwgImJlZXByIiwgImRwbHlyIiwgInB1cnJyIiwNCiAgICAgICAgICAgICAicmVhZHIiKQ0KZnBhY2thZ2UuY2hlY2socGFja2FnZXMpDQpgYGANCg0KDQojIyBJbXBvcnQgdGhlIE5TVU0gcHJlcCBmaWxlcy4NCg0KYGBge3IgaW1wb3J0IGZpbGVzfQ0KI2ltcG9ydCBuc3VtIGlucHV0DQpsb2FkKGZpbGUgPSAiZGF0YV9hbmFseXNpcy9kYXRhL2RhdGFfcHJvY2Vzc2VkL25zdW1faW5wdXQvYW5hbHlzaXNfZmlsZXNfc2VsZWN0aW9uLnJkcyIpDQoNCiNleHRyYWN0IG1hdCBhbmQga25vd24gdmVjdG9yDQptYXQgPC0gYW5hbHlzaXNfZmlsZXNfc2VsZWN0aW9uW1sxXV0NCmtub3duX3ZlY3RvciA8LSBhbmFseXNpc19maWxlc19zZWxlY3Rpb25bWzJdXQ0KYGBgDQoNCiMgTlNVTSBhbmFseXNpcw0KDQpXZSB3aWxsIGluaXRpYWxpemUgdGhlIE5TVU0gZXN0aW1hdGlvbi4gRmlyc3Qgd2Ugd2lsbCBzZXQgdGhlIHNlZWQgdG8gbWFrZSBpdCByZXByaWNhYmxlLiBTZWNvbmQsIHdlIHNldCB0aGUgbnVtYmVyIG9mIGl0ZXJhdGlvbnMgb24gNDAwMDAsIHRoZSBidXJpbiBhdCAxMDAwLCB0aGUgbnVtYmVyIG9mIHJldGFpbnMgcmVzdWx0cyBhdCA0MDAwLCB0aGUgcG9wdWxhdGlvbiBzaXplLCBhbmQgdGhlIG51bWJlciBvZiBtb2RlbHMgd2UgbmVlZCB0byBydW4gKGhvbGRvdXRzKQ0KDQpgYGB7ciBmdW5jdGlvbn0NCiNzZXQgc2VlZA0Kc2V0LnNlZWQoMjIwOTE0KQ0KDQojLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tDQojIHNvbWUgaW5mbyB0byB3b3JrIHdpdGgNCml0ZXJzIDwtIDQwMDAwIyBudW1iZXIgb2YgaXRlcmF0aW9ucyAoNDBrKQ0KYnVybnMgPC0gMTAwMCAjIGJ1cm5pbiBzaXplICAoMWspDQpyZXRhaW4gPC0gNDAwMCAjIGhvdyBtYW55IGNoYWlucyBkbyB3ZSB3YW50IHRvIHJldGFpbj8gKDRrKQ0KcG9wc2l6ZSA8LSAxNzQwNzU4NSAjIHBvcHVsYXRpb24gc2l6ZQ0KaG9sZG91dHMgPC0gMTpsZW5ndGgoa25vd25fdmVjdG9yKSAgDQoNCiMtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0NCg0KYGBgDQoNCldlIHdpbGwgdXNlIHBhcmFsbGVsIGNvbXB1dGluZyB0byBzcGVlZCB1cCB0aGUgTlNVTSBjYWxjdWxhdGlvbnMuIE15IHdvcmsgbGFwb3AgaGFzIDggY29yZXMgc28gd2UgY2FuIHVzZSA3IGNvcmVzIGZvciBvdmVybmlnaHQgY29tcHV0YXRpb24uIFRoZXJlIGFyZSAxNiBtb2RlbHMgdG8gZXN0aW1hdGUgc28gd2UgbmVlZCAzIHJ1bnMuIA0KDQpgYGB7ciBwYXJhbGxlemllIHNlc3Npb259DQojIHBhcmFsZWxsaXplIHRoZSBlc3RpbWF0aW9uDQpudW1Db3JlcyA8LSBkZXRlY3RDb3JlcygpDQppZiAobnVtQ29yZXMgPiBsZW5ndGgoaG9sZG91dHMtMSkpIHsNCiAgcmVnaXN0ZXJEb1BhcmFsbGVsKGNvcmVzID0gbGVuZ3RoKGhvbGRvdXRzKSkNCn0gZWxzZSB7ICAgIA0KICBwcmludCgiVG9vIGZldyBjb3JlcyB0byBkbyBpbiBvbmUgcnVuISIpIA0KfSANCiMgc28gd2UgbmVlZCB0byBkbyBpbiBzZXBlcmF0ZSBydW5zDQpyZWdpc3RlckRvUGFyYWxsZWwoY29yZT1udW1Db3Jlcy0xKQ0KDQpgYGANCg0KTWFpbiBhbmFseXNpcyBsb29wLiBXZSB1c2UgdGhlIGtpbGx3b3J0aCBzdGFydCBmdW5jdGlvbiB0byBjcmVhdGUgc3RhcnRpbmcgdmFsdWVzLiBXZSB0aGVuIHVzZSB0aGVzZSBpbiB0aGUgYW5hbHlzaXMgbG9vcC4gV2UgY2FsY3VsYXRlIGZvciBlYWNoIHJlc3BvbmRlbnQgdGhlIGF2ZXJhZ2UgYW5kIHNkIG92ZXIgdGhlIDQwMDAgcmV0YWluZWQgY2hhaW5zLiBUaGVzZSBhcmUgc3RvcmVkIGluIHNlcGFyYXRlIGRhdGFmaWxlcy4gDQoNCmBgYHtyIGFuYXlzaXMgbG9vcCwgcmVzdWx0cz0naGlkZSd9DQoNCiMtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0NCiMgZmlsbCB1cCBlbXB0eSBsaXN0cw0Ka2RzIDwtIGxpc3QoKQ0Ka2Rzc2QgPC0gbGlzdCgpDQpkYXRhX2xpc3QgPC0gbGlzdCgpDQoNCnRpYygpDQpmb3JlYWNoKGkgPSBob2xkb3V0c1sxOjE2XSkgJWRvcGFyJSB7ICMgY2hhbmdlIHRoZSBudW1iZXIgaGVyZSBvbiBvd24gbWF4IGNvcmUgeW91IHdhbnQgdG8gdXNlICNpID0gMQ0KICAjIDE2IG1vZGVscyB0byBlc3RpbWF0ZS4gSSB1c2UgNyBjb3JlcywgZWFjaCBjb3JlIHNvbHZlcyBvbmUgcGVyIG5pZ2h0Lg0KICAjIHNvIDE2LzcgPSAyLjMgPSAzIG5pZ2h0cy4NCiAgDQogIGNkIDwtIG1hdFssIC1jKGkpXSAjIHRha2Ugb3V0IHBvcCBvZiBpbnRlcmVzdCBhbmQgbWFrZSBhIG5ldyBtYXQNCiAgDQogICMgY2FsY3VsYXRlIHN0YXJ0aW5nIHZhbHVlcw0KICB6IDwtIE5TVU06OmtpbGx3b3J0aC5zdGFydChjYmluZChjZCwgbWF0WywgYyhpKV0pLCAjIHBhc3RlIHRoZSAidGFrZW5vdXQiIGF0IHRoZSBFTkQgb2YgbWF0cml4DQogICAgICAgICAgICAgICAgICAgICAgICAgICAgIGtub3duX3ZlY3RvclstYyhpKV0sICMgdGhpcyBpcyB0aGUga25vd24gcG9wLCBXSVRIT1VUIHVua25vd24gcG9wDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgIHBvcHNpemUpICMgcG9wdWxhdGlvbiBzaXplDQogIA0KICAgIGZpbGVfbmFtZV9kYXRhIDwtIHBhc3RlMCgiZGF0YV9hbmFseXNpcy9yZXN1bHRzL25zdW1fb3V0cHV0L21haW4vZGF0YS9lc3RpbWF0ZXNfaG9sZG91dCIsIGksICIudHh0IikgI2NyZWF0ZSBmaWxlLm5hbWUgZm9yIGRhdGENCiAgICBmaWxlX25hbWVfbW9kZWwgPC0gcGFzdGUwKCJkYXRhX2FuYWx5c2lzL3Jlc3VsdHMvbnN1bV9vdXRwdXQvbWFpbi9tb2RlbC9lc3RpbWF0ZXNfaG9sZG91dCIsIGksICIucmRzIikgI2NyZWF0ZSBmaWxlLm5hbWUgZm9yIG1vZGVsDQogICAgaWYoIWZpbGUuZXhpc3RzKGZpbGVfbmFtZV9tb2RlbCkpeyNjaGVjayBmb3IgZmlsZSwgaWYgaXQgZG9lcyBub3QgZXhpc3QgcnVuIHRoZSBOU1VNIG1vZGVsDQogICAgICAjIHJ1biB0aGUgQmF5ZXNpYW4gZXN0aW1hdGlvbg0KICAgICAgZGVncmVlIDwtIE5TVU06Om5zdW0ubWNtYyhjYmluZChjZCwgbWF0WywgYyhpKV0pLCAjIGdldHMgcGFzdGVkIGF0IHRoZSBsYXN0IGNvbHVtbg0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBrbm93bl92ZWN0b3JbLWMoaSldLCAjIGhlcmUgd2UgYWdhaW4gdGFrZSBvdXQgdGhlICJob2xkb3V0Iiwgb3IgYXJ0aWZpY2lhbCAidW5rbm93biIgcG9wDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHBvcHNpemUsIG1vZGVsID0gImNvbWJpbmVkIiwgIyBjb21iaW5lZCBjb250cm9sIGZvciB0cmFuc21pc3Npb24gYW5kIHJlY2FsbCBlcnJvcnMNCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgaW5kaWNlcy5rID0gbGVuZ3RoKGhvbGRvdXRzKSwgIyBub3RpY2UgdGhhdCAiaG9sZG91dCIgZ2V0cyBwYXN0ZWQgYXMgbGFzdCBjb2x1bW4NCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgaXRlcmF0aW9ucyA9IGl0ZXJzLCBidXJuaW4gPSBidXJucywgc2l6ZSA9IHJldGFpbiwgIyA0MGsgaXRlcmF0aW9ucywgcmV0YWluIDRrIGNoYWlucw0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBkLnN0YXJ0ID0geiRkLnN0YXJ0LCBtdS5zdGFydCA9IHokbXUuc3RhcnQsICMgc3RhcnRpbmcgdmFsdWVzIGZyb20gc2ltcGxlIGVzdGltYXRvcg0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBzaWdtYS5zdGFydCA9IHokc2lnbWEuc3RhcnQsIE5LLnN0YXJ0ID0geiROSy5zdGFydCkNCiAgICAgIA0KICAgICAgI3N0b3JlIG1vZGVsX3Jlc3VsdHMNCiAgICAgIHNhdmUoZGVncmVlLCBmaWxlID0gZmlsZV9uYW1lX21vZGVsKQ0KICAgICAgDQogICAgfSBlbHNlIHsNCiAgICAgIGxvYWQoZmlsZV9uYW1lX21vZGVsKQ0KICAgIH0NCg0KICAgICNzdG9yZSBtZWFuIGFuZCBzZCBpbiBkZg0KICAgICMgY2FsY3VsYXRlIHJvd21lYW4gYW5kIGl0J3MgU0Qgb2YgdGhlIHJldGFpbmVkIDRrIGNoYWlucw0KICAgIGtkc1tbaV1dIDwtIHJvd01lYW5zKGRlZ3JlZSRkLnZhbHVlcyxuYS5ybSA9IFRSVUUpICMgY2FsY3VsYXRlIHJvd21lYW4gb2YgbmV0c2l6ZSBpdGVyYXRpb25zOiBzbyB0aGUgcmV0YWluZWQgY2hhaW5zDQogICAga2Rzc2RbW2ldXSA8LSBtYXRyaXhTdGF0czo6cm93U2RzKGRlZ3JlZSRkLnZhbHVlcykgIyBjYWxjdWxhdGUgc2Qgb2YgNGsgZXN0aW1hdGVzIHBlciByb3c6IHNkIGZvciB0aG9zZSB2YWx1ZXMNCiAgICBkYXRhX2xpc3RbW2ldXSA8LSBkYXRhLmZyYW1lKGNiaW5kKGtkc1tbaV1dLGtkc3NkW1tpXV0pKSAjIGNvbWJpbmUgYW5kIHB1dCBpbiBkZg0KICAgIA0KICAgICMgU2F2ZSB0aGUgZGF0YSwgbmV3IC50eHQgZm9yIGVhY2ggaXRlcmF0aW9uLCBpZiBzb21ldGhpbmcgZ29lcyB3cm9uZywgd2UgY2FuIGFsd2F5cyBzdGFydCBhdCBwcmlvciBvbmUgYW5kIGNvbWJpbmUNCiAgICB3cml0ZS50YWJsZShkYXRhX2xpc3RbW2ldXSwgZmlsZSA9IGZpbGVfbmFtZV9kYXRhLCByb3cubmFtZXMgPSBGKSAjc3RvcmUgdGhlbSBpbiByZXN1bHRzLg0KfQ0KDQp0b2MoKSAjIHRpbWUgcGFzc2VkPw0KYmVlcCgpICMgZG9uZSENCmBgYA0KDQoNCg0K