Set up
Libraries
# get packages needed
fpackage.check <- function(packages) { # (c) Jochem Tolsma
package_list <- lapply(packages, FUN = function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dependencies = TRUE)
library(x, character.only = TRUE)
}
})
}
packages = c("rvest", "httr","polite", "tidyverse",
"data.table")
fpackage.check(packages)
Meertens scrape
Scrape function
get_name_year_frequency <- function(session, name, file.name) {#name = "Elif"
if (file.exists(file.name)) {
load(file.name)
} else {
name_path = paste("/nvb/naam/is/", name,sep="") # set the path for the specific name's webpage
name_session <-nod(session, path = name_path) # agree changing of the path with the host (assuming I have already "bowed" for the higher-level path)
name_page <- scrape(name_session) # get the page for this year
# get the table data and the html_text to extract frequency and frequancy per year data
page <- html_text(name_page)
table <- html_table(name_page)
#save raw data in a list
raw_data <- list(page,table)
save(raw_data, file = file.name)
}
#extract the data from the HTML code with stringr
page <- raw_data[[1]]
table <- raw_data[[2]]
#extract the data from the HTML code with stringr
subset <- page %>%
str_split("<!--")
subset <- subset[[1]][2]
subset <- subset %>%
str_extract_all("\\(.*\\)")
#extracting year
year_list <- subset[[1]][1]
year_list <- year_list %>%
str_extract_all("[:digit:]{4}")
year_list <- as.numeric(year_list[[1]])
#extracting births
value_list <- subset[[1]][4][[1]]
value_list <- value_list %>%
str_extract_all("[:digit:]{1,3}")
value_list <- as.numeric(value_list[[1]])
#extract overall frequency of names.
n_m <- table[[1]][["X3"]][2] %>%
str_replace("--","0")
n_v <- table[[1]][["X3"]][6] %>%
str_replace("--","0")
n_total <- as.numeric(n_m) + as.numeric(n_v)
#save all data in a long formate
df <- tibble::tibble(n_total, year_list, value_list, names = name)
return(df)
}
Initialize scrape
# check permissions and introduce myself to the host
session <- bow("https://www.meertens.knaw.nl/nvb/", user_agent = "Thijmen Jeroense, Radboud University Nijmegen", delay = 1)
session
#set archive for scrape. This way we can store the data for future usage.
archive <- "data_analysis/data/data_raw/ethnic_names/bloothooft_groot_names/"
#export the data.
namelist_df <- read_delim(file = "data_analysis/data/data_processed/ethnic_names/bloothooft_groot_ethnicnames.csv",
delim = ";")
namelist <- namelist_df %>%
select(names)
#empty dfs
dfs <- list()
#create progressbar
pb <- txtProgressBar(min = 1, max = length(namelist$names),
initial = 1, char = "-",
width = 70, style = 3)
Scrape
#start main loop.
for(i in 1:length(namelist$names)) {
#i = 3
setTxtProgressBar(pb, i)
file.name <- paste0(archive, namelist$names[i], ".rda")
dfs[[i]] <- get_name_year_frequency(session = session, name = namelist$names[i], file.name = file.name)
}
Combine results of scrape
#combine data
df_frequency_ethnic_names <- dfs %>%
rbindlist() %>%
left_join(namelist_df, by = "names")
Export results
write_csv(df_frequency_ethnic_names, file = "data_analysis/data/data_processed/meertens_scrape/bloothooft_names_frequency_18802016.csv")
LS0tDQp0aXRsZTogIk1lZXJ0ZW5zIHNjcmFwZTogZXRobmljIG5hbWVzIg0KYXV0aG9yOiAiVGhpam1lbiBKZXJvZW5zZSINCmRhdGU6ICJMYXN0IGNvbXBpbGVkIG9uIGByIGZvcm1hdChTeXMudGltZSgpLCAnJWQgJUIsICVZJylgIg0Kb3V0cHV0Og0KICBodG1sX2RvY3VtZW50Og0KICAgIHRvYzogVFJVRQ0KICAgIHRvY19kZXB0aDogMw0KICAgIHRvY19mbG9hdDogVFJVRQ0KICAgIGNvZGVfZm9sZGluZzogc2hvdw0KICAgIGNvZGVfZG93bmxvYWQ6IFRSVUUNCi0tLQ0KYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9DQprbml0cjo6b3B0c19jaHVuayRzZXQoY2FjaGUgPSBUUlVFLCBtZXNzYWdlID0gRkFMU0UsIHdhcm5pbmcgPSBGQUxTRSwgcmVzdWx0cyA9ICJhc2lzIiwNCiAgICAgICAgICAgICAgICAgICAgICBmaWcuYWxpZ24gPSAiY2VudGVyIikNCmBgYA0KDQojIFNldCB1cA0KDQojIyBMaWJyYXJpZXMNCmBgYHtyIGxpYnJhcmllc30NCiMgZ2V0IHBhY2thZ2VzIG5lZWRlZA0KZnBhY2thZ2UuY2hlY2sgPC0gZnVuY3Rpb24ocGFja2FnZXMpIHsgIyAoYykgSm9jaGVtIFRvbHNtYQ0KICBwYWNrYWdlX2xpc3QgIDwtIGxhcHBseShwYWNrYWdlcywgRlVOID0gZnVuY3Rpb24oeCkgew0KICAgIGlmICghcmVxdWlyZSh4LCBjaGFyYWN0ZXIub25seSA9IFRSVUUpKSB7DQogICAgICBpbnN0YWxsLnBhY2thZ2VzKHgsIGRlcGVuZGVuY2llcyA9IFRSVUUpDQogICAgICBsaWJyYXJ5KHgsIGNoYXJhY3Rlci5vbmx5ID0gVFJVRSkNCiAgICB9DQogIH0pDQp9DQpwYWNrYWdlcyA9IGMoInJ2ZXN0IiwgImh0dHIiLCJwb2xpdGUiLCAidGlkeXZlcnNlIiwgDQogICAgICAgICAgICAgImRhdGEudGFibGUiKQ0KZnBhY2thZ2UuY2hlY2socGFja2FnZXMpDQpgYGANCg0KDQojIE1lZXJ0ZW5zIHNjcmFwZQ0KDQojIyBTY3JhcGUgZnVuY3Rpb24NCg0KYGBge3Igc2NyYXBlIGZ1bmN0aW9ufQ0KZ2V0X25hbWVfeWVhcl9mcmVxdWVuY3kgPC0gZnVuY3Rpb24oc2Vzc2lvbiwgbmFtZSwgZmlsZS5uYW1lKSB7I25hbWUgPSAiRWxpZiINCiAgaWYgKGZpbGUuZXhpc3RzKGZpbGUubmFtZSkpIHsNCiAgICBsb2FkKGZpbGUubmFtZSkNCiAgfSBlbHNlIHsNCiAgICBuYW1lX3BhdGggPSBwYXN0ZSgiL252Yi9uYWFtL2lzLyIsIG5hbWUsc2VwPSIiKSAjIHNldCB0aGUgcGF0aCBmb3IgdGhlIHNwZWNpZmljIG5hbWUncyB3ZWJwYWdlDQogICAgDQogICAgbmFtZV9zZXNzaW9uIDwtbm9kKHNlc3Npb24sIHBhdGggPSBuYW1lX3BhdGgpICMgYWdyZWUgY2hhbmdpbmcgb2YgdGhlIHBhdGggd2l0aCB0aGUgaG9zdCAoYXNzdW1pbmcgSSBoYXZlIGFscmVhZHkgImJvd2VkIiBmb3IgdGhlIGhpZ2hlci1sZXZlbCBwYXRoKQ0KICAgIA0KICAgIG5hbWVfcGFnZSA8LSBzY3JhcGUobmFtZV9zZXNzaW9uKSAjIGdldCB0aGUgcGFnZSBmb3IgdGhpcyB5ZWFyDQogICAgDQogICAgIyBnZXQgdGhlIHRhYmxlIGRhdGEgYW5kIHRoZSBodG1sX3RleHQgdG8gZXh0cmFjdCBmcmVxdWVuY3kgYW5kIGZyZXF1YW5jeSBwZXIgeWVhciBkYXRhDQogICAgcGFnZSA8LSBodG1sX3RleHQobmFtZV9wYWdlKQ0KICAgIHRhYmxlIDwtIGh0bWxfdGFibGUobmFtZV9wYWdlKQ0KICAgIA0KICAgICNzYXZlIHJhdyBkYXRhIGluIGEgbGlzdA0KICAgIHJhd19kYXRhIDwtIGxpc3QocGFnZSx0YWJsZSkNCiAgICANCiAgICBzYXZlKHJhd19kYXRhLCBmaWxlID0gZmlsZS5uYW1lKQ0KICB9DQoNCiNleHRyYWN0IHRoZSBkYXRhIGZyb20gdGhlIEhUTUwgY29kZSB3aXRoIHN0cmluZ3INCnBhZ2UgPC0gcmF3X2RhdGFbWzFdXQ0KdGFibGUgPC0gcmF3X2RhdGFbWzJdXQ0KDQojZXh0cmFjdCB0aGUgZGF0YSBmcm9tIHRoZSBIVE1MIGNvZGUgd2l0aCBzdHJpbmdyDQpzdWJzZXQgPC0gcGFnZSAlPiUgDQogIHN0cl9zcGxpdCgiPCEtLSIpDQoNCnN1YnNldCA8LSBzdWJzZXRbWzFdXVsyXQ0KDQpzdWJzZXQgPC0gc3Vic2V0ICU+JQ0KICBzdHJfZXh0cmFjdF9hbGwoIlxcKC4qXFwpIikNCg0KI2V4dHJhY3RpbmcgeWVhcg0KeWVhcl9saXN0IDwtIHN1YnNldFtbMV1dWzFdDQoNCnllYXJfbGlzdCA8LSB5ZWFyX2xpc3QgJT4lDQogIHN0cl9leHRyYWN0X2FsbCgiWzpkaWdpdDpdezR9IikNCg0KeWVhcl9saXN0IDwtIGFzLm51bWVyaWMoeWVhcl9saXN0W1sxXV0pDQoNCiNleHRyYWN0aW5nIGJpcnRocw0KdmFsdWVfbGlzdCA8LSBzdWJzZXRbWzFdXVs0XVtbMV1dDQoNCnZhbHVlX2xpc3QgPC0gdmFsdWVfbGlzdCAgJT4lDQogIHN0cl9leHRyYWN0X2FsbCgiWzpkaWdpdDpdezEsM30iKQ0KDQp2YWx1ZV9saXN0IDwtIGFzLm51bWVyaWModmFsdWVfbGlzdFtbMV1dKQ0KDQojZXh0cmFjdCBvdmVyYWxsIGZyZXF1ZW5jeSBvZiBuYW1lcy4gDQpuX20gPC0gdGFibGVbWzFdXVtbIlgzIl1dWzJdICU+JSANCnN0cl9yZXBsYWNlKCItLSIsIjAiKQ0KDQpuX3YgPC0gdGFibGVbWzFdXVtbIlgzIl1dWzZdICU+JSANCiAgc3RyX3JlcGxhY2UoIi0tIiwiMCIpDQoNCm5fdG90YWwgPC0gYXMubnVtZXJpYyhuX20pICsgYXMubnVtZXJpYyhuX3YpDQoNCiNzYXZlIGFsbCBkYXRhIGluIGEgbG9uZyBmb3JtYXRlDQpkZiA8LSB0aWJibGU6OnRpYmJsZShuX3RvdGFsLCB5ZWFyX2xpc3QsIHZhbHVlX2xpc3QsIG5hbWVzID0gbmFtZSkNCg0KcmV0dXJuKGRmKQ0KfQ0KYGBgDQoNCiMjIEluaXRpYWxpemUgc2NyYXBlDQoNCmBgYHtyIGluaXRhbGl6ZSBzY3JhcGUsIHJlc3VsdHM9J2hpZGUnfQ0KIyBjaGVjayBwZXJtaXNzaW9ucyBhbmQgaW50cm9kdWNlIG15c2VsZiB0byB0aGUgaG9zdA0Kc2Vzc2lvbiA8LSBib3coImh0dHBzOi8vd3d3Lm1lZXJ0ZW5zLmtuYXcubmwvbnZiLyIsIHVzZXJfYWdlbnQgPSAgIlRoaWptZW4gSmVyb2Vuc2UsIFJhZGJvdWQgVW5pdmVyc2l0eSBOaWptZWdlbiIsIGRlbGF5ID0gMSkNCnNlc3Npb24NCg0KI3NldCBhcmNoaXZlIGZvciBzY3JhcGUuIFRoaXMgd2F5IHdlIGNhbiBzdG9yZSB0aGUgZGF0YSBmb3IgZnV0dXJlIHVzYWdlLg0KYXJjaGl2ZSA8LSAiZGF0YV9hbmFseXNpcy9kYXRhL2RhdGFfcmF3L2V0aG5pY19uYW1lcy9ibG9vdGhvb2Z0X2dyb290X25hbWVzLyINCg0KI2V4cG9ydCB0aGUgZGF0YS4gDQpuYW1lbGlzdF9kZiA8LSByZWFkX2RlbGltKGZpbGUgPSAiZGF0YV9hbmFseXNpcy9kYXRhL2RhdGFfcHJvY2Vzc2VkL2V0aG5pY19uYW1lcy9ibG9vdGhvb2Z0X2dyb290X2V0aG5pY25hbWVzLmNzdiIsDQogICAgICAgICAgICAgICAgICAgICAgICBkZWxpbSA9ICI7IikNCm5hbWVsaXN0IDwtIG5hbWVsaXN0X2RmICU+JQ0KICBzZWxlY3QobmFtZXMpDQoNCiNlbXB0eSBkZnMNCmRmcyA8LSBsaXN0KCkNCg0KI2NyZWF0ZSBwcm9ncmVzc2Jhcg0KcGIgPC0gdHh0UHJvZ3Jlc3NCYXIobWluID0gMSwgbWF4ID0gbGVuZ3RoKG5hbWVsaXN0JG5hbWVzKSwNCiAgICAgICAgICAgICAgICAgICAgIGluaXRpYWwgPSAxLCBjaGFyID0gIi0iLA0KICAgICAgICAgICAgICAgICAgICAgd2lkdGggPSA3MCwgc3R5bGUgPSAzKQ0KDQpgYGANCg0KDQojIyBTY3JhcGUgDQpgYGB7ciBzY3JhcGUsIHJlc3VsdHM9J2hpZGUnfQ0KDQojc3RhcnQgbWFpbiBsb29wLiANCmZvcihpIGluIDE6bGVuZ3RoKG5hbWVsaXN0JG5hbWVzKSkgew0KICAjaSA9IDMNCiAgc2V0VHh0UHJvZ3Jlc3NCYXIocGIsIGkpDQogIA0KICBmaWxlLm5hbWUgIDwtIHBhc3RlMChhcmNoaXZlLCBuYW1lbGlzdCRuYW1lc1tpXSwgIi5yZGEiKQ0KICANCiAgZGZzW1tpXV0gPC0gZ2V0X25hbWVfeWVhcl9mcmVxdWVuY3koc2Vzc2lvbiA9IHNlc3Npb24sIG5hbWUgPSBuYW1lbGlzdCRuYW1lc1tpXSwgZmlsZS5uYW1lID0gZmlsZS5uYW1lKQ0KfQ0KDQpgYGANCg0KIyMgQ29tYmluZSByZXN1bHRzIG9mIHNjcmFwZQ0KDQpgYGB7ciBzY3JhcGUgcmVzdWx0c30NCiNjb21iaW5lIGRhdGENCmRmX2ZyZXF1ZW5jeV9ldGhuaWNfbmFtZXMgPC0gZGZzICU+JQ0KICByYmluZGxpc3QoKSAlPiUNCiAgbGVmdF9qb2luKG5hbWVsaXN0X2RmLCBieSA9ICJuYW1lcyIpDQoNCmBgYA0KDQoNCiMgRXhwb3J0IHJlc3VsdHMNCmBgYHtyIGV4cG9ydH0NCndyaXRlX2NzdihkZl9mcmVxdWVuY3lfZXRobmljX25hbWVzLCBmaWxlID0gImRhdGFfYW5hbHlzaXMvZGF0YS9kYXRhX3Byb2Nlc3NlZC9tZWVydGVuc19zY3JhcGUvYmxvb3Rob29mdF9uYW1lc19mcmVxdWVuY3lfMTg4MDIwMTYuY3N2IikNCmBgYA0KDQo=
Copyright © 2024 Jeroense Thijmen