Set up

Libraries

# get packages needed
fpackage.check <- function(packages) { # (c) Jochem Tolsma
  package_list  <- lapply(packages, FUN = function(x) {
    if (!require(x, character.only = TRUE)) {
      install.packages(x, dependencies = TRUE)
      library(x, character.only = TRUE)
    }
  })
}
packages = c("rvest", "httr","polite", "tidyverse", 
             "data.table")
fpackage.check(packages)

Meertens scrape

Scrape function

get_name_year_frequency <- function(session, name, file.name) {#name = "Elif"
  if (file.exists(file.name)) {
    load(file.name)
  } else {
    name_path = paste("/nvb/naam/is/", name,sep="") # set the path for the specific name's webpage
    
    name_session <-nod(session, path = name_path) # agree changing of the path with the host (assuming I have already "bowed" for the higher-level path)
    
    name_page <- scrape(name_session) # get the page for this year
    
    # get the table data and the html_text to extract frequency and frequancy per year data
    page <- html_text(name_page)
    table <- html_table(name_page)
    
    #save raw data in a list
    raw_data <- list(page,table)
    
    save(raw_data, file = file.name)
  }

#extract the data from the HTML code with stringr
page <- raw_data[[1]]
table <- raw_data[[2]]

#extract the data from the HTML code with stringr
subset <- page %>% 
  str_split("<!--")

subset <- subset[[1]][2]

subset <- subset %>%
  str_extract_all("\\(.*\\)")

#extracting year
year_list <- subset[[1]][1]

year_list <- year_list %>%
  str_extract_all("[:digit:]{4}")

year_list <- as.numeric(year_list[[1]])

#extracting births
value_list <- subset[[1]][4][[1]]

value_list <- value_list  %>%
  str_extract_all("[:digit:]{1,3}")

value_list <- as.numeric(value_list[[1]])

#extract overall frequency of names. 
n_m <- table[[1]][["X3"]][2] %>% 
str_replace("--","0")

n_v <- table[[1]][["X3"]][6] %>% 
  str_replace("--","0")

n_total <- as.numeric(n_m) + as.numeric(n_v)

#save all data in a long formate
df <- tibble::tibble(n_total, year_list, value_list, names = name)

return(df)
}

Initialize scrape

# check permissions and introduce myself to the host
session <- bow("https://www.meertens.knaw.nl/nvb/", user_agent =  "Thijmen Jeroense, Radboud University Nijmegen", delay = 1)
session

#set archive for scrape. This way we can store the data for future usage.
archive <- "data_analysis/data/data_raw/ethnic_names/bloothooft_groot_names/"

#export the data. 
namelist_df <- read_delim(file = "data_analysis/data/data_processed/ethnic_names/bloothooft_groot_ethnicnames.csv",
                        delim = ";")
namelist <- namelist_df %>%
  select(names)

#empty dfs
dfs <- list()

#create progressbar
pb <- txtProgressBar(min = 1, max = length(namelist$names),
                     initial = 1, char = "-",
                     width = 70, style = 3)

Scrape

#start main loop. 
for(i in 1:length(namelist$names)) {
  #i = 3
  setTxtProgressBar(pb, i)
  
  file.name  <- paste0(archive, namelist$names[i], ".rda")
  
  dfs[[i]] <- get_name_year_frequency(session = session, name = namelist$names[i], file.name = file.name)
}

Combine results of scrape

#combine data
df_frequency_ethnic_names <- dfs %>%
  rbindlist() %>%
  left_join(namelist_df, by = "names")

Export results

write_csv(df_frequency_ethnic_names, file = "data_analysis/data/data_processed/meertens_scrape/bloothooft_names_frequency_18802016.csv")
LS0tDQp0aXRsZTogIk1lZXJ0ZW5zIHNjcmFwZTogZXRobmljIG5hbWVzIg0KYXV0aG9yOiAiVGhpam1lbiBKZXJvZW5zZSINCmRhdGU6ICJMYXN0IGNvbXBpbGVkIG9uIGByIGZvcm1hdChTeXMudGltZSgpLCAnJWQgJUIsICVZJylgIg0Kb3V0cHV0Og0KICBodG1sX2RvY3VtZW50Og0KICAgIHRvYzogVFJVRQ0KICAgIHRvY19kZXB0aDogMw0KICAgIHRvY19mbG9hdDogVFJVRQ0KICAgIGNvZGVfZm9sZGluZzogc2hvdw0KICAgIGNvZGVfZG93bmxvYWQ6IFRSVUUNCi0tLQ0KYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9DQprbml0cjo6b3B0c19jaHVuayRzZXQoY2FjaGUgPSBUUlVFLCBtZXNzYWdlID0gRkFMU0UsIHdhcm5pbmcgPSBGQUxTRSwgcmVzdWx0cyA9ICJhc2lzIiwNCiAgICAgICAgICAgICAgICAgICAgICBmaWcuYWxpZ24gPSAiY2VudGVyIikNCmBgYA0KDQojIFNldCB1cA0KDQojIyBMaWJyYXJpZXMNCmBgYHtyIGxpYnJhcmllc30NCiMgZ2V0IHBhY2thZ2VzIG5lZWRlZA0KZnBhY2thZ2UuY2hlY2sgPC0gZnVuY3Rpb24ocGFja2FnZXMpIHsgIyAoYykgSm9jaGVtIFRvbHNtYQ0KICBwYWNrYWdlX2xpc3QgIDwtIGxhcHBseShwYWNrYWdlcywgRlVOID0gZnVuY3Rpb24oeCkgew0KICAgIGlmICghcmVxdWlyZSh4LCBjaGFyYWN0ZXIub25seSA9IFRSVUUpKSB7DQogICAgICBpbnN0YWxsLnBhY2thZ2VzKHgsIGRlcGVuZGVuY2llcyA9IFRSVUUpDQogICAgICBsaWJyYXJ5KHgsIGNoYXJhY3Rlci5vbmx5ID0gVFJVRSkNCiAgICB9DQogIH0pDQp9DQpwYWNrYWdlcyA9IGMoInJ2ZXN0IiwgImh0dHIiLCJwb2xpdGUiLCAidGlkeXZlcnNlIiwgDQogICAgICAgICAgICAgImRhdGEudGFibGUiKQ0KZnBhY2thZ2UuY2hlY2socGFja2FnZXMpDQpgYGANCg0KDQojIE1lZXJ0ZW5zIHNjcmFwZQ0KDQojIyBTY3JhcGUgZnVuY3Rpb24NCg0KYGBge3Igc2NyYXBlIGZ1bmN0aW9ufQ0KZ2V0X25hbWVfeWVhcl9mcmVxdWVuY3kgPC0gZnVuY3Rpb24oc2Vzc2lvbiwgbmFtZSwgZmlsZS5uYW1lKSB7I25hbWUgPSAiRWxpZiINCiAgaWYgKGZpbGUuZXhpc3RzKGZpbGUubmFtZSkpIHsNCiAgICBsb2FkKGZpbGUubmFtZSkNCiAgfSBlbHNlIHsNCiAgICBuYW1lX3BhdGggPSBwYXN0ZSgiL252Yi9uYWFtL2lzLyIsIG5hbWUsc2VwPSIiKSAjIHNldCB0aGUgcGF0aCBmb3IgdGhlIHNwZWNpZmljIG5hbWUncyB3ZWJwYWdlDQogICAgDQogICAgbmFtZV9zZXNzaW9uIDwtbm9kKHNlc3Npb24sIHBhdGggPSBuYW1lX3BhdGgpICMgYWdyZWUgY2hhbmdpbmcgb2YgdGhlIHBhdGggd2l0aCB0aGUgaG9zdCAoYXNzdW1pbmcgSSBoYXZlIGFscmVhZHkgImJvd2VkIiBmb3IgdGhlIGhpZ2hlci1sZXZlbCBwYXRoKQ0KICAgIA0KICAgIG5hbWVfcGFnZSA8LSBzY3JhcGUobmFtZV9zZXNzaW9uKSAjIGdldCB0aGUgcGFnZSBmb3IgdGhpcyB5ZWFyDQogICAgDQogICAgIyBnZXQgdGhlIHRhYmxlIGRhdGEgYW5kIHRoZSBodG1sX3RleHQgdG8gZXh0cmFjdCBmcmVxdWVuY3kgYW5kIGZyZXF1YW5jeSBwZXIgeWVhciBkYXRhDQogICAgcGFnZSA8LSBodG1sX3RleHQobmFtZV9wYWdlKQ0KICAgIHRhYmxlIDwtIGh0bWxfdGFibGUobmFtZV9wYWdlKQ0KICAgIA0KICAgICNzYXZlIHJhdyBkYXRhIGluIGEgbGlzdA0KICAgIHJhd19kYXRhIDwtIGxpc3QocGFnZSx0YWJsZSkNCiAgICANCiAgICBzYXZlKHJhd19kYXRhLCBmaWxlID0gZmlsZS5uYW1lKQ0KICB9DQoNCiNleHRyYWN0IHRoZSBkYXRhIGZyb20gdGhlIEhUTUwgY29kZSB3aXRoIHN0cmluZ3INCnBhZ2UgPC0gcmF3X2RhdGFbWzFdXQ0KdGFibGUgPC0gcmF3X2RhdGFbWzJdXQ0KDQojZXh0cmFjdCB0aGUgZGF0YSBmcm9tIHRoZSBIVE1MIGNvZGUgd2l0aCBzdHJpbmdyDQpzdWJzZXQgPC0gcGFnZSAlPiUgDQogIHN0cl9zcGxpdCgiPCEtLSIpDQoNCnN1YnNldCA8LSBzdWJzZXRbWzFdXVsyXQ0KDQpzdWJzZXQgPC0gc3Vic2V0ICU+JQ0KICBzdHJfZXh0cmFjdF9hbGwoIlxcKC4qXFwpIikNCg0KI2V4dHJhY3RpbmcgeWVhcg0KeWVhcl9saXN0IDwtIHN1YnNldFtbMV1dWzFdDQoNCnllYXJfbGlzdCA8LSB5ZWFyX2xpc3QgJT4lDQogIHN0cl9leHRyYWN0X2FsbCgiWzpkaWdpdDpdezR9IikNCg0KeWVhcl9saXN0IDwtIGFzLm51bWVyaWMoeWVhcl9saXN0W1sxXV0pDQoNCiNleHRyYWN0aW5nIGJpcnRocw0KdmFsdWVfbGlzdCA8LSBzdWJzZXRbWzFdXVs0XVtbMV1dDQoNCnZhbHVlX2xpc3QgPC0gdmFsdWVfbGlzdCAgJT4lDQogIHN0cl9leHRyYWN0X2FsbCgiWzpkaWdpdDpdezEsM30iKQ0KDQp2YWx1ZV9saXN0IDwtIGFzLm51bWVyaWModmFsdWVfbGlzdFtbMV1dKQ0KDQojZXh0cmFjdCBvdmVyYWxsIGZyZXF1ZW5jeSBvZiBuYW1lcy4gDQpuX20gPC0gdGFibGVbWzFdXVtbIlgzIl1dWzJdICU+JSANCnN0cl9yZXBsYWNlKCItLSIsIjAiKQ0KDQpuX3YgPC0gdGFibGVbWzFdXVtbIlgzIl1dWzZdICU+JSANCiAgc3RyX3JlcGxhY2UoIi0tIiwiMCIpDQoNCm5fdG90YWwgPC0gYXMubnVtZXJpYyhuX20pICsgYXMubnVtZXJpYyhuX3YpDQoNCiNzYXZlIGFsbCBkYXRhIGluIGEgbG9uZyBmb3JtYXRlDQpkZiA8LSB0aWJibGU6OnRpYmJsZShuX3RvdGFsLCB5ZWFyX2xpc3QsIHZhbHVlX2xpc3QsIG5hbWVzID0gbmFtZSkNCg0KcmV0dXJuKGRmKQ0KfQ0KYGBgDQoNCiMjIEluaXRpYWxpemUgc2NyYXBlDQoNCmBgYHtyIGluaXRhbGl6ZSBzY3JhcGUsIHJlc3VsdHM9J2hpZGUnfQ0KIyBjaGVjayBwZXJtaXNzaW9ucyBhbmQgaW50cm9kdWNlIG15c2VsZiB0byB0aGUgaG9zdA0Kc2Vzc2lvbiA8LSBib3coImh0dHBzOi8vd3d3Lm1lZXJ0ZW5zLmtuYXcubmwvbnZiLyIsIHVzZXJfYWdlbnQgPSAgIlRoaWptZW4gSmVyb2Vuc2UsIFJhZGJvdWQgVW5pdmVyc2l0eSBOaWptZWdlbiIsIGRlbGF5ID0gMSkNCnNlc3Npb24NCg0KI3NldCBhcmNoaXZlIGZvciBzY3JhcGUuIFRoaXMgd2F5IHdlIGNhbiBzdG9yZSB0aGUgZGF0YSBmb3IgZnV0dXJlIHVzYWdlLg0KYXJjaGl2ZSA8LSAiZGF0YV9hbmFseXNpcy9kYXRhL2RhdGFfcmF3L2V0aG5pY19uYW1lcy9ibG9vdGhvb2Z0X2dyb290X25hbWVzLyINCg0KI2V4cG9ydCB0aGUgZGF0YS4gDQpuYW1lbGlzdF9kZiA8LSByZWFkX2RlbGltKGZpbGUgPSAiZGF0YV9hbmFseXNpcy9kYXRhL2RhdGFfcHJvY2Vzc2VkL2V0aG5pY19uYW1lcy9ibG9vdGhvb2Z0X2dyb290X2V0aG5pY25hbWVzLmNzdiIsDQogICAgICAgICAgICAgICAgICAgICAgICBkZWxpbSA9ICI7IikNCm5hbWVsaXN0IDwtIG5hbWVsaXN0X2RmICU+JQ0KICBzZWxlY3QobmFtZXMpDQoNCiNlbXB0eSBkZnMNCmRmcyA8LSBsaXN0KCkNCg0KI2NyZWF0ZSBwcm9ncmVzc2Jhcg0KcGIgPC0gdHh0UHJvZ3Jlc3NCYXIobWluID0gMSwgbWF4ID0gbGVuZ3RoKG5hbWVsaXN0JG5hbWVzKSwNCiAgICAgICAgICAgICAgICAgICAgIGluaXRpYWwgPSAxLCBjaGFyID0gIi0iLA0KICAgICAgICAgICAgICAgICAgICAgd2lkdGggPSA3MCwgc3R5bGUgPSAzKQ0KDQpgYGANCg0KDQojIyBTY3JhcGUgDQpgYGB7ciBzY3JhcGUsIHJlc3VsdHM9J2hpZGUnfQ0KDQojc3RhcnQgbWFpbiBsb29wLiANCmZvcihpIGluIDE6bGVuZ3RoKG5hbWVsaXN0JG5hbWVzKSkgew0KICAjaSA9IDMNCiAgc2V0VHh0UHJvZ3Jlc3NCYXIocGIsIGkpDQogIA0KICBmaWxlLm5hbWUgIDwtIHBhc3RlMChhcmNoaXZlLCBuYW1lbGlzdCRuYW1lc1tpXSwgIi5yZGEiKQ0KICANCiAgZGZzW1tpXV0gPC0gZ2V0X25hbWVfeWVhcl9mcmVxdWVuY3koc2Vzc2lvbiA9IHNlc3Npb24sIG5hbWUgPSBuYW1lbGlzdCRuYW1lc1tpXSwgZmlsZS5uYW1lID0gZmlsZS5uYW1lKQ0KfQ0KDQpgYGANCg0KIyMgQ29tYmluZSByZXN1bHRzIG9mIHNjcmFwZQ0KDQpgYGB7ciBzY3JhcGUgcmVzdWx0c30NCiNjb21iaW5lIGRhdGENCmRmX2ZyZXF1ZW5jeV9ldGhuaWNfbmFtZXMgPC0gZGZzICU+JQ0KICByYmluZGxpc3QoKSAlPiUNCiAgbGVmdF9qb2luKG5hbWVsaXN0X2RmLCBieSA9ICJuYW1lcyIpDQoNCmBgYA0KDQoNCiMgRXhwb3J0IHJlc3VsdHMNCmBgYHtyIGV4cG9ydH0NCndyaXRlX2NzdihkZl9mcmVxdWVuY3lfZXRobmljX25hbWVzLCBmaWxlID0gImRhdGFfYW5hbHlzaXMvZGF0YS9kYXRhX3Byb2Nlc3NlZC9tZWVydGVuc19zY3JhcGUvYmxvb3Rob29mdF9uYW1lc19mcmVxdWVuY3lfMTg4MDIwMTYuY3N2IikNCmBgYA0KDQo=


Copyright © 2024 Jeroense Thijmen