Set up

Libraries

# get packages needed
fpackage.check <- function(packages) { # (c) Jochem Tolsma
  package_list  <- lapply(packages, FUN = function(x) {
    if (!require(x, character.only = TRUE)) {
      install.packages(x, dependencies = TRUE)
      library(x, character.only = TRUE)
    }
  })
}
packages = c("rvest", "httr","polite", "tidyverse", 
             "data.table")
fpackage.check(packages)

Scrape

Function

Scrape information from name page: yearly birth frequency and total frequency.

get_name_year_frequency <- function(session, name, file.name) {#name = "Lisa"
  if (file.exists(file.name)) {
    load(file.name)
  } else {
    name_path = paste("/nvb/naam/is/", name,sep="") # set the path for the specific name's webpage
    
    name_session <-nod(session, path = name_path) # agree changing of the path with the host (assuming I have already "bowed" for the higher-level path)
    
    name_page <- scrape(name_session) # get the page for this year
    
    # get the table data and the html_text to extract frequency and frequancy per year data
    page <- html_text(name_page)
    table <- html_table(name_page)
    
    #save raw data in a list
    raw_data <- list(page,table)
    
    save(raw_data, file = file.name)
  }
  
  #extract the data from the HTML code with stringr
  page <- raw_data[[1]]
  table <- raw_data[[2]]
  
  #extract the data from the HTML code with stringr
  subset <- page %>% 
    str_split("<!--")
  
  subset <- subset[[1]][2]
  
  subset <- subset %>%
    str_extract_all("\\(.*\\)")
  
  #extracting year
  year_list <- subset[[1]][1]
  
  year_list <- year_list %>%
    str_extract_all("[:digit:]{4}")
  
  year_list <- as.numeric(year_list[[1]])
  
  #extracting births
  value_list <- subset[[1]][4][[1]]
  
  value_list <- value_list  %>%
    str_extract_all("[:digit:]{1,5}")
  
  value_list <- as.numeric(value_list[[1]])
  
  #extract overall frequency of names. 
  n_m <- table[[1]][["X3"]][2] %>% 
    str_replace("--","0")
  
  n_v <- table[[1]][["X3"]][6] %>% 
    str_replace("--","0")
  
  n_total <- as.numeric(n_m) + as.numeric(n_v)
  
  #save all data in a long formate
  df <- tibble::tibble(n_total, year_list, value_list, names = name)
  
  return(df)
}

Initialize scrape

# check permissions and introduce myself to the host
session <- bow("https://www.meertens.knaw.nl/nvb/", user_agent =  "Thijmen Jeroense, Radboud University Nijmegen", delay = 1)
session

#set archive for scrape. This way we can store the data for future usage.
archive <- "data_analysis/data/data_raw/meertens_name_V2//"

#import the data. 
namelist_df <- read_delim(file = "data_analysis/data/data_processed/meertens_scrape/name_numbers.csv",
                          delim = ",")
namelist <- namelist_df %>%
  rename(names = name) %>%
  select(names) %>% 
  distinct()

#empty dfs
dfs <- list()

#create progressbar
pb <- txtProgressBar(min = 1, max = length(namelist$names),
                     initial = 1, char = "-",
                     width = 70, style = 3)

Scrape the page

#start main loop. 
for(i in 1:length(namelist$names)) {
  #i = 10
  setTxtProgressBar(pb, i)
  
  file.name  <- paste0(archive, namelist$names[i], ".rda")
  
  dfs[[i]] <- get_name_year_frequency(session = session, name = namelist$names[i], file.name = file.name)
}

#rename name into names
namelist_df <- namelist_df %>%
  rename(names = name)

#combine data
df_frequency_ethnic_names <- dfs %>%
  rbindlist() %>%
  left_join(namelist_df, by = "names")

Export results

#export results
write_csv(df_frequency_ethnic_names, file = "data_analysis/data/data_processed/meertens_scrape/dutch_names_frequency_18802016.csv")
LS0tDQp0aXRsZTogJ01lZXJ0ZW5zIHNjcmFwZTogbmFtZSBzY3JhcGUnDQphdXRob3I6ICJUaGlqbWVuIEplcm9lbnNlIg0KZGF0ZTogIkxhc3QgY29tcGlsZWQgb24gYHIgZm9ybWF0KFN5cy50aW1lKCksICclZCAlQiwgJVknKWAiDQpvdXRwdXQ6DQogIGh0bWxfZG9jdW1lbnQ6DQogICAgdG9jOiBUUlVFDQogICAgdG9jX2RlcHRoOiAzDQogICAgdG9jX2Zsb2F0OiBUUlVFDQogICAgY29kZV9mb2xkaW5nOiBzaG93DQogICAgY29kZV9kb3dubG9hZDogVFJVRQ0KLS0tDQpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0NCmtuaXRyOjpvcHRzX2NodW5rJHNldChjYWNoZSA9IFRSVUUsIG1lc3NhZ2UgPSBGQUxTRSwgd2FybmluZyA9IEZBTFNFLCByZXN1bHRzID0gImFzaXMiLA0KICAgICAgICAgICAgICAgICAgICAgIGZpZy5hbGlnbiA9ICJjZW50ZXIiKQ0KYGBgDQoNCiMgU2V0IHVwDQoNCiMjIExpYnJhcmllcw0KYGBge3IgbGlicmFyaWVzfQ0KIyBnZXQgcGFja2FnZXMgbmVlZGVkDQpmcGFja2FnZS5jaGVjayA8LSBmdW5jdGlvbihwYWNrYWdlcykgeyAjIChjKSBKb2NoZW0gVG9sc21hDQogIHBhY2thZ2VfbGlzdCAgPC0gbGFwcGx5KHBhY2thZ2VzLCBGVU4gPSBmdW5jdGlvbih4KSB7DQogICAgaWYgKCFyZXF1aXJlKHgsIGNoYXJhY3Rlci5vbmx5ID0gVFJVRSkpIHsNCiAgICAgIGluc3RhbGwucGFja2FnZXMoeCwgZGVwZW5kZW5jaWVzID0gVFJVRSkNCiAgICAgIGxpYnJhcnkoeCwgY2hhcmFjdGVyLm9ubHkgPSBUUlVFKQ0KICAgIH0NCiAgfSkNCn0NCnBhY2thZ2VzID0gYygicnZlc3QiLCAiaHR0ciIsInBvbGl0ZSIsICJ0aWR5dmVyc2UiLCANCiAgICAgICAgICAgICAiZGF0YS50YWJsZSIpDQpmcGFja2FnZS5jaGVjayhwYWNrYWdlcykNCmBgYA0KDQojIFNjcmFwZQ0KDQojIyBGdW5jdGlvbg0KDQpTY3JhcGUgaW5mb3JtYXRpb24gZnJvbSBuYW1lIHBhZ2U6IHllYXJseSBiaXJ0aCBmcmVxdWVuY3kgYW5kIHRvdGFsIGZyZXF1ZW5jeS4gDQoNCmBgYHtyfQ0KDQpnZXRfbmFtZV95ZWFyX2ZyZXF1ZW5jeSA8LSBmdW5jdGlvbihzZXNzaW9uLCBuYW1lLCBmaWxlLm5hbWUpIHsjbmFtZSA9ICJMaXNhIg0KICBpZiAoZmlsZS5leGlzdHMoZmlsZS5uYW1lKSkgew0KICAgIGxvYWQoZmlsZS5uYW1lKQ0KICB9IGVsc2Ugew0KICAgIG5hbWVfcGF0aCA9IHBhc3RlKCIvbnZiL25hYW0vaXMvIiwgbmFtZSxzZXA9IiIpICMgc2V0IHRoZSBwYXRoIGZvciB0aGUgc3BlY2lmaWMgbmFtZSdzIHdlYnBhZ2UNCiAgICANCiAgICBuYW1lX3Nlc3Npb24gPC1ub2Qoc2Vzc2lvbiwgcGF0aCA9IG5hbWVfcGF0aCkgIyBhZ3JlZSBjaGFuZ2luZyBvZiB0aGUgcGF0aCB3aXRoIHRoZSBob3N0IChhc3N1bWluZyBJIGhhdmUgYWxyZWFkeSAiYm93ZWQiIGZvciB0aGUgaGlnaGVyLWxldmVsIHBhdGgpDQogICAgDQogICAgbmFtZV9wYWdlIDwtIHNjcmFwZShuYW1lX3Nlc3Npb24pICMgZ2V0IHRoZSBwYWdlIGZvciB0aGlzIHllYXINCiAgICANCiAgICAjIGdldCB0aGUgdGFibGUgZGF0YSBhbmQgdGhlIGh0bWxfdGV4dCB0byBleHRyYWN0IGZyZXF1ZW5jeSBhbmQgZnJlcXVhbmN5IHBlciB5ZWFyIGRhdGENCiAgICBwYWdlIDwtIGh0bWxfdGV4dChuYW1lX3BhZ2UpDQogICAgdGFibGUgPC0gaHRtbF90YWJsZShuYW1lX3BhZ2UpDQogICAgDQogICAgI3NhdmUgcmF3IGRhdGEgaW4gYSBsaXN0DQogICAgcmF3X2RhdGEgPC0gbGlzdChwYWdlLHRhYmxlKQ0KICAgIA0KICAgIHNhdmUocmF3X2RhdGEsIGZpbGUgPSBmaWxlLm5hbWUpDQogIH0NCiAgDQogICNleHRyYWN0IHRoZSBkYXRhIGZyb20gdGhlIEhUTUwgY29kZSB3aXRoIHN0cmluZ3INCiAgcGFnZSA8LSByYXdfZGF0YVtbMV1dDQogIHRhYmxlIDwtIHJhd19kYXRhW1syXV0NCiAgDQogICNleHRyYWN0IHRoZSBkYXRhIGZyb20gdGhlIEhUTUwgY29kZSB3aXRoIHN0cmluZ3INCiAgc3Vic2V0IDwtIHBhZ2UgJT4lIA0KICAgIHN0cl9zcGxpdCgiPCEtLSIpDQogIA0KICBzdWJzZXQgPC0gc3Vic2V0W1sxXV1bMl0NCiAgDQogIHN1YnNldCA8LSBzdWJzZXQgJT4lDQogICAgc3RyX2V4dHJhY3RfYWxsKCJcXCguKlxcKSIpDQogIA0KICAjZXh0cmFjdGluZyB5ZWFyDQogIHllYXJfbGlzdCA8LSBzdWJzZXRbWzFdXVsxXQ0KICANCiAgeWVhcl9saXN0IDwtIHllYXJfbGlzdCAlPiUNCiAgICBzdHJfZXh0cmFjdF9hbGwoIls6ZGlnaXQ6XXs0fSIpDQogIA0KICB5ZWFyX2xpc3QgPC0gYXMubnVtZXJpYyh5ZWFyX2xpc3RbWzFdXSkNCiAgDQogICNleHRyYWN0aW5nIGJpcnRocw0KICB2YWx1ZV9saXN0IDwtIHN1YnNldFtbMV1dWzRdW1sxXV0NCiAgDQogIHZhbHVlX2xpc3QgPC0gdmFsdWVfbGlzdCAgJT4lDQogICAgc3RyX2V4dHJhY3RfYWxsKCJbOmRpZ2l0Ol17MSw1fSIpDQogIA0KICB2YWx1ZV9saXN0IDwtIGFzLm51bWVyaWModmFsdWVfbGlzdFtbMV1dKQ0KICANCiAgI2V4dHJhY3Qgb3ZlcmFsbCBmcmVxdWVuY3kgb2YgbmFtZXMuIA0KICBuX20gPC0gdGFibGVbWzFdXVtbIlgzIl1dWzJdICU+JSANCiAgICBzdHJfcmVwbGFjZSgiLS0iLCIwIikNCiAgDQogIG5fdiA8LSB0YWJsZVtbMV1dW1siWDMiXV1bNl0gJT4lIA0KICAgIHN0cl9yZXBsYWNlKCItLSIsIjAiKQ0KICANCiAgbl90b3RhbCA8LSBhcy5udW1lcmljKG5fbSkgKyBhcy5udW1lcmljKG5fdikNCiAgDQogICNzYXZlIGFsbCBkYXRhIGluIGEgbG9uZyBmb3JtYXRlDQogIGRmIDwtIHRpYmJsZTo6dGliYmxlKG5fdG90YWwsIHllYXJfbGlzdCwgdmFsdWVfbGlzdCwgbmFtZXMgPSBuYW1lKQ0KICANCiAgcmV0dXJuKGRmKQ0KfQ0KYGBgDQoNCg0KIyMgSW5pdGlhbGl6ZSBzY3JhcGUNCg0KYGBge3IgaW5pdGFsaXplLCByZXN1bHRzPSdoaWRlJ30NCiMgY2hlY2sgcGVybWlzc2lvbnMgYW5kIGludHJvZHVjZSBteXNlbGYgdG8gdGhlIGhvc3QNCnNlc3Npb24gPC0gYm93KCJodHRwczovL3d3dy5tZWVydGVucy5rbmF3Lm5sL252Yi8iLCB1c2VyX2FnZW50ID0gICJUaGlqbWVuIEplcm9lbnNlLCBSYWRib3VkIFVuaXZlcnNpdHkgTmlqbWVnZW4iLCBkZWxheSA9IDEpDQpzZXNzaW9uDQoNCiNzZXQgYXJjaGl2ZSBmb3Igc2NyYXBlLiBUaGlzIHdheSB3ZSBjYW4gc3RvcmUgdGhlIGRhdGEgZm9yIGZ1dHVyZSB1c2FnZS4NCmFyY2hpdmUgPC0gImRhdGFfYW5hbHlzaXMvZGF0YS9kYXRhX3Jhdy9tZWVydGVuc19uYW1lX1YyLy8iDQoNCiNpbXBvcnQgdGhlIGRhdGEuIA0KbmFtZWxpc3RfZGYgPC0gcmVhZF9kZWxpbShmaWxlID0gImRhdGFfYW5hbHlzaXMvZGF0YS9kYXRhX3Byb2Nlc3NlZC9tZWVydGVuc19zY3JhcGUvbmFtZV9udW1iZXJzLmNzdiIsDQogICAgICAgICAgICAgICAgICAgICAgICAgIGRlbGltID0gIiwiKQ0KbmFtZWxpc3QgPC0gbmFtZWxpc3RfZGYgJT4lDQogIHJlbmFtZShuYW1lcyA9IG5hbWUpICU+JQ0KICBzZWxlY3QobmFtZXMpICU+JSANCiAgZGlzdGluY3QoKQ0KDQojZW1wdHkgZGZzDQpkZnMgPC0gbGlzdCgpDQoNCiNjcmVhdGUgcHJvZ3Jlc3NiYXINCnBiIDwtIHR4dFByb2dyZXNzQmFyKG1pbiA9IDEsIG1heCA9IGxlbmd0aChuYW1lbGlzdCRuYW1lcyksDQogICAgICAgICAgICAgICAgICAgICBpbml0aWFsID0gMSwgY2hhciA9ICItIiwNCiAgICAgICAgICAgICAgICAgICAgIHdpZHRoID0gNzAsIHN0eWxlID0gMykNCmBgYA0KDQojIyBTY3JhcGUgdGhlIHBhZ2UNCg0KYGBge3IgbWFpbiBzY3JhcGUgbG9vcCwgcmVzdWx0cz0naGlkZSd9DQojc3RhcnQgbWFpbiBsb29wLiANCmZvcihpIGluIDE6bGVuZ3RoKG5hbWVsaXN0JG5hbWVzKSkgew0KICAjaSA9IDEwDQogIHNldFR4dFByb2dyZXNzQmFyKHBiLCBpKQ0KICANCiAgZmlsZS5uYW1lICA8LSBwYXN0ZTAoYXJjaGl2ZSwgbmFtZWxpc3QkbmFtZXNbaV0sICIucmRhIikNCiAgDQogIGRmc1tbaV1dIDwtIGdldF9uYW1lX3llYXJfZnJlcXVlbmN5KHNlc3Npb24gPSBzZXNzaW9uLCBuYW1lID0gbmFtZWxpc3QkbmFtZXNbaV0sIGZpbGUubmFtZSA9IGZpbGUubmFtZSkNCn0NCg0KI3JlbmFtZSBuYW1lIGludG8gbmFtZXMNCm5hbWVsaXN0X2RmIDwtIG5hbWVsaXN0X2RmICU+JQ0KICByZW5hbWUobmFtZXMgPSBuYW1lKQ0KDQojY29tYmluZSBkYXRhDQpkZl9mcmVxdWVuY3lfZXRobmljX25hbWVzIDwtIGRmcyAlPiUNCiAgcmJpbmRsaXN0KCkgJT4lDQogIGxlZnRfam9pbihuYW1lbGlzdF9kZiwgYnkgPSAibmFtZXMiKQ0KYGBgDQoNCiMgRXhwb3J0IHJlc3VsdHMNCg0KYGBge3IgZXhwb3J0IHJlc3VsdHN9DQojZXhwb3J0IHJlc3VsdHMNCndyaXRlX2NzdihkZl9mcmVxdWVuY3lfZXRobmljX25hbWVzLCBmaWxlID0gImRhdGFfYW5hbHlzaXMvZGF0YS9kYXRhX3Byb2Nlc3NlZC9tZWVydGVuc19zY3JhcGUvZHV0Y2hfbmFtZXNfZnJlcXVlbmN5XzE4ODAyMDE2LmNzdiIpDQoNCmBgYA0KDQoNCg==


Copyright © 2024 Jeroense Thijmen