Set up

Libraries

# get packages needed
fpackage.check <- function(packages) { # (c) Jochem Tolsma
  package_list  <- lapply(packages, FUN = function(x) {
    if (!require(x, character.only = TRUE)) {
      install.packages(x, dependencies = TRUE)
      library(x, character.only = TRUE)
    }
  })
}
packages = c("rvest", "httr","polite", "tidyverse", 
             "data.table")
fpackage.check(packages)

Scrape

Function

Get the popularity list for each year and store the name data in a long file.

get_year_names <- function(session, year, file.name){
  #year = 2014
  if (file.exists(file.name)) {
  load(file.name)
  } else {
  yr_path = paste("nvb/topnamen/land/Nederland/", as.character(year),sep="") # set the path for the specific year's webpage
  
  year_session <-nod(session, path = yr_path) # agree changing of the path with the host (assuming I have already "bowed" for the higher-level path)
  
  year_page <- scrape(year_session) # get the page for this year
  
  all_names <- year_page %>%  # parse the page as a table. Turns out this is a list of three tables; we need numbers 2 and 3
    html_table()
  
  save(all_names, file = file.name)# save the raw html code
  }
  
  boy_names <- all_names[[2]] # second table from the list of three
  colnames(boy_names) <- c("rank", "name", "count") #seems like this should be easier...
  boy_names <- boy_names %>% 
    as.data.frame() %>% 
    mutate(is_girl_name = 0) # add a gender dummy
  
  girl_names <- all_names[[3]] # third table from the list of three
  colnames(girl_names) <- c("rank", "name", "count") #seems like this should be easier...
  girl_names <- girl_names %>% 
    as.data.frame() %>% 
    mutate(is_girl_name = 1)  # add a gender dummy
  
  all_names <- rbind(girl_names, boy_names) %>% # combine the two as a new data frame
    mutate(year = year)  
  
  return(all_names)
}

Initialize scrape

Scrape all the popularity lists for year 1920 untill 2014.

# check permissions and introduce myself to the host
session <- bow("https://www.meertens.knaw.nl/nvb/", user_agent =  "Thijmen Jeroense, Radboud Universiteit Nijmegen", delay = 1)
session

all_names <- data.frame( # initialize the empty data frame for the results
  rank = integer(),
  name = character(),
  count = integer(),
  is_girl_name = integer(),
  year = integer()
)

startyear = 1920
endyear = 2014

Scrape

Scrape all the pages and store in the dfs.

#set up archive
#in this I will store the raw html code for futur use. 
archive <- "data_analysis/data/data_raw/meertens_pop_by_year/year"

for(i in endyear:startyear){ # loop over all years
  print(paste("scraping year",i))
  
  file.name <- paste0(archive,"_", i, ".rda")
  
  names_year <- get_year_names(session, i, file.name)
  all_names <- rbind(all_names, names_year)
}

Export results

write.csv(all_names, 
          file = paste("data_analysis/data/data_processed/meertens_scrape/all_names_",as.character(startyear),as.character(endyear),".csv", sep=""),
          row.names=FALSE)
LS0tDQp0aXRsZTogIk1lZXJ0ZW5zIHNjcmFwZTogbmFtZSBwb3B1bGFyaXR5IGxpc3RzIg0KYXV0aG9yOiAiVGhpam1lbiBKZXJvZW5zZSINCmRhdGU6ICJMYXN0IGNvbXBpbGVkIG9uIGByIGZvcm1hdChTeXMudGltZSgpLCAnJWQgJUIsICVZJylgIg0Kb3V0cHV0Og0KICBodG1sX2RvY3VtZW50Og0KICAgIHRvYzogVFJVRQ0KICAgIHRvY19kZXB0aDogMw0KICAgIHRvY19mbG9hdDogVFJVRQ0KICAgIGNvZGVfZm9sZGluZzogc2hvdw0KICAgIGNvZGVfZG93bmxvYWQ6IFRSVUUNCi0tLQ0KYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9DQprbml0cjo6b3B0c19jaHVuayRzZXQoY2FjaGUgPSBUUlVFLCBtZXNzYWdlID0gRkFMU0UsIHdhcm5pbmcgPSBGQUxTRSwgcmVzdWx0cyA9ICJhc2lzIiwNCiAgICAgICAgICAgICAgICAgICAgICBmaWcuYWxpZ24gPSAiY2VudGVyIikNCmBgYA0KDQojIFNldCB1cA0KDQojIyBMaWJyYXJpZXMNCmBgYHtyIGxpYnJhcmllc30NCiMgZ2V0IHBhY2thZ2VzIG5lZWRlZA0KZnBhY2thZ2UuY2hlY2sgPC0gZnVuY3Rpb24ocGFja2FnZXMpIHsgIyAoYykgSm9jaGVtIFRvbHNtYQ0KICBwYWNrYWdlX2xpc3QgIDwtIGxhcHBseShwYWNrYWdlcywgRlVOID0gZnVuY3Rpb24oeCkgew0KICAgIGlmICghcmVxdWlyZSh4LCBjaGFyYWN0ZXIub25seSA9IFRSVUUpKSB7DQogICAgICBpbnN0YWxsLnBhY2thZ2VzKHgsIGRlcGVuZGVuY2llcyA9IFRSVUUpDQogICAgICBsaWJyYXJ5KHgsIGNoYXJhY3Rlci5vbmx5ID0gVFJVRSkNCiAgICB9DQogIH0pDQp9DQpwYWNrYWdlcyA9IGMoInJ2ZXN0IiwgImh0dHIiLCJwb2xpdGUiLCAidGlkeXZlcnNlIiwgDQogICAgICAgICAgICAgImRhdGEudGFibGUiKQ0KZnBhY2thZ2UuY2hlY2socGFja2FnZXMpDQpgYGANCg0KIyBTY3JhcGUNCg0KIyMgRnVuY3Rpb24NCg0KR2V0IHRoZSBwb3B1bGFyaXR5IGxpc3QgZm9yIGVhY2ggeWVhciBhbmQgc3RvcmUgdGhlIG5hbWUgZGF0YSBpbiBhIGxvbmcgZmlsZS4NCg0KYGBge3IgZnVuY3Rpb259DQpnZXRfeWVhcl9uYW1lcyA8LSBmdW5jdGlvbihzZXNzaW9uLCB5ZWFyLCBmaWxlLm5hbWUpew0KICAjeWVhciA9IDIwMTQNCiAgaWYgKGZpbGUuZXhpc3RzKGZpbGUubmFtZSkpIHsNCiAgbG9hZChmaWxlLm5hbWUpDQogIH0gZWxzZSB7DQogIHlyX3BhdGggPSBwYXN0ZSgibnZiL3RvcG5hbWVuL2xhbmQvTmVkZXJsYW5kLyIsIGFzLmNoYXJhY3Rlcih5ZWFyKSxzZXA9IiIpICMgc2V0IHRoZSBwYXRoIGZvciB0aGUgc3BlY2lmaWMgeWVhcidzIHdlYnBhZ2UNCiAgDQogIHllYXJfc2Vzc2lvbiA8LW5vZChzZXNzaW9uLCBwYXRoID0geXJfcGF0aCkgIyBhZ3JlZSBjaGFuZ2luZyBvZiB0aGUgcGF0aCB3aXRoIHRoZSBob3N0IChhc3N1bWluZyBJIGhhdmUgYWxyZWFkeSAiYm93ZWQiIGZvciB0aGUgaGlnaGVyLWxldmVsIHBhdGgpDQogIA0KICB5ZWFyX3BhZ2UgPC0gc2NyYXBlKHllYXJfc2Vzc2lvbikgIyBnZXQgdGhlIHBhZ2UgZm9yIHRoaXMgeWVhcg0KICANCiAgYWxsX25hbWVzIDwtIHllYXJfcGFnZSAlPiUgICMgcGFyc2UgdGhlIHBhZ2UgYXMgYSB0YWJsZS4gVHVybnMgb3V0IHRoaXMgaXMgYSBsaXN0IG9mIHRocmVlIHRhYmxlczsgd2UgbmVlZCBudW1iZXJzIDIgYW5kIDMNCiAgICBodG1sX3RhYmxlKCkNCiAgDQogIHNhdmUoYWxsX25hbWVzLCBmaWxlID0gZmlsZS5uYW1lKSMgc2F2ZSB0aGUgcmF3IGh0bWwgY29kZQ0KICB9DQogIA0KICBib3lfbmFtZXMgPC0gYWxsX25hbWVzW1syXV0gIyBzZWNvbmQgdGFibGUgZnJvbSB0aGUgbGlzdCBvZiB0aHJlZQ0KICBjb2xuYW1lcyhib3lfbmFtZXMpIDwtIGMoInJhbmsiLCAibmFtZSIsICJjb3VudCIpICNzZWVtcyBsaWtlIHRoaXMgc2hvdWxkIGJlIGVhc2llci4uLg0KICBib3lfbmFtZXMgPC0gYm95X25hbWVzICU+JSANCiAgICBhcy5kYXRhLmZyYW1lKCkgJT4lIA0KICAgIG11dGF0ZShpc19naXJsX25hbWUgPSAwKSAjIGFkZCBhIGdlbmRlciBkdW1teQ0KICANCiAgZ2lybF9uYW1lcyA8LSBhbGxfbmFtZXNbWzNdXSAjIHRoaXJkIHRhYmxlIGZyb20gdGhlIGxpc3Qgb2YgdGhyZWUNCiAgY29sbmFtZXMoZ2lybF9uYW1lcykgPC0gYygicmFuayIsICJuYW1lIiwgImNvdW50IikgI3NlZW1zIGxpa2UgdGhpcyBzaG91bGQgYmUgZWFzaWVyLi4uDQogIGdpcmxfbmFtZXMgPC0gZ2lybF9uYW1lcyAlPiUgDQogICAgYXMuZGF0YS5mcmFtZSgpICU+JSANCiAgICBtdXRhdGUoaXNfZ2lybF9uYW1lID0gMSkgICMgYWRkIGEgZ2VuZGVyIGR1bW15DQogIA0KICBhbGxfbmFtZXMgPC0gcmJpbmQoZ2lybF9uYW1lcywgYm95X25hbWVzKSAlPiUgIyBjb21iaW5lIHRoZSB0d28gYXMgYSBuZXcgZGF0YSBmcmFtZQ0KICAgIG11dGF0ZSh5ZWFyID0geWVhcikgIA0KICANCiAgcmV0dXJuKGFsbF9uYW1lcykNCn0NCmBgYA0KDQoNCiMjIEluaXRpYWxpemUgc2NyYXBlDQoNClNjcmFwZSBhbGwgdGhlIHBvcHVsYXJpdHkgbGlzdHMgZm9yIHllYXIgMTkyMCB1bnRpbGwgMjAxNC4NCg0KYGBge3IgaW5pdGFsaXplIHNjcmFwZSwgcmVzdWx0cz0naGlkZSd9DQojIGNoZWNrIHBlcm1pc3Npb25zIGFuZCBpbnRyb2R1Y2UgbXlzZWxmIHRvIHRoZSBob3N0DQpzZXNzaW9uIDwtIGJvdygiaHR0cHM6Ly93d3cubWVlcnRlbnMua25hdy5ubC9udmIvIiwgdXNlcl9hZ2VudCA9ICAiVGhpam1lbiBKZXJvZW5zZSwgUmFkYm91ZCBVbml2ZXJzaXRlaXQgTmlqbWVnZW4iLCBkZWxheSA9IDEpDQpzZXNzaW9uDQoNCmFsbF9uYW1lcyA8LSBkYXRhLmZyYW1lKCAjIGluaXRpYWxpemUgdGhlIGVtcHR5IGRhdGEgZnJhbWUgZm9yIHRoZSByZXN1bHRzDQogIHJhbmsgPSBpbnRlZ2VyKCksDQogIG5hbWUgPSBjaGFyYWN0ZXIoKSwNCiAgY291bnQgPSBpbnRlZ2VyKCksDQogIGlzX2dpcmxfbmFtZSA9IGludGVnZXIoKSwNCiAgeWVhciA9IGludGVnZXIoKQ0KKQ0KDQpzdGFydHllYXIgPSAxOTIwDQplbmR5ZWFyID0gMjAxNA0KYGBgDQoNCiMjIFNjcmFwZQ0KDQpTY3JhcGUgYWxsIHRoZSBwYWdlcyBhbmQgc3RvcmUgaW4gdGhlIGRmcy4NCg0KYGBge3Igc2NyYXBlLCByZXN1bHRzPSdoaWRlJ30NCg0KI3NldCB1cCBhcmNoaXZlDQojaW4gdGhpcyBJIHdpbGwgc3RvcmUgdGhlIHJhdyBodG1sIGNvZGUgZm9yIGZ1dHVyIHVzZS4gDQphcmNoaXZlIDwtICJkYXRhX2FuYWx5c2lzL2RhdGEvZGF0YV9yYXcvbWVlcnRlbnNfcG9wX2J5X3llYXIveWVhciINCg0KZm9yKGkgaW4gZW5keWVhcjpzdGFydHllYXIpeyAjIGxvb3Agb3ZlciBhbGwgeWVhcnMNCiAgcHJpbnQocGFzdGUoInNjcmFwaW5nIHllYXIiLGkpKQ0KICANCiAgZmlsZS5uYW1lIDwtIHBhc3RlMChhcmNoaXZlLCJfIiwgaSwgIi5yZGEiKQ0KICANCiAgbmFtZXNfeWVhciA8LSBnZXRfeWVhcl9uYW1lcyhzZXNzaW9uLCBpLCBmaWxlLm5hbWUpDQogIGFsbF9uYW1lcyA8LSByYmluZChhbGxfbmFtZXMsIG5hbWVzX3llYXIpDQp9DQoNCmBgYA0KDQojIEV4cG9ydCByZXN1bHRzDQoNCmBgYHtyIGV4cG9ydCByZXN1bHRzfQ0Kd3JpdGUuY3N2KGFsbF9uYW1lcywgDQogICAgICAgICAgZmlsZSA9IHBhc3RlKCJkYXRhX2FuYWx5c2lzL2RhdGEvZGF0YV9wcm9jZXNzZWQvbWVlcnRlbnNfc2NyYXBlL2FsbF9uYW1lc18iLGFzLmNoYXJhY3RlcihzdGFydHllYXIpLGFzLmNoYXJhY3RlcihlbmR5ZWFyKSwiLmNzdiIsIHNlcD0iIiksDQogICAgICAgICAgcm93Lm5hbWVzPUZBTFNFKQ0KDQpgYGANCg0KDQoNCg0K


Copyright © 2024 Jeroense Thijmen