Set up
Libraries
# get packages needed
fpackage.check <- function(packages) { # (c) Jochem Tolsma
package_list <- lapply(packages, FUN = function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dependencies = TRUE)
library(x, character.only = TRUE)
}
})
}
packages = c("rvest", "httr","polite", "tidyverse",
"data.table")
fpackage.check(packages)
Scrape
Function
Get the popularity list for each year and store the name data in a
long file.
get_year_names <- function(session, year, file.name){
#year = 2014
if (file.exists(file.name)) {
load(file.name)
} else {
yr_path = paste("nvb/topnamen/land/Nederland/", as.character(year),sep="") # set the path for the specific year's webpage
year_session <-nod(session, path = yr_path) # agree changing of the path with the host (assuming I have already "bowed" for the higher-level path)
year_page <- scrape(year_session) # get the page for this year
all_names <- year_page %>% # parse the page as a table. Turns out this is a list of three tables; we need numbers 2 and 3
html_table()
save(all_names, file = file.name)# save the raw html code
}
boy_names <- all_names[[2]] # second table from the list of three
colnames(boy_names) <- c("rank", "name", "count") #seems like this should be easier...
boy_names <- boy_names %>%
as.data.frame() %>%
mutate(is_girl_name = 0) # add a gender dummy
girl_names <- all_names[[3]] # third table from the list of three
colnames(girl_names) <- c("rank", "name", "count") #seems like this should be easier...
girl_names <- girl_names %>%
as.data.frame() %>%
mutate(is_girl_name = 1) # add a gender dummy
all_names <- rbind(girl_names, boy_names) %>% # combine the two as a new data frame
mutate(year = year)
return(all_names)
}
Initialize scrape
Scrape all the popularity lists for year 1920 untill 2014.
# check permissions and introduce myself to the host
session <- bow("https://www.meertens.knaw.nl/nvb/", user_agent = "Thijmen Jeroense, Radboud Universiteit Nijmegen", delay = 1)
session
all_names <- data.frame( # initialize the empty data frame for the results
rank = integer(),
name = character(),
count = integer(),
is_girl_name = integer(),
year = integer()
)
startyear = 1920
endyear = 2014
Scrape
Scrape all the pages and store in the dfs.
#set up archive
#in this I will store the raw html code for futur use.
archive <- "data_analysis/data/data_raw/meertens_pop_by_year/year"
for(i in endyear:startyear){ # loop over all years
print(paste("scraping year",i))
file.name <- paste0(archive,"_", i, ".rda")
names_year <- get_year_names(session, i, file.name)
all_names <- rbind(all_names, names_year)
}
Export results
write.csv(all_names,
file = paste("data_analysis/data/data_processed/meertens_scrape/all_names_",as.character(startyear),as.character(endyear),".csv", sep=""),
row.names=FALSE)
LS0tDQp0aXRsZTogIk1lZXJ0ZW5zIHNjcmFwZTogbmFtZSBwb3B1bGFyaXR5IGxpc3RzIg0KYXV0aG9yOiAiVGhpam1lbiBKZXJvZW5zZSINCmRhdGU6ICJMYXN0IGNvbXBpbGVkIG9uIGByIGZvcm1hdChTeXMudGltZSgpLCAnJWQgJUIsICVZJylgIg0Kb3V0cHV0Og0KICBodG1sX2RvY3VtZW50Og0KICAgIHRvYzogVFJVRQ0KICAgIHRvY19kZXB0aDogMw0KICAgIHRvY19mbG9hdDogVFJVRQ0KICAgIGNvZGVfZm9sZGluZzogc2hvdw0KICAgIGNvZGVfZG93bmxvYWQ6IFRSVUUNCi0tLQ0KYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9DQprbml0cjo6b3B0c19jaHVuayRzZXQoY2FjaGUgPSBUUlVFLCBtZXNzYWdlID0gRkFMU0UsIHdhcm5pbmcgPSBGQUxTRSwgcmVzdWx0cyA9ICJhc2lzIiwNCiAgICAgICAgICAgICAgICAgICAgICBmaWcuYWxpZ24gPSAiY2VudGVyIikNCmBgYA0KDQojIFNldCB1cA0KDQojIyBMaWJyYXJpZXMNCmBgYHtyIGxpYnJhcmllc30NCiMgZ2V0IHBhY2thZ2VzIG5lZWRlZA0KZnBhY2thZ2UuY2hlY2sgPC0gZnVuY3Rpb24ocGFja2FnZXMpIHsgIyAoYykgSm9jaGVtIFRvbHNtYQ0KICBwYWNrYWdlX2xpc3QgIDwtIGxhcHBseShwYWNrYWdlcywgRlVOID0gZnVuY3Rpb24oeCkgew0KICAgIGlmICghcmVxdWlyZSh4LCBjaGFyYWN0ZXIub25seSA9IFRSVUUpKSB7DQogICAgICBpbnN0YWxsLnBhY2thZ2VzKHgsIGRlcGVuZGVuY2llcyA9IFRSVUUpDQogICAgICBsaWJyYXJ5KHgsIGNoYXJhY3Rlci5vbmx5ID0gVFJVRSkNCiAgICB9DQogIH0pDQp9DQpwYWNrYWdlcyA9IGMoInJ2ZXN0IiwgImh0dHIiLCJwb2xpdGUiLCAidGlkeXZlcnNlIiwgDQogICAgICAgICAgICAgImRhdGEudGFibGUiKQ0KZnBhY2thZ2UuY2hlY2socGFja2FnZXMpDQpgYGANCg0KIyBTY3JhcGUNCg0KIyMgRnVuY3Rpb24NCg0KR2V0IHRoZSBwb3B1bGFyaXR5IGxpc3QgZm9yIGVhY2ggeWVhciBhbmQgc3RvcmUgdGhlIG5hbWUgZGF0YSBpbiBhIGxvbmcgZmlsZS4NCg0KYGBge3IgZnVuY3Rpb259DQpnZXRfeWVhcl9uYW1lcyA8LSBmdW5jdGlvbihzZXNzaW9uLCB5ZWFyLCBmaWxlLm5hbWUpew0KICAjeWVhciA9IDIwMTQNCiAgaWYgKGZpbGUuZXhpc3RzKGZpbGUubmFtZSkpIHsNCiAgbG9hZChmaWxlLm5hbWUpDQogIH0gZWxzZSB7DQogIHlyX3BhdGggPSBwYXN0ZSgibnZiL3RvcG5hbWVuL2xhbmQvTmVkZXJsYW5kLyIsIGFzLmNoYXJhY3Rlcih5ZWFyKSxzZXA9IiIpICMgc2V0IHRoZSBwYXRoIGZvciB0aGUgc3BlY2lmaWMgeWVhcidzIHdlYnBhZ2UNCiAgDQogIHllYXJfc2Vzc2lvbiA8LW5vZChzZXNzaW9uLCBwYXRoID0geXJfcGF0aCkgIyBhZ3JlZSBjaGFuZ2luZyBvZiB0aGUgcGF0aCB3aXRoIHRoZSBob3N0IChhc3N1bWluZyBJIGhhdmUgYWxyZWFkeSAiYm93ZWQiIGZvciB0aGUgaGlnaGVyLWxldmVsIHBhdGgpDQogIA0KICB5ZWFyX3BhZ2UgPC0gc2NyYXBlKHllYXJfc2Vzc2lvbikgIyBnZXQgdGhlIHBhZ2UgZm9yIHRoaXMgeWVhcg0KICANCiAgYWxsX25hbWVzIDwtIHllYXJfcGFnZSAlPiUgICMgcGFyc2UgdGhlIHBhZ2UgYXMgYSB0YWJsZS4gVHVybnMgb3V0IHRoaXMgaXMgYSBsaXN0IG9mIHRocmVlIHRhYmxlczsgd2UgbmVlZCBudW1iZXJzIDIgYW5kIDMNCiAgICBodG1sX3RhYmxlKCkNCiAgDQogIHNhdmUoYWxsX25hbWVzLCBmaWxlID0gZmlsZS5uYW1lKSMgc2F2ZSB0aGUgcmF3IGh0bWwgY29kZQ0KICB9DQogIA0KICBib3lfbmFtZXMgPC0gYWxsX25hbWVzW1syXV0gIyBzZWNvbmQgdGFibGUgZnJvbSB0aGUgbGlzdCBvZiB0aHJlZQ0KICBjb2xuYW1lcyhib3lfbmFtZXMpIDwtIGMoInJhbmsiLCAibmFtZSIsICJjb3VudCIpICNzZWVtcyBsaWtlIHRoaXMgc2hvdWxkIGJlIGVhc2llci4uLg0KICBib3lfbmFtZXMgPC0gYm95X25hbWVzICU+JSANCiAgICBhcy5kYXRhLmZyYW1lKCkgJT4lIA0KICAgIG11dGF0ZShpc19naXJsX25hbWUgPSAwKSAjIGFkZCBhIGdlbmRlciBkdW1teQ0KICANCiAgZ2lybF9uYW1lcyA8LSBhbGxfbmFtZXNbWzNdXSAjIHRoaXJkIHRhYmxlIGZyb20gdGhlIGxpc3Qgb2YgdGhyZWUNCiAgY29sbmFtZXMoZ2lybF9uYW1lcykgPC0gYygicmFuayIsICJuYW1lIiwgImNvdW50IikgI3NlZW1zIGxpa2UgdGhpcyBzaG91bGQgYmUgZWFzaWVyLi4uDQogIGdpcmxfbmFtZXMgPC0gZ2lybF9uYW1lcyAlPiUgDQogICAgYXMuZGF0YS5mcmFtZSgpICU+JSANCiAgICBtdXRhdGUoaXNfZ2lybF9uYW1lID0gMSkgICMgYWRkIGEgZ2VuZGVyIGR1bW15DQogIA0KICBhbGxfbmFtZXMgPC0gcmJpbmQoZ2lybF9uYW1lcywgYm95X25hbWVzKSAlPiUgIyBjb21iaW5lIHRoZSB0d28gYXMgYSBuZXcgZGF0YSBmcmFtZQ0KICAgIG11dGF0ZSh5ZWFyID0geWVhcikgIA0KICANCiAgcmV0dXJuKGFsbF9uYW1lcykNCn0NCmBgYA0KDQoNCiMjIEluaXRpYWxpemUgc2NyYXBlDQoNClNjcmFwZSBhbGwgdGhlIHBvcHVsYXJpdHkgbGlzdHMgZm9yIHllYXIgMTkyMCB1bnRpbGwgMjAxNC4NCg0KYGBge3IgaW5pdGFsaXplIHNjcmFwZSwgcmVzdWx0cz0naGlkZSd9DQojIGNoZWNrIHBlcm1pc3Npb25zIGFuZCBpbnRyb2R1Y2UgbXlzZWxmIHRvIHRoZSBob3N0DQpzZXNzaW9uIDwtIGJvdygiaHR0cHM6Ly93d3cubWVlcnRlbnMua25hdy5ubC9udmIvIiwgdXNlcl9hZ2VudCA9ICAiVGhpam1lbiBKZXJvZW5zZSwgUmFkYm91ZCBVbml2ZXJzaXRlaXQgTmlqbWVnZW4iLCBkZWxheSA9IDEpDQpzZXNzaW9uDQoNCmFsbF9uYW1lcyA8LSBkYXRhLmZyYW1lKCAjIGluaXRpYWxpemUgdGhlIGVtcHR5IGRhdGEgZnJhbWUgZm9yIHRoZSByZXN1bHRzDQogIHJhbmsgPSBpbnRlZ2VyKCksDQogIG5hbWUgPSBjaGFyYWN0ZXIoKSwNCiAgY291bnQgPSBpbnRlZ2VyKCksDQogIGlzX2dpcmxfbmFtZSA9IGludGVnZXIoKSwNCiAgeWVhciA9IGludGVnZXIoKQ0KKQ0KDQpzdGFydHllYXIgPSAxOTIwDQplbmR5ZWFyID0gMjAxNA0KYGBgDQoNCiMjIFNjcmFwZQ0KDQpTY3JhcGUgYWxsIHRoZSBwYWdlcyBhbmQgc3RvcmUgaW4gdGhlIGRmcy4NCg0KYGBge3Igc2NyYXBlLCByZXN1bHRzPSdoaWRlJ30NCg0KI3NldCB1cCBhcmNoaXZlDQojaW4gdGhpcyBJIHdpbGwgc3RvcmUgdGhlIHJhdyBodG1sIGNvZGUgZm9yIGZ1dHVyIHVzZS4gDQphcmNoaXZlIDwtICJkYXRhX2FuYWx5c2lzL2RhdGEvZGF0YV9yYXcvbWVlcnRlbnNfcG9wX2J5X3llYXIveWVhciINCg0KZm9yKGkgaW4gZW5keWVhcjpzdGFydHllYXIpeyAjIGxvb3Agb3ZlciBhbGwgeWVhcnMNCiAgcHJpbnQocGFzdGUoInNjcmFwaW5nIHllYXIiLGkpKQ0KICANCiAgZmlsZS5uYW1lIDwtIHBhc3RlMChhcmNoaXZlLCJfIiwgaSwgIi5yZGEiKQ0KICANCiAgbmFtZXNfeWVhciA8LSBnZXRfeWVhcl9uYW1lcyhzZXNzaW9uLCBpLCBmaWxlLm5hbWUpDQogIGFsbF9uYW1lcyA8LSByYmluZChhbGxfbmFtZXMsIG5hbWVzX3llYXIpDQp9DQoNCmBgYA0KDQojIEV4cG9ydCByZXN1bHRzDQoNCmBgYHtyIGV4cG9ydCByZXN1bHRzfQ0Kd3JpdGUuY3N2KGFsbF9uYW1lcywgDQogICAgICAgICAgZmlsZSA9IHBhc3RlKCJkYXRhX2FuYWx5c2lzL2RhdGEvZGF0YV9wcm9jZXNzZWQvbWVlcnRlbnNfc2NyYXBlL2FsbF9uYW1lc18iLGFzLmNoYXJhY3RlcihzdGFydHllYXIpLGFzLmNoYXJhY3RlcihlbmR5ZWFyKSwiLmNzdiIsIHNlcD0iIiksDQogICAgICAgICAgcm93Lm5hbWVzPUZBTFNFKQ0KDQpgYGANCg0KDQoNCg0K
Copyright © 2024 Jeroense Thijmen