# get packages needed
fpackage.check <- function(packages) { # (c) Jochem Tolsma
package_list <- lapply(packages, FUN = function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dependencies = TRUE)
library(x, character.only = TRUE)
}
})
}
packages = c("rvest", "httr","polite", "tidyverse",
"data.table")
fpackage.check(packages)
Scrape information from name page: yearly birth frequency and total frequency.
get_name_year_frequency <- function(session, name, file.name) {#name = "Lisa"
if (file.exists(file.name)) {
load(file.name)
} else {
name_path = paste("/nvb/naam/is/", name,sep="") # set the path for the specific name's webpage
name_session <-nod(session, path = name_path) # agree changing of the path with the host (assuming I have already "bowed" for the higher-level path)
name_page <- scrape(name_session) # get the page for this year
# get the table data and the html_text to extract frequency and frequancy per year data
page <- html_text(name_page)
table <- html_table(name_page)
#save raw data in a list
raw_data <- list(page,table)
save(raw_data, file = file.name)
}
#extract the data from the HTML code with stringr
page <- raw_data[[1]]
table <- raw_data[[2]]
#extract the data from the HTML code with stringr
subset <- page %>%
str_split("<!--")
subset <- subset[[1]][2]
subset <- subset %>%
str_extract_all("\\(.*\\)")
#extracting year
year_list <- subset[[1]][1]
year_list <- year_list %>%
str_extract_all("[:digit:]{4}")
year_list <- as.numeric(year_list[[1]])
#extracting births
value_list <- subset[[1]][4][[1]]
value_list <- value_list %>%
str_extract_all("[:digit:]{1,5}")
value_list <- as.numeric(value_list[[1]])
#extract overall frequency of names.
n_m <- table[[1]][["X3"]][2] %>%
str_replace("--","0")
n_v <- table[[1]][["X3"]][6] %>%
str_replace("--","0")
n_total <- as.numeric(n_m) + as.numeric(n_v)
#save all data in a long formate
df <- tibble::tibble(n_total, year_list, value_list, names = name)
return(df)
}
# check permissions and introduce myself to the host
session <- bow("https://www.meertens.knaw.nl/nvb/", user_agent = "Thijmen Jeroense, Radboud University Nijmegen", delay = 1)
session
#set archive for scrape. This way we can store the data for future usage.
archive <- "data_analysis/data/data_raw/meertens_name_V2//"
#import the data.
namelist_df <- read_delim(file = "data_analysis/data/data_processed/meertens_scrape/name_numbers.csv",
delim = ",")
namelist <- namelist_df %>%
rename(names = name) %>%
select(names) %>%
distinct()
#empty dfs
dfs <- list()
#create progressbar
pb <- txtProgressBar(min = 1, max = length(namelist$names),
initial = 1, char = "-",
width = 70, style = 3)
#start main loop.
for(i in 1:length(namelist$names)) {
#i = 10
setTxtProgressBar(pb, i)
file.name <- paste0(archive, namelist$names[i], ".rda")
dfs[[i]] <- get_name_year_frequency(session = session, name = namelist$names[i], file.name = file.name)
}
#rename name into names
namelist_df <- namelist_df %>%
rename(names = name)
#combine data
df_frequency_ethnic_names <- dfs %>%
rbindlist() %>%
left_join(namelist_df, by = "names")
#export results
write_csv(df_frequency_ethnic_names, file = "data_analysis/data/data_processed/meertens_scrape/dutch_names_frequency_18802016.csv")
Copyright © 2024 Jeroense Thijmen