-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmalesian.R
32 lines (29 loc) · 917 Bytes
/
malesian.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
library(tidyverse)
setwd("D:/apm/STSM mobilise/data")
setwd("cyclopaedia-malesian-collectors-master")
#extract person names out of the HTML of the individual pages, stored in alphabetic folders (except folder XY)
naam = NA
names = NA
names = tibble(names)
az = LETTERS
az = az[-25]
az[24]="XY"
for (j in 1:length(az)) {
setwd(az[j])
di = list.files()
for (i in 1:length(di)) {
pars = readLines(di[i])
part = tibble(pars)
#the name is as a header2, extract all between both tags and then regex the html tags away
line.ids = c(grep("<h2",part$pars,fixed=T),
grep("</h2",part$pars,fixed=T))
naam = paste(part$pars[line.ids[1]:line.ids[2]],collapse="")
naam = gsub("<[^<>]*>","",naam)
names = rbind(names,naam)
}
setwd("..")
print(az[j])
}
#remove the initial row and the summary pages
names = names[-1,]
names = filter(names,grepl("Cyclopaedia of",names)==F)