-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind-all-collectors.R
400 lines (354 loc) · 12.7 KB
/
find-all-collectors.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
library(tidyverse)
library(WikidataQueryServiceR)
library(jsonlite)
#modified function based on query_wikidata in WikidataQueryServiceR package
#this one avoids the encoding error in text extraction
#also modified to avoid faulty column parsing with readr for sparse ids, e.g. orcids
querki <- function(query,h="text/csv") {
require(httr)
response <- httr::GET(url = "https://query.wikidata.org/sparql",
query = list(query = query),
httr::add_headers(Accept = h),httr::user_agent("Matdillen"))
return(httr::content(response,type=h,col_types = cols(.default = "c")))
}
#convert logical columns to char for compatibility
#columns will default to logical if all values are empty
tochar <- function(df) {
df2 = as_tibble(sapply(df,
function(x) if(is.logical(x)) {
return(as.character(x)) }
else {
return(x)
}
))
return(df2)
}
#process the different source SPARQL queries
raw = list()
iter = 1
#entomologists of the world
query <- 'SELECT ?item ?itemLabel ?entom_id ?orcid ?viaf ?isni ?yob ?yod WHERE {
?item wdt:P5370 ?entom_id. #entomologists of the world
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
OPTIONAL { ?item wdt:P496 ?orcid .}
OPTIONAL { ?item wdt:P214 ?viaf .}
OPTIONAL { ?item wdt:P213 ?isni .}
OPTIONAL { ?item wdt:P569 ?dob . BIND(YEAR(?dob) as ?yob) }
OPTIONAL { ?item wdt:P570 ?dod . BIND(YEAR(?dod) as ?yod) }
}'
raw[[iter]] = querki(query)
iter = iter + 1
#BHL creator
query <- 'SELECT ?item ?itemLabel ?bhl_id ?orcid ?viaf ?isni ?yob ?yod WHERE {
?item wdt:P4081 ?bhl_id. #BHL creator
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
OPTIONAL { ?item wdt:P496 ?orcid .}
OPTIONAL { ?item wdt:P214 ?viaf .}
OPTIONAL { ?item wdt:P213 ?isni .}
OPTIONAL { ?item wdt:P569 ?dob . BIND(YEAR(?dob) as ?yob) }
OPTIONAL { ?item wdt:P570 ?dod . BIND(YEAR(?dod) as ?yod) }
}'
#raw[[iter]] = query_wikidata(query)
raw[[iter]] = querki(query)
iter = iter + 1
#Harvard
query <- 'SELECT ?item ?itemLabel ?harv_id ?orcid ?viaf ?isni ?yob ?yod WHERE {
?item wdt:P6264 ?harv_id. #Harvard index of botanists
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
OPTIONAL { ?item wdt:P496 ?orcid .}
OPTIONAL { ?item wdt:P214 ?viaf .}
OPTIONAL { ?item wdt:P213 ?isni .}
OPTIONAL { ?item wdt:P569 ?dob . BIND(YEAR(?dob) as ?yob) }
OPTIONAL { ?item wdt:P570 ?dod . BIND(YEAR(?dod) as ?yod) }
}'
raw[[iter]] = querki(query)
iter = iter + 1
#Zoobank
query <- 'SELECT ?item ?itemLabel ?zoo_id ?orcid ?viaf ?isni ?yob ?yod WHERE {
?item wdt:P2006 ?zoo_id. #zoobank
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
OPTIONAL { ?item wdt:P496 ?orcid .}
OPTIONAL { ?item wdt:P214 ?viaf .}
OPTIONAL { ?item wdt:P213 ?isni .}
OPTIONAL { ?item wdt:P569 ?dob . BIND(YEAR(?dob) as ?yob) }
OPTIONAL { ?item wdt:P570 ?dod . BIND(YEAR(?dod) as ?yod) }
}'
raw[[iter]] = querki(query)
iter = iter + 1
#IPNI
query <- 'SELECT ?item ?itemLabel ?ipni_id ?orcid ?viaf ?isni ?yob ?yod WHERE {
?item wdt:P586 ?ipni_id. #ipni
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
OPTIONAL { ?item wdt:P496 ?orcid .}
OPTIONAL { ?item wdt:P214 ?viaf .}
OPTIONAL { ?item wdt:P213 ?isni .}
OPTIONAL { ?item wdt:P569 ?dob . BIND(YEAR(?dob) as ?yob) }
OPTIONAL { ?item wdt:P570 ?dod . BIND(YEAR(?dod) as ?yod) }
}'
raw[[iter]] = querki(query)
iter = iter + 1
#wikispecies
query <- 'SELECT ?item ?itemLabel ?article ?orcid ?viaf ?isni ?yob ?yod
WHERE
{
?item wdt:P31 wd:Q5 .
?article schema:about ?item ;
schema:isPartOf <https://species.wikimedia.org/> .
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
OPTIONAL { ?item wdt:P496 ?orcid .}
OPTIONAL { ?item wdt:P214 ?viaf .}
OPTIONAL { ?item wdt:P213 ?isni .}
OPTIONAL { ?item wdt:P569 ?dob . BIND(YEAR(?dob) as ?yob) }
OPTIONAL { ?item wdt:P570 ?dod . BIND(YEAR(?dod) as ?yod) }
}'
raw[[iter]] = querki(query)
iter = iter + 1
#all biologists
# query <- 'SELECT ?item ?itemLabel ?occupation ?orcid ?viaf ?isni WHERE {
# ?item wdt:P106/wdt:P279* wd:Q864503 . #all biologists incl subclasses
# SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
# OPTIONAL { ?item wdt:P496 ?orcid .}
# OPTIONAL { ?item wdt:P214 ?viaf .}
# OPTIONAL { ?item wdt:P213 ?isni .}
# OPTIONAL { ?item wdt:P106 ?occupation .} #not the label, as that timed out
# }'
# raw[[iter]] = querki(query)
# iter = iter + 1
#for comparison: query results using GUI
# setwd("D:/apm/STSM mobilise/data/select")
# di = list.files(pattern="*.tsv")
# raw2=list()
# for (i in 1:length(di)) {
# raw2[[i]] = read_tsv(di[i],col_types = cols(.default = "c"))
# }
#fix column types
for (i in 1:length(raw)) {
raw[[i]] = tochar(raw[[i]])
}
#full join
allb = full_join(raw[[1]],raw[[2]])
if (length(raw)>2) {
for (i in 3:length(raw)) {
allb = full_join(allb,raw[[i]])
}
}
#deduplication, start with occupation
#find all occupations
#doesn't work: doesn't recurse in subclasses of zoologist, for instance
# query <- 'SELECT ?item ?itemLabel WHERE {
# ?item wdt:P279 wd:Q864503 .
# SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
# }'
# occup = querki(query)
# allb$occupation2 = gsub("http://www.wikidata.org/entity/","",allb$occupation)
# for (i in 1:dim(occup)[1]) {
# allb$occupation2[allb$occupation==occup$item[i]] = occup$itemLabel[i]
# }
# allb$occupation2[allb$occupation=="http://www.wikidata.org/entity/Q864503"] = "biologist"
#deduplication try 2
allb.names = filter(allb,duplicated(item)==F)
for (i in 3:length(allb.names)) {
z = dim(allb.names)[1] -
dim(allb.names[is.na(allb.names[,i]),i])[1]
print(paste0(colnames(allb.names)[i],": ",z))
}
twoways = as.tibble(t(combn(seq(1:9),2)))
twoways = twoways+2
for (i in 1:dim(twoways)[1]) {
z = dim(allb.names[is.na(allb.names[,twoways[i,1]])==F&
is.na(allb.names[,twoways[i,2]])==F,3])[1]
print(paste0(colnames(allb.names)[twoways[i,1]],
", ",
colnames(allb.names)[twoways[i,2]],
": ",
z))
}
#test for no english label
#is the setting in the query optional? does it omit items with no english label?
allb.names$id = gsub("http://www.wikidata.org/entity/","",allb.names$item)
View(filter(allb.names,id==item))
allb.names$sname = gsub("^(.*[\\s])","",allb.names$itemLabel,perl=T)
#try the IPNI beta API
ipni_info = list()
for (i in 1:dim(raw[[5]])[1]) {
ipni_info[[i]] = fromJSON(paste0("https://beta.ipni.org/api/1/a/urn:lsid:ipni.org:authors:",raw[[5]]$ipni_id[i]))
if (i%%5000==0) {
print(i)
}
}
#timed out after 317 requests
#evening: after 2933 requests
#maybe with trycatch, but will take hours!
####Match BG-Base collectors to wikidata items
setwd("D:/apm/STSM mobilise/data")
bgb = read_csv("COLLECTORS_24APR19.csv",locale = locale(encoding = "latin1"))
#View(filter(allb.names,sname%in%bgb$LNAME))
#make date ranges
allb.names$date = paste(allb.names$yob,allb.names$yod,sep="-")
allb.names$date = gsub("NA","",allb.names$date)
#extract year
for (i in 1:dim(bgb)[1]) {
bgb$date1[i] = substr(bgb$BIRTH_DT[i],nchar(bgb$BIRTH_DT[i])-3,nchar(bgb$BIRTH_DT[i]))
bgb$date2[i] = substr(bgb$DEATH_DT[i],nchar(bgb$DEATH_DT[i])-3,nchar(bgb$DEATH_DT[i]))
}
bgb$date = paste(bgb$date1,bgb$date2,sep="-")
bgb$date = gsub("NA","",bgb$date)
bgb$date[bgb$date=="-"]="NA"
#try wikidata api
# allb.names$id = gsub("http://www.wikidata.org/entity/","",allb.names$item)
# tst50 = paste(allb.names$id[1:50],collapse="|")
# test = fromJSON(paste0("https://www.wikidata.org/w/api.php?action=wbgetentities&ids=",tst50,"&format=json"))
#
# library(httr)
# tst100 = paste(allb.names$id[51:100],collapse="|")
# test3 = httr::GET(url = paste0("https://www.wikidata.org/w/api.php?action=wbgetentities&ids=",tst100,"&format=json"),
# httr::user_agent("Matdillen"))
# test3.c = httr::content(test3,type="application/json")
#
# for (i in seq(from=1,to=dim(allb.names)[1],by=50)) {
# #too big for memory, will be difficult to effectively parse
# }
bgb$fullname = paste(bgb$FNAME,bgb$MNAME,bgb$LNAME)
bgb$fullname = gsub("NA ","",bgb$fullname)
#View(filter(bgb,fullname%in%allb.names$itemLabel))
#Find how many specimens a COLL_ID collected
#based on latest BG-Base specimen export (imported as 'exp': data from May 2019)
bgb$n = NA
bgb$mndate = NA
bgb$mxdate = NA
for (i in 1:dim(bgb)[1]) {
k = speccoll[speccoll$COLL_ID==bgb$COLL_ID[i],]
if (is.na(bgb$date1[i])) {
dates = filter(exp,bgb$COLL_ID[i]==COLL_ID,is.na(COLL_DT)==F)$COLL_DT
if (length(dates)>0) {
bgb$mndate[i] = min(as.numeric(substr(dates,8,11)))
bgb$mxdate[i] = max(as.numeric(substr(dates,8,11)))
}
}
bgb$n[i]=k$n[1]
if (i%%100==0) {
print(i)
}
}
#kill ranges over a 100: probably an error in one of the dates or associations
for (i in 1:dim(bgb)[1]) {
if (is.na(bgb$mndate[i])==F&is.na(bgb$mxdate[i])==F) {
dur = bgb$mxdate[i] - bgb$mndate[i]
if (dur > 100) {
bgb$mndate[i] = NA
bgb$mxdate[i] = NA
}
}
}
#only people who have collected specimens
bgb2 = filter(bgb,n>0)
#only those who have some sort of date
bgb3 = filter(bgb2,is.na(date1)==F|is.na(mndate)==F)
bgb3 = filter(bgb3,is.na(LNAME)==F) #remove no collector specified
#add 15y margin for childhood
bgb3$mndate2 = bgb3$mndate - 15
bgb3$options = NA
bgb3$lopt = NA
#create initials for the wikidata item labels
allb.names$init = gsub("\'","",allb.names$itemLabel,fixed=T)
allb.names$init = gsub("\"","",allb.names$init,fixed=T)
allb.names$init = gsub("(?<!\\s).","",allb.names$init,perl=T)
allb.names$init = paste0(substr(allb.names$itemLabel,1,1),allb.names$init)
#match (see documentation for more explanation)
for (i in 1:dim(bgb3)[1]) {
truem = filter(allb.names,itemLabel==bgb3$fullname[i])
if (dim(truem)[1]>0) {
bgb3$lopt[i] = dim(truem)[1]
bgb3$options[i] = paste(truem$itemLabel,collapse="|")
next
}
yob = filter(allb.names,yob==bgb3$date1[i])
if (dim(yob)[1]>0) {
yob = filter(yob,agrepl(bgb3$LNAME[i],itemLabel))
len = dim(yob)[1]
if (is.na(bgb3$FNAME[i])==F) {
if (grepl(".",bgb3$FNAME[i],fixed=T)==F) {
yob = filter(yob,agrepl(bgb3$FNAME[i],itemLabel))
len = dim(yob)[1]
}
if (grepl(".",bgb3$FNAME[i],fixed=T)) {
yob = filter(yob,grepl(gsub("\\.","",bgb3$FNAME[i]),init))
len=dim(yob)[1]
}
}
if (len > 1&is.na(bgb3$MNAME[i])==F) {
yob = filter(yob,agrepl(bgb3$MNAME[i],itemLabel))
len = dim(yob)[1]
}
bgb3$lopt[i] = len
bgb3$options[i] = paste(yob$itemLabel,collapse="|")
print(i)
next
}
if (is.na(bgb3$mndate2[i])==F) {
#next
yob = filter(allb.names,yob<bgb3$mndate2[i])
yob = filter(yob,is.na(yod)|yod>bgb3$mxdate[i])
if (dim(yob)[1]>0) {
yob = filter(yob,agrepl(bgb3$LNAME[i],itemLabel))
len = dim(yob)[1]
if (is.na(bgb3$FNAME[i])==F) {
if (grepl(".",bgb3$FNAME[i],fixed=T)==F) {
yob = filter(yob,agrepl(bgb3$FNAME[i],itemLabel))
len = dim(yob)[1]
}
if (grepl(".",bgb3$FNAME[i],fixed=T)) {
yob = filter(yob,grepl(gsub("\\.","",bgb3$FNAME[i]),init))
len=dim(yob)[1]
}
}
if (len > 1&is.na(bgb3$MNAME[i])==F) {
yob = filter(yob,agrepl(bgb3$MNAME[i],itemLabel))
len = dim(yob)[1]
}
bgb3$lopt[i] = len
bgb3$options[i] = paste(yob$itemLabel,collapse="|")
print(i)
next
}
}
}
#unmatched
allb.nd = filter(allb.names,is.na(yob))
bgb4 = filter(bgb3,lopt!=1)
#get pub info from wikidata
# query <- paste0('SELECT ?item ?itemLabel ?pub WHERE
# {
# ?item wdt:P50 wd:',item,'.
# SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
# OPTIONAL { ?item wdt:P577 ?pub.}
# }')
# pubs = querki(query)
#initialize
pubst = tibble(item = 0,itemLabel = 0, pub = 0,id=0)
#bombardment of sparql queries
for (i in 10001:20000) {
query <- paste0('SELECT ?item ?itemLabel ?pub WHERE
{
?item wdt:P50 wd:',allb.nd$id[i],'.
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
OPTIONAL { ?item wdt:P577 ?pub.}
}')
pubs = querki(query)
if (dim(pubs)[1]>0) {
pubs$id = allb.nd$id[i]
pubst = rbind(pubst,pubs)
}
print(i)
}
#post processing
pubst.uni = filter(pubst,duplicated(id)==F)
pubst.uni = pubst.uni[-1,]
pubst2 = pubst
pubst2$year = substr(pubst2$pub,1,4)
pubst2 = pubst2[-1,]
for (i in 1:dim(pubst.uni)[1]) {
giv = filter(pubst2,id==pubst.uni$id[i])
pubst.uni$mndate[i] = min(giv$year)
pubst.uni$mxdate[i] = max(giv$year)
}