forked from karpathy/arxiv-sanity-preserver
-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathmake_cache.py
114 lines (96 loc) · 3.7 KB
/
make_cache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""
computes various cache things on top of db.py so that the server
(running from serve.py) can start up and serve faster when restarted.
this script should be run whenever db.p is updated, and
creates db2.p, which can be read by the server.
"""
import os
import json
import pymongo
import time
import pickle
import dateutil.parser
from sqlite3 import dbapi2 as sqlite3
from utils import safe_pickle_dump, Config
sqldb = sqlite3.connect(Config.database_path)
sqldb.row_factory = sqlite3.Row # to return dicts rather than tuples
CACHE = {}
print('loading the paper database', Config.db_path)
db = pickle.load(open(Config.db_path, 'rb'))
print('loading tfidf_meta', Config.meta_path)
# meta = pickle.load(open(Config.meta_path, "rb"))
# vocab = meta['vocab']
# idf = meta['idf']
print('decorating the database with additional information...')
for pid,p in db.items():
timestruct = dateutil.parser.parse(p['updated'])
p['time_updated'] = int(timestruct.strftime("%s")) # store in struct for future convenience
timestruct = dateutil.parser.parse(p['published'])
p['time_published'] = int(timestruct.strftime("%s")) # store in struct for future convenience
print('computing min/max time for all papers...')
tts = [time.mktime(dateutil.parser.parse(p['updated']).timetuple()) for pid,p in db.items()]
ttmin = min(tts)*1.0
ttmax = max(tts)*1.0
for pid,p in db.items():
tt = time.mktime(dateutil.parser.parse(p['updated']).timetuple())
p['tscore'] = (tt-ttmin)/(ttmax-ttmin)
print('precomputing papers date sorted...')
scores = [(p['time_updated'], pid) for pid,p in db.items()]
scores.sort(reverse=True, key=lambda x: x[0])
CACHE['date_sorted_pids'] = [sp[1] for sp in scores]
# compute top papers in peoples' libraries
print('computing top papers...')
libs = sqldb.execute('''select * from library''').fetchall()
counts = {}
for lib in libs:
pid = lib['paper_id']
counts[pid] = counts.get(pid, 0) + 1
top_paper_counts = sorted([(v,k) for k,v in counts.items() if v > 0], reverse=True)
CACHE['top_sorted_pids'] = [q[1] for q in top_paper_counts]
# some utilities for creating a search index for faster search
punc = "'!\"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'" # removed hyphen from string.punctuation
trans_table = {ord(c): None for c in punc}
def makedict(s, forceidf=None, scale=1.0):
words = set(s.lower().translate(trans_table).strip().split())
idfd = {}
for w in words: # todo: if we're using bigrams in vocab then this won't search over them
if forceidf is None:
if w in vocab:
# we have idf for this
idfval = idf[vocab[w]]*scale
else:
idfval = 1.0*scale # assume idf 1.0 (low)
else:
idfval = forceidf
idfd[w] = idfval
return idfd
def merge_dicts(dlist):
m = {}
for d in dlist:
for k,v in d.items():
m[k] = m.get(k,0) + v
return m
print('building an index for faster search...')
search_dict = {}
# for pid,p in db.items():
# dict_title = makedict(p['title'], forceidf=5, scale=3)
# dict_authors = makedict(' '.join(x['name'] for x in p['authors']), forceidf=5)
# dict_categories = {x['term'].lower():5 for x in p['tags']}
# if 'and' in dict_authors:
# # special case for "and" handling in authors list
# del dict_authors['and']
# dict_summary = makedict(p['summary'])
# search_dict[pid] = merge_dicts([dict_title, dict_authors, dict_categories, dict_summary])
CACHE['search_dict'] = search_dict
client = pymongo.MongoClient()
mdb = client.arxiv
papers = mdb.papers
for key, val in db.items():
val['_id'] = key
papers.remove()
papers.insert_many(list(db.values()))
# save the cache
print('writing', Config.serve_cache_path)
safe_pickle_dump(CACHE, Config.serve_cache_path)
print('writing', Config.db_serve_path)
safe_pickle_dump(db, Config.db_serve_path)