-
Notifications
You must be signed in to change notification settings - Fork 2
/
4_draw_graphs.py
143 lines (114 loc) · 4.96 KB
/
4_draw_graphs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import matplotlib;
matplotlib.use('pdf')
import matplotlib.pyplot as plt;
import pickle
import os
import logging
import argparse
#from datetime import datetime
#from itertools import groupby
from collections import Counter
import pandas as pd;
import seaborn as sns;
import numpy as np
from mpl_toolkits.axes_grid.inset_locator import inset_axes
from estimators import Estimate, unravel
from metrics import conv
from graph_tool.all import *
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
#parse the input
parser = argparse.ArgumentParser(description='Run the estimators on observations.')
parser.add_argument('--infile', '-f', default="data/estimates_wikidatawiki-20181001-pages.pickle", help='the estimations to load')
parser.add_argument('--outpath', '-o', default="docs/", help='the outpath of the graphics')
parser.add_argument('--ingraph', '-g', default="data/wikidata-20180813-all.json.bz2.universe.noattr.gt.bz2", help='the graph to load (only necessary for nice labels)')
parser.add_argument('--results', '-r', default="data/results_wikidatawiki-20181001-pages.pickle", help='all results')
args = parser.parse_args()
def plot(df, title, ax, idx):
lines = ['N1‒UNIF','Chao92','Jack1','SOR','Distinct']
ax.set_title(title + '\n')
#plot estimators
resrow = {}
metrics = []
for idx, column in enumerate(lines):
sns.lineplot(x=df.columns[0], y=column, data=df, label=column, ax=ax, markers=idx)
#add convergence
if column != 'Distinct':
metric = column + ' ρ'
metrics.append(metric)
resrow[metric] = conv(list(df[column]), list(df['Distinct']))
ax.set(xlabel='Sample Periods', ylabel='Cardinality')
#add distinct
metric = 'Distinct'
metrics.append(metric)
resrow[metric] = df['Distinct'].max().astype(np.float64)
#prepare results
df = pd.DataFrame({title: resrow}).transpose()
df = df.reindex(columns=metrics)
return df
def plotInd(df, title, ax, idx):
# indicators
df['$f_1$'] = df['$f_1$']/df['Distinct'].max().astype(np.float64)
df['Distinct'] = df['Distinct']/df['Distinct'].max().astype(np.float64)
for idx, column in enumerate(['$f_1$','Distinct']):
sns.lineplot(x=df.columns[0], y=column, data=df, label=column, ax=ax)
ax.set(xlabel='Sample Periods', ylabel='Indicators')
universe = None
if args.ingraph != '':
# load graph, only need for the titles here
logging.info("load graph: " + args.ingraph)
universe = load_graph(args.ingraph)
q2v = {}
p2v = {}
for v in universe.vertices():
if universe.vp.item[v]: #items => Q
q2v[universe.vp.q[v]] = v
else: #property => P
p2v[universe.vp.q[v]] = v
logging.info( "loading estimates: " + args.infile )
estimates = pickle.load( open( args.infile , "rb" ) )
html = '''
<html>
<head>
<title>Non-Parametric Class Completeness Estimators for Collaborative Knowledge Graphs - The Case of Wikidata</title>
<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" rel="stylesheet">
</head>
<body>
<div class="container">
<h1>Cardinal</h1>
<p>This is the companian material for the Paper "Non-Parametric Class Completeness Estimators for Collaborative Knowledge Graphs - The Case of Wikidata".</p>
<p>Below you find for all classes with at least 5000 observations the convergence metric and their respective estimator graph. (Please refer to the paper for a detailed description of the graph.)</p>
<p>Please find the code and additional datasets on <a href="https://github.com/eXascaleInfolab/cardinal/">GitHub</a>.</p>
<h2 a="results">Results</h2>'''
for estimate in sorted(estimates):
try:
name = ''
if universe:
name = universe.vp.label[q2v[estimate]]
dfEstimate = pd.DataFrame(estimates[estimate], columns=['month','N1‒UNIF','Chao92','SOR','Jack1','Jack2','Distinct','$f_1$'])
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=[8, 4])
result = plot(dfEstimate, name + " (Q" + str(estimate) + ")", ax1, 0)
plotInd(dfEstimate, name + " (Q" + str(estimate) + ")", ax2, 0)
plt.savefig(args.outpath + 'figures/' +str(estimate) + '.pdf')
plt.savefig(args.outpath + 'figures/' +str(estimate) + '.png')
logging.info( "saved " + str(estimate) + '.pdf' )
plt.close()
html = html + '''
<h2>Q'''+str(estimate)+''': '''+name+'''</h2><br>
'''
html = html + result.to_html()
html = html + '''
<img src="figures/'''+str(estimate)+'''.png">
'''
try:
results = results.append(result)
except NameError:
results = result
except KeyboardInterrupt:
break
pickle.dump( results, open( args.results, "wb" ) )
html = html + '''
</div>
</body>
</html>'''
with open(args.outpath + "index.html", "w") as file:
file.write(html)