This repository has been archived by the owner on Aug 28, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
article_parser.py
executable file
·105 lines (85 loc) · 3.42 KB
/
article_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python
import fileinput, re, sys, traceback
from mwlib.parser.nodes import *
from mwlib.uparser import parseString, simpleparse
from BeautifulSoup import BeautifulStoneSoup
from xml.sax.saxutils import unescape
def replace_nested(regex, text):
while True:
original = text
text = re.sub(regex, ' ', text)
if original == text:
return text
def unescape_full(text):
return unescape(text, {"'": "'", """: '"'})
def meta_article(link):
if not ":" in link:
return False
return link.startswith('File:') \
or link.startswith('Category:') \
or link.startswith('Wikipedia:') \
or link.startswith('Template:') \
or link.startswith('Portal:') \
or link.startswith('Book:') \
or link.startswith('MediaWiki:')
def internal_link(link):
return link.startswith('#')
def valid_link(link, title):
link = re.sub(r'[\#\|].*','', link)
link = re.sub('_',' ', link)
if not link:
return None
link = link[0].upper() + link[1:] # make sure it's first letter capital
if not (meta_article(link) or internal_link(link) or link==title):
return link
for line in fileinput.input():
try:
xml = BeautifulStoneSoup(line)
title = unescape_full(xml.find('title').string)
if meta_article(title):
sys.stderr.write("reporter:counter:parse,ignore_meta_title,1\n")
continue
text = unescape_full(xml.find('text').string)
text = replace_nested('{[^{]*?}', text)
link = None
parantheses_depth = 0
nodes = [ parseString(title='', raw=text) ]
while len(nodes) > 0:
# print "nodes", nodes
node = nodes.pop(0)
node_type = type(node)
if node_type in [Article, Paragraph, Node]:
nodes = node.children + nodes
continue
if node_type is Text:
text = str(node).encode('utf-8')
# print "text", text
for c in text:
if c == '(':
parantheses_depth += 1
elif c == ')':
parantheses_depth -= 1
if parantheses_depth < 0:
sys.stderr.write("reporter:counter:parse,neg_parantheses_depth,1\n")
sys.stderr.write("warning! -ve parantheses_depth for ["+str(title).encode('utf-8')+"]\n")
parantheses_depth = 0
# print "parantheses_depth", parantheses_depth
if parantheses_depth > 0:
# we're still in brackets, ignore
continue
elif node_type is Style:
if node.caption != "''": # ignore italics
nodes = node.children + nodes
elif node_type is ArticleLink:
target = node.target
link = valid_link(target, title)
if link:
break
if link:
print (title+"\t"+link.strip()).encode('utf-8')
else:
sys.stderr.write("reporter:counter:parse,no_link,1\n")
except:
sys.stderr.write("reporter:counter:parse,exception_parsing_article,1\n")
sys.stderr.write("ERROR problem processing article for ["+title.encode('utf-8')+"] "+str(sys.exc_info()[0])+"\n")
traceback.print_exc(file=sys.stderr)