-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathPartsOfSpeech-MBSP-annotate.py
135 lines (117 loc) · 5.08 KB
/
PartsOfSpeech-MBSP-annotate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/python
#
# This script provides parts of speech analysis using the MBPS parser from the CLiPS project.
#
# Each line is tokenized into sentences -- a few lines are not correctly split into sentences by cc-segment-stories.
# Note that MBSP now skips lines with utf8 errors -- they are fairly common, though most are likely musical notes and copyright symbols.
#
# 20140710235922.257|20140710235926.928|POS_01|DUDE/NNP/I-NP/O|,/,/O/O|WE/PRP/I-NP/O|HAVE/VBP/I-VP/O|THE/DT/I-NP/O|MUNCHIES/NNS/I-NP/O|!/./O/O
# Start time|End time|Primary tag(|Word parts of speech tags)*
#
# Memory-Based Shallow Parser (MBSP)
# http://www.clips.ua.ac.be/pages/MBSP
#
# MBSP uses the Penn Treebank II tag set, see http://www.clips.ua.ac.be/pages/mbsp-tags
#
# MBSP starts four data servers that require quite a bit of memory
# (CHUNK: 80MB, LEMMA: 10MB, RELATION: 160MB, PREPOSITION: 210MB).
# Only the CHUNK server (which gives you the part-of-speech tags)
# is mandatory. The optional servers can be disabled in config.py
# to reduce the memory usage, for example:
#
# servers = ['chunk', 'lemma']
#
# Note python/MBSP runs on ports 607x and src/MBSP on 606x, to increase capacity.
#
# Written by FFS, 2014-07-28
#
# Changelog:
#
# 2014-08-09 Add lemma (lowercase first), clean NULL as octal (freezes MBSP)
# 2014-08-08 Add port argument
# 2014-08-04 Renamed from to MBSP-pos.py PartsOfSpeech-MBSP-01.py
# 2014-07-31 Each line is tokenized into sentences
# 2014-07-28 Forked from SentiWordNet-03.py
#
# ------------------------------------------------
# User input
import sys, os.path
scriptname = os.path.basename(sys.argv[0])
port = sys.argv[1]
# Help screen
if port == "-h" :
print "".join([ "\n","\t","This is a production script for parts-of-speech analysis with the MBSP tagger." ])
print "".join([ "\n","\t","MBSP ports on roma and cartago include 6040, 6050, 6060, 6070, 6080, and 6090.","\n" ])
print "".join([ "\t","\t",scriptname," $PORT $FIL.seg > $FIL.pos or" ])
print "".join([ "\t","\t",scriptname," 6050 2013-01-02_2000_US_CNN_Newsroom.seg | sponge 2013-01-02_2000_US_CNN_Newsroom.seg" ])
print "".join([ "\n","\t","or use seg-PartsOfSpeech-MBSP for bulk processing.","\n" ])
quit()
# Libraries
import datetime, re
# Select the MBSP port
MODULE = "".join(["/tvspare/software/python/MBSP-",str(port)])
if MODULE not in sys.path: sys.path.append(MODULE)
try:
import MBSP
except ImportError:
print "".join([ "\n","\t","MBSP failed to load on port ",port," -- see '",scriptname," -h' for available ports.","\n" ])
quit()
# Debug
# print MBSP.parse('I ate pizza with a friend.')
# Filename
filename = sys.argv[2]
# Counter
n = 0
# A. Get the lines from the file
with open(filename) as fp:
for line in fp:
# B. Split each line into fields
field = line.split("|")
# Pretty debug
# print('\n'.join('{}: {}'.format(*k) for k in enumerate(field)))
# C. Header and footer
if len(field[0]) != 18:
print line,
continue
# D. Program credit
if n == 0:
credit=["POS_01|",datetime.datetime.now().strftime("%Y-%m-%d %H:%M"),"|Source_Program=MBSP 1.4, ",scriptname,"|Source_Person=Walter Daelemans, FFS|Codebook=Treebank II"]
print "".join(credit)
n=1
# E. Segment tags and other non-caption tags
if field[2] == "SEG":
print line,
continue
elif len(field[2]) != 3:
print line,
continue
# F. Get the text, clean leading chevrons -- if BOM, strip non-ascii, otherwise remove individually; lowercase text
try:
text = re.sub('^[>,\ ]{0,6}','', field[3])
if re.search("(\xef\xbf\xbd)", text): text = ''.join([x for x in text if ord(x) < 128])
text = str(text).replace('\x00 ','').replace('\xef\xbf\xbd','')
text = str(text).replace('\xf7','').replace('\xc3\xba','').replace('\xb6','').replace('\xa9','').replace('\xe2\x99\xaa','')
text = str(text).replace('\xc3\xaf','').replace('\x5c','').replace('\xf1','').replace('\xe1','').replace('\xe7','').replace('\xfa','')
text = str(text).replace('\xf3','').replace('\xed','').replace('\xe9','').replace('\xe0','').replace('\xae','').replace('\xc2','')
text = str(text).replace('\xc3','').replace('\xa2','').replace('\xbf','')
if text.isupper(): text = text.lower()
# print text
except IndexError:
print line
continue
# G. Remove clearly wrong unicode characters -- BOM, NULL (only utf8 hex works)
line = str(line).replace('\x00 ','').replace('\xef\xbf\xbd','')
print line,
# H. Parts of speech with MBSP -- resplit the text if needed
try:
pos = MBSP.chunk(text, tokenize=True, lemmata=True)
for pos in pos.splitlines():
pos = str(pos).replace(' ','|')
print "".join([field[0],"|",field[1],"|POS_01","|",pos])
except (UnicodeDecodeError, UnicodeEncodeError, IndexError, AssertionError):
# Tag failed UTF-8 lines NA to enable repair
print "".join([field[0],"|",field[1],"|POS_01","|NA"])
continue
# I. Close the file
fp.close()
# EOF