-
Notifications
You must be signed in to change notification settings - Fork 0
/
XmlNlpLoad.py
executable file
·122 lines (97 loc) · 3.52 KB
/
XmlNlpLoad.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python3
#
# XmlNlpLoad.pm: "Pull text from XML and arrange for NLP tools.
# 2018-05-07: Written by Steven J. DeRose.
#
import xml.dom
import xml.dom.minidom
from DomExtensions import DomExtensions
from alogging import ALogger
lg = ALogger(1)
__metadata__ = {
"title" : "XmlNlpLoad",
"description" : "Pull text from XML and arrange for NLP tools.",
"rightsHolder" : "Steven J. DeRose",
"creator" : "http://viaf.org/viaf/50334488",
"type" : "http://purl.org/dc/dcmitype/Software",
"language" : "Python 3.7",
"created" : "2018-05-07",
"modified" : "2021-03-03",
"publisher" : "http://github.com/sderose",
"license" : "https://creativecommons.org/licenses/by-sa/3.0/"
}
__version__ = __metadata__["modified"]
descr = """
[far from finished]
=To Do=
* Add DOMExtensions support for untagElements.
=History=
2018-05-07, 2018-08-16: Written by Steven J. DeRose.
2021-03-03: New layout.
"""
###############################################################################
#
class XmlNlpLoad:
"""Load an XML document, discard parts not of interest, and break it
into an array of text samples (possibly each with an assigned type)
To do:
Allow drop and untag to say whether to put in spaces.
Write setHTMLConventions
Check for xml:lang, html meta encoding
"""
def __init__(self):
self.runTidyFirst = False
self.noSpaceElements = None
self.dropElements = None
self.untagElements = None
self.mergeElements = None
self.selectElements = None
self.justText = False
self.thePath = None
self.theFH = None
self.theEncoding = 'utf-8'
self.theDOM = None
def setHTMLConventions(self):
inlines = (" a abbr acronym b bdo big cite code dfn em i img input " +
" kbd q s small span strike strong sub sup tt var" +
" applet center dir font samp strike w")
self.noSpaceElements = (
inlines.split(sep=" "))
drops = 'del object img head form'
self.dropElements = drops.split(sep=" ")
self.untagElements = 'ins a'.split(sep=" ")
self.runTidyFirst = True
def open(self, path):
self.thePath = path
self.theFH = self.theFH.open(self.thePath, "rb", encoding=self.theEncoding)
return self.theFH
def close(self):
# self.theDOM.breakLinks()
self.theDOM = None
self.theFH.close()
self.theFH = None
def readFrom(self, fh):
self.thePath = None
self.theFH = fh
def loadDOM(self):
DomExtensions.patchDOM()
theDOM = xml.dom.minidom.parse(self.theFH)
if (self.dropElements):
for etype in self.dropElements:
theDOM.removeByTagName(etype)
if (self.untagElements):
assert False, "untagElements not yet implemented."
#for etype in self.untagElements:
# theDOM.untagByTagName(etype)
if (self.mergeElements):
for fromType, toType in self.mergeElements.items():
theDOM.renameByTagName(fromType, toType)
self.theDOM.addElementSpaces(self.noSpaceElements)
self.theDOM = theDOM
return self.theDOM
# Make this an iterator
def getTextByTagName(self, etype):
nodes = self.theDOM.getElementsByTagName(etype)
texts = []
for node in nodes:
texts.append(node.innertext)