-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMultiChaptersBlockTextExtract.py
259 lines (219 loc) · 11 KB
/
MultiChaptersBlockTextExtract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#!/usr/bin/python
# -*- coding: utf-8 -*-
try:
import Image
except ImportError:
from PIL import Image
import cv2
import numpy as np
import pytesseract
import os
import codecs
import re
import shutil
MAX_HEIGHT = 83 # Max % of page height that can be a text block - anything larger is discarded
MAX_WIDTH = 78 # Max % of page width that can be a text block - anything larger is discarded
MIN_HEIGHT = 1.4 # Min % of page height that can contain text - anything smaller is discarded
MIN_WIDTH = 7.8 # Min % of page width that can contain text - anything smaller is discarded
DILATATION_ITERATIONS = 15 # Number of dilatations needed around text to create valid contours
TOP, BOTTOM, LEFT, RIGHT = 10,10,10,10 #border width in pixels added to cropped images
TEXT_EXCLUSION = 'MENU' # Exclude lines of text starting with this value from the OCR when it is at the begining of a block
TEXT_LANGUAGE = 'frm' # Use this language with Tesseract
SAVE_CONTOURED_FILES = 'false' # Save the images with contours indicating each area being OCRed.
SAVE_RAW_OCR = 'false' # Save the OCR as generated by Tesseract before postprocessing
# Create a special function to sort files by number (300 prior to 1600 for instance)
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
# get current folder info
dirpath = os.getcwd()
print("\nProcessing all folders in : " + dirpath)
# Verify if template matching is used
template = cv2.imread('opencv-template-for-matching.jpg',0)
if template is None:
print("The file opencv-template-for-matching.jpg is missing. No template matching will be performed.")
# Verify if OCRFix dictionary is used
if not os.path.isfile('OCRFix.txt'):
print("The file OCRFix.txt is missing. No dictionary replacement will be made.")
# process cropfolders as individual chapters
exclude = set(['cropped','exclusion','contoured'])
for root, subdirs, files in os.walk(dirpath, topdown=True):
subdirs[:] = [d for d in subdirs if d not in exclude]
for file in subdirs:
imagecount = 0
filepath = os.path.join(root, file)
foldername = file
print("\nProcessing files in cropfolder : " + foldername)
# create subdirectory for cropped images
#cropfolder = os.path.join(filepath, "cropped")
cropfolder = ("cropped")
shutil.rmtree(cropfolder, ignore_errors=True)
if not os.path.exists(cropfolder):
for retry in range(100):
try:
os.makedirs(cropfolder)
break
except:
print "Cropped folder creation failed, retrying..."
contourfolder = ("contoured")
if SAVE_CONTOURED_FILES.upper() == 'TRUE' and not os.path.exists(contourfolder):
for retry in range(100):
try:
os.makedirs(contourfolder)
break
except:
print "Contoured folder creation failed, retrying..."
# Find regions of interest and save them to separate file
for file in os.listdir(filepath):
if os.path.isfile(os.path.join(filepath, file)):
if os.path.splitext(file)[1].lower() in ('.png'):
imagecount = imagecount + 1
image = Image.open(os.path.join(filepath, file))
width, height = image.size
print("Cropping images for " + str(file))
image = cv2.imread(os.path.join(filepath, file))
gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY) # grayscale
# template matching and replacement
if template is not None:
prevpt = 0
w, h = template.shape[::-1]
res = cv2.matchTemplate(gray,template,cv2.TM_CCOEFF_NORMED)
threshold = 0.6
loc = np.where( res >= threshold)
for pt in zip(*loc[::-1]):
#cv2.rectangle(image, pt, (pt[0] + w, pt[1] + h), (255,255,255), 2)
cv2.rectangle(gray, pt, (pt[0] + w, pt[1] + h +50), 230, -1) #fill the matched area with grey on grayscale image.
if pt[1] > prevpt + 20:
print('Template match found at ' + str(pt))
crop_img = image[pt[1]+45:pt[1] + h +45, pt[0]:pt[0] + w]
crop_img_w_border = cv2.copyMakeBorder(crop_img, TOP, BOTTOM, LEFT, RIGHT, cv2.BORDER_CONSTANT, value=[255, 255, 255]) #white borders can help the OCR
cropfilename = os.path.splitext(file)[0] + "_cropped_%d.png" % pt[1]
cv2.imwrite(os.path.join(cropfolder, cropfilename), crop_img_w_border)
cv2.rectangle(image, pt, (pt[0] + w, pt[1] + h +50), [255,255,255], -1) #fill the matched area with white on original image.
prevpt = pt[1]
# retrieve contours for remaining blocks
_,thresh = cv2.threshold(gray,125,255,cv2.THRESH_BINARY_INV) # threshold
#cv2.waitKey(0)
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(5,3))
dilated = cv2.dilate(thresh,kernel,iterations = max(DILATATION_ITERATIONS*width/1900, DILATATION_ITERATIONS)) # dilate x times or more if page is larger
#cv2.imshow('Dilated',dilated)
#cv2.waitKey(0)
_, contours, hierarchy = cv2.findContours(dilated,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE) # get contours
# for each contour found, draw a rectangle around it on original image
contourcount = 0
for contour in contours:
# get rectangle bounding contour
[x,y,w,h] = cv2.boundingRect(contour)
# discard areas that are too large
if h>(float(MAX_HEIGHT)/100*height) or w>(float(MAX_WIDTH)/100*width):
continue
# discard areas that are too small
if h<(float(MIN_HEIGHT)/100*height) or w<(float(MIN_WIDTH)/100*width):
continue
contourcount= contourcount + 1
crop_img = image[y:y+h, x:x+w]
#cv2.imshow("cropped%d.png" %i, crop_img)
crop_img_w_border = cv2.copyMakeBorder(crop_img, TOP, BOTTOM, LEFT, RIGHT, cv2.BORDER_CONSTANT, value=[255, 255, 255]) #white borders can help the OCR
cropfilename = os.path.splitext(file)[0] + "_cropped_%d.png" % y
cv2.imwrite(os.path.join(cropfolder, cropfilename), crop_img_w_border)
#cv2.waitKey(0)
# draw rectangle around contour on original image
cv2.rectangle(image,(x,y),(x+w,y+h),(255,0,255),3)
# write original image with added contours to disk
if SAVE_CONTOURED_FILES.upper() == 'TRUE':
contoursubfolder = os.path.join(contourfolder, foldername)
if not os.path.exists(contoursubfolder):
for retry in range(100):
try:
os.makedirs(contoursubfolder)
break
except:
print "Contoured folder creation failed, retrying..."
W = 800.
imgScale = W/width
newX,newY = width*imgScale, height*imgScale
newimg = cv2.resize(image,(int(newX),int(newY)))
contourfilename = os.path.splitext(file)[0] + "_countoured.jpg"
cv2.imwrite(os.path.join(contoursubfolder, contourfilename), newimg, [int(cv2.IMWRITE_JPEG_QUALITY), 50])
# cv2.destroyAllWindows()
if contourcount == 0:
print ('No block of text found, writing the entire image')
cv2.imwrite(os.path.join(cropfolder, file), image)
if imagecount == 0:
print ('There is no png to process in this folder')
continue
# Perform OCR on cropped files
ocrfile = foldername + "_ocr.txt"
finalfile = foldername + ".txt"
text = ''
for root2, dirs2, filenames in os.walk(cropfolder):
for file in sorted(filenames, key=numericalSort):
print("Performing OCR on " + str(file))
ocrtext = pytesseract.image_to_string(Image.open(os.path.join(cropfolder, file)), lang=TEXT_LANGUAGE, config="-c tessedit_char_blacklist=\%]!_‘—{€/…").encode('utf-8')
if len(TEXT_EXCLUSION)>0 and ocrtext[:len(TEXT_EXCLUSION)] == TEXT_EXCLUSION:
print('Excluding text starting with ' + TEXT_EXCLUSION)
try:
ocrtext = ocrtext.split("\n",1)[1]
except IndexError:
ocrtext = ''
text = text + ocrtext + '\n\n'
# write original ocr text to disk
if SAVE_RAW_OCR.upper() == 'TRUE':
f = open(ocrfile,"w")
f.write(text)
f.close()
# Fixing common OCR errors
# Performing Search and replace with dictionary OCRFix.txt
print('Fixing common OCR errors')
if os.path.isfile('OCRFix.txt'):
rep = {} # creation of empty dictionary
with open('OCRFix.txt') as temprep: # loading of definitions in the dictionary
for line in temprep:
(key, val) = line.strip('\n').split('|')
rep[key] = val
# start replacement
#rep = dict((re.escape(k), v) for k, v in rep.items()) commented to enable the use in the mapping of re reserved characters
pattern = re.compile("|".join(rep.keys()))
#print (pattern)
text = pattern.sub(lambda m: rep[m.group(0)], text)
#write of te output files with new suffice
# Removing empty lines
text = re.sub(r'([a-zàé,:])(\n\n)([a-zſ&])', r'\1\n\3', text) # remove empty line between words
text = re.sub(r'([a-zàé]\-)(\n\n)([a-zſ])', r'\1\n\3', text) # remove empty line between hyphenation
text = re.sub(r'([a-zàé]\—)(\n\n)([a-zſ])', r'\1\n\3', text) # remove empty line between hyphenation
text = re.sub(r'(\n)(\n\n)(\n)', r'\n', text) # remove multiple empty lines
# Fixing invalid spacing with comma, colon and semicolon
text = re.sub(r'([a-zA-Z0-9\)àâéèù])(,|;|:)([a-zA-Zſfiàâéèù&0-9])', r'\1\2 \3', text)
text = re.sub(r'([a-zA-Z0-9\)àâéèù])( , )([a-zA-Zſàâéèù&0-9])', r'\1, \3', text)
text = re.sub(r'([a-zA-Z0-9\)àâéèù])( ,)(\n)', r'\1,\3', text)
text = re.sub(r'(\s)(,)([a-zA-Zſàâéèù&0-9])', r', \3', text)
text = re.sub(r'(\s)(;|:)([a-zA-Zſàâéèù&])', r'\2 \3', text)
text = re.sub(r'(\s)(;|:)(\s)', r'\2\3', text)
# Fixing spaces before and after parenthesis
text = re.sub(r'(\()(\s)([a-zA-Zſ\*])', r'\1\3', text)
text = re.sub(r'([a-zA-Z0-9\.\*])(\s)(\))', r'\1\3', text)
# Fixing spaces before and after hyphens
text = re.sub(r'([a-zàâéù])(- )([a-zſàâéèù])', r'\1-\3', text)
text = re.sub(r'([a-zàâéù])( -)([a-zſàâéèù])', r'\1-\3', text)
text = re.sub(r'([a-zàâéù])( - )([a-zzſàâéèù])', r'\1-\3', text)
# Fixing spaces before and after ampersands
text = re.sub(r'([a-z0-9àâéù])(& )([a-zA-Z0-9ſàâéèù])', r'\1 & \3', text)
text = re.sub(r'([a-z0-9àâéù])( &)([a-zA-Z0-9ſàâéèù])', r'\1 & \3', text)
text = re.sub(r'(\n)(&)([a-zA-Z0-9ſàâéèù])', r'\1& \3', text)
# Fixing spaces after periods
text = re.sub(r'(\.)([A-Z0-9])', r'\1 \2', text)
# Cleaning extra characters at end of paragraphs
text = re.sub(r'\._', r'.', text)
text = re.sub(r'\. \.‘',r'.', text)
# Cleaning extra characters at begining of paragraphs
text = re.sub(r'\n(\.|-)(\s)([a-zA-Z])', r'\n\3', text)
#Fixing Os read as zeros
text = re.sub(r'([a-zA-Zé’])(0)', r'\1o', text)
text = re.sub(r'(0)([a-zA-Zé’])', r'o\1', text)
with open(finalfile, 'w') as file:
print('Writing final OCR to ' + str(finalfile) )
file.write(text) # rewrite the file
# Cleanup of cropped images
shutil.rmtree(cropfolder, ignore_errors=True)