-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_pdfs.py
160 lines (127 loc) · 5.68 KB
/
clean_pdfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from pdf2image import convert_from_path
import os
from PIL import Image, ImageDraw
import numpy as np
import math
from tqdm.notebook import tqdm
os.chdir("C:/Users/pithy/Documents/journalocr")
def is_red(pixel):
# Check if pixel falls within the red color range
r, g, b = pixel
return r - max(g, b) > 10
def get_reds(image_path):
image = Image.open(image_path).convert('RGB')
top = []
bottom = []
# Get image dimensions
width, height = image.size
# Iterate through each pixel
for x in range(round(0.1*width), round(0.3*width)):
for y in range(round(0.1*height), round(0.15*height)):
pixel = image.getpixel((x, y))
if is_red(pixel):
top.append(x)
# Iterate through each pixel
for x in range(round(0.1*width), round(0.3*width)):
for y in range(round(0.8*height), round(0.85*height)):
pixel = image.getpixel((x, y))
if is_red(pixel):
bottom.append(x)
return [top, bottom]
def is_blue(pixel, threshold):
# Check if pixel is blue or green (lines sometimes greenish blue on the scan)
r, g, b = pixel
return min(g, b) - r > threshold
def get_blues(image, threshold):
# image = Image.open(image_path).convert('RGB')
l1 = []
# Get image dimensions
width, height = image.size
first_line = round(height * 1.125/9.75)
second_line = round(height * (1.125 + 0.3125)/9.75)
# Iterate through each pixel
for x in range(width):
for y in range(height):
pixel = image.getpixel((x, y))
if is_blue(pixel, threshold):
l1.append(y)
return l1
def crop_top(image, width, height):
# Check brightness at top of page on the left and right ends
left_bright = np.mean(np.array(image.convert('L').crop((100, 0, 150, 10))))
right_bright = np.mean(np.array(image.convert('L').crop((width-150, 0, width-100, 10))))
# If the top of the page is white on both halves, you're done
if left_bright > 200 and right_bright > 200:
return(image)
# Chop a little off the top and try again
image = image.crop((0, 10, width, height))
return crop_top(image, width, height)
def split_lines(image, threshold, journal_no, page_no):
# TODO: what if you miss a line or two at the top, as in 2-120?
splits = []
# Loop through the page in bite-size chunks and detect lines
for chunk in range(100, image.height, 40):
blues = get_blues(image.crop((100, chunk, image.width-100, chunk+40)), threshold)
if len(blues) > 25:
line_center = round(np.median(blues))
splits.append(chunk+line_center)
# TODO: Resolve duplicates
distances = [splits[i] - splits[i-1] for i in range(1, len(splits))]
split_distance = round(np.median(distances))
# deviances = [distances[i] + splits[i+1] - splits[i] for i in range(1, len(splits)-1)]
# to_remove = []
# for i in range(1, len(splits)-1):
# # if abs(deviances[i] - 2*split_distance) < 5:
# if deviances[i-1] < 50:
# to_remove.append(splits[i-1])
# splits = [split not in to_remove for split in splits]
# np.delete(splits, to_remove).tolist()
# img1 = ImageDraw.Draw(image)
# for split in splits:
# img1.line([(0, split), (image.width, split)], fill = 'blue', width = 1)
# image.show()
# Split the image and save each line
for line, split in enumerate(splits):
# TODO: finish logic for filenames
image.crop((0, split-split_distance+10, image.width, split+25)).save(f'data/lines/{journal_no}-{page_no}-{line}.jpeg', 'JPEG')
def rotate_and_cleave_page(image_path, threshold, journal_no, page_no):
# Open image
image = Image.open(image_path).convert('RGB')
# Rotate to vertical
before = get_reds(image_path)
top_mean = np.mean(before[0])
bottom_mean = np.mean(before[1])
width, height = image.size
top = round(0.15*height)
bottom = round(0.85*height)
if round(top_mean) - round(bottom_mean) > 3:
slope = (top - bottom) / (top_mean - bottom_mean)
angle = math.degrees(math.atan(slope)) + 90
image = image.rotate(angle, resample = Image.BICUBIC, expand = True)
# Trim the top of the page to remove any black from poor scan/rotation
image = crop_top(image, width, height)
# Get line splits for sidebar and body simultaneously
split_lines(image, threshold, journal_no, page_no)
# split rotated image and save sidebar and body text
split_point = np.mean(get_reds(image_path)[0])
sidebar = image.crop(box = (0, 0, split_point, height))
body = image.crop(box = (split_point, 0, width, height))
# Example usage
# input_image_path = "data/1-121.jpg"
# rotate_and_cleave_page(input_image_path, 10, 1)
# Set contrast threshold for blue line detection per notebook
thresholds = [8, 17, 8]
# Set page ranges to save from each notebook
page_ranges = {1: [*range(14, 154)], 2: [*range(2, 152)], 3: [*range(2, 155)]}
# This will decouple the filenames from the pagenumbers, so adjust for that
# page_offsets = [13, 1, 1]
# print(page_ranges[1])
# TODO: fix issue where loop breaks after every notebook's final page
for journal_index, journal_no in enumerate([1,2,3]):
images = convert_from_path(f'data/originals/{journal_no}.pdf')
for page_no, img in enumerate(images):
# print(page_num)
# TODO: consider refactoring image/path so that you don't need to save the intact pages as intermediaries?
img.save(f'data/full_pages/{journal_no}-{page_no}.jpg', 'JPEG')
if page_no in page_ranges[journal_no]:
rotate_and_cleave_page(f'data/full_pages/{journal_no}-{page_no}.jpg', thresholds[journal_index], journal_no, page_no)