Skip to content

Commit e3cd11e

Browse files
authored
feat: replace slate3k by pdfminer.six (#20)
1 parent 6ca13d0 commit e3cd11e

File tree

3 files changed

+13
-35
lines changed

3 files changed

+13
-35
lines changed

requirements.txt

+2-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1-
-e git+https://github.com/Wazzabeee/slate3k#egg=slate3k
21
beautifulsoup4==4.10.0
32
nltk==3.6.6
43
odfpy==1.4.1
5-
pdfplumber==0.5.28
64
tabulate==0.8.9
7-
tqdm==4.66.3
5+
tqdm==4.66.3
6+
pdfminer.six==20200517

scripts/processing_files.py

+10-30
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,9 @@
44
import zipfile
55
from os import path
66

7-
import pdfplumber
8-
import slate3k as slate
97
from odf import text, teletype
108
from odf.opendocument import load
9+
from pdfminer.high_level import extract_text
1110

1211

1312
def get_file_extension(filepath: str) -> str:
@@ -39,38 +38,19 @@ def file_extension_call(file: str) -> list:
3938

4039

4140
def get_words_from_pdf_file(pdf_path: str) -> list:
42-
"""Return list of words from pdf file at specified path"""
41+
"""Return list of words from pdf file at specified path using pdfminer.six."""
4342

44-
with open(pdf_path, "rb") as file:
45-
extracted_text = slate.PDF(file)
43+
# Extract text from the PDF file using pdfminer
44+
extracted_text = extract_text(pdf_path)
4645

47-
nested_lists_length_sum = sum(len(temp) for temp in extracted_text)
48-
count_line_return = sum(string.count("\n") for string in extracted_text)
46+
# Clean up the extracted text
47+
cleaned_text = re.sub(r"\s+", " ", extracted_text)
48+
cleaned_text = re.sub(r"<(.|\n)*?>", "", cleaned_text)
4949

50-
# Check \n ratio compared to length of text
51-
if nested_lists_length_sum / count_line_return > 10:
52-
for i, _ in enumerate(extracted_text):
53-
extracted_text[i] = extracted_text[i].replace("\n", " ")
54-
extracted_text[i] = re.sub("<(.|\n)*?>", "", str(extracted_text[i]))
55-
extracted_text[i] = re.findall(r"\w+", extracted_text[i].lower())
50+
# Extract words from the cleaned text
51+
words = re.findall(r"\w+", cleaned_text.lower())
5652

57-
return [item for sublist in extracted_text for item in sublist]
58-
59-
# Pdf format is not readable by Slate library
60-
return get_words_from_special_pdf(pdf_path)
61-
62-
63-
def get_words_from_special_pdf(pdf_path: str) -> list:
64-
"""Return list of words from a PDF file when the Slate library can't scrape it"""
65-
66-
with pdfplumber.open(pdf_path) as file:
67-
concat_string = ""
68-
for page in file.pages:
69-
text_page = page.extract_text() + "\n"
70-
concat_string += text_page
71-
72-
# Split the string into words and return as a list
73-
return concat_string.replace("\xa0", " ").strip().split()
53+
return words
7454

7555

7656
def get_words_from_txt_file(txt_path: str) -> list:

setup.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,9 @@ def get_version():
1414
"beautifulsoup4==4.10.0",
1515
"nltk==3.6.6",
1616
"odfpy==1.4.1",
17-
"pdfplumber==0.5.28",
1817
"tabulate==0.8.9",
1918
"tqdm==4.66.3",
20-
"slate3k @ git+https://github.com/Wazzabeee/slate3k#egg=slate3k",
19+
"pdfminer.six==20200517",
2120
],
2221
extras_require={
2322
"lint": ["pylint==3.0.2", "mypy==1.7.1", "flake8==6.1.0", "black==24.3.0", "types-tabulate"],

0 commit comments

Comments
 (0)