|
4 | 4 | import zipfile
|
5 | 5 | from os import path
|
6 | 6 |
|
7 |
| -import pdfplumber |
8 |
| -import slate3k as slate |
9 | 7 | from odf import text, teletype
|
10 | 8 | from odf.opendocument import load
|
| 9 | +from pdfminer.high_level import extract_text |
11 | 10 |
|
12 | 11 |
|
13 | 12 | def get_file_extension(filepath: str) -> str:
|
@@ -39,38 +38,19 @@ def file_extension_call(file: str) -> list:
|
39 | 38 |
|
40 | 39 |
|
41 | 40 | def get_words_from_pdf_file(pdf_path: str) -> list:
|
42 |
| - """Return list of words from pdf file at specified path""" |
| 41 | + """Return list of words from pdf file at specified path using pdfminer.six.""" |
43 | 42 |
|
44 |
| - with open(pdf_path, "rb") as file: |
45 |
| - extracted_text = slate.PDF(file) |
| 43 | + # Extract text from the PDF file using pdfminer |
| 44 | + extracted_text = extract_text(pdf_path) |
46 | 45 |
|
47 |
| - nested_lists_length_sum = sum(len(temp) for temp in extracted_text) |
48 |
| - count_line_return = sum(string.count("\n") for string in extracted_text) |
| 46 | + # Clean up the extracted text |
| 47 | + cleaned_text = re.sub(r"\s+", " ", extracted_text) |
| 48 | + cleaned_text = re.sub(r"<(.|\n)*?>", "", cleaned_text) |
49 | 49 |
|
50 |
| - # Check \n ratio compared to length of text |
51 |
| - if nested_lists_length_sum / count_line_return > 10: |
52 |
| - for i, _ in enumerate(extracted_text): |
53 |
| - extracted_text[i] = extracted_text[i].replace("\n", " ") |
54 |
| - extracted_text[i] = re.sub("<(.|\n)*?>", "", str(extracted_text[i])) |
55 |
| - extracted_text[i] = re.findall(r"\w+", extracted_text[i].lower()) |
| 50 | + # Extract words from the cleaned text |
| 51 | + words = re.findall(r"\w+", cleaned_text.lower()) |
56 | 52 |
|
57 |
| - return [item for sublist in extracted_text for item in sublist] |
58 |
| - |
59 |
| - # Pdf format is not readable by Slate library |
60 |
| - return get_words_from_special_pdf(pdf_path) |
61 |
| - |
62 |
| - |
63 |
| -def get_words_from_special_pdf(pdf_path: str) -> list: |
64 |
| - """Return list of words from a PDF file when the Slate library can't scrape it""" |
65 |
| - |
66 |
| - with pdfplumber.open(pdf_path) as file: |
67 |
| - concat_string = "" |
68 |
| - for page in file.pages: |
69 |
| - text_page = page.extract_text() + "\n" |
70 |
| - concat_string += text_page |
71 |
| - |
72 |
| - # Split the string into words and return as a list |
73 |
| - return concat_string.replace("\xa0", " ").strip().split() |
| 53 | + return words |
74 | 54 |
|
75 | 55 |
|
76 | 56 | def get_words_from_txt_file(txt_path: str) -> list:
|
|
0 commit comments