Skip to content

Commit f11d2fc

Browse files
authored
feat: add tqdm and custom errors (#13)
1 parent 16f8f52 commit f11d2fc

File tree

2 files changed

+85
-62
lines changed

2 files changed

+85
-62
lines changed

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ nltk==3.6.6
44
odfpy==1.4.1
55
pdfplumber==0.5.28
66
tabulate==0.8.9
7+
tqdm==4.66.2

scripts/main.py

+84-62
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,38 @@
77
It can also use Jaccard Similarity, words counting, overlapping words for similarity
88
99
"""
10-
import sys
1110
import webbrowser
1211
from datetime import datetime
1312
from os import listdir, path
1413
from typing import List
1514

15+
from tqdm import tqdm
16+
1617
from scripts.html_writing import add_links_to_html_table, results_to_html, papers_comparison
1718
from scripts.html_utils import writing_results
1819
from scripts.processing_files import file_extension_call
1920
from scripts.similarity import difflib_overlap
2021
from scripts.utils import wait_for_file, get_student_names, parse_options
2122

2223

24+
class MinimumFilesError(Exception):
25+
"""Raised when there are fewer than two files for comparison."""
26+
27+
pass
28+
29+
30+
class UnsupportedFileError(Exception):
31+
"""Raised when there are unsupported files in the input directory."""
32+
33+
pass
34+
35+
36+
class PathNotFoundError(Exception):
37+
"""Raised when the specified input directory path does not exist."""
38+
39+
pass
40+
41+
2342
def main() -> None:
2443
"""
2544
Main function to process and compare text files.
@@ -36,69 +55,72 @@ def main() -> None:
3655
args = parse_options()
3756
in_dir, out_dir, block_size = args.in_dir, args.out_dir, args.block_size
3857

39-
if path.exists(in_dir): # Check if specified path exists
40-
if not path.isabs(in_dir):
41-
in_dir = path.abspath(in_dir)
42-
if len(listdir(in_dir)) > 1: # Check if there are at least 2 files at specified path
43-
filenames, processed_files = [], []
44-
students_names = get_student_names(in_dir)
45-
for ind, direc in enumerate(listdir(in_dir)):
46-
if path.isdir(path.join(in_dir, direc)):
47-
for file in listdir(path.join(in_dir, direc)):
48-
file_words = file_extension_call(str(path.join(in_dir, direc, file)))
49-
50-
if file_words: # If all files have supported format
51-
processed_files.append(file_words)
52-
filenames.append(students_names[ind])
53-
else: # At least one file was not supported
54-
print("Remove files which are not txt, pdf, docx or odt and run the script again.")
55-
sys.exit()
56-
if out_dir is not None and path.exists(out_dir):
57-
if not path.isabs(out_dir):
58-
out_dir = path.abspath(out_dir)
59-
results_directory = out_dir
60-
else:
61-
# Create new directory for storing html files
62-
results_directory = writing_results(datetime.now().strftime("%Y%m%d_%H%M%S"))
63-
64-
difflib_scores: List[List[float]] = [[] for _ in range(len(processed_files))]
65-
file_ind = 0
66-
67-
for i, text in enumerate(processed_files):
68-
for j, text_bis in enumerate(processed_files):
69-
if i != j:
70-
# Append to the list the similarity score between text and text_bis
71-
difflib_scores[i].append(difflib_overlap(text, text_bis))
72-
73-
# Write text with matching blocks colored in results directory
74-
papers_comparison(
75-
results_directory,
76-
file_ind,
77-
text,
78-
text_bis,
79-
(filenames[i], filenames[j]),
80-
block_size,
81-
)
82-
file_ind += 1
83-
else:
84-
difflib_scores[i].append(-1)
85-
86-
results_directory = path.join(results_directory, "_results.html")
87-
print(results_directory)
88-
89-
results_to_html(difflib_scores, filenames, results_directory)
90-
91-
if wait_for_file(results_directory, 60): # Wait for file to be created
92-
add_links_to_html_table(results_directory)
93-
webbrowser.open(results_directory) # Open results HTML table
58+
if not path.exists(in_dir):
59+
raise PathNotFoundError(f"The specified path does not exist: {in_dir}")
60+
61+
if not path.isabs(in_dir):
62+
in_dir = path.abspath(in_dir)
63+
64+
files = [
65+
f for f in listdir(in_dir) if path.isdir(path.join(in_dir, f)) or f.endswith(("txt", "pdf", "docx", "odt"))
66+
]
67+
68+
if len(files) < 2:
69+
raise MinimumFilesError(
70+
"Minimum number of files is not present. Please check that there are at least two files to compare."
71+
)
72+
73+
filenames, processed_files = [], []
74+
students_names = get_student_names(in_dir)
75+
76+
for ind, direc in enumerate(tqdm(listdir(in_dir), desc="Processing Directories")):
77+
if path.isdir(path.join(in_dir, direc)):
78+
for file in listdir(path.join(in_dir, direc)):
79+
file_words = file_extension_call(str(path.join(in_dir, direc, file)))
80+
if file_words: # If all files have supported format
81+
processed_files.append(file_words)
82+
filenames.append(students_names[ind])
83+
else:
84+
raise UnsupportedFileError(
85+
"Remove files which are not txt, pdf, docx, or odt and run the script again."
86+
)
87+
88+
if out_dir is not None and path.exists(out_dir):
89+
if not path.isabs(out_dir):
90+
out_dir = path.abspath(out_dir)
91+
results_directory = out_dir
92+
else:
93+
results_directory = writing_results(datetime.now().strftime("%Y%m%d_%H%M%S"))
94+
95+
difflib_scores: List[List[float]] = [[] for _ in range(len(processed_files))]
96+
file_ind = 0
97+
98+
for i, text in enumerate(tqdm(processed_files, desc="Comparing Files")):
99+
for j, text_bis in enumerate(processed_files):
100+
if i != j:
101+
difflib_scores[i].append(difflib_overlap(text, text_bis))
102+
papers_comparison(
103+
results_directory,
104+
file_ind,
105+
text,
106+
text_bis,
107+
(filenames[i], filenames[j]),
108+
block_size,
109+
)
110+
file_ind += 1
94111
else:
95-
print("Results file was not created...")
96-
else:
97-
print("Minimum number of files is not present. Please check that there are at least two files to compare.")
98-
sys.exit()
112+
difflib_scores[i].append(-1)
113+
114+
results_directory = path.join(results_directory, "_results.html")
115+
print(f"Results saved at: {results_directory}")
116+
117+
results_to_html(difflib_scores, filenames, results_directory)
118+
119+
if wait_for_file(results_directory, 60): # Wait for file to be created
120+
add_links_to_html_table(results_directory)
121+
webbrowser.open(results_directory) # Open results HTML table
99122
else:
100-
print("The specified path does not exist : " + in_dir)
101-
sys.exit()
123+
raise RuntimeError("Results file was not created...")
102124

103125

104126
if __name__ == "__main__":

0 commit comments

Comments
 (0)