7
7
It can also use Jaccard Similarity, words counting, overlapping words for similarity
8
8
9
9
"""
10
- import sys
11
10
import webbrowser
12
11
from datetime import datetime
13
12
from os import listdir , path
14
13
from typing import List
15
14
15
+ from tqdm import tqdm
16
+
16
17
from scripts .html_writing import add_links_to_html_table , results_to_html , papers_comparison
17
18
from scripts .html_utils import writing_results
18
19
from scripts .processing_files import file_extension_call
19
20
from scripts .similarity import difflib_overlap
20
21
from scripts .utils import wait_for_file , get_student_names , parse_options
21
22
22
23
24
+ class MinimumFilesError (Exception ):
25
+ """Raised when there are fewer than two files for comparison."""
26
+
27
+ pass
28
+
29
+
30
+ class UnsupportedFileError (Exception ):
31
+ """Raised when there are unsupported files in the input directory."""
32
+
33
+ pass
34
+
35
+
36
+ class PathNotFoundError (Exception ):
37
+ """Raised when the specified input directory path does not exist."""
38
+
39
+ pass
40
+
41
+
23
42
def main () -> None :
24
43
"""
25
44
Main function to process and compare text files.
@@ -36,69 +55,72 @@ def main() -> None:
36
55
args = parse_options ()
37
56
in_dir , out_dir , block_size = args .in_dir , args .out_dir , args .block_size
38
57
39
- if path .exists (in_dir ): # Check if specified path exists
40
- if not path .isabs (in_dir ):
41
- in_dir = path .abspath (in_dir )
42
- if len (listdir (in_dir )) > 1 : # Check if there are at least 2 files at specified path
43
- filenames , processed_files = [], []
44
- students_names = get_student_names (in_dir )
45
- for ind , direc in enumerate (listdir (in_dir )):
46
- if path .isdir (path .join (in_dir , direc )):
47
- for file in listdir (path .join (in_dir , direc )):
48
- file_words = file_extension_call (str (path .join (in_dir , direc , file )))
49
-
50
- if file_words : # If all files have supported format
51
- processed_files .append (file_words )
52
- filenames .append (students_names [ind ])
53
- else : # At least one file was not supported
54
- print ("Remove files which are not txt, pdf, docx or odt and run the script again." )
55
- sys .exit ()
56
- if out_dir is not None and path .exists (out_dir ):
57
- if not path .isabs (out_dir ):
58
- out_dir = path .abspath (out_dir )
59
- results_directory = out_dir
60
- else :
61
- # Create new directory for storing html files
62
- results_directory = writing_results (datetime .now ().strftime ("%Y%m%d_%H%M%S" ))
63
-
64
- difflib_scores : List [List [float ]] = [[] for _ in range (len (processed_files ))]
65
- file_ind = 0
66
-
67
- for i , text in enumerate (processed_files ):
68
- for j , text_bis in enumerate (processed_files ):
69
- if i != j :
70
- # Append to the list the similarity score between text and text_bis
71
- difflib_scores [i ].append (difflib_overlap (text , text_bis ))
72
-
73
- # Write text with matching blocks colored in results directory
74
- papers_comparison (
75
- results_directory ,
76
- file_ind ,
77
- text ,
78
- text_bis ,
79
- (filenames [i ], filenames [j ]),
80
- block_size ,
81
- )
82
- file_ind += 1
83
- else :
84
- difflib_scores [i ].append (- 1 )
85
-
86
- results_directory = path .join (results_directory , "_results.html" )
87
- print (results_directory )
88
-
89
- results_to_html (difflib_scores , filenames , results_directory )
90
-
91
- if wait_for_file (results_directory , 60 ): # Wait for file to be created
92
- add_links_to_html_table (results_directory )
93
- webbrowser .open (results_directory ) # Open results HTML table
58
+ if not path .exists (in_dir ):
59
+ raise PathNotFoundError (f"The specified path does not exist: { in_dir } " )
60
+
61
+ if not path .isabs (in_dir ):
62
+ in_dir = path .abspath (in_dir )
63
+
64
+ files = [
65
+ f for f in listdir (in_dir ) if path .isdir (path .join (in_dir , f )) or f .endswith (("txt" , "pdf" , "docx" , "odt" ))
66
+ ]
67
+
68
+ if len (files ) < 2 :
69
+ raise MinimumFilesError (
70
+ "Minimum number of files is not present. Please check that there are at least two files to compare."
71
+ )
72
+
73
+ filenames , processed_files = [], []
74
+ students_names = get_student_names (in_dir )
75
+
76
+ for ind , direc in enumerate (tqdm (listdir (in_dir ), desc = "Processing Directories" )):
77
+ if path .isdir (path .join (in_dir , direc )):
78
+ for file in listdir (path .join (in_dir , direc )):
79
+ file_words = file_extension_call (str (path .join (in_dir , direc , file )))
80
+ if file_words : # If all files have supported format
81
+ processed_files .append (file_words )
82
+ filenames .append (students_names [ind ])
83
+ else :
84
+ raise UnsupportedFileError (
85
+ "Remove files which are not txt, pdf, docx, or odt and run the script again."
86
+ )
87
+
88
+ if out_dir is not None and path .exists (out_dir ):
89
+ if not path .isabs (out_dir ):
90
+ out_dir = path .abspath (out_dir )
91
+ results_directory = out_dir
92
+ else :
93
+ results_directory = writing_results (datetime .now ().strftime ("%Y%m%d_%H%M%S" ))
94
+
95
+ difflib_scores : List [List [float ]] = [[] for _ in range (len (processed_files ))]
96
+ file_ind = 0
97
+
98
+ for i , text in enumerate (tqdm (processed_files , desc = "Comparing Files" )):
99
+ for j , text_bis in enumerate (processed_files ):
100
+ if i != j :
101
+ difflib_scores [i ].append (difflib_overlap (text , text_bis ))
102
+ papers_comparison (
103
+ results_directory ,
104
+ file_ind ,
105
+ text ,
106
+ text_bis ,
107
+ (filenames [i ], filenames [j ]),
108
+ block_size ,
109
+ )
110
+ file_ind += 1
94
111
else :
95
- print ("Results file was not created..." )
96
- else :
97
- print ("Minimum number of files is not present. Please check that there are at least two files to compare." )
98
- sys .exit ()
112
+ difflib_scores [i ].append (- 1 )
113
+
114
+ results_directory = path .join (results_directory , "_results.html" )
115
+ print (f"Results saved at: { results_directory } " )
116
+
117
+ results_to_html (difflib_scores , filenames , results_directory )
118
+
119
+ if wait_for_file (results_directory , 60 ): # Wait for file to be created
120
+ add_links_to_html_table (results_directory )
121
+ webbrowser .open (results_directory ) # Open results HTML table
99
122
else :
100
- print ("The specified path does not exist : " + in_dir )
101
- sys .exit ()
123
+ raise RuntimeError ("Results file was not created..." )
102
124
103
125
104
126
if __name__ == "__main__" :
0 commit comments