Skip to content

Commit 988085b

Browse files
authored
feat: change folder structure to get all files in input_dir (#14)
1 parent f11d2fc commit 988085b

File tree

7 files changed

+31
-40
lines changed

7 files changed

+31
-40
lines changed

README.md

+9-5
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,14 @@ $ copy-spotter [-s] [-o] [-h] input_directory
1919
***Positional Arguments:***
2020
* `input_directory`: Directory that contains one folder per pdf file (see `data/pdf/plagiarism` for example)
2121

22+
```
23+
input_directory/
24+
25+
├── file_1.docx
26+
├── file_2.pdf
27+
└── file_3.pdf
28+
```
29+
2230
***Optional Arguments:***
2331
* `-s`, `--block-size`: Set minimum number of consecutive and similar words detected. (Default is 2)
2432
* `-o`, `--out_dir`: Set the output directory for html files. (Default is creating a new directory called results)
@@ -72,8 +80,4 @@ $ python -m scripts.main [-s] [-o] [-h] input_directory
7280
---
7381
- Add more tests on existing functions
7482
- Implement OCR with tesseract for scanned documents
75-
- Add info in console for timing (tqdm)
76-
- Add CSS to HTML Template to make the results better looking
77-
- Add support for other folder structures (right now the package is expecting one pdf files per folder)
78-
- Add custom naming option for pdf files
79-
- Fix Slate3k by installing custom fork (check if still relevant)
83+
- Add custom naming option for pdf files

data/pdf/plagiarism/Axel Mare_report/report.txt

-1
This file was deleted.

data/pdf/plagiarism/John Doe_report/report_2.txt

-1
This file was deleted.

data/pdf/plagiarism/Lucas Pelipe_report/random_txt.txt

-1
This file was deleted.

data/pdf/plagiarism/Marie Pole_report/final_version.txt

-1
This file was deleted.

scripts/main.py

+10-15
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from scripts.html_utils import writing_results
1919
from scripts.processing_files import file_extension_call
2020
from scripts.similarity import difflib_overlap
21-
from scripts.utils import wait_for_file, get_student_names, parse_options
21+
from scripts.utils import wait_for_file, parse_options
2222

2323

2424
class MinimumFilesError(Exception):
@@ -62,7 +62,7 @@ def main() -> None:
6262
in_dir = path.abspath(in_dir)
6363

6464
files = [
65-
f for f in listdir(in_dir) if path.isdir(path.join(in_dir, f)) or f.endswith(("txt", "pdf", "docx", "odt"))
65+
f for f in listdir(in_dir) if path.isfile(path.join(in_dir, f)) and f.endswith(("txt", "pdf", "docx", "odt"))
6666
]
6767

6868
if len(files) < 2:
@@ -71,19 +71,14 @@ def main() -> None:
7171
)
7272

7373
filenames, processed_files = [], []
74-
students_names = get_student_names(in_dir)
75-
76-
for ind, direc in enumerate(tqdm(listdir(in_dir), desc="Processing Directories")):
77-
if path.isdir(path.join(in_dir, direc)):
78-
for file in listdir(path.join(in_dir, direc)):
79-
file_words = file_extension_call(str(path.join(in_dir, direc, file)))
80-
if file_words: # If all files have supported format
81-
processed_files.append(file_words)
82-
filenames.append(students_names[ind])
83-
else:
84-
raise UnsupportedFileError(
85-
"Remove files which are not txt, pdf, docx, or odt and run the script again."
86-
)
74+
75+
for file in tqdm(files, desc="Processing Files"):
76+
file_words = file_extension_call(str(path.join(in_dir, file)))
77+
if file_words: # If all files have supported format
78+
processed_files.append(file_words)
79+
filenames.append(path.splitext(file)[0])
80+
else:
81+
raise UnsupportedFileError("Remove files which are not txt, pdf, docx, or odt and run the script again.")
8782

8883
if out_dir is not None and path.exists(out_dir):
8984
if not path.isabs(out_dir):

scripts/processing_files.py

+12-16
Original file line numberDiff line numberDiff line change
@@ -13,33 +13,29 @@
1313
def get_file_extension(filepath: str) -> str:
1414
"""Return the file extension of the file at the specified path"""
1515
if not path.isfile(filepath):
16-
print("Invalid file path")
17-
return ""
16+
raise ValueError(f"Invalid file path: {filepath}")
1817

1918
try:
2019
return path.splitext(filepath)[1]
2120
except IndexError:
22-
print("File extension error")
23-
return ""
21+
raise ValueError(f"File extension error for file: {filepath}")
2422

2523

2624
def file_extension_call(file: str) -> list:
2725
"""Map file extension to appropriate function"""
2826

2927
extension = get_file_extension(file)
3028

31-
if extension:
32-
if extension == ".pdf":
33-
return get_words_from_pdf_file(file)
34-
if extension == ".docx":
35-
return get_words_from_docx_file(file)
36-
if extension == ".odt":
37-
return get_words_from_odt_file(file)
38-
if extension == ".txt":
39-
return get_words_from_txt_file(file)
40-
41-
print("File format is not supported. Please convert to pdf, docx, odt or txt")
42-
return []
29+
if extension == ".pdf":
30+
return get_words_from_pdf_file(file)
31+
elif extension == ".docx":
32+
return get_words_from_docx_file(file)
33+
elif extension == ".odt":
34+
return get_words_from_odt_file(file)
35+
elif extension == ".txt":
36+
return get_words_from_txt_file(file)
37+
else:
38+
raise ValueError(f"File format not supported for file: {file}. " f"Please convert to pdf, docx, odt, or txt")
4339

4440

4541
def get_words_from_pdf_file(pdf_path: str) -> list:

0 commit comments

Comments
 (0)