-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathczytacz.py
72 lines (52 loc) · 2.07 KB
/
czytacz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import pytesseract
from pdf2image import convert_from_path
import glob
import os.path
# conda install -c conda-forge poppler
# conda install -c conda-forge pytesseract
# conda install -c conda-forge tesseract
# pip install pdf2image
# pipreqs
# pip install -r requirements.txt
# python -v czytacz.py
# https://tesseract-ocr.github.io/tessdoc/Command-Line-Usage.html#simplest-invocation-to-ocr-an-image
# from PIL import Image
# from googletrans import Translator
# from PyPDF2 import PdfFileReader
print(pytesseract.get_tesseract_version())
# print(pytesseract.get_languages())
# pdfs = open(os.path.join(os.path.dirname(__file__), 'rosliny.pdf'), encoding="utf-8", errors="ignore")
# poppler_path = r"C:\path\to\poppler-xx\bin"
pdfs = glob.glob(r"./rosliny.pdf")
# output_folder = glob.glob(r"./")
for pdf_path in pdfs:
pages = convert_from_path(pdf_path, 500)
for pageNum, imgBlob in enumerate(pages):
text = pytesseract.image_to_string(imgBlob, lang='pol')
# 'ger' 'fra' 'eng'
# https://pypi.org/project/pytesseract/
with open(f'./output/{pdf_path[:-4]}_page{pageNum}.txt', 'w') as the_file:
the_file.write(text)
print("ZMIANA PDF NA TEKST ZAKONCZONA")
# https://stackoverflow.com/questions/66995340/pdf-to-text-convert-using-python-pytesseract
# with open(doc_file, mode='w') as the_file:
# the_file.write(text)
# cd /tmp
# # curl https://repo.anaconda.com/archive/Anaconda3-2020.02-Linux-x86_64.sh --output anaconda.sh
# curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-Linux-x86_64.sh --output anaconda.sh
# bash anaconda.sh
# source ~/.bashrc
# conda search "^python$"
# conda create --name czytacz python=3.10.11
# conda activate czytacz
# python --version
# conda deactivate
# conda install -c conda-forge pytesseract
# conda install -c conda-forge tesseract
# pip install pdf2image
#
# # przy kazdym uzyciu:
# conda activate czytacz
# python -v czytacz.py
# /home/abc/gity/US/OCR_useme/TESERAKT_STANDALONE
# tesseract --tessdata-dir /usr/share/tesseract-ocr/5/tessdata ./eurotext.png eurotext-eng -l eng --psm 3 pdf