-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
41 lines (29 loc) · 1 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import gcp_utils
import os
import doc_ai_utils
import pandas as pd
bucket_name = ''
project_id = ''
location = ''
processor_id = ''
for pdf in gcp_utils.list_blobs(bucket_name):
if pdf.endswith(".pdf"):
print(f'Downloading {pdf}')
gcp_utils.download_blob(bucket_name, pdf, pdf)
for pdf in os.listdir():
if pdf.endswith(".pdf"):
document = doc_ai_utils.process_document(project_id=project_id,
location=location,
processor_id=processor_id,
file=pdf)
print(document)
break # just need to process 1 document as a POC
dict_row = []
for entity in document.entities:
dictionary = {}
dictionary["Entity_type"] = entity.type_
dictionary["mentioned_text"] = entity.mention_text
dictionary["parsed_text"] = entity.normalized_value.text
dictionary["confidence"] = entity.confidence * 100
dict_row.append(dictionary)
print(pd.DataFrame(dict_row))