-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
203 lines (167 loc) · 7.57 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import base64
import os
import pprint
from datetime import datetime
from textwrap import dedent
import polars as pl
import structlog
from dotenv import load_dotenv
from openai import OpenAI
# from openinference.instrumentation.openai import OpenAIInstrumentor
# from phoenix.otel import register
from pydantic import BaseModel
def main():
load_dotenv(override=True)
log = structlog.get_logger()
# tracer_provider = register(
# # endpoint="http://localhost:6006/v1/traces",
# # endpoint="http://host.docker.internal:6007/v1/traces",
# endpoint=os.getenv("PHOENIX_ENDPOINT"),
# )
# OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)
image_paths = find_image_paths()
log.info("Found images to process", count=len(image_paths), image_paths=image_paths)
client = OpenAI()
template_csv_name = "outputs/output.template.csv"
current_time = datetime.now().strftime("%Y%m%d_%H%M")
output_csv_name = f"outputs/output_{current_time}.csv"
df = pl.read_csv(template_csv_name)
for image_path in image_paths:
log.info("Processing image", image_path=image_path)
intake_forms: IntakeForms = process_image(client, image_path)
log.info("Received result", result=pprint.pformat(intake_forms.model_dump()))
for intake_form in intake_forms.list_of_intake_forms:
df = df.vstack(
pl.DataFrame(
{
"id_number": [intake_form.id_number],
"species": [intake_form.species],
"condition": [intake_form.condition],
"intake_date": [intake_form.intake_date],
# Making these wrap in one cell, separated by spaces and not newlines, to meet requirements
"rescuer_name": [
f"{intake_form.rescuer_name} {intake_form.rescuer_city}"
],
"county_found": [intake_form.county_found],
"final_disposition": [intake_form.final_disposition],
"county_released": [intake_form.county_released],
"disposition_date": [intake_form.disposition_date],
}
)
)
log.info("Appended to dataframe", df_length=len(df), image_path=image_path)
log.info("Finished processing images")
log.info(
"Writing to output CSV", output_csv_name=output_csv_name, df_length=len(df)
)
df.write_csv(output_csv_name)
def find_image_paths():
images_directory = "inputs/images"
image_filenames = [
filename
for filename in os.listdir(images_directory)
if filename.lower().endswith((".jpg"))
]
image_paths = [
os.path.join(images_directory, filename) for filename in image_filenames
]
image_paths.sort(key=lambda p: os.path.basename(p))
return image_paths
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
class IntakeForm(BaseModel):
id_number: str
species: str
condition: str
intake_date: str
rescuer_name: str | None
rescuer_city: str | None
county_found: str | None
final_disposition: str | None
county_released: str | None
disposition_date: str | None
class IntakeForms(BaseModel):
list_of_intake_forms: list[IntakeForm]
def process_image(
client: OpenAI,
image_path: str,
) -> IntakeForms:
model = os.getenv("OPENAI_MODEL")
year = os.getenv("YEAR")
base64_image = encode_image(image_path)
# Get a list of conditions and species from the previous years' reports
df = pl.read_csv("inputs/previous_years_reports/DNR-2020.csv")
conditions = df["Condition"].unique().to_list()
species = df["Species"].unique().to_list()
system_prompt = dedent(
f"""
You are a helpful assistant who is helping a user to extract data from pictures of intake forms.
Each intake form image could apply to just one ID or multiple IDs.
In the case of multiple IDs, return an entry for each, with the same data if differences aren't specified.
The ID numbers MUST range from 1 to 2000, alternately written as {year}-0001 to {year}-2000.
For example, if you see an ID of 081-084, you should return a list of IntakeForm objects with IDs of {year}-0081, {year}-0082, {year}-0083, and {year}-0084.
Additional notes:
- CAGO is an abbreviation for Canada Goose
- GHOW is an abbreviation for Great Horned Owl
- RTH is an abbreviation for Red-Tailed Hawk
- If the species is just "duck", use "Mallard" instead
- You MUST return dates in the format MM.DD.YY, like 11.30.{year}
- You MUST abbreviate Indianapolis as Indpls
- You MUST write Indy as Indpls
- If the final_disposition is "D" or "E", then the county_released MUST be "N/A"
- The final_disposition MUST be one of the following: D, R, E, DOA, T, or P
- The counties MUST all be counties from the state of Indiana
- The city MUST be a city from the state of Indiana
- The id_number MUST be between {year}-0001 and {year}-2000
- On the form, county_found is written as "Co Found"
- On the form, county_released is written as "Co Rel"
- On the form, final_disposition is written as "Final Disp"
- On the form, disposition_date is written as "DT"
- All of the dates should be for 20{year}, written as {year} in the format MM.DD.YY
- If the rescuer name doesn't have a last name, use NoLastName
- If the condition is spread over multiple lines, join phrases with a comma where it makes sense, instead of using a hyphen
- If the condition is something like "gosling" or "duckling", then make the condition "orphan"
- If the city is Indianapolis, but no county is listed, then make a best guess at the county based on the city or other location notes
- If the county found is something like "Brown Co", then make the county found just "Brown"
- If the county is "unk", then make the county found "Unknown", but if it's blank, just leave it blank
- For the rescuer name, enter just the main person's name, and not "+ 2 others" or any extra information
- If the species is hatchling, then change it to Songbird
Refer to a list of conditions from previous years' report and follow the style of the condition notes:
{conditions}
The species SHOULD be one of the following:
{species}
Thank you so much for your help!
"""
)
completion = client.beta.chat.completions.parse(
model=model,
messages=[
{
"role": "system",
"content": [
{
"type": "text",
"text": system_prompt,
}
],
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is in this image?",
},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
},
],
},
],
response_format=IntakeForms,
)
return completion.choices[0].message.parsed
if __name__ == "__main__":
main()