Skip to content

Commit

Permalink
Merge pull request #12 from AmberSahdev/feature/gpt-4o-support
Browse files Browse the repository at this point in the history
GPT-4o Support added, modularized LLM code better
  • Loading branch information
AmberSahdev authored Jun 20, 2024
2 parents 0fe2c54 + e600ef7 commit 511b42f
Show file tree
Hide file tree
Showing 15 changed files with 437 additions and 144 deletions.
4 changes: 2 additions & 2 deletions app/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ def __init__(self):
self.llm = None
try:
self.llm = LLM()
except OpenAIError as _:
self.status_queue.put('Set your OpenAPI API Key in Settings and Restart the App')
except OpenAIError as e:
self.status_queue.put(f'Set your OpenAPI API Key in Settings and Restart the App. Error: {e}')

def execute_user_request(self, user_request: str) -> None:
self.stop_previous_request()
Expand Down
130 changes: 40 additions & 90 deletions app/llm.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
import json
import os
from pathlib import Path
from typing import Any

from openai import ChatCompletion
from openai import OpenAI

from models.factory import ModelFactory
from utils import local_info
from utils.screen import Screen
from utils.settings import Settings

DEFAULT_MODEL_NAME = 'gpt-4o'


class LLM:
"""
Expand Down Expand Up @@ -43,106 +41,58 @@ class LLM:
"done": ...
}
function is the function name to call in the executor.
function is the function name to call in the executer.
parameters are the parameters of the above function.
human_readable_justification is what we can use to debug in case program fails somewhere or to explain to user why we're doing what we're doing.
done is null if user request is not complete, and it's a string when it's complete that either contains the
information that the user asked for, or just acknowledges completion of the user requested task. This is going
to be communicated to the user if it's present.
Note: Use code below to check whether gpt4v has assistant support yet.
from openai import OpenAI
client = OpenAI()
assistant = client.beta.assistants.create(
name="bot",
instructions="bot",
model="gpt-4-vision-preview",
tools=[{"type": "code_interpreter"}]
)
"""

def __init__(self):
settings_dict: dict[str, str] = Settings().get_dict()
self.settings_dict: dict[str, str] = Settings().get_dict()
model_name, base_url, api_key = self.get_settings_values()

self.model_name = model_name
context = self.read_context_txt_file()

self.model = ModelFactory.create_model(self.model_name, base_url, api_key, context)

base_url = settings_dict.get('base_url', 'https://api.openai.com/v1/').rstrip('/') + '/'
api_key = settings_dict.get('api_key')
if api_key:
os.environ["OPENAI_API_KEY"] = api_key
def get_settings_values(self) -> tuple[str, str, str]:
model_name = self.settings_dict.get('model')
if not model_name:
model_name = DEFAULT_MODEL_NAME

base_url = self.settings_dict.get('base_url', '')
if not base_url:
base_url = 'https://api.openai.com/v1/'
base_url = base_url.rstrip('/') + '/'

api_key = self.settings_dict.get('api_key')

return model_name, base_url, api_key

def read_context_txt_file(self) -> str:
# Construct context for the assistant by reading context.txt and adding extra system information
context = ''
path_to_context_file = Path(__file__).resolve().parent.joinpath('resources', 'context.txt')
with open(path_to_context_file, 'r') as file:
self.context = file.read()
context += file.read()

self.context += f' Locally installed apps are {",".join(local_info.locally_installed_apps)}.'
self.context += f' OS is {local_info.operating_system}.'
self.context += f' Primary screen size is {Screen().get_size()}.\n'
context += f' Locally installed apps are {",".join(local_info.locally_installed_apps)}.'
context += f' OS is {local_info.operating_system}.'
context += f' Primary screen size is {Screen().get_size()}.\n'

if 'default_browser' in settings_dict.keys() and settings_dict['default_browser']:
self.context += f'\nDefault browser is {settings_dict["default_browser"]}.'
if 'default_browser' in self.settings_dict.keys() and self.settings_dict['default_browser']:
context += f'\nDefault browser is {self.settings_dict["default_browser"]}.'

if 'custom_llm_instructions' in settings_dict:
self.context += f'\nCustom user-added info: {settings_dict["custom_llm_instructions"]}.'
if 'custom_llm_instructions' in self.settings_dict:
context += f'\nCustom user-added info: {self.settings_dict["custom_llm_instructions"]}.'

self.client = OpenAI()

self.model = settings_dict.get('model')
if not self.model:
self.model = 'gpt-4-vision-preview'
self.client = OpenAI(api_key=os.environ["OPENAI_API_KEY"], base_url=base_url)
return context

def get_instructions_for_objective(self, original_user_request: str, step_num: int = 0) -> dict[str, Any]:
message: list[dict[str, Any]] = self.create_message_for_llm(original_user_request, step_num)
llm_response = self.send_message_to_llm(message)
json_instructions: dict[str, Any] = self.convert_llm_response_to_json(llm_response)

return json_instructions

def create_message_for_llm(self, original_user_request, step_num) -> list[dict[str, Any]]:
base64_img: str = Screen().get_screenshot_in_base64()

request_data: str = json.dumps({
'original_user_request': original_user_request,
'step_num': step_num
})

# We have to add context every request for now which is expensive because our chosen model doesn't have a
# stateful/Assistant mode yet.
message = [
{'type': 'text', 'text': self.context + request_data},
{'type': 'image_url',
'image_url': {
'url': f'data:image/jpeg;base64,{base64_img}'
}
}
]

return message

def send_message_to_llm(self, message) -> ChatCompletion:
response = self.client.chat.completions.create(
model=self.model,
messages=[
{
'role': 'user',
'content': message,
}
],
max_tokens=800,
)
return response

def convert_llm_response_to_json(self, llm_response: ChatCompletion) -> dict[str, Any]:
llm_response_data: str = llm_response.choices[0].message.content.strip()

# Our current LLM model does not guarantee a JSON response hence we manually parse the JSON part of the response
# Check for updates here - https://platform.openai.com/docs/guides/text-generation/json-mode
start_index = llm_response_data.find('{')
end_index = llm_response_data.rfind('}')

try:
json_response = json.loads(llm_response_data[start_index:end_index + 1].strip())
except Exception as e:
print(f'Error while parsing JSON response - {e}')
json_response = {}

return json_response
return self.model.get_instructions_for_objective(original_user_request, step_num)

def cleanup(self):
self.model.cleanup()
Empty file added app/models/__init__.py
Empty file.
13 changes: 13 additions & 0 deletions app/models/factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from models.gpt4o import GPT4o
from models.gpt4v import GPT4v


class ModelFactory:
@staticmethod
def create_model(model_name, *args):
if model_name == 'gpt-4o':
return GPT4o(model_name, *args)
elif model_name == 'gpt-4-vision-preview' or model_name == 'gpt-4-turbo':
return GPT4v(model_name, *args)
else:
raise ValueError(f'Unsupported model type {model_name}. Create entry in app/models/')
132 changes: 132 additions & 0 deletions app/models/gpt4o.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import json
import time
from typing import Any

from models.model import Model
from openai.types.beta.threads.message import Message
from utils.screen import Screen


# TODO
# [ ] Function calling with assistants api - https://platform.openai.com/docs/assistants/tools/function-calling/quickstart

class GPT4o(Model):
def __init__(self, model_name, base_url, api_key, context):
super().__init__(model_name, base_url, api_key, context)

# GPT4o has Assistant Mode enabled that we can utilize to make Open Interface be more contextually aware
self.assistant = self.client.beta.assistants.create(
name='Open Interface Backend',
instructions=self.context,
# tools=[],
model='gpt-4o',
)

self.thread = self.client.beta.threads.create()

# IDs of images uploaded to OpenAI for use with the assistants API, can be cleaned up once thread is no longer needed
self.list_of_image_ids = []

def get_instructions_for_objective(self, original_user_request: str, step_num: int = 0) -> dict[str, Any]:
# Upload screenshot to OpenAI - Note: Don't delete files from openai while the thread is active
openai_screenshot_file_id = self.upload_screenshot_and_get_file_id()

self.list_of_image_ids.append(openai_screenshot_file_id)

# Format user request to send to LLM
formatted_user_request = self.format_user_request_for_llm(original_user_request, step_num,
openai_screenshot_file_id)

# Read response
llm_response = self.send_message_to_llm(formatted_user_request)
json_instructions: dict[str, Any] = self.convert_llm_response_to_json_instructions(llm_response)

return json_instructions

def send_message_to_llm(self, formatted_user_request) -> Message:
message = self.client.beta.threads.messages.create(
thread_id=self.thread.id,
role='user',
content=formatted_user_request
)

run = self.client.beta.threads.runs.create_and_poll(
thread_id=self.thread.id,
assistant_id=self.assistant.id,
instructions=''
)

while run.status != 'completed':
print(f'Waiting for response, sleeping for 1. run.status={run.status}')
time.sleep(1)

if run.status == 'failed':
print(f'failed run run.required_action:{run.required_action} run.last_error: {run.last_error}\n\n')
return None

if run.status == 'completed':
# NOTE: Apparently right now the API doesn't have a way to retrieve just the last message???
# So instead you get all messages and take the latest one
response = self.client.beta.threads.messages.list(
thread_id=self.thread.id
)

return response.data[0]
else:
print('Run did not complete successfully.')
return None

def upload_screenshot_and_get_file_id(self):
# Files are used to upload documents like images that can be used with features like Assistants
# Assistants API cannot take base64 images like chat.completions API
filepath = Screen().get_screenshot_file()

response = self.client.files.create(
file=open(filepath, 'rb'),
purpose='vision'
)
return response.id

def format_user_request_for_llm(self, original_user_request, step_num, openai_screenshot_file_id) -> list[
dict[str, Any]]:
request_data: str = json.dumps({
'original_user_request': original_user_request,
'step_num': step_num
})

content = [
{
'type': 'text',
'text': request_data
},
{
'type': 'image_file',
'image_file': {
'file_id': openai_screenshot_file_id
}
}
]

return content

def convert_llm_response_to_json_instructions(self, llm_response: Message) -> dict[str, Any]:
llm_response_data: str = llm_response.content[0].text.value.strip()

# Our current LLM model does not guarantee a JSON response hence we manually parse the JSON part of the response
# Check for updates here - https://platform.openai.com/docs/guides/text-generation/json-mode
start_index = llm_response_data.find('{')
end_index = llm_response_data.rfind('}')

try:
json_response = json.loads(llm_response_data[start_index:end_index + 1].strip())
except Exception as e:
print(f'Error while parsing JSON response - {e}')
json_response = {}

return json_response

def cleanup(self):
# Note: Cannot delete screenshots while the thread is active. Cleanup during shut down.
for id in self.list_of_image_ids:
self.client.files.delete(id)
self.thread = self.client.beta.threads.create() # Using old thread even by accident would cause Image errors
64 changes: 64 additions & 0 deletions app/models/gpt4v.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import json
from typing import Any

from models.model import Model
from openai import ChatCompletion
from utils.screen import Screen


class GPT4v(Model):
def get_instructions_for_objective(self, original_user_request: str, step_num: int = 0) -> dict[str, Any]:
message: list[dict[str, Any]] = self.format_user_request_for_llm(original_user_request, step_num)
llm_response = self.send_message_to_llm(message)
json_instructions: dict[str, Any] = self.convert_llm_response_to_json_instructions(llm_response)
return json_instructions

def format_user_request_for_llm(self, original_user_request, step_num) -> list[dict[str, Any]]:
base64_img: str = Screen().get_screenshot_in_base64()

request_data: str = json.dumps({
'original_user_request': original_user_request,
'step_num': step_num
})

# We have to add context every request for now which is expensive because our chosen model doesn't have a
# stateful/Assistant mode yet.
message = [
{'type': 'text', 'text': self.context + request_data},
{'type': 'image_url',
'image_url': {
'url': f'data:image/jpeg;base64,{base64_img}'
}
}
]

return message

def send_message_to_llm(self, message) -> ChatCompletion:
response = self.client.chat.completions.create(
model=self.model_name,
messages=[
{
'role': 'user',
'content': message,
}
],
max_tokens=800,
)
return response

def convert_llm_response_to_json_instructions(self, llm_response: ChatCompletion) -> dict[str, Any]:
llm_response_data: str = llm_response.choices[0].message.content.strip()

# Our current LLM model does not guarantee a JSON response hence we manually parse the JSON part of the response
# Check for updates here - https://platform.openai.com/docs/guides/text-generation/json-mode
start_index = llm_response_data.find('{')
end_index = llm_response_data.rfind('}')

try:
json_response = json.loads(llm_response_data[start_index:end_index + 1].strip())
except Exception as e:
print(f'Error while parsing JSON response - {e}')
json_response = {}

return json_response
Loading

0 comments on commit 511b42f

Please sign in to comment.