Merge pull request #12 from AmberSahdev/feature/gpt-4o-support

GPT-4o Support added, modularized LLM code better
AmberSahdev · Jun 20, 2024 · 511b42f · 511b42f
2 parents 0fe2c54 + e600ef7
commit 511b42f
Show file tree

Hide file tree

Showing 15 changed files with 437 additions and 144 deletions.
diff --git a/app/core.py b/app/core.py
@@ -20,8 +20,8 @@ def __init__(self):
         self.llm = None
         try:
             self.llm = LLM()
-        except OpenAIError as _:
-            self.status_queue.put('Set your OpenAPI API Key in Settings and Restart the App')
+        except OpenAIError as e:
+            self.status_queue.put(f'Set your OpenAPI API Key in Settings and Restart the App. Error: {e}')
 
     def execute_user_request(self, user_request: str) -> None:
         self.stop_previous_request()

diff --git a/app/llm.py b/app/llm.py
@@ -1,15 +1,13 @@
-import json
-import os
 from pathlib import Path
 from typing import Any
 
-from openai import ChatCompletion
-from openai import OpenAI
-
+from models.factory import ModelFactory
 from utils import local_info
 from utils.screen import Screen
 from utils.settings import Settings
 
+DEFAULT_MODEL_NAME = 'gpt-4o'
+
 
 class LLM:
     """
@@ -43,106 +41,58 @@ class LLM:
     	"done": ...
     }
 
-    function is the function name to call in the executor.
+    function is the function name to call in the executer.
     parameters are the parameters of the above function.
     human_readable_justification is what we can use to debug in case program fails somewhere or to explain to user why we're doing what we're doing.
     done is null if user request is not complete, and it's a string when it's complete that either contains the
         information that the user asked for, or just acknowledges completion of the user requested task. This is going
         to be communicated to the user if it's present.
-
-    Note: Use code below to check whether gpt4v has assistant support yet.
-        from openai import OpenAI
-        client = OpenAI()
-        assistant = client.beta.assistants.create(
-            name="bot",
-            instructions="bot",
-            model="gpt-4-vision-preview",
-            tools=[{"type": "code_interpreter"}]
-        )
     """
 
     def __init__(self):
-        settings_dict: dict[str, str] = Settings().get_dict()
+        self.settings_dict: dict[str, str] = Settings().get_dict()
+        model_name, base_url, api_key = self.get_settings_values()
+
+        self.model_name = model_name
+        context = self.read_context_txt_file()
+
+        self.model = ModelFactory.create_model(self.model_name, base_url, api_key, context)
 
-        base_url = settings_dict.get('base_url', 'https://api.openai.com/v1/').rstrip('/') + '/'
-        api_key = settings_dict.get('api_key')
-        if api_key:
-            os.environ["OPENAI_API_KEY"] = api_key
+    def get_settings_values(self) -> tuple[str, str, str]:
+        model_name = self.settings_dict.get('model')
+        if not model_name:
+            model_name = DEFAULT_MODEL_NAME
 
+        base_url = self.settings_dict.get('base_url', '')
+        if not base_url:
+            base_url = 'https://api.openai.com/v1/'
+        base_url = base_url.rstrip('/') + '/'
+
+        api_key = self.settings_dict.get('api_key')
+
+        return model_name, base_url, api_key
+
+    def read_context_txt_file(self) -> str:
+        # Construct context for the assistant by reading context.txt and adding extra system information
+        context = ''
         path_to_context_file = Path(__file__).resolve().parent.joinpath('resources', 'context.txt')
         with open(path_to_context_file, 'r') as file:
-            self.context = file.read()
+            context += file.read()
 
-        self.context += f' Locally installed apps are {",".join(local_info.locally_installed_apps)}.'
-        self.context += f' OS is {local_info.operating_system}.'
-        self.context += f' Primary screen size is {Screen().get_size()}.\n'
+        context += f' Locally installed apps are {",".join(local_info.locally_installed_apps)}.'
+        context += f' OS is {local_info.operating_system}.'
+        context += f' Primary screen size is {Screen().get_size()}.\n'
 
-        if 'default_browser' in settings_dict.keys() and settings_dict['default_browser']:
-            self.context += f'\nDefault browser is {settings_dict["default_browser"]}.'
+        if 'default_browser' in self.settings_dict.keys() and self.settings_dict['default_browser']:
+            context += f'\nDefault browser is {self.settings_dict["default_browser"]}.'
 
-        if 'custom_llm_instructions' in settings_dict:
-            self.context += f'\nCustom user-added info: {settings_dict["custom_llm_instructions"]}.'
+        if 'custom_llm_instructions' in self.settings_dict:
+            context += f'\nCustom user-added info: {self.settings_dict["custom_llm_instructions"]}.'
 
-        self.client = OpenAI()
-
-        self.model = settings_dict.get('model')
-        if not self.model:
-            self.model = 'gpt-4-vision-preview'
-        self.client = OpenAI(api_key=os.environ["OPENAI_API_KEY"], base_url=base_url)
+        return context
 
     def get_instructions_for_objective(self, original_user_request: str, step_num: int = 0) -> dict[str, Any]:
-        message: list[dict[str, Any]] = self.create_message_for_llm(original_user_request, step_num)
-        llm_response = self.send_message_to_llm(message)
-        json_instructions: dict[str, Any] = self.convert_llm_response_to_json(llm_response)
-
-        return json_instructions
-
-    def create_message_for_llm(self, original_user_request, step_num) -> list[dict[str, Any]]:
-        base64_img: str = Screen().get_screenshot_in_base64()
-
-        request_data: str = json.dumps({
-            'original_user_request': original_user_request,
-            'step_num': step_num
-        })
-
-        # We have to add context every request for now which is expensive because our chosen model doesn't have a
-        #   stateful/Assistant mode yet.
-        message = [
-            {'type': 'text', 'text': self.context + request_data},
-            {'type': 'image_url',
-             'image_url': {
-                 'url': f'data:image/jpeg;base64,{base64_img}'
-             }
-             }
-        ]
-
-        return message
-
-    def send_message_to_llm(self, message) -> ChatCompletion:
-        response = self.client.chat.completions.create(
-            model=self.model,
-            messages=[
-                {
-                    'role': 'user',
-                    'content': message,
-                }
-            ],
-            max_tokens=800,
-        )
-        return response
-
-    def convert_llm_response_to_json(self, llm_response: ChatCompletion) -> dict[str, Any]:
-        llm_response_data: str = llm_response.choices[0].message.content.strip()
-
-        # Our current LLM model does not guarantee a JSON response hence we manually parse the JSON part of the response
-        # Check for updates here - https://platform.openai.com/docs/guides/text-generation/json-mode
-        start_index = llm_response_data.find('{')
-        end_index = llm_response_data.rfind('}')
-
-        try:
-            json_response = json.loads(llm_response_data[start_index:end_index + 1].strip())
-        except Exception as e:
-            print(f'Error while parsing JSON response - {e}')
-            json_response = {}
-
-        return json_response
+        return self.model.get_instructions_for_objective(original_user_request, step_num)
+
+    def cleanup(self):
+        self.model.cleanup()
diff --git a/app/models/__init__.py b/app/models/__init__.py
diff --git a/app/models/factory.py b/app/models/factory.py
@@ -0,0 +1,13 @@
+from models.gpt4o import GPT4o
+from models.gpt4v import GPT4v
+
+
+class ModelFactory:
+    @staticmethod
+    def create_model(model_name, *args):
+        if model_name == 'gpt-4o':
+            return GPT4o(model_name, *args)
+        elif model_name == 'gpt-4-vision-preview' or model_name == 'gpt-4-turbo':
+            return GPT4v(model_name, *args)
+        else:
+            raise ValueError(f'Unsupported model type {model_name}. Create entry in app/models/')
diff --git a/app/models/gpt4o.py b/app/models/gpt4o.py
@@ -0,0 +1,132 @@
+import json
+import time
+from typing import Any
+
+from models.model import Model
+from openai.types.beta.threads.message import Message
+from utils.screen import Screen
+
+
+# TODO
+# [ ] Function calling with assistants api - https://platform.openai.com/docs/assistants/tools/function-calling/quickstart
+
+class GPT4o(Model):
+    def __init__(self, model_name, base_url, api_key, context):
+        super().__init__(model_name, base_url, api_key, context)
+
+        # GPT4o has Assistant Mode enabled that we can utilize to make Open Interface be more contextually aware
+        self.assistant = self.client.beta.assistants.create(
+            name='Open Interface Backend',
+            instructions=self.context,
+            # tools=[],
+            model='gpt-4o',
+        )
+
+        self.thread = self.client.beta.threads.create()
+
+        # IDs of images uploaded to OpenAI for use with the assistants API, can be cleaned up once thread is no longer needed
+        self.list_of_image_ids = []
+
+    def get_instructions_for_objective(self, original_user_request: str, step_num: int = 0) -> dict[str, Any]:
+        # Upload screenshot to OpenAI - Note: Don't delete files from openai while the thread is active
+        openai_screenshot_file_id = self.upload_screenshot_and_get_file_id()
+
+        self.list_of_image_ids.append(openai_screenshot_file_id)
+
+        # Format user request to send to LLM
+        formatted_user_request = self.format_user_request_for_llm(original_user_request, step_num,
+                                                                  openai_screenshot_file_id)
+
+        # Read response
+        llm_response = self.send_message_to_llm(formatted_user_request)
+        json_instructions: dict[str, Any] = self.convert_llm_response_to_json_instructions(llm_response)
+
+        return json_instructions
+
+    def send_message_to_llm(self, formatted_user_request) -> Message:
+        message = self.client.beta.threads.messages.create(
+            thread_id=self.thread.id,
+            role='user',
+            content=formatted_user_request
+        )
+
+        run = self.client.beta.threads.runs.create_and_poll(
+            thread_id=self.thread.id,
+            assistant_id=self.assistant.id,
+            instructions=''
+        )
+
+        while run.status != 'completed':
+            print(f'Waiting for response, sleeping for 1. run.status={run.status}')
+            time.sleep(1)
+
+            if run.status == 'failed':
+                print(f'failed run run.required_action:{run.required_action} run.last_error: {run.last_error}\n\n')
+                return None
+
+        if run.status == 'completed':
+            # NOTE: Apparently right now the API doesn't have a way to retrieve just the last message???
+            #  So instead you get all messages and take the latest one
+            response = self.client.beta.threads.messages.list(
+                thread_id=self.thread.id
+            )
+
+            return response.data[0]
+        else:
+            print('Run did not complete successfully.')
+            return None
+
+    def upload_screenshot_and_get_file_id(self):
+        # Files are used to upload documents like images that can be used with features like Assistants
+        # Assistants API cannot take base64 images like chat.completions API
+        filepath = Screen().get_screenshot_file()
+
+        response = self.client.files.create(
+            file=open(filepath, 'rb'),
+            purpose='vision'
+        )
+        return response.id
+
+    def format_user_request_for_llm(self, original_user_request, step_num, openai_screenshot_file_id) -> list[
+        dict[str, Any]]:
+        request_data: str = json.dumps({
+            'original_user_request': original_user_request,
+            'step_num': step_num
+        })
+
+        content = [
+            {
+                'type': 'text',
+                'text': request_data
+            },
+            {
+                'type': 'image_file',
+                'image_file': {
+                    'file_id': openai_screenshot_file_id
+                }
+            }
+        ]
+
+        return content
+
+    def convert_llm_response_to_json_instructions(self, llm_response: Message) -> dict[str, Any]:
+        llm_response_data: str = llm_response.content[0].text.value.strip()
+
+        # Our current LLM model does not guarantee a JSON response hence we manually parse the JSON part of the response
+        # Check for updates here - https://platform.openai.com/docs/guides/text-generation/json-mode
+        start_index = llm_response_data.find('{')
+        end_index = llm_response_data.rfind('}')
+
+        try:
+            json_response = json.loads(llm_response_data[start_index:end_index + 1].strip())
+        except Exception as e:
+            print(f'Error while parsing JSON response - {e}')
+            json_response = {}
+
+        return json_response
+
+    def cleanup(self):
+        # Note: Cannot delete screenshots while the thread is active. Cleanup during shut down.
+        for id in self.list_of_image_ids:
+            self.client.files.delete(id)
+        self.thread = self.client.beta.threads.create()  # Using old thread even by accident would cause Image errors
diff --git a/app/models/gpt4v.py b/app/models/gpt4v.py
@@ -0,0 +1,64 @@
+import json
+from typing import Any
+
+from models.model import Model
+from openai import ChatCompletion
+from utils.screen import Screen
+
+
+class GPT4v(Model):
+    def get_instructions_for_objective(self, original_user_request: str, step_num: int = 0) -> dict[str, Any]:
+        message: list[dict[str, Any]] = self.format_user_request_for_llm(original_user_request, step_num)
+        llm_response = self.send_message_to_llm(message)
+        json_instructions: dict[str, Any] = self.convert_llm_response_to_json_instructions(llm_response)
+        return json_instructions
+
+    def format_user_request_for_llm(self, original_user_request, step_num) -> list[dict[str, Any]]:
+        base64_img: str = Screen().get_screenshot_in_base64()
+
+        request_data: str = json.dumps({
+            'original_user_request': original_user_request,
+            'step_num': step_num
+        })
+
+        # We have to add context every request for now which is expensive because our chosen model doesn't have a
+        #   stateful/Assistant mode yet.
+        message = [
+            {'type': 'text', 'text': self.context + request_data},
+            {'type': 'image_url',
+             'image_url': {
+                 'url': f'data:image/jpeg;base64,{base64_img}'
+             }
+             }
+        ]
+
+        return message
+
+    def send_message_to_llm(self, message) -> ChatCompletion:
+        response = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=[
+                {
+                    'role': 'user',
+                    'content': message,
+                }
+            ],
+            max_tokens=800,
+        )
+        return response
+
+    def convert_llm_response_to_json_instructions(self, llm_response: ChatCompletion) -> dict[str, Any]:
+        llm_response_data: str = llm_response.choices[0].message.content.strip()
+
+        # Our current LLM model does not guarantee a JSON response hence we manually parse the JSON part of the response
+        # Check for updates here - https://platform.openai.com/docs/guides/text-generation/json-mode
+        start_index = llm_response_data.find('{')
+        end_index = llm_response_data.rfind('}')
+
+        try:
+            json_response = json.loads(llm_response_data[start_index:end_index + 1].strip())
+        except Exception as e:
+            print(f'Error while parsing JSON response - {e}')
+            json_response = {}
+
+        return json_response