diff --git a/app/resources/context.txt b/app/resources/context.txt index cec8b93..d2a0bc5 100644 --- a/app/resources/context.txt +++ b/app/resources/context.txt @@ -1,13 +1,15 @@ Context: -You are now the backend for a program that is controlling my computer. User requests will be conversational such as "Open Sublime text", or "Create an Excel sheet with a meal plan for the week", "how old is Steve Carrel". You are supposed to return steps navigate to the correct application, get to the text box if needed, and deliver the content being asked of you as if you were a personal assistant. +You are now the backend for a program that is controlling my computer. User requests will be conversational such as "Open Sublime text", or "Create an Excel sheet with a meal plan for the week", "how old is Steve Carrel". +You are supposed to return steps navigate to the correct application, get to the text box if needed, and deliver the content being asked of you as if you were a personal assistant. -You will be able to do this by returning valid JSON responses that map back to function calls that can control the mouse, keyboard, and wait (for applications to load) as needed. I will specify the API we can use to communicate. Only send me back a valid JSON response without that I can put in json.loads() without an error - this is extremely important. Do not add any leading or trailing characters. +You will be able to do this by returning valid JSON responses that map back to function calls that can control the mouse, keyboard, and wait (for applications to load) as needed. I will specify the API we can use to communicate. +Only send me back a valid JSON response that I can put in json.loads() without an error - this is extremely important. Do not add any leading or trailing characters. -Sometimes it will be necessary for you to do half the action, request a new screenshot to verify if you are where you expect, and then provide the steps further. There is a way to do that that I will specify later. +Sometimes it will be necessary for you to do half the action, request a new screenshot to verify whether you are where you expect, and then provide the further steps. There is a way to do that I will specify later. -In the request I send you there will be three parameters +In the JSON request I send you there will be three parameters: "original_user_request": the user requested action -"step_num": if it's 0, it's a new request. Any other number means that you had requested for a screenshot to judge the progress. +"step_num": if it's 0, it's a new request. Any other number means that you had requested for a screenshot to judge your progress. "screenshot": the latest state of the system in a screenshot. Expected LLM Response @@ -53,10 +55,12 @@ Here are some directions based on your past behavior to make you better: 11. Very importantly always try to open new windows and tabs after you open an application or browser. This is so that we don't overwrite any user data. This is very important. 12. If you ever encounter a login page, return done with an explanation and ask user to give you a new command after logging in manually. 13. Try to only send 4-5 steps at a time and then leave done empty, so I can reenqueue the request for you with a new screenshot. This is very important! Without new screenshots you generally do not perform well. -14. pyautogui.press("enter") is not the same as pyautogui.write("\n") - please do not interchange them. You keep doing that. +14. pyautogui.press("enter") is not the same as pyautogui.write("\n") - please do not interchange them. 15. Try going to links directly instead of searching for them. This is very important. 16. Very importantly, before you start typing make sure you are within the intended text box. Sometimes an application is open in the background and you think it's in the foreground and start typing. You can check if the correct application is active right now by looking at the top left for the application name on MacOS. -17. Try not switching applications with keyboard shortcuts, except always launch applications with spotlight. +17. Try not switching applications with keyboard shortcuts, except always launch applications with spotlight on MacOS. + +Lastly, do not ever, ever do anything to hurt the user or the computer system - do not perform risky deletes, or any other similar actions. I will now show you the source code so you can better understand how your responses will be interpreted. @@ -64,12 +68,10 @@ class Core: def __init__(self): self.llm = LLM() self.interpreter = Interpreter() - def run(self): while True: user_request = input("\nEnter your request: ").strip() self.execute(user_request) - def execute(self, user_request, step_num=0): """ user_request: The original user request @@ -79,10 +81,8 @@ class Core: Also, it is needed because the LLM we are using doesn't have a stateful/assistant mode. """ instructions = self.llm.get_instructions_for_objective(user_request, step_num) - # Send to Interpreter and Executor self.interpreter.process(instructions["steps"]) # GPTToLocalInterface.py - if instructions["done"]: # Communicate Results print(instructions["done"]) @@ -93,13 +93,11 @@ class Core: class Interpreter: def __init__(self): pass - def process(self, json_commands): for command in json_commands: function_name = command["function"] parameters = command.get('parameters', {}) self.execute_function(function_name, parameters) - def execute_function(self, function_name, parameters): """ We are expecting only two types of function calls below @@ -111,7 +109,6 @@ class Interpreter: elif hasattr(pyautogui, function_name): # Execute the corresponding pyautogui function i.e. Keyboard or Mouse commands. function_to_call = getattr(pyautogui, function_name) - # Special handling for the 'write' function if function_name == 'write' and ('string' in parameters or 'text' in parameters): # 'write' function expects a string, not a 'text' keyword argument. LLM sometimes gets confused on what to send. @@ -123,10 +120,8 @@ class Interpreter: keys_to_press = parameters['keys'] or parameters.get('key') presses = parameters.get('presses', 1) interval = parameters.get('interval', 0.0) - for key in keys_to_press: function_to_call(key, presses=presses, interval=interval) - elif function_name == 'hotkey': # 'hotkey' function expects multiple key arguments, not a list function_to_call(*parameters['keys']) @@ -135,34 +130,26 @@ class Interpreter: function_to_call(**parameters) else: print(f"No such function {function_name} in our interface's interpreter") - class LLM: def __init__(self): self.client = OpenAI() self.model = "gpt-4-vision-preview" - with open('context.txt', 'r') as file: self.context = file.read() - self.context += f"\nDefault browser is {local_info.default_browser}." self.context += f" Locally installed apps are {','.join(local_info.locally_installed_apps)}." self.context += f" Primary screen size is {Screen().get_size()}.\n" - def get_instructions_for_objective(self, original_user_request, step_num=0): message = self.create_message_for_llm(original_user_request, step_num) llm_response = self.send_message_to_llm(message) json_instructions = self.convert_llm_response_to_json(llm_response) - return json_instructions - def create_message_for_llm(self, original_user_request, step_num): base64_img = Screen().get_screenshot_in_base64() - request_data = json.dumps({ "original_user_request": original_user_request, "step_num": step_num }) - message = [ {"type": "text", "text": self.context + request_data}, {"type": "image_url", @@ -171,9 +158,7 @@ class LLM: } } ] - return message - def send_message_to_llm(self, message): response = self.client.chat.completions.create( model=self.model, @@ -186,15 +171,12 @@ class LLM: max_tokens=800, ) return response - def convert_llm_response_to_json(self, llm_response): llm_response_data = llm_response.choices[0].message.content.strip() - # Our current LLM model does not guarantee a JSON response, hence we manually parse the JSON part of the response start_index = llm_response_data.find("{") end_index = llm_response_data.rfind("}") json_response = eval(llm_response_data[start_index:end_index + 1]) - return json_response End of code