stacklok
diff --git a/‎src/codegate/api/v1.py
+2-2 b/‎src/codegate/api/v1.py
+2-2
diff --git a/‎src/codegate/api/v1_models.py
+70-15 b/‎src/codegate/api/v1_models.py
+70-15
diff --git a/‎src/codegate/api/v1_processing.py
+47-20 b/‎src/codegate/api/v1_processing.py
+47-20
diff --git a/‎src/codegate/inference/inference_engine.py
+1-1 b/‎src/codegate/inference/inference_engine.py
+1-1
diff --git a/‎src/codegate/pipeline/output.py
+5-11 b/‎src/codegate/pipeline/output.py
+5-11
diff --git a/‎src/codegate/providers/base.py
-2 b/‎src/codegate/providers/base.py
-2
@@ -477,11 +477,11 @@ def version_check():
     tags=["Workspaces", "Token Usage"],
     generate_unique_id_function=uniq_name,
 )
-async def get_workspace_token_usage(workspace_name: str) -> v1_models.TokenUsage:
+async def get_workspace_token_usage(workspace_name: str) -> v1_models.TokenUsageAggregate:
     """Get the token usage of a workspace."""
     # TODO: This is a dummy implementation. In the future, we should have a proper
     # implementation that fetches the token usage from the database.
-    return v1_models.TokenUsage(
+    return v1_models.TokenUsageAggregate(
         used_tokens=50,
         tokens_by_model=[
             v1_models.TokenUsageByModel(
 
@@ -1,12 +1,17 @@
 import datetime
 from enum import Enum
-from typing import Any, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import pydantic
+import requests
+from cachetools import TTLCache
 
 from codegate.db import models as db_models
 from codegate.pipeline.base import CodeSnippet
 
+# 1 day cache. Not keep all the models in the cache. Just the ones we have used recently.
+model_cost_cache = TTLCache(maxsize=2000, ttl=1 * 24 * 60 * 60)
+
 
 class Workspace(pydantic.BaseModel):
     name: str
@@ -105,15 +110,6 @@ class PartialQuestions(pydantic.BaseModel):
     type: QuestionType
 
 
-class PartialQuestionAnswer(pydantic.BaseModel):
-    """
-    Represents a partial conversation.
-    """
-
-    partial_questions: PartialQuestions
-    answer: Optional[ChatMessage]
-
-
 class ProviderType(str, Enum):
     """
     Represents the different types of providers we support.
@@ -124,24 +120,83 @@ class ProviderType(str, Enum):
     vllm = "vllm"
 
 
+class TokenUsage(pydantic.BaseModel):
+    input_tokens: int = 0
+    output_tokens: int = 0
+    input_cost: float = 0
+    output_cost: float = 0
+
+    @classmethod
+    def from_dict(cls, usage_dict: Dict) -> "TokenUsage":
+        return cls(
+            input_tokens=usage_dict.get("prompt_tokens", 0) or usage_dict.get("input_tokens", 0),
+            output_tokens=usage_dict.get("completion_tokens", 0)
+            or usage_dict.get("output_tokens", 0),
+            input_cost=0,
+            output_cost=0,
+        )
+
+    def __add__(self, other: "TokenUsage") -> "TokenUsage":
+        return TokenUsage(
+            input_tokens=self.input_tokens + other.input_tokens,
+            output_tokens=self.output_tokens + other.output_tokens,
+            input_cost=self.input_cost + other.input_cost,
+            output_cost=self.output_cost + other.output_cost,
+        )
+
+    def update_token_cost(self, model: str) -> None:
+        if not model_cost_cache:
+            model_cost = requests.get(
+                "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
+            )
+            model_cost_cache.update(model_cost.json())
+        model_cost = model_cost_cache.get(model, {})
+        input_cost_per_token = model_cost.get("input_cost_per_token", 0)
+        output_cost_per_token = model_cost.get("output_cost_per_token", 0)
+        self.input_cost = self.input_tokens * input_cost_per_token
+        self.output_cost = self.output_tokens * output_cost_per_token
+
+    def update_costs_based_on_model(self, model: str):
+        pass
+
+
 class TokenUsageByModel(pydantic.BaseModel):
     """
     Represents the tokens used by a model.
     """
 
     provider_type: ProviderType
     model: str
-    used_tokens: int
+    token_usage: TokenUsage
 
 
-class TokenUsage(pydantic.BaseModel):
+class TokenUsageAggregate(pydantic.BaseModel):
     """
     Represents the tokens used. Includes the information of the tokens used by model.
     `used_tokens` are the total tokens used in the `tokens_by_model` list.
     """
 
-    tokens_by_model: List[TokenUsageByModel]
-    used_tokens: int
+    tokens_by_model: Dict[str, TokenUsageByModel]
+    token_usage: TokenUsage
+
+    def add_model_token_usage(self, model_token_usage: TokenUsageByModel) -> None:
+        if model_token_usage.model in self.tokens_by_model:
+            self.tokens_by_model[
+                model_token_usage.model
+            ].token_usage += model_token_usage.token_usage
+        else:
+            self.tokens_by_model[model_token_usage.model] = model_token_usage
+        self.token_usage += model_token_usage.token_usage
+
+
+class PartialQuestionAnswer(pydantic.BaseModel):
+    """
+    Represents a partial conversation.
+    """
+
+    partial_questions: PartialQuestions
+    answer: Optional[ChatMessage]
+    model_token_usage: TokenUsageByModel
 
 
 class Conversation(pydantic.BaseModel):
@@ -154,7 +209,7 @@ class Conversation(pydantic.BaseModel):
     type: QuestionType
     chat_id: str
     conversation_timestamp: datetime.datetime
-    token_usage: Optional[TokenUsage]
+    token_usage_agg: Optional[TokenUsageAggregate]
 
 
 class AlertConversation(pydantic.BaseModel):
 
@@ -14,6 +14,9 @@
     PartialQuestionAnswer,
     PartialQuestions,
     QuestionAnswer,
+    TokenUsage,
+    TokenUsageAggregate,
+    TokenUsageByModel,
 )
 from codegate.db.connection import alert_queue
 from codegate.db.models import Alert, GetPromptWithOutputsRow
@@ -57,16 +60,17 @@ async def _is_system_prompt(message: str) -> bool:
     return False
 
 
-async def parse_request(request_str: str) -> Optional[str]:
+async def parse_request(request_str: str) -> Tuple[Optional[List[str]], str]:
     """
-    Parse the request string from the pipeline and return the message.
+    Parse the request string from the pipeline and return the message and the model.
     """
     try:
         request = json.loads(request_str)
     except Exception as e:
         logger.warning(f"Error parsing request: {request_str}. {e}")
-        return None
+        return None, ""
 
+    model = request.get("model", "")
     messages = []
     for message in request.get("messages", []):
         role = message.get("role")
@@ -91,57 +95,60 @@ async def parse_request(request_str: str) -> Optional[str]:
         if message_prompt and not await _is_system_prompt(message_prompt):
             messages.append(message_prompt)
 
-    # If still we don't have anything, return empty string
+    # If still we don't have anything, return None string
     if not messages:
-        return None
+        return None, model
 
-    # Only respond with the latest message
-    return messages
+    # Respond with the messages and the model
+    return messages, model
 
 
-async def parse_output(output_str: str) -> Optional[str]:
+async def parse_output(output_str: str) -> Tuple[Optional[str], TokenUsage]:
     """
     Parse the output string from the pipeline and return the message.
     """
     try:
         if output_str is None:
-            return None
+            return None, TokenUsage()
 
         output = json.loads(output_str)
     except Exception as e:
         logger.warning(f"Error parsing output: {output_str}. {e}")
-        return None
+        return None, TokenUsage()
 
-    def _parse_single_output(single_output: dict) -> str:
+    def _parse_single_output(single_output: dict) -> Tuple[str, TokenUsage]:
         single_output_message = ""
         for choice in single_output.get("choices", []):
             if not isinstance(choice, dict):
                 continue
             content_dict = choice.get("delta", {}) or choice.get("message", {})
             single_output_message += content_dict.get("content", "")
-        return single_output_message
+        return single_output_message, TokenUsage.from_dict(single_output.get("usage", {}))
 
     full_output_message = ""
+    full_token_usage = TokenUsage()
     if isinstance(output, list):
         for output_chunk in output:
             output_message = ""
+            token_usage = TokenUsage()
             if isinstance(output_chunk, dict):
-                output_message = _parse_single_output(output_chunk)
+                output_message, token_usage = _parse_single_output(output_chunk)
             elif isinstance(output_chunk, str):
                 try:
                     output_decoded = json.loads(output_chunk)
-                    output_message = _parse_single_output(output_decoded)
+                    output_message, token_usage = _parse_single_output(output_decoded)
                 except Exception:
                     logger.error(f"Error reading chunk: {output_chunk}")
             else:
                 logger.warning(
                     f"Could not handle output: {output_chunk}", out_type=type(output_chunk)
                 )
             full_output_message += output_message
+            full_token_usage += token_usage
     elif isinstance(output, dict):
-        full_output_message = _parse_single_output(output)
+        full_output_message, full_token_usage = _parse_single_output(output)
 
-    return full_output_message
+    return full_output_message, full_token_usage
 
 
 async def _get_question_answer(row: GetPromptWithOutputsRow) -> Optional[PartialQuestionAnswer]:
@@ -154,8 +161,8 @@ async def _get_question_answer(row: GetPromptWithOutputsRow) -> Optional[Partial
         request_task = tg.create_task(parse_request(row.request))
         output_task = tg.create_task(parse_output(row.output))
 
-    request_user_msgs = request_task.result()
-    output_msg_str = output_task.result()
+    request_user_msgs, model = request_task.result()
+    output_msg_str, token_usage = output_task.result()
 
     # If we couldn't parse the request, return None
     if not request_user_msgs:
@@ -176,7 +183,23 @@ async def _get_question_answer(row: GetPromptWithOutputsRow) -> Optional[Partial
         )
     else:
         output_message = None
-    return PartialQuestionAnswer(partial_questions=request_message, answer=output_message)
+
+    # Use the model to update the token cost
+    token_usage.update_token_cost(model)
+    provider = row.provider
+    # TODO: This should come from the database. For now, we are manually changing copilot to openai
+    # Change copilot provider to openai
+    if provider == "copilot":
+        provider = "openai"
+    model_token_usage = TokenUsageByModel(
+        model=model, token_usage=token_usage, provider_type=provider
+    )
+
+    return PartialQuestionAnswer(
+        partial_questions=request_message,
+        answer=output_message,
+        model_token_usage=model_token_usage,
+    )
 
 
 def parse_question_answer(input_text: str) -> str:
@@ -304,6 +327,7 @@ async def match_conversations(
     map_q_id_to_conversation = {}
     for group in grouped_partial_questions:
         questions_answers: List[QuestionAnswer] = []
+        token_usage_agg = TokenUsageAggregate(tokens_by_model={}, token_usage=TokenUsage())
         first_partial_qa = None
         for partial_question in sorted(group, key=lambda x: x.timestamp):
             # Partial questions don't contain the answer, so we need to find the corresponding
@@ -322,16 +346,19 @@ async def match_conversations(
                 qa = _get_question_answer_from_partial(selected_partial_qa)
                 qa.question.message = parse_question_answer(qa.question.message)
                 questions_answers.append(qa)
+                token_usage_agg.add_model_token_usage(selected_partial_qa.model_token_usage)
 
         # only add conversation if we have some answers
         if len(questions_answers) > 0 and first_partial_qa is not None:
+            if token_usage_agg.token_usage.input_tokens == 0:
+                token_usage_agg = None
             conversation = Conversation(
                 question_answers=questions_answers,
                 provider=first_partial_qa.partial_questions.provider,
                 type=first_partial_qa.partial_questions.type,
                 chat_id=first_partial_qa.partial_questions.message_id,
                 conversation_timestamp=first_partial_qa.partial_questions.timestamp,
-                token_usage=None,
+                token_usage_agg=token_usage_agg,
             )
             for qa in questions_answers:
                 map_q_id_to_conversation[qa.question.message_id] = conversation
 
@@ -35,7 +35,7 @@ def _close_models(self):
                 model._sampler.close()
             model.close()
 
-    async def __get_model(self, model_path, embedding=False, n_ctx=512, n_gpu_layers=0):
+    async def __get_model(self, model_path, embedding=False, n_ctx=512, n_gpu_layers=0) -> Llama:
         """
         Returns Llama model object from __models if present. Otherwise, the model
         is loaded and added to __models and returned.
 
@@ -113,9 +113,6 @@ def _store_chunk_content(self, chunk: ModelResponse) -> None:
             if choice.delta is not None and choice.delta.content is not None:
                 self._context.processed_content.append(choice.delta.content)
 
-    async def _record_to_db(self):
-        await self._db_recorder.record_context(self._input_context)
-
     async def process_stream(
         self, stream: AsyncIterator[ModelResponse], cleanup_sensitive: bool = True
     ) -> AsyncIterator[ModelResponse]:
@@ -144,13 +141,6 @@ async def process_stream(
 
                     current_chunks = processed_chunks
 
-                # **Needed for Copilot**. This is a hacky way of recording in DB the context
-                # when we see the last chunk. Ideally this should be done in a `finally` or
-                # `StopAsyncIteration` but Copilot streams in an infite while loop so is not
-                # possible
-                if len(chunk.choices) > 0 and chunk.choices[0].get("finish_reason", "") == "stop":
-                    await self._record_to_db()
-
                 # Yield all processed chunks
                 for c in current_chunks:
                     self._store_chunk_content(c)
@@ -164,12 +154,13 @@ async def process_stream(
         finally:
             # Don't flush the buffer if we assume we'll call the pipeline again
             if cleanup_sensitive is False:
+                await self._db_recorder.record_context(self._input_context)
                 return
 
             # Process any remaining content in buffer when stream ends
             if self._context.buffer:
                 final_content = "".join(self._context.buffer)
-                yield ModelResponse(
+                chunk = ModelResponse(
                     id=self._buffered_chunk.id,
                     choices=[
                         StreamingChoices(
@@ -185,8 +176,11 @@ async def process_stream(
                     model=self._buffered_chunk.model,
                     object="chat.completion.chunk",
                 )
+                self._input_context.add_output(chunk)
+                yield chunk
                 self._context.buffer.clear()
 
+            await self._db_recorder.record_context(self._input_context)
             # Cleanup sensitive data through the input context
             if cleanup_sensitive and self._input_context and self._input_context.sensitive:
                 self._input_context.sensitive.secure_cleanup()
 
@@ -193,8 +193,6 @@ async def _cleanup_after_streaming(
                 yield item
         finally:
             if context:
-                # Record to DB the objects captured during the stream
-                await self._db_recorder.record_context(context)
                 # Ensure sensitive data is cleaned up after the stream is consumed
                 if context.sensitive:
                     context.sensitive.secure_cleanup()