Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: feat: Initial code to load workspaces from a specific container path #583

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion sql/schema/schema.sql
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
-- Schema for codegate database using SQLite

-- Workspaces table
CREATE TABLE workspaces (
id TEXT PRIMARY KEY, -- UUID stored as TEXT
name TEXT,
folder_tree_json TEXT -- JSON stored as TEXT
);

-- Prompts table
CREATE TABLE prompts (
id TEXT PRIMARY KEY, -- UUID stored as TEXT
workspace_id TEXT NOT NULL,
timestamp DATETIME NOT NULL,
provider TEXT, -- VARCHAR(255)
request TEXT NOT NULL, -- Record the full request that arrived to the server
type TEXT NOT NULL -- VARCHAR(50) (e.g. "fim", "chat")
type TEXT NOT NULL, -- VARCHAR(50) (e.g. "fim", "chat")
FOREIGN KEY (workspace_id) REFERENCES workspaces(id),
);

-- Outputs table
Expand Down Expand Up @@ -41,6 +50,7 @@ CREATE TABLE settings (
);

-- Create indexes for foreign keys and frequently queried columns
CREATE INDEX idx_prompts_workspace_id ON prompts(workspace_id);
CREATE INDEX idx_outputs_prompt_id ON outputs(prompt_id);
CREATE INDEX idx_alerts_prompt_id ON alerts(prompt_id);
CREATE INDEX idx_prompts_timestamp ON prompts(timestamp);
Expand Down
2 changes: 2 additions & 0 deletions src/codegate/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from codegate.providers.copilot.provider import CopilotProvider
from codegate.server import init_app
from codegate.storage.utils import restore_storage_backup
from codegate.workspaces.workspaces import Workspaces


class UvicornServer:
Expand Down Expand Up @@ -318,6 +319,7 @@ def serve(
else:
click.echo("Existing Certificates are already present.")

Workspaces().read_workspaces('/app/codegate_workspaces', cfg.ignore_paths_workspaces)
# Initialize secrets manager and pipeline factory
secrets_manager = SecretsManager()
pipeline_factory = PipelineFactory(secrets_manager)
Expand Down
3 changes: 3 additions & 0 deletions src/codegate/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ class Config:
force_certs: bool = False

max_fim_hash_lifetime: int = 60 * 5 # Time in seconds. Default is 5 minutes.
ignore_paths_workspaces = [
".git", "__pycache__", ".venv", ".DS_Store", "node_modules", ".pytest_cache", ".ruff_cache"
]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would it make sense to just include the contents of gitignore?
Sounds like we should make this configurable down the road.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did consider the contents of .gitignore but if we use that it would mean skipping files that may contain secrets but could still leaked to LLMs.

I was planning to make ignore_paths_workspaces configurable through the cli, I just didn't have time to do so. The values here would be the defaults


# Provider URLs with defaults
provider_urls: Dict[str, str] = field(default_factory=lambda: DEFAULT_PROVIDER_URLS.copy())
Expand Down
28 changes: 28 additions & 0 deletions src/codegate/db/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
GetPromptWithOutputsRow,
Output,
Prompt,
Workspace,
)
from codegate.pipeline.base import PipelineContext

Expand Down Expand Up @@ -252,6 +253,33 @@ async def record_context(self, context: Optional[PipelineContext]) -> None:
except Exception as e:
logger.error(f"Failed to record context: {context}.", error=str(e))

async def record_workspaces(self, workspaces: List[Workspace]) -> List[Workspace]:
if not workspaces:
return
sql = text(
"""
INSERT INTO workspaces (id, name, folder_tree_json)
VALUES (:id, :name, :folder_tree_json)
RETURNING *
"""
)
workspaces_tasks = []
async with asyncio.TaskGroup() as tg:
for workspace in workspaces:
try:
result = tg.create_task(self._execute_update_pydantic_model(workspace, sql))
workspaces_tasks.append(result)
except Exception as e:
logger.error(f"Failed to record alert: {workspace}.", error=str(e))

recorded_workspaces = []
for workspace_coro in workspaces_tasks:
workspace_recorded = workspace_coro.result()
if workspace_recorded:
recorded_workspaces.append(workspace_recorded)

return recorded_workspaces


class DbReader(DbCodeGate):

Expand Down
6 changes: 6 additions & 0 deletions src/codegate/db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ class Setting(pydantic.BaseModel):
other_settings: Optional[Any]


class Workspace(pydantic.BaseModel):
id: Any
name: str
folder_tree_json: str


# Models for select queries


Expand Down
Empty file.
80 changes: 80 additions & 0 deletions src/codegate/workspaces/workspaces.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import asyncio
import json
import uuid
from pathlib import Path
from typing import Dict, List, Optional, Union

from pydantic import BaseModel

from codegate.db.connection import DbRecorder
from codegate.db.models import Workspace


class Folder(BaseModel):
files: List[str] = []


class Repository(BaseModel):
name: str
folder_tree: Dict[str, Folder]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the intent to store the whole directory tree of a repository?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about storing the root of the repo instead of the whole filesystem?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the intent is to store the whole directory tree of a repository. The reasoning behind it is to do fast lookups when we see a path in the received code snippets. Right now, we get the path of a code snippet if it was supplied for context to the LLM. Example:

{
  "messages": [
    {
      "role": "user",
      "content": "\n\n```py codegate/src/codegate/pipeline/factory.py (1-57)\nfrom typing import List\n\nfrom codegate.config import Config\nfrom codegate.pipeline.base import PipelineStep, SequentialPipelineProcessor\nfrom codegate.pipeline.codegate_context_retriever.codegate import CodegateContextRetriever\nfrom codegate.pipeline.extract_snippets.extract_snippets import CodeSnippetExtractor\nfrom codegate.pipeline.extract_snippets.output import CodeCommentStep\nfrom codegate.pipeline.output import OutputPipelineProcessor, OutputPipelineStep\nfrom codegate.pipeline.secrets.manager import SecretsManager\nfrom codegate.pipeline.secrets.secrets import (\n    CodegateSecrets,\n    SecretRedactionNotifier,\n    SecretUnredactionStep,\n)\nfrom codegate.pipeline.system_prompt.codegate import SystemPrompt\nfrom codegate.pipeline.version.version import CodegateVersion\n\n\nclass PipelineFactory:\n    def __init__(self, secrets_manager: SecretsManager):\n        self.secrets_manager = secrets_manager\n\n    def create_input_pipeline(self) -> SequentialPipelineProcessor:\n        input_steps: List[PipelineStep] = [\n            # make sure that this step is always first in the pipeline\n            # the other steps might send the request to a LLM for it to be analyzed\n            # and without obfuscating the secrets, we'd leak the secrets during those\n            # later steps\n            CodegateSecrets(),\n            CodegateVersion(),\n            CodeSnippetExtractor(),\n            CodegateContextRetriever(),\n            SystemPrompt(Config.get_config().prompts.default_chat),\n        ]\n        return SequentialPipelineProcessor(input_steps, self.secrets_manager, is_fim=False)\n\n    def create_fim_pipeline(self) -> SequentialPipelineProcessor:\n        fim_steps: List[PipelineStep] = [\n            CodegateSecrets(),\n        ]\n        return SequentialPipelineProcessor(fim_steps, self.secrets_manager, is_fim=True)\n\n    def create_output_pipeline(self) -> OutputPipelineProcessor:\n        output_steps: List[OutputPipelineStep] = [\n            SecretRedactionNotifier(),\n            SecretUnredactionStep(),\n            CodeCommentStep(),\n        ]\n        return OutputPipelineProcessor(output_steps)\n\n    def create_fim_output_pipeline(self) -> OutputPipelineProcessor:\n        fim_output_steps: List[OutputPipelineStep] = [\n            # temporarily disabled\n            # SecretUnredactionStep(),\n        ]\n        return OutputPipelineProcessor(fim_output_steps)\n\n```\nwhats this code doing?"
    }
  ],
  "model": "hosted_vllm/unsloth/Qwen2.5-Coder-32B-Instruct",
  "max_tokens": 4096,
  "stream": true,
  "base_url": "https://inference.codegate.ai/v1"
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gotcha, that makes sense. Let's give this a little bit of thought; every time a file is added/removed we'd have to rewrite the JSON blob into the database and that's not optimal either.



class FolderRepoScanner:

def __init__(self, ignore_paths: Optional[List[str]] = None):
if ignore_paths is None:
ignore_paths = []
self.ignore_paths = ignore_paths

def _should_skip(self, path: Path):
"""Skip certain paths that are not relevant for scanning."""
return any(part in path.parts for part in self.ignore_paths)

def _read_repository_structure(self, repo_path: Path) -> Dict[str, Folder]:
folder_tree: Dict[str, Folder] = {}
for path in repo_path.rglob('*'):
if self._should_skip(path):
continue

relative_path = path.relative_to(repo_path)
if path.is_dir():
folder_tree[str(relative_path)] = Folder()
else:
parent_dir = str(relative_path.parent)
if parent_dir not in folder_tree:
folder_tree[parent_dir] = Folder()
folder_tree[parent_dir].files.append(path.name)
return folder_tree

def read(self, path_str: Union[str, Path]) -> List[Repository]:
path_dir = Path(path_str)
if not path_dir.is_dir():
print(f"Path {path_dir} is not a directory")
return []

found_repos = []
for child_path in path_dir.rglob('*'):
if child_path.is_dir() and (child_path / ".git").exists():
repo_structure = self._read_repository_structure(child_path)
new_repo = Repository(name=child_path.name, folder_tree=repo_structure)
found_repos.append(new_repo)
print(f"Found repository at {child_path}.")

return found_repos

class Workspaces:

def __init__(self):
self._db_recorder = DbRecorder()

def read_workspaces(self, path: str, ignore_paths: Optional[List[str]] = None) -> None:
repos = FolderRepoScanner(ignore_paths).read(path)
workspaces = [
Workspace(
id=str(uuid.uuid4()),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this mean that if I restart the container I get a new Workspace per repo in the tree?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I still need to add functionality to avoid creating a new workspace if the repo already exists.

name=repo.name,
folder_tree_json=json.dumps(repo.folder_tree)
)
for repo in repos
]
asyncio.run(self._db_recorder.record_workspaces(workspaces))
Loading