Skip to content

Commit

Permalink
remove unnecesary validation check
Browse files Browse the repository at this point in the history
  • Loading branch information
jgbradley1 committed Jan 29, 2025
1 parent 854238b commit c3d7f91
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 123 deletions.
12 changes: 6 additions & 6 deletions backend/graphrag_app/api/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
StorageNameList,
)
from graphrag_app.utils.common import (
delete_blob_container,
delete_cosmos_container_item,
delete_cosmos_container_item_if_exist,
delete_storage_container_if_exist,
desanitize_name,
get_blob_container_client,
get_cosmos_container_store_client,
Expand Down Expand Up @@ -178,10 +178,10 @@ async def delete_files(sanitized_container_name: str = Depends(sanitize_name)):
# sanitized_container_name = sanitize_name(container_name)
original_container_name = desanitize_name(sanitized_container_name)
try:
# delete container in Azure Storage
delete_blob_container(sanitized_container_name)
# delete entry from container-store in cosmosDB
delete_cosmos_container_item("container-store", sanitized_container_name)
delete_storage_container_if_exist(sanitized_container_name)
delete_cosmos_container_item_if_exist(
"container-store", sanitized_container_name
)
except Exception:
logger = load_pipeline_logger()
logger.error(
Expand Down
59 changes: 26 additions & 33 deletions backend/graphrag_app/api/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,11 @@
from graphrag_app.typing.pipeline import PipelineJobState
from graphrag_app.utils.azure_clients import AzureClientManager
from graphrag_app.utils.common import (
delete_blob_container,
delete_cosmos_container_item,
delete_cosmos_container_item_if_exist,
delete_storage_container_if_exist,
desanitize_name,
get_cosmos_container_store_client,
sanitize_name,
validate_blob_container_name,
)
from graphrag_app.utils.pipeline import PipelineJob

Expand All @@ -50,8 +49,8 @@
responses={200: {"model": BaseResponse}},
)
async def schedule_index_job(
storage_name: str,
index_name: str,
storage_container_name: str,
index_container_name: str,
entity_extraction_prompt: UploadFile | None = None,
entity_summarization_prompt: UploadFile | None = None,
community_summarization_prompt: UploadFile | None = None,
Expand All @@ -61,21 +60,16 @@ async def schedule_index_job(
pipelinejob = PipelineJob()

# validate index name against blob container naming rules
sanitized_index_name = sanitize_name(index_name)
try:
validate_blob_container_name(sanitized_index_name)
except ValueError:
raise HTTPException(
status_code=500,
detail=f"Invalid index name: {index_name}",
)
sanitized_index_container_name = sanitize_name(index_container_name)

# check for data container existence
sanitized_storage_name = sanitize_name(storage_name)
if not blob_service_client.get_container_client(sanitized_storage_name).exists():
sanitized_storage_container_name = sanitize_name(storage_container_name)
if not blob_service_client.get_container_client(
sanitized_storage_container_name
).exists():
raise HTTPException(
status_code=500,
detail=f"Storage blob container {storage_name} does not exist",
detail=f"Storage container '{storage_container_name}' does not exist",
)

# check for prompts
Expand All @@ -98,19 +92,20 @@ async def schedule_index_job(
# check for existing index job
# it is okay if job doesn't exist, but if it does,
# it must not be scheduled or running
if pipelinejob.item_exist(sanitized_index_name):
existing_job = pipelinejob.load_item(sanitized_index_name)
if pipelinejob.item_exist(sanitized_index_container_name):
existing_job = pipelinejob.load_item(sanitized_index_container_name)
if (PipelineJobState(existing_job.status) == PipelineJobState.SCHEDULED) or (
PipelineJobState(existing_job.status) == PipelineJobState.RUNNING
):
raise HTTPException(
status_code=202, # request has been accepted for processing but is not complete.
detail=f"Index '{index_name}' already exists and has not finished building.",
detail=f"Index '{index_container_name}' already exists and has not finished building.",
)
# if indexing job is in a failed state, delete the associated K8s job and pod to allow for a new job to be scheduled
if PipelineJobState(existing_job.status) == PipelineJobState.FAILED:
_delete_k8s_job(
f"indexing-job-{sanitized_index_name}", os.environ["AKS_NAMESPACE"]
f"indexing-job-{sanitized_index_container_name}",
os.environ["AKS_NAMESPACE"],
)
# reset the pipeline job details
existing_job._status = PipelineJobState.SCHEDULED
Expand All @@ -128,9 +123,9 @@ async def schedule_index_job(
existing_job.update_db()
else:
pipelinejob.create_item(
id=sanitized_index_name,
human_readable_index_name=index_name,
human_readable_storage_name=storage_name,
id=sanitized_index_container_name,
human_readable_index_name=index_container_name,
human_readable_storage_name=storage_container_name,
entity_extraction_prompt=entity_extraction_prompt_content,
entity_summarization_prompt=entity_summarization_prompt_content,
community_summarization_prompt=community_summarization_prompt_content,
Expand Down Expand Up @@ -218,22 +213,20 @@ async def delete_index(
sanitized_container_name: str = Depends(sanitize_name),
):
"""
Delete a specified index.
Delete a specified index and all associated metadata.
"""
try:
# kill indexing job if it is running
if os.getenv("KUBERNETES_SERVICE_HOST"): # only found if in AKS
_delete_k8s_job(f"indexing-job-{sanitized_container_name}", "graphrag")

# delete blob container
delete_blob_container(sanitized_container_name)

# delete entry from "container-store" container in cosmosDB
delete_cosmos_container_item("container-store", sanitized_container_name)

# delete entry from "jobs" container in cosmosDB
delete_cosmos_container_item("jobs", sanitized_container_name)
delete_storage_container_if_exist(sanitized_container_name)
delete_cosmos_container_item_if_exist(
"container-store", sanitized_container_name
)
delete_cosmos_container_item_if_exist("jobs", sanitized_container_name)

# delete associated AI Search index
index_client = SearchIndexClient(
endpoint=os.environ["AI_SEARCH_URL"],
credential=DefaultAzureCredential(),
Expand Down Expand Up @@ -263,7 +256,7 @@ async def delete_index(
summary="Track the status of an indexing job",
response_model=IndexStatusResponse,
)
async def get_index_job_status(sanitized_container_name: str = Depends(sanitize_name)):
async def get_index_status(sanitized_container_name: str = Depends(sanitize_name)):
pipelinejob = PipelineJob()
if pipelinejob.item_exist(sanitized_container_name):
pipeline_job = pipelinejob.load_item(sanitized_container_name)
Expand Down
23 changes: 13 additions & 10 deletions backend/graphrag_app/api/prompt_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,47 +8,50 @@
import yaml
from fastapi import (
APIRouter,
Depends,
HTTPException,
)
from graphrag.config.create_graphrag_config import create_graphrag_config

from graphrag_app.logger.load_logger import load_pipeline_logger
from graphrag_app.utils.azure_clients import AzureClientManager
from graphrag_app.utils.common import sanitize_name
from graphrag_app.utils.common import desanitize_name, sanitize_name

prompt_tuning_route = APIRouter(prefix="/index/config", tags=["Prompt Tuning"])


@prompt_tuning_route.get(
"/prompts",
summary="Generate prompts from user-provided data.",
summary="Generate custom graphrag prompts based on user-provided data.",
description="Generating custom prompts from user-provided data may take several minutes to run based on the amount of data used.",
)
async def generate_prompts(storage_name: str, limit: int = 5):
async def generate_prompts(
limit: int = 5,
sanitized_container_name: str = Depends(sanitize_name),
):
"""
Automatically generate custom prompts for entity entraction,
community reports, and summarize descriptions based on a sample of provided data.
"""
# check for storage container existence
azure_client_manager = AzureClientManager()
blob_service_client = azure_client_manager.get_blob_service_client()
sanitized_storage_name = sanitize_name(storage_name)
if not blob_service_client.get_container_client(sanitized_storage_name).exists():
original_container_name = desanitize_name(sanitized_container_name)
if not blob_service_client.get_container_client(sanitized_container_name).exists():
raise HTTPException(
status_code=500,
detail=f"Data container '{storage_name}' does not exist.",
detail=f"Storage container '{original_container_name}' does not exist.",
)

# load pipeline configuration file (settings.yaml) for input data and other settings
ROOT_DIR = Path(__file__).resolve().parent.parent.parent
with (ROOT_DIR / "scripts/settings.yaml").open("r") as f:
data = yaml.safe_load(f)
data["input"]["container_name"] = sanitized_storage_name
data["input"]["container_name"] = sanitized_container_name
graphrag_config = create_graphrag_config(values=data, root_dir=".")

# generate prompts
try:
# NOTE: we need to call api.generate_indexing_prompts
prompts: tuple[str, str, str] = await api.generate_indexing_prompts(
config=graphrag_config,
root=".",
Expand All @@ -58,7 +61,7 @@ async def generate_prompts(storage_name: str, limit: int = 5):
except Exception as e:
logger = load_pipeline_logger()
error_details = {
"storage_name": storage_name,
"storage_name": original_container_name,
}
logger.error(
message="Auto-prompt generation failed.",
Expand All @@ -68,7 +71,7 @@ async def generate_prompts(storage_name: str, limit: int = 5):
)
raise HTTPException(
status_code=500,
detail=f"Error generating prompts for data in '{storage_name}'. Please try a lower limit.",
detail=f"Error generating prompts for data in '{original_container_name}'. Please try a lower limit.",
)

prompt_content = {
Expand Down
54 changes: 2 additions & 52 deletions backend/graphrag_app/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import hashlib
import os
import re

import pandas as pd
from azure.core.exceptions import ResourceNotFoundError
Expand Down Expand Up @@ -41,7 +40,7 @@ def pandas_storage_options() -> dict:
return options


def delete_blob_container(container_name: str):
def delete_storage_container_if_exist(container_name: str):
"""
Delete a blob container. If it does not exist, do nothing.
If exception is raised, the calling function should catch it.
Expand All @@ -55,7 +54,7 @@ def delete_blob_container(container_name: str):
pass


def delete_cosmos_container_item(container: str, item_id: str):
def delete_cosmos_container_item_if_exist(container: str, item_id: str):
"""
Delete an item from a cosmosdb container. If it does not exist, do nothing.
If exception is raised, the calling function should catch it.
Expand Down Expand Up @@ -109,55 +108,6 @@ def validate_index_file_exist(sanitized_container_name: str, file_name: str):
)


def validate_blob_container_name(container_name: str):
"""
Check if container name is valid based on Azure resource naming rules.
- A blob container name must be between 3 and 63 characters in length.
- Start with a letter or number
- All letters used in blob container names must be lowercase.
- Contain only letters, numbers, or the hyphen.
- Consecutive hyphens are not permitted.
- Cannot end with a hyphen.
Args:
-----
container_name (str)
The blob container name to be validated.
Raises: ValueError
"""
# Check the length of the name
if len(container_name) < 3 or len(container_name) > 63:
raise ValueError(
f"Container name must be between 3 and 63 characters in length. Name provided was {len(container_name)} characters long."
)

# Check if the name starts with a letter or number
if not container_name[0].isalnum():
raise ValueError(
f"Container name must start with a letter or number. Starting character was {container_name[0]}."
)

# Check for valid characters (letters, numbers, hyphen) and lowercase letters
if not re.match("^[a-z0-9-]+$", container_name):
raise ValueError(
f"Container name must only contain:\n- lowercase letters\n- numbers\n- or hyphens\nName provided was {container_name}."
)

# Check for consecutive hyphens
if "--" in container_name:
raise ValueError(
f"Container name cannot contain consecutive hyphens. Name provided was {container_name}."
)

# Check for hyphens at the end of the name
if container_name[-1] == "-":
raise ValueError(
f"Container name cannot end with a hyphen. Name provided was {container_name}."
)


def get_cosmos_container_store_client() -> ContainerProxy:
try:
azure_client_manager = AzureClientManager()
Expand Down
22 changes: 0 additions & 22 deletions backend/tests/unit/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,32 +6,10 @@
from graphrag_app.utils.common import (
desanitize_name,
sanitize_name,
validate_blob_container_name,
validate_index_file_exist,
)


def test_validate_blob_container_name():
"""Test the graphrag_app.utils.common.validate_blob_container_name function."""
# test valid container name
assert validate_blob_container_name("validcontainername") is None
# test invalid container name
with pytest.raises(ValueError):
validate_blob_container_name("invalidContainerName")
with pytest.raises(ValueError):
validate_blob_container_name(
"invalidcontainernameinvalidcontainernameinvalidcontainerinvalids"
)
with pytest.raises(ValueError):
validate_blob_container_name("*invalidContainerName")
with pytest.raises(ValueError):
validate_blob_container_name("invalid+ContainerName")
with pytest.raises(ValueError):
validate_blob_container_name("invalid--containername")
with pytest.raises(ValueError):
validate_blob_container_name("invalidcontainername-")


def test_desanitize_name(container_with_graphml_file):
"""Test the graphrag_app.utils.common.desanitize_name function."""
# test retrieving a valid container name
Expand Down

0 comments on commit c3d7f91

Please sign in to comment.