remove unnecesary validation check

Azure-Samples · Jan 29, 2025 · c3d7f91 · c3d7f91
1 parent 854238b
commit c3d7f91
Show file tree

Hide file tree

Showing 5 changed files with 47 additions and 123 deletions.
diff --git a/backend/graphrag_app/api/data.py b/backend/graphrag_app/api/data.py
@@ -20,8 +20,8 @@
     StorageNameList,
 )
 from graphrag_app.utils.common import (
-    delete_blob_container,
-    delete_cosmos_container_item,
+    delete_cosmos_container_item_if_exist,
+    delete_storage_container_if_exist,
     desanitize_name,
     get_blob_container_client,
     get_cosmos_container_store_client,
@@ -178,10 +178,10 @@ async def delete_files(sanitized_container_name: str = Depends(sanitize_name)):
     # sanitized_container_name = sanitize_name(container_name)
     original_container_name = desanitize_name(sanitized_container_name)
     try:
-        # delete container in Azure Storage
-        delete_blob_container(sanitized_container_name)
-        # delete entry from container-store in cosmosDB
-        delete_cosmos_container_item("container-store", sanitized_container_name)
+        delete_storage_container_if_exist(sanitized_container_name)
+        delete_cosmos_container_item_if_exist(
+            "container-store", sanitized_container_name
+        )
     except Exception:
         logger = load_pipeline_logger()
         logger.error(

diff --git a/backend/graphrag_app/api/index.py b/backend/graphrag_app/api/index.py
@@ -28,12 +28,11 @@
 from graphrag_app.typing.pipeline import PipelineJobState
 from graphrag_app.utils.azure_clients import AzureClientManager
 from graphrag_app.utils.common import (
-    delete_blob_container,
-    delete_cosmos_container_item,
+    delete_cosmos_container_item_if_exist,
+    delete_storage_container_if_exist,
     desanitize_name,
     get_cosmos_container_store_client,
     sanitize_name,
-    validate_blob_container_name,
 )
 from graphrag_app.utils.pipeline import PipelineJob
 
@@ -50,8 +49,8 @@
     responses={200: {"model": BaseResponse}},
 )
 async def schedule_index_job(
-    storage_name: str,
-    index_name: str,
+    storage_container_name: str,
+    index_container_name: str,
     entity_extraction_prompt: UploadFile | None = None,
     entity_summarization_prompt: UploadFile | None = None,
     community_summarization_prompt: UploadFile | None = None,
@@ -61,21 +60,16 @@ async def schedule_index_job(
     pipelinejob = PipelineJob()
 
     # validate index name against blob container naming rules
-    sanitized_index_name = sanitize_name(index_name)
-    try:
-        validate_blob_container_name(sanitized_index_name)
-    except ValueError:
-        raise HTTPException(
-            status_code=500,
-            detail=f"Invalid index name: {index_name}",
-        )
+    sanitized_index_container_name = sanitize_name(index_container_name)
 
     # check for data container existence
-    sanitized_storage_name = sanitize_name(storage_name)
-    if not blob_service_client.get_container_client(sanitized_storage_name).exists():
+    sanitized_storage_container_name = sanitize_name(storage_container_name)
+    if not blob_service_client.get_container_client(
+        sanitized_storage_container_name
+    ).exists():
         raise HTTPException(
             status_code=500,
-            detail=f"Storage blob container {storage_name} does not exist",
+            detail=f"Storage container '{storage_container_name}' does not exist",
         )
 
     # check for prompts
@@ -98,19 +92,20 @@ async def schedule_index_job(
     # check for existing index job
     # it is okay if job doesn't exist, but if it does,
     # it must not be scheduled or running
-    if pipelinejob.item_exist(sanitized_index_name):
-        existing_job = pipelinejob.load_item(sanitized_index_name)
+    if pipelinejob.item_exist(sanitized_index_container_name):
+        existing_job = pipelinejob.load_item(sanitized_index_container_name)
         if (PipelineJobState(existing_job.status) == PipelineJobState.SCHEDULED) or (
             PipelineJobState(existing_job.status) == PipelineJobState.RUNNING
         ):
             raise HTTPException(
                 status_code=202,  # request has been accepted for processing but is not complete.
-                detail=f"Index '{index_name}' already exists and has not finished building.",
+                detail=f"Index '{index_container_name}' already exists and has not finished building.",
             )
         # if indexing job is in a failed state, delete the associated K8s job and pod to allow for a new job to be scheduled
         if PipelineJobState(existing_job.status) == PipelineJobState.FAILED:
             _delete_k8s_job(
-                f"indexing-job-{sanitized_index_name}", os.environ["AKS_NAMESPACE"]
+                f"indexing-job-{sanitized_index_container_name}",
+                os.environ["AKS_NAMESPACE"],
             )
         # reset the pipeline job details
         existing_job._status = PipelineJobState.SCHEDULED
@@ -128,9 +123,9 @@ async def schedule_index_job(
         existing_job.update_db()
     else:
         pipelinejob.create_item(
-            id=sanitized_index_name,
-            human_readable_index_name=index_name,
-            human_readable_storage_name=storage_name,
+            id=sanitized_index_container_name,
+            human_readable_index_name=index_container_name,
+            human_readable_storage_name=storage_container_name,
             entity_extraction_prompt=entity_extraction_prompt_content,
             entity_summarization_prompt=entity_summarization_prompt_content,
             community_summarization_prompt=community_summarization_prompt_content,
@@ -218,22 +213,20 @@ async def delete_index(
     sanitized_container_name: str = Depends(sanitize_name),
 ):
     """
-    Delete a specified index.
+    Delete a specified index and all associated metadata.
     """
     try:
         # kill indexing job if it is running
         if os.getenv("KUBERNETES_SERVICE_HOST"):  # only found if in AKS
             _delete_k8s_job(f"indexing-job-{sanitized_container_name}", "graphrag")
 
-        # delete blob container
-        delete_blob_container(sanitized_container_name)
-
-        # delete entry from "container-store" container in cosmosDB
-        delete_cosmos_container_item("container-store", sanitized_container_name)
-
-        # delete entry from "jobs" container in cosmosDB
-        delete_cosmos_container_item("jobs", sanitized_container_name)
+        delete_storage_container_if_exist(sanitized_container_name)
+        delete_cosmos_container_item_if_exist(
+            "container-store", sanitized_container_name
+        )
+        delete_cosmos_container_item_if_exist("jobs", sanitized_container_name)
 
+        # delete associated AI Search index
         index_client = SearchIndexClient(
             endpoint=os.environ["AI_SEARCH_URL"],
             credential=DefaultAzureCredential(),
@@ -263,7 +256,7 @@ async def delete_index(
     summary="Track the status of an indexing job",
     response_model=IndexStatusResponse,
 )
-async def get_index_job_status(sanitized_container_name: str = Depends(sanitize_name)):
+async def get_index_status(sanitized_container_name: str = Depends(sanitize_name)):
     pipelinejob = PipelineJob()
     if pipelinejob.item_exist(sanitized_container_name):
         pipeline_job = pipelinejob.load_item(sanitized_container_name)

diff --git a/backend/graphrag_app/api/prompt_tuning.py b/backend/graphrag_app/api/prompt_tuning.py
@@ -8,47 +8,50 @@
 import yaml
 from fastapi import (
     APIRouter,
+    Depends,
     HTTPException,
 )
 from graphrag.config.create_graphrag_config import create_graphrag_config
 
 from graphrag_app.logger.load_logger import load_pipeline_logger
 from graphrag_app.utils.azure_clients import AzureClientManager
-from graphrag_app.utils.common import sanitize_name
+from graphrag_app.utils.common import desanitize_name, sanitize_name
 
 prompt_tuning_route = APIRouter(prefix="/index/config", tags=["Prompt Tuning"])
 
 
 @prompt_tuning_route.get(
     "/prompts",
-    summary="Generate prompts from user-provided data.",
+    summary="Generate custom graphrag prompts based on user-provided data.",
     description="Generating custom prompts from user-provided data may take several minutes to run based on the amount of data used.",
 )
-async def generate_prompts(storage_name: str, limit: int = 5):
+async def generate_prompts(
+    limit: int = 5,
+    sanitized_container_name: str = Depends(sanitize_name),
+):
     """
     Automatically generate custom prompts for entity entraction,
     community reports, and summarize descriptions based on a sample of provided data.
     """
     # check for storage container existence
     azure_client_manager = AzureClientManager()
     blob_service_client = azure_client_manager.get_blob_service_client()
-    sanitized_storage_name = sanitize_name(storage_name)
-    if not blob_service_client.get_container_client(sanitized_storage_name).exists():
+    original_container_name = desanitize_name(sanitized_container_name)
+    if not blob_service_client.get_container_client(sanitized_container_name).exists():
         raise HTTPException(
             status_code=500,
-            detail=f"Data container '{storage_name}' does not exist.",
+            detail=f"Storage container '{original_container_name}' does not exist.",
         )
 
     # load pipeline configuration file (settings.yaml) for input data and other settings
     ROOT_DIR = Path(__file__).resolve().parent.parent.parent
     with (ROOT_DIR / "scripts/settings.yaml").open("r") as f:
         data = yaml.safe_load(f)
-    data["input"]["container_name"] = sanitized_storage_name
+    data["input"]["container_name"] = sanitized_container_name
     graphrag_config = create_graphrag_config(values=data, root_dir=".")
 
     # generate prompts
     try:
-        # NOTE: we need to call api.generate_indexing_prompts
         prompts: tuple[str, str, str] = await api.generate_indexing_prompts(
             config=graphrag_config,
             root=".",
@@ -58,7 +61,7 @@ async def generate_prompts(storage_name: str, limit: int = 5):
     except Exception as e:
         logger = load_pipeline_logger()
         error_details = {
-            "storage_name": storage_name,
+            "storage_name": original_container_name,
         }
         logger.error(
             message="Auto-prompt generation failed.",
@@ -68,7 +71,7 @@ async def generate_prompts(storage_name: str, limit: int = 5):
         )
         raise HTTPException(
             status_code=500,
-            detail=f"Error generating prompts for data in '{storage_name}'. Please try a lower limit.",
+            detail=f"Error generating prompts for data in '{original_container_name}'. Please try a lower limit.",
         )
 
     prompt_content = {

diff --git a/backend/graphrag_app/utils/common.py b/backend/graphrag_app/utils/common.py
@@ -3,7 +3,6 @@
 
 import hashlib
 import os
-import re
 
 import pandas as pd
 from azure.core.exceptions import ResourceNotFoundError
@@ -41,7 +40,7 @@ def pandas_storage_options() -> dict:
     return options
 
 
-def delete_blob_container(container_name: str):
+def delete_storage_container_if_exist(container_name: str):
     """
     Delete a blob container. If it does not exist, do nothing.
     If exception is raised, the calling function should catch it.
@@ -55,7 +54,7 @@ def delete_blob_container(container_name: str):
         pass
 
 
-def delete_cosmos_container_item(container: str, item_id: str):
+def delete_cosmos_container_item_if_exist(container: str, item_id: str):
     """
     Delete an item from a cosmosdb container. If it does not exist, do nothing.
     If exception is raised, the calling function should catch it.
@@ -109,55 +108,6 @@ def validate_index_file_exist(sanitized_container_name: str, file_name: str):
         )
 
 
-def validate_blob_container_name(container_name: str):
-    """
-    Check if container name is valid based on Azure resource naming rules.
-
-        - A blob container name must be between 3 and 63 characters in length.
-        - Start with a letter or number
-        - All letters used in blob container names must be lowercase.
-        - Contain only letters, numbers, or the hyphen.
-        - Consecutive hyphens are not permitted.
-        - Cannot end with a hyphen.
-
-    Args:
-    -----
-    container_name (str)
-        The blob container name to be validated.
-
-    Raises: ValueError
-    """
-    # Check the length of the name
-    if len(container_name) < 3 or len(container_name) > 63:
-        raise ValueError(
-            f"Container name must be between 3 and 63 characters in length. Name provided was {len(container_name)} characters long."
-        )
-
-    # Check if the name starts with a letter or number
-    if not container_name[0].isalnum():
-        raise ValueError(
-            f"Container name must start with a letter or number. Starting character was {container_name[0]}."
-        )
-
-    # Check for valid characters (letters, numbers, hyphen) and lowercase letters
-    if not re.match("^[a-z0-9-]+$", container_name):
-        raise ValueError(
-            f"Container name must only contain:\n- lowercase letters\n- numbers\n- or hyphens\nName provided was {container_name}."
-        )
-
-    # Check for consecutive hyphens
-    if "--" in container_name:
-        raise ValueError(
-            f"Container name cannot contain consecutive hyphens. Name provided was {container_name}."
-        )
-
-    # Check for hyphens at the end of the name
-    if container_name[-1] == "-":
-        raise ValueError(
-            f"Container name cannot end with a hyphen. Name provided was {container_name}."
-        )
-
-
 def get_cosmos_container_store_client() -> ContainerProxy:
     try:
         azure_client_manager = AzureClientManager()

diff --git a/backend/tests/unit/test_common.py b/backend/tests/unit/test_common.py
@@ -6,32 +6,10 @@
 from graphrag_app.utils.common import (
     desanitize_name,
     sanitize_name,
-    validate_blob_container_name,
     validate_index_file_exist,
 )
 
 
-def test_validate_blob_container_name():
-    """Test the graphrag_app.utils.common.validate_blob_container_name function."""
-    # test valid container name
-    assert validate_blob_container_name("validcontainername") is None
-    # test invalid container name
-    with pytest.raises(ValueError):
-        validate_blob_container_name("invalidContainerName")
-    with pytest.raises(ValueError):
-        validate_blob_container_name(
-            "invalidcontainernameinvalidcontainernameinvalidcontainerinvalids"
-        )
-    with pytest.raises(ValueError):
-        validate_blob_container_name("*invalidContainerName")
-    with pytest.raises(ValueError):
-        validate_blob_container_name("invalid+ContainerName")
-    with pytest.raises(ValueError):
-        validate_blob_container_name("invalid--containername")
-    with pytest.raises(ValueError):
-        validate_blob_container_name("invalidcontainername-")
-
-
 def test_desanitize_name(container_with_graphml_file):
     """Test the graphrag_app.utils.common.desanitize_name function."""
     # test retrieving a valid container name