Remove semantic caching (for now) from zero-to-prod

Azure-Samples · Feb 7, 2025 · 88c3fa9 · 88c3fa9
1 parent 1c76ab0
commit 88c3fa9
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 112 deletions.
diff --git a/labs/zero-to-production/main.bicep b/labs/zero-to-production/main.bicep
@@ -14,14 +14,6 @@ param openAIModelCapacity int
 param openAIAPIVersion string = '2024-02-01'
 param policyXml string
 
-param embeddingsDeploymentName string = 'text-embedding-ada-002'
-param embeddingsModelName string = 'text-embedding-ada-002'
-param embeddingsModelVersion string = '2'
-
-param redisCacheName string = 'rediscache'
-param redisCacheSKU string = 'Balanced_B0'
-param redisCachePort int = 10000
-
 // ------------------
 //    VARIABLES
 // ------------------
@@ -56,35 +48,7 @@ module appInsightsModule '../../modules/monitor/v1/appinsights.bicep' = {
 var appInsightsId = appInsightsModule.outputs.id
 var appInsightsInstrumentationKey = appInsightsModule.outputs.instrumentationKey
 
-// 3. Redis Cache
-// 2/4/25: 2024-10-01 is not yet available in all regions. 2024-09-01-preview is more widely available.
-
-// https://learn.microsoft.com/azure/templates/microsoft.cache/redisenterprise
-resource redisEnterprise 'Microsoft.Cache/redisEnterprise@2024-09-01-preview' = {
-  name: '${redisCacheName}-${resourceSuffix}'
-  location: resourceGroup().location
-  sku: {
-    name: redisCacheSKU
-  }
-}
-
-// https://learn.microsoft.com/azure/templates/microsoft.cache/redisenterprise/databases
-resource redisCache 'Microsoft.Cache/redisEnterprise/databases@2024-09-01-preview' = {
-  name: 'default'
-  parent: redisEnterprise
-  properties: {
-    evictionPolicy: 'NoEviction'
-    clusteringPolicy: 'EnterpriseCluster'
-    modules: [
-      {
-        name: 'RediSearch'
-      }
-    ]
-    port: redisCachePort
-  }
-}
-
-// 4. API Management
+// 3. API Management
 module apimModule '../../modules/apim/v1/apim.bicep' = {
   name: 'apimModule'
   params: {
@@ -94,7 +58,7 @@ module apimModule '../../modules/apim/v1/apim.bicep' = {
   }
 }
 
-// 5. Cognitive Services
+// 4. Cognitive Services
 module openAIModule '../../modules/cognitive-services/v1/openai.bicep' = {
   name: 'openAIModule'
   params: {
@@ -109,30 +73,7 @@ module openAIModule '../../modules/cognitive-services/v1/openai.bicep' = {
   }
 }
 
-resource cognitiveService 'Microsoft.CognitiveServices/accounts@2024-10-01' existing = {
-  name: '${openAIConfig[0].name}-${resourceSuffix}'
-}
-
-resource embeddingsDeployment 'Microsoft.CognitiveServices/accounts/deployments@2023-05-01' = {
-  name: embeddingsDeploymentName
-  parent: cognitiveService
-  properties: {
-    model: {
-      format: (length(openAIModule.outputs.extendedOpenAIConfig) > 0) ? 'OpenAI': ''
-      name: embeddingsModelName
-      version: embeddingsModelVersion
-    }
-  }
-  sku: {
-      name: 'Standard'
-      capacity: 20
-  }
-  dependsOn: [
-    cognitiveService
-  ]
-}
-
-// 6. APIM OpenAI API
+// 5. APIM OpenAI API
 module openAIAPIModule '../../modules/apim/v1/openai-api.bicep' = {
   name: 'openAIAPIModule'
   params: {
@@ -144,7 +85,7 @@ module openAIAPIModule '../../modules/apim/v1/openai-api.bicep' = {
   }
 }
 
-// 7. Create New APIM Subscriptions
+// 6. Create New APIM Subscriptions
 
 // We presume the APIM resource has been created as part of this bicep flow.
 resource apim 'Microsoft.ApiManagement/service@2024-06-01-preview' existing = {
@@ -177,27 +118,6 @@ resource apimSubscriptions 'Microsoft.ApiManagement/service/subscriptions@2024-0
   ]
 }]
 
-// https://learn.microsoft.com/azure/templates/microsoft.apimanagement/service/caches
-resource apimCache 'Microsoft.ApiManagement/service/caches@2024-06-01-preview' = {
-  name: 'Default'
-  parent: apim
-  properties: {
-    connectionString: '${redisEnterprise.properties.hostName}:${redisCachePort},password=${redisCache.listKeys().primaryKey},ssl=True,abortConnect=False'
-    useFromLocation: 'Default'
-    description: redisEnterprise.properties.hostName
-  }
-}
-
-resource backendEmbeddings 'Microsoft.ApiManagement/service/backends@2024-06-01-preview' = {
-  name: 'embeddings-backend' // this name is hard coded in the policy.xml file
-  parent: apim
-  properties: {
-    description: 'Embeddings Backend'
-    url: '${openAIModule.outputs.extendedOpenAIConfig[0].endpoint}openai/deployments/${embeddingsDeploymentName}/embeddings'
-    protocol: 'http'
-  }
-}
-
 // ------------------
 //    MARK: OUTPUTS
 // ------------------
@@ -215,8 +135,3 @@ output apimSubscription1Key string = apimSubscriptions[0].listSecrets().primaryK
 output apimSubscription2Key string = apimSubscriptions[1].listSecrets().primaryKey
 #disable-next-line outputs-should-not-contain-secrets
 output apimSubscription3Key string = apimSubscriptions[2].listSecrets().primaryKey
-
-output redisCacheHost string = redisEnterprise.properties.hostName
-#disable-next-line outputs-should-not-contain-secrets
-output redisCacheKey string = redisCache.listKeys().primaryKey
-output redisCachePort int = redisCachePort
diff --git a/labs/zero-to-production/zero-to-production.ipynb b/labs/zero-to-production/zero-to-production.ipynb
@@ -24,8 +24,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='0'></a>\n",
-    "### 0️⃣ Initialize notebook variables\n",
+    "<a id='1'></a>\n",
+    "### 1) Initialize notebook variables\n",
     "\n",
     "- Resources will be suffixed by a unique string based on your subscription id.\n",
     "- Adjust the location parameters according your preferences and on the [product availability by Azure region.](https://azure.microsoft.com/explore/global-infrastructure/products-by-region/?cdn=disable&products=cognitive-services,api-management)\n",
@@ -43,7 +43,7 @@
     "import utils\n",
     "\n",
     "deployment_name = os.path.basename(os.path.dirname(globals()['__vsc_ipynb_file__']))\n",
-    "resource_group_name = f\"lab-{deployment_name}\" # change the name to match your naming style\n",
+    "resource_group_name = f\"lab-{deployment_name}-g2\" # change the name to match your naming style\n",
     "resource_group_location = \"eastus2\"\n",
     "\n",
     "apim_sku = 'Basicv2'\n",
@@ -58,27 +58,29 @@
     "openai_deployment_name = \"gpt-4o-mini\"\n",
     "openai_model_name = \"gpt-4o-mini\"\n",
     "openai_model_version = \"2024-07-18\"\n",
-    "openai_model_capacity = 8\n",
+    "openai_model_capacity = 20\n",
     "openai_model_sku = 'Standard'\n",
     "openai_api_version = \"2024-02-01\"\n",
     "\n",
+    "backend_id = 'openai-backend-pool' if len(openai_resources) > 1 else openai_resources[0]['name']\n",
+    "\n",
+    "# The provisioning of the Redis (and embedding) resources takes several additional minutes. Therefore, we provide a flag to skip the creation of these resources, which means that the Semantic Caching portion of the notebook will not work.\n",
+    "create_semantic_caching_resources = True\n",
     "embeddings_deployment_name = \"text-embedding-ada-002\"\n",
     "embeddings_model_name = \"text-embedding-ada-002\"\n",
     "embeddings_model_version = \"2\"\n",
     "rediscache_name = \"rediscache\"\n",
     "rediscache_sku = \"Balanced_B0\" # By default it uses the most cost efficient. Check the docs to choose the right SKU: https://learn.microsoft.com/en-us/azure/azure-cache-for-redis/managed-redis/managed-redis-overview#choosing-the-right-tier\n",
     "\n",
-    "backend_id = 'openai-backend-pool' if len(openai_resources) > 1 else openai_resources[0]['name']\n",
-    "\n",
     "utils.print_ok('Notebook initialized')"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='1'></a>\n",
-    "### 1️⃣ Verify the Azure CLI and the connected Azure subscription\n",
+    "<a id='2'></a>\n",
+    "### 2) Verify the Azure CLI and the connected Azure subscription\n",
     "\n",
     "The following commands ensure that you have the latest version of the Azure CLI and that the Azure CLI is connected to your Azure subscription."
    ]
@@ -105,14 +107,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='2'></a>\n",
-    "### 2️⃣ Policy 1 - Load Balancing\n",
+    "<a id='3'></a>\n",
+    "### 3) Policy 1 - Load Balancing\n",
     "\n",
     "This lab uses [Bicep](https://learn.microsoft.com/azure/azure-resource-manager/bicep/overview?tabs=bicep) to declarative define all the resources that will be deployed in the specified resource group. Change the parameters or the [main.bicep](main.bicep) directly to try different configurations.\n",
     "\n",
     "`openAIModelCapacity` is set intentionally low to `8` (8k tokens per minute) to trigger the retry logic in the load balancer (transparent to the user) as well as the priority failover from priority 1 to 2.\n",
     "\n",
-    "<a id='2deployment'></a>\n",
     "#### Create deployment using 🦾 Bicep\n",
     "\n",
     "The `retry-count` parameter should have a value that represents one less than the total number of backends. For example, if we have three defined Azure OpenAI backends, we want to try initially, then have up to two retries, so long as we have remaining, active backends. This ensures that we cover all available backends."
@@ -143,6 +144,7 @@
     "        \"openAIModelCapacity\": { \"value\": openai_model_capacity },\n",
     "        \"openAIModelSKU\": { \"value\": openai_model_sku },\n",
     "        \"openAIAPIVersion\": { \"value\": openai_api_version },\n",
+    "        \"createSemanticCachingResources\": { \"value\": create_semantic_caching_resources },\n",
     "        \"embeddingsModelName\": { \"value\": embeddings_model_name },\n",
     "        \"embeddingsModelVersion\": { \"value\": embeddings_model_version },\n",
     "        \"redisCacheName\": { \"value\": rediscache_name },\n",
@@ -362,12 +364,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='3'></a>\n",
-    "### 3️⃣ Policy 2 - Token Emitting\n",
+    "<a id='4'></a>\n",
+    "### 4) Policy 2 - Token Emitting\n",
     "\n",
     "We now add token emitting to the existing API policy in order to track token usage by subscriptions. This aids usage and cost analysis and chargeback models inside organizations.\n",
     "\n",
-    "<a id='3deployment'></a>\n",
     "#### Update deployment using 🦾 Bicep"
    ]
   },
@@ -405,7 +406,7 @@
     "import time\n",
     "from openai import AzureOpenAI\n",
     "\n",
-    "runs = 10\n",
+    "runs = 5\n",
     "sleep_time_ms = 100\n",
     "total_tokens_all_runs = [0, 0, 0]\n",
     "\n",
@@ -482,7 +483,7 @@
     "<a id='3kql'></a>\n",
     "#### 🔍 Analyze Application Insights custom metrics with a KQL query\n",
     "\n",
-    "With this query you can get the custom metrics that were emitted by Azure APIM. **Note that it may take a few minutes for data to become available.**"
+    "With this query you can get the custom metrics that were emitted by Azure APIM. **Note that it may take a few minutes for data to become available.** If you see the metrics in the Azure Portal, you don't have to wait for the data here and plotting it to continue. Come back to this section later, if you like."
    ]
   },
   {
@@ -543,12 +544,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='4'></a>\n",
-    "### 4️⃣ Policy 3 - Token Rate Limiting\n",
+    "<a id='5'></a>\n",
+    "### 5) Policy 3 - Token Rate Limiting\n",
     "\n",
     "Adding rate limiting for subscriptions is a sensible way to limit runaway usage.\n",
     "\n",
-    "<a id='4deployment'></a>\n",
     "#### Update deployment using 🦾 Bicep"
    ]
   },
@@ -587,7 +587,7 @@
     "import time\n",
     "from openai import AzureOpenAI\n",
     "\n",
-    "runs = 10\n",
+    "runs = 5\n",
     "sleep_time_ms = 100\n",
     "total_tokens_all_runs = [0, 0, 0]\n",
     "\n",
@@ -687,14 +687,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import time\n",
+    "import random, time\n",
     "from openai import AzureOpenAI\n",
     "\n",
     "runs = 5\n",
     "sleep_time_ms = 100\n",
     "total_tokens_all_runs = [0, 0, 0]\n",
     "api_runs = []  # Response Times for each run\n",
     "\n",
+    "questions = [\"How to Brew the Perfect Cup of Coffee?\",\n",
+    "             \"What are the steps to Craft the Ideal Espresso?\",\n",
+    "             \"Tell me how to create the best steaming Java?\",\n",
+    "             \"Explain how to make a caffeinated brewed beverage?\"]\n",
+    "\n",
     "clients = [\n",
     "    AzureOpenAI(\n",
     "        azure_endpoint = apim_resource_gateway_url,\n",
@@ -717,19 +722,23 @@
     "    print(f\"▶️ Run {i+1}/{runs}:\")\n",
     "\n",
     "    for j in range(0, 3):\n",
+    "        print(f\"🔑 Subscription {j+1}\")\n",
+    "        random_question = random.choice(questions)\n",
+    "        print(\"💬 \", random_question)\n",
+    "\n",
     "        start_time = time.time()\n",
     "\n",
     "        raw_response = clients[j].chat.completions.with_raw_response.create(\n",
     "            model = openai_model_name,\n",
     "            messages = [\n",
     "                {\"role\": \"system\", \"content\": \"You are a sarcastic, unhelpful assistant.\"},\n",
-    "                {\"role\": \"user\", \"content\": \"Can you tell me the time, please?\"}\n",
+    "                {\"role\": \"user\", \"content\": random_question}\n",
     "            ],\n",
     "            extra_headers = {\"x-user-id\": \"alex\"}\n",
     "        )\n",
     "\n",
     "        response_time = time.time() - start_time\n",
-    "        print(f\"🔑 Subscription {j+1}\")\n",
+    "\n",
     "        print(f\"⌚ {response_time:.2f} seconds\")\n",
     "        print(f\"x-ms-region: \\x1b[1;32m{raw_response.headers.get(\"x-ms-region\")}\\x1b[0m\") # this header is useful to determine the region of the backend that served the request\n",
     "\n",