Skip to content

Commit

Permalink
Remove semantic caching (for now) from zero-to-prod
Browse files Browse the repository at this point in the history
  • Loading branch information
simonkurtz-MSFT committed Feb 7, 2025
1 parent 1c76ab0 commit 88c3fa9
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 112 deletions.
93 changes: 4 additions & 89 deletions labs/zero-to-production/main.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,6 @@ param openAIModelCapacity int
param openAIAPIVersion string = '2024-02-01'
param policyXml string

param embeddingsDeploymentName string = 'text-embedding-ada-002'
param embeddingsModelName string = 'text-embedding-ada-002'
param embeddingsModelVersion string = '2'

param redisCacheName string = 'rediscache'
param redisCacheSKU string = 'Balanced_B0'
param redisCachePort int = 10000

// ------------------
// VARIABLES
// ------------------
Expand Down Expand Up @@ -56,35 +48,7 @@ module appInsightsModule '../../modules/monitor/v1/appinsights.bicep' = {
var appInsightsId = appInsightsModule.outputs.id
var appInsightsInstrumentationKey = appInsightsModule.outputs.instrumentationKey

// 3. Redis Cache
// 2/4/25: 2024-10-01 is not yet available in all regions. 2024-09-01-preview is more widely available.

// https://learn.microsoft.com/azure/templates/microsoft.cache/redisenterprise
resource redisEnterprise 'Microsoft.Cache/redisEnterprise@2024-09-01-preview' = {
name: '${redisCacheName}-${resourceSuffix}'
location: resourceGroup().location
sku: {
name: redisCacheSKU
}
}

// https://learn.microsoft.com/azure/templates/microsoft.cache/redisenterprise/databases
resource redisCache 'Microsoft.Cache/redisEnterprise/databases@2024-09-01-preview' = {
name: 'default'
parent: redisEnterprise
properties: {
evictionPolicy: 'NoEviction'
clusteringPolicy: 'EnterpriseCluster'
modules: [
{
name: 'RediSearch'
}
]
port: redisCachePort
}
}

// 4. API Management
// 3. API Management
module apimModule '../../modules/apim/v1/apim.bicep' = {
name: 'apimModule'
params: {
Expand All @@ -94,7 +58,7 @@ module apimModule '../../modules/apim/v1/apim.bicep' = {
}
}

// 5. Cognitive Services
// 4. Cognitive Services
module openAIModule '../../modules/cognitive-services/v1/openai.bicep' = {
name: 'openAIModule'
params: {
Expand All @@ -109,30 +73,7 @@ module openAIModule '../../modules/cognitive-services/v1/openai.bicep' = {
}
}

resource cognitiveService 'Microsoft.CognitiveServices/accounts@2024-10-01' existing = {
name: '${openAIConfig[0].name}-${resourceSuffix}'
}

resource embeddingsDeployment 'Microsoft.CognitiveServices/accounts/deployments@2023-05-01' = {
name: embeddingsDeploymentName
parent: cognitiveService
properties: {
model: {
format: (length(openAIModule.outputs.extendedOpenAIConfig) > 0) ? 'OpenAI': ''
name: embeddingsModelName
version: embeddingsModelVersion
}
}
sku: {
name: 'Standard'
capacity: 20
}
dependsOn: [
cognitiveService
]
}

// 6. APIM OpenAI API
// 5. APIM OpenAI API
module openAIAPIModule '../../modules/apim/v1/openai-api.bicep' = {
name: 'openAIAPIModule'
params: {
Expand All @@ -144,7 +85,7 @@ module openAIAPIModule '../../modules/apim/v1/openai-api.bicep' = {
}
}

// 7. Create New APIM Subscriptions
// 6. Create New APIM Subscriptions

// We presume the APIM resource has been created as part of this bicep flow.
resource apim 'Microsoft.ApiManagement/service@2024-06-01-preview' existing = {
Expand Down Expand Up @@ -177,27 +118,6 @@ resource apimSubscriptions 'Microsoft.ApiManagement/service/subscriptions@2024-0
]
}]

// https://learn.microsoft.com/azure/templates/microsoft.apimanagement/service/caches
resource apimCache 'Microsoft.ApiManagement/service/caches@2024-06-01-preview' = {
name: 'Default'
parent: apim
properties: {
connectionString: '${redisEnterprise.properties.hostName}:${redisCachePort},password=${redisCache.listKeys().primaryKey},ssl=True,abortConnect=False'
useFromLocation: 'Default'
description: redisEnterprise.properties.hostName
}
}

resource backendEmbeddings 'Microsoft.ApiManagement/service/backends@2024-06-01-preview' = {
name: 'embeddings-backend' // this name is hard coded in the policy.xml file
parent: apim
properties: {
description: 'Embeddings Backend'
url: '${openAIModule.outputs.extendedOpenAIConfig[0].endpoint}openai/deployments/${embeddingsDeploymentName}/embeddings'
protocol: 'http'
}
}

// ------------------
// MARK: OUTPUTS
// ------------------
Expand All @@ -215,8 +135,3 @@ output apimSubscription1Key string = apimSubscriptions[0].listSecrets().primaryK
output apimSubscription2Key string = apimSubscriptions[1].listSecrets().primaryKey
#disable-next-line outputs-should-not-contain-secrets
output apimSubscription3Key string = apimSubscriptions[2].listSecrets().primaryKey

output redisCacheHost string = redisEnterprise.properties.hostName
#disable-next-line outputs-should-not-contain-secrets
output redisCacheKey string = redisCache.listKeys().primaryKey
output redisCachePort int = redisCachePort
55 changes: 32 additions & 23 deletions labs/zero-to-production/zero-to-production.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id='0'></a>\n",
"### 0️⃣ Initialize notebook variables\n",
"<a id='1'></a>\n",
"### 1) Initialize notebook variables\n",
"\n",
"- Resources will be suffixed by a unique string based on your subscription id.\n",
"- Adjust the location parameters according your preferences and on the [product availability by Azure region.](https://azure.microsoft.com/explore/global-infrastructure/products-by-region/?cdn=disable&products=cognitive-services,api-management)\n",
Expand All @@ -43,7 +43,7 @@
"import utils\n",
"\n",
"deployment_name = os.path.basename(os.path.dirname(globals()['__vsc_ipynb_file__']))\n",
"resource_group_name = f\"lab-{deployment_name}\" # change the name to match your naming style\n",
"resource_group_name = f\"lab-{deployment_name}-g2\" # change the name to match your naming style\n",
"resource_group_location = \"eastus2\"\n",
"\n",
"apim_sku = 'Basicv2'\n",
Expand All @@ -58,27 +58,29 @@
"openai_deployment_name = \"gpt-4o-mini\"\n",
"openai_model_name = \"gpt-4o-mini\"\n",
"openai_model_version = \"2024-07-18\"\n",
"openai_model_capacity = 8\n",
"openai_model_capacity = 20\n",
"openai_model_sku = 'Standard'\n",
"openai_api_version = \"2024-02-01\"\n",
"\n",
"backend_id = 'openai-backend-pool' if len(openai_resources) > 1 else openai_resources[0]['name']\n",
"\n",
"# The provisioning of the Redis (and embedding) resources takes several additional minutes. Therefore, we provide a flag to skip the creation of these resources, which means that the Semantic Caching portion of the notebook will not work.\n",
"create_semantic_caching_resources = True\n",
"embeddings_deployment_name = \"text-embedding-ada-002\"\n",
"embeddings_model_name = \"text-embedding-ada-002\"\n",
"embeddings_model_version = \"2\"\n",
"rediscache_name = \"rediscache\"\n",
"rediscache_sku = \"Balanced_B0\" # By default it uses the most cost efficient. Check the docs to choose the right SKU: https://learn.microsoft.com/en-us/azure/azure-cache-for-redis/managed-redis/managed-redis-overview#choosing-the-right-tier\n",
"\n",
"backend_id = 'openai-backend-pool' if len(openai_resources) > 1 else openai_resources[0]['name']\n",
"\n",
"utils.print_ok('Notebook initialized')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id='1'></a>\n",
"### 1️⃣ Verify the Azure CLI and the connected Azure subscription\n",
"<a id='2'></a>\n",
"### 2) Verify the Azure CLI and the connected Azure subscription\n",
"\n",
"The following commands ensure that you have the latest version of the Azure CLI and that the Azure CLI is connected to your Azure subscription."
]
Expand All @@ -105,14 +107,13 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id='2'></a>\n",
"### 2️⃣ Policy 1 - Load Balancing\n",
"<a id='3'></a>\n",
"### 3) Policy 1 - Load Balancing\n",
"\n",
"This lab uses [Bicep](https://learn.microsoft.com/azure/azure-resource-manager/bicep/overview?tabs=bicep) to declarative define all the resources that will be deployed in the specified resource group. Change the parameters or the [main.bicep](main.bicep) directly to try different configurations.\n",
"\n",
"`openAIModelCapacity` is set intentionally low to `8` (8k tokens per minute) to trigger the retry logic in the load balancer (transparent to the user) as well as the priority failover from priority 1 to 2.\n",
"\n",
"<a id='2deployment'></a>\n",
"#### Create deployment using 🦾 Bicep\n",
"\n",
"The `retry-count` parameter should have a value that represents one less than the total number of backends. For example, if we have three defined Azure OpenAI backends, we want to try initially, then have up to two retries, so long as we have remaining, active backends. This ensures that we cover all available backends."
Expand Down Expand Up @@ -143,6 +144,7 @@
" \"openAIModelCapacity\": { \"value\": openai_model_capacity },\n",
" \"openAIModelSKU\": { \"value\": openai_model_sku },\n",
" \"openAIAPIVersion\": { \"value\": openai_api_version },\n",
" \"createSemanticCachingResources\": { \"value\": create_semantic_caching_resources },\n",
" \"embeddingsModelName\": { \"value\": embeddings_model_name },\n",
" \"embeddingsModelVersion\": { \"value\": embeddings_model_version },\n",
" \"redisCacheName\": { \"value\": rediscache_name },\n",
Expand Down Expand Up @@ -362,12 +364,11 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id='3'></a>\n",
"### 3️⃣ Policy 2 - Token Emitting\n",
"<a id='4'></a>\n",
"### 4) Policy 2 - Token Emitting\n",
"\n",
"We now add token emitting to the existing API policy in order to track token usage by subscriptions. This aids usage and cost analysis and chargeback models inside organizations.\n",
"\n",
"<a id='3deployment'></a>\n",
"#### Update deployment using 🦾 Bicep"
]
},
Expand Down Expand Up @@ -405,7 +406,7 @@
"import time\n",
"from openai import AzureOpenAI\n",
"\n",
"runs = 10\n",
"runs = 5\n",
"sleep_time_ms = 100\n",
"total_tokens_all_runs = [0, 0, 0]\n",
"\n",
Expand Down Expand Up @@ -482,7 +483,7 @@
"<a id='3kql'></a>\n",
"#### 🔍 Analyze Application Insights custom metrics with a KQL query\n",
"\n",
"With this query you can get the custom metrics that were emitted by Azure APIM. **Note that it may take a few minutes for data to become available.**"
"With this query you can get the custom metrics that were emitted by Azure APIM. **Note that it may take a few minutes for data to become available.** If you see the metrics in the Azure Portal, you don't have to wait for the data here and plotting it to continue. Come back to this section later, if you like."
]
},
{
Expand Down Expand Up @@ -543,12 +544,11 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id='4'></a>\n",
"### 4️⃣ Policy 3 - Token Rate Limiting\n",
"<a id='5'></a>\n",
"### 5) Policy 3 - Token Rate Limiting\n",
"\n",
"Adding rate limiting for subscriptions is a sensible way to limit runaway usage.\n",
"\n",
"<a id='4deployment'></a>\n",
"#### Update deployment using 🦾 Bicep"
]
},
Expand Down Expand Up @@ -587,7 +587,7 @@
"import time\n",
"from openai import AzureOpenAI\n",
"\n",
"runs = 10\n",
"runs = 5\n",
"sleep_time_ms = 100\n",
"total_tokens_all_runs = [0, 0, 0]\n",
"\n",
Expand Down Expand Up @@ -687,14 +687,19 @@
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"import random, time\n",
"from openai import AzureOpenAI\n",
"\n",
"runs = 5\n",
"sleep_time_ms = 100\n",
"total_tokens_all_runs = [0, 0, 0]\n",
"api_runs = [] # Response Times for each run\n",
"\n",
"questions = [\"How to Brew the Perfect Cup of Coffee?\",\n",
" \"What are the steps to Craft the Ideal Espresso?\",\n",
" \"Tell me how to create the best steaming Java?\",\n",
" \"Explain how to make a caffeinated brewed beverage?\"]\n",
"\n",
"clients = [\n",
" AzureOpenAI(\n",
" azure_endpoint = apim_resource_gateway_url,\n",
Expand All @@ -717,19 +722,23 @@
" print(f\"▶️ Run {i+1}/{runs}:\")\n",
"\n",
" for j in range(0, 3):\n",
" print(f\"🔑 Subscription {j+1}\")\n",
" random_question = random.choice(questions)\n",
" print(\"💬 \", random_question)\n",
"\n",
" start_time = time.time()\n",
"\n",
" raw_response = clients[j].chat.completions.with_raw_response.create(\n",
" model = openai_model_name,\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": \"You are a sarcastic, unhelpful assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"Can you tell me the time, please?\"}\n",
" {\"role\": \"user\", \"content\": random_question}\n",
" ],\n",
" extra_headers = {\"x-user-id\": \"alex\"}\n",
" )\n",
"\n",
" response_time = time.time() - start_time\n",
" print(f\"🔑 Subscription {j+1}\")\n",
"\n",
" print(f\"⌚ {response_time:.2f} seconds\")\n",
" print(f\"x-ms-region: \\x1b[1;32m{raw_response.headers.get(\"x-ms-region\")}\\x1b[0m\") # this header is useful to determine the region of the backend that served the request\n",
"\n",
Expand Down

0 comments on commit 88c3fa9

Please sign in to comment.