Add semantic caching, update model and locations

Azure-Samples · Feb 7, 2025 · 2987fb5 · 2987fb5
1 parent a532a8d
commit 2987fb5
Show file tree

Hide file tree

Showing 3 changed files with 375 additions and 52 deletions.
diff --git a/labs/zero-to-production/main.bicep b/labs/zero-to-production/main.bicep
@@ -9,10 +9,19 @@ param openAIConfig array = []
 param openAIModelName string
 param openAIModelVersion string
 param openAIDeploymentName string
+param openAIModelSKU string
 param openAIModelCapacity int
 param openAIAPIVersion string = '2024-02-01'
 param policyXml string
 
+param embeddingsDeploymentName string = 'text-embedding-ada-002'
+param embeddingsModelName string = 'text-embedding-ada-002'
+param embeddingsModelVersion string = '2'
+
+param redisCacheName string = 'rediscache'
+param redisCacheSKU string = 'Balanced_B0'
+param redisCachePort int = 10000
+
 // ------------------
 //    VARIABLES
 // ------------------
@@ -47,7 +56,35 @@ module appInsightsModule '../../modules/monitor/v1/appinsights.bicep' = {
 var appInsightsId = appInsightsModule.outputs.id
 var appInsightsInstrumentationKey = appInsightsModule.outputs.instrumentationKey
 
-// 3. API Management
+// 3. Redis Cache
+// 2/4/25: 2024-10-01 is not yet available in all regions. 2024-09-01-preview is more widely available.
+
+// https://learn.microsoft.com/azure/templates/microsoft.cache/redisenterprise
+resource redisEnterprise 'Microsoft.Cache/redisEnterprise@2024-09-01-preview' = {
+  name: '${redisCacheName}-${resourceSuffix}'
+  location: resourceGroup().location
+  sku: {
+    name: redisCacheSKU
+  }
+}
+
+// https://learn.microsoft.com/azure/templates/microsoft.cache/redisenterprise/databases
+resource redisCache 'Microsoft.Cache/redisEnterprise/databases@2024-09-01-preview' = {
+  name: 'default'
+  parent: redisEnterprise
+  properties: {
+    evictionPolicy: 'NoEviction'
+    clusteringPolicy: 'EnterpriseCluster'
+    modules: [
+      {
+        name: 'RediSearch'
+      }
+    ]
+    port: redisCachePort
+  }
+}
+
+// 4. API Management
 module apimModule '../../modules/apim/v1/apim.bicep' = {
   name: 'apimModule'
   params: {
@@ -57,21 +94,45 @@ module apimModule '../../modules/apim/v1/apim.bicep' = {
   }
 }
 
-// 4. Cognitive Services
+// 5. Cognitive Services
 module openAIModule '../../modules/cognitive-services/v1/openai.bicep' = {
   name: 'openAIModule'
   params: {
     openAIConfig: openAIConfig
     openAIDeploymentName: openAIDeploymentName
     openAIModelName: openAIModelName
     openAIModelVersion: openAIModelVersion
+    openAIModelSKU: openAIModelSKU
     openAIModelCapacity: openAIModelCapacity
     apimPrincipalId: apimModule.outputs.principalId
     lawId: lawId
   }
 }
 
-// 5. APIM OpenAI API
+resource cognitiveService 'Microsoft.CognitiveServices/accounts@2024-10-01' existing = {
+  name: '${openAIConfig[0].name}-${resourceSuffix}'
+}
+
+resource embeddingsDeployment 'Microsoft.CognitiveServices/accounts/deployments@2023-05-01' = {
+  name: embeddingsDeploymentName
+  parent: cognitiveService
+  properties: {
+    model: {
+      format: (length(openAIModule.outputs.extendedOpenAIConfig) > 0) ? 'OpenAI': ''
+      name: embeddingsModelName
+      version: embeddingsModelVersion
+    }
+  }
+  sku: {
+      name: 'Standard'
+      capacity: 20
+  }
+  dependsOn: [
+    cognitiveService
+  ]
+}
+
+// 6. APIM OpenAI API
 module openAIAPIModule '../../modules/apim/v1/openai-api.bicep' = {
   name: 'openAIAPIModule'
   params: {
@@ -83,7 +144,7 @@ module openAIAPIModule '../../modules/apim/v1/openai-api.bicep' = {
   }
 }
 
-// 6. Create New APIM Subscriptions
+// 7. Create New APIM Subscriptions
 
 // We presume the APIM resource has been created as part of this bicep flow.
 resource apim 'Microsoft.ApiManagement/service@2024-06-01-preview' existing = {
@@ -116,6 +177,27 @@ resource apimSubscriptions 'Microsoft.ApiManagement/service/subscriptions@2024-0
   ]
 }]
 
+// https://learn.microsoft.com/azure/templates/microsoft.apimanagement/service/caches
+resource apimCache 'Microsoft.ApiManagement/service/caches@2024-06-01-preview' = {
+  name: 'Default'
+  parent: apim
+  properties: {
+    connectionString: '${redisEnterprise.properties.hostName}:${redisCachePort},password=${redisCache.listKeys().primaryKey},ssl=True,abortConnect=False'
+    useFromLocation: 'Default'
+    description: redisEnterprise.properties.hostName
+  }
+}
+
+resource backendEmbeddings 'Microsoft.ApiManagement/service/backends@2024-06-01-preview' = {
+  name: 'embeddings-backend' // this name is hard coded in the policy.xml file
+  parent: apim
+  properties: {
+    description: 'Embeddings Backend'
+    url: '${openAIModule.outputs.extendedOpenAIConfig[0].endpoint}openai/deployments/${embeddingsDeploymentName}/embeddings'
+    protocol: 'http'
+  }
+}
+
 // ------------------
 //    MARK: OUTPUTS
 // ------------------
@@ -133,3 +215,8 @@ output apimSubscription1Key string = apimSubscriptions[0].listSecrets().primaryK
 output apimSubscription2Key string = apimSubscriptions[1].listSecrets().primaryKey
 #disable-next-line outputs-should-not-contain-secrets
 output apimSubscription3Key string = apimSubscriptions[2].listSecrets().primaryKey
+
+output redisCacheHost string = redisEnterprise.properties.hostName
+#disable-next-line outputs-should-not-contain-secrets
+output redisCacheKey string = redisCache.listKeys().primaryKey
+output redisCachePort int = redisCachePort
diff --git a/labs/zero-to-production/policy-4.xml b/labs/zero-to-production/policy-4.xml
@@ -0,0 +1,46 @@
+<policies>
+    <inbound>
+        <base />
+        <!--Policy 4 - Semantic Caching-->
+        <!-- Check the embeddings in the Redis cache for a cached prompt response according to the score threshold. If successful, the policy automatically moves past the azure-openai-semantic-cache-store policy. -->
+        <azure-openai-semantic-cache-lookup score-threshold="0.8" embeddings-backend-id ="embeddings-backend" embeddings-backend-auth ="system-assigned" />
+        <!-- Authenticate to Azure OpenAI with API Management's managed identity -->
+        <authentication-managed-identity resource="https://cognitiveservices.azure.com" output-token-variable-name="managed-id-access-token" ignore-error="false" />
+        <set-header name="Authorization" exists-action="override">
+            <value>@("Bearer " + (string)context.Variables["managed-id-access-token"])</value>
+        </set-header>
+        <set-backend-service backend-id="{backend-id}" />
+        <!--Policy 3 - Limit the tokens per subscription-->
+        <azure-openai-token-limit counter-key="@(context.Subscription.Id)" tokens-per-minute="{tpm}" estimate-prompt-tokens="false" remaining-tokens-variable-name="remainingTokens" />
+        <!--Policy 2 - Emit the Azure OpenAI Token Metrics -->
+        <azure-openai-emit-token-metric namespace="openai">
+            <dimension name="Subscription ID" value="@(context.Subscription.Id)" />
+            <dimension name="Client IP" value="@(context.Request.IpAddress)" />
+            <dimension name="API ID" value="@(context.Api.Id)" />
+            <dimension name="User ID" value="@(context.Request.Headers.GetValueOrDefault("x-user-id", "N/A"))" />
+        </azure-openai-emit-token-metric>
+    </inbound>
+    <backend>
+        <!--Policy 1 - Apply load-balancing and retry mechanisms -->
+        <!--Set count to one less than the number of backends in the pool to try all backends until the backend pool is temporarily unavailable.-->
+        <retry count="{retry-count}" interval="0" first-fast-retry="true" condition="@(context.Response.StatusCode == 429 || (context.Response.StatusCode == 503 && !context.Response.StatusReason.Contains("Backend pool") && !context.Response.StatusReason.Contains("is temporarily unavailable")))">
+            <forward-request buffer-request-body="true" />
+        </retry>
+    </backend>
+    <outbound>
+        <!-- Cache the Gen AI response in Redis for 2 minutes -->
+        <azure-openai-semantic-cache-store duration="120" />
+        <base />
+    </outbound>
+    <on-error>
+        <base />
+        <choose>
+            <!--Return a generic error that does not reveal backend pool details.-->
+            <when condition="@(context.Response.StatusCode == 503)">
+                <return-response>
+                    <set-status code="503" reason="Service Unavailable" />
+                </return-response>
+            </when>
+        </choose>
+    </on-error>
+</policies>