diff --git a/labs/zero-to-production/main.bicep b/labs/zero-to-production/main.bicep
index 958b57f..55f59fa 100644
--- a/labs/zero-to-production/main.bicep
+++ b/labs/zero-to-production/main.bicep
@@ -10,7 +10,6 @@ param openAIModelName string
param openAIModelVersion string
param openAIDeploymentName string
param openAIModelSKU string
-param openAIModelCapacity int
param openAIAPIVersion string = '2024-02-01'
param policyXml string
@@ -59,7 +58,7 @@ module apimModule '../../modules/apim/v1/apim.bicep' = {
}
// 4. Cognitive Services
-module openAIModule '../../modules/cognitive-services/v1/openai.bicep' = {
+module openAIModule '../../modules/cognitive-services/v2/openai.bicep' = {
name: 'openAIModule'
params: {
openAIConfig: openAIConfig
@@ -67,7 +66,6 @@ module openAIModule '../../modules/cognitive-services/v1/openai.bicep' = {
openAIModelName: openAIModelName
openAIModelVersion: openAIModelVersion
openAIModelSKU: openAIModelSKU
- openAIModelCapacity: openAIModelCapacity
apimPrincipalId: apimModule.outputs.principalId
lawId: lawId
}
diff --git a/labs/zero-to-production/zero-to-production.ipynb b/labs/zero-to-production/zero-to-production.ipynb
index a6f4c5e..f57a294 100644
--- a/labs/zero-to-production/zero-to-production.ipynb
+++ b/labs/zero-to-production/zero-to-production.ipynb
@@ -8,7 +8,13 @@
"\n",
"## Zero-to-Production lab\n",
"\n",
- "Playground to create a combination of several policies in an iterative approach. We start with load balancing, then progressively add token emitting, rate limiting, and, eventually, semantic caching. Each of these sets of policies is derived from other labs in this repo.\n",
+ "Playground to create a combination of several policies in an iterative approach. We will accomplish three successive policy additions:\n",
+ "\n",
+ "1) Add load balancing across multiple region\n",
+ "1) Add token emitting to observe token usage\n",
+ "1) Apply token rate limiting to avoid runaway token usage scenarios\n",
+ "\n",
+ "Each of these sets of policies is derived from other labs in this repo.\n",
"\n",
"### Prerequisites\n",
"- [Python 3.12 or later version](https://www.python.org/) installed\n",
@@ -24,12 +30,13 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
- "### 1) Initialize notebook variables\n",
+ "### Initialize notebook variables\n",
"\n",
"- Resources will be suffixed by a unique string based on your subscription id.\n",
- "- Adjust the location parameters according your preferences and on the [product availability by Azure region.](https://azure.microsoft.com/explore/global-infrastructure/products-by-region/?cdn=disable&products=cognitive-services,api-management)\n",
- "- Adjust the OpenAI model and version according the [availability by region.](https://learn.microsoft.com/azure/ai-services/openai/concepts/models) "
+ "- Change the location parameters according your preferences and on the [product availability by Azure region.](https://azure.microsoft.com/explore/global-infrastructure/products-by-region/?cdn=disable&products=cognitive-services,api-management)\n",
+ "- Adjust the OpenAI model, version according the [availability by region.](https://learn.microsoft.com/azure/ai-services/openai/concepts/models) \n",
+ "- Experiment with the priority, weight, and capacity OpenAI parameters to affect the load balancing\n",
+ "- `capacity` is set intentionally low - the unit is tokens per minute - to trigger the retry logic in the load balancer (transparent to the user) as well as the priority failover from priority 1 to 2."
]
},
{
@@ -43,35 +50,26 @@
"import utils\n",
"\n",
"deployment_name = os.path.basename(os.path.dirname(globals()['__vsc_ipynb_file__']))\n",
- "resource_group_name = f\"lab-{deployment_name}-g2\" # change the name to match your naming style\n",
+ "resource_group_name = f\"lab-{deployment_name}\" # change the name to match your naming style\n",
"resource_group_location = \"eastus2\"\n",
"\n",
"apim_sku = 'Basicv2'\n",
"\n",
"# Prioritize East US until exhaustion (simulate PTU with TPM), then equally distribute between Sweden and West US (consumption fallback)\n",
"openai_resources = [\n",
- " {\"name\": \"openai1\", \"location\": \"eastus\", \"priority\": 1},\n",
- " {\"name\": \"openai2\", \"location\": \"swedencentral\", \"priority\": 2, \"weight\": 50},\n",
- " {\"name\": \"openai3\", \"location\": \"westus\", \"priority\": 2, \"weight\": 50}\n",
+ " {\"name\": \"openai1\", \"location\": \"eastus\", \"priority\": 1, \"weight\": 100, \"capacity\": 4},\n",
+ " {\"name\": \"openai2\", \"location\": \"swedencentral\", \"priority\": 2, \"weight\": 50, \"capacity\": 8 },\n",
+ " {\"name\": \"openai3\", \"location\": \"westus\", \"priority\": 2, \"weight\": 50, \"capacity\": 8}\n",
"]\n",
"\n",
"openai_deployment_name = \"gpt-4o-mini\"\n",
"openai_model_name = \"gpt-4o-mini\"\n",
"openai_model_version = \"2024-07-18\"\n",
- "openai_model_capacity = 20\n",
"openai_model_sku = 'Standard'\n",
"openai_api_version = \"2024-02-01\"\n",
"\n",
"backend_id = 'openai-backend-pool' if len(openai_resources) > 1 else openai_resources[0]['name']\n",
"\n",
- "# The provisioning of the Redis (and embedding) resources takes several additional minutes. Therefore, we provide a flag to skip the creation of these resources, which means that the Semantic Caching portion of the notebook will not work.\n",
- "create_semantic_caching_resources = True\n",
- "embeddings_deployment_name = \"text-embedding-ada-002\"\n",
- "embeddings_model_name = \"text-embedding-ada-002\"\n",
- "embeddings_model_version = \"2\"\n",
- "rediscache_name = \"rediscache\"\n",
- "rediscache_sku = \"Balanced_B0\" # By default it uses the most cost efficient. Check the docs to choose the right SKU: https://learn.microsoft.com/en-us/azure/azure-cache-for-redis/managed-redis/managed-redis-overview#choosing-the-right-tier\n",
- "\n",
"utils.print_ok('Notebook initialized')"
]
},
@@ -79,8 +77,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
- "### 2) Verify the Azure CLI and the connected Azure subscription\n",
+ "### Verify the Azure CLI and the connected Azure subscription\n",
"\n",
"The following commands ensure that you have the latest version of the Azure CLI and that the Azure CLI is connected to your Azure subscription."
]
@@ -107,12 +104,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
- "### 3) Policy 1 - Load Balancing\n",
- "\n",
- "This lab uses [Bicep](https://learn.microsoft.com/azure/azure-resource-manager/bicep/overview?tabs=bicep) to declarative define all the resources that will be deployed in the specified resource group. Change the parameters or the [main.bicep](main.bicep) directly to try different configurations.\n",
+ "### Policy 1 - Load Balancing\n",
"\n",
- "`openAIModelCapacity` is set intentionally low to `8` (8k tokens per minute) to trigger the retry logic in the load balancer (transparent to the user) as well as the priority failover from priority 1 to 2.\n",
+ "This lab uses [Bicep](https://learn.microsoft.com/azure/azure-resource-manager/bicep/overview?tabs=bicep) to declaratively define all resources that will be deployed in the specified resource group. Change the parameters or the [main.bicep](main.bicep) directly to try different configurations.\n",
"\n",
"#### Create deployment using ๐ฆพ Bicep\n",
"\n",
@@ -141,14 +135,8 @@
" \"openAIDeploymentName\": { \"value\": openai_deployment_name },\n",
" \"openAIModelName\": { \"value\": openai_model_name },\n",
" \"openAIModelVersion\": { \"value\": openai_model_version },\n",
- " \"openAIModelCapacity\": { \"value\": openai_model_capacity },\n",
" \"openAIModelSKU\": { \"value\": openai_model_sku },\n",
- " \"openAIAPIVersion\": { \"value\": openai_api_version },\n",
- " \"createSemanticCachingResources\": { \"value\": create_semantic_caching_resources },\n",
- " \"embeddingsModelName\": { \"value\": embeddings_model_name },\n",
- " \"embeddingsModelVersion\": { \"value\": embeddings_model_version },\n",
- " \"redisCacheName\": { \"value\": rediscache_name },\n",
- " \"redisCacheSKU\": { \"value\": rediscache_sku }\n",
+ " \"openAIAPIVersion\": { \"value\": openai_api_version }\n",
" }\n",
"}\n",
"\n",
@@ -188,19 +176,17 @@
" apim_subscription1_key = utils.get_deployment_output(output, 'apimSubscription1Key', 'APIM Subscription 1 Key (masked)', True)\n",
" apim_subscription2_key = utils.get_deployment_output(output, 'apimSubscription2Key', 'APIM Subscription 2 Key (masked)', True)\n",
" apim_subscription3_key = utils.get_deployment_output(output, 'apimSubscription3Key', 'APIM Subscription 3 Key (masked)', True)\n",
- " app_insights_name = utils.get_deployment_output(output, 'applicationInsightsName', 'Application Insights Name')\n",
- " rediscache_host = utils.get_deployment_output(output, 'redisCacheHost', 'Redis Cache Host')\n",
- " rediscache_key = utils.get_deployment_output(output, 'redisCacheKey', 'Redis Cache Key (masked)', True)\n",
- " rediscache_port = int(utils.get_deployment_output(output, 'redisCachePort', 'Redis Cache Port'))"
+ " app_insights_name = utils.get_deployment_output(output, 'applicationInsightsName', 'Application Insights Name')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
- "#### ๐งช Test the API using a direct HTTP call\n",
- "Requests is an elegant and simple HTTP library for Python that will be used here to make raw API requests and inspect the responses. \n",
+ "\n",
+ "#### ๐งช Test the API using the Azure OpenAI Python SDK\n",
+ "\n",
+ "Use the OpenAI Python SDK to make requests to API Management and any of the Azure OpenAI backends. Note that we return the `x-ms-region` header to show the frontend which backend was used. You may not want to do that in a production scenario.\n",
"\n",
"You will not see HTTP 429s returned as API Management's `retry` policy will select an available backend. If no backends are viable, an HTTP 503 will be returned.\n",
"\n",
@@ -213,47 +199,48 @@
"metadata": {},
"outputs": [],
"source": [
- "import requests, time\n",
+ "import time\n",
+ "from openai import AzureOpenAI\n",
"\n",
"runs = 20\n",
"sleep_time_ms = 100\n",
- "url = f\"{apim_resource_gateway_url}/openai/deployments/{openai_deployment_name}/chat/completions?api-version={openai_api_version}\"\n",
- "messages = {\"messages\": [\n",
- " {\"role\": \"system\", \"content\": \"You are a sarcastic, unhelpful assistant.\"},\n",
- " {\"role\": \"user\", \"content\": \"Can you tell me the time, please?\"}\n",
- "]}\n",
+ "total_tokens_all_runs = 0\n",
"api_runs = []\n",
"\n",
- "# Initialize a session for connection pooling and set any default headers\n",
- "session = requests.Session()\n",
- "session.headers.update({'api-key': apim_subscription1_key})\n",
+ "client = AzureOpenAI(\n",
+ " azure_endpoint = apim_resource_gateway_url,\n",
+ " api_key = apim_subscription1_key,\n",
+ " api_version = openai_api_version\n",
+ ")\n",
"\n",
- "try:\n",
- " for i in range(runs):\n",
- " print(f\"โถ๏ธ Run {i+1}/{runs}:\")\n",
+ "for i in range(runs):\n",
+ " print(f\"โถ๏ธ Run {i+1}/{runs}:\")\n",
"\n",
- " start_time = time.time()\n",
- " response = session.post(url, json = messages)\n",
- " response_time = time.time() - start_time\n",
- " print(f\"โ {response_time:.2f} seconds\")\n",
- "\n",
- " utils.print_response_code(response)\n",
- "\n",
- " if \"x-ms-region\" in response.headers:\n",
- " print(f\"x-ms-region: \\x1b[1;32m{response.headers.get(\"x-ms-region\")}\\x1b[0m\") # this header is useful to determine the region of the backend that served the request\n",
- " api_runs.append((response_time, response.headers.get(\"x-ms-region\")))\n",
- "\n",
- " if (response.status_code == 200):\n",
- " data = json.loads(response.text)\n",
- " print(f\"Token usage: {json.dumps(dict(data.get(\"usage\")), indent = 4)}\\n\")\n",
- " print(f\"๐ฌ {data.get(\"choices\")[0].get(\"message\").get(\"content\")}\\n\")\n",
- " else:\n",
- " print(f\"{response.text}\\n\")\n",
- "\n",
- " time.sleep(sleep_time_ms/1000)\n",
- "finally:\n",
- " # Close the session to release the connection\n",
- " session.close()"
+ " start_time = time.time()\n",
+ " raw_response = client.chat.completions.with_raw_response.create(\n",
+ " model = openai_model_name,\n",
+ " messages = [\n",
+ " {\"role\": \"system\", \"content\": \"You are a sarcastic, unhelpful assistant.\"},\n",
+ " {\"role\": \"user\", \"content\": \"Can you tell me the time, please?\"}\n",
+ " ])\n",
+ " response_time = time.time() - start_time\n",
+ "\n",
+ " print(f\"โ {response_time:.2f} seconds\")\n",
+ "\n",
+ " if \"x-ms-region\" in raw_response.headers:\n",
+ " print(f\"x-ms-region: \\x1b[1;32m{raw_response.headers.get(\"x-ms-region\")}\\x1b[0m\") # this header is useful to determine the region of the backend that served the request\n",
+ " api_runs.append((response_time, raw_response.headers.get(\"x-ms-region\")))\n",
+ "\n",
+ " response = raw_response.parse()\n",
+ "\n",
+ " if response.usage:\n",
+ " total_tokens_all_runs += response.usage.total_tokens\n",
+ " print(f\"Token usage:\\n Total tokens: {response.usage.total_tokens}\\n Prompt tokens: {response.usage.prompt_tokens}\\n Completion tokens: {response.usage.completion_tokens}\\n Total tokens all runs: {total_tokens_all_runs}\\n\")\n",
+ "\n",
+ "\n",
+ " print(f\"๐ฌ {response.choices[0].message.content}\\n\")\n",
+ "\n",
+ " time.sleep(sleep_time_ms/1000)"
]
},
{
@@ -308,68 +295,11 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
- "#### ๐งช Test the API using the Azure OpenAI Python SDK\n",
- "\n",
- "Repeat the same test using the Python SDK to ensure compatibility. Note that we do not know what region served the response; we only see that we obtained a response."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import time\n",
- "from openai import AzureOpenAI\n",
+ "### Policy 2 - Token Emitting\n",
"\n",
- "runs = 20\n",
- "sleep_time_ms = 100\n",
- "total_tokens_all_runs = 0\n",
+ "We now add token emitting to the existing API policy in order to track token usage by subscriptions. This aids usage and cost analysis and chargeback models inside organizations. You can see the policy to be added in the `policy-2.xml` file in this folder.\n",
"\n",
- "client = AzureOpenAI(\n",
- " azure_endpoint = apim_resource_gateway_url,\n",
- " api_key = apim_subscription1_key,\n",
- " api_version = openai_api_version\n",
- ")\n",
- "\n",
- "for i in range(runs):\n",
- " print(f\"โถ๏ธ Run {i+1}/{runs}:\")\n",
- "\n",
- " start_time = time.time()\n",
- " raw_response = client.chat.completions.with_raw_response.create(\n",
- " model = openai_model_name,\n",
- " messages = [\n",
- " {\"role\": \"system\", \"content\": \"You are a sarcastic, unhelpful assistant.\"},\n",
- " {\"role\": \"user\", \"content\": \"Can you tell me the time, please?\"}\n",
- " ])\n",
- " response_time = time.time() - start_time\n",
- "\n",
- " print(f\"โ {response_time:.2f} seconds\")\n",
- " print(f\"x-ms-region: \\x1b[1;32m{raw_response.headers.get(\"x-ms-region\")}\\x1b[0m\") # this header is useful to determine the region of the backend that served the request\n",
- "\n",
- " response = raw_response.parse()\n",
- "\n",
- " if response.usage:\n",
- " total_tokens_all_runs += response.usage.total_tokens\n",
- " print(f\"Token usage:\\n Total tokens: {response.usage.total_tokens}\\n Prompt tokens: {response.usage.prompt_tokens}\\n Completion tokens: {response.usage.completion_tokens}\\n Total tokens all runs: {total_tokens_all_runs}\\n\")\n",
- "\n",
- "\n",
- " print(f\"๐ฌ {response.choices[0].message.content}\\n\")\n",
- "\n",
- " time.sleep(sleep_time_ms/1000)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "### 4) Policy 2 - Token Emitting\n",
- "\n",
- "We now add token emitting to the existing API policy in order to track token usage by subscriptions. This aids usage and cost analysis and chargeback models inside organizations.\n",
- "\n",
- "#### Update deployment using ๐ฆพ Bicep"
+ "#### Update the API management policy via the REST API"
]
},
{
@@ -447,7 +377,9 @@
" response_time = time.time() - start_time\n",
" print(f\"๐ Subscription {j+1}\")\n",
" print(f\"โ {response_time:.2f} seconds\")\n",
- " print(f\"x-ms-region: \\x1b[1;32m{raw_response.headers.get(\"x-ms-region\")}\\x1b[0m\") # this header is useful to determine the region of the backend that served the request\n",
+ "\n",
+ " if \"x-ms-region\" in raw_response.headers:\n",
+ " print(f\"x-ms-region: \\x1b[1;32m{raw_response.headers.get(\"x-ms-region\")}\\x1b[0m\") # this header is useful to determine the region of the backend that served the request\n",
"\n",
" response = raw_response.parse()\n",
"\n",
@@ -471,7 +403,15 @@
"\n",
"### ๐ See the metrics on the Azure Portal\n",
"\n",
- "Open the Application Insights resource, navigate to the Metrics blade, then select the defined namespace (openai). Choose the metric \"Total Tokens\" with a Sum aggregation. Then, apply splitting by 'Subscription Id' to view values for each dimension. For better visibility switch to an area chart.\n",
+ "One way to see the newly-added token metrics in the Azure Portal:\n",
+ "\n",
+ "1) Open the _Application Insights_ resource in the resource group.\n",
+ "1) Navigate to the _Metrics_ blade.\n",
+ "1) Change the timespan to the last 30 minutes with a 1 minute time granularity.\n",
+ "1) Then select the _openai_ metric namespace. \n",
+ "1) Choose the _Total Tokens_ metric \n",
+ "1) Select the _Sum_ aggregation. \n",
+ "1) Apply splitting by _Subscription Id_ to view values for each dimension. For better visibility switch to an area chart.\n",
"\n",
"![result](result.png)\n"
]
@@ -483,7 +423,7 @@
"\n",
"#### ๐ Analyze Application Insights custom metrics with a KQL query\n",
"\n",
- "With this query you can get the custom metrics that were emitted by Azure APIM. **Note that it may take a few minutes for data to become available.** If you see the metrics in the Azure Portal, you don't have to wait for the data here and plotting it to continue. Come back to this section later, if you like."
+ "Alternatively, you can query the custom metrics that were emitted by Azure APIM. **Note that it may take a few minutes for data to become available.** If you see the metrics in the Azure Portal, you don't have to wait for the data here and plotting it to continue. Come back to this section later, if you like."
]
},
{
@@ -544,12 +484,13 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
- "### 5) Policy 3 - Token Rate Limiting\n",
+ "### Policy 3 - Token Rate Limiting\n",
"\n",
- "Adding rate limiting for subscriptions is a sensible way to limit runaway usage.\n",
+ "Emitting tokens is very valuable to understand usage. Even more valuable is adding rate limiting for subscriptions as a sensible way to limit runaway usage.\n",
"\n",
- "#### Update deployment using ๐ฆพ Bicep"
+ "Please note that results are going to be a bit skewed as we are executing these requests sequentially, but it should get the gist across as to what's happening from a rate-limiting perspective. Threading may be considered in the future.\n",
+ "\n",
+ "#### Update the API management policy via the REST API"
]
},
{
@@ -575,7 +516,7 @@
"\n",
"#### ๐งช Execute multiple runs for each subscription using the Azure OpenAI Python SDK\n",
"\n",
- "We will send requests for each subscription. Adjust the `sleep_time_ms` and the number of `runs` to your test scenario. You should be able to observe a significant pause over 20 runs as the requests will hit the tokens-per-minute limit that we defined earlier. This is expected and validation of the policies working.\n"
+ "We will send requests for each subscription. Adjust the `sleep_time_ms` and the number of `runs` to your test scenario. You should be able to observe a significant pause after a few runs as the requests will hit the tokens-per-minute limit that we defined earlier. This is expected and validation of the policies working.\n"
]
},
{
@@ -587,118 +528,10 @@
"import time\n",
"from openai import AzureOpenAI\n",
"\n",
- "runs = 5\n",
- "sleep_time_ms = 100\n",
- "total_tokens_all_runs = [0, 0, 0]\n",
- "\n",
- "clients = [\n",
- " AzureOpenAI(\n",
- " azure_endpoint = apim_resource_gateway_url,\n",
- " api_key = apim_subscription1_key,\n",
- " api_version = openai_api_version\n",
- " ),\n",
- " AzureOpenAI(\n",
- " azure_endpoint = apim_resource_gateway_url,\n",
- " api_key = apim_subscription2_key,\n",
- " api_version = openai_api_version\n",
- " ),\n",
- " AzureOpenAI(\n",
- " azure_endpoint = apim_resource_gateway_url,\n",
- " api_key = apim_subscription3_key,\n",
- " api_version = openai_api_version\n",
- " )\n",
- "]\n",
- "\n",
- "for i in range(runs):\n",
- " print(f\"โถ๏ธ Run {i+1}/{runs}:\")\n",
- "\n",
- " for j in range(0, 3):\n",
- " start_time = time.time()\n",
- "\n",
- " raw_response = clients[j].chat.completions.with_raw_response.create(\n",
- " model = openai_model_name,\n",
- " messages = [\n",
- " {\"role\": \"system\", \"content\": \"You are a sarcastic, unhelpful assistant.\"},\n",
- " {\"role\": \"user\", \"content\": \"Can you tell me the time, please?\"}\n",
- " ],\n",
- " extra_headers = {\"x-user-id\": \"alex\"}\n",
- " )\n",
- "\n",
- " response_time = time.time() - start_time\n",
- " print(f\"๐ Subscription {j+1}\")\n",
- " print(f\"โ {response_time:.2f} seconds\")\n",
- " print(f\"x-ms-region: \\x1b[1;32m{raw_response.headers.get(\"x-ms-region\")}\\x1b[0m\") # this header is useful to determine the region of the backend that served the request\n",
- "\n",
- " response = raw_response.parse()\n",
- "\n",
- " if response.usage:\n",
- " total_tokens_all_runs[j] += response.usage.total_tokens\n",
- " print(f\"Token usage:\\n Total tokens: {response.usage.total_tokens}\\n Prompt tokens: {response.usage.prompt_tokens}\\n Completion tokens: {response.usage.completion_tokens}\\n Total tokens all runs: {total_tokens_all_runs[j]}\\n\")\n",
- "\n",
- " print(f\"๐ฌ {response.choices[0].message.content}\\n\")\n",
- "\n",
- " print()\n",
- "\n",
- " time.sleep(sleep_time_ms/1000)\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "### 5๏ธโฃ Policy 4 - Semantic Caching\n",
- "\n",
- "The azure-openai-semantic-cache-lookup policy conducts a cache lookup of responses on Azure OpenAI Chat Completion API and Completion API requests from a pre-configured external cache. It operates by comparing the vector proximity of the prompt to prior requests and using a specific similarity score threshold. Caching responses helps reduce bandwidth and processing demands on the backend Azure OpenAI API, thus reducing latency perceived by API consumers. \n",
- "๐๐ฝ Update: The [Bicep file](main.bicep) was updated to use the new [Azure Managed Redis](https://azure.microsoft.com/en-us/products/managed-redis/) for improved cost efficiency.\n",
- "\n",
- "\n",
- "#### Update deployment using ๐ฆพ Bicep"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "policy_xml_file = \"policy-4.xml\"\n",
- "\n",
- "with open(policy_xml_file, 'r') as file:\n",
- " policy_xml = file.read()\n",
- " policy_xml = policy_xml.replace('{backend-id}', backend_id).replace('{retry-count}', str(len(openai_resources) - 1)).replace('{tpm}', str(tokens_per_minute))\n",
- "\n",
- "utils.update_api_policy(subscription_id, resource_group_name, apim_service_name, \"openai\", policy_xml)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "#### ๐งช Execute multiple runs for each subscription using the Azure OpenAI Python SDK\n",
- "\n",
- "We will send requests for each subscription. Adjust the `sleep_time_ms` and the number of `runs` to your test scenario. You should be able to observe a significant pause over 20 runs as the requests will hit the tokens-per-minute limit that we defined earlier. This is expected and validation of the policies working.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import random, time\n",
- "from openai import AzureOpenAI\n",
- "\n",
- "runs = 5\n",
+ "runs = 3\n",
"sleep_time_ms = 100\n",
"total_tokens_all_runs = [0, 0, 0]\n",
- "api_runs = [] # Response Times for each run\n",
- "\n",
- "questions = [\"How to Brew the Perfect Cup of Coffee?\",\n",
- " \"What are the steps to Craft the Ideal Espresso?\",\n",
- " \"Tell me how to create the best steaming Java?\",\n",
- " \"Explain how to make a caffeinated brewed beverage?\"]\n",
+ "api_runs = []\n",
"\n",
"clients = [\n",
" AzureOpenAI(\n",
@@ -722,124 +555,75 @@
" print(f\"โถ๏ธ Run {i+1}/{runs}:\")\n",
"\n",
" for j in range(0, 3):\n",
- " print(f\"๐ Subscription {j+1}\")\n",
- " random_question = random.choice(questions)\n",
- " print(\"๐ฌ \", random_question)\n",
- "\n",
" start_time = time.time()\n",
"\n",
- " raw_response = clients[j].chat.completions.with_raw_response.create(\n",
- " model = openai_model_name,\n",
- " messages = [\n",
- " {\"role\": \"system\", \"content\": \"You are a sarcastic, unhelpful assistant.\"},\n",
- " {\"role\": \"user\", \"content\": random_question}\n",
- " ],\n",
- " extra_headers = {\"x-user-id\": \"alex\"}\n",
- " )\n",
+ " try:\n",
+ " raw_response = clients[j].chat.completions.with_raw_response.create(\n",
+ " model = openai_model_name,\n",
+ " messages = [\n",
+ " {\"role\": \"system\", \"content\": \"You are a sarcastic, unhelpful assistant.\"},\n",
+ " {\"role\": \"user\", \"content\": \"Can you tell me the time, please?\"}\n",
+ " ],\n",
+ " extra_headers = {\"x-user-id\": \"alex\"}\n",
+ " )\n",
"\n",
- " response_time = time.time() - start_time\n",
+ " response_time = time.time() - start_time\n",
+ " print(f\"๐ Subscription {j+1}\")\n",
+ " print(f\"โ {response_time:.2f} seconds\")\n",
+ " api_runs.append((response_time, j+1))\n",
"\n",
- " print(f\"โ {response_time:.2f} seconds\")\n",
- " print(f\"x-ms-region: \\x1b[1;32m{raw_response.headers.get(\"x-ms-region\")}\\x1b[0m\") # this header is useful to determine the region of the backend that served the request\n",
+ " if \"x-ms-region\" in raw_response.headers:\n",
+ " print(f\"x-ms-region: \\x1b[1;32m{raw_response.headers.get(\"x-ms-region\")}\\x1b[0m\") # this header is useful to determine the region of the backend that served the request\n",
"\n",
- " response = raw_response.parse()\n",
+ " response = raw_response.parse()\n",
"\n",
- " if response.usage:\n",
- " total_tokens_all_runs[j] += response.usage.total_tokens\n",
- " print(f\"Token usage:\\n Total tokens: {response.usage.total_tokens}\\n Prompt tokens: {response.usage.prompt_tokens}\\n Completion tokens: {response.usage.completion_tokens}\\n Total tokens all runs: {total_tokens_all_runs[j]}\\n\")\n",
+ " if response.usage:\n",
+ " total_tokens_all_runs[j] += response.usage.total_tokens\n",
+ " print(f\"Token usage:\\n Total tokens: {response.usage.total_tokens}\\n Prompt tokens: {response.usage.prompt_tokens}\\n Completion tokens: {response.usage.completion_tokens}\\n Total tokens all runs: {total_tokens_all_runs[j]}\\n\")\n",
"\n",
- " print(f\"๐ฌ {response.choices[0].message.content}\\n\")\n",
- " api_runs.append(response_time)\n",
+ " print(f\"๐ฌ {response.choices[0].message.content}\\n\")\n",
+ " except e as Exception:\n",
+ " print(e)\n",
"\n",
" print()\n",
"\n",
" time.sleep(sleep_time_ms/1000)\n"
]
},
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "### ๐ Analyze Semantic Caching performance\n",
- "\n",
- "The first request should take a longer time as it makes it all the way to the Azure OpenAI backend. The subsequent requests should be much quicker as they draw from the semantic cache. Note that making more than 20 requests may result in spikes similar to the first request. As we are using the cheapest, smallest Basic Redis cache (B0), the cache server will eventually return a 429, forcing API Management to make a longer request to the Azure OpenAI backend. This is expected as B0 is not intended for load scenarios."
- ]
- },
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "# plot the results\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
+ "from matplotlib.patches import Rectangle as pltRectangle\n",
"import matplotlib as mpl\n",
"\n",
- "mpl.rcParams['figure.figsize'] = [15, 5]\n",
- "df = pd.DataFrame(api_runs, columns=['Response Time'])\n",
+ "mpl.rcParams['figure.figsize'] = [15, 7]\n",
+ "df = pd.DataFrame(api_runs, columns = ['Response Time', 'Subscription'])\n",
"df['Run'] = range(1, len(df) + 1)\n",
- "df.plot(kind='bar', x='Run', y='Response Time', legend=False)\n",
- "plt.title('Semantic Caching Performance')\n",
- "plt.xlabel('Runs')\n",
- "plt.ylabel('Response Time (s)')\n",
- "plt.xticks(rotation=0) # Set x-axis ticks to be the run numbers\n",
- "\n",
- "average = df['Response Time'].mean()\n",
- "plt.axhline(y=average, color='r', linestyle='--', label=f'Average: {average:.2f}')\n",
- "plt.legend()\n",
- "\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "### ๐ Show Redis Cache information"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import matplotlib.pyplot as plt\n",
- "import matplotlib as mpl\n",
- "\n",
- "import redis.asyncio as redis\n",
- "\n",
- "async def get_redis_info():\n",
- " r = await redis.from_url(\n",
- " f\"rediss://:{rediscache_key}@{rediscache_host}:{rediscache_port}\"\n",
- " )\n",
- "\n",
- " info = await r.info()\n",
"\n",
- " print(\"Redis Server Information:\")\n",
- " print(f\"Used Memory : {info['used_memory_human']}\")\n",
- " # Display the Redis info in a pandas DataFrame and plot it\n",
+ "# Define a color map for each subscription\n",
+ "color_map = {1: 'lightpink', 2: 'lightyellow', 3: 'lightblue'} # Add more subscriptions and colors as needed\n",
"\n",
- " redis_info = {\n",
- " 'Metric': ['Cache Hits', 'Cache Misses', 'Evicted Keys', 'Expired Keys'],\n",
- " 'Value': [info['keyspace_hits'], info['keyspace_misses'], info['evicted_keys'], info['expired_keys']]\n",
- " }\n",
+ "# Plot the dataframe with colored bars\n",
+ "ax = df.plot(kind = 'bar', x = 'Run', y = 'Response Time', color = [color_map.get(subscription, 'gray') for subscription in df['Subscription']], legend = False)\n",
"\n",
- " df_redis_info = pd.DataFrame(redis_info)\n",
- " df_redis_info.plot(kind='barh', x='Metric', y='Value', legend=False)\n",
+ "# Add legend\n",
+ "legend_labels = [pltRectangle((0, 0), 1, 1, color = color_map[subscription]) for subscription in color_map.keys()]\n",
+ "ax.legend(legend_labels, ['Subscription 1', 'Subscription 2', 'Subscription 3'])\n",
"\n",
- " plt.title('Redis Server Information')\n",
- " plt.xlabel('Value')\n",
- " plt.ylabel('Metric')\n",
- " plt.show()\n",
+ "plt.title('Token Rate Limiting by Subscription Results (high response times indicate likely throttling)')\n",
+ "plt.xlabel('Run #')\n",
+ "plt.ylabel('Response Time')\n",
+ "plt.xticks(rotation = 0)\n",
"\n",
- " await r.aclose()\n",
+ "average = df['Response Time'].mean()\n",
+ "plt.axhline(y = average, color = 'r', linestyle = '--', label = f'Average: {average:.2f}')\n",
"\n",
- "await get_redis_info()"
+ "plt.show()"
]
},
{
diff --git a/modules/cognitive-services/v2/openai.bicep b/modules/cognitive-services/v2/openai.bicep
new file mode 100644
index 0000000..fd071d4
--- /dev/null
+++ b/modules/cognitive-services/v2/openai.bicep
@@ -0,0 +1,127 @@
+/**
+ * @module openai-v2
+ * @description This module defines the Azure Cognitive Services OpenAI resources using Bicep.
+ * This is version 2 (v2) of the OpenAI Bicep module.
+ */
+
+// ------------------
+// PARAMETERS
+// ------------------
+
+@description('Azure OpenAI Sku')
+@allowed([
+ 'S0'
+])
+param openAISku string = 'S0'
+
+@description('Azure OpenAI Deployment Name')
+param openAIDeploymentName string
+
+@description('Model Name')
+param openAIModelName string
+
+@description('Model Version')
+param openAIModelVersion string
+
+@description('Model SKU')
+param openAIModelSKU string = 'Standard'
+
+@description('Configuration array for OpenAI resources')
+param openAIConfig array = []
+
+@description('Log Analytics Workspace Id')
+param lawId string = ''
+
+@description('APIM Pricipal Id')
+param apimPrincipalId string
+
+// ------------------
+// VARIABLES
+// ------------------
+
+var resourceSuffix = uniqueString(subscription().id, resourceGroup().id)
+var azureRoles = loadJsonContent('../../azure-roles.json')
+var cognitiveServicesOpenAIUserRoleDefinitionID = resourceId('Microsoft.Authorization/roleDefinitions', azureRoles.CognitiveServicesOpenAIUser)
+
+
+// ------------------
+// RESOURCES
+// ------------------
+
+resource cognitiveServices 'Microsoft.CognitiveServices/accounts@2024-10-01' = [for config in openAIConfig: if(length(openAIConfig) > 0) {
+ name: '${config.name}-${resourceSuffix}'
+ location: config.location
+ sku: {
+ name: openAISku
+ }
+ kind: 'OpenAI'
+ properties: {
+ apiProperties: {
+ statisticsEnabled: false
+ }
+ customSubDomainName: toLower('${config.name}-${resourceSuffix}')
+ }
+}]
+
+// https://learn.microsoft.com/azure/templates/microsoft.insights/diagnosticsettings
+resource diagnosticSettings 'Microsoft.Insights/diagnosticSettings@2021-05-01-preview' = [for (config, i) in openAIConfig: if(length(openAIConfig) > 0 && lawId != '') {
+ name: '${cognitiveServices[i].name}-diagnostics'
+ scope: cognitiveServices[i]
+ properties: {
+ workspaceId: lawId != '' ? lawId : null
+ logs: []
+ metrics: [
+ {
+ category: 'AllMetrics'
+ enabled: true
+ }
+ ]
+ }
+}]
+
+@batchSize(1)
+resource deployment 'Microsoft.CognitiveServices/accounts/deployments@2024-10-01' = [for (config, i) in openAIConfig: if(length(openAIConfig) > 0) {
+ name: openAIDeploymentName
+ parent: cognitiveServices[i]
+ properties: {
+ model: {
+ format: 'OpenAI'
+ name: openAIModelName
+ version: openAIModelVersion
+ }
+ }
+ sku: {
+ name: openAIModelSKU
+ capacity: config.capacity
+ }
+}]
+
+resource roleAssignment 'Microsoft.Authorization/roleAssignments@2022-04-01' = [for (config, i) in openAIConfig: if(length(openAIConfig) > 0) {
+ scope: cognitiveServices[i]
+ name: guid(subscription().id, resourceGroup().id, config.name, cognitiveServicesOpenAIUserRoleDefinitionID)
+ properties: {
+ roleDefinitionId: cognitiveServicesOpenAIUserRoleDefinitionID
+ principalId: apimPrincipalId
+ principalType: 'ServicePrincipal'
+ }
+}]
+
+
+// ------------------
+// OUTPUTS
+// ------------------
+
+output extendedOpenAIConfig array = [for (config, i) in openAIConfig: {
+ // Original openAIConfig properties
+ name: config.name
+ location: config.location
+ priority: config.?priority
+ weight: config.?weight
+ // Additional properties
+ sku: openAISku
+ deploymentName: openAIDeploymentName
+ modelName: openAIModelName
+ modelVersion: openAIModelVersion
+ cognitiveService: cognitiveServices[i]
+ endpoint: cognitiveServices[i].properties.endpoint
+}]