|
4 | 4 | "cell_type": "markdown",
|
5 | 5 | "metadata": {},
|
6 | 6 | "source": [
|
7 |
| - "The first step is to import the libraries and set the OpenAI API key and endpoint. You'll need to set the following environment variables:\n", |
8 |
| - "\n", |
9 |
| - "- `AZURE_OPENAI_API_KEY` - Your OpenAI API key\n", |
10 |
| - "- `AZURE_OPENAI_ENDPOINT` - Your OpenAI endpoint" |
| 7 | + "In order to run the following noteboooks, if you haven't done yet, you need to deploy a model that uses `text-embedding-ada-002` as base model and set his deployment name inside .env file as `AZURE_OPENAI_EMBEDDINGS_ENDPOINT`" |
11 | 8 | ]
|
12 | 9 | },
|
13 | 10 | {
|
14 | 11 | "cell_type": "code",
|
15 |
| - "execution_count": null, |
| 12 | + "execution_count": 1, |
16 | 13 | "metadata": {},
|
17 | 14 | "outputs": [],
|
18 | 15 | "source": [
|
19 | 16 | "import os\n",
|
20 | 17 | "import pandas as pd\n",
|
21 |
| - "import openai\n", |
22 |
| - "from openai.embeddings_utils import cosine_similarity, get_embedding\n", |
| 18 | + "import numpy as np\n", |
| 19 | + "from openai import AzureOpenAI\n", |
| 20 | + "from dotenv import load_dotenv\n", |
23 | 21 | "\n",
|
24 |
| - "OPENAI_EMBEDDING_ENGINE = \"text-embedding-ada-002\"\n", |
25 |
| - "SIMILARITIES_RESULTS_THRESHOLD = 0.75\n", |
26 |
| - "DATASET_NAME = \"embedding_index_3m.json\"\n", |
| 22 | + "from sklearn.metrics.pairwise import cosine_similarity\n", |
| 23 | + "load_dotenv()\n", |
27 | 24 | "\n",
|
28 |
| - "openai.api_type = \"azure\"\n", |
29 |
| - "openai.api_key = os.environ[\"AZURE_OPENAI_API_KEY\"]\n", |
30 |
| - "openai.api_base = os.environ[\"AZURE_OPENAI_ENDPOINT\"]\n", |
31 |
| - "openai.api_version = \"2023-07-01-preview\"\n", |
| 25 | + "client = AzureOpenAI(\n", |
| 26 | + " api_key=os.environ['AZURE_OPENAI_KEY'], # this is also the default, it can be omitted\n", |
| 27 | + " api_version = \"2023-05-15\"\n", |
| 28 | + " )\n", |
32 | 29 | "\n",
|
33 |
| - "OPENAI_EMBEDDING_DEPLOYMENT_NAME = os.environ[\n", |
34 |
| - " \"AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME\"\n", |
35 |
| - "]" |
| 30 | + "model = os.environ['AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT']\n", |
| 31 | + "\n", |
| 32 | + "SIMILARITIES_RESULTS_THRESHOLD = 0.75\n", |
| 33 | + "DATASET_NAME = \"embedding_index_3m.json\"" |
36 | 34 | ]
|
37 | 35 | },
|
38 | 36 | {
|
|
44 | 42 | },
|
45 | 43 | {
|
46 | 44 | "cell_type": "code",
|
47 |
| - "execution_count": null, |
| 45 | + "execution_count": 2, |
48 | 46 | "metadata": {},
|
49 | 47 | "outputs": [],
|
50 | 48 | "source": [
|
|
69 | 67 | },
|
70 | 68 | {
|
71 | 69 | "cell_type": "code",
|
72 |
| - "execution_count": null, |
| 70 | + "execution_count": 3, |
73 | 71 | "metadata": {},
|
74 | 72 | "outputs": [],
|
75 | 73 | "source": [
|
|
79 | 77 | " # create a copy of the dataset\n",
|
80 | 78 | " video_vectors = dataset.copy()\n",
|
81 | 79 | "\n",
|
82 |
| - " # get the embeddings for the query\n", |
83 |
| - " query_embeddings = get_embedding(query, OPENAI_EMBEDDING_ENGINE)\n", |
| 80 | + " # get the embeddings for the query \n", |
| 81 | + " query_embeddings = client.embeddings.create(input=query, model=model).data[0].embedding\n", |
84 | 82 | "\n",
|
85 | 83 | " # create a new column with the calculated similarity for each row\n",
|
86 | 84 | " video_vectors[\"similarity\"] = video_vectors[\"ada_v2\"].apply(\n",
|
87 |
| - " lambda x: cosine_similarity(query_embeddings, x)\n", |
| 85 | + " lambda x: cosine_similarity(np.array(query_embeddings).reshape(1,-1), np.array(x).reshape(1,-1))\n", |
88 | 86 | " )\n",
|
89 | 87 | "\n",
|
90 | 88 | " # filter the videos by similarity\n",
|
|
109 | 107 | },
|
110 | 108 | {
|
111 | 109 | "cell_type": "code",
|
112 |
| - "execution_count": null, |
| 110 | + "execution_count": 4, |
113 | 111 | "metadata": {},
|
114 | 112 | "outputs": [],
|
115 | 113 | "source": [
|
|
119 | 117 | " return f\"https://youtu.be/{video_id}?t={seconds}\"\n",
|
120 | 118 | "\n",
|
121 | 119 | " print(f\"\\nVideos similar to '{query}':\")\n",
|
122 |
| - " for index, row in videos.iterrows():\n", |
| 120 | + " for row in videos.iterrows():\n", |
123 | 121 | " youtube_url = _gen_yt_url(row[\"videoId\"], row[\"seconds\"])\n",
|
124 | 122 | " print(f\" - {row['title']}\")\n",
|
125 | 123 | " print(f\" Summary: {' '.join(row['summary'].split()[:15])}...\")\n",
|
|
153 | 151 | },
|
154 | 152 | {
|
155 | 153 | "cell_type": "code",
|
156 |
| - "execution_count": null, |
| 154 | + "execution_count": 5, |
157 | 155 | "metadata": {},
|
158 | 156 | "outputs": [],
|
159 | 157 | "source": [
|
|
190 | 188 | "name": "python",
|
191 | 189 | "nbconvert_exporter": "python",
|
192 | 190 | "pygments_lexer": "ipython3",
|
193 |
| - "version": "3.11.6" |
| 191 | + "version": "3.10.8" |
194 | 192 | }
|
195 | 193 | },
|
196 | 194 | "nbformat": 4,
|
|
0 commit comments