diff --git a/notebooks/Quickstart.ipynb b/notebooks/Quickstart.ipynb new file mode 100644 index 00000000..1dde95e4 --- /dev/null +++ b/notebooks/Quickstart.ipynb @@ -0,0 +1,445 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {***REMOVED***, + "source": [ + "# GraphRAG Quickstart" + ] + ***REMOVED***, + { + "cell_type": "markdown", + "metadata": {***REMOVED***, + "source": [ + "### Prerequisite installs to run the quickstart notebook\n", + "Install 3rd party packages that are not part of the Python Standard Library" + ] + ***REMOVED***, + { + "cell_type": "code", + "execution_count": null, + "metadata": {***REMOVED***, + "outputs": [], + "source": [ + "! pip install devtools pandas python-magic requests tqdm" + ] + ***REMOVED***, + { + "cell_type": "code", + "execution_count": null, + "metadata": {***REMOVED***, + "outputs": [], + "source": [ + "import getpass\n", + "***REMOVED***\n", + "import sys\n", + "***REMOVED***\n", + "from pathlib import Path\n", + "\n", + "import magic\n", + "***REMOVED***\n", + "***REMOVED***\n", + "from devtools import pprint\n", + "from tqdm import tqdm" + ] + ***REMOVED***, + { + "cell_type": "markdown", + "metadata": {***REMOVED***, + "source": [ + "### Configuration - API Key, file directions and API endpoints" + ] + ***REMOVED***, + { + "cell_type": "markdown", + "metadata": {***REMOVED***, + "source": [ + "#### Get API Key for API Management Service\n", + "For authentication, the API requires a *subscription key* to be passed in the header of all requests. To find this key, visit the Azure Portal. The API subscription key will be located under ` --> --> --> --> Primary Key`." + ] + ***REMOVED***, + { + "cell_type": "code", + "execution_count": null, + "metadata": {***REMOVED***, + "outputs": [], + "source": [ + "ocp_apim_subscription_key = getpass.getpass(\n", + " \"Enter the subscription key to the GraphRag APIM:\"\n", + ")" + ] + ***REMOVED***, + { + "cell_type": "markdown", + "metadata": {***REMOVED***, + "source": [ + "#### Setup directories and API endpoint\n", + "\n", + "The following parameters are required to access and use the GraphRAG solution accelerator API:\n", + "* file_directory\n", + "* storage_name\n", + "* index_name\n", + "* endpoint\n", + "\n", + "For demonstration purposes, you may use the provided `get-wiki-articles.py` script to download a small set of wikipedia articles or provide your own data." + ] + ***REMOVED***, + { + "cell_type": "code", + "execution_count": null, + "metadata": {***REMOVED***, + "outputs": [], + "source": [ + "\"\"\"\n", + "These parameters must be defined by the user:\n", + "\n", + "- file_directory: local directory where data files of interest are stored.\n", + "- storage_name: unique name for an Azure blob storage container where files will be uploaded.\n", + "- index_name: unique name for a single knowledge graph construction. Multiple indexes can be created from the same blob container of data.\n", + "- apim_url: the endpoint URL for GraphRAG service (this is the Gateway URL found in the APIM resource).\n", + "\"\"\"\n", + "\n", + "file_directory = \"\"\n", + "storage_name = \"\"\n", + "index_name = \"\"\n", + "apim_url = \"\"" + ] + ***REMOVED***, + { + "cell_type": "code", + "execution_count": null, + "metadata": {***REMOVED***, + "outputs": [], + "source": [ + "assert (\n", + " file_directory != \"\" and storage_name != \"\" and index_name != \"\" and apim_url != \"\"\n", + ")" + ] + ***REMOVED***, + { + "cell_type": "code", + "execution_count": null, + "metadata": {***REMOVED***, + "outputs": [], + "source": [ + "\"\"\"\n", + "\"Ocp-Apim-Subscription-Key\": \n", + " This is a custom HTTP header used by Azure API Management service (APIM) to \n", + " authenticate API requests. The value for this key should be set to the subscription \n", + " key provided by the Azure APIM instance in your GraphRAG resource group.\n", + "\"\"\"\n", + "\n", + "***REMOVED***\"Ocp-Apim-Subscription-Key\": ocp_apim_subscription_key***REMOVED***" + ] + ***REMOVED***, + { + "cell_type": "markdown", + "metadata": {***REMOVED***, + "source": [ + "## Upload Files to Storage Data" + ] + ***REMOVED***, + { + "cell_type": "code", + "execution_count": null, + "metadata": {***REMOVED***, + "outputs": [], + "source": [ + "def upload_files(\n", + " file_directory: str,\n", + " storage_name: str,\n", + " batch_size: int = 100,\n", + " overwrite: bool = True,\n", + " max_retries: int = 5,\n", + ") -> requests.Response | list[Path]:\n", + " \"\"\"\n", + " Upload files to a blob storage container.\n", + "\n", + " Args:\n", + " file_directory - a local directory of .txt files to upload. All files must be in utf-8 encoding.\n", + " storage_name - a unique name for the Azure storage container.\n", + " batch_size - the number of files to upload in a single batch.\n", + " overwrite - whether or not to overwrite files if they already exist in the storage container.\n", + " max_retries - the maximum number of times to retry uploading a batch of files if the API is busy.\n", + "\n", + " NOTE: Uploading files may sometimes fail if the blob container was recently deleted\n", + " (i.e. a few seconds before. The solution \"in practice\" is to sleep a few seconds and try again.\n", + " \"\"\"\n", + " url = apim_url + \"/data\"\n", + "\n", + " def upload_batch(\n", + " files: list, storage_name: str, overwrite: bool, max_retries: int\n", + "***REMOVED*** -> requests.Response:\n", + " for _ in range(max_retries):\n", + " ***REMOVED***\n", + " url=url,\n", + " files=files,\n", + " params={\"storage_name\": storage_name, \"overwrite\": overwrite***REMOVED***,\n", + " headers=headers,\n", + " ***REMOVED***\n", + " # API may be busy, retry\n", + " if response.status_code == 500:\n", + " print(\"API busy. Sleeping and will try again.\")\n", + " time.sleep(10)\n", + " continue\n", + " return response\n", + " return response\n", + "\n", + " batch_files = []\n", + " accepted_file_types = [\"text/plain\"]\n", + " filepaths = list(Path(file_directory).iterdir())\n", + " for file in tqdm(filepaths):\n", + " # validate that file is a file, has acceptable file type, has a .txt extension, and has utf-8 encoding\n", + " if (\n", + " not file.is_file()\n", + " or file.suffix != \".txt\"\n", + " or magic.from_file(str(file), mime=True) not in accepted_file_types\n", + "***REMOVED***:\n", + " print(f\"Skipping invalid file: {file***REMOVED***\")\n", + " continue\n", + " # open and decode file as utf-8, ignore bad characters\n", + " batch_files.append(\n", + " (\"files\", open(file=file, mode=\"r\", encoding=\"utf-8\", errors=\"ignore\"))\n", + "***REMOVED***\n", + " # upload batch of files\n", + " if len(batch_files) == batch_size:\n", + " response = upload_batch(batch_files, storage_name, overwrite, max_retries)\n", + " # if response is not ok, return early\n", + " if not response.ok:\n", + " return response\n", + " batch_files.clear()\n", + " # upload remaining files\n", + " if len(batch_files) > 0:\n", + " response = upload_batch(batch_files, storage_name, overwrite, max_retries)\n", + " return response" + ] + ***REMOVED***, + { + "cell_type": "code", + "execution_count": null, + "metadata": {***REMOVED***, + "outputs": [], + "source": [ + "response = upload_files(\n", + " file_directory=file_directory,\n", + " storage_name=storage_name,\n", + " batch_size=100,\n", + " overwrite=True,\n", + ")\n", + "if not response.ok:\n", + " print(response.text)\n", + "else:\n", + " print(response)" + ] + ***REMOVED***, + { + "cell_type": "markdown", + "metadata": {***REMOVED***, + "source": [ + "## Create an Index\n", + "\n", + "After data files have been uploaded, it is now possible to construct a knowledge graph by creating a search index. If an entity configuration is not provided, a default entity configuration will be used that has been shown to generally work well." + ] + ***REMOVED***, + { + "cell_type": "code", + "execution_count": null, + "metadata": {***REMOVED***, + "outputs": [], + "source": [ + "def build_index(\n", + " storage_name: str,\n", + " index_name: str,\n", + ") -> requests.Response:\n", + " \"\"\"Create a search index.\n", + " This function kicks off a job that builds a knowledge graph (KG) index from files located in a blob storage container.\n", + " \"\"\"\n", + " url = apim_url + \"/index\"\n", + " request = {\n", + " \"storage_name\": storage_name,\n", + " \"index_name\": index_name\n", + " ***REMOVED***\n", + " return requests.post(url, json=request, headers=headers)" + ] + ***REMOVED***, + { + "cell_type": "code", + "execution_count": null, + "metadata": {***REMOVED***, + "outputs": [], + "source": [ + "response = build_index(\n", + " storage_name=storage_name,\n", + " index_name=index_name\n", + ")\n", + "print(response)\n", + "if response.ok:\n", + " print(response.text)\n", + "else:\n", + " print(f\"Failed to submit job.\\nStatus: {response.text***REMOVED***\")" + ] + ***REMOVED***, + { + "cell_type": "markdown", + "metadata": {***REMOVED***, + "source": [ + "### Check the status of an indexing job\n", + "\n", + "Please wait for your index to reach 100 percent complete before continuing on to the next section to run queries." + ] + ***REMOVED***, + { + "cell_type": "code", + "execution_count": null, + "metadata": {***REMOVED***, + "outputs": [], + "source": [ + "def index_status(index_name: str) -> requests.Response:\n", + " url = apim_url + f\"/index/status/{index_name***REMOVED***\"\n", + " return requests.get(url, headers=headers)" + ] + ***REMOVED***, + { + "cell_type": "code", + "execution_count": null, + "metadata": {***REMOVED***, + "outputs": [], + "source": [ + "response = index_status(index_name)\n", + "\n", + "pprint(response.json())" + ] + ***REMOVED***, + { + "cell_type": "markdown", + "metadata": {***REMOVED***, + "source": [ + "## Query\n", + "\n", + "After an indexing job has completed, the knowledge graph is ready to query. Two types of queries (global and local) are currently supported. In addition, you can issue a query over a single index or multiple indexes." + ] + ***REMOVED***, + { + "cell_type": "code", + "execution_count": null, + "metadata": {***REMOVED***, + "outputs": [], + "source": [ + "\"\"\"Needed helper function to parse out the clear result from the query response. \"\"\"\n", + "def parse_query_response(\n", + " response: requests.Response, return_context_data: bool = False\n", + ") -> requests.Response | dict[list[dict]]:\n", + " \"\"\"\n", + " Prints response['result'] value and optionally\n", + " returns associated context data.\n", + " \"\"\"\n", + " if response.ok:\n", + " print(json.loads(response.text)[\"result\"])\n", + " if return_context_data:\n", + " return json.loads(response.text)[\"context_data\"]\n", + " return response\n", + " else:\n", + " print(response.reason)\n", + " print(response.content)\n", + " return response" + ] + ***REMOVED***, + { + "cell_type": "markdown", + "metadata": {***REMOVED***, + "source": [ + "### Global Query \n", + "\n", + "Global search queries are resource-intensive, but give good responses to questions that require an understanding of the dataset as a whole." + ] + ***REMOVED***, + { + "cell_type": "code", + "execution_count": null, + "metadata": {***REMOVED***, + "outputs": [], + "source": [ + "def global_search(index_name: str | list[str], query: str) -> requests.Response:\n", + " \"\"\"Run a global query over the knowledge graph(s) associated with one or more indexes\"\"\"\n", + " url = apim_url + \"/query/global\"\n", + " request = {\"index_name\": index_name, \"query\": query***REMOVED***\n", + " return requests.post(url, json=request, headers=headers)" + ] + ***REMOVED***, + { + "cell_type": "code", + "execution_count": null, + "metadata": {***REMOVED***, + "outputs": [], + "source": [ + "%%time\n", + "# pass in a single index name as a string or to query across multiple indexes, set index_name=[myindex1, myindex2]\n", + "global_response = global_search(\n", + " index_name=index_name, query=\"Summarize the main topics of this data\"\n", + ")\n", + "# print the result and save context data in a variable\n", + "global_response_data = parse_query_response(global_response, return_context_data=True)\n", + "global_response_data" + ] + ***REMOVED***, + { + "cell_type": "markdown", + "metadata": {***REMOVED***, + "source": [ + "### Local Query\n", + "\n", + "Local search queries are best suited for narrow-focused questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?)" + ] + ***REMOVED***, + { + "cell_type": "code", + "execution_count": null, + "metadata": {***REMOVED***, + "outputs": [], + "source": [ + "def local_search(index_name: str | list[str], query: str) -> requests.Response:\n", + " \"\"\"Run a local query over the knowledge graph(s) associated with one or more indexes\"\"\"\n", + " url = apim_url + \"/query/local\"\n", + " request = {\"index_name\": index_name, \"query\": query***REMOVED***\n", + " return requests.post(url, json=request, headers=headers)" + ] + ***REMOVED***, + { + "cell_type": "code", + "execution_count": null, + "metadata": {***REMOVED***, + "outputs": [], + "source": [ + "%%time\n", + "# pass in a single index name as a string or to query across multiple indexes, set index_name=[myindex1, myindex2]\n", + "local_response = local_search(\n", + " index_name=index_name, query=\"Who are the primary actors in these communities?\"\n", + ")\n", + "# print the result and save context data in a variable\n", + "local_response_data = parse_query_response(local_response, return_context_data=True)\n", + "local_response_data" + ] + ***REMOVED*** + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + ***REMOVED***, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + ***REMOVED***, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + ***REMOVED*** + ***REMOVED***, + "nbformat": 4, + "nbformat_minor": 2 +***REMOVED***