datastax · epinzur · Mar 20, 2025 · Mar 11, 2025 · Mar 18, 2025 · Mar 18, 2025
diff --git a/data/astrapy.jsonl b/data/astrapy.jsonl
diff --git a/docs/examples/code-generation.ipynb b/docs/examples/code-generation.ipynb
diff --git a/docs/examples/index.md b/docs/examples/index.md
@@ -10,4 +10,20 @@
     It loads Wikipedia articles and traverses based on links ("mentions") and named entities (extracted from the content). It retrieves a large number of articles, groups them by community, and extracts claims from each community. The best claims are used to answer the question.
 
     [:material-fast-forward: Lazy Graph RAG Example](lazy-graph-rag.ipynb)
-</div>
+
+-   :material-code-braces-box:{ .lg .middle } __Code Generation__
+
+    ---
+    This example notebook shows how to load documentation for python packages into a
+    vector store so that it can be used to provide context to an LLM for code generation.
+
+    It uses LangChain and `langchain-graph-retriever` with a custom traversal Strategy
+    to in order to improve LLM generated code output. It shows that using GraphRAG can
+    provide a significant increase in quality over using either a LLM alone or standard
+    RAG.
+
+    GraphRAG is traverses through the documentation in a way similar to how a
+    software engineer would, in order to determine how to solve a coding problem.
+
+    [:material-fast-forward: Code Generation Example](code-generation.ipynb)
+</div>
diff --git a/docs/examples/lazy-graph-rag.ipynb b/docs/examples/lazy-graph-rag.ipynb
@@ -71,6 +71,13 @@
     "The last package -- `graph-rag-example-helpers` -- includes some helpers for setting up environment helpers and allowing the loading of wikipedia data to be restarted if it fails."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/packages/graph-rag-example-helpers/pyproject.toml b/packages/graph-rag-example-helpers/pyproject.toml
@@ -42,6 +42,7 @@ dependencies = [
     "astrapy>=1.5.2",
     "backoff>=2.2.1",
     "graph-retriever",
+    "griffe>=1.5.7",
     "httpx>=0.28.1",
     "langchain-core>=0.3.29",
     "python-dotenv>=1.0.1",
@@ -54,6 +55,7 @@ dependencies = [
 astrapy = "astrapy"
 backoff = "backoff"
 graph-retriever = "graph_retriever"
+griffe = "griffe"
 httpx = "httpx"
 langchain-core = "langchain_core"
 mypy = "mypy"

diff --git a/...ages/graph-rag-example-helpers/src/graph_rag_example_helpers/datasets/astrapy/__init__.py b/...ages/graph-rag-example-helpers/src/graph_rag_example_helpers/datasets/astrapy/__init__.py
@@ -0,0 +1,9 @@
+from ...examples.code_generation.format import add_tabs, format_docs, format_document
+from .fetch import fetch_documents
+
+__all__ = [
+    "fetch_documents",
+    "add_tabs",
+    "format_document",
+    "format_docs",
+]
diff --git a/packages/graph-rag-example-helpers/src/graph_rag_example_helpers/datasets/astrapy/fetch.py b/packages/graph-rag-example-helpers/src/graph_rag_example_helpers/datasets/astrapy/fetch.py
@@ -0,0 +1,40 @@
+import json
+
+import requests
+from langchain_core.documents import Document
+
+# TODO: revert to main branch before code generation is merged
+# ASTRAPY_JSONL_URL = "https://raw.githubusercontent.com/datastax/graph-rag/refs/heads/main/data/astrapy.jsonl"
+ASTRAPY_JSONL_URL = "https://raw.githubusercontent.com/datastax/graph-rag/refs/heads/code_generation/data/astrapy.jsonl"
+
+
+def fetch_documents() -> list[Document]:
+    """
+    Download and parse a list of Documents for use with Graph Retriever.
+
+    This dataset contains the documentation for the AstraPy project as of version 1.5.2.
+
+    This method downloads the dataset each time -- generally it is preferable
+    to invoke this only once and store the documents in memory or a vector
+    store.
+
+    Returns
+    -------
+    :
+        The fetched astra-py documentation Documents.
+
+    Notes
+    -----
+    - The dataset is setup in a way where the path of the item is the `id`, the pydoc
+    description is the `page_content`, and the items other attributes are stored in the
+    `metadata`.
+    - There are many documents that contain an id and metadata, but no page_content.
+    """
+    response = requests.get(ASTRAPY_JSONL_URL)
+    response.raise_for_status()  # Ensure we got a valid response
+
+    return [
+        Document(id=data["id"], page_content=data["text"], metadata=data["metadata"])
+        for line in response.text.splitlines()
+        if (data := json.loads(line))
+    ]
diff --git a/packages/graph-rag-example-helpers/src/graph_rag_example_helpers/examples/__init__.py b/packages/graph-rag-example-helpers/src/graph_rag_example_helpers/examples/__init__.py
diff --git a/...ph-rag-example-helpers/src/graph_rag_example_helpers/examples/code_generation/__init__.py b/...ph-rag-example-helpers/src/graph_rag_example_helpers/examples/code_generation/__init__.py
@@ -0,0 +1,7 @@
+from .format import add_tabs, format_docs, format_document
+
+__all__ = [
+    "add_tabs",
+    "format_document",
+    "format_docs",
+]