feat: add filter for query in ts templates (#172)

--------- Co-authored-by: Marcus Schiesser <[email protected]>
run-llama · Jul 22, 2024 · bd4714c · bd4714c
1 parent 455ab68
commit bd4714c
Show file tree

Hide file tree

Showing 16 changed files with 164 additions and 106 deletions.
diff --git a/.changeset/curvy-penguins-work.md b/.changeset/curvy-penguins-work.md
@@ -0,0 +1,5 @@
+---
+"create-llama": patch
+---
+
+Filter private documents for Typescript (Using MetadataFilters) and update to LlamaIndexTS 0.5.6
diff --git a/templates/components/engines/typescript/agent/chat.ts b/templates/components/engines/typescript/agent/chat.ts
@@ -1,4 +1,10 @@
-import { BaseToolWithCall, OpenAIAgent, QueryEngineTool } from "llamaindex";
+import {
+  BaseToolWithCall,
+  MetadataFilter,
+  MetadataFilters,
+  OpenAIAgent,
+  QueryEngineTool,
+} from "llamaindex";
 import fs from "node:fs/promises";
 import path from "node:path";
 import { getDataSource } from "./index";
@@ -14,7 +20,7 @@ export async function createChatEngine(documentIds?: string[]) {
     tools.push(
       new QueryEngineTool({
         queryEngine: index.asQueryEngine({
-          preFilters: undefined, // TODO: Add filters once LITS supports it (getQueryFilters)
+          preFilters: generateFilters(documentIds || []),
         }),
         metadata: {
           name: "data_query_engine",
@@ -41,3 +47,27 @@ export async function createChatEngine(documentIds?: string[]) {
     systemPrompt: process.env.SYSTEM_PROMPT,
   });
 }
+
+function generateFilters(documentIds: string[]): MetadataFilters | undefined {
+  // public documents don't have the "private" field or it's set to "false"
+  const publicDocumentsFilter: MetadataFilter = {
+    key: "private",
+    value: ["true"],
+    operator: "nin",
+  };
+
+  // if no documentIds are provided, only retrieve information from public documents
+  if (!documentIds.length) return { filters: [publicDocumentsFilter] };
+
+  const privateDocumentsFilter: MetadataFilter = {
+    key: "doc_id",
+    value: documentIds,
+    operator: "in",
+  };
+
+  // if documentIds are provided, retrieve information from public and private documents
+  return {
+    filters: [publicDocumentsFilter, privateDocumentsFilter],
+    condition: "or",
+  };
+}
diff --git a/templates/components/llamaindex/typescript/streaming/annotations.ts b/templates/components/llamaindex/typescript/streaming/annotations.ts
@@ -35,9 +35,9 @@ export function retrieveDocumentIds(annotations?: JSONValue[]): string[] {
     ) {
       const files = data.files as DocumentFile[];
       for (const file of files) {
-        if (Array.isArray(file.content)) {
+        if (Array.isArray(file.content.value)) {
           // it's an array, so it's an array of doc IDs
-          for (const id of file.content) {
+          for (const id of file.content.value) {
             ids.push(id);
           }
         }

diff --git a/templates/components/llamaindex/typescript/streaming/events.ts b/templates/components/llamaindex/typescript/streaming/events.ts
@@ -8,20 +8,18 @@ import {
 } from "llamaindex";
 import { LLamaCloudFileService } from "./service";
 
-export async function appendSourceData(
+export function appendSourceData(
   data: StreamData,
   sourceNodes?: NodeWithScore<Metadata>[],
 ) {
   if (!sourceNodes?.length) return;
   try {
-    const nodes = await Promise.all(
-      sourceNodes.map(async (node) => ({
-        ...node.node.toMutableJSON(),
-        id: node.node.id_,
-        score: node.score ?? null,
-        url: await getNodeUrl(node.node.metadata),
-      })),
-    );
+    const nodes = sourceNodes.map((node) => ({
+      ...node.node.toMutableJSON(),
+      id: node.node.id_,
+      score: node.score ?? null,
+      url: getNodeUrl(node.node.metadata),
+    }));
     data.appendMessageAnnotation({
       type: "sources",
       data: {
@@ -76,18 +74,19 @@ export function createStreamTimeout(stream: StreamData) {
 export function createCallbackManager(stream: StreamData) {
   const callbackManager = new CallbackManager();
 
-  callbackManager.on("retrieve-end", async (data) => {
-    const { nodes, query } = data.detail.payload;
-    await appendSourceData(stream, nodes);
+  callbackManager.on("retrieve-end", (data) => {
+    const { nodes, query } = data.detail;
+    appendSourceData(stream, nodes);
     appendEventData(stream, `Retrieving context for query: '${query}'`);
     appendEventData(
       stream,
       `Retrieved ${nodes.length} sources to use as context for the query`,
     );
+    LLamaCloudFileService.downloadFiles(nodes); // don't await to avoid blocking chat streaming
   });
 
   callbackManager.on("llm-tool-call", (event) => {
-    const { name, input } = event.detail.payload.toolCall;
+    const { name, input } = event.detail.toolCall;
     const inputString = Object.entries(input)
       .map(([key, value]) => `${key}: ${value}`)
       .join(", ");
@@ -98,14 +97,14 @@ export function createCallbackManager(stream: StreamData) {
   });
 
   callbackManager.on("llm-tool-result", (event) => {
-    const { toolCall, toolResult } = event.detail.payload;
+    const { toolCall, toolResult } = event.detail;
     appendToolData(stream, toolCall, toolResult);
   });
 
   return callbackManager;
 }
 
-async function getNodeUrl(metadata: Metadata) {
+function getNodeUrl(metadata: Metadata) {
   if (!process.env.FILESERVER_URL_PREFIX) {
     console.warn(
       "FILESERVER_URL_PREFIX is not set. File URLs will not be generated.",
@@ -114,13 +113,11 @@ async function getNodeUrl(metadata: Metadata) {
   const fileName = metadata["file_name"];
   if (fileName && process.env.FILESERVER_URL_PREFIX) {
     // file_name exists and file server is configured
-    const isLocalFile = metadata["is_local_file"] === "true";
     const pipelineId = metadata["pipeline_id"];
-    if (pipelineId && !isLocalFile) {
+    if (pipelineId && metadata["private"] == null) {
       // file is from LlamaCloud and was not ingested locally
-      // TODO trigger but don't await file download and just use convention to generate the URL (see Python code)
-      // return `${process.env.FILESERVER_URL_PREFIX}/output/llamacloud/${pipelineId}\$${fileName}`;
-      return await LLamaCloudFileService.getFileUrl(fileName, pipelineId);
+      const name = LLamaCloudFileService.toDownloadedName(pipelineId, fileName);
+      return `${process.env.FILESERVER_URL_PREFIX}/output/llamacloud/${name}`;
     }
     const isPrivate = metadata["private"] === "true";
     const folder = isPrivate ? "output/uploaded" : "data";

diff --git a/templates/components/llamaindex/typescript/streaming/service.ts b/templates/components/llamaindex/typescript/streaming/service.ts
@@ -1,86 +1,76 @@
+import { Metadata, NodeWithScore } from "llamaindex";
 import fs from "node:fs";
 import https from "node:https";
 import path from "node:path";
 
 const LLAMA_CLOUD_OUTPUT_DIR = "output/llamacloud";
 const LLAMA_CLOUD_BASE_URL = "https://cloud.llamaindex.ai/api/v1";
+const FILE_DELIMITER = "$"; // delimiter between pipelineId and filename
 
-export interface LlamaCloudFile {
+interface LlamaCloudFile {
   name: string;
   file_id: string;
   project_id: string;
 }
 
 export class LLamaCloudFileService {
-  static async getFiles(pipelineId: string): Promise<LlamaCloudFile[]> {
-    const url = `${LLAMA_CLOUD_BASE_URL}/pipelines/${pipelineId}/files`;
-    const headers = {
-      Accept: "application/json",
-      Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
-    };
-    const response = await fetch(url, { method: "GET", headers });
-    const data = await response.json();
-    return data;
+  public static async downloadFiles(nodes: NodeWithScore<Metadata>[]) {
+    const files = this.nodesToDownloadFiles(nodes);
+    if (!files.length) return;
+    console.log("Downloading files from LlamaCloud...");
+    for (const file of files) {
+      await this.downloadFile(file.pipelineId, file.fileName);
+    }
   }
 
-  static async getFileDetail(
-    projectId: string,
-    fileId: string,
-  ): Promise<{ url: string }> {
-    const url = `${LLAMA_CLOUD_BASE_URL}/files/${fileId}/content?project_id=${projectId}`;
-    const headers = {
-      Accept: "application/json",
-      Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
-    };
-    const response = await fetch(url, { method: "GET", headers });
-    const data = (await response.json()) as { url: string };
-    return data;
+  public static toDownloadedName(pipelineId: string, fileName: string) {
+    return `${pipelineId}${FILE_DELIMITER}${fileName}`;
   }
 
-  static async getFileUrl(
-    name: string,
-    pipelineId: string,
-  ): Promise<string | null> {
-    try {
-      const files = await this.getFiles(pipelineId);
-      for (const file of files) {
-        if (file.name === name) {
-          const fileId = file.file_id;
-          const projectId = file.project_id;
-          const fileDetail = await this.getFileDetail(projectId, fileId);
-          const localFileUrl = this.downloadFile(fileDetail.url, fileId, name);
-          return localFileUrl;
-        }
+  /**
+   * This function will return an array of unique files to download from LlamaCloud
+   * We only download files that are uploaded directly in LlamaCloud datasources (don't have `private` in metadata)
+   * Files are uploaded directly in LlamaCloud datasources don't have `private` in metadata (public docs)
+   * Files are uploaded from local via `generate` command will have `private=false` (public docs)
+   * Files are uploaded from local via `/chat/upload` endpoint will have `private=true` (private docs)
+   *
+   * @param nodes
+   * @returns list of unique files to download
+   */
+  private static nodesToDownloadFiles(nodes: NodeWithScore<Metadata>[]) {
+    const downloadFiles: Array<{
+      pipelineId: string;
+      fileName: string;
+    }> = [];
+    for (const node of nodes) {
+      const isLocalFile = node.node.metadata["private"] != null;
+      const pipelineId = node.node.metadata["pipeline_id"];
+      const fileName = node.node.metadata["file_name"];
+      if (isLocalFile || !pipelineId || !fileName) continue;
+      const isDuplicate = downloadFiles.some(
+        (f) => f.pipelineId === pipelineId && f.fileName === fileName,
+      );
+      if (!isDuplicate) {
+        downloadFiles.push({ pipelineId, fileName });
       }
-      return null;
-    } catch (error) {
-      console.error("Error fetching file from LlamaCloud:", error);
-      return null;
     }
+    return downloadFiles;
   }
 
-  static downloadFile(url: string, fileId: string, filename: string) {
-    const FILE_DELIMITER = "$"; // delimiter between fileId and filename
-    const downloadedFileName = `${fileId}${FILE_DELIMITER}${filename}`;
-    const downloadedFilePath = path.join(
-      LLAMA_CLOUD_OUTPUT_DIR,
-      downloadedFileName,
-    );
-    const urlPrefix = `${process.env.FILESERVER_URL_PREFIX}/${LLAMA_CLOUD_OUTPUT_DIR}`;
-    const fileUrl = `${urlPrefix}/${downloadedFileName}`;
-
+  private static async downloadFile(pipelineId: string, fileName: string) {
     try {
+      const downloadedName = this.toDownloadedName(pipelineId, fileName);
+      const downloadedPath = path.join(LLAMA_CLOUD_OUTPUT_DIR, downloadedName);
+
       // Check if file already exists
-      if (fs.existsSync(downloadedFilePath)) return fileUrl;
+      if (fs.existsSync(downloadedPath)) return;
 
-      // Create directory if it doesn't exist
-      if (!fs.existsSync(LLAMA_CLOUD_OUTPUT_DIR)) {
-        fs.mkdirSync(LLAMA_CLOUD_OUTPUT_DIR, { recursive: true });
-      }
+      const urlToDownload = await this.getFileUrlByName(pipelineId, fileName);
+      if (!urlToDownload) throw new Error("File not found in LlamaCloud");
 
-      const file = fs.createWriteStream(downloadedFilePath);
+      const file = fs.createWriteStream(downloadedPath);
       https
-        .get(url, (response) => {
+        .get(urlToDownload, (response) => {
           response.pipe(file);
           file.on("finish", () => {
             file.close(() => {
@@ -89,15 +79,50 @@ export class LLamaCloudFileService {
           });
         })
         .on("error", (err) => {
-          fs.unlink(downloadedFilePath, () => {
+          fs.unlink(downloadedPath, () => {
             console.error("Error downloading file:", err);
             throw err;
           });
         });
-
-      return fileUrl;
     } catch (error) {
       throw new Error(`Error downloading file from LlamaCloud: ${error}`);
     }
   }
+
+  private static async getFileUrlByName(
+    pipelineId: string,
+    name: string,
+  ): Promise<string | null> {
+    const files = await this.getAllFiles(pipelineId);
+    const file = files.find((file) => file.name === name);
+    if (!file) return null;
+    return await this.getFileUrlById(file.project_id, file.file_id);
+  }
+
+  private static async getFileUrlById(
+    projectId: string,
+    fileId: string,
+  ): Promise<string> {
+    const url = `${LLAMA_CLOUD_BASE_URL}/files/${fileId}/content?project_id=${projectId}`;
+    const headers = {
+      Accept: "application/json",
+      Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
+    };
+    const response = await fetch(url, { method: "GET", headers });
+    const data = (await response.json()) as { url: string };
+    return data.url;
+  }
+
+  private static async getAllFiles(
+    pipelineId: string,
+  ): Promise<LlamaCloudFile[]> {
+    const url = `${LLAMA_CLOUD_BASE_URL}/pipelines/${pipelineId}/files`;
+    const headers = {
+      Accept: "application/json",
+      Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
+    };
+    const response = await fetch(url, { method: "GET", headers });
+    const data = await response.json();
+    return data;
+  }
 }
diff --git a/templates/components/vectordbs/python/llamacloud/generate.py b/templates/components/vectordbs/python/llamacloud/generate.py
@@ -30,9 +30,9 @@ def generate_datasource():
 
     documents = get_documents()
 
-    # Set is_local_file=true to distinguish locally ingested files from LlamaCloud files
+    # Set private=false to mark the document as public (required for filtering)
     for doc in documents:
-        doc.metadata["is_local_file"] = "true"
+        doc.metadata["private"] = "false"
 
     LlamaCloudIndex.from_documents(
         documents=documents,

diff --git a/templates/components/vectordbs/python/none/generate.py b/templates/components/vectordbs/python/none/generate.py
@@ -21,6 +21,9 @@ def generate_datasource():
     storage_dir = os.environ.get("STORAGE_DIR", "storage")
     # load the documents and create the index
     documents = get_documents()
+    # Set private=false to mark the document as public (required for filtering)
+    for doc in documents:
+        doc.metadata["private"] = "false"
     index = VectorStoreIndex.from_documents(
         documents,
     )

diff --git a/templates/components/vectordbs/typescript/llamacloud/generate.ts b/templates/components/vectordbs/typescript/llamacloud/generate.ts
@@ -9,11 +9,11 @@ dotenv.config();
 
 async function loadAndIndex() {
   const documents = await getDocuments();
-  // Set is_local_file=true to distinguish locally ingested files from LlamaCloud files
+  // Set private=false to mark the document as public (required for filtering)
   for (const document of documents) {
     document.metadata = {
       ...document.metadata,
-      is_local_file: "true",
+      private: "false",
     };
   }
   await getDataSource();

diff --git a/templates/components/vectordbs/typescript/none/generate.ts b/templates/components/vectordbs/typescript/none/generate.ts
@@ -25,6 +25,11 @@ async function generateDatasource() {
       persistDir: STORAGE_CACHE_DIR,
     });
     const documents = await getDocuments();
+    //  Set private=false to mark the document as public (required for filtering)
+    documents.forEach((doc) => {
+      doc.metadata["private"] = "false";
+    });
+
     await VectorStoreIndex.fromDocuments(documents, {
       storageContext,
     });

diff --git a/templates/types/streaming/express/package.json b/templates/types/streaming/express/package.json
@@ -20,7 +20,7 @@
     "dotenv": "^16.3.1",
     "duck-duck-scrape": "^2.2.5",
     "express": "^4.18.2",
-    "llamaindex": "0.4.14",
+    "llamaindex": "0.5.6",
     "pdf2json": "3.0.5",
     "ajv": "^8.12.0",
     "@e2b/code-interpreter": "^0.0.5",