Skip to content

Commit

Permalink
feat: add filter for query in ts templates (#172)
Browse files Browse the repository at this point in the history
---------
Co-authored-by: Marcus Schiesser <[email protected]>
  • Loading branch information
thucpn authored Jul 22, 2024
1 parent 455ab68 commit bd4714c
Show file tree
Hide file tree
Showing 16 changed files with 164 additions and 106 deletions.
5 changes: 5 additions & 0 deletions .changeset/curvy-penguins-work.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"create-llama": patch
---

Filter private documents for Typescript (Using MetadataFilters) and update to LlamaIndexTS 0.5.6
34 changes: 32 additions & 2 deletions templates/components/engines/typescript/agent/chat.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
import { BaseToolWithCall, OpenAIAgent, QueryEngineTool } from "llamaindex";
import {
BaseToolWithCall,
MetadataFilter,
MetadataFilters,
OpenAIAgent,
QueryEngineTool,
} from "llamaindex";
import fs from "node:fs/promises";
import path from "node:path";
import { getDataSource } from "./index";
Expand All @@ -14,7 +20,7 @@ export async function createChatEngine(documentIds?: string[]) {
tools.push(
new QueryEngineTool({
queryEngine: index.asQueryEngine({
preFilters: undefined, // TODO: Add filters once LITS supports it (getQueryFilters)
preFilters: generateFilters(documentIds || []),
}),
metadata: {
name: "data_query_engine",
Expand All @@ -41,3 +47,27 @@ export async function createChatEngine(documentIds?: string[]) {
systemPrompt: process.env.SYSTEM_PROMPT,
});
}

function generateFilters(documentIds: string[]): MetadataFilters | undefined {
// public documents don't have the "private" field or it's set to "false"
const publicDocumentsFilter: MetadataFilter = {
key: "private",
value: ["true"],
operator: "nin",
};

// if no documentIds are provided, only retrieve information from public documents
if (!documentIds.length) return { filters: [publicDocumentsFilter] };

const privateDocumentsFilter: MetadataFilter = {
key: "doc_id",
value: documentIds,
operator: "in",
};

// if documentIds are provided, retrieve information from public and private documents
return {
filters: [publicDocumentsFilter, privateDocumentsFilter],
condition: "or",
};
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ export function retrieveDocumentIds(annotations?: JSONValue[]): string[] {
) {
const files = data.files as DocumentFile[];
for (const file of files) {
if (Array.isArray(file.content)) {
if (Array.isArray(file.content.value)) {
// it's an array, so it's an array of doc IDs
for (const id of file.content) {
for (const id of file.content.value) {
ids.push(id);
}
}
Expand Down
37 changes: 17 additions & 20 deletions templates/components/llamaindex/typescript/streaming/events.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,18 @@ import {
} from "llamaindex";
import { LLamaCloudFileService } from "./service";

export async function appendSourceData(
export function appendSourceData(
data: StreamData,
sourceNodes?: NodeWithScore<Metadata>[],
) {
if (!sourceNodes?.length) return;
try {
const nodes = await Promise.all(
sourceNodes.map(async (node) => ({
...node.node.toMutableJSON(),
id: node.node.id_,
score: node.score ?? null,
url: await getNodeUrl(node.node.metadata),
})),
);
const nodes = sourceNodes.map((node) => ({
...node.node.toMutableJSON(),
id: node.node.id_,
score: node.score ?? null,
url: getNodeUrl(node.node.metadata),
}));
data.appendMessageAnnotation({
type: "sources",
data: {
Expand Down Expand Up @@ -76,18 +74,19 @@ export function createStreamTimeout(stream: StreamData) {
export function createCallbackManager(stream: StreamData) {
const callbackManager = new CallbackManager();

callbackManager.on("retrieve-end", async (data) => {
const { nodes, query } = data.detail.payload;
await appendSourceData(stream, nodes);
callbackManager.on("retrieve-end", (data) => {
const { nodes, query } = data.detail;
appendSourceData(stream, nodes);
appendEventData(stream, `Retrieving context for query: '${query}'`);
appendEventData(
stream,
`Retrieved ${nodes.length} sources to use as context for the query`,
);
LLamaCloudFileService.downloadFiles(nodes); // don't await to avoid blocking chat streaming
});

callbackManager.on("llm-tool-call", (event) => {
const { name, input } = event.detail.payload.toolCall;
const { name, input } = event.detail.toolCall;
const inputString = Object.entries(input)
.map(([key, value]) => `${key}: ${value}`)
.join(", ");
Expand All @@ -98,14 +97,14 @@ export function createCallbackManager(stream: StreamData) {
});

callbackManager.on("llm-tool-result", (event) => {
const { toolCall, toolResult } = event.detail.payload;
const { toolCall, toolResult } = event.detail;
appendToolData(stream, toolCall, toolResult);
});

return callbackManager;
}

async function getNodeUrl(metadata: Metadata) {
function getNodeUrl(metadata: Metadata) {
if (!process.env.FILESERVER_URL_PREFIX) {
console.warn(
"FILESERVER_URL_PREFIX is not set. File URLs will not be generated.",
Expand All @@ -114,13 +113,11 @@ async function getNodeUrl(metadata: Metadata) {
const fileName = metadata["file_name"];
if (fileName && process.env.FILESERVER_URL_PREFIX) {
// file_name exists and file server is configured
const isLocalFile = metadata["is_local_file"] === "true";
const pipelineId = metadata["pipeline_id"];
if (pipelineId && !isLocalFile) {
if (pipelineId && metadata["private"] == null) {
// file is from LlamaCloud and was not ingested locally
// TODO trigger but don't await file download and just use convention to generate the URL (see Python code)
// return `${process.env.FILESERVER_URL_PREFIX}/output/llamacloud/${pipelineId}\$${fileName}`;
return await LLamaCloudFileService.getFileUrl(fileName, pipelineId);
const name = LLamaCloudFileService.toDownloadedName(pipelineId, fileName);
return `${process.env.FILESERVER_URL_PREFIX}/output/llamacloud/${name}`;
}
const isPrivate = metadata["private"] === "true";
const folder = isPrivate ? "output/uploaded" : "data";
Expand Down
145 changes: 85 additions & 60 deletions templates/components/llamaindex/typescript/streaming/service.ts
Original file line number Diff line number Diff line change
@@ -1,86 +1,76 @@
import { Metadata, NodeWithScore } from "llamaindex";
import fs from "node:fs";
import https from "node:https";
import path from "node:path";

const LLAMA_CLOUD_OUTPUT_DIR = "output/llamacloud";
const LLAMA_CLOUD_BASE_URL = "https://cloud.llamaindex.ai/api/v1";
const FILE_DELIMITER = "$"; // delimiter between pipelineId and filename

export interface LlamaCloudFile {
interface LlamaCloudFile {
name: string;
file_id: string;
project_id: string;
}

export class LLamaCloudFileService {
static async getFiles(pipelineId: string): Promise<LlamaCloudFile[]> {
const url = `${LLAMA_CLOUD_BASE_URL}/pipelines/${pipelineId}/files`;
const headers = {
Accept: "application/json",
Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
};
const response = await fetch(url, { method: "GET", headers });
const data = await response.json();
return data;
public static async downloadFiles(nodes: NodeWithScore<Metadata>[]) {
const files = this.nodesToDownloadFiles(nodes);
if (!files.length) return;
console.log("Downloading files from LlamaCloud...");
for (const file of files) {
await this.downloadFile(file.pipelineId, file.fileName);
}
}

static async getFileDetail(
projectId: string,
fileId: string,
): Promise<{ url: string }> {
const url = `${LLAMA_CLOUD_BASE_URL}/files/${fileId}/content?project_id=${projectId}`;
const headers = {
Accept: "application/json",
Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
};
const response = await fetch(url, { method: "GET", headers });
const data = (await response.json()) as { url: string };
return data;
public static toDownloadedName(pipelineId: string, fileName: string) {
return `${pipelineId}${FILE_DELIMITER}${fileName}`;
}

static async getFileUrl(
name: string,
pipelineId: string,
): Promise<string | null> {
try {
const files = await this.getFiles(pipelineId);
for (const file of files) {
if (file.name === name) {
const fileId = file.file_id;
const projectId = file.project_id;
const fileDetail = await this.getFileDetail(projectId, fileId);
const localFileUrl = this.downloadFile(fileDetail.url, fileId, name);
return localFileUrl;
}
/**
* This function will return an array of unique files to download from LlamaCloud
* We only download files that are uploaded directly in LlamaCloud datasources (don't have `private` in metadata)
* Files are uploaded directly in LlamaCloud datasources don't have `private` in metadata (public docs)
* Files are uploaded from local via `generate` command will have `private=false` (public docs)
* Files are uploaded from local via `/chat/upload` endpoint will have `private=true` (private docs)
*
* @param nodes
* @returns list of unique files to download
*/
private static nodesToDownloadFiles(nodes: NodeWithScore<Metadata>[]) {
const downloadFiles: Array<{
pipelineId: string;
fileName: string;
}> = [];
for (const node of nodes) {
const isLocalFile = node.node.metadata["private"] != null;
const pipelineId = node.node.metadata["pipeline_id"];
const fileName = node.node.metadata["file_name"];
if (isLocalFile || !pipelineId || !fileName) continue;
const isDuplicate = downloadFiles.some(
(f) => f.pipelineId === pipelineId && f.fileName === fileName,
);
if (!isDuplicate) {
downloadFiles.push({ pipelineId, fileName });
}
return null;
} catch (error) {
console.error("Error fetching file from LlamaCloud:", error);
return null;
}
return downloadFiles;
}

static downloadFile(url: string, fileId: string, filename: string) {
const FILE_DELIMITER = "$"; // delimiter between fileId and filename
const downloadedFileName = `${fileId}${FILE_DELIMITER}${filename}`;
const downloadedFilePath = path.join(
LLAMA_CLOUD_OUTPUT_DIR,
downloadedFileName,
);
const urlPrefix = `${process.env.FILESERVER_URL_PREFIX}/${LLAMA_CLOUD_OUTPUT_DIR}`;
const fileUrl = `${urlPrefix}/${downloadedFileName}`;

private static async downloadFile(pipelineId: string, fileName: string) {
try {
const downloadedName = this.toDownloadedName(pipelineId, fileName);
const downloadedPath = path.join(LLAMA_CLOUD_OUTPUT_DIR, downloadedName);

// Check if file already exists
if (fs.existsSync(downloadedFilePath)) return fileUrl;
if (fs.existsSync(downloadedPath)) return;

// Create directory if it doesn't exist
if (!fs.existsSync(LLAMA_CLOUD_OUTPUT_DIR)) {
fs.mkdirSync(LLAMA_CLOUD_OUTPUT_DIR, { recursive: true });
}
const urlToDownload = await this.getFileUrlByName(pipelineId, fileName);
if (!urlToDownload) throw new Error("File not found in LlamaCloud");

const file = fs.createWriteStream(downloadedFilePath);
const file = fs.createWriteStream(downloadedPath);
https
.get(url, (response) => {
.get(urlToDownload, (response) => {
response.pipe(file);
file.on("finish", () => {
file.close(() => {
Expand All @@ -89,15 +79,50 @@ export class LLamaCloudFileService {
});
})
.on("error", (err) => {
fs.unlink(downloadedFilePath, () => {
fs.unlink(downloadedPath, () => {
console.error("Error downloading file:", err);
throw err;
});
});

return fileUrl;
} catch (error) {
throw new Error(`Error downloading file from LlamaCloud: ${error}`);
}
}

private static async getFileUrlByName(
pipelineId: string,
name: string,
): Promise<string | null> {
const files = await this.getAllFiles(pipelineId);
const file = files.find((file) => file.name === name);
if (!file) return null;
return await this.getFileUrlById(file.project_id, file.file_id);
}

private static async getFileUrlById(
projectId: string,
fileId: string,
): Promise<string> {
const url = `${LLAMA_CLOUD_BASE_URL}/files/${fileId}/content?project_id=${projectId}`;
const headers = {
Accept: "application/json",
Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
};
const response = await fetch(url, { method: "GET", headers });
const data = (await response.json()) as { url: string };
return data.url;
}

private static async getAllFiles(
pipelineId: string,
): Promise<LlamaCloudFile[]> {
const url = `${LLAMA_CLOUD_BASE_URL}/pipelines/${pipelineId}/files`;
const headers = {
Accept: "application/json",
Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
};
const response = await fetch(url, { method: "GET", headers });
const data = await response.json();
return data;
}
}
4 changes: 2 additions & 2 deletions templates/components/vectordbs/python/llamacloud/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ def generate_datasource():

documents = get_documents()

# Set is_local_file=true to distinguish locally ingested files from LlamaCloud files
# Set private=false to mark the document as public (required for filtering)
for doc in documents:
doc.metadata["is_local_file"] = "true"
doc.metadata["private"] = "false"

LlamaCloudIndex.from_documents(
documents=documents,
Expand Down
3 changes: 3 additions & 0 deletions templates/components/vectordbs/python/none/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ def generate_datasource():
storage_dir = os.environ.get("STORAGE_DIR", "storage")
# load the documents and create the index
documents = get_documents()
# Set private=false to mark the document as public (required for filtering)
for doc in documents:
doc.metadata["private"] = "false"
index = VectorStoreIndex.from_documents(
documents,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ dotenv.config();

async function loadAndIndex() {
const documents = await getDocuments();
// Set is_local_file=true to distinguish locally ingested files from LlamaCloud files
// Set private=false to mark the document as public (required for filtering)
for (const document of documents) {
document.metadata = {
...document.metadata,
is_local_file: "true",
private: "false",
};
}
await getDataSource();
Expand Down
5 changes: 5 additions & 0 deletions templates/components/vectordbs/typescript/none/generate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ async function generateDatasource() {
persistDir: STORAGE_CACHE_DIR,
});
const documents = await getDocuments();
// Set private=false to mark the document as public (required for filtering)
documents.forEach((doc) => {
doc.metadata["private"] = "false";
});

await VectorStoreIndex.fromDocuments(documents, {
storageContext,
});
Expand Down
2 changes: 1 addition & 1 deletion templates/types/streaming/express/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"dotenv": "^16.3.1",
"duck-duck-scrape": "^2.2.5",
"express": "^4.18.2",
"llamaindex": "0.4.14",
"llamaindex": "0.5.6",
"pdf2json": "3.0.5",
"ajv": "^8.12.0",
"@e2b/code-interpreter": "^0.0.5",
Expand Down
Loading

0 comments on commit bd4714c

Please sign in to comment.