Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add filter for query in ts templates #172

Merged
merged 17 commits into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/curvy-penguins-work.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"create-llama": patch
---

add filter for query in ts templates
7 changes: 7 additions & 0 deletions helpers/typescript.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ export const installTSTemplate = async ({
nextConfigJson.output = "export";
nextConfigJson.images = { unoptimized: true };
console.log("\nUsing static site generation\n");

// if having backend, copy overwrite next.config.simple.mjs to next.config.mjs
await fs.copyFile(
path.join(root, "next.config.simple.mjs"),
path.join(root, "next.config.mjs"),
);
} else {
if (vectorDb === "milvus") {
nextConfigJson.experimental.serverComponentsExternalPackages =
Expand All @@ -64,6 +70,7 @@ export const installTSTemplate = async ({
);
}
}
await fs.rm(path.join(root, "next.config.simple.mjs"));
await fs.writeFile(
nextConfigJsonFile,
JSON.stringify(nextConfigJson, null, 2) + os.EOL,
Expand Down
34 changes: 32 additions & 2 deletions templates/components/engines/typescript/agent/chat.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
import { BaseToolWithCall, OpenAIAgent, QueryEngineTool } from "llamaindex";
import {
BaseToolWithCall,
MetadataFilter,
MetadataFilters,
OpenAIAgent,
QueryEngineTool,
} from "llamaindex";
import fs from "node:fs/promises";
import path from "node:path";
import { getDataSource } from "./index";
Expand All @@ -14,7 +20,7 @@ export async function createChatEngine(documentIds?: string[]) {
tools.push(
new QueryEngineTool({
queryEngine: index.asQueryEngine({
preFilters: undefined, // TODO: Add filters once LITS supports it (getQueryFilters)
preFilters: generateFilters(documentIds || []),
}),
metadata: {
name: "data_query_engine",
Expand All @@ -41,3 +47,27 @@ export async function createChatEngine(documentIds?: string[]) {
systemPrompt: process.env.SYSTEM_PROMPT,
});
}

function generateFilters(documentIds: string[]): MetadataFilters | undefined {
// public documents don't have the "private" field or it's set to "false"
const publicDocumentsFilter: MetadataFilter = {
key: "private",
value: ["true"],
operator: "nin",
};

// if no documentIds are provided, only retrieve information from public documents
if (!documentIds.length) return { filters: [publicDocumentsFilter] };

const privateDocumentsFilter: MetadataFilter = {
key: "doc_id",
value: documentIds,
operator: "in",
};

// if documentIds are provided, retrieve information from public and private documents
return {
filters: [publicDocumentsFilter, privateDocumentsFilter],
condition: "or",
};
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ export async function runPipeline(documents: Document[], filename: string) {
for (const document of documents) {
document.metadata = {
...document.metadata,
doc_id: document.id_,
marcusschiesser marked this conversation as resolved.
Show resolved Hide resolved
file_name: filename,
private: "true", // to separate from other public documents
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ export function retrieveDocumentIds(annotations?: JSONValue[]): string[] {
) {
const files = data.files as DocumentFile[];
for (const file of files) {
if (Array.isArray(file.content)) {
if (Array.isArray(file.content.value)) {
// it's an array, so it's an array of doc IDs
for (const id of file.content) {
for (const id of file.content.value) {
ids.push(id);
}
}
Expand Down
37 changes: 17 additions & 20 deletions templates/components/llamaindex/typescript/streaming/events.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,18 @@ import {
} from "llamaindex";
import { LLamaCloudFileService } from "./service";

export async function appendSourceData(
export function appendSourceData(
data: StreamData,
sourceNodes?: NodeWithScore<Metadata>[],
) {
if (!sourceNodes?.length) return;
try {
const nodes = await Promise.all(
sourceNodes.map(async (node) => ({
...node.node.toMutableJSON(),
id: node.node.id_,
score: node.score ?? null,
url: await getNodeUrl(node.node.metadata),
})),
);
const nodes = sourceNodes.map((node) => ({
...node.node.toMutableJSON(),
id: node.node.id_,
score: node.score ?? null,
url: getNodeUrl(node.node.metadata),
}));
data.appendMessageAnnotation({
type: "sources",
data: {
Expand Down Expand Up @@ -76,18 +74,19 @@ export function createStreamTimeout(stream: StreamData) {
export function createCallbackManager(stream: StreamData) {
const callbackManager = new CallbackManager();

callbackManager.on("retrieve-end", async (data) => {
const { nodes, query } = data.detail.payload;
await appendSourceData(stream, nodes);
callbackManager.on("retrieve-end", (data) => {
const { nodes, query } = data.detail;
appendSourceData(stream, nodes);
appendEventData(stream, `Retrieving context for query: '${query}'`);
appendEventData(
stream,
`Retrieved ${nodes.length} sources to use as context for the query`,
);
LLamaCloudFileService.downloadFiles(nodes); // don't await to avoid blocking chat streaming
});

callbackManager.on("llm-tool-call", (event) => {
const { name, input } = event.detail.payload.toolCall;
const { name, input } = event.detail.toolCall;
const inputString = Object.entries(input)
.map(([key, value]) => `${key}: ${value}`)
.join(", ");
Expand All @@ -98,14 +97,14 @@ export function createCallbackManager(stream: StreamData) {
});

callbackManager.on("llm-tool-result", (event) => {
const { toolCall, toolResult } = event.detail.payload;
const { toolCall, toolResult } = event.detail;
appendToolData(stream, toolCall, toolResult);
});

return callbackManager;
}

async function getNodeUrl(metadata: Metadata) {
function getNodeUrl(metadata: Metadata) {
if (!process.env.FILESERVER_URL_PREFIX) {
console.warn(
"FILESERVER_URL_PREFIX is not set. File URLs will not be generated.",
Expand All @@ -114,13 +113,11 @@ async function getNodeUrl(metadata: Metadata) {
const fileName = metadata["file_name"];
if (fileName && process.env.FILESERVER_URL_PREFIX) {
// file_name exists and file server is configured
const isLocalFile = metadata["is_local_file"] === "true";
const pipelineId = metadata["pipeline_id"];
if (pipelineId && !isLocalFile) {
if (pipelineId && metadata["private"] == null) {
// file is from LlamaCloud and was not ingested locally
// TODO trigger but don't await file download and just use convention to generate the URL (see Python code)
// return `${process.env.FILESERVER_URL_PREFIX}/output/llamacloud/${pipelineId}\$${fileName}`;
return await LLamaCloudFileService.getFileUrl(fileName, pipelineId);
const name = LLamaCloudFileService.toDownloadedName(pipelineId, fileName);
return `${process.env.FILESERVER_URL_PREFIX}/output/llamacloud/${name}`;
marcusschiesser marked this conversation as resolved.
Show resolved Hide resolved
}
const isPrivate = metadata["private"] === "true";
const folder = isPrivate ? "output/uploaded" : "data";
Expand Down
145 changes: 85 additions & 60 deletions templates/components/llamaindex/typescript/streaming/service.ts
Original file line number Diff line number Diff line change
@@ -1,86 +1,76 @@
import { Metadata, NodeWithScore } from "llamaindex";
import fs from "node:fs";
import https from "node:https";
import path from "node:path";

const LLAMA_CLOUD_OUTPUT_DIR = "output/llamacloud";
const LLAMA_CLOUD_BASE_URL = "https://cloud.llamaindex.ai/api/v1";
const FILE_DELIMITER = "$"; // delimiter between pipelineId and filename

export interface LlamaCloudFile {
interface LlamaCloudFile {
name: string;
file_id: string;
project_id: string;
}

export class LLamaCloudFileService {
static async getFiles(pipelineId: string): Promise<LlamaCloudFile[]> {
const url = `${LLAMA_CLOUD_BASE_URL}/pipelines/${pipelineId}/files`;
const headers = {
Accept: "application/json",
Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
};
const response = await fetch(url, { method: "GET", headers });
const data = await response.json();
return data;
public static async downloadFiles(nodes: NodeWithScore<Metadata>[]) {
const files = this.nodesToDownloadFiles(nodes);
if (!files.length) return;
console.log("Downloading files from LlamaCloud...");
for (const file of files) {
await this.downloadFile(file.pipelineId, file.fileName);
}
thucpn marked this conversation as resolved.
Show resolved Hide resolved
}

static async getFileDetail(
projectId: string,
fileId: string,
): Promise<{ url: string }> {
const url = `${LLAMA_CLOUD_BASE_URL}/files/${fileId}/content?project_id=${projectId}`;
const headers = {
Accept: "application/json",
Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
};
const response = await fetch(url, { method: "GET", headers });
const data = (await response.json()) as { url: string };
return data;
public static toDownloadedName(pipelineId: string, fileName: string) {
return `${pipelineId}${FILE_DELIMITER}${fileName}`;
}

static async getFileUrl(
name: string,
pipelineId: string,
): Promise<string | null> {
try {
const files = await this.getFiles(pipelineId);
for (const file of files) {
if (file.name === name) {
const fileId = file.file_id;
const projectId = file.project_id;
const fileDetail = await this.getFileDetail(projectId, fileId);
const localFileUrl = this.downloadFile(fileDetail.url, fileId, name);
return localFileUrl;
}
/**
* This function will return an array of unique files to download from LlamaCloud
* We only download files that are uploaded directly in LlamaCloud datasources (don't have `private` in metadata)
* Files are uploaded directly in LlamaCloud datasources don't have `private` in metadata (public docs)
* Files are uploaded from local via `generate` command will have `private=false` (public docs)
* Files are uploaded from local via `/chat/upload` endpoint will have `private=true` (private docs)
*
* @param nodes
* @returns list of unique files to download
*/
private static nodesToDownloadFiles(nodes: NodeWithScore<Metadata>[]) {
const downloadFiles: Array<{
pipelineId: string;
fileName: string;
}> = [];
for (const node of nodes) {
const isLocalFile = node.node.metadata["private"] != null;
const pipelineId = node.node.metadata["pipeline_id"];
const fileName = node.node.metadata["file_name"];
if (isLocalFile || !pipelineId || !fileName) continue;
const isDuplicate = downloadFiles.some(
(f) => f.pipelineId === pipelineId && f.fileName === fileName,
);
if (!isDuplicate) {
downloadFiles.push({ pipelineId, fileName });
}
return null;
} catch (error) {
console.error("Error fetching file from LlamaCloud:", error);
return null;
}
return downloadFiles;
}

static downloadFile(url: string, fileId: string, filename: string) {
const FILE_DELIMITER = "$"; // delimiter between fileId and filename
const downloadedFileName = `${fileId}${FILE_DELIMITER}${filename}`;
const downloadedFilePath = path.join(
LLAMA_CLOUD_OUTPUT_DIR,
downloadedFileName,
);
const urlPrefix = `${process.env.FILESERVER_URL_PREFIX}/${LLAMA_CLOUD_OUTPUT_DIR}`;
const fileUrl = `${urlPrefix}/${downloadedFileName}`;

private static async downloadFile(pipelineId: string, fileName: string) {
try {
const downloadedName = this.toDownloadedName(pipelineId, fileName);
const downloadedPath = path.join(LLAMA_CLOUD_OUTPUT_DIR, downloadedName);

// Check if file already exists
if (fs.existsSync(downloadedFilePath)) return fileUrl;
if (fs.existsSync(downloadedPath)) return;

// Create directory if it doesn't exist
if (!fs.existsSync(LLAMA_CLOUD_OUTPUT_DIR)) {
fs.mkdirSync(LLAMA_CLOUD_OUTPUT_DIR, { recursive: true });
}
const urlToDownload = await this.getFileUrlByName(pipelineId, fileName);
if (!urlToDownload) throw new Error("File not found in LlamaCloud");

const file = fs.createWriteStream(downloadedFilePath);
const file = fs.createWriteStream(downloadedPath);
https
.get(url, (response) => {
.get(urlToDownload, (response) => {
response.pipe(file);
file.on("finish", () => {
file.close(() => {
Expand All @@ -89,15 +79,50 @@ export class LLamaCloudFileService {
});
})
.on("error", (err) => {
fs.unlink(downloadedFilePath, () => {
fs.unlink(downloadedPath, () => {
console.error("Error downloading file:", err);
throw err;
});
});

return fileUrl;
} catch (error) {
throw new Error(`Error downloading file from LlamaCloud: ${error}`);
}
}

private static async getFileUrlByName(
pipelineId: string,
name: string,
): Promise<string | null> {
const files = await this.getAllFiles(pipelineId);
const file = files.find((file) => file.name === name);
if (!file) return null;
return await this.getFileUrlById(file.project_id, file.file_id);
}
thucpn marked this conversation as resolved.
Show resolved Hide resolved

private static async getFileUrlById(
projectId: string,
fileId: string,
): Promise<string> {
const url = `${LLAMA_CLOUD_BASE_URL}/files/${fileId}/content?project_id=${projectId}`;
const headers = {
Accept: "application/json",
Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
};
const response = await fetch(url, { method: "GET", headers });
const data = (await response.json()) as { url: string };
return data.url;
}

private static async getAllFiles(
pipelineId: string,
): Promise<LlamaCloudFile[]> {
const url = `${LLAMA_CLOUD_BASE_URL}/pipelines/${pipelineId}/files`;
const headers = {
Accept: "application/json",
Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
};
const response = await fetch(url, { method: "GET", headers });
const data = await response.json();
return data;
}
}
4 changes: 2 additions & 2 deletions templates/components/vectordbs/python/llamacloud/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ def generate_datasource():

documents = get_documents()

# Set is_local_file=true to distinguish locally ingested files from LlamaCloud files
# Set private=false to mark the document as public (required for filtering)
for doc in documents:
doc.metadata["is_local_file"] = "true"
doc.metadata["private"] = "false"

LlamaCloudIndex.from_documents(
documents=documents,
Expand Down
3 changes: 3 additions & 0 deletions templates/components/vectordbs/python/none/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ def generate_datasource():
storage_dir = os.environ.get("STORAGE_DIR", "storage")
# load the documents and create the index
documents = get_documents()
# Set private=false to mark the document as public (required for filtering)
for doc in documents:
doc.metadata["private"] = "false"
index = VectorStoreIndex.from_documents(
documents,
)
Expand Down
Loading
Loading