Skip to content

Commit d15f1a3

Browse files
Merge pull request #70 from upstash/add-llama-parse
feat: add llama parse
2 parents 10c90ea + 76e6891 commit d15f1a3

File tree

4 files changed

+89
-49
lines changed

4 files changed

+89
-49
lines changed

bun.lockb

178 KB
Binary file not shown.

package.json

+1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
"d3-dsv": "^3.0.1",
6969
"html-to-text": "^9.0.5",
7070
"langchain": "^0.2.0",
71+
"llamaindex": "^0.5.20",
7172
"nanoid": "^5.0.7",
7273
"pdf-parse": "^1.1.1",
7374
"unstructured-client": "^0.15.1"

src/database.ts

+7-4
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,17 @@ import { DEFAULT_SIMILARITY_THRESHOLD, DEFAULT_TOP_K } from "./constants";
66
import { FileDataLoader } from "./file-loader";
77
import type { AddContextOptions } from "./types";
88
import type { UnstructuredLoaderOptions } from "@langchain/community/document_loaders/fs/unstructured";
9+
import type { LlamaParseReader } from "llamaindex";
910

1011
export type FilePath = string;
1112
export type URL = string;
1213

13-
export type ProcessorType = {
14-
name: "unstructured";
15-
options: UnstructuredLoaderOptions;
16-
};
14+
export type ProcessorType =
15+
| {
16+
name: "unstructured";
17+
options: UnstructuredLoaderOptions;
18+
}
19+
| { name: "llama-parse"; options: Partial<LlamaParseReader> };
1720

1821
export type DatasWithFileSource =
1922
| {

src/file-loader.ts

+81-45
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
1212
import { nanoid } from "nanoid";
1313
import { UnstructuredClient } from "unstructured-client";
1414
import type { DatasWithFileSource, FilePath, ProcessorType, URL } from "./database";
15+
import { LlamaParseReader } from "llamaindex";
1516

1617
type Element = {
1718
type: string;
@@ -37,51 +38,7 @@ export class FileDataLoader {
3738

3839
private async createLoader(args: any) {
3940
if (hasProcessor(this.config)) {
40-
const client = new UnstructuredClient({
41-
serverURL: "https://api.unstructuredapp.io",
42-
security: {
43-
apiKeyAuth: this.config.processor.options.apiKey,
44-
},
45-
});
46-
47-
//@ts-expect-error TS can't pick up the correct type due to complex union
48-
const fileData = await Bun.file(this.config.fileSource).text();
49-
const response = await client.general.partition({
50-
//@ts-expect-error Will be fixed soon
51-
partitionParameters: {
52-
files: {
53-
content: fileData,
54-
//@ts-expect-error TS can't pick up the correct type due to complex union
55-
fileName: this.config.fileSource,
56-
},
57-
...this.config.processor.options,
58-
},
59-
});
60-
const elements = response.elements?.filter(
61-
(element) => typeof element.text === "string"
62-
) as Element[];
63-
64-
return {
65-
// eslint-disable-next-line @typescript-eslint/require-await
66-
load: async (): Promise<Document[]> => {
67-
const documents: Document[] = [];
68-
for (const element of elements) {
69-
const { metadata, text } = element;
70-
if (typeof text === "string" && text !== "") {
71-
documents.push(
72-
new Document({
73-
pageContent: text,
74-
metadata: {
75-
...metadata,
76-
category: element.type,
77-
},
78-
})
79-
);
80-
}
81-
}
82-
return documents;
83-
},
84-
};
41+
return await this.createLoaderForProcessors();
8542
}
8643
switch (this.config.type) {
8744
case "pdf": {
@@ -115,6 +72,84 @@ export class FileDataLoader {
11572
}
11673
}
11774

75+
private async createLoaderForProcessors() {
76+
// Without this check typescript complains about types because of unions
77+
if (!hasProcessor(this.config)) throw new Error("Only processors are allowed");
78+
79+
switch (this.config.processor.name) {
80+
case "unstructured": {
81+
const client = new UnstructuredClient({
82+
serverURL: "https://api.unstructuredapp.io",
83+
security: {
84+
apiKeyAuth: this.config.processor.options.apiKey,
85+
},
86+
});
87+
88+
//@ts-expect-error TS can't pick up the correct type due to complex union
89+
const fileData = await Bun.file(this.config.fileSource).text();
90+
const response = await client.general.partition({
91+
//@ts-expect-error Will be fixed soon
92+
partitionParameters: {
93+
files: {
94+
content: fileData,
95+
//@ts-expect-error TS can't pick up the correct type due to complex union
96+
fileName: this.config.fileSource,
97+
},
98+
...this.config.processor.options,
99+
},
100+
});
101+
const elements = response.elements?.filter(
102+
(element) => typeof element.text === "string"
103+
) as Element[];
104+
105+
return {
106+
// eslint-disable-next-line @typescript-eslint/require-await
107+
load: async (): Promise<Document[]> => {
108+
const documents: Document[] = [];
109+
for (const element of elements) {
110+
const { metadata, text } = element;
111+
if (typeof text === "string" && text !== "") {
112+
documents.push(
113+
new Document({
114+
pageContent: text,
115+
metadata: {
116+
...metadata,
117+
category: element.type,
118+
},
119+
})
120+
);
121+
}
122+
}
123+
return documents;
124+
},
125+
};
126+
}
127+
case "llama-parse": {
128+
const reader = new LlamaParseReader(this.config.processor.options);
129+
//@ts-expect-error TS can't pick up the correct type due to complex union
130+
const parsedDocuments = await reader.loadData(this.config.fileSource);
131+
return {
132+
// eslint-disable-next-line @typescript-eslint/require-await
133+
load: async (): Promise<Document[]> => {
134+
const documents: Document[] = [];
135+
for (const element of parsedDocuments) {
136+
const { metadata, text } = element;
137+
if (typeof text === "string" && text !== "") {
138+
documents.push(
139+
new Document({
140+
pageContent: text,
141+
metadata,
142+
})
143+
);
144+
}
145+
}
146+
return documents;
147+
},
148+
};
149+
}
150+
}
151+
}
152+
118153
private isURL(source: FilePath | Blob): source is URL {
119154
return typeof source === "string" && source.startsWith("http");
120155
}
@@ -158,6 +193,7 @@ export class FileDataLoader {
158193
return mapDocumentsIntoInsertPayload(newDocuments);
159194
}
160195

196+
// Processors will be handled here. E.g. "unstructured", "llama-parse"
161197
case undefined: {
162198
const documents_ = documents.map(
163199
(item) => new Document({ pageContent: item.pageContent, metadata: item.metadata })

0 commit comments

Comments
 (0)