@@ -12,6 +12,7 @@ import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
12
12
import { nanoid } from "nanoid" ;
13
13
import { UnstructuredClient } from "unstructured-client" ;
14
14
import type { DatasWithFileSource , FilePath , ProcessorType , URL } from "./database" ;
15
+ import { LlamaParseReader } from "llamaindex" ;
15
16
16
17
type Element = {
17
18
type : string ;
@@ -37,51 +38,7 @@ export class FileDataLoader {
37
38
38
39
private async createLoader ( args : any ) {
39
40
if ( hasProcessor ( this . config ) ) {
40
- const client = new UnstructuredClient ( {
41
- serverURL : "https://api.unstructuredapp.io" ,
42
- security : {
43
- apiKeyAuth : this . config . processor . options . apiKey ,
44
- } ,
45
- } ) ;
46
-
47
- //@ts -expect-error TS can't pick up the correct type due to complex union
48
- const fileData = await Bun . file ( this . config . fileSource ) . text ( ) ;
49
- const response = await client . general . partition ( {
50
- //@ts -expect-error Will be fixed soon
51
- partitionParameters : {
52
- files : {
53
- content : fileData ,
54
- //@ts -expect-error TS can't pick up the correct type due to complex union
55
- fileName : this . config . fileSource ,
56
- } ,
57
- ...this . config . processor . options ,
58
- } ,
59
- } ) ;
60
- const elements = response . elements ?. filter (
61
- ( element ) => typeof element . text === "string"
62
- ) as Element [ ] ;
63
-
64
- return {
65
- // eslint-disable-next-line @typescript-eslint/require-await
66
- load : async ( ) : Promise < Document [ ] > => {
67
- const documents : Document [ ] = [ ] ;
68
- for ( const element of elements ) {
69
- const { metadata, text } = element ;
70
- if ( typeof text === "string" && text !== "" ) {
71
- documents . push (
72
- new Document ( {
73
- pageContent : text ,
74
- metadata : {
75
- ...metadata ,
76
- category : element . type ,
77
- } ,
78
- } )
79
- ) ;
80
- }
81
- }
82
- return documents ;
83
- } ,
84
- } ;
41
+ return await this . createLoaderForProcessors ( ) ;
85
42
}
86
43
switch ( this . config . type ) {
87
44
case "pdf" : {
@@ -115,6 +72,84 @@ export class FileDataLoader {
115
72
}
116
73
}
117
74
75
+ private async createLoaderForProcessors ( ) {
76
+ // Without this check typescript complains about types because of unions
77
+ if ( ! hasProcessor ( this . config ) ) throw new Error ( "Only processors are allowed" ) ;
78
+
79
+ switch ( this . config . processor . name ) {
80
+ case "unstructured" : {
81
+ const client = new UnstructuredClient ( {
82
+ serverURL : "https://api.unstructuredapp.io" ,
83
+ security : {
84
+ apiKeyAuth : this . config . processor . options . apiKey ,
85
+ } ,
86
+ } ) ;
87
+
88
+ //@ts -expect-error TS can't pick up the correct type due to complex union
89
+ const fileData = await Bun . file ( this . config . fileSource ) . text ( ) ;
90
+ const response = await client . general . partition ( {
91
+ //@ts -expect-error Will be fixed soon
92
+ partitionParameters : {
93
+ files : {
94
+ content : fileData ,
95
+ //@ts -expect-error TS can't pick up the correct type due to complex union
96
+ fileName : this . config . fileSource ,
97
+ } ,
98
+ ...this . config . processor . options ,
99
+ } ,
100
+ } ) ;
101
+ const elements = response . elements ?. filter (
102
+ ( element ) => typeof element . text === "string"
103
+ ) as Element [ ] ;
104
+
105
+ return {
106
+ // eslint-disable-next-line @typescript-eslint/require-await
107
+ load : async ( ) : Promise < Document [ ] > => {
108
+ const documents : Document [ ] = [ ] ;
109
+ for ( const element of elements ) {
110
+ const { metadata, text } = element ;
111
+ if ( typeof text === "string" && text !== "" ) {
112
+ documents . push (
113
+ new Document ( {
114
+ pageContent : text ,
115
+ metadata : {
116
+ ...metadata ,
117
+ category : element . type ,
118
+ } ,
119
+ } )
120
+ ) ;
121
+ }
122
+ }
123
+ return documents ;
124
+ } ,
125
+ } ;
126
+ }
127
+ case "llama-parse" : {
128
+ const reader = new LlamaParseReader ( this . config . processor . options ) ;
129
+ //@ts -expect-error TS can't pick up the correct type due to complex union
130
+ const parsedDocuments = await reader . loadData ( this . config . fileSource ) ;
131
+ return {
132
+ // eslint-disable-next-line @typescript-eslint/require-await
133
+ load : async ( ) : Promise < Document [ ] > => {
134
+ const documents : Document [ ] = [ ] ;
135
+ for ( const element of parsedDocuments ) {
136
+ const { metadata, text } = element ;
137
+ if ( typeof text === "string" && text !== "" ) {
138
+ documents . push (
139
+ new Document ( {
140
+ pageContent : text ,
141
+ metadata,
142
+ } )
143
+ ) ;
144
+ }
145
+ }
146
+ return documents ;
147
+ } ,
148
+ } ;
149
+ }
150
+ }
151
+ }
152
+
118
153
private isURL ( source : FilePath | Blob ) : source is URL {
119
154
return typeof source === "string" && source . startsWith ( "http" ) ;
120
155
}
@@ -158,6 +193,7 @@ export class FileDataLoader {
158
193
return mapDocumentsIntoInsertPayload ( newDocuments ) ;
159
194
}
160
195
196
+ // Processors will be handled here. E.g. "unstructured", "llama-parse"
161
197
case undefined : {
162
198
const documents_ = documents . map (
163
199
( item ) => new Document ( { pageContent : item . pageContent , metadata : item . metadata } )
0 commit comments