Skip to content

Commit 8242697

Browse files
authored
Merge pull request #986 from WolframResearch/feature/misc-rag-updates
Updated RAG framework to allow easier addition of new sources
2 parents 67eef22 + 6a7766d commit 8242697

18 files changed

+429
-143
lines changed

.gitattributes

Lines changed: 0 additions & 1 deletion
This file was deleted.

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
Assets/VectorDatabases/**/*.usearch
33
Assets/VectorDatabases/**/*.wxf
44
build
5+
Developer/VectorDatabases/SourceData/*.jsonl
56
Source/Chatbook/64Bit/Chatbook.mx

.vscode/settings.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
"CICD",
1313
"Componentwise",
1414
"Connor",
15+
"datarepository",
1516
"deepseek",
1617
"Deflatten",
1718
"Deinitialization",
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
<|
2+
"Name" -> "DataRepositoryURIs",
3+
"Location" -> CloudObject[ "https://www.wolframcloud.com/obj/wolframai-content/VectorDatabases/DataRepositoryURIs/1.0.0/DataRepositoryURIs.jsonl" ]
4+
|>

Developer/VectorDatabases/SourceData/DocumentationURIs.jsonl

Lines changed: 0 additions & 3 deletions
This file was deleted.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
<|
2+
"Name" -> "DocumentationURIs",
3+
"Location" -> CloudObject[ "https://www.wolframcloud.com/obj/wolframai-content/VectorDatabases/DocumentationURIs/1.3.0/DocumentationURIs.jsonl" ]
4+
|>
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
<|
2+
"Name" -> "FunctionRepositoryURIs",
3+
"Location" -> CloudObject[ "https://www.wolframcloud.com/obj/wolframai-content/VectorDatabases/FunctionRepositoryURIs/1.0.0/FunctionRepositoryURIs.jsonl" ]
4+
|>

Developer/VectorDatabases/SourceData/WolframAlphaQueries.jsonl

Lines changed: 0 additions & 3 deletions
This file was deleted.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
<|
2+
"Name" -> "WolframAlphaQueries",
3+
"Location" -> CloudObject[ "https://www.wolframcloud.com/obj/wolframai-content/VectorDatabases/WolframAlphaQueries/1.3.0/WolframAlphaQueries.jsonl" ]
4+
|>

Developer/VectorDatabases/VectorDatabaseBuilder.wl

Lines changed: 118 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,9 @@ $$vectorDatabase = _VectorDatabaseObject? System`Private`ValidQ;
4848
(* ::**************************************************************************************************************:: *)
4949
(* ::Subsection::Closed:: *)
5050
(*Vector Databases*)
51-
$vectorDBSourceDirectory = FileNameJoin @ { DirectoryName @ $InputFileName, "SourceData" };
52-
$vectorDBTargetDirectory = FileNameJoin @ { DirectoryName[ $InputFileName, 3 ], "Assets", "VectorDatabases" };
51+
$defaultVectorDBSourceDirectory = FileNameJoin @ { DirectoryName @ $InputFileName, "SourceData" };
52+
$vectorDBSourceDirectory := getVectorDBSourceDirectory[ ];
53+
$vectorDBTargetDirectory = FileNameJoin @ { DirectoryName[ $InputFileName, 3 ], "Assets", "VectorDatabases" };
5354

5455
$incrementalBuildBatchSize = 512;
5556
$dbConnectivity = 16;
@@ -86,29 +87,36 @@ $embeddingCache = <| |>;
8687
ImportVectorDatabaseData // ClearAll;
8788

8889
ImportVectorDatabaseData[ name_String ] :=
89-
Enclose @ Module[ { file, data },
90-
file = ConfirmBy[ FileNameJoin @ { $vectorDBSourceDirectory, name<>".jsonl" }, FileExistsQ, "File" ];
91-
data = ConfirmMatch[ jsonlImport @ file, { ___Association? AssociationQ }, "Data" ];
92-
data
90+
Enclose @ Module[ { file },
91+
file = ConfirmBy[ getVectorDBSourceFile @ name, FileExistsQ, "File" ];
92+
ImportVectorDatabaseData @ File @ file
9393
];
9494

95+
ImportVectorDatabaseData[ file_File ] :=
96+
Enclose @ ConfirmMatch[ jsonlImport @ file, { ___Association? AssociationQ }, "Data" ];
97+
9598
(* ::**************************************************************************************************************:: *)
9699
(* ::Subsection::Closed:: *)
97100
(*ExportVectorDatabaseData*)
98101
ExportVectorDatabaseData // ClearAll;
99102

100-
ExportVectorDatabaseData[ name_String, data0_List ] :=
101-
Enclose @ Module[ { data, dir, file },
102-
data = ConfirmBy[ toDBData @ data0, dbDataQ, "Data" ];
103+
ExportVectorDatabaseData[ name_String, data_List ] :=
104+
Enclose @ Module[ { dir, file },
103105
dir = ConfirmBy[ ensureDirectory @ $vectorDBSourceDirectory, DirectoryQ, "Directory" ];
104106
file = ConfirmBy[ FileNameJoin @ { dir, name<>".jsonl" }, StringQ, "File" ];
107+
ExportVectorDatabaseData[ File @ file, data ]
108+
];
109+
110+
ExportVectorDatabaseData[ file_File, data0_List ] :=
111+
Enclose @ Module[ { data },
112+
data = ConfirmBy[ toDBData @ data0, dbDataQ, "Data" ];
105113
ConfirmBy[ jsonlExport[ file, data ], FileExistsQ, "Export" ]
106114
];
107115

108116
(* ::**************************************************************************************************************:: *)
109117
(* ::Subsection::Closed:: *)
110118
(*AddToVectorDatabaseData*)
111-
AddToVectorDatabaseData // beginDefinition;
119+
AddToVectorDatabaseData // ClearAll;
112120
AddToVectorDatabaseData // Options = { "Tag" -> "TextLiteral", "Rebuild" -> False };
113121

114122
AddToVectorDatabaseData[ name_String, data_List, opts: OptionsPattern[ ] ] :=
@@ -128,8 +136,6 @@ AddToVectorDatabaseData[ name_String, data_List, opts: OptionsPattern[ ] ] :=
128136
<| "Exported" -> exported, "Rebuilt" -> rebuilt |>
129137
];
130138

131-
AddToVectorDatabaseData // endDefinition;
132-
133139
(* ::**************************************************************************************************************:: *)
134140
(* ::Subsection::Closed:: *)
135141
(*BuildVectorDatabase*)
@@ -147,7 +153,7 @@ BuildVectorDatabase[ All, opts: OptionsPattern[ ] ] :=
147153
$dbExpansionAdd = OptionValue[ "ExpansionAdd" ],
148154
$dbExpansionSearch = OptionValue[ "ExpansionSearch" ]
149155
},
150-
AssociationMap[ BuildVectorDatabase, FileBaseName /@ FileNames[ "*.jsonl", $vectorDBSourceDirectory ] ]
156+
AssociationMap[ BuildVectorDatabase, FileBaseName /@ getVectorDBSourceFile @ All ]
151157
];
152158

153159
BuildVectorDatabase[ name_String, opts: OptionsPattern[ ] ] := Enclose[
@@ -169,13 +175,13 @@ BuildVectorDatabase[ name_String, opts: OptionsPattern[ ] ] := Enclose[
169175
buildVectorDatabase // ClearAll;
170176

171177
buildVectorDatabase[ name_String ] :=
172-
Enclose @ Catch @ Module[ { dir, rel, src, db, valueBag, count, n, stream, values },
178+
Enclose @ Catch @ Module[ { dir, rel, src, db, valueBag, count, n, stream, values, built },
173179

174180
loadEmbeddingCache[ ];
175181

176182
dir = ConfirmBy[ ensureDirectory @ { $vectorDBTargetDirectory, name }, DirectoryQ, "Directory" ];
177183
rel = ConfirmBy[ ResourceFunction[ "RelativePath" ][ dir ], DirectoryQ, "Relative" ];
178-
src = ConfirmBy[ FileNameJoin @ { $vectorDBSourceDirectory, name<>".jsonl" }, FileExistsQ, "File" ];
184+
src = ConfirmBy[ getVectorDBSourceFile @ name, FileExistsQ, "File" ];
179185

180186
DeleteFile /@ FileNames[ { "*.wxf", "*.usearch" }, dir ];
181187
ConfirmAssert[ FileNames[ { "*.wxf", "*.usearch" }, dir ] === { }, "ClearedFilesCheck" ];
@@ -198,59 +204,65 @@ buildVectorDatabase[ name_String ] :=
198204
valueBag = Internal`Bag[ ];
199205
count = ConfirmMatch[ lineCount @ src, _Integer? Positive, "LineCount" ];
200206
n = 0;
201-
stream = ConfirmMatch[ OpenRead @ src, _InputStream, "Stream" ];
202-
203-
withProgress[
204-
While[
205-
NumericArrayQ @ ConfirmMatch[ addBatch[ db, stream, valueBag ], _NumericArray|EndOfFile, "Add" ],
206-
n = Internal`BagLength @ valueBag
207-
],
208-
<|
209-
"Text" -> "Building database \""<>name<>"\"",
210-
"ElapsedTime" -> Automatic,
211-
"RemainingTime" -> Automatic,
212-
"ItemTotal" :> count,
213-
"ItemCurrent" :> n,
214-
"Progress" :> Automatic
215-
|>,
216-
"Delay" -> 0,
217-
UpdateInterval -> 1
218-
];
219-
220-
saveEmbeddingCache[ ];
221-
222-
values = Internal`BagPart[ valueBag, All ];
223-
224-
ConfirmAssert[ Length @ values === count, "ValueCount" ];
225-
ConfirmAssert[ First @ db[ "Dimensions" ] === count, "VectorCount" ];
226-
227-
ConfirmBy[
228-
writeWXFFile[ FileNameJoin @ { dir, "Values.wxf" }, values, PerformanceGoal -> "Size" ],
229-
FileExistsQ,
230-
"Values"
231-
];
207+
WithCleanup[
208+
stream = ConfirmMatch[ OpenRead @ src, _InputStream, "Stream" ],
232209

233-
ConfirmBy[
234-
writeWXFFile[
235-
FileNameJoin @ { dir, "EmbeddingInformation.wxf" },
210+
withProgress[
211+
While[
212+
NumericArrayQ @ ConfirmMatch[ addBatch[ db, stream, valueBag ], _NumericArray|EndOfFile, "Add" ],
213+
n = Internal`BagLength @ valueBag
214+
],
236215
<|
237-
"Dimension" -> $embeddingDimension,
238-
"Type" -> $embeddingType,
239-
"Model" -> $embeddingModel,
240-
"Service" -> $embeddingService
241-
|>
216+
"Text" -> "Building database \""<>name<>"\"",
217+
"ElapsedTime" -> Automatic,
218+
"RemainingTime" -> Automatic,
219+
"ItemTotal" :> count,
220+
"ItemCurrent" :> n,
221+
"Progress" :> Automatic
222+
|>,
223+
"Delay" -> 0,
224+
UpdateInterval -> 1
225+
];
226+
227+
saveEmbeddingCache[ ];
228+
229+
values = Internal`BagPart[ valueBag, All ];
230+
231+
ConfirmBy[ rewriteDBData[ rel, name ], FileExistsQ, "Rewrite" ];
232+
233+
built = ConfirmMatch[
234+
VectorDatabaseObject @ File @ FileNameJoin @ { rel, name <> ".wxf" },
235+
$$vectorDatabase,
236+
"Result"
237+
];
238+
239+
ConfirmAssert[ Length @ values === count, "ValueCount" ];
240+
ConfirmAssert[ First @ built[ "Dimensions" ] === count, "VectorCount" ];
241+
242+
ConfirmBy[
243+
writeWXFFile[ FileNameJoin @ { dir, "Values.wxf" }, values, PerformanceGoal -> "Size" ],
244+
FileExistsQ,
245+
"Values"
246+
];
247+
248+
ConfirmBy[
249+
writeWXFFile[
250+
FileNameJoin @ { dir, "EmbeddingInformation.wxf" },
251+
<|
252+
"Dimension" -> $embeddingDimension,
253+
"Type" -> $embeddingType,
254+
"Model" -> $embeddingModel,
255+
"Service" -> $embeddingService
256+
|>
257+
],
258+
FileExistsQ,
259+
"EmbeddingInformation"
242260
],
243-
FileExistsQ,
244-
"EmbeddingInformation"
245-
];
246261

247-
ConfirmBy[ rewriteDBData[ rel, name ], FileExistsQ, "Rewrite" ];
262+
Close @ stream
263+
];
248264

249-
ConfirmMatch[
250-
VectorDatabaseObject @ File @ FileNameJoin @ { rel, name <> ".wxf" },
251-
$$vectorDatabase,
252-
"Result"
253-
]
265+
ConfirmMatch[ built, $$vectorDatabase, "Result" ]
254266
];
255267

256268
(* ::**************************************************************************************************************:: *)
@@ -274,7 +286,7 @@ setDBDefaults[ dir_, name_String ] :=
274286
addBatch // ClearAll;
275287

276288
addBatch[ db_VectorDatabaseObject, stream_InputStream, valueBag_Internal`Bag ] :=
277-
Enclose @ Catch @ Module[ { batch, text, values, embeddings },
289+
Enclose @ Catch @ Module[ { batch, text, values, embeddings, added },
278290

279291
batch = ConfirmMatch[
280292
readJSONLines[ stream, $incrementalBuildBatchSize ],
@@ -289,9 +301,9 @@ addBatch[ db_VectorDatabaseObject, stream_InputStream, valueBag_Internal`Bag ] :
289301
values = ConfirmMatch[ batch[[ All, "Value" ]], { __ }, "Values" ];
290302
embeddings = ConfirmBy[ $lastEmbedding = GetEmbedding @ text, NumericArrayQ, "Embeddings" ];
291303
ConfirmAssert[ Length @ values === Length @ embeddings, "LengthCheck" ];
292-
Confirm[ $lastAdded = AddToVectorDatabase[ db, embeddings ], "AddToVectorDatabase" ];
304+
added = Confirm[ $lastAdded = AddToVectorDatabase[ db, embeddings ], "AddToVectorDatabase" ];
293305
Internal`StuffBag[ valueBag, values, 1 ];
294-
ConfirmMatch[ db[ "Dimensions" ], { Internal`BagLength @ valueBag, $embeddingDimension }, "DimensionCheck" ];
306+
ConfirmMatch[ added[ "Dimensions" ], { Internal`BagLength @ valueBag, $embeddingDimension }, "DimensionCheck" ];
295307
embeddings
296308
];
297309

@@ -729,6 +741,46 @@ embeddingHash[ string_String ] :=
729741
(* ::Section::Closed:: *)
730742
(*Misc Utilities*)
731743

744+
(* ::**************************************************************************************************************:: *)
745+
(* ::Subsection::Closed:: *)
746+
(*getVectorDBSourceDirectory*)
747+
getVectorDBSourceDirectory // ClearAll;
748+
749+
getVectorDBSourceDirectory[ ] := Enclose[
750+
getVectorDBSourceDirectory[ ] = Confirm @ SelectFirst[
751+
{
752+
ReleaseHold @ PersistentSymbol[ "ChatbookDeveloper/VectorDatabaseSourceDirectory" ],
753+
GeneralUtilities`EnsureDirectory @ $defaultVectorDBSourceDirectory
754+
},
755+
DirectoryQ,
756+
$Failed
757+
]
758+
];
759+
760+
(* ::**************************************************************************************************************:: *)
761+
(* ::Subsection::Closed:: *)
762+
(*getVectorDBSourceFile*)
763+
getVectorDBSourceFile // ClearAll;
764+
765+
getVectorDBSourceFile[ name_String ] :=
766+
Enclose @ Catch @ Module[ { dir, jsonl, wl, as, url, downloaded },
767+
dir = ConfirmBy[ getVectorDBSourceDirectory[ ], DirectoryQ, "Directory" ];
768+
jsonl = FileNameJoin @ { dir, name<>".jsonl" };
769+
If[ FileExistsQ @ jsonl, Throw @ jsonl ];
770+
wl = ConfirmBy[ FileNameJoin @ { dir, name<>".wl" }, FileExistsQ, "File" ];
771+
as = ConfirmBy[ Get @ wl, AssociationQ, "Data" ];
772+
url = ConfirmMatch[ as[ "Location" ], _String|_CloudObject|_URL, "URL" ];
773+
downloaded = ConfirmBy[ URLDownload[ url, jsonl ], FileExistsQ, "Download" ];
774+
ConfirmBy[ jsonl, FileExistsQ, "Result" ]
775+
];
776+
777+
getVectorDBSourceFile[ All ] :=
778+
Enclose @ Module[ { dir, names },
779+
dir = ConfirmBy[ getVectorDBSourceDirectory[ ], DirectoryQ, "Directory" ];
780+
names = Union[ FileBaseName /@ FileNames[ { "*.jsonl", "*.wl" }, dir ] ];
781+
getVectorDBSourceFile /@ names
782+
];
783+
732784
(* ::**************************************************************************************************************:: *)
733785
(* ::Subsection::Closed:: *)
734786
(*withProgress*)

0 commit comments

Comments
 (0)