@@ -48,8 +48,9 @@ $$vectorDatabase = _VectorDatabaseObject? System`Private`ValidQ;
48
48
(* ::**************************************************************************************************************:: *)
49
49
(* ::Subsection::Closed:: *)
50
50
(*Vector Databases*)
51
- $vectorDBSourceDirectory = FileNameJoin @ { DirectoryName @ $InputFileName , "SourceData" };
52
- $vectorDBTargetDirectory = FileNameJoin @ { DirectoryName [ $InputFileName , 3 ], "Assets" , "VectorDatabases" };
51
+ $defaultVectorDBSourceDirectory = FileNameJoin @ { DirectoryName @ $InputFileName , "SourceData" };
52
+ $vectorDBSourceDirectory := getVectorDBSourceDirectory [ ];
53
+ $vectorDBTargetDirectory = FileNameJoin @ { DirectoryName [ $InputFileName , 3 ], "Assets" , "VectorDatabases" };
53
54
54
55
$incrementalBuildBatchSize = 512 ;
55
56
$dbConnectivity = 16 ;
@@ -86,29 +87,36 @@ $embeddingCache = <| |>;
86
87
ImportVectorDatabaseData // ClearAll ;
87
88
88
89
ImportVectorDatabaseData [ name_ String ] :=
89
- Enclose @ Module [ { file , data },
90
- file = ConfirmBy [ FileNameJoin @ { $vectorDBSourceDirectory , name <> ".jsonl" }, FileExistsQ , "File" ];
91
- data = ConfirmMatch [ jsonlImport @ file , { ___ Association ? AssociationQ }, "Data" ];
92
- data
90
+ Enclose @ Module [ { file },
91
+ file = ConfirmBy [ getVectorDBSourceFile @ name , FileExistsQ , "File" ];
92
+ ImportVectorDatabaseData @ File @ file
93
93
];
94
94
95
+ ImportVectorDatabaseData [ file_ File ] :=
96
+ Enclose @ ConfirmMatch [ jsonlImport @ file , { ___ Association ? AssociationQ }, "Data" ];
97
+
95
98
(* ::**************************************************************************************************************:: *)
96
99
(* ::Subsection::Closed:: *)
97
100
(*ExportVectorDatabaseData*)
98
101
ExportVectorDatabaseData // ClearAll ;
99
102
100
- ExportVectorDatabaseData [ name_ String , data0_ List ] :=
101
- Enclose @ Module [ { data , dir , file },
102
- data = ConfirmBy [ toDBData @ data0 , dbDataQ , "Data" ];
103
+ ExportVectorDatabaseData [ name_ String , data_ List ] :=
104
+ Enclose @ Module [ { dir , file },
103
105
dir = ConfirmBy [ ensureDirectory @ $vectorDBSourceDirectory , DirectoryQ , "Directory" ];
104
106
file = ConfirmBy [ FileNameJoin @ { dir , name <> ".jsonl" }, StringQ , "File" ];
107
+ ExportVectorDatabaseData [ File @ file , data ]
108
+ ];
109
+
110
+ ExportVectorDatabaseData [ file_ File , data0_ List ] :=
111
+ Enclose @ Module [ { data },
112
+ data = ConfirmBy [ toDBData @ data0 , dbDataQ , "Data" ];
105
113
ConfirmBy [ jsonlExport [ file , data ], FileExistsQ , "Export" ]
106
114
];
107
115
108
116
(* ::**************************************************************************************************************:: *)
109
117
(* ::Subsection::Closed:: *)
110
118
(*AddToVectorDatabaseData*)
111
- AddToVectorDatabaseData // beginDefinition ;
119
+ AddToVectorDatabaseData // ClearAll ;
112
120
AddToVectorDatabaseData // Options = { "Tag" -> "TextLiteral" , "Rebuild" -> False };
113
121
114
122
AddToVectorDatabaseData [ name_ String , data_ List , opts : OptionsPattern [ ] ] :=
@@ -128,8 +136,6 @@ AddToVectorDatabaseData[ name_String, data_List, opts: OptionsPattern[ ] ] :=
128
136
< | "Exported" -> exported , "Rebuilt" -> rebuilt |>
129
137
];
130
138
131
- AddToVectorDatabaseData // endDefinition ;
132
-
133
139
(* ::**************************************************************************************************************:: *)
134
140
(* ::Subsection::Closed:: *)
135
141
(*BuildVectorDatabase*)
@@ -147,7 +153,7 @@ BuildVectorDatabase[ All, opts: OptionsPattern[ ] ] :=
147
153
$dbExpansionAdd = OptionValue [ "ExpansionAdd" ],
148
154
$dbExpansionSearch = OptionValue [ "ExpansionSearch" ]
149
155
},
150
- AssociationMap [ BuildVectorDatabase , FileBaseName /@ FileNames [ "*.jsonl" , $vectorDBSourceDirectory ] ]
156
+ AssociationMap [ BuildVectorDatabase , FileBaseName /@ getVectorDBSourceFile @ All ]
151
157
];
152
158
153
159
BuildVectorDatabase [ name_ String , opts : OptionsPattern [ ] ] := Enclose [
@@ -169,13 +175,13 @@ BuildVectorDatabase[ name_String, opts: OptionsPattern[ ] ] := Enclose[
169
175
buildVectorDatabase // ClearAll ;
170
176
171
177
buildVectorDatabase [ name_ String ] :=
172
- Enclose @ Catch @ Module [ { dir , rel , src , db , valueBag , count , n , stream , values },
178
+ Enclose @ Catch @ Module [ { dir , rel , src , db , valueBag , count , n , stream , values , built },
173
179
174
180
loadEmbeddingCache [ ];
175
181
176
182
dir = ConfirmBy [ ensureDirectory @ { $vectorDBTargetDirectory , name }, DirectoryQ , "Directory" ];
177
183
rel = ConfirmBy [ ResourceFunction [ "RelativePath" ][ dir ], DirectoryQ , "Relative" ];
178
- src = ConfirmBy [ FileNameJoin @ { $vectorDBSourceDirectory , name <> ".jsonl" } , FileExistsQ , "File" ];
184
+ src = ConfirmBy [ getVectorDBSourceFile @ name , FileExistsQ , "File" ];
179
185
180
186
DeleteFile /@ FileNames [ { "*.wxf" , "*.usearch" }, dir ];
181
187
ConfirmAssert [ FileNames [ { "*.wxf" , "*.usearch" }, dir ] === { }, "ClearedFilesCheck" ];
@@ -198,59 +204,65 @@ buildVectorDatabase[ name_String ] :=
198
204
valueBag = Internal ` Bag [ ];
199
205
count = ConfirmMatch [ lineCount @ src , _ Integer ? Positive , "LineCount" ];
200
206
n = 0 ;
201
- stream = ConfirmMatch [ OpenRead @ src , _ InputStream , "Stream" ];
202
-
203
- withProgress [
204
- While [
205
- NumericArrayQ @ ConfirmMatch [ addBatch [ db , stream , valueBag ], _ NumericArray |EndOfFile , "Add" ],
206
- n = Internal ` BagLength @ valueBag
207
- ],
208
- < |
209
- "Text" -> "Building database \" " <> name <> "\" " ,
210
- "ElapsedTime" -> Automatic ,
211
- "RemainingTime" -> Automatic ,
212
- "ItemTotal" :> count ,
213
- "ItemCurrent" :> n ,
214
- "Progress" :> Automatic
215
- |> ,
216
- "Delay" -> 0 ,
217
- UpdateInterval -> 1
218
- ];
219
-
220
- saveEmbeddingCache [ ];
221
-
222
- values = Internal ` BagPart [ valueBag , All ];
223
-
224
- ConfirmAssert [ Length @ values === count , "ValueCount" ];
225
- ConfirmAssert [ First @ db [ "Dimensions" ] === count , "VectorCount" ];
226
-
227
- ConfirmBy [
228
- writeWXFFile [ FileNameJoin @ { dir , "Values.wxf" }, values , PerformanceGoal -> "Size" ],
229
- FileExistsQ ,
230
- "Values"
231
- ];
207
+ WithCleanup [
208
+ stream = ConfirmMatch [ OpenRead @ src , _ InputStream , "Stream" ],
232
209
233
- ConfirmBy [
234
- writeWXFFile [
235
- FileNameJoin @ { dir , "EmbeddingInformation.wxf" },
210
+ withProgress [
211
+ While [
212
+ NumericArrayQ @ ConfirmMatch [ addBatch [ db , stream , valueBag ], _ NumericArray |EndOfFile , "Add" ],
213
+ n = Internal ` BagLength @ valueBag
214
+ ],
236
215
< |
237
- "Dimension" -> $embeddingDimension ,
238
- "Type" -> $embeddingType ,
239
- "Model" -> $embeddingModel ,
240
- "Service" -> $embeddingService
241
- |>
216
+ "Text" -> "Building database \" " <> name <> "\" " ,
217
+ "ElapsedTime" -> Automatic ,
218
+ "RemainingTime" -> Automatic ,
219
+ "ItemTotal" :> count ,
220
+ "ItemCurrent" :> n ,
221
+ "Progress" :> Automatic
222
+ |> ,
223
+ "Delay" -> 0 ,
224
+ UpdateInterval -> 1
225
+ ];
226
+
227
+ saveEmbeddingCache [ ];
228
+
229
+ values = Internal ` BagPart [ valueBag , All ];
230
+
231
+ ConfirmBy [ rewriteDBData [ rel , name ], FileExistsQ , "Rewrite" ];
232
+
233
+ built = ConfirmMatch [
234
+ VectorDatabaseObject @ File @ FileNameJoin @ { rel , name <> ".wxf" },
235
+ $$vectorDatabase ,
236
+ "Result"
237
+ ];
238
+
239
+ ConfirmAssert [ Length @ values === count , "ValueCount" ];
240
+ ConfirmAssert [ First @ built [ "Dimensions" ] === count , "VectorCount" ];
241
+
242
+ ConfirmBy [
243
+ writeWXFFile [ FileNameJoin @ { dir , "Values.wxf" }, values , PerformanceGoal -> "Size" ],
244
+ FileExistsQ ,
245
+ "Values"
246
+ ];
247
+
248
+ ConfirmBy [
249
+ writeWXFFile [
250
+ FileNameJoin @ { dir , "EmbeddingInformation.wxf" },
251
+ < |
252
+ "Dimension" -> $embeddingDimension ,
253
+ "Type" -> $embeddingType ,
254
+ "Model" -> $embeddingModel ,
255
+ "Service" -> $embeddingService
256
+ |>
257
+ ],
258
+ FileExistsQ ,
259
+ "EmbeddingInformation"
242
260
],
243
- FileExistsQ ,
244
- "EmbeddingInformation"
245
- ];
246
261
247
- ConfirmBy [ rewriteDBData [ rel , name ], FileExistsQ , "Rewrite" ];
262
+ Close @ stream
263
+ ];
248
264
249
- ConfirmMatch [
250
- VectorDatabaseObject @ File @ FileNameJoin @ { rel , name <> ".wxf" },
251
- $$vectorDatabase ,
252
- "Result"
253
- ]
265
+ ConfirmMatch [ built , $$vectorDatabase , "Result" ]
254
266
];
255
267
256
268
(* ::**************************************************************************************************************:: *)
@@ -274,7 +286,7 @@ setDBDefaults[ dir_, name_String ] :=
274
286
addBatch // ClearAll ;
275
287
276
288
addBatch [ db_ VectorDatabaseObject , stream_ InputStream , valueBag_ Internal ` Bag ] :=
277
- Enclose @ Catch @ Module [ { batch , text , values , embeddings },
289
+ Enclose @ Catch @ Module [ { batch , text , values , embeddings , added },
278
290
279
291
batch = ConfirmMatch [
280
292
readJSONLines [ stream , $incrementalBuildBatchSize ],
@@ -289,9 +301,9 @@ addBatch[ db_VectorDatabaseObject, stream_InputStream, valueBag_Internal`Bag ] :
289
301
values = ConfirmMatch [ batch [[ All , "Value" ]], { __ }, "Values" ];
290
302
embeddings = ConfirmBy [ $lastEmbedding = GetEmbedding @ text , NumericArrayQ , "Embeddings" ];
291
303
ConfirmAssert [ Length @ values === Length @ embeddings , "LengthCheck" ];
292
- Confirm [ $lastAdded = AddToVectorDatabase [ db , embeddings ], "AddToVectorDatabase" ];
304
+ added = Confirm [ $lastAdded = AddToVectorDatabase [ db , embeddings ], "AddToVectorDatabase" ];
293
305
Internal ` StuffBag [ valueBag , values , 1 ];
294
- ConfirmMatch [ db [ "Dimensions" ], { Internal ` BagLength @ valueBag , $embeddingDimension }, "DimensionCheck" ];
306
+ ConfirmMatch [ added [ "Dimensions" ], { Internal ` BagLength @ valueBag , $embeddingDimension }, "DimensionCheck" ];
295
307
embeddings
296
308
];
297
309
@@ -729,6 +741,46 @@ embeddingHash[ string_String ] :=
729
741
(* ::Section::Closed:: *)
730
742
(*Misc Utilities*)
731
743
744
+ (* ::**************************************************************************************************************:: *)
745
+ (* ::Subsection::Closed:: *)
746
+ (*getVectorDBSourceDirectory*)
747
+ getVectorDBSourceDirectory // ClearAll ;
748
+
749
+ getVectorDBSourceDirectory [ ] := Enclose [
750
+ getVectorDBSourceDirectory [ ] = Confirm @ SelectFirst [
751
+ {
752
+ ReleaseHold @ PersistentSymbol [ "ChatbookDeveloper/VectorDatabaseSourceDirectory" ],
753
+ GeneralUtilities ` EnsureDirectory @ $defaultVectorDBSourceDirectory
754
+ },
755
+ DirectoryQ ,
756
+ $Failed
757
+ ]
758
+ ];
759
+
760
+ (* ::**************************************************************************************************************:: *)
761
+ (* ::Subsection::Closed:: *)
762
+ (*getVectorDBSourceFile*)
763
+ getVectorDBSourceFile // ClearAll ;
764
+
765
+ getVectorDBSourceFile [ name_ String ] :=
766
+ Enclose @ Catch @ Module [ { dir , jsonl , wl , as , url , downloaded },
767
+ dir = ConfirmBy [ getVectorDBSourceDirectory [ ], DirectoryQ , "Directory" ];
768
+ jsonl = FileNameJoin @ { dir , name <> ".jsonl" };
769
+ If [ FileExistsQ @ jsonl , Throw @ jsonl ];
770
+ wl = ConfirmBy [ FileNameJoin @ { dir , name <> ".wl" }, FileExistsQ , "File" ];
771
+ as = ConfirmBy [ Get @ wl , AssociationQ , "Data" ];
772
+ url = ConfirmMatch [ as [ "Location" ], _ String |_ CloudObject |_ URL , "URL" ];
773
+ downloaded = ConfirmBy [ URLDownload [ url , jsonl ], FileExistsQ , "Download" ];
774
+ ConfirmBy [ jsonl , FileExistsQ , "Result" ]
775
+ ];
776
+
777
+ getVectorDBSourceFile [ All ] :=
778
+ Enclose @ Module [ { dir , names },
779
+ dir = ConfirmBy [ getVectorDBSourceDirectory [ ], DirectoryQ , "Directory" ];
780
+ names = Union [ FileBaseName /@ FileNames [ { "*.jsonl" , "*.wl" }, dir ] ];
781
+ getVectorDBSourceFile /@ names
782
+ ];
783
+
732
784
(* ::**************************************************************************************************************:: *)
733
785
(* ::Subsection::Closed:: *)
734
786
(*withProgress*)
0 commit comments