4
4
import sys
5
5
import warnings
6
6
7
+ import requests
8
+
7
9
import openai
8
10
from openai .upload_progress import BufferReader
9
11
from openai .validators import (
@@ -200,7 +202,10 @@ def create(cls, args):
200
202
with open (args .file , "rb" ) as file_reader :
201
203
buffer_reader = BufferReader (file_reader .read (), desc = "Upload progress" )
202
204
resp = openai .File .create (
203
- file = buffer_reader , purpose = args .purpose , model = args .model
205
+ file = buffer_reader ,
206
+ purpose = args .purpose ,
207
+ model = args .model ,
208
+ user_provided_filename = args .file ,
204
209
)
205
210
print (resp )
206
211
@@ -238,52 +243,102 @@ def list(cls, args):
238
243
print (resp )
239
244
240
245
@classmethod
241
- def _get_or_upload (cls , file , check_if_file_exists = True ):
242
- try :
243
- openai .File .retrieve (file )
244
- except openai .error .InvalidRequestError as e :
245
- if e .http_status == 404 and os .path .isfile (file ):
246
- matching_files = openai .File .find_matching_files (
247
- file = open (file ), purpose = "fine-tune"
246
+ def _is_url (cls , file : str ):
247
+ return file .lower ().startswith ("http" )
248
+
249
+ @classmethod
250
+ def _download_file_from_public_url (cls , url : str ) -> Optional [bytes ]:
251
+ resp = requests .get (url )
252
+ if resp .status_code == 200 :
253
+ return resp .content
254
+ else :
255
+ return None
256
+
257
+ @classmethod
258
+ def _maybe_upload_file (
259
+ cls ,
260
+ file : Optional [str ] = None ,
261
+ content : Optional [bytes ] = None ,
262
+ user_provided_file : Optional [str ] = None ,
263
+ check_if_file_exists : bool = True ,
264
+ ):
265
+ # Exactly one of `file` or `content` must be provided
266
+ if (file is None ) == (content is None ):
267
+ raise ValueError ("Exactly one of `file` or `content` must be provided" )
268
+
269
+ if content is None :
270
+ assert file is not None
271
+ with open (file , "rb" ) as f :
272
+ content = f .read ()
273
+
274
+ if check_if_file_exists :
275
+ bytes = len (content )
276
+ matching_files = openai .File .find_matching_files (
277
+ name = user_provided_file or f .name , bytes = bytes , purpose = "fine-tune"
278
+ )
279
+ if len (matching_files ) > 0 :
280
+ file_ids = [f ["id" ] for f in matching_files ]
281
+ sys .stdout .write (
282
+ "Found potentially duplicated files with name '{name}', purpose 'fine-tune' and size {size} bytes\n " .format (
283
+ name = os .path .basename (matching_files [0 ]["filename" ]),
284
+ size = matching_files [0 ]["bytes" ],
285
+ )
248
286
)
249
- if len ( matching_files ) > 0 and check_if_file_exists :
250
- file_ids = [ f [ "id" ] for f in matching_files ]
287
+ sys . stdout . write ( " \n " . join ( file_ids ))
288
+ while True :
251
289
sys .stdout .write (
252
- "Found potentially duplicated files with name '{name}', purpose 'fine-tune' and size {size} bytes\n " .format (
253
- name = matching_files [0 ]["filename" ],
254
- size = matching_files [0 ]["bytes" ],
255
- )
290
+ "\n Enter file ID to reuse an already uploaded file, or an empty string to upload this file anyway: "
256
291
)
257
- sys .stdout . write ( " \n " . join ( file_ids ) )
258
- while True :
292
+ inp = sys .stdin . readline (). strip ( )
293
+ if inp in file_ids :
259
294
sys .stdout .write (
260
- "\n Enter file ID to reuse an already uploaded file, or an empty string to upload this file anyway: "
295
+ "Reusing already uploaded file: {id} \n " . format ( id = inp )
261
296
)
262
- inp = sys .stdin .readline ().strip ()
263
- if inp in file_ids :
264
- sys .stdout .write (
265
- "Using your file {file}: {id}\n " .format (
266
- file = file , id = inp
267
- )
268
- )
269
- return inp
270
- elif inp == "" :
271
- break
272
- else :
273
- sys .stdout .write (
274
- "File id '{id}' is not among the IDs of the potentially duplicated files\n " .format (
275
- id = inp
276
- )
297
+ return inp
298
+ elif inp == "" :
299
+ break
300
+ else :
301
+ sys .stdout .write (
302
+ "File id '{id}' is not among the IDs of the potentially duplicated files\n " .format (
303
+ id = inp
277
304
)
305
+ )
278
306
279
- resp = openai .File .create (
280
- file = open (file ),
281
- purpose = "fine-tune" ,
282
- )
283
- sys .stdout .write (
284
- "Uploaded file from {file}: {id}\n " .format (file = file , id = resp ["id" ])
307
+ buffer_reader = BufferReader (content , desc = "Upload progress" )
308
+ resp = openai .File .create (
309
+ file = buffer_reader ,
310
+ purpose = "fine-tune" ,
311
+ user_provided_filename = user_provided_file or file ,
312
+ )
313
+ sys .stdout .write (
314
+ "Uploaded file from {file}: {id}\n " .format (
315
+ file = user_provided_file or file , id = resp ["id" ]
316
+ )
317
+ )
318
+ return resp ["id" ]
319
+
320
+ @classmethod
321
+ def _get_or_upload (cls , file , check_if_file_exists = True ):
322
+ try :
323
+ # 1. If it's a valid file, use it
324
+ openai .File .retrieve (file )
325
+ return file
326
+ except openai .error .InvalidRequestError :
327
+ pass
328
+ if os .path .isfile (file ):
329
+ # 2. If it's a file on the filesystem, upload it
330
+ return cls ._maybe_upload_file (
331
+ file = file , check_if_file_exists = check_if_file_exists
332
+ )
333
+ if cls ._is_url (file ):
334
+ # 3. If it's a URL, download it temporarily
335
+ content = cls ._download_file_from_public_url (file )
336
+ if content is not None :
337
+ return cls ._maybe_upload_file (
338
+ content = content ,
339
+ check_if_file_exists = check_if_file_exists ,
340
+ user_provided_file = file ,
285
341
)
286
- return resp ["id" ]
287
342
return file
288
343
289
344
@classmethod
@@ -737,15 +792,15 @@ def help(args):
737
792
"--training_file" ,
738
793
required = True ,
739
794
help = "JSONL file containing prompt-completion examples for training. This can "
740
- "be the ID of a file uploaded through the OpenAI API (e.g. file-abcde12345) "
741
- "or a local file path." ,
795
+ "be the ID of a file uploaded through the OpenAI API (e.g. file-abcde12345), "
796
+ ' a local file path, or a URL that starts with "http".' ,
742
797
)
743
798
sub .add_argument (
744
799
"-v" ,
745
800
"--validation_file" ,
746
801
help = "JSONL file containing prompt-completion examples for validation. This can "
747
- "be the ID of a file uploaded through the OpenAI API (e.g. file-abcde12345) "
748
- "or a local file path." ,
802
+ "be the ID of a file uploaded through the OpenAI API (e.g. file-abcde12345), "
803
+ ' a local file path, or a URL that starts with "http".' ,
749
804
)
750
805
sub .add_argument (
751
806
"--no_check_if_files_exist" ,
@@ -780,7 +835,7 @@ def help(args):
780
835
type = float ,
781
836
help = "The learning rate multiplier to use for training. The fine-tuning "
782
837
"learning rate is determined by the original learning rate used for "
783
- "pretraining multiplied by this value" ,
838
+ "pretraining multiplied by this value. " ,
784
839
)
785
840
sub .add_argument (
786
841
"--use_packing" ,
@@ -796,15 +851,15 @@ def help(args):
796
851
"--no_packing" ,
797
852
action = "store_false" ,
798
853
dest = "use_packing" ,
799
- help = "Disables the packing flag (see --use_packing for description)" ,
854
+ help = "Disables the packing flag (see --use_packing for description). " ,
800
855
)
801
856
sub .set_defaults (use_packing = None )
802
857
sub .add_argument (
803
858
"--prompt_loss_weight" ,
804
859
type = float ,
805
860
help = "The weight to use for the prompt loss. The optimum value here depends "
806
861
"depends on your use case. This determines how much the model prioritizes "
807
- "learning from prompt tokens vs learning from completion tokens" ,
862
+ "learning from prompt tokens vs learning from completion tokens. " ,
808
863
)
809
864
sub .add_argument (
810
865
"--compute_classification_metrics" ,
@@ -817,13 +872,13 @@ def help(args):
817
872
"--classification_n_classes" ,
818
873
type = int ,
819
874
help = "The number of classes in a classification task. This parameter is "
820
- "required for multiclass classification" ,
875
+ "required for multiclass classification. " ,
821
876
)
822
877
sub .add_argument (
823
878
"--classification_positive_class" ,
824
879
help = "The positive class in binary classification. This parameter is needed "
825
880
"to generate precision, recall and F-1 metrics when doing binary "
826
- "classification" ,
881
+ "classification. " ,
827
882
)
828
883
sub .add_argument (
829
884
"--classification_betas" ,
0 commit comments