Skip to content

Commit f4f6f90

Browse files
authored
[fine-tuning] accept file URLs as train & validation files (#50)
also a few fixes: setting the correct filename for file uploads using files.create, reinstating the progress meter for uploading files in conjunction with the fine-tuning endpoint, standardizing punctuation on FT help strings
1 parent 1f32472 commit f4f6f90

File tree

3 files changed

+118
-65
lines changed

3 files changed

+118
-65
lines changed

openai/api_resources/file.py

+14-16
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ def create(
2020
api_base=None,
2121
api_version=None,
2222
organization=None,
23+
user_provided_filename=None,
2324
):
2425
if purpose != "search" and model is not None:
2526
raise ValueError("'model' is only meaningful if 'purpose' is 'search'")
@@ -32,9 +33,13 @@ def create(
3233
url = cls.class_url()
3334
# Set the filename on 'purpose' and 'model' to None so they are
3435
# interpreted as form data.
35-
files = [("file", file), ("purpose", (None, purpose))]
36+
files = [("purpose", (None, purpose))]
3637
if model is not None:
3738
files.append(("model", (None, model)))
39+
if user_provided_filename is not None:
40+
files.append(("file", (user_provided_filename, file)))
41+
else:
42+
files.append(("file", file))
3843
response, _, api_key = requestor.request("post", url, files=files)
3944
return util.convert_to_openai_object(
4045
response, api_key, api_version, organization
@@ -65,37 +70,30 @@ def download(
6570
@classmethod
6671
def find_matching_files(
6772
cls,
73+
name,
74+
bytes,
75+
purpose,
6876
api_key=None,
6977
api_base=None,
7078
api_version=None,
7179
organization=None,
72-
file=None,
73-
purpose=None,
7480
):
75-
if file is None:
76-
raise openai.error.InvalidRequestError(
77-
"'file' is a required property", "file"
78-
)
79-
if purpose is None:
80-
raise openai.error.InvalidRequestError(
81-
"'purpose' is a required property", "purpose"
82-
)
81+
"""Find already uploaded files with the same name, size, and purpose."""
8382
all_files = cls.list(
8483
api_key=api_key,
8584
api_base=api_base or openai.api_base,
8685
api_version=api_version,
8786
organization=organization,
8887
).get("data", [])
8988
matching_files = []
89+
basename = os.path.basename(name)
9090
for f in all_files:
9191
if f["purpose"] != purpose:
9292
continue
93-
if not hasattr(file, "name") or f["filename"] != file.name:
93+
file_basename = os.path.basename(f["filename"])
94+
if file_basename != basename:
9495
continue
95-
file.seek(0, os.SEEK_END)
96-
if f["bytes"] != file.tell():
97-
file.seek(0)
96+
if f["bytes"] != bytes:
9897
continue
99-
file.seek(0)
10098
matching_files.append(f)
10199
return matching_files

openai/cli.py

+103-48
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import sys
55
import warnings
66

7+
import requests
8+
79
import openai
810
from openai.upload_progress import BufferReader
911
from openai.validators import (
@@ -200,7 +202,10 @@ def create(cls, args):
200202
with open(args.file, "rb") as file_reader:
201203
buffer_reader = BufferReader(file_reader.read(), desc="Upload progress")
202204
resp = openai.File.create(
203-
file=buffer_reader, purpose=args.purpose, model=args.model
205+
file=buffer_reader,
206+
purpose=args.purpose,
207+
model=args.model,
208+
user_provided_filename=args.file,
204209
)
205210
print(resp)
206211

@@ -238,52 +243,102 @@ def list(cls, args):
238243
print(resp)
239244

240245
@classmethod
241-
def _get_or_upload(cls, file, check_if_file_exists=True):
242-
try:
243-
openai.File.retrieve(file)
244-
except openai.error.InvalidRequestError as e:
245-
if e.http_status == 404 and os.path.isfile(file):
246-
matching_files = openai.File.find_matching_files(
247-
file=open(file), purpose="fine-tune"
246+
def _is_url(cls, file: str):
247+
return file.lower().startswith("http")
248+
249+
@classmethod
250+
def _download_file_from_public_url(cls, url: str) -> Optional[bytes]:
251+
resp = requests.get(url)
252+
if resp.status_code == 200:
253+
return resp.content
254+
else:
255+
return None
256+
257+
@classmethod
258+
def _maybe_upload_file(
259+
cls,
260+
file: Optional[str] = None,
261+
content: Optional[bytes] = None,
262+
user_provided_file: Optional[str] = None,
263+
check_if_file_exists: bool = True,
264+
):
265+
# Exactly one of `file` or `content` must be provided
266+
if (file is None) == (content is None):
267+
raise ValueError("Exactly one of `file` or `content` must be provided")
268+
269+
if content is None:
270+
assert file is not None
271+
with open(file, "rb") as f:
272+
content = f.read()
273+
274+
if check_if_file_exists:
275+
bytes = len(content)
276+
matching_files = openai.File.find_matching_files(
277+
name=user_provided_file or f.name, bytes=bytes, purpose="fine-tune"
278+
)
279+
if len(matching_files) > 0:
280+
file_ids = [f["id"] for f in matching_files]
281+
sys.stdout.write(
282+
"Found potentially duplicated files with name '{name}', purpose 'fine-tune' and size {size} bytes\n".format(
283+
name=os.path.basename(matching_files[0]["filename"]),
284+
size=matching_files[0]["bytes"],
285+
)
248286
)
249-
if len(matching_files) > 0 and check_if_file_exists:
250-
file_ids = [f["id"] for f in matching_files]
287+
sys.stdout.write("\n".join(file_ids))
288+
while True:
251289
sys.stdout.write(
252-
"Found potentially duplicated files with name '{name}', purpose 'fine-tune' and size {size} bytes\n".format(
253-
name=matching_files[0]["filename"],
254-
size=matching_files[0]["bytes"],
255-
)
290+
"\nEnter file ID to reuse an already uploaded file, or an empty string to upload this file anyway: "
256291
)
257-
sys.stdout.write("\n".join(file_ids))
258-
while True:
292+
inp = sys.stdin.readline().strip()
293+
if inp in file_ids:
259294
sys.stdout.write(
260-
"\nEnter file ID to reuse an already uploaded file, or an empty string to upload this file anyway: "
295+
"Reusing already uploaded file: {id}\n".format(id=inp)
261296
)
262-
inp = sys.stdin.readline().strip()
263-
if inp in file_ids:
264-
sys.stdout.write(
265-
"Using your file {file}: {id}\n".format(
266-
file=file, id=inp
267-
)
268-
)
269-
return inp
270-
elif inp == "":
271-
break
272-
else:
273-
sys.stdout.write(
274-
"File id '{id}' is not among the IDs of the potentially duplicated files\n".format(
275-
id=inp
276-
)
297+
return inp
298+
elif inp == "":
299+
break
300+
else:
301+
sys.stdout.write(
302+
"File id '{id}' is not among the IDs of the potentially duplicated files\n".format(
303+
id=inp
277304
)
305+
)
278306

279-
resp = openai.File.create(
280-
file=open(file),
281-
purpose="fine-tune",
282-
)
283-
sys.stdout.write(
284-
"Uploaded file from {file}: {id}\n".format(file=file, id=resp["id"])
307+
buffer_reader = BufferReader(content, desc="Upload progress")
308+
resp = openai.File.create(
309+
file=buffer_reader,
310+
purpose="fine-tune",
311+
user_provided_filename=user_provided_file or file,
312+
)
313+
sys.stdout.write(
314+
"Uploaded file from {file}: {id}\n".format(
315+
file=user_provided_file or file, id=resp["id"]
316+
)
317+
)
318+
return resp["id"]
319+
320+
@classmethod
321+
def _get_or_upload(cls, file, check_if_file_exists=True):
322+
try:
323+
# 1. If it's a valid file, use it
324+
openai.File.retrieve(file)
325+
return file
326+
except openai.error.InvalidRequestError:
327+
pass
328+
if os.path.isfile(file):
329+
# 2. If it's a file on the filesystem, upload it
330+
return cls._maybe_upload_file(
331+
file=file, check_if_file_exists=check_if_file_exists
332+
)
333+
if cls._is_url(file):
334+
# 3. If it's a URL, download it temporarily
335+
content = cls._download_file_from_public_url(file)
336+
if content is not None:
337+
return cls._maybe_upload_file(
338+
content=content,
339+
check_if_file_exists=check_if_file_exists,
340+
user_provided_file=file,
285341
)
286-
return resp["id"]
287342
return file
288343

289344
@classmethod
@@ -737,15 +792,15 @@ def help(args):
737792
"--training_file",
738793
required=True,
739794
help="JSONL file containing prompt-completion examples for training. This can "
740-
"be the ID of a file uploaded through the OpenAI API (e.g. file-abcde12345) "
741-
"or a local file path.",
795+
"be the ID of a file uploaded through the OpenAI API (e.g. file-abcde12345), "
796+
'a local file path, or a URL that starts with "http".',
742797
)
743798
sub.add_argument(
744799
"-v",
745800
"--validation_file",
746801
help="JSONL file containing prompt-completion examples for validation. This can "
747-
"be the ID of a file uploaded through the OpenAI API (e.g. file-abcde12345) "
748-
"or a local file path.",
802+
"be the ID of a file uploaded through the OpenAI API (e.g. file-abcde12345), "
803+
'a local file path, or a URL that starts with "http".',
749804
)
750805
sub.add_argument(
751806
"--no_check_if_files_exist",
@@ -780,7 +835,7 @@ def help(args):
780835
type=float,
781836
help="The learning rate multiplier to use for training. The fine-tuning "
782837
"learning rate is determined by the original learning rate used for "
783-
"pretraining multiplied by this value",
838+
"pretraining multiplied by this value.",
784839
)
785840
sub.add_argument(
786841
"--use_packing",
@@ -796,15 +851,15 @@ def help(args):
796851
"--no_packing",
797852
action="store_false",
798853
dest="use_packing",
799-
help="Disables the packing flag (see --use_packing for description)",
854+
help="Disables the packing flag (see --use_packing for description).",
800855
)
801856
sub.set_defaults(use_packing=None)
802857
sub.add_argument(
803858
"--prompt_loss_weight",
804859
type=float,
805860
help="The weight to use for the prompt loss. The optimum value here depends "
806861
"depends on your use case. This determines how much the model prioritizes "
807-
"learning from prompt tokens vs learning from completion tokens",
862+
"learning from prompt tokens vs learning from completion tokens.",
808863
)
809864
sub.add_argument(
810865
"--compute_classification_metrics",
@@ -817,13 +872,13 @@ def help(args):
817872
"--classification_n_classes",
818873
type=int,
819874
help="The number of classes in a classification task. This parameter is "
820-
"required for multiclass classification",
875+
"required for multiclass classification.",
821876
)
822877
sub.add_argument(
823878
"--classification_positive_class",
824879
help="The positive class in binary classification. This parameter is needed "
825880
"to generate precision, recall and F-1 metrics when doing binary "
826-
"classification",
881+
"classification.",
827882
)
828883
sub.add_argument(
829884
"--classification_betas",

openai/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
VERSION = "0.11.1"
1+
VERSION = "0.11.2"

0 commit comments

Comments
 (0)