Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file removed .DS_Store
Binary file not shown.
47 changes: 46 additions & 1 deletion datatorch/api/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,11 +163,55 @@ def upload_to_default_filesource(
)
print(r.text + " " + endpoint)

def upload_to_filesource(
self,
project: Project,
file: IO,
storageId: str = None,
storageFolderName=None,
dataset: Dataset = None,
**kwargs,
):
"""
Uploads a file to the provided `storage_id` if available;
otherwise, retrieves the default storage ID (DataTorch Storage) from the project.
"""
# Retrieve default storage_id if not explicitly provided
if storageId is None:
storageId = project.storage_link_default().id

storageFolderName = "" if storageFolderName is None else storageFolderName
datasetId = "" if dataset is None else dataset.id
importFiles = "false" if dataset is None else "true"

# Construct the endpoint
endpoint = f"{self.api_url}/file/v1/upload/{storageId}?path={storageFolderName}&import={importFiles}&datasetId={datasetId}"

# Determine MIME type
if magic:
tell = file.tell()
mimetype = magic.from_buffer(file.read(1024), mime=True)
file.seek(tell)
else:
mimetype = mimetypes.guess_type(file.name)[0]

# Make the POST request
r = requests.post(
endpoint,
files={"file": (os.path.basename(file.name), file, mimetype)},
headers={self.token_header: self._api_token},
stream=True,
)

# Raise an error for failed requests
r.raise_for_status()

def glob_upload_folder(
self,
project: Project,
uploadingFromGlob: str,
storageFolderName: str,
storageId: str = None,
folderSplit=1000,
dataset: Dataset = None,
recursive=False,
Expand All @@ -192,9 +236,10 @@ def glob_upload_folder(
folderIndex += 1
uploadFolderName = storageFolderName + "_" + str(folderIndex)
file = open(file, "rb")
self.upload_to_default_filesource(
self.upload_to_filesource(
project=project,
file=file,
storageId=storageId,
storageFolderName=uploadFolderName,
dataset=dataset,
)
Expand Down
3 changes: 3 additions & 0 deletions datatorch/cli/groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .pipeline import pipeline
from .action import action
from .import_cmds import import_cmd
from .upload import upload


@click.group()
Expand All @@ -29,3 +30,5 @@ def main():
main.add_command(agent)
main.add_command(action)
main.add_command(import_cmd)

main.add_command(upload)
10 changes: 10 additions & 0 deletions datatorch/cli/upload/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import click
from .folder import folder


@click.group(help="Commands for managing uploads.")
def upload():
pass


upload.add_command(folder)
131 changes: 131 additions & 0 deletions datatorch/cli/upload/folder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import os
import click
from datatorch.core.settings import UserSettings
from datatorch.api.api import ApiClient
from datatorch.api.entity.project import Project
from ..spinner import Spinner


@click.command("folder")
@click.argument("folder_path", type=click.Path(exists=True, file_okay=False))
@click.argument("project_id", type=str)
def folder(folder_path, project_id):
"""Bulk upload files to a specified project."""

# Get the list of files to upload
files = [f for f in os.listdir(folder_path)
if os.path.isfile(os.path.join(folder_path, f))]
total_files = len(files)

if total_files == 0:
click.echo("No files found in the specified folder.")
return

# Load user settings
user_settings = UserSettings()
api_key = user_settings.api_key
api_url = user_settings.api_url

if not api_key or not api_url:
click.echo("You are not logged in. "
"Please log in using the `login` command.")
return

# Initialize the API client
client = ApiClient(api_url=api_url, api_key=api_key)

# Validate the endpoint
if not client.validate_endpoint():
click.echo("Error: Invalid API endpoint.")
return
click.echo("Valid API endpoint verified.")

# Retrieve the project by ID
try:
project = client.project(project_id)
click.echo(f"Retrieved project: {project.name}")
except Exception as e:
click.echo(f"Error: Unable to retrieve "
f"project with ID '{project_id}'. {e}")
return

# Display available dataset
try:
datasets = project.datasets()
if datasets:
click.echo("\nAvailable Dataset:")
for idx, dataset in enumerate(datasets, start=1):
click.echo(f"{idx}. {dataset.name} (ID: {dataset.id})")

# Prompt user to select a dataset
choice = click.prompt(
"Enter the number of the dataset",
type=int,
default=1,
)
if 1 <= choice <= len(datasets):
selected_dataset = datasets[choice - 1]
click.echo(f"Selected Dataset: {selected_dataset.name} (ID: {selected_dataset.id}")
else:
click.echo(f"Invalid choice. Please select a number between 1 and {len(datasets)}")
else:
# No datasets found, as if user want to continue with global upload
continue_upload = click.confirm("No datasets found for this project"
"Do you want to continue with global upload?", default=False)
if not continue_upload:
click.echo("Ending...")
return
except Exception as e:
click.echo(f"Error retrieving data set: {e}")
return

# Display available storage links and prompt user selection
try:
storage_links = project.storage_links()
if not storage_links:
click.echo("No storage available for this project.")
return

click.echo("\nAvailable Storages:")
for idx, storage_link in enumerate(storage_links):
click.echo(f"{idx + 1}. {storage_link.name} "
f"(ID: {storage_link.id})")

# Prompt user to select a storage link
choice = click.prompt(
"Enter the number of the storage to use",
type=int,
default=1,
)
if 1 <= choice <= len(storage_links):
selected_storage_link = storage_links[choice - 1]
else:
click.echo(f"Invalid choice. Please select a number between 1 and {len(storage_links)}.")
return

click.echo(f"Selected Storage: {selected_storage_link.name} "
f"(ID: {selected_storage_link.id})")
except Exception as e:
click.echo(f"Error retrieving storage: {e}")
return

# Initialize the spinner
spinner = Spinner(f"Uploading files (0/{total_files})")

# Upload files to the selected storage and dataset using their IDs
try:
for idx, file_name in enumerate(files, start=1):
file_path = os.path.join(folder_path, file_name)
spinner.set_text(f"Uploading file ({idx}/{total_files})")
with open(file_path, "rb") as file:
client.upload_to_filesource(
project=project,
file=file,
storageId=selected_storage_link.id,
storageFolderName=None,
dataset=selected_dataset,
)
spinner.done(f"Uploaded all {total_files} files successfully!")
except Exception as e:
spinner.done(f"Error during upload: {e}")
return
28 changes: 28 additions & 0 deletions examples/upload_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import os
import datatorch as dt

api = dt.api.ApiClient('your-api-key')
proj = api.project('user-name/project-name')
dset = proj.dataset('data-set-name')

folder_to_upload = 'uploadme'
upload_to_storage_id = 'your-storage-id'

# Get all the file names in the folder
files = [f for f in os.listdir(folder_to_upload)
if os.path.isfile(os.path.join(folder_to_upload, f))]

# Upload files to the selected storage and dataset using their IDs
try:
for file_name in files:
file_path = os.path.join(folder_to_upload, file_name)
with open(file_path, "rb") as file:
api.upload_to_filesource(
project=proj,
file=file,
storageId=upload_to_storage_id,
storageFolderName=None,
dataset=dset,
)
except Exception as e:
print(f"Error Uploading: {e}")
Binary file added examples/uploadme/1copy.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/uploadme/2copy.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/uploadme/3copy.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
10 changes: 5 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from setuptools import setup, find_packages
import sys

# Ensure the Python version is 3.13 or higher
assert sys.version_info >= (3, 13, 0), "DataTorch requires Python 3.13+"
# Ensure the Python version is 3.12 or higher
assert sys.version_info >= (3, 12, 0), "DataTorch requires Python 3.12+"

with open("README.md", "r", encoding="utf-8") as fp:
long_description = fp.read()
Expand Down Expand Up @@ -33,7 +33,7 @@

setup(
name="datatorch",
version="0.4.8.4",
version="0.4.8.5",
description="A CLI and library for interacting with DataTorch.",
author="DataTorch",
author_email="[email protected]",
Expand All @@ -45,7 +45,7 @@
long_description=long_description,
long_description_content_type="text/markdown",
install_requires=requirements,
python_requires=">=3.13",
python_requires=">=3.12",
license="MIT license",
zip_safe=False,
include_package_data=True,
Expand All @@ -55,7 +55,7 @@
"Framework :: Pytest",
"Intended Audience :: Developers",
"Natural Language :: English",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.12",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Software Development :: Libraries :: Python Modules",
],
Expand Down
Loading