Skip to content

Commit 133f9e2

Browse files
committed
feat: add commands
1 parent e3507c5 commit 133f9e2

File tree

8 files changed

+114
-37
lines changed

8 files changed

+114
-37
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ clean:
7070
rm -rf venv node_modules azure_ai/__pycache__ package-lock.json
7171

7272
analyze:
73-
cloc . --exclude-ext=svg,json,zip --vcs=git
73+
cloc . --exclude-ext=svg,json,zip,csv --vcs=git
7474

7575
release:
7676
git commit -m "fix: force a new release" --allow-empty && git push
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"""
2+
Get or create a compute cluster in Azure AI Studio.
3+
"""
4+
5+
import json
6+
7+
from azure_ai.ml_studio import AzureAIMLStudioComputeCluster
8+
9+
10+
if __name__ == "__main__":
11+
12+
compute_cluster = AzureAIMLStudioComputeCluster(cluster_name="standard-cluster").cluster
13+
retval = compute_cluster.serialize() # type: ignore[no-untyped-call]
14+
retval = json.dumps(retval, indent=4)
15+
print(retval)
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
"""
2+
Create a dataset from a local file.
3+
~/Desktop/gh/fswl/azureml-example/azure_ai/tests/data/maths.csv
4+
"""
5+
6+
import argparse
7+
import os
8+
9+
from azure_ai.ml_studio import AzureAIMLAssetsDataset
10+
11+
12+
HERE = os.path.abspath(os.path.dirname(__file__))
13+
14+
if __name__ == "__main__":
15+
parser = argparse.ArgumentParser(description="Create a dataset from a local file.")
16+
parser.add_argument(
17+
"dataset_name",
18+
type=str,
19+
help="A dataset name, e.g., maths",
20+
)
21+
parser.add_argument(
22+
"file_path",
23+
type=str,
24+
help="A path to a local file, e.g., ~/Desktop/gh/fswl/azureml-example/azure_ai/tests/data/maths.csv",
25+
)
26+
args = parser.parse_args()
27+
28+
file_path = os.path.join(HERE, args.file_path)
29+
dataset_name = args.dataset_name
30+
data_set = AzureAIMLAssetsDataset(dataset_name=dataset_name, file_name=file_path)
31+
pandas_df = data_set.dataset_to_dataframe()
32+
print(pandas_df.head(n=5))
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
"""
2+
Create a dataset from Kaggle.
3+
https://www.kaggle.com/datasets/heptapod/titanic
4+
"""
5+
6+
import argparse
7+
8+
from azure_ai.ml_studio import AzureAIMLAssetsDataset
9+
10+
11+
if __name__ == "__main__":
12+
parser = argparse.ArgumentParser(description="Create a dataset from Kaggle.")
13+
parser.add_argument(
14+
"dataset_name",
15+
type=str,
16+
help="A dataset name, e.g., titanic",
17+
)
18+
parser.add_argument("dataset_name", type=str, help="A Kaggle dataset name, e.g., heptapod/titanic")
19+
args = parser.parse_args()
20+
21+
kaggle_dataset = args.dataset_name
22+
dataset_name = args.dataset_name
23+
data_set = AzureAIMLAssetsDataset(dataset_name=dataset_name, kaggle_dataset=args.dataset_name)
24+
25+
pandas_df = data_set.dataset_to_dataframe()
26+
print(pandas_df.head(n=5))

azure_ai/commands/example.py

Lines changed: 0 additions & 26 deletions
This file was deleted.

azure_ai/commands/help.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"""
2+
Get an existing workspace dataset from Azure AI Studio
3+
"""
4+
5+
if __name__ == "__main__":
6+
7+
print("=" * 80)
8+
print("python3 -m azure_ai.commands.workspace")
9+
print("python3 -m azure_ai.commands.compute_cluster cluster-name")
10+
print(
11+
"python3 -m azure_ai.commands.dataset_from_file maths ~/Desktop/gh/fswl/azureml-example/azure_ai/tests/data/maths.csv"
12+
)
13+
print("python3 -m azure_ai.commands.dataset_from_kaggle titanic heptapod/titanic")
14+
print("python3 -m azure_ai.commands.help")
15+
print("=" * 80)

azure_ai/commands/workspace.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
"""
2+
Get an existing workspace dataset from Azure AI Studio
3+
"""
4+
5+
from azure_ai.ml_studio import AzureAIMLWorkspace
6+
7+
8+
if __name__ == "__main__":
9+
10+
workspace = AzureAIMLWorkspace().workspace
11+
print(str(workspace))

azure_ai/ml_studio.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,9 @@ def __init__(
9292
self.kaggle_dataset = kaggle_dataset
9393
self.file_name = file_name
9494
self.description = description
95-
self.dataset = self.get_or_create()
95+
96+
if dataset_name is not None or source_data is not None or kaggle_dataset is not None or file_name is not None:
97+
self.dataset = self.get_or_create()
9698

9799
@classmethod
98100
def get_default_name(cls, workspace: Workspace) -> str:
@@ -161,12 +163,12 @@ def get_or_create(self) -> Union[FileDataset, TabularDataset]:
161163
logger.info("Using provided DataFrame with shape: %s", df.shape)
162164
elif self.kaggle_dataset:
163165
df = self.from_kaggle()
164-
logger.info("Downloaded Kaggle dataset: %s", self.kaggle_dataset)
165166
elif self.file_name:
166167
df = self.from_file(self.file_name)
167-
logger.info("Loaded dataset from file: %s", self.file_name)
168168
else:
169169
raise ValueError("Either source_data or kaggle_dataset or file_name must be provided")
170+
if df is None or df.empty:
171+
raise ValueError(f"File {self.file_name} does not contain valid data")
170172

171173
# Get default datastore
172174
datastore = self.workspace.get_default_datastore()
@@ -176,21 +178,21 @@ def get_or_create(self) -> Union[FileDataset, TabularDataset]:
176178
# Save DataFrame to temporary CSV file
177179
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as temp_file:
178180
df.to_csv(temp_file.name, index=False)
179-
temp_file_path = temp_file.name
180-
logger.info("Saved DataFrame to temporary file: %s", temp_file_path)
181+
temp_file.close()
182+
logger.info("Saved DataFrame to temporary file: %s", temp_file.name)
181183

182184
try:
183-
logger.info("Uploading temporary file to datastore: %s", temp_file_path)
185+
logger.info("Uploading temporary file to datastore: %s", temp_file.name)
184186
target_path = f"datasets/{self.dataset_name}/"
185187

186188
# Create a temporary directory with only our CSV file
187189
with tempfile.TemporaryDirectory() as upload_dir:
188190
# Copy our CSV file to the clean upload directory
189-
csv_filename = f"{self.dataset_name}.csv"
191+
csv_filename = f"{os.path.basename(self.dataset_name)}.csv"
190192
upload_file_path = os.path.join(upload_dir, csv_filename)
191193

192194
# Copy the CSV content to the new file
193-
shutil.copy2(temp_file_path, upload_file_path)
195+
shutil.copy2(temp_file.name, upload_file_path)
194196

195197
# Now upload only this clean directory
196198
FileDatasetFactory.upload_directory(
@@ -217,7 +219,8 @@ def get_or_create(self) -> Union[FileDataset, TabularDataset]:
217219

218220
finally:
219221
# Clean up temporary file
220-
os.unlink(temp_file_path)
222+
if os.path.exists(temp_file.name):
223+
os.unlink(temp_file.name)
221224

222225
def dataset_to_dataframe(self) -> pd.DataFrame:
223226
"""
@@ -273,7 +276,7 @@ def from_file(self, file_path: str) -> pd.DataFrame:
273276
logger.info("Successfully loaded dataset from %s with shape: %s", file_path, df.shape)
274277
return df
275278

276-
def from_kaggle(self) -> pd.DataFrame:
279+
def from_kaggle(self, dataset: Optional[str] = None) -> pd.DataFrame:
277280
"""
278281
Download and load a Kaggle dataset as a pandas DataFrame.
279282
@@ -287,6 +290,7 @@ def from_kaggle(self) -> pd.DataFrame:
287290
Raises:
288291
Exception: If dataset download or loading fails
289292
"""
293+
self.kaggle_dataset = dataset or self.kaggle_dataset
290294
try:
291295
with tempfile.TemporaryDirectory() as temp_dir:
292296
logger.info("Downloading Kaggle dataset: %s", self.kaggle_dataset)

0 commit comments

Comments
 (0)