feat: add commands

lpm0073 · lpm0073 · commit 133f9e2833c9 · 2025-07-03T16:15:15.000-06:00
diff --git a/Makefile b/Makefile
@@ -70,7 +70,7 @@ clean:
 	rm -rf venv node_modules azure_ai/__pycache__ package-lock.json
 
 analyze:
-	cloc . --exclude-ext=svg,json,zip --vcs=git
+	cloc . --exclude-ext=svg,json,zip,csv --vcs=git
 
 release:
 	git commit -m "fix: force a new release" --allow-empty && git push
diff --git a/azure_ai/commands/compute_cluster.py b/azure_ai/commands/compute_cluster.py
@@ -0,0 +1,15 @@
+"""
+Get or create a compute cluster in Azure AI Studio.
+"""
+
+import json
+
+from azure_ai.ml_studio import AzureAIMLStudioComputeCluster
+
+
+if __name__ == "__main__":
+
+    compute_cluster = AzureAIMLStudioComputeCluster(cluster_name="standard-cluster").cluster
+    retval = compute_cluster.serialize()  # type: ignore[no-untyped-call]
+    retval = json.dumps(retval, indent=4)
+    print(retval)
diff --git a/azure_ai/commands/dataset_from_file.py b/azure_ai/commands/dataset_from_file.py
@@ -0,0 +1,32 @@
+"""
+Create a dataset from a local file.
+~/Desktop/gh/fswl/azureml-example/azure_ai/tests/data/maths.csv
+"""
+
+import argparse
+import os
+
+from azure_ai.ml_studio import AzureAIMLAssetsDataset
+
+
+HERE = os.path.abspath(os.path.dirname(__file__))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Create a dataset from a local file.")
+    parser.add_argument(
+        "dataset_name",
+        type=str,
+        help="A dataset name, e.g., maths",
+    )
+    parser.add_argument(
+        "file_path",
+        type=str,
+        help="A path to a local file, e.g., ~/Desktop/gh/fswl/azureml-example/azure_ai/tests/data/maths.csv",
+    )
+    args = parser.parse_args()
+
+    file_path = os.path.join(HERE, args.file_path)
+    dataset_name = args.dataset_name
+    data_set = AzureAIMLAssetsDataset(dataset_name=dataset_name, file_name=file_path)
+    pandas_df = data_set.dataset_to_dataframe()
+    print(pandas_df.head(n=5))
diff --git a/azure_ai/commands/dataset_from_kaggle.py b/azure_ai/commands/dataset_from_kaggle.py
@@ -0,0 +1,26 @@
+"""
+Create a dataset from Kaggle.
+https://www.kaggle.com/datasets/heptapod/titanic
+"""
+
+import argparse
+
+from azure_ai.ml_studio import AzureAIMLAssetsDataset
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Create a dataset from Kaggle.")
+    parser.add_argument(
+        "dataset_name",
+        type=str,
+        help="A dataset name, e.g., titanic",
+    )
+    parser.add_argument("dataset_name", type=str, help="A Kaggle dataset name, e.g., heptapod/titanic")
+    args = parser.parse_args()
+
+    kaggle_dataset = args.dataset_name
+    dataset_name = args.dataset_name
+    data_set = AzureAIMLAssetsDataset(dataset_name=dataset_name, kaggle_dataset=args.dataset_name)
+
+    pandas_df = data_set.dataset_to_dataframe()
+    print(pandas_df.head(n=5))
diff --git a/azure_ai/commands/example.py b/azure_ai/commands/example.py
diff --git a/azure_ai/commands/help.py b/azure_ai/commands/help.py
@@ -0,0 +1,15 @@
+"""
+Get an existing workspace dataset from Azure AI Studio
+"""
+
+if __name__ == "__main__":
+
+    print("=" * 80)
+    print("python3 -m azure_ai.commands.workspace")
+    print("python3 -m azure_ai.commands.compute_cluster cluster-name")
+    print(
+        "python3 -m azure_ai.commands.dataset_from_file maths ~/Desktop/gh/fswl/azureml-example/azure_ai/tests/data/maths.csv"
+    )
+    print("python3 -m azure_ai.commands.dataset_from_kaggle titanic heptapod/titanic")
+    print("python3 -m azure_ai.commands.help")
+    print("=" * 80)
diff --git a/azure_ai/commands/workspace.py b/azure_ai/commands/workspace.py
@@ -0,0 +1,11 @@
+"""
+Get an existing workspace dataset from Azure AI Studio
+"""
+
+from azure_ai.ml_studio import AzureAIMLWorkspace
+
+
+if __name__ == "__main__":
+
+    workspace = AzureAIMLWorkspace().workspace
+    print(str(workspace))
diff --git a/azure_ai/ml_studio.py b/azure_ai/ml_studio.py
@@ -92,7 +92,9 @@ def __init__(
         self.kaggle_dataset = kaggle_dataset
         self.file_name = file_name
         self.description = description
-        self.dataset = self.get_or_create()
+
+        if dataset_name is not None or source_data is not None or kaggle_dataset is not None or file_name is not None:
+            self.dataset = self.get_or_create()
 
     @classmethod
     def get_default_name(cls, workspace: Workspace) -> str:
@@ -161,12 +163,12 @@ def get_or_create(self) -> Union[FileDataset, TabularDataset]:
             logger.info("Using provided DataFrame with shape: %s", df.shape)
         elif self.kaggle_dataset:
             df = self.from_kaggle()
-            logger.info("Downloaded Kaggle dataset: %s", self.kaggle_dataset)
         elif self.file_name:
             df = self.from_file(self.file_name)
-            logger.info("Loaded dataset from file: %s", self.file_name)
         else:
             raise ValueError("Either source_data or kaggle_dataset or file_name must be provided")
+        if df is None or df.empty:
+            raise ValueError(f"File {self.file_name} does not contain valid data")
 
         # Get default datastore
         datastore = self.workspace.get_default_datastore()
@@ -176,21 +178,21 @@ def get_or_create(self) -> Union[FileDataset, TabularDataset]:
         # Save DataFrame to temporary CSV file
         with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as temp_file:
             df.to_csv(temp_file.name, index=False)
-            temp_file_path = temp_file.name
-            logger.info("Saved DataFrame to temporary file: %s", temp_file_path)
+            temp_file.close()
+            logger.info("Saved DataFrame to temporary file: %s", temp_file.name)
 
         try:
-            logger.info("Uploading temporary file to datastore: %s", temp_file_path)
+            logger.info("Uploading temporary file to datastore: %s", temp_file.name)
             target_path = f"datasets/{self.dataset_name}/"
 
             # Create a temporary directory with only our CSV file
             with tempfile.TemporaryDirectory() as upload_dir:
                 # Copy our CSV file to the clean upload directory
-                csv_filename = f"{self.dataset_name}.csv"
+                csv_filename = f"{os.path.basename(self.dataset_name)}.csv"
                 upload_file_path = os.path.join(upload_dir, csv_filename)
 
                 # Copy the CSV content to the new file
-                shutil.copy2(temp_file_path, upload_file_path)
+                shutil.copy2(temp_file.name, upload_file_path)
 
                 # Now upload only this clean directory
                 FileDatasetFactory.upload_directory(
@@ -217,7 +219,8 @@ def get_or_create(self) -> Union[FileDataset, TabularDataset]:
 
         finally:
             # Clean up temporary file
-            os.unlink(temp_file_path)
+            if os.path.exists(temp_file.name):
+                os.unlink(temp_file.name)
 
     def dataset_to_dataframe(self) -> pd.DataFrame:
         """
@@ -273,7 +276,7 @@ def from_file(self, file_path: str) -> pd.DataFrame:
         logger.info("Successfully loaded dataset from %s with shape: %s", file_path, df.shape)
         return df
 
-    def from_kaggle(self) -> pd.DataFrame:
+    def from_kaggle(self, dataset: Optional[str] = None) -> pd.DataFrame:
         """
         Download and load a Kaggle dataset as a pandas DataFrame.
 
@@ -287,6 +290,7 @@ def from_kaggle(self) -> pd.DataFrame:
         Raises:
             Exception: If dataset download or loading fails
         """
+        self.kaggle_dataset = dataset or self.kaggle_dataset
         try:
             with tempfile.TemporaryDirectory() as temp_dir:
                 logger.info("Downloading Kaggle dataset: %s", self.kaggle_dataset)