@@ -92,7 +92,9 @@ def __init__(
9292 self .kaggle_dataset = kaggle_dataset
9393 self .file_name = file_name
9494 self .description = description
95- self .dataset = self .get_or_create ()
95+
96+ if dataset_name is not None or source_data is not None or kaggle_dataset is not None or file_name is not None :
97+ self .dataset = self .get_or_create ()
9698
9799 @classmethod
98100 def get_default_name (cls , workspace : Workspace ) -> str :
@@ -161,12 +163,12 @@ def get_or_create(self) -> Union[FileDataset, TabularDataset]:
161163 logger .info ("Using provided DataFrame with shape: %s" , df .shape )
162164 elif self .kaggle_dataset :
163165 df = self .from_kaggle ()
164- logger .info ("Downloaded Kaggle dataset: %s" , self .kaggle_dataset )
165166 elif self .file_name :
166167 df = self .from_file (self .file_name )
167- logger .info ("Loaded dataset from file: %s" , self .file_name )
168168 else :
169169 raise ValueError ("Either source_data or kaggle_dataset or file_name must be provided" )
170+ if df is None or df .empty :
171+ raise ValueError (f"File { self .file_name } does not contain valid data" )
170172
171173 # Get default datastore
172174 datastore = self .workspace .get_default_datastore ()
@@ -176,21 +178,21 @@ def get_or_create(self) -> Union[FileDataset, TabularDataset]:
176178 # Save DataFrame to temporary CSV file
177179 with tempfile .NamedTemporaryFile (mode = "w" , suffix = ".csv" , delete = False ) as temp_file :
178180 df .to_csv (temp_file .name , index = False )
179- temp_file_path = temp_file .name
180- logger .info ("Saved DataFrame to temporary file: %s" , temp_file_path )
181+ temp_file .close ()
182+ logger .info ("Saved DataFrame to temporary file: %s" , temp_file . name )
181183
182184 try :
183- logger .info ("Uploading temporary file to datastore: %s" , temp_file_path )
185+ logger .info ("Uploading temporary file to datastore: %s" , temp_file . name )
184186 target_path = f"datasets/{ self .dataset_name } /"
185187
186188 # Create a temporary directory with only our CSV file
187189 with tempfile .TemporaryDirectory () as upload_dir :
188190 # Copy our CSV file to the clean upload directory
189- csv_filename = f"{ self .dataset_name } .csv"
191+ csv_filename = f"{ os . path . basename ( self .dataset_name ) } .csv"
190192 upload_file_path = os .path .join (upload_dir , csv_filename )
191193
192194 # Copy the CSV content to the new file
193- shutil .copy2 (temp_file_path , upload_file_path )
195+ shutil .copy2 (temp_file . name , upload_file_path )
194196
195197 # Now upload only this clean directory
196198 FileDatasetFactory .upload_directory (
@@ -217,7 +219,8 @@ def get_or_create(self) -> Union[FileDataset, TabularDataset]:
217219
218220 finally :
219221 # Clean up temporary file
220- os .unlink (temp_file_path )
222+ if os .path .exists (temp_file .name ):
223+ os .unlink (temp_file .name )
221224
222225 def dataset_to_dataframe (self ) -> pd .DataFrame :
223226 """
@@ -273,7 +276,7 @@ def from_file(self, file_path: str) -> pd.DataFrame:
273276 logger .info ("Successfully loaded dataset from %s with shape: %s" , file_path , df .shape )
274277 return df
275278
276- def from_kaggle (self ) -> pd .DataFrame :
279+ def from_kaggle (self , dataset : Optional [ str ] = None ) -> pd .DataFrame :
277280 """
278281 Download and load a Kaggle dataset as a pandas DataFrame.
279282
@@ -287,6 +290,7 @@ def from_kaggle(self) -> pd.DataFrame:
287290 Raises:
288291 Exception: If dataset download or loading fails
289292 """
293+ self .kaggle_dataset = dataset or self .kaggle_dataset
290294 try :
291295 with tempfile .TemporaryDirectory () as temp_dir :
292296 logger .info ("Downloading Kaggle dataset: %s" , self .kaggle_dataset )
0 commit comments