@@ -1089,6 +1089,7 @@ def __init__(
1089
1089
start_method : Optional [str ] = None ,
1090
1090
storage_options : dict [str , Any ] = {},
1091
1091
keep_data_ordered : bool = True ,
1092
+ verbose : bool = True ,
1092
1093
):
1093
1094
"""Provides an efficient way to process data across multiple machine into chunks to make training faster.
1094
1095
@@ -1115,6 +1116,7 @@ def __init__(
1115
1116
inside an interactive shell like Ipython.
1116
1117
storage_options: Storage options for the cloud provider.
1117
1118
keep_data_ordered: Whether to use a shared queue for the workers or not.
1119
+ verbose: Whether to print the progress & logs of the workers. Defaults to True.
1118
1120
"""
1119
1121
# spawn doesn't work in IPython
1120
1122
start_method = start_method or ("fork" if in_notebook () else "spawn" )
@@ -1124,7 +1126,8 @@ def __init__(
1124
1126
msg += "Tip: Libraries relying on lock can hang with `fork`. To use `spawn` in notebooks, "
1125
1127
msg += "move your code to files and import it within the notebook."
1126
1128
1127
- print (msg )
1129
+ if verbose :
1130
+ print (msg )
1128
1131
1129
1132
multiprocessing .set_start_method (start_method , force = True )
1130
1133
@@ -1166,9 +1169,13 @@ def __init__(
1166
1169
if self .output_dir :
1167
1170
# Ensure the output dir is the same across all nodes
1168
1171
self .output_dir = broadcast_object ("output_dir" , self .output_dir , rank = _get_node_rank ())
1169
- print (f"Storing the files under { self .output_dir .path if self .output_dir .path else self .output_dir .url } " )
1172
+ if verbose :
1173
+ print (
1174
+ f"Storing the files under { self .output_dir .path if self .output_dir .path else self .output_dir .url } "
1175
+ )
1170
1176
1171
1177
self .random_seed = random_seed
1178
+ self .verbose = verbose
1172
1179
1173
1180
def run (self , data_recipe : DataRecipe ) -> None :
1174
1181
"""Triggers the data recipe processing over your dataset."""
@@ -1179,7 +1186,8 @@ def run(self, data_recipe: DataRecipe) -> None:
1179
1186
self ._cleanup_checkpoints ()
1180
1187
1181
1188
t0 = time ()
1182
- print (f"Setup started with fast_dev_run={ self .fast_dev_run } ." )
1189
+ if self .verbose :
1190
+ print (f"Setup started with fast_dev_run={ self .fast_dev_run } ." )
1183
1191
1184
1192
# Force random seed to be fixed
1185
1193
random .seed (self .random_seed )
@@ -1231,7 +1239,8 @@ def run(self, data_recipe: DataRecipe) -> None:
1231
1239
if isinstance (user_items , list )
1232
1240
else "Using a Queue to process items on demand."
1233
1241
)
1234
- print (f"Setup finished in { round (time () - t0 , 3 )} seconds. { msg } " )
1242
+ if self .verbose :
1243
+ print (f"Setup finished in { round (time () - t0 , 3 )} seconds. { msg } " )
1235
1244
1236
1245
if self .use_checkpoint :
1237
1246
if isinstance (user_items , multiprocessing .queues .Queue ):
@@ -1244,49 +1253,56 @@ def run(self, data_recipe: DataRecipe) -> None:
1244
1253
# Checkpoint feature is not supported for generators for now.
1245
1254
raise ValueError ("Checkpoint feature is not supported for generators, yet." )
1246
1255
# get the last checkpoint details
1247
- print ("Resuming from last saved checkpoint..." )
1256
+ if self .verbose :
1257
+ print ("Resuming from last saved checkpoint..." )
1248
1258
self ._load_checkpoint_config (workers_user_items )
1249
1259
1250
1260
assert isinstance (self .checkpoint_next_index , list )
1251
1261
1252
1262
if all (self .checkpoint_next_index [i ] == 0 for i in range (self .num_workers )):
1253
1263
# save the current configuration in the checkpoints.json file
1254
- print ("No checkpoints found. Saving current configuration..." )
1264
+ if self .verbose :
1265
+ print ("No checkpoints found. Saving current configuration..." )
1255
1266
self ._save_current_config (workers_user_items )
1256
1267
else :
1257
1268
# load the last checkpoint details
1258
1269
assert isinstance (self .checkpoint_next_index , list )
1259
1270
workers_user_items = [w [self .checkpoint_next_index [i ] :] for i , w in enumerate (workers_user_items )]
1260
- print ("Checkpoints loaded successfully." )
1271
+ if self .verbose :
1272
+ print ("Checkpoints loaded successfully." )
1261
1273
1262
1274
if self .fast_dev_run and not isinstance (user_items , multiprocessing .queues .Queue ):
1263
1275
assert isinstance (workers_user_items , list )
1264
1276
1265
1277
items_to_keep = self .fast_dev_run if isinstance (self .fast_dev_run , int ) else _DEFAULT_FAST_DEV_RUN_ITEMS
1266
1278
workers_user_items = [w [:items_to_keep ] for w in workers_user_items ]
1267
- print (f"Fast dev run is enabled. Limiting to { items_to_keep } items per process." )
1279
+ if self .verbose :
1280
+ print (f"Fast dev run is enabled. Limiting to { items_to_keep } items per process." )
1268
1281
1269
1282
self ._cleanup_cache ()
1270
1283
1271
1284
num_items = sum ([len (items ) for items in workers_user_items ]) if workers_user_items is not None else - 1
1272
1285
1273
- if workers_user_items is not None :
1274
- print (
1275
- f"Starting { self .num_workers } workers with { num_items } items."
1276
- f" The progress bar is only updated when a worker finishes."
1277
- )
1278
- else :
1279
- print (f"Starting { self .num_workers } workers with a Queue to process items on demand." )
1286
+ if self .verbose :
1287
+ if workers_user_items is not None :
1288
+ print (
1289
+ f"Starting { self .num_workers } workers with { num_items } items."
1290
+ f" The progress bar is only updated when a worker finishes."
1291
+ )
1292
+ else :
1293
+ print (f"Starting { self .num_workers } workers with a Queue to process items on demand." )
1280
1294
1281
1295
if self .input_dir is None and self .src_resolver is not None and self .input_dir :
1282
1296
self .input_dir = self .src_resolver (self .input_dir )
1283
- print (f"The remote_dir is `{ self .input_dir } `." )
1297
+ if self .verbose :
1298
+ print (f"The remote_dir is `{ self .input_dir } `." )
1284
1299
1285
1300
signal .signal (signal .SIGINT , self ._signal_handler )
1286
1301
1287
1302
self ._create_process_workers (data_recipe , workers_user_items )
1288
1303
1289
- print ("Workers are ready ! Starting data processing..." )
1304
+ if self .verbose :
1305
+ print ("Workers are ready ! Starting data processing..." )
1290
1306
1291
1307
current_total = 0
1292
1308
if _TQDM_AVAILABLE :
@@ -1306,7 +1322,8 @@ def run(self, data_recipe: DataRecipe) -> None:
1306
1322
total_num_items = len (user_items ) if isinstance (user_items , list ) else - 1
1307
1323
1308
1324
while True :
1309
- flush_msg_queue (self .msg_queue , pbar if _TQDM_AVAILABLE else None )
1325
+ if self .verbose :
1326
+ flush_msg_queue (self .msg_queue , pbar if _TQDM_AVAILABLE else None )
1310
1327
1311
1328
# Exit early if all the workers are done.
1312
1329
# This means either there were some kinda of errors, or optimize function was very small.
@@ -1315,7 +1332,8 @@ def run(self, data_recipe: DataRecipe) -> None:
1315
1332
error = self .error_queue .get (timeout = 0.01 )
1316
1333
self ._exit_on_error (error )
1317
1334
except Empty :
1318
- print ("All workers are done. Exiting!" )
1335
+ if self .verbose :
1336
+ print ("All workers are done. Exiting!" )
1319
1337
break
1320
1338
1321
1339
try :
@@ -1349,13 +1367,15 @@ def run(self, data_recipe: DataRecipe) -> None:
1349
1367
with open ("status.json" , "w" ) as f :
1350
1368
json .dump ({"progress" : str (100 * current_total * num_nodes / total_num_items ) + "%" }, f )
1351
1369
1352
- flush_msg_queue (self .msg_queue , pbar if _TQDM_AVAILABLE else None )
1370
+ if self .verbose :
1371
+ flush_msg_queue (self .msg_queue , pbar if _TQDM_AVAILABLE else None )
1353
1372
1354
1373
if _TQDM_AVAILABLE :
1355
1374
pbar .clear ()
1356
1375
pbar .close ()
1357
1376
1358
- print ("Workers are finished." )
1377
+ if self .verbose :
1378
+ print ("Workers are finished." )
1359
1379
size = len (workers_user_items ) if workers_user_items is not None else None
1360
1380
result = data_recipe ._done (size , self .delete_cached_files , self .output_dir )
1361
1381
@@ -1375,8 +1395,8 @@ def run(self, data_recipe: DataRecipe) -> None:
1375
1395
num_chunks = result .num_chunks ,
1376
1396
num_bytes_per_chunk = result .num_bytes_per_chunk ,
1377
1397
)
1378
-
1379
- print ("Finished data processing!" )
1398
+ if self . verbose :
1399
+ print ("Finished data processing!" )
1380
1400
if self .use_checkpoint and isinstance (data_recipe , DataChunkRecipe ):
1381
1401
# clean up checkpoints
1382
1402
self ._cleanup_checkpoints ()
0 commit comments