13
13
from inspect import getsourcelines
14
14
from traceback import print_tb as print_traceback
15
15
from io import open
16
+ from fnmatch import fnmatchcase
16
17
import argparse
17
18
import subprocess
18
19
import re
@@ -1209,7 +1210,7 @@ def mop(args):
1209
1210
workspace_name = workspace ['workspace' ]['name' ]
1210
1211
1211
1212
if args .verbose :
1212
- print ("{0 } -- {1 }" .format (workspace_name , bucket_prefix ))
1213
+ print ("{} -- {}" .format (workspace_name , bucket_prefix ))
1213
1214
1214
1215
referenced_files = set ()
1215
1216
for value in workspace ['workspace' ]['attributes' ].values ():
@@ -1219,23 +1220,47 @@ def mop(args):
1219
1220
# TODO: Make this more efficient with a native api call?
1220
1221
# # Now run a gsutil ls to list files present in the bucket
1221
1222
try :
1222
- gsutil_args = ['gsutil' , 'ls' , 'gs://' + bucket + '/**' ]
1223
+ gsutil_args = ['gsutil' , 'ls' , '-l' , bucket_prefix + '/**' ]
1223
1224
if args .verbose :
1224
1225
print (' ' .join (gsutil_args ))
1225
- bucket_files = subprocess .check_output (gsutil_args , stderr = subprocess .PIPE )
1226
+ bucket_files = subprocess .check_output (gsutil_args , stderr = subprocess .STDOUT )
1226
1227
# Check output produces a string in Py2, Bytes in Py3, so decode if necessary
1227
1228
if type (bucket_files ) == bytes :
1228
1229
bucket_files = bucket_files .decode ()
1229
-
1230
+
1231
+ # Store size of each file in bucket to report recovered space
1232
+ bucket_file_sizes = {}
1233
+ for listing in bucket_files .split ('\n ' ):
1234
+ listing = listing .strip ().split (' ' )
1235
+ if len (listing ) != 3 :
1236
+ break
1237
+ bucket_file_sizes [listing [2 ]] = int (listing [0 ])
1238
+
1239
+ # Now make a call to the API for the user's submission information.
1240
+ user_submission_request = fapi .list_submissions (args .project , args .workspace )
1241
+
1242
+ # Check if API call was successful, in the case of failure, the function will return an error
1243
+ fapi ._check_response_code (user_submission_request , 200 )
1244
+
1245
+ # Sort user submission ids for future bucket file verification
1246
+ submission_ids = set (item ['submissionId' ] for item in user_submission_request .json ())
1247
+
1248
+ # Check to see if bucket file path contain the user's submission id
1249
+ # to ensure deletion of files in the submission directories only.
1250
+ # Splits the bucket file: "gs://bucket_Id/submission_id/file_path", by the '/' symbol
1251
+ # and stores values in a 5 length array: ['gs:', '' , 'bucket_Id', submission_id, file_path]
1252
+ # to extract the submission id from the 4th element (index 3) of the array
1253
+ bucket_files = set (bucket_file for bucket_file in bucket_file_sizes if bucket_file .split ('/' , 4 )[3 ] in submission_ids )
1254
+
1230
1255
except subprocess .CalledProcessError as e :
1231
- eprint ("Error retrieving files from bucket: " + str (e ))
1256
+ eprint ("Error retrieving files from bucket:" +
1257
+ "\n \t {}\n \t {}" .format (str (e ), e .output ))
1232
1258
return 1
1233
1259
1234
- bucket_files = set (bucket_files .strip ().split ('\n ' ))
1235
1260
if args .verbose :
1236
1261
num = len (bucket_files )
1237
1262
if args .verbose :
1238
- print ("Found {0 } files in bucket {1 }" .format (num , bucket ))
1263
+ print ("Found {} files in bucket {}" .format (num , bucket ))
1239
1264
1240
1265
# Now build a set of files that are referenced in the bucket
1241
1266
# 1. Get a list of the entity types in the workspace
@@ -1260,51 +1285,85 @@ def mop(args):
1260
1285
1261
1286
if args .verbose :
1262
1287
num = len (referenced_files )
1263
- print ("Found {0 } referenced files in workspace {1 }" .format (num , workspace_name ))
1288
+ print ("Found {} referenced files in workspace {}" .format (num , workspace_name ))
1264
1289
1265
1290
# Set difference shows files in bucket that aren't referenced
1266
1291
unreferenced_files = bucket_files - referenced_files
1267
1292
1268
1293
# Filter out files like .logs and rc.txt
1269
1294
def can_delete (f ):
1270
1295
'''Return true if this file should not be deleted in a mop.'''
1296
+ filename = f .rsplit ('/' , 1 )[- 1 ]
1271
1297
# Don't delete logs
1272
- if f .endswith ('.log' ):
1298
+ if filename .endswith ('.log' ):
1273
1299
return False
1274
1300
# Don't delete return codes from jobs
1275
- if f .endswith ('-rc.txt' ):
1301
+ if filename .endswith ('-rc.txt' ):
1302
+ return False
1303
+ if filename == "rc" :
1276
1304
return False
1277
- # Don't delete tool's exec.sh
1278
- if f . endswith ('exec.sh' ):
1305
+ # Don't delete tool's exec.sh or script
1306
+ if filename in ('exec.sh' , 'script ' ):
1279
1307
return False
1308
+ # keep stdout, stderr, and output
1309
+ if filename in ('stderr' , 'stdout' , 'output' ):
1310
+ return False
1311
+ # Only delete specified unreferenced files
1312
+ if args .include :
1313
+ for glob in args .include :
1314
+ if fnmatchcase (filename , glob ):
1315
+ return True
1316
+ return False
1317
+ # Don't delete specified unreferenced files
1318
+ if args .exclude :
1319
+ for glob in args .exclude :
1320
+ if fnmatchcase (filename , glob ):
1321
+ return False
1280
1322
1281
1323
return True
1282
1324
1283
- deleteable_files = [f for f in unreferenced_files if can_delete (f )]
1325
+ deletable_files = [f for f in unreferenced_files if can_delete (f )]
1284
1326
1285
- if len (deleteable_files ) == 0 :
1327
+ if len (deletable_files ) == 0 :
1286
1328
if args .verbose :
1287
1329
print ("No files to mop in " + workspace ['workspace' ]['name' ])
1288
1330
return 0
1331
+
1332
+ units = ['bytes' , 'KiB' , 'MiB' , 'GiB' , 'TiB' , 'PiB' ]
1333
+ def human_readable_size (size_in_bytes ):
1334
+ '''Takes a bytes value and returns a human-readable string with an
1335
+ appropriate unit conversion'''
1336
+ reduce_count = 0
1337
+ while size_in_bytes >= 1024.0 and reduce_count < 5 :
1338
+ size_in_bytes /= 1024.0
1339
+ reduce_count += 1
1340
+ size_str = "{:.2f}" .format (size_in_bytes ) if reduce_count > 0 else str (size_in_bytes )
1341
+ return "{} {}" .format (size_str , units [reduce_count ])
1342
+
1343
+ deletable_size = human_readable_size (sum (bucket_file_sizes [f ]
1344
+ for f in deletable_files ))
1289
1345
1290
1346
if args .verbose or args .dry_run :
1291
- print ("Found {0} files to delete:\n " .format (len (deleteable_files ))
1292
- + "\n " .join (deleteable_files ) + '\n ' )
1293
-
1294
- message = "WARNING: Delete {0} files in {1} ({2})" .format (
1295
- len (deleteable_files ), bucket_prefix , workspace ['workspace' ]['name' ])
1347
+ print ("Found {} files to delete:\n " .format (len (deletable_files )) +
1348
+ "\n " .join ("{} {}" .format (human_readable_size (bucket_file_sizes [f ]).rjust (11 ), f )
1349
+ for f in deletable_files ) +
1350
+ '\n Total Size: {}\n ' .format (deletable_size ))
1351
+
1352
+ message = "WARNING: Delete {} files totaling {} in {} ({})" .format (
1353
+ len (deletable_files ), deletable_size , bucket_prefix ,
1354
+ workspace ['workspace' ]['name' ])
1296
1355
if args .dry_run or (not args .yes and not _confirm_prompt (message )):
1297
1356
return 0
1298
1357
1299
- # Pipe the deleteable_files into gsutil rm to remove them
1358
+ # Pipe the deletable_files into gsutil rm to remove them
1300
1359
gsrm_args = ['gsutil' , '-m' , 'rm' , '-I' ]
1301
1360
PIPE = subprocess .PIPE
1302
1361
STDOUT = subprocess .STDOUT
1303
1362
if args .verbose :
1304
1363
print ("Deleting files with gsutil..." )
1305
1364
gsrm_proc = subprocess .Popen (gsrm_args , stdin = PIPE , stdout = PIPE , stderr = STDOUT )
1306
- # Pipe the deleteable_files into gsutil
1307
- result = gsrm_proc .communicate (input = '\n ' .join (deleteable_files ).encode ())[0 ]
1365
+ # Pipe the deletable_files into gsutil
1366
+ result = gsrm_proc .communicate (input = '\n ' .join (deletable_files ).encode ())[0 ]
1308
1367
if args .verbose :
1309
1368
if type (result ) == bytes :
1310
1369
result = result .decode ()
@@ -1314,7 +1373,7 @@ def can_delete(f):
1314
1373
@fiss_cmd
1315
1374
def noop (args ):
1316
1375
if args .verbose :
1317
- proj = getattr (args , "project" ,"unspecified" )
1376
+ proj = getattr (args , "project" , "unspecified" )
1318
1377
space = getattr (args , "workspace" , "unspecified" )
1319
1378
print ('fiss no-op command: Project=%s, Space=%s' % (proj , space ))
1320
1379
return 0
@@ -2411,6 +2470,14 @@ def main(argv=None):
2411
2470
parents = [workspace_parent ])
2412
2471
subp .add_argument ('--dry-run' , action = 'store_true' ,
2413
2472
help = 'Show deletions that would be performed' )
2473
+ group = subp .add_mutually_exclusive_group ()
2474
+ group .add_argument ('-i' , '--include' , nargs = '+' , metavar = "glob" ,
2475
+ help = "Only delete unreferenced files matching the " +
2476
+ "given UNIX glob-style pattern(s)" )
2477
+ group .add_argument ('-x' , '--exclude' , nargs = '+' , metavar = "glob" ,
2478
+ help = "Only delete unreferenced files that don't match" +
2479
+ " the given UNIX glob-style pattern(s)" )
2480
+
2414
2481
subp .set_defaults (func = mop )
2415
2482
2416
2483
subp = subparsers .add_parser ('noop' ,
0 commit comments