Skip to content

Commit 3870bee

Browse files
authored
v0.16.25 (#129)
Multiple enhancements to mop: * Only delete unreferenced files generated by workflows. (#127 by @Marianie-Simeon) * Ignore more files generated by the execution engine (e.g. rc, script, stdout, stderr). (closes #128) * Report file sizes in summaries and listings. (closes #118) * Allow user to include/exclude files via glob. (closes #116)
1 parent ec3aab5 commit 3870bee

File tree

3 files changed

+96
-24
lines changed

3 files changed

+96
-24
lines changed

changelog.txt

+5
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@ Change Log for FISSFC: the (Fi)recloud (S)ervice (S)elector
33
=======================================================================
44
Terms used below: HL = high level interface, LL = low level interface
55

6+
v0.16.25 - HL: multiple enhancements to mop; only delete unreferenced files
7+
generated by workflows, ignore more files generated by the execution
8+
engine (e.g. rc, script, stdout, stderr), report file sizes, allow
9+
user to include/exclude files via glob.
10+
611
v0.16.24 - Hotfix: corrected error in api.py due to difference in user ID
712
location when run from a Google Cloud VM; setup.py updated to
813
explicitly designate long_description_content_type as text/plain.

firecloud/__about__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
# Package version
2-
__version__ = "0.16.24"
2+
__version__ = "0.16.25"

firecloud/fiss.py

+90-23
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from inspect import getsourcelines
1414
from traceback import print_tb as print_traceback
1515
from io import open
16+
from fnmatch import fnmatchcase
1617
import argparse
1718
import subprocess
1819
import re
@@ -1209,7 +1210,7 @@ def mop(args):
12091210
workspace_name = workspace['workspace']['name']
12101211

12111212
if args.verbose:
1212-
print("{0} -- {1}".format(workspace_name, bucket_prefix))
1213+
print("{} -- {}".format(workspace_name, bucket_prefix))
12131214

12141215
referenced_files = set()
12151216
for value in workspace['workspace']['attributes'].values():
@@ -1219,23 +1220,47 @@ def mop(args):
12191220
# TODO: Make this more efficient with a native api call?
12201221
# # Now run a gsutil ls to list files present in the bucket
12211222
try:
1222-
gsutil_args = ['gsutil', 'ls', 'gs://' + bucket + '/**']
1223+
gsutil_args = ['gsutil', 'ls', '-l', bucket_prefix + '/**']
12231224
if args.verbose:
12241225
print(' '.join(gsutil_args))
1225-
bucket_files = subprocess.check_output(gsutil_args, stderr=subprocess.PIPE)
1226+
bucket_files = subprocess.check_output(gsutil_args, stderr=subprocess.STDOUT)
12261227
# Check output produces a string in Py2, Bytes in Py3, so decode if necessary
12271228
if type(bucket_files) == bytes:
12281229
bucket_files = bucket_files.decode()
1229-
1230+
1231+
# Store size of each file in bucket to report recovered space
1232+
bucket_file_sizes = {}
1233+
for listing in bucket_files.split('\n'):
1234+
listing = listing.strip().split(' ')
1235+
if len(listing) != 3:
1236+
break
1237+
bucket_file_sizes[listing[2]] = int(listing[0])
1238+
1239+
# Now make a call to the API for the user's submission information.
1240+
user_submission_request = fapi.list_submissions(args.project, args.workspace)
1241+
1242+
# Check if API call was successful, in the case of failure, the function will return an error
1243+
fapi._check_response_code(user_submission_request, 200)
1244+
1245+
# Sort user submission ids for future bucket file verification
1246+
submission_ids = set(item['submissionId'] for item in user_submission_request.json())
1247+
1248+
# Check to see if bucket file path contain the user's submission id
1249+
# to ensure deletion of files in the submission directories only.
1250+
# Splits the bucket file: "gs://bucket_Id/submission_id/file_path", by the '/' symbol
1251+
# and stores values in a 5 length array: ['gs:', '' , 'bucket_Id', submission_id, file_path]
1252+
# to extract the submission id from the 4th element (index 3) of the array
1253+
bucket_files = set(bucket_file for bucket_file in bucket_file_sizes if bucket_file.split('/', 4)[3] in submission_ids)
1254+
12301255
except subprocess.CalledProcessError as e:
1231-
eprint("Error retrieving files from bucket: " + str(e))
1256+
eprint("Error retrieving files from bucket:" +
1257+
"\n\t{}\n\t{}".format(str(e), e.output))
12321258
return 1
12331259

1234-
bucket_files = set(bucket_files.strip().split('\n'))
12351260
if args.verbose:
12361261
num = len(bucket_files)
12371262
if args.verbose:
1238-
print("Found {0} files in bucket {1}".format(num, bucket))
1263+
print("Found {} files in bucket {}".format(num, bucket))
12391264

12401265
# Now build a set of files that are referenced in the bucket
12411266
# 1. Get a list of the entity types in the workspace
@@ -1260,51 +1285,85 @@ def mop(args):
12601285

12611286
if args.verbose:
12621287
num = len(referenced_files)
1263-
print("Found {0} referenced files in workspace {1}".format(num, workspace_name))
1288+
print("Found {} referenced files in workspace {}".format(num, workspace_name))
12641289

12651290
# Set difference shows files in bucket that aren't referenced
12661291
unreferenced_files = bucket_files - referenced_files
12671292

12681293
# Filter out files like .logs and rc.txt
12691294
def can_delete(f):
12701295
'''Return true if this file should not be deleted in a mop.'''
1296+
filename = f.rsplit('/', 1)[-1]
12711297
# Don't delete logs
1272-
if f.endswith('.log'):
1298+
if filename.endswith('.log'):
12731299
return False
12741300
# Don't delete return codes from jobs
1275-
if f.endswith('-rc.txt'):
1301+
if filename.endswith('-rc.txt'):
1302+
return False
1303+
if filename == "rc":
12761304
return False
1277-
# Don't delete tool's exec.sh
1278-
if f.endswith('exec.sh'):
1305+
# Don't delete tool's exec.sh or script
1306+
if filename in ('exec.sh', 'script'):
12791307
return False
1308+
# keep stdout, stderr, and output
1309+
if filename in ('stderr', 'stdout', 'output'):
1310+
return False
1311+
# Only delete specified unreferenced files
1312+
if args.include:
1313+
for glob in args.include:
1314+
if fnmatchcase(filename, glob):
1315+
return True
1316+
return False
1317+
# Don't delete specified unreferenced files
1318+
if args.exclude:
1319+
for glob in args.exclude:
1320+
if fnmatchcase(filename, glob):
1321+
return False
12801322

12811323
return True
12821324

1283-
deleteable_files = [f for f in unreferenced_files if can_delete(f)]
1325+
deletable_files = [f for f in unreferenced_files if can_delete(f)]
12841326

1285-
if len(deleteable_files) == 0:
1327+
if len(deletable_files) == 0:
12861328
if args.verbose:
12871329
print("No files to mop in " + workspace['workspace']['name'])
12881330
return 0
1331+
1332+
units = ['bytes', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB']
1333+
def human_readable_size(size_in_bytes):
1334+
'''Takes a bytes value and returns a human-readable string with an
1335+
appropriate unit conversion'''
1336+
reduce_count = 0
1337+
while size_in_bytes >= 1024.0 and reduce_count < 5:
1338+
size_in_bytes /= 1024.0
1339+
reduce_count += 1
1340+
size_str = "{:.2f}".format(size_in_bytes) if reduce_count > 0 else str(size_in_bytes)
1341+
return "{} {}".format(size_str, units[reduce_count])
1342+
1343+
deletable_size = human_readable_size(sum(bucket_file_sizes[f]
1344+
for f in deletable_files))
12891345

12901346
if args.verbose or args.dry_run:
1291-
print("Found {0} files to delete:\n".format(len(deleteable_files))
1292-
+ "\n".join(deleteable_files ) + '\n')
1293-
1294-
message = "WARNING: Delete {0} files in {1} ({2})".format(
1295-
len(deleteable_files), bucket_prefix, workspace['workspace']['name'])
1347+
print("Found {} files to delete:\n".format(len(deletable_files)) +
1348+
"\n".join("{} {}".format(human_readable_size(bucket_file_sizes[f]).rjust(11), f)
1349+
for f in deletable_files) +
1350+
'\nTotal Size: {}\n'.format(deletable_size))
1351+
1352+
message = "WARNING: Delete {} files totaling {} in {} ({})".format(
1353+
len(deletable_files), deletable_size, bucket_prefix,
1354+
workspace['workspace']['name'])
12961355
if args.dry_run or (not args.yes and not _confirm_prompt(message)):
12971356
return 0
12981357

1299-
# Pipe the deleteable_files into gsutil rm to remove them
1358+
# Pipe the deletable_files into gsutil rm to remove them
13001359
gsrm_args = ['gsutil', '-m', 'rm', '-I']
13011360
PIPE = subprocess.PIPE
13021361
STDOUT=subprocess.STDOUT
13031362
if args.verbose:
13041363
print("Deleting files with gsutil...")
13051364
gsrm_proc = subprocess.Popen(gsrm_args, stdin=PIPE, stdout=PIPE, stderr=STDOUT)
1306-
# Pipe the deleteable_files into gsutil
1307-
result = gsrm_proc.communicate(input='\n'.join(deleteable_files).encode())[0]
1365+
# Pipe the deletable_files into gsutil
1366+
result = gsrm_proc.communicate(input='\n'.join(deletable_files).encode())[0]
13081367
if args.verbose:
13091368
if type(result) == bytes:
13101369
result = result.decode()
@@ -1314,7 +1373,7 @@ def can_delete(f):
13141373
@fiss_cmd
13151374
def noop(args):
13161375
if args.verbose:
1317-
proj = getattr(args, "project","unspecified")
1376+
proj = getattr(args, "project", "unspecified")
13181377
space = getattr(args, "workspace", "unspecified")
13191378
print('fiss no-op command: Project=%s, Space=%s' % (proj, space))
13201379
return 0
@@ -2411,6 +2470,14 @@ def main(argv=None):
24112470
parents=[workspace_parent])
24122471
subp.add_argument('--dry-run', action='store_true',
24132472
help='Show deletions that would be performed')
2473+
group = subp.add_mutually_exclusive_group()
2474+
group.add_argument('-i', '--include', nargs='+', metavar="glob",
2475+
help="Only delete unreferenced files matching the " +
2476+
"given UNIX glob-style pattern(s)")
2477+
group.add_argument('-x', '--exclude', nargs='+', metavar="glob",
2478+
help="Only delete unreferenced files that don't match" +
2479+
" the given UNIX glob-style pattern(s)")
2480+
24142481
subp.set_defaults(func=mop)
24152482

24162483
subp = subparsers.add_parser('noop',

0 commit comments

Comments
 (0)