Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Partial fileset Issue# 87 #91

Merged
merged 7 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,9 @@ ENV/
*~
\#*\#

.venv
.venv

# vscode
settings.json
# any additional data being used for development
data/
27 changes: 16 additions & 11 deletions ifcb/data/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,12 @@ class Fileset(object):
"""
Represents a set of three raw data files
"""
def __init__(self, basepath):
def __init__(self, basepath, require_roi_files=True):
"""
:param basepath: the base path of the files (no extension)
"""
self.basepath = basepath
self.require_roi_files = require_roi_files
@property
def adc_path(self):
"""
Expand Down Expand Up @@ -61,13 +62,14 @@ def exists(self):
"""
Checks for existence of all three raw data files.

:param require_roi_files: bool, whether to require the .roi file
:returns bool: whether or not all files exist
"""
if not os.path.exists(self.adc_path):
return False
if not os.path.exists(self.hdr_path):
return False
if not os.path.exists(self.roi_path):
if self.require_roi_files and not os.path.exists(self.roi_path):
return False
return True
# metrics
Expand Down Expand Up @@ -252,7 +254,7 @@ def validate_path(filepath, blacklist=DEFAULT_BLACKLIST, whitelist=DEFAULT_WHITE
return False
return True

def list_filesets(dirpath, blacklist=DEFAULT_BLACKLIST, whitelist=DEFAULT_WHITELIST, sort=True, validate=True):
def list_filesets(dirpath, blacklist=DEFAULT_BLACKLIST, whitelist=DEFAULT_WHITELIST, sort=True, validate=True, require_roi_files=True):
"""
Iterate over entire directory tree and yield a Fileset
object for each .adc/.hdr/.roi fileset found. Warning: for
Expand All @@ -263,6 +265,7 @@ def list_filesets(dirpath, blacklist=DEFAULT_BLACKLIST, whitelist=DEFAULT_WHITEL
do not match a file's basename
:param sort: whether to sort output (sorts by alpha)
:param validate: whether to validate each path
:param require_roi_files: bool, whether to require the .roi file
"""
if not set(blacklist).isdisjoint(set(whitelist)):
raise ValueError('whitelist and blacklist must be disjoint')
Expand All @@ -275,7 +278,7 @@ def list_filesets(dirpath, blacklist=DEFAULT_BLACKLIST, whitelist=DEFAULT_WHITEL
filenames.sort(reverse=True)
for f in filenames:
basename, extension = f[:-4], f[-3:]
if extension == 'adc' and basename+'.hdr' in filenames and basename+'.roi' in filenames:
if extension == 'adc' and basename+'.hdr' in filenames and (not require_roi_files or basename+'.roi' in filenames):
if validate:
reldir = dp[len(dirpath)+1:]
if not validate_path(os.path.join(reldir,basename), whitelist=whitelist, blacklist=blacklist):
Expand Down Expand Up @@ -306,7 +309,7 @@ def list_data_dirs(dirpath, blacklist=DEFAULT_BLACKLIST, sort=True, prune=True):
if os.path.isdir(child):
yield from list_data_dirs(child, sort=sort, prune=prune)

def find_fileset(dirpath, lid, whitelist=['data'], blacklist=['skip','beads']):
def find_fileset(dirpath, lid, whitelist=['data'], blacklist=['skip','beads'], require_roi_files=True):
"""
Find a fileset anywhere below the given directory path
given the bin's lid. This assumes that the file's path
Expand All @@ -318,10 +321,10 @@ def find_fileset(dirpath, lid, whitelist=['data'], blacklist=['skip','beads']):
for name in dirlist:
if name == lid + '.adc':
basepath = os.path.join(dirpath,lid)
return Fileset(basepath)
return Fileset(basepath, require_roi_files=require_roi_files)
elif name in whitelist or name in lid:
# is the name whitelisted or contains part of the lid?
fs = find_fileset(os.path.join(dirpath,name), lid, whitelist=whitelist, blacklist=blacklist)
fs = find_fileset(os.path.join(dirpath,name), lid, whitelist=whitelist, blacklist=blacklist, require_roi_files=require_roi_files)
if fs is not None:
return fs
# not found
Expand All @@ -333,23 +336,25 @@ class DataDirectory(object):

Provides a dict-like interface allowing access to FilesetBins by LID.
"""
def __init__(self, path='.', whitelist=DEFAULT_WHITELIST, blacklist=DEFAULT_BLACKLIST, filter=lambda x: True):
def __init__(self, path='.', whitelist=DEFAULT_WHITELIST, blacklist=DEFAULT_BLACKLIST, filter=lambda x: True, require_roi_files=True):
"""
:param path: the path of the data directory
:param whitelist: a list of directory names to allow
:param blacklist: a list of directory names to disallow
:param require_roi_files: bool, whether to require the .roi file
"""
self.path = path
self.whitelist = whitelist
self.blacklist = blacklist
self.filter = filter
self.require_roi_files=require_roi_files
def list_filesets(self):
"""
Yield all filesets.
"""
for dirpath, basename in list_filesets(self.path, whitelist=self.whitelist, blacklist=self.blacklist):
for dirpath, basename in list_filesets(self.path, whitelist=self.whitelist, blacklist=self.blacklist, require_roi_files=self.require_roi_files):
basepath = os.path.join(dirpath, basename)
fs = Fileset(basepath)
fs = Fileset(basepath, require_roi_files=self.require_roi_files)
if self.filter(fs):
yield fs
def find_fileset(self, lid):
Expand All @@ -360,7 +365,7 @@ def find_fileset(self, lid):
:type lid: str
:returns Fileset: the fileset, or None if not found
"""
fs = find_fileset(self.path, lid, whitelist=self.whitelist, blacklist=self.blacklist)
fs = find_fileset(self.path, lid, whitelist=self.whitelist, blacklist=self.blacklist, require_roi_files=self.require_roi_files)
if fs is None:
return None
elif self.filter(fs):
Expand Down
7 changes: 7 additions & 0 deletions ifcb/tests/data/test_data/partial/IFCB5_2012_028_081515.adc
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
1,.3105469,.06072998046875,-.61004638671875,-.1776123046875,-3.6199951171875,-.29541015625,0,.2597656,202,673,96,45,1,-.27557373046875,
2,.4296875,-.25726318359375,-6.6448974609375,-8.79486083984375,-10,-.701904296875,.3105469,.3808594,252,290,238,137,4321,-1.22467041015625,
3,.7207031,.02685546875,-1.846923828125,.0457763671875,-1.15936279296875,-.61248779296875,.4296875,.6503906,210,432,180,86,36927,-.45074462890625,
3,.7207031,.02685546875,-1.846923828125,.0457763671875,-1.15936279296875,-.61248779296875,.4296875,.6503906,360,457,113,45,52407,-.45074462890625,
4,1.041016,.069580078125,-.0439453125,.07293701171875,.04913330078125,-.12969970703125,.7207031,1.001953,160,357,105,45,57492,-.0146484375,
5,1.232422,.05096435546875,-.76263427734375,-.113525390625,-4.2095947265625,-.3924560546875,1.041016,1.181641,218,632,197,45,62217,-.34698486328125,
6,1.300781,.05340576171875,-.4962158203125,-.1116943359375,-3.8262939453125,-.27801513671875,1.232422,1.251953,-11,-11,0,0,71082,-.411376953125,
6 changes: 6 additions & 0 deletions ifcb/tests/data/test_data/partial/IFCB5_2012_028_081515.hdr
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"Imaging FlowCytobot Acquisition Software version 1.0; October 2005"
"Heidi M. Sosik and Robert J. Olson"
"Woods Hole Oceanographic Institution"
"SyringeStatus = 0"
"Temp Humidity BinarizeThresh PMT1hv(ssc) PMT2hv(chl) BlobSizeThresh"
" 11.4799732421875"," 32.167437512207"," 30"," .675"," .6"," 10"
15 changes: 13 additions & 2 deletions ifcb/tests/data/test_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,17 @@ def test_list_filesets(self):
"""test with validation off and search"""
paths = list(files.list_filesets(self.data_dir, whitelist=WHITELIST, validate=False))
assert len(paths) == 5
partial_paths = list(files.list_filesets(self.data_dir, whitelist=WHITELIST, validate=False,require_roi_files=False))
assert len(partial_paths) == 6

class TestDataDirectory(unittest.TestCase):
def setUp(self):
self.data_dir = data_dir()
self.default = files.DataDirectory(self.data_dir)
self.whitelist = files.DataDirectory(self.data_dir, whitelist=WHITELIST)
self.blacklist = files.DataDirectory(self.data_dir, blacklist=['skip','invalid','empty'])
partial_data_dir = os.path.join(self.data_dir, 'partial')
self.partial = files.DataDirectory(partial_data_dir, require_roi_files=False)
def test_iteration(self):
fss = list(self.default)
assert len(fss) == 1 # only one whitelisted by default
Expand All @@ -46,6 +50,13 @@ def test_exists(self):
assert os.path.exists(fs.adc_path)
assert os.path.exists(fs.hdr_path)
assert os.path.exists(fs.roi_path)

Copy link
Collaborator Author

@shravani-whoi shravani-whoi Jan 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@joefutrelle need help in configuring this test. Need to include the partial test dataset directory here. I added the flag in the exists() method, but should I also add the flag in the FileSet constructor ?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes

partial_fss = [b.fileset for b in self.partial]
for fs in partial_fss:
assert fs.exists()
assert os.path.exists(fs.adc_path)
assert os.path.exists(fs.hdr_path)
assert not os.path.exists(fs.roi_path)
def test_lids(self):
fss = [b.fileset for b in self.whitelist]
lids = [fs.lid for fs in fss]
Expand All @@ -60,8 +71,8 @@ def test_getsizes(self):
assert fs.getsizes() == sizes
assert fs.getsize() == sum(sizes.values())
def test_descendants(self):
assert len(list(self.default.list_descendants())) == 4
assert len(list(self.blacklist.list_descendants())) == 2
assert len(list(self.default.list_descendants())) == 5
assert len(list(self.blacklist.list_descendants())) == 3

class TestFilesetBin(unittest.TestCase):
def _bins(self):
Expand Down