Skip to content

Commit c52a2a1

Browse files
DanielaSchachererfedorov
authored andcommitted
ENH added functionality and tests to list available indices and fetch sm-specific ones.
1 parent adde133 commit c52a2a1

File tree

2 files changed

+111
-7
lines changed

2 files changed

+111
-7
lines changed

idc_index/index.py

+101-7
Original file line numberDiff line numberDiff line change
@@ -58,20 +58,19 @@ def client(cls) -> IDCClient:
5858
return cls._client
5959

6060
def __init__(self):
61+
# Read main index file
6162
file_path = idc_index_data.IDC_INDEX_PARQUET_FILEPATH
62-
63-
# Read index file
6463
logger.debug(f"Reading index file v{idc_index_data.__version__}")
6564
self.index = pd.read_parquet(file_path)
6665
# self.index = self.index.astype(str).replace("nan", "")
6766
self.index["series_size_MB"] = self.index["series_size_MB"].astype(float)
6867
self.collection_summary = self.index.groupby("collection_id").agg(
6968
{"Modality": pd.Series.unique, "series_size_MB": "sum"}
7069
)
70+
self.indices_overview = self.list_indices()
7171

7272
# Lookup s5cmd
7373
self.s5cmdPath = shutil.which("s5cmd")
74-
7574
if self.s5cmdPath is None:
7675
# Workaround to support environment without a properly setup PATH
7776
# See https://github.com/Slicer/Slicer/pull/7587
@@ -80,16 +79,12 @@ def __init__(self):
8079
if str(script).startswith("s5cmd/bin/s5cmd"):
8180
self.s5cmdPath = script.locate().resolve(strict=True)
8281
break
83-
8482
if self.s5cmdPath is None:
8583
raise FileNotFoundError(
8684
"s5cmd executable not found. Please install s5cmd from https://github.com/peak/s5cmd#installation"
8785
)
88-
8986
self.s5cmdPath = str(self.s5cmdPath)
90-
9187
logger.debug(f"Found s5cmd executable: {self.s5cmdPath}")
92-
9388
# ... and check it can be executed
9489
subprocess.check_call([self.s5cmdPath, "--help"], stdout=subprocess.DEVNULL)
9590

@@ -177,6 +172,105 @@ def get_idc_version():
177172
idc_version = Version(idc_index_data.__version__).major
178173
return f"v{idc_version}"
179174

175+
@staticmethod
176+
def _get_latest_idc_index_data_release_assets():
177+
"""
178+
Retrieves a list of the latest idc-index-data release assets.
179+
180+
Returns:
181+
release_assets (list): List of tuples (asset_name, asset_url).
182+
"""
183+
release_assets = []
184+
url = f"https://api.github.com/repos/ImagingDataCommons/idc-index-data/releases/tags/{idc_index_data.__version__}"
185+
try:
186+
response = requests.get(url, timeout=30)
187+
if response.status_code == 200:
188+
release_data = response.json()
189+
assets = release_data.get("assets", [])
190+
for asset in assets:
191+
release_assets.append(
192+
(asset["name"], asset["browser_download_url"])
193+
)
194+
else:
195+
logger.error(f"Failed to fetch releases: {response.status_code}")
196+
197+
except FileNotFoundError:
198+
logger.error(f"Failed to fetch releases: {response.status_code}")
199+
200+
return release_assets
201+
202+
def list_indices(self):
203+
"""
204+
Lists all available indices including their installation status.
205+
206+
Returns:
207+
indices_overview (pd.DataFrame): DataFrame containing information per index.
208+
"""
209+
210+
if "indices_overview" not in locals():
211+
indices_overview = {}
212+
# Find installed indices
213+
for file in distribution("idc-index-data").files:
214+
if str(file).endswith("index.parquet"):
215+
index_name = os.path.splitext(
216+
str(file).rsplit("/", maxsplit=1)[-1]
217+
)[0]
218+
219+
indices_overview[index_name] = {
220+
"description": None,
221+
"installed": True,
222+
"local_path": os.path.join(
223+
idc_index_data.IDC_INDEX_PARQUET_FILEPATH.parents[0],
224+
f"{index_name}.parquet",
225+
),
226+
}
227+
228+
# Find available indices from idc-index-data
229+
release_assets = self._get_latest_idc_index_data_release_assets()
230+
for asset_name, asset_url in release_assets:
231+
if asset_name.endswith(".parquet"):
232+
asset_name = os.path.splitext(asset_name)[0]
233+
if asset_name not in indices_overview:
234+
indices_overview[asset_name] = {
235+
"description": None,
236+
"installed": False,
237+
"url": asset_url,
238+
}
239+
240+
self.indices_overview = pd.DataFrame.from_dict(
241+
indices_overview, orient="index"
242+
)
243+
244+
return self.indices_overview
245+
246+
def fetch_index(self, index) -> None:
247+
"""
248+
Downloads requested index.
249+
250+
Args:
251+
index (str): Name of the index to be downloaded.
252+
"""
253+
254+
if index not in self.indices_overview.index.tolist():
255+
logger.error(f"Index {index} is not available and can not be fetched.")
256+
elif self.indices_overview.loc[index, "installed"]:
257+
logger.warning(
258+
f"Index {index} already installed and will not be fetched again."
259+
)
260+
else:
261+
response = requests.get(self.indices_overview.loc[index, "url"], timeout=30)
262+
if response.status_code == 200:
263+
filepath = os.path.join(
264+
idc_index_data.IDC_INDEX_PARQUET_FILEPATH.parents[0],
265+
f"{index}.parquet",
266+
)
267+
with open(filepath, mode="wb") as file:
268+
file.write(response.content)
269+
self.indices_overview.loc[index, "installed"] = True
270+
self.indices_overview.loc[index, "local_path"] = filepath
271+
else:
272+
logger.error(f"Failed to fetch index: {response.status_code}")
273+
180274
def get_collections(self):
181275
"""
182276
Returns the collections present in IDC

tests/idcindex.py

+10
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,16 @@ def test_cli_download(self):
461461
)
462462
assert len(os.listdir(Path.cwd())) != 0
463463

464+
def test_list_indices(self):
465+
i = IDCClient()
466+
assert not i.indices_overview.empty # assert that df was created
467+
468+
def test_fetch_index(self):
469+
i = IDCClient()
470+
assert i.indices_overview["sm_index", "installed"] is False
471+
i.fetch_index("sm_index")
472+
assert i.indices_overview["sm_index", "installed"] is True
473+
464474

465475
if __name__ == "__main__":
466476
unittest.main()

0 commit comments

Comments
 (0)