Skip to content

Commit b12d593

Browse files
authored
Merge pull request #162 from fedorov/161-instance-url
Add get_instance_file_URL
2 parents b478e58 + 3300978 commit b12d593

File tree

3 files changed

+74
-13
lines changed

3 files changed

+74
-13
lines changed

docs/column_descriptions.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,8 @@ includes DICOM instances of the slide microscopy modality.
105105
obtained
106106
- `staining_usingSubstance`: describes staining steps the specimen underwent
107107
before the image was obtained
108-
- `max_TotalPixelMatrixColumns`: width of the image at the maximum resolution
109-
- `max_TotalMatrixRows`: height of the image at the maximum resolution
108+
- `TotalPixelMatrixColumns`: width of the image
109+
- `TotalMatrixRows`: height of the image
110110
- `PixelSpacing_0`: pixel spacing in mm
111111
- `ImageType`: specifies further characteristics of the image in a list,
112112
including as the third value whether it is a VOLUME, LABEL, OVERVIEW or

idc_index/index.py

Lines changed: 62 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,11 @@ def __init__(self):
133133
},
134134
}
135135

136+
# these will point to the dataframes containing the respective indices, once installed
137+
self.sm_index = None
138+
self.sm_instance_index = None
139+
self.clinical_index = None
140+
136141
# Lookup s5cmd
137142
self.s5cmdPath = shutil.which("s5cmd")
138143
if self.s5cmdPath is None:
@@ -355,7 +360,9 @@ def fetch_index(self, index_name) -> None:
355360
# self.index[["series_aws_url", "SeriesInstanceUID"]],
356361
# on="SeriesInstanceUID", how="left"
357362
# )
358-
setattr(self.__class__, index_name, index_table)
363+
# TODO: consider switching to class variable!
364+
# setattr(self.__class__, index_name, index_table)
365+
setattr(self, index_name, index_table)
359366
self.indices_overview[index_name]["installed"] = True
360367
self.indices_overview[index_name]["file_path"] = filepath
361368

@@ -676,6 +683,54 @@ def get_series_file_URLs(self, seriesInstanceUID, source_bucket_location="aws"):
676683

677684
return file_names
678685

686+
def get_instance_file_URL(self, sopInstanceUID, source_bucket_location="aws"):
687+
"""
688+
Get the bucket URL of the file corresponding to a given SOPInstanceUID.
689+
690+
This function will only return the URL for the Slide Microscopy (SM) instances,
691+
which are maintained in the `sm_instance_index` table.
692+
693+
Args:
694+
sopInstanceUID: string containing the value of DICOM SOPInstanceUID
695+
source_bucket_location: string containing the source bucket location, either "aws" or "gcp"
696+
697+
Returns:
698+
string containing the bucket URL of the file corresponding to the SOPInstanceUID,
699+
or None if the SOPInstanceUID is not recognized
700+
"""
701+
702+
# sm_instance_index is required to complete this operation - install it!
703+
self.fetch_index("sm_instance_index")
704+
705+
if self.sm_instance_index is None:
706+
logger.error(
707+
"sm_instance_index could not be installed. Please install it first using fetch_index."
708+
)
709+
return None
710+
711+
if sopInstanceUID not in self.sm_instance_index["SOPInstanceUID"].values: # pylint: disable=unsubscriptable-object
712+
raise ValueError("SOPInstanceUID not found in IDC sm_instance_index.")
713+
714+
# merge with the main index to get series_aws_url
715+
selected_instance_df = self.sm_instance_index[ # pylint: disable=unsubscriptable-object
716+
self.sm_instance_index["SOPInstanceUID"] == sopInstanceUID # pylint: disable=unsubscriptable-object
717+
].copy()[["SeriesInstanceUID", "SOPInstanceUID", "crdc_instance_uuid"]]
718+
selected_instance_df = pd.merge(
719+
selected_instance_df,
720+
self.index,
721+
on="SeriesInstanceUID",
722+
how="left",
723+
)
724+
725+
if source_bucket_location == "gcp":
726+
# replace AWS with the GCP bucket
727+
self._replace_aws_with_gcp_buckets(selected_instance_df, "series_aws_url")
728+
729+
# instance files are named using crdc_instance_uuid
730+
series_url = selected_instance_df.iloc[0]["series_aws_url"][:-1]
731+
instance_uuid = selected_instance_df.iloc[0]["crdc_instance_uuid"]
732+
return series_url + instance_uuid + ".dcm"
733+
679734
def get_viewer_URL(
680735
self, seriesInstanceUID=None, studyInstanceUID=None, viewer_selector=None
681736
):
@@ -1721,8 +1776,8 @@ def download_from_selection(
17211776
# If SOPInstanceUID(s) are given, we need to join the main index with the instance-level index
17221777
sm_instance_index = None
17231778
if sopInstanceUID:
1724-
if hasattr(
1725-
self, "sm_instance_index"
1779+
if (
1780+
self.sm_instance_index is not None
17261781
): # check if instance-level index is installed
17271782
download_df = self.sm_instance_index
17281783
sm_instance_index = self.sm_instance_index
@@ -2138,12 +2193,12 @@ def sql_query(self, sql_query):
21382193
logger.debug("Executing SQL query: " + sql_query)
21392194
# TODO: find a more elegant way to automate the following: https://www.perplexity.ai/search/write-python-code-that-iterate-XY9ppywbQFSRnOpgbwx_uQ
21402195
index = self.index
2141-
if hasattr(self, "sm_index"):
2196+
if self.sm_index is not None:
21422197
sm_index = self.sm_index
2143-
if hasattr(self, "sm_instance_index"):
2198+
if self.sm_instance_index is not None:
21442199
sm_instance_index = self.sm_instance_index
2145-
if hasattr(self, "clinical_index"):
2200+
if self.clinical_index is not None:
21462201
clinical_index = self.clinical_index
2147-
if hasattr(self, "prior_versions_index"):
2202+
if self.prior_versions_index is not None:
21482203
prior_versions_index = self.prior_versions_index
21492204
return duckdb.query(sql_query).to_df()

tests/idcindex.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -190,8 +190,7 @@ def test_download_dicom_series(self):
190190
self.assertEqual(sum([len(files) for r, d, files in os.walk(temp_dir)]), 3)
191191

192192
def test_download_dicom_instance(self):
193-
i = IDCClient()
194-
i.fetch_index("sm_instance_index")
193+
self.client.fetch_index("sm_instance_index")
195194
with tempfile.TemporaryDirectory() as temp_dir:
196195
self.client.download_dicom_instance(
197196
sopInstanceUID="1.3.6.1.4.1.5962.99.1.528744472.1087975700.1641206284312.14.0",
@@ -210,8 +209,7 @@ def test_download_dicom_series_gcs(self):
210209
self.assertEqual(sum([len(files) for r, d, files in os.walk(temp_dir)]), 3)
211210

212211
def test_download_dicom_instance_gcs(self):
213-
i = IDCClient()
214-
i.fetch_index("sm_instance_index")
212+
self.client.fetch_index("sm_instance_index")
215213
with tempfile.TemporaryDirectory() as temp_dir:
216214
self.client.download_dicom_instance(
217215
sopInstanceUID="1.3.6.1.4.1.5962.99.1.528744472.1087975700.1641206284312.14.0",
@@ -597,6 +595,14 @@ def test_series_files_URLs(self):
597595
assert len(files_aws) > 0
598596
assert len(files_gcp) == len(files_aws)
599597

598+
def test_instance_file_URLs(self):
599+
c = IDCClient()
600+
sopInstanceUID = "1.3.6.1.4.1.5962.99.1.1900325859.924065538.1719887277027.10.0"
601+
file_url = "s3://idc-open-data/763fe058-7d25-4ba7-9b29-fd3d6c41dc4b/210f0529-c767-4795-9acf-bad2f4877427.dcm"
602+
files_aws = c.get_instance_file_URL(sopInstanceUID, "aws")
603+
files_gcp = c.get_instance_file_URL(sopInstanceUID, "gcp")
604+
assert files_aws == files_gcp == file_url
605+
600606

601607
if __name__ == "__main__":
602608
unittest.main()

0 commit comments

Comments
 (0)