Merge pull request #162 from fedorov/161-instance-url

fedorov · web-flow · commit b12d5939155f · 2025-05-16T18:08:08.000-04:00
Add get_instance_file_URL
diff --git a/docs/column_descriptions.md b/docs/column_descriptions.md
@@ -105,8 +105,8 @@ includes DICOM instances of the slide microscopy modality.
     obtained
   - `staining_usingSubstance`: describes staining steps the specimen underwent
     before the image was obtained
-  - `max_TotalPixelMatrixColumns`: width of the image at the maximum resolution
-  - `max_TotalMatrixRows`: height of the image at the maximum resolution
+  - `TotalPixelMatrixColumns`: width of the image
+  - `TotalMatrixRows`: height of the image
   - `PixelSpacing_0`: pixel spacing in mm
   - `ImageType`: specifies further characteristics of the image in a list,
     including as the third value whether it is a VOLUME, LABEL, OVERVIEW or
diff --git a/idc_index/index.py b/idc_index/index.py
@@ -133,6 +133,11 @@ def __init__(self):
             },
         }
 
+        # these will point to the dataframes containing the respective indices, once installed
+        self.sm_index = None
+        self.sm_instance_index = None
+        self.clinical_index = None
+
         # Lookup s5cmd
         self.s5cmdPath = shutil.which("s5cmd")
         if self.s5cmdPath is None:
@@ -355,7 +360,9 @@ def fetch_index(self, index_name) -> None:
                 #    self.index[["series_aws_url", "SeriesInstanceUID"]],
                 #    on="SeriesInstanceUID", how="left"
                 # )
-                setattr(self.__class__, index_name, index_table)
+                # TODO: consider switching to class variable!
+                # setattr(self.__class__, index_name, index_table)
+                setattr(self, index_name, index_table)
                 self.indices_overview[index_name]["installed"] = True
                 self.indices_overview[index_name]["file_path"] = filepath
 
@@ -676,6 +683,54 @@ def get_series_file_URLs(self, seriesInstanceUID, source_bucket_location="aws"):
 
         return file_names
 
+    def get_instance_file_URL(self, sopInstanceUID, source_bucket_location="aws"):
+        """
+        Get the bucket URL of the file corresponding to a given SOPInstanceUID.
+
+        This function will only return the URL for the Slide Microscopy (SM) instances,
+        which are maintained in the `sm_instance_index` table.
+
+        Args:
+            sopInstanceUID: string containing the value of DICOM SOPInstanceUID
+            source_bucket_location: string containing the source bucket location, either "aws" or "gcp"
+
+        Returns:
+            string containing the bucket URL of the file corresponding to the SOPInstanceUID,
+            or None if the SOPInstanceUID is not recognized
+        """
+
+        # sm_instance_index is required to complete this operation - install it!
+        self.fetch_index("sm_instance_index")
+
+        if self.sm_instance_index is None:
+            logger.error(
+                "sm_instance_index could not be installed. Please install it first using fetch_index."
+            )
+            return None
+
+        if sopInstanceUID not in self.sm_instance_index["SOPInstanceUID"].values:  # pylint: disable=unsubscriptable-object
+            raise ValueError("SOPInstanceUID not found in IDC sm_instance_index.")
+
+        # merge with the main index to get series_aws_url
+        selected_instance_df = self.sm_instance_index[  # pylint: disable=unsubscriptable-object
+            self.sm_instance_index["SOPInstanceUID"] == sopInstanceUID  # pylint: disable=unsubscriptable-object
+        ].copy()[["SeriesInstanceUID", "SOPInstanceUID", "crdc_instance_uuid"]]
+        selected_instance_df = pd.merge(
+            selected_instance_df,
+            self.index,
+            on="SeriesInstanceUID",
+            how="left",
+        )
+
+        if source_bucket_location == "gcp":
+            # replace AWS with the GCP bucket
+            self._replace_aws_with_gcp_buckets(selected_instance_df, "series_aws_url")
+
+        # instance files are named using crdc_instance_uuid
+        series_url = selected_instance_df.iloc[0]["series_aws_url"][:-1]
+        instance_uuid = selected_instance_df.iloc[0]["crdc_instance_uuid"]
+        return series_url + instance_uuid + ".dcm"
+
     def get_viewer_URL(
         self, seriesInstanceUID=None, studyInstanceUID=None, viewer_selector=None
     ):
@@ -1721,8 +1776,8 @@ def download_from_selection(
         # If SOPInstanceUID(s) are given, we need to join the main index with the instance-level index
         sm_instance_index = None
         if sopInstanceUID:
-            if hasattr(
-                self, "sm_instance_index"
+            if (
+                self.sm_instance_index is not None
             ):  # check if instance-level index is installed
                 download_df = self.sm_instance_index
                 sm_instance_index = self.sm_instance_index
@@ -2138,12 +2193,12 @@ def sql_query(self, sql_query):
         logger.debug("Executing SQL query: " + sql_query)
         # TODO: find a more elegant way to automate the following:  https://www.perplexity.ai/search/write-python-code-that-iterate-XY9ppywbQFSRnOpgbwx_uQ
         index = self.index
-        if hasattr(self, "sm_index"):
+        if self.sm_index is not None:
             sm_index = self.sm_index
-        if hasattr(self, "sm_instance_index"):
+        if self.sm_instance_index is not None:
             sm_instance_index = self.sm_instance_index
-        if hasattr(self, "clinical_index"):
+        if self.clinical_index is not None:
             clinical_index = self.clinical_index
-        if hasattr(self, "prior_versions_index"):
+        if self.prior_versions_index is not None:
             prior_versions_index = self.prior_versions_index
         return duckdb.query(sql_query).to_df()
diff --git a/tests/idcindex.py b/tests/idcindex.py
@@ -190,8 +190,7 @@ def test_download_dicom_series(self):
             self.assertEqual(sum([len(files) for r, d, files in os.walk(temp_dir)]), 3)
 
     def test_download_dicom_instance(self):
-        i = IDCClient()
-        i.fetch_index("sm_instance_index")
+        self.client.fetch_index("sm_instance_index")
         with tempfile.TemporaryDirectory() as temp_dir:
             self.client.download_dicom_instance(
                 sopInstanceUID="1.3.6.1.4.1.5962.99.1.528744472.1087975700.1641206284312.14.0",
@@ -210,8 +209,7 @@ def test_download_dicom_series_gcs(self):
             self.assertEqual(sum([len(files) for r, d, files in os.walk(temp_dir)]), 3)
 
     def test_download_dicom_instance_gcs(self):
-        i = IDCClient()
-        i.fetch_index("sm_instance_index")
+        self.client.fetch_index("sm_instance_index")
         with tempfile.TemporaryDirectory() as temp_dir:
             self.client.download_dicom_instance(
                 sopInstanceUID="1.3.6.1.4.1.5962.99.1.528744472.1087975700.1641206284312.14.0",
@@ -597,6 +595,14 @@ def test_series_files_URLs(self):
         assert len(files_aws) > 0
         assert len(files_gcp) == len(files_aws)
 
+    def test_instance_file_URLs(self):
+        c = IDCClient()
+        sopInstanceUID = "1.3.6.1.4.1.5962.99.1.1900325859.924065538.1719887277027.10.0"
+        file_url = "s3://idc-open-data/763fe058-7d25-4ba7-9b29-fd3d6c41dc4b/210f0529-c767-4795-9acf-bad2f4877427.dcm"
+        files_aws = c.get_instance_file_URL(sopInstanceUID, "aws")
+        files_gcp = c.get_instance_file_URL(sopInstanceUID, "gcp")
+        assert files_aws == files_gcp == file_url
+
 
 if __name__ == "__main__":
     unittest.main()