21
21
22
22
aws_endpoint_url = "https://s3.amazonaws.com"
23
23
gcp_endpoint_url = "https://storage.googleapis.com"
24
+ asset_endpoint_url = f"https://api.github.com/repos/ImagingDataCommons/idc-index-data/releases/tags/{ idc_index_data .__version__ } "
24
25
25
26
logging .basicConfig (format = "%(asctime)s - %(message)s" , level = logging .INFO )
26
27
logger = logging .getLogger (__name__ )
@@ -67,7 +68,24 @@ def __init__(self):
67
68
self .collection_summary = self .index .groupby ("collection_id" ).agg (
68
69
{"Modality" : pd .Series .unique , "series_size_MB" : "sum" }
69
70
)
70
- self .indices_overview = self .list_indices ()
71
+
72
+ self .indices_overview = pd .DataFrame (
73
+ {
74
+ "index" : {"description" : None , "installed" : True , "url" : None },
75
+ "sm_index" : {
76
+ "description" : None ,
77
+ "installed" : True ,
78
+ "url" : os .path .join (asset_endpoint_url , "sm_index.parquet" ),
79
+ },
80
+ "sm_instance_index" : {
81
+ "description" : None ,
82
+ "installed" : True ,
83
+ "url" : os .path .join (
84
+ asset_endpoint_url , "sm_instance_index.parquet"
85
+ ),
86
+ },
87
+ }
88
+ )
71
89
72
90
# Lookup s5cmd
73
91
self .s5cmdPath = shutil .which ("s5cmd" )
@@ -172,33 +190,6 @@ def get_idc_version():
172
190
idc_version = Version (idc_index_data .__version__ ).major
173
191
return f"v{ idc_version } "
174
192
175
- @staticmethod
176
- def _get_latest_idc_index_data_release_assets ():
177
- """
178
- Retrieves a list of the latest idc-index-data release assets.
179
-
180
- Returns:
181
- release_assets (list): List of tuples (asset_name, asset_url).
182
- """
183
- release_assets = []
184
- url = f"https://api.github.com/repos/ImagingDataCommons/idc-index-data/releases/tags/{ idc_index_data .__version__ } "
185
- try :
186
- response = requests .get (url , timeout = 30 )
187
- if response .status_code == 200 :
188
- release_data = response .json ()
189
- assets = release_data .get ("assets" , [])
190
- for asset in assets :
191
- release_assets .append (
192
- (asset ["name" ], asset ["browser_download_url" ])
193
- )
194
- else :
195
- logger .error (f"Failed to fetch releases: { response .status_code } " )
196
-
197
- except FileNotFoundError :
198
- logger .error (f"Failed to fetch releases: { response .status_code } " )
199
-
200
- return release_assets
201
-
202
193
def list_indices (self ):
203
194
"""
204
195
Lists all available indices including their installation status.
@@ -207,40 +198,6 @@ def list_indices(self):
207
198
indices_overview (pd.DataFrame): DataFrame containing information per index.
208
199
"""
209
200
210
- if "indices_overview" not in locals ():
211
- indices_overview = {}
212
- # Find installed indices
213
- for file in distribution ("idc-index-data" ).files :
214
- if str (file ).endswith ("index.parquet" ):
215
- index_name = os .path .splitext (
216
- str (file ).rsplit ("/" , maxsplit = 1 )[- 1 ]
217
- )[0 ]
218
-
219
- indices_overview [index_name ] = {
220
- "description" : None ,
221
- "installed" : True ,
222
- "local_path" : os .path .join (
223
- idc_index_data .IDC_INDEX_PARQUET_FILEPATH .parents [0 ],
224
- f"{ index_name } .parquet" ,
225
- ),
226
- }
227
-
228
- # Find available indices from idc-index-data
229
- release_assets = self ._get_latest_idc_index_data_release_assets ()
230
- for asset_name , asset_url in release_assets :
231
- if asset_name .endswith (".parquet" ):
232
- asset_name = os .path .splitext (asset_name )[0 ]
233
- if asset_name not in indices_overview :
234
- indices_overview [asset_name ] = {
235
- "description" : None ,
236
- "installed" : False ,
237
- "url" : asset_url ,
238
- }
239
-
240
- self .indices_overview = pd .DataFrame .from_dict (
241
- indices_overview , orient = "index"
242
- )
243
-
244
201
return self .indices_overview
245
202
246
203
def fetch_index (self , index ) -> None :
@@ -251,23 +208,22 @@ def fetch_index(self, index) -> None:
251
208
index (str): Name of the index to be downloaded.
252
209
"""
253
210
254
- if index not in self .indices_overview .index . tolist ():
211
+ if index not in self .indices_overview .keys ():
255
212
logger .error (f"Index { index } is not available and can not be fetched." )
256
- elif self .indices_overview . loc [index , "installed" ]:
213
+ elif self .indices_overview [index ][ "installed" ]:
257
214
logger .warning (
258
215
f"Index { index } already installed and will not be fetched again."
259
216
)
260
217
else :
261
- response = requests .get (self .indices_overview . loc [index , "url" ], timeout = 30 )
218
+ response = requests .get (self .indices_overview [index ][ "url" ], timeout = 30 )
262
219
if response .status_code == 200 :
263
220
filepath = os .path .join (
264
221
idc_index_data .IDC_INDEX_PARQUET_FILEPATH .parents [0 ],
265
222
f"{ index } .parquet" ,
266
223
)
267
224
with open (filepath , mode = "wb" ) as file :
268
225
file .write (response .content )
269
- self .indices_overview .loc [index , "installed" ] = True
270
- self .indices_overview .loc [index , "local_path" ] = filepath
226
+ self .indices_overview [index ]["installed" ] = True
271
227
else :
272
228
logger .error (f"Failed to fetch index: { response .status_code } " )
273
229
@@ -668,8 +624,8 @@ def _validate_update_manifest_and_get_download_size(
668
624
# create a copy of the index
669
625
index_df_copy = self .index
670
626
671
- # Extract s3 url and crdc_instance_uuid from the manifest copy commands
672
- # Next, extract crdc_instance_uuid from aws_series_url in the index and
627
+ # Extract s3 url and crdc_series_uuid from the manifest copy commands
628
+ # Next, extract crdc_series_uuid from aws_series_url in the index and
673
629
# try to verify if every series in the manifest is present in the index
674
630
675
631
# TODO: need to remove the assumption that manifest commands will have 'cp'
@@ -697,8 +653,9 @@ def _validate_update_manifest_and_get_download_size(
697
653
seriesInstanceuid,
698
654
s3_url,
699
655
series_size_MB,
700
- index_crdc_series_uuid==manifest_crdc_series_uuid AS crdc_series_uuid_match,
656
+ index_crdc_series_uuid is not NULL as crdc_series_uuid_match,
701
657
s3_url==series_aws_url AS s3_url_match,
658
+ manifest_temp.manifest_cp_cmd,
702
659
CASE
703
660
WHEN s3_url==series_aws_url THEN 'aws'
704
661
ELSE
@@ -717,19 +674,23 @@ def _validate_update_manifest_and_get_download_size(
717
674
718
675
endpoint_to_use = None
719
676
720
- if validate_manifest :
721
- # Check if crdc_instance_uuid is found in the index
722
- if not all (merged_df ["crdc_series_uuid_match" ]):
723
- missing_manifest_cp_cmds = merged_df .loc [
724
- ~ merged_df ["crdc_series_uuid_match" ], "manifest_cp_cmd"
725
- ]
726
- missing_manifest_cp_cmds_str = f"The following manifest copy commands do not have any associated series in the index: { missing_manifest_cp_cmds .tolist ()} "
727
- raise ValueError (missing_manifest_cp_cmds_str )
677
+ # Check if any crdc_series_uuid are not found in the index
678
+ if not all (merged_df ["crdc_series_uuid_match" ]):
679
+ missing_manifest_cp_cmds = merged_df .loc [
680
+ ~ merged_df ["crdc_series_uuid_match" ], "manifest_cp_cmd"
681
+ ]
682
+ logger .error (
683
+ "The following manifest copy commands are not recognized as referencing any associated series in the index.\n "
684
+ "This means either these commands are invalid, or they may correspond to files available in a release of IDC\n "
685
+ f"different from { self .get_idc_version ()} used in this version of idc-index. The corresponding files will not be downloaded.\n "
686
+ )
687
+ logger .error ("\n " + "\n " .join (missing_manifest_cp_cmds .tolist ()))
728
688
729
- # Check if there are more than one endpoints
689
+ if validate_manifest :
690
+ # Check if there is more than one endpoint
730
691
if len (merged_df ["endpoint" ].unique ()) > 1 :
731
692
raise ValueError (
732
- "Either GCS bucket path is invalid or manifest has a mix of GCS and AWS urls. If so, please use urls from one provider only "
693
+ "Either GCS bucket path is invalid or manifest has a mix of GCS and AWS urls. "
733
694
)
734
695
735
696
if (
0 commit comments