Skip to content

Commit 80c67b3

Browse files
committed
ENH: support selection of gcs for manifest creation
Re #102
1 parent dfcc6cc commit 80c67b3

File tree

1 file changed

+21
-22
lines changed

1 file changed

+21
-22
lines changed

idc_index/index.py

+21-22
Original file line numberDiff line numberDiff line change
@@ -788,7 +788,8 @@ def _validate_update_manifest_and_get_download_size(
788788
index_df_copy = self.index[
789789
[
790790
"SeriesInstanceUID",
791-
"series_aws_url",
791+
"aws_bucket",
792+
"crdc_series_uuid",
792793
"series_size_MB",
793794
"PatientID",
794795
"collection_id",
@@ -800,6 +801,7 @@ def _validate_update_manifest_and_get_download_size(
800801
[
801802
"SeriesInstanceUID",
802803
"series_aws_url",
804+
"crdc_series_uuid",
803805
"series_size_MB",
804806
"PatientID",
805807
"collection_id",
@@ -817,22 +819,20 @@ def _validate_update_manifest_and_get_download_size(
817819
hierarchy = f"CONCAT('{downloadDir}')"
818820

819821
# Extract s3 url and crdc_series_uuid from the manifest copy commands
820-
# Next, extract crdc_series_uuid from aws_series_url in the index and
822+
# Next, construct aws_series_url in the index and
821823
# try to verify if every series in the manifest is present in the index
822824

823-
# TODO: need to remove the assumption that manifest commands will have 'cp'
824-
# and need to parse S3 URL directly
825825
# ruff: noqa
826826
sql = f"""
827827
PRAGMA disable_progress_bar;
828828
WITH
829829
index_temp AS (
830830
SELECT
831831
seriesInstanceUID,
832-
series_aws_url,
832+
CONCAT('s3://',aws_bucket,'/',crdc_series_uuid,'/*') AS series_aws_url,
833833
series_size_MB,
834834
{hierarchy} AS path,
835-
REGEXP_EXTRACT(series_aws_url, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) index_crdc_series_uuid
835+
crdc_series_uuid AS index_crdc_series_uuid
836836
FROM
837837
index_df_copy),
838838
manifest_temp AS (
@@ -885,15 +885,17 @@ def _validate_update_manifest_and_get_download_size(
885885
logger.debug(
886886
"Checking if the requested data is available in other idc versions "
887887
)
888+
888889
missing_series_sql = f"""
889890
PRAGMA disable_progress_bar;
890891
WITH
891-
combined_index AS
892+
index_temp AS
892893
(SELECT
893-
seriesInstanceUID,
894-
series_aws_url,
894+
seriesInstanceUID,
895+
CONCAT('s3://',aws_bucket,'/',crdc_series_uuid,'/*') AS series_aws_url,
895896
series_size_MB,
896897
{hierarchy} AS path,
898+
crdc_series_uuid AS index_crdc_series_uuid
897899
FROM
898900
index_df_copy
899901
union by name
@@ -902,19 +904,11 @@ def _validate_update_manifest_and_get_download_size(
902904
series_aws_url,
903905
series_size_MB,
904906
{hierarchy} AS path,
907+
crdc_series_uuid AS index_crdc_series_uuid
905908
FROM
906909
prior_versions_index_df_copy pvip
907910
908911
),
909-
index_temp AS (
910-
SELECT
911-
seriesInstanceUID,
912-
series_aws_url,
913-
series_size_MB,
914-
path,
915-
REGEXP_EXTRACT(series_aws_url, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) index_crdc_series_uuid
916-
FROM
917-
combined_index),
918912
manifest_temp AS (
919913
SELECT
920914
manifest_cp_cmd,
@@ -947,6 +941,7 @@ def _validate_update_manifest_and_get_download_size(
947941
index_temp.index_crdc_series_uuid = manifest_temp.manifest_crdc_series_uuid
948942
"""
949943
merged_df = duckdb.sql(missing_series_sql).df()
944+
print(merged_df)
950945
if not all(merged_df["crdc_series_uuid_match"]):
951946
missing_manifest_cp_cmds = merged_df.loc[
952947
~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd"
@@ -1034,6 +1029,8 @@ def _validate_update_manifest_and_get_download_size(
10341029
"cp " + merged_df["s3_url"] + " " + '"' + downloadDir + '"'
10351030
)
10361031

1032+
print(merged_df["s5cmd_cmd"])
1033+
10371034
# Combine all commands into a single string with newline separators
10381035
commands = "\n".join(merged_df["s5cmd_cmd"])
10391036

@@ -1702,6 +1699,7 @@ def download_from_selection(
17021699
"StudyInstanceUID",
17031700
"SeriesInstanceUID",
17041701
"crdc_series_uuid",
1702+
"aws_bucket",
17051703
"series_aws_url",
17061704
"series_size_MB",
17071705
]
@@ -1714,6 +1712,7 @@ def download_from_selection(
17141712
"StudyInstanceUID",
17151713
"SeriesInstanceUID",
17161714
"crdc_series_uuid",
1715+
"aws_bucket",
17171716
"series_aws_url",
17181717
"series_size_MB",
17191718
]
@@ -1785,8 +1784,8 @@ def download_from_selection(
17851784
)
17861785
SELECT
17871786
series_aws_url,
1788-
CONCAT(TRIM('*' FROM series_aws_url), crdc_instance_uuid, '.dcm') as instance_aws_url,
1789-
REGEXP_EXTRACT(series_aws_url, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) index_crdc_series_uuid,
1787+
CONCAT('s3://', aws_bucket, '/', crdc_series_uuid,'/', crdc_instance_uuid, ".dcm") as instance_aws_url,
1788+
crdc_series_uuid index_crdc_series_uuid,
17901789
{hierarchy} as path
17911790
FROM
17921791
temp
@@ -1805,8 +1804,8 @@ def download_from_selection(
18051804
result_df
18061805
)
18071806
SELECT
1808-
series_aws_url,
1809-
REGEXP_EXTRACT(series_aws_url, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) index_crdc_series_uuid,
1807+
CONCAT('s3://',aws_bucket,'/',crdc_series_uuid,'/*') AS series_aws_url,
1808+
crdc_series_uuid AS index_crdc_series_uuid,
18101809
series_size_MB,
18111810
{hierarchy} as path
18121811
FROM

0 commit comments

Comments
 (0)