@@ -788,7 +788,8 @@ def _validate_update_manifest_and_get_download_size(
788
788
index_df_copy = self .index [
789
789
[
790
790
"SeriesInstanceUID" ,
791
- "series_aws_url" ,
791
+ "aws_bucket" ,
792
+ "crdc_series_uuid" ,
792
793
"series_size_MB" ,
793
794
"PatientID" ,
794
795
"collection_id" ,
@@ -800,6 +801,7 @@ def _validate_update_manifest_and_get_download_size(
800
801
[
801
802
"SeriesInstanceUID" ,
802
803
"series_aws_url" ,
804
+ "crdc_series_uuid" ,
803
805
"series_size_MB" ,
804
806
"PatientID" ,
805
807
"collection_id" ,
@@ -817,22 +819,20 @@ def _validate_update_manifest_and_get_download_size(
817
819
hierarchy = f"CONCAT('{ downloadDir } ')"
818
820
819
821
# Extract s3 url and crdc_series_uuid from the manifest copy commands
820
- # Next, extract crdc_series_uuid from aws_series_url in the index and
822
+ # Next, construct aws_series_url in the index and
821
823
# try to verify if every series in the manifest is present in the index
822
824
823
- # TODO: need to remove the assumption that manifest commands will have 'cp'
824
- # and need to parse S3 URL directly
825
825
# ruff: noqa
826
826
sql = f"""
827
827
PRAGMA disable_progress_bar;
828
828
WITH
829
829
index_temp AS (
830
830
SELECT
831
831
seriesInstanceUID,
832
- series_aws_url,
832
+ CONCAT('s3://',aws_bucket,'/',crdc_series_uuid,'/*') AS series_aws_url,
833
833
series_size_MB,
834
834
{ hierarchy } AS path,
835
- REGEXP_EXTRACT(series_aws_url, '(?:.*? \\ /){{3}}([^ \\ /?#]+)', 1) index_crdc_series_uuid
835
+ crdc_series_uuid AS index_crdc_series_uuid
836
836
FROM
837
837
index_df_copy),
838
838
manifest_temp AS (
@@ -885,15 +885,17 @@ def _validate_update_manifest_and_get_download_size(
885
885
logger .debug (
886
886
"Checking if the requested data is available in other idc versions "
887
887
)
888
+
888
889
missing_series_sql = f"""
889
890
PRAGMA disable_progress_bar;
890
891
WITH
891
- combined_index AS
892
+ index_temp AS
892
893
(SELECT
893
- seriesInstanceUID,
894
- series_aws_url,
894
+ seriesInstanceUID,
895
+ CONCAT('s3://',aws_bucket,'/',crdc_series_uuid,'/*') AS series_aws_url,
895
896
series_size_MB,
896
897
{ hierarchy } AS path,
898
+ crdc_series_uuid AS index_crdc_series_uuid
897
899
FROM
898
900
index_df_copy
899
901
union by name
@@ -902,19 +904,11 @@ def _validate_update_manifest_and_get_download_size(
902
904
series_aws_url,
903
905
series_size_MB,
904
906
{ hierarchy } AS path,
907
+ crdc_series_uuid AS index_crdc_series_uuid
905
908
FROM
906
909
prior_versions_index_df_copy pvip
907
910
908
911
),
909
- index_temp AS (
910
- SELECT
911
- seriesInstanceUID,
912
- series_aws_url,
913
- series_size_MB,
914
- path,
915
- REGEXP_EXTRACT(series_aws_url, '(?:.*?\\ /){{3}}([^\\ /?#]+)', 1) index_crdc_series_uuid
916
- FROM
917
- combined_index),
918
912
manifest_temp AS (
919
913
SELECT
920
914
manifest_cp_cmd,
@@ -947,6 +941,7 @@ def _validate_update_manifest_and_get_download_size(
947
941
index_temp.index_crdc_series_uuid = manifest_temp.manifest_crdc_series_uuid
948
942
"""
949
943
merged_df = duckdb .sql (missing_series_sql ).df ()
944
+ print (merged_df )
950
945
if not all (merged_df ["crdc_series_uuid_match" ]):
951
946
missing_manifest_cp_cmds = merged_df .loc [
952
947
~ merged_df ["crdc_series_uuid_match" ], "manifest_cp_cmd"
@@ -1034,6 +1029,8 @@ def _validate_update_manifest_and_get_download_size(
1034
1029
"cp " + merged_df ["s3_url" ] + " " + '"' + downloadDir + '"'
1035
1030
)
1036
1031
1032
+ print (merged_df ["s5cmd_cmd" ])
1033
+
1037
1034
# Combine all commands into a single string with newline separators
1038
1035
commands = "\n " .join (merged_df ["s5cmd_cmd" ])
1039
1036
@@ -1702,6 +1699,7 @@ def download_from_selection(
1702
1699
"StudyInstanceUID" ,
1703
1700
"SeriesInstanceUID" ,
1704
1701
"crdc_series_uuid" ,
1702
+ "aws_bucket" ,
1705
1703
"series_aws_url" ,
1706
1704
"series_size_MB" ,
1707
1705
]
@@ -1714,6 +1712,7 @@ def download_from_selection(
1714
1712
"StudyInstanceUID" ,
1715
1713
"SeriesInstanceUID" ,
1716
1714
"crdc_series_uuid" ,
1715
+ "aws_bucket" ,
1717
1716
"series_aws_url" ,
1718
1717
"series_size_MB" ,
1719
1718
]
@@ -1785,8 +1784,8 @@ def download_from_selection(
1785
1784
)
1786
1785
SELECT
1787
1786
series_aws_url,
1788
- CONCAT(TRIM('*' FROM series_aws_url), crdc_instance_uuid, ' .dcm' ) as instance_aws_url,
1789
- REGEXP_EXTRACT(series_aws_url, '(?:.*? \\ /){{3}}([^ \\ /?#]+)', 1) index_crdc_series_uuid,
1787
+ CONCAT('s3://', aws_bucket, '/', crdc_series_uuid,'/', crdc_instance_uuid, " .dcm" ) as instance_aws_url,
1788
+ crdc_series_uuid index_crdc_series_uuid,
1790
1789
{ hierarchy } as path
1791
1790
FROM
1792
1791
temp
@@ -1805,8 +1804,8 @@ def download_from_selection(
1805
1804
result_df
1806
1805
)
1807
1806
SELECT
1808
- series_aws_url,
1809
- REGEXP_EXTRACT(series_aws_url, '(?:.*? \\ /){{3}}([^ \\ /?#]+)', 1) index_crdc_series_uuid,
1807
+ CONCAT('s3://',aws_bucket,'/',crdc_series_uuid,'/*') AS series_aws_url,
1808
+ crdc_series_uuid AS index_crdc_series_uuid,
1810
1809
series_size_MB,
1811
1810
{ hierarchy } as path
1812
1811
FROM
0 commit comments