@@ -39,41 +39,22 @@ class Split:
39
39
40
40
class Dataset :
41
41
42
- data_format_params = {
43
- "samples" : (
44
- "improve_sample_id" , "cancer_type" , "model_type" , "common_name" ,
45
- "other_id" , "other_names" , "id_source" , "species"
46
- ),
47
- "transcriptomics" : (
48
- "improve_sample_id" , "entrez_id" , "transcriptomics"
49
- ),
50
- "proteomics" : ("improve_sample_id" , "entrez_id" , "proteomics" ),
51
- "mutations" : ("improve_sample_id" , "entrez_id" , "mutation" ),
52
- "copy_number" : ("improve_sample_id" , "entrez_id" , "copy_number" ),
53
- "methylation" : ("improve_sample_id" , "entrez_id" , "methylation" ),
54
- "experiments" : (
55
- "improve_sample_id" , "improve_drug_id" , "dose_response_value"
56
- ),
57
- "drugs" : ("improve_drug_id" , "chem_name" , "isoSMILES" ),
58
- "genes" : ("entrez_id" , "gene_symbol" , "other_id" )
59
- }
60
-
61
42
def __init__ (
62
43
self ,
63
- name : str = None ,
64
- transcriptomics : pd .DataFrame = None ,
65
- proteomics : pd .DataFrame = None ,
66
- mutations : pd .DataFrame = None ,
67
- copy_number : pd .DataFrame = None ,
68
- samples : pd .DataFrame = None ,
69
- drugs : pd .DataFrame = None ,
70
- drug_descriptors : pd .DataFrame = None ,
71
- mirna : pd .DataFrame = None ,
72
- experiments : pd .DataFrame = None ,
73
- methylation : pd .DataFrame = None ,
74
- metabolomics : pd .DataFrame = None ,
75
- genes : pd .DataFrame = None ,
76
- combinations : pd .DataFrame = None ,
44
+ name : Optional [ str ] = None ,
45
+ transcriptomics : Optional [ pd .DataFrame ] = None ,
46
+ proteomics : Optional [ pd .DataFrame ] = None ,
47
+ mutations : Optional [ pd .DataFrame ] = None ,
48
+ copy_number : Optional [ pd .DataFrame ] = None ,
49
+ samples : Optional [ pd .DataFrame ] = None ,
50
+ drugs : Optional [ pd .DataFrame ] = None ,
51
+ drug_descriptors : Optional [ pd .DataFrame ] = None ,
52
+ mirna : Optional [ pd .DataFrame ] = None ,
53
+ experiments : Optional [ pd .DataFrame ] = None ,
54
+ methylation : Optional [ pd .DataFrame ] = None ,
55
+ metabolomics : Optional [ pd .DataFrame ] = None ,
56
+ genes : Optional [ pd .DataFrame ] = None ,
57
+ combinations : Optional [ pd .DataFrame ] = None ,
77
58
):
78
59
"""
79
60
Load datasets of a specific type into predefined attributes of this class instance.
@@ -131,12 +112,6 @@ def __init__(
131
112
# getters / setters & deleters
132
113
# ----------------------------
133
114
134
-
135
- @property
136
- def data_format_params (self ):
137
- return self ._data_format_params
138
-
139
-
140
115
@property
141
116
def name (self ):
142
117
return self ._name
@@ -330,10 +305,10 @@ def format(
330
305
'experiments' , 'combinations' , 'drug_descriptor' , 'drugs' ,
331
306
'genes' , 'samples' ,
332
307
],
333
- use_polars : bool = False ,
308
+ remove_na : bool = False ,
334
309
** kwargs : dict ,
335
310
):
336
- return format (self , data_type = data_type , use_polars = use_polars , ** kwargs )
311
+ return format (self , data_type = data_type , remove_na = False , ** kwargs )
337
312
338
313
339
314
def split_train_other (
@@ -470,6 +445,21 @@ def load(
470
445
_description_
471
446
"""
472
447
448
+ data_types_to_load = (
449
+ 'transcriptomics' ,
450
+ 'proteomics' ,
451
+ 'mutations' ,
452
+ 'copy_number' ,
453
+ 'samples' ,
454
+ 'drugs' ,
455
+ 'drug_descriptors' ,
456
+ 'mirna' ,
457
+ 'experiments' ,
458
+ 'methylation' ,
459
+ 'metabolomics' ,
460
+ 'genes' ,
461
+ )
462
+
473
463
if type (local_path ) is not Path :
474
464
try :
475
465
local_path = Path (local_path )
@@ -487,30 +477,63 @@ def load(
487
477
dataset = Dataset (name )
488
478
accepted_file_endings = ('.csv' , '.tsv' , '.csv.gz' , '.tsv.gz' )
489
479
print (f"Importing raw data ..." , file = sys .stderr )
490
- for child in local_path .iterdir ():
491
- if child .name in ["genes.csv" , "genes.csv.gz" ]:
480
+
481
+ # generating the file list that contains all files that need to
482
+ # be imported based on the Dataset name
483
+ files = {}
484
+ for p in local_path .glob (f'{ name } _*' ):
485
+ if p .name .endswith (accepted_file_endings ) and p .is_file ():
486
+ dataset_type = p .name [len (name )+ 1 :].split ('.' )[0 ]
487
+ files [dataset_type ] = p
488
+ for p in local_path .glob (f'genes*' ):
489
+ if p .name .endswith (accepted_file_endings ) and p .is_file ():
490
+ files ['genes' ] = p
491
+
492
+ for dataset_type in data_types_to_load :
493
+ if dataset_type not in files :
492
494
print (
493
- f"Importing 'genes' from { child } ... " ,
494
- end = ' ' ,
495
+ f"' { dataset_type } ' not available for { name } " ,
496
+ end = '\n ' ,
495
497
file = sys .stderr
496
498
)
497
- dataset .genes = _load_file (child )
498
- print ("DONE" , file = sys .stderr )
499
-
500
- if (
501
- child .name .startswith (name )
502
- and child .name .endswith (accepted_file_endings )
503
- ):
504
-
505
- dataset_type = child .name [len (name )+ 1 :].split ('.' )[0 ]
499
+ continue
500
+ file = files [dataset_type ]
501
+ if dataset_type != 'genes' :
506
502
print (
507
- f"Importing '{ dataset_type } ' from { child } ..." ,
503
+ f"Importing '{ dataset_type } ' from { file } ..." ,
508
504
end = ' ' ,
509
505
file = sys .stderr
510
506
)
511
507
if hasattr (dataset , dataset_type ):
512
- setattr (dataset , dataset_type , _load_file (child ))
508
+ setattr (dataset , dataset_type , _load_file (file ))
513
509
print ("DONE" , file = sys .stderr )
510
+ else :
511
+ '''
512
+ The genes dataset available in the online repository is
513
+ universal and contains information on genes of all
514
+ datasets. To that end it needs to be subsetted to only
515
+ those genes that are associate with a specific cancer
516
+ dataset.
517
+ '''
518
+ print (
519
+ f"Importing 'genes' from { file } ..." ,
520
+ end = ' ' ,
521
+ file = sys .stderr
522
+ )
523
+ dataset .genes = _load_file (file )
524
+
525
+ entrez_ids = set ()
526
+ for dataset_type in ('transcriptomics' , 'proteomics' ,
527
+ 'mutations' , 'copy_number' ):
528
+ if getattr (dataset , dataset_type ) is not None :
529
+ entrez_ids .update (list (
530
+ getattr (dataset , dataset_type )['entrez_id' ].unique ()
531
+ ))
532
+ dataset .genes = dataset .genes [
533
+ dataset .genes ['entrez_id' ].isin (entrez_ids )
534
+ ]
535
+ print ("DONE" , file = sys .stderr )
536
+
514
537
print (f"Importing raw data ... DONE" , file = sys .stderr )
515
538
return dataset
516
539
@@ -526,6 +549,7 @@ def load(
526
549
dataset = pickle .load (file = file )
527
550
print ("DONE" , file = sys .stderr )
528
551
return dataset
552
+ raise FileNotFoundError ("No suitable pickle file found." )
529
553
530
554
531
555
@@ -536,7 +560,7 @@ def format(
536
560
'experiments' , 'combinations' , 'drug_descriptor' , 'drugs' ,
537
561
'genes' , 'samples' ,
538
562
],
539
- use_polars : bool = False ,
563
+ remove_na : bool = False ,
540
564
** kwargs : dict ,
541
565
):
542
566
@@ -642,6 +666,8 @@ def format(
642
666
columns = 'dose_response_metric' ,
643
667
values = 'dose_response_value'
644
668
).reset_index ().rename_axis (None , axis = 1 )
669
+ if remove_na :
670
+ ret .dropna (axis = 'index' , inplace = True )
645
671
elif shape == 'matrix' :
646
672
if len (metrics ) > 1 :
647
673
raise ValueError (
@@ -654,7 +680,6 @@ def format(
654
680
index = 'improve_drug_id' ,
655
681
columns = 'improve_sample_id'
656
682
)
657
- return ret
658
683
659
684
elif data_type == "combinations" :
660
685
raise NotImplementedError (
@@ -771,7 +796,7 @@ def split_train_test_validate(
771
796
train , other = _split_two_way (
772
797
data = data ,
773
798
split_type = split_type ,
774
- ratio = [ ratio [0 ], ratio [1 ] + ratio [2 ]] ,
799
+ ratio = ( ratio [0 ], ratio [1 ] + ratio [2 ]) ,
775
800
stratify_by = stratify_by ,
776
801
balance = balance ,
777
802
random_state = random_state ,
@@ -781,7 +806,7 @@ def split_train_test_validate(
781
806
test , val = _split_two_way (
782
807
data = other ,
783
808
split_type = split_type ,
784
- ratio = [ ratio [1 ], ratio [2 ]] ,
809
+ ratio = ( ratio [1 ], ratio [2 ]) ,
785
810
stratify_by = stratify_by ,
786
811
balance = balance ,
787
812
random_state = random_state ,
@@ -993,10 +1018,10 @@ def _filter(data: Dataset, split: pd.DataFrame) -> Dataset:
993
1018
return data_ret
994
1019
995
1020
def _balance_data (
996
- data : pd .Dataframe ,
1021
+ data : pd .DataFrame ,
997
1022
random_state : Optional [Union [int ,RandomState ]]= None ,
998
1023
# oversample: bool=False,
999
- ) -> pd .Dataframe :
1024
+ ) -> pd .DataFrame :
1000
1025
tmp = deepcopy (data )
1001
1026
counts = tmp .value_counts ('split_class' )
1002
1027
ret_df = (
@@ -1012,7 +1037,7 @@ def _create_classes(
1012
1037
metric : str ,
1013
1038
num_classes : int = 2 ,
1014
1039
quantiles : bool = True ,
1015
- thresh : float = None ,
1040
+ thresh : Optional [ float ] = None ,
1016
1041
) -> pd .DataFrame :
1017
1042
"""
1018
1043
Helper function that bins experiment data into a number of defined
@@ -1101,7 +1126,7 @@ def _split_two_way(
1101
1126
split_type : Literal [
1102
1127
'mixed-set' , 'drug-blind' , 'cancer-blind'
1103
1128
]= 'mixed-set' ,
1104
- ratio : tuple [int , int , int ]= (8 ,2 ),
1129
+ ratio : tuple [int , int ]= (8 ,2 ),
1105
1130
balance : bool = False ,
1106
1131
stratify_by : Optional [str ]= None ,
1107
1132
random_state : Optional [Union [int ,RandomState ]]= None ,
@@ -1207,7 +1232,8 @@ def _split_two_way(
1207
1232
columns = 'dose_response_metric' ,
1208
1233
values = 'dose_response_value'
1209
1234
).reset_index ()
1210
-
1235
+ if stratify_by is not None :
1236
+ df_full .dropna (axis = 'index' , subset = [stratify_by ], inplace = True )
1211
1237
# Defining the split sizes.
1212
1238
train_size = float (ratio [0 ]) / sum (ratio )
1213
1239
test_val_size = float (ratio [1 ]) / sum (ratio )
0 commit comments