Skip to content

Commit b29b940

Browse files
committed
Updates to documenation
Added new tutorial and adjusted how the tutorial is displayed on the site. Added new functions from stats.py. Updated the README and requirements to reflect changes.
1 parent 2aba3e0 commit b29b940

File tree

14 files changed

+2090
-18913
lines changed

14 files changed

+2090
-18913
lines changed

coderdata/__init__.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,14 @@
1313

1414

1515
from .utils.utils import version
16-
from .utils.utils import list_datasets
16+
from .utils.utils import list_datasets
17+
18+
try:
19+
import matplotlib
20+
import seaborn as sns
21+
except ModuleNotFoundError:
22+
pass
23+
else:
24+
from .utils.stats import summarize_response_metric
25+
from .utils.stats import plot_response_metric
26+
from .utils.stats import plot_2d_respones_metric

coderdata/dataset/dataset.py

Lines changed: 91 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -39,41 +39,22 @@ class Split:
3939

4040
class Dataset:
4141

42-
data_format_params = {
43-
"samples": (
44-
"improve_sample_id", "cancer_type", "model_type", "common_name",
45-
"other_id", "other_names", "id_source", "species"
46-
),
47-
"transcriptomics": (
48-
"improve_sample_id", "entrez_id", "transcriptomics"
49-
),
50-
"proteomics": ("improve_sample_id", "entrez_id", "proteomics"),
51-
"mutations": ("improve_sample_id", "entrez_id", "mutation"),
52-
"copy_number": ("improve_sample_id", "entrez_id", "copy_number"),
53-
"methylation": ("improve_sample_id", "entrez_id", "methylation"),
54-
"experiments": (
55-
"improve_sample_id", "improve_drug_id", "dose_response_value"
56-
),
57-
"drugs": ("improve_drug_id", "chem_name", "isoSMILES"),
58-
"genes": ("entrez_id", "gene_symbol", "other_id")
59-
}
60-
6142
def __init__(
6243
self,
63-
name: str=None,
64-
transcriptomics: pd.DataFrame=None,
65-
proteomics: pd.DataFrame=None,
66-
mutations: pd.DataFrame=None,
67-
copy_number: pd.DataFrame=None,
68-
samples: pd.DataFrame=None,
69-
drugs: pd.DataFrame=None,
70-
drug_descriptors: pd.DataFrame=None,
71-
mirna: pd.DataFrame=None,
72-
experiments: pd.DataFrame=None,
73-
methylation: pd.DataFrame=None,
74-
metabolomics: pd.DataFrame=None,
75-
genes: pd.DataFrame=None,
76-
combinations: pd.DataFrame=None,
44+
name: Optional[str]=None,
45+
transcriptomics: Optional[pd.DataFrame]=None,
46+
proteomics: Optional[pd.DataFrame]=None,
47+
mutations: Optional[pd.DataFrame]=None,
48+
copy_number: Optional[pd.DataFrame]=None,
49+
samples: Optional[pd.DataFrame]=None,
50+
drugs: Optional[pd.DataFrame]=None,
51+
drug_descriptors: Optional[pd.DataFrame]=None,
52+
mirna: Optional[pd.DataFrame]=None,
53+
experiments: Optional[pd.DataFrame]=None,
54+
methylation: Optional[pd.DataFrame]=None,
55+
metabolomics: Optional[pd.DataFrame]=None,
56+
genes: Optional[pd.DataFrame]=None,
57+
combinations: Optional[pd.DataFrame]=None,
7758
):
7859
"""
7960
Load datasets of a specific type into predefined attributes of this class instance.
@@ -131,12 +112,6 @@ def __init__(
131112
# getters / setters & deleters
132113
# ----------------------------
133114

134-
135-
@property
136-
def data_format_params(self):
137-
return self._data_format_params
138-
139-
140115
@property
141116
def name(self):
142117
return self._name
@@ -330,10 +305,10 @@ def format(
330305
'experiments', 'combinations', 'drug_descriptor', 'drugs',
331306
'genes', 'samples',
332307
],
333-
use_polars: bool=False,
308+
remove_na: bool=False,
334309
**kwargs: dict,
335310
):
336-
return format(self, data_type=data_type, use_polars=use_polars, **kwargs)
311+
return format(self, data_type=data_type, remove_na=False, **kwargs)
337312

338313

339314
def split_train_other(
@@ -470,6 +445,21 @@ def load(
470445
_description_
471446
"""
472447

448+
data_types_to_load = (
449+
'transcriptomics',
450+
'proteomics',
451+
'mutations',
452+
'copy_number',
453+
'samples',
454+
'drugs',
455+
'drug_descriptors',
456+
'mirna',
457+
'experiments',
458+
'methylation',
459+
'metabolomics',
460+
'genes',
461+
)
462+
473463
if type(local_path) is not Path:
474464
try:
475465
local_path = Path(local_path)
@@ -487,30 +477,63 @@ def load(
487477
dataset = Dataset(name)
488478
accepted_file_endings = ('.csv', '.tsv', '.csv.gz', '.tsv.gz')
489479
print(f"Importing raw data ...", file=sys.stderr)
490-
for child in local_path.iterdir():
491-
if child.name in ["genes.csv", "genes.csv.gz"]:
480+
481+
# generating the file list that contains all files that need to
482+
# be imported based on the Dataset name
483+
files = {}
484+
for p in local_path.glob(f'{name}_*'):
485+
if p.name.endswith(accepted_file_endings) and p.is_file():
486+
dataset_type = p.name[len(name)+1:].split('.')[0]
487+
files[dataset_type] = p
488+
for p in local_path.glob(f'genes*'):
489+
if p.name.endswith(accepted_file_endings) and p.is_file():
490+
files['genes'] = p
491+
492+
for dataset_type in data_types_to_load:
493+
if dataset_type not in files:
492494
print(
493-
f"Importing 'genes' from {child} ...",
494-
end=' ',
495+
f"'{dataset_type}' not available for {name}",
496+
end='\n',
495497
file=sys.stderr
496498
)
497-
dataset.genes = _load_file(child)
498-
print("DONE", file=sys.stderr)
499-
500-
if (
501-
child.name.startswith(name)
502-
and child.name.endswith(accepted_file_endings)
503-
):
504-
505-
dataset_type = child.name[len(name)+1:].split('.')[0]
499+
continue
500+
file = files[dataset_type]
501+
if dataset_type != 'genes':
506502
print(
507-
f"Importing '{dataset_type}' from {child} ...",
503+
f"Importing '{dataset_type}' from {file} ...",
508504
end=' ',
509505
file=sys.stderr
510506
)
511507
if hasattr(dataset, dataset_type):
512-
setattr(dataset, dataset_type, _load_file(child))
508+
setattr(dataset, dataset_type, _load_file(file))
513509
print("DONE", file=sys.stderr)
510+
else:
511+
'''
512+
The genes dataset available in the online repository is
513+
universal and contains information on genes of all
514+
datasets. To that end it needs to be subsetted to only
515+
those genes that are associate with a specific cancer
516+
dataset.
517+
'''
518+
print(
519+
f"Importing 'genes' from {file} ...",
520+
end=' ',
521+
file=sys.stderr
522+
)
523+
dataset.genes = _load_file(file)
524+
525+
entrez_ids = set()
526+
for dataset_type in ('transcriptomics', 'proteomics',
527+
'mutations', 'copy_number'):
528+
if getattr(dataset, dataset_type) is not None:
529+
entrez_ids.update(list(
530+
getattr(dataset, dataset_type)['entrez_id'].unique()
531+
))
532+
dataset.genes = dataset.genes[
533+
dataset.genes['entrez_id'].isin(entrez_ids)
534+
]
535+
print("DONE", file=sys.stderr)
536+
514537
print(f"Importing raw data ... DONE", file=sys.stderr)
515538
return dataset
516539

@@ -526,6 +549,7 @@ def load(
526549
dataset = pickle.load(file=file)
527550
print("DONE", file=sys.stderr)
528551
return dataset
552+
raise FileNotFoundError("No suitable pickle file found.")
529553

530554

531555

@@ -536,7 +560,7 @@ def format(
536560
'experiments', 'combinations', 'drug_descriptor', 'drugs',
537561
'genes', 'samples',
538562
],
539-
use_polars: bool=False,
563+
remove_na: bool=False,
540564
**kwargs: dict,
541565
):
542566

@@ -642,6 +666,8 @@ def format(
642666
columns = 'dose_response_metric',
643667
values = 'dose_response_value'
644668
).reset_index().rename_axis(None, axis=1)
669+
if remove_na:
670+
ret.dropna(axis='index', inplace=True)
645671
elif shape == 'matrix':
646672
if len(metrics) > 1:
647673
raise ValueError(
@@ -654,7 +680,6 @@ def format(
654680
index='improve_drug_id',
655681
columns='improve_sample_id'
656682
)
657-
return ret
658683

659684
elif data_type == "combinations":
660685
raise NotImplementedError(
@@ -771,7 +796,7 @@ def split_train_test_validate(
771796
train, other = _split_two_way(
772797
data=data,
773798
split_type=split_type,
774-
ratio=[ratio[0], ratio[1] + ratio[2]],
799+
ratio=(ratio[0], ratio[1] + ratio[2]),
775800
stratify_by=stratify_by,
776801
balance=balance,
777802
random_state=random_state,
@@ -781,7 +806,7 @@ def split_train_test_validate(
781806
test, val = _split_two_way(
782807
data=other,
783808
split_type=split_type,
784-
ratio=[ratio[1], ratio[2]],
809+
ratio=(ratio[1], ratio[2]),
785810
stratify_by=stratify_by,
786811
balance=balance,
787812
random_state=random_state,
@@ -993,10 +1018,10 @@ def _filter(data: Dataset, split: pd.DataFrame) -> Dataset:
9931018
return data_ret
9941019

9951020
def _balance_data(
996-
data: pd.Dataframe,
1021+
data: pd.DataFrame,
9971022
random_state: Optional[Union[int,RandomState]]=None,
9981023
# oversample: bool=False,
999-
) -> pd.Dataframe:
1024+
) -> pd.DataFrame:
10001025
tmp = deepcopy(data)
10011026
counts = tmp.value_counts('split_class')
10021027
ret_df = (
@@ -1012,7 +1037,7 @@ def _create_classes(
10121037
metric: str,
10131038
num_classes: int=2,
10141039
quantiles: bool=True,
1015-
thresh: float=None,
1040+
thresh: Optional[float]=None,
10161041
) -> pd.DataFrame:
10171042
"""
10181043
Helper function that bins experiment data into a number of defined
@@ -1101,7 +1126,7 @@ def _split_two_way(
11011126
split_type: Literal[
11021127
'mixed-set', 'drug-blind', 'cancer-blind'
11031128
]='mixed-set',
1104-
ratio: tuple[int, int, int]=(8,2),
1129+
ratio: tuple[int, int]=(8,2),
11051130
balance: bool=False,
11061131
stratify_by: Optional[str]=None,
11071132
random_state: Optional[Union[int,RandomState]]=None,
@@ -1207,7 +1232,8 @@ def _split_two_way(
12071232
columns = 'dose_response_metric',
12081233
values = 'dose_response_value'
12091234
).reset_index()
1210-
1235+
if stratify_by is not None:
1236+
df_full.dropna(axis='index', subset=[stratify_by], inplace=True)
12111237
# Defining the split sizes.
12121238
train_size = float(ratio[0]) / sum(ratio)
12131239
test_val_size = float(ratio[1]) / sum(ratio)

coderdata/download/downloader.py

Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# coderdata/download/downloader.py
22

33
from importlib import resources
4+
from hashlib import md5
45
from pathlib import Path
56
from os import PathLike
67
import os
@@ -86,22 +87,40 @@ def download(
8687

8788
for file_name, file_data in unique_files.items():
8889
file_info = file_data['file_info']
89-
file_url = file_info['download_url']
90-
90+
file_id = str(file_info['id'])
91+
file_url = "https://api.figshare.com/v2/file/download/" + file_id
92+
file_md5sum = file_info['supplied_md5']
93+
retry_count = 10
9194
# Download the file
92-
with requests.get(file_url, stream=True) as r:
93-
r.raise_for_status()
94-
if file_name.exists() and not exist_ok:
95+
while retry_count > 0:
96+
with requests.get(file_url, stream=True) as r:
97+
r.raise_for_status()
98+
if file_name.exists() and not exist_ok:
99+
warnings.warn(
100+
f"{file_name} already exists. Use argument 'exist_ok=True'"
101+
"to overwrite existing file."
102+
)
103+
else:
104+
with open(file_name, 'wb') as f:
105+
for chunk in r.iter_content(chunk_size=8192):
106+
f.write(chunk)
107+
with open(file_name, 'rb') as f:
108+
check_md5sum = md5(f.read()).hexdigest()
109+
if file_md5sum == check_md5sum:
110+
break
111+
elif retry_count > 0:
95112
warnings.warn(
96-
f"{file_name} already exists. Use argument 'exist_ok=True'"
97-
"to overwrite existing file."
98-
)
99-
else:
100-
with open(file_name, 'wb') as f:
101-
for chunk in r.iter_content(chunk_size=8192):
102-
f.write(chunk)
103-
104-
print(f"Downloaded '{file_url}' to '{file_name}'")
113+
f"{file_name} could not be downloaded successfully. "
114+
f"(expected md5sum: {file_md5sum} - "
115+
f"calculated md5sum: {check_md5sum})... retrying..."
116+
)
117+
retry_count = retry_count - 1
118+
if retry_count == 0:
119+
warnings.warn(
120+
f"{file_name} could not be downloaded. Try again."
121+
)
122+
else:
123+
print(f"Downloaded '{file_url}' to '{file_name}'")
105124

106125
return
107126

coderdata/utils/__init__.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,17 @@
11
from .utils import version
2-
from .utils import list_datasets
2+
from .utils import list_datasets
3+
4+
try:
5+
import matplotlib
6+
import seaborn as sns
7+
except ModuleNotFoundError:
8+
import warnings
9+
warnings.warn(
10+
"package was not availble. To use coderdata.utils.stats functions "
11+
"please make sure 'matplotlib' & 'seaborn' are available in the "
12+
"environment."
13+
)
14+
else:
15+
from .stats import summarize_response_metric
16+
from .stats import plot_response_metric
17+
from .stats import plot_2d_respones_metric

0 commit comments

Comments
 (0)