-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path01-gather-data.Rmd
98 lines (83 loc) · 6.04 KB
/
01-gather-data.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
---
title: "Match Data"
output: html_notebook
---
```{bash, download-metadata}
mkdir -p data
# download the sample metadata v1.3 from GitHub, make sure you download this into "sample_metadata_file" defined in .Rprofile
wget -q -O data/IHEC_metadata_harmonization.v1.3.extended.csv https://raw.githubusercontent.com/IHEC/epiATLAS-metadata-harmonization/9b195bfe18a9e1a8a74096ed16716ffe87ce0c29/openrefine/v1.3/IHEC_metadata_harmonization.v1.3.extended.csv
mkdir -p processed_data
```
```{r, find-epirr-ids}
# Load sample metadata
metadata <- fread(sample_metadata_file)
### make a list of all epirr_ids which contain RNA-Seq, WGBS and observed or imputed ChIP-Seq data
# first subset the metadata to only include the columns epirr_id and automated_experiments*
experiment_cols <- metadata[, c("epirr_id_without_version", grep("automated_experiments", names(metadata), value = TRUE))]
experiment_dt <- metadata[, ..experiment_cols]
# remove imputed WGBS imputed entries, because we do not use them
experiment_dt[automated_experiments_WGBS_standard == 'imputed', automated_experiments_WGBS_standard := ""]
# fill empty cells in WGBS column with PBAT column
experiment_dt[automated_experiments_WGBS_standard == "", automated_experiments_WGBS_standard := automated_experiments_WGBS_PBAT]
# create RNA-Seq column with mRNA-seq and total RNA-seq if mRNA-Seq is empty
experiment_dt[, `automated_experiments_RNA-Seq` := `automated_experiments_RNA-Seq_mRNA-Seq`]
experiment_dt[`automated_experiments_RNA-Seq` == "", `automated_experiments_RNA-Seq` := `automated_experiments_RNA-Seq_total-RNA-Seq`]
# get chip-seq cols
chip_seq_cols <- grep("automated_experiments_ChIP-Seq", names(experiment_dt), value = TRUE)
# check if all chip-seq columns are filled
full_chip <- rowSums(experiment_dt[, lapply(.SD, function(x) x == ""), .SDcols = chip_seq_cols]) == 0
# get epirr_ids which have all three data types
epirr_ids_to_use <- experiment_dt[full_chip & automated_experiments_WGBS_standard != "" & `automated_experiments_RNA-Seq` != "", epirr_id_without_version]
```
```{r, select-data-to-download}
# subset metadata again using the epirr_ids we want to use
download_metadata <- metadata[epirr_id_without_version %in% epirr_ids_to_use, ..experiment_cols]
# melt data table
download_metadata <- melt(download_metadata, id.vars = "epirr_id_without_version", variable.name = "column", value.name = "uuid", na.rm = TRUE)
# split the column names to get the assay type and experiment type
download_metadata <- download_metadata[, c("assay_type", "experiment_type"):=tstrsplit(column, "_", fixed = TRUE, keep=c(3, 4))]
# remove imputed WGBS data
download_metadata[assay_type == 'WGBS' & uuid == 'imputed', uuid := NA]
# set empty uuids to NA
download_metadata[uuid == "", uuid := NA]
# get the epirr_ids which have both RNA-Seq types and set the totalRN to NA
both_RNA <- download_metadata[assay_type == 'RNA-Seq', if(.SD[, sum(!is.na(uuid)) > 1]).(both_RNA=TRUE), by=epirr_id_without_version]
download_metadata[epirr_id_without_version %in% both_RNA[, epirr_id_without_version] & experiment_type == 'total-RNA-Seq', uuid := NA]
# remove empty uuids
download_metadata <- download_metadata[!is.na(uuid)]
# make sure we have 405 entries with 3 different assays and 8 different entries
stopifnot(download_metadata[, .(unique_assays=uniqueN(assay_type), unique_experiments=uniqueN(experiment_type)), by = epirr_id_without_version][, all(unique_assays == 3) & all(unique_experiments == 8)])
```
```{r, make-ftp-list}
# read experiment metadata for observed and imputed data
experiment_metadata <- fread("data/EpiATLAS_experiment_metadata.csv") # from https://ihec.sd4h.ca/IHEC/readme/EpiATLAS_experiment_metadata.csv
# add epirr_id for the specific version and software version
download_metadata[experiment_metadata, on=.NATURAL, c("epirr_id", "software_version"):=.(epirr_id, software_version)]
imputed_experiment_metadata <- fread("data/EpiATLAS_imputed_experiment_metadata.csv") # from https://ihec.sd4h.ca/IHEC/readme/EpiATLAS_imputed_experiment_metadata.csv
# add epirr_id for the specific version and data_path
imputed_experiment_metadata[, epirr_id_without_version:=tstrsplit(epirr_id, ".", fixed=TRUE, keep=1)]
imputed_experiment_metadata[, uuid := 'imputed']
imputed_experiment_metadata[, experiment_type := mark]
download_metadata[imputed_experiment_metadata, on=c('epirr_id_without_version', 'uuid', 'experiment_type'), c("epirr_id", "data_file_path"):=.(i.epirr_id, data_file_path)]
# create the file paths from filenames
download_metadata[assay_type == 'ChIP-Seq', filename := paste("ihec.chipseq.ihec-chipseq-containerv1.1.4", epirr_id, uuid ,"pval.signal.bigwig", sep = '.')]
download_metadata[assay_type == 'RNA-Seq', filename := paste("ihec.rna-seq.ihec-grapenf-containerv1.1.0", epirr_id, uuid ,"isoforms.results", sep = '.')]
download_metadata[, file_path := file.path('IHEC/base_data', epirr_id_without_version, experiment_type, filename)]
# different path for imputed ChIP-Seq data
download_metadata[uuid == 'imputed', file_path := sub('incoming', "IHEC/post_processed_data", data_file_path, fixed = TRUE)]
# write the downloaded files to file for later
fwrite(download_metadata, "processed_data/file_table.csv.gz")
# we download WGBS data using the post processed WGBS matrices
wgbs_matrices <- paste0("IHEC/post_processed_data/WGBS_matrices/meth10/chr", seq.int(22), ".meth10.csv.gz")
final_paths <- c(download_metadata[assay_type != 'WGBS', file_path], wgbs_matrices)
writeLines(final_paths, "processed_data/ftp_paths.txt")
# now you can download the files in ftp_paths.txt
```
here we need to download the files from the sftp
```{r, check-downloaded-files}
# check whether all files are downloaded:
# rna:
stopifnot("some RNA-Seq files are not downloaded yet"=all(file.exists(download_metadata[assay_type == 'RNA-Seq', file.path(rna_data_dir, basename(file_path))])),
"some ChIP-Seq files are not downloaded yet"=all(file.exists(download_metadata[assay_type == 'ChIP-Seq', file.path(chip_data_dir, basename(file_path))])),
"some WGBS matrices are not downloaded yet"=all(file.exists(file.path(wgbs_matrices_data_dir, basename(wgbs_matrices)))))
```