Move data h5 (#46)

* ENCODE data processing now running with h5 * tests pass with aws data
YosefLab · Jan 27, 2021 · a0f7cc6 · a0f7cc6
1 parent 2e22426
commit a0f7cc6
Show file tree

Hide file tree

Showing 24 changed files with 5,725 additions and 2,561 deletions.
diff --git a/data/ChIP_target_types.csv b/data/ChIP_target_types.csv
diff --git a/data/download_chip_atlas.py b/data/download_chip_atlas.py
diff --git a/data/download_encode.py b/data/download_encode.py
diff --git a/data/download_functions.py b/data/download_functions.py
diff --git a/data/parallel_download.py b/data/parallel_download.py
@@ -0,0 +1,163 @@
+# Code for running file downloads in parallel
+
+# imports
+from multiprocessing import Pool
+import multiprocessing as mp
+import argparse
+from download_functions import *
+import os
+import pandas as pd
+import numpy as np
+import pyranges as pr
+
+logger = set_logger()
+
+# number of threads
+threads = mp.cpu_count()
+logger.info("%i threads available for processing" % threads)
+
+##############################################################################################
+############################################# PARSE USER ARGUMENTS ###########################
+##############################################################################################
+
+# Parser for user specific locations
+parser = argparse.ArgumentParser(description='Downloads bed files in parallel and processes them to npy files')
+
+parser.add_argument('download_path', help='Temporary path to download temporary files to.', type=str)
+parser.add_argument('assembly', help='assembly to filter files in metadata.tsv file by.', choices=['ce10', 'ce11', 'dm3', 'dm6', 'hg19', 'hg38', 'GRCh38', 'mm10', 'mm9', 'rn6', 'sacCer3'], type=str)
+
+parser.add_argument('--metadata_path',type=str,
+                    help='Path to ChIP-Atlas metadata csv file.')
+
+parser.add_argument('--min_chip_per_cell', help='Minimum ChIP-seq experiments for each cell type.', type=int, default=1)
+parser.add_argument('--min_cells_per_chip', help='Minimum cells a given ChIP-seq target must be observed in.', type=int, default=3)
+
+parser.add_argument('--all_regions_file', help='File to read regions from', type=str, default=None)
+parser.add_argument('--bigBedToBed', help='Path to bigBedToBed executable, downloaded from http://hgdownload.cse.ucsc.edu/admin/exe/', type=str, default='bigBedToBed')
+
+download_path = parser.parse_args().download_path
+all_regions_file_unfiltered = parser.parse_args().all_regions_file
+metadata_path = parser.parse_args().metadata_path
+min_chip_per_cell = parser.parse_args().min_chip_per_cell
+min_cells_per_chip = parser.parse_args().min_cells_per_chip
+assembly = parser.parse_args().assembly
+bigBedToBed = parser.parse_args().bigBedToBed
+
+# where to temporarily store np files
+tmp_download_path = os.path.join(download_path, "tmp_np")
+bed_download_path = os.path.join(download_path, "downloads")
+
+# make paths if they do not exist
+if not os.path.exists(tmp_download_path):
+    os.makedirs(tmp_download_path)
+if not os.path.exists(bed_download_path):
+    os.makedirs(bed_download_path)
+
+replicate_groups = get_metadata_groups(metadata_path,
+                        assembly,
+                        min_chip_per_cell = min_chip_per_cell,
+                        min_cells_per_chip = min_chip_per_cell)
+
+# create matrix or load in existing
+matrix_path_all = os.path.join(download_path, 'train_total.h5') # all sites
+
+# collect all regions and merge by chromsome, count number of 200bp bins
+pyDF = pr.read_bed(all_regions_file_unfiltered)
+
+# get number of genomic regions in all.pos.bed file
+nregions = len(pyDF)
+logger.info("Counted %i total unfiltered regions" % nregions)
+
+def processGroups(n):
+    '''
+    Process set of enumerated dataframe rows, a group of (antigen, cell types)
+
+    Args:
+        :param n: row from a grouped dataframe, ((antigen, celltype), samples)
+        :param tmp_download_path: where npz files should be saved to
+
+    :return tuple: tuple of (tmp_file_save, cell, target)
+
+    '''
+    target, cell = n[0] # tuple of  ((antigen, celltype), samples)
+    samples = n[1]
+
+    id_ = samples.iloc[0]['Experimental ID'] # just use first as ID for filename
+
+    if target == 'DNase-Seq' or target == 'DNase-seq' or target == "ATAC-seq":
+        target = target.split("-")[0] # remove "Seq/seq"
+
+    # create a temporaryfile
+    # save appends 'npy' to end of filename
+    tmp_file_save = os.path.join(tmp_download_path, id_)
+
+    # if there is data in this row, it was already written, so skip it.
+    if os.path.exists(tmp_file_save + ".npz"):
+        logger.info("Skipping %s, %s, already written to %s" % (target,cell, tmp_file_save))
+        arr = np.load(tmp_file_save + ".npz", allow_pickle=True)['data'].astype('i1') # int8
+    else:
+        logger.info("writing into matrix for %s, %s" % (target, cell))
+
+        if samples.iloc[0]['Source'] == "ENCODE":
+            downloaded_files = [download_ENCODE_url(sample,bed_download_path, bigBedToBed) for i, sample in samples.iterrows()]
+        else:
+            downloaded_files = [download_CHIPAtlas_url(sample,bed_download_path) for i, sample in samples.iterrows()]
+
+        downloaded_files = list(filter(lambda x: x is not None, downloaded_files))
+
+        # filter out bed files with less than 200 peaks
+        downloaded_files = list(filter(lambda x: count_lines(x) > 200, downloaded_files))
+
+        arr = lojs_overlap(downloaded_files, pyDF)
+
+        np.savez_compressed(tmp_file_save, data=arr)
+
+    if np.sum(arr) == 0:
+        return None
+    else:
+        return (tmp_file_save, cell, target)
+
+
+
+with Pool(threads) as p:
+    # list of tuples for each file, where tuple is (i, filename, featurename)
+    results = p.map(processGroups, replicate_groups)
+
+results = [i for i in results if i is not None]
+
+# load in cells and targets into a dataframe
+cellTypes = [i[1] for i in results]
+targets = [i[2] for i in results]
+row_df = pd.DataFrame({'cellType': cellTypes,'target': targets})
+
+### save matrix
+if os.path.exists(matrix_path_all):
+    h5_file = h5py.File(matrix_path_all, "r")['data']
+    # make sure the dataset hasnt changed if you are appending
+    assert h5_file[0,:].shape[0] == nregions, "%s has wrong ncols %i, should be %i" % (matrix_path_all, h5_file[0,:].shape[0], nregions)
+    assert h5_file[:,0].shape[0] == len(results) , "%s has wrong nrows %i, should be %i" % (matrix_path_all, h5_file[:,0].shape[0], len(results))
+
+else:
+    h5_file = h5py.File(matrix_path_all, "w")
+    matrix = h5_file.create_dataset("data", (len(results), nregions), dtype='i1', # int8
+        compression='gzip', compression_opts=9)
+
+    for i, (f, cell, target) in enumerate(results):
+
+        matrix[i,:] = np.load(f + ".npz", allow_pickle=True)['data'].astype('i1') # int8
+
+        if i % 100 == 0:
+            logger.info("Writing %i, feature %s,%s..." % (i, cell,target))
+
+h5_file.close()
+
+# save row_df to be loaded into maain
+row_df.to_csv(os.path.join(download_path, "row_df.csv"))
+
+logger.info("Done saving sparse data")
+
+
+# can read matrix back in using:
+# > import h5py
+# > tmp = h5py.File(os.path.join(download, 'train.h5'), "r")
+# > tmp['data']
diff --git a/docs/index.rst b/docs/index.rst
@@ -13,14 +13,16 @@ Epitome is a computational model that leverages chromatin accessibility data to
    :maxdepth: 2
 
    installation/source
-   usage/data
+   usage/dataset
 
 .. toctree::
    :caption: Usage and Examples
    :maxdepth: 2
 
+   usage/dataset
    usage/train
    usage/predict
+   usage/create_dataset
    usage/api
 
 

diff --git a/docs/usage/data.rst b/docs/usage/data.rst
diff --git a/docs/usage/dataset.rst b/docs/usage/dataset.rst
@@ -0,0 +1,74 @@
+Creating an Epitome Dataset
+===========================
+
+This section explains how to load in an Epitome Dataset. If you
+are interested in pre-processing your own dataset from ENCODE or
+ChIP-Atlas, see `Configuring data <./create_dataset.html>`__.
+
+First, import EpitomeDataset:
+
+.. code:: python
+
+	from epitome.dataset import *
+
+Create an Epitome Dataset
+-------------------------
+
+First, create an Epitome Dataset. In the dataset, you will define the
+ChIP-seq targets you want to predict, the cell types you want to train from,
+and the assays you want to use to compute cell type similarity.
+
+.. code:: python
+
+ 	targets = ['CTCF','RAD21','SMC3']
+	celltypes = ['K562', 'A549', 'GM12878']
+
+	dataset = EpitomeDataset(targets, celltypes)
+
+Note that you do not have to define ``celltypes``. If you leave ``celltypes``
+blank, the Epitome dataset will choose cell types that have coverage  for the
+ChIP-seq targets chosen. The parameters ``min_cells_per_target`` and ``min_targets_per_cell``
+specify the minimum number of cells required for a ChIP-seq target, and the minimum
+number of ChIP-seq targets required to include a celltype. By default,
+``min_cells_per_target = 3`` and ``min_targets_per_cell = 2``.
+
+
+.. code:: python
+
+ 	targets = ['CTCF','RAD21','SMC3']
+
+	dataset = EpitomeDataset(targets,
+		min_cells_per_target = 4, # requires that each ChIP-seq target has data from at least 4 cell types
+		min_targets_per_cell = 3) # requires that each cell type has data for all three ChIP-seq targets
+
+
+Note that by default, EpitomeDataset sets DNase-seq (DNase) to be used to compute
+cell type similarity between cell types. To specify a different assay to compute
+cell type similarity, you can specify in the Epitome dataset:
+
+.. code:: python
+
+	dataset = EpitomeDataset(targets, celltypes, similarity_targets = ['DNase', 'H3K27ac'])
+
+You can then visualize the ChIP-seq targets and cell types in your dataset by
+using the ``view()`` function:
+
+.. code:: python
+
+	dataset.view()
+
+
+To list all of the ChIP-seq targets that an Epitome dataset has available data for,
+you can define an Epitome Dataset without specifying ``targets`` or ``cells``.
+You can then use the ``list_targets()`` function to print all available ChIP-seq targets
+in the dataset:
+
+.. code:: python
+
+	dataset = EpitomeDataset()
+
+	dataset.list_targets() # prints > 200 ChIP-seq targets
+
+
+
+You can now use your ``dataset`` in an Epitome model.
diff --git a/docs/usage/predict.rst b/docs/usage/predict.rst
@@ -1,5 +1,5 @@
-Predicting binding from Chromatin Accessibility
-===============================================
+Predicting peaks for ChIP-seq targets
+=====================================
 
 
 Once you have `trained a model <./train.html>`__, you can predict on your own cell types.
@@ -11,20 +11,47 @@ To get predictions on the whole genome, run:
 
 .. code:: python
 
-  peak_result = model.score_whole_genome(peak_file, # chromatin accessibility peak file
+  peak_result = model.score_whole_genome(peak_files, # list of bed files containing similarity data (either chromatin accessibility, histone modifications, or other)
     output_path, # where to save results
-    chrs=["chr8","chr9"]) # chromosomes you would like to score. Leave blank for whole genome.
+    chrs=["chr8","chr9"]) # chromosomes you would like to score. Leave blank to score the whole genome whole genome.
 
 **Note:** Scoring on the whole genome scores about 7 million regions and takes about 1.5 hours.
 
-TODO: talk about including histone modification files.
+Using histone modifications to compute cell type similarity
+-----------------------------------------------------------
+``peak_files`` is a list of bed or narrowpeak files. Each file represents a different
+assay from your cell type of interest that is used to compute cell type similarity.
+If you just use DNase-seq to compute cell type similarity, ``peak_files`` should be a single
+bed file of either ATAC-seq or DNase-seq peaks. If you use additional assays to compute
+cell type similarity, such as histone modifications, you should include a separate bed file
+for each assay used in computing cell type similarity.
+
+For example, the following example trains an Epitome model using DNase-seq and H3K27ac to compute cell type
+similarity, and then predicts using the ``score_whole_genome`` function:
+
+.. code:: python
+
+  # define the dataset, using DNase-seq and H3K27ac to compute similarity
+  targets = ['CTCF', 'RAD21', 'SMC3']
+  dataset = EpitomeDataset(targets, similarity_targets=['DNase', 'H3K27ac'])
+
+  # create and train model
+  model = VLP(dataset)
+  model.train(5000)
+
+  # list of paths to bed files for similarity assays for a cell type of interest
+  peak_files = ['my_DNase_peaks.bed', 'my_H3K27ac_peaks.bed']
+
+  peak_result = model.score_whole_genome(peak_files, # list of bed files containing similarity data (either chromatin accessibility, histone modifications, or other)
+    output_path, # where to save results
+    chrs=["chr8","chr9"]) # chromosomes you would like to score. Leave blank to score the whole genome whole genome.
 
 
 You can also get predictions on specific genomic regions:
 
 .. code:: python
 
-  results = model.score_peak_file(peak_file, # chromatin accessibility peak file
+  results = model.score_peak_file(peak_files, # chromatin accessibility peak file
     regions_file) # bed file of regions to score
 
 This method returns a dataframe of the scored predictions.