Skip to content

Commit

Permalink
Move data h5 (#46)
Browse files Browse the repository at this point in the history
* ENCODE data processing now running with h5

* tests pass with aws data
  • Loading branch information
akmorrow13 authored Jan 27, 2021
1 parent 2e22426 commit a0f7cc6
Show file tree
Hide file tree
Showing 24 changed files with 5,725 additions and 2,561 deletions.
3,962 changes: 3,505 additions & 457 deletions data/ChIP_target_types.csv

Large diffs are not rendered by default.

519 changes: 44 additions & 475 deletions data/download_chip_atlas.py

Large diffs are not rendered by default.

484 changes: 74 additions & 410 deletions data/download_encode.py

Large diffs are not rendered by default.

460 changes: 460 additions & 0 deletions data/download_functions.py

Large diffs are not rendered by default.

163 changes: 163 additions & 0 deletions data/parallel_download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
# Code for running file downloads in parallel

# imports
from multiprocessing import Pool
import multiprocessing as mp
import argparse
from download_functions import *
import os
import pandas as pd
import numpy as np
import pyranges as pr

logger = set_logger()

# number of threads
threads = mp.cpu_count()
logger.info("%i threads available for processing" % threads)

##############################################################################################
############################################# PARSE USER ARGUMENTS ###########################
##############################################################################################

# Parser for user specific locations
parser = argparse.ArgumentParser(description='Downloads bed files in parallel and processes them to npy files')

parser.add_argument('download_path', help='Temporary path to download temporary files to.', type=str)
parser.add_argument('assembly', help='assembly to filter files in metadata.tsv file by.', choices=['ce10', 'ce11', 'dm3', 'dm6', 'hg19', 'hg38', 'GRCh38', 'mm10', 'mm9', 'rn6', 'sacCer3'], type=str)

parser.add_argument('--metadata_path',type=str,
help='Path to ChIP-Atlas metadata csv file.')

parser.add_argument('--min_chip_per_cell', help='Minimum ChIP-seq experiments for each cell type.', type=int, default=1)
parser.add_argument('--min_cells_per_chip', help='Minimum cells a given ChIP-seq target must be observed in.', type=int, default=3)

parser.add_argument('--all_regions_file', help='File to read regions from', type=str, default=None)
parser.add_argument('--bigBedToBed', help='Path to bigBedToBed executable, downloaded from http://hgdownload.cse.ucsc.edu/admin/exe/', type=str, default='bigBedToBed')

download_path = parser.parse_args().download_path
all_regions_file_unfiltered = parser.parse_args().all_regions_file
metadata_path = parser.parse_args().metadata_path
min_chip_per_cell = parser.parse_args().min_chip_per_cell
min_cells_per_chip = parser.parse_args().min_cells_per_chip
assembly = parser.parse_args().assembly
bigBedToBed = parser.parse_args().bigBedToBed

# where to temporarily store np files
tmp_download_path = os.path.join(download_path, "tmp_np")
bed_download_path = os.path.join(download_path, "downloads")

# make paths if they do not exist
if not os.path.exists(tmp_download_path):
os.makedirs(tmp_download_path)
if not os.path.exists(bed_download_path):
os.makedirs(bed_download_path)

replicate_groups = get_metadata_groups(metadata_path,
assembly,
min_chip_per_cell = min_chip_per_cell,
min_cells_per_chip = min_chip_per_cell)

# create matrix or load in existing
matrix_path_all = os.path.join(download_path, 'train_total.h5') # all sites

# collect all regions and merge by chromsome, count number of 200bp bins
pyDF = pr.read_bed(all_regions_file_unfiltered)

# get number of genomic regions in all.pos.bed file
nregions = len(pyDF)
logger.info("Counted %i total unfiltered regions" % nregions)

def processGroups(n):
'''
Process set of enumerated dataframe rows, a group of (antigen, cell types)
Args:
:param n: row from a grouped dataframe, ((antigen, celltype), samples)
:param tmp_download_path: where npz files should be saved to
:return tuple: tuple of (tmp_file_save, cell, target)
'''
target, cell = n[0] # tuple of ((antigen, celltype), samples)
samples = n[1]

id_ = samples.iloc[0]['Experimental ID'] # just use first as ID for filename

if target == 'DNase-Seq' or target == 'DNase-seq' or target == "ATAC-seq":
target = target.split("-")[0] # remove "Seq/seq"

# create a temporaryfile
# save appends 'npy' to end of filename
tmp_file_save = os.path.join(tmp_download_path, id_)

# if there is data in this row, it was already written, so skip it.
if os.path.exists(tmp_file_save + ".npz"):
logger.info("Skipping %s, %s, already written to %s" % (target,cell, tmp_file_save))
arr = np.load(tmp_file_save + ".npz", allow_pickle=True)['data'].astype('i1') # int8
else:
logger.info("writing into matrix for %s, %s" % (target, cell))

if samples.iloc[0]['Source'] == "ENCODE":
downloaded_files = [download_ENCODE_url(sample,bed_download_path, bigBedToBed) for i, sample in samples.iterrows()]
else:
downloaded_files = [download_CHIPAtlas_url(sample,bed_download_path) for i, sample in samples.iterrows()]

downloaded_files = list(filter(lambda x: x is not None, downloaded_files))

# filter out bed files with less than 200 peaks
downloaded_files = list(filter(lambda x: count_lines(x) > 200, downloaded_files))

arr = lojs_overlap(downloaded_files, pyDF)

np.savez_compressed(tmp_file_save, data=arr)

if np.sum(arr) == 0:
return None
else:
return (tmp_file_save, cell, target)



with Pool(threads) as p:
# list of tuples for each file, where tuple is (i, filename, featurename)
results = p.map(processGroups, replicate_groups)

results = [i for i in results if i is not None]

# load in cells and targets into a dataframe
cellTypes = [i[1] for i in results]
targets = [i[2] for i in results]
row_df = pd.DataFrame({'cellType': cellTypes,'target': targets})

### save matrix
if os.path.exists(matrix_path_all):
h5_file = h5py.File(matrix_path_all, "r")['data']
# make sure the dataset hasnt changed if you are appending
assert h5_file[0,:].shape[0] == nregions, "%s has wrong ncols %i, should be %i" % (matrix_path_all, h5_file[0,:].shape[0], nregions)
assert h5_file[:,0].shape[0] == len(results) , "%s has wrong nrows %i, should be %i" % (matrix_path_all, h5_file[:,0].shape[0], len(results))

else:
h5_file = h5py.File(matrix_path_all, "w")
matrix = h5_file.create_dataset("data", (len(results), nregions), dtype='i1', # int8
compression='gzip', compression_opts=9)

for i, (f, cell, target) in enumerate(results):

matrix[i,:] = np.load(f + ".npz", allow_pickle=True)['data'].astype('i1') # int8

if i % 100 == 0:
logger.info("Writing %i, feature %s,%s..." % (i, cell,target))

h5_file.close()

# save row_df to be loaded into maain
row_df.to_csv(os.path.join(download_path, "row_df.csv"))

logger.info("Done saving sparse data")


# can read matrix back in using:
# > import h5py
# > tmp = h5py.File(os.path.join(download, 'train.h5'), "r")
# > tmp['data']
4 changes: 3 additions & 1 deletion docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,16 @@ Epitome is a computational model that leverages chromatin accessibility data to
:maxdepth: 2

installation/source
usage/data
usage/dataset

.. toctree::
:caption: Usage and Examples
:maxdepth: 2

usage/dataset
usage/train
usage/predict
usage/create_dataset
usage/api


Expand Down
61 changes: 0 additions & 61 deletions docs/usage/data.rst

This file was deleted.

74 changes: 74 additions & 0 deletions docs/usage/dataset.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
Creating an Epitome Dataset
===========================

This section explains how to load in an Epitome Dataset. If you
are interested in pre-processing your own dataset from ENCODE or
ChIP-Atlas, see `Configuring data <./create_dataset.html>`__.

First, import EpitomeDataset:

.. code:: python
from epitome.dataset import *
Create an Epitome Dataset
-------------------------

First, create an Epitome Dataset. In the dataset, you will define the
ChIP-seq targets you want to predict, the cell types you want to train from,
and the assays you want to use to compute cell type similarity.

.. code:: python
targets = ['CTCF','RAD21','SMC3']
celltypes = ['K562', 'A549', 'GM12878']
dataset = EpitomeDataset(targets, celltypes)
Note that you do not have to define ``celltypes``. If you leave ``celltypes``
blank, the Epitome dataset will choose cell types that have coverage for the
ChIP-seq targets chosen. The parameters ``min_cells_per_target`` and ``min_targets_per_cell``
specify the minimum number of cells required for a ChIP-seq target, and the minimum
number of ChIP-seq targets required to include a celltype. By default,
``min_cells_per_target = 3`` and ``min_targets_per_cell = 2``.


.. code:: python
targets = ['CTCF','RAD21','SMC3']
dataset = EpitomeDataset(targets,
min_cells_per_target = 4, # requires that each ChIP-seq target has data from at least 4 cell types
min_targets_per_cell = 3) # requires that each cell type has data for all three ChIP-seq targets
Note that by default, EpitomeDataset sets DNase-seq (DNase) to be used to compute
cell type similarity between cell types. To specify a different assay to compute
cell type similarity, you can specify in the Epitome dataset:

.. code:: python
dataset = EpitomeDataset(targets, celltypes, similarity_targets = ['DNase', 'H3K27ac'])
You can then visualize the ChIP-seq targets and cell types in your dataset by
using the ``view()`` function:

.. code:: python
dataset.view()
To list all of the ChIP-seq targets that an Epitome dataset has available data for,
you can define an Epitome Dataset without specifying ``targets`` or ``cells``.
You can then use the ``list_targets()`` function to print all available ChIP-seq targets
in the dataset:

.. code:: python
dataset = EpitomeDataset()
dataset.list_targets() # prints > 200 ChIP-seq targets
You can now use your ``dataset`` in an Epitome model.
39 changes: 33 additions & 6 deletions docs/usage/predict.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Predicting binding from Chromatin Accessibility
===============================================
Predicting peaks for ChIP-seq targets
=====================================


Once you have `trained a model <./train.html>`__, you can predict on your own cell types.
Expand All @@ -11,20 +11,47 @@ To get predictions on the whole genome, run:

.. code:: python
peak_result = model.score_whole_genome(peak_file, # chromatin accessibility peak file
peak_result = model.score_whole_genome(peak_files, # list of bed files containing similarity data (either chromatin accessibility, histone modifications, or other)
output_path, # where to save results
chrs=["chr8","chr9"]) # chromosomes you would like to score. Leave blank for whole genome.
chrs=["chr8","chr9"]) # chromosomes you would like to score. Leave blank to score the whole genome whole genome.
**Note:** Scoring on the whole genome scores about 7 million regions and takes about 1.5 hours.

TODO: talk about including histone modification files.
Using histone modifications to compute cell type similarity
-----------------------------------------------------------
``peak_files`` is a list of bed or narrowpeak files. Each file represents a different
assay from your cell type of interest that is used to compute cell type similarity.
If you just use DNase-seq to compute cell type similarity, ``peak_files`` should be a single
bed file of either ATAC-seq or DNase-seq peaks. If you use additional assays to compute
cell type similarity, such as histone modifications, you should include a separate bed file
for each assay used in computing cell type similarity.

For example, the following example trains an Epitome model using DNase-seq and H3K27ac to compute cell type
similarity, and then predicts using the ``score_whole_genome`` function:

.. code:: python
# define the dataset, using DNase-seq and H3K27ac to compute similarity
targets = ['CTCF', 'RAD21', 'SMC3']
dataset = EpitomeDataset(targets, similarity_targets=['DNase', 'H3K27ac'])
# create and train model
model = VLP(dataset)
model.train(5000)
# list of paths to bed files for similarity assays for a cell type of interest
peak_files = ['my_DNase_peaks.bed', 'my_H3K27ac_peaks.bed']
peak_result = model.score_whole_genome(peak_files, # list of bed files containing similarity data (either chromatin accessibility, histone modifications, or other)
output_path, # where to save results
chrs=["chr8","chr9"]) # chromosomes you would like to score. Leave blank to score the whole genome whole genome.
You can also get predictions on specific genomic regions:

.. code:: python
results = model.score_peak_file(peak_file, # chromatin accessibility peak file
results = model.score_peak_file(peak_files, # chromatin accessibility peak file
regions_file) # bed file of regions to score
This method returns a dataframe of the scored predictions.
Loading

0 comments on commit a0f7cc6

Please sign in to comment.