Skip to content

Commit

Permalink
make ENV variable available to specify data location (#31)
Browse files Browse the repository at this point in the history
* added option to specify data path in ENV variable
  • Loading branch information
akmorrow13 authored Aug 27, 2020
1 parent ec5919d commit ca05f3d
Show file tree
Hide file tree
Showing 8 changed files with 150 additions and 67 deletions.
30 changes: 16 additions & 14 deletions data/download_encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import os
import urllib
import multiprocessing
from multiprocessing import Pool
import subprocess
import math
import argparse
Expand Down Expand Up @@ -128,7 +129,7 @@ def loj_overlap(feature_file):
if os.path.normpath(os.path.dirf(all_regions_file_unfiltered)) != os.path.normpath(output_path):
shutil.copyfile(all_regions_file_unfiltered, os.path.join(output_path, "all.pos_unfiltered.bed"))

# gzipped tmp file
# gzipped tmp file
all_regions_file_unfiltered_gz = all_regions_file_unfiltered + ".gz"

# download metadata if it does not exist
Expand Down Expand Up @@ -168,6 +169,7 @@ def loj_overlap(feature_file):
filtered_chip = filtered_chip.drop_duplicates(subset=["Biosample term name","Experiment target"] , keep='last')



# only want assays that are shared between more than 3 cells
filtered_chip = filtered_chip.groupby("Experiment target").filter(lambda x: len(x) >= min_cells_per_chip)

Expand All @@ -188,12 +190,12 @@ def loj_overlap(feature_file):
##############################################################################################

def download_url(f, tries = 0):

logger.warning("Trying to download %s for the %ith time..." % (f["File download URL"], tries))

if tries == 2:
raise Exception("File accession %s from URL %s failed for download 3 times. Exiting 1..." % (f['File accession'], f["File download URL"]))

path = f["File download URL"]
ext = path.split(".")[-1]
if (ext == "gz" and path.split(".")[-2] == 'bed'):
Expand Down Expand Up @@ -236,11 +238,10 @@ def download_url(f, tries = 0):
download_url(f, tries + 1)

# download all files
# TODO AM uncomment
# rows = list(map(lambda x: x[1], filtered_files.iterrows()))
# pool = multiprocessing.Pool(processes=threads)
# pool.map(download_url, rows)
# pool.close()
rows = list(map(lambda x: x[1], filtered_files.iterrows()))
pool = multiprocessing.Pool(processes=threads)
pool.map(download_url, rows)
pool.close()

##############################################################################################
############################# window chromsizes into 200bp ###################################
Expand Down Expand Up @@ -320,6 +321,7 @@ def download_url(f, tries = 0):
compression='gzip', compression_opts=9)



bed_files = list(filter(lambda x: x.endswith(".bed") & x.startswith("ENC"), os.listdir(download_path)))
logger.info("Running bedtools on %i files..." % len(bed_files))

Expand Down Expand Up @@ -387,7 +389,7 @@ def save_epitome_numpy_data(download_dir, output_path):
# paths to save 0 reduced files to
all_regions_file = os.path.join(output_path, "all.pos.bed")
all_regions_file_gz = all_regions_file + ".gz"

if not os.path.exists(all_regions_file) or not os.path.exists(matrix_path):

if not os.path.exists(output_path):
Expand Down Expand Up @@ -421,11 +423,12 @@ def save_epitome_numpy_data(download_dir, output_path):
matrix = h5_file.create_dataset("data", nonzero_data.shape, dtype='i',
compression='gzip', compression_opts=9)
matrix[:,:] = nonzero_data

h5_file.close()
logger.info("done saving matrix")



# gzip filtered all_regions_file
if not os.path.exists(all_regions_file_gz):
stdout = open(all_regions_file_gz,"wb")
Expand Down Expand Up @@ -482,4 +485,3 @@ def save_epitome_numpy_data(download_dir, output_path):
os.remove(all_regions_file_unfiltered + ".tmp")
# remove h5 file with all zeros
os.remove(matrix_path_all) # remove h5 file with all zeros

44 changes: 35 additions & 9 deletions docs/usage/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,58 @@ Configuring data
================

Epitome pre-processes ChIP-seq peaks and DNase-seq peaks from ENCODE for usage
in the Epitome models. Pre-processed datasets are lazily downloaded from `Amazon S3 <../https://epitome-data.s3-us-west-1.amazonaws.com/data.zip>`__ when users run an Epitome model.
in the Epitome models. Pre-processed datasets hg19 are lazily downloaded from `Amazon S3 <../https://epitome-data.s3-us-west-1.amazonaws.com/data.zip>`__ when users run an Epitome model.


This dataset contains the following files:

- **train.npz, valid.npz, and test.npz**: compressed numpy data matrices. Valid.npz includes chr7 data, test.npz includes chr8 and chr9, and train.npz includes data from all other chromosomes.

- **all.pos.bed.gz**: gunzipped genomic regions matching the numpy data matrices.
- **all.pos.bed.gz**: gzipped genomic regions matching columns in the numpy data matrices.

- **feature_name**: ChIP-seq and DNase-seq peaks corresponding to the data matrix.
- **feature_name**: ChIP-seq and DNase-seq assays corresponding to rows in the data matrices.


Generating data for Epitome
---------------------------

You can generate your own Epitome dataset from ENCODE using the following command:
```download_encode.py```.
``download_encode.py``.

.. code:: bash
python download_encode.py -h
usage: download_encode.py [-h] [--metadata_url METADATA_URL]
[--min_chip_per_cell MIN_CHIP_PER_CELL]
[--regions_file REGIONS_FILE]
download_path {hg19,mm10,GRCh38} bigBedToBed
output_path
positional arguments:
download_path Temporary path to download bed/bigbed files to.
{hg19,mm10,GRCh38} assembly to filter files in metadata.tsv file by.
output_path path to save file data to
optional arguments:
-h, --help show this help message and exit
--metadata_url METADATA_URL
ENCODE metadata URL.
--min_chip_per_cell MIN_CHIP_PER_CELL
Minimum ChIP-seq experiments for each cell type.
--min_cells_per_chip MIN_CELLS_PER_CHIP
Minimum cells a given ChIP-seq target must be observed
in.
--regions_file REGIONS_FILE
File to read regions from
--bgzip BGZIP Path to bgzip executable
--bigBedToBed BIGBEDTOBED
Path to bigBedToBed executable, downloaded from
http://hgdownload.cse.ucsc.edu/admin/exe/
To use your own dataset in an Epitome model, make sure to set the environment environment variable
``EPITOME_DATA_PATH`` that points to your custom dataset. This will tell Epitome where to load
data from.

.. code:: bash
import os
os.environ["EPITOME_DATA_PATH"] = 'path/to/my/epitome/dataset'
...
TODO: need to add this script as a binary in the module.
24 changes: 18 additions & 6 deletions epitome/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,26 @@

S3_DATA_PATH = 'https://epitome-data.s3-us-west-1.amazonaws.com/data.zip'

# os env that should be set by user to explicitly set the data path
EPITOME_DATA_PATH_ENV="EPITOME_DATA_PATH"

# data files required by epitome
POSITIONS_FILE = "all.pos.bed.gz"
FEATURE_NAME_FILE = "feature_name"
REQUIRED_FILES = [POSITIONS_FILE,"train.npz","valid.npz", FEATURE_NAME_FILE,"test.npz"]

def GET_EPITOME_USER_PATH():
return os.path.join(os.path.expanduser('~'), '.epitome')

# relative path to data
def GET_DATA_PATH():
return os.path.join(GET_EPITOME_USER_PATH(),'data')

POSITIONS_FILE = "all.pos.bed.gz"
FEATURE_NAME_FILE = "feature_name"
REQUIRED_FILES = [POSITIONS_FILE,"train.npz","valid.npz", FEATURE_NAME_FILE,"test.npz"]
"""
Check if user has set env variable that specifies data path.
Otherwise, use default location.
Returns:
location of epitome data with all required files
"""
if os.environ.get("EPITOME_DATA_PATH") is not None:
return os.environ["EPITOME_DATA_PATH"]
else:
return os.path.join(GET_EPITOME_USER_PATH(),'data')
31 changes: 30 additions & 1 deletion epitome/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from .constants import *
import scipy.sparse
import pyranges as pr
from sklearn.metrics import jaccard_score

import warnings
from operator import itemgetter
Expand Down Expand Up @@ -398,7 +399,6 @@ def get_assays_from_feature_file(feature_name_file = None,

# finally, make sure that all assays that were specified are in assaymap
# if not, throw an error and print the reason.
print(eligible_assays is not None)
if eligible_assays is not None:

missing = [i for i in eligible_assays if i not in list(assaymap)]
Expand Down Expand Up @@ -633,3 +633,32 @@ def concatenate_all_data(data, region_file):
data[Dataset.VALID], # chr7
data[Dataset.TEST], # chr 8 and 9
data[Dataset.TRAIN][:,chr6_end:]],axis=1) # all the rest of the chromosomes




def order_by_similarity(matrix, cellmap, assaymap, cell, data, compare_assay = 'DNase'):
"""
Orders list of cellmap names by similarity to comparison cell.
Args:
:param numpy matrix specifying location of each assay/cellmap in data
:param cellmap: map of cell: index in matrix
:param assaymap: map of assay: index in matrix
:param cell: name of cell type, should be in cellmap
:param data: numpy matrix of data to run comparison. rows index into assay/cell
:compare_assay: assay to use to compare cell types. Default = DNase
Returns:
list of cellline names ordered by DNase similarity to cell (most similar is first)
"""

# data for cell line to compare all other cell lines to
compare_arr = data[matrix[cellmap[cell], assaymap[compare_assay]],:]

# calculate jaccard score
corrs = np.array([jaccard_score(data[matrix[cellmap[c], assaymap[compare_assay]],:], compare_arr) for c in list(cellmap)])

tmp = sorted(zip(corrs, list(cellmap)), key = lambda x: x[0], reverse=True)
return list(map(lambda x: x[1],tmp))

Loading

0 comments on commit ca05f3d

Please sign in to comment.