make ENV variable available to specify data location (#31)

* added option to specify data path in ENV variable
YosefLab · Aug 27, 2020 · ca05f3d · ca05f3d
1 parent ec5919d
commit ca05f3d
Show file tree

Hide file tree

Showing 8 changed files with 150 additions and 67 deletions.
diff --git a/data/download_encode.py b/data/download_encode.py
@@ -14,6 +14,7 @@
 import os
 import urllib
 import multiprocessing
+from multiprocessing import Pool
 import subprocess
 import math
 import argparse
@@ -128,7 +129,7 @@ def loj_overlap(feature_file):
     if os.path.normpath(os.path.dirf(all_regions_file_unfiltered)) != os.path.normpath(output_path):
         shutil.copyfile(all_regions_file_unfiltered, os.path.join(output_path, "all.pos_unfiltered.bed"))
 
-# gzipped tmp file        
+# gzipped tmp file
 all_regions_file_unfiltered_gz = all_regions_file_unfiltered + ".gz"
 
 # download metadata if it does not exist
@@ -168,6 +169,7 @@ def loj_overlap(feature_file):
 filtered_chip = filtered_chip.drop_duplicates(subset=["Biosample term name","Experiment target"] , keep='last')
 
 
+
 # only want assays that are shared between more than 3 cells
 filtered_chip = filtered_chip.groupby("Experiment target").filter(lambda x: len(x) >= min_cells_per_chip)
 
@@ -188,12 +190,12 @@ def loj_overlap(feature_file):
 ##############################################################################################
 
 def download_url(f, tries = 0):
-    
+
     logger.warning("Trying to download %s for the %ith time..." % (f["File download URL"], tries))
-    
+
     if tries == 2:
         raise Exception("File accession %s from URL %s failed for download 3 times. Exiting 1..." % (f['File accession'], f["File download URL"]))
-        
+
     path = f["File download URL"]
     ext = path.split(".")[-1]
     if (ext == "gz" and path.split(".")[-2]  == 'bed'):
@@ -236,11 +238,10 @@ def download_url(f, tries = 0):
         download_url(f, tries + 1)
 
 # download all files
-# TODO AM uncomment
-# rows = list(map(lambda x: x[1], filtered_files.iterrows()))
-# pool = multiprocessing.Pool(processes=threads)
-# pool.map(download_url, rows)
-# pool.close()
+rows = list(map(lambda x: x[1], filtered_files.iterrows()))
+pool = multiprocessing.Pool(processes=threads)
+pool.map(download_url, rows)
+pool.close()
 
 ##############################################################################################
 ############################# window chromsizes into 200bp ###################################
@@ -320,6 +321,7 @@ def download_url(f, tries = 0):
         compression='gzip', compression_opts=9)
 
 
+
 bed_files = list(filter(lambda x: x.endswith(".bed") & x.startswith("ENC"), os.listdir(download_path)))
 logger.info("Running bedtools on %i files..." % len(bed_files))
 
@@ -387,7 +389,7 @@ def save_epitome_numpy_data(download_dir, output_path):
     # paths to save 0 reduced files to
     all_regions_file = os.path.join(output_path, "all.pos.bed")
     all_regions_file_gz = all_regions_file + ".gz"
-    
+
     if not os.path.exists(all_regions_file) or not os.path.exists(matrix_path):
 
         if not os.path.exists(output_path):
@@ -421,11 +423,12 @@ def save_epitome_numpy_data(download_dir, output_path):
         matrix = h5_file.create_dataset("data", nonzero_data.shape, dtype='i',
             compression='gzip', compression_opts=9)
         matrix[:,:] = nonzero_data
+
         h5_file.close()
         logger.info("done saving matrix")
-        
-        
-        
+
+
+
     # gzip filtered all_regions_file
     if not os.path.exists(all_regions_file_gz):
         stdout = open(all_regions_file_gz,"wb")
@@ -482,4 +485,3 @@ def save_epitome_numpy_data(download_dir, output_path):
 os.remove(all_regions_file_unfiltered + ".tmp")
 # remove h5 file with all zeros
 os.remove(matrix_path_all) # remove h5 file with all zeros
-
diff --git a/docs/usage/data.rst b/docs/usage/data.rst
@@ -2,32 +2,58 @@ Configuring data
 ================
 
 Epitome pre-processes ChIP-seq peaks and DNase-seq peaks from ENCODE for usage
-in the Epitome models. Pre-processed datasets are lazily downloaded from `Amazon S3 <../https://epitome-data.s3-us-west-1.amazonaws.com/data.zip>`__ when users run an Epitome model.
+in the Epitome models. Pre-processed datasets hg19 are lazily downloaded from `Amazon S3 <../https://epitome-data.s3-us-west-1.amazonaws.com/data.zip>`__ when users run an Epitome model.
 
 
 This dataset contains the following files:
 
 - **train.npz, valid.npz, and test.npz**: compressed numpy data matrices. Valid.npz includes chr7 data, test.npz includes chr8 and chr9, and train.npz includes data from all other chromosomes.
 
-- **all.pos.bed.gz**: gunzipped genomic regions matching the numpy data matrices.
+- **all.pos.bed.gz**: gzipped genomic regions matching columns in the numpy data matrices.
 
-- **feature_name**: ChIP-seq and DNase-seq peaks corresponding to the data matrix.
+- **feature_name**: ChIP-seq and DNase-seq assays corresponding to rows in the data matrices.
 
 
 Generating data for Epitome
 ---------------------------
 
 You can generate your own Epitome dataset from ENCODE using the following command:
-```download_encode.py```.
+``download_encode.py``.
 
 .. code:: bash
 
   python download_encode.py -h
 
-  usage: download_encode.py [-h] [--metadata_url METADATA_URL]
-                          [--min_chip_per_cell MIN_CHIP_PER_CELL]
-                          [--regions_file REGIONS_FILE]
-                          download_path {hg19,mm10,GRCh38} bigBedToBed
-                          output_path
+    positional arguments:
+    download_path         Temporary path to download bed/bigbed files to.
+    {hg19,mm10,GRCh38}    assembly to filter files in metadata.tsv file by.
+    output_path           path to save file data to
+
+  optional arguments:
+    -h, --help            show this help message and exit
+    --metadata_url METADATA_URL
+                          ENCODE metadata URL.
+    --min_chip_per_cell MIN_CHIP_PER_CELL
+                          Minimum ChIP-seq experiments for each cell type.
+    --min_cells_per_chip MIN_CELLS_PER_CHIP
+                          Minimum cells a given ChIP-seq target must be observed
+                          in.
+    --regions_file REGIONS_FILE
+                          File to read regions from
+    --bgzip BGZIP         Path to bgzip executable
+    --bigBedToBed BIGBEDTOBED
+                          Path to bigBedToBed executable, downloaded from
+                          http://hgdownload.cse.ucsc.edu/admin/exe/
+
+
+To use your own dataset in an Epitome model, make sure to set the environment environment variable
+``EPITOME_DATA_PATH`` that points to your custom dataset. This will tell Epitome where to load
+data from.
+
+.. code:: bash
+
+  import os
+  os.environ["EPITOME_DATA_PATH"] = 'path/to/my/epitome/dataset'
+  ...
 
 TODO: need to add this script as a binary in the module.
diff --git a/epitome/__init__.py b/epitome/__init__.py
@@ -21,14 +21,26 @@
 
 S3_DATA_PATH = 'https://epitome-data.s3-us-west-1.amazonaws.com/data.zip'
 
+# os env that should be set by user to explicitly set the data path
+EPITOME_DATA_PATH_ENV="EPITOME_DATA_PATH"
+
+# data files required by epitome
+POSITIONS_FILE = "all.pos.bed.gz"
+FEATURE_NAME_FILE = "feature_name"
+REQUIRED_FILES = [POSITIONS_FILE,"train.npz","valid.npz", FEATURE_NAME_FILE,"test.npz"]
 
 def GET_EPITOME_USER_PATH():
     return os.path.join(os.path.expanduser('~'), '.epitome')
 
-# relative path to data
 def GET_DATA_PATH():
-    return os.path.join(GET_EPITOME_USER_PATH(),'data')
-
-POSITIONS_FILE = "all.pos.bed.gz"
-FEATURE_NAME_FILE = "feature_name"
-REQUIRED_FILES = [POSITIONS_FILE,"train.npz","valid.npz", FEATURE_NAME_FILE,"test.npz"]
+	"""
+	Check if user has set env variable that specifies data path.
+	Otherwise, use default location.
+	
+	Returns:
+		location of epitome data with all required files
+	"""
+	if os.environ.get("EPITOME_DATA_PATH") is not None:
+		return os.environ["EPITOME_DATA_PATH"]
+	else:
+		return os.path.join(GET_EPITOME_USER_PATH(),'data')
diff --git a/epitome/functions.py b/epitome/functions.py
@@ -33,6 +33,7 @@
 from .constants import *
 import scipy.sparse
 import pyranges as pr
+from sklearn.metrics import jaccard_score
 
 import warnings
 from operator import itemgetter
@@ -398,7 +399,6 @@ def get_assays_from_feature_file(feature_name_file = None,
 
     # finally, make sure that all assays that were specified are in assaymap
     # if not, throw an error and print the reason.
-    print(eligible_assays is not None)
     if eligible_assays is not None:
 
         missing = [i for i in eligible_assays if i not in list(assaymap)]
@@ -633,3 +633,32 @@ def concatenate_all_data(data, region_file):
                            data[Dataset.VALID], # chr7
                            data[Dataset.TEST], # chr 8 and 9
                            data[Dataset.TRAIN][:,chr6_end:]],axis=1) # all the rest of the chromosomes
+
+
+
+
+def order_by_similarity(matrix, cellmap, assaymap, cell, data, compare_assay = 'DNase'):
+    """
+    Orders list of cellmap names by similarity to comparison cell. 
+    
+    Args:
+        :param numpy matrix specifying location of each assay/cellmap in data
+        :param cellmap: map of cell: index in matrix
+        :param assaymap: map of assay: index in matrix
+        :param cell: name of cell type, should be in cellmap
+        :param data: numpy matrix of data to run comparison. rows index into assay/cell
+        :compare_assay: assay to use to compare cell types. Default = DNase
+
+    Returns:
+        list of cellline names ordered by DNase similarity to cell (most similar is first)
+    """
+
+    # data for cell line to compare all other cell lines to
+    compare_arr = data[matrix[cellmap[cell], assaymap[compare_assay]],:]
+
+    # calculate jaccard score
+    corrs = np.array([jaccard_score(data[matrix[cellmap[c], assaymap[compare_assay]],:], compare_arr) for c in list(cellmap)])
+
+    tmp = sorted(zip(corrs, list(cellmap)), key = lambda x: x[0], reverse=True)
+    return list(map(lambda x: x[1],tmp))
+