Update README and rename model to EpitomeModel (#66)

* udpate README * changed model name to EpitomeModel
YosefLab · Feb 8, 2021 · 193c906 · 193c906
1 parent ee966fb
commit 193c906
Show file tree

Hide file tree

Showing 8 changed files with 64 additions and 42 deletions.
diff --git a/README.md b/README.md
@@ -4,15 +4,15 @@ Pipeline for predicting ChIP-seq peaks in novel cell types using chromatin acces
 
 ![Epitome Diagram](docs/figures/epitome_diagram_celllines.png)
 
-Epitome leverages chromatin accessibility data to predict transcription factor binding sites on a novel cell type of interest. Epitome computes the chromatin similarity between 11 cell types in ENCODE and the novel cell types, and uses chromatin similarity to transfer binding information in known cell types to a novel cell type of interest.
+Epitome leverages chromatin accessibility (either DNase-seq or ATAC-seq) to predict epigenetic events in a novel cell type of interest. Such epigenetic events include transcription factor binding sites and histone modifications. Epitome computes chromatin accessibility similarity between ENCODE cell types and the novel cell type, and uses this information to transfer known epigentic signal to the novel cell type of interest.
 
 
-## Requirements:
+## Requirements
 * [conda](https://docs.conda.io/en/latest/miniconda.html)
-* python > 3.6
+* python >= 3.6
 
-## Setup and Installation:
-1. Create and activate a conda venv:
+## Setup and Installation
+1. Create and activate a conda environment:
 ```
 conda create --name EpitomeEnv python=3.6 pip
 source activate EpitomeEnv
@@ -22,58 +22,80 @@ source activate EpitomeEnv
 pip install epitome
 ```
 
-# Install Epitome for development:
-```
-make develop
-```
 
-Note: Epitome is configured for tensorflow 2.1.0/Cuda 9. If you have a different
-version of cuda, update tensorflow-gpu version accordingly.
+## Training a Model
 
-To check your Cuda version:
-```
-nvcc --version
-```
+TODO: link to documentation
+
+First, create an Epitome dataset that defines the cell types and ChIP-seq
+targets you want to train on,
 
-## Training a Model
 
 ```python
 
-    print(list_assays()) # list of available ChIP-seq targets epitome can predict on
+    from epitome.dataset import *
+
+    targets = ['CTCF','RAD21','SMC3']
+    celltypes = ['K562', 'A549', 'GM12878']
+
+    dataset = EpitomeDataset(targets=targets, cells=celltypes)
+
+```
+
+Now, you can create and train your model:
+
+```python
 
     from epitome.models import *
-    model = VLP(['CTCF', 'SMC3', 'RAD21'])
-    model.train(5000) # train for 5000 iterations
+
+    model = EpitomeModel(dataset, test_celltypes = ["K562"])
+    model.train(5000) # train for 5000 batches
 ```
 
 ## Evaluate a Model:
 
 ```python
 
-   model.test(1000) # evaluate how well the model performs on a validation set
+   model.test(1000) # evaluate how well the model performs on a validation chromosome
 
 ```
 
-## Predict using a Model:
+## Using Epitome on your own dataset:
 
 Epitome can perform genome wide predictions or region specific predictions on
-a new DNase-seq or ATAC-seq sample.
+a sample that has either DNase-seq or ATAC-seq.
 
 To score specific regions:
 
 ```python
 
    chromatin_peak_file = ... # path to peak called ATAC-seq or DNase-seq in bed format
    regions_file = ...        # path to bed file of regions to score
-   results = model.score_peak_file(chromatin_peak_file, regions_file)
+   results = model.score_peak_file([chromatin_peak_file], regions_file)
 
 ```
 
 To score on the whole genome:
+
 ```python
 
    chromatin_peak_file = ... # path to peak called ATAC-seq or DNase-seq in bed format
    file_prefix = ...        # file to save compressed numpy predictions to.
-   model.score_peak_file(chromatin_peak_file, file_prefix)
+   model.score_whole_genome([chromatin_peak_file], file_prefix)
+
+```
+
+
+# Install Epitome for development
 
+To build Epitome for development, run:
+
+```
+make develop
+```
+
+## Running unit tests
+
+```
+make test
 ```
diff --git a/docs/figures/epitome_diagram_celllines.png b/docs/figures/epitome_diagram_celllines.png
diff --git a/docs/usage/predict.rst b/docs/usage/predict.rst
@@ -36,7 +36,7 @@ similarity, and then predicts using the ``score_whole_genome`` function:
   dataset = EpitomeDataset(targets, similarity_targets=['DNase', 'H3K27ac'])
 
   # create and train model
-  model = VLP(dataset)
+  model = EpitomeModel(dataset)
   model.train(5000)
 
   # list of paths to bed files for similarity assays for a cell type of interest

diff --git a/docs/usage/train.rst b/docs/usage/train.rst
@@ -32,7 +32,7 @@ Now, you can create a model:
 
 .. code:: python
 
-	model = VLP(dataset, test_celltypes = ["K562"]) # cell line reserved for testing
+	model = EpitomeModel(dataset, test_celltypes = ["K562"]) # cell line reserved for testing
 
 Next, train the model. Here, we train the model for 5000 iterations:
 

diff --git a/epitome/models.py b/epitome/models.py
@@ -7,8 +7,8 @@
 .. autosummary::
   :toctree: _generate/
 
-  VariationalPeakModel
-  VLP
+  PeakModel
+  EpitomeModel
 """
 
 
@@ -34,7 +34,7 @@
 #################### Variational Peak Model ###########################
 #######################################################################
 
-class VariationalPeakModel():
+class PeakModel():
     """
     Model for learning from ChIP-seq peaks.
     """
@@ -574,7 +574,7 @@ def score_peak_file(self, similarity_peak_files, regions_peak_file):
         return pd.concat([compareObject.compare_df(), preds_df], axis=1)
 
 
-class VLP(VariationalPeakModel):
+class EpitomeModel(PeakModel):
     def __init__(self,
              *args,
              **kwargs):
@@ -584,7 +584,7 @@ def __init__(self,
 
             .. code-block:: python
 
-                model = VLP(checkpoint=path_to_saved_model)
+                model = EpitomeModel(checkpoint=path_to_saved_model)
         '''
         self.activation = tf.tanh
         self.layers = 2
@@ -599,7 +599,7 @@ def __init__(self,
             metadata['dataset'] = dataset
             del metadata['dataset_params']
 
-            VariationalPeakModel.__init__(self, **metadata, **kwargs)
+            PeakModel.__init__(self, **metadata, **kwargs)
             file = h5py.File(os.path.join(kwargs["checkpoint"], "weights.h5"), 'r')
 
             # load model weights back in
@@ -610,7 +610,7 @@ def __init__(self,
             file.close()
 
         else:
-            VariationalPeakModel.__init__(self, *args, **kwargs)
+            PeakModel.__init__(self, *args, **kwargs)
 
     def create_model(self, **kwargs):
         '''

diff --git a/epitome/test/__init__.py b/epitome/test/__init__.py
@@ -43,7 +43,7 @@ def makeSmallModel(self):
 			cells = eligible_cells)
 
 
-		return VLP(dataset,
+		return EpitomeModel(dataset,
 			test_celltypes = ['K562'])
 
 

diff --git a/epitome/test/models_test.py b/epitome/test/models_test.py
@@ -1,7 +1,7 @@
 from epitome.test import EpitomeTestCase
 from epitome.constants import Dataset
 import numpy as np
-from epitome.models import VLP
+from epitome.models import EpitomeModel
 import pytest
 import tempfile
 import pyranges as pr
@@ -66,7 +66,7 @@ def test_specify_assays(self):
 		eligible_targets = ['CTCF', 'RAD21', 'CEBPB']
 		dataset = EpitomeDataset(targets = eligible_targets)
 
-		model = VLP(dataset)
+		model = EpitomeModel(dataset)
 		assert(len(model.dataset.targetmap) == 4)
 
 	def test_model_similarity_assays(self):
@@ -75,7 +75,7 @@ def test_model_similarity_assays(self):
 
 		dataset = EpitomeDataset(targets = eligible_targets, similarity_targets = ['H3K27ac'])
 
-		model = VLP(dataset)
+		model = EpitomeModel(dataset)
 		assert(len(model.dataset.targetmap) == 4)
 
 	def test_model_two_similarity_assays(self):
@@ -84,7 +84,7 @@ def test_model_two_similarity_assays(self):
 
 		dataset = EpitomeDataset(targets = eligible_targets, similarity_targets = ['DNase', 'H3K27ac'])
 
-		model = VLP(dataset)
+		model = EpitomeModel(dataset)
 		assert(len(model.dataset.targetmap) == 5)
 
 	def test_model_similarity_assays(self):
@@ -93,7 +93,7 @@ def test_model_similarity_assays(self):
 
 		dataset = EpitomeDataset(targets = eligible_targets, similarity_targets = ['H3K27ac'])
 
-		model = VLP(dataset)
+		model = EpitomeModel(dataset)
 		assert(len(model.dataset.targetmap) == 4)
 
 	def test_eval_vector(self):
@@ -107,7 +107,7 @@ def test_save_model(self):
 		# should save and re-load model
 		tmp_path = self.tmpFile()
 		self.model.save(tmp_path)
-		loaded_model = VLP(checkpoint=tmp_path)
+		loaded_model = EpitomeModel(checkpoint=tmp_path)
 		results = loaded_model.test(self.validation_size)
 		assert(results['preds'].shape[0] == self.validation_size)
 
@@ -203,7 +203,7 @@ def test_correct_weights(self):
 		# this is where the bug was
 		assert np.where(ds.matrix == -1)[0].shape[0] == 0
 
-		model = VLP(ds)
+		model = EpitomeModel(ds)
 		model.train(1)
 		results = model.test(1000, calculate_metrics = True)
 		assert np.where(results['weights']==0)[0].shape[0] == 0
diff --git a/epitome/viz.py b/epitome/viz.py
@@ -149,7 +149,7 @@ def heatmap_aggreement_from_model_weights(model):
     Plots seaborn heatmap for DNase weights of first layer in network.
     Plots one heatmap for each celltype used in the features for training.
 
-    :param VLP model: an Epitome model
+    :param EpitomeModel model: an Epitome model
     '''
 
     # get weights