Skip to content

Commit

Permalink
Speed up generator code (#34)
Browse files Browse the repository at this point in the history
* updates to speed up generator code
  • Loading branch information
akmorrow13 authored Sep 3, 2020
1 parent 2dfde40 commit 75d9675
Show file tree
Hide file tree
Showing 7 changed files with 181 additions and 117 deletions.
18 changes: 14 additions & 4 deletions docs/installation/source.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,20 @@ Requirements

* `conda <https://docs.conda.io/en/latest/miniconda.html>`__
* python 3.7
* `tensorflow 2.3.0 <https://www.tensorflow.org/install/source>`__


Installation
------------
Installing Tensorflow
---------------------

In order to run Epitome as efficiently as possible, you should install
`tensorflow from source <https://www.tensorflow.org/install/source>`__.
If you have not installed tensorflow prior to installing Epitome, Epitome will
install tensorflow from pip, which will not be optimized for your hardware.


Installation from Pip
---------------------

1. Create and activate a pytion 3.7 conda venv:

Expand All @@ -27,8 +37,8 @@ Installation
pip install epitome
From Source
-----------
Installation from Source
------------------------

1. Create and activate a pytion 3.7 conda venv:

Expand Down
33 changes: 33 additions & 0 deletions epitome/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,7 +701,40 @@ def concatenate_all_data(data, region_file):
data[Dataset.TRAIN][:,chr6_end:]],axis=1) # all the rest of the chromosomes


def get_radius_indices(radii, r, i, max_index):
'''
Gets indices for a given radius r in both directions from index i.
Used in generator code to get indices in data for a given radius from
genomic loci i.
Args:
:param radii: increasing list of integers indiciating radii
:param r: Index of which radii
:param i: center index to access data
:param max_index: max index which can be accessed
Returns:
exclusive indices for this radius
'''
radius = radii[r]

min_radius = max(0, i - radius)
max_radius = min(i+radius+1, max_index)

# do not featurize chromatin regions
# that were considered in smaller radii
if (r != 0):

radius_range_1 = np.arange(min_radius, max(0, i - radii[r-1]+1))
radius_range_2 = np.arange(i+radii[r-1], max_radius)

radius_range = np.concatenate([radius_range_1, radius_range_2])
else:

radius_range = np.arange(min_radius, max_radius)

return radius_range

def order_by_similarity(matrix, cellmap, assaymap, cell, data, compare_assay = 'DNase'):
"""
Expand Down
136 changes: 62 additions & 74 deletions epitome/generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def load_data(data,
similarity_assays = [similarity_assays]
assert('DNase' in similarity_assays)

# get indices for features.rows are cells and cols are assays
# get indices for features. rows are cells and cols are assays
cellmap_idx = [cellmap[c] for c in list(eval_cell_types)]
feature_cell_indices = matrix[cellmap_idx,:]

Expand Down Expand Up @@ -141,11 +141,8 @@ def load_data(data,

def g():
for i in indices: # for all records specified
feature_names = []

for (cell) in label_cell_types: # for all cell types to be used in labels
similarities_double_positive = np.empty([len(eval_cell_types),0])
similarities_agreement = np.empty([len(eval_cell_types),0])

# labels for this cell
if (mode != Dataset.RUNTIME):
Expand All @@ -171,78 +168,54 @@ def g():
# for cell types that are going to be features
similarity_indices = feature_cell_indices[:, delete_indices]

similarity_labels_agreement = []
similarity_labels_dp = []

for r, radius in enumerate(radii):
# get indices for each radius in radii
radius_ranges = list(map(lambda x: get_radius_indices(radii, x, i, data.shape[-1]), range(len(radii))))

min_radius = max(0, i - radius + 1)
max_radius = min(i+radius, data.shape[1])
if len(radius_ranges) > 0:
radius_indices = np.concatenate(radius_ranges)

# do not featurize chromatin regions
# that were considered in smaller radii
if (r != 0):
radius_range_1 = np.arange(min_radius, max(0, i - radii[r-1]+1))
radius_range_2 = np.arange(i+radii[r-1], max_radius)
cell_train_data = data[similarity_indices[:,:,None],radius_indices]

radius_range = np.concatenate([radius_range_1, radius_range_2])
else:

radius_range = np.arange(min_radius, max_radius)


####################################################################
cell_train_data = data[similarity_indices[:,:,None],radius_range]

# use similarity matrix, if it is provided
if (mode == Dataset.RUNTIME):

# within the radius, fraction of places where they are both 1
similarity_double_positive = np.average(cell_train_data*
similarity_matrix[:,radius_range], axis=-1)
if mode == Dataset.RUNTIME:

# within the radius, fraction of places where they are both equal (0 or 1)
similarity_agreement = np.average(cell_train_data==
similarity_matrix[:,radius_range], axis=-1)
pos = cell_train_data*similarity_matrix[:,radius_indices]
agree = cell_train_data == similarity_matrix[:,radius_indices]

else:
cell_label_data = data[label_cell_indices[delete_indices][:,None],radius_range]

similarity_double_positive = np.average(cell_train_data*
cell_label_data, axis=-1)

# within the radius, fraction of places where they are both equal (0 or 1)
similarity_agreement = np.average(cell_train_data ==
cell_label_data, axis=-1)

similarity_labels_agreement.append('r%i_%s' % (radius, 'agree'))
similarity_labels_dp.append('r%i_%s' % (radius, 'dp'))

similarities_double_positive = np.concatenate([similarities_double_positive,similarity_double_positive],axis=1)
similarities_agreement = np.concatenate([similarities_agreement,similarity_agreement],axis=1)

# rehape agreement assay similarity to Radii by feature_cells
similarities = np.concatenate([similarities_agreement, similarities_double_positive], axis=1)
similarity_labels = np.concatenate([similarity_labels_agreement, similarity_labels_dp])

final = []
for j,c in enumerate(eval_cell_types):
# get indices for this cell that has data
present_indices = feature_cell_indices[j,:]
present_indices = present_indices[present_indices!=-1]

cell_features = data[present_indices,i]
cell_similarities = similarities[j,:]
concat = np.concatenate([cell_features, cell_similarities])
final.append(concat)
cell_label_data = data[label_cell_indices[delete_indices][:,None],radius_indices]

# remove middle dimension and flatten similarity assays
pos = (cell_train_data*cell_label_data)
agree = (cell_train_data == cell_label_data)

# pos = pos.reshape(cell_train_data.shape[0], cell_train_data.shape[1]*cell_train_data.shape[2])
# pos = agree.reshape(cell_train_data.shape[0], cell_train_data.shape[1]*cell_train_data.shape[2])
#
print("POS", pos.shape)
# get indices to split on. remove last because it is empty
split_indices = np.cumsum([len(i) for i in radius_ranges])[:-1]
# slice arrays by radii
pos_arrays = np.split(pos, split_indices, axis= -1 )
agree_arrays = np.split(agree, split_indices, axis = -1)

similarities = np.stack(list(map(lambda x: np.average(x, axis = -1), pos_arrays + agree_arrays)),axis=1)
else:
# no radius, so no similarities. just an empty placeholder
similarities = np.zeros((len(eval_cell_types),0,0))

# concatenate together feature names
tmp = np.array(feature_assays)[feature_cell_indices[j,:] != -1]
al = ['%s_%s' % (c, a) for a in tmp]
sl = ['%s_%s' % (c, s) for s in similarity_labels]
# reshape similarities to flatten 1st dimension, which are the assays
# results in the odering:
## row 1: cell 1: pos for each assay and agree for each assay for each radius
similarities = similarities.reshape(similarities.shape[0], similarities.shape[1]*similarities.shape[2])

feature_names.append(np.concatenate([al, sl]))
##### Concatenate all cell type features together ####
final_features = np.concatenate([data[feature_cell_indices,i], similarities],axis=1).flatten()

# mask missing data
f_mask = np.concatenate([feature_cell_indices!=-1,
np.ones(similarities.shape)],axis=1).flatten()
final_features = final_features[f_mask != 0]

if (mode != Dataset.RUNTIME):
labels = data[label_cell_indices_no_similarities,i]
Expand All @@ -252,22 +225,37 @@ def g():
labels = garbage_labels # all 0's

# append labels and assaymask
final.append(labels.astype(np.float32))
feature_names.append(['lbl_%s_%s' % (cell, a) for a in label_assays]) # of form lbl_cellline_target

final.append(assay_mask.astype(np.float32))
feature_names.append(['mask_%s_%s' % (cell, a) for a in label_assays]) # of form mask_cellline_target
final= tuple([final_features, labels.astype(np.float32), assay_mask.astype(np.float32)])

#### Finish appending feature labels together ####
if (return_feature_names):
yield (tuple(final), tuple(feature_names))
all_labels = []
feature_names = []
similarity_labels_agreement = ['r%i_%s' % (radius, 'agree') for radius in radii]
similarity_labels_dp = ['r%i_%s' % (radius, 'dp') for radius in radii]
similarity_labels = np.concatenate([similarity_labels_agreement, similarity_labels_dp])

# concatenate together feature names
for j,c in enumerate(eval_cell_types):
tmp = np.array(feature_assays)[feature_cell_indices[j,:] != -1]
al = ['%s_%s' % (c, a) for a in tmp]
sl = ['%s_%s' % (c, s) for s in similarity_labels]

feature_names.append(al)
feature_names.append(sl)

all_labels.append(np.concatenate(feature_names))
all_labels.append(['lbl_%s_%s' % (cell, a) for a in label_assays]) # of form lbl_cellline_target
all_labels.append(['mask_%s_%s' % (cell, a) for a in label_assays]) # of form mask_cellline_target

yield (final, tuple(all_labels))
else:
yield tuple(final)
yield final


return g



def generator_to_tf_dataset(g, batch_size, shuffle_size, prefetch_size):
"""
Generates a tensorflow dataset from a data generator.
Expand Down
32 changes: 16 additions & 16 deletions epitome/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,6 @@
import pickle
from operator import itemgetter

# TODO RM!
# this is required because tensorflow is running in eager mode
# but keras weights are not, which throws an error
# should be fixed by not running in eager mode
tf.config.experimental_run_functions_eagerly(True)

#######################################################################
#################### Variational Peak Model ###########################
#######################################################################
Expand Down Expand Up @@ -299,19 +293,25 @@ def train_step(f):

return elbo_loss, neg_log_likelihood, kl_loss

for step, f in enumerate(self.train_iter.take(num_steps)):
loss = train_step(f)
@tf.function
def loopiter():
for step, f in enumerate(self.train_iter):
loss = train_step(f)

if step % 100 == 0:
tf.compat.v1.logging.info(str(step) + " " + str(tf.reduce_mean(loss[0])) +
str(tf.reduce_mean(loss[1])) +
str(tf.reduce_mean(loss[2])))

if step % 1000 == 0:
if (self.debug):
tf.compat.v1.logging.info("On validation")
_, _, _, _, _ = self.test(40000, log=False)
tf.compat.v1.logging.info("")

tf.compat.v1.logging.info(str(step) + " " + str(tf.reduce_mean(loss[0])) +
str(tf.reduce_mean(loss[1])) +
str(tf.reduce_mean(loss[2])))
if step > num_steps:
break

if (self.debug):
tf.compat.v1.logging.info("On validation")
_, _, _, _, _ = self.test(40000, log=False)
tf.compat.v1.logging.info("")
loopiter()

def test(self, num_samples, mode = Dataset.VALID, calculate_metrics=False):
"""
Expand Down
41 changes: 29 additions & 12 deletions epitome/test/generators_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,23 +44,40 @@ def test_generator_sparse_data(self):

eligible_cells = ['K562','HepG2','H1','A549','HeLa-S3']
eligible_assays = ['DNase','CTCF','RAD21','LARP7']
matrix, cellmap, assaymap = self.getFeatureData(eligible_assays, eligible_cells)
label_cell_types = ['K562']
matrix, cellmap, assaymap = get_assays_from_feature_file(
eligible_assays = eligible_assays,
eligible_cells = eligible_cells, min_cells_per_assay = 1, min_assays_per_cell = 1)

label_cell_types = ['HepG2']
eligible_cells.remove(label_cell_types[0])

results = load_data(self.data[Dataset.TRAIN],
['K562'],
results = list(load_data(self.data[Dataset.TRAIN],
label_cell_types,
eligible_cells,
matrix,
assaymap,
cellmap,
radii = [],
mode = Dataset.VALID,
indices=np.arange(0,10))()
li_results = list(results)
return_feature_names=True,
indices=np.arange(0,10))())

# get first features
features = results[0][0]

# get labels
labels = results[0][1]

# length should be shorter for first cell because missing LARP7
assert(len(li_results[0][0]) == len(eligible_assays)-1)
# all cell types but K562 are missing LARP7 data
assert(len(features[0]) == len(eligible_cells) * len(eligible_assays) - 3)

# make sure mask is masking out LARP7 for HepG2
assert(np.all(features[-1] == [1., 0., 1.]))

# make sure first label cell is not the test cell K562
assert(labels[-2][0] == 'lbl_HepG2_RAD21')
assert(labels[-2][1] == 'lbl_HepG2_LARP7')
assert(labels[-2][2] == 'lbl_HepG2_CTCF')

def test_generator_radius(self):
eligible_cells = ['K562','HepG2','H1','A549','HeLa-S3']
Expand All @@ -83,10 +100,9 @@ def test_generator_radius(self):
li_results = list(results)

# length should include eligible assays and 2* radius for pos and agreement
assert(len(li_results[0][0]) == len(eligible_assays)+len(radii)* 2)

assert(len(li_results[0][0]) == len(eligible_cells) * (len(eligible_assays)+len(radii)* 2))

def test_generator_runtime(self):
def test_generator_multiple_sim(self):
eligible_cells = ['K562','HepG2','H1','A549','HeLa-S3']
eligible_assays = ['DNase','CTCF','RAD21']
matrix, cellmap, assaymap = self.getFeatureData(eligible_assays, eligible_cells)
Expand All @@ -111,7 +127,8 @@ def test_generator_runtime(self):
li_results = list(results)

# length should include eligible assays and 2* radius for pos and agreement
assert(len(li_results[0][0]) == len(eligible_assays)+len(radii)* 4)
# for each of the 2 similarity assays
assert(len(li_results[0][0]) == len(eligible_cells) * (len(eligible_assays)+len(radii)* 4))


def test_generator_dnase_array(self):
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ sklearn
pytabix==0.1
sphinx==2.1.1 # documentation
sphinx_rtd_theme==0.4.3
tensorflow-probability==0.9.0
tensorflow-probability==0.11.0
tqdm
pyranges>=0.0.84
h5sparse==0.0.5
Loading

0 comments on commit 75d9675

Please sign in to comment.