Skip to content

Boostrapping nb #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added LibraryTesting/BadVariants.txt
Empty file.
32 changes: 25 additions & 7 deletions libs/IMlibs.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@
import pandas as pd
import scipy.io as sio
import matplotlib.pyplot as plt
import CPlibs
#import CPlibs

def spawnMatlabJob(matlabFunctionCallString):
try:
#construct the command-line matlab call
functionCallString = "try,"
functionCallString = functionCallString + "addpath('{0}', '{1}');".format('/home/sarah/array_image_tools_SKD/libs', '/home/sarah/array_image_tools_SKD/scripts') #placeholder TEMP DEBUG CHANGE
functionCallString = functionCallString + "addpath('{0}', '{1}');".format('/home/namita/Code/array_image_tools_SKD/libs', '/home/namita/Code/array_image_tools_SKD/scripts') #placeholder TEMP DEBUG CHANGE
functionCallString = functionCallString + matlabFunctionCallString + ';'
functionCallString = functionCallString + "catch e,"
functionCallString = functionCallString + "disp(getReport(e,'extended'));"
Expand Down Expand Up @@ -142,7 +142,8 @@ def getFPfromCPsignal(signalNamesByTileDict):
return bindingSeriesFilenameDict

def pasteTogetherSignal(cpSeqFilename, signal, outfilename):
tmpfilename = 'signal_'+str(time.time())
output_directory = os.path.dirname(outfilename)
tmpfilename = os.path.join(output_directory, 'signal_'+str(time.time()))
np.savetxt(tmpfilename, signal, fmt='%s')
os.system("paste %s %s > %s"%(cpSeqFilename, tmpfilename, outfilename+'.tmp'))
os.system("mv %s %s"%(outfilename+'.tmp', outfilename))
Expand Down Expand Up @@ -218,19 +219,19 @@ def sortConcatenateCPsignal(reducedSignalNamesByTileDict, barcode_col, sortedAll
return

def compressBarcodes(sortedAllCPsignalFile, barcode_col, seq_col, compressedBarcodeFile):
script = '/home/sarah/array_image_tools_SKD/scripts/compressBarcodes.py'
script = '/home/namita/Code/array_image_tools_SKD/scripts/compressBarcodes.py'
print "python %s -i %s -o %s -c %d -C %d"%(script, sortedAllCPsignalFile, compressedBarcodeFile, barcode_col, seq_col)
os.system("python %s -i %s -o %s -c %d -C %d"%(script, sortedAllCPsignalFile, compressedBarcodeFile, barcode_col, seq_col))
return

def barcodeToSequenceMap(compressedBarcodeFile, libraryDesignFile, outFile):
script = '/home/sarah/array_image_tools_SKD/scripts/findSeqDistribution.py'
script = '/home/namita/Code/array_image_tools_SKD/scripts/findSeqDistribution.py'
print "python %s -b %s -l %s -o %s "%(script, compressedBarcodeFile, libraryDesignFile, outFile)
os.system("python %s -b %s -l %s -o %s "%(script, compressedBarcodeFile, libraryDesignFile, outFile))
return

def matchCPsignalToLibrary(barcodeToSequenceFilename, sortedAllCPsignalFile, sequenceToLibraryFilename):
script = '/home/sarah/array_image_tools_SKD/scripts/matchCPsignalLibrary.py'
script = '/home/namita/Code/array_image_tools_SKD/scripts/matchCPsignalLibrary.py'
print "python %s -b %s -i %s -o %s "%(script, barcodeToSequenceFilename, sortedAllCPsignalFile, sequenceToLibraryFilename)
os.system("python %s -b %s -i %s -o %s "%(script, barcodeToSequenceFilename, sortedAllCPsignalFile, sequenceToLibraryFilename))
return
Expand Down Expand Up @@ -338,9 +339,18 @@ def loadCPseqSignal(filename):
"""
cols = ['tileID','filter','read1_seq','read1_quality','read2_seq','read2_quality','index1_seq','index1_quality','index2_seq', 'index2_quality','all_cluster_signal','binding_series']
table = pd.read_csv(filename, sep='\t', header=None, names=cols, index_col=False)
#count = 0
#
#binding_series = np.zeros((len(table),8))
#for series in table['binding_series']:
# if (len(np.array(series.split(','), dtype=float)) ==8):
# binding_series[count] = np.array(series.split(','), dtype=float)
# count += 1
binding_series = np.array([np.array(series.split(','), dtype=float) for series in table['binding_series']])

for col in range(binding_series.shape[1]):
table[col] = binding_series[:, col]

return table

def loadNullScores(signalNamesByTileDict, filterSet):
Expand All @@ -361,7 +371,15 @@ def loadBindingCurveFromCPsignal(filename):
then return the binding series, normalized by all cluster image.
"""
table = pd.read_table(filename, usecols=(10,11))
binding_series = np.array([np.array(series.split(','), dtype=float) for series in table['binding_series']])
count = 0
binding_series = np.zeros((len(table),8))
for series in table['binding_series']:
if (len(np.array(series.split(','), dtype=float)) ==8):
binding_series[count] = np.array(series.split(','), dtype=float)
count += 1

#binding_series = np.array([np.array(series.split(','), dtype=float) for series in table['binding_series']])

return binding_series, np.array(table['all_cluster_signal'])

def loadFittedCPsignal(filename):
Expand Down
Binary file modified libs/IMlibs.pyc
Binary file not shown.
93 changes: 74 additions & 19 deletions libs/variantFun.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,36 +10,85 @@
import plotfun
import scipy.io as sio
import matplotlib.pyplot as plt
import CPlibs
#import CPlibs
import scikits.bootstrap
import scipy as scipy
from multiprocessing import Pool
import functools

class Parameters():
def __init__(self):

# save the units of concentration given in the binding series
self.RT = 0.582 # kcal/mol at 20 degrees celsius
self.min_deltaG = -12
self.max_deltaG = -3



def perVariantInfo(table):
variants = np.arange(0, np.max(table['variant_number']))
columns = [name for name in table][:12] + ['numTests', 'numRejects', 'kd', 'dG', 'fmax', 'fmin']
variants = range(0,int(np.max(table['variant_number'])))
wd = '/home/namita/RigidAnalysis'
filename = os.path.join(wd, 'BadVariants.txt')
f = open(filename, 'w')

columns = [name for name in table][:12] + ['numTests', 'numRejects', 'kd', 'dG', 'fmax', 'fmin','qvalue', 'errdG_L', 'errdG_R', ]
newtable = pd.DataFrame(columns=columns, index=np.arange(len(variants)))
for i, variant in enumerate(variants):
if i%1000==0: print 'computing iteration %d'%i
sub_table = table[table['variant_number']==variant]
if len(sub_table) > 0:
sub_table_filtered = filterFitParameters(sub_table)[['kd', 'dG', 'fmax', 'fmin']]
newtable.iloc[i]['numTests'] = len(sub_table_filtered)
newtable.iloc[i]['numRejects'] = len(sub_table) - len(sub_table_filtered)
newtable.iloc[i]['kd':] = np.median(sub_table_filtered, 0)
newtable.iloc[i][:'total_length'] = sub_table.iloc[0][:'total_length']
return newtable
pool = Pool(processes=20)
datapervar= pool.map(functools.partial(generateVarStats, table, f), variants)
pool.close()
f.close()
print('here!')
for variant in variants:
#this is really slow, there's got to be a better way of unpacking datapervar
if (variant % 1000) == 0:
print(variant)
newtable.iloc[variant] = datapervar[variant].iloc[0]

return newtable

def generateVarStats(table,f,variant):
sub_table = table[table['variant_number']==variant]
#how do you intialize a list?
# print(variant)
#doesn't like 15356
columns = [name for name in table][:12] + ['numTests', 'numRejects', 'kd', 'dG', 'fmax', 'fmin','qvalue', 'errdG_L', 'errdG_R', ]
newtable_pervar = pd.DataFrame(columns=columns, index = xrange(0,1))
if len(sub_table) > 0:
sub_table_filtered = filterFitParameters(sub_table)[['kd', 'dG', 'fmax','qvalue', 'fmin']]
if not(sub_table_filtered.empty):
newtable_pervar.iloc[0]['numTests'] = len(sub_table_filtered)
newtable_pervar.iloc[0]['numRejects'] = len(sub_table) - len(sub_table_filtered)
newtable_pervar.iloc[0]['kd':'qvalue'] = np.median(sub_table_filtered, 0)
newtable_pervar.iloc[0][:'total_length'] = sub_table.iloc[0][:'total_length']
if len(sub_table_filtered) > 1:
#get bootstrapped error bars on mean, would have liked to do median but get errors...
try:
if (variant % 1000) == 0:
print(variant)
bootval = scikits.bootstrap.ci(data=sub_table_filtered['dG'], statfunction=np.median, method='pi')
newtable_pervar.iloc[0]['errdG_L'] = bootval[0]-np.median(sub_table_filtered, 0)[1]
newtable_pervar.iloc[0]['errdG_R'] = bootval[1]-np.median(sub_table_filtered, 0)[1]
except IndexError:
print('value error')
print(variant)
indexbad = np.array(variant)
np.savetxt(filename, indexbad)
#could also use
f.write('%d\t\n'%variant)

#save to text file the name of the variant, could do an np.savetxt,


return newtable_pervar


def filterFitParameters(sub_table):
sub_table = sub_table[sub_table['fit_success']==1]
#sub_table = sub_table[sub_table['fit_success']==1]
sub_table = sub_table[sub_table['rsq']>0.5]
sub_table = sub_table[sub_table['fraction_consensus']>=67] # 2/3 majority
#find where the binding series starts
firstconc = int(np.where(sub_table.columns == 0)[0])
numconcs = 8
#filters out anything that has a single NaN concentration
sub_table = sub_table[sub_table.iloc[:,firstconc:firstconc+numconcs].isnull().sum(1) == 0 ]
return sub_table

def bindingCurve(concentrations, kd, fmax=None, fmin=None):
Expand Down Expand Up @@ -262,7 +311,6 @@ def plot_length_changes_helices(table, variant_table, topology, loop=None, recep
# choose just one sequence of that topology to look at
seq1 = variant_table[criteria_central]['junction_sequence'].iloc[0]
criteria_central = np.all((criteria_central, np.array(variant_table['junction_sequence']) == seq1), axis=0)

sub_table = variant_table[criteria_central]
helix_context = np.unique(sub_table['helix_context'])
total_lengths = np.array([8,9,10,11,12])
Expand All @@ -286,4 +334,11 @@ def plot_length_changes_helices(table, variant_table, topology, loop=None, recep
ax.set_ylabel('delta delta G')
ax.set_ylim((-1, 5))
plt.tight_layout()
return
return







8 changes: 6 additions & 2 deletions scripts/analyzeVariants.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import pandas as pd
import variantFun
import IMlibs
import multiprocessing
import plotfun
parameters = variantFun.Parameters()

#set up command line argument parser
Expand All @@ -38,14 +40,16 @@
args = parser.parse_args()

# outdirectory
imageDirectory = 'imageAnalysis/reduced_signals/barcode_mapping/figs'
#imageDirectory = 'imageAnalysis/reduced_signals/barcode_mapping/figs'
imageDirectory = '/home/namita/RigidAnalysis/figs'

# load concentrations
xValuesFilename, fluor_dirs_red, fluor_dirs, concentrations = IMlibs.loadMapFile(args.CPfluor_dirs_to_quantify)

# load table
fittedBindingFilename = args.CPfitted
table = IMlibs.loadFittedCPsignal(fittedBindingFilename)
table['dG'] = parameters.RT*np.log(table['kd']*1E-9)
#table['dG'] = parameters.RT*np.log(table['kd']*1E-9)

# get another dict that gives info on a per variant basis
variantFittedFilename = os.path.splitext(fittedBindingFilename)[0]+'.perVariant.fitParameters'
Expand Down
2 changes: 2 additions & 0 deletions scripts/findSeqDistribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import numpy as np
from optparse import OptionParser
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')
import pandas as pd
import sys
import os
Expand Down