GreenleafLab · sjkdenny · Dec 31, 2014 · Jan 7, 2015
diff --git a/LibraryTesting/BadVariants.txt b/LibraryTesting/BadVariants.txt
diff --git a/libs/IMlibs.py b/libs/IMlibs.py
@@ -12,13 +12,13 @@
 import pandas as pd
 import scipy.io as sio
 import matplotlib.pyplot as plt
-import CPlibs
+#import CPlibs
 
 def spawnMatlabJob(matlabFunctionCallString):
     try:
         #construct the command-line matlab call 
         functionCallString =                      "try,"
-        functionCallString = functionCallString +      "addpath('{0}', '{1}');".format('/home/sarah/array_image_tools_SKD/libs', '/home/sarah/array_image_tools_SKD/scripts') #placeholder TEMP DEBUG CHANGE
+        functionCallString = functionCallString +      "addpath('{0}', '{1}');".format('/home/namita/Code/array_image_tools_SKD/libs', '/home/namita/Code/array_image_tools_SKD/scripts') #placeholder TEMP DEBUG CHANGE
         functionCallString = functionCallString +     matlabFunctionCallString + ';'
         functionCallString = functionCallString + "catch e,"
         functionCallString = functionCallString +     "disp(getReport(e,'extended'));"
@@ -142,7 +142,8 @@ def getFPfromCPsignal(signalNamesByTileDict):
     return bindingSeriesFilenameDict
 
 def pasteTogetherSignal(cpSeqFilename, signal, outfilename):
-    tmpfilename = 'signal_'+str(time.time())
+    output_directory = os.path.dirname(outfilename)
+    tmpfilename = os.path.join(output_directory, 'signal_'+str(time.time()))
     np.savetxt(tmpfilename, signal, fmt='%s')
     os.system("paste %s %s > %s"%(cpSeqFilename, tmpfilename, outfilename+'.tmp'))
     os.system("mv %s %s"%(outfilename+'.tmp', outfilename))
@@ -218,19 +219,19 @@ def sortConcatenateCPsignal(reducedSignalNamesByTileDict, barcode_col, sortedAll
     return
 
 def compressBarcodes(sortedAllCPsignalFile, barcode_col, seq_col, compressedBarcodeFile):
-    script = '/home/sarah/array_image_tools_SKD/scripts/compressBarcodes.py'
+    script = '/home/namita/Code/array_image_tools_SKD/scripts/compressBarcodes.py'
     print "python %s -i %s -o %s -c %d -C %d"%(script, sortedAllCPsignalFile, compressedBarcodeFile, barcode_col, seq_col)
     os.system("python %s -i %s -o %s -c %d -C %d"%(script, sortedAllCPsignalFile, compressedBarcodeFile, barcode_col, seq_col))
     return
 
 def barcodeToSequenceMap(compressedBarcodeFile, libraryDesignFile, outFile):
-    script = '/home/sarah/array_image_tools_SKD/scripts/findSeqDistribution.py'
+    script = '/home/namita/Code/array_image_tools_SKD/scripts/findSeqDistribution.py'
     print "python %s -b %s -l %s -o %s "%(script, compressedBarcodeFile, libraryDesignFile, outFile)
     os.system("python %s -b %s -l %s -o %s "%(script, compressedBarcodeFile, libraryDesignFile, outFile))
     return
 
 def matchCPsignalToLibrary(barcodeToSequenceFilename, sortedAllCPsignalFile, sequenceToLibraryFilename):
-    script = '/home/sarah/array_image_tools_SKD/scripts/matchCPsignalLibrary.py'
+    script = '/home/namita/Code/array_image_tools_SKD/scripts/matchCPsignalLibrary.py'
     print "python %s -b %s -i %s -o %s "%(script, barcodeToSequenceFilename, sortedAllCPsignalFile, sequenceToLibraryFilename)
     os.system("python %s -b %s -i %s -o %s "%(script, barcodeToSequenceFilename, sortedAllCPsignalFile, sequenceToLibraryFilename))
     return
@@ -338,9 +339,18 @@ def loadCPseqSignal(filename):
     """
     cols = ['tileID','filter','read1_seq','read1_quality','read2_seq','read2_quality','index1_seq','index1_quality','index2_seq', 'index2_quality','all_cluster_signal','binding_series']
     table = pd.read_csv(filename, sep='\t', header=None, names=cols, index_col=False)
+    #count = 0
+    #    
+    #binding_series = np.zeros((len(table),8))
+    #for series in table['binding_series']:
+    #    if (len(np.array(series.split(','), dtype=float)) ==8):
+    #        binding_series[count] = np.array(series.split(','), dtype=float)
+    #        count += 1
     binding_series = np.array([np.array(series.split(','), dtype=float) for series in table['binding_series']])
+
     for col in range(binding_series.shape[1]):
         table[col] = binding_series[:, col]
+
     return table
 
 def loadNullScores(signalNamesByTileDict, filterSet):
@@ -361,7 +371,15 @@ def loadBindingCurveFromCPsignal(filename):
     then return the binding series, normalized by all cluster image.
     """
     table = pd.read_table(filename, usecols=(10,11))
-    binding_series = np.array([np.array(series.split(','), dtype=float) for series in table['binding_series']])
+    count = 0
+    binding_series = np.zeros((len(table),8))
+    for series in table['binding_series']:
+        if (len(np.array(series.split(','), dtype=float)) ==8):
+            binding_series[count] = np.array(series.split(','), dtype=float)
+            count += 1
+
+    #binding_series = np.array([np.array(series.split(','), dtype=float) for series in table['binding_series']])
+
     return binding_series, np.array(table['all_cluster_signal'])
 
 def loadFittedCPsignal(filename):

diff --git a/libs/IMlibs.pyc b/libs/IMlibs.pyc
diff --git a/libs/variantFun.py b/libs/variantFun.py
@@ -10,36 +10,85 @@
 import plotfun
 import scipy.io as sio
 import matplotlib.pyplot as plt
-import CPlibs
+#import CPlibs
+import scikits.bootstrap
+import scipy as scipy
+from multiprocessing import Pool
+import functools
 
 class Parameters():
     def __init__(self):
-
         # save the units of concentration given in the binding series
         self.RT = 0.582  # kcal/mol at 20 degrees celsius
         self.min_deltaG = -12
         self.max_deltaG = -3
-
+
+
 def perVariantInfo(table):
-    variants = np.arange(0, np.max(table['variant_number']))
-    columns = [name for name in table][:12] + ['numTests', 'numRejects', 'kd', 'dG', 'fmax', 'fmin']
+    variants = range(0,int(np.max(table['variant_number'])))
+    wd = '/home/namita/RigidAnalysis' 
+    filename = os.path.join(wd, 'BadVariants.txt')
+    f = open(filename, 'w')
+
+    columns = [name for name in table][:12] + ['numTests', 'numRejects', 'kd', 'dG', 'fmax', 'fmin','qvalue', 'errdG_L', 'errdG_R', ]
     newtable = pd.DataFrame(columns=columns, index=np.arange(len(variants)))
-    for i, variant in enumerate(variants):
-        if i%1000==0: print 'computing iteration %d'%i
-        sub_table = table[table['variant_number']==variant]
-        if len(sub_table) > 0:
-            sub_table_filtered = filterFitParameters(sub_table)[['kd', 'dG', 'fmax', 'fmin']]
-            newtable.iloc[i]['numTests'] = len(sub_table_filtered)
-            newtable.iloc[i]['numRejects'] = len(sub_table) - len(sub_table_filtered)
-            newtable.iloc[i]['kd':] = np.median(sub_table_filtered, 0)
-            newtable.iloc[i][:'total_length'] = sub_table.iloc[0][:'total_length']
-    return newtable
+    pool = Pool(processes=20)   
+    datapervar= pool.map(functools.partial(generateVarStats, table, f), variants)
+    pool.close()
+    f.close()
+    print('here!')
+    for variant in variants:
+        #this is really slow, there's got to be a better way of unpacking datapervar
+        if (variant % 1000) == 0:
+            print(variant)
+        newtable.iloc[variant] = datapervar[variant].iloc[0]
+
+    return newtable    
+
+def generateVarStats(table,f,variant):
+    sub_table = table[table['variant_number']==variant]
+    #how do you intialize a list?
+   # print(variant)
+   #doesn't like 15356
+    columns = [name for name in table][:12] + ['numTests', 'numRejects', 'kd', 'dG', 'fmax', 'fmin','qvalue', 'errdG_L', 'errdG_R', ]
+    newtable_pervar = pd.DataFrame(columns=columns, index = xrange(0,1))
+    if len(sub_table) > 0:        
+        sub_table_filtered = filterFitParameters(sub_table)[['kd', 'dG', 'fmax','qvalue', 'fmin']]
+        if not(sub_table_filtered.empty):
+            newtable_pervar.iloc[0]['numTests'] = len(sub_table_filtered)
+            newtable_pervar.iloc[0]['numRejects'] = len(sub_table) - len(sub_table_filtered)
+            newtable_pervar.iloc[0]['kd':'qvalue'] = np.median(sub_table_filtered, 0)
+            newtable_pervar.iloc[0][:'total_length'] = sub_table.iloc[0][:'total_length']
+        if len(sub_table_filtered) > 1:
+                    #get bootstrapped error bars on mean, would have liked to do median but get errors...   
+                    try:
+                        if (variant % 1000) == 0:
+                            print(variant)
+                        bootval = scikits.bootstrap.ci(data=sub_table_filtered['dG'], statfunction=np.median, method='pi')    
+                        newtable_pervar.iloc[0]['errdG_L'] = bootval[0]-np.median(sub_table_filtered, 0)[1]
+                        newtable_pervar.iloc[0]['errdG_R'] = bootval[1]-np.median(sub_table_filtered, 0)[1]
+                    except IndexError:
+                        print('value error')
+                        print(variant)
+                        indexbad = np.array(variant)
+                        np.savetxt(filename, indexbad)
+                        #could also use 
+                        f.write('%d\t\n'%variant)
+
+                        #save to text file the name of the variant, could do an np.savetxt,         
+
+
+    return newtable_pervar
 
 
 def filterFitParameters(sub_table):
-    sub_table = sub_table[sub_table['fit_success']==1]
+    #sub_table = sub_table[sub_table['fit_success']==1]
     sub_table = sub_table[sub_table['rsq']>0.5]
-    sub_table = sub_table[sub_table['fraction_consensus']>=67] # 2/3 majority
+    #find where the binding series starts
+    firstconc = int(np.where(sub_table.columns == 0)[0])
+    numconcs = 8
+    #filters out anything that has a single NaN concentration
+    sub_table = sub_table[sub_table.iloc[:,firstconc:firstconc+numconcs].isnull().sum(1) == 0 ]
     return sub_table
 
 def bindingCurve(concentrations, kd, fmax=None, fmin=None):
@@ -262,7 +311,6 @@ def plot_length_changes_helices(table, variant_table, topology, loop=None, recep
     # choose just one sequence of that topology to look at
     seq1 = variant_table[criteria_central]['junction_sequence'].iloc[0]
     criteria_central = np.all((criteria_central, np.array(variant_table['junction_sequence']) == seq1), axis=0)
-
     sub_table = variant_table[criteria_central]
     helix_context = np.unique(sub_table['helix_context'])
     total_lengths = np.array([8,9,10,11,12])
@@ -286,4 +334,11 @@ def plot_length_changes_helices(table, variant_table, topology, loop=None, recep
     ax.set_ylabel('delta delta G')
     ax.set_ylim((-1, 5))
     plt.tight_layout()
-    return
+    return
+
+
+
+
+
+
+
diff --git a/scripts/analyzeVariants.py b/scripts/analyzeVariants.py
@@ -21,6 +21,8 @@
 import pandas as pd
 import variantFun
 import IMlibs
+import multiprocessing
+import plotfun
 parameters = variantFun.Parameters()
 
 #set up command line argument parser
@@ -38,14 +40,16 @@
 args = parser.parse_args()
 
 # outdirectory
-imageDirectory = 'imageAnalysis/reduced_signals/barcode_mapping/figs'
+#imageDirectory = 'imageAnalysis/reduced_signals/barcode_mapping/figs'
+imageDirectory  = '/home/namita/RigidAnalysis/figs'
+
 # load concentrations
 xValuesFilename, fluor_dirs_red, fluor_dirs, concentrations = IMlibs.loadMapFile(args.CPfluor_dirs_to_quantify)
 
 # load table
 fittedBindingFilename = args.CPfitted
 table = IMlibs.loadFittedCPsignal(fittedBindingFilename)
-table['dG'] = parameters.RT*np.log(table['kd']*1E-9)
+#table['dG'] = parameters.RT*np.log(table['kd']*1E-9)
 
 # get another dict that gives info on a per variant basis
 variantFittedFilename = os.path.splitext(fittedBindingFilename)[0]+'.perVariant.fitParameters'

diff --git a/scripts/findSeqDistribution.py b/scripts/findSeqDistribution.py
@@ -10,6 +10,8 @@
 import numpy as np
 from optparse import OptionParser
 import matplotlib.pyplot as plt
+import matplotlib 
+matplotlib.use('Agg')
 import pandas as pd
 import sys
 import os