9
9
import datetime
10
10
import scipy .stats as st
11
11
import warnings
12
+ import collections
12
13
import pickle
13
14
import seaborn as sns
14
15
import itertools
16
+ import fileFun
15
17
16
18
def filenameMatchesAListOfExtensions (filename , extensionList = None ):
17
19
# from CPlibs
@@ -25,19 +27,31 @@ def findTileFilesInDirectory(dirPath, extensionList, excludedExtensionList=None)
25
27
# from CPlibs
26
28
dirList = os .listdir (dirPath )
27
29
28
- filenameDict = {}
30
+ filenameDict = collections . defaultdict ( list )
29
31
for currFilename in dirList :
30
- if filenameMatchesAListOfExtensions (currFilename ,extensionList ) and not filenameMatchesAListOfExtensions (currFilename ,excludedExtensionList ):
32
+ if (filenameMatchesAListOfExtensions (currFilename ,extensionList ) and not
33
+ filenameMatchesAListOfExtensions (currFilename ,excludedExtensionList )):
31
34
currTileNum = getTileNumberFromFilename (currFilename )
32
35
if currTileNum != '' :
33
- filenameDict [currTileNum ] = os .path .join (dirPath ,currFilename )
36
+ filenameDict [currTileNum ]. append ( os .path .join (dirPath ,currFilename ) )
34
37
if len (filenameDict ) == 0 :
35
38
print ' NONE FOUND'
36
39
else :
37
40
for currTile ,currFilename in filenameDict .items ():
38
- print ' found tile ' + currTile + ': "' + currFilename + '"'
41
+ if len (currFilename ) == 1 :
42
+ print ' found tile ' + currTile + ': "' + currFilename [0 ] + '"'
43
+ filenameDict [currTile ] = currFilename [0 ]
44
+ elif len (currFilename ) > 1 :
45
+ filenameDict [currTile ] = np .sort (currFilename )
46
+
39
47
return filenameDict
40
48
49
+ def addIndexToDir (perTileDict , index = None ):
50
+ d = collections .defaultdict (list )
51
+ for key , values in perTileDict .items ():
52
+ d [key ] = pd .Series (values , index = index )
53
+ return d
54
+
41
55
def getTileNumberFromFilename (inFilename ):
42
56
# from CPlibs
43
57
(path ,filename ) = os .path .split (inFilename ) #split the file into parts
@@ -66,25 +80,23 @@ def loadMapFile(mapFilename):
66
80
num_dirs = len (remainder )
67
81
directories = [os .path .join (root_dir , line .strip ()) for line in remainder ]
68
82
69
- return red_dir , np .array ( directories )
83
+ return pd . Series ( red_dir ), pd . Series ( directories , index = np .arange ( len ( directories )) + 1 )
70
84
71
85
def makeSignalFileName (directory , fluor_filename ):
72
86
return os .path .join (directory , '%s.signal' % os .path .splitext (os .path .basename (fluor_filename ))[0 ])
73
87
74
88
def getFluorFileNames (directories , tileNames ):
75
- """
76
- return dict whose keys are tile numbers and entries are a list of CPfluor files by concentration
77
- """
78
- filenameDict = {}
79
- filesPerTile = ['' ]* len (directories )
80
- for tile in tileNames :
81
- filenameDict [tile ] = ['' ]* len (directories )
82
- for i , directory in enumerate (directories ):
83
- try :
84
- filename = subprocess .check_output ('find %s -maxdepth 1 -name "*CPfluor" -type f | grep tile%s' % (directory , tile ), shell = True ).strip ()
85
- except : filename = ''
86
- filenameDict [tile ][i ] = filename
87
- return filenameDict
89
+ """ Return dict with keys tile numbers and entries list of CPfluor files."""
90
+ d = collections .defaultdict (list )
91
+ for idx , directory in directories .iteritems ():
92
+ new_dict = addIndexToDir ((findTileFilesInDirectory (directory , ['CPfluor' ])), index = [idx ])
93
+ for tile , filenames in new_dict .items ():
94
+ d [tile ].append (filenames )
95
+ # consolidate tiles
96
+ newdict = {}
97
+ for tile , files in d .items ():
98
+ newdict [tile ] = pd .concat (files )
99
+ return newdict
88
100
89
101
def parseTimeStampFromFilename (CPfluorFilename ):
90
102
try :
@@ -102,25 +114,22 @@ def getFluorFileNamesOffrates(directories, tileNames):
102
114
"""
103
115
return dict whose keys are tile numbers and entries are a list of CPfluor files by concentration
104
116
"""
105
- filenameDict = {}
106
117
107
- for tile in tileNames :
108
- filenameDict [tile ] = []
109
- for i , directory in enumerate (directories ):
110
- try :
111
- filenames = subprocess .check_output (('find %s -maxdepth 1 -name "*CPfluor" | ' +
112
- 'grep tile%s | sort' )% (directory , tile ),
113
- shell = True ).split ()
114
- for filename in filenames :
115
- filenameDict [tile ].append (filename )
116
- except :
117
- pass
118
+ filenameDict = collections .defaultdict (list )
119
+ for idx , directory in directories .iteritems ():
120
+ new_dict = addIndexToDir ((findTileFilesInDirectory (directory , ['CPfluor' ])))
121
+ for tile , filenames in new_dict .items ():
122
+ filenameDict [tile ].append (filenames )
123
+
124
+ # make sure is flat
125
+ for tile , filenames in filenameDict .items ():
126
+ filenameDict [tile ] = pd .concat (filenameDict [tile ])
118
127
119
128
# check time stamps
120
129
timeStampDict = {}
121
130
for tile in tileNames :
122
131
timeStampDict [tile ] = [parseTimeStampFromFilename (filename )
123
- for filename in filenameDict [tile ]]
132
+ for idx , filename in filenameDict [tile ]. iteritems () ]
124
133
allTimeStamps = []
125
134
for times in timeStampDict .values ():
126
135
for time in times :
@@ -143,6 +152,17 @@ def getCPsignalDictfromCPseqDict(filteredCPseqFilenameDict, signal_directory):
143
152
for tiles , cpSeqFilename in filteredCPseqFilenameDict .items ():
144
153
signalNamesByTileDict [tiles ] = getCPsignalFileFromCPseq (cpSeqFilename , signal_directory )
145
154
return signalNamesByTileDict
155
+
156
+ def getCPseriesDictfromCPseqDict (filteredCPseqFilenameDict , signal_directory ):
157
+ signalNamesByTileDict = {}
158
+ for tiles , cpSeqFilename in filteredCPseqFilenameDict .items ():
159
+ signalNamesByTileDict [tiles ] = getCPseriesFileFromCPseq (cpSeqFilename , signal_directory )
160
+ return signalNamesByTileDict
161
+
162
+ def getCPseriesFileFromCPseq (cpSeqFilename , directory = None ):
163
+ if directory is None :
164
+ directory = os .path .dirname (cpSeqFilename )
165
+ return os .path .join (directory , os .path .splitext (os .path .basename (cpSeqFilename ))[0 ] + '.CPseries.pkl' )
146
166
147
167
def getCPsignalFileFromCPseq (cpSeqFilename , directory = None ):
148
168
if directory is None :
@@ -162,19 +182,29 @@ def getReducedCPsignalDict(signalNamesByTileDict, filterSet = None, directory =
162
182
reducedSignalNamesByTileDict [tiles ] = os .path .join (directory , os .path .splitext ('__' + os .path .basename (cpSignalFilename ))[0 ] + '_%s.CPsignal' % filterSet )
163
183
return reducedSignalNamesByTileDict
164
184
165
- def getReducedCPsignalFilename (reducedSignalNamesByTileDict ):
166
- filename = reducedSignalNamesByTileDict .values ()[0 ]
167
- dirname = os .path .dirname (filename )
168
- basename = os .path .basename (filename ).lstrip ('_' )
185
+ def getReducedCPsignalFilename (signalFilesByTile , directory , suffix = None ):
186
+ if suffix is None :
187
+ postScript = '_reduced.CPseries.pkl'
188
+ else :
189
+ postScript = '_reduced_%s.CPseries.pkl' % suffix
190
+ startFile = os .path .basename (signalFilesByTile .values ()[0 ])
191
+ noTileFile = startFile [:startFile .find ('tile' )] + startFile [startFile .find ('tile' )+ 8 :]
192
+ return os .path .join (directory , os .path .splitext (noTileFile [:noTileFile .find ('.pkl' )])[0 ] + postScript )
193
+
194
+ def getTileOutputFilename (signalFilesByTile , directory , suffix = None ):
195
+ if suffix is None :
196
+ postScript = '_reduced.CPtiles.pkl'
197
+ else :
198
+ postScript = '_reduced_%s.CPtiles.pkl' % suffix
199
+ startFile = os .path .basename (signalFilesByTile .values ()[0 ])
200
+ noTileFile = startFile [:startFile .find ('tile' )] + startFile [startFile .find ('tile' )+ 8 :]
201
+ return os .path .join (directory , os .path .splitext (noTileFile [:noTileFile .find ('.pkl' )])[0 ] + postScript )
169
202
170
- removeIndex = basename .find ('tile' )
171
- newBasename = basename [:removeIndex ] + basename [removeIndex + 8 :]
172
- return os .path .join (dirname , newBasename )
173
203
174
204
def getSignalFromCPFluor (CPfluorfilename ):
175
- fitResults = pd . read_csv (CPfluorfilename , usecols = range ( 7 , 12 ), sep = ':' , header = None , names = [ 'success' , 'amplitude' , 'sigma' , 'fit_X' , 'fit_Y' ] )
176
- signal = np . array ( 2 * np .pi * fitResults [ ' amplitude' ] * fitResults [ ' sigma' ] * fitResults [ ' sigma' ])
177
- signal [ np . array ( fitResults [ ' success' ] == 0 )] = np .nan
205
+ fitResults = fileFun . loadFile (CPfluorfilename )
206
+ signal = 2 * np .pi * fitResults . amplitude * fitResults . sigma * fitResults . sigma
207
+ signal . loc [ ~ fitResults . success . astype ( bool )] = np .nan
178
208
return signal
179
209
180
210
def getBSfromCPsignal (signalNamesByTileDict ):
@@ -189,6 +219,7 @@ def getFPfromCPsignal(signalNamesByTileDict):
189
219
bindingSeriesFilenameDict [tiles ] = os .path .splitext (cpSignalFilename )[0 ] + '.fit_parameters'
190
220
return bindingSeriesFilenameDict
191
221
222
+
192
223
def findCPsignalFile (cpSeqFilename , redFluors , greenFluors , cpSignalFilename , tile = None ):
193
224
194
225
# find signal in red
@@ -231,6 +262,27 @@ def findCPsignalFile(cpSeqFilename, redFluors, greenFluors, cpSignalFilename, ti
231
262
np .savetxt (cpSignalFilename , np .transpose (np .vstack ((cp_seq , signal_comma_format ))), fmt = '%s' , delimiter = '\t ' )
232
263
return signal_green
233
264
265
+ def makeCPseriesFile (cpseriesfilename , fluorFiles ):
266
+ signals = []
267
+ signal_file_exists = [False ]* len (fluorFiles )
268
+
269
+ # find signal for each fluor file
270
+ for i , (idx , filename ) in enumerate (fluorFiles .iteritems ()):
271
+ if os .path .exists (filename ):
272
+ signal_file_exists [i ] = True
273
+ signals .append (pd .Series (getSignalFromCPFluor (filename ), name = idx ))
274
+
275
+ # check any were made
276
+ if not np .any (signal_file_exists ):
277
+ print 'Error: no CPfluor files found! Are directories in map file correct?'
278
+ return
279
+
280
+ # save
281
+ signal = pd .concat (signals , axis = 1 )
282
+ signal .to_pickle (cpseriesfilename )
283
+ return
284
+
285
+
234
286
def reduceCPsignalFile (cpSignalFilename , reducedCPsignalFilename , filterPos = None ):
235
287
236
288
if filterPos is None :
@@ -249,6 +301,50 @@ def reduceCPsignalFile(cpSignalFilename, reducedCPsignalFilename, filterPos=None
249
301
os .system (to_run )
250
302
return
251
303
304
+ def makeIndexFileNoGrep (filteredCPseqFilenameDict , filterPos , outputFile ):
305
+ # takes about 61 seconds to run on ~2 million indices
306
+ awkFilterText = ' || ' .join (['(a[i]==\" %s\" )' % s for s in filterPos ])
307
+ call = ("cat %s | "
308
+ "awk '{n=split($2, a,\" :\" ); b=0; for (i=1; i<=n; i++) if (%s) b=1; "
309
+ "if (b==1) print $1}' > %s" )% (' ' .join (filteredCPseqFilenameDict .values ()),
310
+ awkFilterText , outputFile )
311
+ print call
312
+ os .system (call )
313
+ return
314
+
315
+ def makeIndexFile (filteredCPseqFilenameDict , filterPos , outputFile ):
316
+ # takes about 35 seconds to run on ~2 million indices
317
+ call = "cat %s | grep -w '%s' | cut -f1 > %s" % (' ' .join (filteredCPseqFilenameDict .values ()),
318
+ '\|' .join (filterPos ),
319
+ outputFile )
320
+ print call
321
+ os .system (call )
322
+ return
323
+
324
+
325
+ def reduceCPseriesFiles (outputFiles , reducedOutputFile , indices = None , tileOutputFile = None ):
326
+ # load all files in dict outputFiles
327
+ allTiles = [fileFun .loadFile (filename ) for filename in outputFiles .values ()]
328
+ a = pd .concat (allTiles )
329
+ a = a .groupby (level = 0 ).first ()
330
+
331
+ if indices is None :
332
+ a .to_pickle (reducedOutputFile )
333
+ else :
334
+ a .loc [indices ].to_pickle (reducedOutputFile )
335
+
336
+ # find tile dict if tile output file is given
337
+ if tileOutputFile is not None :
338
+ tiles = pd .concat ([pd .Series (index = s .index , data = tile )
339
+ for s , tile in itertools .izip (allTiles , outputFiles .keys ())])
340
+ tiles = tiles .groupby (level = 0 ).first ()
341
+ if indices is None :
342
+ tiles .to_pickle (tileOutputFile )
343
+ else :
344
+ tiles .loc [indices ].to_pickle (tileOutputFile )
345
+
346
+ return
347
+
252
348
def saveTimeDeltaDict (filename , timeDeltaDict ):
253
349
with open (filename , "wb" ) as f :
254
350
pickle .dump (timeDeltaDict , f , protocol = pickle .HIGHEST_PROTOCOL )
@@ -463,35 +559,7 @@ def loadBindingCurveFromCPsignal(filename, concentrations=None, subset=None, ind
463
559
usecols = [index_col , 'all_cluster_signal' ])
464
560
return binding_series , all_cluster_signal .all_cluster_signal
465
561
466
- def boundFluorescence (signal , plot = None ):
467
- # take i.e. all cluster signal and bound it
468
- if plot is None : plot = False
469
-
470
- signal = signal .copy ()
471
-
472
- # check if at least one element of signal is not nan
473
- if np .isfinite (signal ).sum () > 0 :
474
- lowerbound = np .percentile (signal .dropna (), 1 )
475
- upperbound = signal .median () + 5 * signal .std ()
476
-
477
- if plot :
478
- binwidth = (upperbound - lowerbound )/ 50.
479
- plt .figure (figsize = (4 ,3 ))
480
- sns .distplot (signal .dropna (), bins = np .arange (signal .min (), signal .max ()+ binwidth , binwidth ), color = 'seagreen' )
481
- ax = plt .gca ()
482
- ax .tick_params (right = 'off' , top = 'off' )
483
- ylim = ax .get_ylim ()
484
- plt .plot ([lowerbound ]* 2 , ylim , 'k:' )
485
- plt .plot ([upperbound ]* 2 , ylim , 'k:' )
486
- plt .xlim (0 , upperbound + 2 * signal .std ())
487
- plt .tight_layout ()
488
- signal .loc [signal < lowerbound ] = lowerbound
489
- signal .loc [signal > upperbound ] = upperbound
490
-
491
- else :
492
- #if they are all nan, set to 1 for division
493
- signal .loc [:] = 1
494
- return signal
562
+
495
563
496
564
def loadFittedCPsignal (fittedBindingFilename , annotatedClusterFile = None ,
497
565
bindingCurveFilename = None , pickled = None ):
0 commit comments