Skip to content

Commit 97cb1c1

Browse files
committed
Merge branch 'develop2'
2 parents c877332 + 97975e6 commit 97cb1c1

31 files changed

+5860
-1060
lines changed

bin/IMlibs.py

+138-70
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,11 @@
99
import datetime
1010
import scipy.stats as st
1111
import warnings
12+
import collections
1213
import pickle
1314
import seaborn as sns
1415
import itertools
16+
import fileFun
1517

1618
def filenameMatchesAListOfExtensions(filename, extensionList=None):
1719
# from CPlibs
@@ -25,19 +27,31 @@ def findTileFilesInDirectory(dirPath, extensionList, excludedExtensionList=None)
2527
# from CPlibs
2628
dirList = os.listdir(dirPath)
2729

28-
filenameDict = {}
30+
filenameDict = collections.defaultdict(list)
2931
for currFilename in dirList:
30-
if filenameMatchesAListOfExtensions(currFilename,extensionList) and not filenameMatchesAListOfExtensions(currFilename,excludedExtensionList):
32+
if (filenameMatchesAListOfExtensions(currFilename,extensionList) and not
33+
filenameMatchesAListOfExtensions(currFilename,excludedExtensionList)):
3134
currTileNum = getTileNumberFromFilename(currFilename)
3235
if currTileNum != '':
33-
filenameDict[currTileNum] = os.path.join(dirPath,currFilename)
36+
filenameDict[currTileNum].append(os.path.join(dirPath,currFilename))
3437
if len(filenameDict) == 0:
3538
print ' NONE FOUND'
3639
else:
3740
for currTile,currFilename in filenameDict.items():
38-
print ' found tile ' + currTile + ': "' + currFilename + '"'
41+
if len(currFilename) == 1:
42+
print ' found tile ' + currTile + ': "' + currFilename[0] + '"'
43+
filenameDict[currTile] = currFilename[0]
44+
elif len(currFilename) > 1:
45+
filenameDict[currTile] = np.sort(currFilename)
46+
3947
return filenameDict
4048

49+
def addIndexToDir(perTileDict, index=None):
50+
d = collections.defaultdict(list)
51+
for key, values in perTileDict.items():
52+
d[key] = pd.Series(values, index=index)
53+
return d
54+
4155
def getTileNumberFromFilename(inFilename):
4256
# from CPlibs
4357
(path,filename) = os.path.split(inFilename) #split the file into parts
@@ -66,25 +80,23 @@ def loadMapFile(mapFilename):
6680
num_dirs = len(remainder)
6781
directories = [os.path.join(root_dir, line.strip()) for line in remainder]
6882

69-
return red_dir, np.array(directories)
83+
return pd.Series(red_dir), pd.Series(directories, index=np.arange(len(directories))+1)
7084

7185
def makeSignalFileName(directory, fluor_filename):
7286
return os.path.join(directory, '%s.signal'%os.path.splitext(os.path.basename(fluor_filename))[0])
7387

7488
def getFluorFileNames(directories, tileNames):
75-
"""
76-
return dict whose keys are tile numbers and entries are a list of CPfluor files by concentration
77-
"""
78-
filenameDict = {}
79-
filesPerTile = ['']*len(directories)
80-
for tile in tileNames:
81-
filenameDict[tile] = ['']*len(directories)
82-
for i, directory in enumerate(directories):
83-
try:
84-
filename = subprocess.check_output('find %s -maxdepth 1 -name "*CPfluor" -type f | grep tile%s'%(directory, tile), shell=True).strip()
85-
except: filename = ''
86-
filenameDict[tile][i] = filename
87-
return filenameDict
89+
""" Return dict with keys tile numbers and entries list of CPfluor files."""
90+
d = collections.defaultdict(list)
91+
for idx, directory in directories.iteritems():
92+
new_dict = addIndexToDir((findTileFilesInDirectory(directory, ['CPfluor'])), index=[idx])
93+
for tile, filenames in new_dict.items():
94+
d[tile].append(filenames)
95+
# consolidate tiles
96+
newdict = {}
97+
for tile, files in d.items():
98+
newdict[tile] = pd.concat(files)
99+
return newdict
88100

89101
def parseTimeStampFromFilename(CPfluorFilename):
90102
try:
@@ -102,25 +114,22 @@ def getFluorFileNamesOffrates(directories, tileNames):
102114
"""
103115
return dict whose keys are tile numbers and entries are a list of CPfluor files by concentration
104116
"""
105-
filenameDict = {}
106117

107-
for tile in tileNames:
108-
filenameDict[tile] = []
109-
for i, directory in enumerate(directories):
110-
try:
111-
filenames = subprocess.check_output(('find %s -maxdepth 1 -name "*CPfluor" | '+
112-
'grep tile%s | sort')%(directory, tile),
113-
shell=True).split()
114-
for filename in filenames:
115-
filenameDict[tile].append(filename)
116-
except:
117-
pass
118+
filenameDict = collections.defaultdict(list)
119+
for idx, directory in directories.iteritems():
120+
new_dict = addIndexToDir((findTileFilesInDirectory(directory, ['CPfluor'])))
121+
for tile, filenames in new_dict.items():
122+
filenameDict[tile].append(filenames)
123+
124+
# make sure is flat
125+
for tile, filenames in filenameDict.items():
126+
filenameDict[tile] = pd.concat(filenameDict[tile])
118127

119128
# check time stamps
120129
timeStampDict = {}
121130
for tile in tileNames:
122131
timeStampDict[tile] = [parseTimeStampFromFilename(filename)
123-
for filename in filenameDict[tile]]
132+
for idx, filename in filenameDict[tile].iteritems()]
124133
allTimeStamps = []
125134
for times in timeStampDict.values():
126135
for time in times:
@@ -143,6 +152,17 @@ def getCPsignalDictfromCPseqDict(filteredCPseqFilenameDict, signal_directory):
143152
for tiles, cpSeqFilename in filteredCPseqFilenameDict.items():
144153
signalNamesByTileDict[tiles] = getCPsignalFileFromCPseq(cpSeqFilename, signal_directory)
145154
return signalNamesByTileDict
155+
156+
def getCPseriesDictfromCPseqDict(filteredCPseqFilenameDict, signal_directory):
157+
signalNamesByTileDict = {}
158+
for tiles, cpSeqFilename in filteredCPseqFilenameDict.items():
159+
signalNamesByTileDict[tiles] = getCPseriesFileFromCPseq(cpSeqFilename, signal_directory)
160+
return signalNamesByTileDict
161+
162+
def getCPseriesFileFromCPseq(cpSeqFilename, directory=None):
163+
if directory is None:
164+
directory = os.path.dirname(cpSeqFilename)
165+
return os.path.join(directory, os.path.splitext(os.path.basename(cpSeqFilename))[0] + '.CPseries.pkl')
146166

147167
def getCPsignalFileFromCPseq(cpSeqFilename, directory=None):
148168
if directory is None:
@@ -162,19 +182,29 @@ def getReducedCPsignalDict(signalNamesByTileDict, filterSet = None, directory =
162182
reducedSignalNamesByTileDict[tiles] = os.path.join(directory, os.path.splitext('__'+os.path.basename(cpSignalFilename))[0] + '_%s.CPsignal'%filterSet)
163183
return reducedSignalNamesByTileDict
164184

165-
def getReducedCPsignalFilename(reducedSignalNamesByTileDict):
166-
filename = reducedSignalNamesByTileDict.values()[0]
167-
dirname = os.path.dirname(filename)
168-
basename = os.path.basename(filename).lstrip('_')
185+
def getReducedCPsignalFilename(signalFilesByTile, directory, suffix=None):
186+
if suffix is None:
187+
postScript = '_reduced.CPseries.pkl'
188+
else:
189+
postScript = '_reduced_%s.CPseries.pkl'%suffix
190+
startFile = os.path.basename(signalFilesByTile.values()[0])
191+
noTileFile = startFile[:startFile.find('tile')] + startFile[startFile.find('tile')+8:]
192+
return os.path.join(directory, os.path.splitext(noTileFile[:noTileFile.find('.pkl')])[0] + postScript)
193+
194+
def getTileOutputFilename(signalFilesByTile, directory, suffix=None):
195+
if suffix is None:
196+
postScript = '_reduced.CPtiles.pkl'
197+
else:
198+
postScript = '_reduced_%s.CPtiles.pkl'%suffix
199+
startFile = os.path.basename(signalFilesByTile.values()[0])
200+
noTileFile = startFile[:startFile.find('tile')] + startFile[startFile.find('tile')+8:]
201+
return os.path.join(directory, os.path.splitext(noTileFile[:noTileFile.find('.pkl')])[0] + postScript)
169202

170-
removeIndex = basename.find('tile')
171-
newBasename = basename[:removeIndex] + basename[removeIndex+8:]
172-
return os.path.join(dirname, newBasename)
173203

174204
def getSignalFromCPFluor(CPfluorfilename):
175-
fitResults = pd.read_csv(CPfluorfilename, usecols=range(7, 12), sep=':', header=None, names=['success', 'amplitude', 'sigma', 'fit_X', 'fit_Y'] )
176-
signal = np.array(2*np.pi*fitResults['amplitude']*fitResults['sigma']*fitResults['sigma'])
177-
signal[np.array(fitResults['success']==0)] = np.nan
205+
fitResults = fileFun.loadFile(CPfluorfilename)
206+
signal = 2*np.pi*fitResults.amplitude*fitResults.sigma*fitResults.sigma
207+
signal.loc[~fitResults.success.astype(bool)] = np.nan
178208
return signal
179209

180210
def getBSfromCPsignal(signalNamesByTileDict):
@@ -189,6 +219,7 @@ def getFPfromCPsignal(signalNamesByTileDict):
189219
bindingSeriesFilenameDict[tiles] = os.path.splitext(cpSignalFilename)[0] + '.fit_parameters'
190220
return bindingSeriesFilenameDict
191221

222+
192223
def findCPsignalFile(cpSeqFilename, redFluors, greenFluors, cpSignalFilename, tile=None):
193224

194225
# find signal in red
@@ -231,6 +262,27 @@ def findCPsignalFile(cpSeqFilename, redFluors, greenFluors, cpSignalFilename, ti
231262
np.savetxt(cpSignalFilename, np.transpose(np.vstack((cp_seq, signal_comma_format))), fmt='%s', delimiter='\t')
232263
return signal_green
233264

265+
def makeCPseriesFile(cpseriesfilename, fluorFiles):
266+
signals = []
267+
signal_file_exists = [False]*len(fluorFiles)
268+
269+
# find signal for each fluor file
270+
for i, (idx, filename) in enumerate(fluorFiles.iteritems()):
271+
if os.path.exists(filename):
272+
signal_file_exists[i] = True
273+
signals.append(pd.Series(getSignalFromCPFluor(filename), name=idx))
274+
275+
# check any were made
276+
if not np.any(signal_file_exists):
277+
print 'Error: no CPfluor files found! Are directories in map file correct?'
278+
return
279+
280+
# save
281+
signal = pd.concat(signals, axis=1)
282+
signal.to_pickle(cpseriesfilename)
283+
return
284+
285+
234286
def reduceCPsignalFile(cpSignalFilename, reducedCPsignalFilename, filterPos=None):
235287

236288
if filterPos is None:
@@ -249,6 +301,50 @@ def reduceCPsignalFile(cpSignalFilename, reducedCPsignalFilename, filterPos=None
249301
os.system(to_run)
250302
return
251303

304+
def makeIndexFileNoGrep(filteredCPseqFilenameDict, filterPos, outputFile):
305+
# takes about 61 seconds to run on ~2 million indices
306+
awkFilterText = ' || '.join(['(a[i]==\"%s\")'%s for s in filterPos])
307+
call = ("cat %s | "
308+
"awk '{n=split($2, a,\":\"); b=0; for (i=1; i<=n; i++) if (%s) b=1; "
309+
"if (b==1) print $1}' > %s")%(' '.join(filteredCPseqFilenameDict.values()),
310+
awkFilterText, outputFile)
311+
print call
312+
os.system(call)
313+
return
314+
315+
def makeIndexFile(filteredCPseqFilenameDict, filterPos, outputFile):
316+
# takes about 35 seconds to run on ~2 million indices
317+
call = "cat %s | grep -w '%s' | cut -f1 > %s"%(' '.join(filteredCPseqFilenameDict.values()),
318+
'\|'.join(filterPos),
319+
outputFile)
320+
print call
321+
os.system(call)
322+
return
323+
324+
325+
def reduceCPseriesFiles(outputFiles, reducedOutputFile, indices=None, tileOutputFile=None):
326+
# load all files in dict outputFiles
327+
allTiles = [fileFun.loadFile(filename) for filename in outputFiles.values()]
328+
a = pd.concat(allTiles)
329+
a = a.groupby(level=0).first()
330+
331+
if indices is None:
332+
a.to_pickle(reducedOutputFile)
333+
else:
334+
a.loc[indices].to_pickle(reducedOutputFile)
335+
336+
# find tile dict if tile output file is given
337+
if tileOutputFile is not None:
338+
tiles = pd.concat([pd.Series(index=s.index, data=tile)
339+
for s, tile in itertools.izip(allTiles, outputFiles.keys())])
340+
tiles = tiles.groupby(level=0).first()
341+
if indices is None:
342+
tiles.to_pickle(tileOutputFile)
343+
else:
344+
tiles.loc[indices].to_pickle(tileOutputFile)
345+
346+
return
347+
252348
def saveTimeDeltaDict(filename, timeDeltaDict):
253349
with open(filename, "wb") as f:
254350
pickle.dump(timeDeltaDict, f, protocol=pickle.HIGHEST_PROTOCOL)
@@ -463,35 +559,7 @@ def loadBindingCurveFromCPsignal(filename, concentrations=None, subset=None, ind
463559
usecols=[index_col, 'all_cluster_signal'])
464560
return binding_series, all_cluster_signal.all_cluster_signal
465561

466-
def boundFluorescence(signal, plot=None):
467-
# take i.e. all cluster signal and bound it
468-
if plot is None: plot=False
469-
470-
signal = signal.copy()
471-
472-
# check if at least one element of signal is not nan
473-
if np.isfinite(signal).sum() > 0:
474-
lowerbound = np.percentile(signal.dropna(), 1)
475-
upperbound = signal.median() + 5*signal.std()
476-
477-
if plot:
478-
binwidth = (upperbound - lowerbound)/50.
479-
plt.figure(figsize=(4,3))
480-
sns.distplot(signal.dropna(), bins = np.arange(signal.min(), signal.max()+binwidth, binwidth), color='seagreen')
481-
ax = plt.gca()
482-
ax.tick_params(right='off', top='off')
483-
ylim = ax.get_ylim()
484-
plt.plot([lowerbound]*2, ylim, 'k:')
485-
plt.plot([upperbound]*2, ylim, 'k:')
486-
plt.xlim(0, upperbound + 2*signal.std())
487-
plt.tight_layout()
488-
signal.loc[signal < lowerbound] = lowerbound
489-
signal.loc[signal > upperbound] = upperbound
490-
491-
else:
492-
#if they are all nan, set to 1 for division
493-
signal.loc[:] = 1
494-
return signal
562+
495563

496564
def loadFittedCPsignal(fittedBindingFilename, annotatedClusterFile=None,
497565
bindingCurveFilename=None, pickled=None):

0 commit comments

Comments
 (0)