venkatesh-sivaraman
diff --git a/Diff for: ‎.DS_Store
0 Bytes b/Diff for: ‎.DS_Store
0 Bytes
diff --git a/Diff for: ‎PythonProteins.xcodeproj/project.pbxproj
+2 b/Diff for: ‎PythonProteins.xcodeproj/project.pbxproj
+2
diff --git a/Diff for: ‎central_distributions.py
+256 b/Diff for: ‎central_distributions.py
+256
diff --git a/Diff for: ‎decoys.py
+6-5 b/Diff for: ‎decoys.py
+6-5
diff --git a/Diff for: ‎folding.py
+18-8 b/Diff for: ‎folding.py
+18-8
@@ -32,6 +32,7 @@
 		96B6AF041C8A4A100004B68F /* loading_indicator.py */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.python; path = loading_indicator.py; sourceTree = "<group>"; };
 		96CA6AB11CA3537D0036E642 /* score.py */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.python; path = score.py; sourceTree = "<group>"; };
 		96CCF6951C9AF9BF00FA2A7C /* gensparc.py */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.python; path = gensparc.py; sourceTree = "<group>"; };
+		96D12B411CB5F11A0092B0F2 /* central_distributions.py */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.python; path = central_distributions.py; sourceTree = "<group>"; };
 		96D2926C1C5B0B5F002E842F /* .gitignore */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = .gitignore; sourceTree = "<group>"; };
 		96D2926D1C5B124C002E842F /* secondary_structure.py */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.python; path = secondary_structure.py; sourceTree = "<group>"; };
 		96DBD89A1BE161C500CA76A1 /* sparc_distribution.py */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.python; path = sparc_distribution.py; sourceTree = "<group>"; };
@@ -48,6 +49,7 @@
 				963ED0201A55A9F7005F0079 /* probsource.py */,
 				9652F91A1A3CC30500A37E22 /* distributions.py */,
 				96DBD89A1BE161C500CA76A1 /* sparc_distribution.py */,
+				96D12B411CB5F11A0092B0F2 /* central_distributions.py */,
 				96B6AF041C8A4A100004B68F /* loading_indicator.py */,
 				96E3E17C1C6D8B2B00A22AF3 /* reference_state.py */,
 				961253651A7EC28400231E37 /* permissions.py */,
 
@@ -0,0 +1,256 @@
+"""This module contains the SPARCCentralDistributionManager class, which can manage all orientation-based terms of SPARC."""
+
+from distributions import *
+from secondary_structure import *
+from memory_profiler import profile
+
+sparc_consecutive_mode = 'consec'
+sparc_short_range_mode = 'short_range'
+sparc_long_range_mode = 'long_range'
+sparc_secondary_mode = 'secondary'
+sparc_consec_secondary_mode = 'consec_secondary'
+sparc_default_mode = 'default'
+
+class SPARCCentralDistributionManager(FrequencyDistributionManager):
+	
+	def __init__(self, frequencies_path, references=None):
+		"""frequencies_path should be a path to a directory of alpha zones paired with frequencies for individual amino acid pairs."""
+		self.alpha_frequencies = {}
+		self.reference_frequencies = None #{}
+		self.reference_totals = [[[0 for n in xrange(42)] for i in xrange(AMINO_ACID_COUNT)] for j in xrange(AMINO_ACID_COUNT)]
+		self.total_interactions = [[[0 for n in xrange(42)] for i in xrange(AMINO_ACID_COUNT)] for j in xrange(AMINO_ACID_COUNT)]
+		self.median_frequencies = [[[0 for n in xrange(42)] for i in xrange(AMINO_ACID_COUNT)] for j in xrange(AMINO_ACID_COUNT)]
+		self.total_median = 0
+		self.identifier = os.path.basename(frequencies_path)
+		if references:
+			self.load_references(references)
+		self.load_frequencies(frequencies_path)
+		self.weight = 1.0
+		self.defaultvalue = 0
+		self.refstate = True
+
+	def __repr__(self):
+		return "<Distribution Manager for '{}' data>".format(self.identifier)
+	
+	def alpha_frequency(self, aa, aa2, sec_name):
+		"""This helper function retrieves the frequency of the orientation between aa and aa2 in the loaded frequency data. sec_name is the string type for the secondary structure shared by both amino acids, if any."""
+		zone = aa.tolocal(aa2.acarbon).floor()
+		
+		alpha_freq = 0
+		reference_freq = 0
+		if zone in self.alpha_frequencies:
+			data_dict = self.alpha_frequencies[zone][aacode(aa.type)][aacode(aa2.type)]
+			separation = int(min(math.fabs(aa.tag - aa2.tag), 6) - 1) * 7
+			if sec_name:
+				struct_idx = next((i for i, ss in enumerate([None, "helix1", "helix5", "helix7", "sheet0", "sheet1", "sheet-1"]) if ss == sec_name), 0)
+				separation += struct_idx
+			if separation in data_dict:
+				alpha_freq = float(data_dict[separation])
+		if zone in self.reference_frequencies:
+			data_dict = self.reference_frequencies[zone][sec_name]
+			separation = int(min(math.fabs(aa.tag - aa2.tag), 6) - 1)
+			if separation in data_dict:
+				reference_freq = float(data_dict[separation])
+		return (alpha_freq, reference_freq)
+
+	def subscore(self, protein, aa, aa2, onlyone=False, zero_value=0.01):
+		tag1 = aacode(aa.type)
+		tag2 = aacode(aa2.type)
+		if tag1 >= AMINO_ACID_COUNT: tag1 = 0
+		if tag2 >= AMINO_ACID_COUNT: tag2 = 0
+		sec_struct = protein.secondary_structure_aa(aa.tag)
+		sec_struct_2 = protein.secondary_structure_aa(aa2.tag)
+		if sec_struct and sec_struct_2 and sec_struct[1].start == sec_struct_2[1].start:
+			sec_name = sec_struct[0].type + str(sec_struct[1].identifiers[0])
+		else:
+			sec_name = "default"
+		
+		separation = int(min(math.fabs(aa.tag - aa2.tag), 6) - 1) * 7
+		if sec_name:
+			struct_idx = next((i for i, ss in enumerate([None, "helix1", "helix5", "helix7", "sheet0", "sheet1", "sheet-1"]) if ss == sec_name), 0)
+			separation += struct_idx
+		
+		if self.refstate:
+			subscore, ref = self.alpha_frequency(aa2, aa, sec_name)
+			if subscore == 0:
+				subscore = zero_value
+			if ref == 0:
+				ref = zero_value
+			subscore2, ref2 = self.alpha_frequency(aa, aa2, sec_name)
+			if subscore2 == 0:
+				subscore2 = zero_value
+			if ref2 == 0:
+				ref2 = zero_value
+			if onlyone:
+				print subscore, subscore2
+				return (subscore * self.weight, subscore2 * self.weight, self.total_interactions[tag1][tag2], self.total_interactions[tag2][tag1])
+			return -math.log((subscore / self.total_interactions[tag1][tag2][separation]) / (ref / self.reference_totals[sec_name][int(min(math.fabs(aa.tag - aa2.tag), 6) - 1)])) - math.log((subscore2 / self.total_interactions[tag2][tag1][separation]) / (ref2 / self.reference_totals[sec_name][int(min(math.fabs(aa.tag - aa2.tag), 6) - 1)]))
+		'''else:
+			zone = aa2.tolocal(aa.acarbon).floor()
+			subscore = self.alpha_frequency(tag2, tag1, zone)
+			if subscore == 0:
+				subscore = zero_value
+			subscore = -math.log(subscore / self.median_frequencies[tag2][tag1] * self.total_interactions[tag2][tag1] / self.total_median)
+			zone2 = aa.tolocal(aa2.acarbon).floor()
+			subscore2 = self.alpha_frequency(tag1, tag2, zone2)
+			if subscore2 == 0:
+				subscore2 = zero_value
+			subscore2 = -math.log(subscore2 / self.median_frequencies[tag1][tag2] * self.total_interactions[tag1][tag2] / self.total_median)
+			if onlyone:
+				print subscore, subscore2
+				return (subscore * self.weight, subscore2 * self.weight, self.total_interactions[tag1][tag2], self.total_interactions[tag2][tag1])
+			return subscore + subscore2'''
+
+	def score(self, protein, data, system=None, isolate=False, onlyone=False, prior=2, zero_value=0.01, mode='default'):
+		"""For frequency distributions, pass in an array of hypothetical aminoacids. This implementation returns the product of the frequencies of each pairwise interaction. If isolate=True, only the amino acids in data will be considered for the energy calculation.
+			Pass prior to consider ONLY the amino acid before (True) or after (False) each amino acid in data. This works best for consecutive modes."""
+		score = 0.0
+		taglist = {}
+
+		consec = 2
+		use_secondary = 2
+		use_short_range = 2
+		if mode == sparc_consecutive_mode:
+			consec = 1
+			use_secondary = 0
+		elif mode == sparc_secondary_mode:
+			consec = 1
+			use_secondary = 1
+		elif mode == sparc_consec_secondary_mode:
+			consec = 1
+		elif mode == sparc_short_range_mode:
+			consec = 0
+			use_short_range = 1
+		elif mode == sparc_long_range_mode:
+			consec = 0
+			use_short_range = 0
+		
+		for aa in data:
+			if not aa: continue
+			if prior != 2:
+				nearby = []
+				if aa.tag > 0 and prior != False: nearby.append(protein.aminoacids[aa.tag - 1])
+				if aa.tag < len(protein.aminoacids) - 1 and prior != True: nearby.append(protein.aminoacids[aa.tag + 1])
+			else:
+				if system and not consec and use_short_range != 0:
+					nearby = system.nearby_aa(aa, protein, 10.0, consec=consec)
+				else:
+					nearby = protein.nearby_aa(aa, 10.0, consec=consec)
+			for aa2 in nearby:
+				if not aa2: continue
+				sec_struct = protein.secondary_structure_aa(aa.tag)
+				sec_struct_2 = protein.secondary_structure_aa(aa2.tag)
+				if aa2.tag - aa.tag == 1 and aa.has_break: continue
+				elif math.fabs(aa2.tag - aa.tag) > 5 and use_short_range == 1: continue
+				elif math.fabs(aa2.tag - aa.tag) <= 5 and use_short_range == 0: continue
+				elif use_secondary == 0 and sec_struct and sec_struct_2 and sec_struct[1].start == sec_struct_2[1].start: continue
+				elif use_secondary == 1 and not (sec_struct and sec_struct_2 and sec_struct[1].start == sec_struct_2[1].start): continue
+				if (aa.tag in taglist and aa2.tag in taglist[aa.tag]) or (aa2.tag in taglist and aa.tag in taglist[aa2.tag]):
+					continue
+				hypo = next((x for x in data if x and x.tag == aa2.tag), None)
+				if hypo is not None: aa2 = hypo
+				elif isolate: continue
+				
+				try:
+					subscore = self.subscore(protein, aa, aa2, onlyone, zero_value)
+				except ZeroDivisionError:
+					subscore = 0.0
+				
+				if onlyone: return subscore
+				else: score += subscore
+				
+				if aa.tag in taglist:
+					taglist[aa.tag].append(aa2.tag)
+				else:
+					taglist[aa.tag] = [aa2.tag]
+		return score * self.weight
+					
+	def read_frequency_line(self, line, tag1, tag2):
+		"""Helper method for load_frequencies, intended for subclasses to easily modify the reading procedure."""
+		ptcomps, freqs = line.strip().split(";")
+		alpha = Point3D(*ptcomps.split(","))
+		if alpha not in self.alpha_frequencies:
+			self.alpha_frequencies[alpha] = [[{} for i in xrange(AMINO_ACID_COUNT)] for k in xrange(AMINO_ACID_COUNT)]
+		freqs = [freq for freq in freqs.split(",") if len(freq)]
+		for sep, freq in enumerate(freqs):
+			freq = int(freq)
+			if freq != 0:
+				self.alpha_frequencies[alpha][tag1][tag2][sep] = freq
+				self.total_interactions[tag1][tag2][sep] += freq
+	
+	def load_frequencies(self, path):
+		files = os.listdir(path)
+		self.total_interactions = [[[0 for k in xrange(42)] for i in xrange(AMINO_ACID_COUNT)] for j in xrange(AMINO_ACID_COUNT)]
+		loading_indicator.add_loading_data(len(files))
+		for n, indfile in enumerate(files):
+			loading_indicator.update_progress(1)
+			if indfile.find(".txt") == -1 or indfile[0] == ".": continue
+			tag1, tag2 = indfile[0:-4].split('-')
+			tag1 = int(tag1)
+			tag2 = int(tag2)
+			with open(join(path, indfile), 'r') as file:
+				for line in file:
+					if ";" not in line:
+						if len(line.strip()) > 0:
+							self.median_frequencies[tag1][tag2] = [float(y) for y in line.split(",")]
+						continue
+					self.read_frequency_line(line, tag1, tag2)
+		#Compute median total frequency
+		#s = sorted([x for list1 in self.total_interactions for x in list1])
+		#self.total_median = sum(s) / float(len(s)) #s[int(len(s) / 2.0)]
+
+	def load_references(self, path):
+		files = os.listdir(path)
+		percentage = 0
+		self.reference_frequencies = {}
+		self.reference_totals = secondary_structures_dict([0 for i in xrange(7)])
+		self.reference_totals["default"] = [0 for i in xrange(7)]
+		self.reference_totals["all"] = [0 for i in xrange(7)]
+		loading_indicator.add_loading_data(len(files))
+		for n, indfile in enumerate(files):
+			loading_indicator.update_progress(1)
+			if indfile.find(".txt") == -1: continue
+			sec_name = indfile[0:-4]
+			with open(join(path, indfile), 'r') as file:
+				for line in file:
+					if ";" not in line or len(line.strip()) == 0:
+						continue
+					ptcomps, freqs = line.strip().split(";")
+					alpha = Point3D(*ptcomps.split(","))
+					if alpha not in self.reference_frequencies:
+						self.reference_frequencies[alpha] = secondary_structures_dict()
+						self.reference_frequencies[alpha]["default"] = {}
+						self.reference_frequencies[alpha]["all"] = {}
+					freqs = [freq for freq in freqs.split(",") if len(freq)]
+					for sep, freq in enumerate(freqs):
+						if int(freq) != 0.0:
+							self.reference_frequencies[alpha][sec_name][sep] = int(freq)
+							self.reference_totals[sec_name][sep] += int(freq)
+		print "Loaded references"
+
+class SPARCCentralDistributionPuppet (object):
+	"""The SPARCCentralDistributionPuppet class provides objects that can act like individual frequency managers, while all the time referring back to a single centralized distribution manager. Simply initialize the object with a manager object and a mode (see the top of the module), then use it by calling the score() method as you would any FrequencyDistributionManager."""
+	
+	def __init__(self, manager, mode='default', weight=1.0):
+		self.manager = manager
+		self.mode = mode
+		self.weight = weight
+		self.identifier = self.mode
+		self.short_range = 2
+		self.blocks_secondary_structures = 2
+		if self.mode == sparc_consecutive_mode:
+			self.type = frequency_consec_disttype
+			self.blocks_secondary_structures = 1
+		elif self.mode == sparc_secondary_mode or self.mode == sparc_consec_secondary_mode:
+			self.type = frequency_consec_disttype
+			self.blocks_secondary_structures = 0
+		elif self.mode == sparc_long_range_mode:
+			self.type = frequency_nonconsec_disttype
+			self.short_range = 0
+		elif self.mode == sparc_short_range_mode:
+			self.type = frequency_nonconsec_disttype
+			self.short_range = 1
+
+	def score(self, protein, data, system=None, isolate=False, onlyone=False, prior=2, zero_value=0.01):
+		"""This method funnels through to the puppet's original central manager, passing in the mode parameter."""
+		return self.manager.score(protein, data, system, isolate, onlyone, prior, zero_value, self.mode) * self.weight
@@ -31,6 +31,7 @@ def process_decoys_file((input, output, sparc_dir, nativepath, old)):
 		distributions = dists_old + dists_noref + dists_yesref
 	else:
 		distributions = load_dists(sparc_dir, concurrent=False, secondary=False)
+		#distributions = load_central_dist(sparc_dir, secondary=False)
 
 	paths = os.listdir(input)
 	allpaths = [os.path.join(input, path) for path in paths]
@@ -61,11 +62,11 @@ def process_decoys_file((input, output, sparc_dir, nativepath, old)):
 		scores = None
 	for path in paths:
 		if path == "list" or path == "rmsds": continue
-		if True: #try:
+		try:
 			scores = sparc_scores_file(join(input, path), distributions, bounds=bounds, peptide=peptide) #, ignored_aas=gaps
-		'''except Exception as e:
+		except Exception as e:
 			print path, "exception ({})".format(e)
-			continue'''
+			continue
 		if output and scores is not None:
 			scorestr = ""
 			for s in scores: scorestr += str(s) + ","
@@ -87,8 +88,8 @@ def test_sparc(input, output, sparc_dir, natives=None, old=None):
 		os.mkdir(output)
 	print len(files), "files"
 	pool = multiprocessing.Pool(processes=2, maxtasksperchild=1)
-	zipped = [(join(input, file), join(output, file + ".txt"), sparc_dir, natives, old) for file in files]
-	map(process_decoys_file, zipped)
+	zipped = [(join(input, file), join(output, file + ".txt"), sparc_dir, natives, old) for file in files if file[0] not in "._"]
+	pool.map_async(process_decoys_file, zipped)
 	pool.close()
 	pool.join()
 	print "done"
 
@@ -20,7 +20,7 @@
 from molecular_systems import *
 from probsource import *
 import random
-from main import load_dists, apply_dist_weights
+from main import load_dists, apply_dist_weights, load_central_dist
 from pdbanalysis import *
 import os, sys
 import numpy
@@ -466,6 +466,7 @@ def segment_fold(sparc_dir, dists, seq, range1, range2, infiles, output, sec_str
 	seq1 = seq[range1[0] - 1 : range1[1]]
 	seq2 = seq[range2[0] - 1 : range2[1]]
 	seg_prob = AAConstructiveProbabilitySource(peptide, (0, len(seq1)), (len(seq1), len(seq1) + len(seq2)), dists, permissions, sec_struct_permissions, system=system)
+	seg_prob.steric_cutoff = 0.0
 	for i, inf in enumerate(infiles):
 		seg_prob.load_cluster_conformations(i + 1, inf, n=cluster_confs)
 
@@ -523,6 +524,7 @@ def segment_fold(sparc_dir, dists, seq, range1, range2, infiles, output, sec_str
 	pdb_model_idx = 1
 	prob = AAProbabilitySource(peptide, dists, permissions, sec_struct_permissions)
 	prob.mode = psource_gentle_mode
+	prob.steric_cutoff = 0.0
 	model_count = 10
 	best_models = [[] for i in xrange(model_count)]
 	best_scores = [1000000 for i in xrange(model_count)]
@@ -677,6 +679,7 @@ def simulate_fold(sparc_dir, dists, seq, range, output, outname="simulation.pdb"
 			peptide.add_secondary_structures(sec_structs, format='pdb', range=range)
 	system = MolecularSystem([peptide])
 	prob = AAProbabilitySource(peptide, dists, permissions, sec_struct_permissions, system=system)
+	prob.steric_cutoff = 0.0
 	best_models = [[] for i in xrange(model_count)]
 	best_scores = [1000000 for i in xrange(model_count)]
 	pdb_model_idx = 2
@@ -724,8 +727,8 @@ def simulate_fold(sparc_dir, dists, seq, range, output, outname="simulation.pdb"
 		proximity = 2.0
 		for i in xrange(n):
 			print "{} ({})".format(i, outname)
-			if gentle_cutoff != 0:
-				prob.erratic_proximity = math.fabs(proximity * ((gentle_cutoff / scores[-1]) ** 2)) #(1.0 - i / n)
+			#if gentle_cutoff != 0:
+			#	prob.erratic_proximity = math.fabs(proximity * ((gentle_cutoff / scores[-1]) ** 2)) #(1.0 - i / n)
 			seglen = segment_length(scores[-1] / len(peptide.aminoacids))
 			folding_iteration(system, [prob], seglen)
 			peptide.center()
@@ -829,8 +832,8 @@ def simulate_fold(sparc_dir, dists, seq, range, output, outname="simulation.pdb"
 
 def start_run((run, seq, sec_structs, sparc_dir)):
 	assert len(run) >= 2, "Run directive is invalid: {}".format(run)
-	weights = { "consec": 3.0, "secondary": 3.0, "short-range": 2.0, "nonconsec": 2.0, "medium": 3.0 }
-	distributions = load_dists(sparc_dir, concurrent=False, secondary=True, weights=weights)
+	weights = { "consec": 3.0, "secondary": 3.0, "short_range": 2.0, "long_range": 2.0, "medium": 3.0 }
+	distributions = load_dists(sparc_dir, concurrent=False, secondary=True, weights=weights) #load_central_dist(sparc_dir, secondary=True)
 
 	if len(run[1]) > 1:
 		# run[1] must be the input paths, and run[2] must be the output path name
@@ -844,7 +847,7 @@ def start_run((run, seq, sec_structs, sparc_dir)):
 		range1 = [int(x) for x in run[0][0].split("-")]
 		range2 = [int(x) for x in run[0][1].split("-")]
 		infiles = [os.path.join(output, nm) for nm in run[1]]
-		apply_dist_weights(distributions, { "consec": 1.0, "secondary": 1.0, "short-range": 5.0, "nonconsec": 5.0, "medium": 3.0 })
+		apply_dist_weights(distributions, { "consec": 1.0, "secondary": 1.0, "short_range": 3.0, "long_range": 5.0, "medium": 3.0 })
 		return segment_fold(sparc_dir, distributions, seq, range1, range2, infiles, output, outname=run[2][0], **extra_args)
 	else:
 		# run[1] must be the output path name
@@ -856,8 +859,13 @@ def start_run((run, seq, sec_structs, sparc_dir)):
 				kv = arg.split("=")
 				extra_args[kv[0]] = kv[1]
 		range = [int(x) for x in run[0][0].split("-")]
-		if range[1] - range[0] > 7:
-			apply_dist_weights(distributions, { "consec": 2.0, "secondary": 2.0, "short-range": 4.0, "nonconsec": 4.0, "medium": 5.0 })
+		if "weights" in extra_args:
+			weightlist = extra_args["weights"].split(",")
+			assert len(weightlist) == 5, "Need exactly 5 weight specifications, not {}".format(len(weightlist))
+			apply_dist_weights(distributions, { "consec": float(weightlist[0]), "secondary": float(weightlist[1]), "short_range": float(weightlist[2]), "long_range": float(weightlist[3]), "medium": float(weightlist[4]) })
+			del extra_args["weights"]
+		elif range[1] - range[0] > 7:
+			apply_dist_weights(distributions, { "consec": 2.0, "secondary": 2.0, "short_range": 4.0, "long_range": 5.0, "medium": 4.0 })
 		return simulate_fold(sparc_dir, distributions, seq, range, output, outname=run[1][0], **extra_args)
 
 def run_simulation(directives, output, sparc_dir):
@@ -915,4 +923,6 @@ def run_simulation(directives, output, sparc_dir):
 			i += 2
 		else:
 			assert False, "Unexpected command-line argument {}".format(args[i])
+	if not os.path.exists(output):
+		os.mkdir(output)
 	run_simulation(input, output, sparc_dir)