marbl · pintomollo · Sep 4, 2021
diff --git a/RE_sites.py b/RE_sites.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python
 import re 
 import sys
 import argparse
@@ -11,7 +11,7 @@ def parse_fasta(fh):
         if ln[0] == '>':
             # check for bad name
             if ":" in ln:
-                print >> sys.stderr, "Error, your fasta records contain a ':' (%s). SALSA requires fasta to not contain this character, please update your fasta and re-try"%(ln[1:].rstrip())
+                print("Error, your fasta records contain a ':' (%s). SALSA requires fasta to not contain this character, please update your fasta and re-try"%(ln[1:].rstrip()), file=sys.stderr)
                 sys.exit(1)
 
             # new name line; remember current sequence's short name
@@ -22,7 +22,7 @@ def parse_fasta(fh):
             # append nucleotides to current sequence
             fa[current_short_name].append(ln.rstrip())
     # Part 2: join lists into strings
-    for short_name, nuc_list in fa.iteritems():
+    for short_name, nuc_list in fa.items():
         # join this sequence's lines into one long string
         fa[short_name] = ''.join(nuc_list)
     return fa
@@ -39,14 +39,14 @@ def main():
     if args.enzyme == "DNASE":
         for key in f:
             id, seq = key, f[key]
-            print id, len(seq)/2, len(seq)/2
+            print(id, len(seq)/2, len(seq)/2)
         sys.exit(0)
 
     enzymes_input = args.enzyme.replace(' ','').split(',')
     final_enzymes = []
     for each in enzymes_input:
         if not re.match("^[ACGTN]+$", each):
-            print >> sys.stderr, "Error, enzyme should be restriction site sequence (e.g. AACTT) not enzyme name or DNASE, you input %s"%(each)
+            print("Error, enzyme should be restriction site sequence (e.g. AACTT) not enzyme name or DNASE, you input %s"%(each), file=sys.stderr)
             sys.exit(1)
 
         if 'N' in each:
@@ -85,7 +85,7 @@ def main():
         #     else:
         #         rigt_count += 1
 
-        print id, left_count, rigt_count
+        print(id, left_count, rigt_count)
 
 
 

diff --git a/alignments2txt.py b/alignments2txt.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python
 import os,sys
 import argparse
 import pickle
@@ -48,7 +48,7 @@ def update_bed(expanded_scaffold):
         path = expanded_scaffold[key]
         scaffold_length[key] = 0
         offset = 0
-        for i in xrange(0,len(path)-1,2):
+        for i in range(0,len(path)-1,2):
             contig = path[i].split(':')[0]
             contig2scaffold[contig] = key
             ori = path[i].split(':')[1] + path[i+1].split(':')[1]
@@ -117,15 +117,15 @@ def update_bed(expanded_scaffold):
                             #o_lines += curr_scaffold+'\t'+str(new_curr_start)+'\t'+str(new_curr_end)+'\t'+curr_attrs[3]+'\n'
                             count += 1
                             if count == 1000000:
-                                print olines
+                                print(olines)
                                 #output.write(o_lines)
                                 count = 0
                                 olines = ""
 
                 prev_line = line
 
             #write remaining lines
-            print olines
+            print(olines)
             #output.write(o_lines)
             #output.close()
 

diff --git a/correct.py b/correct.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python
 import sys
 
 #reads input assembly, breakpoints given by the method and outputs new contig file with lengths
@@ -18,7 +18,7 @@ def parse_fasta(fh):
             # append nucleotides to current sequence
             fa[current_short_name].append(ln.rstrip())
     # Part 2: join lists into strings
-    for short_name, nuc_list in fa.iteritems():
+    for short_name, nuc_list in fa.items():
         # join this sequence's lines into one long string
         fa[short_name] = ''.join(nuc_list)
     return fa
@@ -94,7 +94,7 @@ def parse_fasta(fh):
 ofasta = open(sys.argv[4]+'/asm.cleaned.fasta','w')
 for seq in contig2newseq:
     contig_seq = contig2newseq[seq]
-    chunks = [contig_seq[i:i+80] for i in xrange(0,len(contig_seq),80)]
+    chunks = [contig_seq[i:i+80] for i in range(0,len(contig_seq),80)]
     ofasta.write('>'+seq+'\n')
     for chunk in chunks:
         ofasta.write(chunk+'\n')

diff --git a/fast_scaled_scores.py b/fast_scaled_scores.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python
 import networkx as nx
 import sys
 import argparse

diff --git a/get_seq.py b/get_seq.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python
 import pickle
 import argparse
 
@@ -16,7 +16,7 @@ def parse_fasta(fh):
             # append nucleotides to current sequence
             fa[current_short_name].append(ln.rstrip())
     # Part 2: join lists into strings
-    for short_name, nuc_list in fa.iteritems():
+    for short_name, nuc_list in fa.items():
         # join this sequence's lines into one long string
         fa[short_name] = ''.join(nuc_list)
     return fa
@@ -30,7 +30,7 @@ def parse_fasta(fh):
 args = parser.parse_args()
 
 revcompl = lambda x: ''.join([{'A':'T','B':'N','C':'G','G':'C','T':'A','N':'N','R':'N','M':'N','Y':'N','S':'N','W':'N','K':'N','a':'t','c':'g','g':'c','t':'a','n':'n',' ':'',}[B] for B in x][::-1])
-scaff_map = pickle.load(open(args.map,'r'))
+scaff_map = pickle.load(open(args.map,'rb'))
 
 contig_length = {}
 id2seq = {}
@@ -46,11 +46,11 @@ def parse_fasta(fh):
 for scaffold in scaff_map:
 	path = scaff_map[scaffold]
 	length = 0
-	for i in xrange(0,len(path)-1,2):
+	for i in range(0,len(path)-1,2):
 		length += contig_length[path[i].split(':')[0]]
 	scaff2length[scaffold] = length
 
-sorted_scaffolds = sorted(scaff2length.items(), key=lambda x: x[1],reverse=True)
+sorted_scaffolds = sorted(list(scaff2length.items()), key=lambda x: x[1],reverse=True)
 
 c_id = 1
 line = ""
@@ -114,7 +114,7 @@ def parse_fasta(fh):
     # rec = SeqRecord(Seq(curr_contig,generic_dna),id='scaffold_'+str(c_id))
     # recs.append(rec)
     # print c_id
-    chunks = [curr_contig[i:i+80] for i in xrange(0,len(curr_contig),80)]
+    chunks = [curr_contig[i:i+80] for i in range(0,len(curr_contig),80)]
     ofile.write('>scaffold_'+str(c_id)+'\n')
     for chunk in chunks:
         ofile.write(chunk+'\n')

diff --git a/layout_unitigs.py b/layout_unitigs.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python
 import networkx as nx
 import sys
 import operator
@@ -177,7 +177,7 @@ def load_unitig_mapping():
                     unitig_graph.add_edge(curr_unitig+':E',prev_unitig+':E')
 
             prev_line = line
-    print >> sys.stderr, 'Unitig tiling graph loaded, nodes = ' + str(len(unitig_graph.nodes())) + ' edges = ' + str(len(unitig_graph.edges()))
+    print('Unitig tiling graph loaded, nodes = ' + str(len(unitig_graph.nodes())) + ' edges = ' + str(len(unitig_graph.edges())), file=sys.stderr)
     return unitig_graph
 
 
@@ -219,7 +219,7 @@ def load_tenx_graph(cutoff):
 contig2scaffold = {}
 if int(args.iteration) > 1:
     try:
-        previous_scaffolds = pickle.load(open(args.directory+'/scaffolds_iteration_'+str(iteration-1)+'.p','r'))
+        previous_scaffolds = pickle.load(open(args.directory+'/scaffolds_iteration_'+str(iteration-1)+'.p','rb'))
         for key in previous_scaffolds:
             contigs_1 = previous_scaffolds[key]
             first = contigs_1[0].split(':')[0]
@@ -240,7 +240,7 @@ def load_tenx_graph(cutoff):
 
 
 #Load the best score graph first, keep log of used and unused links
-print >> sys.stderr,  'Loading Hi-C links '
+print('Loading Hi-C links ', file=sys.stderr)
 '''
 Counts to keep log of each types of edge loaded in the graph
 '''
@@ -257,11 +257,11 @@ def test_edge(scaffold_first,scaffold_second,G_test,type):
     last_first = scaffold_first[-1]
     first_second = scaffold_second[0]
     last_second = scaffold_second[-1]
-    print 'testing'
-    print first_first
-    print first_second
-    print last_first
-    print last_second
+    print('testing')
+    print(first_first)
+    print(first_second)
+    print(last_first)
+    print(last_second)
     if G_test.has_edge(last_first,first_second):
         v1 = c1+':E'
         v2 = c2 +':B'
@@ -579,8 +579,8 @@ def generate_scaffold_graph():
     for ctg in list(contigs):
         G.add_edge(ctg+":B", ctg+":E", t="c", score=0, linktype='implicit')
 
-    print >> sys.stderr, 'Hybrid scaffold graph loaded, nodes = ' + str(len(G.nodes())) + ' edges = ' + str(len(G.edges()))
-    print >> sys.stderr, 'Hi-C implied edges = ' + str(hic_edges)
+    print('Hybrid scaffold graph loaded, nodes = ' + str(len(G.nodes())) + ' edges = ' + str(len(G.edges())), file=sys.stderr)
+    print('Hi-C implied edges = ' + str(hic_edges), file=sys.stderr)
 
 
 '''
@@ -592,7 +592,8 @@ def get_seed_scaffold():
     seed_scaffolds = {} #this stores initial long scaffolds
     to_merge = set()
 
-    for subg in nx.connected_component_subgraphs(G):
+    #for subg in nx.connected_component_subgraphs(G):
+    for subg in (G.subgraph(c) for c in nx.connected_components(G)):
         p = []
         for node in subg.nodes():
             if subg.degree(node) == 1:
@@ -664,7 +665,7 @@ def insert(assignment, seed_scaffolds):
         orientation = ''
         pos = -1
         #first check at all middle positions
-        for i in xrange(1,len(path)-1,2):
+        for i in range(1,len(path)-1,2):
             score_fow = -1
             score_rev = -1
             if all_G.has_edge(five_prime,path[i]) and all_G.has_edge(three_prime,path[i+1]):
@@ -722,7 +723,7 @@ def update_bed(expanded_scaffold):
         path = expanded_scaffold[key]
         scaffold_length[key] = 0
         offset = 0
-        for i in xrange(0,len(path),2):
+        for i in range(0,len(path),2):
             contig = path[i].split(':')[0]
             contig2scaffold[contig] = key
             ori = path[i].split(':')[1] + path[i+1].split(':')[1]
@@ -735,7 +736,7 @@ def update_bed(expanded_scaffold):
 
     scaffold_re = {}
     for key in expanded_scaffold:
-        print key
+        print(key)
         path = expanded_scaffold[key]
         length = scaffold_length[key]
         offset = 0
@@ -751,7 +752,7 @@ def update_bed(expanded_scaffold):
                 scaffold_re[key] = (right,left)
         else:
             midpoint = length/2
-            for i in xrange(0,len(path),2):
+            for i in range(0,len(path),2):
                 contig = path[i].split(':')[0]
                 contig2scaffold[contig] = key
                 left,right = re_counts[contig]
@@ -915,7 +916,8 @@ def merge(contigs):
 
     #print best_hic_graph.nodes()
 
-    for g in nx.connected_component_subgraphs(best_hic_graph):
+    #for g in nx.connected_component_subgraphs(best_hic_graph):
+    for g in (best_hic_graph.subgraph(c) for c in nx.connected_components(best_hic_graph)):
         #print g.nodes()
         p = []
         for node in g.nodes():
@@ -983,7 +985,7 @@ def correct_scaffolds(curr_scaffolds):
     path = final_scaffolds[key]
     new_path = []
     #print path
-    for i in xrange(0,len(path)-1,2):
+    for i in range(0,len(path)-1,2):
         #print path
         contig = path[i].split(':')[0]
         scaffolded_contigs[contig] = True
@@ -1010,7 +1012,7 @@ def correct_scaffolds(curr_scaffolds):
     oline = ''
     oline += 'scaffold_'+str(key) +'\t'
     cum_len = 0
-    for i in xrange(0,len(path)-1,2):
+    for i in range(0,len(path)-1,2):
         #print path
         contig = path[i].split(':')[0]
         cum_len += prev_len[contig]
@@ -1095,7 +1097,7 @@ def correct_scaffolds(curr_scaffolds):
 
 # expanded_scaffold_paths = updated_scaffolds
 
-pickle.dump(expanded_scaffold_paths,open(args.directory+'/scaffolds_iteration_'+str(args.iteration)+'.p','w'))
+pickle.dump(expanded_scaffold_paths,open(args.directory+'/scaffolds_iteration_'+str(args.iteration)+'.p','wb'))
 update_bed(expanded_scaffold_paths)
 
 

diff --git a/make_links.py b/make_links.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python
 import operator
 import math
 import sys
@@ -55,7 +55,7 @@ def main():
     coordinates_map = {}
     first_in_pair = {}
     second_in_pair = {}
-    print >> sys.stderr, 'bedfile started'
+    print('bedfile started', file=sys.stderr)
     prev_line = ''
     with open(args.mapping,'r') as f:
         for line in f:
@@ -78,8 +78,8 @@ def main():
                 if prev_read.split('/')[0] == curr_read.split('/')[0]:
                     #print 'here'
 
-                    pos1 = (long(prev_attrs[1]) + long(prev_attrs[2]))/2.0
-                    pos2 = (long(attrs[1]) + long(attrs[2]))/2.0
+                    pos1 = (int(prev_attrs[1]) + int(prev_attrs[2]))/2.0
+                    pos2 = (int(attrs[1]) + int(attrs[2]))/2.0
                     # l1 = 300000.0
                     # l2 = 300000.0
                     #print pos1, pos2
@@ -152,11 +152,11 @@ def main():
             #     continue
 
 
-    print >> sys.stderr, 'bedfile loaded'
+    print('bedfile loaded', file=sys.stderr)
 
 
 
-    print len(contig_links)
+    print(len(contig_links))
     ofile = open(args.directory+'/contig_links_iteration_'+str(iteration),'w')
     for key in contig_links:
         if key != '':