Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Moved to python 3 and Networkx 2.6 #148

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions RE_sites.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python
import re
import sys
import argparse
Expand All @@ -11,7 +11,7 @@ def parse_fasta(fh):
if ln[0] == '>':
# check for bad name
if ":" in ln:
print >> sys.stderr, "Error, your fasta records contain a ':' (%s). SALSA requires fasta to not contain this character, please update your fasta and re-try"%(ln[1:].rstrip())
print("Error, your fasta records contain a ':' (%s). SALSA requires fasta to not contain this character, please update your fasta and re-try"%(ln[1:].rstrip()), file=sys.stderr)
sys.exit(1)

# new name line; remember current sequence's short name
Expand All @@ -22,7 +22,7 @@ def parse_fasta(fh):
# append nucleotides to current sequence
fa[current_short_name].append(ln.rstrip())
# Part 2: join lists into strings
for short_name, nuc_list in fa.iteritems():
for short_name, nuc_list in fa.items():
# join this sequence's lines into one long string
fa[short_name] = ''.join(nuc_list)
return fa
Expand All @@ -39,14 +39,14 @@ def main():
if args.enzyme == "DNASE":
for key in f:
id, seq = key, f[key]
print id, len(seq)/2, len(seq)/2
print(id, len(seq)/2, len(seq)/2)
sys.exit(0)

enzymes_input = args.enzyme.replace(' ','').split(',')
final_enzymes = []
for each in enzymes_input:
if not re.match("^[ACGTN]+$", each):
print >> sys.stderr, "Error, enzyme should be restriction site sequence (e.g. AACTT) not enzyme name or DNASE, you input %s"%(each)
print("Error, enzyme should be restriction site sequence (e.g. AACTT) not enzyme name or DNASE, you input %s"%(each), file=sys.stderr)
sys.exit(1)

if 'N' in each:
Expand Down Expand Up @@ -85,7 +85,7 @@ def main():
# else:
# rigt_count += 1

print id, left_count, rigt_count
print(id, left_count, rigt_count)



Expand Down
8 changes: 4 additions & 4 deletions alignments2txt.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python
import os,sys
import argparse
import pickle
Expand Down Expand Up @@ -48,7 +48,7 @@ def update_bed(expanded_scaffold):
path = expanded_scaffold[key]
scaffold_length[key] = 0
offset = 0
for i in xrange(0,len(path)-1,2):
for i in range(0,len(path)-1,2):
contig = path[i].split(':')[0]
contig2scaffold[contig] = key
ori = path[i].split(':')[1] + path[i+1].split(':')[1]
Expand Down Expand Up @@ -117,15 +117,15 @@ def update_bed(expanded_scaffold):
#o_lines += curr_scaffold+'\t'+str(new_curr_start)+'\t'+str(new_curr_end)+'\t'+curr_attrs[3]+'\n'
count += 1
if count == 1000000:
print olines
print(olines)
#output.write(o_lines)
count = 0
olines = ""

prev_line = line

#write remaining lines
print olines
print(olines)
#output.write(o_lines)
#output.close()

Expand Down
6 changes: 3 additions & 3 deletions correct.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python
import sys

#reads input assembly, breakpoints given by the method and outputs new contig file with lengths
Expand All @@ -18,7 +18,7 @@ def parse_fasta(fh):
# append nucleotides to current sequence
fa[current_short_name].append(ln.rstrip())
# Part 2: join lists into strings
for short_name, nuc_list in fa.iteritems():
for short_name, nuc_list in fa.items():
# join this sequence's lines into one long string
fa[short_name] = ''.join(nuc_list)
return fa
Expand Down Expand Up @@ -94,7 +94,7 @@ def parse_fasta(fh):
ofasta = open(sys.argv[4]+'/asm.cleaned.fasta','w')
for seq in contig2newseq:
contig_seq = contig2newseq[seq]
chunks = [contig_seq[i:i+80] for i in xrange(0,len(contig_seq),80)]
chunks = [contig_seq[i:i+80] for i in range(0,len(contig_seq),80)]
ofasta.write('>'+seq+'\n')
for chunk in chunks:
ofasta.write(chunk+'\n')
Expand Down
2 changes: 1 addition & 1 deletion fast_scaled_scores.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python
import networkx as nx
import sys
import argparse
Expand Down
12 changes: 6 additions & 6 deletions get_seq.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python
import pickle
import argparse

Expand All @@ -16,7 +16,7 @@ def parse_fasta(fh):
# append nucleotides to current sequence
fa[current_short_name].append(ln.rstrip())
# Part 2: join lists into strings
for short_name, nuc_list in fa.iteritems():
for short_name, nuc_list in fa.items():
# join this sequence's lines into one long string
fa[short_name] = ''.join(nuc_list)
return fa
Expand All @@ -30,7 +30,7 @@ def parse_fasta(fh):
args = parser.parse_args()

revcompl = lambda x: ''.join([{'A':'T','B':'N','C':'G','G':'C','T':'A','N':'N','R':'N','M':'N','Y':'N','S':'N','W':'N','K':'N','a':'t','c':'g','g':'c','t':'a','n':'n',' ':'',}[B] for B in x][::-1])
scaff_map = pickle.load(open(args.map,'r'))
scaff_map = pickle.load(open(args.map,'rb'))

contig_length = {}
id2seq = {}
Expand All @@ -46,11 +46,11 @@ def parse_fasta(fh):
for scaffold in scaff_map:
path = scaff_map[scaffold]
length = 0
for i in xrange(0,len(path)-1,2):
for i in range(0,len(path)-1,2):
length += contig_length[path[i].split(':')[0]]
scaff2length[scaffold] = length

sorted_scaffolds = sorted(scaff2length.items(), key=lambda x: x[1],reverse=True)
sorted_scaffolds = sorted(list(scaff2length.items()), key=lambda x: x[1],reverse=True)

c_id = 1
line = ""
Expand Down Expand Up @@ -114,7 +114,7 @@ def parse_fasta(fh):
# rec = SeqRecord(Seq(curr_contig,generic_dna),id='scaffold_'+str(c_id))
# recs.append(rec)
# print c_id
chunks = [curr_contig[i:i+80] for i in xrange(0,len(curr_contig),80)]
chunks = [curr_contig[i:i+80] for i in range(0,len(curr_contig),80)]
ofile.write('>scaffold_'+str(c_id)+'\n')
for chunk in chunks:
ofile.write(chunk+'\n')
Expand Down
42 changes: 22 additions & 20 deletions layout_unitigs.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python
import networkx as nx
import sys
import operator
Expand Down Expand Up @@ -177,7 +177,7 @@ def load_unitig_mapping():
unitig_graph.add_edge(curr_unitig+':E',prev_unitig+':E')

prev_line = line
print >> sys.stderr, 'Unitig tiling graph loaded, nodes = ' + str(len(unitig_graph.nodes())) + ' edges = ' + str(len(unitig_graph.edges()))
print('Unitig tiling graph loaded, nodes = ' + str(len(unitig_graph.nodes())) + ' edges = ' + str(len(unitig_graph.edges())), file=sys.stderr)
return unitig_graph


Expand Down Expand Up @@ -219,7 +219,7 @@ def load_tenx_graph(cutoff):
contig2scaffold = {}
if int(args.iteration) > 1:
try:
previous_scaffolds = pickle.load(open(args.directory+'/scaffolds_iteration_'+str(iteration-1)+'.p','r'))
previous_scaffolds = pickle.load(open(args.directory+'/scaffolds_iteration_'+str(iteration-1)+'.p','rb'))
for key in previous_scaffolds:
contigs_1 = previous_scaffolds[key]
first = contigs_1[0].split(':')[0]
Expand All @@ -240,7 +240,7 @@ def load_tenx_graph(cutoff):


#Load the best score graph first, keep log of used and unused links
print >> sys.stderr, 'Loading Hi-C links '
print('Loading Hi-C links ', file=sys.stderr)
'''
Counts to keep log of each types of edge loaded in the graph
'''
Expand All @@ -257,11 +257,11 @@ def test_edge(scaffold_first,scaffold_second,G_test,type):
last_first = scaffold_first[-1]
first_second = scaffold_second[0]
last_second = scaffold_second[-1]
print 'testing'
print first_first
print first_second
print last_first
print last_second
print('testing')
print(first_first)
print(first_second)
print(last_first)
print(last_second)
if G_test.has_edge(last_first,first_second):
v1 = c1+':E'
v2 = c2 +':B'
Expand Down Expand Up @@ -579,8 +579,8 @@ def generate_scaffold_graph():
for ctg in list(contigs):
G.add_edge(ctg+":B", ctg+":E", t="c", score=0, linktype='implicit')

print >> sys.stderr, 'Hybrid scaffold graph loaded, nodes = ' + str(len(G.nodes())) + ' edges = ' + str(len(G.edges()))
print >> sys.stderr, 'Hi-C implied edges = ' + str(hic_edges)
print('Hybrid scaffold graph loaded, nodes = ' + str(len(G.nodes())) + ' edges = ' + str(len(G.edges())), file=sys.stderr)
print('Hi-C implied edges = ' + str(hic_edges), file=sys.stderr)


'''
Expand All @@ -592,7 +592,8 @@ def get_seed_scaffold():
seed_scaffolds = {} #this stores initial long scaffolds
to_merge = set()

for subg in nx.connected_component_subgraphs(G):
#for subg in nx.connected_component_subgraphs(G):
for subg in (G.subgraph(c) for c in nx.connected_components(G)):
p = []
for node in subg.nodes():
if subg.degree(node) == 1:
Expand Down Expand Up @@ -664,7 +665,7 @@ def insert(assignment, seed_scaffolds):
orientation = ''
pos = -1
#first check at all middle positions
for i in xrange(1,len(path)-1,2):
for i in range(1,len(path)-1,2):
score_fow = -1
score_rev = -1
if all_G.has_edge(five_prime,path[i]) and all_G.has_edge(three_prime,path[i+1]):
Expand Down Expand Up @@ -722,7 +723,7 @@ def update_bed(expanded_scaffold):
path = expanded_scaffold[key]
scaffold_length[key] = 0
offset = 0
for i in xrange(0,len(path),2):
for i in range(0,len(path),2):
contig = path[i].split(':')[0]
contig2scaffold[contig] = key
ori = path[i].split(':')[1] + path[i+1].split(':')[1]
Expand All @@ -735,7 +736,7 @@ def update_bed(expanded_scaffold):

scaffold_re = {}
for key in expanded_scaffold:
print key
print(key)
path = expanded_scaffold[key]
length = scaffold_length[key]
offset = 0
Expand All @@ -751,7 +752,7 @@ def update_bed(expanded_scaffold):
scaffold_re[key] = (right,left)
else:
midpoint = length/2
for i in xrange(0,len(path),2):
for i in range(0,len(path),2):
contig = path[i].split(':')[0]
contig2scaffold[contig] = key
left,right = re_counts[contig]
Expand Down Expand Up @@ -915,7 +916,8 @@ def merge(contigs):

#print best_hic_graph.nodes()

for g in nx.connected_component_subgraphs(best_hic_graph):
#for g in nx.connected_component_subgraphs(best_hic_graph):
for g in (best_hic_graph.subgraph(c) for c in nx.connected_components(best_hic_graph)):
#print g.nodes()
p = []
for node in g.nodes():
Expand Down Expand Up @@ -983,7 +985,7 @@ def correct_scaffolds(curr_scaffolds):
path = final_scaffolds[key]
new_path = []
#print path
for i in xrange(0,len(path)-1,2):
for i in range(0,len(path)-1,2):
#print path
contig = path[i].split(':')[0]
scaffolded_contigs[contig] = True
Expand All @@ -1010,7 +1012,7 @@ def correct_scaffolds(curr_scaffolds):
oline = ''
oline += 'scaffold_'+str(key) +'\t'
cum_len = 0
for i in xrange(0,len(path)-1,2):
for i in range(0,len(path)-1,2):
#print path
contig = path[i].split(':')[0]
cum_len += prev_len[contig]
Expand Down Expand Up @@ -1095,7 +1097,7 @@ def correct_scaffolds(curr_scaffolds):

# expanded_scaffold_paths = updated_scaffolds

pickle.dump(expanded_scaffold_paths,open(args.directory+'/scaffolds_iteration_'+str(args.iteration)+'.p','w'))
pickle.dump(expanded_scaffold_paths,open(args.directory+'/scaffolds_iteration_'+str(args.iteration)+'.p','wb'))
update_bed(expanded_scaffold_paths)


Expand Down
12 changes: 6 additions & 6 deletions make_links.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python
import operator
import math
import sys
Expand Down Expand Up @@ -55,7 +55,7 @@ def main():
coordinates_map = {}
first_in_pair = {}
second_in_pair = {}
print >> sys.stderr, 'bedfile started'
print('bedfile started', file=sys.stderr)
prev_line = ''
with open(args.mapping,'r') as f:
for line in f:
Expand All @@ -78,8 +78,8 @@ def main():
if prev_read.split('/')[0] == curr_read.split('/')[0]:
#print 'here'

pos1 = (long(prev_attrs[1]) + long(prev_attrs[2]))/2.0
pos2 = (long(attrs[1]) + long(attrs[2]))/2.0
pos1 = (int(prev_attrs[1]) + int(prev_attrs[2]))/2.0
pos2 = (int(attrs[1]) + int(attrs[2]))/2.0
# l1 = 300000.0
# l2 = 300000.0
#print pos1, pos2
Expand Down Expand Up @@ -152,11 +152,11 @@ def main():
# continue


print >> sys.stderr, 'bedfile loaded'
print('bedfile loaded', file=sys.stderr)



print len(contig_links)
print(len(contig_links))
ofile = open(args.directory+'/contig_links_iteration_'+str(iteration),'w')
for key in contig_links:
if key != '':
Expand Down
Loading