-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathShortestCommonSuperstring.py
More file actions
63 lines (52 loc) · 2.3 KB
/
ShortestCommonSuperstring.py
File metadata and controls
63 lines (52 loc) · 2.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from Overlap import Overlap
from itertools import permutations
from Overlap import pick_Maximal_overlap, pick_maximal_overlap_with_precomputed_kmers
import time
from collections import defaultdict, Counter
# brute force
def NaiveSCS(reads):
""" Returns shortest common superstring of given strings,
assuming no string is a strict substring of another """
shortest_superstring = None
for s in permutations(reads): # returns every possible ordering in the list of reads as tuples
sup = s[0]
for i in range(len(reads) - 1):
overlap_len = Overlap(s[i], s[i+1], min_length=1)
sup += s[i+1][overlap_len:]
if shortest_superstring is None or len(sup) < len(shortest_superstring):
shortest_superstring = sup
return shortest_superstring
def SCS_list(reads):
# Returns list of all superstrings that are tied for shortest
shortest_superstrings = set()
shortest_superstring = NaiveSCS(reads)
for s in permutations(reads): # returns every possible ordering in the list of reads as tuples
shortest = NaiveSCS(s)
if len(shortest) == len(shortest_superstring):
shortest_superstrings.add(shortest)
return shortest_superstrings
def GreedySCS(reads, k):
""" Greedy shortest-common-superstring merge.
Repeat until no edges (overlaps of length >= k)
remain. """
start = time.process_time()
read_a, read_b, max_overlap_length = pick_Maximal_overlap(reads, k)
while max_overlap_length > 0:
reads.remove(read_a)
reads.remove(read_b)
reads.append(read_a + read_b[max_overlap_length:])
read_a, read_b, max_overlap_length = pick_Maximal_overlap(reads, k)
end = time.process_time()
print(end-start)
return ''.join(reads)
def FasterGreedySCS(reads, k=10): # uses precomputed kmers, the rest is the same as the GreedySCS
start = time.process_time()
read_a, read_b, max_overlap_length = pick_maximal_overlap_with_precomputed_kmers(reads, k)
while max_overlap_length > 0:
reads.remove(read_a)
reads.remove(read_b)
reads.append(read_a + read_b[max_overlap_length:])
read_a, read_b, max_overlap_length = pick_maximal_overlap_with_precomputed_kmers(reads, k)
end = time.process_time()
print(end-start)
return ''.join(reads)