-
Notifications
You must be signed in to change notification settings - Fork 119
/
Copy pathdetect_adapter.py
executable file
·84 lines (68 loc) · 2.37 KB
/
detect_adapter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# written by Nathan Boley, from https://github.com/nboley/GGR_code
import sys
import gzip
VERBOSE = False
adapters = {
'Illumina': b'AGATCGGAAGAGC',
'Nextera ': b'CTGTCTCTTATA',
'smallRNA': b'TGGAATTCTCGG'
}
def open_gz(fname):
return gzip.open(fname) if fname.endswith('.gz') else open(fname, 'rb')
def detect_adapters_and_cnts(fname, max_n_lines=1000000):
adapter_cnts = {
'Illumina': 0,
'Nextera ': 0,
'smallRNA': 0
}
with open_gz(fname) as fp:
# read the first million sequences or to the end of the while -- whichever
# comes first, and then use the adapter for trimming which was found to
# occur most often
for seq_index, line in enumerate(fp):
if seq_index >= max_n_lines:
break
if seq_index % 4 != 1:
continue
for key in adapters:
if line.find(adapters[key]) > -1:
adapter_cnts[key] += 1
observed_adapters = [
adapter for adapter, cnt in sorted(
adapter_cnts.items(), key=lambda x: -x[1])
if cnt > 0
]
return observed_adapters, adapter_cnts, seq_index//4
def detect_most_likely_adapter(fname):
observed_adapters, adapter_cnts, n_obs_adapters = detect_adapters_and_cnts(
fname)
if observed_adapters:
best_adapter = observed_adapters[0]
else:
best_adapter = ""
if VERBOSE:
print("\n\nAUTO-DETECTING ADAPTER TYPE\n===========================")
print("Attempting to auto-detect adapter type from the first 1 million sequences of the first file (>> {} <<)\n".format(
fname)
)
print("Found perfect matches for the following adapter sequences:")
print("Adapter type\tCount\tSequence\tSequences analysed\tPercentage")
for adapter in observed_adapters:
print("{}\t{}\t{}\t{}\t\t\t{:.2%}".format(
adapter,
adapter_cnts[adapter],
adapters[adapter].decode(),
n_obs_adapters,
adapter_cnts[adapter]/n_obs_adapters)
)
if best_adapter:
return adapters[best_adapter].decode()
else:
return ""
def main():
global VERBOSE
VERBOSE = False
best_adapter = detect_most_likely_adapter(sys.argv[1])
print(best_adapter)
if __name__ == '__main__':
main()