-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path%gc_sliding_window.py
124 lines (97 loc) · 4 KB
/
%gc_sliding_window.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""
Description:
- Sliding window program to compute the %GC in sequence of nucleotides.
Input:
- DNA sequence
- The window size (assume the window increment is 1)
Return:
- Nucleotide number
- %GC for each window
"""
import argparse
import os
def get_args():
parser = argparse.ArgumentParser(
description="",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument('mode', metavar='mode', help='Do you want to write a sequence (1) or read a file (2)')
parser.add_argument('sequence', metavar='seq', help='Enter a DNA sequence or path of the .fasta file')
parser.add_argument('size', metavar='size', help='Enter window size')
args = parser.parse_args()
return args.mode, args.sequence, args.size
# Get window from sequence
def get_chunks(dna, window_size=4, increment=1):
for i in range(0, len(dna) - window_size + 1, increment):
chunk = dna[i:i + window_size]
assert len(chunk) == window_size
yield chunk
# Valid letters in DNA sequence
LETTERS_OF_DNA = ['A','G','T','C','N']
# Get sequence of nucleotides
def getSequence():
# Get the mode of read input from the user
mode = get_args()[0]
if mode == "1":
# Enter the sequence
sequence = get_args()[1]
elif mode == "2":
file_name = get_args()[1]
try:
path = f"{os.path.dirname(os.path.realpath(__file__))}\{file_name}"
with open(path, "r", encoding='utf-8-sig') as file:
# Read the text from a file
sequence = file.read()
except Exception as e:
if "No such file or directory" in e.__str__():
raise Exception("**ERROR** No such file or directory")
sequence = sequence.split("\n")[1:]
sequence = "".join(sequence)
# Remove spaces and line breaks from file
sequence.replace('\n', '').replace('\r', '')
else:
raise Exception("**ERROR** Invalid mode")
return sequence
def main():
sequence = getSequence()
# Go through all the letters in sequence to check the validity of it.
for letter in sequence:
if letter.upper() in LETTERS_OF_DNA:
# If letter is valid => pass to next letter;
pass
else:
# If one letter is not valid => break;
raise Exception("**ERROR** Invalid sequence")
# Convert the sequence to upper letters to unify the sequence
sequence = sequence.upper()
# Enter the window size
window_size = get_args()[2]
# Check is the window size entered is valid or not
try:
window_size = int(window_size)
except:
raise Exception("**ERROR** Invalid window size.")
if window_size > len(sequence):
raise Exception("**ERROR** window size must not be larger than sequence length.")
print("\nStart processing ...\n")
c = 1
for chunk in get_chunks(sequence, window_size):
g_count = float(chunk.count("G")) # Count G in sequence
c_count = float(chunk.count("C")) # Count C in sequence
a_count = float(chunk.count("A")) # Count A in sequence
t_count = float(chunk.count("T")) # Count T in sequence
n_count = float(chunk.count("N")) # Count N in sequence
print(f"Window {c}: {chunk}")
print(f"Nucleotide number: {len(chunk)}")
print(f"There are {g_count} Gs, {c_count} Cs, {a_count} As and {t_count} Ts, in the DNA strand.")
print(f"GC content is: {(g_count+c_count)*100/(a_count+t_count+g_count+c_count+n_count)}%")
print("======================")
c += 1
print("\nEnd processing ...\n")
if __name__ == "__main__":
main()
"""
Resources:
- https://www.geeksforgeeks.org/window-sliding-technique/
- https://en.wikipedia.org/wiki/GC-content#:~:text=In%20molecular%20biology%20and%20genetics,)%20or%20cytosine%20(C).
"""