Skip to content

Commit

Permalink
Add window and step as optional arguments for SWAN
Browse files Browse the repository at this point in the history
  • Loading branch information
darwinsorchid committed Aug 28, 2024
1 parent 480fcbb commit 8fa4974
Showing 1 changed file with 73 additions and 5 deletions.
78 changes: 73 additions & 5 deletions GC_argparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@
# ===================================================== ARGPARSE SETUP =============================================================
# Create parser instance
parser = argparse.ArgumentParser(description='Calculate GC content of sequence')

# Add positional arguments
parser.add_argument('seq', nargs= '+', help = 'Enter a single or multiple DNA/RNA sequences of the same length')
parser.add_argument('seq', nargs= '+', help = 'Enter a single or multiple DNA/RNA sequences of the same length!')
parser.add_argument('-w', '--window', type = int, help = 'Enter the window size if you want to calculate GC content with sliding window analysis.')
parser.add_argument('-s', '--step', type = int, help = 'Enter step size if you want to calculate GC content with sliding window analysis.' )

# Parse command-line arguments that were passed to the script and store them as attributes to args object
args = parser.parse_args()
Expand All @@ -18,6 +21,8 @@
sequence_list= [Seq(seq) for seq in args.seq]

# ===================================================== FUNCTIONS ==================================================================

# ------------------------------------- CALCULATE GC CONTENT WITHOUT SLIDING WINDOW ------------------------------------------------
def calc_gc(sequence_list):
''' Calculates GC content of sequences and returns list of GC content of each sequence.
'''
Expand Down Expand Up @@ -59,9 +64,72 @@ def create_GC_barplot(sequence_list, gc_list):
# Display the plot
plt.show()

# ---------------------------------------CALCULATE GC CONTENT WITH SLIDING WINDOW -------------------------------------------
def calculate_window_gc(seq, window, step):
'''Function that calculates the GC content of sequenceS with the sliding window method.
Returns a tuple of a list of the windows the sequence has been split to and a list with
the respective GC values of every window analyzed.
[Sliding window analysis (SWAN) is a commonly used method for studying the properties of molecular sequences:
data are plotted as moving averages of a particular criterion, such as the number of nucleotide changes,
for a window of a certain length slid along a sequence or sequence alignment (Tajima, 1991).
SWAN developed to reveal patterns of variability along nucleotide sequences.]
'''
gc_parts = []
window_count = 0
window_list = []

# iterate through windows in a single sequence and calculate GC content for each window
for i in range(0, len(seq) - window +1, step):
gc_part = gc_fraction(seq[i: i+window])
gc_parts.append(gc_part)
window_count +=1
window_list.append(f"Window {window_count}")
return window_list, gc_parts


def multi_window_plot(x_windows, seq_dict):
'''Function that plots the SWAN calculated GC content of multiple sequences passed in by the user. Window and step size are the same
for all the sequences analyzed and they are defined by the user. The function takes in as an argument a dictionary with the sequences
as keys and a list of GC values of each window as values.
'''
colors_list = ['b', 'g', 'm','r', 'c', 'y', 'k']
for num, (key, value) in enumerate(seq_dict.items()):
color = colors_list[num % len(colors_list)]
plt.plot(x_windows[0], value, color = color, marker = "o", linestyle = "--", label = f"{key}")
plt.title("Sliding Window GC Content Calculation")
plt.xlabel("Sequence Windows")
plt.ylabel("GC content")
plt.legend()
plt.tight_layout()
plt.show()

# ===================================================== LOGIC ===============================================================
# Calculate gc_content of passed in sequences
y_values = calc_gc(sequence_list)

# Create barplot of GC content values
create_GC_barplot(sequence_list, y_values)
# Calculate GC content of one or more RNA/DNA sequences with sliding window analysis
if args.window is not None and args.step is not None:
windows = []
gc_content_list = []

# Iterate through each given sequence
for seq in sequence_list:
window_list, gc_content = calculate_window_gc(seq, args.window, args.step)
windows.append(window_list)
gc_content_list.append(gc_content)

# Create dictionary for given sequences and number of respective windows
pairs = zip(sequence_list, gc_content_list)
seq_dict = dict(pairs)
multi_window_plot(windows, seq_dict)
elif args.window is not None and args.step is None:
print("Please provide step size")
elif args.window is None and args.step is not None:
print("Please provide window size")

# Calculate GC content of one or more RNA/DNA sequences without sliding window analysis
else:
# Calculate gc_content of passed in sequences
y_values = calc_gc(sequence_list)

# Create barplot of GC content values
create_GC_barplot(sequence_list, y_values)

0 comments on commit 8fa4974

Please sign in to comment.