-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnanoporemet.py
executable file
·170 lines (134 loc) · 7.07 KB
/
nanoporemet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import os
import glob
import subprocess
import gzip
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
def plot_histogram(data, column_name, title, xlabel, ylabel, ax, log_scale=False):
n, bins, patches = ax.hist(data[column_name], bins=100, edgecolor='black', log=log_scale, color='white', lw=1.2, zorder=2)
median_value = data[column_name].median()
rounded_median = round(median_value) if column_name == "sequence_length_template" else round(median_value, 2)
ax.set_title(f"{title}", loc='left') # Align title to the left
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
if log_scale:
ax.set_yscale('log')
ax.grid(axis='y', linestyle='-', alpha=0.5, color='lightgrey', zorder=1) # Add light-grey gridlines
# Plot median as a vertical line
ax.axvline(x=median_value, color='red', linestyle='dashed', linewidth=2, label=f"Median: {rounded_median}")
ax.legend()
def generate_sequencing_summary_pdf(directory):
pdf_path = os.path.join(directory, "sequencing_summary.pdf")
# Check if the sequencing_summary.pdf file already exists
if os.path.exists(pdf_path):
print("The sequencing_summary.pdf file has already been created.")
return pdf_path
# Find the file with the name "sequencing_summary_*.txt"
file_pattern = os.path.join(directory, "sequencing_summary_*.txt")
files = glob.glob(file_pattern)
if not files:
print("The sequencing_summary.txt file is not available yet.")
return pdf_path
# Take the first matching file (you can modify this logic if needed)
file_path = files[0]
# Read the data from the file
data = pd.read_csv(file_path, sep='\t')
# Create histograms and save in one PDF file
with PdfPages(pdf_path) as pdf:
# Plot for all data
fig, axs = plt.subplots(figsize=(8, 6))
plot_histogram(data, "mean_qscore_template", "Mean Q score (all reads)",
"Mean Q score", "Frequency", axs)
pdf.savefig(fig)
plt.close(fig)
# Filter based on the "passes_filtering" column
filtered_data = data[data["passes_filtering"] == True]
# Plot for filtered data
fig, axs = plt.subplots(figsize=(8, 6)) # Create a new figure
plot_histogram(filtered_data, "mean_qscore_template", "Mean Q score (quality-filtered reads)",
"Mean Q score", "Frequency", axs)
pdf.savefig(fig)
plt.close(fig)
# Plot for sequence length (all reads)
fig, axs = plt.subplots(figsize=(8, 6)) # Create a new figure
plot_histogram(data, "sequence_length_template", "Sequence length (all reads)",
"Sequence length", "Frequency", axs, log_scale=True)
pdf.savefig(fig)
plt.close(fig)
# Plot for sequence length (quality-filtered reads)
fig, axs = plt.subplots(figsize=(8, 6)) # Create a new figure
plot_histogram(filtered_data, "sequence_length_template", "Sequence length (quality-filtered reads)",
"Sequence length", "Frequency", axs, log_scale=True)
pdf.savefig(fig)
plt.close(fig)
print(f"Plots saved in {pdf_path}")
return pdf_path
def run_kraken(directory, kraken_db, analyze_bacterial):
# Path to the output .fastq file
output_fastq = os.path.join(directory, "concatenated.fastq")
# Collect fastq.gz files in the directory
fastq_files = [file for file in os.listdir(directory) if file.endswith('.fastq.gz')]
# Check if there are fastq.gz files in the directory
if fastq_files:
# Open the output file in binary mode and concatenate the content of fastq.gz files
with open(output_fastq, 'wb') as output_handle:
for file in fastq_files:
with open(os.path.join(directory, file), 'rb') as input_handle:
output_handle.write(input_handle.read())
# Run Kraken2 on the concatenated .fastq file
output_kraken = os.path.join(directory, f"{os.path.basename(directory)}.kreport.txt")
kraken_command = f"kraken2 --db {kraken_db} --output {output_fastq.replace('.fastq', '.kraken')} --report {output_kraken} {output_fastq}"
subprocess.run(kraken_command, shell=True)
# Add a column with title to the kreport.txt file
title = os.path.basename(directory)
with open(output_kraken, 'r') as f:
lines = f.readlines()
lines = [f"{title}\t{line}" for line in lines] # Prepend the title column
with open(output_kraken, 'w') as f:
f.writelines(lines)
# Cleanup: Remove the concatenated .fastq file after Kraken2 analysis
os.remove(output_fastq)
else:
print(f"No fastq.gz files found in {directory}")
def main():
# Use the current working directory as the analysis directory
analysis_directory = os.getcwd()
# Ask if bacterial reads analysis is desired
analyze_bacterial = input("Do you wish to analyze bacterial reads? (yes/y or no/n): ").lower().strip()
# Generate sequencing_summary.pdf
pdf_path = generate_sequencing_summary_pdf(analysis_directory)
# Set Kraken2 database path based on user input
if analyze_bacterial in ['yes', 'y']:
kraken_db_path = "/data/kraken_databases/k2_pluspf_08gb_20231009/"
output_file_name = "virus_bacteria.kraken.txt"
else:
kraken_db_path = "/data/kraken_databases/k2_human-viral_20240111/"
output_file_name = "virus.kraken.txt"
# List to store individual kreport.txt DataFrames
kreport_dfs = []
# Navigate through each sub-folder in "fastq_pass" and run Kraken2 analysis
fastq_pass_directory = os.path.join(analysis_directory, "fastq_pass")
for subdir in os.listdir(fastq_pass_directory):
subdir_path = os.path.join(fastq_pass_directory, subdir)
if os.path.isdir(subdir_path) and any(os.scandir(subdir_path)):
run_kraken(subdir_path, kraken_db_path, analyze_bacterial)
output_kraken = os.path.join(subdir_path, f"{os.path.basename(subdir_path)}.kreport.txt")
kreport_df = pd.read_csv(output_kraken, sep='\t', header=None)
kreport_dfs.append(kreport_df)
else:
# For real-time analysis when the folder already exists but no file yet
print(f"Skipping empty subdirectory: {subdir}")
# Combine all kreport.txt DataFrames into one DataFrame
combined_kreport = pd.concat(kreport_dfs)
# Duplicate and modify the DataFrame for "all" rows
combined_kreport_all = combined_kreport.copy()
combined_kreport_all[0] = "all"
# Concatenate the original DataFrame and the modified DataFrame
combined_kreport_final = pd.concat([combined_kreport, combined_kreport_all])
# Save the combined DataFrame with the dynamically generated output file name
combined_output_path = os.path.join(analysis_directory, output_file_name)
combined_kreport_final.to_csv(combined_output_path, sep='\t', header=False, index=False)
print(f"Kraken analysis results saved in {combined_output_path}")
if __name__ == "__main__":
main()