Skip to content

Commit 86056c6

Browse files
committed
added visualization script for introgressions; reorg of some funcs in postprocess_introgressions.py
1 parent 280ce78 commit 86056c6

File tree

3 files changed

+177
-57
lines changed

3 files changed

+177
-57
lines changed

panagram/postprocess_introgressions.py

+20-54
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212

1313
from pathlib import Path
1414
import subprocess
15-
import math
1615
import numpy as np
1716
import pandas as pd
1817
from panagram.index import Index
@@ -264,27 +263,37 @@ def read_introgressions(introgression_file, fix_names=False):
264263
return intro_df
265264

266265

267-
def score_introgressions(called_intro_file, gt_intro_file, threshold):
268-
# get confusion matrix for introgressions given ground truth in the same coordinate/bin space
266+
def threshold_introgressions_helper(intro_df, threshold):
267+
intro_df[intro_df < threshold] = 0
268+
intro_df[intro_df != 0] = 1
269+
return intro_df
270+
271+
272+
def threshold_introgressions(called_intro_file, gt_intro_file, threshold):
269273
# if given multiple gt files, merge together
270-
if type(gt_intro_file) == list:
274+
if type(gt_intro_file) == list or type(gt_intro_file) == tuple:
271275
gt_intro_df1 = read_introgressions(gt_intro_file[0], fix_names=True)
272276
gt_intro_df2 = read_introgressions(gt_intro_file[1], fix_names=True)
273277
# threshold and merge multiple intro types
274-
gt_intro_df1[gt_intro_df1 <= threshold] = 0
275-
gt_intro_df1[gt_intro_df1 != 0] = 1
276-
gt_intro_df2[gt_intro_df2 <= threshold] = 0
277-
gt_intro_df2[gt_intro_df2 != 0] = 1
278+
gt_intro_df1 = threshold_introgressions_helper(gt_intro_df1, threshold)
279+
gt_intro_df2 = threshold_introgressions_helper(gt_intro_df2, threshold)
278280
gt_intro_df = gt_intro_df1 + gt_intro_df2
279281
gt_intro_df[gt_intro_df > 0] = 1
280282
else:
281283
gt_intro_df = read_introgressions(gt_intro_file, fix_names=True)
282-
gt_intro_df[gt_intro_df <= threshold] = 0
283-
gt_intro_df[gt_intro_df != 0] = 1
284+
threshold_introgressions_helper(gt_intro_df, threshold)
284285

285286
# make sure all called introgressions are scored as a 1
286287
called_intro_df = read_introgressions(called_intro_file)
287-
called_intro_df[called_intro_df >= 1] = 1
288+
called_intro_df = threshold_introgressions_helper(called_intro_df, threshold=1)
289+
return called_intro_df, gt_intro_df
290+
291+
292+
def score_introgressions(called_intro_file, gt_intro_file, threshold):
293+
# get confusion matrix for introgressions given ground truth in the same coordinate/bin space
294+
called_intro_df, gt_intro_df = threshold_introgressions(
295+
called_intro_file, gt_intro_file, threshold
296+
)
288297

289298
# rotate dfs, sort cols, and drop all columns that aren't shared btwn called and gt
290299
shared_cols = list(set(called_intro_df.index).intersection(set(gt_intro_df.index)))
@@ -326,49 +335,6 @@ def score_introgressions(called_intro_file, gt_intro_file, threshold):
326335
return metrics
327336

328337

329-
def create_heatmap(distances_file):
330-
import plotly.express as px
331-
332-
distances_file = Path(distances_file)
333-
output_file = distances_file.parent / (distances_file.stem + ".png")
334-
distances = pd.read_csv(distances_file, sep="\t", index_col=0).fillna(0)
335-
new_index = list(distances.index)
336-
337-
# for sl4
338-
# new_index = [i.split(".")[0] for i in new_index]
339-
# for sl5 - get the name of the acession without number added by Jasmine
340-
new_index = [i.split("_")[1] for i in new_index]
341-
342-
distances.index = new_index
343-
distances = distances.sort_index()
344-
345-
fig = px.imshow(distances, color_continuous_scale="Greens", aspect="auto", zmin=0, zmax=1)
346-
fig.update_layout(yaxis=dict(tickmode="linear"), title=distances_file.stem)
347-
fig.write_image(output_file)
348-
return
349-
350-
351-
def self_dotplot(seq):
352-
import numpy as np
353-
import matplotlib.pyplot as plt
354-
355-
# look at a 10kb piece of a sequence
356-
middle = int(len(seq) / 2)
357-
seq = seq[middle - 5000 : middle + 5000]
358-
length = len(seq)
359-
360-
matrix = np.zeros((length, length))
361-
362-
for i in range(length):
363-
for j in range(length):
364-
if seq[i] == seq[j]:
365-
matrix[i, j] = 1
366-
367-
plt.imshow(matrix, cmap="Greys", interpolation="none")
368-
plt.savefig("./test.png")
369-
return
370-
371-
372338
def score_all_introgressions():
373339
# NOTE: change parameters here
374340
index_dir = Path("/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato_sl4")

panagram/view.py

+15-3
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,9 @@ def view(params):
198198
children=[
199199
# left figure
200200
dcc.Graph(
201-
id="chromosome", config=config, style={"font-size": 20, "height": 350}
201+
id="chromosome",
202+
config=config,
203+
style={"font-size": 20, "height": 350},
202204
)
203205
],
204206
)
@@ -210,7 +212,9 @@ def view(params):
210212
className="w3-container",
211213
children=[
212214
dcc.Graph(
213-
id="primary", config=config, style={"height": 1000, "font-size": 20}
215+
id="primary",
216+
config=config,
217+
style={"height": 1000, "font-size": 20},
214218
)
215219
],
216220
)
@@ -888,7 +892,15 @@ def get_local_info(bar_sum_regional, anchor_name, chrs):
888892
return fig
889893

890894
def plot_interactive(
891-
anchor_name, chrom, start_coord, end_coord, step, bitmap, pancounts, paircounts, genes
895+
anchor_name,
896+
chrom,
897+
start_coord,
898+
end_coord,
899+
step,
900+
bitmap,
901+
pancounts,
902+
paircounts,
903+
genes,
892904
):
893905
t0 = time.perf_counter()
894906

panagram/visualize_introgressions.py

+142
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
from pathlib import Path
2+
import numpy as np
3+
import pandas as pd
4+
import plotly.express as px
5+
from postprocess_introgressions import threshold_introgressions
6+
7+
8+
def self_dotplot(seq):
9+
import matplotlib.pyplot as plt
10+
11+
# look at a 10kb piece of a sequence
12+
middle = int(len(seq) / 2)
13+
seq = seq[middle - 5000 : middle + 5000]
14+
length = len(seq)
15+
16+
matrix = np.zeros((length, length))
17+
18+
for i in range(length):
19+
for j in range(length):
20+
if seq[i] == seq[j]:
21+
matrix[i, j] = 1
22+
23+
plt.imshow(matrix, cmap="Greys", interpolation="none")
24+
plt.savefig("./test.png")
25+
return
26+
27+
28+
def create_heatmap(distances_file):
29+
distances_file = Path(distances_file)
30+
output_file = distances_file.parent / (distances_file.stem + ".png")
31+
distances = pd.read_csv(distances_file, sep="\t", index_col=0).fillna(0)
32+
# new_index = list(distances.index)
33+
34+
# for sl4
35+
# new_index = [i.split(".")[0] for i in new_index]
36+
# for sl5 - get the name of the acession without number added by Jasmine
37+
# new_index = [i.split("_")[1] for i in new_index]
38+
39+
# distances.index = new_index
40+
distances = distances.sort_index()
41+
42+
fig = px.imshow(distances, color_continuous_scale="Greens", aspect="auto", zmin=0, zmax=1)
43+
fig.update_layout(yaxis=dict(tickmode="linear"), title=distances_file.stem)
44+
fig.write_image(output_file)
45+
return
46+
47+
48+
def create_scored_heatmap(called_intro_file, gt_intro_file, threshold):
49+
# read both distances files
50+
output_file = called_intro_file.parent / (called_intro_file.stem + ".scored.png")
51+
52+
# threshold and rename the accessions to match
53+
called_intro_df, gt_intro_df = threshold_introgressions(
54+
called_intro_file, gt_intro_file, threshold
55+
)
56+
57+
# only visualize the accessions that are shared btwn both files
58+
shared_accessions = list(
59+
set(called_intro_df.index.values).intersection(set(gt_intro_df.index.values))
60+
)
61+
called_intro_df = called_intro_df[called_intro_df.index.isin(shared_accessions)].sort_index()
62+
gt_intro_df = gt_intro_df[gt_intro_df.index.isin(shared_accessions)].sort_index()
63+
64+
# label all positions as green for TP, yellow for FP, red for FN, white/blank for TN
65+
# true pos
66+
called_intro_df[(called_intro_df == 1) & (gt_intro_df == 1)] = 5
67+
68+
# true neg
69+
called_intro_df[(called_intro_df == 0) & (gt_intro_df == 0)] = 4
70+
71+
# false pos
72+
called_intro_df[(called_intro_df == 1) & (gt_intro_df == 0)] = 3
73+
74+
# false neg
75+
called_intro_df[(called_intro_df == 0) & (gt_intro_df == 1)] = 2
76+
77+
# visualize
78+
fig = px.imshow(
79+
called_intro_df, color_continuous_scale=["red", "yellow", "gray", "green"], aspect="auto"
80+
)
81+
fig.update_layout(
82+
coloraxis_showscale=False, yaxis=dict(tickmode="linear"), title=output_file.stem
83+
)
84+
fig.write_image(output_file)
85+
return
86+
87+
88+
def create_heatmap_runner():
89+
# NOTE: change folder here
90+
input_folder = Path(
91+
"/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato_sl4/introgression_analysis_v2/postprocessed"
92+
)
93+
94+
distances_files = list(input_folder.glob("chr*.txt"))
95+
for file in distances_files:
96+
print(f"Visualizing {file.name}")
97+
create_heatmap(file)
98+
return
99+
100+
101+
def create_scored_heatmap_runner():
102+
# NOTE: change folders here
103+
called_intros_folder = Path(
104+
"/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato_sl4/introgression_analysis_v2/postprocessed"
105+
)
106+
gt_intros_folder = Path(
107+
"/home/nbrown62/data_mschatz1/nbrown62/CallIntrogressions_data/tomato_sl4_paper"
108+
)
109+
introgression_type = "REF"
110+
threshold = 0.5
111+
112+
# get gt and called intro files
113+
called_intros_files = list(called_intros_folder.glob(f"chr*{introgression_type}.txt"))
114+
gt_intros_files_sp = list(gt_intros_folder.glob(f"chr*SP.txt"))
115+
gt_intros_files_slc = list(gt_intros_folder.glob(f"chr*SLC.txt"))
116+
117+
if introgression_type == "SP":
118+
gt_intros_files = gt_intros_files_sp
119+
elif introgression_type == "SLC":
120+
gt_intros_files = gt_intros_files_slc
121+
elif introgression_type == "merged" or introgression_type == "REF":
122+
# merge files together
123+
gt_intros_files_sp.sort()
124+
gt_intros_files_slc.sort()
125+
gt_intros_files = list(zip(gt_intros_files_slc, gt_intros_files_sp))
126+
127+
# sort the files and make sure there are equal numbers of files
128+
called_intros_files.sort()
129+
gt_intros_files.sort()
130+
131+
if len(called_intros_files) != len(gt_intros_files):
132+
raise ValueError("Unequal numbers of GT and called chromosome files...")
133+
134+
# pass tuples of files to create_scored_heatmap for each chromosome
135+
for called_intros_file, gt_intros_file in zip(called_intros_files, gt_intros_files):
136+
create_scored_heatmap(called_intros_file, gt_intros_file, threshold=0.5)
137+
return
138+
139+
140+
if __name__ == "__main__":
141+
# create_heatmap_runner()
142+
create_scored_heatmap_runner()

0 commit comments

Comments
 (0)