Skip to content

Commit ebf3d1f

Browse files
committed
adding group-based intro caller
1 parent d5ba147 commit ebf3d1f

File tree

1 file changed

+152
-0
lines changed

1 file changed

+152
-0
lines changed
+152
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
from pathlib import Path
2+
import pandas as pd
3+
from panagram.index import Index
4+
import plotly.express as px
5+
6+
7+
def better_dir(item):
8+
# don't print hidden functions
9+
methods = dir(item)
10+
return [method for method in methods if not method.startswith("_")]
11+
12+
13+
def visualize(pair, output_file, inverse=False):
14+
# take a look at what pair looks like after manipulation
15+
# pair[pair >= 1] = 10
16+
if inverse:
17+
fig = px.imshow(
18+
pair,
19+
color_continuous_scale=px.colors.sequential.Greens[
20+
::-1
21+
], # px.colors.sequential.Plasma[::-1],
22+
x=pair.columns,
23+
y=pair.index,
24+
aspect="auto",
25+
zmin=0,
26+
zmax=1,
27+
)
28+
else:
29+
fig = px.imshow(
30+
pair,
31+
x=pair.columns,
32+
y=pair.index,
33+
aspect="auto",
34+
)
35+
fig.update_layout(
36+
xaxis=dict(
37+
dtick=2000000,
38+
),
39+
)
40+
fig.write_image(output_file)
41+
return
42+
43+
44+
def merge_adjacent():
45+
return
46+
47+
48+
def run_introgression_finder(
49+
index,
50+
anchor,
51+
chr_name,
52+
group_tsv,
53+
comp_group,
54+
bitmap_step,
55+
bin_size,
56+
output_dir,
57+
):
58+
# Step 1 - choose an anchor and re-create pairwise correlation matrix for it
59+
output_dir = Path(output_dir)
60+
output_dir.mkdir(parents=True, exist_ok=True)
61+
genome = index.genomes[anchor]
62+
groups = pd.read_csv(group_tsv, sep="\t", index_col=0)
63+
64+
# get an entire chr's bitmap
65+
chr_size = genome.sizes[chr_name]
66+
chr_bitmap = genome.query(chr_name, 0, chr_size, step=bitmap_step)
67+
68+
# get correlation matrix
69+
_, pair = index.bitmap_to_bins(chr_bitmap, bin_size)
70+
71+
# show the original heatmap of kmer similarities that panagram shows
72+
# deeper green = more kmer dissimilarity
73+
# visualize(pair, output_dir / f"{anchor}_{chr_name}_original_heatmap.png", inverse=True)
74+
75+
# get the kmer similarities for the anchor's group and the comparison group
76+
pair = pair.merge(groups, left_index=True, right_index=True, how='left')
77+
anchor_group = pair.loc[anchor, "group"] # get the group the anchor belongs to
78+
# make sure anchor's self-similarity gets dropped
79+
pair_anchor_group = pair[pair["group"] == anchor_group].drop(columns=["group"]).drop(anchor, axis=0)
80+
pair_comp_group = pair[pair["group"] == comp_group].drop(columns=["group"])
81+
82+
# get mean similarities per window for each group
83+
group_sims = pair_anchor_group.mean(axis=0).to_frame(name="anchor_sim")
84+
group_sims["comp_sim"] = pair_comp_group.mean(axis=0)
85+
group_sims["introgression"] = (group_sims.comp_sim >= group_sims.anchor_sim)
86+
87+
# show user introgressions labeled on the original heatmap from panagram
88+
pair = pair.drop(columns = ["group"])
89+
pair.loc["Intro?"] = (~group_sims["introgression"]).astype(int)
90+
visualize(pair, output_dir / f"{anchor}_{chr_name}_{comp_group}_heatmap.png", inverse=True)
91+
92+
# find start/end coordinates by merging adjacent introgressions
93+
introgressions = group_sims[group_sims.introgression > 0].copy()
94+
introgressions['start'] = introgressions.index
95+
introgressions['end'] = introgressions['start'] + bin_size
96+
97+
print(introgressions)
98+
99+
# write out a bed file of all found introgressions (chr, start, end)
100+
101+
exit()
102+
return
103+
104+
105+
def main():
106+
# USER PARAMS
107+
bitmap_step = 100
108+
bin_size = 1000000
109+
index_dir = "/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato_sl4"
110+
group_tsv = "/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato_sl4/group.tsv"
111+
comp_group = "SP"
112+
output_dir = Path(
113+
"/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato_sl4/introgression_analysis_v2/"
114+
)
115+
output_dir.mkdir(parents=True, exist_ok=True)
116+
index = Index(index_dir)
117+
118+
# For testing with tomato pangenome
119+
for anchor in ["SL4"]:
120+
genome = index.genomes[anchor]
121+
print(genome.sizes.keys())
122+
for chr_name in ["chr11", "chr4"]:
123+
for comp_group in ["SLC", "SP"]:
124+
print("Now running introgression analysis for", anchor, chr_name, comp_group)
125+
run_introgression_finder(
126+
index,
127+
anchor,
128+
chr_name,
129+
group_tsv,
130+
comp_group,
131+
bitmap_step,
132+
bin_size,
133+
output_dir,
134+
)
135+
136+
# for anchor in index.genomes.keys():
137+
# genome = index.genomes[anchor]
138+
# for chr_name in genome.sizes.keys():
139+
# print("Now running introgression analysis for", anchor, chr_name)
140+
# run_introgression_finder(
141+
# index,
142+
# anchor,
143+
# chr_name,
144+
# bitmap_step,
145+
# bin_size,
146+
# output_dir,
147+
# )
148+
return
149+
150+
151+
if __name__ == "__main__":
152+
main()

0 commit comments

Comments
 (0)