Skip to content

Commit 95c7d79

Browse files
committed
added ability to merge different introgression calls
1 parent 02fb392 commit 95c7d79

File tree

1 file changed

+74
-53
lines changed

1 file changed

+74
-53
lines changed

panagram/introgressions_group_based.py

+74-53
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def visualize(pair, output_file, inverse=False):
1616
y=pair.index,
1717
aspect="auto",
1818
zmin=0,
19-
zmax=1,
19+
# zmax=1,
2020
)
2121
else:
2222
fig = px.imshow(
@@ -83,6 +83,36 @@ def threshold_introgressions(pair, anchor, comp_group):
8383
return group_sims
8484

8585

86+
def threshold_to_bed(group_sims, bin_size, chr_name, comp_group, output_file):
87+
# find start/end coordinates
88+
introgressions = group_sims[group_sims.introgression > 0].copy()
89+
introgressions["start"] = introgressions.index
90+
introgressions["end"] = introgressions["start"] + bin_size
91+
92+
# call adjacent introgressions as the same
93+
# check if end of the prev col is the same as the start of the current
94+
introgressions["groups"] = (introgressions.start - introgressions.end.shift(1)).fillna(0)
95+
# use cumsum to assign adjacent intros to the same group
96+
introgressions["groups"] = introgressions["groups"].cumsum()
97+
# get the number of bins in each group and use to calculate new start/end
98+
bins_in_groups = introgressions.groupby("groups").count()["end"]
99+
introgressions = introgressions.drop_duplicates(subset="groups", keep="first")
100+
introgressions = introgressions.drop(columns="end").merge(bins_in_groups, on="groups")
101+
introgressions["end"] = introgressions.start + (introgressions.end * bin_size)
102+
103+
# write out a bed file of all found introgressions (chr, start, end)
104+
introgressions["chr"] = chr_name
105+
introgressions["name"] = f"{comp_group}_intro"
106+
introgressions = introgressions[["chr", "start", "end", "name"]]
107+
introgressions.to_csv(
108+
output_file,
109+
header=False,
110+
index=False,
111+
sep="\t",
112+
)
113+
return
114+
115+
86116
def run_introgression_finder(
87117
index,
88118
anchor,
@@ -113,45 +143,36 @@ def run_introgression_finder(
113143
# get the kmer similarities for the anchor's group and the comparison group
114144
pair = pair.merge(groups, left_index=True, right_index=True, how="left")
115145

116-
# TODO: for comp_group in comp_groups
146+
merged_sims = None
117147
for comp_group in comp_groups:
118148
group_sims = threshold_introgressions(pair, anchor, comp_group)
149+
if merged_sims is None:
150+
merged_sims = group_sims
151+
else:
152+
merged_sims += group_sims
119153

120154
# show user introgressions labeled on the original heatmap from panagram
121155
# invert values for figure so that introgressions = 0
122156
pair.loc["Intro. Score"] = (~(group_sims["introgression"].astype(bool))).astype(int)
123-
visualize(pair.drop(columns=["group"]), output_dir / f"{anchor}_{chr_name}_{comp_group}_heatmap.png", inverse=True)
124-
125-
print(comp_group)
126-
print(group_sims)
127-
exit(0)
128-
129-
# find start/end coordinates
130-
introgressions = group_sims[group_sims.introgression > 0].copy()
131-
introgressions["start"] = introgressions.index
132-
introgressions["end"] = introgressions["start"] + bin_size
133-
134-
# call adjacent introgressions as the same
135-
# check if end of the prev col is the same as the start of the current
136-
introgressions["groups"] = (introgressions.start - introgressions.end.shift(1)).fillna(0)
137-
# use cumsum to assign adjacent intros to the same group
138-
introgressions["groups"] = introgressions["groups"].cumsum()
139-
# get the number of bins in each group and use to calculate new start/end
140-
bins_in_groups = introgressions.groupby("groups").count()["end"]
141-
introgressions = introgressions.drop_duplicates(subset="groups", keep="first")
142-
introgressions = introgressions.drop(columns="end").merge(bins_in_groups, on="groups")
143-
introgressions["end"] = introgressions.start + (introgressions.end * bin_size)
144-
145-
# write out a bed file of all found introgressions (chr, start, end)
146-
introgressions["chr"] = chr_name
147-
introgressions["name"] = f"{comp_group}_intro"
148-
introgressions = introgressions[["chr", "start", "end", "name"]]
149-
introgressions.to_csv(
150-
output_dir / f"{anchor}_{chr_name}_{comp_group}.bed",
151-
header=False,
152-
index=False,
153-
sep="\t",
154-
)
157+
output_vis = output_dir / f"{anchor}_{chr_name}_{comp_group}_heatmap.png"
158+
visualize(pair.drop(columns=["group"]), output_vis, inverse=True)
159+
160+
# save introgressions to bed file
161+
output_bed = output_dir / f"{anchor}_{chr_name}_{comp_group}.bed"
162+
threshold_to_bed(group_sims, bin_size, chr_name, comp_group, output_bed)
163+
164+
# repeat analysis one more with merged introgressions from all comp_groups
165+
# pair.loc["Intro. Score"] = (~(merged_sims["introgression"].astype(bool))).astype(int)
166+
167+
# divide by the max to get between 0 and 1, do 1 - x to invert
168+
pair.loc["Intro. Score"] = 1 - (merged_sims["introgression"] / merged_sims["introgression"].max())
169+
170+
output_vis = output_dir / f"{anchor}_{chr_name}_merged_heatmap.png"
171+
visualize(pair.drop(columns=["group"]), output_vis, inverse=True)
172+
173+
# save introgressions to bed file
174+
output_bed = output_dir / f"{anchor}_{chr_name}_merged.bed"
175+
threshold_to_bed(merged_sims, bin_size, chr_name, comp_group, output_bed)
155176
return
156177

157178

@@ -169,25 +190,10 @@ def main():
169190
index = Index(index_dir)
170191

171192
# For testing with tomato pangenome
172-
for anchor in ["M82"]:
173-
genome = index.genomes[anchor]
174-
print(genome.sizes.keys())
175-
for chr_name in ["chr11", "chr4"]:
176-
print("Now running introgression analysis for", anchor, chr_name)
177-
run_introgression_finder(
178-
index,
179-
anchor,
180-
chr_name,
181-
group_tsv,
182-
comp_groups,
183-
bitmap_step,
184-
bin_size,
185-
output_dir,
186-
)
187-
188-
# for anchor in index.genomes.keys():
193+
# for anchor in ["M82"]:
189194
# genome = index.genomes[anchor]
190-
# for chr_name in genome.sizes.keys():
195+
# print(genome.sizes.keys())
196+
# for chr_name in ["chr11", "chr4"]:
191197
# print("Now running introgression analysis for", anchor, chr_name)
192198
# run_introgression_finder(
193199
# index,
@@ -199,7 +205,22 @@ def main():
199205
# bin_size,
200206
# output_dir,
201207
# )
202-
# NOTE: if anchor not in SLL group, we can skip
208+
209+
for anchor in index.genomes.keys():
210+
genome = index.genomes[anchor]
211+
for chr_name in genome.sizes.keys():
212+
print("Now running introgression analysis for", anchor, chr_name)
213+
run_introgression_finder(
214+
index,
215+
anchor,
216+
chr_name,
217+
group_tsv,
218+
comp_groups,
219+
bitmap_step,
220+
bin_size,
221+
output_dir,
222+
)
223+
# NOTE: if anchor not in SLL group, we could skip
203224
return
204225

205226

0 commit comments

Comments
 (0)