@@ -16,7 +16,7 @@ def visualize(pair, output_file, inverse=False):
16
16
y = pair .index ,
17
17
aspect = "auto" ,
18
18
zmin = 0 ,
19
- zmax = 1 ,
19
+ # zmax=1,
20
20
)
21
21
else :
22
22
fig = px .imshow (
@@ -83,6 +83,36 @@ def threshold_introgressions(pair, anchor, comp_group):
83
83
return group_sims
84
84
85
85
86
+ def threshold_to_bed (group_sims , bin_size , chr_name , comp_group , output_file ):
87
+ # find start/end coordinates
88
+ introgressions = group_sims [group_sims .introgression > 0 ].copy ()
89
+ introgressions ["start" ] = introgressions .index
90
+ introgressions ["end" ] = introgressions ["start" ] + bin_size
91
+
92
+ # call adjacent introgressions as the same
93
+ # check if end of the prev col is the same as the start of the current
94
+ introgressions ["groups" ] = (introgressions .start - introgressions .end .shift (1 )).fillna (0 )
95
+ # use cumsum to assign adjacent intros to the same group
96
+ introgressions ["groups" ] = introgressions ["groups" ].cumsum ()
97
+ # get the number of bins in each group and use to calculate new start/end
98
+ bins_in_groups = introgressions .groupby ("groups" ).count ()["end" ]
99
+ introgressions = introgressions .drop_duplicates (subset = "groups" , keep = "first" )
100
+ introgressions = introgressions .drop (columns = "end" ).merge (bins_in_groups , on = "groups" )
101
+ introgressions ["end" ] = introgressions .start + (introgressions .end * bin_size )
102
+
103
+ # write out a bed file of all found introgressions (chr, start, end)
104
+ introgressions ["chr" ] = chr_name
105
+ introgressions ["name" ] = f"{ comp_group } _intro"
106
+ introgressions = introgressions [["chr" , "start" , "end" , "name" ]]
107
+ introgressions .to_csv (
108
+ output_file ,
109
+ header = False ,
110
+ index = False ,
111
+ sep = "\t " ,
112
+ )
113
+ return
114
+
115
+
86
116
def run_introgression_finder (
87
117
index ,
88
118
anchor ,
@@ -113,45 +143,36 @@ def run_introgression_finder(
113
143
# get the kmer similarities for the anchor's group and the comparison group
114
144
pair = pair .merge (groups , left_index = True , right_index = True , how = "left" )
115
145
116
- # TODO: for comp_group in comp_groups
146
+ merged_sims = None
117
147
for comp_group in comp_groups :
118
148
group_sims = threshold_introgressions (pair , anchor , comp_group )
149
+ if merged_sims is None :
150
+ merged_sims = group_sims
151
+ else :
152
+ merged_sims += group_sims
119
153
120
154
# show user introgressions labeled on the original heatmap from panagram
121
155
# invert values for figure so that introgressions = 0
122
156
pair .loc ["Intro. Score" ] = (~ (group_sims ["introgression" ].astype (bool ))).astype (int )
123
- visualize (pair .drop (columns = ["group" ]), output_dir / f"{ anchor } _{ chr_name } _{ comp_group } _heatmap.png" , inverse = True )
124
-
125
- print (comp_group )
126
- print (group_sims )
127
- exit (0 )
128
-
129
- # find start/end coordinates
130
- introgressions = group_sims [group_sims .introgression > 0 ].copy ()
131
- introgressions ["start" ] = introgressions .index
132
- introgressions ["end" ] = introgressions ["start" ] + bin_size
133
-
134
- # call adjacent introgressions as the same
135
- # check if end of the prev col is the same as the start of the current
136
- introgressions ["groups" ] = (introgressions .start - introgressions .end .shift (1 )).fillna (0 )
137
- # use cumsum to assign adjacent intros to the same group
138
- introgressions ["groups" ] = introgressions ["groups" ].cumsum ()
139
- # get the number of bins in each group and use to calculate new start/end
140
- bins_in_groups = introgressions .groupby ("groups" ).count ()["end" ]
141
- introgressions = introgressions .drop_duplicates (subset = "groups" , keep = "first" )
142
- introgressions = introgressions .drop (columns = "end" ).merge (bins_in_groups , on = "groups" )
143
- introgressions ["end" ] = introgressions .start + (introgressions .end * bin_size )
144
-
145
- # write out a bed file of all found introgressions (chr, start, end)
146
- introgressions ["chr" ] = chr_name
147
- introgressions ["name" ] = f"{ comp_group } _intro"
148
- introgressions = introgressions [["chr" , "start" , "end" , "name" ]]
149
- introgressions .to_csv (
150
- output_dir / f"{ anchor } _{ chr_name } _{ comp_group } .bed" ,
151
- header = False ,
152
- index = False ,
153
- sep = "\t " ,
154
- )
157
+ output_vis = output_dir / f"{ anchor } _{ chr_name } _{ comp_group } _heatmap.png"
158
+ visualize (pair .drop (columns = ["group" ]), output_vis , inverse = True )
159
+
160
+ # save introgressions to bed file
161
+ output_bed = output_dir / f"{ anchor } _{ chr_name } _{ comp_group } .bed"
162
+ threshold_to_bed (group_sims , bin_size , chr_name , comp_group , output_bed )
163
+
164
+ # repeat analysis one more with merged introgressions from all comp_groups
165
+ # pair.loc["Intro. Score"] = (~(merged_sims["introgression"].astype(bool))).astype(int)
166
+
167
+ # divide by the max to get between 0 and 1, do 1 - x to invert
168
+ pair .loc ["Intro. Score" ] = 1 - (merged_sims ["introgression" ] / merged_sims ["introgression" ].max ())
169
+
170
+ output_vis = output_dir / f"{ anchor } _{ chr_name } _merged_heatmap.png"
171
+ visualize (pair .drop (columns = ["group" ]), output_vis , inverse = True )
172
+
173
+ # save introgressions to bed file
174
+ output_bed = output_dir / f"{ anchor } _{ chr_name } _merged.bed"
175
+ threshold_to_bed (merged_sims , bin_size , chr_name , comp_group , output_bed )
155
176
return
156
177
157
178
@@ -169,25 +190,10 @@ def main():
169
190
index = Index (index_dir )
170
191
171
192
# For testing with tomato pangenome
172
- for anchor in ["M82" ]:
173
- genome = index .genomes [anchor ]
174
- print (genome .sizes .keys ())
175
- for chr_name in ["chr11" , "chr4" ]:
176
- print ("Now running introgression analysis for" , anchor , chr_name )
177
- run_introgression_finder (
178
- index ,
179
- anchor ,
180
- chr_name ,
181
- group_tsv ,
182
- comp_groups ,
183
- bitmap_step ,
184
- bin_size ,
185
- output_dir ,
186
- )
187
-
188
- # for anchor in index.genomes.keys():
193
+ # for anchor in ["M82"]:
189
194
# genome = index.genomes[anchor]
190
- # for chr_name in genome.sizes.keys():
195
+ # print(genome.sizes.keys())
196
+ # for chr_name in ["chr11", "chr4"]:
191
197
# print("Now running introgression analysis for", anchor, chr_name)
192
198
# run_introgression_finder(
193
199
# index,
@@ -199,7 +205,22 @@ def main():
199
205
# bin_size,
200
206
# output_dir,
201
207
# )
202
- # NOTE: if anchor not in SLL group, we can skip
208
+
209
+ for anchor in index .genomes .keys ():
210
+ genome = index .genomes [anchor ]
211
+ for chr_name in genome .sizes .keys ():
212
+ print ("Now running introgression analysis for" , anchor , chr_name )
213
+ run_introgression_finder (
214
+ index ,
215
+ anchor ,
216
+ chr_name ,
217
+ group_tsv ,
218
+ comp_groups ,
219
+ bitmap_step ,
220
+ bin_size ,
221
+ output_dir ,
222
+ )
223
+ # NOTE: if anchor not in SLL group, we could skip
203
224
return
204
225
205
226
0 commit comments