1
- import sys
2
- import os
3
- # import os.path
4
- # from os import path
5
1
from pathlib import Path
6
- import subprocess
2
+ import math
7
3
import numpy as np
8
4
import pandas as pd
9
- import bgzip
10
- import gzip
11
- import csv
12
- import glob
13
- import pysam
14
- from collections import defaultdict , Counter
15
- from time import time
16
- from Bio import bgzf , SeqIO
17
- import yaml
18
- import multiprocessing as mp
19
- from types import SimpleNamespace
20
- import shutil
21
- import snakemake
22
- import re
23
- import logging
24
5
from panagram .index import Index
25
6
import plotly .express as px
26
7
@@ -35,19 +16,20 @@ def visualize(pair, output_file, inverse=False):
35
16
# take a look at what pair looks like after manipulation
36
17
# pair[pair >= 1] = 10
37
18
if inverse :
38
- fig = px .imshow (pair ,
39
- color_continuous_scale = px .colors .sequential .Plasma [::- 1 ],
40
- x = pair .columns ,
41
- y = pair .index )
19
+ fig = px .imshow (
20
+ pair ,
21
+ color_continuous_scale = px .colors .sequential .Plasma [::- 1 ],
22
+ x = pair .columns ,
23
+ y = pair .index ,
24
+ )
42
25
else :
43
- fig = px .imshow (pair ,
44
- x = pair .columns ,
45
- y = pair .index )
26
+ fig = px .imshow (pair , x = pair .columns , y = pair .index )
46
27
fig .write_image (output_file )
47
28
return
48
29
49
30
50
31
def fill_gaps (row , rounds = 2 ):
32
+ row = row .values
51
33
for j in range (rounds ):
52
34
# Find gaps of 0s surrounded by 1s
53
35
for i in range (1 , len (row ) - 1 ):
@@ -56,7 +38,17 @@ def fill_gaps(row, rounds=2):
56
38
row [i ] = (row [i - 1 ] + row [i + 1 ]) / 2
57
39
return row
58
40
59
- def run_introgression_finder (index , anchor , chr_name , bitmap_step , max_chr_bins , k , output_dir ):
41
+
42
+ def run_introgression_finder (
43
+ index ,
44
+ anchor ,
45
+ chr_name ,
46
+ bitmap_step ,
47
+ max_chr_bins ,
48
+ k ,
49
+ set_difference_threshold ,
50
+ output_dir ,
51
+ ):
60
52
# Step 1 - choose an anchor and re-create pairwise correlation matrix for it
61
53
# kmer size is 20-30ish; kmer at position X starts at position X (not centered at position X)
62
54
# there are multiple positions in a bin; there are <bin size> - k + 1 kmers in a bin
@@ -83,7 +75,9 @@ def run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins,
83
75
print ("# Kmers in a bin" , num_kmers_in_bin )
84
76
85
77
pan , pair = index .bitmap_to_bins (chr_bitmap , bin_size )
86
- visualize (pair , output_dir / f"{ anchor } _{ chr_name } _original_heatmap.png" , inverse = True )
78
+ visualize (
79
+ pair , output_dir / f"{ anchor } _{ chr_name } _original_heatmap.png" , inverse = True
80
+ )
87
81
88
82
# # sanity check
89
83
# print(len(pair.columns))
@@ -110,64 +104,121 @@ def run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins,
110
104
print ("Chosen dissimilarity threshold" , dissimilarity_threshold )
111
105
112
106
# Histogram of correlations
113
- fig = px .histogram (values , title = 'Histogram of Correlation Values' )
114
- fig .add_vline (x = dissimilarity_threshold , line_width = 2 , line_dash = "dash" , line_color = "red" , annotation_text = "Threshold" , annotation_position = "top left" )
107
+ fig = px .histogram (values , title = "Histogram of Correlation Values" )
108
+ fig .add_vline (
109
+ x = dissimilarity_threshold ,
110
+ line_width = 2 ,
111
+ line_dash = "dash" ,
112
+ line_color = "red" ,
113
+ annotation_text = "Threshold" ,
114
+ annotation_position = "top left" ,
115
+ )
115
116
fig .write_image (output_dir / f"{ anchor } _{ chr_name } _corr_hist.png" )
116
117
117
118
# NOTE: if there are no outliers, or threshold is very high, there are likely no introgressions
118
119
# test for this scenario
119
120
120
121
# flip matrix so we can use pandas rolling operation
121
122
transposed_pair = pair .transpose ().drop (columns = [anchor ])
123
+ pangenome_names = list (transposed_pair .columns ) # save for reference later
122
124
# print(transposed_pair)
123
125
124
126
# convert to binary array by applying dissimilarity threshold
125
127
transposed_pair [transposed_pair > dissimilarity_threshold ] = 0
126
128
transposed_pair [transposed_pair != 0 ] = 1
127
129
128
130
# Step 3 - create an introgression score for each position based on total dissimilarity
129
- transposed_pair ["introgression_score" ] = transposed_pair .sum (axis = 1 )
130
- # transposed_pair["genomes"] = transposed_pair.apply(lambda row: {col for col in transposed_pair.columns if row[col] > 0}, axis=1)
131
- # print(transposed_pair)
132
-
133
- # combine nearby locations as the same introgression
134
- transposed_pair ["introgression_score" ] = fill_gaps (transposed_pair ["introgression_score" ].values )#, transposed_pair["genomes"].values)
135
- visualize (transposed_pair .transpose (), output_dir / f"{ anchor } _{ chr_name } _introgressions_heatmap.png" )
131
+ # for each genome combine nearby locations as the same introgression - column by column
132
+ transposed_pair = transposed_pair .apply (fill_gaps )
133
+
134
+ # For each position, figure out which subset of genomes share an introgression at the position
135
+ transposed_pair ["genomes" ] = transposed_pair .apply (
136
+ lambda row : {col for col in pangenome_names if row [col ] > 0 }, axis = 1
137
+ )
138
+ # Calculate set difference with the next row and store the length
139
+ transposed_pair ["genomes_overlap" ] = (
140
+ transposed_pair ["genomes" ]
141
+ .shift (1 )
142
+ .combine (
143
+ transposed_pair ["genomes" ],
144
+ lambda next_row , current_row : (
145
+ len (current_row & (next_row or set ()))
146
+ / len (current_row | (next_row or set ()))
147
+ if current_row | (next_row or set ())
148
+ else 0
149
+ ),
150
+ )
151
+ )
152
+
153
+ # introgression score - counter of number of genomes that share the introgression
154
+ # TODO: normalize by number of genomes in the pangenome?
155
+ # transposed_pair["introgression_score"] = transposed_pair[pangenome_names].sum(axis=1)
156
+ # transposed_pair["introgression_score"] = fill_gaps(transposed_pair["introgression_score"].values, transposed_pair["genomes"].values, set_difference_threshold)
157
+
158
+ transposed_pair ["introgression_score" ] = transposed_pair ["genomes" ].apply (len )
159
+ visualize (
160
+ transposed_pair [pangenome_names + ["introgression_score" ]].transpose (),
161
+ output_dir / f"{ anchor } _{ chr_name } _introgressions_heatmap.png" ,
162
+ )
163
+ pair .loc ["intro_score" ] = 1 - (
164
+ transposed_pair ["introgression_score" ].values
165
+ / max (transposed_pair ["introgression_score" ].values )
166
+ )
167
+ visualize (
168
+ pair , output_dir / f"{ anchor } _{ chr_name } _hybrid_heatmap.png" , inverse = True
169
+ )
136
170
137
171
# Step 4 - report, sorted by size and then by score
138
- transposed_pair ['introgression_starts' ] = (transposed_pair ['introgression_score' ] != 0 ) & (transposed_pair ['introgression_score' ].shift (1 , fill_value = 0 ) == 0 )
172
+ # transposed_pair['introgression_starts'] = (transposed_pair['introgression_score'] != 0) & (transposed_pair['introgression_score'].shift(1, fill_value=0) == 0)
173
+ transposed_pair ["introgression_starts" ] = (
174
+ (transposed_pair ["genomes_overlap" ] > 0 )
175
+ & (transposed_pair ["genomes_overlap" ] < 0.9 )
176
+ ) | (
177
+ (transposed_pair ["genomes_overlap" ] == 0 )
178
+ & (transposed_pair ["introgression_score" ] == 1 )
179
+ )
139
180
140
181
# Create a group identifier for each set of non-zeros
141
- transposed_pair ['introgression_group' ] = transposed_pair ['introgression_starts' ].cumsum ()
182
+ transposed_pair ["introgression_group" ] = transposed_pair [
183
+ "introgression_starts"
184
+ ].cumsum ()
185
+
142
186
# sum of all introgression scores
143
- total_introgression_scores = transposed_pair [transposed_pair ['introgression_score' ] != 0 ].groupby ('introgression_group' )['introgression_score' ].mean ()
187
+ introgression_groupby = transposed_pair [
188
+ transposed_pair ["introgression_score" ] != 0
189
+ ].groupby ("introgression_group" )
190
+ total_introgression_scores = introgression_groupby ["introgression_score" ].mean ()
144
191
# end index for finding introgression length in bps
145
- last_indices = transposed_pair [transposed_pair ['introgression_score' ] != 0 ].groupby ('introgression_group' ).tail (1 ).index .values
192
+ last_indices = introgression_groupby .tail (1 ).index .values
193
+ # genomes that introgression belongs to (union of all sets in the group)
194
+ introgression_genomes = (
195
+ introgression_groupby ["genomes" ].agg (lambda sets : set .union (* sets )).values
196
+ )
197
+
198
+ introgressions = transposed_pair [
199
+ transposed_pair .introgression_starts == True
200
+ ].copy ()
146
201
147
- introgressions = transposed_pair [transposed_pair . introgression_starts == True ]. copy ( )
202
+ # print( transposed_pair[transposed_pair['introgression_score'] != 0][transposed_pair.columns[10:20]] )
148
203
introgressions ["introgression_score" ] = total_introgression_scores .values
149
204
introgressions ["introgression_end" ] = last_indices
150
- introgressions ["introgression_end" ] = introgressions ["introgression_end" ] + bin_size # account for the fact that the index is the start and not the end of a bin
151
- introgressions = introgressions [["introgression_end" , "introgression_score" ]].reset_index ().rename (columns = {"index" :"introgression_start" })
152
- introgressions ["introgression_length" ] = introgressions ["introgression_end" ] - introgressions ["introgression_start" ]
153
- print (introgressions .sort_values (by = ["introgression_length" , "introgression_score" ], ascending = False ))
154
- introgressions .sort_values (by = ["introgression_length" , "introgression_score" ], ascending = False ).to_csv (output_dir / f"{ anchor } _{ chr_name } _introgressions.csv" , index = False )
155
- # NOTE: don't need this loop anymore; df operations are faster
156
- # introgression_locations = {}
157
- # for location in transposed_pair["introgression_score"]:
158
- # print(location)
159
- # if bin >= 1 and not in_introgression:
160
- # # start new introgression
161
- # in_introgression = True
162
- # introgression_locations = [locatio]
163
- # elif bin >= 1 and in_introgression:
164
- # # append
165
- # else:
166
- # in_introgression = False
167
-
168
- # TODO: turn into function that can repeat across all chrs/all genomes if requested
169
- # Look at underlying sequence in found introgressions; compare/cluster? align with annotations?
170
- # we're using dissimilarity to find them though, so by definition, won't these mostly look unique?
205
+ introgressions ["introgression_end" ] = (
206
+ introgressions ["introgression_end" ] + bin_size
207
+ ) # account for the fact that the index is the start and not the end of a bin
208
+ introgressions = (
209
+ introgressions [["introgression_end" , "introgression_score" ]]
210
+ .reset_index ()
211
+ .rename (columns = {"index" : "introgression_start" })
212
+ )
213
+ introgressions ["introgression_length" ] = (
214
+ introgressions ["introgression_end" ] - introgressions ["introgression_start" ]
215
+ )
216
+ introgressions ["introgression_genomes" ] = introgression_genomes
217
+ print (introgressions .sort_values (by = ["introgression_start" ], ascending = True ))
218
+ introgressions .sort_values (by = ["introgression_start" ], ascending = True ).to_csv (
219
+ output_dir / f"{ anchor } _{ chr_name } _introgressions.csv" , index = False
220
+ )
221
+
171
222
return
172
223
173
224
@@ -177,29 +228,40 @@ def run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins,
177
228
# chr_name = "BGV006775_MAS2.0ch11"
178
229
bitmap_step = 100
179
230
max_chr_bins = 350
180
- size_threshold = 3000000 # NOTE: unused, minimum size in bps of the introgression
181
- k = 31 # TODO: k should be defined somewhere else; don't need from the user
231
+ size_threshold = 3000000 # NOTE: unused, minimum size in bps of the introgression
232
+ k = 31 # TODO: k should be defined somewhere else; don't need from the user
182
233
output_dir = "/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato/introgression_analysis_v1/"
183
234
184
235
index = Index (index_dir )
236
+ set_difference_threshold = 2 # NOTE: could also make this proportionate to pangenome size int(len(index.genomes) / 10)
185
237
186
- # For testing
187
- # for anchor in ["SL5"]:#index.genomes.keys():
188
- # genome = index.genomes[anchor]
189
- # print(genome.sizes.keys())
190
- # for chr_name in [11]:#genome.sizes.keys():
191
- # print(anchor, chr_name)
192
- # run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins, k, output_dir)
193
- # break
194
- # break
195
-
196
- for anchor in index .genomes .keys ():
238
+ # For testing with tomato pangenome
239
+ for anchor in ["SL5" ]:
197
240
genome = index .genomes [anchor ]
198
241
print (genome .sizes .keys ())
199
- for chr_name in genome . sizes . keys () :
242
+ for chr_name in [ 11 ] :
200
243
print (anchor , chr_name )
201
- run_introgression_finder (index , anchor , chr_name , bitmap_step , max_chr_bins , k , output_dir )
244
+ run_introgression_finder (
245
+ index ,
246
+ anchor ,
247
+ chr_name ,
248
+ bitmap_step ,
249
+ max_chr_bins ,
250
+ k ,
251
+ set_difference_threshold ,
252
+ output_dir ,
253
+ )
254
+ break
255
+ break
256
+
257
+ # for anchor in index.genomes.keys():
258
+ # genome = index.genomes[anchor]
259
+ # for chr_name in genome.sizes.keys():
260
+ # print("Now running introgression analysis for", anchor, chr_name)
261
+ # run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins, k, output_dir)
202
262
203
263
# TODO: column for each position with a python set of genomes contributing to the introgression score
204
264
# column for each position with a python set of genomes that may share the same introgression (similarity above threshold)
205
265
# only merge adjacent areas where the set difference is <=2
266
+ # TODO: Look at underlying sequence in found introgressions; compare/cluster? align with annotations?
267
+ # we're using dissimilarity to find them though, so by definition, won't these mostly look unique?
0 commit comments