Skip to content

Commit 0eb5014

Browse files
committed
saving prototype introgression code
1 parent 2e0c935 commit 0eb5014

File tree

3 files changed

+378
-0
lines changed

3 files changed

+378
-0
lines changed

panagram/experimentation.py

+95
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import sys
2+
import os
3+
import os.path
4+
from os import path
5+
import subprocess
6+
import numpy as np
7+
import pandas as pd
8+
import bgzip
9+
import gzip
10+
import csv
11+
import glob
12+
import pysam
13+
from collections import defaultdict, Counter
14+
from time import time
15+
from Bio import bgzf, SeqIO
16+
import yaml
17+
import multiprocessing as mp
18+
from types import SimpleNamespace
19+
import shutil
20+
import snakemake
21+
import re
22+
import logging
23+
from panagram.index import Index
24+
25+
26+
# let's try some toy examples of manipulating a bitmap
27+
28+
# import numpy as np
29+
# arrs = dict()
30+
# arr = np.array([3,4,5])
31+
# step = 0
32+
33+
# # for step,arr in byte_arrs.items():
34+
# arrs[step] = np.concatenate(arr, axis=1)
35+
# arrs[step] = np.concatenate(np.array([4,5,6,7]), axis=1)
36+
37+
def better_dir(item):
38+
# don't print hidden functions
39+
methods = dir(item)
40+
return [method for method in methods if not method.startswith("_")]
41+
42+
# Reading in an index - you can do this after running snakemake
43+
index_dir = "/home/nbrown62/data_mschatz1/nbrown62/panagram/example_data"
44+
index = Index(index_dir)
45+
print(index)
46+
47+
# Gives you something like Index(input='samples.tsv', mode=None, prefix='', k=21, cores=1, lowres_step=100, max_bin_kbp=200, min_bin_count=100, max_view_chrs=50, gff_gene_types=['gene'], gff_anno_types=None, gff_name='Name', anchor_genomes=['ecoli', 'ecoli_k12', 'klebsiella', 'salmonella', 'shigella'], prepare=False, kmc=KMC(memory=8, threads=1, use_existing=False), use_existing=1, threads=1, memory=1)
48+
print(index.anchor_genomes)
49+
50+
# All available methods for Index class
51+
print(better_dir(index))
52+
53+
# Get a genome class - they are listed by their names in a dictionary
54+
print(index.genome_names[0])
55+
ecoli_genome = index.genomes[index.genome_names[0]]
56+
57+
# All available methods for Genome class
58+
print(better_dir(ecoli_genome))
59+
60+
# pandas df of chromosome names, sizes, and gene count
61+
# might tell you which chrs are most interesting
62+
chromosome_info = ecoli_genome.chrs
63+
print(chromosome_info)
64+
65+
# bitmaps themselves are compressed objects until accessed
66+
print(ecoli_genome.bitmaps)
67+
68+
# Get part of the bitmap from the genome
69+
# Query a chromosome at a certain position to get a piece of the bitmap
70+
# TODO: Not sure what step does yet? Sum of kmer occurrence across a window of size step?
71+
ecoli_bitmap = ecoli_genome.query('NZ_CP015023.1', 0, 5506781, step=1)
72+
73+
74+
# You can also query using the index itself
75+
# print(index.query_bitmap(ecoli_genome, chrom, start=None, end=None, step=1))
76+
77+
# what if you wanted to look at intergressions
78+
# set a threshold for what counts as an intergression
79+
threshold = 0.75 # 75% co-occurrence across the pangenome
80+
ecoli_bitmap["frac_co_occurrence"] = ecoli_bitmap.sum(axis=1)
81+
ecoli_bitmap["frac_co_occurrence"] = ecoli_bitmap["frac_co_occurrence"] / len(index.genomes)
82+
print(ecoli_bitmap)
83+
84+
# the index on the left (+ starting point index) gives you the indices of intergressions
85+
print(ecoli_bitmap[ecoli_bitmap["frac_co_occurrence"] >= threshold])
86+
87+
# in this example, there are 196262 indices (i.e., there is one at 34329 and one at 5465887)
88+
# TODO: merge nearby intergression locations into one
89+
# A run of length > run_threshold is a true intergression
90+
# TODO: how long are intergressions typically?
91+
# use a sliding window and take average co-occurance perhaps within a k-sized window
92+
93+
diff_threshold = 10000 # consider different intergression if more than n away?
94+
# index_diff = df['index'].diff().fillna(0)
95+

panagram/generate_corr_figure.py

+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import sys
2+
import os
3+
# import os.path
4+
# from os import path
5+
from pathlib import Path
6+
import subprocess
7+
import numpy as np
8+
import pandas as pd
9+
import bgzip
10+
import gzip
11+
import csv
12+
import glob
13+
import pysam
14+
from collections import defaultdict, Counter
15+
from time import time
16+
from Bio import bgzf, SeqIO
17+
import yaml
18+
import multiprocessing as mp
19+
from types import SimpleNamespace
20+
import shutil
21+
import snakemake
22+
import re
23+
import logging
24+
from panagram.index import Index
25+
import plotly.express as px
26+
27+
28+
def better_dir(item):
29+
# don't print hidden functions
30+
methods = dir(item)
31+
return [method for method in methods if not method.startswith("_")]
32+
33+
34+
def visualize(pair, output_file, inverse=False):
35+
# take a look at what pair looks like after manipulation
36+
# pair[pair >= 1] = 10
37+
if inverse:
38+
fig = px.imshow(pair,
39+
color_continuous_scale=px.colors.sequential.Plasma[::-1],
40+
x=pair.columns,
41+
y=pair.index)
42+
else:
43+
fig = px.imshow(pair,
44+
x=pair.columns,
45+
y=pair.index)
46+
fig.write_image(output_file)
47+
return
48+
49+
index_dir = "/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato"
50+
anchor = "SL5" #"SL5"
51+
# chr_name = "BGV006775_MAS2.0ch11"
52+
output_dir = "/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato/introgression_analysis_v1/"
53+
54+
55+
output_dir = Path(output_dir)
56+
output_dir.mkdir(parents=True, exist_ok=True)
57+
index = Index(index_dir)
58+
genome = index.genomes[anchor]
59+
chrs = genome.sizes.keys()
60+
# # print(index.genomes)
61+
bitmap_step = 100
62+
max_chr_bins = 350
63+
k=31
64+
65+
for chr_name in chrs:
66+
# get an entire chr's bitmap
67+
chr_size = genome.sizes[chr_name]
68+
chr_bitmap = genome.query(chr_name, 0, chr_size, step=bitmap_step)
69+
70+
# get correlation matrix
71+
start_coord = 0
72+
end_coord = chr_size
73+
bin_size = ((end_coord - start_coord) // max_chr_bins) + 1
74+
num_kmers_in_bin = bin_size - k + 1
75+
76+
pan, pair = index.bitmap_to_bins(chr_bitmap, bin_size)
77+
78+
visualize(pair, output_dir / f"{anchor}_{chr_name}_original_heatmap.png", inverse=True)

panagram/introgressions.py

+205
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
import sys
2+
import os
3+
# import os.path
4+
# from os import path
5+
from pathlib import Path
6+
import subprocess
7+
import numpy as np
8+
import pandas as pd
9+
import bgzip
10+
import gzip
11+
import csv
12+
import glob
13+
import pysam
14+
from collections import defaultdict, Counter
15+
from time import time
16+
from Bio import bgzf, SeqIO
17+
import yaml
18+
import multiprocessing as mp
19+
from types import SimpleNamespace
20+
import shutil
21+
import snakemake
22+
import re
23+
import logging
24+
from panagram.index import Index
25+
import plotly.express as px
26+
27+
28+
def better_dir(item):
29+
# don't print hidden functions
30+
methods = dir(item)
31+
return [method for method in methods if not method.startswith("_")]
32+
33+
34+
def visualize(pair, output_file, inverse=False):
35+
# take a look at what pair looks like after manipulation
36+
# pair[pair >= 1] = 10
37+
if inverse:
38+
fig = px.imshow(pair,
39+
color_continuous_scale=px.colors.sequential.Plasma[::-1],
40+
x=pair.columns,
41+
y=pair.index)
42+
else:
43+
fig = px.imshow(pair,
44+
x=pair.columns,
45+
y=pair.index)
46+
fig.write_image(output_file)
47+
return
48+
49+
50+
def fill_gaps(row, rounds=2):
51+
for j in range(rounds):
52+
# Find gaps of 0s surrounded by 1s
53+
for i in range(1, len(row) - 1):
54+
# TODO: if previous row and following row do not share identities, don't add together
55+
if row[i] == 0 and (row[i - 1] >= 1 and row[i + 1] >= 1):
56+
row[i] = (row[i - 1] + row[i + 1]) / 2
57+
return row
58+
59+
def run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins, k, output_dir):
60+
# Step 1 - choose an anchor and re-create pairwise correlation matrix for it
61+
# kmer size is 20-30ish; kmer at position X starts at position X (not centered at position X)
62+
# there are multiple positions in a bin; there are <bin size> - k + 1 kmers in a bin
63+
# default: bin_size = ((end_coord - start_coord) // max_chr_bins) + 1; max_chr_bins = 350; step = 100
64+
output_dir = Path(output_dir)
65+
output_dir.mkdir(parents=True, exist_ok=True)
66+
67+
# index = Index(index_dir)
68+
genome = index.genomes[anchor]
69+
# # print(index.genomes)
70+
# # print(genome.sizes)
71+
72+
# get an entire chr's bitmap
73+
chr_size = genome.sizes[chr_name]
74+
chr_bitmap = genome.query(chr_name, 0, chr_size, step=bitmap_step)
75+
76+
# get correlation matrix
77+
start_coord = 0
78+
end_coord = chr_size
79+
bin_size = ((end_coord - start_coord) // max_chr_bins) + 1
80+
num_kmers_in_bin = bin_size - k + 1
81+
82+
print("# Positions in a bin", bin_size)
83+
print("# Kmers in a bin", num_kmers_in_bin)
84+
85+
pan, pair = index.bitmap_to_bins(chr_bitmap, bin_size)
86+
visualize(pair, output_dir / f"{anchor}_{chr_name}_original_heatmap.png", inverse=True)
87+
88+
# # sanity check
89+
# print(len(pair.columns))
90+
91+
# Step 2 - slide through the matrix one genome at a time and calculate average pairwise correlation
92+
# introgression - area of yellow in panagram where all other accessions/samples are less similar to anchor
93+
# assumes other accessions are related variants without the introgression
94+
# sliding window size based on size of the matrix - 5% of the size? alternatively based on bps
95+
# threshold maybe around <=0.6; if average within threshold, mark all locations within window as part of introgression
96+
97+
# TODO: decide btwn using the mean/max of outliers to set threshold
98+
# Or using q3; could increase the threshold and use redundancy as an additional metric for deciding
99+
values = pair.values.flatten()
100+
q1 = np.percentile(values, 25)
101+
q3 = np.percentile(values, 75)
102+
iqr = q3 - q1
103+
lower_bound = q1 - 1.5 * iqr
104+
outliers = values[values < lower_bound]
105+
106+
if len(outliers) > 0:
107+
dissimilarity_threshold = outliers.max()
108+
else:
109+
dissimilarity_threshold = 0.6
110+
print("Chosen dissimilarity threshold", dissimilarity_threshold)
111+
112+
# Histogram of correlations
113+
fig = px.histogram(values, title='Histogram of Correlation Values')
114+
fig.add_vline(x=dissimilarity_threshold, line_width=2, line_dash="dash", line_color="red", annotation_text="Threshold", annotation_position="top left")
115+
fig.write_image(output_dir / f"{anchor}_{chr_name}_corr_hist.png")
116+
117+
# NOTE: if there are no outliers, or threshold is very high, there are likely no introgressions
118+
# test for this scenario
119+
120+
# flip matrix so we can use pandas rolling operation
121+
transposed_pair = pair.transpose().drop(columns=[anchor])
122+
# print(transposed_pair)
123+
124+
# convert to binary array by applying dissimilarity threshold
125+
transposed_pair[transposed_pair > dissimilarity_threshold] = 0
126+
transposed_pair[transposed_pair != 0] = 1
127+
128+
# Step 3 - create an introgression score for each position based on total dissimilarity
129+
transposed_pair["introgression_score"] = transposed_pair.sum(axis=1)
130+
# transposed_pair["genomes"] = transposed_pair.apply(lambda row: {col for col in transposed_pair.columns if row[col] > 0}, axis=1)
131+
# print(transposed_pair)
132+
133+
# combine nearby locations as the same introgression
134+
transposed_pair["introgression_score"] = fill_gaps(transposed_pair["introgression_score"].values)#, transposed_pair["genomes"].values)
135+
visualize(transposed_pair.transpose(), output_dir / f"{anchor}_{chr_name}_introgressions_heatmap.png")
136+
137+
# Step 4 - report, sorted by size and then by score
138+
transposed_pair['introgression_starts'] = (transposed_pair['introgression_score'] != 0) & (transposed_pair['introgression_score'].shift(1, fill_value=0) == 0)
139+
140+
# Create a group identifier for each set of non-zeros
141+
transposed_pair['introgression_group'] = transposed_pair['introgression_starts'].cumsum()
142+
# sum of all introgression scores
143+
total_introgression_scores = transposed_pair[transposed_pair['introgression_score'] != 0].groupby('introgression_group')['introgression_score'].mean()
144+
# end index for finding introgression length in bps
145+
last_indices = transposed_pair[transposed_pair['introgression_score'] != 0].groupby('introgression_group').tail(1).index.values
146+
147+
introgressions = transposed_pair[transposed_pair.introgression_starts == True].copy()
148+
introgressions["introgression_score"] = total_introgression_scores.values
149+
introgressions["introgression_end"] = last_indices
150+
introgressions["introgression_end"] = introgressions["introgression_end"] + bin_size # account for the fact that the index is the start and not the end of a bin
151+
introgressions = introgressions[["introgression_end", "introgression_score"]].reset_index().rename(columns={"index":"introgression_start"})
152+
introgressions["introgression_length"] = introgressions["introgression_end"] - introgressions["introgression_start"]
153+
print(introgressions.sort_values(by=["introgression_length", "introgression_score"], ascending=False))
154+
introgressions.sort_values(by=["introgression_length", "introgression_score"], ascending=False).to_csv(output_dir / f"{anchor}_{chr_name}_introgressions.csv", index=False)
155+
# NOTE: don't need this loop anymore; df operations are faster
156+
# introgression_locations = {}
157+
# for location in transposed_pair["introgression_score"]:
158+
# print(location)
159+
# if bin >= 1 and not in_introgression:
160+
# # start new introgression
161+
# in_introgression = True
162+
# introgression_locations = [locatio]
163+
# elif bin >= 1 and in_introgression:
164+
# # append
165+
# else:
166+
# in_introgression = False
167+
168+
# TODO: turn into function that can repeat across all chrs/all genomes if requested
169+
# Look at underlying sequence in found introgressions; compare/cluster? align with annotations?
170+
# we're using dissimilarity to find them though, so by definition, won't these mostly look unique?
171+
return
172+
173+
174+
# USER PARAMS
175+
index_dir = "/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato"
176+
# anchor = "BGV006775_MAS2" #"SL5"
177+
# chr_name = "BGV006775_MAS2.0ch11"
178+
bitmap_step = 100
179+
max_chr_bins = 350
180+
size_threshold = 3000000 # NOTE: unused, minimum size in bps of the introgression
181+
k = 31 # TODO: k should be defined somewhere else; don't need from the user
182+
output_dir = "/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato/introgression_analysis_v1/"
183+
184+
index = Index(index_dir)
185+
186+
# For testing
187+
# for anchor in ["SL5"]:#index.genomes.keys():
188+
# genome = index.genomes[anchor]
189+
# print(genome.sizes.keys())
190+
# for chr_name in [11]:#genome.sizes.keys():
191+
# print(anchor, chr_name)
192+
# run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins, k, output_dir)
193+
# break
194+
# break
195+
196+
for anchor in index.genomes.keys():
197+
genome = index.genomes[anchor]
198+
print(genome.sizes.keys())
199+
for chr_name in genome.sizes.keys():
200+
print(anchor, chr_name)
201+
run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins, k, output_dir)
202+
203+
# TODO: column for each position with a python set of genomes contributing to the introgression score
204+
# column for each position with a python set of genomes that may share the same introgression (similarity above threshold)
205+
# only merge adjacent areas where the set difference is <=2

0 commit comments

Comments
 (0)