Skip to content

Commit da53114

Browse files
committed
added introgression genomes of origin; linting
1 parent 0eb5014 commit da53114

File tree

3 files changed

+163
-94
lines changed

3 files changed

+163
-94
lines changed

panagram/experimentation.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,13 @@
3434
# arrs[step] = np.concatenate(arr, axis=1)
3535
# arrs[step] = np.concatenate(np.array([4,5,6,7]), axis=1)
3636

37+
3738
def better_dir(item):
3839
# don't print hidden functions
3940
methods = dir(item)
4041
return [method for method in methods if not method.startswith("_")]
4142

43+
4244
# Reading in an index - you can do this after running snakemake
4345
index_dir = "/home/nbrown62/data_mschatz1/nbrown62/panagram/example_data"
4446
index = Index(index_dir)
@@ -68,17 +70,19 @@ def better_dir(item):
6870
# Get part of the bitmap from the genome
6971
# Query a chromosome at a certain position to get a piece of the bitmap
7072
# TODO: Not sure what step does yet? Sum of kmer occurrence across a window of size step?
71-
ecoli_bitmap = ecoli_genome.query('NZ_CP015023.1', 0, 5506781, step=1)
73+
ecoli_bitmap = ecoli_genome.query("NZ_CP015023.1", 0, 5506781, step=1)
7274

7375

7476
# You can also query using the index itself
7577
# print(index.query_bitmap(ecoli_genome, chrom, start=None, end=None, step=1))
7678

7779
# what if you wanted to look at intergressions
7880
# set a threshold for what counts as an intergression
79-
threshold = 0.75 # 75% co-occurrence across the pangenome
81+
threshold = 0.75 # 75% co-occurrence across the pangenome
8082
ecoli_bitmap["frac_co_occurrence"] = ecoli_bitmap.sum(axis=1)
81-
ecoli_bitmap["frac_co_occurrence"] = ecoli_bitmap["frac_co_occurrence"] / len(index.genomes)
83+
ecoli_bitmap["frac_co_occurrence"] = ecoli_bitmap["frac_co_occurrence"] / len(
84+
index.genomes
85+
)
8286
print(ecoli_bitmap)
8387

8488
# the index on the left (+ starting point index) gives you the indices of intergressions
@@ -90,6 +94,5 @@ def better_dir(item):
9094
# TODO: how long are intergressions typically?
9195
# use a sliding window and take average co-occurance perhaps within a k-sized window
9296

93-
diff_threshold = 10000 # consider different intergression if more than n away?
97+
diff_threshold = 10000 # consider different intergression if more than n away?
9498
# index_diff = df['index'].diff().fillna(0)
95-

panagram/generate_corr_figure.py

+14-10
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import sys
22
import os
3+
34
# import os.path
45
# from os import path
56
from pathlib import Path
@@ -35,19 +36,20 @@ def visualize(pair, output_file, inverse=False):
3536
# take a look at what pair looks like after manipulation
3637
# pair[pair >= 1] = 10
3738
if inverse:
38-
fig = px.imshow(pair,
39-
color_continuous_scale=px.colors.sequential.Plasma[::-1],
40-
x=pair.columns,
41-
y=pair.index)
39+
fig = px.imshow(
40+
pair,
41+
color_continuous_scale=px.colors.sequential.Plasma[::-1],
42+
x=pair.columns,
43+
y=pair.index,
44+
)
4245
else:
43-
fig = px.imshow(pair,
44-
x=pair.columns,
45-
y=pair.index)
46+
fig = px.imshow(pair, x=pair.columns, y=pair.index)
4647
fig.write_image(output_file)
4748
return
4849

50+
4951
index_dir = "/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato"
50-
anchor = "SL5" #"SL5"
52+
anchor = "SL5" # "SL5"
5153
# chr_name = "BGV006775_MAS2.0ch11"
5254
output_dir = "/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato/introgression_analysis_v1/"
5355

@@ -60,7 +62,7 @@ def visualize(pair, output_file, inverse=False):
6062
# # print(index.genomes)
6163
bitmap_step = 100
6264
max_chr_bins = 350
63-
k=31
65+
k = 31
6466

6567
for chr_name in chrs:
6668
# get an entire chr's bitmap
@@ -75,4 +77,6 @@ def visualize(pair, output_file, inverse=False):
7577

7678
pan, pair = index.bitmap_to_bins(chr_bitmap, bin_size)
7779

78-
visualize(pair, output_dir / f"{anchor}_{chr_name}_original_heatmap.png", inverse=True)
80+
visualize(
81+
pair, output_dir / f"{anchor}_{chr_name}_original_heatmap.png", inverse=True
82+
)

panagram/introgressions.py

+141-79
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,7 @@
1-
import sys
2-
import os
3-
# import os.path
4-
# from os import path
51
from pathlib import Path
6-
import subprocess
2+
import math
73
import numpy as np
84
import pandas as pd
9-
import bgzip
10-
import gzip
11-
import csv
12-
import glob
13-
import pysam
14-
from collections import defaultdict, Counter
15-
from time import time
16-
from Bio import bgzf, SeqIO
17-
import yaml
18-
import multiprocessing as mp
19-
from types import SimpleNamespace
20-
import shutil
21-
import snakemake
22-
import re
23-
import logging
245
from panagram.index import Index
256
import plotly.express as px
267

@@ -35,19 +16,20 @@ def visualize(pair, output_file, inverse=False):
3516
# take a look at what pair looks like after manipulation
3617
# pair[pair >= 1] = 10
3718
if inverse:
38-
fig = px.imshow(pair,
39-
color_continuous_scale=px.colors.sequential.Plasma[::-1],
40-
x=pair.columns,
41-
y=pair.index)
19+
fig = px.imshow(
20+
pair,
21+
color_continuous_scale=px.colors.sequential.Plasma[::-1],
22+
x=pair.columns,
23+
y=pair.index,
24+
)
4225
else:
43-
fig = px.imshow(pair,
44-
x=pair.columns,
45-
y=pair.index)
26+
fig = px.imshow(pair, x=pair.columns, y=pair.index)
4627
fig.write_image(output_file)
4728
return
4829

4930

5031
def fill_gaps(row, rounds=2):
32+
row = row.values
5133
for j in range(rounds):
5234
# Find gaps of 0s surrounded by 1s
5335
for i in range(1, len(row) - 1):
@@ -56,7 +38,17 @@ def fill_gaps(row, rounds=2):
5638
row[i] = (row[i - 1] + row[i + 1]) / 2
5739
return row
5840

59-
def run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins, k, output_dir):
41+
42+
def run_introgression_finder(
43+
index,
44+
anchor,
45+
chr_name,
46+
bitmap_step,
47+
max_chr_bins,
48+
k,
49+
set_difference_threshold,
50+
output_dir,
51+
):
6052
# Step 1 - choose an anchor and re-create pairwise correlation matrix for it
6153
# kmer size is 20-30ish; kmer at position X starts at position X (not centered at position X)
6254
# there are multiple positions in a bin; there are <bin size> - k + 1 kmers in a bin
@@ -83,7 +75,9 @@ def run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins,
8375
print("# Kmers in a bin", num_kmers_in_bin)
8476

8577
pan, pair = index.bitmap_to_bins(chr_bitmap, bin_size)
86-
visualize(pair, output_dir / f"{anchor}_{chr_name}_original_heatmap.png", inverse=True)
78+
visualize(
79+
pair, output_dir / f"{anchor}_{chr_name}_original_heatmap.png", inverse=True
80+
)
8781

8882
# # sanity check
8983
# print(len(pair.columns))
@@ -110,64 +104,121 @@ def run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins,
110104
print("Chosen dissimilarity threshold", dissimilarity_threshold)
111105

112106
# Histogram of correlations
113-
fig = px.histogram(values, title='Histogram of Correlation Values')
114-
fig.add_vline(x=dissimilarity_threshold, line_width=2, line_dash="dash", line_color="red", annotation_text="Threshold", annotation_position="top left")
107+
fig = px.histogram(values, title="Histogram of Correlation Values")
108+
fig.add_vline(
109+
x=dissimilarity_threshold,
110+
line_width=2,
111+
line_dash="dash",
112+
line_color="red",
113+
annotation_text="Threshold",
114+
annotation_position="top left",
115+
)
115116
fig.write_image(output_dir / f"{anchor}_{chr_name}_corr_hist.png")
116117

117118
# NOTE: if there are no outliers, or threshold is very high, there are likely no introgressions
118119
# test for this scenario
119120

120121
# flip matrix so we can use pandas rolling operation
121122
transposed_pair = pair.transpose().drop(columns=[anchor])
123+
pangenome_names = list(transposed_pair.columns) # save for reference later
122124
# print(transposed_pair)
123125

124126
# convert to binary array by applying dissimilarity threshold
125127
transposed_pair[transposed_pair > dissimilarity_threshold] = 0
126128
transposed_pair[transposed_pair != 0] = 1
127129

128130
# Step 3 - create an introgression score for each position based on total dissimilarity
129-
transposed_pair["introgression_score"] = transposed_pair.sum(axis=1)
130-
# transposed_pair["genomes"] = transposed_pair.apply(lambda row: {col for col in transposed_pair.columns if row[col] > 0}, axis=1)
131-
# print(transposed_pair)
132-
133-
# combine nearby locations as the same introgression
134-
transposed_pair["introgression_score"] = fill_gaps(transposed_pair["introgression_score"].values)#, transposed_pair["genomes"].values)
135-
visualize(transposed_pair.transpose(), output_dir / f"{anchor}_{chr_name}_introgressions_heatmap.png")
131+
# for each genome combine nearby locations as the same introgression - column by column
132+
transposed_pair = transposed_pair.apply(fill_gaps)
133+
134+
# For each position, figure out which subset of genomes share an introgression at the position
135+
transposed_pair["genomes"] = transposed_pair.apply(
136+
lambda row: {col for col in pangenome_names if row[col] > 0}, axis=1
137+
)
138+
# Calculate set difference with the next row and store the length
139+
transposed_pair["genomes_overlap"] = (
140+
transposed_pair["genomes"]
141+
.shift(1)
142+
.combine(
143+
transposed_pair["genomes"],
144+
lambda next_row, current_row: (
145+
len(current_row & (next_row or set()))
146+
/ len(current_row | (next_row or set()))
147+
if current_row | (next_row or set())
148+
else 0
149+
),
150+
)
151+
)
152+
153+
# introgression score - counter of number of genomes that share the introgression
154+
# TODO: normalize by number of genomes in the pangenome?
155+
# transposed_pair["introgression_score"] = transposed_pair[pangenome_names].sum(axis=1)
156+
# transposed_pair["introgression_score"] = fill_gaps(transposed_pair["introgression_score"].values, transposed_pair["genomes"].values, set_difference_threshold)
157+
158+
transposed_pair["introgression_score"] = transposed_pair["genomes"].apply(len)
159+
visualize(
160+
transposed_pair[pangenome_names + ["introgression_score"]].transpose(),
161+
output_dir / f"{anchor}_{chr_name}_introgressions_heatmap.png",
162+
)
163+
pair.loc["intro_score"] = 1 - (
164+
transposed_pair["introgression_score"].values
165+
/ max(transposed_pair["introgression_score"].values)
166+
)
167+
visualize(
168+
pair, output_dir / f"{anchor}_{chr_name}_hybrid_heatmap.png", inverse=True
169+
)
136170

137171
# Step 4 - report, sorted by size and then by score
138-
transposed_pair['introgression_starts'] = (transposed_pair['introgression_score'] != 0) & (transposed_pair['introgression_score'].shift(1, fill_value=0) == 0)
172+
# transposed_pair['introgression_starts'] = (transposed_pair['introgression_score'] != 0) & (transposed_pair['introgression_score'].shift(1, fill_value=0) == 0)
173+
transposed_pair["introgression_starts"] = (
174+
(transposed_pair["genomes_overlap"] > 0)
175+
& (transposed_pair["genomes_overlap"] < 0.9)
176+
) | (
177+
(transposed_pair["genomes_overlap"] == 0)
178+
& (transposed_pair["introgression_score"] == 1)
179+
)
139180

140181
# Create a group identifier for each set of non-zeros
141-
transposed_pair['introgression_group'] = transposed_pair['introgression_starts'].cumsum()
182+
transposed_pair["introgression_group"] = transposed_pair[
183+
"introgression_starts"
184+
].cumsum()
185+
142186
# sum of all introgression scores
143-
total_introgression_scores = transposed_pair[transposed_pair['introgression_score'] != 0].groupby('introgression_group')['introgression_score'].mean()
187+
introgression_groupby = transposed_pair[
188+
transposed_pair["introgression_score"] != 0
189+
].groupby("introgression_group")
190+
total_introgression_scores = introgression_groupby["introgression_score"].mean()
144191
# end index for finding introgression length in bps
145-
last_indices = transposed_pair[transposed_pair['introgression_score'] != 0].groupby('introgression_group').tail(1).index.values
192+
last_indices = introgression_groupby.tail(1).index.values
193+
# genomes that introgression belongs to (union of all sets in the group)
194+
introgression_genomes = (
195+
introgression_groupby["genomes"].agg(lambda sets: set.union(*sets)).values
196+
)
197+
198+
introgressions = transposed_pair[
199+
transposed_pair.introgression_starts == True
200+
].copy()
146201

147-
introgressions = transposed_pair[transposed_pair.introgression_starts == True].copy()
202+
# print(transposed_pair[transposed_pair['introgression_score'] != 0][transposed_pair.columns[10:20]])
148203
introgressions["introgression_score"] = total_introgression_scores.values
149204
introgressions["introgression_end"] = last_indices
150-
introgressions["introgression_end"] = introgressions["introgression_end"] + bin_size # account for the fact that the index is the start and not the end of a bin
151-
introgressions = introgressions[["introgression_end", "introgression_score"]].reset_index().rename(columns={"index":"introgression_start"})
152-
introgressions["introgression_length"] = introgressions["introgression_end"] - introgressions["introgression_start"]
153-
print(introgressions.sort_values(by=["introgression_length", "introgression_score"], ascending=False))
154-
introgressions.sort_values(by=["introgression_length", "introgression_score"], ascending=False).to_csv(output_dir / f"{anchor}_{chr_name}_introgressions.csv", index=False)
155-
# NOTE: don't need this loop anymore; df operations are faster
156-
# introgression_locations = {}
157-
# for location in transposed_pair["introgression_score"]:
158-
# print(location)
159-
# if bin >= 1 and not in_introgression:
160-
# # start new introgression
161-
# in_introgression = True
162-
# introgression_locations = [locatio]
163-
# elif bin >= 1 and in_introgression:
164-
# # append
165-
# else:
166-
# in_introgression = False
167-
168-
# TODO: turn into function that can repeat across all chrs/all genomes if requested
169-
# Look at underlying sequence in found introgressions; compare/cluster? align with annotations?
170-
# we're using dissimilarity to find them though, so by definition, won't these mostly look unique?
205+
introgressions["introgression_end"] = (
206+
introgressions["introgression_end"] + bin_size
207+
) # account for the fact that the index is the start and not the end of a bin
208+
introgressions = (
209+
introgressions[["introgression_end", "introgression_score"]]
210+
.reset_index()
211+
.rename(columns={"index": "introgression_start"})
212+
)
213+
introgressions["introgression_length"] = (
214+
introgressions["introgression_end"] - introgressions["introgression_start"]
215+
)
216+
introgressions["introgression_genomes"] = introgression_genomes
217+
print(introgressions.sort_values(by=["introgression_start"], ascending=True))
218+
introgressions.sort_values(by=["introgression_start"], ascending=True).to_csv(
219+
output_dir / f"{anchor}_{chr_name}_introgressions.csv", index=False
220+
)
221+
171222
return
172223

173224

@@ -177,29 +228,40 @@ def run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins,
177228
# chr_name = "BGV006775_MAS2.0ch11"
178229
bitmap_step = 100
179230
max_chr_bins = 350
180-
size_threshold = 3000000 # NOTE: unused, minimum size in bps of the introgression
181-
k = 31 # TODO: k should be defined somewhere else; don't need from the user
231+
size_threshold = 3000000 # NOTE: unused, minimum size in bps of the introgression
232+
k = 31 # TODO: k should be defined somewhere else; don't need from the user
182233
output_dir = "/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato/introgression_analysis_v1/"
183234

184235
index = Index(index_dir)
236+
set_difference_threshold = 2 # NOTE: could also make this proportionate to pangenome size int(len(index.genomes) / 10)
185237

186-
# For testing
187-
# for anchor in ["SL5"]:#index.genomes.keys():
188-
# genome = index.genomes[anchor]
189-
# print(genome.sizes.keys())
190-
# for chr_name in [11]:#genome.sizes.keys():
191-
# print(anchor, chr_name)
192-
# run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins, k, output_dir)
193-
# break
194-
# break
195-
196-
for anchor in index.genomes.keys():
238+
# For testing with tomato pangenome
239+
for anchor in ["SL5"]:
197240
genome = index.genomes[anchor]
198241
print(genome.sizes.keys())
199-
for chr_name in genome.sizes.keys():
242+
for chr_name in [11]:
200243
print(anchor, chr_name)
201-
run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins, k, output_dir)
244+
run_introgression_finder(
245+
index,
246+
anchor,
247+
chr_name,
248+
bitmap_step,
249+
max_chr_bins,
250+
k,
251+
set_difference_threshold,
252+
output_dir,
253+
)
254+
break
255+
break
256+
257+
# for anchor in index.genomes.keys():
258+
# genome = index.genomes[anchor]
259+
# for chr_name in genome.sizes.keys():
260+
# print("Now running introgression analysis for", anchor, chr_name)
261+
# run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins, k, output_dir)
202262

203263
# TODO: column for each position with a python set of genomes contributing to the introgression score
204264
# column for each position with a python set of genomes that may share the same introgression (similarity above threshold)
205265
# only merge adjacent areas where the set difference is <=2
266+
# TODO: Look at underlying sequence in found introgressions; compare/cluster? align with annotations?
267+
# we're using dissimilarity to find them though, so by definition, won't these mostly look unique?

0 commit comments

Comments
 (0)