saving prototype introgression code

nlbrown62 · nlbrown62 · commit 0eb5014f29b4 · 2024-12-10T15:43:24.000-05:00
diff --git a/panagram/experimentation.py b/panagram/experimentation.py
@@ -0,0 +1,95 @@
+import sys
+import os
+import os.path
+from os import path
+import subprocess
+import numpy as np
+import pandas as pd
+import bgzip
+import gzip
+import csv
+import glob
+import pysam
+from collections import defaultdict, Counter
+from time import time
+from Bio import bgzf, SeqIO
+import yaml
+import multiprocessing as mp
+from types import SimpleNamespace
+import shutil
+import snakemake
+import re
+import logging
+from panagram.index import Index
+
+
+# let's try some toy examples of manipulating a bitmap
+
+# import numpy as np
+# arrs = dict()
+# arr = np.array([3,4,5])
+# step = 0
+
+# # for step,arr in byte_arrs.items():
+# arrs[step] = np.concatenate(arr, axis=1)
+# arrs[step] = np.concatenate(np.array([4,5,6,7]), axis=1)
+
+def better_dir(item):
+    # don't print hidden functions
+    methods = dir(item)
+    return [method for method in methods if not method.startswith("_")]
+
+# Reading in an index - you can do this after running snakemake
+index_dir = "/home/nbrown62/data_mschatz1/nbrown62/panagram/example_data"
+index = Index(index_dir)
+print(index)
+
+# Gives you something like Index(input='samples.tsv', mode=None, prefix='', k=21, cores=1, lowres_step=100, max_bin_kbp=200, min_bin_count=100, max_view_chrs=50, gff_gene_types=['gene'], gff_anno_types=None, gff_name='Name', anchor_genomes=['ecoli', 'ecoli_k12', 'klebsiella', 'salmonella', 'shigella'], prepare=False, kmc=KMC(memory=8, threads=1, use_existing=False), use_existing=1, threads=1, memory=1)
+print(index.anchor_genomes)
+
+# All available methods for Index class
+print(better_dir(index))
+
+# Get a genome class - they are listed by their names in a dictionary
+print(index.genome_names[0])
+ecoli_genome = index.genomes[index.genome_names[0]]
+
+# All available methods for Genome class
+print(better_dir(ecoli_genome))
+
+# pandas df of chromosome names, sizes, and gene count
+# might tell you which chrs are most interesting
+chromosome_info = ecoli_genome.chrs
+print(chromosome_info)
+
+# bitmaps themselves are compressed objects until accessed
+print(ecoli_genome.bitmaps)
+
+# Get part of the bitmap from the genome
+# Query a chromosome at a certain position to get a piece of the bitmap
+# TODO: Not sure what step does yet? Sum of kmer occurrence across a window of size step?
+ecoli_bitmap = ecoli_genome.query('NZ_CP015023.1', 0, 5506781, step=1)
+
+
+# You can also query using the index itself
+# print(index.query_bitmap(ecoli_genome, chrom, start=None, end=None, step=1))
+
+# what if you wanted to look at intergressions
+# set a threshold for what counts as an intergression
+threshold = 0.75 # 75% co-occurrence across the pangenome
+ecoli_bitmap["frac_co_occurrence"] = ecoli_bitmap.sum(axis=1)
+ecoli_bitmap["frac_co_occurrence"] = ecoli_bitmap["frac_co_occurrence"] / len(index.genomes)
+print(ecoli_bitmap)
+
+# the index on the left (+ starting point index) gives you the indices of intergressions
+print(ecoli_bitmap[ecoli_bitmap["frac_co_occurrence"] >= threshold])
+
+# in this example, there are 196262 indices (i.e., there is one at 34329 and one at 5465887)
+# TODO: merge nearby intergression locations into one
+# A run of length > run_threshold is a true intergression
+# TODO: how long are intergressions typically?
+# use a sliding window and take average co-occurance perhaps within a k-sized window
+
+diff_threshold = 10000 # consider different intergression if more than n away?
+# index_diff = df['index'].diff().fillna(0)
+
diff --git a/panagram/generate_corr_figure.py b/panagram/generate_corr_figure.py
@@ -0,0 +1,78 @@
+import sys
+import os
+# import os.path
+# from os import path
+from pathlib import Path
+import subprocess
+import numpy as np
+import pandas as pd
+import bgzip
+import gzip
+import csv
+import glob
+import pysam
+from collections import defaultdict, Counter
+from time import time
+from Bio import bgzf, SeqIO
+import yaml
+import multiprocessing as mp
+from types import SimpleNamespace
+import shutil
+import snakemake
+import re
+import logging
+from panagram.index import Index
+import plotly.express as px
+
+
+def better_dir(item):
+    # don't print hidden functions
+    methods = dir(item)
+    return [method for method in methods if not method.startswith("_")]
+
+
+def visualize(pair, output_file, inverse=False):
+    # take a look at what pair looks like after manipulation
+    # pair[pair >= 1] = 10
+    if inverse:
+        fig = px.imshow(pair,
+                        color_continuous_scale=px.colors.sequential.Plasma[::-1],
+                        x=pair.columns,
+                        y=pair.index)
+    else:
+        fig = px.imshow(pair,
+                        x=pair.columns,
+                        y=pair.index)
+    fig.write_image(output_file)
+    return
+
+index_dir = "/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato"
+anchor = "SL5" #"SL5"
+# chr_name = "BGV006775_MAS2.0ch11"
+output_dir = "/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato/introgression_analysis_v1/"
+
+
+output_dir = Path(output_dir)
+output_dir.mkdir(parents=True, exist_ok=True)
+index = Index(index_dir)
+genome = index.genomes[anchor]
+chrs = genome.sizes.keys()
+# # print(index.genomes)
+bitmap_step = 100
+max_chr_bins = 350
+k=31
+
+for chr_name in chrs:
+    # get an entire chr's bitmap
+    chr_size = genome.sizes[chr_name]
+    chr_bitmap = genome.query(chr_name, 0, chr_size, step=bitmap_step)
+
+    # get correlation matrix
+    start_coord = 0
+    end_coord = chr_size
+    bin_size = ((end_coord - start_coord) // max_chr_bins) + 1
+    num_kmers_in_bin = bin_size - k + 1
+
+    pan, pair = index.bitmap_to_bins(chr_bitmap, bin_size)
+
+    visualize(pair, output_dir / f"{anchor}_{chr_name}_original_heatmap.png", inverse=True)
diff --git a/panagram/introgressions.py b/panagram/introgressions.py
@@ -0,0 +1,205 @@
+import sys
+import os
+# import os.path
+# from os import path
+from pathlib import Path
+import subprocess
+import numpy as np
+import pandas as pd
+import bgzip
+import gzip
+import csv
+import glob
+import pysam
+from collections import defaultdict, Counter
+from time import time
+from Bio import bgzf, SeqIO
+import yaml
+import multiprocessing as mp
+from types import SimpleNamespace
+import shutil
+import snakemake
+import re
+import logging
+from panagram.index import Index
+import plotly.express as px
+
+
+def better_dir(item):
+    # don't print hidden functions
+    methods = dir(item)
+    return [method for method in methods if not method.startswith("_")]
+
+
+def visualize(pair, output_file, inverse=False):
+    # take a look at what pair looks like after manipulation
+    # pair[pair >= 1] = 10
+    if inverse:
+        fig = px.imshow(pair,
+                        color_continuous_scale=px.colors.sequential.Plasma[::-1],
+                        x=pair.columns,
+                        y=pair.index)
+    else:
+        fig = px.imshow(pair,
+                        x=pair.columns,
+                        y=pair.index)
+    fig.write_image(output_file)
+    return
+
+
+def fill_gaps(row, rounds=2):
+    for j in range(rounds):
+        # Find gaps of 0s surrounded by 1s
+        for i in range(1, len(row) - 1):
+            # TODO: if previous row and following row do not share identities, don't add together
+            if row[i] == 0 and (row[i - 1] >= 1 and row[i + 1] >= 1):
+                row[i] = (row[i - 1] + row[i + 1]) / 2
+    return row
+
+def run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins, k, output_dir):
+    # Step 1 - choose an anchor and re-create pairwise correlation matrix for it
+    # kmer size is 20-30ish; kmer at position X starts at position X (not centered at position X)
+    # there are multiple positions in a bin; there are <bin size> - k + 1 kmers in a bin
+    # default: bin_size = ((end_coord - start_coord) // max_chr_bins) + 1; max_chr_bins = 350; step = 100
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # index = Index(index_dir)
+    genome = index.genomes[anchor]
+    # # print(index.genomes)
+    # # print(genome.sizes)
+
+    # get an entire chr's bitmap
+    chr_size = genome.sizes[chr_name]
+    chr_bitmap = genome.query(chr_name, 0, chr_size, step=bitmap_step)
+
+    # get correlation matrix
+    start_coord = 0
+    end_coord = chr_size
+    bin_size = ((end_coord - start_coord) // max_chr_bins) + 1
+    num_kmers_in_bin = bin_size - k + 1
+
+    print("# Positions in a bin", bin_size)
+    print("# Kmers in a bin", num_kmers_in_bin)
+
+    pan, pair = index.bitmap_to_bins(chr_bitmap, bin_size)
+    visualize(pair, output_dir / f"{anchor}_{chr_name}_original_heatmap.png", inverse=True)
+
+    # # sanity check
+    # print(len(pair.columns))
+
+    # Step 2 - slide through the matrix one genome at a time and calculate average pairwise correlation
+    # introgression - area of yellow in panagram where all other accessions/samples are less similar to anchor
+    # assumes other accessions are related variants without the introgression
+    # sliding window size based on size of the matrix - 5% of the size? alternatively based on bps
+    # threshold maybe around <=0.6; if average within threshold, mark all locations within window as part of introgression
+
+    # TODO: decide btwn using the mean/max of outliers to set threshold
+    # Or using q3; could increase the threshold and use redundancy as an additional metric for deciding
+    values = pair.values.flatten()
+    q1 = np.percentile(values, 25)
+    q3 = np.percentile(values, 75)
+    iqr = q3 - q1
+    lower_bound = q1 - 1.5 * iqr
+    outliers = values[values < lower_bound]
+
+    if len(outliers) > 0:
+        dissimilarity_threshold = outliers.max()
+    else:
+        dissimilarity_threshold = 0.6
+    print("Chosen dissimilarity threshold", dissimilarity_threshold)
+
+    # Histogram of correlations
+    fig = px.histogram(values, title='Histogram of Correlation Values')
+    fig.add_vline(x=dissimilarity_threshold, line_width=2, line_dash="dash", line_color="red", annotation_text="Threshold", annotation_position="top left")
+    fig.write_image(output_dir / f"{anchor}_{chr_name}_corr_hist.png")
+
+    # NOTE: if there are no outliers, or threshold is very high, there are likely no introgressions
+    # test for this scenario
+
+    # flip matrix so we can use pandas rolling operation
+    transposed_pair = pair.transpose().drop(columns=[anchor])
+    # print(transposed_pair)
+
+    # convert to binary array by applying dissimilarity threshold
+    transposed_pair[transposed_pair > dissimilarity_threshold] = 0
+    transposed_pair[transposed_pair != 0] = 1
+
+    # Step 3 - create an introgression score for each position based on total dissimilarity
+    transposed_pair["introgression_score"] = transposed_pair.sum(axis=1)
+    # transposed_pair["genomes"] = transposed_pair.apply(lambda row: {col for col in transposed_pair.columns if row[col] > 0}, axis=1)
+    # print(transposed_pair)
+
+    # combine nearby locations as the same introgression
+    transposed_pair["introgression_score"] = fill_gaps(transposed_pair["introgression_score"].values)#, transposed_pair["genomes"].values)
+    visualize(transposed_pair.transpose(), output_dir / f"{anchor}_{chr_name}_introgressions_heatmap.png")
+
+    # Step 4 - report, sorted by size and then by score
+    transposed_pair['introgression_starts'] = (transposed_pair['introgression_score'] != 0) & (transposed_pair['introgression_score'].shift(1, fill_value=0) == 0)
+
+    # Create a group identifier for each set of non-zeros
+    transposed_pair['introgression_group'] = transposed_pair['introgression_starts'].cumsum()
+    # sum of all introgression scores
+    total_introgression_scores = transposed_pair[transposed_pair['introgression_score'] != 0].groupby('introgression_group')['introgression_score'].mean()
+    # end index for finding introgression length in bps
+    last_indices = transposed_pair[transposed_pair['introgression_score'] != 0].groupby('introgression_group').tail(1).index.values
+
+    introgressions = transposed_pair[transposed_pair.introgression_starts == True].copy()
+    introgressions["introgression_score"] = total_introgression_scores.values
+    introgressions["introgression_end"] = last_indices
+    introgressions["introgression_end"] = introgressions["introgression_end"] + bin_size # account for the fact that the index is the start and not the end of a bin
+    introgressions = introgressions[["introgression_end", "introgression_score"]].reset_index().rename(columns={"index":"introgression_start"})
+    introgressions["introgression_length"] = introgressions["introgression_end"] - introgressions["introgression_start"]
+    print(introgressions.sort_values(by=["introgression_length", "introgression_score"], ascending=False))
+    introgressions.sort_values(by=["introgression_length", "introgression_score"], ascending=False).to_csv(output_dir / f"{anchor}_{chr_name}_introgressions.csv", index=False)
+    # NOTE: don't need this loop anymore; df operations are faster
+    # introgression_locations = {}
+    # for location in transposed_pair["introgression_score"]:
+    #     print(location)
+        # if bin >= 1 and not in_introgression:
+        #     # start new introgression
+        #     in_introgression = True
+        #     introgression_locations = [locatio]
+        # elif bin >= 1 and in_introgression:
+        #     # append
+        # else:
+            # in_introgression = False
+
+    # TODO: turn into function that can repeat across all chrs/all genomes if requested
+    # Look at underlying sequence in found introgressions; compare/cluster? align with annotations?
+    # we're using dissimilarity to find them though, so by definition, won't these mostly look unique?
+    return
+
+
+# USER PARAMS
+index_dir = "/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato"
+# anchor = "BGV006775_MAS2" #"SL5"
+# chr_name = "BGV006775_MAS2.0ch11"
+bitmap_step = 100
+max_chr_bins = 350
+size_threshold = 3000000 # NOTE: unused, minimum size in bps of the introgression
+k = 31 # TODO: k should be defined somewhere else; don't need from the user
+output_dir = "/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato/introgression_analysis_v1/"
+
+index = Index(index_dir)
+
+# For testing
+# for anchor in ["SL5"]:#index.genomes.keys():
+#     genome = index.genomes[anchor]
+#     print(genome.sizes.keys())
+#     for chr_name in [11]:#genome.sizes.keys():
+#         print(anchor, chr_name)
+#         run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins, k, output_dir)
+#         break
+#     break
+
+for anchor in index.genomes.keys():
+    genome = index.genomes[anchor]
+    print(genome.sizes.keys())
+    for chr_name in genome.sizes.keys():
+        print(anchor, chr_name)
+        run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins, k, output_dir)
+
+# TODO: column for each position with a python set of genomes contributing to the introgression score
+# column for each position with a python set of genomes that may share the same introgression (similarity above threshold)
+# only merge adjacent areas where the set difference is <=2