|
| 1 | +import sys |
| 2 | +import os |
| 3 | +# import os.path |
| 4 | +# from os import path |
| 5 | +from pathlib import Path |
| 6 | +import subprocess |
| 7 | +import numpy as np |
| 8 | +import pandas as pd |
| 9 | +import bgzip |
| 10 | +import gzip |
| 11 | +import csv |
| 12 | +import glob |
| 13 | +import pysam |
| 14 | +from collections import defaultdict, Counter |
| 15 | +from time import time |
| 16 | +from Bio import bgzf, SeqIO |
| 17 | +import yaml |
| 18 | +import multiprocessing as mp |
| 19 | +from types import SimpleNamespace |
| 20 | +import shutil |
| 21 | +import snakemake |
| 22 | +import re |
| 23 | +import logging |
| 24 | +from panagram.index import Index |
| 25 | +import plotly.express as px |
| 26 | + |
| 27 | + |
| 28 | +def better_dir(item): |
| 29 | + # don't print hidden functions |
| 30 | + methods = dir(item) |
| 31 | + return [method for method in methods if not method.startswith("_")] |
| 32 | + |
| 33 | + |
| 34 | +def visualize(pair, output_file, inverse=False): |
| 35 | + # take a look at what pair looks like after manipulation |
| 36 | + # pair[pair >= 1] = 10 |
| 37 | + if inverse: |
| 38 | + fig = px.imshow(pair, |
| 39 | + color_continuous_scale=px.colors.sequential.Plasma[::-1], |
| 40 | + x=pair.columns, |
| 41 | + y=pair.index) |
| 42 | + else: |
| 43 | + fig = px.imshow(pair, |
| 44 | + x=pair.columns, |
| 45 | + y=pair.index) |
| 46 | + fig.write_image(output_file) |
| 47 | + return |
| 48 | + |
| 49 | + |
| 50 | +def fill_gaps(row, rounds=2): |
| 51 | + for j in range(rounds): |
| 52 | + # Find gaps of 0s surrounded by 1s |
| 53 | + for i in range(1, len(row) - 1): |
| 54 | + # TODO: if previous row and following row do not share identities, don't add together |
| 55 | + if row[i] == 0 and (row[i - 1] >= 1 and row[i + 1] >= 1): |
| 56 | + row[i] = (row[i - 1] + row[i + 1]) / 2 |
| 57 | + return row |
| 58 | + |
| 59 | +def run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins, k, output_dir): |
| 60 | + # Step 1 - choose an anchor and re-create pairwise correlation matrix for it |
| 61 | + # kmer size is 20-30ish; kmer at position X starts at position X (not centered at position X) |
| 62 | + # there are multiple positions in a bin; there are <bin size> - k + 1 kmers in a bin |
| 63 | + # default: bin_size = ((end_coord - start_coord) // max_chr_bins) + 1; max_chr_bins = 350; step = 100 |
| 64 | + output_dir = Path(output_dir) |
| 65 | + output_dir.mkdir(parents=True, exist_ok=True) |
| 66 | + |
| 67 | + # index = Index(index_dir) |
| 68 | + genome = index.genomes[anchor] |
| 69 | + # # print(index.genomes) |
| 70 | + # # print(genome.sizes) |
| 71 | + |
| 72 | + # get an entire chr's bitmap |
| 73 | + chr_size = genome.sizes[chr_name] |
| 74 | + chr_bitmap = genome.query(chr_name, 0, chr_size, step=bitmap_step) |
| 75 | + |
| 76 | + # get correlation matrix |
| 77 | + start_coord = 0 |
| 78 | + end_coord = chr_size |
| 79 | + bin_size = ((end_coord - start_coord) // max_chr_bins) + 1 |
| 80 | + num_kmers_in_bin = bin_size - k + 1 |
| 81 | + |
| 82 | + print("# Positions in a bin", bin_size) |
| 83 | + print("# Kmers in a bin", num_kmers_in_bin) |
| 84 | + |
| 85 | + pan, pair = index.bitmap_to_bins(chr_bitmap, bin_size) |
| 86 | + visualize(pair, output_dir / f"{anchor}_{chr_name}_original_heatmap.png", inverse=True) |
| 87 | + |
| 88 | + # # sanity check |
| 89 | + # print(len(pair.columns)) |
| 90 | + |
| 91 | + # Step 2 - slide through the matrix one genome at a time and calculate average pairwise correlation |
| 92 | + # introgression - area of yellow in panagram where all other accessions/samples are less similar to anchor |
| 93 | + # assumes other accessions are related variants without the introgression |
| 94 | + # sliding window size based on size of the matrix - 5% of the size? alternatively based on bps |
| 95 | + # threshold maybe around <=0.6; if average within threshold, mark all locations within window as part of introgression |
| 96 | + |
| 97 | + # TODO: decide btwn using the mean/max of outliers to set threshold |
| 98 | + # Or using q3; could increase the threshold and use redundancy as an additional metric for deciding |
| 99 | + values = pair.values.flatten() |
| 100 | + q1 = np.percentile(values, 25) |
| 101 | + q3 = np.percentile(values, 75) |
| 102 | + iqr = q3 - q1 |
| 103 | + lower_bound = q1 - 1.5 * iqr |
| 104 | + outliers = values[values < lower_bound] |
| 105 | + |
| 106 | + if len(outliers) > 0: |
| 107 | + dissimilarity_threshold = outliers.max() |
| 108 | + else: |
| 109 | + dissimilarity_threshold = 0.6 |
| 110 | + print("Chosen dissimilarity threshold", dissimilarity_threshold) |
| 111 | + |
| 112 | + # Histogram of correlations |
| 113 | + fig = px.histogram(values, title='Histogram of Correlation Values') |
| 114 | + fig.add_vline(x=dissimilarity_threshold, line_width=2, line_dash="dash", line_color="red", annotation_text="Threshold", annotation_position="top left") |
| 115 | + fig.write_image(output_dir / f"{anchor}_{chr_name}_corr_hist.png") |
| 116 | + |
| 117 | + # NOTE: if there are no outliers, or threshold is very high, there are likely no introgressions |
| 118 | + # test for this scenario |
| 119 | + |
| 120 | + # flip matrix so we can use pandas rolling operation |
| 121 | + transposed_pair = pair.transpose().drop(columns=[anchor]) |
| 122 | + # print(transposed_pair) |
| 123 | + |
| 124 | + # convert to binary array by applying dissimilarity threshold |
| 125 | + transposed_pair[transposed_pair > dissimilarity_threshold] = 0 |
| 126 | + transposed_pair[transposed_pair != 0] = 1 |
| 127 | + |
| 128 | + # Step 3 - create an introgression score for each position based on total dissimilarity |
| 129 | + transposed_pair["introgression_score"] = transposed_pair.sum(axis=1) |
| 130 | + # transposed_pair["genomes"] = transposed_pair.apply(lambda row: {col for col in transposed_pair.columns if row[col] > 0}, axis=1) |
| 131 | + # print(transposed_pair) |
| 132 | + |
| 133 | + # combine nearby locations as the same introgression |
| 134 | + transposed_pair["introgression_score"] = fill_gaps(transposed_pair["introgression_score"].values)#, transposed_pair["genomes"].values) |
| 135 | + visualize(transposed_pair.transpose(), output_dir / f"{anchor}_{chr_name}_introgressions_heatmap.png") |
| 136 | + |
| 137 | + # Step 4 - report, sorted by size and then by score |
| 138 | + transposed_pair['introgression_starts'] = (transposed_pair['introgression_score'] != 0) & (transposed_pair['introgression_score'].shift(1, fill_value=0) == 0) |
| 139 | + |
| 140 | + # Create a group identifier for each set of non-zeros |
| 141 | + transposed_pair['introgression_group'] = transposed_pair['introgression_starts'].cumsum() |
| 142 | + # sum of all introgression scores |
| 143 | + total_introgression_scores = transposed_pair[transposed_pair['introgression_score'] != 0].groupby('introgression_group')['introgression_score'].mean() |
| 144 | + # end index for finding introgression length in bps |
| 145 | + last_indices = transposed_pair[transposed_pair['introgression_score'] != 0].groupby('introgression_group').tail(1).index.values |
| 146 | + |
| 147 | + introgressions = transposed_pair[transposed_pair.introgression_starts == True].copy() |
| 148 | + introgressions["introgression_score"] = total_introgression_scores.values |
| 149 | + introgressions["introgression_end"] = last_indices |
| 150 | + introgressions["introgression_end"] = introgressions["introgression_end"] + bin_size # account for the fact that the index is the start and not the end of a bin |
| 151 | + introgressions = introgressions[["introgression_end", "introgression_score"]].reset_index().rename(columns={"index":"introgression_start"}) |
| 152 | + introgressions["introgression_length"] = introgressions["introgression_end"] - introgressions["introgression_start"] |
| 153 | + print(introgressions.sort_values(by=["introgression_length", "introgression_score"], ascending=False)) |
| 154 | + introgressions.sort_values(by=["introgression_length", "introgression_score"], ascending=False).to_csv(output_dir / f"{anchor}_{chr_name}_introgressions.csv", index=False) |
| 155 | + # NOTE: don't need this loop anymore; df operations are faster |
| 156 | + # introgression_locations = {} |
| 157 | + # for location in transposed_pair["introgression_score"]: |
| 158 | + # print(location) |
| 159 | + # if bin >= 1 and not in_introgression: |
| 160 | + # # start new introgression |
| 161 | + # in_introgression = True |
| 162 | + # introgression_locations = [locatio] |
| 163 | + # elif bin >= 1 and in_introgression: |
| 164 | + # # append |
| 165 | + # else: |
| 166 | + # in_introgression = False |
| 167 | + |
| 168 | + # TODO: turn into function that can repeat across all chrs/all genomes if requested |
| 169 | + # Look at underlying sequence in found introgressions; compare/cluster? align with annotations? |
| 170 | + # we're using dissimilarity to find them though, so by definition, won't these mostly look unique? |
| 171 | + return |
| 172 | + |
| 173 | + |
| 174 | +# USER PARAMS |
| 175 | +index_dir = "/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato" |
| 176 | +# anchor = "BGV006775_MAS2" #"SL5" |
| 177 | +# chr_name = "BGV006775_MAS2.0ch11" |
| 178 | +bitmap_step = 100 |
| 179 | +max_chr_bins = 350 |
| 180 | +size_threshold = 3000000 # NOTE: unused, minimum size in bps of the introgression |
| 181 | +k = 31 # TODO: k should be defined somewhere else; don't need from the user |
| 182 | +output_dir = "/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato/introgression_analysis_v1/" |
| 183 | + |
| 184 | +index = Index(index_dir) |
| 185 | + |
| 186 | +# For testing |
| 187 | +# for anchor in ["SL5"]:#index.genomes.keys(): |
| 188 | +# genome = index.genomes[anchor] |
| 189 | +# print(genome.sizes.keys()) |
| 190 | +# for chr_name in [11]:#genome.sizes.keys(): |
| 191 | +# print(anchor, chr_name) |
| 192 | +# run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins, k, output_dir) |
| 193 | +# break |
| 194 | +# break |
| 195 | + |
| 196 | +for anchor in index.genomes.keys(): |
| 197 | + genome = index.genomes[anchor] |
| 198 | + print(genome.sizes.keys()) |
| 199 | + for chr_name in genome.sizes.keys(): |
| 200 | + print(anchor, chr_name) |
| 201 | + run_introgression_finder(index, anchor, chr_name, bitmap_step, max_chr_bins, k, output_dir) |
| 202 | + |
| 203 | +# TODO: column for each position with a python set of genomes contributing to the introgression score |
| 204 | +# column for each position with a python set of genomes that may share the same introgression (similarity above threshold) |
| 205 | +# only merge adjacent areas where the set difference is <=2 |
0 commit comments