|
| 1 | +from pathlib import Path |
| 2 | +import numpy as np |
| 3 | +import pandas as pd |
| 4 | +import plotly.express as px |
| 5 | +from postprocess_introgressions import threshold_introgressions |
| 6 | + |
| 7 | + |
| 8 | +def self_dotplot(seq): |
| 9 | + import matplotlib.pyplot as plt |
| 10 | + |
| 11 | + # look at a 10kb piece of a sequence |
| 12 | + middle = int(len(seq) / 2) |
| 13 | + seq = seq[middle - 5000 : middle + 5000] |
| 14 | + length = len(seq) |
| 15 | + |
| 16 | + matrix = np.zeros((length, length)) |
| 17 | + |
| 18 | + for i in range(length): |
| 19 | + for j in range(length): |
| 20 | + if seq[i] == seq[j]: |
| 21 | + matrix[i, j] = 1 |
| 22 | + |
| 23 | + plt.imshow(matrix, cmap="Greys", interpolation="none") |
| 24 | + plt.savefig("./test.png") |
| 25 | + return |
| 26 | + |
| 27 | + |
| 28 | +def create_heatmap(distances_file): |
| 29 | + distances_file = Path(distances_file) |
| 30 | + output_file = distances_file.parent / (distances_file.stem + ".png") |
| 31 | + distances = pd.read_csv(distances_file, sep="\t", index_col=0).fillna(0) |
| 32 | + # new_index = list(distances.index) |
| 33 | + |
| 34 | + # for sl4 |
| 35 | + # new_index = [i.split(".")[0] for i in new_index] |
| 36 | + # for sl5 - get the name of the acession without number added by Jasmine |
| 37 | + # new_index = [i.split("_")[1] for i in new_index] |
| 38 | + |
| 39 | + # distances.index = new_index |
| 40 | + distances = distances.sort_index() |
| 41 | + |
| 42 | + fig = px.imshow(distances, color_continuous_scale="Greens", aspect="auto", zmin=0, zmax=1) |
| 43 | + fig.update_layout(yaxis=dict(tickmode="linear"), title=distances_file.stem) |
| 44 | + fig.write_image(output_file) |
| 45 | + return |
| 46 | + |
| 47 | + |
| 48 | +def create_scored_heatmap(called_intro_file, gt_intro_file, threshold): |
| 49 | + # read both distances files |
| 50 | + output_file = called_intro_file.parent / (called_intro_file.stem + ".scored.png") |
| 51 | + |
| 52 | + # threshold and rename the accessions to match |
| 53 | + called_intro_df, gt_intro_df = threshold_introgressions( |
| 54 | + called_intro_file, gt_intro_file, threshold |
| 55 | + ) |
| 56 | + |
| 57 | + # only visualize the accessions that are shared btwn both files |
| 58 | + shared_accessions = list( |
| 59 | + set(called_intro_df.index.values).intersection(set(gt_intro_df.index.values)) |
| 60 | + ) |
| 61 | + called_intro_df = called_intro_df[called_intro_df.index.isin(shared_accessions)].sort_index() |
| 62 | + gt_intro_df = gt_intro_df[gt_intro_df.index.isin(shared_accessions)].sort_index() |
| 63 | + |
| 64 | + # label all positions as green for TP, yellow for FP, red for FN, white/blank for TN |
| 65 | + # true pos |
| 66 | + called_intro_df[(called_intro_df == 1) & (gt_intro_df == 1)] = 5 |
| 67 | + |
| 68 | + # true neg |
| 69 | + called_intro_df[(called_intro_df == 0) & (gt_intro_df == 0)] = 4 |
| 70 | + |
| 71 | + # false pos |
| 72 | + called_intro_df[(called_intro_df == 1) & (gt_intro_df == 0)] = 3 |
| 73 | + |
| 74 | + # false neg |
| 75 | + called_intro_df[(called_intro_df == 0) & (gt_intro_df == 1)] = 2 |
| 76 | + |
| 77 | + # visualize |
| 78 | + fig = px.imshow( |
| 79 | + called_intro_df, color_continuous_scale=["red", "yellow", "gray", "green"], aspect="auto" |
| 80 | + ) |
| 81 | + fig.update_layout( |
| 82 | + coloraxis_showscale=False, yaxis=dict(tickmode="linear"), title=output_file.stem |
| 83 | + ) |
| 84 | + fig.write_image(output_file) |
| 85 | + return |
| 86 | + |
| 87 | + |
| 88 | +def create_heatmap_runner(): |
| 89 | + # NOTE: change folder here |
| 90 | + input_folder = Path( |
| 91 | + "/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato_sl4/introgression_analysis_v2/postprocessed" |
| 92 | + ) |
| 93 | + |
| 94 | + distances_files = list(input_folder.glob("chr*.txt")) |
| 95 | + for file in distances_files: |
| 96 | + print(f"Visualizing {file.name}") |
| 97 | + create_heatmap(file) |
| 98 | + return |
| 99 | + |
| 100 | + |
| 101 | +def create_scored_heatmap_runner(): |
| 102 | + # NOTE: change folders here |
| 103 | + called_intros_folder = Path( |
| 104 | + "/home/nbrown62/data_mschatz1/nbrown62/panagram_data/tomato_sl4/introgression_analysis_v2/postprocessed" |
| 105 | + ) |
| 106 | + gt_intros_folder = Path( |
| 107 | + "/home/nbrown62/data_mschatz1/nbrown62/CallIntrogressions_data/tomato_sl4_paper" |
| 108 | + ) |
| 109 | + introgression_type = "REF" |
| 110 | + threshold = 0.5 |
| 111 | + |
| 112 | + # get gt and called intro files |
| 113 | + called_intros_files = list(called_intros_folder.glob(f"chr*{introgression_type}.txt")) |
| 114 | + gt_intros_files_sp = list(gt_intros_folder.glob(f"chr*SP.txt")) |
| 115 | + gt_intros_files_slc = list(gt_intros_folder.glob(f"chr*SLC.txt")) |
| 116 | + |
| 117 | + if introgression_type == "SP": |
| 118 | + gt_intros_files = gt_intros_files_sp |
| 119 | + elif introgression_type == "SLC": |
| 120 | + gt_intros_files = gt_intros_files_slc |
| 121 | + elif introgression_type == "merged" or introgression_type == "REF": |
| 122 | + # merge files together |
| 123 | + gt_intros_files_sp.sort() |
| 124 | + gt_intros_files_slc.sort() |
| 125 | + gt_intros_files = list(zip(gt_intros_files_slc, gt_intros_files_sp)) |
| 126 | + |
| 127 | + # sort the files and make sure there are equal numbers of files |
| 128 | + called_intros_files.sort() |
| 129 | + gt_intros_files.sort() |
| 130 | + |
| 131 | + if len(called_intros_files) != len(gt_intros_files): |
| 132 | + raise ValueError("Unequal numbers of GT and called chromosome files...") |
| 133 | + |
| 134 | + # pass tuples of files to create_scored_heatmap for each chromosome |
| 135 | + for called_intros_file, gt_intros_file in zip(called_intros_files, gt_intros_files): |
| 136 | + create_scored_heatmap(called_intros_file, gt_intros_file, threshold=0.5) |
| 137 | + return |
| 138 | + |
| 139 | + |
| 140 | +if __name__ == "__main__": |
| 141 | + # create_heatmap_runner() |
| 142 | + create_scored_heatmap_runner() |
0 commit comments