diff --git a/README.md b/README.md index 2730ae4..e03eef4 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,12 @@ Outputs an alphafold model for each part of the targets that we were interested 4. Find best CATH hit using merizo search. +## Results collation + +hmm_drift_summary.R +prottrans_drift_summary.R +summarise_models.R + ## ANALYSIS OF DRIFT QUESTIONS 1. Analyse *_blast_summary.csv, to find out which families show drift and what kinds of drift @@ -143,15 +149,3 @@ Outputs an alphafold model for each part of the targets that we were interested 2. What kinds of drift pattern are there? 3. Do drift families share the same folds or not? If not what happens if we build alphafold models with the MSAs at each iteration (see above) 4. Do drifting families correlate with "hard" targets for alphafold/dmplfold. i.e. regions of low plDDT or just the poor models in CASP15 - -## 3. Other Model coherence. - -1. Generate 100 seqs for each drift family using hhemit -2. blast each seq against pfam seq db. What is the family of top hit? What is the consensus family of the top 5 and 10 hits? -3. Repeat for protein ESM single and ESM-MSA - -# To do - -1. collate alphafold plDDT results (/home/dbuchan/Projects/profile_drift/results_data/alphafold_models) -2. merizo search over alphfold models? -3. some r-scripts to make barcharts \ No newline at end of file diff --git a/results_planning.md b/results_planning.md index 3ffaa30..9ad3edc 100644 --- a/results_planning.md +++ b/results_planning.md @@ -20,4 +20,6 @@ From that we sample 530 families 4. Model analysis - - results_data/generation_or_af_targets/alphafold_targets.csv \ No newline at end of file + - results_data/generation_or_af_targets/alphafold_targets.csv + analyse_plddt.py + diff --git a/scripts/alphafold_modelling/analyse_merizo.py b/scripts/alphafold_modelling/analyse_merizo.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/alphafold_modelling/analyse_plddt.py b/scripts/alphafold_modelling/analyse_plddt.py new file mode 100644 index 0000000..4f937c7 --- /dev/null +++ b/scripts/alphafold_modelling/analyse_plddt.py @@ -0,0 +1,73 @@ +import csv +import sys +from collections import defaultdict +# +# python scripts/alphafold_modelling/analyse_plddt.py ./results_data/alphafold_models/plddt_summary.csv > ./results_data/alphafold_models/plddt_means.csv +# + +results = {'contaminants_grew': {}, + 'contaminants_complex': {}, + 'contaminants_purified': {}, + 'insig_drift': {}, + 'non_drift': {}, + 'query_purified': {}, +} +classes = set() +with open(sys.argv[1], "r", encoding="utf-8") as fhIn: + plddtreader = csv.reader(fhIn, delimiter=',') + next(plddtreader) + for row in plddtreader: + classes.add(row[0]) + if int(row[2]) in results[row[0]]: + results[row[0]][int(row[2])]["total_plddt"]+=float(row[3]) + results[row[0]][int(row[2])]["class_count"]+=1 + else: + results[row[0]][int(row[2])] = {"total_plddt": float(row[3]), + "class_count": 1} + +for drift_class in results: + # print(drift_class) + # print(results[drift_class]) + if "contaminants_grew" in drift_class: + mean_1 = results[drift_class][1]['total_plddt']/results[drift_class][1]['class_count'] + mean_5 = results[drift_class][5]['total_plddt']/results[drift_class][5]['class_count'] + mean_20 = results[drift_class][20]['total_plddt']/results[drift_class][20]['class_count'] + print(f'{drift_class},{mean_1},{mean_5},{mean_20}') + if "query_purified" in drift_class: + mean_1 = results[drift_class][1]['total_plddt']/results[drift_class][1]['class_count'] + mean_5 = results[drift_class][5]['total_plddt']/results[drift_class][5]['class_count'] + mean_20 = results[drift_class][20]['total_plddt']/results[drift_class][20]['class_count'] + print(f'{drift_class},{mean_1},{mean_5},{mean_20}') + if "insig_drift" in drift_class: + mean_1 = results[drift_class][1]['total_plddt']/results[drift_class][1]['class_count'] + mean_20 = results[drift_class][20]['total_plddt']/results[drift_class][20]['class_count'] + print(f'{drift_class},{mean_1},na,{mean_20}') + if "non_drift" in drift_class: + mean_1 = results[drift_class][1]['total_plddt']/results[drift_class][1]['class_count'] + mean_20 = results[drift_class][20]['total_plddt']/results[drift_class][20]['class_count'] + print(f'{drift_class},{mean_1},na,{mean_20}') + if "contaminants_complex" in drift_class: + mean_1 = results[drift_class][1]['total_plddt']/results[drift_class][1]['class_count'] + mean_20 = results[drift_class][20]['total_plddt']/results[drift_class][20]['class_count'] + tot_plddt = 0 + tot_count = 0 + for iteration in results[drift_class]: + if iteration == 1 or iteration == 20: + continue + tot_plddt += results[drift_class][iteration]['total_plddt'] + tot_count += results[drift_class][iteration]['class_count'] + mean_peak = tot_plddt/tot_count + print(f'{drift_class},{mean_1},{mean_peak},{mean_20}') + if "contaminants_purified" in drift_class: + mean_1 = results[drift_class][1]['total_plddt']/results[drift_class][1]['class_count'] + mean_20 = results[drift_class][20]['total_plddt']/results[drift_class][20]['class_count'] + tot_plddt = 0 + tot_count = 0 + for iteration in results[drift_class]: + if iteration == 1 or iteration == 20: + continue + tot_plddt += results[drift_class][iteration]['total_plddt'] + tot_count += results[drift_class][iteration]['class_count'] + mean_peak = tot_plddt/tot_count + print(f'{drift_class},{mean_1},{mean_peak},{mean_20}') + \ No newline at end of file diff --git a/scripts/alphafold_modelling/summarise_models.R b/scripts/alphafold_modelling/summarise_models.R new file mode 100644 index 0000000..299ce79 --- /dev/null +++ b/scripts/alphafold_modelling/summarise_models.R @@ -0,0 +1,8 @@ +library(ggplot2) +library(reshape2) +library(tidyr) +library(tidyverse) + +plDDT_summary <- read.csv("/home/dbuchan/Projects/profile_drift/results_data/alphafold_models/plddt_summary.csv", header=T) + +merizo_summary <- read.csv("/home/dbuchan/Projects/profile_drift/results_data/alphafold_models/merizo_hits.csv", header=T)