summarised merizo outputs

psipred · Oct 31, 2024 · b132c8b · b132c8b
1 parent 260b5ca
commit b132c8b
Show file tree

Hide file tree

Showing 5 changed files with 111 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -133,8 +133,13 @@ Outputs an alphafold model for each part of the targets that we were interested
 > ./results_data/alphafold_models/plddt_summary.csv
 
 4. Find best CATH hit using merizo search.
-> ./ run_merizo_search.py
+> ./run_merizo_search.py
 
+Runs merizo search and collates which cath h family and the TMscore
+
+> ./sum_merizo.py
+
+read the merzio summary and work out, per class, how often the cath class changes and how much the TMScor changes
 
 ## Results collation
 

diff --git a/insig_udpate/hmm_subset.hmm → results_data/insig_udpate/hmm_subset.hmm b/insig_udpate/hmm_subset.hmm → results_data/insig_udpate/hmm_subset.hmm
diff --git a/scripts/alphafold_modelling/run_merizo_search.py b/scripts/alphafold_modelling/run_merizo_search.py
@@ -16,6 +16,7 @@
 
 print('class,domain,iteration,hit,max_tm,h_family')
 for file in glob.glob(f'{sys.argv[1]}/*.pdb'):
+    # print(file)
     m = re.search('models/(.+_PF\d+_\d+)_', file)
     identifier = ''
     if m:

diff --git a/scripts/alphafold_modelling/sum_merizo.py b/scripts/alphafold_modelling/sum_merizo.py
@@ -0,0 +1,97 @@
+import csv
+import sys
+import statistics
+
+#
+# python sum_merizo.py results_data/alphafold_models/merizo_search_hits.csv
+#
+
+def read_merizo(file):
+    data = {}
+    with open(file, "r", encoding="utf-8") as fh:
+        next(fh)
+        merizoreader = csv.reader(fh, delimiter=",")
+        for row in merizoreader:
+            if row[0] not in data:
+                data[row[0]] = {}
+            if row[1] not in data[row[0]]:
+                data[row[0]][row[1]] = {}
+            data[row[0]][row[1]][int(row[2])] = {'cath_code': row[5], 'tm_score': row[4]}
+            # print(row)
+    return(data)
+
+def read_counts(data, iterations):
+    count_NAs = 0
+    count_matched = 0
+    count_matched_unassigned = 0
+
+    count_became_unassigned = 0
+    count_became_na = 0
+    count_missed = 0
+    first_iteration_tm_scores = []
+    final_iteration_tm_scores = []
+    for prot_id in data:
+        cath_codes = set()
+        if 1 not in data[prot_id] or 20 not in data[prot_id]:
+            continue
+        for iteration in iterations:
+            cath_codes.add(data[prot_id][iteration]['cath_code'])
+        if 'NA' not in data[prot_id][1]['tm_score']:
+            first_iteration_tm_scores.append(float(data[prot_id][1]['tm_score']))
+        if 'NA' not in data[prot_id][20]['tm_score']:
+            final_iteration_tm_scores.append(float(data[prot_id][20]['tm_score']))
+
+        if len(cath_codes) == 1:
+            if 'NA' in cath_codes:
+                count_NAs += 1
+            elif 'UNASSIGNED' in cath_codes:
+                count_matched_unassigned += 1
+            else:
+                count_matched += 1
+        else:
+            if 'NA' in cath_codes:
+                count_became_na += 1
+            elif 'UNASSIGNED' in cath_codes:
+                count_became_unassigned += 1
+            else:
+                count_missed += 1
+    print(f'Total Analysed {len(data)}')
+    print(f'No domains recognised: {count_NAs}')
+    print(f'Domain always the same: {count_matched}')
+    print(f'Domain always unassigned: {count_matched_unassigned}')
+    print(f'Domain id changed: {count_missed}')
+    print(f'Domain changed to NA: {count_became_na}')
+    print(f'Domain changed to unassigned: {count_became_unassigned}')
+    print(f'mean tmscore at 1: {statistics.mean(first_iteration_tm_scores)}')
+    print(f'mean tmscore at 10: {statistics.mean(final_iteration_tm_scores)}')
+
+
+
+
+types = ['contaminants_complex', 'contaminants_grew', 'contaminants_purified',
+         'insig_drift', 'non_drift', 'query_purified']
+
+
+merizo_data = read_merizo(sys.argv[1])
+
+c_complex_data = merizo_data['contaminants_complex']
+c_grew_data = merizo_data['contaminants_grew']
+c_purified_data = merizo_data['contaminants_purified']
+insig_data = merizo_data['insig_drift']
+non_data = merizo_data['non_drift']
+q_purified_data = merizo_data['query_purified']
+
+print("Non Drift")
+read_counts(non_data, [1, 20])
+print("Insig Drift")
+read_counts(insig_data, [1, 20])
+print("Query purified")
+read_counts(q_purified_data, [1, 20])
+print("Contaminants Grew")
+read_counts(c_grew_data, [1, 20])
+print("Contaminants Purified")
+read_counts(c_purified_data, [1, 20])
+print("Contaminants Complex")
+read_counts(c_complex_data, [1, 20])
+
+#print(merizo_data)
diff --git a/scripts/hmmer_seqs/find_closest_hmm_seq_family.py b/scripts/hmmer_seqs/find_closest_hmm_seq_family.py
@@ -34,8 +34,8 @@ def get_hmm_generated_sequences(hmm_seqs):
     return seqs       
 
 def find_closest_fasta(all_family_seqs, gen_seqs, resultspath, i):
-    seq_record = gen_seqs[i]     
-    match = re.search("^>.+\|(PF\d+)-", seq_record['header'])
+    seq_record = gen_seqs[i]    
+    match = re.search("^>.+\|(PF\d+-sample\d+)", seq_record['header'])
     query_name = ''
     if match:
         query_name = match.groups()[0]
@@ -46,10 +46,10 @@ def find_closest_fasta(all_family_seqs, gen_seqs, resultspath, i):
     fhtmp.write(seq_record["header"]+'\n')
     fhtmp.write(seq_record["seq"]+'\n')
     fhtmp.close()
-    args = ['/home/ucbcdwb/Applications/fasta36/bin/fasta36',
-    # args = ['/home/dbuchan/Applications/fasta36/bin/fasta36',
-            '-T',
-            '10'
+    # args = ['/home/ucbcdwb/Applications/fasta36/bin/fasta36',
+    args = ['/home/dbuchan/Applications/fasta36/bin/fasta36',
+            # '-T',
+            # '1',
             '-q',
             '-p',
             '-O',
@@ -115,4 +115,4 @@ def find_closest_fasta(all_family_seqs, gen_seqs, resultspath, i):
 # {PFID: [{header:,
 #          seq: },]}
 # Header format ">pfam_family_name|PFID-sampleNN"
-find_closest_fasta(sys.argv[2], generated_seqs, sys.argv[3], int(sys.argv[4])-1)
+find_closest_fasta(sys.argv[2], generated_seqs, sys.argv[3], int(sys.argv[4])-1)