Skip to content

Commit

Permalink
summarised merizo outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
DanBuchan committed Oct 31, 2024
1 parent 260b5ca commit b132c8b
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 8 deletions.
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,13 @@ Outputs an alphafold model for each part of the targets that we were interested
> ./results_data/alphafold_models/plddt_summary.csv
4. Find best CATH hit using merizo search.
> ./ run_merizo_search.py
> ./run_merizo_search.py
Runs merizo search and collates which cath h family and the TMscore

> ./sum_merizo.py
read the merzio summary and work out, per class, how often the cath class changes and how much the TMScor changes

## Results collation

Expand Down
File renamed without changes.
1 change: 1 addition & 0 deletions scripts/alphafold_modelling/run_merizo_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

print('class,domain,iteration,hit,max_tm,h_family')
for file in glob.glob(f'{sys.argv[1]}/*.pdb'):
# print(file)
m = re.search('models/(.+_PF\d+_\d+)_', file)
identifier = ''
if m:
Expand Down
97 changes: 97 additions & 0 deletions scripts/alphafold_modelling/sum_merizo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import csv
import sys
import statistics

#
# python sum_merizo.py results_data/alphafold_models/merizo_search_hits.csv
#

def read_merizo(file):
data = {}
with open(file, "r", encoding="utf-8") as fh:
next(fh)
merizoreader = csv.reader(fh, delimiter=",")
for row in merizoreader:
if row[0] not in data:
data[row[0]] = {}
if row[1] not in data[row[0]]:
data[row[0]][row[1]] = {}
data[row[0]][row[1]][int(row[2])] = {'cath_code': row[5], 'tm_score': row[4]}
# print(row)
return(data)

def read_counts(data, iterations):
count_NAs = 0
count_matched = 0
count_matched_unassigned = 0

count_became_unassigned = 0
count_became_na = 0
count_missed = 0
first_iteration_tm_scores = []
final_iteration_tm_scores = []
for prot_id in data:
cath_codes = set()
if 1 not in data[prot_id] or 20 not in data[prot_id]:
continue
for iteration in iterations:
cath_codes.add(data[prot_id][iteration]['cath_code'])
if 'NA' not in data[prot_id][1]['tm_score']:
first_iteration_tm_scores.append(float(data[prot_id][1]['tm_score']))
if 'NA' not in data[prot_id][20]['tm_score']:
final_iteration_tm_scores.append(float(data[prot_id][20]['tm_score']))

if len(cath_codes) == 1:
if 'NA' in cath_codes:
count_NAs += 1
elif 'UNASSIGNED' in cath_codes:
count_matched_unassigned += 1
else:
count_matched += 1
else:
if 'NA' in cath_codes:
count_became_na += 1
elif 'UNASSIGNED' in cath_codes:
count_became_unassigned += 1
else:
count_missed += 1
print(f'Total Analysed {len(data)}')
print(f'No domains recognised: {count_NAs}')
print(f'Domain always the same: {count_matched}')
print(f'Domain always unassigned: {count_matched_unassigned}')
print(f'Domain id changed: {count_missed}')
print(f'Domain changed to NA: {count_became_na}')
print(f'Domain changed to unassigned: {count_became_unassigned}')
print(f'mean tmscore at 1: {statistics.mean(first_iteration_tm_scores)}')
print(f'mean tmscore at 10: {statistics.mean(final_iteration_tm_scores)}')




types = ['contaminants_complex', 'contaminants_grew', 'contaminants_purified',
'insig_drift', 'non_drift', 'query_purified']


merizo_data = read_merizo(sys.argv[1])

c_complex_data = merizo_data['contaminants_complex']
c_grew_data = merizo_data['contaminants_grew']
c_purified_data = merizo_data['contaminants_purified']
insig_data = merizo_data['insig_drift']
non_data = merizo_data['non_drift']
q_purified_data = merizo_data['query_purified']

print("Non Drift")
read_counts(non_data, [1, 20])
print("Insig Drift")
read_counts(insig_data, [1, 20])
print("Query purified")
read_counts(q_purified_data, [1, 20])
print("Contaminants Grew")
read_counts(c_grew_data, [1, 20])
print("Contaminants Purified")
read_counts(c_purified_data, [1, 20])
print("Contaminants Complex")
read_counts(c_complex_data, [1, 20])

#print(merizo_data)
14 changes: 7 additions & 7 deletions scripts/hmmer_seqs/find_closest_hmm_seq_family.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ def get_hmm_generated_sequences(hmm_seqs):
return seqs

def find_closest_fasta(all_family_seqs, gen_seqs, resultspath, i):
seq_record = gen_seqs[i]
match = re.search("^>.+\|(PF\d+)-", seq_record['header'])
seq_record = gen_seqs[i]
match = re.search("^>.+\|(PF\d+-sample\d+)", seq_record['header'])
query_name = ''
if match:
query_name = match.groups()[0]
Expand All @@ -46,10 +46,10 @@ def find_closest_fasta(all_family_seqs, gen_seqs, resultspath, i):
fhtmp.write(seq_record["header"]+'\n')
fhtmp.write(seq_record["seq"]+'\n')
fhtmp.close()
args = ['/home/ucbcdwb/Applications/fasta36/bin/fasta36',
# args = ['/home/dbuchan/Applications/fasta36/bin/fasta36',
'-T',
'10'
# args = ['/home/ucbcdwb/Applications/fasta36/bin/fasta36',
args = ['/home/dbuchan/Applications/fasta36/bin/fasta36',
# '-T',
# '1',
'-q',
'-p',
'-O',
Expand Down Expand Up @@ -115,4 +115,4 @@ def find_closest_fasta(all_family_seqs, gen_seqs, resultspath, i):
# {PFID: [{header:,
# seq: },]}
# Header format ">pfam_family_name|PFID-sampleNN"
find_closest_fasta(sys.argv[2], generated_seqs, sys.argv[3], int(sys.argv[4])-1)
find_closest_fasta(sys.argv[2], generated_seqs, sys.argv[3], int(sys.argv[4])-1)

0 comments on commit b132c8b

Please sign in to comment.