Skip to content

Commit

Permalink
added code to generate alignments for alphafold sample
Browse files Browse the repository at this point in the history
  • Loading branch information
DanBuchan committed Sep 20, 2024
1 parent 81e87d3 commit 7f0b504
Show file tree
Hide file tree
Showing 9 changed files with 319 additions and 3 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@
*.out
*.txt
*.npy
*.a2m
*.a3m
*.tar.gz
58 changes: 58 additions & 0 deletions missing_distance.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
137_hits.csv

189_hits.csv
190_hits.csv

192_hits.csv

194_hits.csv
195_hits.csv

225_hits.csv

228_hits.csv

230_hits.csv
231_hits.csv
232_hits.csv
233_hits.csv
234_hits.csv
235_hits.csv
236_hits.csv
237_hits.csv

257_hits.csv
258_hits.csv
259_hits.csv
260_hits.csv
261_hits.csv
262_hits.csv
263_hits.csv
264_hits.csv
265_hits.csv
266_hits.csv

328_hits.csv
329_hits.csv
330_hits.csv
331_hits.csv
332_hits.csv

339_hits.csv

33_hits.csv

343_hits.csv
344_hits.csv

346_hits.csv

348_hits.csv

416_hits.csv
417_hits.csv
418_hits.csv

469_hits.csv

504_hits.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
for i in {0..504}; do
file_name="${i}_hits.csv"
[ ! -f $file_name ] && echo "$file_name"
done
31 changes: 31 additions & 0 deletions scripts/alphafold_modelling/example_modelling_script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash -l
#$ -o /home/ucbcdwb/Scratch/output/alpha/std.out
#$ -e /home/ucbcdwb/Scratch/output/alpha/std.err

# Request an hour of run time
#$ -l h_rt=01:00:0

# Request 10 gigabyte of TMPDIR space (default is 10 GB - remove if cluster is diskless)
#$ -l tmpfs=10G

#$ -N runalpha
#$ -l tmem=2G
#$ -l gpu=true
#$ -pe gpu 1
#$ -R y

# Set up the job array. In this instance we have requested 10000 tasks
# numbered 1 to 10000.
#$ -t 1-10000
# Set the name of the job.
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/share/apps/cuda-11.8/lib64/
conda activate /cluster/project1/ProCovar/lmoffat/miniconda/envs/alphafold
cd /cluster/project1/ProCovar/colabfold-customtemplates
bash /cluster/project1/ProCovar/colabfold-customtemplates/main_db.sh test_run.single_sequence.fa test_run.single_sequence.a3m db_test

# move the model
cp db_test_*_unrelaxed_rank_1_model_*.pdb /home/dbuchan/alphafold_runs/
# tidy up
rm -rf db_test_*_all
rm -rf db_test_*_template
rm -f db_test*
201 changes: 201 additions & 0 deletions scripts/alphafold_modelling/prep_alignments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
import sys
import csv
from collections import defaultdict
import glob
from subprocess import Popen, PIPE
import os

# This script untars the alignments and preps a fasta file and an a3m file for alphafold to do it's work

# 1. read in target list to get class
# for each target
# 2. unpack the seq tarball
# 3. work out which set of seqs need to be aligned as pre the readme rules
# 4. use clustalOmega to align the seqs outputting in aln or a3m format

# 1. non_drift - 2 models, 1st and the last : 100 examples == 200 models
# 2. insig_drift - 2 models build the 1st and the last : 100 examples == 200 models
# 3. contaminants_grew - 4 models, 1st, last and 5th and 15th : 100 examples == 400 models
# 4. contaminants_purified - 3 models 1st, last and peak contamination : 50 examples = 150 models
# 5. contaminants_complex - 3 models 1st, last and peak contamination : 100 examples = 300 models *
# 6. query_purified - 4 models, 1st, last and 5th and 15th : 100 examples == 400 models

# might be sensible to arrange as sets given the type and then run multiple array jobs of 100 on the cluster

# usage: python prep_alignments.py /home/dbuchan/Projects/profile_drift/results_data/generation_or_af_targets/alphafold_targets.csv /home/dbuchan/Projects/profile_drift/results_data/alphafold_targets/

# /home/dbuchan/Projects/profile_drift/results_data/alphafold_targets/sorted_target_data

def read_target_sample(file):
with open(file, encoding="utf-8") as fhIn:
next(fhIn)
lookup = defaultdict(list)
targetreader = csv.reader(fhIn, delimiter=',')
for row in targetreader:
# print(row)
if len(row) > 0:
lookup[row[0]].append(row[1])
return lookup

def untar_file(file):
args = ['/usr/bin/tar',
'-zxvf',
f'{file}'
]
# print("Untarring", " ".join(args))
try:
p = Popen(args, stdout=PIPE, stderr=PIPE)
result_stdout, err = p.communicate()
except Exception as e:
print(str(e))
sys.exit(1)
if p.returncode != 0:
print("Non Zero Exit status: "+str(p.returncode))
raise OSError("Non Zero Exit status: "+str(p.returncode))

def create_fasta(main_pfam, location):
for file in glob.glob("*_iteration1_*"):
# print(file)
with open(file, "r") as fhIn:
header = next(fhIn)
seq = next(fhIn)
fhOut = open(f'{sys.argv[2]}sorted_target_data/{location}/{main_pfam}.fa', 'w', encoding="utf-8")
fhOut.write(header)
fhOut.write(seq)
fhOut.close()

def align_seqs(align_list, location, main_pfam):
for iteration in align_list:
# print(iteration)
for file in glob.glob(f'*_iteration{iteration}_*'):
args = ['/home/dbuchan/Applications/clustalo_1_2_4/clustalo',
'--in',
file,
'--out',
f'{sys.argv[2]}sorted_target_data/{location}/{main_pfam}_{iteration}.a2m',
'--outfmt=a2m',
'--threads=10',
'--force'
]
print("Clustering", " ".join(args))
try:
p = Popen(args, stdout=PIPE, stderr=PIPE)
result_stdout, err = p.communicate()
except Exception as e:
print(str(e))
sys.exit(1)
if p.returncode != 0:
print("Non Zero Exit status: "+str(p.returncode))
raise OSError("Non Zero Exit status: "+str(p.returncode))

def process_complex(target_data, main_pfam):
# 3 models 1st, last and peak contamination : 100 examples = 300 models
peak_iteration = 1
max_contamination_percentage = 0
last_iteration = 0
for iteration in list(target_data.keys()):
family_count = 0
contaminant_value = 0
for family in target_data[iteration]:
family_count = int(target_data[iteration][main_pfam])
if family not in main_pfam:
contaminant_value += int(target_data[iteration][family])
contamination_percentage = contaminant_value/family_count
if contamination_percentage > max_contamination_percentage:
max_contamination_percentage = contamination_percentage
peak_iteration = iteration
last_iteration = int(iteration)
# print(iteration, family_count, contaminant_value, contamination_percentage)
# print(peak_iteration)
create_fasta(main_pfam, "contaminants_complex")
align_seqs([1, peak_iteration, last_iteration], "contaminants_complex", main_pfam)

def process_purified(target_data, main_pfam):
# 3 models 1st, last and peak contamination : 100 examples = 300 models
peak_iteration = 1
max_contamination_percentage = 0
last_iteration = 0
for iteration in list(target_data.keys()):
family_count = 0
contaminant_value = 0
for family in target_data[iteration]:
family_count = int(target_data[iteration][main_pfam])
if family not in main_pfam:
contaminant_value += int(target_data[iteration][family])
contamination_percentage = contaminant_value/family_count
if contamination_percentage > max_contamination_percentage:
max_contamination_percentage = contamination_percentage
peak_iteration = iteration
last_iteration = int(iteration)
# print(iteration, family_count, contaminant_value, contamination_percentage)
# print(peak_iteration)
create_fasta(main_pfam, "contaminants_purified")
align_seqs([1, peak_iteration, last_iteration], "contaminants_purified", main_pfam)


def process_non_drift(target_data, main_pfam):
create_fasta(main_pfam, "non_drift")
align_seqs([1, 20], "non_drift", main_pfam)

def process_insig(target_data, main_pfam):
create_fasta(main_pfam, "insig_drift")
align_seqs([1, 20], "insig_drift", main_pfam)

def process_grew(target_data, main_pfam):
create_fasta(main_pfam, "contaminants_grew")
align_seqs([1, 5, 15, 20], "contaminants_grew", main_pfam)

def process_q_purified(target_data, main_pfam):
create_fasta(main_pfam, "query_purified")
align_seqs([1, 5, 15, 20], "query_purified", main_pfam)

def tidy_up():
for file in glob.glob("*.fa"):
os.remove(file)

# generate the fasta sequence and alignment
def process_targets(af_dir, target_class):
for file in glob.glob(f"{af_dir}/*.csv"):
print(file)
name_stub = file[:-17]
tarball = f"{name_stub}seqs.tar.gz"
untar_file(tarball)
target_family = ''
target_data = defaultdict(dict)
with open(file, encoding="utf-8") as fhIn:
next(fhIn)
summaryreader = csv.reader(fhIn, delimiter=',')
for row in summaryreader:
# print(row)
target_family = row[1]
target_data[row[0]][row[2]] = row[3]
#print(target_family)
#print(target_data)

for target_type in target_class[target_family]:
print(target_type)
# if "contaminants_complex" in target_type:
# continue
if "contaminants_complex" in target_type:
process_complex(target_data, target_family)
if "non_drift" in target_type:
process_non_drift(target_data, target_family)
if "insig_drift" in target_type:
process_insig(target_data, target_family)
if "contaminants_grew" in target_type:
process_grew(target_data, target_family)
if "contaminants_purified" in target_type:
process_purified(target_data, target_family)
if "query_purified" in target_type:
process_q_purified(target_data, target_family)

tidy_up()
# break


target_class = read_target_sample(sys.argv[1])
#print(target_class)
process_targets(sys.argv[2], target_class)



12 changes: 12 additions & 0 deletions scripts/alphafold_modelling/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Models to build

1. non_drift - 2 models, 1st and the last : 100 examples == 200 models
2. insig_drift - 2 models build the 1st and the last : 100 examples == 200 models
3. contaminants_grew - 4 models, 1st, last and 5th and 15th : 100 examples == 400 models
4. contaminants_purified - 3 models 1st, last and peak contamination : 50 examples = 150 models
5. contaminants_complex - 3 models 1st, last and peak contamination : 100 examples = 300 models
6. query_purified - 4 models, 1st, last and 5th and 15th : 100 examples == 400 models

total 1650 models == 23 GPU days

Or we could just do all 11,000 models == 153 GPU days
4 changes: 2 additions & 2 deletions scripts/prottrans_seqs/prottrans_seq_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def predict_seq(seq_labels, input_labels, t5, device):
tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_uniref50', do_lower_case=False)
model = T5ForConditionalGeneration.from_pretrained('Rostlab/prot_t5_xl_uniref50')
# device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')
device = torch.device('cuda')
model = model.to(device)
model = model.eval()
print("generating seqs")
Expand All @@ -114,7 +114,7 @@ def predict_seq(seq_labels, input_labels, t5, device):
print(family)
# if("PF05086" not in family):
# continue
sample = random.choices(pfam_seqs[family], k=50)
sample = random.choices(pfam_seqs[family], k=25)
# print(len(sample))
for cnt, seq_data in enumerate(sample):
torch.cuda.empty_cache()
Expand Down
9 changes: 8 additions & 1 deletion scripts/psiblast_drift/get_target_id_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,17 @@ def read_targets(file):
# print(id_list)
target_file = sys.argv[2]
target_list = read_targets(target_file)
s = set(target_list)
target_list = list(s)
# print(len(target_list))

count_targets = 0
for i, id in enumerate(id_list):
# print(id)
pfam_id = id[-7:]
if pfam_id in target_list:
#print(id, pfam_id, i+1)
count_targets += 1
print(i+1)


#print(count_targets)
Empty file.

0 comments on commit 7f0b504

Please sign in to comment.