added code to generate alignments for alphafold sample

psipred · Sep 20, 2024 · 7f0b504 · 7f0b504
1 parent 81e87d3
commit 7f0b504
Show file tree

Hide file tree

Showing 9 changed files with 319 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,6 @@
 *.out
 *.txt
 *.npy
+*.a2m
+*.a3m
+*.tar.gz
diff --git a/missing_distance.css b/missing_distance.css
@@ -0,0 +1,58 @@
+137_hits.csv
+
+189_hits.csv
+190_hits.csv
+
+192_hits.csv
+
+194_hits.csv
+195_hits.csv
+
+225_hits.csv
+
+228_hits.csv
+
+230_hits.csv
+231_hits.csv
+232_hits.csv
+233_hits.csv
+234_hits.csv
+235_hits.csv
+236_hits.csv
+237_hits.csv
+
+257_hits.csv
+258_hits.csv
+259_hits.csv
+260_hits.csv
+261_hits.csv
+262_hits.csv
+263_hits.csv
+264_hits.csv
+265_hits.csv
+266_hits.csv
+
+328_hits.csv
+329_hits.csv
+330_hits.csv
+331_hits.csv
+332_hits.csv
+
+339_hits.csv
+
+33_hits.csv
+
+343_hits.csv
+344_hits.csv
+
+346_hits.csv
+
+348_hits.csv
+
+416_hits.csv
+417_hits.csv
+418_hits.csv
+
+469_hits.csv
+
+504_hits.csv
diff --git a/results_data/distance_matrix/pfam_rep_all_against_all/find_missing.sh b/results_data/distance_matrix/pfam_rep_all_against_all/find_missing.sh
@@ -0,0 +1,4 @@
+for i in {0..504}; do
+     file_name="${i}_hits.csv"
+        [ ! -f $file_name ] && echo "$file_name"
+done
diff --git a/scripts/alphafold_modelling/example_modelling_script.sh b/scripts/alphafold_modelling/example_modelling_script.sh
@@ -0,0 +1,31 @@
+#!/bin/bash -l
+#$ -o /home/ucbcdwb/Scratch/output/alpha/std.out
+#$ -e /home/ucbcdwb/Scratch/output/alpha/std.err
+
+# Request an hour of run time
+#$ -l h_rt=01:00:0
+
+# Request 10 gigabyte of TMPDIR space (default is 10 GB - remove if cluster is diskless)
+#$ -l tmpfs=10G
+
+#$ -N runalpha
+#$ -l tmem=2G
+#$ -l gpu=true
+#$ -pe gpu 1
+#$ -R y
+
+# Set up the job array.  In this instance we have requested 10000 tasks
+# numbered 1 to 10000.
+#$ -t 1-10000
+# Set the name of the job.
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/share/apps/cuda-11.8/lib64/
+conda activate /cluster/project1/ProCovar/lmoffat/miniconda/envs/alphafold
+cd /cluster/project1/ProCovar/colabfold-customtemplates
+bash /cluster/project1/ProCovar/colabfold-customtemplates/main_db.sh test_run.single_sequence.fa test_run.single_sequence.a3m db_test
+
+# move the model
+cp db_test_*_unrelaxed_rank_1_model_*.pdb /home/dbuchan/alphafold_runs/
+# tidy up
+rm -rf db_test_*_all
+rm -rf db_test_*_template
+rm -f db_test*
diff --git a/scripts/alphafold_modelling/prep_alignments.py b/scripts/alphafold_modelling/prep_alignments.py
@@ -0,0 +1,201 @@
+import sys
+import csv
+from collections import defaultdict
+import glob
+from subprocess import Popen, PIPE
+import os
+
+# This script untars the alignments and preps a fasta file and an a3m file for alphafold to do it's work
+
+# 1. read in target list to get class
+# for each target
+# 2. unpack the seq tarball
+# 3. work out which set of seqs need to be aligned as pre the readme rules
+# 4. use clustalOmega to align the seqs outputting in aln or a3m format 
+
+# 1. non_drift - 2 models, 1st and the last : 100 examples == 200 models
+# 2. insig_drift - 2 models build the 1st and the last : 100 examples == 200 models
+# 3. contaminants_grew - 4 models, 1st, last and 5th and 15th : 100 examples == 400 models
+# 4. contaminants_purified - 3 models 1st, last and peak contamination : 50 examples = 150 models
+# 5. contaminants_complex -  3 models 1st, last and peak contamination : 100 examples = 300 models *
+# 6. query_purified - 4 models, 1st, last and 5th and 15th : 100 examples == 400 models
+
+# might be sensible to arrange as sets given the type and then run multiple array jobs of 100 on the cluster
+
+# usage: python prep_alignments.py /home/dbuchan/Projects/profile_drift/results_data/generation_or_af_targets/alphafold_targets.csv /home/dbuchan/Projects/profile_drift/results_data/alphafold_targets/ 
+
+# /home/dbuchan/Projects/profile_drift/results_data/alphafold_targets/sorted_target_data
+
+def read_target_sample(file):
+    with open(file, encoding="utf-8") as fhIn:
+        next(fhIn)
+        lookup = defaultdict(list)
+        targetreader = csv.reader(fhIn, delimiter=',')
+        for row in targetreader:
+            # print(row)
+            if len(row) > 0:
+                lookup[row[0]].append(row[1])
+    return lookup
+
+def untar_file(file):
+    args = ['/usr/bin/tar',
+            '-zxvf',
+            f'{file}'
+    ]
+    # print("Untarring", " ".join(args))
+    try:
+        p = Popen(args, stdout=PIPE, stderr=PIPE)
+        result_stdout, err = p.communicate()
+    except Exception as e:
+        print(str(e))
+        sys.exit(1)
+    if p.returncode != 0:
+        print("Non Zero Exit status: "+str(p.returncode))
+        raise OSError("Non Zero Exit status: "+str(p.returncode))
+
+def create_fasta(main_pfam, location):
+    for file in glob.glob("*_iteration1_*"):
+        # print(file)
+        with open(file, "r") as fhIn:
+            header = next(fhIn)
+            seq = next(fhIn)
+        fhOut = open(f'{sys.argv[2]}sorted_target_data/{location}/{main_pfam}.fa', 'w', encoding="utf-8")
+        fhOut.write(header)
+        fhOut.write(seq)
+        fhOut.close()
+
+def align_seqs(align_list, location, main_pfam):
+    for iteration in align_list:
+        # print(iteration)
+        for file in glob.glob(f'*_iteration{iteration}_*'):
+            args = ['/home/dbuchan/Applications/clustalo_1_2_4/clustalo',
+                    '--in',
+                    file,
+                    '--out',
+                    f'{sys.argv[2]}sorted_target_data/{location}/{main_pfam}_{iteration}.a2m',
+                    '--outfmt=a2m',
+                    '--threads=10',
+                    '--force'
+            ]
+            print("Clustering", " ".join(args))
+            try:
+                p = Popen(args, stdout=PIPE, stderr=PIPE)
+                result_stdout, err = p.communicate()
+            except Exception as e:
+                print(str(e))
+                sys.exit(1)
+            if p.returncode != 0:
+                print("Non Zero Exit status: "+str(p.returncode))
+                raise OSError("Non Zero Exit status: "+str(p.returncode))
+
+def process_complex(target_data, main_pfam):
+    # 3 models 1st, last and peak contamination : 100 examples = 300 models
+    peak_iteration = 1
+    max_contamination_percentage = 0
+    last_iteration = 0
+    for iteration in list(target_data.keys()):
+        family_count = 0
+        contaminant_value = 0
+        for family in target_data[iteration]:
+            family_count = int(target_data[iteration][main_pfam])
+            if family not in main_pfam:
+                contaminant_value += int(target_data[iteration][family])
+        contamination_percentage = contaminant_value/family_count
+        if contamination_percentage > max_contamination_percentage:
+            max_contamination_percentage = contamination_percentage
+            peak_iteration = iteration
+        last_iteration = int(iteration)
+        # print(iteration, family_count, contaminant_value, contamination_percentage)
+        # print(peak_iteration)
+    create_fasta(main_pfam, "contaminants_complex")
+    align_seqs([1, peak_iteration, last_iteration], "contaminants_complex", main_pfam)
+
+def process_purified(target_data, main_pfam):
+    # 3 models 1st, last and peak contamination : 100 examples = 300 models
+    peak_iteration = 1
+    max_contamination_percentage = 0
+    last_iteration = 0
+    for iteration in list(target_data.keys()):
+        family_count = 0
+        contaminant_value = 0
+        for family in target_data[iteration]:
+            family_count = int(target_data[iteration][main_pfam])
+            if family not in main_pfam:
+                contaminant_value += int(target_data[iteration][family])
+        contamination_percentage = contaminant_value/family_count
+        if contamination_percentage > max_contamination_percentage:
+            max_contamination_percentage = contamination_percentage
+            peak_iteration = iteration
+        last_iteration = int(iteration)
+        # print(iteration, family_count, contaminant_value, contamination_percentage)
+        # print(peak_iteration)
+    create_fasta(main_pfam, "contaminants_purified")
+    align_seqs([1, peak_iteration, last_iteration], "contaminants_purified", main_pfam)
+
+
+def process_non_drift(target_data, main_pfam):
+    create_fasta(main_pfam, "non_drift")
+    align_seqs([1, 20], "non_drift", main_pfam)
+
+def process_insig(target_data, main_pfam):
+    create_fasta(main_pfam, "insig_drift")
+    align_seqs([1, 20], "insig_drift", main_pfam)
+
+def process_grew(target_data, main_pfam):
+    create_fasta(main_pfam, "contaminants_grew")
+    align_seqs([1, 5, 15, 20], "contaminants_grew", main_pfam)
+
+def process_q_purified(target_data, main_pfam):
+    create_fasta(main_pfam, "query_purified")
+    align_seqs([1, 5, 15, 20], "query_purified", main_pfam)
+
+def tidy_up():
+    for file in glob.glob("*.fa"):
+        os.remove(file)
+
+# generate the fasta sequence and alignment
+def process_targets(af_dir, target_class):
+    for file in glob.glob(f"{af_dir}/*.csv"):
+        print(file)
+        name_stub = file[:-17]
+        tarball = f"{name_stub}seqs.tar.gz"
+        untar_file(tarball)
+        target_family = ''
+        target_data = defaultdict(dict)
+        with open(file, encoding="utf-8") as fhIn:
+            next(fhIn)
+            summaryreader = csv.reader(fhIn, delimiter=',')
+            for row in summaryreader:
+                # print(row)
+                target_family = row[1]
+                target_data[row[0]][row[2]] = row[3]
+        #print(target_family)
+        #print(target_data)
+
+        for target_type in target_class[target_family]:
+            print(target_type)
+            # if "contaminants_complex" in target_type:
+            #     continue
+            if "contaminants_complex" in target_type:
+                process_complex(target_data, target_family)
+            if "non_drift" in target_type:
+                process_non_drift(target_data, target_family)
+            if "insig_drift" in target_type:
+                process_insig(target_data, target_family)
+            if "contaminants_grew" in target_type:
+                process_grew(target_data, target_family)
+            if "contaminants_purified" in target_type:
+                process_purified(target_data, target_family)
+            if "query_purified" in target_type:
+                process_q_purified(target_data, target_family)
+
+        tidy_up()
+        # break
+
+
+target_class = read_target_sample(sys.argv[1])
+#print(target_class)
+process_targets(sys.argv[2], target_class)
+
+
+
diff --git a/scripts/alphafold_modelling/readme.md b/scripts/alphafold_modelling/readme.md
@@ -0,0 +1,12 @@
+# Models to build
+
+1. non_drift - 2 models, 1st and the last : 100 examples == 200 models
+2. insig_drift - 2 models build the 1st and the last : 100 examples == 200 models
+3. contaminants_grew - 4 models, 1st, last and 5th and 15th : 100 examples == 400 models
+4. contaminants_purified - 3 models 1st, last and peak contamination : 50 examples = 150 models
+5. contaminants_complex -  3 models 1st, last and peak contamination : 100 examples = 300 models
+6. query_purified - 4 models, 1st, last and 5th and 15th : 100 examples == 400 models
+
+total 1650 models == 23 GPU days
+
+Or we could just do all 11,000 models == 153 GPU days
diff --git a/scripts/prottrans_seqs/prottrans_seq_gen.py b/scripts/prottrans_seqs/prottrans_seq_gen.py
@@ -101,7 +101,7 @@ def predict_seq(seq_labels, input_labels, t5, device):
 tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_uniref50', do_lower_case=False)
 model = T5ForConditionalGeneration.from_pretrained('Rostlab/prot_t5_xl_uniref50')
 # device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-device = torch.device('cpu')
+device = torch.device('cuda')
 model = model.to(device)
 model = model.eval()
 print("generating seqs")
@@ -114,7 +114,7 @@ def predict_seq(seq_labels, input_labels, t5, device):
     print(family)
     # if("PF05086" not in family):
     #     continue
-    sample = random.choices(pfam_seqs[family], k=50)
+    sample = random.choices(pfam_seqs[family], k=25)
     # print(len(sample))
     for cnt, seq_data in enumerate(sample):
         torch.cuda.empty_cache()

diff --git a/scripts/psiblast_drift/get_target_id_list.py b/scripts/psiblast_drift/get_target_id_list.py
@@ -34,10 +34,17 @@ def read_targets(file):
 # print(id_list)
 target_file = sys.argv[2]
 target_list = read_targets(target_file)
+s = set(target_list)
+target_list = list(s)
+# print(len(target_list))
 
+count_targets = 0
 for i, id in enumerate(id_list):
+    # print(id)
     pfam_id = id[-7:]
     if pfam_id in target_list:
        #print(id, pfam_id, i+1)
+       count_targets += 1
        print(i+1)
-
+
+#print(count_targets)
diff --git a/scripts/rep_distance_matrix/504_hits.err b/scripts/rep_distance_matrix/504_hits.err
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,3 +7,6 @@ @@
     *.out
     *.txt
     *.npy
+    *.a2m
+    *.a3m
+    *.tar.gz