Skip to content

Commit 05c30a5

Browse files
committed
move cache function and generalise
1 parent 770744a commit 05c30a5

8 files changed

+73
-59
lines changed

ppac_merged_split_run_anno.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from orthologue_analysis.orthogroups import init_orthogroup_df
66
from orthologue_analysis.species import SpeciesList, PristionchusFromTool
77

8-
from ppac_merged_split_run_utils import pickle_cache_suspicious_orthologue_pipeline
8+
from reannotation.pipelines import pickle_cache_suspicious_orthologue_pipeline
99

1010
wbps_col = "Ppac_LT"
1111
anno_col = "Ppac_anno_LT"
@@ -26,7 +26,7 @@
2626
load_blast=True
2727
)
2828

29-
anno_merged, anno_split = pickle_cache_suspicious_orthologue_pipeline("anno", og_df, wbps_col, anno_col, species_list, seq_id_map, wbps_prefix="Transcript")
29+
anno_merged, anno_split = pickle_cache_suspicious_orthologue_pipeline("anno", "ppac", og_df, wbps_col, anno_col, species_list, seq_id_map, wbps_prefix="Transcript")
3030

3131
num_genes = len(list(species_list.get_species_with_data_label("Ppac_anno_LT").db.all_features(featuretype="gene")))
3232
print(f"Anno: merged={len(anno_merged)}, split={len(anno_split)}, total={round(100*(len(anno_split) + len(anno_merged)*2)/num_genes, 2)}")

ppac_merged_split_run_braker3.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from orthologue_analysis.orthogroups import init_orthogroup_df
66
from orthologue_analysis.species import SpeciesList, PristionchusFromTool
77

8-
from ppac_merged_split_run_utils import pickle_cache_suspicious_orthologue_pipeline
8+
from reannotation.pipelines import pickle_cache_suspicious_orthologue_pipeline
99

1010

1111
wbps_col = "Ppac_LT"
@@ -27,7 +27,7 @@
2727
load_blast=True
2828
)
2929

30-
braker_merged, braker_split = pickle_cache_suspicious_orthologue_pipeline("braker", og_df, wbps_col, braker_col, species_list, seq_id_map, wbps_prefix="Transcript")
30+
braker_merged, braker_split = pickle_cache_suspicious_orthologue_pipeline("braker", "ppac", og_df, wbps_col, braker_col, species_list, seq_id_map, wbps_prefix="Transcript")
3131

3232
num_genes = len(list(species_list.get_species_with_data_label("Ppac_braker3_LT").db.all_features(featuretype="gene")))
3333
print(f"BRAKER3: merged={len(braker_merged)}, split={len(braker_split)}, total={round(100*(len(braker_split) + len(braker_merged)*2)/num_genes, 2)}")

ppac_merged_split_run_helixer.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from orthologue_analysis.orthogroups import init_orthogroup_df
66
from orthologue_analysis.species import SpeciesList, PristionchusFromTool
77

8-
from ppac_merged_split_run_utils import pickle_cache_suspicious_orthologue_pipeline
8+
from reannotation.pipelines import pickle_cache_suspicious_orthologue_pipeline
99

1010
wbps_col = "Ppac_LT"
1111
helixer_col = "Ppac_helixer_LT"
@@ -26,7 +26,7 @@
2626
load_blast=True
2727
)
2828

29-
helixer_merged, helixer_split = pickle_cache_suspicious_orthologue_pipeline("helixer", og_df, wbps_col, helixer_col, species_list, seq_id_map, wbps_prefix="Transcript")
29+
helixer_merged, helixer_split = pickle_cache_suspicious_orthologue_pipeline("helixer", "ppac", og_df, wbps_col, helixer_col, species_list, seq_id_map, wbps_prefix="Transcript")
3030

3131
num_genes = len(list(species_list.get_species_with_data_label("Ppac_helixer_LT").db.all_features(featuretype="gene")))
3232
print(f"Helixer: merged={len(helixer_merged)}, split={len(helixer_split)}, total={round(100*(len(helixer_split) + len(helixer_merged)*2)/num_genes, 2)}")

ppac_merged_split_run_utils.py

-21
This file was deleted.

reannotation/pipelines.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from collections import Counter
2-
# import contextlib
32
import os
43
import os.path
4+
import pickle
55
import re
66

77
from gffutils.exceptions import FeatureNotFoundError
@@ -202,6 +202,23 @@ def suspicious_orthologue_pipeline(og_df, wbps_col, tool_col, species_list, seq_
202202
return genuine_merged, genuine_split
203203

204204

205+
def pickle_cache_suspicious_orthologue_pipeline(tool, sp_prefix, *args, **kwargs):
206+
merged_path = os.path.join("data", "tmp", f"{sp_prefix}_{tool}_merged.pickle")
207+
split_path = os.path.join("data", "tmp", f"{sp_prefix}_{tool}_split.pickle")
208+
if os.path.isfile(merged_path) and os.path.isfile(split_path):
209+
with open(merged_path, "rb") as f:
210+
merged = pickle.load(f)
211+
with open(split_path, "rb") as f:
212+
split = pickle.load(f)
213+
else:
214+
merged, split = suspicious_orthologue_pipeline(*args, **kwargs)
215+
with open(merged_path, 'wb') as f:
216+
pickle.dump(merged, f, protocol=pickle.HIGHEST_PROTOCOL)
217+
with open(split_path, 'wb') as f:
218+
pickle.dump(split, f, protocol=pickle.HIGHEST_PROTOCOL)
219+
return merged, split
220+
221+
205222
def novel_orthologue_pipeline(og_df, wbps_col, tool_col, species_list, out_dir="data/novel_orthologue_sequences/"):
206223
makedirs(out_dir)
207224
count = 0

reannotation_hcontortus_all.ipynb

+4-3
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
" interpro_accession_pipeline,\n",
4040
" interpro_accession_pipeline_all_tools,\n",
4141
" suspicious_orthologue_pipeline,\n",
42+
" pickle_cache_suspicious_orthologue_pipeline,\n",
4243
" novel_orthologue_pipeline\n",
4344
")\n",
4445
"from reannotation.statistics import fisher_exact_for_two_lists_of_accessions\n",
@@ -143,9 +144,9 @@
143144
}
144145
],
145146
"source": [
146-
"braker_merged, braker_split = suspicious_orthologue_pipeline(og_df, wbps_col, braker_col, species_list, seq_id_map)\n",
147-
"anno_merged, anno_split = suspicious_orthologue_pipeline(og_df, wbps_col, anno_col, species_list, seq_id_map)\n",
148-
"helixer_merged, helixer_split = suspicious_orthologue_pipeline(og_df, wbps_col, helixer_col, species_list, seq_id_map)"
147+
"braker_merged, braker_split = pickle_cache_suspicious_orthologue_pipeline(\"braker\", \"hcon\", og_df, wbps_col, braker_col, species_list, seq_id_map)\n",
148+
"anno_merged, anno_split = pickle_cache_suspicious_orthologue_pipeline(\"anno\", \"hcon\", og_df, wbps_col, anno_col, species_list, seq_id_map)\n",
149+
"helixer_merged, helixer_split = pickle_cache_suspicious_orthologue_pipeline(\"helixer\", \"hcon\", og_df, wbps_col, helixer_col, species_list, seq_id_map)"
149150
]
150151
},
151152
{

reannotation_ppacificus_all.ipynb

+23-13
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 1,
5+
"execution_count": null,
66
"metadata": {},
77
"outputs": [
88
{
@@ -31,14 +31,18 @@
3131
"from orthologue_analysis.orthogroups import init_orthogroup_df\n",
3232
"from orthologue_analysis.species import PristionchusFromTool, SpeciesList\n",
3333
"from orthologue_analysis.utils import SequenceIDMapping, orthofinder_paths\n",
34-
"from ppac_merged_split_run_utils import pickle_cache_suspicious_orthologue_pipeline\n",
3534
"from reannotation.analysis import (\n",
3635
" interpro_accessions_frequently_missed_by_all_tools,\n",
3736
" interpro_accessions_in_novel_transcripts,\n",
3837
" interpro_accessions_in_missed_transcripts,\n",
3938
" missed_transcripts_with_significantly_more_frequent_accessions\n",
4039
")\n",
41-
"from reannotation.pipelines import interpro_accession_pipeline, suspicious_orthologue_pipeline, novel_orthologue_pipeline\n",
40+
"from reannotation.pipelines import (\n",
41+
" interpro_accession_pipeline,\n",
42+
" suspicious_orthologue_pipeline,\n",
43+
" pickle_cache_suspicious_orthologue_pipeline,\n",
44+
" novel_orthologue_pipeline\n",
45+
")\n",
4246
"from reannotation.statistics import fisher_exact_for_two_lists_of_accessions\n",
4347
"from reannotation.utils import extract_accessions_from_transcript\n",
4448
"from utils.esm import extract_esm_means\n",
@@ -129,33 +133,39 @@
129133
"metadata": {},
130134
"outputs": [],
131135
"source": [
132-
"braker_merged, braker_split = pickle_cache_suspicious_orthologue_pipeline(\"braker\", og_df, wbps_col, braker_col, species_list, seq_id_map, wbps_prefix=\"Transcript\")\n",
133-
"anno_merged, anno_split = pickle_cache_suspicious_orthologue_pipeline(\"anno\", og_df, wbps_col, anno_col, species_list, seq_id_map, wbps_prefix=\"Transcript\")\n",
134-
"helixer_merged, helixer_split = pickle_cache_suspicious_orthologue_pipeline(\"helixer\", og_df, wbps_col, helixer_col, species_list, seq_id_map, wbps_prefix=\"Transcript\")"
136+
"braker_merged, braker_split = pickle_cache_suspicious_orthologue_pipeline(\"braker\", \"ppac\", og_df, wbps_col, braker_col, species_list, seq_id_map, wbps_prefix=\"Transcript\")\n",
137+
"anno_merged, anno_split = pickle_cache_suspicious_orthologue_pipeline(\"anno\", \"ppac\", og_df, wbps_col, anno_col, species_list, seq_id_map, wbps_prefix=\"Transcript\")\n",
138+
"helixer_merged, helixer_split = pickle_cache_suspicious_orthologue_pipeline(\"helixer\", \"ppac\", og_df, wbps_col, helixer_col, species_list, seq_id_map, wbps_prefix=\"Transcript\")"
135139
]
136140
},
137141
{
138142
"cell_type": "code",
139-
"execution_count": 5,
143+
"execution_count": 6,
140144
"metadata": {},
141145
"outputs": [
142146
{
143147
"name": "stdout",
144148
"output_type": "stream",
145149
"text": [
146-
"BRAKER3: merged=19, split=96, total=0.56\n",
147-
"Helixer: merged=349, split=591, total=4.0\n",
148-
"Anno: merged=958, split=100, total=7.13\n"
150+
"BRAKER3: merged=708, split=102, total=6.3%\n",
151+
"\tTotal genes: 24077\n",
152+
"Helixer: merged=351, split=533, total=3.83%\n",
153+
"\tTotal genes: 32221\n",
154+
"Anno: merged=1009, split=96, total=7.47%\n",
155+
"\tTotal genes: 28283\n"
149156
]
150157
}
151158
],
152159
"source": [
153160
"num_genes = len(list(braker_species.db.all_features(featuretype=\"gene\")))\n",
154-
"print(f\"BRAKER3: merged={len(braker_merged)}, split={len(braker_split)}, total={round(100*(len(braker_split) + len(braker_merged)*2)/num_genes, 2)}\")\n",
161+
"print(f\"BRAKER3: merged={len(braker_merged)}, split={len(braker_split)}, total={round(100*(len(braker_split) + len(braker_merged)*2)/num_genes, 2)}%\")\n",
162+
"print(f\"\\tTotal genes: {num_genes}\")\n",
155163
"num_genes = len(list(helixer_species.db.all_features(featuretype=\"gene\")))\n",
156-
"print(f\"Helixer: merged={len(helixer_merged)}, split={len(helixer_split)}, total={round(100*(len(helixer_split) + len(helixer_merged)*2)/num_genes, 2)}\")\n",
164+
"print(f\"Helixer: merged={len(helixer_merged)}, split={len(helixer_split)}, total={round(100*(len(helixer_split) + len(helixer_merged)*2)/num_genes, 2)}%\")\n",
165+
"print(f\"\\tTotal genes: {num_genes}\")\n",
157166
"num_genes = len(list(anno_species.db.all_features(featuretype=\"gene\")))\n",
158-
"print(f\"Anno: merged={len(anno_merged)}, split={len(anno_split)}, total={round(100*(len(anno_split) + len(anno_merged)*2)/num_genes, 2)}\")"
167+
"print(f\"Anno: merged={len(anno_merged)}, split={len(anno_split)}, total={round(100*(len(anno_split) + len(anno_merged)*2)/num_genes, 2)}%\")\n",
168+
"print(f\"\\tTotal genes: {num_genes}\")"
159169
]
160170
},
161171
{

reannotation_smansoni_all.ipynb

+22-15
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 40,
5+
"execution_count": 1,
66
"metadata": {},
77
"outputs": [
88
{
@@ -47,6 +47,7 @@
4747
" interpro_accession_pipeline_all_tools,\n",
4848
" interpro_accession_pipeline,\n",
4949
" suspicious_orthologue_pipeline,\n",
50+
" pickle_cache_suspicious_orthologue_pipeline,\n",
5051
" novel_orthologue_pipeline\n",
5152
")\n",
5253
"from reannotation.statistics import fisher_exact_for_two_lists_of_accessions, count_transcripts_with_accession\n",
@@ -389,47 +390,53 @@
389390
},
390391
{
391392
"cell_type": "code",
392-
"execution_count": 17,
393+
"execution_count": 2,
393394
"metadata": {},
394395
"outputs": [
395396
{
396397
"name": "stderr",
397398
"output_type": "stream",
398399
"text": [
399-
"100%|██████████| 9122/9122 [00:46<00:00, 195.20it/s] \n",
400-
"100%|██████████| 9122/9122 [04:02<00:00, 37.68it/s] \n",
401-
"100%|██████████| 9122/9122 [01:20<00:00, 113.98it/s]\n"
400+
"100%|██████████| 9122/9122 [00:52<00:00, 172.94it/s] \n",
401+
"100%|██████████| 9122/9122 [04:36<00:00, 33.05it/s] \n",
402+
"100%|██████████| 9122/9122 [01:29<00:00, 101.93it/s]\n"
402403
]
403404
}
404405
],
405406
"source": [
406-
"braker_merged, braker_split = suspicious_orthologue_pipeline(og_df, wbps_col, braker_col, species_list, seq_id_map)\n",
407-
"anno_merged, anno_split = suspicious_orthologue_pipeline(og_df, wbps_col, anno_col, species_list, seq_id_map)\n",
408-
"helixer_merged, helixer_split = suspicious_orthologue_pipeline(og_df, wbps_col, helixer_col, species_list, seq_id_map)"
407+
"braker_merged, braker_split = pickle_cache_suspicious_orthologue_pipeline(\"braker\", \"sman\", og_df, wbps_col, braker_col, species_list, seq_id_map)\n",
408+
"anno_merged, anno_split = pickle_cache_suspicious_orthologue_pipeline(\"anno\", \"sman\", og_df, wbps_col, anno_col, species_list, seq_id_map)\n",
409+
"helixer_merged, helixer_split = pickle_cache_suspicious_orthologue_pipeline(\"helixer\", \"sman\", og_df, wbps_col, helixer_col, species_list, seq_id_map)"
409410
]
410411
},
411412
{
412413
"cell_type": "code",
413-
"execution_count": 20,
414+
"execution_count": 4,
414415
"metadata": {},
415416
"outputs": [
416417
{
417418
"name": "stdout",
418419
"output_type": "stream",
419420
"text": [
420-
"BRAKER3: merged=17, split=10, total=0.48\n",
421-
"Helixer: merged=161, split=20, total=3.45\n",
422-
"Anno: merged=204, split=27, total=3.4\n"
421+
"BRAKER3: merged=17, split=10, total=0.48%\n",
422+
"\tTotal genes: 9092\n",
423+
"Helixer: merged=161, split=20, total=3.45%\n",
424+
"\tTotal genes: 9901\n",
425+
"Anno: merged=204, split=27, total=3.4%\n",
426+
"\tTotal genes: 12798\n"
423427
]
424428
}
425429
],
426430
"source": [
427431
"num_genes = len(list(braker_species.db.all_features(featuretype=\"gene\")))\n",
428-
"print(f\"BRAKER3: merged={len(braker_merged)}, split={len(braker_split)}, total={round(100*(len(braker_split) + len(braker_merged)*2)/num_genes, 2)}\")\n",
432+
"print(f\"BRAKER3: merged={len(braker_merged)}, split={len(braker_split)}, total={round(100*(len(braker_split) + len(braker_merged)*2)/num_genes, 2)}%\")\n",
433+
"print(f\"\\tTotal genes: {num_genes}\")\n",
429434
"num_genes = len(list(helixer_species.db.all_features(featuretype=\"gene\")))\n",
430-
"print(f\"Helixer: merged={len(helixer_merged)}, split={len(helixer_split)}, total={round(100*(len(helixer_split) + len(helixer_merged)*2)/num_genes, 2)}\")\n",
435+
"print(f\"Helixer: merged={len(helixer_merged)}, split={len(helixer_split)}, total={round(100*(len(helixer_split) + len(helixer_merged)*2)/num_genes, 2)}%\")\n",
436+
"print(f\"\\tTotal genes: {num_genes}\")\n",
431437
"num_genes = len(list(anno_species.db.all_features(featuretype=\"gene\")))\n",
432-
"print(f\"Anno: merged={len(anno_merged)}, split={len(anno_split)}, total={round(100*(len(anno_split) + len(anno_merged)*2)/num_genes, 2)}\")"
438+
"print(f\"Anno: merged={len(anno_merged)}, split={len(anno_split)}, total={round(100*(len(anno_split) + len(anno_merged)*2)/num_genes, 2)}%\")\n",
439+
"print(f\"\\tTotal genes: {num_genes}\")"
433440
]
434441
},
435442
{

0 commit comments

Comments
 (0)