move cache function and generalise

haessar · haessar · commit 05c30a5b51f7 · 2025-03-18T21:52:18.000Z
diff --git a/ppac_merged_split_run_anno.py b/ppac_merged_split_run_anno.py
@@ -5,7 +5,7 @@
 from orthologue_analysis.orthogroups import init_orthogroup_df
 from orthologue_analysis.species import SpeciesList, PristionchusFromTool
 
-from ppac_merged_split_run_utils import pickle_cache_suspicious_orthologue_pipeline
+from reannotation.pipelines import pickle_cache_suspicious_orthologue_pipeline
 
 wbps_col = "Ppac_LT"
 anno_col = "Ppac_anno_LT"
@@ -26,7 +26,7 @@
     load_blast=True
 )
 
-anno_merged, anno_split = pickle_cache_suspicious_orthologue_pipeline("anno", og_df, wbps_col, anno_col, species_list, seq_id_map, wbps_prefix="Transcript")
+anno_merged, anno_split = pickle_cache_suspicious_orthologue_pipeline("anno", "ppac", og_df, wbps_col, anno_col, species_list, seq_id_map, wbps_prefix="Transcript")
 
 num_genes = len(list(species_list.get_species_with_data_label("Ppac_anno_LT").db.all_features(featuretype="gene")))
 print(f"Anno: merged={len(anno_merged)}, split={len(anno_split)}, total={round(100*(len(anno_split) + len(anno_merged)*2)/num_genes, 2)}")
diff --git a/ppac_merged_split_run_braker3.py b/ppac_merged_split_run_braker3.py
@@ -5,7 +5,7 @@
 from orthologue_analysis.orthogroups import init_orthogroup_df
 from orthologue_analysis.species import SpeciesList, PristionchusFromTool
 
-from ppac_merged_split_run_utils import pickle_cache_suspicious_orthologue_pipeline
+from reannotation.pipelines import pickle_cache_suspicious_orthologue_pipeline
 
 
 wbps_col = "Ppac_LT"
@@ -27,7 +27,7 @@
     load_blast=True
 )
 
-braker_merged, braker_split = pickle_cache_suspicious_orthologue_pipeline("braker", og_df, wbps_col, braker_col, species_list, seq_id_map, wbps_prefix="Transcript")
+braker_merged, braker_split = pickle_cache_suspicious_orthologue_pipeline("braker", "ppac", og_df, wbps_col, braker_col, species_list, seq_id_map, wbps_prefix="Transcript")
 
 num_genes = len(list(species_list.get_species_with_data_label("Ppac_braker3_LT").db.all_features(featuretype="gene")))
 print(f"BRAKER3: merged={len(braker_merged)}, split={len(braker_split)}, total={round(100*(len(braker_split) + len(braker_merged)*2)/num_genes, 2)}")
diff --git a/ppac_merged_split_run_helixer.py b/ppac_merged_split_run_helixer.py
@@ -5,7 +5,7 @@
 from orthologue_analysis.orthogroups import init_orthogroup_df
 from orthologue_analysis.species import SpeciesList, PristionchusFromTool
 
-from ppac_merged_split_run_utils import pickle_cache_suspicious_orthologue_pipeline
+from reannotation.pipelines import pickle_cache_suspicious_orthologue_pipeline
 
 wbps_col = "Ppac_LT"
 helixer_col = "Ppac_helixer_LT"
@@ -26,7 +26,7 @@
     load_blast=True
 )
 
-helixer_merged, helixer_split = pickle_cache_suspicious_orthologue_pipeline("helixer", og_df, wbps_col, helixer_col, species_list, seq_id_map, wbps_prefix="Transcript")
+helixer_merged, helixer_split = pickle_cache_suspicious_orthologue_pipeline("helixer", "ppac", og_df, wbps_col, helixer_col, species_list, seq_id_map, wbps_prefix="Transcript")
 
 num_genes = len(list(species_list.get_species_with_data_label("Ppac_helixer_LT").db.all_features(featuretype="gene")))
 print(f"Helixer: merged={len(helixer_merged)}, split={len(helixer_split)}, total={round(100*(len(helixer_split) + len(helixer_merged)*2)/num_genes, 2)}")
diff --git a/ppac_merged_split_run_utils.py b/ppac_merged_split_run_utils.py
diff --git a/reannotation/pipelines.py b/reannotation/pipelines.py
@@ -1,7 +1,7 @@
 from collections import Counter
-# import contextlib
 import os
 import os.path
+import pickle
 import re
 
 from gffutils.exceptions import FeatureNotFoundError
@@ -202,6 +202,23 @@ def suspicious_orthologue_pipeline(og_df, wbps_col, tool_col, species_list, seq_
     return genuine_merged, genuine_split
 
 
+def pickle_cache_suspicious_orthologue_pipeline(tool, sp_prefix, *args, **kwargs):
+    merged_path = os.path.join("data", "tmp", f"{sp_prefix}_{tool}_merged.pickle")
+    split_path = os.path.join("data", "tmp", f"{sp_prefix}_{tool}_split.pickle")
+    if os.path.isfile(merged_path) and os.path.isfile(split_path):
+        with open(merged_path, "rb") as f:
+            merged = pickle.load(f)
+        with open(split_path, "rb") as f:
+            split = pickle.load(f)
+    else:
+        merged, split = suspicious_orthologue_pipeline(*args, **kwargs)
+        with open(merged_path, 'wb') as f:
+            pickle.dump(merged, f, protocol=pickle.HIGHEST_PROTOCOL)
+        with open(split_path, 'wb') as f:
+            pickle.dump(split, f, protocol=pickle.HIGHEST_PROTOCOL)
+    return merged, split
+
+
 def novel_orthologue_pipeline(og_df, wbps_col, tool_col, species_list, out_dir="data/novel_orthologue_sequences/"):
     makedirs(out_dir)
     count = 0
diff --git a/reannotation_hcontortus_all.ipynb b/reannotation_hcontortus_all.ipynb
@@ -39,6 +39,7 @@
     "    interpro_accession_pipeline,\n",
     "    interpro_accession_pipeline_all_tools,\n",
     "    suspicious_orthologue_pipeline,\n",
+    "    pickle_cache_suspicious_orthologue_pipeline,\n",
     "    novel_orthologue_pipeline\n",
     ")\n",
     "from reannotation.statistics import fisher_exact_for_two_lists_of_accessions\n",
@@ -143,9 +144,9 @@
     }
    ],
    "source": [
-    "braker_merged, braker_split = suspicious_orthologue_pipeline(og_df, wbps_col, braker_col, species_list, seq_id_map)\n",
-    "anno_merged, anno_split = suspicious_orthologue_pipeline(og_df, wbps_col, anno_col, species_list, seq_id_map)\n",
-    "helixer_merged, helixer_split = suspicious_orthologue_pipeline(og_df, wbps_col, helixer_col, species_list, seq_id_map)"
+    "braker_merged, braker_split = pickle_cache_suspicious_orthologue_pipeline(\"braker\", \"hcon\", og_df, wbps_col, braker_col, species_list, seq_id_map)\n",
+    "anno_merged, anno_split = pickle_cache_suspicious_orthologue_pipeline(\"anno\", \"hcon\", og_df, wbps_col, anno_col, species_list, seq_id_map)\n",
+    "helixer_merged, helixer_split = pickle_cache_suspicious_orthologue_pipeline(\"helixer\", \"hcon\", og_df, wbps_col, helixer_col, species_list, seq_id_map)"
    ]
   },
   {
diff --git a/reannotation_ppacificus_all.ipynb b/reannotation_ppacificus_all.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -31,14 +31,18 @@
     "from orthologue_analysis.orthogroups import init_orthogroup_df\n",
     "from orthologue_analysis.species import PristionchusFromTool, SpeciesList\n",
     "from orthologue_analysis.utils import SequenceIDMapping, orthofinder_paths\n",
-    "from ppac_merged_split_run_utils import pickle_cache_suspicious_orthologue_pipeline\n",
     "from reannotation.analysis import (\n",
     "    interpro_accessions_frequently_missed_by_all_tools,\n",
     "    interpro_accessions_in_novel_transcripts,\n",
     "    interpro_accessions_in_missed_transcripts,\n",
     "    missed_transcripts_with_significantly_more_frequent_accessions\n",
     ")\n",
-    "from reannotation.pipelines import interpro_accession_pipeline, suspicious_orthologue_pipeline, novel_orthologue_pipeline\n",
+    "from reannotation.pipelines import (\n",
+    "    interpro_accession_pipeline,\n",
+    "    suspicious_orthologue_pipeline,\n",
+    "    pickle_cache_suspicious_orthologue_pipeline,\n",
+    "    novel_orthologue_pipeline\n",
+    ")\n",
     "from reannotation.statistics import fisher_exact_for_two_lists_of_accessions\n",
     "from reannotation.utils import extract_accessions_from_transcript\n",
     "from utils.esm import extract_esm_means\n",
@@ -129,33 +133,39 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "braker_merged, braker_split = pickle_cache_suspicious_orthologue_pipeline(\"braker\", og_df, wbps_col, braker_col, species_list, seq_id_map, wbps_prefix=\"Transcript\")\n",
-    "anno_merged, anno_split = pickle_cache_suspicious_orthologue_pipeline(\"anno\", og_df, wbps_col, anno_col, species_list, seq_id_map, wbps_prefix=\"Transcript\")\n",
-    "helixer_merged, helixer_split = pickle_cache_suspicious_orthologue_pipeline(\"helixer\", og_df, wbps_col, helixer_col, species_list, seq_id_map, wbps_prefix=\"Transcript\")"
+    "braker_merged, braker_split = pickle_cache_suspicious_orthologue_pipeline(\"braker\", \"ppac\", og_df, wbps_col, braker_col, species_list, seq_id_map, wbps_prefix=\"Transcript\")\n",
+    "anno_merged, anno_split = pickle_cache_suspicious_orthologue_pipeline(\"anno\", \"ppac\", og_df, wbps_col, anno_col, species_list, seq_id_map, wbps_prefix=\"Transcript\")\n",
+    "helixer_merged, helixer_split = pickle_cache_suspicious_orthologue_pipeline(\"helixer\", \"ppac\", og_df, wbps_col, helixer_col, species_list, seq_id_map, wbps_prefix=\"Transcript\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "BRAKER3: merged=19, split=96, total=0.56\n",
-      "Helixer: merged=349, split=591, total=4.0\n",
-      "Anno: merged=958, split=100, total=7.13\n"
+      "BRAKER3: merged=708, split=102, total=6.3%\n",
+      "\tTotal genes: 24077\n",
+      "Helixer: merged=351, split=533, total=3.83%\n",
+      "\tTotal genes: 32221\n",
+      "Anno: merged=1009, split=96, total=7.47%\n",
+      "\tTotal genes: 28283\n"
      ]
     }
    ],
    "source": [
     "num_genes = len(list(braker_species.db.all_features(featuretype=\"gene\")))\n",
-    "print(f\"BRAKER3: merged={len(braker_merged)}, split={len(braker_split)}, total={round(100*(len(braker_split) + len(braker_merged)*2)/num_genes, 2)}\")\n",
+    "print(f\"BRAKER3: merged={len(braker_merged)}, split={len(braker_split)}, total={round(100*(len(braker_split) + len(braker_merged)*2)/num_genes, 2)}%\")\n",
+    "print(f\"\\tTotal genes: {num_genes}\")\n",
     "num_genes = len(list(helixer_species.db.all_features(featuretype=\"gene\")))\n",
-    "print(f\"Helixer: merged={len(helixer_merged)}, split={len(helixer_split)}, total={round(100*(len(helixer_split) + len(helixer_merged)*2)/num_genes, 2)}\")\n",
+    "print(f\"Helixer: merged={len(helixer_merged)}, split={len(helixer_split)}, total={round(100*(len(helixer_split) + len(helixer_merged)*2)/num_genes, 2)}%\")\n",
+    "print(f\"\\tTotal genes: {num_genes}\")\n",
     "num_genes = len(list(anno_species.db.all_features(featuretype=\"gene\")))\n",
-    "print(f\"Anno: merged={len(anno_merged)}, split={len(anno_split)}, total={round(100*(len(anno_split) + len(anno_merged)*2)/num_genes, 2)}\")"
+    "print(f\"Anno: merged={len(anno_merged)}, split={len(anno_split)}, total={round(100*(len(anno_split) + len(anno_merged)*2)/num_genes, 2)}%\")\n",
+    "print(f\"\\tTotal genes: {num_genes}\")"
    ]
   },
   {
diff --git a/reannotation_smansoni_all.ipynb b/reannotation_smansoni_all.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
@@ -47,6 +47,7 @@
     "    interpro_accession_pipeline_all_tools,\n",
     "    interpro_accession_pipeline,\n",
     "    suspicious_orthologue_pipeline,\n",
+    "    pickle_cache_suspicious_orthologue_pipeline,\n",
     "    novel_orthologue_pipeline\n",
     ")\n",
     "from reannotation.statistics import fisher_exact_for_two_lists_of_accessions, count_transcripts_with_accession\n",
@@ -389,47 +390,53 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 9122/9122 [00:46<00:00, 195.20it/s] \n",
-      "100%|██████████| 9122/9122 [04:02<00:00, 37.68it/s] \n",
-      "100%|██████████| 9122/9122 [01:20<00:00, 113.98it/s]\n"
+      "100%|██████████| 9122/9122 [00:52<00:00, 172.94it/s] \n",
+      "100%|██████████| 9122/9122 [04:36<00:00, 33.05it/s] \n",
+      "100%|██████████| 9122/9122 [01:29<00:00, 101.93it/s]\n"
      ]
     }
    ],
    "source": [
-    "braker_merged, braker_split = suspicious_orthologue_pipeline(og_df, wbps_col, braker_col, species_list, seq_id_map)\n",
-    "anno_merged, anno_split = suspicious_orthologue_pipeline(og_df, wbps_col, anno_col, species_list, seq_id_map)\n",
-    "helixer_merged, helixer_split = suspicious_orthologue_pipeline(og_df, wbps_col, helixer_col, species_list, seq_id_map)"
+    "braker_merged, braker_split = pickle_cache_suspicious_orthologue_pipeline(\"braker\", \"sman\", og_df, wbps_col, braker_col, species_list, seq_id_map)\n",
+    "anno_merged, anno_split = pickle_cache_suspicious_orthologue_pipeline(\"anno\", \"sman\", og_df, wbps_col, anno_col, species_list, seq_id_map)\n",
+    "helixer_merged, helixer_split = pickle_cache_suspicious_orthologue_pipeline(\"helixer\", \"sman\", og_df, wbps_col, helixer_col, species_list, seq_id_map)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "BRAKER3: merged=17, split=10, total=0.48\n",
-      "Helixer: merged=161, split=20, total=3.45\n",
-      "Anno: merged=204, split=27, total=3.4\n"
+      "BRAKER3: merged=17, split=10, total=0.48%\n",
+      "\tTotal genes: 9092\n",
+      "Helixer: merged=161, split=20, total=3.45%\n",
+      "\tTotal genes: 9901\n",
+      "Anno: merged=204, split=27, total=3.4%\n",
+      "\tTotal genes: 12798\n"
      ]
     }
    ],
    "source": [
     "num_genes = len(list(braker_species.db.all_features(featuretype=\"gene\")))\n",
-    "print(f\"BRAKER3: merged={len(braker_merged)}, split={len(braker_split)}, total={round(100*(len(braker_split) + len(braker_merged)*2)/num_genes, 2)}\")\n",
+    "print(f\"BRAKER3: merged={len(braker_merged)}, split={len(braker_split)}, total={round(100*(len(braker_split) + len(braker_merged)*2)/num_genes, 2)}%\")\n",
+    "print(f\"\\tTotal genes: {num_genes}\")\n",
     "num_genes = len(list(helixer_species.db.all_features(featuretype=\"gene\")))\n",
-    "print(f\"Helixer: merged={len(helixer_merged)}, split={len(helixer_split)}, total={round(100*(len(helixer_split) + len(helixer_merged)*2)/num_genes, 2)}\")\n",
+    "print(f\"Helixer: merged={len(helixer_merged)}, split={len(helixer_split)}, total={round(100*(len(helixer_split) + len(helixer_merged)*2)/num_genes, 2)}%\")\n",
+    "print(f\"\\tTotal genes: {num_genes}\")\n",
     "num_genes = len(list(anno_species.db.all_features(featuretype=\"gene\")))\n",
-    "print(f\"Anno: merged={len(anno_merged)}, split={len(anno_split)}, total={round(100*(len(anno_split) + len(anno_merged)*2)/num_genes, 2)}\")"
+    "print(f\"Anno: merged={len(anno_merged)}, split={len(anno_split)}, total={round(100*(len(anno_split) + len(anno_merged)*2)/num_genes, 2)}%\")\n",
+    "print(f\"\\tTotal genes: {num_genes}\")"
    ]
   },
   {