option paths_limit

BIONF · Dec 13, 2023 · c13f69f · c13f69f
1 parent 567e4e2
commit c13f69f
Show file tree

Hide file tree

Showing 6 changed files with 74 additions and 48 deletions.
diff --git a/greedyFAS/calcFAS.py b/greedyFAS/calcFAS.py
@@ -133,6 +133,9 @@ def get_options():
                             help="Change to define the threshold for the maximal cardinality (number) of feature paths "
                                  "in a graph. If max. cardinality is exceeded the priority mode will be used to for "
                                  "the path evaluation. default=500")
+    thresholds.add_argument("--paths_limit", default=0, type=int,
+                            help="Specify number of maximum paths to be considered (10^n). If this threshold is exceeded, "
+                                "the corresponding protein will be ignored. Default: 0 for no limit")
     obscure.add_argument("--priority_mode", action='store_false',
                          help="deactivates the greedy strategy priority mode for larger architectures, NOT RECOMMENDED")
     obscure.add_argument("--timelimit", default=3600, type=int,
@@ -191,7 +194,7 @@ def fas(opts):
                    "timelimit": args.timelimit, "phyloprofile": args.phyloprofile, "score_weights": [],
                     "tsv": args.tsv, "json": args.json, "max_overlap_percentage": 0.0, "domain": args.domain, "pairwise": None,
                     "eInstance": args.eInstance, "eFeature": args.eFeature, "progress": True,
-                    "empty_as_1": args.empty_as_1, "silent": args.silent
+                    "empty_as_1": args.empty_as_1, "silent": args.silent, "paths_limit": 10**args.paths_limit
                    }
     seedname = '.'.join(args.seed.split('/')[-1].split('.')[:-1])
     option_dict["p_path"] = [args.annotation_dir + '/' + seedname + '.json']

diff --git a/greedyFAS/calcFASmulti.py b/greedyFAS/calcFASmulti.py
@@ -224,11 +224,11 @@ def get_prot_for_taxpair(opts):
     taxa_pair = ''
     if len(tmp) == 4:
         taxa_pair = f'{tmp[1]}#{tmp[3]}'
-        if paths_limit > 0:
-            path_p1 = calc_path_number(tmp[0], f'{annotation_dir}/{tmp[1]}.json')
-            path_p2 = calc_path_number(tmp[2], f'{annotation_dir}/{tmp[3]}.json')
-            if path_p1 > 10**paths_limit or path_p2 > 10**paths_limit:
-                return('')
+        # if paths_limit > 0:
+        #     path_p1 = calc_path_number(tmp[0], f'{annotation_dir}/{tmp[1]}.json')
+        #     path_p2 = calc_path_number(tmp[2], f'{annotation_dir}/{tmp[3]}.json')
+        #     if path_p1 > 10**paths_limit or path_p2 > 10**paths_limit:
+        #         return('')
         fp = open(f'{out_dir}/{out_name}_split_inputs/{taxa_pair}.txt', 'a+')
         fp.write(f'{tmp[0]}\t{tmp[2]}\n')
         fp.close()

diff --git a/greedyFAS/mainFAS/fasOutput.py b/greedyFAS/mainFAS/fasOutput.py
@@ -30,12 +30,16 @@ def write_tsv_out(outpath, bidirectional, results):
     outdict = {}
     out.write('Seed\tQuery\tScore(Forward/Reverse)\tMS(Forward/Reverse)\tPS(Forward/Reverse)\tCS(Forward/Reverse)'
               '\tLS(Forward/Reverse)\tMethod\n')
-    for result in results[0]:
-        outdict[result[0], result[1]] = (result[2], ('NA', 'NA', 'NA', 'NA', 'NA'), result[3])
-    if bidirectional:
+    if results[0]:
+        for result in results[0]:
+            outdict[result[0], result[1]] = (result[2], ('NA', 'NA', 'NA', 'NA', 'NA'), result[3])
+        if bidirectional:
+            for result in results[1]:
+                outdict[result[1], result[0]] = (outdict[result[1], result[0]][0], result[2],
+                                                    outdict[result[1],result[0]][2])
+    else:
         for result in results[1]:
-            outdict[result[1], result[0]] = (outdict[result[1], result[0]][0], result[2], outdict[result[1],
-                                                                                                  result[0]][2])
+            outdict[result[0], result[1]] = (result[2], ('NA', 'NA', 'NA', 'NA', 'NA'), result[3])
     for pair in outdict:
         out.write(pair[0] + '\t' + pair[1] + '\t' + f'{outdict[pair][0][0]:.4}' + '/'
                   + f'{outdict[pair][1][0]:.4}' + '\t' + f'{outdict[pair][0][1]:.4}' + '/'
@@ -48,12 +52,17 @@ def write_tsv_out(outpath, bidirectional, results):
 
 def write_json_out(outpath, bidirectional, results):
     outdict = {}
-    for result in results[0]:
-        outdict[result[0], result[1]] = (result[2], ('NA', 'NA', 'NA', 'NA', 'NA'), result[3])
-    if bidirectional:
+    if results[0]:
+        for result in results[0]:
+            outdict[result[0], result[1]] = (result[2], ('NA', 'NA', 'NA', 'NA', 'NA'), result[3])
+        if bidirectional:
+            for result in results[1]:
+                outdict[result[1], result[0]] = (outdict[result[1], result[0]][0], result[2],
+                                                    outdict[result[1],result[0]][2])
+    else:
         for result in results[1]:
-            outdict[result[1], result[0]] = (outdict[result[1], result[0]][0], result[2], outdict[result[1],
-                                                                                                  result[0]][2])
+            outdict[result[0], result[1]] = (result[2], ('NA', 'NA', 'NA', 'NA', 'NA'), result[3])
+
     json_dict = {}
     for pair in outdict:
         json_dict['_'.join(pair)] = [f'{outdict[pair][0][0]:.4}', f'{outdict[pair][1][0]:.4}']

diff --git a/greedyFAS/mainFAS/greedyFAS.py b/greedyFAS/mainFAS/greedyFAS.py
@@ -130,27 +130,30 @@ def fc_start(option):
         if not option["silent"]:
             print("calculating forward scores...")
         f_results = fc_main(domain_count, seed_proteome, query_proteome, clan_dict, option, interprokeys, phmm)
-        if option["MS_uni"] == 0 and option["ref_2"]:
-            domain_count_2 = {}
-            for path in option["ref_2"]:
-                domain_count_2.update(read_json(path)["count"])
-            if option["weight_correction"]:
-                domain_count_2 = w_weight_correction(option["weight_correction"], domain_count_2)
-            option['ref_proteome'] = option['ref_2']
-        else:
-            domain_count_2 = domain_count
-        id_tmp = option["seed_id"]
-        option["reverse"] = True
-        option["seed_id"] = option["query_id"]
-        option["query_id"] = id_tmp
-        if option["pairwise"]:
-            pairtmp = []
-            for pair in option["pairwise"]:
-                pairtmp.append([pair[1], pair[0]])
-            option["pairwise"] = pairtmp
-        if not option["silent"]:
-            print("calculating backward scores...")
-        r_results = fc_main(domain_count_2, query_proteome, seed_proteome, clan_dict, option, interprokeys, phmm)
+        r_results = ()
+        if not f_results[-1] == 'NA':
+            if option["MS_uni"] == 0 and option["ref_2"]:
+                domain_count_2 = {}
+                for path in option["ref_2"]:
+                    domain_count_2.update(read_json(path)["count"])
+                if option["weight_correction"]:
+                    domain_count_2 = w_weight_correction(option["weight_correction"], domain_count_2)
+                option['ref_proteome'] = option['ref_2']
+            else:
+                domain_count_2 = domain_count
+            id_tmp = option["seed_id"]
+            option["reverse"] = True
+            option["seed_id"] = option["query_id"]
+            option["query_id"] = id_tmp
+            if option["pairwise"]:
+                pairtmp = []
+                for pair in option["pairwise"]:
+                    pairtmp.append([pair[1], pair[0]])
+                option["pairwise"] = pairtmp
+            if not option["silent"]:
+                print("calculating backward scores...")
+            r_results = fc_main(domain_count_2, query_proteome, seed_proteome, clan_dict, option, interprokeys, phmm)
+
         if option["phyloprofile"]:
             phyloprofile_out(option["outpath"], True, option["phyloprofile"], (f_results, r_results))
         if not option['tsv']:
@@ -241,15 +244,22 @@ def fc_main(domain_count, seed_proteome, query_proteome, clan_dict, option, inte
             if protein not in seed_proteome:
                 raise Exception(protein + ' is missing in annotation!')
             tmp_query = fc_prep_query(query, domain_count, query_proteome, option, clan_dict)
-            query_graph, all_query_paths, lin_query_set, query_features, a_q_f, query_clans, clan_dict = tmp_query[0:7]
-            go_priority, domain_count = tmp_query[7:9]
-
-            #### WRITE RESULTS ####################
-            results.append(fc_main_sub(protein, domain_count, seed_proteome, option, all_query_paths, query_features,
-                                       go_priority, a_q_f, clan_dict, query_graph, query_proteome, query, query_clans,
-                                       domain_out, interprokeys, phmm))
-            if option["progress"]:
-                progress.update(1)
+            if not tmp_query == None:
+                tmp_protein = fc_prep_query(protein, 'NA', seed_proteome, option, clan_dict)
+                if not tmp_protein == None:
+                    query_graph, all_query_paths, lin_query_set, query_features, a_q_f, query_clans, clan_dict = tmp_query[0:7]
+                    go_priority, domain_count = tmp_query[7:9]
+
+                    #### WRITE RESULTS ####################
+                    results.append(fc_main_sub(protein, domain_count, seed_proteome, option, all_query_paths, query_features,
+                                               go_priority, a_q_f, clan_dict, query_graph, query_proteome, query, query_clans,
+                                               domain_out, interprokeys, phmm))
+                    if option["progress"]:
+                        progress.update(1)
+                else:
+                    results.append((protein, query, ('NA', 'NA', 'NA', 'NA', 'NA'), 'NA'))
+            else:
+                results.append((protein, query, ('NA', 'NA', 'NA', 'NA', 'NA'), 'NA'))
 
         if option["progress"]:
             progress.refresh()
@@ -268,6 +278,8 @@ def fc_prep_query(query, domain_count, query_proteome, option, clan_dict):
         query, query_proteome, clan_dict, option)
     tmp_query_graph, path_number = pb_region_paths(pb_region_mapper(
         lin_query_set, query_features, option["max_overlap"], option["max_overlap_percentage"]))
+    if option["paths_limit"] > 1 and option["paths_limit"] < path_number:
+        return None
     # PRIORITY CHECK: checking for number of instances - assess complexity of the feature graph
     if (len(query_features) > option["priority_threshold"] or path_number > option["max_cardinality"]) and \
             option["priority_mode"]:

diff --git a/greedyFAS/tsv2json.py b/greedyFAS/tsv2json.py
@@ -3,7 +3,9 @@
 #######################################################################
 # Copyright (C) 2023 Vinh Tran
 #
-# This file is part of FAS.
+#  This file is part of FAS. This script is used to convert FAS output
+#  from TSV (tab-delimited) to JSON formar. Only proteins IDs and their
+#  (pairwise) FAS scores will be stored in the JSON output.
 #
 #  FAS is free software: you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by

diff --git a/setup.py b/setup.py
@@ -28,7 +28,7 @@
 
 setup(
     name='greedyFAS',
-    version='1.18.6',
+    version='1.18.7',
     python_requires='>=3.7.0',
     description='A tool to compare protein feature architectures',
     long_description=long_description,