update to 1.3.5

PedroMTQ · Jan 15, 2022 · d21ab43 · d21ab43
1 parent 19517a8
commit d21ab43
Show file tree

Hide file tree

Showing 4 changed files with 118 additions and 66 deletions.
diff --git a/Workflows/Representative_function/Cluster_Representative_Function.py b/Workflows/Representative_function/Cluster_Representative_Function.py
@@ -4,7 +4,7 @@
 import os
 import sys
 from pathlib import Path
-
+from sys import stdin
 
 from unifunc import source
 
@@ -41,8 +41,15 @@ def __init__(self,
         self.OUTPUT_FUNC_SIMILARITY = f'{output_folder}func_sim.tsv'
         self.OUTPUT_FUNC_CLUSTER = f'{output_folder}rep_func_cluster.tsv'
         Path(self.output_folder).mkdir(parents=True, exist_ok=True)
-
-        self.get_representative_function()
+        print(f'Creating functional clusters for {self.input_path}')
+        if self.input_path == 'stdin':
+            self.input_path=stdin
+            self.get_representative_function()
+        else:
+            if os.path.exists(self.input_path):
+                self.get_representative_function()
+            else:
+                print('Input does not exist!')
 
     def pre_process_annotations(self,annotation_str):
         annotation_str=annotation_str.strip()
@@ -68,27 +75,27 @@ def pre_process_annotations(self,annotation_str):
         if not self.keep_hypothetical:
             if 'hypothetical' in annotation_str: annotation_str=''
             if 'predicted' in annotation_str: annotation_str=''
-        #if annotation_str in ['predicted protein','predicted protein, partial','hypothetical protein','hypothetical protein, partial',]:        annotation_str=''
         return annotation_str.strip()
 
     def read_clustered_annotations(self):
-        #this yields all clusters, one by one - less memory footprint
-        #FILE NEEDS TO BE SORTED BY CLUSTER_ID
-        with open(self.input_path) as file:
-            file.readline()
-            temp=[]
-            for line in file:
-                line=line.strip('\n')
+        if self.input_path is stdin:
+            file = sys.stdin
+        else:
+            file = open(self.input_path)
+        res={}
+        for line in file:
+            line=line.strip('\n')
+            if line:
                 line=line.split('\t')
                 gene_id,cluster_id,annotation=line
                 annotation=self.pre_process_annotations(annotation)
-                if temp and cluster_id!=previous_cluster_id:
-                    yield previous_cluster_id,temp
-                    temp=[]
-                temp.append([gene_id,annotation])
-                previous_cluster_id=cluster_id
-        if temp:
-            yield previous_cluster_id,temp
+                if cluster_id not in res: res[cluster_id]=[]
+                res[cluster_id].append([gene_id,annotation])
+            #breaking point for stdin
+            else:
+                break
+        for cluster_id in res:
+            yield cluster_id,res[cluster_id]
 
     def compare_annotations(self):
         #this will yield the cluster_id, the genes (gene_id+annotations) a non-redundant score of all-vs-all annotations

diff --git a/unifunc/__init__.py b/unifunc/__init__.py
@@ -1 +1 @@
-__version__ = "1.3.4"
+__version__ = "1.3.5"
diff --git a/unifunc/__main__.py b/unifunc/__main__.py
@@ -7,12 +7,12 @@
 
 def argv_cluster_representative_function():
     from Workflows.Representative_function.Cluster_Representative_Function import Cluster_Representative_Function
-    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser = argparse.ArgumentParser(description='This workflow selects a representative function per cluster of genes and respective functions',formatter_class=argparse.RawTextHelpFormatter)
     #just a placeholder
     parser.add_argument('workflow')
 
     parser.add_argument('-i', '--input_path',
-                        help='[required]\tInput with a tsv of clusters formatted as: gene_id|cluster|annotation. The tsv should be sorted by cluster! Otherwise this code will not work properly.')
+                        help='[required]\ttab seperated input with the following format: gene_id|cluster|annotation. Either <path/to/file.tsv> or <stdin> are accepted ')
     parser.add_argument('-o', '--output_folder', help='[required]\tOutput folder path')
 
     parser.add_argument('-v', '--verbose', help='Verbose mode for UniFunc. Default is False', action='store_true')
@@ -92,12 +92,11 @@ def main():
                                                      ' \\___/ |_| |_||_|\\_|     \\__,_||_| |_| \\___|, a functional annotation text similarity analysis tool.\n\n'+
                                                      'UniFunc can be run in two modes:\n' +
                                                      'The default mode returns the similarity score (float) between the provided strings, to run it use: '
-                                                     '<python UniFunc "this is string1" "this is string2">\n' +
+                                                     '<unifunc "this is string1" "this is string2">\n' +
                                                      'The secondary mode requires the user to set a threshold (e.g. 0.95), and True will be returned if the string similarity is above the threshold, and False otherwise. To run it use: ' +
-                                                     '<python UniFunc string1 string2 -t 0.95>\n' +
+                                                     '<unifunc string1 string2 -t 0.95>\n' +
                                                      'To use verbose mode add <-v>, to redirect output to a file, add <-t file_path>'
                                          ,formatter_class=argparse.RawTextHelpFormatter)
-
         parser.add_argument('str1')
         parser.add_argument('str2')
         parser.add_argument('-v','--verbose',action='store_true',help='Verbose mode for UniFunc')