Skip to content

Commit

Permalink
update to 1.3.5
Browse files Browse the repository at this point in the history
  • Loading branch information
PedroMTQ committed Jan 15, 2022
1 parent 19517a8 commit d21ab43
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 66 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import sys
from pathlib import Path

from sys import stdin

from unifunc import source

Expand Down Expand Up @@ -41,8 +41,15 @@ def __init__(self,
self.OUTPUT_FUNC_SIMILARITY = f'{output_folder}func_sim.tsv'
self.OUTPUT_FUNC_CLUSTER = f'{output_folder}rep_func_cluster.tsv'
Path(self.output_folder).mkdir(parents=True, exist_ok=True)

self.get_representative_function()
print(f'Creating functional clusters for {self.input_path}')
if self.input_path == 'stdin':
self.input_path=stdin
self.get_representative_function()
else:
if os.path.exists(self.input_path):
self.get_representative_function()
else:
print('Input does not exist!')

def pre_process_annotations(self,annotation_str):
annotation_str=annotation_str.strip()
Expand All @@ -68,27 +75,27 @@ def pre_process_annotations(self,annotation_str):
if not self.keep_hypothetical:
if 'hypothetical' in annotation_str: annotation_str=''
if 'predicted' in annotation_str: annotation_str=''
#if annotation_str in ['predicted protein','predicted protein, partial','hypothetical protein','hypothetical protein, partial',]: annotation_str=''
return annotation_str.strip()

def read_clustered_annotations(self):
#this yields all clusters, one by one - less memory footprint
#FILE NEEDS TO BE SORTED BY CLUSTER_ID
with open(self.input_path) as file:
file.readline()
temp=[]
for line in file:
line=line.strip('\n')
if self.input_path is stdin:
file = sys.stdin
else:
file = open(self.input_path)
res={}
for line in file:
line=line.strip('\n')
if line:
line=line.split('\t')
gene_id,cluster_id,annotation=line
annotation=self.pre_process_annotations(annotation)
if temp and cluster_id!=previous_cluster_id:
yield previous_cluster_id,temp
temp=[]
temp.append([gene_id,annotation])
previous_cluster_id=cluster_id
if temp:
yield previous_cluster_id,temp
if cluster_id not in res: res[cluster_id]=[]
res[cluster_id].append([gene_id,annotation])
#breaking point for stdin
else:
break
for cluster_id in res:
yield cluster_id,res[cluster_id]

def compare_annotations(self):
#this will yield the cluster_id, the genes (gene_id+annotations) a non-redundant score of all-vs-all annotations
Expand Down
2 changes: 1 addition & 1 deletion unifunc/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.3.4"
__version__ = "1.3.5"
9 changes: 4 additions & 5 deletions unifunc/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@

def argv_cluster_representative_function():
from Workflows.Representative_function.Cluster_Representative_Function import Cluster_Representative_Function
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser = argparse.ArgumentParser(description='This workflow selects a representative function per cluster of genes and respective functions',formatter_class=argparse.RawTextHelpFormatter)
#just a placeholder
parser.add_argument('workflow')

parser.add_argument('-i', '--input_path',
help='[required]\tInput with a tsv of clusters formatted as: gene_id|cluster|annotation. The tsv should be sorted by cluster! Otherwise this code will not work properly.')
help='[required]\ttab seperated input with the following format: gene_id|cluster|annotation. Either <path/to/file.tsv> or <stdin> are accepted ')
parser.add_argument('-o', '--output_folder', help='[required]\tOutput folder path')

parser.add_argument('-v', '--verbose', help='Verbose mode for UniFunc. Default is False', action='store_true')
Expand Down Expand Up @@ -92,12 +92,11 @@ def main():
' \\___/ |_| |_||_|\\_| \\__,_||_| |_| \\___|, a functional annotation text similarity analysis tool.\n\n'+
'UniFunc can be run in two modes:\n' +
'The default mode returns the similarity score (float) between the provided strings, to run it use: '
'<python UniFunc "this is string1" "this is string2">\n' +
'<unifunc "this is string1" "this is string2">\n' +
'The secondary mode requires the user to set a threshold (e.g. 0.95), and True will be returned if the string similarity is above the threshold, and False otherwise. To run it use: ' +
'<python UniFunc string1 string2 -t 0.95>\n' +
'<unifunc string1 string2 -t 0.95>\n' +
'To use verbose mode add <-v>, to redirect output to a file, add <-t file_path>'
,formatter_class=argparse.RawTextHelpFormatter)

parser.add_argument('str1')
parser.add_argument('str2')
parser.add_argument('-v','--verbose',action='store_true',help='Verbose mode for UniFunc')
Expand Down
Loading

0 comments on commit d21ab43

Please sign in to comment.